From 77a3bbf74e7320d6c1f86cd168f5aef48dc34ffe Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 18 Jul 2022 04:55:25 +0000 Subject: [PATCH 01/12] change number of broker --- kafka/50kafka.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kafka/50kafka.yml b/kafka/50kafka.yml index a001f1d5..c04d162a 100644 --- a/kafka/50kafka.yml +++ b/kafka/50kafka.yml @@ -8,7 +8,7 @@ spec: matchLabels: app: kafka serviceName: "kafka" - replicas: 3 + replicas: 5 updateStrategy: type: RollingUpdate podManagementPolicy: Parallel From 67cd84d7fafa11843d3547500dddc537fdae0cdf Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 18 Jul 2022 05:28:29 +0000 Subject: [PATCH 02/12] delete unnecessary dir --- 01-test-namespace.yml | 5 - cruise-control/11cruise-control-config.yml | 380 ------------------ .../20kafka-broker-reporter-patch.yml | 25 -- cruise-control/40cruise-control-service.yml | 12 - cruise-control/50cruise-control.yml | 51 --- cruise-control/README.md | 26 -- cruise-control/topic-create.yml | 24 -- events-kube/README.md | 5 - events-kube/config.yml | 16 - events-kube/events-kube-kafka.yml | 47 --- events-kube/rbac/cluster-events-watcher.yml | 39 -- events-kube/topic-create.yaml | 24 -- native/distroless.yaml | 9 - native/kustomization.yaml | 17 - native/native-image-zookeeper.yaml | 29 -- nonroot/entrypoint-from-image.yaml | 2 - nonroot/fsgroup-65534.yaml | 4 - nonroot/kustomization.yaml | 44 -- nonroot/nonroot-image-kafka.yaml | 15 - nonroot/nonroot-image-zookeeper.yaml | 31 -- pixy/Kustomization | 3 - pixy/pixy-service.yml | 12 - pixy/pixy.yml | 30 -- yahoo-kafka-manager/Kustomization | 3 - yahoo-kafka-manager/kafka-manager-service.yml | 12 - yahoo-kafka-manager/kafka-manager.yml | 26 -- 26 files changed, 891 deletions(-) delete mode 100644 01-test-namespace.yml delete mode 100644 cruise-control/11cruise-control-config.yml delete mode 100644 cruise-control/20kafka-broker-reporter-patch.yml delete mode 100644 cruise-control/40cruise-control-service.yml delete mode 100644 cruise-control/50cruise-control.yml delete mode 100644 cruise-control/README.md delete mode 100644 cruise-control/topic-create.yml delete mode 100644 events-kube/README.md delete mode 100644 events-kube/config.yml delete mode 100644 events-kube/events-kube-kafka.yml delete mode 100644 events-kube/rbac/cluster-events-watcher.yml delete mode 100644 events-kube/topic-create.yaml delete mode 100644 native/distroless.yaml delete mode 100644 native/kustomization.yaml delete mode 100644 native/native-image-zookeeper.yaml delete mode 100644 nonroot/entrypoint-from-image.yaml delete mode 100644 nonroot/fsgroup-65534.yaml delete mode 100644 nonroot/kustomization.yaml delete mode 100644 nonroot/nonroot-image-kafka.yaml delete mode 100644 nonroot/nonroot-image-zookeeper.yaml delete mode 100644 pixy/Kustomization delete mode 100644 pixy/pixy-service.yml delete mode 100644 pixy/pixy.yml delete mode 100644 yahoo-kafka-manager/Kustomization delete mode 100644 yahoo-kafka-manager/kafka-manager-service.yml delete mode 100644 yahoo-kafka-manager/kafka-manager.yml diff --git a/01-test-namespace.yml b/01-test-namespace.yml deleted file mode 100644 index fbb6e0ef..00000000 --- a/01-test-namespace.yml +++ /dev/null @@ -1,5 +0,0 @@ ---- -apiVersion: v1 -kind: Namespace -metadata: - name: test-kafka diff --git a/cruise-control/11cruise-control-config.yml b/cruise-control/11cruise-control-config.yml deleted file mode 100644 index b964d199..00000000 --- a/cruise-control/11cruise-control-config.yml +++ /dev/null @@ -1,380 +0,0 @@ -kind: ConfigMap -metadata: - name: broker-cruise-control-config - namespace: kafka -apiVersion: v1 -data: - cruisecontrol.properties: |- - # - # Copyright 2017 LinkedIn Corp. Licensed under the BSD 2-Clause License (the "License"). See License in the project root for license information. - # - - # This is an example property file for Kafka Cruise Control. See KafkaCruiseControlConfig for more details. - - # Configuration for the metadata client. - # ======================================= - - # The Kafka cluster to control. - bootstrap.servers=bootstrap:9092 - - # The maximum interval in milliseconds between two metadata refreshes. - #metadata.max.age.ms=300000 - - # Client id for the Cruise Control. It is used for the metadata client. - #client.id=kafka-cruise-control - - # The size of TCP send buffer bytes for the metadata client. - #send.buffer.bytes=131072 - - # The size of TCP receive buffer size for the metadata client. - #receive.buffer.bytes=131072 - - # The time to wait before disconnect an idle TCP connection. - #connections.max.idle.ms=540000 - - # The time to wait before reconnect to a given host. - #reconnect.backoff.ms=50 - - # The time to wait for a response from a host after sending a request. - #request.timeout.ms=30000 - - - # Configurations for the load monitor - # ======================================= - - # The number of metric fetcher thread to fetch metrics for the Kafka cluster - num.metric.fetchers=1 - - # The metric sampler class - metric.sampler.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.CruiseControlMetricsReporterSampler - # Configurations for CruiseControlMetricsReporterSampler - metric.reporter.topic.pattern=__CruiseControlMetrics - - # The sample store class name - sample.store.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.KafkaSampleStore - - # The config for the Kafka sample store to save the partition metric samples - partition.metric.sample.store.topic=__KafkaCruiseControlPartitionMetricSamples - - # The config for the Kafka sample store to save the model training samples - broker.metric.sample.store.topic=__KafkaCruiseControlModelTrainingSamples - - # The replication factor of Kafka metric sample store topic - sample.store.topic.replication.factor=2 - - # The config for the number of Kafka sample store consumer threads - num.sample.loading.threads=8 - - # The partition assignor class for the metric samplers - metric.sampler.partition.assignor.class=com.linkedin.kafka.cruisecontrol.monitor.sampling.DefaultMetricSamplerPartitionAssignor - - # The metric sampling interval in milliseconds - metric.sampling.interval.ms=120000 - - # The partition metrics window size in milliseconds - partition.metrics.window.ms=300000 - - # The number of partition metric windows to keep in memory - num.partition.metrics.windows=1 - - # The minimum partition metric samples required for a partition in each window - min.samples.per.partition.metrics.window=1 - - # The broker metrics window size in milliseconds - broker.metrics.window.ms=300000 - - # The number of broker metric windows to keep in memory - num.broker.metrics.windows=20 - - # The minimum broker metric samples required for a partition in each window - min.samples.per.broker.metrics.window=1 - - # The configuration for the BrokerCapacityConfigFileResolver (supports JBOD and non-JBOD broker capacities) - capacity.config.file=config/capacity.json - #capacity.config.file=config/capacityJBOD.json - - # Configurations for the analyzer - # ======================================= - - # The list of goals to optimize the Kafka cluster for with pre-computed proposals - default.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PotentialNwOutGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.TopicReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.LeaderBytesInDistributionGoal - - # The list of supported goals - goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PotentialNwOutGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.TopicReplicaDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.LeaderBytesInDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.kafkaassigner.KafkaAssignerDiskUsageDistributionGoal,com.linkedin.kafka.cruisecontrol.analyzer.kafkaassigner.KafkaAssignerEvenRackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.PreferredLeaderElectionGoal - - # The list of supported hard goals - hard.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal - - # The minimum percentage of well monitored partitions out of all the partitions - min.monitored.partition.percentage=0.95 - - # The balance threshold for CPU - cpu.balance.threshold=1.1 - - # The balance threshold for disk - disk.balance.threshold=1.1 - - # The balance threshold for network inbound utilization - network.inbound.balance.threshold=1.1 - - # The balance threshold for network outbound utilization - network.outbound.balance.threshold=1.1 - - # The balance threshold for the replica count - replica.count.balance.threshold=1.1 - - # The capacity threshold for CPU in percentage - cpu.capacity.threshold=0.8 - - # The capacity threshold for disk in percentage - disk.capacity.threshold=0.8 - - # The capacity threshold for network inbound utilization in percentage - network.inbound.capacity.threshold=0.8 - - # The capacity threshold for network outbound utilization in percentage - network.outbound.capacity.threshold=0.8 - - # The threshold to define the cluster to be in a low CPU utilization state - cpu.low.utilization.threshold=0.0 - - # The threshold to define the cluster to be in a low disk utilization state - disk.low.utilization.threshold=0.0 - - # The threshold to define the cluster to be in a low network inbound utilization state - network.inbound.low.utilization.threshold=0.0 - - # The threshold to define the cluster to be in a low disk utilization state - network.outbound.low.utilization.threshold=0.0 - - # The metric anomaly percentile upper threshold - metric.anomaly.percentile.upper.threshold=90.0 - - # The metric anomaly percentile lower threshold - metric.anomaly.percentile.lower.threshold=10.0 - - # How often should the cached proposal be expired and recalculated if necessary - proposal.expiration.ms=60000 - - # The maximum number of replicas that can reside on a broker at any given time. - max.replicas.per.broker=10000 - - # The number of threads to use for proposal candidate precomputing. - num.proposal.precompute.threads=1 - - # the topics that should be excluded from the partition movement. - #topics.excluded.from.partition.movement - - # Configurations for the executor - # ======================================= - - # The zookeeper connect of the Kafka cluster - zookeeper.connect=zookeeper:2181/ - - # The max number of partitions to move in/out on a given broker at a given time. - num.concurrent.partition.movements.per.broker=10 - - # The interval between two execution progress checks. - execution.progress.check.interval.ms=10000 - - - # Configurations for anomaly detector - # ======================================= - - # The goal violation notifier class - anomaly.notifier.class=com.linkedin.kafka.cruisecontrol.detector.notifier.SelfHealingNotifier - - # The metric anomaly finder class - metric.anomaly.finder.class=com.linkedin.kafka.cruisecontrol.detector.KafkaMetricAnomalyFinder - - # The anomaly detection interval - anomaly.detection.interval.ms=10000 - - # The goal violation to detect. - anomaly.detection.goals=com.linkedin.kafka.cruisecontrol.analyzer.goals.RackAwareGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.ReplicaCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.DiskCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkInboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.NetworkOutboundCapacityGoal,com.linkedin.kafka.cruisecontrol.analyzer.goals.CpuCapacityGoal - - # The interested metrics for metric anomaly analyzer. - metric.anomaly.analyzer.metrics=BROKER_PRODUCE_LOCAL_TIME_MS_MAX,BROKER_PRODUCE_LOCAL_TIME_MS_MEAN,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_MAX,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_MEAN,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_MAX,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_MEAN,BROKER_LOG_FLUSH_TIME_MS_MAX,BROKER_LOG_FLUSH_TIME_MS_MEAN - - ## Adjust accordingly if your metrics reporter is an older version and does not produce these metrics. - #metric.anomaly.analyzer.metrics=BROKER_PRODUCE_LOCAL_TIME_MS_50TH,BROKER_PRODUCE_LOCAL_TIME_MS_999TH,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_50TH,BROKER_CONSUMER_FETCH_LOCAL_TIME_MS_999TH,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_50TH,BROKER_FOLLOWER_FETCH_LOCAL_TIME_MS_999TH,BROKER_LOG_FLUSH_TIME_MS_50TH,BROKER_LOG_FLUSH_TIME_MS_999TH - - # The zk path to store failed broker information. - failed.brokers.zk.path=/CruiseControlBrokerList - - # Topic config provider class - topic.config.provider.class=com.linkedin.kafka.cruisecontrol.config.KafkaTopicConfigProvider - - # The cluster configurations for the KafkaTopicConfigProvider - cluster.configs.file=config/clusterConfigs.json - - # The maximum time in milliseconds to store the response and access details of a completed user task. - completed.user.task.retention.time.ms=21600000 - - # The maximum time in milliseconds to retain the demotion history of brokers. - demotion.history.retention.time.ms=86400000 - - # The maximum number of completed user tasks for which the response and access details will be cached. - max.cached.completed.user.tasks=100 - - # The maximum number of user tasks for concurrently running in async endpoints across all users. - max.active.user.tasks=5 - - # Enable self healing for all anomaly detectors, unless the particular anomaly detector is explicitly disabled - self.healing.enabled=true - - # Enable self healing for broker failure detector - #self.healing.broker.failure.enabled=true - - # Enable self healing for goal violation detector - #self.healing.goal.violation.enabled=true - - # Enable self healing for metric anomaly detector - #self.healing.metric.anomaly.enabled=true - - - # configurations for the webserver - # ================================ - - # HTTP listen port - webserver.http.port=9090 - - # HTTP listen address - webserver.http.address=0.0.0.0 - - # Whether CORS support is enabled for API or not - webserver.http.cors.enabled=false - - # Value for Access-Control-Allow-Origin - webserver.http.cors.origin=http://localhost:8080/ - - # Value for Access-Control-Request-Method - webserver.http.cors.allowmethods=OPTIONS,GET,POST - - # Headers that should be exposed to the Browser (Webapp) - # This is a special header that is used by the - # User Tasks subsystem and should be explicitly - # Enabled when CORS mode is used as part of the - # Admin Interface - webserver.http.cors.exposeheaders=User-Task-ID - - # REST API default prefix - # (dont forget the ending *) - webserver.api.urlprefix=/kafkacruisecontrol/* - - # Location where the Cruise Control frontend is deployed - webserver.ui.diskpath=./cruise-control-ui/dist/ - - # URL path prefix for UI - # (dont forget the ending *) - webserver.ui.urlprefix=/* - - # Time After which request is converted to Async - webserver.request.maxBlockTimeMs=10000 - - # Default Session Expiry Period - webserver.session.maxExpiryTimeMs=60000 - - # Session cookie path - webserver.session.path=/ - - # Server Access Logs - webserver.accesslog.enabled=true - - # Location of HTTP Request Logs - webserver.accesslog.path=access.log - - # HTTP Request Log retention days - webserver.accesslog.retention.days=14 - - capacityJBOD.json: |- - { - "brokerCapacities":[ - { - "brokerId": "-1", - "capacity": { - "DISK": {"/tmp/kafka-logs-1": "100000", "/tmp/kafka-logs-2": "100000", "/tmp/kafka-logs-3": "50000", - "/tmp/kafka-logs-4": "50000", "/tmp/kafka-logs-5": "150000", "/tmp/kafka-logs-6": "50000"}, - "CPU": "100", - "NW_IN": "10000", - "NW_OUT": "10000" - }, - "doc": "The default capacity for a broker with multiple logDirs each on a separate heterogeneous disk." - }, - { - "brokerId": "0", - "capacity": { - "DISK": {"/tmp/kafka-logs": "500000"}, - "CPU": "100", - "NW_IN": "50000", - "NW_OUT": "50000" - }, - "doc": "This overrides the capacity for broker 0. This broker is not a JBOD broker." - }, - { - "brokerId": "1", - "capacity": { - "DISK": {"/tmp/kafka-logs-1": "250000", "/tmp/kafka-logs-2": "250000"}, - "CPU": "100", - "NW_IN": "50000", - "NW_OUT": "50000" - }, - "doc": "This overrides the capacity for broker 1. This broker is a JBOD broker." - } - ] - } - - capacity.json: |- - { - "brokerCapacities":[ - { - "brokerId": "-1", - "capacity": { - "DISK": "100000", - "CPU": "100", - "NW_IN": "10000", - "NW_OUT": "10000" - }, - "doc": "This is the default capacity. Capacity unit used for disk is in MB, cpu is in percentage, network throughput is in KB." - }, - { - "brokerId": "0", - "capacity": { - "DISK": "500000", - "CPU": "100", - "NW_IN": "50000", - "NW_OUT": "50000" - }, - "doc": "This overrides the capacity for broker 0." - } - ] - } - - clusterConfigs.json: |- - { - "min.insync.replicas": 2 - } - - log4j2.xml: |- - - - - - - - - - - - - - - - log4j.properties: |- - log4j.rootLogger = INFO, FILE - - log4j.appender.FILE=org.apache.log4j.FileAppender - log4j.appender.FILE.File=/dev/stdout - - log4j.appender.FILE.layout=org.apache.log4j.PatternLayout - log4j.appender.FILE.layout.conversionPattern=%-6r [%15.15t] %-5p %30.30c %x - %m%n diff --git a/cruise-control/20kafka-broker-reporter-patch.yml b/cruise-control/20kafka-broker-reporter-patch.yml deleted file mode 100644 index f3305ecd..00000000 --- a/cruise-control/20kafka-broker-reporter-patch.yml +++ /dev/null @@ -1,25 +0,0 @@ -# meant to be applied using -# kubectl --namespace kafka patch statefulset kafka --patch "$(cat cruise-control/20kafka-broker-reporter-patch.yml)" -metadata: - name: kafka - namespace: kafka -spec: - template: - spec: - initContainers: - - name: cruise-control-reporter - image: solsson/kafka-cruise-control@sha256:c70eae329b4ececba58e8cf4fa6e774dd2e0205988d8e5be1a70e622fcc46716 - command: - - /bin/bash - - -cex - - | - cp -v /opt/cruise-control/cruise-control/build/dependant-libs/cruise-control-metrics-reporter.jar /opt/kafka/libs/extensions/cruise-control-metrics-reporter.jar - echo -e "\n\nmetric.reporters = com.linkedin.kafka.cruisecontrol.metricsreporter.CruiseControlMetricsReporter" | tee -a /etc/kafka/server.properties - volumeMounts: - - name: config - mountPath: /etc/kafka - - name: extensions - mountPath: /opt/kafka/libs/extensions - $setElementOrder/initContainers: - - name: init-config - - name: cruise-control-reporter diff --git a/cruise-control/40cruise-control-service.yml b/cruise-control/40cruise-control-service.yml deleted file mode 100644 index dcb8f243..00000000 --- a/cruise-control/40cruise-control-service.yml +++ /dev/null @@ -1,12 +0,0 @@ -kind: Service -apiVersion: v1 -metadata: - name: cruise-control - namespace: kafka -spec: - selector: - app: cruise-control - ports: - - protocol: TCP - port: 8090 - targetPort: 8090 diff --git a/cruise-control/50cruise-control.yml b/cruise-control/50cruise-control.yml deleted file mode 100644 index d2892b8c..00000000 --- a/cruise-control/50cruise-control.yml +++ /dev/null @@ -1,51 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: cruise-control - namespace: kafka -spec: - selector: - matchLabels: - app: cruise-control - replicas: 1 - template: - metadata: - labels: - app: cruise-control - annotations: - spec: - terminationGracePeriodSeconds: 30 - initContainers: - - name: init-config - image: busybox@sha256:2a03a6059f21e150ae84b0973863609494aad70f0a80eaeb64bddd8d92465812 - command: ['/bin/sh'] - args: [ '-c', 'cp /etc/cruise-control-configmap/* /opt/cruise-control/config'] - volumeMounts: - - name: configmap - mountPath: /etc/cruise-control-configmap - - name: config - mountPath: /opt/cruise-control/config - containers: - - name: cruise-control - image: solsson/kafka-cruise-control@sha256:2c2d113ec3d960bfa75e8e51d1fed6a2f2818e329990f61c895e611403ba64d0 - imagePullPolicy: IfNotPresent - ports: - - name: api - containerPort: 8090 - resources: - requests: - cpu: 100m - memory: 512Mi - readinessProbe: - tcpSocket: - port: 8090 - timeoutSeconds: 1 - volumeMounts: - - name: config - mountPath: /opt/cruise-control/config - volumes: - - name: configmap - configMap: - name: broker-cruise-control-config - - name: config - emptyDir: {} diff --git a/cruise-control/README.md b/cruise-control/README.md deleted file mode 100644 index c27ba2f1..00000000 --- a/cruise-control/README.md +++ /dev/null @@ -1,26 +0,0 @@ -## Cruise Control - -Cruise Control is used to automate the dynamic workload rebalance and self-healing of a Kafka cluster. This tool will allow you to add, replace or remove nodes and the cluster will be automatically adjusted. Partitions will be rebalanced based on resource usage of CPU, network, disk, etc. - -*Disclaimer*: It is important to understand Cruise Control will modify the Kafka cluster without operator intervention. Bugs or misconfiguration may cause loss of data or denial of service. You bear the responsibility of configuring and testing properly and taking precautions based on the importance of your data. - -### Configuration - -There are several configuration files that need to be mounted in `/opt/cruise-control/config`. The files in `11cruise-control-config.yml` are the defaults from [the Cruise Control GitHub repo, migrate_to_kafka_2_0 branch](https://github.com/linkedin/cruise-control/tree/migrate_to_kafka_2_0/config). The significant modification from the GitHub repo is that self healing has been enabled using `self.healing.enabled=true`. - -Following are the files in `11cruise-control-config.yml`. Nearly all changes you would make are in `cruisecontrol.properties`. - -- cruisecontrol.properties -- capacityJBOD.json -- capacity.json -- clusterConfigs.json -- log4j2.xml -- log4j.properties - -### Patching - -Cruise control requires broker metrics to make informed decisions. Each broker runs a metric collector that pushes metrics into a topic, by default named `__CruiseControlMetrics`. Configuring the collector requires patching the broker StatefulSet. An example command to apply this patch is below. - -```shell -$ kubectl --namespace kafka patch statefulset kafka --patch "$(cat cruise-control/20kafka-broker-reporter-patch.yml)" -``` diff --git a/cruise-control/topic-create.yml b/cruise-control/topic-create.yml deleted file mode 100644 index c8730c15..00000000 --- a/cruise-control/topic-create.yml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: topic-cruise-control-metrics - namespace: kafka -spec: - template: - spec: - containers: - - name: topic-create - image: solsson/kafka:native-cli@sha256:fbf29c59182fb87921c5199783d2d5796856ecbfe34a9c03eca658b3cf50f3c4 - command: - - ./bin/kafka-topics.sh - - --zookeeper - - zookeeper.kafka.svc.cluster.local:2181 - - --create - - --if-not-exists - - --topic - - __CruiseControlMetrics - resources: - limits: - cpu: 100m - memory: 20Mi - restartPolicy: Never diff --git a/events-kube/README.md b/events-kube/README.md deleted file mode 100644 index e2eff305..00000000 --- a/events-kube/README.md +++ /dev/null @@ -1,5 +0,0 @@ -## Kubernetes event streaming - -This is meant as input for Ops work and automation. - -Using https://github.com/heptiolabs/eventrouter with Kafka sink. diff --git a/events-kube/config.yml b/events-kube/config.yml deleted file mode 100644 index 2f40f15f..00000000 --- a/events-kube/config.yml +++ /dev/null @@ -1,16 +0,0 @@ -kind: ConfigMap -apiVersion: v1 -metadata: - name: eventrouter-cm - namespace: kube-system - annotations: - origin: github.com/Yolean/kubernetes-kafka -data: - config.json: |- - { - "sink": "kafka", - "kafkaBrokers": "bootstrap.kafka:9092", - "kafkaTopic": "ops.kube-events.stream.json", - "kafkaAsync": false, - "kafkaRetryMax": 5 - } diff --git a/events-kube/events-kube-kafka.yml b/events-kube/events-kube-kafka.yml deleted file mode 100644 index 6a90e667..00000000 --- a/events-kube/events-kube-kafka.yml +++ /dev/null @@ -1,47 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: eventrouter - namespace: kube-system - labels: - app: eventrouter - annotations: - origin: github.com/Yolean/kubernetes-kafka -spec: - replicas: 1 - strategy: - type: RollingUpdate - rollingUpdate: - # prefer duplicate events over missed - maxUnavailable: 0 - maxSurge: 1 - selector: - matchLabels: - app: eventrouter - template: - metadata: - labels: - app: eventrouter - tier: control-plane-addons - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - spec: - serviceAccount: eventrouter - containers: - - name: kube-eventrouter - image: gcr.io/heptio-images/eventrouter@sha256:e613b48c6235426fa334867d661118322b4e1973c23e2e8cf5c066b982cc8596 - resources: - requests: - memory: "5Mi" - cpu: "2m" - limits: - memory: "20Mi" - cpu: "100m" - volumeMounts: - - name: config-volume - mountPath: /etc/eventrouter - volumes: - - name: config-volume - configMap: - name: eventrouter-cm diff --git a/events-kube/rbac/cluster-events-watcher.yml b/events-kube/rbac/cluster-events-watcher.yml deleted file mode 100644 index 3d9ef4d7..00000000 --- a/events-kube/rbac/cluster-events-watcher.yml +++ /dev/null @@ -1,39 +0,0 @@ ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: eventrouter - namespace: kube-system - annotations: - origin: github.com/Yolean/kubernetes-kafka ---- -kind: ClusterRole -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: eventrouter - annotations: - origin: github.com/Yolean/kubernetes-kafka -rules: -- apiGroups: - - "" - resources: - - events - verbs: - - get - - watch - - list ---- -kind: ClusterRoleBinding -apiVersion: rbac.authorization.k8s.io/v1 -metadata: - name: eventrouter - annotations: - origin: github.com/Yolean/kubernetes-kafka -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: eventrouter -subjects: -- kind: ServiceAccount - name: eventrouter - namespace: kube-system diff --git a/events-kube/topic-create.yaml b/events-kube/topic-create.yaml deleted file mode 100644 index 2cf4f062..00000000 --- a/events-kube/topic-create.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: topic-create-event-kube - namespace: kafka -spec: - template: - spec: - containers: - - name: topic-create - image: solsson/kafka:native-cli@sha256:fbf29c59182fb87921c5199783d2d5796856ecbfe34a9c03eca658b3cf50f3c4 - command: - - ./bin/kafka-topics.sh - - --zookeeper - - zookeeper.kafka.svc.cluster.local:2181 - - --create - - --if-not-exists - - --topic - - ops.kube-events.stream.json - resources: - limits: - cpu: 100m - memory: 20Mi - restartPolicy: Never diff --git a/native/distroless.yaml b/native/distroless.yaml deleted file mode 100644 index e00b5205..00000000 --- a/native/distroless.yaml +++ /dev/null @@ -1,9 +0,0 @@ -# The more specific removes are to make sure that there was a shell that we're removing -- op: remove - path: /spec/template/spec/containers/0/readinessProbe/exec -- op: remove - path: /spec/template/spec/containers/0/readinessProbe -- op: remove - path: /spec/template/spec/containers/0/lifecycle/preStop/exec -- op: remove - path: /spec/template/spec/containers/0/lifecycle/preStop diff --git a/native/kustomization.yaml b/native/kustomization.yaml deleted file mode 100644 index 6a489db7..00000000 --- a/native/kustomization.yaml +++ /dev/null @@ -1,17 +0,0 @@ -bases: -- ../nonroot -patchesStrategicMerge: -- native-image-zookeeper.yaml -patchesJson6902: -- target: - group: apps - version: v1 - kind: StatefulSet - name: pzoo - path: distroless.yaml -- target: - group: apps - version: v1 - kind: StatefulSet - name: zoo - path: distroless.yaml \ No newline at end of file diff --git a/native/native-image-zookeeper.yaml b/native/native-image-zookeeper.yaml deleted file mode 100644 index 5d38e65d..00000000 --- a/native/native-image-zookeeper.yaml +++ /dev/null @@ -1,29 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: pzoo -spec: - template: - spec: - containers: - - name: zookeeper - image: solsson/kafka:native-zookeeper-server-start@sha256:ba3a0632240b8906a3b5bb6441e98ad9d9de73cb716b156ca68f1b435c819e8b - resources: - requests: - cpu: 10m - memory: 25Mi ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: zoo -spec: - template: - spec: - containers: - - name: zookeeper - image: solsson/kafka:native-zookeeper-server-start@sha256:ba3a0632240b8906a3b5bb6441e98ad9d9de73cb716b156ca68f1b435c819e8b - resources: - requests: - cpu: 10m - memory: 25Mi diff --git a/nonroot/entrypoint-from-image.yaml b/nonroot/entrypoint-from-image.yaml deleted file mode 100644 index fe110c35..00000000 --- a/nonroot/entrypoint-from-image.yaml +++ /dev/null @@ -1,2 +0,0 @@ -- op: remove - path: /spec/template/spec/containers/0/command diff --git a/nonroot/fsgroup-65534.yaml b/nonroot/fsgroup-65534.yaml deleted file mode 100644 index 5ebd6cba..00000000 --- a/nonroot/fsgroup-65534.yaml +++ /dev/null @@ -1,4 +0,0 @@ -- op: add - path: /spec/template/spec/securityContext - value: - fsGroup: 65534 diff --git a/nonroot/kustomization.yaml b/nonroot/kustomization.yaml deleted file mode 100644 index a3526dfb..00000000 --- a/nonroot/kustomization.yaml +++ /dev/null @@ -1,44 +0,0 @@ -bases: -- ../rbac-namespace-default -- ../kafka -- ../zookeeper -patchesStrategicMerge: -- nonroot-image-kafka.yaml -- nonroot-image-zookeeper.yaml -patchesJson6902: -- target: - group: apps - version: v1 - kind: StatefulSet - name: kafka - path: fsgroup-65534.yaml -- target: - group: apps - version: v1 - kind: StatefulSet - name: pzoo - path: fsgroup-65534.yaml -- target: - group: apps - version: v1 - kind: StatefulSet - name: zoo - path: fsgroup-65534.yaml -- target: - group: apps - version: v1 - kind: StatefulSet - name: kafka - path: entrypoint-from-image.yaml -- target: - group: apps - version: v1 - kind: StatefulSet - name: pzoo - path: entrypoint-from-image.yaml -- target: - group: apps - version: v1 - kind: StatefulSet - name: zoo - path: entrypoint-from-image.yaml diff --git a/nonroot/nonroot-image-kafka.yaml b/nonroot/nonroot-image-kafka.yaml deleted file mode 100644 index 8d124f4b..00000000 --- a/nonroot/nonroot-image-kafka.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: kafka -spec: - template: - spec: - initContainers: - - name: init-config - image: solsson/kafka:initutils-nonroot@sha256:8988aca5b34feabe8d7d4e368f74b2ede398f692c7e99a38b262a938d475812c - containers: - - name: broker - image: solsson/kafka:2.5.1-kafka-server-start@sha256:e4d34530e500eb9724e7778595f4cb244df228336090bb7ed5f7dd7433d4d41d - args: - - /etc/kafka/server.properties.$(POD_NAME) diff --git a/nonroot/nonroot-image-zookeeper.yaml b/nonroot/nonroot-image-zookeeper.yaml deleted file mode 100644 index 880a33d2..00000000 --- a/nonroot/nonroot-image-zookeeper.yaml +++ /dev/null @@ -1,31 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: pzoo -spec: - template: - spec: - initContainers: - - name: init-config - image: solsson/kafka:initutils-nonroot@sha256:8988aca5b34feabe8d7d4e368f74b2ede398f692c7e99a38b262a938d475812c - containers: - - name: zookeeper - image: solsson/kafka:2.5.1-zookeeper-server-start@sha256:b3af82c547b8188fa303520901eee6a526c6e34d87cfd78c1569a3a2c96ad5cd - args: - - /etc/kafka/zookeeper.properties.scale-$(REPLICAS).$(POD_NAME) ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: zoo -spec: - template: - spec: - initContainers: - - name: init-config - image: solsson/kafka:initutils-nonroot@sha256:8988aca5b34feabe8d7d4e368f74b2ede398f692c7e99a38b262a938d475812c - containers: - - name: zookeeper - image: solsson/kafka:2.5.1-zookeeper-server-start@sha256:b3af82c547b8188fa303520901eee6a526c6e34d87cfd78c1569a3a2c96ad5cd - args: - - /etc/kafka/zookeeper.properties.scale-$(REPLICAS).$(POD_NAME) diff --git a/pixy/Kustomization b/pixy/Kustomization deleted file mode 100644 index 17eaa10f..00000000 --- a/pixy/Kustomization +++ /dev/null @@ -1,3 +0,0 @@ -resources: -- pixy-service.yml -- pixy.yml diff --git a/pixy/pixy-service.yml b/pixy/pixy-service.yml deleted file mode 100644 index c15b7379..00000000 --- a/pixy/pixy-service.yml +++ /dev/null @@ -1,12 +0,0 @@ -kind: Service -apiVersion: v1 -metadata: - name: pixy - namespace: kafka -spec: - selector: - app: pixy - ports: - - name: web - protocol: TCP - port: 80 diff --git a/pixy/pixy.yml b/pixy/pixy.yml deleted file mode 100644 index 190195b4..00000000 --- a/pixy/pixy.yml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: pixy - namespace: kafka - labels: - app: pixy -spec: - replicas: 2 - selector: - matchLabels: - app: pixy - template: - metadata: - labels: - app: pixy - spec: - containers: - - name: pixy - image: mailgun/kafka-pixy:0.17.0@sha256:0b5f4795c0b0d80729fa7415ec70ae4d411e152c6149656dddf01b18184792e0 - ports: - - containerPort: 80 - command: - - kafka-pixy - - -kafkaPeers - - bootstrap.kafka:9092 - - -zookeeperPeers - - zookeeper.kafka:2181 - - -tcpAddr - - 0.0.0.0:80 diff --git a/yahoo-kafka-manager/Kustomization b/yahoo-kafka-manager/Kustomization deleted file mode 100644 index d32e595c..00000000 --- a/yahoo-kafka-manager/Kustomization +++ /dev/null @@ -1,3 +0,0 @@ -resources: -- kafka-manager-service.yml -- kafka-manager.yml diff --git a/yahoo-kafka-manager/kafka-manager-service.yml b/yahoo-kafka-manager/kafka-manager-service.yml deleted file mode 100644 index 3d26adf3..00000000 --- a/yahoo-kafka-manager/kafka-manager-service.yml +++ /dev/null @@ -1,12 +0,0 @@ -kind: Service -apiVersion: v1 -metadata: - name: kafka-manager - namespace: kafka -spec: - selector: - app: kafka-manager - ports: - - protocol: TCP - port: 80 - targetPort: 80 diff --git a/yahoo-kafka-manager/kafka-manager.yml b/yahoo-kafka-manager/kafka-manager.yml deleted file mode 100644 index 16eea0b0..00000000 --- a/yahoo-kafka-manager/kafka-manager.yml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: kafka-manager - namespace: kafka -spec: - replicas: 1 - selector: - matchLabels: - app: kafka-manager - template: - metadata: - labels: - app: kafka-manager - spec: - containers: - - name: kafka-manager - image: solsson/kafka-manager@sha256:9da595ecbb733074a1d3c6091a1e0c384da4f4e1f19f4e16276062278da8e592 - ports: - - containerPort: 80 - env: - - name: ZK_HOSTS - value: zookeeper.kafka:2181 - command: - - ./bin/kafka-manager - - -Dhttp.port=80 \ No newline at end of file From 67e222c6ff99b17fc8bd159d8b27fa98bad27e6a Mon Sep 17 00:00:00 2001 From: joyfulbean Date: Mon, 18 Jul 2022 08:59:02 +0000 Subject: [PATCH 03/12] change to my version of broker and zookeeper set --- kafka/Kustomization | 8 +- ...trap-service.yml => bootstrap-service.yml} | 0 ...{10broker-config.yml => broker-config.yml} | 0 kafka/{20dns.yml => dns.yml} | 0 kafka/{50kafka.yml => kafka.yml} | 2 +- 00-namespace.yml => namespace.yml | 0 ...patch.yml => kafka-jmx-exporter-patch.yml} | 0 prometheus/kustomization.yaml | 4 +- ...-metrics-config.yml => metrics-config.yml} | 0 zookeeper/21zoo-service.yml | 15 --- zookeeper/51zoo.yml | 102 ------------------ zookeeper/Kustomization | 10 +- zookeeper/{30service.yml => dns.yml} | 0 .../{20pzoo-service.yml => pzoo-service.yml} | 0 zookeeper/{50pzoo.yml => pzoo.yml} | 2 +- ...{10zookeeper-config.yml => zoo-config.yml} | 13 ++- 16 files changed, 20 insertions(+), 136 deletions(-) rename kafka/{30bootstrap-service.yml => bootstrap-service.yml} (100%) rename kafka/{10broker-config.yml => broker-config.yml} (100%) rename kafka/{20dns.yml => dns.yml} (100%) rename kafka/{50kafka.yml => kafka.yml} (99%) rename 00-namespace.yml => namespace.yml (100%) rename prometheus/{50-kafka-jmx-exporter-patch.yml => kafka-jmx-exporter-patch.yml} (100%) rename prometheus/{10-metrics-config.yml => metrics-config.yml} (100%) delete mode 100644 zookeeper/21zoo-service.yml delete mode 100644 zookeeper/51zoo.yml rename zookeeper/{30service.yml => dns.yml} (100%) rename zookeeper/{20pzoo-service.yml => pzoo-service.yml} (100%) rename zookeeper/{50pzoo.yml => pzoo.yml} (99%) rename zookeeper/{10zookeeper-config.yml => zoo-config.yml} (86%) diff --git a/kafka/Kustomization b/kafka/Kustomization index 0590b6b5..351a643f 100644 --- a/kafka/Kustomization +++ b/kafka/Kustomization @@ -1,5 +1,5 @@ resources: -- 10broker-config.yml -- 20dns.yml -- 30bootstrap-service.yml -- 50kafka.yml +- broker-config.yml +- dns.yml +- bootstrap-service.yml +- kafka.yml diff --git a/kafka/30bootstrap-service.yml b/kafka/bootstrap-service.yml similarity index 100% rename from kafka/30bootstrap-service.yml rename to kafka/bootstrap-service.yml diff --git a/kafka/10broker-config.yml b/kafka/broker-config.yml similarity index 100% rename from kafka/10broker-config.yml rename to kafka/broker-config.yml diff --git a/kafka/20dns.yml b/kafka/dns.yml similarity index 100% rename from kafka/20dns.yml rename to kafka/dns.yml diff --git a/kafka/50kafka.yml b/kafka/kafka.yml similarity index 99% rename from kafka/50kafka.yml rename to kafka/kafka.yml index c04d162a..0a15d729 100644 --- a/kafka/50kafka.yml +++ b/kafka/kafka.yml @@ -8,7 +8,7 @@ spec: matchLabels: app: kafka serviceName: "kafka" - replicas: 5 + replicas: 7 updateStrategy: type: RollingUpdate podManagementPolicy: Parallel diff --git a/00-namespace.yml b/namespace.yml similarity index 100% rename from 00-namespace.yml rename to namespace.yml diff --git a/prometheus/50-kafka-jmx-exporter-patch.yml b/prometheus/kafka-jmx-exporter-patch.yml similarity index 100% rename from prometheus/50-kafka-jmx-exporter-patch.yml rename to prometheus/kafka-jmx-exporter-patch.yml diff --git a/prometheus/kustomization.yaml b/prometheus/kustomization.yaml index 81c00fed..21eac282 100644 --- a/prometheus/kustomization.yaml +++ b/prometheus/kustomization.yaml @@ -4,6 +4,6 @@ bases: #- ../kafka #- ../variants/scale-1 resources: -- 10-metrics-config.yml +- metrics-config.yml patchesStrategicMerge: -- 50-kafka-jmx-exporter-patch.yml +- kafka-jmx-exporter-patch.yml diff --git a/prometheus/10-metrics-config.yml b/prometheus/metrics-config.yml similarity index 100% rename from prometheus/10-metrics-config.yml rename to prometheus/metrics-config.yml diff --git a/zookeeper/21zoo-service.yml b/zookeeper/21zoo-service.yml deleted file mode 100644 index 53beaeb7..00000000 --- a/zookeeper/21zoo-service.yml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: zoo - namespace: kafka -spec: - ports: - - port: 2888 - name: peer - - port: 3888 - name: leader-election - clusterIP: None - selector: - app: zookeeper - storage: persistent-regional diff --git a/zookeeper/51zoo.yml b/zookeeper/51zoo.yml deleted file mode 100644 index 52e930f7..00000000 --- a/zookeeper/51zoo.yml +++ /dev/null @@ -1,102 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: zoo - namespace: kafka -spec: - selector: - matchLabels: - app: zookeeper - storage: persistent-regional - serviceName: "zoo" - replicas: 2 - updateStrategy: - type: RollingUpdate - podManagementPolicy: Parallel - template: - metadata: - labels: - app: zookeeper - storage: persistent-regional - annotations: - spec: - terminationGracePeriodSeconds: 10 - initContainers: - - name: init-config - image: solsson/kafka:initutils@sha256:8988aca5b34feabe8d7d4e368f74b2ede398f692c7e99a38b262a938d475812c - command: ['/bin/bash', '/etc/kafka-configmap/init.sh'] - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: ID_OFFSET - value: "4" - volumeMounts: - - name: configmap - mountPath: /etc/kafka-configmap - - name: config - mountPath: /etc/kafka - - name: data - mountPath: /var/lib/zookeeper - containers: - - name: zookeeper - image: solsson/kafka:2.5.1@sha256:5c52620bd8e1bcd47805eb8ca285843168e1684aa27f1ae11ce330c3e12f6b0c - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: KAFKA_LOG4J_OPTS - value: -Dlog4j.configuration=file:/etc/kafka/log4j.properties - command: - - ./bin/zookeeper-server-start.sh - - /etc/kafka/zookeeper.properties.scale-5.$(POD_NAME) - lifecycle: - preStop: - exec: - command: ["sh", "-ce", "kill -s TERM 1; while $(kill -0 1 2>/dev/null); do sleep 1; done"] - ports: - - containerPort: 2181 - name: client - - containerPort: 2888 - name: peer - - containerPort: 3888 - name: leader-election - resources: - requests: - cpu: 10m - memory: 100Mi - limits: - memory: 120Mi - readinessProbe: - exec: - command: - - /bin/sh - - -c - - '[ "imok" = "$(echo ruok | nc -w 1 -q 1 127.0.0.1 2181)" ]' - timeoutSeconds: 2 - periodSeconds: 30 - volumeMounts: - - name: config - mountPath: /etc/kafka - - name: data - mountPath: /var/lib/zookeeper - volumes: - - name: configmap - configMap: - name: zookeeper-config - - name: config - emptyDir: {} - volumeClaimTemplates: - - metadata: - name: data - spec: - accessModes: [ "ReadWriteOnce" ] - resources: - requests: - storage: 1Gi diff --git a/zookeeper/Kustomization b/zookeeper/Kustomization index 978b228a..16255850 100644 --- a/zookeeper/Kustomization +++ b/zookeeper/Kustomization @@ -1,7 +1,5 @@ resources: -- 10zookeeper-config.yml -- 20pzoo-service.yml -- 21zoo-service.yml -- 30service.yml -- 50pzoo.yml -- 51zoo.yml +- zoo-config.yml +- pzoo-service.yml +- dns.yml +- pzoo.yml diff --git a/zookeeper/30service.yml b/zookeeper/dns.yml similarity index 100% rename from zookeeper/30service.yml rename to zookeeper/dns.yml diff --git a/zookeeper/20pzoo-service.yml b/zookeeper/pzoo-service.yml similarity index 100% rename from zookeeper/20pzoo-service.yml rename to zookeeper/pzoo-service.yml diff --git a/zookeeper/50pzoo.yml b/zookeeper/pzoo.yml similarity index 99% rename from zookeeper/50pzoo.yml rename to zookeeper/pzoo.yml index cdaae36b..e79a973e 100644 --- a/zookeeper/50pzoo.yml +++ b/zookeeper/pzoo.yml @@ -9,7 +9,7 @@ spec: app: zookeeper storage: persistent serviceName: "pzoo" - replicas: 3 + replicas: 5 updateStrategy: type: RollingUpdate podManagementPolicy: Parallel diff --git a/zookeeper/10zookeeper-config.yml b/zookeeper/zoo-config.yml similarity index 86% rename from zookeeper/10zookeeper-config.yml rename to zookeeper/zoo-config.yml index 57728474..a48eb1d6 100644 --- a/zookeeper/10zookeeper-config.yml +++ b/zookeeper/zoo-config.yml @@ -19,15 +19,15 @@ data: for N in $(seq $PZOO_REPLICAS); do echo "server.$N=pzoo-$(( $N - 1 )).pzoo.$POD_NAMESPACE.svc.cluster.local:2888:3888:participant" >> /etc/kafka/zookeeper.properties; done for N in $(seq $(( $REPLICAS - $PZOO_REPLICAS ))); do echo "server.$(( $PZOO_REPLICAS + $N ))=zoo-$(( $N - 1 )).zoo.$POD_NAMESPACE.svc.cluster.local:2888:3888:participant" >> /etc/kafka/zookeeper.properties; done } - ln -s /etc/kafka/zookeeper.properties /etc/kafka/zookeeper.properties.scale-$REPLICAS.$POD_NAME + ln -s /etc/kafka/zookeeper.properties /etc/kafka/zookeeper.properties.scale-5.$POD_NAME zookeeper.properties: | - 4lw.commands.whitelist=ruok + 4lw.commands.whitelist=stat,ruok,conf,isro,mntr tickTime=2000 dataDir=/var/lib/zookeeper/data dataLogDir=/var/lib/zookeeper/log clientPort=2181 - maxClientCnxns=3 + maxClientCnxns=0 initLimit=5 syncLimit=2 tcpKeepAlive=true @@ -35,8 +35,11 @@ data: server.1=pzoo-0.pzoo:2888:3888:participant server.2=pzoo-1.pzoo:2888:3888:participant server.3=pzoo-2.pzoo:2888:3888:participant - server.4=zoo-0.zoo:2888:3888:participant - server.5=zoo-1.zoo:2888:3888:participant + server.4=pzoo-3.pzoo:2888:3888:participant + server.5=pzoo-4.pzoo:2888:3888:participant + admin.enableServer=false + autopurge.snapRetainCount=3 + autopurge.purgeInterval=24 log4j.properties: |- log4j.rootLogger=INFO, stdout From 9832df4007d5ff84fc6f2164e30214ac4a061b30 Mon Sep 17 00:00:00 2001 From: joyfulbean Date: Tue, 19 Jul 2022 00:43:17 +0000 Subject: [PATCH 04/12] chagne burrow-config --- linkedin-burrow/burrow-config.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/linkedin-burrow/burrow-config.yml b/linkedin-burrow/burrow-config.yml index 5fe6dda1..548ab285 100644 --- a/linkedin-burrow/burrow-config.yml +++ b/linkedin-burrow/burrow-config.yml @@ -6,27 +6,27 @@ apiVersion: v1 data: burrow.toml: |- [zookeeper] - servers=[ "zookeeper:2181" ] + servers=[ "pzoo-0:2181", "pzoo-1:2181","pzoo-2:2181","pzoo-3:2181","pzoo-4:2181" ] timeout=6 root-path="/burrow" [cluster.local] class-name="kafka" - servers=[ "kafka-0.broker:9092", "kafka-1.broker:9092", "kafka-2.broker:9092" ] + servers=[ "kafka-0", "kafka-1:9092", "kafka-2:9092", "kafka-3:9092", "kafka-4:9092", "kafka-5:9092", "kafka-6:9092" ] topic-refresh=60 offset-refresh=30 [consumer.local] class-name="kafka" cluster="local" - servers=[ "kafka-0.broker:9092", "kafka-1.broker:9092", "kafka-2.broker:9092" ] + servers=[ "kafka-0:9092", "kafka-1:9092", "kafka-2:9092", "kafka-3:9092", "kafka-4:9092", "kafka-5:9092", "kafka-6:9092" ] group-blacklist="" group-whitelist="" [consumer.local_zk] class-name="kafka_zk" cluster="local" - servers=[ "zookeeper:2181" ] + servers=[ pzoo-0:2181", "pzoo-1:2181","pzoo-2:2181","pzoo-3:2181","pzoo-4:2181" ] zookeeper-path="/local" zookeeper-timeout=30 group-blacklist="" From 18d14a11576ddaa257482b6057967e0fba1a920e Mon Sep 17 00:00:00 2001 From: joyfulbean Date: Tue, 19 Jul 2022 04:58:19 +0000 Subject: [PATCH 05/12] working burrow --- linkedin-burrow/burrow-config.yml | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/linkedin-burrow/burrow-config.yml b/linkedin-burrow/burrow-config.yml index 548ab285..f1d41ceb 100644 --- a/linkedin-burrow/burrow-config.yml +++ b/linkedin-burrow/burrow-config.yml @@ -6,31 +6,27 @@ apiVersion: v1 data: burrow.toml: |- [zookeeper] - servers=[ "pzoo-0:2181", "pzoo-1:2181","pzoo-2:2181","pzoo-3:2181","pzoo-4:2181" ] + servers=[ "zookeeper:2181" ] timeout=6 root-path="/burrow" - [cluster.local] class-name="kafka" - servers=[ "kafka-0", "kafka-1:9092", "kafka-2:9092", "kafka-3:9092", "kafka-4:9092", "kafka-5:9092", "kafka-6:9092" ] + servers=[ "bootstrap:9092" ] topic-refresh=60 offset-refresh=30 - [consumer.local] class-name="kafka" cluster="local" - servers=[ "kafka-0:9092", "kafka-1:9092", "kafka-2:9092", "kafka-3:9092", "kafka-4:9092", "kafka-5:9092", "kafka-6:9092" ] + servers=[ "bootstrap:9092" ] group-blacklist="" group-whitelist="" - [consumer.local_zk] class-name="kafka_zk" cluster="local" - servers=[ pzoo-0:2181", "pzoo-1:2181","pzoo-2:2181","pzoo-3:2181","pzoo-4:2181" ] + servers=[ "zookeeper:2181" ] zookeeper-path="/local" zookeeper-timeout=30 group-blacklist="" group-whitelist="" - [httpserver.default] address=":8000" From f68ae6d5b5e1b445da879634aa46db7ba9901680 Mon Sep 17 00:00:00 2001 From: joyfulbean Date: Tue, 19 Jul 2022 05:01:44 +0000 Subject: [PATCH 06/12] add CMAK --- kubernetes-kafka | 1 + yahoo-kafka-manager/Kustomization | 3 +++ yahoo-kafka-manager/kafka-manager-service.yml | 12 +++++++++ yahoo-kafka-manager/kafka-manager.yml | 26 +++++++++++++++++++ 4 files changed, 42 insertions(+) create mode 160000 kubernetes-kafka create mode 100644 yahoo-kafka-manager/Kustomization create mode 100644 yahoo-kafka-manager/kafka-manager-service.yml create mode 100644 yahoo-kafka-manager/kafka-manager.yml diff --git a/kubernetes-kafka b/kubernetes-kafka new file mode 160000 index 00000000..9832df40 --- /dev/null +++ b/kubernetes-kafka @@ -0,0 +1 @@ +Subproject commit 9832df4007d5ff84fc6f2164e30214ac4a061b30 diff --git a/yahoo-kafka-manager/Kustomization b/yahoo-kafka-manager/Kustomization new file mode 100644 index 00000000..d32e595c --- /dev/null +++ b/yahoo-kafka-manager/Kustomization @@ -0,0 +1,3 @@ +resources: +- kafka-manager-service.yml +- kafka-manager.yml diff --git a/yahoo-kafka-manager/kafka-manager-service.yml b/yahoo-kafka-manager/kafka-manager-service.yml new file mode 100644 index 00000000..3d26adf3 --- /dev/null +++ b/yahoo-kafka-manager/kafka-manager-service.yml @@ -0,0 +1,12 @@ +kind: Service +apiVersion: v1 +metadata: + name: kafka-manager + namespace: kafka +spec: + selector: + app: kafka-manager + ports: + - protocol: TCP + port: 80 + targetPort: 80 diff --git a/yahoo-kafka-manager/kafka-manager.yml b/yahoo-kafka-manager/kafka-manager.yml new file mode 100644 index 00000000..16eea0b0 --- /dev/null +++ b/yahoo-kafka-manager/kafka-manager.yml @@ -0,0 +1,26 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kafka-manager + namespace: kafka +spec: + replicas: 1 + selector: + matchLabels: + app: kafka-manager + template: + metadata: + labels: + app: kafka-manager + spec: + containers: + - name: kafka-manager + image: solsson/kafka-manager@sha256:9da595ecbb733074a1d3c6091a1e0c384da4f4e1f19f4e16276062278da8e592 + ports: + - containerPort: 80 + env: + - name: ZK_HOSTS + value: zookeeper.kafka:2181 + command: + - ./bin/kafka-manager + - -Dhttp.port=80 \ No newline at end of file From 9bc4084ad2e7ce6dcb5802557f389b9b5892442c Mon Sep 17 00:00:00 2001 From: joyfulbean Date: Tue, 19 Jul 2022 05:04:47 +0000 Subject: [PATCH 07/12] delete kubernetes-kafka --- kubernetes-kafka | 1 - 1 file changed, 1 deletion(-) delete mode 160000 kubernetes-kafka diff --git a/kubernetes-kafka b/kubernetes-kafka deleted file mode 160000 index 9832df40..00000000 --- a/kubernetes-kafka +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9832df4007d5ff84fc6f2164e30214ac4a061b30 From f3b60cbc55bf375f4866b9f5ed43da0ca8e469bc Mon Sep 17 00:00:00 2001 From: joyfulbean Date: Wed, 20 Jul 2022 12:36:16 +0000 Subject: [PATCH 08/12] add node-exporter --- consumers-prometheus/kminion-service.yaml | 3 + kafka/Kustomization | 4 +- kafka/bootstrap-service.yml | 11 - kafka/dns.yml | 17 - linkedin-burrow/burrow-service.yml | 4 + monitoring/manifests-all.yaml | 2913 +++++++++++++++++ outside-services/outside-0.yml | 15 - outside-services/outside-1.yml | 4 +- prometheus/kafka-jmx-exporter-svc.yml | 14 + prometheus/metrics-config.yml | 3 +- prometheus/node-exporter-daemonset.yml | 53 + prometheus/node-exporter-svc.yml | 18 + yahoo-kafka-manager/kafka-manager-service.yml | 2 + yahoo-kafka-manager/kafka-manager.yml | 4 +- zookeeper/Kustomization | 4 +- zookeeper/dns.yml | 12 - zookeeper/pzoo-service.yml | 15 - 17 files changed, 3017 insertions(+), 79 deletions(-) delete mode 100644 kafka/bootstrap-service.yml delete mode 100644 kafka/dns.yml create mode 100644 monitoring/manifests-all.yaml delete mode 100644 outside-services/outside-0.yml create mode 100644 prometheus/kafka-jmx-exporter-svc.yml create mode 100644 prometheus/node-exporter-daemonset.yml create mode 100644 prometheus/node-exporter-svc.yml delete mode 100644 zookeeper/dns.yml delete mode 100644 zookeeper/pzoo-service.yml diff --git a/consumers-prometheus/kminion-service.yaml b/consumers-prometheus/kminion-service.yaml index 04bb8d9f..f0abb551 100644 --- a/consumers-prometheus/kminion-service.yaml +++ b/consumers-prometheus/kminion-service.yaml @@ -6,7 +6,10 @@ metadata: labels: &labels app: kminion spec: + type: NodePort selector: *labels ports: - name: http port: 8080 + targetPort: 8080 + nodePort: 30077 diff --git a/kafka/Kustomization b/kafka/Kustomization index 351a643f..27a9d9fe 100644 --- a/kafka/Kustomization +++ b/kafka/Kustomization @@ -1,5 +1,5 @@ resources: - broker-config.yml -- dns.yml -- bootstrap-service.yml +- kafka-headless.yml +- bootstrap-svc.yml - kafka.yml diff --git a/kafka/bootstrap-service.yml b/kafka/bootstrap-service.yml deleted file mode 100644 index 7c2a3376..00000000 --- a/kafka/bootstrap-service.yml +++ /dev/null @@ -1,11 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - name: bootstrap - namespace: kafka -spec: - ports: - - port: 9092 - selector: - app: kafka diff --git a/kafka/dns.yml b/kafka/dns.yml deleted file mode 100644 index 3a74b708..00000000 --- a/kafka/dns.yml +++ /dev/null @@ -1,17 +0,0 @@ -# A headless service to create DNS records ---- -apiVersion: v1 -kind: Service -metadata: - name: kafka - namespace: kafka - annotations: - service.alpha.kubernetes.io/tolerate-unready-endpoints: "true" -spec: - publishNotReadyAddresses: true - ports: - - port: 9092 - # [podname].broker.kafka.svc.cluster.local - clusterIP: None - selector: - app: kafka diff --git a/linkedin-burrow/burrow-service.yml b/linkedin-burrow/burrow-service.yml index 15eac06a..13c5d6a0 100644 --- a/linkedin-burrow/burrow-service.yml +++ b/linkedin-burrow/burrow-service.yml @@ -4,15 +4,19 @@ metadata: name: burrow namespace: kafka spec: + type: NodePort selector: app: burrow ports: - name: web protocol: TCP port: 80 + nodePort: 30031 - name: api protocol: TCP port: 8000 + nodePort: 30032 - name: prometheus protocol: TCP port: 8080 + nodePort: 30033 diff --git a/monitoring/manifests-all.yaml b/monitoring/manifests-all.yaml new file mode 100644 index 00000000..4414f1fe --- /dev/null +++ b/monitoring/manifests-all.yaml @@ -0,0 +1,2913 @@ +# Derived from ./manifests +--- +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus +rules: +- apiGroups: [""] + resources: + - nodes + - nodes/proxy + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: v1 +data: + default.tmpl: | + {{ define "__alertmanager" }}AlertManager{{ end }} + {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} + + {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} + {{ define "__description" }}{{ end }} + + {{ define "__text_alert_list" }}{{ range . }}Labels: + {{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }} + {{ end }}Annotations: + {{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }} + {{ end }}Source: {{ .GeneratorURL }} + {{ end }}{{ end }} + + + {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }} + {{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }} + {{ define "slack.default.pretext" }}{{ end }} + {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }} + {{ define "slack.default.iconemoji" }}{{ end }} + {{ define "slack.default.iconurl" }}{{ end }} + {{ define "slack.default.text" }}{{ end }} + + + {{ define "hipchat.default.from" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "hipchat.default.message" }}{{ template "__subject" . }}{{ end }} + + + {{ define "pagerduty.default.description" }}{{ template "__subject" . }}{{ end }} + {{ define "pagerduty.default.client" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "pagerduty.default.clientURL" }}{{ template "__alertmanagerURL" . }}{{ end }} + {{ define "pagerduty.default.instances" }}{{ template "__text_alert_list" . }}{{ end }} + + + {{ define "opsgenie.default.message" }}{{ template "__subject" . }}{{ end }} + {{ define "opsgenie.default.description" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} + {{ if gt (len .Alerts.Firing) 0 -}} + Alerts Firing: + {{ template "__text_alert_list" .Alerts.Firing }} + {{- end }} + {{ if gt (len .Alerts.Resolved) 0 -}} + Alerts Resolved: + {{ template "__text_alert_list" .Alerts.Resolved }} + {{- end }} + {{- end }} + {{ define "opsgenie.default.source" }}{{ template "__alertmanagerURL" . }}{{ end }} + + + {{ define "victorops.default.message" }}{{ template "__subject" . }} | {{ template "__alertmanagerURL" . }}{{ end }} + {{ define "victorops.default.from" }}{{ template "__alertmanager" . }}{{ end }} + + + {{ define "email.default.subject" }}{{ template "__subject" . }}{{ end }} + {{ define "email.default.html" }} + + + + + + + {{ template "__subject" . }} + + + + + + + + + + + +
+
+ + + + + + + +
+ {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} + {{ .Name }}={{ .Value }} + {{ end }} +
+ + + + + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + {{ range .Alerts.Firing }} + + + + {{ end }} + + {{ if gt (len .Alerts.Resolved) 0 }} + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + + + + {{ end }} + {{ range .Alerts.Resolved }} + + + + {{ end }} +
+ View in {{ template "__alertmanager" . }} +
+ [{{ .Alerts.Firing | len }}] Firing +
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + Source
+
+
+
+
+
+ [{{ .Alerts.Resolved | len }}] Resolved +
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + Source
+
+
+ +
+
+ + + + + {{ end }} + + {{ define "pushover.default.title" }}{{ template "__subject" . }}{{ end }} + {{ define "pushover.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} + {{ if gt (len .Alerts.Firing) 0 }} + Alerts Firing: + {{ template "__text_alert_list" .Alerts.Firing }} + {{ end }} + {{ if gt (len .Alerts.Resolved) 0 }} + Alerts Resolved: + {{ template "__text_alert_list" .Alerts.Resolved }} + {{ end }} + {{ end }} + {{ define "pushover.default.url" }}{{ template "__alertmanagerURL" . }}{{ end }} + slack.tmpl: | + {{ define "slack.devops.text" }} + {{range .Alerts}}{{.Annotations.DESCRIPTION}} + {{end}} + {{ end }} +kind: ConfigMap +metadata: + creationTimestamp: null + name: alertmanager-templates + namespace: monitoring +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: alertmanager + namespace: monitoring +data: + config.yml: |- + global: + # ResolveTimeout is the time after which an alert is declared resolved + # if it has not been updated. + resolve_timeout: 5m + + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'smtp.gmail.com:587' + smtp_from: 'foo@bar.com' + smtp_auth_username: 'foo@bar.com' + smtp_auth_password: 'barfoo' + + # The API URL to use for Slack notifications. + slack_api_url: 'https://hooks.slack.com/services/some/api/token' + + # # The directory from which notification templates are read. + templates: + - '/etc/alertmanager-templates/*.tmpl' + + # The root route on which each incoming alert enters. + route: + + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + + group_by: ['alertname', 'cluster', 'service'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + + #repeat_interval: 1m + repeat_interval: 15m + + # A default receiver + + # If an alert isn't caught by a route, send it to default. + receiver: default + + # All the above attributes are inherited by all child routes and can + # overwritten on each. + + # The child route trees. + routes: + # Send severity=slack alerts to slack. + - match: + severity: slack + receiver: slack_alert + # - match: + # severity: email + # receiver: email_alert + + receivers: + - name: 'default' + slack_configs: + - channel: '#alertmanager-test' + text: '{{ template "slack.devops.text" . }}' + send_resolved: true + + - name: 'slack_alert' + slack_configs: + - channel: '#alertmanager-test' + send_resolved: true +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alertmanager + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: alertmanager + template: + metadata: + name: alertmanager + labels: + app: alertmanager + spec: + containers: + - name: alertmanager + image: quay.io/prometheus/alertmanager:v0.7.1 + args: + - '-config.file=/etc/alertmanager/config.yml' + - '-storage.path=/alertmanager' + ports: + - name: alertmanager + containerPort: 9093 + volumeMounts: + - name: config-volume + mountPath: /etc/alertmanager + - name: templates-volume + mountPath: /etc/alertmanager-templates + - name: alertmanager + mountPath: /alertmanager + volumes: + - name: config-volume + configMap: + name: alertmanager + - name: templates-volume + configMap: + name: alertmanager-templates + - name: alertmanager + emptyDir: {} +--- +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/scrape: 'true' + prometheus.io/path: '/metrics' + labels: + name: alertmanager + name: alertmanager + namespace: monitoring +spec: + selector: + app: alertmanager + type: NodePort + ports: + - name: alertmanager + protocol: TCP + port: 9093 + targetPort: 9093 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana-core + namespace: monitoring + labels: + app: grafana + component: core +spec: + replicas: 1 + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + component: core + spec: + containers: + - image: grafana/grafana:4.2.0 + name: grafana-core + imagePullPolicy: IfNotPresent + # env: + resources: + # keep request = limit to keep this container in guaranteed class + limits: + cpu: 100m + memory: 100Mi + requests: + cpu: 100m + memory: 100Mi + env: + # The following env variables set up basic auth twith the default admin user and admin password. + - name: GF_AUTH_BASIC_ENABLED + value: "true" + - name: GF_SECURITY_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana + key: admin-username + - name: GF_SECURITY_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana + key: admin-password + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "false" + # - name: GF_AUTH_ANONYMOUS_ORG_ROLE + # value: Admin + # does not really work, because of template variables in exported dashboards: + # - name: GF_DASHBOARDS_JSON_ENABLED + # value: "true" + readinessProbe: + httpGet: + path: /login + port: 3000 + # initialDelaySeconds: 30 + # timeoutSeconds: 1 + volumeMounts: + - name: grafana-persistent-storage + mountPath: /var/lib/grafana + volumes: + - name: grafana-persistent-storage + emptyDir: {} +--- +apiVersion: v1 +data: + grafana-net-2-dashboard.json: | + { + "__inputs": [{ + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + }], + "__requires": [{ + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + }, { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + }, { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "3.1.0" + }, { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }], + "id": null, + "title": "Prometheus Stats", + "tags": [], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": true, + "sharedCrosshair": false, + "rows": [{ + "collapse": false, + "editable": true, + "height": 178, + "panels": [{ + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], + "datasource": "${DS_PROMETHEUS}", + "decimals": 1, + "editable": true, + "error": false, + "format": "s", + "id": 5, + "interval": null, + "links": [], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "(time() - container_start_time_seconds{container_name=\"kube-apiserver\"})", + "intervalFactor": 2, + "refId": "A", + "step": 4 + }], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current", + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "rangeMaps": [{ + "from": "null", + "to": "null", + "text": "N/A" + }], + "mappingType": 1, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": ["rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)"], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "id": 6, + "interval": null, + "links": [], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [{ + "expr": "prometheus_local_storage_memory_series", + "intervalFactor": 2, + "refId": "A", + "step": 4 + }], + "thresholds": "1,5", + "title": "Local Storage Memory Series", + "type": "singlestat", + "valueFontSize": "70%", + "valueMaps": [], + "valueName": "current", + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "rangeMaps": [{ + "from": "null", + "to": "null", + "text": "N/A" + }], + "mappingType": 1, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": ["rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)"], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "id": 7, + "interval": null, + "links": [], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [{ + "expr": "prometheus_local_storage_indexing_queue_length", + "intervalFactor": 2, + "refId": "A", + "step": 4 + }], + "thresholds": "500,4000", + "title": "Internal Storage Queue Length", + "type": "singlestat", + "valueFontSize": "70%", + "valueMaps": [{ + "op": "=", + "text": "Empty", + "value": "0" + }], + "valueName": "current", + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "rangeMaps": [{ + "from": "null", + "to": "null", + "text": "N/A" + }], + "mappingType": 1, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, { + "content": "\"Prometheus\nPrometheus\n\n

You're using Prometheus, an open-source systems monitoring and alerting toolkit originally built at SoundCloud. For more information, check out the Grafana and Prometheus projects.

", + "editable": true, + "error": false, + "id": 9, + "links": [], + "mode": "html", + "span": 3, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + }], + "title": "New row" + }, { + "collapse": false, + "editable": true, + "height": 227, + "panels": [{ + "aliasColors": { + "prometheus": "#C15C17", + "{instance=\"localhost:9090\",job=\"prometheus\"}": "#C15C17" + }, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "rate(prometheus_local_storage_ingested_samples_total[5m])", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "metric": "", + "refId": "A", + "step": 2 + }], + "timeFrom": null, + "timeShift": null, + "title": "Samples ingested (rate-5m)", + "tooltip": { + "shared": true, + "value_type": "cumulative", + "ordering": "alphabetical", + "msResolution": false + }, + "type": "graph", + "yaxes": [{ + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }, { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }], + "xaxis": { + "show": true + } + }, { + "content": "#### Samples Ingested\nThis graph displays the count of samples ingested by the Prometheus server, as measured over the last 5 minutes, per time series in the range vector. When troubleshooting an issue on IRC or Github, this is often the first stat requested by the Prometheus team. ", + "editable": true, + "error": false, + "id": 8, + "links": [], + "mode": "markdown", + "span": 2.995914043583536, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + }], + "title": "New row" + }, { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [{ + "aliasColors": { + "prometheus": "#F9BA8F", + "{instance=\"localhost:9090\",interval=\"5s\",job=\"prometheus\"}": "#F9BA8F" + }, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "rate(prometheus_target_interval_length_seconds_count[5m])", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "refId": "A", + "step": 2 + }], + "timeFrom": null, + "timeShift": null, + "title": "Target Scrapes (last 5m)", + "tooltip": { + "shared": true, + "value_type": "cumulative", + "ordering": "alphabetical", + "msResolution": false + }, + "type": "graph", + "yaxes": [{ + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }, { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }], + "xaxis": { + "show": true + } + }, { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "prometheus_target_interval_length_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{quantile}} ({{interval}})", + "metric": "", + "refId": "A", + "step": 2 + }], + "timeFrom": null, + "timeShift": null, + "title": "Scrape Duration", + "tooltip": { + "shared": true, + "value_type": "cumulative", + "ordering": "alphabetical", + "msResolution": false + }, + "type": "graph", + "yaxes": [{ + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }, { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }], + "xaxis": { + "show": true + } + }, { + "content": "#### Scrapes\nPrometheus scrapes metrics from instrumented jobs, either directly or via an intermediary push gateway for short-lived jobs. Target scrapes will show how frequently targets are scraped, as measured over the last 5 minutes, per time series in the range vector. Scrape Duration will show how long the scrapes are taking, with percentiles available as series. ", + "editable": true, + "error": false, + "id": 11, + "links": [], + "mode": "markdown", + "span": 3, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + }], + "title": "New row" + }, { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [{ + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "prometheus_evaluator_duration_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{quantile}}", + "refId": "A", + "step": 2 + }], + "timeFrom": null, + "timeShift": null, + "title": "Rule Eval Duration", + "tooltip": { + "shared": true, + "value_type": "cumulative", + "ordering": "alphabetical", + "msResolution": false + }, + "type": "graph", + "yaxes": [{ + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "percentunit", + "label": "" + }, { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }], + "xaxis": { + "show": true + } + }, { + "content": "#### Rule Evaluation Duration\nThis graph panel plots the duration for all evaluations to execute. The 50th percentile, 90th percentile and 99th percentile are shown as three separate series to help identify outliers that may be skewing the data.", + "editable": true, + "error": false, + "id": 15, + "links": [], + "mode": "markdown", + "span": 3, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + }], + "title": "New row" + }], + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"], + "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] + }, + "templating": { + "list": [] + }, + "annotations": { + "list": [] + }, + "refresh": false, + "schemaVersion": 12, + "version": 0, + "links": [{ + "icon": "info", + "tags": [], + "targetBlank": true, + "title": "Grafana Docs", + "tooltip": "", + "type": "link", + "url": "http://www.grafana.org/docs" + }, { + "icon": "info", + "tags": [], + "targetBlank": true, + "title": "Prometheus Docs", + "type": "link", + "url": "http://prometheus.io/docs/introduction/overview/" + }], + "gnetId": 2, + "description": "The official, pre-built Prometheus Stats Dashboard." + } + grafana-net-737-dashboard.json: | + { + "__inputs": [{ + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + }], + "__requires": [{ + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + }, { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "3.1.0" + }, { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }], + "id": null, + "title": "Kubernetes Pod Resources", + "description": "Shows resource usage of Kubernetes pods.", + "tags": [ + "kubernetes" + ], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": false, + "sharedCrosshair": false, + "rows": [{ + "collapse": false, + "editable": true, + "height": "250px", + "panels": [{ + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "180px", + "id": 4, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum (container_memory_working_set_bytes{id=\"/\",instance=~\"^$instance$\"}) / sum (machine_memory_bytes{instance=~\"^$instance$\"}) * 100", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 2 + }], + "thresholds": "65, 90", + "timeFrom": "1m", + "timeShift": null, + "title": "Memory Working Set", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "180px", + "id": 6, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum(rate(container_cpu_usage_seconds_total{id=\"/\",instance=~\"^$instance$\"}[1m])) / sum (machine_cpu_cores{instance=~\"^$instance$\"}) * 100", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "65, 90", + "timeFrom": "1m", + "timeShift": null, + "title": "Cpu Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "180px", + "id": 7, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum(container_fs_usage_bytes{id=\"/\",instance=~\"^$instance$\"}) / sum(container_fs_limit_bytes{id=\"/\",instance=~\"^$instance$\"}) * 100", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 10 + }], + "thresholds": "65, 90", + "timeFrom": "1m", + "timeShift": null, + "title": "Filesystem Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "hideTimeOverride": true, + "id": 9, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "20%", + "prefix": "", + "prefixFontSize": "20%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum(container_memory_working_set_bytes{id=\"/\",instance=~\"^$instance$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "", + "timeFrom": "1m", + "title": "Used", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "hideTimeOverride": true, + "id": 10, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum (machine_memory_bytes{instance=~\"^$instance$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "", + "timeFrom": "1m", + "title": "Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "hideTimeOverride": true, + "id": 11, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": " cores", + "postfixFontSize": "30%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",instance=~\"^$instance$\"}[1m]))", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "", + "timeFrom": "1m", + "timeShift": null, + "title": "Used", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "hideTimeOverride": true, + "id": 12, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": " cores", + "postfixFontSize": "30%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum (machine_cpu_cores{instance=~\"^$instance$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "", + "timeFrom": "1m", + "title": "Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "hideTimeOverride": true, + "id": 13, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum(container_fs_usage_bytes{id=\"/\",instance=~\"^$instance$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "", + "timeFrom": "1m", + "title": "Used", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "hideTimeOverride": true, + "id": 14, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum (container_fs_limit_bytes{id=\"/\",instance=~\"^$instance$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "", + "timeFrom": "1m", + "title": "Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)", + "thresholdLine": false + }, + "height": "200px", + "id": 32, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "sum(rate(container_network_receive_bytes_total{instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m]))", + "interval": "", + "intervalFactor": 2, + "legendFormat": "receive", + "metric": "network", + "refId": "A", + "step": 240 + }, { + "expr": "- sum(rate(container_network_transmit_bytes_total{instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m]))", + "interval": "", + "intervalFactor": 2, + "legendFormat": "transmit", + "metric": "network", + "refId": "B", + "step": 240 + }], + "timeFrom": null, + "timeShift": null, + "title": "Network", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [{ + "format": "Bps", + "label": "transmit / receive", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + }] + }], + "showTitle": true, + "title": "all pods" + }, { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [{ + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "height": "", + "id": 17, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "sum(rate(container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "metric": "container_cpu", + "refId": "A", + "step": 240 + }], + "timeFrom": null, + "timeShift": null, + "title": "Cpu Usage", + "tooltip": { + "msResolution": true, + "shared": false, + "sort": 2, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [{ + "format": "none", + "label": "cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + }] + }, { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 33, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}) by (pod_name)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "metric": "", + "refId": "A", + "step": 240 + }], + "timeFrom": null, + "timeShift": null, + "title": "Memory Working Set", + "tooltip": { + "msResolution": false, + "shared": false, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [{ + "format": "bytes", + "label": "used", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + }] + }, { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 16, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }} < in", + "metric": "network", + "refId": "A", + "step": 240 + }, { + "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }} > out", + "metric": "network", + "refId": "B", + "step": 240 + }], + "timeFrom": null, + "timeShift": null, + "title": "Network", + "tooltip": { + "msResolution": false, + "shared": false, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [{ + "format": "Bps", + "label": "transmit / receive", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + }] + }, { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 34, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "sum(container_fs_usage_bytes{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}) by (pod_name)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "metric": "network", + "refId": "A", + "step": 240 + }], + "timeFrom": null, + "timeShift": null, + "title": "Filesystem", + "tooltip": { + "msResolution": false, + "shared": false, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [{ + "format": "bytes", + "label": "used", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + }] + }], + "showTitle": true, + "title": "each pod" + }], + "time": { + "from": "now-3d", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "templating": { + "list": [{ + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": false, + "name": "instance", + "options": [], + "query": "label_values(instance)", + "refresh": 1, + "regex": "", + "type": "query" + }, { + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "options": [], + "query": "label_values(namespace)", + "refresh": 1, + "regex": "", + "type": "query" + }] + }, + "annotations": { + "list": [] + }, + "refresh": false, + "schemaVersion": 12, + "version": 8, + "links": [], + "gnetId": 737 + } + prometheus-datasource.json: | + { + "name": "prometheus", + "type": "prometheus", + "url": "http://prometheus:9090", + "access": "proxy", + "basicAuth": false + } +kind: ConfigMap +metadata: + creationTimestamp: null + name: grafana-import-dashboards + namespace: monitoring +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: grafana-import-dashboards + namespace: monitoring + labels: + app: grafana + component: import-dashboards +spec: + template: + metadata: + name: grafana-import-dashboards + labels: + app: grafana + component: import-dashboards + spec: + serviceAccountName: prometheus-k8s + initContainers: + - name: wait-for-grafana + image: giantswarm/tiny-tools + args: + - /bin/sh + - -c + - > + set -x; + while [ $(curl -Lsw '%{http_code}' "http://grafana:3000" -o /dev/null) -ne 200 ]; do + echo '.' + sleep 15; + done + containers: + - name: grafana-import-dashboards + image: giantswarm/tiny-tools + command: ["/bin/sh", "-c"] + workingDir: /opt/grafana-import-dashboards + args: + - > + for file in *-datasource.json ; do + if [ -e "$file" ] ; then + echo "importing $file" && + curl --silent --fail --show-error \ + --request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/datasources \ + --header "Content-Type: application/json" \ + --data-binary "@$file" ; + echo "" ; + fi + done ; + for file in *-dashboard.json ; do + if [ -e "$file" ] ; then + echo "importing $file" && + ( echo '{"dashboard":'; \ + cat "$file"; \ + echo ',"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}' ) \ + | jq -c '.' \ + | curl --silent --fail --show-error \ + --request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/dashboards/import \ + --header "Content-Type: application/json" \ + --data-binary "@-" ; + echo "" ; + fi + done + + env: + - name: GF_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana + key: admin-username + - name: GF_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana + key: admin-password + volumeMounts: + - name: config-volume + mountPath: /opt/grafana-import-dashboards + restartPolicy: Never + volumes: + - name: config-volume + configMap: + name: grafana-import-dashboards +--- +# apiVersion: extensions/v1beta1 +# kind: Ingress +# metadata: +# name: grafana +# namespace: monitoring +# spec: +# rules: +# - host: ..k8s.gigantic.io +# http: +# paths: +# - path: / +# backend: +# serviceName: grafana +# servicePort: 3000 +--- +apiVersion: v1 +kind: Secret +data: + admin-password: YWRtaW4= + admin-username: YWRtaW4= +metadata: + name: grafana + namespace: monitoring +type: Opaque +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: monitoring + labels: + app: grafana + component: core +spec: + type: NodePort + ports: + - port: 3000 + selector: + app: grafana + component: core +--- +apiVersion: v1 +data: + prometheus.yaml: | + global: + scrape_interval: 10s + scrape_timeout: 10s + evaluation_interval: 10s + rule_files: + - "/etc/prometheus-rules/*.rules" + scrape_configs: + - job_name: 'node' + static_configs: + - targets: ['localhost:9100'] + # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L37 + - job_name: 'kubernetes-nodes' + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - source_labels: [__address__] + regex: '(.*):10250' + replacement: '${1}:10255' + target_label: __address__ + + # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L79 + - job_name: 'kubernetes-endpoints' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+)(?::\d+);(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + + # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L119 + - job_name: 'kubernetes-services' + metrics_path: /probe + params: + module: [http_2xx] + kubernetes_sd_configs: + - role: service + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: true + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + replacement: blackbox + - source_labels: [__param_target] + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + target_label: kubernetes_name + + # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L156 + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: (.+):(?:\d+);(\d+) + replacement: ${1}:${2} + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + - source_labels: [__meta_kubernetes_pod_container_port_number] + action: keep + regex: 9\d{3} + + - job_name: 'kubernetes-cadvisor' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor + +kind: ConfigMap +metadata: + creationTimestamp: null + name: prometheus-core + namespace: monitoring +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-core + namespace: monitoring + labels: + app: prometheus + component: core +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + name: prometheus-main + labels: + app: prometheus + component: core + spec: + serviceAccountName: prometheus-k8s + containers: + - name: prometheus + image: prom/prometheus:v1.7.0 + args: + - '-storage.local.retention=12h' + - '-storage.local.memory-chunks=500000' + - '-config.file=/etc/prometheus/prometheus.yaml' + - '-alertmanager.url=http://alertmanager:9093/' + ports: + - name: webui + containerPort: 9090 + resources: + requests: + cpu: 100m + memory: 500M + limits: + cpu: 100m + memory: 500M + volumeMounts: + - name: config-volume + mountPath: /etc/prometheus + - name: rules-volume + mountPath: /etc/prometheus-rules + volumes: + - name: config-volume + configMap: + name: prometheus-core + - name: rules-volume + configMap: + name: prometheus-rules +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: monitoring + labels: + app: kube-state-metrics +spec: + replicas: 1 + selector: + matchLabels: + app: kube-state-metrics + template: + metadata: + labels: + app: kube-state-metrics + spec: + serviceAccountName: kube-state-metrics + containers: + - name: kube-state-metrics + image: gcr.io/google_containers/kube-state-metrics:v0.5.0 + ports: + - containerPort: 8080 +--- +# --- +# apiVersion: rbac.authorization.k8s.io/v1beta1 +# kind: ClusterRoleBinding +# metadata: +# name: kube-state-metrics +# roleRef: +# apiGroup: rbac.authorization.k8s.io +# kind: ClusterRole +# name: kube-state-metrics +# subjects: +# - kind: ServiceAccount +# name: kube-state-metrics +# namespace: monitoring +# --- +# apiVersion: rbac.authorization.k8s.io/v1beta1 +# kind: ClusterRole +# metadata: +# name: kube-state-metrics +# rules: +# - apiGroups: [""] +# resources: +# - nodes +# - pods +# - services +# - resourcequotas +# - replicationcontrollers +# - limitranges +# verbs: ["list", "watch"] +# - apiGroups: ["apps"] +# resources: +# - daemonsets +# - deployments +# - replicasets +# verbs: ["list", "watch"] +# --- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: monitoring +--- +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/scrape: 'true' + name: kube-state-metrics + namespace: monitoring + labels: + app: kube-state-metrics +spec: + ports: + - name: kube-state-metrics + port: 8080 + protocol: TCP + selector: + app: kube-state-metrics + +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-directory-size-metrics + namespace: monitoring + labels: + app: node-directory-size-metrics + annotations: + description: | + This `DaemonSet` provides metrics in Prometheus format about disk usage on the nodes. + The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now. + The other container `caddy` just hands out the contents of that file on request via `http` on `/metrics` at port `9102` which are the defaults for Prometheus. + These are scheduled on every node in the Kubernetes cluster. + To choose directories from the node to check, just mount them on the `read-du` container below `/mnt`. +spec: + selector: + matchLabels: + app: node-directory-size-metrics + template: + metadata: + labels: + app: node-directory-size-metrics + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '9102' + description: | + This `Pod` provides metrics in Prometheus format about disk usage on the node. + The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now. + The other container `caddy` just hands out the contents of that file on request on `/metrics` at port `9102` which are the defaults for Prometheus. + This `Pod` is scheduled on every node in the Kubernetes cluster. + To choose directories from the node to check just mount them on `read-du` below `/mnt`. + spec: + containers: + - name: read-du + image: giantswarm/tiny-tools + imagePullPolicy: Always + # FIXME threshold via env var + # The + command: + - fish + - --command + - | + touch /tmp/metrics-temp + while true + for directory in (du --bytes --separate-dirs --threshold=100M /mnt) + echo $directory | read size path + echo "node_directory_size_bytes{path=\"$path\"} $size" \ + >> /tmp/metrics-temp + end + mv /tmp/metrics-temp /tmp/metrics + sleep 300 + end + volumeMounts: + - name: host-fs-var + mountPath: /mnt/var + readOnly: true + - name: metrics + mountPath: /tmp + - name: caddy + image: dockermuenster/caddy:0.9.3 + command: + - "caddy" + - "-port=9102" + - "-root=/var/www" + ports: + - containerPort: 9102 + volumeMounts: + - name: metrics + mountPath: /var/www + volumes: + - name: host-fs-var + hostPath: + path: /var + - name: metrics + emptyDir: + medium: Memory +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: prometheus-node-exporter + namespace: monitoring + labels: + app: prometheus + component: node-exporter +spec: + selector: + matchLabels: + app: prometheus + template: + metadata: + name: prometheus-node-exporter + labels: + app: prometheus + component: node-exporter + spec: + containers: + - image: prom/node-exporter:v0.14.0 + name: prometheus-node-exporter + ports: + - name: prom-node-exp + #^ must be an IANA_SVC_NAME (at most 15 characters, ..) + containerPort: 9100 + hostPort: 9100 + hostNetwork: true + hostPID: true +--- +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/scrape: 'true' + name: prometheus-node-exporter + namespace: monitoring + labels: + app: prometheus + component: node-exporter +spec: + clusterIP: None + ports: + - name: prometheus-node-exporter + port: 9100 + protocol: TCP + selector: + app: prometheus + component: node-exporter + type: ClusterIP +--- +apiVersion: v1 +data: + cpu-usage.rules: | + ALERT NodeCPUUsage + IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75 + FOR 2m + LABELS { + severity="page" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: High CPU usage detected", + DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})" + } + instance-availability.rules: | + ALERT InstanceDown + IF up == 0 + FOR 1m + LABELS { severity = "page" } + ANNOTATIONS { + summary = "Instance {{ $labels.instance }} down", + description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.", + } + low-disk-space.rules: | + ALERT NodeLowRootDisk + IF ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"} ) / node_filesystem_size{mountpoint="/root-disk"} * 100) > 75 + FOR 2m + LABELS { + severity="page" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: Low root disk space", + DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})" + } + + ALERT NodeLowDataDisk + IF ((node_filesystem_size{mountpoint="/data-disk"} - node_filesystem_free{mountpoint="/data-disk"} ) / node_filesystem_size{mountpoint="/data-disk"} * 100) > 75 + FOR 2m + LABELS { + severity="page" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: Low data disk space", + DESCRIPTION = "{{$labels.instance}}: Data disk usage is above 75% (current value is: {{ $value }})" + } + mem-usage.rules: | + ALERT NodeSwapUsage + IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75 + FOR 2m + LABELS { + severity="page" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: Swap usage detected", + DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})" + } + + ALERT NodeMemoryUsage + IF (((node_memory_MemTotal-node_memory_MemAvailable)/(node_memory_MemTotal)*100)) > 75 + FOR 2m + LABELS { + severity="page" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: High memory usage detected", + DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})" + } +kind: ConfigMap +metadata: + creationTimestamp: null + name: prometheus-rules + namespace: monitoring +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring + labels: + app: prometheus + component: core + annotations: + prometheus.io/scrape: 'true' +spec: + type: NodePort + ports: + - port: 9090 + protocol: TCP + name: webui + selector: + app: prometheus + component: core diff --git a/outside-services/outside-0.yml b/outside-services/outside-0.yml deleted file mode 100644 index 7bc12bd7..00000000 --- a/outside-services/outside-0.yml +++ /dev/null @@ -1,15 +0,0 @@ -kind: Service -apiVersion: v1 -metadata: - name: outside-0 - namespace: kafka -spec: - selector: - app: kafka - kafka-broker-id: "0" - ports: - - protocol: TCP - targetPort: 9094 - port: 32400 - nodePort: 32400 - type: NodePort \ No newline at end of file diff --git a/outside-services/outside-1.yml b/outside-services/outside-1.yml index 1642ee02..a0660851 100644 --- a/outside-services/outside-1.yml +++ b/outside-services/outside-1.yml @@ -9,7 +9,7 @@ spec: kafka-broker-id: "1" ports: - protocol: TCP - targetPort: 9094 + targetPort: 5556 port: 32401 nodePort: 32401 - type: NodePort \ No newline at end of file + type: NodePort diff --git a/prometheus/kafka-jmx-exporter-svc.yml b/prometheus/kafka-jmx-exporter-svc.yml new file mode 100644 index 00000000..024e53a0 --- /dev/null +++ b/prometheus/kafka-jmx-exporter-svc.yml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: meteric + namespace: kafka +spec: + ports: + - name: http + port: 5556 + nodePort: 30001 + protocol: TCP + type: NodePort + selector: + app: kafka diff --git a/prometheus/metrics-config.yml b/prometheus/metrics-config.yml index 345e1929..1c89d3cd 100644 --- a/prometheus/metrics-config.yml +++ b/prometheus/metrics-config.yml @@ -7,6 +7,7 @@ data: jmx-kafka-prometheus.yml: |+ lowercaseOutputName: true + #hostPort: 127.0.0.1: 5555 jmxUrl: service:jmx:rmi:///jndi/rmi://127.0.0.1:5555/jmxrmi ssl: false whitelistObjectNames: ["kafka.server:*","kafka.controller:*","java.lang:*"] @@ -42,4 +43,4 @@ data: name: "zookeeper_$4_$5" labels: replicaId: "$2" - memberType: "$3" \ No newline at end of file + memberType: "$3" diff --git a/prometheus/node-exporter-daemonset.yml b/prometheus/node-exporter-daemonset.yml new file mode 100644 index 00000000..41b28ace --- /dev/null +++ b/prometheus/node-exporter-daemonset.yml @@ -0,0 +1,53 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter + namespace: kafka + labels: + app: node-exporter +spec: + selector: + matchLabels: + app: node-exporter + template: + metadata: + labels: + app: node-exporter + spec: + hostNetwork: true + hostIPC: true + hostPID: true + containers: + - name: node-exporter + image: prom/node-exporter:v1.0.1 + imagePullPolicy: IfNotPresent + args: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + resources: + requests: + cpu: 10m + memory: 100Mi + limits: + cpu: 100m + memory: 100Mi + ports: + - name: scrape + containerPort: 9100 + hostPort: 9100 + volumeMounts: + - mountPath: /host/proc + name: proc + readOnly: true + - mountPath: /host/sys + name: sys + readOnly: true + volumes: + - name: proc + hostPath: + path: /proc + type: "" + - name: sys + hostPath: + path: /sys + type: "" diff --git a/prometheus/node-exporter-svc.yml b/prometheus/node-exporter-svc.yml new file mode 100644 index 00000000..bfbc37e4 --- /dev/null +++ b/prometheus/node-exporter-svc.yml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/scrape: "true" + name: node-exporter-http + namespace: kafka + labels: + app: node-exporter +spec: + type: NodePort + selector: + app: node-exporter + ports: + - name: scrape + port: 9100 + nodePort: 30002 + protocol: TCP diff --git a/yahoo-kafka-manager/kafka-manager-service.yml b/yahoo-kafka-manager/kafka-manager-service.yml index 3d26adf3..0a28c12d 100644 --- a/yahoo-kafka-manager/kafka-manager-service.yml +++ b/yahoo-kafka-manager/kafka-manager-service.yml @@ -4,9 +4,11 @@ metadata: name: kafka-manager namespace: kafka spec: + type: NodePort selector: app: kafka-manager ports: - protocol: TCP port: 80 targetPort: 80 + nodePort: 30010 diff --git a/yahoo-kafka-manager/kafka-manager.yml b/yahoo-kafka-manager/kafka-manager.yml index 16eea0b0..5727f031 100644 --- a/yahoo-kafka-manager/kafka-manager.yml +++ b/yahoo-kafka-manager/kafka-manager.yml @@ -20,7 +20,7 @@ spec: - containerPort: 80 env: - name: ZK_HOSTS - value: zookeeper.kafka:2181 + value: zookeeper:2181 command: - ./bin/kafka-manager - - -Dhttp.port=80 \ No newline at end of file + - -Dhttp.port=80 diff --git a/zookeeper/Kustomization b/zookeeper/Kustomization index 16255850..7a8093d2 100644 --- a/zookeeper/Kustomization +++ b/zookeeper/Kustomization @@ -1,5 +1,5 @@ resources: - zoo-config.yml -- pzoo-service.yml -- dns.yml +- pzoo-svc.yml +- zoo-headless.yml - pzoo.yml diff --git a/zookeeper/dns.yml b/zookeeper/dns.yml deleted file mode 100644 index f33f3938..00000000 --- a/zookeeper/dns.yml +++ /dev/null @@ -1,12 +0,0 @@ -# the headless service is for PetSet DNS, this one is for clients -apiVersion: v1 -kind: Service -metadata: - name: zookeeper - namespace: kafka -spec: - ports: - - port: 2181 - name: client - selector: - app: zookeeper diff --git a/zookeeper/pzoo-service.yml b/zookeeper/pzoo-service.yml deleted file mode 100644 index 00c33e1c..00000000 --- a/zookeeper/pzoo-service.yml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: pzoo - namespace: kafka -spec: - ports: - - port: 2888 - name: peer - - port: 3888 - name: leader-election - clusterIP: None - selector: - app: zookeeper - storage: persistent From 16f3a0c992cc66d12b6dbea0f4455e7f5f7f8661 Mon Sep 17 00:00:00 2001 From: joyfulbean Date: Thu, 21 Jul 2022 12:41:41 +0000 Subject: [PATCH 09/12] add prometheus and grafana --- kafka/bootstrap-svc.yml | 12 ++++++++++++ kafka/kafka-headless.yml | 17 +++++++++++++++++ zookeeper/pzoo-svc.yml | 15 +++++++++++++++ zookeeper/zoo-headless.yml | 12 ++++++++++++ 4 files changed, 56 insertions(+) create mode 100644 kafka/bootstrap-svc.yml create mode 100644 kafka/kafka-headless.yml create mode 100644 zookeeper/pzoo-svc.yml create mode 100644 zookeeper/zoo-headless.yml diff --git a/kafka/bootstrap-svc.yml b/kafka/bootstrap-svc.yml new file mode 100644 index 00000000..ab0b8e84 --- /dev/null +++ b/kafka/bootstrap-svc.yml @@ -0,0 +1,12 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: bootstrap + namespace: kafka +spec: + type: ClusterIP + ports: + - port: 9092 + selector: + app: kafka diff --git a/kafka/kafka-headless.yml b/kafka/kafka-headless.yml new file mode 100644 index 00000000..3a74b708 --- /dev/null +++ b/kafka/kafka-headless.yml @@ -0,0 +1,17 @@ +# A headless service to create DNS records +--- +apiVersion: v1 +kind: Service +metadata: + name: kafka + namespace: kafka + annotations: + service.alpha.kubernetes.io/tolerate-unready-endpoints: "true" +spec: + publishNotReadyAddresses: true + ports: + - port: 9092 + # [podname].broker.kafka.svc.cluster.local + clusterIP: None + selector: + app: kafka diff --git a/zookeeper/pzoo-svc.yml b/zookeeper/pzoo-svc.yml new file mode 100644 index 00000000..00c33e1c --- /dev/null +++ b/zookeeper/pzoo-svc.yml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: pzoo + namespace: kafka +spec: + ports: + - port: 2888 + name: peer + - port: 3888 + name: leader-election + clusterIP: None + selector: + app: zookeeper + storage: persistent diff --git a/zookeeper/zoo-headless.yml b/zookeeper/zoo-headless.yml new file mode 100644 index 00000000..f33f3938 --- /dev/null +++ b/zookeeper/zoo-headless.yml @@ -0,0 +1,12 @@ +# the headless service is for PetSet DNS, this one is for clients +apiVersion: v1 +kind: Service +metadata: + name: zookeeper + namespace: kafka +spec: + ports: + - port: 2181 + name: client + selector: + app: zookeeper From 6b2051b820ced386bdd18769b885fcdbe9b5eb7d Mon Sep 17 00:00:00 2001 From: joyfulbean Date: Mon, 25 Jul 2022 04:20:05 +0000 Subject: [PATCH 10/12] separate grafana and prometheus --- monitoring/manifests-all.yaml | 2429 +-------------------------------- 1 file changed, 2 insertions(+), 2427 deletions(-) diff --git a/monitoring/manifests-all.yaml b/monitoring/manifests-all.yaml index 4414f1fe..dfe6d388 100644 --- a/monitoring/manifests-all.yaml +++ b/monitoring/manifests-all.yaml @@ -1,2428 +1,3 @@ -# Derived from ./manifests ---- -apiVersion: v1 -kind: Namespace -metadata: - name: monitoring ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: prometheus -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: prometheus -rules: -- apiGroups: [""] - resources: - - nodes - - nodes/proxy - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: - - configmaps - verbs: ["get"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: prometheus-k8s - namespace: monitoring ---- -apiVersion: v1 -data: - default.tmpl: | - {{ define "__alertmanager" }}AlertManager{{ end }} - {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} - - {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} - {{ define "__description" }}{{ end }} - - {{ define "__text_alert_list" }}{{ range . }}Labels: - {{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }} - {{ end }}Annotations: - {{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }} - {{ end }}Source: {{ .GeneratorURL }} - {{ end }}{{ end }} - - - {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }} - {{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }} - {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }} - {{ define "slack.default.pretext" }}{{ end }} - {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }} - {{ define "slack.default.iconemoji" }}{{ end }} - {{ define "slack.default.iconurl" }}{{ end }} - {{ define "slack.default.text" }}{{ end }} - - - {{ define "hipchat.default.from" }}{{ template "__alertmanager" . }}{{ end }} - {{ define "hipchat.default.message" }}{{ template "__subject" . }}{{ end }} - - - {{ define "pagerduty.default.description" }}{{ template "__subject" . }}{{ end }} - {{ define "pagerduty.default.client" }}{{ template "__alertmanager" . }}{{ end }} - {{ define "pagerduty.default.clientURL" }}{{ template "__alertmanagerURL" . }}{{ end }} - {{ define "pagerduty.default.instances" }}{{ template "__text_alert_list" . }}{{ end }} - - - {{ define "opsgenie.default.message" }}{{ template "__subject" . }}{{ end }} - {{ define "opsgenie.default.description" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} - {{ if gt (len .Alerts.Firing) 0 -}} - Alerts Firing: - {{ template "__text_alert_list" .Alerts.Firing }} - {{- end }} - {{ if gt (len .Alerts.Resolved) 0 -}} - Alerts Resolved: - {{ template "__text_alert_list" .Alerts.Resolved }} - {{- end }} - {{- end }} - {{ define "opsgenie.default.source" }}{{ template "__alertmanagerURL" . }}{{ end }} - - - {{ define "victorops.default.message" }}{{ template "__subject" . }} | {{ template "__alertmanagerURL" . }}{{ end }} - {{ define "victorops.default.from" }}{{ template "__alertmanager" . }}{{ end }} - - - {{ define "email.default.subject" }}{{ template "__subject" . }}{{ end }} - {{ define "email.default.html" }} - - - - - - - {{ template "__subject" . }} - - - - - - - - - - - -
-
- - - - - - - -
- {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} - {{ .Name }}={{ .Value }} - {{ end }} -
- - - - - {{ if gt (len .Alerts.Firing) 0 }} - - - - {{ end }} - {{ range .Alerts.Firing }} - - - - {{ end }} - - {{ if gt (len .Alerts.Resolved) 0 }} - {{ if gt (len .Alerts.Firing) 0 }} - - - - {{ end }} - - - - {{ end }} - {{ range .Alerts.Resolved }} - - - - {{ end }} -
- View in {{ template "__alertmanager" . }} -
- [{{ .Alerts.Firing | len }}] Firing -
- Labels
- {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} - {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} - {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} - Source
-
-
-
-
-
- [{{ .Alerts.Resolved | len }}] Resolved -
- Labels
- {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} - {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} - {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} - Source
-
-
- -
-
- - - - - {{ end }} - - {{ define "pushover.default.title" }}{{ template "__subject" . }}{{ end }} - {{ define "pushover.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} - {{ if gt (len .Alerts.Firing) 0 }} - Alerts Firing: - {{ template "__text_alert_list" .Alerts.Firing }} - {{ end }} - {{ if gt (len .Alerts.Resolved) 0 }} - Alerts Resolved: - {{ template "__text_alert_list" .Alerts.Resolved }} - {{ end }} - {{ end }} - {{ define "pushover.default.url" }}{{ template "__alertmanagerURL" . }}{{ end }} - slack.tmpl: | - {{ define "slack.devops.text" }} - {{range .Alerts}}{{.Annotations.DESCRIPTION}} - {{end}} - {{ end }} -kind: ConfigMap -metadata: - creationTimestamp: null - name: alertmanager-templates - namespace: monitoring ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: alertmanager - namespace: monitoring -data: - config.yml: |- - global: - # ResolveTimeout is the time after which an alert is declared resolved - # if it has not been updated. - resolve_timeout: 5m - - # The smarthost and SMTP sender used for mail notifications. - smtp_smarthost: 'smtp.gmail.com:587' - smtp_from: 'foo@bar.com' - smtp_auth_username: 'foo@bar.com' - smtp_auth_password: 'barfoo' - - # The API URL to use for Slack notifications. - slack_api_url: 'https://hooks.slack.com/services/some/api/token' - - # # The directory from which notification templates are read. - templates: - - '/etc/alertmanager-templates/*.tmpl' - - # The root route on which each incoming alert enters. - route: - - # The labels by which incoming alerts are grouped together. For example, - # multiple alerts coming in for cluster=A and alertname=LatencyHigh would - # be batched into a single group. - - group_by: ['alertname', 'cluster', 'service'] - - # When a new group of alerts is created by an incoming alert, wait at - # least 'group_wait' to send the initial notification. - # This way ensures that you get multiple alerts for the same group that start - # firing shortly after another are batched together on the first - # notification. - - group_wait: 30s - - # When the first notification was sent, wait 'group_interval' to send a batch - # of new alerts that started firing for that group. - - group_interval: 5m - - # If an alert has successfully been sent, wait 'repeat_interval' to - # resend them. - - #repeat_interval: 1m - repeat_interval: 15m - - # A default receiver - - # If an alert isn't caught by a route, send it to default. - receiver: default - - # All the above attributes are inherited by all child routes and can - # overwritten on each. - - # The child route trees. - routes: - # Send severity=slack alerts to slack. - - match: - severity: slack - receiver: slack_alert - # - match: - # severity: email - # receiver: email_alert - - receivers: - - name: 'default' - slack_configs: - - channel: '#alertmanager-test' - text: '{{ template "slack.devops.text" . }}' - send_resolved: true - - - name: 'slack_alert' - slack_configs: - - channel: '#alertmanager-test' - send_resolved: true ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: alertmanager - namespace: monitoring -spec: - replicas: 1 - selector: - matchLabels: - app: alertmanager - template: - metadata: - name: alertmanager - labels: - app: alertmanager - spec: - containers: - - name: alertmanager - image: quay.io/prometheus/alertmanager:v0.7.1 - args: - - '-config.file=/etc/alertmanager/config.yml' - - '-storage.path=/alertmanager' - ports: - - name: alertmanager - containerPort: 9093 - volumeMounts: - - name: config-volume - mountPath: /etc/alertmanager - - name: templates-volume - mountPath: /etc/alertmanager-templates - - name: alertmanager - mountPath: /alertmanager - volumes: - - name: config-volume - configMap: - name: alertmanager - - name: templates-volume - configMap: - name: alertmanager-templates - - name: alertmanager - emptyDir: {} ---- -apiVersion: v1 -kind: Service -metadata: - annotations: - prometheus.io/scrape: 'true' - prometheus.io/path: '/metrics' - labels: - name: alertmanager - name: alertmanager - namespace: monitoring -spec: - selector: - app: alertmanager - type: NodePort - ports: - - name: alertmanager - protocol: TCP - port: 9093 - targetPort: 9093 ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: grafana-core - namespace: monitoring - labels: - app: grafana - component: core -spec: - replicas: 1 - selector: - matchLabels: - app: grafana - template: - metadata: - labels: - app: grafana - component: core - spec: - containers: - - image: grafana/grafana:4.2.0 - name: grafana-core - imagePullPolicy: IfNotPresent - # env: - resources: - # keep request = limit to keep this container in guaranteed class - limits: - cpu: 100m - memory: 100Mi - requests: - cpu: 100m - memory: 100Mi - env: - # The following env variables set up basic auth twith the default admin user and admin password. - - name: GF_AUTH_BASIC_ENABLED - value: "true" - - name: GF_SECURITY_ADMIN_USER - valueFrom: - secretKeyRef: - name: grafana - key: admin-username - - name: GF_SECURITY_ADMIN_PASSWORD - valueFrom: - secretKeyRef: - name: grafana - key: admin-password - - name: GF_AUTH_ANONYMOUS_ENABLED - value: "false" - # - name: GF_AUTH_ANONYMOUS_ORG_ROLE - # value: Admin - # does not really work, because of template variables in exported dashboards: - # - name: GF_DASHBOARDS_JSON_ENABLED - # value: "true" - readinessProbe: - httpGet: - path: /login - port: 3000 - # initialDelaySeconds: 30 - # timeoutSeconds: 1 - volumeMounts: - - name: grafana-persistent-storage - mountPath: /var/lib/grafana - volumes: - - name: grafana-persistent-storage - emptyDir: {} ---- -apiVersion: v1 -data: - grafana-net-2-dashboard.json: | - { - "__inputs": [{ - "name": "DS_PROMETHEUS", - "label": "Prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - }], - "__requires": [{ - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - }, { - "type": "panel", - "id": "text", - "name": "Text", - "version": "" - }, { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "3.1.0" - }, { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }], - "id": null, - "title": "Prometheus Stats", - "tags": [], - "style": "dark", - "timezone": "browser", - "editable": true, - "hideControls": true, - "sharedCrosshair": false, - "rows": [{ - "collapse": false, - "editable": true, - "height": 178, - "panels": [{ - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], - "datasource": "${DS_PROMETHEUS}", - "decimals": 1, - "editable": true, - "error": false, - "format": "s", - "id": 5, - "interval": null, - "links": [], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [{ - "expr": "(time() - container_start_time_seconds{container_name=\"kube-apiserver\"})", - "intervalFactor": 2, - "refId": "A", - "step": 4 - }], - "thresholds": "", - "title": "Uptime", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [{ - "op": "=", - "text": "N/A", - "value": "null" - }], - "valueName": "current", - "mappingTypes": [{ - "name": "value to text", - "value": 1 - }, { - "name": "range to text", - "value": 2 - }], - "rangeMaps": [{ - "from": "null", - "to": "null", - "text": "N/A" - }], - "mappingType": 1, - "gauge": { - "show": false, - "minValue": 0, - "maxValue": 100, - "thresholdMarkers": true, - "thresholdLabels": false - } - }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": ["rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)"], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "id": 6, - "interval": null, - "links": [], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [{ - "expr": "prometheus_local_storage_memory_series", - "intervalFactor": 2, - "refId": "A", - "step": 4 - }], - "thresholds": "1,5", - "title": "Local Storage Memory Series", - "type": "singlestat", - "valueFontSize": "70%", - "valueMaps": [], - "valueName": "current", - "mappingTypes": [{ - "name": "value to text", - "value": 1 - }, { - "name": "range to text", - "value": 2 - }], - "rangeMaps": [{ - "from": "null", - "to": "null", - "text": "N/A" - }], - "mappingType": 1, - "gauge": { - "show": false, - "minValue": 0, - "maxValue": 100, - "thresholdMarkers": true, - "thresholdLabels": false - } - }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": ["rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)"], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "id": 7, - "interval": null, - "links": [], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": true - }, - "targets": [{ - "expr": "prometheus_local_storage_indexing_queue_length", - "intervalFactor": 2, - "refId": "A", - "step": 4 - }], - "thresholds": "500,4000", - "title": "Internal Storage Queue Length", - "type": "singlestat", - "valueFontSize": "70%", - "valueMaps": [{ - "op": "=", - "text": "Empty", - "value": "0" - }], - "valueName": "current", - "mappingTypes": [{ - "name": "value to text", - "value": 1 - }, { - "name": "range to text", - "value": 2 - }], - "rangeMaps": [{ - "from": "null", - "to": "null", - "text": "N/A" - }], - "mappingType": 1, - "gauge": { - "show": false, - "minValue": 0, - "maxValue": 100, - "thresholdMarkers": true, - "thresholdLabels": false - } - }, { - "content": "\"Prometheus\nPrometheus\n\n

You're using Prometheus, an open-source systems monitoring and alerting toolkit originally built at SoundCloud. For more information, check out the Grafana and Prometheus projects.

", - "editable": true, - "error": false, - "id": 9, - "links": [], - "mode": "html", - "span": 3, - "style": {}, - "title": "", - "transparent": true, - "type": "text" - }], - "title": "New row" - }, { - "collapse": false, - "editable": true, - "height": 227, - "panels": [{ - "aliasColors": { - "prometheus": "#C15C17", - "{instance=\"localhost:9090\",job=\"prometheus\"}": "#C15C17" - }, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [{ - "expr": "rate(prometheus_local_storage_ingested_samples_total[5m])", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{job}}", - "metric": "", - "refId": "A", - "step": 2 - }], - "timeFrom": null, - "timeShift": null, - "title": "Samples ingested (rate-5m)", - "tooltip": { - "shared": true, - "value_type": "cumulative", - "ordering": "alphabetical", - "msResolution": false - }, - "type": "graph", - "yaxes": [{ - "show": true, - "min": null, - "max": null, - "logBase": 1, - "format": "short" - }, { - "show": true, - "min": null, - "max": null, - "logBase": 1, - "format": "short" - }], - "xaxis": { - "show": true - } - }, { - "content": "#### Samples Ingested\nThis graph displays the count of samples ingested by the Prometheus server, as measured over the last 5 minutes, per time series in the range vector. When troubleshooting an issue on IRC or Github, this is often the first stat requested by the Prometheus team. ", - "editable": true, - "error": false, - "id": 8, - "links": [], - "mode": "markdown", - "span": 2.995914043583536, - "style": {}, - "title": "", - "transparent": true, - "type": "text" - }], - "title": "New row" - }, { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [{ - "aliasColors": { - "prometheus": "#F9BA8F", - "{instance=\"localhost:9090\",interval=\"5s\",job=\"prometheus\"}": "#F9BA8F" - }, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 2, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 5, - "stack": false, - "steppedLine": false, - "targets": [{ - "expr": "rate(prometheus_target_interval_length_seconds_count[5m])", - "intervalFactor": 2, - "legendFormat": "{{job}}", - "refId": "A", - "step": 2 - }], - "timeFrom": null, - "timeShift": null, - "title": "Target Scrapes (last 5m)", - "tooltip": { - "shared": true, - "value_type": "cumulative", - "ordering": "alphabetical", - "msResolution": false - }, - "type": "graph", - "yaxes": [{ - "show": true, - "min": null, - "max": null, - "logBase": 1, - "format": "short" - }, { - "show": true, - "min": null, - "max": null, - "logBase": 1, - "format": "short" - }], - "xaxis": { - "show": true - } - }, { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 14, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [{ - "expr": "prometheus_target_interval_length_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{quantile}} ({{interval}})", - "metric": "", - "refId": "A", - "step": 2 - }], - "timeFrom": null, - "timeShift": null, - "title": "Scrape Duration", - "tooltip": { - "shared": true, - "value_type": "cumulative", - "ordering": "alphabetical", - "msResolution": false - }, - "type": "graph", - "yaxes": [{ - "show": true, - "min": null, - "max": null, - "logBase": 1, - "format": "short" - }, { - "show": true, - "min": null, - "max": null, - "logBase": 1, - "format": "short" - }], - "xaxis": { - "show": true - } - }, { - "content": "#### Scrapes\nPrometheus scrapes metrics from instrumented jobs, either directly or via an intermediary push gateway for short-lived jobs. Target scrapes will show how frequently targets are scraped, as measured over the last 5 minutes, per time series in the range vector. Scrape Duration will show how long the scrapes are taking, with percentiles available as series. ", - "editable": true, - "error": false, - "id": 11, - "links": [], - "mode": "markdown", - "span": 3, - "style": {}, - "title": "", - "transparent": true, - "type": "text" - }], - "title": "New row" - }, { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [{ - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 12, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": true, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 9, - "stack": false, - "steppedLine": false, - "targets": [{ - "expr": "prometheus_evaluator_duration_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{quantile}}", - "refId": "A", - "step": 2 - }], - "timeFrom": null, - "timeShift": null, - "title": "Rule Eval Duration", - "tooltip": { - "shared": true, - "value_type": "cumulative", - "ordering": "alphabetical", - "msResolution": false - }, - "type": "graph", - "yaxes": [{ - "show": true, - "min": null, - "max": null, - "logBase": 1, - "format": "percentunit", - "label": "" - }, { - "show": true, - "min": null, - "max": null, - "logBase": 1, - "format": "short" - }], - "xaxis": { - "show": true - } - }, { - "content": "#### Rule Evaluation Duration\nThis graph panel plots the duration for all evaluations to execute. The 50th percentile, 90th percentile and 99th percentile are shown as three separate series to help identify outliers that may be skewing the data.", - "editable": true, - "error": false, - "id": 15, - "links": [], - "mode": "markdown", - "span": 3, - "style": {}, - "title": "", - "transparent": true, - "type": "text" - }], - "title": "New row" - }], - "time": { - "from": "now-5m", - "to": "now" - }, - "timepicker": { - "now": true, - "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"], - "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] - }, - "templating": { - "list": [] - }, - "annotations": { - "list": [] - }, - "refresh": false, - "schemaVersion": 12, - "version": 0, - "links": [{ - "icon": "info", - "tags": [], - "targetBlank": true, - "title": "Grafana Docs", - "tooltip": "", - "type": "link", - "url": "http://www.grafana.org/docs" - }, { - "icon": "info", - "tags": [], - "targetBlank": true, - "title": "Prometheus Docs", - "type": "link", - "url": "http://prometheus.io/docs/introduction/overview/" - }], - "gnetId": 2, - "description": "The official, pre-built Prometheus Stats Dashboard." - } - grafana-net-737-dashboard.json: | - { - "__inputs": [{ - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - }], - "__requires": [{ - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - }, { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "3.1.0" - }, { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }], - "id": null, - "title": "Kubernetes Pod Resources", - "description": "Shows resource usage of Kubernetes pods.", - "tags": [ - "kubernetes" - ], - "style": "dark", - "timezone": "browser", - "editable": true, - "hideControls": false, - "sharedCrosshair": false, - "rows": [{ - "collapse": false, - "editable": true, - "height": "250px", - "panels": [{ - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "height": "180px", - "id": 4, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [{ - "name": "value to text", - "value": 1 - }, { - "name": "range to text", - "value": 2 - }], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [{ - "from": "null", - "text": "N/A", - "to": "null" - }], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [{ - "expr": "sum (container_memory_working_set_bytes{id=\"/\",instance=~\"^$instance$\"}) / sum (machine_memory_bytes{instance=~\"^$instance$\"}) * 100", - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 2 - }], - "thresholds": "65, 90", - "timeFrom": "1m", - "timeShift": null, - "title": "Memory Working Set", - "transparent": false, - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [{ - "op": "=", - "text": "N/A", - "value": "null" - }], - "valueName": "current" - }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "height": "180px", - "id": 6, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [{ - "name": "value to text", - "value": 1 - }, { - "name": "range to text", - "value": 2 - }], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [{ - "from": "null", - "text": "N/A", - "to": "null" - }], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [{ - "expr": "sum(rate(container_cpu_usage_seconds_total{id=\"/\",instance=~\"^$instance$\"}[1m])) / sum (machine_cpu_cores{instance=~\"^$instance$\"}) * 100", - "interval": "10s", - "intervalFactor": 1, - "refId": "A", - "step": 10 - }], - "thresholds": "65, 90", - "timeFrom": "1m", - "timeShift": null, - "title": "Cpu Usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [{ - "op": "=", - "text": "N/A", - "value": "null" - }], - "valueName": "current" - }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "height": "180px", - "id": 7, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [{ - "name": "value to text", - "value": 1 - }, { - "name": "range to text", - "value": 2 - }], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [{ - "from": "null", - "text": "N/A", - "to": "null" - }], - "span": 4, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [{ - "expr": "sum(container_fs_usage_bytes{id=\"/\",instance=~\"^$instance$\"}) / sum(container_fs_limit_bytes{id=\"/\",instance=~\"^$instance$\"}) * 100", - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "", - "metric": "", - "refId": "A", - "step": 10 - }], - "thresholds": "65, 90", - "timeFrom": "1m", - "timeShift": null, - "title": "Filesystem Usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [{ - "op": "=", - "text": "N/A", - "value": "null" - }], - "valueName": "current" - }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "format": "bytes", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "height": "1px", - "hideTimeOverride": true, - "id": 9, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [{ - "name": "value to text", - "value": 1 - }, { - "name": "range to text", - "value": 2 - }], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "20%", - "prefix": "", - "prefixFontSize": "20%", - "rangeMaps": [{ - "from": "null", - "text": "N/A", - "to": "null" - }], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [{ - "expr": "sum(container_memory_working_set_bytes{id=\"/\",instance=~\"^$instance$\"})", - "interval": "10s", - "intervalFactor": 1, - "refId": "A", - "step": 10 - }], - "thresholds": "", - "timeFrom": "1m", - "title": "Used", - "type": "singlestat", - "valueFontSize": "50%", - "valueMaps": [{ - "op": "=", - "text": "N/A", - "value": "null" - }], - "valueName": "current" - }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "format": "bytes", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "height": "1px", - "hideTimeOverride": true, - "id": 10, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [{ - "name": "value to text", - "value": 1 - }, { - "name": "range to text", - "value": 2 - }], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [{ - "from": "null", - "text": "N/A", - "to": "null" - }], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [{ - "expr": "sum (machine_memory_bytes{instance=~\"^$instance$\"})", - "interval": "10s", - "intervalFactor": 1, - "refId": "A", - "step": 10 - }], - "thresholds": "", - "timeFrom": "1m", - "title": "Total", - "type": "singlestat", - "valueFontSize": "50%", - "valueMaps": [{ - "op": "=", - "text": "N/A", - "value": "null" - }], - "valueName": "current" - }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "height": "1px", - "hideTimeOverride": true, - "id": 11, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [{ - "name": "value to text", - "value": 1 - }, { - "name": "range to text", - "value": 2 - }], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": " cores", - "postfixFontSize": "30%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [{ - "from": "null", - "text": "N/A", - "to": "null" - }], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [{ - "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",instance=~\"^$instance$\"}[1m]))", - "interval": "10s", - "intervalFactor": 1, - "refId": "A", - "step": 10 - }], - "thresholds": "", - "timeFrom": "1m", - "timeShift": null, - "title": "Used", - "type": "singlestat", - "valueFontSize": "50%", - "valueMaps": [{ - "op": "=", - "text": "N/A", - "value": "null" - }], - "valueName": "current" - }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "height": "1px", - "hideTimeOverride": true, - "id": 12, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [{ - "name": "value to text", - "value": 1 - }, { - "name": "range to text", - "value": 2 - }], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": " cores", - "postfixFontSize": "30%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [{ - "from": "null", - "text": "N/A", - "to": "null" - }], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [{ - "expr": "sum (machine_cpu_cores{instance=~\"^$instance$\"})", - "interval": "10s", - "intervalFactor": 1, - "refId": "A", - "step": 10 - }], - "thresholds": "", - "timeFrom": "1m", - "title": "Total", - "type": "singlestat", - "valueFontSize": "50%", - "valueMaps": [{ - "op": "=", - "text": "N/A", - "value": "null" - }], - "valueName": "current" - }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "format": "bytes", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "height": "1px", - "hideTimeOverride": true, - "id": 13, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [{ - "name": "value to text", - "value": 1 - }, { - "name": "range to text", - "value": 2 - }], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [{ - "from": "null", - "text": "N/A", - "to": "null" - }], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [{ - "expr": "sum(container_fs_usage_bytes{id=\"/\",instance=~\"^$instance$\"})", - "interval": "10s", - "intervalFactor": 1, - "refId": "A", - "step": 10 - }], - "thresholds": "", - "timeFrom": "1m", - "title": "Used", - "type": "singlestat", - "valueFontSize": "50%", - "valueMaps": [{ - "op": "=", - "text": "N/A", - "value": "null" - }], - "valueName": "current" - }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "format": "bytes", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "height": "1px", - "hideTimeOverride": true, - "id": 14, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [{ - "name": "value to text", - "value": 1 - }, { - "name": "range to text", - "value": 2 - }], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [{ - "from": "null", - "text": "N/A", - "to": "null" - }], - "span": 2, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "targets": [{ - "expr": "sum (container_fs_limit_bytes{id=\"/\",instance=~\"^$instance$\"})", - "interval": "10s", - "intervalFactor": 1, - "refId": "A", - "step": 10 - }], - "thresholds": "", - "timeFrom": "1m", - "title": "Total", - "type": "singlestat", - "valueFontSize": "50%", - "valueMaps": [{ - "op": "=", - "text": "N/A", - "value": "null" - }], - "valueName": "current" - }, { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)", - "thresholdLine": false - }, - "height": "200px", - "id": 32, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": 200, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [{ - "expr": "sum(rate(container_network_receive_bytes_total{instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m]))", - "interval": "", - "intervalFactor": 2, - "legendFormat": "receive", - "metric": "network", - "refId": "A", - "step": 240 - }, { - "expr": "- sum(rate(container_network_transmit_bytes_total{instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m]))", - "interval": "", - "intervalFactor": 2, - "legendFormat": "transmit", - "metric": "network", - "refId": "B", - "step": 240 - }], - "timeFrom": null, - "timeShift": null, - "title": "Network", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transparent": false, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [{ - "format": "Bps", - "label": "transmit / receive", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, { - "format": "Bps", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - }] - }], - "showTitle": true, - "title": "all pods" - }, { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [{ - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 3, - "editable": true, - "error": false, - "fill": 0, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "height": "", - "id": 17, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [{ - "expr": "sum(rate(container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }}", - "metric": "container_cpu", - "refId": "A", - "step": 240 - }], - "timeFrom": null, - "timeShift": null, - "title": "Cpu Usage", - "tooltip": { - "msResolution": true, - "shared": false, - "sort": 2, - "value_type": "cumulative" - }, - "transparent": false, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [{ - "format": "none", - "label": "cores", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - }] - }, { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "fill": 0, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 33, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [{ - "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}) by (pod_name)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }}", - "metric": "", - "refId": "A", - "step": 240 - }], - "timeFrom": null, - "timeShift": null, - "title": "Memory Working Set", - "tooltip": { - "msResolution": false, - "shared": false, - "sort": 2, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [{ - "format": "bytes", - "label": "used", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - }] - }, { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 16, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": 200, - "sort": "avg", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [{ - "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }} < in", - "metric": "network", - "refId": "A", - "step": 240 - }, { - "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }} > out", - "metric": "network", - "refId": "B", - "step": 240 - }], - "timeFrom": null, - "timeShift": null, - "title": "Network", - "tooltip": { - "msResolution": false, - "shared": false, - "sort": 2, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [{ - "format": "Bps", - "label": "transmit / receive", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - }] - }, { - "aliasColors": {}, - "bars": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "id": 34, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": 200, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "span": 12, - "stack": false, - "steppedLine": false, - "targets": [{ - "expr": "sum(container_fs_usage_bytes{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}) by (pod_name)", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{ pod_name }}", - "metric": "network", - "refId": "A", - "step": 240 - }], - "timeFrom": null, - "timeShift": null, - "title": "Filesystem", - "tooltip": { - "msResolution": false, - "shared": false, - "sort": 2, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "show": true - }, - "yaxes": [{ - "format": "bytes", - "label": "used", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - }] - }], - "showTitle": true, - "title": "each pod" - }], - "time": { - "from": "now-3d", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "templating": { - "list": [{ - "allValue": ".*", - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Instance", - "multi": false, - "name": "instance", - "options": [], - "query": "label_values(instance)", - "refresh": 1, - "regex": "", - "type": "query" - }, { - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "Namespace", - "multi": true, - "name": "namespace", - "options": [], - "query": "label_values(namespace)", - "refresh": 1, - "regex": "", - "type": "query" - }] - }, - "annotations": { - "list": [] - }, - "refresh": false, - "schemaVersion": 12, - "version": 8, - "links": [], - "gnetId": 737 - } - prometheus-datasource.json: | - { - "name": "prometheus", - "type": "prometheus", - "url": "http://prometheus:9090", - "access": "proxy", - "basicAuth": false - } -kind: ConfigMap -metadata: - creationTimestamp: null - name: grafana-import-dashboards - namespace: monitoring ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: grafana-import-dashboards - namespace: monitoring - labels: - app: grafana - component: import-dashboards -spec: - template: - metadata: - name: grafana-import-dashboards - labels: - app: grafana - component: import-dashboards - spec: - serviceAccountName: prometheus-k8s - initContainers: - - name: wait-for-grafana - image: giantswarm/tiny-tools - args: - - /bin/sh - - -c - - > - set -x; - while [ $(curl -Lsw '%{http_code}' "http://grafana:3000" -o /dev/null) -ne 200 ]; do - echo '.' - sleep 15; - done - containers: - - name: grafana-import-dashboards - image: giantswarm/tiny-tools - command: ["/bin/sh", "-c"] - workingDir: /opt/grafana-import-dashboards - args: - - > - for file in *-datasource.json ; do - if [ -e "$file" ] ; then - echo "importing $file" && - curl --silent --fail --show-error \ - --request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/datasources \ - --header "Content-Type: application/json" \ - --data-binary "@$file" ; - echo "" ; - fi - done ; - for file in *-dashboard.json ; do - if [ -e "$file" ] ; then - echo "importing $file" && - ( echo '{"dashboard":'; \ - cat "$file"; \ - echo ',"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}' ) \ - | jq -c '.' \ - | curl --silent --fail --show-error \ - --request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/dashboards/import \ - --header "Content-Type: application/json" \ - --data-binary "@-" ; - echo "" ; - fi - done - - env: - - name: GF_ADMIN_USER - valueFrom: - secretKeyRef: - name: grafana - key: admin-username - - name: GF_ADMIN_PASSWORD - valueFrom: - secretKeyRef: - name: grafana - key: admin-password - volumeMounts: - - name: config-volume - mountPath: /opt/grafana-import-dashboards - restartPolicy: Never - volumes: - - name: config-volume - configMap: - name: grafana-import-dashboards ---- -# apiVersion: extensions/v1beta1 -# kind: Ingress -# metadata: -# name: grafana -# namespace: monitoring -# spec: -# rules: -# - host: ..k8s.gigantic.io -# http: -# paths: -# - path: / -# backend: -# serviceName: grafana -# servicePort: 3000 ---- -apiVersion: v1 -kind: Secret -data: - admin-password: YWRtaW4= - admin-username: YWRtaW4= -metadata: - name: grafana - namespace: monitoring -type: Opaque ---- -apiVersion: v1 -kind: Service -metadata: - name: grafana - namespace: monitoring - labels: - app: grafana - component: core -spec: - type: NodePort - ports: - - port: 3000 - selector: - app: grafana - component: core --- apiVersion: v1 data: @@ -2434,9 +9,9 @@ data: rule_files: - "/etc/prometheus-rules/*.rules" scrape_configs: - - job_name: 'node' + - job_name: 'jmx-kafka' static_configs: - - targets: ['localhost:9100'] + - targets: ['54.241.141.202:32401'] # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L37 - job_name: 'kubernetes-nodes' tls_config: From ce225b1597bd66b94994b68b81cc3d97583f422f Mon Sep 17 00:00:00 2001 From: joyfulbean Date: Mon, 25 Jul 2022 04:43:22 +0000 Subject: [PATCH 11/12] separate grafana and prometheus2 --- alertmanager/Kustomization | 5 + alertmanager/alert-config.yml | 65 + alertmanager/alert-svc.yml | 20 + alertmanager/alert-template-config.yml | 177 ++ alertmanager/alertmanager.yml | 42 + {yahoo-kafka-manager => cmak}/Kustomization | 0 .../kafka-manager-service.yml | 0 .../kafka-manager.yml | 0 grafana/grafana-dashboard.yml | 1841 +++++++++++++++++ grafana/grafana-deploy.yml | 66 + grafana/grafana-secret.yml | 10 + grafana/grafana-svc.yml | 16 + grafana/import-dashboard-job.yml | 80 + .../kminion-service.yaml | 0 .../kminion.yaml | 0 .../kustomization.yaml | 0 .../avro-tools}/avro-tools-config.yml | 0 .../avro-tools}/rest-service.yml | 0 .../avro-tools}/rest.yml | 0 .../avro-tools}/schemas-service.yml | 0 .../avro-tools}/schemas.yml | 0 .../avro-tools}/test/70rest-test1.yml | 0 .../avro-tools}/test/rest-curl.yml | 0 namespace.yml | 5 - ...onfig.yml => kafka-jmx-metrics-config.yml} | 0 prometheus/node-exporter-daemonset.yml | 53 - prometheus/node-exporter-svc.yml | 18 - rbac-namespace-default/Kustomization | 2 + rbac-namespace-default/namespace.yml | 12 + .../prometheus.yml | 43 +- .../aks-storageclass-broker-managed.yaml | 9 - .../aks-storageclass-zookeeper-managed.yaml | 9 - variants/aks-managed/kustomization.yaml | 7 - variants/aks-managed/volume-claims.yaml | 32 - .../aws-gp2/aws-storageclass-broker-gp2.yaml | 11 - .../aws-storageclass-zookeeper-gp2.yaml | 11 - variants/aws-gp2/kustomization.yaml | 7 - variants/aws-gp2/volume-claims.yaml | 32 - variants/dev-small/jmx-disable.yaml | 13 - variants/dev-small/kustomization.yaml | 20 - variants/dev-small/listener-localhost.json | 4 - variants/dev-small/num-partitions-1.json | 4 - .../gke-storageclass-broker-pd.yaml | 9 - ...lass-zookeeper-regional-euwest1cd-ssh.yaml | 11 - .../gke-storageclass-zookeeper-ssd.yaml | 9 - variants/gke-regional/kustomization.yaml | 8 - variants/gke-regional/volume-claims.yaml | 48 - .../additional-scrape-configs.yaml | 11 - .../alertmanager-main-scale-1.yaml | 7 - .../k8s-kafka-rbac.yaml | 32 - .../k8s-kafka-servicemonitor.yaml | 38 - .../k8s-minion-servicemonitor.yaml | 22 - .../kustomization.yaml | 30 - .../prometheus-k8s-2.9.2.yaml | 8 - .../prometheus-k8s-nodeport.yaml | 10 - .../prometheus-k8s-scale-1.yaml | 7 - ...iscovery-by-prometheus-io-annotations.yaml | 35 - .../kafka-scale1-overrides.json | 10 - variants/scale-1-ephemeral/kafka.yaml | 12 - variants/scale-1-ephemeral/kustomization.yaml | 5 - variants/scale-1-ephemeral/zookeeper.yaml | 32 - variants/scale-1/kafka-scale1-overrides.json | 12 - variants/scale-1/kafka.yaml | 7 - variants/scale-1/kustomization.yaml | 13 - variants/scale-1/zookeeper.yaml | 38 - variants/scale-2/kafka-scale2-overrides.json | 10 - variants/scale-2/kafka.yaml | 7 - variants/scale-2/kustomization.yaml | 13 - variants/scale-2/zookeeper.yaml | 33 - .../kafka-zookeeper-connect-only-zoo.json | 6 - variants/scale-3-3/kustomization.yaml | 12 - variants/scale-3-3/only-zoo-3.yaml | 33 - .../kafka-zookeeper-connect-only-zoo.json | 4 - variants/scale-3-5-nopzoo/kustomization.yaml | 12 - variants/scale-3-5-nopzoo/only-zoo-5.yaml | 33 - variants/scale-3-5/kustomization.yaml | 2 - variants/scale-6-9/kafka-6.yaml | 6 - .../kafka-zookeeper-connect-only-zoo.json | 6 - variants/scale-6-9/kustomization.yaml | 26 - variants/scale-6-9/lifecycle-remove.json | 4 - variants/scale-6-9/zoo-9.yaml | 33 - .../zoo-readiness-without-shell.yaml | 6 - 82 files changed, 2362 insertions(+), 912 deletions(-) create mode 100644 alertmanager/Kustomization create mode 100644 alertmanager/alert-config.yml create mode 100644 alertmanager/alert-svc.yml create mode 100644 alertmanager/alert-template-config.yml create mode 100644 alertmanager/alertmanager.yml rename {yahoo-kafka-manager => cmak}/Kustomization (100%) rename {yahoo-kafka-manager => cmak}/kafka-manager-service.yml (100%) rename {yahoo-kafka-manager => cmak}/kafka-manager.yml (100%) create mode 100644 grafana/grafana-dashboard.yml create mode 100644 grafana/grafana-deploy.yml create mode 100644 grafana/grafana-secret.yml create mode 100644 grafana/grafana-svc.yml create mode 100644 grafana/import-dashboard-job.yml rename {consumers-prometheus => kminions}/kminion-service.yaml (100%) rename {consumers-prometheus => kminions}/kminion.yaml (100%) rename {consumers-prometheus => kminions}/kustomization.yaml (100%) rename {avro-tools => maintenance/avro-tools}/avro-tools-config.yml (100%) rename {avro-tools => maintenance/avro-tools}/rest-service.yml (100%) rename {avro-tools => maintenance/avro-tools}/rest.yml (100%) rename {avro-tools => maintenance/avro-tools}/schemas-service.yml (100%) rename {avro-tools => maintenance/avro-tools}/schemas.yml (100%) rename {avro-tools => maintenance/avro-tools}/test/70rest-test1.yml (100%) rename {avro-tools => maintenance/avro-tools}/test/rest-curl.yml (100%) delete mode 100644 namespace.yml rename prometheus/{metrics-config.yml => kafka-jmx-metrics-config.yml} (100%) delete mode 100644 prometheus/node-exporter-daemonset.yml delete mode 100644 prometheus/node-exporter-svc.yml create mode 100644 rbac-namespace-default/namespace.yml rename variants/prometheus-operator-example/k8s-cluster-rbac.yaml => rbac-namespace-default/prometheus.yml (54%) delete mode 100644 variants/aks-managed/aks-storageclass-broker-managed.yaml delete mode 100644 variants/aks-managed/aks-storageclass-zookeeper-managed.yaml delete mode 100644 variants/aks-managed/kustomization.yaml delete mode 100644 variants/aks-managed/volume-claims.yaml delete mode 100644 variants/aws-gp2/aws-storageclass-broker-gp2.yaml delete mode 100644 variants/aws-gp2/aws-storageclass-zookeeper-gp2.yaml delete mode 100644 variants/aws-gp2/kustomization.yaml delete mode 100644 variants/aws-gp2/volume-claims.yaml delete mode 100644 variants/dev-small/jmx-disable.yaml delete mode 100644 variants/dev-small/kustomization.yaml delete mode 100644 variants/dev-small/listener-localhost.json delete mode 100644 variants/dev-small/num-partitions-1.json delete mode 100644 variants/gke-regional/gke-storageclass-broker-pd.yaml delete mode 100644 variants/gke-regional/gke-storageclass-zookeeper-regional-euwest1cd-ssh.yaml delete mode 100644 variants/gke-regional/gke-storageclass-zookeeper-ssd.yaml delete mode 100644 variants/gke-regional/kustomization.yaml delete mode 100644 variants/gke-regional/volume-claims.yaml delete mode 100644 variants/prometheus-operator-example/additional-scrape-configs.yaml delete mode 100644 variants/prometheus-operator-example/alertmanager-main-scale-1.yaml delete mode 100644 variants/prometheus-operator-example/k8s-kafka-rbac.yaml delete mode 100644 variants/prometheus-operator-example/k8s-kafka-servicemonitor.yaml delete mode 100644 variants/prometheus-operator-example/k8s-minion-servicemonitor.yaml delete mode 100644 variants/prometheus-operator-example/kustomization.yaml delete mode 100644 variants/prometheus-operator-example/prometheus-k8s-2.9.2.yaml delete mode 100644 variants/prometheus-operator-example/prometheus-k8s-nodeport.yaml delete mode 100644 variants/prometheus-operator-example/prometheus-k8s-scale-1.yaml delete mode 100644 variants/prometheus-operator-example/scrape-configs/pods-discovery-by-prometheus-io-annotations.yaml delete mode 100644 variants/scale-1-ephemeral/kafka-scale1-overrides.json delete mode 100644 variants/scale-1-ephemeral/kafka.yaml delete mode 100644 variants/scale-1-ephemeral/kustomization.yaml delete mode 100644 variants/scale-1-ephemeral/zookeeper.yaml delete mode 100644 variants/scale-1/kafka-scale1-overrides.json delete mode 100644 variants/scale-1/kafka.yaml delete mode 100644 variants/scale-1/kustomization.yaml delete mode 100644 variants/scale-1/zookeeper.yaml delete mode 100644 variants/scale-2/kafka-scale2-overrides.json delete mode 100644 variants/scale-2/kafka.yaml delete mode 100644 variants/scale-2/kustomization.yaml delete mode 100644 variants/scale-2/zookeeper.yaml delete mode 100644 variants/scale-3-3/kafka-zookeeper-connect-only-zoo.json delete mode 100644 variants/scale-3-3/kustomization.yaml delete mode 100644 variants/scale-3-3/only-zoo-3.yaml delete mode 100644 variants/scale-3-5-nopzoo/kafka-zookeeper-connect-only-zoo.json delete mode 100644 variants/scale-3-5-nopzoo/kustomization.yaml delete mode 100644 variants/scale-3-5-nopzoo/only-zoo-5.yaml delete mode 100644 variants/scale-3-5/kustomization.yaml delete mode 100644 variants/scale-6-9/kafka-6.yaml delete mode 100644 variants/scale-6-9/kafka-zookeeper-connect-only-zoo.json delete mode 100644 variants/scale-6-9/kustomization.yaml delete mode 100644 variants/scale-6-9/lifecycle-remove.json delete mode 100644 variants/scale-6-9/zoo-9.yaml delete mode 100644 variants/scale-6-9/zoo-readiness-without-shell.yaml diff --git a/alertmanager/Kustomization b/alertmanager/Kustomization new file mode 100644 index 00000000..324049ed --- /dev/null +++ b/alertmanager/Kustomization @@ -0,0 +1,5 @@ +resources: +- alert-config.yml +- alert-template-config.yml +- alertmanager.yml +- alert-svc.yml diff --git a/alertmanager/alert-config.yml b/alertmanager/alert-config.yml new file mode 100644 index 00000000..99806f1f --- /dev/null +++ b/alertmanager/alert-config.yml @@ -0,0 +1,65 @@ +--- +kind: ConfigMap +apiVersion: v1 +metadata: + name: alertmanager + namespace: monitoring +data: + config.yml: |- + global: + # ResolveTimeout is the time after which an alert is declared resolved + # if it has not been updated. + resolve_timeout: 5m + # The smarthost and SMTP sender used for mail notifications. + smtp_smarthost: 'smtp.gmail.com:587' + smtp_from: 'foo@bar.com' + smtp_auth_username: 'foo@bar.com' + smtp_auth_password: 'barfoo' + # The API URL to use for Slack notifications. + slack_api_url: 'https://hooks.slack.com/services/some/api/token' + # # The directory from which notification templates are read. + templates: + - '/etc/alertmanager-templates/*.tmpl' + # The root route on which each incoming alert enters. + route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + group_by: ['alertname', 'cluster', 'service'] + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + #repeat_interval: 1m + repeat_interval: 15m + # A default receiver + # If an alert isn't caught by a route, send it to default. + receiver: default + # All the above attributes are inherited by all child routes and can + # overwritten on each. + # The child route trees. + routes: + # Send severity=slack alerts to slack. + - match: + severity: slack + receiver: slack_alert + # - match: + # severity: email + # receiver: email_alert + receivers: + - name: 'default' + slack_configs: + - channel: '#alertmanager-test' + text: '{{ template "slack.devops.text" . }}' + send_resolved: true + - name: 'slack_alert' + slack_configs: + - channel: '#alertmanager-test' + send_resolved: true diff --git a/alertmanager/alert-svc.yml b/alertmanager/alert-svc.yml new file mode 100644 index 00000000..a002f60c --- /dev/null +++ b/alertmanager/alert-svc.yml @@ -0,0 +1,20 @@ +--- +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/scrape: 'true' + prometheus.io/path: '/metrics' + labels: + name: alertmanager + name: alertmanager + namespace: monitoring +spec: + selector: + app: alertmanager + type: NodePort + ports: + - name: alertmanager + protocol: TCP + port: 9093 + targetPort: 9093 diff --git a/alertmanager/alert-template-config.yml b/alertmanager/alert-template-config.yml new file mode 100644 index 00000000..74451d30 --- /dev/null +++ b/alertmanager/alert-template-config.yml @@ -0,0 +1,177 @@ +apiVersion: v1 +data: + default.tmpl: | + {{ define "__alertmanager" }}AlertManager{{ end }} + {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} + {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} + {{ define "__description" }}{{ end }} + {{ define "__text_alert_list" }}{{ range . }}Labels: + {{ range .Labels.SortedPairs }} - {{ .Name }} = {{ .Value }} + {{ end }}Annotations: + {{ range .Annotations.SortedPairs }} - {{ .Name }} = {{ .Value }} + {{ end }}Source: {{ .GeneratorURL }} + {{ end }}{{ end }} + {{ define "slack.default.title" }}{{ template "__subject" . }}{{ end }} + {{ define "slack.default.username" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "slack.default.fallback" }}{{ template "slack.default.title" . }} | {{ template "slack.default.titlelink" . }}{{ end }} + {{ define "slack.default.pretext" }}{{ end }} + {{ define "slack.default.titlelink" }}{{ template "__alertmanagerURL" . }}{{ end }} + {{ define "slack.default.iconemoji" }}{{ end }} + {{ define "slack.default.iconurl" }}{{ end }} + {{ define "slack.default.text" }}{{ end }} + {{ define "hipchat.default.from" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "hipchat.default.message" }}{{ template "__subject" . }}{{ end }} + {{ define "pagerduty.default.description" }}{{ template "__subject" . }}{{ end }} + {{ define "pagerduty.default.client" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "pagerduty.default.clientURL" }}{{ template "__alertmanagerURL" . }}{{ end }} + {{ define "pagerduty.default.instances" }}{{ template "__text_alert_list" . }}{{ end }} + {{ define "opsgenie.default.message" }}{{ template "__subject" . }}{{ end }} + {{ define "opsgenie.default.description" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} + {{ if gt (len .Alerts.Firing) 0 -}} + Alerts Firing: + {{ template "__text_alert_list" .Alerts.Firing }} + {{- end }} + {{ if gt (len .Alerts.Resolved) 0 -}} + Alerts Resolved: + {{ template "__text_alert_list" .Alerts.Resolved }} + {{- end }} + {{- end }} + {{ define "opsgenie.default.source" }}{{ template "__alertmanagerURL" . }}{{ end }} + {{ define "victorops.default.message" }}{{ template "__subject" . }} | {{ template "__alertmanagerURL" . }}{{ end }} + {{ define "victorops.default.from" }}{{ template "__alertmanager" . }}{{ end }} + {{ define "email.default.subject" }}{{ template "__subject" . }}{{ end }} + {{ define "email.default.html" }} + + + + + + + {{ template "__subject" . }} + + + + + + + + +
+
+ + + + + + + +
+ {{ .Alerts | len }} alert{{ if gt (len .Alerts) 1 }}s{{ end }} for {{ range .GroupLabels.SortedPairs }} + {{ .Name }}={{ .Value }} + {{ end }} +
+ + + + + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + {{ range .Alerts.Firing }} + + + + {{ end }} + {{ if gt (len .Alerts.Resolved) 0 }} + {{ if gt (len .Alerts.Firing) 0 }} + + + + {{ end }} + + + + {{ end }} + {{ range .Alerts.Resolved }} + + + + {{ end }} +
+ View in {{ template "__alertmanager" . }} +
+ [{{ .Alerts.Firing | len }}] Firing +
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + Source
+
+
+
+
+
+ [{{ .Alerts.Resolved | len }}] Resolved +
+ Labels
+ {{ range .Labels.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + {{ if gt (len .Annotations) 0 }}Annotations
{{ end }} + {{ range .Annotations.SortedPairs }}{{ .Name }} = {{ .Value }}
{{ end }} + Source
+
+
+
+
+ + + {{ end }} + {{ define "pushover.default.title" }}{{ template "__subject" . }}{{ end }} + {{ define "pushover.default.message" }}{{ .CommonAnnotations.SortedPairs.Values | join " " }} + {{ if gt (len .Alerts.Firing) 0 }} + Alerts Firing: + {{ template "__text_alert_list" .Alerts.Firing }} + {{ end }} + {{ if gt (len .Alerts.Resolved) 0 }} + Alerts Resolved: + {{ template "__text_alert_list" .Alerts.Resolved }} + {{ end }} + {{ end }} + {{ define "pushover.default.url" }}{{ template "__alertmanagerURL" . }}{{ end }} + slack.tmpl: | + {{ define "slack.devops.text" }} + {{range .Alerts}}{{.Annotations.DESCRIPTION}} + {{end}} + {{ end }} +kind: ConfigMap +metadata: + creationTimestamp: null + name: alertmanager-templates + namespace: monitoring diff --git a/alertmanager/alertmanager.yml b/alertmanager/alertmanager.yml new file mode 100644 index 00000000..5954c30b --- /dev/null +++ b/alertmanager/alertmanager.yml @@ -0,0 +1,42 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alertmanager + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: alertmanager + template: + metadata: + name: alertmanager + labels: + app: alertmanager + spec: + containers: + - name: alertmanager + image: quay.io/prometheus/alertmanager:v0.7.1 + args: + - '-config.file=/etc/alertmanager/config.yml' + - '-storage.path=/alertmanager' + ports: + - name: alertmanager + containerPort: 9093 + volumeMounts: + - name: config-volume + mountPath: /etc/alertmanager + - name: templates-volume + mountPath: /etc/alertmanager-templates + - name: alertmanager + mountPath: /alertmanager + volumes: + - name: config-volume + configMap: + name: alertmanager + - name: templates-volume + configMap: + name: alertmanager-templates + - name: alertmanager + emptyDir: {} diff --git a/yahoo-kafka-manager/Kustomization b/cmak/Kustomization similarity index 100% rename from yahoo-kafka-manager/Kustomization rename to cmak/Kustomization diff --git a/yahoo-kafka-manager/kafka-manager-service.yml b/cmak/kafka-manager-service.yml similarity index 100% rename from yahoo-kafka-manager/kafka-manager-service.yml rename to cmak/kafka-manager-service.yml diff --git a/yahoo-kafka-manager/kafka-manager.yml b/cmak/kafka-manager.yml similarity index 100% rename from yahoo-kafka-manager/kafka-manager.yml rename to cmak/kafka-manager.yml diff --git a/grafana/grafana-dashboard.yml b/grafana/grafana-dashboard.yml new file mode 100644 index 00000000..1fddd5a7 --- /dev/null +++ b/grafana/grafana-dashboard.yml @@ -0,0 +1,1841 @@ +--- +apiVersion: v1 +data: + grafana-net-2-dashboard.json: | + { + "__inputs": [{ + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + }], + "__requires": [{ + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + }, { + "type": "panel", + "id": "text", + "name": "Text", + "version": "" + }, { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "3.1.0" + }, { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }], + "id": null, + "title": "Prometheus Stats", + "tags": [], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": true, + "sharedCrosshair": false, + "rows": [{ + "collapse": false, + "editable": true, + "height": 178, + "panels": [{ + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": ["rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)"], + "datasource": "${DS_PROMETHEUS}", + "decimals": 1, + "editable": true, + "error": false, + "format": "s", + "id": 5, + "interval": null, + "links": [], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "(time() - container_start_time_seconds{container_name=\"kube-apiserver\"})", + "intervalFactor": 2, + "refId": "A", + "step": 4 + }], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current", + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "rangeMaps": [{ + "from": "null", + "to": "null", + "text": "N/A" + }], + "mappingType": 1, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": ["rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)"], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "id": 6, + "interval": null, + "links": [], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [{ + "expr": "prometheus_local_storage_memory_series", + "intervalFactor": 2, + "refId": "A", + "step": 4 + }], + "thresholds": "1,5", + "title": "Local Storage Memory Series", + "type": "singlestat", + "valueFontSize": "70%", + "valueMaps": [], + "valueName": "current", + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "rangeMaps": [{ + "from": "null", + "to": "null", + "text": "N/A" + }], + "mappingType": 1, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": ["rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)"], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "none", + "id": 7, + "interval": null, + "links": [], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [{ + "expr": "prometheus_local_storage_indexing_queue_length", + "intervalFactor": 2, + "refId": "A", + "step": 4 + }], + "thresholds": "500,4000", + "title": "Internal Storage Queue Length", + "type": "singlestat", + "valueFontSize": "70%", + "valueMaps": [{ + "op": "=", + "text": "Empty", + "value": "0" + }], + "valueName": "current", + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "rangeMaps": [{ + "from": "null", + "to": "null", + "text": "N/A" + }], + "mappingType": 1, + "gauge": { + "show": false, + "minValue": 0, + "maxValue": 100, + "thresholdMarkers": true, + "thresholdLabels": false + } + }, { + "content": "\"Prometheus\nPrometheus\n\n

You're using Prometheus, an open-source systems monitoring and alerting toolkit originally built at SoundCloud. For more information, check out the Grafana and Prometheus projects.

", + "editable": true, + "error": false, + "id": 9, + "links": [], + "mode": "html", + "span": 3, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + }], + "title": "New row" + }, { + "collapse": false, + "editable": true, + "height": 227, + "panels": [{ + "aliasColors": { + "prometheus": "#C15C17", + "{instance=\"localhost:9090\",job=\"prometheus\"}": "#C15C17" + }, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "rate(prometheus_local_storage_ingested_samples_total[5m])", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "metric": "", + "refId": "A", + "step": 2 + }], + "timeFrom": null, + "timeShift": null, + "title": "Samples ingested (rate-5m)", + "tooltip": { + "shared": true, + "value_type": "cumulative", + "ordering": "alphabetical", + "msResolution": false + }, + "type": "graph", + "yaxes": [{ + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }, { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }], + "xaxis": { + "show": true + } + }, { + "content": "#### Samples Ingested\nThis graph displays the count of samples ingested by the Prometheus server, as measured over the last 5 minutes, per time series in the range vector. When troubleshooting an issue on IRC or Github, this is often the first stat requested by the Prometheus team. ", + "editable": true, + "error": false, + "id": 8, + "links": [], + "mode": "markdown", + "span": 2.995914043583536, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + }], + "title": "New row" + }, { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [{ + "aliasColors": { + "prometheus": "#F9BA8F", + "{instance=\"localhost:9090\",interval=\"5s\",job=\"prometheus\"}": "#F9BA8F" + }, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "rate(prometheus_target_interval_length_seconds_count[5m])", + "intervalFactor": 2, + "legendFormat": "{{job}}", + "refId": "A", + "step": 2 + }], + "timeFrom": null, + "timeShift": null, + "title": "Target Scrapes (last 5m)", + "tooltip": { + "shared": true, + "value_type": "cumulative", + "ordering": "alphabetical", + "msResolution": false + }, + "type": "graph", + "yaxes": [{ + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }, { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }], + "xaxis": { + "show": true + } + }, { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "prometheus_target_interval_length_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{quantile}} ({{interval}})", + "metric": "", + "refId": "A", + "step": 2 + }], + "timeFrom": null, + "timeShift": null, + "title": "Scrape Duration", + "tooltip": { + "shared": true, + "value_type": "cumulative", + "ordering": "alphabetical", + "msResolution": false + }, + "type": "graph", + "yaxes": [{ + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }, { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }], + "xaxis": { + "show": true + } + }, { + "content": "#### Scrapes\nPrometheus scrapes metrics from instrumented jobs, either directly or via an intermediary push gateway for short-lived jobs. Target scrapes will show how frequently targets are scraped, as measured over the last 5 minutes, per time series in the range vector. Scrape Duration will show how long the scrapes are taking, with percentiles available as series. ", + "editable": true, + "error": false, + "id": 11, + "links": [], + "mode": "markdown", + "span": 3, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + }], + "title": "New row" + }, { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [{ + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": null, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "prometheus_evaluator_duration_seconds{quantile!=\"0.01\", quantile!=\"0.05\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{quantile}}", + "refId": "A", + "step": 2 + }], + "timeFrom": null, + "timeShift": null, + "title": "Rule Eval Duration", + "tooltip": { + "shared": true, + "value_type": "cumulative", + "ordering": "alphabetical", + "msResolution": false + }, + "type": "graph", + "yaxes": [{ + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "percentunit", + "label": "" + }, { + "show": true, + "min": null, + "max": null, + "logBase": 1, + "format": "short" + }], + "xaxis": { + "show": true + } + }, { + "content": "#### Rule Evaluation Duration\nThis graph panel plots the duration for all evaluations to execute. The 50th percentile, 90th percentile and 99th percentile are shown as three separate series to help identify outliers that may be skewing the data.", + "editable": true, + "error": false, + "id": 15, + "links": [], + "mode": "markdown", + "span": 3, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + }], + "title": "New row" + }], + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": ["5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d"], + "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] + }, + "templating": { + "list": [] + }, + "annotations": { + "list": [] + }, + "refresh": false, + "schemaVersion": 12, + "version": 0, + "links": [{ + "icon": "info", + "tags": [], + "targetBlank": true, + "title": "Grafana Docs", + "tooltip": "", + "type": "link", + "url": "http://www.grafana.org/docs" + }, { + "icon": "info", + "tags": [], + "targetBlank": true, + "title": "Prometheus Docs", + "type": "link", + "url": "http://prometheus.io/docs/introduction/overview/" + }], + "gnetId": 2, + "description": "The official, pre-built Prometheus Stats Dashboard." + } + grafana-net-737-dashboard.json: | + { + "__inputs": [{ + "name": "DS_PROMETHEUS", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + }], + "__requires": [{ + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + }, { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "3.1.0" + }, { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }], + "id": null, + "title": "Kubernetes Pod Resources", + "description": "Shows resource usage of Kubernetes pods.", + "tags": [ + "kubernetes" + ], + "style": "dark", + "timezone": "browser", + "editable": true, + "hideControls": false, + "sharedCrosshair": false, + "rows": [{ + "collapse": false, + "editable": true, + "height": "250px", + "panels": [{ + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "180px", + "id": 4, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum (container_memory_working_set_bytes{id=\"/\",instance=~\"^$instance$\"}) / sum (machine_memory_bytes{instance=~\"^$instance$\"}) * 100", + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 2 + }], + "thresholds": "65, 90", + "timeFrom": "1m", + "timeShift": null, + "title": "Memory Working Set", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "180px", + "id": 6, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum(rate(container_cpu_usage_seconds_total{id=\"/\",instance=~\"^$instance$\"}[1m])) / sum (machine_cpu_cores{instance=~\"^$instance$\"}) * 100", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "65, 90", + "timeFrom": "1m", + "timeShift": null, + "title": "Cpu Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "180px", + "id": 7, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum(container_fs_usage_bytes{id=\"/\",instance=~\"^$instance$\"}) / sum(container_fs_limit_bytes{id=\"/\",instance=~\"^$instance$\"}) * 100", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 10 + }], + "thresholds": "65, 90", + "timeFrom": "1m", + "timeShift": null, + "title": "Filesystem Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "hideTimeOverride": true, + "id": 9, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "20%", + "prefix": "", + "prefixFontSize": "20%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum(container_memory_working_set_bytes{id=\"/\",instance=~\"^$instance$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "", + "timeFrom": "1m", + "title": "Used", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "hideTimeOverride": true, + "id": 10, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum (machine_memory_bytes{instance=~\"^$instance$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "", + "timeFrom": "1m", + "title": "Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "hideTimeOverride": true, + "id": 11, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": " cores", + "postfixFontSize": "30%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum (rate (container_cpu_usage_seconds_total{id=\"/\",instance=~\"^$instance$\"}[1m]))", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "", + "timeFrom": "1m", + "timeShift": null, + "title": "Used", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "hideTimeOverride": true, + "id": 12, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": " cores", + "postfixFontSize": "30%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum (machine_cpu_cores{instance=~\"^$instance$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "", + "timeFrom": "1m", + "title": "Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "hideTimeOverride": true, + "id": 13, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum(container_fs_usage_bytes{id=\"/\",instance=~\"^$instance$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "", + "timeFrom": "1m", + "title": "Used", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "height": "1px", + "hideTimeOverride": true, + "id": 14, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [{ + "name": "value to text", + "value": 1 + }, { + "name": "range to text", + "value": 2 + }], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [{ + "from": "null", + "text": "N/A", + "to": "null" + }], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [{ + "expr": "sum (container_fs_limit_bytes{id=\"/\",instance=~\"^$instance$\"})", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + }], + "thresholds": "", + "timeFrom": "1m", + "title": "Total", + "type": "singlestat", + "valueFontSize": "50%", + "valueMaps": [{ + "op": "=", + "text": "N/A", + "value": "null" + }], + "valueName": "current" + }, { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)", + "thresholdLine": false + }, + "height": "200px", + "id": 32, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "sum(rate(container_network_receive_bytes_total{instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m]))", + "interval": "", + "intervalFactor": 2, + "legendFormat": "receive", + "metric": "network", + "refId": "A", + "step": 240 + }, { + "expr": "- sum(rate(container_network_transmit_bytes_total{instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m]))", + "interval": "", + "intervalFactor": 2, + "legendFormat": "transmit", + "metric": "network", + "refId": "B", + "step": 240 + }], + "timeFrom": null, + "timeShift": null, + "title": "Network", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [{ + "format": "Bps", + "label": "transmit / receive", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + }] + }], + "showTitle": true, + "title": "all pods" + }, { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [{ + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 3, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "height": "", + "id": 17, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "sum(rate(container_cpu_usage_seconds_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "metric": "container_cpu", + "refId": "A", + "step": 240 + }], + "timeFrom": null, + "timeShift": null, + "title": "Cpu Usage", + "tooltip": { + "msResolution": true, + "shared": false, + "sort": 2, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [{ + "format": "none", + "label": "cores", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + }] + }, { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 33, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "sum (container_memory_working_set_bytes{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}) by (pod_name)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "metric": "", + "refId": "A", + "step": 240 + }], + "timeFrom": null, + "timeShift": null, + "title": "Memory Working Set", + "tooltip": { + "msResolution": false, + "shared": false, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [{ + "format": "bytes", + "label": "used", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + }] + }, { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 16, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "sum (rate (container_network_receive_bytes_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }} < in", + "metric": "network", + "refId": "A", + "step": 240 + }, { + "expr": "- sum (rate (container_network_transmit_bytes_total{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}[1m])) by (pod_name)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }} > out", + "metric": "network", + "refId": "B", + "step": 240 + }], + "timeFrom": null, + "timeShift": null, + "title": "Network", + "tooltip": { + "msResolution": false, + "shared": false, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [{ + "format": "Bps", + "label": "transmit / receive", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + }] + }, { + "aliasColors": {}, + "bars": false, + "datasource": "${DS_PROMETHEUS}", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 34, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [{ + "expr": "sum(container_fs_usage_bytes{image!=\"\",name=~\"^k8s_.*\",instance=~\"^$instance$\",namespace=~\"^$namespace$\"}) by (pod_name)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "metric": "network", + "refId": "A", + "step": 240 + }], + "timeFrom": null, + "timeShift": null, + "title": "Filesystem", + "tooltip": { + "msResolution": false, + "shared": false, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "show": true + }, + "yaxes": [{ + "format": "bytes", + "label": "used", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + }] + }], + "showTitle": true, + "title": "each pod" + }], + "time": { + "from": "now-3d", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "templating": { + "list": [{ + "allValue": ".*", + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": false, + "name": "instance", + "options": [], + "query": "label_values(instance)", + "refresh": 1, + "regex": "", + "type": "query" + }, { + "current": {}, + "datasource": "${DS_PROMETHEUS}", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": true, + "name": "namespace", + "options": [], + "query": "label_values(namespace)", + "refresh": 1, + "regex": "", + "type": "query" + }] + }, + "annotations": { + "list": [] + }, + "refresh": false, + "schemaVersion": 12, + "version": 8, + "links": [], + "gnetId": 737 + } + prometheus-datasource.json: | + { + "name": "prometheus", + "type": "prometheus", + "url": "http://prometheus:9090", + "access": "proxy", + "basicAuth": false + } +kind: ConfigMap +metadata: + creationTimestamp: null + name: grafana-import-dashboards + namespace: monitoring diff --git a/grafana/grafana-deploy.yml b/grafana/grafana-deploy.yml new file mode 100644 index 00000000..c7993d80 --- /dev/null +++ b/grafana/grafana-deploy.yml @@ -0,0 +1,66 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana-core + namespace: monitoring + labels: + app: grafana + component: core +spec: + replicas: 1 + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + component: core + spec: + containers: + - image: grafana/grafana:4.2.0 + name: grafana-core + imagePullPolicy: IfNotPresent + # env: + resources: + # keep request = limit to keep this container in guaranteed class + limits: + cpu: 100m + memory: 100Mi + requests: + cpu: 100m + memory: 100Mi + env: + # The following env variables set up basic auth twith the default admin user and admin password. + - name: GF_AUTH_BASIC_ENABLED + value: "true" + - name: GF_SECURITY_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana + key: admin-username + - name: GF_SECURITY_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana + key: admin-password + - name: GF_AUTH_ANONYMOUS_ENABLED + value: "false" + # - name: GF_AUTH_ANONYMOUS_ORG_ROLE + # value: Admin + # does not really work, because of template variables in exported dashboards: + # - name: GF_DASHBOARDS_JSON_ENABLED + # value: "true" + readinessProbe: + httpGet: + path: /login + port: 3000 + # initialDelaySeconds: 30 + # timeoutSeconds: 1 + volumeMounts: + - name: grafana-persistent-storage + mountPath: /var/lib/grafana + volumes: + - name: grafana-persistent-storage + emptyDir: {} diff --git a/grafana/grafana-secret.yml b/grafana/grafana-secret.yml new file mode 100644 index 00000000..9023b292 --- /dev/null +++ b/grafana/grafana-secret.yml @@ -0,0 +1,10 @@ +--- +apiVersion: v1 +kind: Secret +data: + admin-password: YWRtaW4= + admin-username: YWRtaW4= +metadata: + name: grafana + namespace: monitoring +type: Opaque diff --git a/grafana/grafana-svc.yml b/grafana/grafana-svc.yml new file mode 100644 index 00000000..ac1e0d7a --- /dev/null +++ b/grafana/grafana-svc.yml @@ -0,0 +1,16 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: monitoring + labels: + app: grafana + component: core +spec: + type: NodePort + ports: + - port: 3000 + selector: + app: grafana + component: core diff --git a/grafana/import-dashboard-job.yml b/grafana/import-dashboard-job.yml new file mode 100644 index 00000000..5bcfdf73 --- /dev/null +++ b/grafana/import-dashboard-job.yml @@ -0,0 +1,80 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: grafana-import-dashboards + namespace: monitoring + labels: + app: grafana + component: import-dashboards +spec: + template: + metadata: + name: grafana-import-dashboards + labels: + app: grafana + component: import-dashboards + spec: + serviceAccountName: prometheus-k8s + initContainers: + - name: wait-for-grafana + image: giantswarm/tiny-tools + args: + - /bin/sh + - -c + - > + set -x; + while [ $(curl -Lsw '%{http_code}' "http://grafana:3000" -o /dev/null) -ne 200 ]; do + echo '.' + sleep 15; + done + containers: + - name: grafana-import-dashboards + image: giantswarm/tiny-tools + command: ["/bin/sh", "-c"] + workingDir: /opt/grafana-import-dashboards + args: + - > + for file in *-datasource.json ; do + if [ -e "$file" ] ; then + echo "importing $file" && + curl --silent --fail --show-error \ + --request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/datasources \ + --header "Content-Type: application/json" \ + --data-binary "@$file" ; + echo "" ; + fi + done ; + for file in *-dashboard.json ; do + if [ -e "$file" ] ; then + echo "importing $file" && + ( echo '{"dashboard":'; \ + cat "$file"; \ + echo ',"overwrite":true,"inputs":[{"name":"DS_PROMETHEUS","type":"datasource","pluginId":"prometheus","value":"prometheus"}]}' ) \ + | jq -c '.' \ + | curl --silent --fail --show-error \ + --request POST http://${GF_ADMIN_USER}:${GF_ADMIN_PASSWORD}@grafana:3000/api/dashboards/import \ + --header "Content-Type: application/json" \ + --data-binary "@-" ; + echo "" ; + fi + done + env: + - name: GF_ADMIN_USER + valueFrom: + secretKeyRef: + name: grafana + key: admin-username + - name: GF_ADMIN_PASSWORD + valueFrom: + secretKeyRef: + name: grafana + key: admin-password + volumeMounts: + - name: config-volume + mountPath: /opt/grafana-import-dashboards + restartPolicy: Never + volumes: + - name: config-volume + configMap: + name: grafana-import-dashboards diff --git a/consumers-prometheus/kminion-service.yaml b/kminions/kminion-service.yaml similarity index 100% rename from consumers-prometheus/kminion-service.yaml rename to kminions/kminion-service.yaml diff --git a/consumers-prometheus/kminion.yaml b/kminions/kminion.yaml similarity index 100% rename from consumers-prometheus/kminion.yaml rename to kminions/kminion.yaml diff --git a/consumers-prometheus/kustomization.yaml b/kminions/kustomization.yaml similarity index 100% rename from consumers-prometheus/kustomization.yaml rename to kminions/kustomization.yaml diff --git a/avro-tools/avro-tools-config.yml b/maintenance/avro-tools/avro-tools-config.yml similarity index 100% rename from avro-tools/avro-tools-config.yml rename to maintenance/avro-tools/avro-tools-config.yml diff --git a/avro-tools/rest-service.yml b/maintenance/avro-tools/rest-service.yml similarity index 100% rename from avro-tools/rest-service.yml rename to maintenance/avro-tools/rest-service.yml diff --git a/avro-tools/rest.yml b/maintenance/avro-tools/rest.yml similarity index 100% rename from avro-tools/rest.yml rename to maintenance/avro-tools/rest.yml diff --git a/avro-tools/schemas-service.yml b/maintenance/avro-tools/schemas-service.yml similarity index 100% rename from avro-tools/schemas-service.yml rename to maintenance/avro-tools/schemas-service.yml diff --git a/avro-tools/schemas.yml b/maintenance/avro-tools/schemas.yml similarity index 100% rename from avro-tools/schemas.yml rename to maintenance/avro-tools/schemas.yml diff --git a/avro-tools/test/70rest-test1.yml b/maintenance/avro-tools/test/70rest-test1.yml similarity index 100% rename from avro-tools/test/70rest-test1.yml rename to maintenance/avro-tools/test/70rest-test1.yml diff --git a/avro-tools/test/rest-curl.yml b/maintenance/avro-tools/test/rest-curl.yml similarity index 100% rename from avro-tools/test/rest-curl.yml rename to maintenance/avro-tools/test/rest-curl.yml diff --git a/namespace.yml b/namespace.yml deleted file mode 100644 index a6cf001d..00000000 --- a/namespace.yml +++ /dev/null @@ -1,5 +0,0 @@ ---- -apiVersion: v1 -kind: Namespace -metadata: - name: kafka diff --git a/prometheus/metrics-config.yml b/prometheus/kafka-jmx-metrics-config.yml similarity index 100% rename from prometheus/metrics-config.yml rename to prometheus/kafka-jmx-metrics-config.yml diff --git a/prometheus/node-exporter-daemonset.yml b/prometheus/node-exporter-daemonset.yml deleted file mode 100644 index 41b28ace..00000000 --- a/prometheus/node-exporter-daemonset.yml +++ /dev/null @@ -1,53 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: node-exporter - namespace: kafka - labels: - app: node-exporter -spec: - selector: - matchLabels: - app: node-exporter - template: - metadata: - labels: - app: node-exporter - spec: - hostNetwork: true - hostIPC: true - hostPID: true - containers: - - name: node-exporter - image: prom/node-exporter:v1.0.1 - imagePullPolicy: IfNotPresent - args: - - --path.procfs=/host/proc - - --path.sysfs=/host/sys - resources: - requests: - cpu: 10m - memory: 100Mi - limits: - cpu: 100m - memory: 100Mi - ports: - - name: scrape - containerPort: 9100 - hostPort: 9100 - volumeMounts: - - mountPath: /host/proc - name: proc - readOnly: true - - mountPath: /host/sys - name: sys - readOnly: true - volumes: - - name: proc - hostPath: - path: /proc - type: "" - - name: sys - hostPath: - path: /sys - type: "" diff --git a/prometheus/node-exporter-svc.yml b/prometheus/node-exporter-svc.yml deleted file mode 100644 index bfbc37e4..00000000 --- a/prometheus/node-exporter-svc.yml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - annotations: - prometheus.io/scrape: "true" - name: node-exporter-http - namespace: kafka - labels: - app: node-exporter -spec: - type: NodePort - selector: - app: node-exporter - ports: - - name: scrape - port: 9100 - nodePort: 30002 - protocol: TCP diff --git a/rbac-namespace-default/Kustomization b/rbac-namespace-default/Kustomization index c2155fb0..6cb5aba7 100644 --- a/rbac-namespace-default/Kustomization +++ b/rbac-namespace-default/Kustomization @@ -1,3 +1,5 @@ resources: - node-reader.yml - pod-labler.yml +- prometheus.yml +- namespace.yml diff --git a/rbac-namespace-default/namespace.yml b/rbac-namespace-default/namespace.yml new file mode 100644 index 00000000..58a46786 --- /dev/null +++ b/rbac-namespace-default/namespace.yml @@ -0,0 +1,12 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: kafka + +--- +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring + diff --git a/variants/prometheus-operator-example/k8s-cluster-rbac.yaml b/rbac-namespace-default/prometheus.yml similarity index 54% rename from variants/prometheus-operator-example/k8s-cluster-rbac.yaml rename to rbac-namespace-default/prometheus.yml index 3f57d21d..4748e38d 100644 --- a/variants/prometheus-operator-example/k8s-cluster-rbac.yaml +++ b/rbac-namespace-default/prometheus.yml @@ -1,30 +1,39 @@ -# Allows the "k8s" prometheus from Prometheus Operator contrib to do service discovery in the kafka namespace --- apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole +kind: ClusterRoleBinding metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: +- kind: ServiceAccount name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus rules: -- apiGroups: - - "" +- apiGroups: [""] resources: + - nodes + - nodes/proxy - services - endpoints - pods - verbs: - - get - - list - - watch + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] --- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding +apiVersion: v1 +kind: ServiceAccount metadata: name: prometheus-k8s -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-k8s -subjects: -- kind: ServiceAccount - name: prometheus-k8s namespace: monitoring diff --git a/variants/aks-managed/aks-storageclass-broker-managed.yaml b/variants/aks-managed/aks-storageclass-broker-managed.yaml deleted file mode 100644 index 50ae160b..00000000 --- a/variants/aks-managed/aks-storageclass-broker-managed.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: kafka-broker -provisioner: kubernetes.io/azure-disk -reclaimPolicy: Retain -parameters: - kind: "Managed" - storageaccounttype: Premium_LRS diff --git a/variants/aks-managed/aks-storageclass-zookeeper-managed.yaml b/variants/aks-managed/aks-storageclass-zookeeper-managed.yaml deleted file mode 100644 index 2c4e0ff1..00000000 --- a/variants/aks-managed/aks-storageclass-zookeeper-managed.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: kafka-zookeeper -provisioner: kubernetes.io/azure-disk -reclaimPolicy: Retain -parameters: - kind: "Managed" - storageaccounttype: Premium_LRS diff --git a/variants/aks-managed/kustomization.yaml b/variants/aks-managed/kustomization.yaml deleted file mode 100644 index aca693de..00000000 --- a/variants/aks-managed/kustomization.yaml +++ /dev/null @@ -1,7 +0,0 @@ -bases: -- ../scale-3-5 -resources: -- aks-storageclass-broker-managed.yaml -- aks-storageclass-zookeeper-managed.yaml -patchesStrategicMerge: -- volume-claims.yaml diff --git a/variants/aks-managed/volume-claims.yaml b/variants/aks-managed/volume-claims.yaml deleted file mode 100644 index fb20db89..00000000 --- a/variants/aks-managed/volume-claims.yaml +++ /dev/null @@ -1,32 +0,0 @@ ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: kafka - namespace: kafka -spec: - volumeClaimTemplates: - - metadata: - name: data - spec: - accessModes: [ "ReadWriteOnce" ] - storageClassName: kafka-broker - resources: - requests: - storage: 10Gi ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: pzoo - namespace: kafka -spec: - volumeClaimTemplates: - - metadata: - name: data - spec: - accessModes: [ "ReadWriteOnce" ] - storageClassName: kafka-zookeeper - resources: - requests: - storage: 1Gi diff --git a/variants/aws-gp2/aws-storageclass-broker-gp2.yaml b/variants/aws-gp2/aws-storageclass-broker-gp2.yaml deleted file mode 100644 index 16289b51..00000000 --- a/variants/aws-gp2/aws-storageclass-broker-gp2.yaml +++ /dev/null @@ -1,11 +0,0 @@ -kind: StorageClass -apiVersion: storage.k8s.io/v1 -metadata: - name: kafka-broker - labels: - k8s-addon: storage-aws.addons.k8s.io -provisioner: kubernetes.io/aws-ebs -reclaimPolicy: Retain -allowVolumeExpansion: true -parameters: - type: gp2 diff --git a/variants/aws-gp2/aws-storageclass-zookeeper-gp2.yaml b/variants/aws-gp2/aws-storageclass-zookeeper-gp2.yaml deleted file mode 100644 index 1e651df1..00000000 --- a/variants/aws-gp2/aws-storageclass-zookeeper-gp2.yaml +++ /dev/null @@ -1,11 +0,0 @@ -kind: StorageClass -apiVersion: storage.k8s.io/v1 -metadata: - name: kafka-zookeeper - labels: - k8s-addon: storage-aws.addons.k8s.io -provisioner: kubernetes.io/aws-ebs -reclaimPolicy: Retain -allowVolumeExpansion: true -parameters: - type: gp2 diff --git a/variants/aws-gp2/kustomization.yaml b/variants/aws-gp2/kustomization.yaml deleted file mode 100644 index ea9f8aac..00000000 --- a/variants/aws-gp2/kustomization.yaml +++ /dev/null @@ -1,7 +0,0 @@ -bases: -- ../scale-3-5 -resources: -- aws-storageclass-broker-gp2.yaml -- aws-storageclass-zookeeper-gp2.yaml -patchesStrategicMerge: -- volume-claims.yaml diff --git a/variants/aws-gp2/volume-claims.yaml b/variants/aws-gp2/volume-claims.yaml deleted file mode 100644 index fb20db89..00000000 --- a/variants/aws-gp2/volume-claims.yaml +++ /dev/null @@ -1,32 +0,0 @@ ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: kafka - namespace: kafka -spec: - volumeClaimTemplates: - - metadata: - name: data - spec: - accessModes: [ "ReadWriteOnce" ] - storageClassName: kafka-broker - resources: - requests: - storage: 10Gi ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: pzoo - namespace: kafka -spec: - volumeClaimTemplates: - - metadata: - name: data - spec: - accessModes: [ "ReadWriteOnce" ] - storageClassName: kafka-zookeeper - resources: - requests: - storage: 1Gi diff --git a/variants/dev-small/jmx-disable.yaml b/variants/dev-small/jmx-disable.yaml deleted file mode 100644 index fb7dd092..00000000 --- a/variants/dev-small/jmx-disable.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: kafka - namespace: kafka -spec: - template: - spec: - containers: - - name: broker - env: - - name: JMX_PORT - value: "" diff --git a/variants/dev-small/kustomization.yaml b/variants/dev-small/kustomization.yaml deleted file mode 100644 index a22670a3..00000000 --- a/variants/dev-small/kustomization.yaml +++ /dev/null @@ -1,20 +0,0 @@ -# With kubectl -n kafka port-forward kafka-0 9094 -bases: -- ../scale-1 -patchesStrategicMerge: -- jmx-disable.yaml -patchesJson6902: -- target: - group: apps - version: v1 - kind: StatefulSet - name: kafka - namespace: kafka - path: listener-localhost.json -- target: - group: apps - version: v1 - kind: StatefulSet - name: kafka - namespace: kafka - path: num-partitions-1.json diff --git a/variants/dev-small/listener-localhost.json b/variants/dev-small/listener-localhost.json deleted file mode 100644 index 496b1664..00000000 --- a/variants/dev-small/listener-localhost.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - {"op": "add", "path": "/spec/template/spec/containers/0/args/1", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/2", "value": "advertised.listeners=PLAINTEXT://:9092,OUTSIDE://localhost:9094"} -] diff --git a/variants/dev-small/num-partitions-1.json b/variants/dev-small/num-partitions-1.json deleted file mode 100644 index b8211f7f..00000000 --- a/variants/dev-small/num-partitions-1.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "num.partitions=1"} -] diff --git a/variants/gke-regional/gke-storageclass-broker-pd.yaml b/variants/gke-regional/gke-storageclass-broker-pd.yaml deleted file mode 100644 index 25c7ed5b..00000000 --- a/variants/gke-regional/gke-storageclass-broker-pd.yaml +++ /dev/null @@ -1,9 +0,0 @@ -kind: StorageClass -apiVersion: storage.k8s.io/v1 -metadata: - name: kafka-broker -provisioner: kubernetes.io/gce-pd -reclaimPolicy: Retain -allowVolumeExpansion: true -parameters: - type: pd-standard diff --git a/variants/gke-regional/gke-storageclass-zookeeper-regional-euwest1cd-ssh.yaml b/variants/gke-regional/gke-storageclass-zookeeper-regional-euwest1cd-ssh.yaml deleted file mode 100644 index 22d9602f..00000000 --- a/variants/gke-regional/gke-storageclass-zookeeper-regional-euwest1cd-ssh.yaml +++ /dev/null @@ -1,11 +0,0 @@ -kind: StorageClass -apiVersion: storage.k8s.io/v1 -metadata: - name: kafka-zookeeper-regional -provisioner: kubernetes.io/gce-pd -reclaimPolicy: Retain -allowVolumeExpansion: true -parameters: - type: pd-ssd - replication-type: regional-pd - zones: europe-west1-c, europe-west1-d diff --git a/variants/gke-regional/gke-storageclass-zookeeper-ssd.yaml b/variants/gke-regional/gke-storageclass-zookeeper-ssd.yaml deleted file mode 100644 index 2223d7b2..00000000 --- a/variants/gke-regional/gke-storageclass-zookeeper-ssd.yaml +++ /dev/null @@ -1,9 +0,0 @@ -kind: StorageClass -apiVersion: storage.k8s.io/v1 -metadata: - name: kafka-zookeeper -provisioner: kubernetes.io/gce-pd -reclaimPolicy: Retain -allowVolumeExpansion: true -parameters: - type: pd-ssd diff --git a/variants/gke-regional/kustomization.yaml b/variants/gke-regional/kustomization.yaml deleted file mode 100644 index 44acb293..00000000 --- a/variants/gke-regional/kustomization.yaml +++ /dev/null @@ -1,8 +0,0 @@ -bases: -- ../scale-3-5 -resources: -- gke-storageclass-broker-pd.yaml -- gke-storageclass-zookeeper-ssd.yaml -- gke-storageclass-zookeeper-regional-euwest1cd-ssh.yaml -patchesStrategicMerge: -- volume-claims.yaml diff --git a/variants/gke-regional/volume-claims.yaml b/variants/gke-regional/volume-claims.yaml deleted file mode 100644 index 42357194..00000000 --- a/variants/gke-regional/volume-claims.yaml +++ /dev/null @@ -1,48 +0,0 @@ ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: kafka - namespace: kafka -spec: - volumeClaimTemplates: - - metadata: - name: data - spec: - accessModes: [ "ReadWriteOnce" ] - storageClassName: kafka-broker - resources: - requests: - storage: 10Gi ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: pzoo - namespace: kafka -spec: - volumeClaimTemplates: - - metadata: - name: data - spec: - accessModes: [ "ReadWriteOnce" ] - storageClassName: kafka-zookeeper - resources: - requests: - storage: 1Gi ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: zoo - namespace: kafka -spec: - volumeClaimTemplates: - - metadata: - name: data - spec: - accessModes: [ "ReadWriteOnce" ] - storageClassName: kafka-zookeeper-regional - resources: - requests: - storage: 1Gi diff --git a/variants/prometheus-operator-example/additional-scrape-configs.yaml b/variants/prometheus-operator-example/additional-scrape-configs.yaml deleted file mode 100644 index 8c0541a4..00000000 --- a/variants/prometheus-operator-example/additional-scrape-configs.yaml +++ /dev/null @@ -1,11 +0,0 @@ - -apiVersion: monitoring.coreos.com/v1 -kind: Prometheus -metadata: - name: k8s - namespace: monitoring -spec: - additionalScrapeConfigs: - name: additional-scrape-configs - # See https://github.com/prometheus/prometheus/pull/4131, and upon disagreement see https://github.com/prometheus/prometheus/issues/4484 - key: pods-discovery-by-prometheus-io-annotations.yaml diff --git a/variants/prometheus-operator-example/alertmanager-main-scale-1.yaml b/variants/prometheus-operator-example/alertmanager-main-scale-1.yaml deleted file mode 100644 index 38da8bdf..00000000 --- a/variants/prometheus-operator-example/alertmanager-main-scale-1.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: Alertmanager -metadata: - name: main - namespace: monitoring -spec: - replicas: 1 diff --git a/variants/prometheus-operator-example/k8s-kafka-rbac.yaml b/variants/prometheus-operator-example/k8s-kafka-rbac.yaml deleted file mode 100644 index 311961ce..00000000 --- a/variants/prometheus-operator-example/k8s-kafka-rbac.yaml +++ /dev/null @@ -1,32 +0,0 @@ -# Allows the "k8s" prometheus from Prometheus Operator contrib to do service discovery in the kafka namespace ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: prometheus-k8s - namespace: kafka -rules: -- apiGroups: - - "" - resources: - - services - - endpoints - - pods - verbs: - - get - - list - - watch ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: prometheus-k8s - namespace: kafka -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring diff --git a/variants/prometheus-operator-example/k8s-kafka-servicemonitor.yaml b/variants/prometheus-operator-example/k8s-kafka-servicemonitor.yaml deleted file mode 100644 index 35b5b416..00000000 --- a/variants/prometheus-operator-example/k8s-kafka-servicemonitor.yaml +++ /dev/null @@ -1,38 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - name: broker-monitoring - namespace: kafka - labels: - app: kafka -spec: - publishNotReadyAddresses: true - ports: - - name: fromjmx - port: 5556 - selector: - app: kafka ---- -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kafka - namespace: monitoring - labels: - k8s-app: kafka -spec: - namespaceSelector: - matchNames: - - kafka - selector: - matchLabels: - app: kafka - endpoints: - # https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#endpoint - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 120s - scrapeTimeout: 119s - port: fromjmx - scheme: http - path: /metrics diff --git a/variants/prometheus-operator-example/k8s-minion-servicemonitor.yaml b/variants/prometheus-operator-example/k8s-minion-servicemonitor.yaml deleted file mode 100644 index 79bbfa22..00000000 --- a/variants/prometheus-operator-example/k8s-minion-servicemonitor.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: kafka-metrics-minion - namespace: monitoring - labels: - k8s-app: kafka-metrics-minion -spec: - namespaceSelector: - matchNames: - - kafka - selector: - matchLabels: - app: kafka-minion - type: openmetrics - endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s - scrapeTimeout: 30s - port: http - scheme: http - path: /metrics diff --git a/variants/prometheus-operator-example/kustomization.yaml b/variants/prometheus-operator-example/kustomization.yaml deleted file mode 100644 index 3fe25c81..00000000 --- a/variants/prometheus-operator-example/kustomization.yaml +++ /dev/null @@ -1,30 +0,0 @@ -bases: -# With prometheus-operator only you must add your own Prometheus and Alertmanager resources -#- github.com/coreos/prometheus-operator?ref=de9a6e1 -- github.com/coreos/kube-prometheus?ref=3a64636 -- ../../consumers-prometheus -# The ../../prometheus base must be edited to point to the chosen kafka base -# Actually to apply the sidecar with apply -k it has to be included with the kafka variant; can't be its own kustomization because you'll get -# either "failed to find an object with apps_v1_StatefulSet|kafka to apply the patch" or "id 'apps_v1_StatefulSet|kafka|~P|zoo|~S' already used" -#- ../../prometheus -resources: -- k8s-kafka-rbac.yaml -# or, to scrape all namespaces -#- k8s-cluster-rbac.yaml -# with base ../../prometheus -#- k8s-kafka-servicemonitor.yaml -# with base ../../consumers-prometheus -- k8s-minion-servicemonitor.yaml -patchesStrategicMerge: -- prometheus-k8s-scale-1.yaml -- prometheus-k8s-2.9.2.yaml -- alertmanager-main-scale-1.yaml -- prometheus-k8s-nodeport.yaml -- additional-scrape-configs.yaml -generatorOptions: - disableNameSuffixHash: true -secretGenerator: -- name: additional-scrape-configs - namespace: monitoring - files: - - scrape-configs/pods-discovery-by-prometheus-io-annotations.yaml diff --git a/variants/prometheus-operator-example/prometheus-k8s-2.9.2.yaml b/variants/prometheus-operator-example/prometheus-k8s-2.9.2.yaml deleted file mode 100644 index ad928e9b..00000000 --- a/variants/prometheus-operator-example/prometheus-k8s-2.9.2.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: Prometheus -metadata: - name: k8s - namespace: monitoring -spec: - baseImage: quay.io/prometheus/prometheus - version: v2.7.2 diff --git a/variants/prometheus-operator-example/prometheus-k8s-nodeport.yaml b/variants/prometheus-operator-example/prometheus-k8s-nodeport.yaml deleted file mode 100644 index 3ef4ab1b..00000000 --- a/variants/prometheus-operator-example/prometheus-k8s-nodeport.yaml +++ /dev/null @@ -1,10 +0,0 @@ -apiVersion: v1 -kind: Service -metadata: - name: prometheus-k8s - namespace: monitoring -spec: - type: NodePort - ports: - - port: 9090 - nodePort: 32490 diff --git a/variants/prometheus-operator-example/prometheus-k8s-scale-1.yaml b/variants/prometheus-operator-example/prometheus-k8s-scale-1.yaml deleted file mode 100644 index 939ddb73..00000000 --- a/variants/prometheus-operator-example/prometheus-k8s-scale-1.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: Prometheus -metadata: - name: k8s - namespace: monitoring -spec: - replicas: 1 diff --git a/variants/prometheus-operator-example/scrape-configs/pods-discovery-by-prometheus-io-annotations.yaml b/variants/prometheus-operator-example/scrape-configs/pods-discovery-by-prometheus-io-annotations.yaml deleted file mode 100644 index 4d812001..00000000 --- a/variants/prometheus-operator-example/scrape-configs/pods-discovery-by-prometheus-io-annotations.yaml +++ /dev/null @@ -1,35 +0,0 @@ -# Example scrape config for pods -# -# The relabeling allows the actual pod scrape endpoint to be configured via the -# following annotations: -# -# * `prometheus.io/scrape`: Only scrape pods that have a value of `true` -# * `prometheus.io/path`: If the metrics path is not `/metrics` override this. -# * `prometheus.io/port`: Scrape the pod on the indicated port instead of the -# pod's declared ports (default is a port-free target if none are declared). -- job_name: 'kubernetes-pods' - - kubernetes_sd_configs: - - role: pod - - relabel_configs: - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] - action: keep - regex: true - - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] - action: replace - target_label: __metrics_path__ - regex: (.+) - - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] - action: replace - regex: ([^:]+)(?::\d+)?;(\d+) - replacement: $1:$2 - target_label: __address__ - - action: labelmap - regex: __meta_kubernetes_pod_label_(.+) - - source_labels: [__meta_kubernetes_namespace] - action: replace - target_label: kubernetes_namespace - - source_labels: [__meta_kubernetes_pod_name] - action: replace - target_label: kubernetes_pod_name diff --git a/variants/scale-1-ephemeral/kafka-scale1-overrides.json b/variants/scale-1-ephemeral/kafka-scale1-overrides.json deleted file mode 100644 index 13187671..00000000 --- a/variants/scale-1-ephemeral/kafka-scale1-overrides.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "default.replication.factor=1"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "min.insync.replicas=1"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "offsets.topic.replication.factor=1"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "offsets.topic.num.partitions=1"} -] \ No newline at end of file diff --git a/variants/scale-1-ephemeral/kafka.yaml b/variants/scale-1-ephemeral/kafka.yaml deleted file mode 100644 index 436fa722..00000000 --- a/variants/scale-1-ephemeral/kafka.yaml +++ /dev/null @@ -1,12 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: kafka - namespace: kafka -spec: - template: - spec: - volumes: - - name: data - emptyDir: {} - volumeClaimTemplates: [] diff --git a/variants/scale-1-ephemeral/kustomization.yaml b/variants/scale-1-ephemeral/kustomization.yaml deleted file mode 100644 index 0cef8d2c..00000000 --- a/variants/scale-1-ephemeral/kustomization.yaml +++ /dev/null @@ -1,5 +0,0 @@ -bases: -- ../scale-1 -patchesStrategicMerge: -- kafka.yaml -- zookeeper.yaml diff --git a/variants/scale-1-ephemeral/zookeeper.yaml b/variants/scale-1-ephemeral/zookeeper.yaml deleted file mode 100644 index c2810703..00000000 --- a/variants/scale-1-ephemeral/zookeeper.yaml +++ /dev/null @@ -1,32 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - name: pzoo - namespace: kafka ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: pzoo - namespace: kafka -spec: - template: - spec: - volumes: - - name: data - emptyDir: {} - volumeClaimTemplates: [] ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: zoo - namespace: kafka -spec: - template: - spec: - volumes: - - name: data - emptyDir: {} - volumeClaimTemplates: [] diff --git a/variants/scale-1/kafka-scale1-overrides.json b/variants/scale-1/kafka-scale1-overrides.json deleted file mode 100644 index b4dcceca..00000000 --- a/variants/scale-1/kafka-scale1-overrides.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "zookeeper.connect=zoo-0.zoo.$(POD_NAMESPACE).svc.cluster.local:2181" }, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "default.replication.factor=1"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "min.insync.replicas=1"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "offsets.topic.replication.factor=1"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "offsets.topic.num.partitions=1"} -] \ No newline at end of file diff --git a/variants/scale-1/kafka.yaml b/variants/scale-1/kafka.yaml deleted file mode 100644 index 18d01c80..00000000 --- a/variants/scale-1/kafka.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: kafka - namespace: kafka -spec: - replicas: 1 diff --git a/variants/scale-1/kustomization.yaml b/variants/scale-1/kustomization.yaml deleted file mode 100644 index ab741e0b..00000000 --- a/variants/scale-1/kustomization.yaml +++ /dev/null @@ -1,13 +0,0 @@ -bases: -- ../../native -patchesStrategicMerge: -- kafka.yaml -- zookeeper.yaml -patchesJson6902: -- target: - group: apps - version: v1 - kind: StatefulSet - name: kafka - namespace: kafka - path: kafka-scale1-overrides.json diff --git a/variants/scale-1/zookeeper.yaml b/variants/scale-1/zookeeper.yaml deleted file mode 100644 index ae027b0b..00000000 --- a/variants/scale-1/zookeeper.yaml +++ /dev/null @@ -1,38 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - name: pzoo - namespace: kafka ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: pzoo - namespace: kafka -spec: - replicas: 0 ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: zoo - namespace: kafka -spec: - replicas: 1 - template: - spec: - initContainers: - - name: init-config - env: - - name: PZOO_REPLICAS - value: '0' - - name: REPLICAS - value: '1' - - name: ID_OFFSET - value: '1' - containers: - - name: zookeeper - env: - - name: REPLICAS - value: '1' diff --git a/variants/scale-2/kafka-scale2-overrides.json b/variants/scale-2/kafka-scale2-overrides.json deleted file mode 100644 index a18753cb..00000000 --- a/variants/scale-2/kafka-scale2-overrides.json +++ /dev/null @@ -1,10 +0,0 @@ -[ - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "default.replication.factor=2"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "min.insync.replicas=2"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "offsets.topic.replication.factor=2"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "offsets.topic.num.partitions=2"} -] diff --git a/variants/scale-2/kafka.yaml b/variants/scale-2/kafka.yaml deleted file mode 100644 index c80e25a9..00000000 --- a/variants/scale-2/kafka.yaml +++ /dev/null @@ -1,7 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: kafka - namespace: kafka -spec: - replicas: 2 diff --git a/variants/scale-2/kustomization.yaml b/variants/scale-2/kustomization.yaml deleted file mode 100644 index 89e86df5..00000000 --- a/variants/scale-2/kustomization.yaml +++ /dev/null @@ -1,13 +0,0 @@ -bases: -- ../../nonroot -patchesStrategicMerge: -- kafka.yaml -- zookeeper.yaml -patchesJson6902: -- target: - group: apps - version: v1 - kind: StatefulSet - name: kafka - namespace: kafka - path: kafka-scale2-overrides.json diff --git a/variants/scale-2/zookeeper.yaml b/variants/scale-2/zookeeper.yaml deleted file mode 100644 index 96c8b189..00000000 --- a/variants/scale-2/zookeeper.yaml +++ /dev/null @@ -1,33 +0,0 @@ ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: pzoo - namespace: kafka -spec: - replicas: 0 ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: zoo - namespace: kafka -spec: - replicas: 2 - template: - spec: - initContainers: - - name: init-config - env: - # There's no validation on these numbers adding up to a coherent zk config, so watch out - - name: PZOO_REPLICAS - value: '0' - - name: REPLICAS - value: '2' - - name: ID_OFFSET - value: '1' - containers: - - name: zookeeper - env: - - name: REPLICAS - value: '2' diff --git a/variants/scale-3-3/kafka-zookeeper-connect-only-zoo.json b/variants/scale-3-3/kafka-zookeeper-connect-only-zoo.json deleted file mode 100644 index 605bf743..00000000 --- a/variants/scale-3-3/kafka-zookeeper-connect-only-zoo.json +++ /dev/null @@ -1,6 +0,0 @@ -[ - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": - "zookeeper.connect=zoo-0.zoo.$(POD_NAMESPACE).svc.cluster.local:2181,zoo-1.zoo.$(POD_NAMESPACE).svc.cluster.local:2181,zoo-2.zoo.$(POD_NAMESPACE).svc.cluster.local:2181" - } -] \ No newline at end of file diff --git a/variants/scale-3-3/kustomization.yaml b/variants/scale-3-3/kustomization.yaml deleted file mode 100644 index 11b82724..00000000 --- a/variants/scale-3-3/kustomization.yaml +++ /dev/null @@ -1,12 +0,0 @@ -bases: -- ../scale-3-5 -patchesStrategicMerge: -- ./only-zoo-3.yaml -patchesJson6902: -- target: - group: apps - version: v1 - kind: StatefulSet - name: kafka - namespace: kafka - path: kafka-zookeeper-connect-only-zoo.json diff --git a/variants/scale-3-3/only-zoo-3.yaml b/variants/scale-3-3/only-zoo-3.yaml deleted file mode 100644 index bced2320..00000000 --- a/variants/scale-3-3/only-zoo-3.yaml +++ /dev/null @@ -1,33 +0,0 @@ ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: pzoo - namespace: kafka -spec: - replicas: 0 ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: zoo - namespace: kafka -spec: - replicas: 3 - template: - spec: - initContainers: - - name: init-config - env: - # There's no validation on these numbers adding up to a coherent zk config, so watch out - - name: PZOO_REPLICAS - value: '0' - - name: REPLICAS - value: '3' - - name: ID_OFFSET - value: '1' - containers: - - name: zookeeper - env: - - name: REPLICAS - value: '3' diff --git a/variants/scale-3-5-nopzoo/kafka-zookeeper-connect-only-zoo.json b/variants/scale-3-5-nopzoo/kafka-zookeeper-connect-only-zoo.json deleted file mode 100644 index 37c9f1fc..00000000 --- a/variants/scale-3-5-nopzoo/kafka-zookeeper-connect-only-zoo.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "zookeeper.connect=zoo:2181"} -] \ No newline at end of file diff --git a/variants/scale-3-5-nopzoo/kustomization.yaml b/variants/scale-3-5-nopzoo/kustomization.yaml deleted file mode 100644 index 5c7ec29d..00000000 --- a/variants/scale-3-5-nopzoo/kustomization.yaml +++ /dev/null @@ -1,12 +0,0 @@ -bases: -- ../scale-3-5 -patchesStrategicMerge: -- ./only-zoo-5.yaml -patchesJson6902: -- target: - group: apps - version: v1 - kind: StatefulSet - name: kafka - namespace: kafka - path: kafka-zookeeper-connect-only-zoo.json diff --git a/variants/scale-3-5-nopzoo/only-zoo-5.yaml b/variants/scale-3-5-nopzoo/only-zoo-5.yaml deleted file mode 100644 index ed59e8de..00000000 --- a/variants/scale-3-5-nopzoo/only-zoo-5.yaml +++ /dev/null @@ -1,33 +0,0 @@ ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: pzoo - namespace: kafka -spec: - replicas: 0 ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: zoo - namespace: kafka -spec: - replicas: 5 - template: - spec: - initContainers: - - name: init-config - env: - # There's no validation on these numbers adding up to a coherent zk config, so watch out - - name: PZOO_REPLICAS - value: '0' - - name: REPLICAS - value: '5' - - name: ID_OFFSET - value: '1' - containers: - - name: zookeeper - env: - - name: REPLICAS - value: '5' diff --git a/variants/scale-3-5/kustomization.yaml b/variants/scale-3-5/kustomization.yaml deleted file mode 100644 index 80c534f7..00000000 --- a/variants/scale-3-5/kustomization.yaml +++ /dev/null @@ -1,2 +0,0 @@ -bases: -- ../../nonroot diff --git a/variants/scale-6-9/kafka-6.yaml b/variants/scale-6-9/kafka-6.yaml deleted file mode 100644 index a7e319a7..00000000 --- a/variants/scale-6-9/kafka-6.yaml +++ /dev/null @@ -1,6 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: kafka -spec: - replicas: 6 diff --git a/variants/scale-6-9/kafka-zookeeper-connect-only-zoo.json b/variants/scale-6-9/kafka-zookeeper-connect-only-zoo.json deleted file mode 100644 index 06d3bb1a..00000000 --- a/variants/scale-6-9/kafka-zookeeper-connect-only-zoo.json +++ /dev/null @@ -1,6 +0,0 @@ -[ - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--override"}, - {"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": - "zookeeper.connect=zoo-0.zoo.$(POD_NAMESPACE).svc.cluster.local:2181,zoo-1.zoo.$(POD_NAMESPACE).svc.cluster.local:2181,zoo-2.zoo.$(POD_NAMESPACE).svc.cluster.local:2181,zoo-3.zoo.$(POD_NAMESPACE).svc.cluster.local:2181,zoo-4.zoo.$(POD_NAMESPACE).svc.cluster.local:2181,zoo-5.zoo.$(POD_NAMESPACE).svc.cluster.local:2181,zoo-6.zoo.$(POD_NAMESPACE).svc.cluster.local:2181,zoo-7.zoo.$(POD_NAMESPACE).svc.cluster.local:2181,zoo-8.zoo.$(POD_NAMESPACE).svc.cluster.local:2181" - } -] \ No newline at end of file diff --git a/variants/scale-6-9/kustomization.yaml b/variants/scale-6-9/kustomization.yaml deleted file mode 100644 index bd1afdd4..00000000 --- a/variants/scale-6-9/kustomization.yaml +++ /dev/null @@ -1,26 +0,0 @@ -bases: -- ../../native -patchesStrategicMerge: -- zoo-9.yaml -- kafka-6.yaml -patchesJson6902: -- target: - group: apps - version: v1 - kind: StatefulSet - name: kafka - path: kafka-zookeeper-connect-only-zoo.json -# The nonroot image is distroless and doesn't support a shell that the prestop hook needs -- target: - group: apps - version: v1 - kind: StatefulSet - name: zoo - path: lifecycle-remove.json -# The nonroot image is distroless and has neither shell nor the nc command -- target: - group: apps - version: v1 - kind: StatefulSet - name: zoo - path: zoo-readiness-without-shell.yaml diff --git a/variants/scale-6-9/lifecycle-remove.json b/variants/scale-6-9/lifecycle-remove.json deleted file mode 100644 index d6ed5370..00000000 --- a/variants/scale-6-9/lifecycle-remove.json +++ /dev/null @@ -1,4 +0,0 @@ -[ - {"op": "remove", "path": "/spec/template/spec/containers/0/lifecycle"} -] - \ No newline at end of file diff --git a/variants/scale-6-9/zoo-9.yaml b/variants/scale-6-9/zoo-9.yaml deleted file mode 100644 index e2babff0..00000000 --- a/variants/scale-6-9/zoo-9.yaml +++ /dev/null @@ -1,33 +0,0 @@ ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: pzoo - namespace: kafka -spec: - replicas: 0 ---- -apiVersion: apps/v1 -kind: StatefulSet -metadata: - name: zoo - namespace: kafka -spec: - replicas: 9 - template: - spec: - initContainers: - - name: init-config - env: - # There's no validation on these numbers adding up to a coherent zk config, so watch out - - name: PZOO_REPLICAS - value: '0' - - name: REPLICAS - value: '9' - - name: ID_OFFSET - value: '1' - containers: - - name: zookeeper - env: - - name: REPLICAS - value: '9' diff --git a/variants/scale-6-9/zoo-readiness-without-shell.yaml b/variants/scale-6-9/zoo-readiness-without-shell.yaml deleted file mode 100644 index 72fe2284..00000000 --- a/variants/scale-6-9/zoo-readiness-without-shell.yaml +++ /dev/null @@ -1,6 +0,0 @@ -- path: /spec/template/spec/containers/0/readinessProbe - # op: replace - # value: - # tcpSocket: - # port: 2181 - op: remove From 8f89b384e256c29127d6001ae3bfce365d5199ac Mon Sep 17 00:00:00 2001 From: joyfulbean Date: Mon, 25 Jul 2022 04:58:03 +0000 Subject: [PATCH 12/12] separate prometheus and exporter --- prometheus/kube-state-deployment.yml | 24 ++++ prometheus/kube-state-svc.yml | 23 ++++ .../{kustomization.yaml => kustomization.yml} | 0 prometheus/node-dir-exporter.yml | 77 +++++++++++ prometheus/node-exporter-svc.yml | 21 +++ prometheus/node-exporter.yml | 30 ++++ prometheus/prometheus-config.yml | 128 ++++++++++++++++++ prometheus/prometheus-deployment.yml | 52 +++++++ prometheus/prometheus-svc.yml | 21 +++ prometheus/rules.yml | 71 ++++++++++ 10 files changed, 447 insertions(+) create mode 100644 prometheus/kube-state-deployment.yml create mode 100644 prometheus/kube-state-svc.yml rename prometheus/{kustomization.yaml => kustomization.yml} (100%) create mode 100644 prometheus/node-dir-exporter.yml create mode 100644 prometheus/node-exporter-svc.yml create mode 100644 prometheus/node-exporter.yml create mode 100644 prometheus/prometheus-config.yml create mode 100644 prometheus/prometheus-deployment.yml create mode 100644 prometheus/prometheus-svc.yml create mode 100644 prometheus/rules.yml diff --git a/prometheus/kube-state-deployment.yml b/prometheus/kube-state-deployment.yml new file mode 100644 index 00000000..09df9c74 --- /dev/null +++ b/prometheus/kube-state-deployment.yml @@ -0,0 +1,24 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: monitoring + labels: + app: kube-state-metrics +spec: + replicas: 1 + selector: + matchLabels: + app: kube-state-metrics + template: + metadata: + labels: + app: kube-state-metrics + spec: + serviceAccountName: kube-state-metrics + containers: + - name: kube-state-metrics + image: gcr.io/google_containers/kube-state-metrics:v0.5.0 + ports: + - containerPort: 8080 diff --git a/prometheus/kube-state-svc.yml b/prometheus/kube-state-svc.yml new file mode 100644 index 00000000..40559e07 --- /dev/null +++ b/prometheus/kube-state-svc.yml @@ -0,0 +1,23 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: monitoring +--- +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/scrape: 'true' + name: kube-state-metrics + namespace: monitoring + labels: + app: kube-state-metrics +spec: + ports: + - name: kube-state-metrics + port: 8080 + protocol: TCP + selector: + app: kube-state-metrics diff --git a/prometheus/kustomization.yaml b/prometheus/kustomization.yml similarity index 100% rename from prometheus/kustomization.yaml rename to prometheus/kustomization.yml diff --git a/prometheus/node-dir-exporter.yml b/prometheus/node-dir-exporter.yml new file mode 100644 index 00000000..e8240ecd --- /dev/null +++ b/prometheus/node-dir-exporter.yml @@ -0,0 +1,77 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-directory-size-metrics + namespace: monitoring + labels: + app: node-directory-size-metrics + annotations: + description: | + This `DaemonSet` provides metrics in Prometheus format about disk usage on the nodes. + The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now. + The other container `caddy` just hands out the contents of that file on request via `http` on `/metrics` at port `9102` which are the defaults for Prometheus. + These are scheduled on every node in the Kubernetes cluster. + To choose directories from the node to check, just mount them on the `read-du` container below `/mnt`. +spec: + selector: + matchLabels: + app: node-directory-size-metrics + template: + metadata: + labels: + app: node-directory-size-metrics + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '9102' + description: | + This `Pod` provides metrics in Prometheus format about disk usage on the node. + The container `read-du` reads in sizes of all directories below /mnt and writes that to `/tmp/metrics`. It only reports directories larger then `100M` for now. + The other container `caddy` just hands out the contents of that file on request on `/metrics` at port `9102` which are the defaults for Prometheus. + This `Pod` is scheduled on every node in the Kubernetes cluster. + To choose directories from the node to check just mount them on `read-du` below `/mnt`. + spec: + containers: + - name: read-du + image: giantswarm/tiny-tools + imagePullPolicy: Always + # FIXME threshold via env var + # The + command: + - fish + - --command + - | + touch /tmp/metrics-temp + while true + for directory in (du --bytes --separate-dirs --threshold=100M /mnt) + echo $directory | read size path + echo "node_directory_size_bytes{path=\"$path\"} $size" \ + >> /tmp/metrics-temp + end + mv /tmp/metrics-temp /tmp/metrics + sleep 300 + end + volumeMounts: + - name: host-fs-var + mountPath: /mnt/var + readOnly: true + - name: metrics + mountPath: /tmp + - name: caddy + image: dockermuenster/caddy:0.9.3 + command: + - "caddy" + - "-port=9102" + - "-root=/var/www" + ports: + - containerPort: 9102 + volumeMounts: + - name: metrics + mountPath: /var/www + volumes: + - name: host-fs-var + hostPath: + path: /var + - name: metrics + emptyDir: + medium: Memory diff --git a/prometheus/node-exporter-svc.yml b/prometheus/node-exporter-svc.yml new file mode 100644 index 00000000..dd4b5243 --- /dev/null +++ b/prometheus/node-exporter-svc.yml @@ -0,0 +1,21 @@ +--- +apiVersion: v1 +kind: Service +metadata: + annotations: + prometheus.io/scrape: 'true' + name: prometheus-node-exporter + namespace: monitoring + labels: + app: prometheus + component: node-exporter +spec: + clusterIP: None + ports: + - name: prometheus-node-exporter + port: 9100 + protocol: TCP + selector: + app: prometheus + component: node-exporter + type: ClusterIP diff --git a/prometheus/node-exporter.yml b/prometheus/node-exporter.yml new file mode 100644 index 00000000..b9d30584 --- /dev/null +++ b/prometheus/node-exporter.yml @@ -0,0 +1,30 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: prometheus-node-exporter + namespace: monitoring + labels: + app: prometheus + component: node-exporter +spec: + selector: + matchLabels: + app: prometheus + template: + metadata: + name: prometheus-node-exporter + labels: + app: prometheus + component: node-exporter + spec: + containers: + - image: prom/node-exporter:v0.14.0 + name: prometheus-node-exporter + ports: + - name: prom-node-exp + #^ must be an IANA_SVC_NAME (at most 15 characters, ..) + containerPort: 9100 + hostPort: 9100 + hostNetwork: true + hostPID: true diff --git a/prometheus/prometheus-config.yml b/prometheus/prometheus-config.yml new file mode 100644 index 00000000..958a2098 --- /dev/null +++ b/prometheus/prometheus-config.yml @@ -0,0 +1,128 @@ +--- +apiVersion: v1 +data: + prometheus.yaml: | + global: + scrape_interval: 10s + scrape_timeout: 10s + evaluation_interval: 10s + rule_files: + - "/etc/prometheus-rules/*.rules" + scrape_configs: + - job_name: 'jmx-kafka' + static_configs: + - targets: ['54.241.141.202:32401'] + # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L37 + - job_name: 'kubernetes-nodes' + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - source_labels: [__address__] + regex: '(.*):10250' + replacement: '${1}:10255' + target_label: __address__ + # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L79 + - job_name: 'kubernetes-endpoints' + kubernetes_sd_configs: + - role: endpoints + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: (.+)(?::\d+);(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: kubernetes_name + # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L119 + - job_name: 'kubernetes-services' + metrics_path: /probe + params: + module: [http_2xx] + kubernetes_sd_configs: + - role: service + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: true + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + replacement: blackbox + - source_labels: [__param_target] + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_service_name] + target_label: kubernetes_name + # https://github.com/prometheus/prometheus/blob/master/documentation/examples/prometheus-kubernetes.yml#L156 + - job_name: 'kubernetes-pods' + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: (.+):(?:\d+);(\d+) + replacement: ${1}:${2} + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + - source_labels: [__meta_kubernetes_pod_container_port_number] + action: keep + regex: 9\d{3} + - job_name: 'kubernetes-cadvisor' + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + kubernetes_sd_configs: + - role: node + relabel_configs: + - action: labelmap + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - target_label: __address__ + replacement: kubernetes.default.svc:443 + - source_labels: [__meta_kubernetes_node_name] + regex: (.+) + target_label: __metrics_path__ + replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor +kind: ConfigMap +metadata: + creationTimestamp: null + name: prometheus-core + namespace: monitoring diff --git a/prometheus/prometheus-deployment.yml b/prometheus/prometheus-deployment.yml new file mode 100644 index 00000000..e91bf454 --- /dev/null +++ b/prometheus/prometheus-deployment.yml @@ -0,0 +1,52 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-core + namespace: monitoring + labels: + app: prometheus + component: core +spec: + replicas: 1 + selector: + matchLabels: + app: prometheus + template: + metadata: + name: prometheus-main + labels: + app: prometheus + component: core + spec: + serviceAccountName: prometheus-k8s + containers: + - name: prometheus + image: prom/prometheus:v1.7.0 + args: + - '-storage.local.retention=12h' + - '-storage.local.memory-chunks=500000' + - '-config.file=/etc/prometheus/prometheus.yaml' + - '-alertmanager.url=http://alertmanager:9093/' + ports: + - name: webui + containerPort: 9090 + resources: + requests: + cpu: 100m + memory: 500M + limits: + cpu: 100m + memory: 500M + volumeMounts: + - name: config-volume + mountPath: /etc/prometheus + - name: rules-volume + mountPath: /etc/prometheus-rules + volumes: + - name: config-volume + configMap: + name: prometheus-core + - name: rules-volume + configMap: + name: prometheus-rules diff --git a/prometheus/prometheus-svc.yml b/prometheus/prometheus-svc.yml new file mode 100644 index 00000000..c8f9d09c --- /dev/null +++ b/prometheus/prometheus-svc.yml @@ -0,0 +1,21 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring + labels: + app: prometheus + component: core + annotations: + prometheus.io/scrape: 'true' +spec: + type: NodePort + ports: + - port: 9090 + protocol: TCP + nodeport: 32334 + name: webui + selector: + app: prometheus + component: core diff --git a/prometheus/rules.yml b/prometheus/rules.yml new file mode 100644 index 00000000..7e336caf --- /dev/null +++ b/prometheus/rules.yml @@ -0,0 +1,71 @@ +--- +apiVersion: v1 +data: + cpu-usage.rules: | + ALERT NodeCPUUsage + IF (100 - (avg by (instance) (irate(node_cpu{name="node-exporter",mode="idle"}[5m])) * 100)) > 75 + FOR 2m + LABELS { + severity="page" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: High CPU usage detected", + DESCRIPTION = "{{$labels.instance}}: CPU usage is above 75% (current value is: {{ $value }})" + } + instance-availability.rules: | + ALERT InstanceDown + IF up == 0 + FOR 1m + LABELS { severity = "page" } + ANNOTATIONS { + summary = "Instance {{ $labels.instance }} down", + description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.", + } + low-disk-space.rules: | + ALERT NodeLowRootDisk + IF ((node_filesystem_size{mountpoint="/root-disk"} - node_filesystem_free{mountpoint="/root-disk"} ) / node_filesystem_size{mountpoint="/root-disk"} * 100) > 75 + FOR 2m + LABELS { + severity="page" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: Low root disk space", + DESCRIPTION = "{{$labels.instance}}: Root disk usage is above 75% (current value is: {{ $value }})" + } + ALERT NodeLowDataDisk + IF ((node_filesystem_size{mountpoint="/data-disk"} - node_filesystem_free{mountpoint="/data-disk"} ) / node_filesystem_size{mountpoint="/data-disk"} * 100) > 75 + FOR 2m + LABELS { + severity="page" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: Low data disk space", + DESCRIPTION = "{{$labels.instance}}: Data disk usage is above 75% (current value is: {{ $value }})" + } + mem-usage.rules: | + ALERT NodeSwapUsage + IF (((node_memory_SwapTotal-node_memory_SwapFree)/node_memory_SwapTotal)*100) > 75 + FOR 2m + LABELS { + severity="page" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: Swap usage detected", + DESCRIPTION = "{{$labels.instance}}: Swap usage usage is above 75% (current value is: {{ $value }})" + } + ALERT NodeMemoryUsage + IF (((node_memory_MemTotal-node_memory_MemAvailable)/(node_memory_MemTotal)*100)) > 75 + FOR 2m + LABELS { + severity="page" + } + ANNOTATIONS { + SUMMARY = "{{$labels.instance}}: High memory usage detected", + DESCRIPTION = "{{$labels.instance}}: Memory usage is above 75% (current value is: {{ $value }})" + } + +kind: ConfigMap +metadata: + creationTimestamp: null + name: prometheus-rules + namespace: monitoring