From c09fed355cd552e2c98e77eea501aadd6faeb7e7 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 17 Aug 2021 16:47:12 +0000 Subject: [PATCH 1/2] Cortex kv panels: narrow to one instance Distributor uses multiple kv stores - for global limits and ha-tracker, as well as reading from the ingester ring - so we need to narrow the panel to just the one it says it is showing. For consistency, do the same on the ingester panel, although currently ingesters only have one kv store. Note that the renamed recording rule will mean that dashboards show no data for latency prior to the change. --- cortex-mixin/dashboards/writes.libsonnet | 10 ++++++---- cortex-mixin/recording_rules.libsonnet | 2 +- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index e99faee4..999603be 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -104,11 +104,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Key-value store for high-availability (HA) deduplication') .addPanel( $.panel('Requests / sec') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s,kv_name="distributor-hatracker"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.eq('kv_name', 'distributor-hatracker')]) + ) ) ) .addRow( @@ -133,11 +134,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('Key-value store for the ingesters ring') .addPanel( $.panel('Requests / sec') + - $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s,kv_name="ingester-lifecycler"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)+ [utils.selector.eq('kv_name', 'ingester-lifecycler')]) + ) ) ) .addRowIf( diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index 433fa8e6..041a099b 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -42,7 +42,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.histogramRules('cortex_chunk_store_chunks_per_query', ['cluster', 'job']) + utils.histogramRules('cortex_database_request_duration_seconds', ['cluster', 'job', 'method']) + utils.histogramRules('cortex_gcs_request_duration_seconds', ['cluster', 'job', 'operation']) + - utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job']), + utils.histogramRules('cortex_kv_request_duration_seconds', ['cluster', 'job', 'kv_name']), }, { name: 'cortex_queries', From 76b89ff7d05edd7f2b9c4c6edd793d37e0a0f929 Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 17 Aug 2021 16:59:05 +0000 Subject: [PATCH 2/2] Cortex writes dashboard: add panels for KV activity For HA-tracker, show which tenants are changing election. For ingester, show how many are active, leaving, etc. --- CHANGELOG.md | 1 + cortex-mixin/dashboards/writes.libsonnet | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a61ebcf7..e3bca8af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ * [CHANGE] Increased `CortexIngesterReachingSeriesLimit` critical alert threshold from 80% to 85%. #363 * [CHANGE] Decreased `-server.grpc-max-concurrent-streams` from 100k to 10k. #369 * [CHANGE] Decreased blocks storage ingesters graceful termination period from 80m to 20m. #369 +* [ENHANCEMENT] Writes dashboard: fix HA-tracker KV panels; add elections panel and ingester state panel. #371 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317 * [ENHANCEMENT] Cortex-mixin: Include `cortex-gw-internal` naming variation in default `gateway` job names. #328 * [ENHANCEMENT] Ruler dashboard: added object storage metrics. #354 diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index 999603be..bf383ab2 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -110,6 +110,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Latency') + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.eq('kv_name', 'distributor-hatracker')]) ) + .addPanel( + $.panel('Elected replica changes / min') + + $.queryPanel([ + 'max by(exported_cluster, user)(increase(cortex_ha_tracker_elected_replica_changes_total{%s}[1m])) >0' % $.jobMatcher($._config.job_names.distributor), + ], [ + '{{user}}/{{exported_cluster}}', + ]) + + $.stack + { + yaxes: $.yaxes('cpm'), + }, ) ) .addRow( @@ -138,8 +148,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)+ [utils.selector.eq('kv_name', 'ingester-lifecycler')]) + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('kv_name', 'ingester-lifecycler')]) ) + .addPanel( + $.panel('Ingester status') + + $.queryPanel([ + 'max by (state)(cortex_ring_members{%s}) >0' % $.jobMatcher($._config.job_names.distributor), + ], [ + '{{state}}', + ]) ) ) .addRowIf(