diff --git a/CHANGELOG.md b/CHANGELOG.md index 03e0f87..4b60c57 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## master * [ENHANCEMENT] Add bigger tenants and configure default compactor tenant shards +* [ENHANCEMENT] Add alert `CortexCompactorWriteVisitMarkerIsFailing` to monitor compactors ## 1.17.1 / 2024-10-23 * [CHANGE] Use cortex v1.17.1 diff --git a/cortex-mixin/alerts/compactor.libsonnet b/cortex-mixin/alerts/compactor.libsonnet index 73e50a4..7552e30 100644 --- a/cortex-mixin/alerts/compactor.libsonnet +++ b/cortex-mixin/alerts/compactor.libsonnet @@ -102,6 +102,22 @@ ||| % $._config, }, }, + { + // Alert if compactor are not able to update the visit-marker. + alert: 'CortexCompactorBlockVisitMarkerIsFailing', + 'for': '2h', + expr: ||| + sum(increase(cortex_compactor_block_visit_marker_write_failed{job=~".+/%(compactor)s"}[2h]))>0 + ||| % $._config.job_names, + labels: { + severity: 'critical' + }, + annotations: { + message: ||| + Cortex compactors are not able to update the visit marker, double check logs to see what is happening + ||| + } + } ], }, ], diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 3958687..fa30a9d 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -379,6 +379,17 @@ How to **investigate**: - Ensure ingesters are successfully shipping blocks to the storage - Look for any error in the compactor logs +### CortexCompactorWriteVisitMarkerIsFailing + +Only applies to compactors when using shuffle sharding. +This alert fires if the compactor is not able to update the visit marker across all tenants. +The marker file is a very small json file that should never have any problems getting updated. + +How to **investigate**: +- Verify the logs for the compactors, they should show the exact reason +- If you see the `context canceled` or any other timeouts in the logs, +consider increasing `-compactor.compaction-visit-marker-timeout` and `-compactor.compaction-visit-marker-file-update-interval`. + ### CortexCompactorHasNotSuccessfullyRunCompaction This alert fires if the compactor is not able to successfully compact all discovered compactable blocks (across all tenants).