Skip to content

Commit 2cb22f8

Browse files
authored
Add integration label to notification metrics (#3056)
* Add integration label to notification metrics Also, add a metric for when creation of rules manager would fail. Signed-off-by: Goutham Veeramachaneni <[email protected]> * Address feedback Signed-off-by: Goutham Veeramachaneni <[email protected]>
1 parent 2c8c1cb commit 2cb22f8

File tree

7 files changed

+91
-31
lines changed

7 files changed

+91
-31
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,15 @@
1212
* [CHANGE] Experimental Delete Series: `/api/v1/admin/tsdb/delete_series` and `/api/v1/admin/tsdb/cancel_delete_request` purger APIs to return status code `204` instead of `200` for success. #2946
1313
* [CHANGE] Histogram `cortex_memcache_request_duration_seconds` `method` label value changes from `Memcached.Get` to `Memcached.GetBatched` for batched lookups, and is not reported for non-batched lookups (label value `Memcached.GetMulti` remains, and had exactly the same value as `Get` in nonbatched lookups). The same change applies to tracing spans. #3046
1414
* [CHANGE] TLS server validation is now enabled by default, a new parameter `tls_insecure_skip_verify` can be set to true to skip validation optionally. #3030
15+
* [CHANGE] `cortex_ruler_config_update_failures_total` has been removed in favor of `cortex_ruler_config_last_reload_successful`. #3056
1516
* [ENHANCEMENT] Add support for azure storage in China, German and US Government environments. #2988
1617
* [ENHANCEMENT] Query-tee: added a small tolerance to floating point sample values comparison. #2994
1718
* [ENHANCEMENT] Query-tee: add support for doing a passthrough of requests to preferred backend for unregistered routes #3018
1819
* [ENHANCEMENT] Expose `storage.aws.dynamodb.backoff_config` configuration file field. #3026
1920
* [ENHANCEMENT] Added `cortex_request_message_bytes` and `cortex_response_message_bytes` histograms to track received and sent gRPC message and HTTP request/response sizes. Added `cortex_inflight_requests` gauge to track number of inflight gRPC and HTTP requests. #3064
2021
* [ENHANCEMENT] Add config validation to the experimental Alertmanager API. Invalid configs are no longer accepted. #3053
22+
* [ENHANCEMENT] Add "integration" as a label for `cortex_alertmanager_notifications_total` and `cortex_alertmanager_notifications_failed_total` metrics. #3056
23+
* [ENHANCEMENT] Add `cortex_ruler_config_last_reload_successful` and `cortex_ruler_config_last_reload_successful_seconds` to check status of users rule manager. #3056
2124
* [BUGFIX] Query-frontend: Fixed rounding for incoming query timestamps, to be 100% Prometheus compatible. #2990
2225
* [BUGFIX] Querier: Merge results from chunks and blocks ingesters when using streaming of results. #3013
2326
* [BUGFIX] Querier: query /series from ingesters regardless the `-querier.query-ingesters-within` setting. #3035

pkg/alertmanager/alertmanager_metrics.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,11 @@ func newAlertmanagerMetrics() *alertmanagerMetrics {
6262
numNotifications: prometheus.NewDesc(
6363
"cortex_alertmanager_notifications_total",
6464
"The total number of attempted notifications.",
65-
[]string{"user"}, nil),
65+
[]string{"user", "integration"}, nil),
6666
numFailedNotifications: prometheus.NewDesc(
6767
"cortex_alertmanager_notifications_failed_total",
6868
"The total number of failed notifications.",
69-
[]string{"user"}, nil),
69+
[]string{"user", "integration"}, nil),
7070
notificationLatencySeconds: prometheus.NewDesc(
7171
"cortex_alertmanager_notification_latency_seconds",
7272
"The latency of notifications in seconds.",
@@ -186,8 +186,8 @@ func (m *alertmanagerMetrics) Collect(out chan<- prometheus.Metric) {
186186
data.SendSumOfCountersPerUser(out, m.alertsReceived, "alertmanager_alerts_received_total")
187187
data.SendSumOfCountersPerUser(out, m.alertsInvalid, "alertmanager_alerts_invalid_total")
188188

189-
data.SendSumOfCountersPerUser(out, m.numNotifications, "alertmanager_notifications_total")
190-
data.SendSumOfCountersPerUser(out, m.numFailedNotifications, "alertmanager_notifications_failed_total")
189+
data.SendSumOfCountersPerUserWithLabels(out, m.numNotifications, "alertmanager_notifications_total", "integration")
190+
data.SendSumOfCountersPerUserWithLabels(out, m.numFailedNotifications, "alertmanager_notifications_failed_total", "integration")
191191
data.SendSumOfHistograms(out, m.notificationLatencySeconds, "alertmanager_notification_latency_seconds")
192192
data.SendSumOfGaugesPerUserWithLabels(out, m.markerAlerts, "alertmanager_alerts", "state")
193193

pkg/alertmanager/alertmanager_metrics_test.go

Lines changed: 48 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -99,14 +99,56 @@ func TestAlertmanagerMetricsStore(t *testing.T) {
9999
cortex_alertmanager_notification_latency_seconds_count 24
100100
# HELP cortex_alertmanager_notifications_failed_total The total number of failed notifications.
101101
# TYPE cortex_alertmanager_notifications_failed_total counter
102-
cortex_alertmanager_notifications_failed_total{user="user1"} 28
103-
cortex_alertmanager_notifications_failed_total{user="user2"} 280
104-
cortex_alertmanager_notifications_failed_total{user="user3"} 2800
102+
cortex_alertmanager_notifications_failed_total{integration="email",user="user1"} 0
103+
cortex_alertmanager_notifications_failed_total{integration="email",user="user2"} 0
104+
cortex_alertmanager_notifications_failed_total{integration="email",user="user3"} 0
105+
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user1"} 5
106+
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user2"} 50
107+
cortex_alertmanager_notifications_failed_total{integration="opsgenie",user="user3"} 500
108+
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user1"} 1
109+
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user2"} 10
110+
cortex_alertmanager_notifications_failed_total{integration="pagerduty",user="user3"} 100
111+
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user1"} 3
112+
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user2"} 30
113+
cortex_alertmanager_notifications_failed_total{integration="pushover",user="user3"} 300
114+
cortex_alertmanager_notifications_failed_total{integration="slack",user="user1"} 4
115+
cortex_alertmanager_notifications_failed_total{integration="slack",user="user2"} 40
116+
cortex_alertmanager_notifications_failed_total{integration="slack",user="user3"} 400
117+
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user1"} 7
118+
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user2"} 70
119+
cortex_alertmanager_notifications_failed_total{integration="victorops",user="user3"} 700
120+
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user1"} 6
121+
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user2"} 60
122+
cortex_alertmanager_notifications_failed_total{integration="webhook",user="user3"} 600
123+
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user1"} 2
124+
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user2"} 20
125+
cortex_alertmanager_notifications_failed_total{integration="wechat",user="user3"} 200
105126
# HELP cortex_alertmanager_notifications_total The total number of attempted notifications.
106127
# TYPE cortex_alertmanager_notifications_total counter
107-
cortex_alertmanager_notifications_total{user="user1"} 28
108-
cortex_alertmanager_notifications_total{user="user2"} 280
109-
cortex_alertmanager_notifications_total{user="user3"} 2800
128+
cortex_alertmanager_notifications_total{integration="email",user="user1"} 0
129+
cortex_alertmanager_notifications_total{integration="email",user="user2"} 0
130+
cortex_alertmanager_notifications_total{integration="email",user="user3"} 0
131+
cortex_alertmanager_notifications_total{integration="opsgenie",user="user1"} 5
132+
cortex_alertmanager_notifications_total{integration="opsgenie",user="user2"} 50
133+
cortex_alertmanager_notifications_total{integration="opsgenie",user="user3"} 500
134+
cortex_alertmanager_notifications_total{integration="pagerduty",user="user1"} 1
135+
cortex_alertmanager_notifications_total{integration="pagerduty",user="user2"} 10
136+
cortex_alertmanager_notifications_total{integration="pagerduty",user="user3"} 100
137+
cortex_alertmanager_notifications_total{integration="pushover",user="user1"} 3
138+
cortex_alertmanager_notifications_total{integration="pushover",user="user2"} 30
139+
cortex_alertmanager_notifications_total{integration="pushover",user="user3"} 300
140+
cortex_alertmanager_notifications_total{integration="slack",user="user1"} 4
141+
cortex_alertmanager_notifications_total{integration="slack",user="user2"} 40
142+
cortex_alertmanager_notifications_total{integration="slack",user="user3"} 400
143+
cortex_alertmanager_notifications_total{integration="victorops",user="user1"} 7
144+
cortex_alertmanager_notifications_total{integration="victorops",user="user2"} 70
145+
cortex_alertmanager_notifications_total{integration="victorops",user="user3"} 700
146+
cortex_alertmanager_notifications_total{integration="webhook",user="user1"} 6
147+
cortex_alertmanager_notifications_total{integration="webhook",user="user2"} 60
148+
cortex_alertmanager_notifications_total{integration="webhook",user="user3"} 600
149+
cortex_alertmanager_notifications_total{integration="wechat",user="user1"} 2
150+
cortex_alertmanager_notifications_total{integration="wechat",user="user2"} 20
151+
cortex_alertmanager_notifications_total{integration="wechat",user="user3"} 200
110152
# HELP cortex_alertmanager_silences How many silences by state.
111153
# TYPE cortex_alertmanager_silences gauge
112154
cortex_alertmanager_silences{state="active",user="user1"} 1

pkg/alertmanager/multitenant.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ func newMultitenantAlertmanagerMetrics(reg prometheus.Registerer) *multitenantAl
135135
m.invalidConfig = promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
136136
Namespace: "cortex",
137137
Name: "alertmanager_config_invalid",
138-
Help: "Whenever the Alertmanager config is invalid for a user.",
138+
Help: "Boolean set to 1 whenever the Alertmanager config is invalid for a user.",
139139
}, []string{"user"})
140140

141141
return m

pkg/alertmanager/multitenant_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ func TestLoadAllConfigs(t *testing.T) {
9898
require.Equal(t, simpleConfigOne, currentConfig.RawConfig)
9999

100100
assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
101-
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
101+
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
102102
# TYPE cortex_alertmanager_config_invalid gauge
103103
cortex_alertmanager_config_invalid{user="user1"} 0
104104
cortex_alertmanager_config_invalid{user="user2"} 0
@@ -115,7 +115,7 @@ func TestLoadAllConfigs(t *testing.T) {
115115
require.Len(t, am.alertmanagers, 3)
116116

117117
assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
118-
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
118+
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
119119
# TYPE cortex_alertmanager_config_invalid gauge
120120
cortex_alertmanager_config_invalid{user="user1"} 0
121121
cortex_alertmanager_config_invalid{user="user2"} 0
@@ -148,7 +148,7 @@ func TestLoadAllConfigs(t *testing.T) {
148148
require.False(t, userAM.IsActive())
149149

150150
assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
151-
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
151+
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
152152
# TYPE cortex_alertmanager_config_invalid gauge
153153
cortex_alertmanager_config_invalid{user="user1"} 0
154154
cortex_alertmanager_config_invalid{user="user2"} 0
@@ -172,7 +172,7 @@ func TestLoadAllConfigs(t *testing.T) {
172172
require.True(t, userAM.IsActive())
173173

174174
assert.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
175-
# HELP cortex_alertmanager_config_invalid Whenever the Alertmanager config is invalid for a user.
175+
# HELP cortex_alertmanager_config_invalid Boolean set to 1 whenever the Alertmanager config is invalid for a user.
176176
# TYPE cortex_alertmanager_config_invalid gauge
177177
cortex_alertmanager_config_invalid{user="user1"} 0
178178
cortex_alertmanager_config_invalid{user="user2"} 0

pkg/ruler/manager.go

Lines changed: 31 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,12 @@ type DefaultMultiTenantManager struct {
3737
notifiersMtx sync.Mutex
3838
notifiers map[string]*rulerNotifier
3939

40-
managersTotal prometheus.Gauge
41-
registry prometheus.Registerer
42-
logger log.Logger
40+
managersTotal prometheus.Gauge
41+
lastReloadSuccessful *prometheus.GaugeVec
42+
lastReloadSuccessfulTimestamp *prometheus.GaugeVec
43+
configUpdatesTotal *prometheus.CounterVec
44+
registry prometheus.Registerer
45+
logger log.Logger
4346
}
4447

4548
func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg prometheus.Registerer, logger log.Logger) (*DefaultMultiTenantManager, error) {
@@ -66,6 +69,21 @@ func NewDefaultMultiTenantManager(cfg Config, managerFactory ManagerFactory, reg
6669
Name: "ruler_managers_total",
6770
Help: "Total number of managers registered and running in the ruler",
6871
}),
72+
lastReloadSuccessful: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
73+
Namespace: "cortex",
74+
Name: "ruler_config_last_reload_successful",
75+
Help: "Boolean set to 1 whenever the last configuration reload attempt was successful.",
76+
}, []string{"user"}),
77+
lastReloadSuccessfulTimestamp: promauto.With(reg).NewGaugeVec(prometheus.GaugeOpts{
78+
Namespace: "cortex",
79+
Name: "ruler_config_last_reload_successful_seconds",
80+
Help: "Timestamp of the last successful configuration reload.",
81+
}, []string{"user"}),
82+
configUpdatesTotal: promauto.With(reg).NewCounterVec(prometheus.CounterOpts{
83+
Namespace: "cortex",
84+
Name: "ruler_config_updates_total",
85+
Help: "Total number of config updates triggered by a user",
86+
}, []string{"user"}),
6987
registry: reg,
7088
logger: logger,
7189
}, nil
@@ -86,6 +104,9 @@ func (r *DefaultMultiTenantManager) SyncRuleGroups(ctx context.Context, ruleGrou
86104
if _, exists := ruleGroups[userID]; !exists {
87105
go mngr.Stop()
88106
delete(r.userManagers, userID)
107+
r.lastReloadSuccessful.DeleteLabelValues(userID)
108+
r.lastReloadSuccessfulTimestamp.DeleteLabelValues(userID)
109+
r.configUpdatesTotal.DeleteLabelValues(userID)
89110
level.Info(r.logger).Log("msg", "deleting rule manager", "user", userID)
90111
}
91112
}
@@ -100,18 +121,19 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
100121
// have been updated
101122
update, files, err := r.mapper.MapRules(user, groups.Formatted())
102123
if err != nil {
124+
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
103125
level.Error(r.logger).Log("msg", "unable to map rule files", "user", user, "err", err)
104126
return
105127
}
106128

107129
if update {
108130
level.Debug(r.logger).Log("msg", "updating rules", "user", "user")
109-
configUpdatesTotal.WithLabelValues(user).Inc()
131+
r.configUpdatesTotal.WithLabelValues(user).Inc()
110132
manager, exists := r.userManagers[user]
111133
if !exists {
112134
manager, err = r.newManager(ctx, user)
113135
if err != nil {
114-
configUpdateFailuresTotal.WithLabelValues(user, "rule-manager-creation-failure").Inc()
136+
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
115137
level.Error(r.logger).Log("msg", "unable to create rule manager", "user", user, "err", err)
116138
return
117139
}
@@ -122,10 +144,13 @@ func (r *DefaultMultiTenantManager) syncRulesToManager(ctx context.Context, user
122144
}
123145
err = manager.Update(r.cfg.EvaluationInterval, files, nil)
124146
if err != nil {
125-
configUpdateFailuresTotal.WithLabelValues(user, "rules-update-failure").Inc()
147+
r.lastReloadSuccessful.WithLabelValues(user).Set(0)
126148
level.Error(r.logger).Log("msg", "unable to update rule manager", "user", user, "err", err)
127149
return
128150
}
151+
152+
r.lastReloadSuccessful.WithLabelValues(user).Set(1)
153+
r.lastReloadSuccessfulTimestamp.WithLabelValues(user).SetToCurrentTime()
129154
}
130155
}
131156

pkg/ruler/ruler.go

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,6 @@ var (
3838
Name: "ruler_ring_check_errors_total",
3939
Help: "Number of errors that have occurred when checking the ring for ownership",
4040
})
41-
configUpdatesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
42-
Namespace: "cortex",
43-
Name: "ruler_config_updates_total",
44-
Help: "Total number of config updates triggered by a user",
45-
}, []string{"user"})
46-
configUpdateFailuresTotal = promauto.NewCounterVec(prometheus.CounterOpts{
47-
Namespace: "cortex",
48-
Name: "ruler_config_update_failures_total",
49-
Help: "Total number of config update failures triggered by a user",
50-
}, []string{"user", "reason"})
5141
)
5242

5343
// Config is the configuration for the recording rules server.

0 commit comments

Comments
 (0)