Skip to content

Commit 4457537

Browse files
authored
Add a cortex_slow_queries_total to track # of slow queries (#6859)
* Add a cortex_slow_queries_total to track # of slow queries Signed-off-by: SungJin1212 <[email protected]> * Make cortex_slow_queries_total init lazily Signed-off-by: SungJin1212 <[email protected]> --------- Signed-off-by: SungJin1212 <[email protected]>
1 parent 91b1377 commit 4457537

File tree

3 files changed

+57
-3
lines changed

3 files changed

+57
-3
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
* [FEATURE] Compactor: Add support for percentage based sharding for compactors. #6738
2020
* [FEATURE] Querier: Allow choosing PromQL engine via header. #6777
2121
* [ENHANCEMENT] Tenant Federation: Add a # of query result limit logic when the `-tenant-federation.regex-matcher-enabled` is enabled. #6845
22+
* [ENHANCEMENT] Query Frontend: Add a `cortex_slow_queries_total` metric to track # of slow queries per user. #6859
2223
* [ENHANCEMENT] Query Frontend: Change to return 400 when the tenant resolving fail. #6715
2324
* [ENHANCEMENT] Querier: Support query parameters to metadata api (/api/v1/metadata) to allow user to limit metadata to return. Add a `-ingester.return-all-metadata` flag to make the metadata API run when the deployment. Please set this flag to `false` to use the metadata API with the limits later. #6681 #6744
2425
* [ENHANCEMENT] Ingester: Add a `cortex_ingester_active_native_histogram_series` metric to track # of active NH series. #6695

pkg/frontend/transport/handler.go

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
"net/url"
1212
"strconv"
1313
"strings"
14+
"sync"
1415
"syscall"
1516
"time"
1617

@@ -108,7 +109,11 @@ type Handler struct {
108109
queryChunkBytes *prometheus.CounterVec
109110
queryDataBytes *prometheus.CounterVec
110111
rejectedQueries *prometheus.CounterVec
112+
slowQueries *prometheus.CounterVec
111113
activeUsers *util.ActiveUsersCleanupService
114+
115+
initSlowQueryMetric sync.Once
116+
reg prometheus.Registerer
112117
}
113118

114119
// NewHandler creates a new frontend handler.
@@ -118,6 +123,7 @@ func NewHandler(cfg HandlerConfig, tenantFederationCfg tenantfederation.Config,
118123
tenantFederationCfg: tenantFederationCfg,
119124
log: log,
120125
roundTripper: roundTripper,
126+
reg: reg,
121127
}
122128

123129
if cfg.QueryStatsEnabled {
@@ -167,7 +173,6 @@ func NewHandler(cfg HandlerConfig, tenantFederationCfg tenantfederation.Config,
167173
},
168174
[]string{"reason", "source", "user"},
169175
)
170-
171176
h.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(h.cleanupMetricsForInactiveUser)
172177
// If cleaner stops or fail, we will simply not clean the metrics for inactive users.
173178
_ = h.activeUsers.StartAsync(context.Background())
@@ -176,6 +181,19 @@ func NewHandler(cfg HandlerConfig, tenantFederationCfg tenantfederation.Config,
176181
return h
177182
}
178183

184+
func (h *Handler) getOrCreateSlowQueryMetric() *prometheus.CounterVec {
185+
h.initSlowQueryMetric.Do(func() {
186+
h.slowQueries = promauto.With(h.reg).NewCounterVec(
187+
prometheus.CounterOpts{
188+
Name: "cortex_slow_queries_total",
189+
Help: "The total number of slow queries.",
190+
},
191+
[]string{"source", "user"},
192+
)
193+
})
194+
return h.slowQueries
195+
}
196+
179197
func (h *Handler) cleanupMetricsForInactiveUser(user string) {
180198
if !h.cfg.QueryStatsEnabled {
181199
return
@@ -209,6 +227,11 @@ func (h *Handler) cleanupMetricsForInactiveUser(user string) {
209227
if err := util.DeleteMatchingLabels(h.rejectedQueries, userLabel); err != nil {
210228
level.Warn(h.log).Log("msg", "failed to remove cortex_rejected_queries_total metric for user", "user", user, "err", err)
211229
}
230+
if h.slowQueries != nil {
231+
if err := util.DeleteMatchingLabels(h.slowQueries, userLabel); err != nil {
232+
level.Warn(h.log).Log("msg", "failed to remove cortex_slow_queries_total metric for user", "user", user, "err", err)
233+
}
234+
}
212235
}
213236

214237
func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
@@ -294,6 +317,9 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
294317
}
295318
if shouldReportSlowQuery {
296319
f.reportSlowQuery(r, queryString, queryResponseTime)
320+
if f.cfg.QueryStatsEnabled {
321+
f.getOrCreateSlowQueryMetric().WithLabelValues(source, userID).Inc()
322+
}
297323
}
298324

299325
if f.cfg.QueryStatsEnabled {

pkg/frontend/transport/handler_test.go

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,23 @@ func TestHandler_ServeHTTP(t *testing.T) {
398398
},
399399
expectedStatusCode: http.StatusUnprocessableEntity,
400400
},
401+
{
402+
name: "test cortex_slow_queries_total",
403+
cfg: HandlerConfig{QueryStatsEnabled: true, LogQueriesLongerThan: time.Second * 2},
404+
expectedMetrics: 7,
405+
roundTripperFunc: roundTripperFunc(func(req *http.Request) (*http.Response, error) {
406+
time.Sleep(time.Second * 4)
407+
return &http.Response{
408+
StatusCode: http.StatusOK,
409+
Body: io.NopCloser(strings.NewReader("mock")),
410+
}, nil
411+
}),
412+
additionalMetricsCheckFunc: func(h *Handler) {
413+
v := promtest.ToFloat64(h.slowQueries.WithLabelValues(tripperware.SourceAPI, userID))
414+
assert.Equal(t, float64(1), v)
415+
},
416+
expectedStatusCode: http.StatusOK,
417+
},
401418
} {
402419
t.Run(tt.name, func(t *testing.T) {
403420
reg := prometheus.NewPedanticRegistry()
@@ -420,6 +437,7 @@ func TestHandler_ServeHTTP(t *testing.T) {
420437
"cortex_query_fetched_chunks_bytes_total",
421438
"cortex_query_samples_scanned_total",
422439
"cortex_query_peak_samples",
440+
"cortex_slow_queries_total",
423441
)
424442

425443
assert.NoError(t, err)
@@ -713,6 +731,7 @@ func TestHandlerMetricsCleanup(t *testing.T) {
713731
handler.queryChunkBytes.WithLabelValues(source, user1).Add(1024)
714732
handler.queryDataBytes.WithLabelValues(source, user1).Add(2048)
715733
handler.rejectedQueries.WithLabelValues(reasonTooManySamples, source, user1).Add(5)
734+
handler.getOrCreateSlowQueryMetric().WithLabelValues(source, user1).Add(5)
716735

717736
// Simulate activity for user2
718737
handler.querySeconds.WithLabelValues(source, user2).Add(2.0)
@@ -723,6 +742,7 @@ func TestHandlerMetricsCleanup(t *testing.T) {
723742
handler.queryChunkBytes.WithLabelValues(source, user2).Add(2048)
724743
handler.queryDataBytes.WithLabelValues(source, user2).Add(4096)
725744
handler.rejectedQueries.WithLabelValues(reasonTooManySamples, source, user2).Add(10)
745+
handler.getOrCreateSlowQueryMetric().WithLabelValues(source, user2).Add(10)
726746

727747
// Verify initial state - both users should have metrics
728748
require.NoError(t, promtest.GatherAndCompare(reg, strings.NewReader(`
@@ -762,9 +782,13 @@ func TestHandlerMetricsCleanup(t *testing.T) {
762782
# TYPE cortex_rejected_queries_total counter
763783
cortex_rejected_queries_total{reason="too_many_samples",source="api",user="user1"} 5
764784
cortex_rejected_queries_total{reason="too_many_samples",source="api",user="user2"} 10
785+
# HELP cortex_slow_queries_total The total number of slow queries.
786+
# TYPE cortex_slow_queries_total counter
787+
cortex_slow_queries_total{source="api",user="user1"} 5
788+
cortex_slow_queries_total{source="api",user="user2"} 10
765789
`), "cortex_query_seconds_total", "cortex_query_fetched_series_total", "cortex_query_samples_total",
766790
"cortex_query_samples_scanned_total", "cortex_query_peak_samples", "cortex_query_fetched_chunks_bytes_total",
767-
"cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total"))
791+
"cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total", "cortex_slow_queries_total"))
768792

769793
// Clean up metrics for user1
770794
handler.cleanupMetricsForInactiveUser(user1)
@@ -797,7 +821,10 @@ func TestHandlerMetricsCleanup(t *testing.T) {
797821
# HELP cortex_rejected_queries_total The total number of queries that were rejected.
798822
# TYPE cortex_rejected_queries_total counter
799823
cortex_rejected_queries_total{reason="too_many_samples",source="api",user="user2"} 10
824+
# HELP cortex_slow_queries_total The total number of slow queries.
825+
# TYPE cortex_slow_queries_total counter
826+
cortex_slow_queries_total{source="api",user="user2"} 10
800827
`), "cortex_query_seconds_total", "cortex_query_fetched_series_total", "cortex_query_samples_total",
801828
"cortex_query_samples_scanned_total", "cortex_query_peak_samples", "cortex_query_fetched_chunks_bytes_total",
802-
"cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total"))
829+
"cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total", "cortex_slow_queries_total"))
803830
}

0 commit comments

Comments
 (0)