diff --git a/CHANGELOG.md b/CHANGELOG.md index 1615f6328c..6ff1a4b2ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ * [FEATURE] Compactor: Add support for percentage based sharding for compactors. #6738 * [FEATURE] Querier: Allow choosing PromQL engine via header. #6777 * [ENHANCEMENT] Tenant Federation: Add a # of query result limit logic when the `-tenant-federation.regex-matcher-enabled` is enabled. #6845 +* [ENHANCEMENT] Query Frontend: Add a `cortex_slow_queries_total` metric to track # of slow queries per user. #6859 * [ENHANCEMENT] Query Frontend: Change to return 400 when the tenant resolving fail. #6715 * [ENHANCEMENT] Querier: Support query parameters to metadata api (/api/v1/metadata) to allow user to limit metadata to return. Add a `-ingester.return-all-metadata` flag to make the metadata API run when the deployment. Please set this flag to `false` to use the metadata API with the limits later. #6681 #6744 * [ENHANCEMENT] Ingester: Add a `cortex_ingester_active_native_histogram_series` metric to track # of active NH series. #6695 diff --git a/pkg/frontend/transport/handler.go b/pkg/frontend/transport/handler.go index a111381f58..29ea818331 100644 --- a/pkg/frontend/transport/handler.go +++ b/pkg/frontend/transport/handler.go @@ -108,6 +108,7 @@ type Handler struct { queryChunkBytes *prometheus.CounterVec queryDataBytes *prometheus.CounterVec rejectedQueries *prometheus.CounterVec + slowQueries *prometheus.CounterVec activeUsers *util.ActiveUsersCleanupService } @@ -167,6 +168,13 @@ func NewHandler(cfg HandlerConfig, tenantFederationCfg tenantfederation.Config, }, []string{"reason", "source", "user"}, ) + h.slowQueries = promauto.With(reg).NewCounterVec( + prometheus.CounterOpts{ + Name: "cortex_slow_queries_total", + Help: "The total number of slow queries.", + }, + []string{"source", "user"}, + ) h.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(h.cleanupMetricsForInactiveUser) // If cleaner stops or fail, we will simply not clean the metrics for inactive users. @@ -209,6 +217,9 @@ func (h *Handler) cleanupMetricsForInactiveUser(user string) { if err := util.DeleteMatchingLabels(h.rejectedQueries, userLabel); err != nil { level.Warn(h.log).Log("msg", "failed to remove cortex_rejected_queries_total metric for user", "user", user, "err", err) } + if err := util.DeleteMatchingLabels(h.slowQueries, userLabel); err != nil { + level.Warn(h.log).Log("msg", "failed to remove cortex_slow_queries_total metric for user", "user", user, "err", err) + } } func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { @@ -294,6 +305,9 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { } if shouldReportSlowQuery { f.reportSlowQuery(r, queryString, queryResponseTime) + if f.cfg.QueryStatsEnabled { + f.slowQueries.WithLabelValues(source, userID).Inc() + } } if f.cfg.QueryStatsEnabled { diff --git a/pkg/frontend/transport/handler_test.go b/pkg/frontend/transport/handler_test.go index 2e2ea8c3c3..c647ee78d6 100644 --- a/pkg/frontend/transport/handler_test.go +++ b/pkg/frontend/transport/handler_test.go @@ -398,6 +398,23 @@ func TestHandler_ServeHTTP(t *testing.T) { }, expectedStatusCode: http.StatusUnprocessableEntity, }, + { + name: "test cortex_slow_queries_total", + cfg: HandlerConfig{QueryStatsEnabled: true, LogQueriesLongerThan: time.Second * 2}, + expectedMetrics: 7, + roundTripperFunc: roundTripperFunc(func(req *http.Request) (*http.Response, error) { + time.Sleep(time.Second * 4) + return &http.Response{ + StatusCode: http.StatusOK, + Body: io.NopCloser(strings.NewReader("mock")), + }, nil + }), + additionalMetricsCheckFunc: func(h *Handler) { + v := promtest.ToFloat64(h.slowQueries.WithLabelValues(tripperware.SourceAPI, userID)) + assert.Equal(t, float64(1), v) + }, + expectedStatusCode: http.StatusOK, + }, } { t.Run(tt.name, func(t *testing.T) { reg := prometheus.NewPedanticRegistry() @@ -420,6 +437,7 @@ func TestHandler_ServeHTTP(t *testing.T) { "cortex_query_fetched_chunks_bytes_total", "cortex_query_samples_scanned_total", "cortex_query_peak_samples", + "cortex_slow_queries_total", ) assert.NoError(t, err) @@ -713,6 +731,7 @@ func TestHandlerMetricsCleanup(t *testing.T) { handler.queryChunkBytes.WithLabelValues(source, user1).Add(1024) handler.queryDataBytes.WithLabelValues(source, user1).Add(2048) handler.rejectedQueries.WithLabelValues(reasonTooManySamples, source, user1).Add(5) + handler.slowQueries.WithLabelValues(source, user1).Add(5) // Simulate activity for user2 handler.querySeconds.WithLabelValues(source, user2).Add(2.0) @@ -723,6 +742,7 @@ func TestHandlerMetricsCleanup(t *testing.T) { handler.queryChunkBytes.WithLabelValues(source, user2).Add(2048) handler.queryDataBytes.WithLabelValues(source, user2).Add(4096) handler.rejectedQueries.WithLabelValues(reasonTooManySamples, source, user2).Add(10) + handler.slowQueries.WithLabelValues(source, user2).Add(10) // Verify initial state - both users should have metrics require.NoError(t, promtest.GatherAndCompare(reg, strings.NewReader(` @@ -762,9 +782,13 @@ func TestHandlerMetricsCleanup(t *testing.T) { # TYPE cortex_rejected_queries_total counter cortex_rejected_queries_total{reason="too_many_samples",source="api",user="user1"} 5 cortex_rejected_queries_total{reason="too_many_samples",source="api",user="user2"} 10 + # HELP cortex_slow_queries_total The total number of slow queries. + # TYPE cortex_slow_queries_total counter + cortex_slow_queries_total{source="api",user="user1"} 5 + cortex_slow_queries_total{source="api",user="user2"} 10 `), "cortex_query_seconds_total", "cortex_query_fetched_series_total", "cortex_query_samples_total", "cortex_query_samples_scanned_total", "cortex_query_peak_samples", "cortex_query_fetched_chunks_bytes_total", - "cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total")) + "cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total", "cortex_slow_queries_total")) // Clean up metrics for user1 handler.cleanupMetricsForInactiveUser(user1) @@ -797,7 +821,10 @@ func TestHandlerMetricsCleanup(t *testing.T) { # HELP cortex_rejected_queries_total The total number of queries that were rejected. # TYPE cortex_rejected_queries_total counter cortex_rejected_queries_total{reason="too_many_samples",source="api",user="user2"} 10 + # HELP cortex_slow_queries_total The total number of slow queries. + # TYPE cortex_slow_queries_total counter + cortex_slow_queries_total{source="api",user="user2"} 10 `), "cortex_query_seconds_total", "cortex_query_fetched_series_total", "cortex_query_samples_total", "cortex_query_samples_scanned_total", "cortex_query_peak_samples", "cortex_query_fetched_chunks_bytes_total", - "cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total")) + "cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total", "cortex_slow_queries_total")) }