Skip to content

Add a cortex_slow_queries_total to track # of slow queries #6859

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
* [FEATURE] Compactor: Add support for percentage based sharding for compactors. #6738
* [FEATURE] Querier: Allow choosing PromQL engine via header. #6777
* [ENHANCEMENT] Tenant Federation: Add a # of query result limit logic when the `-tenant-federation.regex-matcher-enabled` is enabled. #6845
* [ENHANCEMENT] Query Frontend: Add a `cortex_slow_queries_total` metric to track # of slow queries per user. #6859
* [ENHANCEMENT] Query Frontend: Change to return 400 when the tenant resolving fail. #6715
* [ENHANCEMENT] Querier: Support query parameters to metadata api (/api/v1/metadata) to allow user to limit metadata to return. Add a `-ingester.return-all-metadata` flag to make the metadata API run when the deployment. Please set this flag to `false` to use the metadata API with the limits later. #6681 #6744
* [ENHANCEMENT] Ingester: Add a `cortex_ingester_active_native_histogram_series` metric to track # of active NH series. #6695
Expand Down
14 changes: 14 additions & 0 deletions pkg/frontend/transport/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ type Handler struct {
queryChunkBytes *prometheus.CounterVec
queryDataBytes *prometheus.CounterVec
rejectedQueries *prometheus.CounterVec
slowQueries *prometheus.CounterVec
activeUsers *util.ActiveUsersCleanupService
}

Expand Down Expand Up @@ -167,6 +168,13 @@ func NewHandler(cfg HandlerConfig, tenantFederationCfg tenantfederation.Config,
},
[]string{"reason", "source", "user"},
)
h.slowQueries = promauto.With(reg).NewCounterVec(
prometheus.CounterOpts{
Name: "cortex_slow_queries_total",
Help: "The total number of slow queries.",
},
[]string{"source", "user"},
)

h.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(h.cleanupMetricsForInactiveUser)
// If cleaner stops or fail, we will simply not clean the metrics for inactive users.
Expand Down Expand Up @@ -209,6 +217,9 @@ func (h *Handler) cleanupMetricsForInactiveUser(user string) {
if err := util.DeleteMatchingLabels(h.rejectedQueries, userLabel); err != nil {
level.Warn(h.log).Log("msg", "failed to remove cortex_rejected_queries_total metric for user", "user", user, "err", err)
}
if err := util.DeleteMatchingLabels(h.slowQueries, userLabel); err != nil {
level.Warn(h.log).Log("msg", "failed to remove cortex_slow_queries_total metric for user", "user", user, "err", err)
}
}

func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
Expand Down Expand Up @@ -294,6 +305,9 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
}
if shouldReportSlowQuery {
f.reportSlowQuery(r, queryString, queryResponseTime)
if f.cfg.QueryStatsEnabled {
f.slowQueries.WithLabelValues(source, userID).Inc()
}
}

if f.cfg.QueryStatsEnabled {
Expand Down
31 changes: 29 additions & 2 deletions pkg/frontend/transport/handler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,23 @@ func TestHandler_ServeHTTP(t *testing.T) {
},
expectedStatusCode: http.StatusUnprocessableEntity,
},
{
name: "test cortex_slow_queries_total",
cfg: HandlerConfig{QueryStatsEnabled: true, LogQueriesLongerThan: time.Second * 2},
expectedMetrics: 7,
roundTripperFunc: roundTripperFunc(func(req *http.Request) (*http.Response, error) {
time.Sleep(time.Second * 4)
return &http.Response{
StatusCode: http.StatusOK,
Body: io.NopCloser(strings.NewReader("mock")),
}, nil
}),
additionalMetricsCheckFunc: func(h *Handler) {
v := promtest.ToFloat64(h.slowQueries.WithLabelValues(tripperware.SourceAPI, userID))
assert.Equal(t, float64(1), v)
},
expectedStatusCode: http.StatusOK,
},
} {
t.Run(tt.name, func(t *testing.T) {
reg := prometheus.NewPedanticRegistry()
Expand All @@ -420,6 +437,7 @@ func TestHandler_ServeHTTP(t *testing.T) {
"cortex_query_fetched_chunks_bytes_total",
"cortex_query_samples_scanned_total",
"cortex_query_peak_samples",
"cortex_slow_queries_total",
)

assert.NoError(t, err)
Expand Down Expand Up @@ -713,6 +731,7 @@ func TestHandlerMetricsCleanup(t *testing.T) {
handler.queryChunkBytes.WithLabelValues(source, user1).Add(1024)
handler.queryDataBytes.WithLabelValues(source, user1).Add(2048)
handler.rejectedQueries.WithLabelValues(reasonTooManySamples, source, user1).Add(5)
handler.slowQueries.WithLabelValues(source, user1).Add(5)

// Simulate activity for user2
handler.querySeconds.WithLabelValues(source, user2).Add(2.0)
Expand All @@ -723,6 +742,7 @@ func TestHandlerMetricsCleanup(t *testing.T) {
handler.queryChunkBytes.WithLabelValues(source, user2).Add(2048)
handler.queryDataBytes.WithLabelValues(source, user2).Add(4096)
handler.rejectedQueries.WithLabelValues(reasonTooManySamples, source, user2).Add(10)
handler.slowQueries.WithLabelValues(source, user2).Add(10)

// Verify initial state - both users should have metrics
require.NoError(t, promtest.GatherAndCompare(reg, strings.NewReader(`
Expand Down Expand Up @@ -762,9 +782,13 @@ func TestHandlerMetricsCleanup(t *testing.T) {
# TYPE cortex_rejected_queries_total counter
cortex_rejected_queries_total{reason="too_many_samples",source="api",user="user1"} 5
cortex_rejected_queries_total{reason="too_many_samples",source="api",user="user2"} 10
# HELP cortex_slow_queries_total The total number of slow queries.
# TYPE cortex_slow_queries_total counter
cortex_slow_queries_total{source="api",user="user1"} 5
cortex_slow_queries_total{source="api",user="user2"} 10
`), "cortex_query_seconds_total", "cortex_query_fetched_series_total", "cortex_query_samples_total",
"cortex_query_samples_scanned_total", "cortex_query_peak_samples", "cortex_query_fetched_chunks_bytes_total",
"cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total"))
"cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total", "cortex_slow_queries_total"))

// Clean up metrics for user1
handler.cleanupMetricsForInactiveUser(user1)
Expand Down Expand Up @@ -797,7 +821,10 @@ func TestHandlerMetricsCleanup(t *testing.T) {
# HELP cortex_rejected_queries_total The total number of queries that were rejected.
# TYPE cortex_rejected_queries_total counter
cortex_rejected_queries_total{reason="too_many_samples",source="api",user="user2"} 10
# HELP cortex_slow_queries_total The total number of slow queries.
# TYPE cortex_slow_queries_total counter
cortex_slow_queries_total{source="api",user="user2"} 10
`), "cortex_query_seconds_total", "cortex_query_fetched_series_total", "cortex_query_samples_total",
"cortex_query_samples_scanned_total", "cortex_query_peak_samples", "cortex_query_fetched_chunks_bytes_total",
"cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total"))
"cortex_query_fetched_data_bytes_total", "cortex_rejected_queries_total", "cortex_slow_queries_total"))
}
Loading