From b621e7872f94c7627b60a7c526fe56e74a6fafa4 Mon Sep 17 00:00:00 2001 From: Daniel Dao Date: Tue, 22 Mar 2022 13:23:32 +0000 Subject: [PATCH 1/7] Replace runc with dqminh/runc for psi support Signed-off-by: Daniel Dao --- cmd/go.mod | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/go.mod b/cmd/go.mod index 690bc52858..b4ee58df2b 100644 --- a/cmd/go.mod +++ b/cmd/go.mod @@ -129,3 +129,5 @@ require ( google.golang.org/protobuf v1.36.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) + +replace github.com/opencontainers/runc => github.com/dqminh/runc v0.0.0-20220513155811-6414629ada8a From d3fefb97867eab8518a4afe80dc902879b3b4659 Mon Sep 17 00:00:00 2001 From: Daniel Dao Date: Tue, 22 Mar 2022 16:56:43 +0000 Subject: [PATCH 2/7] Expose PSI stats in libcontainer handler This adds 2 new set of metrics: - `psi_total`: read total number of seconds a resource is under pressure - `psi_avg`: read ratio of time a resource is under pressure over a sliding time window. For more details about these definitions, see: - https://www.kernel.org/doc/html/latest/accounting/psi.html - https://facebookmicrosites.github.io/psi/docs/overview Signed-off-by: Daniel Dao --- cmd/cadvisor_test.go | 2 ++ container/factory.go | 4 ++++ container/libcontainer/handler.go | 20 ++++++++++++++++++ container/libcontainer/handler_test.go | 28 ++++++++++++++++++++++++++ info/v1/container.go | 18 +++++++++++++++++ 5 files changed, 72 insertions(+) diff --git a/cmd/cadvisor_test.go b/cmd/cadvisor_test.go index 58461ae182..7d3c04b9d3 100644 --- a/cmd/cadvisor_test.go +++ b/cmd/cadvisor_test.go @@ -112,6 +112,8 @@ func TestToIncludedMetrics(t *testing.T) { container.ResctrlMetrics: struct{}{}, container.CPUSetMetrics: struct{}{}, container.OOMMetrics: struct{}{}, + container.PSITotalMetrics: struct{}{}, + container.PSIAvgMetrics: struct{}{}, }, container.AllMetrics, {}, diff --git a/container/factory.go b/container/factory.go index c48a64e163..ca8e80ad38 100644 --- a/container/factory.go +++ b/container/factory.go @@ -66,6 +66,8 @@ const ( ResctrlMetrics MetricKind = "resctrl" CPUSetMetrics MetricKind = "cpuset" OOMMetrics MetricKind = "oom_event" + PSITotalMetrics MetricKind = "psi_total" + PSIAvgMetrics MetricKind = "psi_avg" ) // AllMetrics represents all kinds of metrics that cAdvisor supported. @@ -91,6 +93,8 @@ var AllMetrics = MetricSet{ ResctrlMetrics: struct{}{}, CPUSetMetrics: struct{}{}, OOMMetrics: struct{}{}, + PSITotalMetrics: struct{}{}, + PSIAvgMetrics: struct{}{}, } // AllNetworkMetrics represents all network metrics that cAdvisor supports. diff --git a/container/libcontainer/handler.go b/container/libcontainer/handler.go index 5bf1a4f997..f667313183 100644 --- a/container/libcontainer/handler.go +++ b/container/libcontainer/handler.go @@ -763,6 +763,20 @@ func (h *Handler) GetProcesses() ([]int, error) { return pids, nil } +// Convert libcontainer cgroups.PSIData to info.PSIData +func convertPSIData(from *cgroups.PSIData, to *info.PSIData) { + to.Avg10 = from.Avg10 + to.Avg60 = from.Avg60 + to.Avg300 = from.Avg300 + to.Total = from.Total +} + +// Convert libcontainer cgroups.PSIStats to info.PSIStats +func convertPSI(from *cgroups.PSIStats, to *info.PSIStats) { + convertPSIData(&from.Some, &to.Some) + convertPSIData(&from.Full, &to.Full) +} + // Convert libcontainer stats to info.ContainerStats. func setCPUStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) { ret.Cpu.Usage.User = s.CpuStats.CpuUsage.UsageInUsermode @@ -772,6 +786,8 @@ func setCPUStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) { ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime + convertPSI(&s.CpuStats.PSI, &ret.Cpu.PSI) + if !withPerCPU { return } @@ -792,6 +808,8 @@ func setDiskIoStats(s *cgroups.Stats, ret *info.ContainerStats) { ret.DiskIo.IoWaitTime = diskStatsCopy(s.BlkioStats.IoWaitTimeRecursive) ret.DiskIo.IoMerged = diskStatsCopy(s.BlkioStats.IoMergedRecursive) ret.DiskIo.IoTime = diskStatsCopy(s.BlkioStats.IoTimeRecursive) + + convertPSI(&s.BlkioStats.PSI, &ret.DiskIo.PSI) } func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) { @@ -800,6 +818,8 @@ func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) { ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt ret.Memory.KernelUsage = s.MemoryStats.KernelUsage.Usage + convertPSI(&s.MemoryStats.PSI, &ret.Memory.PSI) + if cgroups.IsCgroup2UnifiedMode() { ret.Memory.Cache = s.MemoryStats.Stats["file"] ret.Memory.RSS = s.MemoryStats.Stats["anon"] diff --git a/container/libcontainer/handler_test.go b/container/libcontainer/handler_test.go index 82da0b3e67..92423b390b 100644 --- a/container/libcontainer/handler_test.go +++ b/container/libcontainer/handler_test.go @@ -110,6 +110,20 @@ func TestSetCPUStats(t *testing.T) { UsageInKernelmode: 734746 * nanosecondsInSeconds / clockTicks, UsageInUsermode: 2767637 * nanosecondsInSeconds / clockTicks, }, + PSI: cgroups.PSIStats{ + Some: cgroups.PSIData{ + Avg10: 0.1, + Avg60: 0.2, + Avg300: 0.3, + Total: 100, + }, + Full: cgroups.PSIData{ + Avg10: 0.4, + Avg60: 0.5, + Avg300: 0.6, + Total: 200, + }, + }, }, } var ret info.ContainerStats @@ -123,6 +137,20 @@ func TestSetCPUStats(t *testing.T) { System: s.CpuStats.CpuUsage.UsageInKernelmode, Total: 33802947350272, }, + PSI: info.PSIStats{ + Some: info.PSIData{ + Avg10: 0.1, + Avg60: 0.2, + Avg300: 0.3, + Total: 100, + }, + Full: info.PSIData{ + Avg10: 0.4, + Avg60: 0.5, + Avg300: 0.6, + Total: 200, + }, + }, }, } diff --git a/info/v1/container.go b/info/v1/container.go index ae1d9caecc..a7cf722a9e 100644 --- a/info/v1/container.go +++ b/info/v1/container.go @@ -261,6 +261,18 @@ func (ci *ContainerInfo) StatsEndTime() time.Time { return ret } +type PSIData struct { + Avg10 float64 `json:"avg10"` + Avg60 float64 `json:"avg60"` + Avg300 float64 `json:"avg300"` + Total uint64 `json:"total"` +} + +type PSIStats struct { + Some PSIData `json:"some,omitempty"` + Full PSIData `json:"full,omitempty"` +} + // This mirrors kernel internal structure. type LoadStats struct { // Number of sleeping tasks. @@ -335,6 +347,8 @@ type CpuStats struct { LoadAverage int32 `json:"load_average"` // from LoadStats.NrUninterruptible LoadDAverage int32 `json:"load_d_average"` + + PSI PSIStats `json:"psi,omitempty"` } type PerDiskStats struct { @@ -353,6 +367,8 @@ type DiskIoStats struct { IoWaitTime []PerDiskStats `json:"io_wait_time,omitempty"` IoMerged []PerDiskStats `json:"io_merged,omitempty"` IoTime []PerDiskStats `json:"io_time,omitempty"` + + PSI PSIStats `json:"psi,omitempty"` } type HugetlbStats struct { @@ -411,6 +427,8 @@ type MemoryStats struct { ContainerData MemoryStatsMemoryData `json:"container_data,omitempty"` HierarchicalData MemoryStatsMemoryData `json:"hierarchical_data,omitempty"` + + PSI PSIStats `json:"psi,omitempty"` } type CPUSetStats struct { From 6b23ac76025796c52fc153c26821d0116475ea80 Mon Sep 17 00:00:00 2001 From: Daniel Dao Date: Tue, 22 Mar 2022 16:57:52 +0000 Subject: [PATCH 3/7] Expose PSI metrics with prometheus This adds support for reading PSI metrics via prometheus. We exposes the following for `psi_total`: ``` container_cpu_psi_total_seconds container_memory_psi_total_seconds container_io_psi_total_seconds ``` And for `psi_avg`: ``` container_cpu_psi_avg10_ratio container_cpu_psi_avg60_ratio container_cpu_psi_avg300_ratio container_memory_psi_avg10_ratio container_memory_psi_avg60_ratio container_memory_psi_avg300_ratio container_io_psi_avg10_ratio container_io_psi_avg60_ratio container_io_psi_avg300_ratio ``` Signed-off-by: Daniel Dao --- metrics/prometheus.go | 78 +++++++++++++++++++ metrics/prometheus_fake.go | 42 ++++++++++ metrics/testdata/prometheus_metrics | 48 ++++++++++++ .../prometheus_metrics_whitelist_filtered | 48 ++++++++++++ 4 files changed, 216 insertions(+) diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 86064819d3..5d796cbf8d 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -1746,6 +1746,64 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri }) } + if includedMetrics.Has(container.PSITotalMetrics) { + c.containerMetrics = append(c.containerMetrics, []containerMetric{ + { + name: "container_cpu_psi_total_seconds", + help: "Total time spent under cpu pressure in seconds.", + valueType: prometheus.CounterValue, + extraLabels: []string{"kind"}, + getValues: func(s *info.ContainerStats) metricValues { + return getPSIValues(s, &s.Cpu.PSI, "total") + }, + }, { + name: "container_memory_psi_total_seconds", + help: "Total container time spent under memory pressure in seconds.", + valueType: prometheus.CounterValue, + extraLabels: []string{"kind"}, + getValues: func(s *info.ContainerStats) metricValues { + return getPSIValues(s, &s.Memory.PSI, "total") + }, + }, { + name: "container_io_psi_total_seconds", + help: "Total time spent under io pressure in seconds.", + valueType: prometheus.CounterValue, + extraLabels: []string{"kind"}, + getValues: func(s *info.ContainerStats) metricValues { + return getPSIValues(s, &s.DiskIo.PSI, "total") + }, + }, + }...) + } + + if includedMetrics.Has(container.PSIAvgMetrics) { + makePSIAvgMetric := func(controller, window string) containerMetric { + return containerMetric{ + name: fmt.Sprintf("container_%s_psi_avg%s_ratio", controller, window), + help: fmt.Sprintf("Ratio of time spent under %s pressure over time window of %s seconds", controller, window), + valueType: prometheus.GaugeValue, + extraLabels: []string{"kind"}, + getValues: func(s *info.ContainerStats) metricValues { + switch controller { + case "cpu": + return getPSIValues(s, &s.Cpu.PSI, "avg"+window) + case "memory": + return getPSIValues(s, &s.Memory.PSI, "avg"+window) + case "io": + return getPSIValues(s, &s.DiskIo.PSI, "avg"+window) + default: + return nil + } + }, + } + } + for _, controller := range []string{"cpu", "memory", "io"} { + for _, window := range []string{"10", "60", "300"} { + c.containerMetrics = append(c.containerMetrics, makePSIAvgMetric(controller, window)) + } + } + } + return c } @@ -2038,3 +2096,23 @@ func getMinCoreScalingRatio(s *info.ContainerStats) metricValues { } return values } + +func getPSIValues(s *info.ContainerStats, psi *info.PSIStats, psiMetric string) metricValues { + v := make(metricValues, 0, 2) + switch psiMetric { + case "avg10": + v = append(v, metricValue{value: psi.Some.Avg10, timestamp: s.Timestamp, labels: []string{"some"}}) + v = append(v, metricValue{value: psi.Full.Avg10, timestamp: s.Timestamp, labels: []string{"full"}}) + case "avg60": + v = append(v, metricValue{value: psi.Some.Avg60, timestamp: s.Timestamp, labels: []string{"some"}}) + v = append(v, metricValue{value: psi.Full.Avg60, timestamp: s.Timestamp, labels: []string{"full"}}) + case "avg300": + v = append(v, metricValue{value: psi.Some.Avg300, timestamp: s.Timestamp, labels: []string{"some"}}) + v = append(v, metricValue{value: psi.Full.Avg300, timestamp: s.Timestamp, labels: []string{"full"}}) + case "total": + // total is measured as microseconds + v = append(v, metricValue{value: float64(time.Duration(psi.Some.Total)*time.Microsecond) / float64(time.Second), timestamp: s.Timestamp, labels: []string{"some"}}) + v = append(v, metricValue{value: float64(time.Duration(psi.Full.Total)*time.Microsecond) / float64(time.Second), timestamp: s.Timestamp, labels: []string{"full"}}) + } + return v +} diff --git a/metrics/prometheus_fake.go b/metrics/prometheus_fake.go index fd43b78148..675de88ae4 100644 --- a/metrics/prometheus_fake.go +++ b/metrics/prometheus_fake.go @@ -328,6 +328,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req }, LoadAverage: 2, LoadDAverage: 2, + PSI: info.PSIStats{ + Some: info.PSIData{ + Avg10: 0.1, + Avg60: 0.2, + Avg300: 0.3, + Total: 100, + }, + Full: info.PSIData{ + Avg10: 0.4, + Avg60: 0.5, + Avg300: 0.6, + Total: 200, + }, + }, }, Memory: info.MemoryStats{ Usage: 8, @@ -358,6 +372,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req MappedFile: 16, KernelUsage: 17, Swap: 8192, + PSI: info.PSIStats{ + Some: info.PSIData{ + Avg10: 0.01, + Avg60: 0.02, + Avg300: 0.03, + Total: 1000, + }, + Full: info.PSIData{ + Avg10: 0.04, + Avg60: 0.05, + Avg300: 0.06, + Total: 2000, + }, + }, }, Hugetlb: map[string]info.HugetlbStats{ "2Mi": { @@ -550,6 +578,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req "Write": 6, }, }}, + PSI: info.PSIStats{ + Some: info.PSIData{ + Avg10: 0.11, + Avg60: 0.12, + Avg300: 0.13, + Total: 1111, + }, + Full: info.PSIData{ + Avg10: 0.14, + Avg60: 0.15, + Avg300: 0.16, + Total: 2222, + }, + }, }, Filesystem: []info.FsStats{ { diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index a385e50689..f04521b67e 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",container_label_fo # TYPE container_memory_bandwidth_local_bytes gauge container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000 container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000 +# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds +# TYPE container_cpu_psi_avg10_ratio gauge +container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000 +container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000 +# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds +# TYPE container_cpu_psi_avg300_ratio gauge +container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000 +container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000 +# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds +# TYPE container_cpu_psi_avg60_ratio gauge +container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000 +container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000 +# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds. +# TYPE container_cpu_psi_total_seconds counter +container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000 +container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 +# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds +# TYPE container_io_psi_avg10_ratio gauge +container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000 +container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000 +# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds +# TYPE container_io_psi_avg300_ratio gauge +container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000 +container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000 +# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds +# TYPE container_io_psi_avg60_ratio gauge +container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000 +container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000 +# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds. +# TYPE container_io_psi_total_seconds counter +container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002222 1395066363000 +container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001111 1395066363000 +# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds +# TYPE container_memory_psi_avg10_ratio gauge +container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000 +container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000 +# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds +# TYPE container_memory_psi_avg300_ratio gauge +container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000 +container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000 +# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds +# TYPE container_memory_psi_avg60_ratio gauge +container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000 +container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000 +# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds. +# TYPE container_memory_psi_total_seconds counter +container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000 +container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000 diff --git a/metrics/testdata/prometheus_metrics_whitelist_filtered b/metrics/testdata/prometheus_metrics_whitelist_filtered index 921b2e1106..0f7f23f5ee 100644 --- a/metrics/testdata/prometheus_metrics_whitelist_filtered +++ b/metrics/testdata/prometheus_metrics_whitelist_filtered @@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",id="testcontainer" # TYPE container_memory_bandwidth_local_bytes gauge container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000 container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000 +# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds +# TYPE container_cpu_psi_avg10_ratio gauge +container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000 +container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000 +# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds +# TYPE container_cpu_psi_avg300_ratio gauge +container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000 +container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000 +# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds +# TYPE container_cpu_psi_avg60_ratio gauge +container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000 +container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000 +# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds. +# TYPE container_cpu_psi_total_seconds counter +container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000 +container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 +# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds +# TYPE container_io_psi_avg10_ratio gauge +container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000 +container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000 +# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds +# TYPE container_io_psi_avg300_ratio gauge +container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000 +container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000 +# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds +# TYPE container_io_psi_avg60_ratio gauge +container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000 +container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000 +# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds. +# TYPE container_io_psi_total_seconds counter +container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002222 1395066363000 +container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001111 1395066363000 +# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds +# TYPE container_memory_psi_avg10_ratio gauge +container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000 +container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000 +# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds +# TYPE container_memory_psi_avg300_ratio gauge +container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000 +container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000 +# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds +# TYPE container_memory_psi_avg60_ratio gauge +container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000 +container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000 +# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds. +# TYPE container_memory_psi_total_seconds counter +container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000 +container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000 From 103b4be9de8b723e69a51de4f96f7523e18252a3 Mon Sep 17 00:00:00 2001 From: Felix Ehrenpfort Date: Sun, 26 Jan 2025 15:08:44 +0100 Subject: [PATCH 4/7] Add pressure stall information metrics issues: #3052, #3083, kubernetes/enhancements#4205 This change adds metrics for pressure stall information, that indicate why some or all tasks of a cgroupv2 have waited due to resource congestion (cpu, memory, io). The change exposes this information by including the _PSIStats_ of each controller in it's stats, i.e. _CPUStats.PSI_, _MemoryStats.PSI_ and _DiskStats.PSI_. The information is additionally exposed as Prometheus metrics. The metrics follow the naming outlined by the prometheus/node-exporter, where stalled eq full and waiting eq some. ``` container_pressure_cpu_stalled_seconds_total container_pressure_cpu_waiting_seconds_total container_pressure_memory_stalled_seconds_total container_pressure_memory_waiting_seconds_total container_pressure_io_stalled_seconds_total container_pressure_io_waiting_seconds_total ``` Signed-off-by: Felix Ehrenpfort --- cmd/cadvisor_test.go | 3 +- cmd/go.mod | 1 - container/factory.go | 6 +- container/libcontainer/handler.go | 39 ++++---- container/libcontainer/handler_test.go | 30 +++--- info/v1/container.go | 34 ++++--- metrics/prometheus.go | 98 +++++++------------ metrics/prometheus_fake.go | 50 +++++----- metrics/testdata/prometheus_metrics | 66 ++++--------- .../prometheus_metrics_whitelist_filtered | 66 ++++--------- 10 files changed, 152 insertions(+), 241 deletions(-) diff --git a/cmd/cadvisor_test.go b/cmd/cadvisor_test.go index 7d3c04b9d3..fc3a68966f 100644 --- a/cmd/cadvisor_test.go +++ b/cmd/cadvisor_test.go @@ -112,8 +112,7 @@ func TestToIncludedMetrics(t *testing.T) { container.ResctrlMetrics: struct{}{}, container.CPUSetMetrics: struct{}{}, container.OOMMetrics: struct{}{}, - container.PSITotalMetrics: struct{}{}, - container.PSIAvgMetrics: struct{}{}, + container.PressureMetrics: struct{}{}, }, container.AllMetrics, {}, diff --git a/cmd/go.mod b/cmd/go.mod index b4ee58df2b..41c5cc8ebd 100644 --- a/cmd/go.mod +++ b/cmd/go.mod @@ -130,4 +130,3 @@ require ( gopkg.in/yaml.v3 v3.0.1 // indirect ) -replace github.com/opencontainers/runc => github.com/dqminh/runc v0.0.0-20220513155811-6414629ada8a diff --git a/container/factory.go b/container/factory.go index ca8e80ad38..dfe6de6437 100644 --- a/container/factory.go +++ b/container/factory.go @@ -66,8 +66,7 @@ const ( ResctrlMetrics MetricKind = "resctrl" CPUSetMetrics MetricKind = "cpuset" OOMMetrics MetricKind = "oom_event" - PSITotalMetrics MetricKind = "psi_total" - PSIAvgMetrics MetricKind = "psi_avg" + PressureMetrics MetricKind = "pressure" ) // AllMetrics represents all kinds of metrics that cAdvisor supported. @@ -93,8 +92,7 @@ var AllMetrics = MetricSet{ ResctrlMetrics: struct{}{}, CPUSetMetrics: struct{}{}, OOMMetrics: struct{}{}, - PSITotalMetrics: struct{}{}, - PSIAvgMetrics: struct{}{}, + PressureMetrics: struct{}{}, } // AllNetworkMetrics represents all network metrics that cAdvisor supports. diff --git a/container/libcontainer/handler.go b/container/libcontainer/handler.go index f667313183..ece7559613 100644 --- a/container/libcontainer/handler.go +++ b/container/libcontainer/handler.go @@ -763,20 +763,6 @@ func (h *Handler) GetProcesses() ([]int, error) { return pids, nil } -// Convert libcontainer cgroups.PSIData to info.PSIData -func convertPSIData(from *cgroups.PSIData, to *info.PSIData) { - to.Avg10 = from.Avg10 - to.Avg60 = from.Avg60 - to.Avg300 = from.Avg300 - to.Total = from.Total -} - -// Convert libcontainer cgroups.PSIStats to info.PSIStats -func convertPSI(from *cgroups.PSIStats, to *info.PSIStats) { - convertPSIData(&from.Some, &to.Some) - convertPSIData(&from.Full, &to.Full) -} - // Convert libcontainer stats to info.ContainerStats. func setCPUStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) { ret.Cpu.Usage.User = s.CpuStats.CpuUsage.UsageInUsermode @@ -785,8 +771,7 @@ func setCPUStats(s *cgroups.Stats, ret *info.ContainerStats, withPerCPU bool) { ret.Cpu.CFS.Periods = s.CpuStats.ThrottlingData.Periods ret.Cpu.CFS.ThrottledPeriods = s.CpuStats.ThrottlingData.ThrottledPeriods ret.Cpu.CFS.ThrottledTime = s.CpuStats.ThrottlingData.ThrottledTime - - convertPSI(&s.CpuStats.PSI, &ret.Cpu.PSI) + setPSIStats(s.CpuStats.PSI, &ret.Cpu.PSI) if !withPerCPU { return @@ -808,8 +793,7 @@ func setDiskIoStats(s *cgroups.Stats, ret *info.ContainerStats) { ret.DiskIo.IoWaitTime = diskStatsCopy(s.BlkioStats.IoWaitTimeRecursive) ret.DiskIo.IoMerged = diskStatsCopy(s.BlkioStats.IoMergedRecursive) ret.DiskIo.IoTime = diskStatsCopy(s.BlkioStats.IoTimeRecursive) - - convertPSI(&s.BlkioStats.PSI, &ret.DiskIo.PSI) + setPSIStats(s.BlkioStats.PSI, &ret.DiskIo.PSI) } func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) { @@ -817,8 +801,7 @@ func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) { ret.Memory.MaxUsage = s.MemoryStats.Usage.MaxUsage ret.Memory.Failcnt = s.MemoryStats.Usage.Failcnt ret.Memory.KernelUsage = s.MemoryStats.KernelUsage.Usage - - convertPSI(&s.MemoryStats.PSI, &ret.Memory.PSI) + setPSIStats(s.MemoryStats.PSI, &ret.Memory.PSI) if cgroups.IsCgroup2UnifiedMode() { ret.Memory.Cache = s.MemoryStats.Stats["file"] @@ -904,6 +887,22 @@ func setHugepageStats(s *cgroups.Stats, ret *info.ContainerStats) { } } +func setPSIData(d *cgroups.PSIData, ret *info.PSIData) { + if d != nil { + ret.Total = d.Total + ret.Avg10 = d.Avg10 + ret.Avg60 = d.Avg60 + ret.Avg300 = d.Avg300 + } +} + +func setPSIStats(s *cgroups.PSIStats, ret *info.PSIStats) { + if s != nil { + setPSIData(&s.Full, &ret.Full) + setPSIData(&s.Some, &ret.Some) + } +} + // read from pids path not cpu func setThreadsStats(s *cgroups.Stats, ret *info.ContainerStats) { if s != nil { diff --git a/container/libcontainer/handler_test.go b/container/libcontainer/handler_test.go index 92423b390b..a74fc09831 100644 --- a/container/libcontainer/handler_test.go +++ b/container/libcontainer/handler_test.go @@ -110,17 +110,17 @@ func TestSetCPUStats(t *testing.T) { UsageInKernelmode: 734746 * nanosecondsInSeconds / clockTicks, UsageInUsermode: 2767637 * nanosecondsInSeconds / clockTicks, }, - PSI: cgroups.PSIStats{ - Some: cgroups.PSIData{ - Avg10: 0.1, + PSI: &cgroups.PSIStats{ + Full: cgroups.PSIData{ + Avg10: 0.3, Avg60: 0.2, - Avg300: 0.3, + Avg300: 0.1, Total: 100, }, - Full: cgroups.PSIData{ - Avg10: 0.4, - Avg60: 0.5, - Avg300: 0.6, + Some: cgroups.PSIData{ + Avg10: 0.6, + Avg60: 0.4, + Avg300: 0.2, Total: 200, }, }, @@ -138,16 +138,16 @@ func TestSetCPUStats(t *testing.T) { Total: 33802947350272, }, PSI: info.PSIStats{ - Some: info.PSIData{ - Avg10: 0.1, + Full: info.PSIData{ + Avg10: 0.3, Avg60: 0.2, - Avg300: 0.3, + Avg300: 0.1, Total: 100, }, - Full: info.PSIData{ - Avg10: 0.4, - Avg60: 0.5, - Avg300: 0.6, + Some: info.PSIData{ + Avg10: 0.6, + Avg60: 0.4, + Avg300: 0.2, Total: 200, }, }, diff --git a/info/v1/container.go b/info/v1/container.go index a7cf722a9e..5921783165 100644 --- a/info/v1/container.go +++ b/info/v1/container.go @@ -261,16 +261,24 @@ func (ci *ContainerInfo) StatsEndTime() time.Time { return ret } -type PSIData struct { - Avg10 float64 `json:"avg10"` - Avg60 float64 `json:"avg60"` - Avg300 float64 `json:"avg300"` - Total uint64 `json:"total"` -} - +// PSI statistics for an individual resource. type PSIStats struct { - Some PSIData `json:"some,omitempty"` + // PSI data for all tasks of in the cgroup. Full PSIData `json:"full,omitempty"` + // PSI data for some tasks in the cgroup. + Some PSIData `json:"some,omitempty"` +} + +type PSIData struct { + // Total time duration for tasks in the cgroup have waited due to congestion. + // Unit: nanoseconds. + Total uint64 `json:"total"` + // The average (in %) tasks have waited due to congestion over a 10 second window. + Avg10 float64 `json:"avg10"` + // The average (in %) tasks have waited due to congestion over a 60 second window. + Avg60 float64 `json:"avg60"` + // The average (in %) tasks have waited due to congestion over a 300 second window. + Avg300 float64 `json:"avg300"` } // This mirrors kernel internal structure. @@ -346,9 +354,8 @@ type CpuStats struct { // from LoadStats.NrRunning. LoadAverage int32 `json:"load_average"` // from LoadStats.NrUninterruptible - LoadDAverage int32 `json:"load_d_average"` - - PSI PSIStats `json:"psi,omitempty"` + LoadDAverage int32 `json:"load_d_average"` + PSI PSIStats `json:"psi"` } type PerDiskStats struct { @@ -367,8 +374,7 @@ type DiskIoStats struct { IoWaitTime []PerDiskStats `json:"io_wait_time,omitempty"` IoMerged []PerDiskStats `json:"io_merged,omitempty"` IoTime []PerDiskStats `json:"io_time,omitempty"` - - PSI PSIStats `json:"psi,omitempty"` + PSI PSIStats `json:"psi"` } type HugetlbStats struct { @@ -428,7 +434,7 @@ type MemoryStats struct { ContainerData MemoryStatsMemoryData `json:"container_data,omitempty"` HierarchicalData MemoryStatsMemoryData `json:"hierarchical_data,omitempty"` - PSI PSIStats `json:"psi,omitempty"` + PSI PSIStats `json:"psi"` } type CPUSetStats struct { diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 5d796cbf8d..63a64ecfb7 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -1746,64 +1746,54 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri }) } - if includedMetrics.Has(container.PSITotalMetrics) { + if includedMetrics.Has(container.PressureMetrics) { c.containerMetrics = append(c.containerMetrics, []containerMetric{ { - name: "container_cpu_psi_total_seconds", - help: "Total time spent under cpu pressure in seconds.", - valueType: prometheus.CounterValue, - extraLabels: []string{"kind"}, + name: "container_pressure_cpu_stalled_seconds_total", + help: "Total time duration no tasks in the container could make progress due to CPU congestion.", + valueType: prometheus.CounterValue, getValues: func(s *info.ContainerStats) metricValues { - return getPSIValues(s, &s.Cpu.PSI, "total") + return metricValues{{value: float64(s.Cpu.PSI.Full.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}} }, }, { - name: "container_memory_psi_total_seconds", - help: "Total container time spent under memory pressure in seconds.", - valueType: prometheus.CounterValue, - extraLabels: []string{"kind"}, + name: "container_pressure_cpu_waiting_seconds_total", + help: "Total time duration tasks in the container have waited due to CPU congestion.", + valueType: prometheus.CounterValue, getValues: func(s *info.ContainerStats) metricValues { - return getPSIValues(s, &s.Memory.PSI, "total") + return metricValues{{value: float64(s.Cpu.PSI.Some.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}} }, }, { - name: "container_io_psi_total_seconds", - help: "Total time spent under io pressure in seconds.", - valueType: prometheus.CounterValue, - extraLabels: []string{"kind"}, + name: "container_pressure_memory_stalled_seconds_total", + help: "Total time duration no tasks in the container could make progress due to memory congestion.", + valueType: prometheus.CounterValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: float64(s.Memory.PSI.Full.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}} + }, + }, { + name: "container_pressure_memory_waiting_seconds_total", + help: "Total time duration tasks in the container have waited due to memory congestion.", + valueType: prometheus.CounterValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: float64(s.Memory.PSI.Some.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}} + }, + }, { + name: "container_pressure_io_stalled_seconds_total", + help: "Total time duration no tasks in the container could make progress due to IO congestion.", + valueType: prometheus.CounterValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: float64(s.DiskIo.PSI.Full.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}} + }, + }, { + name: "container_pressure_io_waiting_seconds_total", + help: "Total time duration tasks in the container have waited due to IO congestion.", + valueType: prometheus.CounterValue, getValues: func(s *info.ContainerStats) metricValues { - return getPSIValues(s, &s.DiskIo.PSI, "total") + return metricValues{{value: float64(s.DiskIo.PSI.Some.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}} }, }, }...) } - if includedMetrics.Has(container.PSIAvgMetrics) { - makePSIAvgMetric := func(controller, window string) containerMetric { - return containerMetric{ - name: fmt.Sprintf("container_%s_psi_avg%s_ratio", controller, window), - help: fmt.Sprintf("Ratio of time spent under %s pressure over time window of %s seconds", controller, window), - valueType: prometheus.GaugeValue, - extraLabels: []string{"kind"}, - getValues: func(s *info.ContainerStats) metricValues { - switch controller { - case "cpu": - return getPSIValues(s, &s.Cpu.PSI, "avg"+window) - case "memory": - return getPSIValues(s, &s.Memory.PSI, "avg"+window) - case "io": - return getPSIValues(s, &s.DiskIo.PSI, "avg"+window) - default: - return nil - } - }, - } - } - for _, controller := range []string{"cpu", "memory", "io"} { - for _, window := range []string{"10", "60", "300"} { - c.containerMetrics = append(c.containerMetrics, makePSIAvgMetric(controller, window)) - } - } - } - return c } @@ -2096,23 +2086,3 @@ func getMinCoreScalingRatio(s *info.ContainerStats) metricValues { } return values } - -func getPSIValues(s *info.ContainerStats, psi *info.PSIStats, psiMetric string) metricValues { - v := make(metricValues, 0, 2) - switch psiMetric { - case "avg10": - v = append(v, metricValue{value: psi.Some.Avg10, timestamp: s.Timestamp, labels: []string{"some"}}) - v = append(v, metricValue{value: psi.Full.Avg10, timestamp: s.Timestamp, labels: []string{"full"}}) - case "avg60": - v = append(v, metricValue{value: psi.Some.Avg60, timestamp: s.Timestamp, labels: []string{"some"}}) - v = append(v, metricValue{value: psi.Full.Avg60, timestamp: s.Timestamp, labels: []string{"full"}}) - case "avg300": - v = append(v, metricValue{value: psi.Some.Avg300, timestamp: s.Timestamp, labels: []string{"some"}}) - v = append(v, metricValue{value: psi.Full.Avg300, timestamp: s.Timestamp, labels: []string{"full"}}) - case "total": - // total is measured as microseconds - v = append(v, metricValue{value: float64(time.Duration(psi.Some.Total)*time.Microsecond) / float64(time.Second), timestamp: s.Timestamp, labels: []string{"some"}}) - v = append(v, metricValue{value: float64(time.Duration(psi.Full.Total)*time.Microsecond) / float64(time.Second), timestamp: s.Timestamp, labels: []string{"full"}}) - } - return v -} diff --git a/metrics/prometheus_fake.go b/metrics/prometheus_fake.go index 675de88ae4..5e53a8d6de 100644 --- a/metrics/prometheus_fake.go +++ b/metrics/prometheus_fake.go @@ -329,16 +329,16 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req LoadAverage: 2, LoadDAverage: 2, PSI: info.PSIStats{ - Some: info.PSIData{ - Avg10: 0.1, + Full: info.PSIData{ + Avg10: 0.3, Avg60: 0.2, - Avg300: 0.3, + Avg300: 0.1, Total: 100, }, - Full: info.PSIData{ - Avg10: 0.4, - Avg60: 0.5, - Avg300: 0.6, + Some: info.PSIData{ + Avg10: 0.6, + Avg60: 0.4, + Avg300: 0.2, Total: 200, }, }, @@ -373,16 +373,16 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req KernelUsage: 17, Swap: 8192, PSI: info.PSIStats{ - Some: info.PSIData{ - Avg10: 0.01, - Avg60: 0.02, - Avg300: 0.03, + Full: info.PSIData{ + Avg10: 0.3, + Avg60: 0.2, + Avg300: 0.1, Total: 1000, }, - Full: info.PSIData{ - Avg10: 0.04, - Avg60: 0.05, - Avg300: 0.06, + Some: info.PSIData{ + Avg10: 0.6, + Avg60: 0.4, + Avg300: 0.2, Total: 2000, }, }, @@ -579,17 +579,17 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req }, }}, PSI: info.PSIStats{ - Some: info.PSIData{ - Avg10: 0.11, - Avg60: 0.12, - Avg300: 0.13, - Total: 1111, - }, Full: info.PSIData{ - Avg10: 0.14, - Avg60: 0.15, - Avg300: 0.16, - Total: 2222, + Avg10: 0.3, + Avg60: 0.2, + Avg300: 0.1, + Total: 1100, + }, + Some: info.PSIData{ + Avg10: 0.6, + Avg60: 0.4, + Avg300: 0.2, + Total: 2200, }, }, }, diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index f04521b67e..b0dc5c444b 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -381,6 +381,24 @@ container_perf_uncore_events_total{container_env_foo_env="prod",container_label_ # TYPE container_perf_uncore_events_scaling_ratio gauge container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000 +# HELP container_pressure_cpu_stalled_seconds_total Total time duration no tasks in the container could make progress due to CPU congestion. +# TYPE container_pressure_cpu_stalled_seconds_total counter +container_pressure_cpu_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 +# HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion. +# TYPE container_pressure_cpu_waiting_seconds_total counter +container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000 +# HELP container_pressure_io_stalled_seconds_total Total time duration no tasks in the container could make progress due to IO congestion. +# TYPE container_pressure_io_stalled_seconds_total counter +container_pressure_io_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0011 1395066363000 +# HELP container_pressure_io_waiting_seconds_total Total time duration tasks in the container have waited due to IO congestion. +# TYPE container_pressure_io_waiting_seconds_total counter +container_pressure_io_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0022 1395066363000 +# HELP container_pressure_memory_stalled_seconds_total Total time duration no tasks in the container could make progress due to memory congestion. +# TYPE container_pressure_memory_stalled_seconds_total counter +container_pressure_memory_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000 +# HELP container_pressure_memory_waiting_seconds_total Total time duration tasks in the container have waited due to memory congestion. +# TYPE container_pressure_memory_waiting_seconds_total counter +container_pressure_memory_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000 # HELP container_processes Number of processes running inside the container. # TYPE container_processes gauge container_processes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000 @@ -433,51 +451,3 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",container_label_fo # TYPE container_memory_bandwidth_local_bytes gauge container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000 container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000 -# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds -# TYPE container_cpu_psi_avg10_ratio gauge -container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000 -container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000 -# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds -# TYPE container_cpu_psi_avg300_ratio gauge -container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000 -container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000 -# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds -# TYPE container_cpu_psi_avg60_ratio gauge -container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000 -container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000 -# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds. -# TYPE container_cpu_psi_total_seconds counter -container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000 -container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 -# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds -# TYPE container_io_psi_avg10_ratio gauge -container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000 -container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000 -# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds -# TYPE container_io_psi_avg300_ratio gauge -container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000 -container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000 -# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds -# TYPE container_io_psi_avg60_ratio gauge -container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000 -container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000 -# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds. -# TYPE container_io_psi_total_seconds counter -container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002222 1395066363000 -container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001111 1395066363000 -# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds -# TYPE container_memory_psi_avg10_ratio gauge -container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000 -container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000 -# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds -# TYPE container_memory_psi_avg300_ratio gauge -container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000 -container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000 -# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds -# TYPE container_memory_psi_avg60_ratio gauge -container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000 -container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000 -# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds. -# TYPE container_memory_psi_total_seconds counter -container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000 -container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000 diff --git a/metrics/testdata/prometheus_metrics_whitelist_filtered b/metrics/testdata/prometheus_metrics_whitelist_filtered index 0f7f23f5ee..8d1999815c 100644 --- a/metrics/testdata/prometheus_metrics_whitelist_filtered +++ b/metrics/testdata/prometheus_metrics_whitelist_filtered @@ -381,6 +381,24 @@ container_perf_uncore_events_total{container_env_foo_env="prod",event="cas_count # TYPE container_perf_uncore_events_scaling_ratio gauge container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000 +# HELP container_pressure_cpu_stalled_seconds_total Total time duration no tasks in the container could make progress due to CPU congestion. +# TYPE container_pressure_cpu_stalled_seconds_total counter +container_pressure_cpu_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 +# HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion. +# TYPE container_pressure_cpu_waiting_seconds_total counter +container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000 +# HELP container_pressure_io_stalled_seconds_total Total time duration no tasks in the container could make progress due to IO congestion. +# TYPE container_pressure_io_stalled_seconds_total counter +container_pressure_io_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0011 1395066363000 +# HELP container_pressure_io_waiting_seconds_total Total time duration tasks in the container have waited due to IO congestion. +# TYPE container_pressure_io_waiting_seconds_total counter +container_pressure_io_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0022 1395066363000 +# HELP container_pressure_memory_stalled_seconds_total Total time duration no tasks in the container could make progress due to memory congestion. +# TYPE container_pressure_memory_stalled_seconds_total counter +container_pressure_memory_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000 +# HELP container_pressure_memory_waiting_seconds_total Total time duration tasks in the container have waited due to memory congestion. +# TYPE container_pressure_memory_waiting_seconds_total counter +container_pressure_memory_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000 # HELP container_processes Number of processes running inside the container. # TYPE container_processes gauge container_processes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1 1395066363000 @@ -433,51 +451,3 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",id="testcontainer" # TYPE container_memory_bandwidth_local_bytes gauge container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000 container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000 -# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds -# TYPE container_cpu_psi_avg10_ratio gauge -container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000 -container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000 -# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds -# TYPE container_cpu_psi_avg300_ratio gauge -container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000 -container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000 -# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds -# TYPE container_cpu_psi_avg60_ratio gauge -container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000 -container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000 -# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds. -# TYPE container_cpu_psi_total_seconds counter -container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000 -container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 -# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds -# TYPE container_io_psi_avg10_ratio gauge -container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000 -container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000 -# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds -# TYPE container_io_psi_avg300_ratio gauge -container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000 -container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000 -# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds -# TYPE container_io_psi_avg60_ratio gauge -container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000 -container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000 -# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds. -# TYPE container_io_psi_total_seconds counter -container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002222 1395066363000 -container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001111 1395066363000 -# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds -# TYPE container_memory_psi_avg10_ratio gauge -container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000 -container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000 -# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds -# TYPE container_memory_psi_avg300_ratio gauge -container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000 -container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000 -# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds -# TYPE container_memory_psi_avg60_ratio gauge -container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000 -container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000 -# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds. -# TYPE container_memory_psi_total_seconds counter -container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.002 1395066363000 -container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.001 1395066363000 From 94a027c7f3868582bd0e40181373e663ba695b67 Mon Sep 17 00:00:00 2001 From: Felix Ehrenpfort Date: Sun, 26 Jan 2025 20:45:09 +0100 Subject: [PATCH 5/7] Add minor improvements to PSI metrics Signed-off-by: Felix Ehrenpfort --- cmd/go.mod | 1 - metrics/prometheus.go | 22 ++++++++++------------ 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/cmd/go.mod b/cmd/go.mod index 41c5cc8ebd..690bc52858 100644 --- a/cmd/go.mod +++ b/cmd/go.mod @@ -129,4 +129,3 @@ require ( google.golang.org/protobuf v1.36.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) - diff --git a/metrics/prometheus.go b/metrics/prometheus.go index 63a64ecfb7..ca0681cc3c 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -33,6 +33,11 @@ import ( // asFloat64 converts a uint64 into a float64. func asFloat64(v uint64) float64 { return float64(v) } +// asMicrosecondsToSeconds converts nanoseconds into a float64 representing seconds. +func asMicrosecondsToSeconds(v uint64) float64 { + return float64(v) / float64(time.Millisecond) +} + // asNanosecondsToSeconds converts nanoseconds into a float64 representing seconds. func asNanosecondsToSeconds(v uint64) float64 { return float64(v) / float64(time.Second) @@ -1749,46 +1754,39 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri if includedMetrics.Has(container.PressureMetrics) { c.containerMetrics = append(c.containerMetrics, []containerMetric{ { - name: "container_pressure_cpu_stalled_seconds_total", - help: "Total time duration no tasks in the container could make progress due to CPU congestion.", - valueType: prometheus.CounterValue, - getValues: func(s *info.ContainerStats) metricValues { - return metricValues{{value: float64(s.Cpu.PSI.Full.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}} - }, - }, { name: "container_pressure_cpu_waiting_seconds_total", help: "Total time duration tasks in the container have waited due to CPU congestion.", valueType: prometheus.CounterValue, getValues: func(s *info.ContainerStats) metricValues { - return metricValues{{value: float64(s.Cpu.PSI.Some.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}} + return metricValues{{value: asMicrosecondsToSeconds(s.Cpu.PSI.Some.Total), timestamp: s.Timestamp}} }, }, { name: "container_pressure_memory_stalled_seconds_total", help: "Total time duration no tasks in the container could make progress due to memory congestion.", valueType: prometheus.CounterValue, getValues: func(s *info.ContainerStats) metricValues { - return metricValues{{value: float64(s.Memory.PSI.Full.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}} + return metricValues{{value: asMicrosecondsToSeconds(s.Memory.PSI.Full.Total), timestamp: s.Timestamp}} }, }, { name: "container_pressure_memory_waiting_seconds_total", help: "Total time duration tasks in the container have waited due to memory congestion.", valueType: prometheus.CounterValue, getValues: func(s *info.ContainerStats) metricValues { - return metricValues{{value: float64(s.Memory.PSI.Some.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}} + return metricValues{{value: asMicrosecondsToSeconds(s.Memory.PSI.Some.Total), timestamp: s.Timestamp}} }, }, { name: "container_pressure_io_stalled_seconds_total", help: "Total time duration no tasks in the container could make progress due to IO congestion.", valueType: prometheus.CounterValue, getValues: func(s *info.ContainerStats) metricValues { - return metricValues{{value: float64(s.DiskIo.PSI.Full.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}} + return metricValues{{value: asMicrosecondsToSeconds(s.DiskIo.PSI.Full.Total), timestamp: s.Timestamp}} }, }, { name: "container_pressure_io_waiting_seconds_total", help: "Total time duration tasks in the container have waited due to IO congestion.", valueType: prometheus.CounterValue, getValues: func(s *info.ContainerStats) metricValues { - return metricValues{{value: float64(s.DiskIo.PSI.Some.Total) / 1000.0 / 1000.0, timestamp: s.Timestamp}} + return metricValues{{value: asMicrosecondsToSeconds(s.DiskIo.PSI.Some.Total), timestamp: s.Timestamp}} }, }, }...) From e238b0806b47d4caef9654385ba74cfc11b234ec Mon Sep 17 00:00:00 2001 From: Felix Ehrenpfort Date: Sun, 26 Jan 2025 21:15:30 +0100 Subject: [PATCH 6/7] Use 1e6/9 instead of time for conversion Signed-off-by: Felix Ehrenpfort --- metrics/prometheus.go | 4 ++-- metrics/testdata/prometheus_metrics | 3 --- metrics/testdata/prometheus_metrics_whitelist_filtered | 3 --- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/metrics/prometheus.go b/metrics/prometheus.go index ca0681cc3c..cf93018fea 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -35,12 +35,12 @@ func asFloat64(v uint64) float64 { return float64(v) } // asMicrosecondsToSeconds converts nanoseconds into a float64 representing seconds. func asMicrosecondsToSeconds(v uint64) float64 { - return float64(v) / float64(time.Millisecond) + return float64(v) / 1e6 } // asNanosecondsToSeconds converts nanoseconds into a float64 representing seconds. func asNanosecondsToSeconds(v uint64) float64 { - return float64(v) / float64(time.Second) + return float64(v) / 1e9 } // fsValues is a helper method for assembling per-filesystem stats. diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index b0dc5c444b..354e4109e7 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -381,9 +381,6 @@ container_perf_uncore_events_total{container_env_foo_env="prod",container_label_ # TYPE container_perf_uncore_events_scaling_ratio gauge container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000 -# HELP container_pressure_cpu_stalled_seconds_total Total time duration no tasks in the container could make progress due to CPU congestion. -# TYPE container_pressure_cpu_stalled_seconds_total counter -container_pressure_cpu_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 # HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion. # TYPE container_pressure_cpu_waiting_seconds_total counter container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000 diff --git a/metrics/testdata/prometheus_metrics_whitelist_filtered b/metrics/testdata/prometheus_metrics_whitelist_filtered index 8d1999815c..7c12bd4b72 100644 --- a/metrics/testdata/prometheus_metrics_whitelist_filtered +++ b/metrics/testdata/prometheus_metrics_whitelist_filtered @@ -381,9 +381,6 @@ container_perf_uncore_events_total{container_env_foo_env="prod",event="cas_count # TYPE container_perf_uncore_events_scaling_ratio gauge container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000 -# HELP container_pressure_cpu_stalled_seconds_total Total time duration no tasks in the container could make progress due to CPU congestion. -# TYPE container_pressure_cpu_stalled_seconds_total counter -container_pressure_cpu_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 # HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion. # TYPE container_pressure_cpu_waiting_seconds_total counter container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000 From 20e5af2c49c44851bdb0cb8856ac765017a76cf0 Mon Sep 17 00:00:00 2001 From: Felix Ehrenpfort Date: Mon, 27 Jan 2025 21:51:19 +0100 Subject: [PATCH 7/7] Expose PSI metric for CPU full Signed-off-by: Felix Ehrenpfort --- metrics/prometheus.go | 7 +++++++ metrics/testdata/prometheus_metrics | 3 +++ metrics/testdata/prometheus_metrics_whitelist_filtered | 3 +++ 3 files changed, 13 insertions(+) diff --git a/metrics/prometheus.go b/metrics/prometheus.go index cf93018fea..aa6d53ceeb 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -1754,6 +1754,13 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri if includedMetrics.Has(container.PressureMetrics) { c.containerMetrics = append(c.containerMetrics, []containerMetric{ { + name: "container_pressure_cpu_stalled_seconds_total", + help: "Total time duration no tasks in the container could make progress due to CPU congestion.", + valueType: prometheus.CounterValue, + getValues: func(s *info.ContainerStats) metricValues { + return metricValues{{value: asMicrosecondsToSeconds(s.Cpu.PSI.Full.Total), timestamp: s.Timestamp}} + }, + }, { name: "container_pressure_cpu_waiting_seconds_total", help: "Total time duration tasks in the container have waited due to CPU congestion.", valueType: prometheus.CounterValue, diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index 354e4109e7..b0dc5c444b 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -381,6 +381,9 @@ container_perf_uncore_events_total{container_env_foo_env="prod",container_label_ # TYPE container_perf_uncore_events_scaling_ratio gauge container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000 +# HELP container_pressure_cpu_stalled_seconds_total Total time duration no tasks in the container could make progress due to CPU congestion. +# TYPE container_pressure_cpu_stalled_seconds_total counter +container_pressure_cpu_stalled_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 # HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion. # TYPE container_pressure_cpu_waiting_seconds_total counter container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000 diff --git a/metrics/testdata/prometheus_metrics_whitelist_filtered b/metrics/testdata/prometheus_metrics_whitelist_filtered index 7c12bd4b72..8d1999815c 100644 --- a/metrics/testdata/prometheus_metrics_whitelist_filtered +++ b/metrics/testdata/prometheus_metrics_whitelist_filtered @@ -381,6 +381,9 @@ container_perf_uncore_events_total{container_env_foo_env="prod",event="cas_count # TYPE container_perf_uncore_events_scaling_ratio gauge container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000 container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000 +# HELP container_pressure_cpu_stalled_seconds_total Total time duration no tasks in the container could make progress due to CPU congestion. +# TYPE container_pressure_cpu_stalled_seconds_total counter +container_pressure_cpu_stalled_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0001 1395066363000 # HELP container_pressure_cpu_waiting_seconds_total Total time duration tasks in the container have waited due to CPU congestion. # TYPE container_pressure_cpu_waiting_seconds_total counter container_pressure_cpu_waiting_seconds_total{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.0002 1395066363000