Skip to content

Commit 4c5c036

Browse files
committed
Expose PSI metrics with prometheus
This adds support for reading PSI metrics via prometheus. We exposes the following for `psi_total`: ``` container_cpu_psi_total_seconds container_memory_psi_total_seconds container_io_psi_total_seconds ``` And for `psi_avg`: ``` container_cpu_psi_avg10_ratio container_cpu_psi_avg60_ratio container_cpu_psi_avg300_ratio container_memory_psi_avg10_ratio container_memory_psi_avg60_ratio container_memory_psi_avg300_ratio container_io_psi_avg10_ratio container_io_psi_avg60_ratio container_io_psi_avg300_ratio ``` Signed-off-by: Daniel Dao <[email protected]>
1 parent ab9bb9e commit 4c5c036

File tree

4 files changed

+215
-0
lines changed

4 files changed

+215
-0
lines changed

metrics/prometheus.go

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1768,6 +1768,64 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
17681768
})
17691769
}
17701770

1771+
if includedMetrics.Has(container.PSITotalMetrics) {
1772+
c.containerMetrics = append(c.containerMetrics, []containerMetric{
1773+
{
1774+
name: "container_cpu_psi_total_seconds",
1775+
help: "Total time spent under cpu pressure in seconds.",
1776+
valueType: prometheus.CounterValue,
1777+
extraLabels: []string{"kind"},
1778+
getValues: func(s *info.ContainerStats) metricValues {
1779+
return getPSIValues(s, &s.Cpu.PSI, "total")
1780+
},
1781+
}, {
1782+
name: "container_memory_psi_total_seconds",
1783+
help: "Total container time spent under memory pressure in seconds.",
1784+
valueType: prometheus.CounterValue,
1785+
extraLabels: []string{"kind"},
1786+
getValues: func(s *info.ContainerStats) metricValues {
1787+
return getPSIValues(s, &s.Memory.PSI, "total")
1788+
},
1789+
}, {
1790+
name: "container_io_psi_total_seconds",
1791+
help: "Total time spent under io pressure in seconds.",
1792+
valueType: prometheus.CounterValue,
1793+
extraLabels: []string{"kind"},
1794+
getValues: func(s *info.ContainerStats) metricValues {
1795+
return getPSIValues(s, &s.DiskIo.PSI, "total")
1796+
},
1797+
},
1798+
}...)
1799+
}
1800+
1801+
if includedMetrics.Has(container.PSIAvgMetrics) {
1802+
makePSIAvgMetric := func(controller, window string) containerMetric {
1803+
return containerMetric{
1804+
name: fmt.Sprintf("container_%s_psi_avg%s_ratio", controller, window),
1805+
help: fmt.Sprintf("Ratio of time spent under %s pressure over time window of %s seconds", controller, window),
1806+
valueType: prometheus.GaugeValue,
1807+
extraLabels: []string{"kind"},
1808+
getValues: func(s *info.ContainerStats) metricValues {
1809+
switch controller {
1810+
case "cpu":
1811+
return getPSIValues(s, &s.Cpu.PSI, "avg"+window)
1812+
case "memory":
1813+
return getPSIValues(s, &s.Memory.PSI, "avg"+window)
1814+
case "io":
1815+
return getPSIValues(s, &s.DiskIo.PSI, "avg"+window)
1816+
default:
1817+
return nil
1818+
}
1819+
},
1820+
}
1821+
}
1822+
for _, controller := range []string{"cpu", "memory", "io"} {
1823+
for _, window := range []string{"10", "60", "300"} {
1824+
c.containerMetrics = append(c.containerMetrics, makePSIAvgMetric(controller, window))
1825+
}
1826+
}
1827+
}
1828+
17711829
return c
17721830
}
17731831

@@ -2060,3 +2118,22 @@ func getMinCoreScalingRatio(s *info.ContainerStats) metricValues {
20602118
}
20612119
return values
20622120
}
2121+
2122+
func getPSIValues(s *info.ContainerStats, psi *info.PSIStats, psiMetric string) metricValues {
2123+
v := make(metricValues, 0, 2)
2124+
switch psiMetric {
2125+
case "avg10":
2126+
v = append(v, metricValue{value: psi.Some.Avg10, timestamp: s.Timestamp, labels: []string{"some"}})
2127+
v = append(v, metricValue{value: psi.Full.Avg10, timestamp: s.Timestamp, labels: []string{"full"}})
2128+
case "avg60":
2129+
v = append(v, metricValue{value: psi.Some.Avg60, timestamp: s.Timestamp, labels: []string{"some"}})
2130+
v = append(v, metricValue{value: psi.Full.Avg60, timestamp: s.Timestamp, labels: []string{"full"}})
2131+
case "avg300":
2132+
v = append(v, metricValue{value: psi.Some.Avg300, timestamp: s.Timestamp, labels: []string{"some"}})
2133+
v = append(v, metricValue{value: psi.Full.Avg300, timestamp: s.Timestamp, labels: []string{"full"}})
2134+
case "total":
2135+
v = append(v, metricValue{value: float64(psi.Some.Total) / float64(1e9), timestamp: s.Timestamp, labels: []string{"some"}})
2136+
v = append(v, metricValue{value: float64(psi.Full.Total) / float64(1e9), timestamp: s.Timestamp, labels: []string{"full"}})
2137+
}
2138+
return v
2139+
}

metrics/prometheus_fake.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
319319
RunPeriods: 984285,
320320
},
321321
LoadAverage: 2,
322+
PSI: info.PSIStats{
323+
Some: info.PSIData{
324+
Avg10: 0.1,
325+
Avg60: 0.2,
326+
Avg300: 0.3,
327+
Total: 100,
328+
},
329+
Full: info.PSIData{
330+
Avg10: 0.4,
331+
Avg60: 0.5,
332+
Avg300: 0.6,
333+
Total: 200,
334+
},
335+
},
322336
},
323337
Memory: info.MemoryStats{
324338
Usage: 8,
@@ -346,6 +360,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
346360
RSS: 15,
347361
MappedFile: 16,
348362
Swap: 8192,
363+
PSI: info.PSIStats{
364+
Some: info.PSIData{
365+
Avg10: 0.01,
366+
Avg60: 0.02,
367+
Avg300: 0.03,
368+
Total: 1000,
369+
},
370+
Full: info.PSIData{
371+
Avg10: 0.04,
372+
Avg60: 0.05,
373+
Avg300: 0.06,
374+
Total: 2000,
375+
},
376+
},
349377
},
350378
Hugetlb: map[string]info.HugetlbStats{
351379
"2Mi": {
@@ -538,6 +566,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
538566
"Write": 6,
539567
},
540568
}},
569+
PSI: info.PSIStats{
570+
Some: info.PSIData{
571+
Avg10: 0.11,
572+
Avg60: 0.12,
573+
Avg300: 0.13,
574+
Total: 1111,
575+
},
576+
Full: info.PSIData{
577+
Avg10: 0.14,
578+
Avg60: 0.15,
579+
Avg300: 0.16,
580+
Total: 2222,
581+
},
582+
},
541583
},
542584
Filesystem: []info.FsStats{
543585
{

metrics/testdata/prometheus_metrics

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",container_label_fo
433433
# TYPE container_memory_bandwidth_local_bytes gauge
434434
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000
435435
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000
436+
# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds
437+
# TYPE container_cpu_psi_avg10_ratio gauge
438+
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000
439+
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000
440+
# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds
441+
# TYPE container_cpu_psi_avg300_ratio gauge
442+
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000
443+
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000
444+
# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds
445+
# TYPE container_cpu_psi_avg60_ratio gauge
446+
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000
447+
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000
448+
# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds.
449+
# TYPE container_cpu_psi_total_seconds counter
450+
container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 2e-07 1395066363000
451+
container_cpu_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 1e-07 1395066363000
452+
# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds
453+
# TYPE container_io_psi_avg10_ratio gauge
454+
container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000
455+
container_io_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000
456+
# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds
457+
# TYPE container_io_psi_avg300_ratio gauge
458+
container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000
459+
container_io_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000
460+
# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds
461+
# TYPE container_io_psi_avg60_ratio gauge
462+
container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000
463+
container_io_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000
464+
# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds.
465+
# TYPE container_io_psi_total_seconds counter
466+
container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 2.222e-06 1395066363000
467+
container_io_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 1.111e-06 1395066363000
468+
# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds
469+
# TYPE container_memory_psi_avg10_ratio gauge
470+
container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000
471+
container_memory_psi_avg10_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000
472+
# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds
473+
# TYPE container_memory_psi_avg300_ratio gauge
474+
container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000
475+
container_memory_psi_avg300_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000
476+
# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds
477+
# TYPE container_memory_psi_avg60_ratio gauge
478+
container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000
479+
container_memory_psi_avg60_ratio{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000
480+
# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds.
481+
# TYPE container_memory_psi_total_seconds counter
482+
container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 2e-06 1395066363000
483+
container_memory_psi_total_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 1e-06 1395066363000

metrics/testdata/prometheus_metrics_whitelist_filtered

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,3 +433,51 @@ container_memory_bandwidth_bytes{container_env_foo_env="prod",id="testcontainer"
433433
# TYPE container_memory_bandwidth_local_bytes gauge
434434
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="0",zone_name="hello"} 2.390393e+06 1395066363000
435435
container_memory_bandwidth_local_bytes{container_env_foo_env="prod",id="testcontainer",image="test",name="testcontaineralias",node_id="1",zone_name="hello"} 1.231233e+06 1395066363000
436+
# HELP container_cpu_psi_avg10_ratio Ratio of time spent under cpu pressure over time window of 10 seconds
437+
# TYPE container_cpu_psi_avg10_ratio gauge
438+
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.4 1395066363000
439+
container_cpu_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.1 1395066363000
440+
# HELP container_cpu_psi_avg300_ratio Ratio of time spent under cpu pressure over time window of 300 seconds
441+
# TYPE container_cpu_psi_avg300_ratio gauge
442+
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.6 1395066363000
443+
container_cpu_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.3 1395066363000
444+
# HELP container_cpu_psi_avg60_ratio Ratio of time spent under cpu pressure over time window of 60 seconds
445+
# TYPE container_cpu_psi_avg60_ratio gauge
446+
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.5 1395066363000
447+
container_cpu_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.2 1395066363000
448+
# HELP container_cpu_psi_total_seconds Total time spent under cpu pressure in seconds.
449+
# TYPE container_cpu_psi_total_seconds counter
450+
container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 2e-07 1395066363000
451+
container_cpu_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 1e-07 1395066363000
452+
# HELP container_io_psi_avg10_ratio Ratio of time spent under io pressure over time window of 10 seconds
453+
# TYPE container_io_psi_avg10_ratio gauge
454+
container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.14 1395066363000
455+
container_io_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.11 1395066363000
456+
# HELP container_io_psi_avg300_ratio Ratio of time spent under io pressure over time window of 300 seconds
457+
# TYPE container_io_psi_avg300_ratio gauge
458+
container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.16 1395066363000
459+
container_io_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.13 1395066363000
460+
# HELP container_io_psi_avg60_ratio Ratio of time spent under io pressure over time window of 60 seconds
461+
# TYPE container_io_psi_avg60_ratio gauge
462+
container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.15 1395066363000
463+
container_io_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.12 1395066363000
464+
# HELP container_io_psi_total_seconds Total time spent under io pressure in seconds.
465+
# TYPE container_io_psi_total_seconds counter
466+
container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 2.222e-06 1395066363000
467+
container_io_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 1.111e-06 1395066363000
468+
# HELP container_memory_psi_avg10_ratio Ratio of time spent under memory pressure over time window of 10 seconds
469+
# TYPE container_memory_psi_avg10_ratio gauge
470+
container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.04 1395066363000
471+
container_memory_psi_avg10_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.01 1395066363000
472+
# HELP container_memory_psi_avg300_ratio Ratio of time spent under memory pressure over time window of 300 seconds
473+
# TYPE container_memory_psi_avg300_ratio gauge
474+
container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.06 1395066363000
475+
container_memory_psi_avg300_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.03 1395066363000
476+
# HELP container_memory_psi_avg60_ratio Ratio of time spent under memory pressure over time window of 60 seconds
477+
# TYPE container_memory_psi_avg60_ratio gauge
478+
container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 0.05 1395066363000
479+
container_memory_psi_avg60_ratio{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 0.02 1395066363000
480+
# HELP container_memory_psi_total_seconds Total container time spent under memory pressure in seconds.
481+
# TYPE container_memory_psi_total_seconds counter
482+
container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="full",name="testcontaineralias",zone_name="hello"} 2e-06 1395066363000
483+
container_memory_psi_total_seconds{container_env_foo_env="prod",id="testcontainer",image="test",kind="some",name="testcontaineralias",zone_name="hello"} 1e-06 1395066363000

0 commit comments

Comments
 (0)