Skip to content

Commit 707625d

Browse files
committed
feat(device): implement MSR fallback for CPU power meter
This commit implements EP-002 MSR fallback power meter enhancement proposal. Add MSR (Model Specific Register) support as fallback when powercap interface is unavailable. This enhancement improves Kepler's compatibility across different systems and kernel configurations. Key changes: - Add MSR reader implementation with Intel RAPL register support - Create raplReader interface abstracting powercap and MSR backends - Extract existing powercap logic into dedicated reader component - Enhance RAPL power meter with automatic fallback detection - Add MSR configuration with security-conscious opt-in defaults - Implement comprehensive test coverage with mock MSR data The MSR fallback is disabled by default due to PLATYPUS attack vectors (CVE-2020-8694/8695) and must be explicitly enabled via configuration. When enabled, the system automatically falls back to MSR if powercap is unavailable, maintaining transparent operation. Signed-off-by: Sunil Thaha <[email protected]>
1 parent 9d8210b commit 707625d

13 files changed

+1646
-950
lines changed

cmd/kepler/main.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,9 +233,17 @@ func createCPUMeter(logger *slog.Logger, cfg *config.Config) (device.CPUPowerMet
233233
logger.Info("rapl zones are filtered", "zones-enabled", cfg.Rapl.Zones)
234234
}
235235

236+
// Convert config MSR settings to device MSRConfig
237+
msrConfig := device.MSRConfig{
238+
Enabled: cfg.MSR.Enabled,
239+
Force: cfg.MSR.Force,
240+
DevicePath: cfg.MSR.DevicePath,
241+
}
242+
236243
return device.NewCPUPowerMeter(
237244
cfg.Host.SysFS,
238245
device.WithRaplLogger(logger),
239246
device.WithZoneFilter(cfg.Rapl.Zones),
247+
device.WithMSRConfig(msrConfig),
240248
)
241249
}

config/config.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,16 @@ type (
3333
Zones []string `yaml:"zones"`
3434
}
3535

36+
// MSR configuration for fallback power reading
37+
MSR struct {
38+
// Enable automatic MSR fallback when powercap unavailable
39+
Enabled *bool `yaml:"enabled"`
40+
// Force MSR usage even if powercap available (testing)
41+
Force *bool `yaml:"force"`
42+
// MSR device path template
43+
DevicePath string `yaml:"devicePath"`
44+
}
45+
3646
// Development mode settings; disabled by default
3747
Dev struct {
3848
FakeCpuMeter struct {
@@ -98,6 +108,7 @@ type (
98108
Host Host `yaml:"host"`
99109
Monitor Monitor `yaml:"monitor"`
100110
Rapl Rapl `yaml:"rapl"`
111+
MSR MSR `yaml:"msr"`
101112
Exporter Exporter `yaml:"exporter"`
102113
Web Web `yaml:"web"`
103114
Debug Debug `yaml:"debug"`
@@ -168,6 +179,12 @@ const (
168179
// RAPL
169180
RaplZones = "rapl.zones" // not a flag
170181

182+
// MSR - NOTE: MSR settings are not exposed as CLI flags per proposal
183+
// They should only be configured via YAML files due to security implications
184+
MSREnabled = "msr.enabled" // not a flag
185+
MSRForce = "msr.force" // not a flag
186+
MSRDevicePath = "msr.devicePath" // not a flag
187+
171188
pprofEnabledFlag = "debug.pprof"
172189

173190
WebConfigFlag = "web.config-file"
@@ -203,6 +220,11 @@ func DefaultConfig() *Config {
203220
Rapl: Rapl{
204221
Zones: []string{},
205222
},
223+
MSR: MSR{
224+
Enabled: ptr.To(false), // Opt-in for security
225+
Force: ptr.To(false),
226+
DevicePath: "/dev/cpu/%d/msr",
227+
},
206228
Monitor: Monitor{
207229
Interval: 5 * time.Second,
208230
Staleness: 500 * time.Millisecond,
@@ -408,6 +430,9 @@ func (c *Config) sanitize() {
408430
c.Rapl.Zones[i] = strings.TrimSpace(c.Rapl.Zones[i])
409431
}
410432

433+
// MSR settings sanitization
434+
c.MSR.DevicePath = strings.TrimSpace(c.MSR.DevicePath)
435+
411436
for i := range c.Exporter.Prometheus.DebugCollectors {
412437
c.Exporter.Prometheus.DebugCollectors[i] = strings.TrimSpace(c.Exporter.Prometheus.DebugCollectors[i])
413438
}
@@ -488,6 +513,16 @@ func (c *Config) Validate(skips ...SkipValidation) error {
488513
errs = append(errs, fmt.Sprintf("invalid monitor min terminated energy threshold: %d can't be negative", c.Monitor.MinTerminatedEnergyThreshold))
489514
}
490515
}
516+
{ // MSR settings
517+
if c.MSR.DevicePath == "" {
518+
errs = append(errs, "MSR device path cannot be empty")
519+
} else {
520+
// Basic validation that device path is a template
521+
if !strings.Contains(c.MSR.DevicePath, "%d") {
522+
errs = append(errs, "MSR device path must contain '%d' placeholder for CPU ID")
523+
}
524+
}
525+
}
491526
{ // Kubernetes
492527
if ptr.Deref(c.Kube.Enabled, false) {
493528
if c.Kube.Config != "" {

internal/device/mock_cpu_power_meter.go

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,6 @@ package device
55

66
// TODO: Move this mock to a separate testutil package
77

8-
import (
9-
"slices"
10-
"testing"
11-
12-
"github.com/prometheus/procfs/sysfs"
13-
"github.com/stretchr/testify/require"
14-
)
15-
168
const (
179
validSysFSPath = "testdata/sys"
1810
badSysFSPath = "testdata/bad_sysfs"
@@ -67,27 +59,3 @@ func (m *MockRaplZone) OnEnergy(j Energy, err error) {
6759
func (m *MockRaplZone) Inc(delta Energy) {
6860
m.energy = (m.energy + delta) % m.maxMicroJoules
6961
}
70-
71-
func validSysFSFixtures(t *testing.T) sysfs.FS {
72-
t.Helper()
73-
fs, err := sysfs.NewFS(validSysFSPath)
74-
require.NoError(t, err, "Failed to create sysfs test FS")
75-
return fs
76-
}
77-
78-
func invalidSysFSFixtures(t *testing.T) sysfs.FS {
79-
t.Helper()
80-
fs, err := sysfs.NewFS(badSysFSPath)
81-
require.NoError(t, err, "Failed to create sysfs test FS")
82-
return fs
83-
}
84-
85-
func sortedZoneNames(zones []EnergyZone) []string {
86-
names := make([]string, len(zones))
87-
for i, zone := range zones {
88-
names[i] = zone.Name()
89-
}
90-
slices.Sort(names)
91-
92-
return names
93-
}

0 commit comments

Comments
 (0)