diff --git a/nodescraper/models/event.py b/nodescraper/models/event.py index 0a759b0..947be3d 100644 --- a/nodescraper/models/event.py +++ b/nodescraper/models/event.py @@ -50,7 +50,7 @@ class Event(BaseModel): timestamp: datetime.datetime = Field( default_factory=lambda: datetime.datetime.now(datetime.timezone.utc) ) - reporter: str = "ERROR_SCRAPER" + reporter: str = "NODE_SCRAPER" category: str description: str data: dict = Field(default_factory=dict) diff --git a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py index 78f84a2..6e79741 100644 --- a/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py +++ b/nodescraper/plugins/inband/dmesg/dmesg_analyzer.py @@ -53,7 +53,7 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]): event_category=EventCategory.SW_DRIVER, ), ErrorRegex( - regex=re.compile(r"[Kk]ernel panic.*"), + regex=re.compile(r"\bkernel panic\b.*", re.IGNORECASE), message="Kernel Panic", event_category=EventCategory.SW_DRIVER, ), @@ -294,6 +294,33 @@ class DmesgAnalyzer(RegexAnalyzer[DmesgData, DmesgAnalyzerArgs]): event_category=EventCategory.SW_DRIVER, event_priority=EventPriority.WARNING, ), + ErrorRegex( + regex=re.compile( + r"(?:\[[^\]]+\]\s*)?LNetError:.*ko2iblnd:\s*No matching interfaces", + re.IGNORECASE, + ), + message="LNet: ko2iblnd has no matching interfaces", + event_category=EventCategory.IO, + event_priority=EventPriority.WARNING, + ), + ErrorRegex( + regex=re.compile( + r"(?:\[[^\]]+\]\s*)?LNetError:\s*.*Error\s*-?\d+\s+starting up LNI\s+\w+", + re.IGNORECASE, + ), + message="LNet: Error starting up LNI", + event_category=EventCategory.IO, + event_priority=EventPriority.WARNING, + ), + ErrorRegex( + regex=re.compile( + r"LustreError:.*ptlrpc_init_portals\(\).*network initiali[sz]ation failed", + re.IGNORECASE, + ), + message="Lustre: network initialisation failed", + event_category=EventCategory.IO, + event_priority=EventPriority.WARNING, + ), ] @classmethod diff --git a/test/unit/plugin/test_dmesg_analyzer.py b/test/unit/plugin/test_dmesg_analyzer.py index f17057c..2a6844f 100644 --- a/test/unit/plugin/test_dmesg_analyzer.py +++ b/test/unit/plugin/test_dmesg_analyzer.py @@ -204,3 +204,33 @@ def test_page_fault(system_info): for event in res.events: assert event.priority == EventPriority.ERROR assert event.description == "amdgpu Page Fault" + + +def test_lnet_and_lustre_boot_errors_are_warning_events(system_info): + dmesg_log = "\n".join( + [ + "[ 548.063411] LNetError: 2719:0:(o2iblnd.c:3327:kiblnd_startup()) ko2iblnd: No matching interfaces", + "[ 548.073737] LNetError: 105-4: Error -100 starting up LNI o2ib", + "[Wed Jun 25 17:19:52 2025] LustreError: 2719:0:(events.c:639:ptlrpc_init_portals()) network initialisation failed", + ] + ) + + analyzer = DmesgAnalyzer( + system_info=system_info, + ) + data = DmesgData(dmesg_content=dmesg_log) + result = analyzer.analyze_data(data, DmesgAnalyzerArgs()) + + by_msg = {e.description: e for e in result.events} + + m1 = "LNet: ko2iblnd has no matching interfaces" + m2 = "LNet: Error starting up LNI" + m3 = "Lustre: network initialisation failed" + + assert m1 in by_msg, f"Missing event: {m1}" + assert m2 in by_msg, f"Missing event: {m2}" + assert m3 in by_msg, f"Missing event: {m3}" + + for m in (m1, m2, m3): + ev = by_msg[m] + assert ev.priority == EventPriority.WARNING, f"{m} should be WARNING"