Skip to content

Commit db0e35c

Browse files
committed
feat(heuristics): add Whitespace Check to detect excessive spacing and invisible characters
Signed-off-by: Amine <[email protected]>
1 parent 979b05b commit db0e35c

File tree

5 files changed

+179
-0
lines changed

5 files changed

+179
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,3 +181,4 @@ docs/_build
181181
bin/
182182
requirements.txt
183183
.macaron_env_file
184+
**/.DS_Store

src/macaron/malware_analyzer/pypi_heuristics/heuristics.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ class Heuristics(str, Enum):
3737
#: Indicates that the package has an unusually large version number for a single release.
3838
ANOMALOUS_VERSION = "anomalous_version"
3939

40+
#: Indicates that the package has a lot of white spaces or invisible characters in the code.
41+
WHITE_SPACES = "white_spaces"
42+
4043

4144
class HeuristicResult(str, Enum):
4245
"""Result type indicating the outcome of a heuristic."""
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""This analyzer checks if the package has white spaces or invisible characters in the code."""
5+
6+
import logging
7+
import re
8+
9+
from macaron.config.defaults import defaults
10+
from macaron.json_tools import JsonType
11+
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
12+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
13+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
14+
15+
logger: logging.Logger = logging.getLogger(__name__)
16+
17+
18+
class WhiteSpacesAnalyzer(BaseHeuristicAnalyzer):
19+
"""Check whether the code has successive white spaces or invisible characters."""
20+
21+
INVISIBLE_CHARS = [
22+
"\u200b",
23+
"\u200c",
24+
"\u200d",
25+
"\ufeff",
26+
"\u200e",
27+
"\u200f",
28+
"\u00a0",
29+
"\u00ad",
30+
" ",
31+
]
32+
33+
def __init__(self) -> None:
34+
super().__init__(
35+
name="white_spaces_analyzer",
36+
heuristic=Heuristics.WHITE_SPACES,
37+
depends_on=None,
38+
)
39+
40+
self.repeated_spaces_threshold = self._load_defaults()
41+
42+
def _load_defaults(self) -> int:
43+
"""Load default settings from defaults.ini.
44+
45+
Returns
46+
-------
47+
int:
48+
The repeated spaces threshold.
49+
"""
50+
section_name = "heuristic.pypi"
51+
if defaults.has_section(section_name):
52+
section = defaults[section_name]
53+
return section.getint("repeated_spaces_threshold", 50)
54+
55+
return 50
56+
57+
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
58+
"""Analyze the package.
59+
60+
Parameters
61+
----------
62+
pypi_package_json: PyPIPackageJsonAsset
63+
The PyPI package JSON asset object.
64+
65+
Returns
66+
-------
67+
tuple[HeuristicResult, dict[str, JsonType]]:
68+
The result and related information collected during the analysis.
69+
"""
70+
scripts: dict[str, str] | None = pypi_package_json.get_sourcecode()
71+
if scripts is None:
72+
return HeuristicResult.SKIP, {}
73+
74+
for file, content in scripts.items():
75+
if file.endswith(".py") and self.has_white_spaces(content):
76+
return HeuristicResult.FAIL, {
77+
"file": file,
78+
}
79+
return HeuristicResult.PASS, {}
80+
81+
def has_white_spaces(self, code_string: str) -> bool:
82+
"""Check for excessive or invisible whitespace characters in a code string.
83+
84+
Parameters
85+
----------
86+
code_string: str
87+
The code string to check.
88+
89+
Returns
90+
-------
91+
bool:
92+
True if suspicious patterns are found, False otherwise.
93+
"""
94+
char_class = "".join(self.INVISIBLE_CHARS)
95+
regex_pattern = f"[{char_class}]{{{self.repeated_spaces_threshold},}}"
96+
if re.search(regex_pattern, code_string, re.DOTALL):
97+
return True
98+
return False

src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_absence import WheelAbsenceAnalyzer
2828
from macaron.malware_analyzer.pypi_heuristics.pypi_sourcecode_analyzer import PyPISourcecodeAnalyzer
2929
from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer
30+
from macaron.malware_analyzer.pypi_heuristics.sourcecode.white_spaces import WhiteSpacesAnalyzer
3031
from macaron.slsa_analyzer.analyze_context import AnalyzeContext
3132
from macaron.slsa_analyzer.checks.base_check import BaseCheck
3233
from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType
@@ -332,6 +333,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
332333
SuspiciousSetupAnalyzer,
333334
WheelAbsenceAnalyzer,
334335
AnomalousVersionAnalyzer,
336+
WhiteSpacesAnalyzer,
335337
]
336338

337339
# name used to query the result of all problog rules, so it can be accessed outside the model.
@@ -381,6 +383,10 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
381383
failed({Heuristics.CLOSER_RELEASE_JOIN_DATE.value}),
382384
forceSetup.
383385
386+
% Package released with excessive whitespace in the code .
387+
{Confidence.HIGH.value}::trigger(malware_high_confidence_4) :-
388+
quickUndetailed, forceSetup, failed({Heuristics.WHITE_SPACES.value}).
389+
384390
% Package released recently with little detail, with multiple releases as a trust marker, but frequent and with
385391
% the same code.
386392
{Confidence.MEDIUM.value}::trigger(malware_medium_confidence_1) :-
@@ -401,6 +407,7 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
401407
{problog_result_access} :- trigger(malware_high_confidence_1).
402408
{problog_result_access} :- trigger(malware_high_confidence_2).
403409
{problog_result_access} :- trigger(malware_high_confidence_3).
410+
{problog_result_access} :- trigger(malware_high_confidence_4).
404411
{problog_result_access} :- trigger(malware_medium_confidence_2).
405412
{problog_result_access} :- trigger(malware_medium_confidence_1).
406413
query({problog_result_access}).
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved.
2+
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
3+
4+
"""Tests for the WhiteSpacesAnalyzer heuristic."""
5+
# pylint: disable=redefined-outer-name
6+
7+
8+
from unittest.mock import MagicMock
9+
10+
import pytest
11+
12+
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult
13+
from macaron.malware_analyzer.pypi_heuristics.sourcecode.white_spaces import WhiteSpacesAnalyzer
14+
15+
16+
@pytest.fixture()
17+
def analyzer() -> WhiteSpacesAnalyzer:
18+
"""Pytest fixture to create a WhiteSpacesAnalyzer instance."""
19+
analyzer_instance = WhiteSpacesAnalyzer()
20+
return analyzer_instance
21+
22+
23+
def test_analyze_no_sourcecode(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None:
24+
"""Test the analyzer skips when there is no source code."""
25+
pypi_package_json.get_sourcecode.return_value = None
26+
result, info = analyzer.analyze(pypi_package_json)
27+
assert result == HeuristicResult.SKIP
28+
assert info == {}
29+
30+
31+
def test_analyze_pass(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None:
32+
"""Test the analyzer passes when no suspicious whitespace is found."""
33+
pypi_package_json.get_sourcecode.return_value = {"test.py": "print('hello')"}
34+
result, info = analyzer.analyze(pypi_package_json)
35+
assert result == HeuristicResult.PASS
36+
assert info == {}
37+
38+
39+
def test_analyze_fail_long_spaces(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None:
40+
"""Test the analyzer fails when long spaces are found."""
41+
repeated_spaces_threshold = analyzer.repeated_spaces_threshold
42+
code = f"print('hello')\n{' ' * (repeated_spaces_threshold + 1)}print('world')"
43+
pypi_package_json.get_sourcecode.return_value = {"test.py": code}
44+
result, info = analyzer.analyze(pypi_package_json)
45+
assert result == HeuristicResult.FAIL
46+
assert info["file"] == "test.py"
47+
48+
49+
def test_analyze_fail_invisible_chars(analyzer: WhiteSpacesAnalyzer, pypi_package_json: MagicMock) -> None:
50+
"""Test the analyzer fails when invisible characters are found."""
51+
repeated_spaces_threshold = analyzer.repeated_spaces_threshold
52+
invisible_char = "\u200b" # Zero-width space.
53+
code = f"print('hello'){invisible_char * repeated_spaces_threshold}print('world')"
54+
pypi_package_json.get_sourcecode.return_value = {"test.py": code}
55+
result, info = analyzer.analyze(pypi_package_json)
56+
assert result == HeuristicResult.FAIL
57+
assert info["file"] == "test.py"
58+
59+
60+
def test_has_white_spaces_long_spaces(analyzer: WhiteSpacesAnalyzer) -> None:
61+
"""Test has_white_spaces method with long spaces."""
62+
repeated_spaces_threshold = analyzer.repeated_spaces_threshold
63+
code = f"print('hello')\n{' ' * repeated_spaces_threshold}print('world')"
64+
assert analyzer.has_white_spaces(code)
65+
66+
67+
def test_has_white_spaces_no_suspicious(analyzer: WhiteSpacesAnalyzer) -> None:
68+
"""Test has_white_spaces method with no suspicious whitespace."""
69+
code = "print('hello')\nprint('world')"
70+
assert not analyzer.has_white_spaces(code)

0 commit comments

Comments
 (0)