Skip to content

Commit d096a31

Browse files
committed
refactor: similar projects looks at pypi inspector links
Signed-off-by: Carl Flottmann <[email protected]>
1 parent b8347ad commit d096a31

File tree

3 files changed

+140
-236
lines changed

3 files changed

+140
-236
lines changed

src/macaron/malware_analyzer/pypi_heuristics/metadata/similar_projects.py

Lines changed: 106 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,12 @@
44
"""This analyzer checks if the package has a similar structure to other packages maintained by the same user."""
55

66
import hashlib
7-
import io
87
import logging
9-
import tarfile
108

119
from macaron.json_tools import JsonType
1210
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
1311
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
14-
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset
15-
from macaron.util import send_get_http, send_get_http_raw
12+
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIInspectorAsset, PyPIPackageJsonAsset
1613

1714
logger: logging.Logger = logging.getLogger(__name__)
1815

@@ -24,20 +21,7 @@ def __init__(self) -> None:
2421
super().__init__(
2522
name="similar_project_analyzer",
2623
heuristic=Heuristics.SIMILAR_PROJECTS,
27-
# TODO: these dependencies are used as this heuristic currently downloads many package sourcecode
28-
# tarballs. Refactoring this heuristic to run more efficiently means this should have depends_on=None.
29-
depends_on=[
30-
(Heuristics.EMPTY_PROJECT_LINK, HeuristicResult.FAIL),
31-
(Heuristics.ONE_RELEASE, HeuristicResult.FAIL),
32-
(Heuristics.HIGH_RELEASE_FREQUENCY, HeuristicResult.FAIL),
33-
(Heuristics.UNCHANGED_RELEASE, HeuristicResult.FAIL),
34-
(Heuristics.CLOSER_RELEASE_JOIN_DATE, HeuristicResult.FAIL),
35-
(Heuristics.SUSPICIOUS_SETUP, HeuristicResult.FAIL),
36-
(Heuristics.WHEEL_ABSENCE, HeuristicResult.FAIL),
37-
(Heuristics.ANOMALOUS_VERSION, HeuristicResult.FAIL),
38-
(Heuristics.TYPOSQUATTING_PRESENCE, HeuristicResult.FAIL),
39-
(Heuristics.FAKE_EMAIL, HeuristicResult.FAIL),
40-
],
24+
depends_on=None,
4125
)
4226

4327
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
@@ -58,112 +42,127 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
5842
HeuristicAnalyzerValueError
5943
if the analysis fails.
6044
"""
61-
package_name = pypi_package_json.component_name
62-
target_hash = self.get_structure_hash(package_name)
63-
if not target_hash:
45+
target_structure = self.get_normalized_structure(pypi_package_json)
46+
if not target_structure:
6447
return HeuristicResult.SKIP, {}
48+
target_hash = hashlib.sha256("\n".join(target_structure).encode("utf-8")).hexdigest()
49+
detail_info: dict = {}
50+
similar_projects: list[str] = []
51+
result: HeuristicResult = HeuristicResult.PASS
52+
53+
maintainers = pypi_package_json.pypi_registry.get_maintainers_of_package(pypi_package_json.component_name)
54+
if not maintainers:
55+
# NOTE: This would ideally raise an error, identifying malformed package information, but issues with
56+
# obtaining maintainer information from the HTML page means this will remains as a SKIP for now.
57+
return HeuristicResult.SKIP, {}
58+
59+
analyzed: set[str] = {pypi_package_json.component_name}
6560

66-
maintainers = pypi_package_json.pypi_registry.get_maintainers_of_package(package_name)
67-
if maintainers:
68-
for maintainer in maintainers:
69-
maintainer_packages = pypi_package_json.pypi_registry.get_packages_by_username(maintainer)
70-
if not maintainer_packages:
61+
for maintainer in maintainers:
62+
maintainer_packages = pypi_package_json.pypi_registry.get_packages_by_username(maintainer)
63+
if not maintainer_packages:
64+
continue
65+
for package in maintainer_packages:
66+
# skip if it is a package we have already analyzed
67+
if package in analyzed:
7168
continue
72-
for package in maintainer_packages:
73-
if package == package_name:
74-
continue
69+
analyzed.add(package)
7570

76-
hash_value = self.get_structure_hash(package)
77-
if target_hash == hash_value:
78-
return HeuristicResult.FAIL, {
79-
"message": f"The package {package_name} has a similar structure to {package}.",
80-
"similar_package": package,
81-
}
71+
adjacent_pypi_json = PyPIPackageJsonAsset(
72+
package, None, False, pypi_package_json.pypi_registry, {}, "", PyPIInspectorAsset("", [], {})
73+
)
74+
if not adjacent_pypi_json.download(""):
75+
continue
76+
structure = self.get_normalized_structure(adjacent_pypi_json)
77+
if not structure:
78+
continue
8279

83-
return HeuristicResult.PASS, {}
80+
hash_value = hashlib.sha256("\n".join(structure).encode("utf-8")).hexdigest()
81+
if target_hash == hash_value:
82+
similar_projects.append(package)
8483

85-
def get_url(self, package_name: str, package_type: str = "sdist") -> str | None:
86-
"""Get the URL of the package's sdist.
84+
detail_info["similar_projects"] = similar_projects
85+
if similar_projects:
86+
result = HeuristicResult.FAIL
8787

88-
Parameters
89-
----------
90-
package_name : str
91-
The name of the package.
92-
package_type: str
93-
The package type to retrieve the URL of.
88+
return result, detail_info
9489

95-
Returns
96-
-------
97-
str | None:
98-
The URL of the package's sdist or None if not found.
99-
"""
100-
json_url = f"https://pypi.org/pypi/{package_name}/json"
101-
data = send_get_http(json_url, headers={})
102-
if not data:
103-
logger.debug("Failed to fetch package data for %s.", package_name)
104-
return None
105-
106-
sdist = next((url for url in data["urls"] if url["packagetype"] == package_type and url.get("url")), None)
107-
return sdist["url"] if sdist else None
90+
def get_normalized_structure(self, pypi_package_json: PyPIPackageJsonAsset) -> set[str] | None:
91+
"""Extract a normalized structure for a package.
10892
109-
def get_structure(self, package_name: str) -> list[str]:
110-
"""Get the file structure of the package's sdist.
93+
The normalized structure is the file tree structure of all python file in the package, with the package's
94+
name removed, so it is comparable.
11195
11296
Parameters
11397
----------
114-
package_name : str
115-
The name of the package.
98+
pypi_package_json: PyPIPackageJsonAsset
99+
The PyPI package JSON asset object.
116100
117101
Returns
118102
-------
119-
list[str]:
120-
The list of files in the package's sdist.
103+
set[str] | None:
104+
The normalized structure of file paths in a set, or None if a problem was encountered.
121105
"""
122-
# TODO: We should not download the source distributions for every package.
123-
# This is very inefficient. We should find a different way to extract the package
124-
# structure, e.g., the inspector service?
125-
sdist_url = self.get_url(package_name)
126-
if not sdist_url:
127-
logger.debug("Package %s does not have a sdist.", package_name)
128-
return []
129-
130-
response = send_get_http_raw(sdist_url)
131-
if not response:
132-
logger.debug("Failed to download sdist for package %s.", package_name)
133-
return []
134-
135-
buffer = io.BytesIO(response.content)
136-
try:
137-
with tarfile.open(fileobj=buffer, mode="r:gz") as tf:
138-
members = [
139-
member.name
140-
for member in tf.getmembers()
141-
if member.name and not member.name.startswith("PAXHeaders/")
142-
]
143-
except (tarfile.TarError, OSError) as error:
144-
logger.debug("Error reading source code tar file: %s", error)
145-
return []
146-
147-
return members
148-
149-
def get_structure_hash(self, package_name: str) -> str:
150-
"""Get the hash of the package's file structure.
106+
if not pypi_package_json.get_inspector_links():
107+
return None
151108

152-
Parameters
153-
----------
154-
package_name : str
155-
The name of the package.
109+
# for normalizing the structure
110+
version = pypi_package_json.component_version
111+
if version is None:
112+
version = pypi_package_json.get_latest_version()
113+
if version is None:
114+
return None
156115

157-
Returns
158-
-------
159-
str:
160-
The hash of the package's file structure.
161-
"""
162-
structure = self.get_structure(package_name)
163-
if not structure:
164-
return ""
116+
prefix = "./" + pypi_package_json.component_name + "-" + version
117+
normalized_structure = set()
118+
119+
# try using the tarball first
120+
tarball_link = pypi_package_json.inspector_asset.package_sdist_link
121+
if tarball_link and pypi_package_json.inspector_asset.package_link_reachability[tarball_link]:
122+
# all files are always prefixed with ./<package_name>-<version>/<...> in tarballs
123+
# non-metadaata files then have <package_name>/
124+
# prefix += "/" + pypi_package_json.component_name + "/"
125+
structure = PyPIInspectorAsset.get_structure(tarball_link)
126+
if structure:
127+
for file_path in structure:
128+
# we only consider python files. This avoids considering always package-specific files like PKG_INFO, licenses,
129+
# build metadata, etc.
130+
if file_path[-3:] != ".py":
131+
continue
132+
133+
# remove the "/package_name" from the prefix as well, that way the structure between two packages with different
134+
# names will be the same
135+
normalized_structure.add(
136+
file_path.removeprefix(prefix).removeprefix("/" + pypi_package_json.component_name)
137+
)
138+
139+
# We can't compare against wheel structures if we keep setup.py in there
140+
normalized_structure.discard("/setup.py")
141+
return normalized_structure
142+
143+
wheel_links = pypi_package_json.inspector_asset.package_whl_links
144+
if len(wheel_links) > 0:
145+
# wheels have this extra field for package metadata
146+
prefix += ".dist-info/"
147+
# structure is generally going to be the same, platform-specific details may vary for pacakges
148+
# which have platform-specific wheels
149+
structure = PyPIInspectorAsset.get_structure(wheel_links[0])
150+
if structure:
151+
for file_path in structure:
152+
# the .dist-info stuff is usually metadata
153+
if file_path.startswith(prefix) or file_path[-3:] != ".py":
154+
continue
155+
156+
# remove the "./package_name" from the prefix as well, that way the structure between
157+
# two packages with different names will be the same
158+
normalized_structure.add(
159+
file_path.removeprefix(pypi_package_json.component_name + "/").removeprefix(
160+
"./" + pypi_package_json.component_name
161+
)
162+
)
165163

166-
normalized = sorted([p.replace(package_name, "<ROOT>") for p in structure])
164+
return normalized_structure
167165

168-
joined = "\n".join(normalized).encode("utf-8")
169-
return hashlib.sha256(joined).hexdigest()
166+
# doesn't have wheel or tarball links even made, so shouldn't get here if the first line of this
167+
# function worked.
168+
return None

src/macaron/slsa_analyzer/package_registry/pypi_registry.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,40 @@ def __bool__(self) -> bool:
499499
return True
500500
return False
501501

502+
@staticmethod
503+
def get_structure(pypi_inspector_url: str) -> list[str] | None:
504+
"""Get the folder structure of a package from the inspector HTML.
505+
506+
Parameters
507+
----------
508+
pypi_inspector_url: str
509+
The URL to a pypi inspector package page.
510+
511+
Returns
512+
-------
513+
list[str] | None
514+
A list containing the folder structure, or None if it could not be extracted.
515+
"""
516+
# TODO: may have to change this in the asset. Got a client challenge without the "/" appended.
517+
response = send_get_http_raw(pypi_inspector_url)
518+
if not response:
519+
return None
520+
521+
html = response.content.decode("utf-8")
522+
soup = BeautifulSoup(html, "html.parser")
523+
# The package structure is present on an inspector.pypi.io page inside an unordered list (<ul>). This
524+
# is the only unordered list on the page.
525+
if soup.ul is None:
526+
return None
527+
528+
# All the file names sit inside <li> elements in our unordered list, under <a> tags with the 'href' class.
529+
files_list = []
530+
for element in soup.ul.find_all("li"):
531+
if element.a and element.a["href"]:
532+
files_list.append(element.a["href"])
533+
534+
return files_list
535+
502536

503537
@dataclass
504538
class PyPIPackageJsonAsset:

0 commit comments

Comments
 (0)