4
4
"""This analyzer checks if the package has a similar structure to other packages maintained by the same user."""
5
5
6
6
import hashlib
7
- import io
8
7
import logging
9
- import tarfile
10
8
11
9
from macaron .json_tools import JsonType
12
10
from macaron .malware_analyzer .pypi_heuristics .base_analyzer import BaseHeuristicAnalyzer
13
11
from macaron .malware_analyzer .pypi_heuristics .heuristics import HeuristicResult , Heuristics
14
- from macaron .slsa_analyzer .package_registry .pypi_registry import PyPIPackageJsonAsset
15
- from macaron .util import send_get_http , send_get_http_raw
12
+ from macaron .slsa_analyzer .package_registry .pypi_registry import PyPIInspectorAsset , PyPIPackageJsonAsset
16
13
17
14
logger : logging .Logger = logging .getLogger (__name__ )
18
15
@@ -24,20 +21,7 @@ def __init__(self) -> None:
24
21
super ().__init__ (
25
22
name = "similar_project_analyzer" ,
26
23
heuristic = Heuristics .SIMILAR_PROJECTS ,
27
- # TODO: these dependencies are used as this heuristic currently downloads many package sourcecode
28
- # tarballs. Refactoring this heuristic to run more efficiently means this should have depends_on=None.
29
- depends_on = [
30
- (Heuristics .EMPTY_PROJECT_LINK , HeuristicResult .FAIL ),
31
- (Heuristics .ONE_RELEASE , HeuristicResult .FAIL ),
32
- (Heuristics .HIGH_RELEASE_FREQUENCY , HeuristicResult .FAIL ),
33
- (Heuristics .UNCHANGED_RELEASE , HeuristicResult .FAIL ),
34
- (Heuristics .CLOSER_RELEASE_JOIN_DATE , HeuristicResult .FAIL ),
35
- (Heuristics .SUSPICIOUS_SETUP , HeuristicResult .FAIL ),
36
- (Heuristics .WHEEL_ABSENCE , HeuristicResult .FAIL ),
37
- (Heuristics .ANOMALOUS_VERSION , HeuristicResult .FAIL ),
38
- (Heuristics .TYPOSQUATTING_PRESENCE , HeuristicResult .FAIL ),
39
- (Heuristics .FAKE_EMAIL , HeuristicResult .FAIL ),
40
- ],
24
+ depends_on = None ,
41
25
)
42
26
43
27
def analyze (self , pypi_package_json : PyPIPackageJsonAsset ) -> tuple [HeuristicResult , dict [str , JsonType ]]:
@@ -58,112 +42,127 @@ def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicRes
58
42
HeuristicAnalyzerValueError
59
43
if the analysis fails.
60
44
"""
61
- package_name = pypi_package_json .component_name
62
- target_hash = self .get_structure_hash (package_name )
63
- if not target_hash :
45
+ target_structure = self .get_normalized_structure (pypi_package_json )
46
+ if not target_structure :
64
47
return HeuristicResult .SKIP , {}
48
+ target_hash = hashlib .sha256 ("\n " .join (target_structure ).encode ("utf-8" )).hexdigest ()
49
+ detail_info : dict = {}
50
+ similar_projects : list [str ] = []
51
+ result : HeuristicResult = HeuristicResult .PASS
52
+
53
+ maintainers = pypi_package_json .pypi_registry .get_maintainers_of_package (pypi_package_json .component_name )
54
+ if not maintainers :
55
+ # NOTE: This would ideally raise an error, identifying malformed package information, but issues with
56
+ # obtaining maintainer information from the HTML page means this will remains as a SKIP for now.
57
+ return HeuristicResult .SKIP , {}
58
+
59
+ analyzed : set [str ] = {pypi_package_json .component_name }
65
60
66
- maintainers = pypi_package_json .pypi_registry .get_maintainers_of_package (package_name )
67
- if maintainers :
68
- for maintainer in maintainers :
69
- maintainer_packages = pypi_package_json .pypi_registry .get_packages_by_username (maintainer )
70
- if not maintainer_packages :
61
+ for maintainer in maintainers :
62
+ maintainer_packages = pypi_package_json .pypi_registry .get_packages_by_username (maintainer )
63
+ if not maintainer_packages :
64
+ continue
65
+ for package in maintainer_packages :
66
+ # skip if it is a package we have already analyzed
67
+ if package in analyzed :
71
68
continue
72
- for package in maintainer_packages :
73
- if package == package_name :
74
- continue
69
+ analyzed .add (package )
75
70
76
- hash_value = self .get_structure_hash (package )
77
- if target_hash == hash_value :
78
- return HeuristicResult .FAIL , {
79
- "message" : f"The package { package_name } has a similar structure to { package } ." ,
80
- "similar_package" : package ,
81
- }
71
+ adjacent_pypi_json = PyPIPackageJsonAsset (
72
+ package , None , False , pypi_package_json .pypi_registry , {}, "" , PyPIInspectorAsset ("" , [], {})
73
+ )
74
+ if not adjacent_pypi_json .download ("" ):
75
+ continue
76
+ structure = self .get_normalized_structure (adjacent_pypi_json )
77
+ if not structure :
78
+ continue
82
79
83
- return HeuristicResult .PASS , {}
80
+ hash_value = hashlib .sha256 ("\n " .join (structure ).encode ("utf-8" )).hexdigest ()
81
+ if target_hash == hash_value :
82
+ similar_projects .append (package )
84
83
85
- def get_url (self , package_name : str , package_type : str = "sdist" ) -> str | None :
86
- """Get the URL of the package's sdist.
84
+ detail_info ["similar_projects" ] = similar_projects
85
+ if similar_projects :
86
+ result = HeuristicResult .FAIL
87
87
88
- Parameters
89
- ----------
90
- package_name : str
91
- The name of the package.
92
- package_type: str
93
- The package type to retrieve the URL of.
88
+ return result , detail_info
94
89
95
- Returns
96
- -------
97
- str | None:
98
- The URL of the package's sdist or None if not found.
99
- """
100
- json_url = f"https://pypi.org/pypi/{ package_name } /json"
101
- data = send_get_http (json_url , headers = {})
102
- if not data :
103
- logger .debug ("Failed to fetch package data for %s." , package_name )
104
- return None
105
-
106
- sdist = next ((url for url in data ["urls" ] if url ["packagetype" ] == package_type and url .get ("url" )), None )
107
- return sdist ["url" ] if sdist else None
90
+ def get_normalized_structure (self , pypi_package_json : PyPIPackageJsonAsset ) -> set [str ] | None :
91
+ """Extract a normalized structure for a package.
108
92
109
- def get_structure ( self , package_name : str ) -> list [ str ]:
110
- """Get the file structure of the package's sdist .
93
+ The normalized structure is the file tree structure of all python file in the package, with the package's
94
+ name removed, so it is comparable .
111
95
112
96
Parameters
113
97
----------
114
- package_name : str
115
- The name of the package .
98
+ pypi_package_json: PyPIPackageJsonAsset
99
+ The PyPI package JSON asset object .
116
100
117
101
Returns
118
102
-------
119
- list [str]:
120
- The list of files in the package's sdist .
103
+ set [str] | None :
104
+ The normalized structure of file paths in a set, or None if a problem was encountered .
121
105
"""
122
- # TODO: We should not download the source distributions for every package.
123
- # This is very inefficient. We should find a different way to extract the package
124
- # structure, e.g., the inspector service?
125
- sdist_url = self .get_url (package_name )
126
- if not sdist_url :
127
- logger .debug ("Package %s does not have a sdist." , package_name )
128
- return []
129
-
130
- response = send_get_http_raw (sdist_url )
131
- if not response :
132
- logger .debug ("Failed to download sdist for package %s." , package_name )
133
- return []
134
-
135
- buffer = io .BytesIO (response .content )
136
- try :
137
- with tarfile .open (fileobj = buffer , mode = "r:gz" ) as tf :
138
- members = [
139
- member .name
140
- for member in tf .getmembers ()
141
- if member .name and not member .name .startswith ("PAXHeaders/" )
142
- ]
143
- except (tarfile .TarError , OSError ) as error :
144
- logger .debug ("Error reading source code tar file: %s" , error )
145
- return []
146
-
147
- return members
148
-
149
- def get_structure_hash (self , package_name : str ) -> str :
150
- """Get the hash of the package's file structure.
106
+ if not pypi_package_json .get_inspector_links ():
107
+ return None
151
108
152
- Parameters
153
- ----------
154
- package_name : str
155
- The name of the package.
109
+ # for normalizing the structure
110
+ version = pypi_package_json .component_version
111
+ if version is None :
112
+ version = pypi_package_json .get_latest_version ()
113
+ if version is None :
114
+ return None
156
115
157
- Returns
158
- -------
159
- str:
160
- The hash of the package's file structure.
161
- """
162
- structure = self .get_structure (package_name )
163
- if not structure :
164
- return ""
116
+ prefix = "./" + pypi_package_json .component_name + "-" + version
117
+ normalized_structure = set ()
118
+
119
+ # try using the tarball first
120
+ tarball_link = pypi_package_json .inspector_asset .package_sdist_link
121
+ if tarball_link and pypi_package_json .inspector_asset .package_link_reachability [tarball_link ]:
122
+ # all files are always prefixed with ./<package_name>-<version>/<...> in tarballs
123
+ # non-metadaata files then have <package_name>/
124
+ # prefix += "/" + pypi_package_json.component_name + "/"
125
+ structure = PyPIInspectorAsset .get_structure (tarball_link )
126
+ if structure :
127
+ for file_path in structure :
128
+ # we only consider python files. This avoids considering always package-specific files like PKG_INFO, licenses,
129
+ # build metadata, etc.
130
+ if file_path [- 3 :] != ".py" :
131
+ continue
132
+
133
+ # remove the "/package_name" from the prefix as well, that way the structure between two packages with different
134
+ # names will be the same
135
+ normalized_structure .add (
136
+ file_path .removeprefix (prefix ).removeprefix ("/" + pypi_package_json .component_name )
137
+ )
138
+
139
+ # We can't compare against wheel structures if we keep setup.py in there
140
+ normalized_structure .discard ("/setup.py" )
141
+ return normalized_structure
142
+
143
+ wheel_links = pypi_package_json .inspector_asset .package_whl_links
144
+ if len (wheel_links ) > 0 :
145
+ # wheels have this extra field for package metadata
146
+ prefix += ".dist-info/"
147
+ # structure is generally going to be the same, platform-specific details may vary for pacakges
148
+ # which have platform-specific wheels
149
+ structure = PyPIInspectorAsset .get_structure (wheel_links [0 ])
150
+ if structure :
151
+ for file_path in structure :
152
+ # the .dist-info stuff is usually metadata
153
+ if file_path .startswith (prefix ) or file_path [- 3 :] != ".py" :
154
+ continue
155
+
156
+ # remove the "./package_name" from the prefix as well, that way the structure between
157
+ # two packages with different names will be the same
158
+ normalized_structure .add (
159
+ file_path .removeprefix (pypi_package_json .component_name + "/" ).removeprefix (
160
+ "./" + pypi_package_json .component_name
161
+ )
162
+ )
165
163
166
- normalized = sorted ([ p . replace ( package_name , "<ROOT>" ) for p in structure ])
164
+ return normalized_structure
167
165
168
- joined = "\n " .join (normalized ).encode ("utf-8" )
169
- return hashlib .sha256 (joined ).hexdigest ()
166
+ # doesn't have wheel or tarball links even made, so shouldn't get here if the first line of this
167
+ # function worked.
168
+ return None
0 commit comments