-
Notifications
You must be signed in to change notification settings - Fork 315
Add features based on file paths in the title and description #4270
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
9adba3f
4a43181
bdbf73d
9ce9d76
802c9bd
165e68a
49601eb
9166307
8250977
d1eb019
f0f1118
fad7dae
fdd6123
82a038a
bfd6334
5f4ec72
1f3921b
0cf2482
deadc18
a3f0ede
a96a1e2
176079c
19a289c
4fd1f04
0d8d9dd
c28ad97
24a5375
2bcfb18
9bed4a1
e76c0d0
1c00a10
f2a9d39
b677ccb
70f72f5
836d42d
d6f8002
e11be5b
38432cf
9373c0b
2022eb4
64e5c3b
93b3738
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
# License, v. 2.0. If a copy of the MPL was not distributed with this file, | ||
# You can obtain one at http://mozilla.org/MPL/2.0/. | ||
|
||
import mimetypes | ||
import re | ||
import sys | ||
from collections import defaultdict | ||
|
@@ -14,6 +15,8 @@ | |
from dateutil import parser | ||
from libmozdata import versions | ||
from libmozdata.bugzilla import Bugzilla | ||
from publicsuffix2 import PublicSuffixList | ||
from pygments.lexers import get_all_lexers | ||
from sklearn.base import BaseEstimator, TransformerMixin | ||
|
||
from bugbug import bug_snapshot, bugzilla, repository, utils | ||
|
@@ -905,3 +908,80 @@ class BugType(SingleBugFeature): | |
|
||
def __call__(self, bug, **kwargs): | ||
return bug["type"] | ||
|
||
|
||
class FilePaths(SingleBugFeature): | ||
"""Extract file paths (partial and full) from bug data.""" | ||
|
||
name = "Extract File Paths" | ||
|
||
def __init__(self): | ||
non_file_path_keywords = [ | ||
"http://", | ||
"https://", | ||
"www.", | ||
"@", | ||
] | ||
|
||
valid_extensions = set(ext.lstrip(".") for ext in mimetypes.types_map.keys()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why don't we focus only on code-related extensions? Is this improving the results? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you elaborate on code-related extensions? I'm essentially extracting any instance of a file path in the title or description. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What value do extensions from |
||
|
||
valid_extensions.update( | ||
ext[2:] for (_, _, exts, *_) in get_all_lexers() for ext in exts | ||
) | ||
|
||
extension_pattern_string = "|".join(re.escape(ext) for ext in valid_extensions) | ||
|
||
self.extension_pattern = re.compile( | ||
rf"\.({extension_pattern_string})(?![a-zA-Z])" | ||
) | ||
|
||
psl = PublicSuffixList() | ||
tlds = set(f".{entry}" for entry in psl.tlds if "." not in entry) | ||
|
||
filtered_tlds = [tld for tld in tlds if tld[1:] not in valid_extensions] | ||
non_file_path_keywords.extend(filtered_tlds) | ||
|
||
keyword_pattern_string = "|".join( | ||
re.escape(keyword) for keyword in non_file_path_keywords | ||
) | ||
self.keyword_pattern = re.compile(rf"\S*({keyword_pattern_string})\S*") | ||
|
||
def is_valid_file_path_candidate(self, word: str) -> bool: | ||
return not self.keyword_pattern.search(word) | ||
|
||
def extract_valid_file_path(self, word: str) -> str: | ||
if not self.is_valid_file_path_candidate(word): | ||
return "" | ||
|
||
match = self.extension_pattern.search(word) | ||
if match: | ||
ext = match.group(1) | ||
ext_index = match.start() | ||
prefix = word[:ext_index] | ||
alphanumeric_sequence = re.findall(r"[a-zA-Z0-9/_]+", prefix) | ||
if alphanumeric_sequence: | ||
return f"{alphanumeric_sequence[-1]}.{ext}" | ||
return "" | ||
|
||
def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: | ||
text = f"{bug.get('summary', '')} {bug['comments'][0]['text']}" | ||
|
||
file_paths = [ | ||
path | ||
for word in text.split() | ||
if (path := self.extract_valid_file_path(word)) | ||
] | ||
|
||
all_paths: list[str] = [] | ||
|
||
for path in file_paths: | ||
parts = path.split("/") | ||
all_paths.extend(part for part in parts if part) | ||
if len(parts) > 1: | ||
all_paths.extend( | ||
subpath | ||
for i in range(len(parts)) | ||
if (subpath := "/".join(parts[i:])) | ||
) | ||
|
||
return all_paths |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ orjson==3.10.16 | |
ortools==9.12.4544 | ||
pandas==2.2.3 | ||
psutil==7.0.0 | ||
publicsuffix2==2.20191221 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm a bit skeptical about adding a non-actively maintained package. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand your point, but in this case I'm only using the library for a list of public suffixes. If we want to avoid using a non-actively maintained package, we could also hard-code it, but I don't think that is any better. |
||
pydriller==1.12 | ||
pyOpenSSL>=0.14 # Could not find a version that satisfies the requirement pyOpenSSL>=0.14; extra == "security" (from requests[security]>=2.7.0->libmozdata==0.1.43) | ||
python-dateutil==2.9.0.post0 | ||
|
Uh oh!
There was an error while loading. Please reload this page.