diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py index 48bb2d649c..92bf841a6f 100644 --- a/bugbug/bug_features.py +++ b/bugbug/bug_features.py @@ -3,6 +3,7 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this file, # You can obtain one at http://mozilla.org/MPL/2.0/. +import mimetypes import re import sys from collections import defaultdict @@ -14,6 +15,8 @@ from dateutil import parser from libmozdata import versions from libmozdata.bugzilla import Bugzilla +from publicsuffix2 import PublicSuffixList +from pygments.lexers import get_all_lexers from sklearn.base import BaseEstimator, TransformerMixin from bugbug import bug_snapshot, bugzilla, repository, utils @@ -905,3 +908,80 @@ class BugType(SingleBugFeature): def __call__(self, bug, **kwargs): return bug["type"] + + +class FilePaths(SingleBugFeature): + """Extract file paths (partial and full) from bug data.""" + + name = "Extract File Paths" + + def __init__(self): + non_file_path_keywords = [ + "http://", + "https://", + "www.", + "@", + ] + + valid_extensions = set(ext.lstrip(".") for ext in mimetypes.types_map.keys()) + + valid_extensions.update( + ext[2:] for (_, _, exts, *_) in get_all_lexers() for ext in exts + ) + + extension_pattern_string = "|".join(re.escape(ext) for ext in valid_extensions) + + self.extension_pattern = re.compile( + rf"\.({extension_pattern_string})(?![a-zA-Z])" + ) + + psl = PublicSuffixList() + tlds = set(f".{entry}" for entry in psl.tlds if "." not in entry) + + filtered_tlds = [tld for tld in tlds if tld[1:] not in valid_extensions] + non_file_path_keywords.extend(filtered_tlds) + + keyword_pattern_string = "|".join( + re.escape(keyword) for keyword in non_file_path_keywords + ) + self.keyword_pattern = re.compile(rf"\S*({keyword_pattern_string})\S*") + + def is_valid_file_path_candidate(self, word: str) -> bool: + return not self.keyword_pattern.search(word) + + def extract_valid_file_path(self, word: str) -> str: + if not self.is_valid_file_path_candidate(word): + return "" + + match = self.extension_pattern.search(word) + if match: + ext = match.group(1) + ext_index = match.start() + prefix = word[:ext_index] + alphanumeric_sequence = re.findall(r"[a-zA-Z0-9/_]+", prefix) + if alphanumeric_sequence: + return f"{alphanumeric_sequence[-1]}.{ext}" + return "" + + def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]: + text = f"{bug.get('summary', '')} {bug['comments'][0]['text']}" + + file_paths = [ + path + for word in text.split() + if (path := self.extract_valid_file_path(word)) + ] + + all_paths: list[str] = [] + + for path in file_paths: + parts = path.split("/") + all_paths.extend(part for part in parts if part) + if len(parts) > 1: + all_paths.extend( + subpath + for i in range(len(parts)) + if (subpath := "/".join(parts[i:])) + ) + + return all_paths diff --git a/bugbug/models/component.py b/bugbug/models/component.py index 39d9116fe3..773d46de0d 100644 --- a/bugbug/models/component.py +++ b/bugbug/models/component.py @@ -84,6 +84,7 @@ def __init__(self, lemmatization=False): bug_features.Whiteboard(), bug_features.Patches(), bug_features.Landings(), + bug_features.FilePaths(), ] cleanup_functions = [ diff --git a/requirements.txt b/requirements.txt index 283f41f69f..c2ee32bc89 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,6 +22,7 @@ orjson==3.10.9 ortools==9.11.4210 pandas==2.2.3 psutil==6.1.0 +publicsuffix2==2.20191221 pydriller==1.12 pyOpenSSL>=0.14 # Could not find a version that satisfies the requirement pyOpenSSL>=0.14; extra == "security" (from requests[security]>=2.7.0->libmozdata==0.1.43) python-dateutil==2.9.0.post0 diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py index f659411157..fdc405aa11 100644 --- a/tests/test_bug_features.py +++ b/tests/test_bug_features.py @@ -17,6 +17,7 @@ CommentLength, Component, DeltaNightlyRequestMerge, + FilePaths, HasCrashSignature, HasCVEInAlias, HasGithubURL, @@ -187,3 +188,209 @@ def test_BugTypes(read) -> None: BugTypes, [["performance"], ["memory"], ["power"], ["security"], ["crash"]], ) + + +def test_FilePaths(read): + inline_data = [ + { + "summary": " cleanup", + "comments": [ + { + "text": "Fix for\n{{ \nanthonyd (2 warnings)\n1.\tlayout/html/base/src/nsFrame.cpp:3879 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n2.\tlayout/html/base/src/nsFrame.cpp:3908 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n}} (NB: lines should be lines - 2, due to checkin \"in progress\")\nwill be included." + } + ], + }, + { + "summary": "Spidermonkey regression causes treehydra trunk to fail 6 tests", + "comments": [ + { + "text": 'Today I\'m trying to get callgraph stuff hooked into dxr, and I\'m unable to get a working treehydra. I\'ve updated tried updating just dehydra, then I updated gcc w/plugins using the new stuff in the patch queue, and it doesn\'t matter. Running make check_treehydra fails like this:\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad3.js locks_bad3.cc\n Failure msg: Expected \'locks_bad3.cc:10: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good.js locks_good.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good2.js locks_good2.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad4.js locks_bad4.cc\n Failure msg: Expected \'locks_bad4.cc:13: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad2.js locks_bad2.cc\n Failure msg: Expected \'locks_bad2.cc:12: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad1.js locks_bad1.cc\n Failure msg: Expected \'locks_bad1.cc:11: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\n\nUnit Test Suite Summary:\n 32 passed\n 6 failed\n 0 error(s)\nmake[1]: *** [check_treehydra] Error 1\nmake[1]: Leaving directory `/var/www/html/dxr/tools/gcc-dehydra/dehydra/test\'\nmake: *** [check] Error 2' + } + ], + }, + ] + expected_results = [ + [ + "nsFrame.cpp", + "layout", + "html", + "base", + "src", + "nsFrame.cpp", + "layout/html/base/src/nsFrame.cpp", + "html/base/src/nsFrame.cpp", + "base/src/nsFrame.cpp", + "src/nsFrame.cpp", + "nsFrame.cpp", + "layout", + "html", + "base", + "src", + "nsFrame.cpp", + "layout/html/base/src/nsFrame.cpp", + "html/base/src/nsFrame.cpp", + "base/src/nsFrame.cpp", + "src/nsFrame.cpp", + "nsFrame.cpp", + ], + [ + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_bad3.js", + "locks_bad3.cc", + "locks_bad3.cc", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_good.js", + "locks_good.cc", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_good2.js", + "locks_good2.cc", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_bad4.js", + "locks_bad4.cc", + "locks_bad4.cc", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_bad2.js", + "locks_bad2.cc", + "locks_bad2.cc", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + "gcc_treehydra.so", + "/gcc_treehydra.so", + "gcc_treehydra.so", + "test_locks_bad1.js", + "locks_bad1.cc", + "locks_bad1.cc", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "treehydra.js", + "/libs/treehydra.js", + "libs/treehydra.js", + "treehydra.js", + "libs", + "unstable", + "esp.js", + "/libs/unstable/esp.js", + "libs/unstable/esp.js", + "unstable/esp.js", + "esp.js", + "esp_lock.js", + "/esp_lock.js", + "esp_lock.js", + ], + ] + + feature_extractor = FilePaths() + results = [feature_extractor(item) for item in inline_data] + assert results == expected_results