mozilla · benjaminmah · Jun 20, 2024 · Jun 20, 2024 · Jun 25, 2024 · Jun 25, 2024
diff --git a/bugbug/bug_features.py b/bugbug/bug_features.py
@@ -3,6 +3,7 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this file,
 # You can obtain one at http://mozilla.org/MPL/2.0/.
 
+import mimetypes
 import re
 import sys
 from collections import defaultdict
@@ -14,6 +15,8 @@
 from dateutil import parser
 from libmozdata import versions
 from libmozdata.bugzilla import Bugzilla
+from publicsuffix2 import PublicSuffixList
+from pygments.lexers import get_all_lexers
 from sklearn.base import BaseEstimator, TransformerMixin
 
 from bugbug import bug_snapshot, bugzilla, repository, utils
@@ -905,3 +908,80 @@ class BugType(SingleBugFeature):
 
     def __call__(self, bug, **kwargs):
         return bug["type"]
+
+
+class FilePaths(SingleBugFeature):
+    """Extract file paths (partial and full) from bug data."""
+
+    name = "Extract File Paths"
+
+    def __init__(self):
+        non_file_path_keywords = [
+            "http://",
+            "https://",
+            "www.",
+            "@",
+        ]
+
+        valid_extensions = set(ext.lstrip(".") for ext in mimetypes.types_map.keys())
+
+        valid_extensions.update(
+            ext[2:] for (_, _, exts, *_) in get_all_lexers() for ext in exts
+        )
+
+        extension_pattern_string = "|".join(re.escape(ext) for ext in valid_extensions)
+
+        self.extension_pattern = re.compile(
+            rf"\.({extension_pattern_string})(?![a-zA-Z])"
+        )
+
+        psl = PublicSuffixList()
+        tlds = set(f".{entry}" for entry in psl.tlds if "." not in entry)
+
+        filtered_tlds = [tld for tld in tlds if tld[1:] not in valid_extensions]
+        non_file_path_keywords.extend(filtered_tlds)
+
+        keyword_pattern_string = "|".join(
+            re.escape(keyword) for keyword in non_file_path_keywords
+        )
+        self.keyword_pattern = re.compile(rf"\S*({keyword_pattern_string})\S*")
+
+    def is_valid_file_path_candidate(self, word: str) -> bool:
+        return not self.keyword_pattern.search(word)
+
+    def extract_valid_file_path(self, word: str) -> str:
+        if not self.is_valid_file_path_candidate(word):
+            return ""
+
+        match = self.extension_pattern.search(word)
+        if match:
+            ext = match.group(1)
+            ext_index = match.start()
+            prefix = word[:ext_index]
+            alphanumeric_sequence = re.findall(r"[a-zA-Z0-9/_]+", prefix)
+            if alphanumeric_sequence:
+                return f"{alphanumeric_sequence[-1]}.{ext}"
+        return ""
+
+    def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]:
+        text = f"{bug.get('summary', '')} {bug['comments'][0]['text']}"
+
+        file_paths = [
+            path
+            for word in text.split()
+            if (path := self.extract_valid_file_path(word))
+        ]
+
+        all_paths: list[str] = []
+
+        for path in file_paths:
+            parts = path.split("/")
+            all_paths.extend(part for part in parts if part)
+            if len(parts) > 1:
+                all_paths.extend(
+                    subpath
+                    for i in range(len(parts))
+                    if (subpath := "/".join(parts[i:]))
+                )
+
+        return all_paths
diff --git a/bugbug/models/component.py b/bugbug/models/component.py
@@ -84,6 +84,7 @@ def __init__(self, lemmatization=False):
             bug_features.Whiteboard(),
             bug_features.Patches(),
             bug_features.Landings(),
+            bug_features.FilePaths(),
         ]
 
         cleanup_functions = [

diff --git a/requirements.txt b/requirements.txt
@@ -22,6 +22,7 @@ orjson==3.10.9
 ortools==9.11.4210
 pandas==2.2.3
 psutil==6.1.0
+publicsuffix2==2.20191221
 pydriller==1.12
 pyOpenSSL>=0.14  # Could not find a version that satisfies the requirement pyOpenSSL>=0.14; extra == "security" (from requests[security]>=2.7.0->libmozdata==0.1.43)
 python-dateutil==2.9.0.post0

diff --git a/tests/test_bug_features.py b/tests/test_bug_features.py
@@ -17,6 +17,7 @@
     CommentLength,
     Component,
     DeltaNightlyRequestMerge,
+    FilePaths,
     HasCrashSignature,
     HasCVEInAlias,
     HasGithubURL,
@@ -187,3 +188,209 @@ def test_BugTypes(read) -> None:
         BugTypes,
         [["performance"], ["memory"], ["power"], ["security"], ["crash"]],
     )
+
+
+def test_FilePaths(read):
+    inline_data = [
+        {
+            "summary": "<nsFrame.cpp> cleanup",
+            "comments": [
+                {
+                    "text": "Fix for\n{{ <http://tinderbox.mozilla.org/SeaMonkey/warn1082809200.7591.html>\nanthonyd (2 warnings)\n1.\tlayout/html/base/src/nsFrame.cpp:3879 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n2.\tlayout/html/base/src/nsFrame.cpp:3908 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n}} (NB: lines should be lines - 2, due to checkin \"in progress\")\nwill be included."
+                }
+            ],
+        },
+        {
+            "summary": "Spidermonkey regression causes treehydra trunk to fail 6 tests",
+            "comments": [
+                {
+                    "text": 'Today I\'m trying to get callgraph stuff hooked into dxr, and I\'m unable to get a working treehydra.  I\'ve updated tried updating just dehydra, then I updated gcc w/plugins using the new stuff in the patch queue, and it doesn\'t matter.  Running make check_treehydra fails like this:\n\nTest Failure: \n    Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad3.js locks_bad3.cc\n    Failure msg: Expected \'locks_bad3.cc:10: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0:     #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12:        #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481:    #2: ()\n./esp_lock.js:41:       #3: process_tree([object GCCNode])\n\nTest Failure: \n    Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good.js locks_good.cc\n    Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0:     #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12:        #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481:    #2: ()\n./esp_lock.js:41:       #3: process_tree([object GCCNode])\n\nTest Failure: \n    Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good2.js locks_good2.cc\n    Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0:     #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12:        #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481:    #2: ()\n./esp_lock.js:41:       #3: process_tree([object GCCNode])\n\nTest Failure: \n    Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad4.js locks_bad4.cc\n    Failure msg: Expected \'locks_bad4.cc:13: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0:     #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12:        #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481:    #2: ()\n./esp_lock.js:41:       #3: process_tree([object GCCNode])\n\nTest Failure: \n    Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad2.js locks_bad2.cc\n    Failure msg: Expected \'locks_bad2.cc:12: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0:     #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12:        #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481:    #2: ()\n./esp_lock.js:41:       #3: process_tree([object GCCNode])\n\nTest Failure: \n    Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad1.js locks_bad1.cc\n    Failure msg: Expected \'locks_bad1.cc:11: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0:     #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12:        #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481:    #2: ()\n./esp_lock.js:41:       #3: process_tree([object GCCNode])\n\n\nUnit Test Suite Summary:\n     32 passed\n      6 failed\n      0 error(s)\nmake[1]: *** [check_treehydra] Error 1\nmake[1]: Leaving directory `/var/www/html/dxr/tools/gcc-dehydra/dehydra/test\'\nmake: *** [check] Error 2'
+                }
+            ],
+        },
+    ]
+    expected_results = [
+        [
+            "nsFrame.cpp",
+            "layout",
+            "html",
+            "base",
+            "src",
+            "nsFrame.cpp",
+            "layout/html/base/src/nsFrame.cpp",
+            "html/base/src/nsFrame.cpp",
+            "base/src/nsFrame.cpp",
+            "src/nsFrame.cpp",
+            "nsFrame.cpp",
+            "layout",
+            "html",
+            "base",
+            "src",
+            "nsFrame.cpp",
+            "layout/html/base/src/nsFrame.cpp",
+            "html/base/src/nsFrame.cpp",
+            "base/src/nsFrame.cpp",
+            "src/nsFrame.cpp",
+            "nsFrame.cpp",
+        ],
+        [
+            "gcc_treehydra.so",
+            "/gcc_treehydra.so",
+            "gcc_treehydra.so",
+            "test_locks_bad3.js",
+            "locks_bad3.cc",
+            "locks_bad3.cc",
+            "libs",
+            "treehydra.js",
+            "/libs/treehydra.js",
+            "libs/treehydra.js",
+            "treehydra.js",
+            "libs",
+            "treehydra.js",
+            "/libs/treehydra.js",
+            "libs/treehydra.js",
+            "treehydra.js",
+            "libs",
+            "unstable",
+            "esp.js",
+            "/libs/unstable/esp.js",
+            "libs/unstable/esp.js",
+            "unstable/esp.js",
+            "esp.js",
+            "esp_lock.js",
+            "/esp_lock.js",
+            "esp_lock.js",
+            "gcc_treehydra.so",
+            "/gcc_treehydra.so",
+            "gcc_treehydra.so",
+            "test_locks_good.js",
+            "locks_good.cc",
+            "libs",
+            "treehydra.js",
+            "/libs/treehydra.js",
+            "libs/treehydra.js",
+            "treehydra.js",
+            "libs",
+            "treehydra.js",
+            "/libs/treehydra.js",
+            "libs/treehydra.js",
+            "treehydra.js",
+            "libs",
+            "unstable",
+            "esp.js",
+            "/libs/unstable/esp.js",
+            "libs/unstable/esp.js",
+            "unstable/esp.js",
+            "esp.js",
+            "esp_lock.js",
+            "/esp_lock.js",
+            "esp_lock.js",
+            "gcc_treehydra.so",
+            "/gcc_treehydra.so",
+            "gcc_treehydra.so",
+            "test_locks_good2.js",
+            "locks_good2.cc",
+            "libs",
+            "treehydra.js",
+            "/libs/treehydra.js",
+            "libs/treehydra.js",
+            "treehydra.js",
+            "libs",
+            "treehydra.js",
+            "/libs/treehydra.js",
+            "libs/treehydra.js",
+            "treehydra.js",
+            "libs",
+            "unstable",
+            "esp.js",
+            "/libs/unstable/esp.js",
+            "libs/unstable/esp.js",
+            "unstable/esp.js",
+            "esp.js",
+            "esp_lock.js",
+            "/esp_lock.js",
+            "esp_lock.js",
+            "gcc_treehydra.so",
+            "/gcc_treehydra.so",
+            "gcc_treehydra.so",
+            "test_locks_bad4.js",
+            "locks_bad4.cc",
+            "locks_bad4.cc",
+            "libs",
+            "treehydra.js",
+            "/libs/treehydra.js",
+            "libs/treehydra.js",
+            "treehydra.js",
+            "libs",
+            "treehydra.js",
+            "/libs/treehydra.js",
+            "libs/treehydra.js",
+            "treehydra.js",
+            "libs",
+            "unstable",
+            "esp.js",
+            "/libs/unstable/esp.js",
+            "libs/unstable/esp.js",
+            "unstable/esp.js",
+            "esp.js",
+            "esp_lock.js",
+            "/esp_lock.js",
+            "esp_lock.js",
+            "gcc_treehydra.so",
+            "/gcc_treehydra.so",
+            "gcc_treehydra.so",
+            "test_locks_bad2.js",
+            "locks_bad2.cc",
+            "locks_bad2.cc",
+            "libs",
+            "treehydra.js",
+            "/libs/treehydra.js",
+            "libs/treehydra.js",
+            "treehydra.js",
+            "libs",
+            "treehydra.js",
+            "/libs/treehydra.js",
+            "libs/treehydra.js",
+            "treehydra.js",
+            "libs",
+            "unstable",
+            "esp.js",
+            "/libs/unstable/esp.js",
+            "libs/unstable/esp.js",
+            "unstable/esp.js",
+            "esp.js",
+            "esp_lock.js",
+            "/esp_lock.js",
+            "esp_lock.js",
+            "gcc_treehydra.so",
+            "/gcc_treehydra.so",
+            "gcc_treehydra.so",
+            "test_locks_bad1.js",
+            "locks_bad1.cc",
+            "locks_bad1.cc",
+            "libs",
+            "treehydra.js",
+            "/libs/treehydra.js",
+            "libs/treehydra.js",
+            "treehydra.js",
+            "libs",
+            "treehydra.js",
+            "/libs/treehydra.js",
+            "libs/treehydra.js",
+            "treehydra.js",
+            "libs",
+            "unstable",
+            "esp.js",
+            "/libs/unstable/esp.js",
+            "libs/unstable/esp.js",
+            "unstable/esp.js",
+            "esp.js",
+            "esp_lock.js",
+            "/esp_lock.js",
+            "esp_lock.js",
+        ],
+    ]
+
+    feature_extractor = FilePaths()
+    results = [feature_extractor(item) for item in inline_data]
+    assert results == expected_results