Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add features based on file paths in the title and description #4270

Open
wants to merge 41 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
9adba3f
Added file path feature extraction
benjaminmah Jun 20, 2024
4a43181
Improved regex for splitting filepaths
benjaminmah Jun 20, 2024
bdbf73d
Moved `/` and `.` check into regex
benjaminmah Jun 25, 2024
9ce9d76
Moved regex initialization to the constructor
benjaminmah Jun 25, 2024
802c9bd
Compiled regex using `re.compile` and move to constructor
benjaminmah Jul 2, 2024
165e68a
Renamed `ExtractFilePaths` to `FilePaths`
benjaminmah Jul 18, 2024
49601eb
Removed temporary list creation in `FilePaths` feature
benjaminmah Jul 18, 2024
9166307
Fixed `FilePaths` feature to accurately extract file paths and avoid …
benjaminmah Jul 18, 2024
8250977
Revised version to extract only file paths with valid file extensions
benjaminmah Jul 19, 2024
d1eb019
Initialized and compiled regex in compiler
benjaminmah Jul 22, 2024
f0f1118
Made code more Pythonic
benjaminmah Jul 24, 2024
fad7dae
Added 2 tests for `FilePaths` feature
benjaminmah Jul 24, 2024
fdd6123
Restructured `file_paths.json`
benjaminmah Jul 29, 2024
82a038a
Replaced hard-coding programming language extensions with `pygment.le…
benjaminmah Sep 4, 2024
bfd6334
Fixed tests to reflect more file extensions
benjaminmah Sep 4, 2024
5f4ec72
Added `publicsuffix2` to generate list of tlds
benjaminmah Sep 4, 2024
1f3921b
Replaced all addition strings with f-strings
benjaminmah Oct 18, 2024
0cf2482
Removed fixture from file path test
benjaminmah Oct 18, 2024
deadc18
Fixed test errors
benjaminmah Oct 18, 2024
a3f0ede
Added custom delimiter
benjaminmah Oct 18, 2024
a96a1e2
Fixed json input
benjaminmah Oct 18, 2024
176079c
Deleted fixture for file paths
benjaminmah Oct 18, 2024
19a289c
Pre-compile regex
benjaminmah Oct 18, 2024
4fd1f04
Removed comment
benjaminmah Oct 18, 2024
0d8d9dd
Changed default value of `inline_data` to `None`
benjaminmah Oct 21, 2024
c28ad97
Removed inline data boolean
benjaminmah Oct 21, 2024
24a5375
Removed `readlines()`
benjaminmah Oct 21, 2024
2bcfb18
Converted results into a list
benjaminmah Oct 21, 2024
9bed4a1
Moved FilePaths test to function
benjaminmah Oct 21, 2024
e76c0d0
Fixed indentation
benjaminmah Oct 21, 2024
1c00a10
Fixed assertion
benjaminmah Oct 21, 2024
f2a9d39
Changed `valid_extensions` to a local variable instead of an attribute
benjaminmah Oct 23, 2024
b677ccb
Converted `non_file_path_keywords` from attribute to local variable
benjaminmah Oct 23, 2024
70f72f5
Added comment explaining sorting `valid_extensions`
benjaminmah Oct 23, 2024
836d42d
Removed deletion of URLs from string
benjaminmah Oct 23, 2024
d6f8002
Removed sorting (test)
benjaminmah Oct 25, 2024
e11be5b
Removed sorting comment
benjaminmah Oct 25, 2024
38432cf
Simplified updating valid extensions set with lexers
benjaminmah Oct 25, 2024
9373c0b
Fixed ValueError
benjaminmah Oct 25, 2024
2022eb4
Fixed ValueError
benjaminmah Oct 25, 2024
64e5c3b
Removed tracking
benjaminmah Oct 28, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions bugbug/bug_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

import mimetypes
import re
import sys
from collections import defaultdict
Expand All @@ -14,6 +15,8 @@
from dateutil import parser
from libmozdata import versions
from libmozdata.bugzilla import Bugzilla
from publicsuffix2 import PublicSuffixList
from pygments.lexers import get_all_lexers
from sklearn.base import BaseEstimator, TransformerMixin

from bugbug import bug_snapshot, bugzilla, repository, utils
Expand Down Expand Up @@ -905,3 +908,80 @@ class BugType(SingleBugFeature):

def __call__(self, bug, **kwargs):
return bug["type"]


class FilePaths(SingleBugFeature):
benjaminmah marked this conversation as resolved.
Show resolved Hide resolved
"""Extract file paths (partial and full) from bug data."""

name = "Extract File Paths"

def __init__(self):
non_file_path_keywords = [
"http://",
"https://",
"www.",
"@",
]

valid_extensions = set(ext.lstrip(".") for ext in mimetypes.types_map.keys())
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why don't we focus only on code-related extensions? Is this improving the results?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you elaborate on code-related extensions? I'm essentially extracting any instance of a file path in the title or description.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What value do extensions from mime-types add over using extensions from lexers only?


valid_extensions.update(
ext[2:] for (_, _, exts, *_) in get_all_lexers() for ext in exts
)

extension_pattern_string = "|".join(re.escape(ext) for ext in valid_extensions)

self.extension_pattern = re.compile(
rf"\.({extension_pattern_string})(?![a-zA-Z])"
)

psl = PublicSuffixList()
tlds = set(f".{entry}" for entry in psl.tlds if "." not in entry)

filtered_tlds = [tld for tld in tlds if tld[1:] not in valid_extensions]
non_file_path_keywords.extend(filtered_tlds)

keyword_pattern_string = "|".join(
re.escape(keyword) for keyword in non_file_path_keywords
)
self.keyword_pattern = re.compile(rf"\S*({keyword_pattern_string})\S*")

def is_valid_file_path_candidate(self, word: str) -> bool:
return not self.keyword_pattern.search(word)

def extract_valid_file_path(self, word: str) -> str:
if not self.is_valid_file_path_candidate(word):
return ""

match = self.extension_pattern.search(word)
if match:
ext = match.group(1)
ext_index = match.start()
prefix = word[:ext_index]
alphanumeric_sequence = re.findall(r"[a-zA-Z0-9/_]+", prefix)
if alphanumeric_sequence:
return f"{alphanumeric_sequence[-1]}.{ext}"
return ""

def __call__(self, bug: bugzilla.BugDict, **kwargs) -> list[str]:
text = f"{bug.get('summary', '')} {bug['comments'][0]['text']}"

file_paths = [
path
for word in text.split()
if (path := self.extract_valid_file_path(word))
]

all_paths: list[str] = []

for path in file_paths:
parts = path.split("/")
all_paths.extend(part for part in parts if part)
if len(parts) > 1:
all_paths.extend(
subpath
for i in range(len(parts))
if (subpath := "/".join(parts[i:]))
)

return all_paths
1 change: 1 addition & 0 deletions bugbug/models/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def __init__(self, lemmatization=False):
bug_features.Whiteboard(),
bug_features.Patches(),
bug_features.Landings(),
bug_features.FilePaths(),
]

cleanup_functions = [
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ orjson==3.10.9
ortools==9.11.4210
pandas==2.2.3
psutil==6.1.0
publicsuffix2==2.20191221
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit skeptical about adding a non-actively maintained package.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand your point, but in this case I'm only using the library for a list of public suffixes. If we want to avoid using a non-actively maintained package, we could also hard-code it, but I don't think that is any better.

pydriller==1.12
pyOpenSSL>=0.14 # Could not find a version that satisfies the requirement pyOpenSSL>=0.14; extra == "security" (from requests[security]>=2.7.0->libmozdata==0.1.43)
python-dateutil==2.9.0.post0
Expand Down
207 changes: 207 additions & 0 deletions tests/test_bug_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
CommentLength,
Component,
DeltaNightlyRequestMerge,
FilePaths,
HasCrashSignature,
HasCVEInAlias,
HasGithubURL,
Expand Down Expand Up @@ -187,3 +188,209 @@ def test_BugTypes(read) -> None:
BugTypes,
[["performance"], ["memory"], ["power"], ["security"], ["crash"]],
)


def test_FilePaths(read):
inline_data = [
suhaibmujahid marked this conversation as resolved.
Show resolved Hide resolved
{
"summary": "<nsFrame.cpp> cleanup",
"comments": [
{
"text": "Fix for\n{{ <http://tinderbox.mozilla.org/SeaMonkey/warn1082809200.7591.html>\nanthonyd (2 warnings)\n1.\tlayout/html/base/src/nsFrame.cpp:3879 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n2.\tlayout/html/base/src/nsFrame.cpp:3908 (See build log excerpt)\n\t`nsIFrame*thisBlock' might be used uninitialized in this function\n}} (NB: lines should be lines - 2, due to checkin \"in progress\")\nwill be included."
}
],
},
{
"summary": "Spidermonkey regression causes treehydra trunk to fail 6 tests",
"comments": [
{
"text": 'Today I\'m trying to get callgraph stuff hooked into dxr, and I\'m unable to get a working treehydra. I\'ve updated tried updating just dehydra, then I updated gcc w/plugins using the new stuff in the patch queue, and it doesn\'t matter. Running make check_treehydra fails like this:\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad3.js locks_bad3.cc\n Failure msg: Expected \'locks_bad3.cc:10: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good.js locks_good.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_good2.js locks_good2.cc\n Failure msg: Expected no error output, got error output :../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad4.js locks_bad4.cc\n Failure msg: Expected \'locks_bad4.cc:13: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad2.js locks_bad2.cc\n Failure msg: Expected \'locks_bad2.cc:12: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\nTest Failure: \n Test command: /var/www/html/dxr/tools/gcc-dehydra/installed/bin/../libexec/gcc/x86_64-unknown-linux-gnu/4.3.0/cc1plus -quiet -fplugin=../gcc_treehydra.so -o /dev/null -fplugin-arg=test_locks_bad1.js locks_bad1.cc\n Failure msg: Expected \'locks_bad1.cc:11: error: precondition not met\' in error output; not found. stderr:../libs/treehydra.js:12: JS Exception: No case_val in this lazy object\n:0: #0: Error("No case_val in this lazy object")\n../libs/treehydra.js:12: #1: unhandledLazyProperty("case_val")\n../libs/unstable/esp.js:481: #2: ()\n./esp_lock.js:41: #3: process_tree([object GCCNode])\n\n\nUnit Test Suite Summary:\n 32 passed\n 6 failed\n 0 error(s)\nmake[1]: *** [check_treehydra] Error 1\nmake[1]: Leaving directory `/var/www/html/dxr/tools/gcc-dehydra/dehydra/test\'\nmake: *** [check] Error 2'
}
],
},
]
expected_results = [
[
"nsFrame.cpp",
"layout",
"html",
"base",
"src",
"nsFrame.cpp",
"layout/html/base/src/nsFrame.cpp",
"html/base/src/nsFrame.cpp",
"base/src/nsFrame.cpp",
"src/nsFrame.cpp",
"nsFrame.cpp",
"layout",
"html",
"base",
"src",
"nsFrame.cpp",
"layout/html/base/src/nsFrame.cpp",
"html/base/src/nsFrame.cpp",
"base/src/nsFrame.cpp",
"src/nsFrame.cpp",
"nsFrame.cpp",
],
[
"gcc_treehydra.so",
"/gcc_treehydra.so",
"gcc_treehydra.so",
"test_locks_bad3.js",
"locks_bad3.cc",
"locks_bad3.cc",
"libs",
"treehydra.js",
"/libs/treehydra.js",
"libs/treehydra.js",
"treehydra.js",
"libs",
"treehydra.js",
"/libs/treehydra.js",
"libs/treehydra.js",
"treehydra.js",
"libs",
"unstable",
"esp.js",
"/libs/unstable/esp.js",
"libs/unstable/esp.js",
"unstable/esp.js",
"esp.js",
"esp_lock.js",
"/esp_lock.js",
"esp_lock.js",
"gcc_treehydra.so",
"/gcc_treehydra.so",
"gcc_treehydra.so",
"test_locks_good.js",
"locks_good.cc",
"libs",
"treehydra.js",
"/libs/treehydra.js",
"libs/treehydra.js",
"treehydra.js",
"libs",
"treehydra.js",
"/libs/treehydra.js",
"libs/treehydra.js",
"treehydra.js",
"libs",
"unstable",
"esp.js",
"/libs/unstable/esp.js",
"libs/unstable/esp.js",
"unstable/esp.js",
"esp.js",
"esp_lock.js",
"/esp_lock.js",
"esp_lock.js",
"gcc_treehydra.so",
"/gcc_treehydra.so",
"gcc_treehydra.so",
"test_locks_good2.js",
"locks_good2.cc",
"libs",
"treehydra.js",
"/libs/treehydra.js",
"libs/treehydra.js",
"treehydra.js",
"libs",
"treehydra.js",
"/libs/treehydra.js",
"libs/treehydra.js",
"treehydra.js",
"libs",
"unstable",
"esp.js",
"/libs/unstable/esp.js",
"libs/unstable/esp.js",
"unstable/esp.js",
"esp.js",
"esp_lock.js",
"/esp_lock.js",
"esp_lock.js",
"gcc_treehydra.so",
"/gcc_treehydra.so",
"gcc_treehydra.so",
"test_locks_bad4.js",
"locks_bad4.cc",
"locks_bad4.cc",
"libs",
"treehydra.js",
"/libs/treehydra.js",
"libs/treehydra.js",
"treehydra.js",
"libs",
"treehydra.js",
"/libs/treehydra.js",
"libs/treehydra.js",
"treehydra.js",
"libs",
"unstable",
"esp.js",
"/libs/unstable/esp.js",
"libs/unstable/esp.js",
"unstable/esp.js",
"esp.js",
"esp_lock.js",
"/esp_lock.js",
"esp_lock.js",
"gcc_treehydra.so",
"/gcc_treehydra.so",
"gcc_treehydra.so",
"test_locks_bad2.js",
"locks_bad2.cc",
"locks_bad2.cc",
"libs",
"treehydra.js",
"/libs/treehydra.js",
"libs/treehydra.js",
"treehydra.js",
"libs",
"treehydra.js",
"/libs/treehydra.js",
"libs/treehydra.js",
"treehydra.js",
"libs",
"unstable",
"esp.js",
"/libs/unstable/esp.js",
"libs/unstable/esp.js",
"unstable/esp.js",
"esp.js",
"esp_lock.js",
"/esp_lock.js",
"esp_lock.js",
"gcc_treehydra.so",
"/gcc_treehydra.so",
"gcc_treehydra.so",
"test_locks_bad1.js",
"locks_bad1.cc",
"locks_bad1.cc",
"libs",
"treehydra.js",
"/libs/treehydra.js",
"libs/treehydra.js",
"treehydra.js",
"libs",
"treehydra.js",
"/libs/treehydra.js",
"libs/treehydra.js",
"treehydra.js",
"libs",
"unstable",
"esp.js",
"/libs/unstable/esp.js",
"libs/unstable/esp.js",
"unstable/esp.js",
"esp.js",
"esp_lock.js",
"/esp_lock.js",
"esp_lock.js",
],
]

feature_extractor = FilePaths()
results = [feature_extractor(item) for item in inline_data]
assert results == expected_results