-
Notifications
You must be signed in to change notification settings - Fork 28
feat(security): Add package name typosquatting detection #1059
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
ad7c576
25fc7d0
f560201
a775b6d
4140160
4c506c2
28f9936
e0b538e
8d37e9b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -181,3 +181,4 @@ docs/_build | |
bin/ | ||
requirements.txt | ||
.macaron_env_file | ||
**/.DS_Store | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,296 @@ | ||
# Copyright (c) 2024 - 2025, Oracle and/or its affiliates. All rights reserved. | ||
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. | ||
|
||
"""Analyzer checks if there is typosquatting presence in the package name.""" | ||
import logging | ||
import os | ||
|
||
from macaron.config.defaults import defaults | ||
from macaron.config.global_config import global_config | ||
from macaron.errors import HeuristicAnalyzerValueError | ||
from macaron.json_tools import JsonType | ||
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer | ||
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics | ||
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class TyposquattingPresenceAnalyzer(BaseHeuristicAnalyzer): | ||
"""Check whether the PyPI package has typosquatting presence.""" | ||
|
||
KEYBOARD_LAYOUT = { | ||
"1": (0, 0), | ||
"2": (0, 1), | ||
"3": (0, 2), | ||
"4": (0, 3), | ||
"5": (0, 4), | ||
"6": (0, 5), | ||
"7": (0, 6), | ||
"8": (0, 7), | ||
"9": (0, 8), | ||
"0": (0, 9), | ||
"-": (0, 10), | ||
"q": (1, 0), | ||
"w": (1, 1), | ||
"e": (1, 2), | ||
"r": (1, 3), | ||
"t": (1, 4), | ||
"y": (1, 5), | ||
"u": (1, 6), | ||
"i": (1, 7), | ||
"o": (1, 8), | ||
"p": (1, 9), | ||
"a": (2, 0), | ||
"s": (2, 1), | ||
"d": (2, 2), | ||
"f": (2, 3), | ||
"g": (2, 4), | ||
"h": (2, 5), | ||
"j": (2, 6), | ||
"k": (2, 7), | ||
"l": (2, 8), | ||
"z": (3, 0), | ||
"x": (3, 1), | ||
"c": (3, 2), | ||
"v": (3, 3), | ||
"b": (3, 4), | ||
"n": (3, 5), | ||
"m": (3, 6), | ||
} | ||
|
||
def __init__(self, popular_packages_path: str | None = None) -> None: | ||
super().__init__( | ||
name="typosquatting_presence_analyzer", heuristic=Heuristics.TYPOSQUATTING_PRESENCE, depends_on=None | ||
) | ||
self.default_path = os.path.join(global_config.resources_path, "popular_packages.txt") | ||
if popular_packages_path: | ||
self.default_path = popular_packages_path | ||
self.popular_packages, self.distance_ratio_threshold, self.keyboard, self.scaling, self.cost = ( | ||
self._load_defaults() | ||
) | ||
|
||
def _load_defaults(self) -> tuple[list[str], float, float, float, float]: | ||
"""Load default settings from defaults.ini. | ||
|
||
Returns | ||
------- | ||
tuple[list[str], float, float, float, float]: | ||
The popular packages list, distance ratio threshold, | ||
keyboard awareness factor, scaling factor, and cost factor. | ||
""" | ||
section_name = "heuristic.pypi" | ||
path = self.default_path | ||
distance_ratio_threshold = 0.95 | ||
keyboard = 0.8 | ||
scaling = 0.15 | ||
cost = 1.0 | ||
|
||
if defaults.has_section(section_name): | ||
section = defaults[section_name] | ||
path_from_config = section.get("popular_packages_path", self.default_path) | ||
# Fall back to default if the path in defaults.ini is empty. | ||
if path_from_config.strip(): | ||
path = path_from_config | ||
distance_ratio_threshold = section.getfloat("distance_ratio_threshold", 0.95) | ||
keyboard = section.getfloat("keyboard", 0.8) | ||
scaling = section.getfloat("scaling", 0.15) | ||
cost = section.getfloat("cost", 1.0) | ||
|
||
if not path or not os.path.exists(path): | ||
err_msg = "Popular packages file not found or path not configured" | ||
logger.debug(err_msg) | ||
raise HeuristicAnalyzerValueError(err_msg) | ||
|
||
popular_packages_list = [] | ||
try: | ||
with open(path, encoding="utf-8") as file: | ||
popular_packages_list = file.read().splitlines() | ||
except OSError as error: | ||
err_msg = "Could not read popular packages file" | ||
logger.debug(err_msg) | ||
raise HeuristicAnalyzerValueError(err_msg) from error | ||
|
||
return ( | ||
popular_packages_list, | ||
distance_ratio_threshold, | ||
keyboard, | ||
scaling, | ||
cost, | ||
) | ||
|
||
def are_neighbors(self, first_char: str, second_char: str) -> bool: | ||
"""Check if two characters are adjacent on a QWERTY keyboard. | ||
|
||
AmineRaouane marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Adjacent characters are those that are next to each other | ||
either horizontally, vertically, or diagonally. | ||
|
||
Parameters | ||
---------- | ||
first_char : str | ||
The first character. | ||
second_char : str | ||
The second character. | ||
|
||
Returns | ||
------- | ||
bool | ||
True if the characters are neighbors, False otherwise. | ||
""" | ||
coordinates1 = self.KEYBOARD_LAYOUT.get(first_char) | ||
coordinates2 = self.KEYBOARD_LAYOUT.get(second_char) | ||
if not coordinates1 or not coordinates2: | ||
return False | ||
return (abs(coordinates1[0] - coordinates2[0]) <= 1) and (abs(coordinates1[1] - coordinates2[1]) <= 1) | ||
|
||
def substitution_func(self, first_char: str, second_char: str) -> float: | ||
"""Calculate the substitution cost between two characters. | ||
|
||
Parameters | ||
---------- | ||
first_char : str | ||
The first character. | ||
second_char : str | ||
The second character. | ||
|
||
Returns | ||
------- | ||
float | ||
0.0 if the characters are the same, `self.keyboard` if they are | ||
neighbors on a QWERTY keyboard, otherwise `self.cost` . | ||
""" | ||
if first_char == second_char: | ||
return 0.0 | ||
if self.keyboard and self.are_neighbors(first_char, second_char): | ||
return self.keyboard | ||
return self.cost | ||
|
||
def jaro_distance(self, package_name: str, popular_package_name: str) -> float: | ||
"""Calculate the Jaro distance between two package names. | ||
|
||
Parameters | ||
---------- | ||
package_name : str | ||
The name of the package being analyzed. | ||
popular_package_name : str | ||
The name of a popular package to compare against. | ||
|
||
Returns | ||
------- | ||
float | ||
The Jaro distance between the two package names. | ||
""" | ||
if package_name == popular_package_name: | ||
return 1.0 | ||
|
||
package_name_len = len(package_name) | ||
popular_package_name_len = len(popular_package_name) | ||
if package_name_len == 0 or popular_package_name_len == 0: | ||
return 0.0 | ||
|
||
match_distance = max(package_name_len, popular_package_name_len) // 2 - 1 | ||
|
||
package_name_matches = [False] * package_name_len | ||
popular_package_name_matches = [False] * popular_package_name_len | ||
matches = 0 | ||
transpositions = 0.0 # A float to handle partial costs. | ||
|
||
# Count matches. | ||
for first_index in range(package_name_len): | ||
start = max(0, first_index - match_distance) | ||
end = min(first_index + match_distance + 1, popular_package_name_len) | ||
for second_index in range(start, end): | ||
if popular_package_name_matches[second_index]: | ||
continue | ||
if package_name[first_index] == popular_package_name[second_index]: | ||
package_name_matches[first_index] = True | ||
popular_package_name_matches[second_index] = True | ||
matches += 1 | ||
break | ||
|
||
if matches == 0: | ||
return 0.0 | ||
|
||
# Count transpositions with possible keyboard awareness. | ||
k = 0 | ||
for index in range(package_name_len): | ||
if package_name_matches[index]: | ||
while not popular_package_name_matches[k]: | ||
k += 1 | ||
if package_name[index] != popular_package_name[k]: | ||
transpositions += self.substitution_func(package_name[index], popular_package_name[k]) | ||
k += 1 | ||
|
||
transpositions /= 2.0 # Adjust for transpositions being counted twice. | ||
|
||
return ( | ||
matches / package_name_len + matches / popular_package_name_len + (matches - transpositions) / matches | ||
) / 3.0 | ||
|
||
def ratio(self, package_name: str, popular_package_name: str) -> float: | ||
"""Calculate the Jaro-Winkler distance ratio. | ||
|
||
Parameters | ||
---------- | ||
package_name : str | ||
The name of the package being analyzed. | ||
popular_package_name : str | ||
The name of a popular package to compare against. | ||
|
||
Returns | ||
------- | ||
float | ||
The Jaro-Winkler distance ratio, incorporating a prefix bonus | ||
for common initial characters. | ||
""" | ||
scaling = self.scaling | ||
jaro_dist = self.jaro_distance(package_name, popular_package_name) | ||
prefix_length = 0 | ||
max_prefix = 4 | ||
for i in range(min(max_prefix, len(package_name), len(popular_package_name))): | ||
if package_name[i] == popular_package_name[i]: | ||
prefix_length += 1 | ||
else: | ||
break | ||
|
||
return jaro_dist + prefix_length * scaling * (1 - jaro_dist) | ||
|
||
def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: | ||
"""Analyze the package. | ||
|
||
Parameters | ||
---------- | ||
pypi_package_json: PyPIPackageJsonAsset | ||
The PyPI package JSON asset object. | ||
|
||
Returns | ||
------- | ||
tuple[HeuristicResult, dict[str, JsonType]]: | ||
The result and related information collected during the analysis. | ||
""" | ||
if not self.popular_packages: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So now with the change for filling in |
||
err_msg = "Popular packages file is empty" | ||
logger.warning(err_msg) | ||
return HeuristicResult.SKIP, {"error": err_msg} | ||
AmineRaouane marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
package_name = pypi_package_json.component_name | ||
AmineRaouane marked this conversation as resolved.
Show resolved
Hide resolved
|
||
for popular_package in self.popular_packages: | ||
# If there is a popular packages file, check if the package name is similar to any of them. | ||
if package_name == popular_package: | ||
return HeuristicResult.PASS, {"package_name": package_name} | ||
|
||
distance_ratio = self.ratio(package_name, popular_package) | ||
if distance_ratio >= self.distance_ratio_threshold: | ||
logger.info( | ||
"Potential typosquatting detected: '%s' is similar to popular package '%s' (ratio: %.3f)", | ||
package_name, | ||
popular_package, | ||
distance_ratio, | ||
) | ||
return HeuristicResult.FAIL, { | ||
"package_name": package_name, | ||
"popular_package": popular_package, | ||
"similarity_ratio": distance_ratio, | ||
} | ||
|
||
return HeuristicResult.PASS, {"package_name": package_name} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This does not cover subdirectories such as
tests
,parsers
, etc.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I changed it to
.DS_Store
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, I was wrong above.
**/.DS_Store
should be sufficient, you just need to remove all the matching files that were already added.