diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt index f072df79e8..c7ded6e560 100644 --- a/.github/actions/spelling/allow.txt +++ b/.github/actions/spelling/allow.txt @@ -303,6 +303,7 @@ https hunspell hur hwloc +hyperscan i icecast icu @@ -606,6 +607,7 @@ pybabel pycon pycqa pypa +pyperscan pypi pytest pythex diff --git a/README.md b/README.md index f1c95371a0..36795121d2 100644 --- a/README.md +++ b/README.md @@ -541,6 +541,7 @@ Checkers: -s SKIPS, --skips SKIPS comma-separated list of checkers to disable -r RUNS, --runs RUNS comma-separated list of checkers to enable + --pyperscan use pyperscan for binary checkers (unsupported on Windows) Database Management: --import-json IMPORT_JSON diff --git a/cve_bin_tool/cli.py b/cve_bin_tool/cli.py index f7f520dc9c..3e978d1f8c 100644 --- a/cve_bin_tool/cli.py +++ b/cve_bin_tool/cli.py @@ -488,6 +488,12 @@ def main(argv=None): help="comma-separated list of checkers to enable", default="", ) + checker_group.add_argument( + "--pyperscan", + action="store_true", + help="Use pyperscan for binary checkers (unsupported on Windows)", + default=False, + ) database_group = parser.add_argument_group("Database Management") database_group.add_argument( @@ -1126,6 +1132,7 @@ def main(argv=None): validate=not args["disable_validation_check"], sources=enabled_sources, no_scan=args["no_scan"], + pyperscan=args["pyperscan"], ) version_scanner.remove_skiplist(skips) LOGGER.info(f"Number of checkers: {version_scanner.number_of_checkers()}") diff --git a/cve_bin_tool/version_scanner.py b/cve_bin_tool/version_scanner.py index 7940e51f96..907c78a23e 100644 --- a/cve_bin_tool/version_scanner.py +++ b/cve_bin_tool/version_scanner.py @@ -8,7 +8,7 @@ from pathlib import Path from typing import Iterator -from cve_bin_tool.checkers import BUILTIN_CHECKERS, Checker +from cve_bin_tool.checkers import BUILTIN_CHECKERS, Checker, VersionMatchInfo from cve_bin_tool.cvedb import CVEDB from cve_bin_tool.egg_updater import IS_DEVELOP, update_egg from cve_bin_tool.error_handler import ErrorMode @@ -19,6 +19,9 @@ from cve_bin_tool.strings import parse_strings from cve_bin_tool.util import DirWalk, ProductInfo, ScanInfo, inpath +if sys.platform != "win32": + from pyperscan import Pattern, Scan, StreamDatabase + if sys.version_info >= (3, 10): from importlib import metadata as importlib_metadata else: @@ -45,6 +48,7 @@ def __init__( validate: bool = True, sources=None, no_scan=False, + pyperscan=False, ): self.no_scan = no_scan self.logger = logger or LOGGER.getChild(self.__class__.__name__) @@ -73,6 +77,12 @@ def __init__( self.logger.info("Checkers loaded: %s" % (", ".join(self.checkers.keys()))) self.language_checkers = valid_files self.language_checkers_names = self.available_language_checkers() + if sys.platform == "win32" and pyperscan: + self.logger.error("pyperscan unsupported on Windows") + self.pyperscan = False + else: + self.pyperscan = pyperscan + self.pyperscan_db = None @classmethod def load_checkers(cls) -> dict[str, type[Checker]]: @@ -276,36 +286,79 @@ def scan_file(self, filename: str) -> Iterator[ScanInfo]: yield from self.run_checkers(filename, lines) + def build_pyperscan_database(self, checkers: Checker) -> None: + # Database built only once to improve performance + if self.pyperscan_db is None: + patterns = [] + for dummy_checker_name, checker in self.checkers.items(): + checker = checker() + checker.dummy_checker_name = dummy_checker_name + for pattern in checker.VERSION_PATTERNS + checker.CONTAINS_PATTERNS: + patterns.append(Pattern(pattern.pattern.encode(), tag=checker)) + if patterns: + self.pyperscan_db = StreamDatabase(*patterns) + + @staticmethod + def pyperscan_match( + pyperscan_matches: dict, checker: Checker, offset: int, end: int + ) -> Scan: + pyperscan_matches[checker.dummy_checker_name] = checker, offset, end + return Scan.Continue + def run_checkers(self, filename: str, lines: str) -> Iterator[ScanInfo]: """process a Set of checker objects, run them on file lines, and yield information about detected products and versions. It uses logging to provide debug and error information along the way.""" LOGGER.info(f"filename = {filename}") - # tko - for dummy_checker_name, checker in self.checkers.items(): - checker = checker() - version_results = checker.get_versions(lines, filename) - - if version_results.matched_filename or version_results.matched_contains: - for version in version_results.versions: - if version == "UNKNOWN": - file_path = "".join(self.file_stack) - self.logger.debug( - f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}" - ) - else: - file_path = "".join(self.file_stack) - self.logger.debug( - f"{file_path} matched {dummy_checker_name} {version} ({version_results.matched_filename=}, {version_results.matched_contains=})" - ) - for vendor, product in checker.VENDOR_PRODUCT: - yield ScanInfo( - ProductInfo(vendor, product, version), - file_path, - ) + if self.pyperscan: + self.build_pyperscan_database(self.checkers) + + pyperscan_matches = dict() + scanner = self.pyperscan_db.build(pyperscan_matches, self.pyperscan_match) + scanner.scan(lines.encode()) + + for dummy_checker_name, (checker, offset, end) in pyperscan_matches.items(): + # Confirm pyperscan match with get_versions as pyperscan doesn't support + # group capture. SOM_LEFTMOST is not enabled (offset is always 0) + version_results = checker.get_versions(lines[offset:end], filename) + yield from self.parse_version_match( + dummy_checker_name, checker, version_results + ) + else: + # tko + for dummy_checker_name, checker in self.checkers.items(): + checker = checker() + version_results = checker.get_versions(lines, filename) + yield from self.parse_version_match( + dummy_checker_name, checker, version_results + ) self.logger.debug(f"Done scanning file: {filename}") + def parse_version_match( + self, + dummy_checker_name: str, + checker: Checker, + version_results: VersionMatchInfo, + ): + if version_results.matched_filename or version_results.matched_contains: + for version in version_results.versions: + if version == "UNKNOWN": + file_path = "".join(self.file_stack) + self.logger.debug( + f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}" + ) + else: + file_path = "".join(self.file_stack) + self.logger.debug( + f"{file_path} matched {dummy_checker_name} {version} ({version_results.matched_filename=}, {version_results.matched_contains=})" + ) + for vendor, product in checker.VENDOR_PRODUCT: + yield ScanInfo( + ProductInfo(vendor, product, version), + file_path, + ) + @staticmethod def clean_file_path(filepath: str) -> str: """Returns a cleaner filepath by removing temp path from filepath""" diff --git a/doc/MANUAL.md b/doc/MANUAL.md index fc19de6e28..58b455c087 100644 --- a/doc/MANUAL.md +++ b/doc/MANUAL.md @@ -37,6 +37,7 @@ - [Checkers Arguments](#checkers-arguments) - [-s SKIPS, --skips SKIPS](#-s-skips---skips-skips) - [-r CHECKERS, --runs CHECKERS](#-r-checkers---runs-checkers) + - [--pyperscan](#--pyperscan) - [Input Arguments](#input-arguments) - [directory (positional argument)](#directory-positional-argument) - [-i INPUT\_FILE, --input-file INPUT\_FILE](#-i-input_file---input-file-input_file) @@ -214,6 +215,7 @@ which is useful if you're trying the latest code from -s SKIPS, --skips SKIPS comma-separated list of checkers to disable -r RUNS, --runs RUNS comma-separated list of checkers to enable + --pyperscan use pyperscan for binary checkers (unsupported on Windows) Database Management: --import-json IMPORT_JSON @@ -887,6 +889,17 @@ This option allows one to skip (disable) a comma-separated list of checkers and This option allows one to enable a comma-separated list of checkers. +### --pyperscan + +The pyperscan flag enables pyperscan support in the CVE Bin Tool. [pyperscan](https://github.com/vlaci/pyperscan) is an opinionated Python binding for [Hyperscan](https://www.hyperscan.io) focusing on easy of use and safety. + +When pyperscan flag is enabled, the tool leverages on Hyperscan High-performance regular expression matching library to runs simultaneously all binary version checkers on a file which significantly reduce processing time. + +pyperscan package is used instead of the most well-known hyperscan package as pyperscan allows to add a tag for each pattern. This feature allows to easily retrieve the binary checker associated to the matched pattern. + +> **Note**: pyperscan is unsupported on Windows. + + ## Input Arguments ### directory (positional argument) diff --git a/requirements.csv b/requirements.csv index 3ad34e857e..9d2aa64713 100644 --- a/requirements.csv +++ b/requirements.csv @@ -26,3 +26,4 @@ the_purl_authors_not_in_db,packageurl-python h2non,filetype python,setuptools jaraco,zipp +vlaci_not_in_db,pyperscan diff --git a/requirements.txt b/requirements.txt index 7b7279d611..f9dbea39c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,7 @@ lib4vex>=0.2.0 packageurl-python packaging>=22.0 plotly +pyperscan; sys_platform != 'win32' # pyperscan unsupported on Windows python-gnupg pyyaml>=5.4 requests>=2.32.2