diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt
index f072df79e8..c7ded6e560 100644
--- a/.github/actions/spelling/allow.txt
+++ b/.github/actions/spelling/allow.txt
@@ -303,6 +303,7 @@ https
hunspell
hur
hwloc
+hyperscan
i
icecast
icu
@@ -606,6 +607,7 @@ pybabel
pycon
pycqa
pypa
+pyperscan
pypi
pytest
pythex
diff --git a/README.md b/README.md
index f1c95371a0..36795121d2 100644
--- a/README.md
+++ b/README.md
@@ -541,6 +541,7 @@ Checkers:
-s SKIPS, --skips SKIPS
comma-separated list of checkers to disable
-r RUNS, --runs RUNS comma-separated list of checkers to enable
+ --pyperscan use pyperscan for binary checkers (unsupported on Windows)
Database Management:
--import-json IMPORT_JSON
diff --git a/cve_bin_tool/cli.py b/cve_bin_tool/cli.py
index f7f520dc9c..3e978d1f8c 100644
--- a/cve_bin_tool/cli.py
+++ b/cve_bin_tool/cli.py
@@ -488,6 +488,12 @@ def main(argv=None):
help="comma-separated list of checkers to enable",
default="",
)
+ checker_group.add_argument(
+ "--pyperscan",
+ action="store_true",
+ help="Use pyperscan for binary checkers (unsupported on Windows)",
+ default=False,
+ )
database_group = parser.add_argument_group("Database Management")
database_group.add_argument(
@@ -1126,6 +1132,7 @@ def main(argv=None):
validate=not args["disable_validation_check"],
sources=enabled_sources,
no_scan=args["no_scan"],
+ pyperscan=args["pyperscan"],
)
version_scanner.remove_skiplist(skips)
LOGGER.info(f"Number of checkers: {version_scanner.number_of_checkers()}")
diff --git a/cve_bin_tool/version_scanner.py b/cve_bin_tool/version_scanner.py
index 7940e51f96..907c78a23e 100644
--- a/cve_bin_tool/version_scanner.py
+++ b/cve_bin_tool/version_scanner.py
@@ -8,7 +8,7 @@
from pathlib import Path
from typing import Iterator
-from cve_bin_tool.checkers import BUILTIN_CHECKERS, Checker
+from cve_bin_tool.checkers import BUILTIN_CHECKERS, Checker, VersionMatchInfo
from cve_bin_tool.cvedb import CVEDB
from cve_bin_tool.egg_updater import IS_DEVELOP, update_egg
from cve_bin_tool.error_handler import ErrorMode
@@ -19,6 +19,9 @@
from cve_bin_tool.strings import parse_strings
from cve_bin_tool.util import DirWalk, ProductInfo, ScanInfo, inpath
+if sys.platform != "win32":
+ from pyperscan import Pattern, Scan, StreamDatabase
+
if sys.version_info >= (3, 10):
from importlib import metadata as importlib_metadata
else:
@@ -45,6 +48,7 @@ def __init__(
validate: bool = True,
sources=None,
no_scan=False,
+ pyperscan=False,
):
self.no_scan = no_scan
self.logger = logger or LOGGER.getChild(self.__class__.__name__)
@@ -73,6 +77,12 @@ def __init__(
self.logger.info("Checkers loaded: %s" % (", ".join(self.checkers.keys())))
self.language_checkers = valid_files
self.language_checkers_names = self.available_language_checkers()
+ if sys.platform == "win32" and pyperscan:
+ self.logger.error("pyperscan unsupported on Windows")
+ self.pyperscan = False
+ else:
+ self.pyperscan = pyperscan
+ self.pyperscan_db = None
@classmethod
def load_checkers(cls) -> dict[str, type[Checker]]:
@@ -276,36 +286,79 @@ def scan_file(self, filename: str) -> Iterator[ScanInfo]:
yield from self.run_checkers(filename, lines)
+ def build_pyperscan_database(self, checkers: Checker) -> None:
+ # Database built only once to improve performance
+ if self.pyperscan_db is None:
+ patterns = []
+ for dummy_checker_name, checker in self.checkers.items():
+ checker = checker()
+ checker.dummy_checker_name = dummy_checker_name
+ for pattern in checker.VERSION_PATTERNS + checker.CONTAINS_PATTERNS:
+ patterns.append(Pattern(pattern.pattern.encode(), tag=checker))
+ if patterns:
+ self.pyperscan_db = StreamDatabase(*patterns)
+
+ @staticmethod
+ def pyperscan_match(
+ pyperscan_matches: dict, checker: Checker, offset: int, end: int
+ ) -> Scan:
+ pyperscan_matches[checker.dummy_checker_name] = checker, offset, end
+ return Scan.Continue
+
def run_checkers(self, filename: str, lines: str) -> Iterator[ScanInfo]:
"""process a Set of checker objects, run them on file lines,
and yield information about detected products and versions.
It uses logging to provide debug and error information along the way."""
LOGGER.info(f"filename = {filename}")
- # tko
- for dummy_checker_name, checker in self.checkers.items():
- checker = checker()
- version_results = checker.get_versions(lines, filename)
-
- if version_results.matched_filename or version_results.matched_contains:
- for version in version_results.versions:
- if version == "UNKNOWN":
- file_path = "".join(self.file_stack)
- self.logger.debug(
- f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}"
- )
- else:
- file_path = "".join(self.file_stack)
- self.logger.debug(
- f"{file_path} matched {dummy_checker_name} {version} ({version_results.matched_filename=}, {version_results.matched_contains=})"
- )
- for vendor, product in checker.VENDOR_PRODUCT:
- yield ScanInfo(
- ProductInfo(vendor, product, version),
- file_path,
- )
+ if self.pyperscan:
+ self.build_pyperscan_database(self.checkers)
+
+ pyperscan_matches = dict()
+ scanner = self.pyperscan_db.build(pyperscan_matches, self.pyperscan_match)
+ scanner.scan(lines.encode())
+
+ for dummy_checker_name, (checker, offset, end) in pyperscan_matches.items():
+ # Confirm pyperscan match with get_versions as pyperscan doesn't support
+ # group capture. SOM_LEFTMOST is not enabled (offset is always 0)
+ version_results = checker.get_versions(lines[offset:end], filename)
+ yield from self.parse_version_match(
+ dummy_checker_name, checker, version_results
+ )
+ else:
+ # tko
+ for dummy_checker_name, checker in self.checkers.items():
+ checker = checker()
+ version_results = checker.get_versions(lines, filename)
+ yield from self.parse_version_match(
+ dummy_checker_name, checker, version_results
+ )
self.logger.debug(f"Done scanning file: {filename}")
+ def parse_version_match(
+ self,
+ dummy_checker_name: str,
+ checker: Checker,
+ version_results: VersionMatchInfo,
+ ):
+ if version_results.matched_filename or version_results.matched_contains:
+ for version in version_results.versions:
+ if version == "UNKNOWN":
+ file_path = "".join(self.file_stack)
+ self.logger.debug(
+ f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}"
+ )
+ else:
+ file_path = "".join(self.file_stack)
+ self.logger.debug(
+ f"{file_path} matched {dummy_checker_name} {version} ({version_results.matched_filename=}, {version_results.matched_contains=})"
+ )
+ for vendor, product in checker.VENDOR_PRODUCT:
+ yield ScanInfo(
+ ProductInfo(vendor, product, version),
+ file_path,
+ )
+
@staticmethod
def clean_file_path(filepath: str) -> str:
"""Returns a cleaner filepath by removing temp path from filepath"""
diff --git a/doc/MANUAL.md b/doc/MANUAL.md
index fc19de6e28..58b455c087 100644
--- a/doc/MANUAL.md
+++ b/doc/MANUAL.md
@@ -37,6 +37,7 @@
- [Checkers Arguments](#checkers-arguments)
- [-s SKIPS, --skips SKIPS](#-s-skips---skips-skips)
- [-r CHECKERS, --runs CHECKERS](#-r-checkers---runs-checkers)
+ - [--pyperscan](#--pyperscan)
- [Input Arguments](#input-arguments)
- [directory (positional argument)](#directory-positional-argument)
- [-i INPUT\_FILE, --input-file INPUT\_FILE](#-i-input_file---input-file-input_file)
@@ -214,6 +215,7 @@ which is useful if you're trying the latest code from
-s SKIPS, --skips SKIPS
comma-separated list of checkers to disable
-r RUNS, --runs RUNS comma-separated list of checkers to enable
+ --pyperscan use pyperscan for binary checkers (unsupported on Windows)
Database Management:
--import-json IMPORT_JSON
@@ -887,6 +889,17 @@ This option allows one to skip (disable) a comma-separated list of checkers and
This option allows one to enable a comma-separated list of checkers.
+### --pyperscan
+
+The pyperscan flag enables pyperscan support in the CVE Bin Tool. [pyperscan](https://github.com/vlaci/pyperscan) is an opinionated Python binding for [Hyperscan](https://www.hyperscan.io) focusing on easy of use and safety.
+
+When pyperscan flag is enabled, the tool leverages on Hyperscan High-performance regular expression matching library to runs simultaneously all binary version checkers on a file which significantly reduce processing time.
+
+pyperscan package is used instead of the most well-known hyperscan package as pyperscan allows to add a tag for each pattern. This feature allows to easily retrieve the binary checker associated to the matched pattern.
+
+> **Note**: pyperscan is unsupported on Windows.
+
+
## Input Arguments
### directory (positional argument)
diff --git a/requirements.csv b/requirements.csv
index 3ad34e857e..9d2aa64713 100644
--- a/requirements.csv
+++ b/requirements.csv
@@ -26,3 +26,4 @@ the_purl_authors_not_in_db,packageurl-python
h2non,filetype
python,setuptools
jaraco,zipp
+vlaci_not_in_db,pyperscan
diff --git a/requirements.txt b/requirements.txt
index 7b7279d611..f9dbea39c4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,6 +14,7 @@ lib4vex>=0.2.0
packageurl-python
packaging>=22.0
plotly
+pyperscan; sys_platform != 'win32' # pyperscan unsupported on Windows
python-gnupg
pyyaml>=5.4
requests>=2.32.2