Skip to content

feat: add pyperscan support #5228

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/actions/spelling/allow.txt
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ https
hunspell
hur
hwloc
hyperscan
i
icecast
icu
Expand Down Expand Up @@ -606,6 +607,7 @@ pybabel
pycon
pycqa
pypa
pyperscan
pypi
pytest
pythex
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,7 @@ Checkers:
<a href="https://github.com/intel/cve-bin-tool/blob/main/doc/MANUAL.md#-s-skips---skips-skips">-s SKIPS, --skips SKIPS</a>
comma-separated list of checkers to disable
<a href="https://github.com/intel/cve-bin-tool/blob/main/doc/MANUAL.md#-r-checkers---runs-checkers">-r RUNS, --runs RUNS</a> comma-separated list of checkers to enable
<a href="https://github.com/intel/cve-bin-tool/blob/main/doc/MANUAL.md#--pyperscan">--pyperscan</a> use pyperscan for binary checkers (unsupported on Windows)

Database Management:
--import-json IMPORT_JSON
Expand Down
7 changes: 7 additions & 0 deletions cve_bin_tool/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,12 @@ def main(argv=None):
help="comma-separated list of checkers to enable",
default="",
)
checker_group.add_argument(
"--pyperscan",
action="store_true",
help="Use pyperscan for binary checkers (unsupported on Windows)",
default=False,
)

database_group = parser.add_argument_group("Database Management")
database_group.add_argument(
Expand Down Expand Up @@ -1126,6 +1132,7 @@ def main(argv=None):
validate=not args["disable_validation_check"],
sources=enabled_sources,
no_scan=args["no_scan"],
pyperscan=args["pyperscan"],
)
version_scanner.remove_skiplist(skips)
LOGGER.info(f"Number of checkers: {version_scanner.number_of_checkers()}")
Expand Down
99 changes: 76 additions & 23 deletions cve_bin_tool/version_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from pathlib import Path
from typing import Iterator

from cve_bin_tool.checkers import BUILTIN_CHECKERS, Checker
from cve_bin_tool.checkers import BUILTIN_CHECKERS, Checker, VersionMatchInfo
from cve_bin_tool.cvedb import CVEDB
from cve_bin_tool.egg_updater import IS_DEVELOP, update_egg
from cve_bin_tool.error_handler import ErrorMode
Expand All @@ -19,6 +19,9 @@
from cve_bin_tool.strings import parse_strings
from cve_bin_tool.util import DirWalk, ProductInfo, ScanInfo, inpath

if sys.platform != "win32":
from pyperscan import Pattern, Scan, StreamDatabase

if sys.version_info >= (3, 10):
from importlib import metadata as importlib_metadata
else:
Expand All @@ -45,6 +48,7 @@ def __init__(
validate: bool = True,
sources=None,
no_scan=False,
pyperscan=False,
):
self.no_scan = no_scan
self.logger = logger or LOGGER.getChild(self.__class__.__name__)
Expand Down Expand Up @@ -73,6 +77,12 @@ def __init__(
self.logger.info("Checkers loaded: %s" % (", ".join(self.checkers.keys())))
self.language_checkers = valid_files
self.language_checkers_names = self.available_language_checkers()
if sys.platform == "win32" and pyperscan:
self.logger.error("pyperscan unsupported on Windows")
self.pyperscan = False
else:
self.pyperscan = pyperscan
self.pyperscan_db = None

@classmethod
def load_checkers(cls) -> dict[str, type[Checker]]:
Expand Down Expand Up @@ -276,36 +286,79 @@ def scan_file(self, filename: str) -> Iterator[ScanInfo]:

yield from self.run_checkers(filename, lines)

def build_pyperscan_database(self, checkers: Checker) -> None:
# Database built only once to improve performance
if self.pyperscan_db is None:
patterns = []
for dummy_checker_name, checker in self.checkers.items():
checker = checker()
checker.dummy_checker_name = dummy_checker_name
for pattern in checker.VERSION_PATTERNS + checker.CONTAINS_PATTERNS:
patterns.append(Pattern(pattern.pattern.encode(), tag=checker))
if patterns:
self.pyperscan_db = StreamDatabase(*patterns)

@staticmethod
def pyperscan_match(
pyperscan_matches: dict, checker: Checker, offset: int, end: int
) -> Scan:
pyperscan_matches[checker.dummy_checker_name] = checker, offset, end
return Scan.Continue

def run_checkers(self, filename: str, lines: str) -> Iterator[ScanInfo]:
"""process a Set of checker objects, run them on file lines,
and yield information about detected products and versions.
It uses logging to provide debug and error information along the way."""
LOGGER.info(f"filename = {filename}")
# tko
for dummy_checker_name, checker in self.checkers.items():
checker = checker()
version_results = checker.get_versions(lines, filename)

if version_results.matched_filename or version_results.matched_contains:
for version in version_results.versions:
if version == "UNKNOWN":
file_path = "".join(self.file_stack)
self.logger.debug(
f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}"
)
else:
file_path = "".join(self.file_stack)
self.logger.debug(
f"{file_path} matched {dummy_checker_name} {version} ({version_results.matched_filename=}, {version_results.matched_contains=})"
)
for vendor, product in checker.VENDOR_PRODUCT:
yield ScanInfo(
ProductInfo(vendor, product, version),
file_path,
)
if self.pyperscan:
self.build_pyperscan_database(self.checkers)

pyperscan_matches = dict()
scanner = self.pyperscan_db.build(pyperscan_matches, self.pyperscan_match)
scanner.scan(lines.encode())

for dummy_checker_name, (checker, offset, end) in pyperscan_matches.items():
# Confirm pyperscan match with get_versions as pyperscan doesn't support
# group capture. SOM_LEFTMOST is not enabled (offset is always 0)
version_results = checker.get_versions(lines[offset:end], filename)
yield from self.parse_version_match(
dummy_checker_name, checker, version_results
)
else:
# tko
for dummy_checker_name, checker in self.checkers.items():
checker = checker()
version_results = checker.get_versions(lines, filename)
yield from self.parse_version_match(
dummy_checker_name, checker, version_results
)

self.logger.debug(f"Done scanning file: {filename}")

def parse_version_match(
self,
dummy_checker_name: str,
checker: Checker,
version_results: VersionMatchInfo,
):
if version_results.matched_filename or version_results.matched_contains:
for version in version_results.versions:
if version == "UNKNOWN":
file_path = "".join(self.file_stack)
self.logger.debug(
f"{dummy_checker_name} was detected with version UNKNOWN in file {file_path}"
)
else:
file_path = "".join(self.file_stack)
self.logger.debug(
f"{file_path} matched {dummy_checker_name} {version} ({version_results.matched_filename=}, {version_results.matched_contains=})"
)
for vendor, product in checker.VENDOR_PRODUCT:
yield ScanInfo(
ProductInfo(vendor, product, version),
file_path,
)

@staticmethod
def clean_file_path(filepath: str) -> str:
"""Returns a cleaner filepath by removing temp path from filepath"""
Expand Down
13 changes: 13 additions & 0 deletions doc/MANUAL.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
- [Checkers Arguments](#checkers-arguments)
- [-s SKIPS, --skips SKIPS](#-s-skips---skips-skips)
- [-r CHECKERS, --runs CHECKERS](#-r-checkers---runs-checkers)
- [--pyperscan](#--pyperscan)
- [Input Arguments](#input-arguments)
- [directory (positional argument)](#directory-positional-argument)
- [-i INPUT\_FILE, --input-file INPUT\_FILE](#-i-input_file---input-file-input_file)
Expand Down Expand Up @@ -214,6 +215,7 @@ which is useful if you're trying the latest code from
-s SKIPS, --skips SKIPS
comma-separated list of checkers to disable
-r RUNS, --runs RUNS comma-separated list of checkers to enable
--pyperscan use pyperscan for binary checkers (unsupported on Windows)

Database Management:
--import-json IMPORT_JSON
Expand Down Expand Up @@ -887,6 +889,17 @@ This option allows one to skip (disable) a comma-separated list of checkers and

This option allows one to enable a comma-separated list of checkers.

### --pyperscan

The pyperscan flag enables pyperscan support in the CVE Bin Tool. [pyperscan](https://github.com/vlaci/pyperscan) is an opinionated Python binding for [Hyperscan](https://www.hyperscan.io) focusing on easy of use and safety.

When pyperscan flag is enabled, the tool leverages on Hyperscan High-performance regular expression matching library to runs simultaneously all binary version checkers on a file which significantly reduce processing time.

pyperscan package is used instead of the most well-known hyperscan package as pyperscan allows to add a tag for each pattern. This feature allows to easily retrieve the binary checker associated to the matched pattern.

> **Note**: pyperscan is unsupported on Windows.


## Input Arguments

### directory (positional argument)
Expand Down
1 change: 1 addition & 0 deletions requirements.csv
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,4 @@ the_purl_authors_not_in_db,packageurl-python
h2non,filetype
python,setuptools
jaraco,zipp
vlaci_not_in_db,pyperscan
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ lib4vex>=0.2.0
packageurl-python
packaging>=22.0
plotly
pyperscan; sys_platform != 'win32' # pyperscan unsupported on Windows
python-gnupg
pyyaml>=5.4
requests>=2.32.2
Expand Down
Loading