From 1a2eff35bf7a27dc37b1c867c2cf83630c7cb31a Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Wed, 12 Mar 2025 21:29:32 +0500 Subject: [PATCH 01/19] feat: added new OSV data source class initial implementation --- cve_bin_tool/data_sources/new_osv_source.py | 38 +++++++++++++++++++++ test/test_new_osv_source.py | 27 +++++++++++++++ 2 files changed, 65 insertions(+) create mode 100644 cve_bin_tool/data_sources/new_osv_source.py create mode 100644 test/test_new_osv_source.py diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py new file mode 100644 index 0000000000..77dc1badc9 --- /dev/null +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -0,0 +1,38 @@ +from google.auth.credentials import AnonymousCredentials # type: ignore[import-untyped] +from google.cloud import storage # type: ignore[import-untyped] + +from cve_bin_tool.data_sources import Data_Source +from cve_bin_tool.log import LOGGER + + +class OSVDataSource(Data_Source): + def __init__(self, partial_update=False, bucket_name=None): + self._client = storage.Client(credentials=AnonymousCredentials(), project=None) + self.ecosystems_fetched = [] + self.download_url = [] + + self.bucket_name = bucket_name or "osv-vulnerabilities" + self.partial_update = partial_update + + async def update_ecosystem_info(self) -> None: + """Fetch OSV ecosystem information and download links""" + + LOGGER.debug("Started fetching OSV ecosystems info...") + blobs = self._client.list_blobs(self.bucket_name) + for blob in blobs: + if blob.name.endswith("all.zip"): + try: + ecosystem_name = blob.name.split("/")[-2] + if ecosystem_name.find(":") >= 0: + ecosystem_name = ecosystem_name.split(":")[0] + self.ecosystems_fetched.append(ecosystem_name) + self.download_url.append(blob.media_link) + except (ValueError, IndexError): + pass + + # remove repeating items + self.ecosystems_fetched = list(set(self.ecosystems_fetched)) + + async def get_cve_data(self): + """Returns OSV cve data to insert into db""" + await self.update_ecosystem_info() diff --git a/test/test_new_osv_source.py b/test/test_new_osv_source.py new file mode 100644 index 0000000000..c6ff4acdd1 --- /dev/null +++ b/test/test_new_osv_source.py @@ -0,0 +1,27 @@ +from test.utils import EXTERNAL_SYSTEM + +import pytest + +from cve_bin_tool.data_sources import new_osv_source +from cve_bin_tool.util import make_http_requests + + +class TestNewOSVSource: + data_source = new_osv_source.OSVDataSource() + ecosystems_url = "https://osv-vulnerabilities.storage.googleapis.com/ecosystems.txt" + + @pytest.mark.asyncio + @pytest.mark.skipif(not EXTERNAL_SYSTEM(), reason="Needs network connection.") + async def test_update_ecosystem_info(self): + ecosystems_txt = make_http_requests( + "text", url=self.ecosystems_url, timeout=300 + ).strip("\n") + expected_ecosystems = set(ecosystems_txt.split("\n")) + + await self.data_source.update_ecosystem_info() + + # there may be more ecosystems fetched than provided in ecosystems.txt + assert all( + ecosystem in self.data_source.ecosystems_fetched + for ecosystem in expected_ecosystems + ) From c146c4522c7d698b4b21a7921afecb7ed5e8419e Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Thu, 13 Mar 2025 14:15:56 +0500 Subject: [PATCH 02/19] feat: added file fetching to OSV datasource --- cve_bin_tool/data_sources/new_osv_source.py | 60 +++++++++++++++++---- 1 file changed, 49 insertions(+), 11 deletions(-) diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py index 77dc1badc9..8038e45a24 100644 --- a/cve_bin_tool/data_sources/new_osv_source.py +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -1,38 +1,76 @@ +import asyncio +import os +import pathlib + +import aiohttp from google.auth.credentials import AnonymousCredentials # type: ignore[import-untyped] from google.cloud import storage # type: ignore[import-untyped] -from cve_bin_tool.data_sources import Data_Source +from cve_bin_tool.data_sources import DISK_LOCATION_DEFAULT, Data_Source from cve_bin_tool.log import LOGGER class OSVDataSource(Data_Source): - def __init__(self, partial_update=False, bucket_name=None): + """Slightly more memory efficient reimplementation of OSV datasource""" + + def __init__( + self, partial_update=False, bucket_name=None, max_parallel_downloads=5 + ): self._client = storage.Client(credentials=AnonymousCredentials(), project=None) - self.ecosystems_fetched = [] + self.ecosystems_fetched = set() self.download_url = [] + self._semaphore = asyncio.Semaphore(max_parallel_downloads) + self.osv_path = str(pathlib.Path(DISK_LOCATION_DEFAULT) / "osv") self.bucket_name = bucket_name or "osv-vulnerabilities" self.partial_update = partial_update + self.signed_link_expiration_time = 3600 * 2 # 2 hours async def update_ecosystem_info(self) -> None: - """Fetch OSV ecosystem information and download links""" - - LOGGER.debug("Started fetching OSV ecosystems info...") + """Fetch OSV ecosystem information and prepare download links""" + LOGGER.info("Started fetching OSV ecosystems info...") blobs = self._client.list_blobs(self.bucket_name) for blob in blobs: if blob.name.endswith("all.zip"): try: ecosystem_name = blob.name.split("/")[-2] + url = f"https://storage.googleapis.com/{self.bucket_name}/{ecosystem_name}/all.zip" + self.download_url.append(url) if ecosystem_name.find(":") >= 0: ecosystem_name = ecosystem_name.split(":")[0] - self.ecosystems_fetched.append(ecosystem_name) - self.download_url.append(blob.media_link) + self.ecosystems_fetched.add(ecosystem_name) except (ValueError, IndexError): pass - # remove repeating items - self.ecosystems_fetched = list(set(self.ecosystems_fetched)) + async def __fetch_single(self, url: str, download_to: str, session): + """ + Fetches single file while preventing downloading more than $max_parallel_downloads files simultaneously + """ + async with self._semaphore: + async with session.get(url) as response: + if response.status == 200: + try: + content = await response.read() + filename: str = f"{url.split("/")[-2]}.zip" + location = os.path.join(download_to, filename) + with open(location, "wb") as file: + file.write(content) + del content + LOGGER.debug(f"Fetched {url}") + except (ValueError, IndexError): + pass + + async def _fetch_all(self): + """Concurrently fetches all zip files from OSV""" + LOGGER.info("Started fetching OSV CVE files...") + async with aiohttp.ClientSession() as session: + tasks = [ + self.__fetch_single(url, self.osv_path, session) + for url in self.download_url + ] + await asyncio.gather(*tasks) - async def get_cve_data(self): + async def get_cve_data(self) -> None: """Returns OSV cve data to insert into db""" await self.update_ecosystem_info() + await self._fetch_all() From 0b5a52f45e06aa3493fd94c8957ed41f9909cfa0 Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Thu, 13 Mar 2025 15:51:34 +0500 Subject: [PATCH 03/19] feat: added zip extraction for osv --- cve_bin_tool/data_sources/new_osv_source.py | 23 +++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py index 8038e45a24..d9225fdcc4 100644 --- a/cve_bin_tool/data_sources/new_osv_source.py +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -1,6 +1,7 @@ import asyncio import os import pathlib +import zipfile import aiohttp from google.auth.credentials import AnonymousCredentials # type: ignore[import-untyped] @@ -28,7 +29,7 @@ def __init__( async def update_ecosystem_info(self) -> None: """Fetch OSV ecosystem information and prepare download links""" - LOGGER.info("Started fetching OSV ecosystems info...") + LOGGER.info("OSV: started fetching ecosystems info...") blobs = self._client.list_blobs(self.bucket_name) for blob in blobs: if blob.name.endswith("all.zip"): @@ -59,10 +60,12 @@ async def __fetch_single(self, url: str, download_to: str, session): LOGGER.debug(f"Fetched {url}") except (ValueError, IndexError): pass + else: + LOGGER.error(f"OSV: was not able to fetch {url}") async def _fetch_all(self): """Concurrently fetches all zip files from OSV""" - LOGGER.info("Started fetching OSV CVE files...") + LOGGER.info("OSV: started fetching OSV CVE files...") async with aiohttp.ClientSession() as session: tasks = [ self.__fetch_single(url, self.osv_path, session) @@ -70,7 +73,23 @@ async def _fetch_all(self): ] await asyncio.gather(*tasks) + async def _extract_all(self): + """Extract and delete all files in OSV cache dir""" + # .glob("zip") returns iterator so it is ok to process files in the loop + LOGGER.info("OSV: started extracting zip files...") + for file in pathlib.Path(self.osv_path).glob("*.zip"): + try: + with zipfile.ZipFile(file, "r") as zip_ref: + zip_ref.extractall(self.osv_path) + os.remove(file) + except zipfile.BadZipFile: + LOGGER.error(f"OSV: error while extracting {file}") + async def get_cve_data(self) -> None: """Returns OSV cve data to insert into db""" await self.update_ecosystem_info() await self._fetch_all() + await self._extract_all() + + # no need to keep links after download, there may be lots of them + del self.download_url From f076096dd291d3bf886bd9de469f0c9e096ae4e6 Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Thu, 13 Mar 2025 16:32:28 +0500 Subject: [PATCH 04/19] feat: added data formatting method --- cve_bin_tool/data_sources/new_osv_source.py | 125 +++++++++++++++++++- 1 file changed, 124 insertions(+), 1 deletion(-) diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py index d9225fdcc4..2fc3fd0f27 100644 --- a/cve_bin_tool/data_sources/new_osv_source.py +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -1,9 +1,11 @@ import asyncio +import json import os import pathlib import zipfile import aiohttp +from cvss import CVSS3 from google.auth.credentials import AnonymousCredentials # type: ignore[import-untyped] from google.cloud import storage # type: ignore[import-untyped] @@ -17,6 +19,7 @@ class OSVDataSource(Data_Source): def __init__( self, partial_update=False, bucket_name=None, max_parallel_downloads=5 ): + self.source_name = "OSV" self._client = storage.Client(credentials=AnonymousCredentials(), project=None) self.ecosystems_fetched = set() self.download_url = [] @@ -85,7 +88,124 @@ async def _extract_all(self): except zipfile.BadZipFile: LOGGER.error(f"OSV: error while extracting {file}") - async def get_cve_data(self) -> None: + def process_data_from_disk(self): + """Read data from disk and yield each instance in required format""" + for file in pathlib.Path(self.osv_path).glob("*.json"): + with open(file) as opened_file: + content = opened_file.read() + + json_data: dict = json.loads(content) + del content + + cve_id, severity, vector = ( + json_data.get("id"), + json_data.get("severity", None), + None, + ) + + severity: dict | None + if severity and "CVSS_V3" in [x["type"] for x in severity]: + try: + # Ensure CVSS vector is valid + if severity[0]["score"].endswith("/"): + cvss_data = CVSS3(severity[0]["score"][:-1]) + LOGGER.debug(f"{cve_id} : Correcting malformed CVSS3 vector") + else: + cvss_data = CVSS3(severity[0]["score"]) + # Now extract CVSS attributes + version = "3" + severity = cvss_data.severities()[0] + score = cvss_data.scores()[0] + vector = cvss_data.clean_vector() + + except Exception as e: + LOGGER.debug(e) + LOGGER.debug(f"{cve_id} : {severity}") + vector = None + + cve = { + "ID": cve_id, + "severity": severity if vector is not None else "unknown", + "description": json_data.get("summary", "unknown"), + "score": score if vector is not None else "unknown", # noqa + "CVSS_version": version if vector is not None else "unknown", # noqa + "CVSS_vector": vector if vector is not None else "unknown", + "last_modified": ( + json_data["modified"] + if json_data.get("modified", None) + else json_data["published"] + ), + } + + affected = None + + for package_data in json_data.get("affected", []): + package = package_data.get("package", {}) + if not package: + continue + + product = package.get("name") + vendor = ( + "unknown" # OSV Schema does not provide vendor names for packages + ) + + if product.startswith("github.com/"): + vendor = product.split("/")[-2] + product = product.split("/")[-1] + + _affected = { + "cve_id": cve_id, + "vendor": vendor, + "product": product, + "version": "*", + "versionStartIncluding": "", + "versionStartExcluding": "", + "versionEndIncluding": "", + "versionEndExcluding": "", + } + + events = None + for ranges in package_data.get("ranges", []): + if ranges["type"] == "SEMVER": + events = ranges["events"] + + if events is None and "versions" in package_data: + versions = package_data["versions"] + + if not versions: + continue + + version_affected = _affected.copy() + + version_affected["versionStartIncluding"] = versions[0] + version_affected["versionEndIncluding"] = versions[-1] + + affected = version_affected + elif events is not None: + introduced = None + fixed = None + + for event in events: + if event.get("introduced", None): + introduced = event.get("introduced") + if event.get("fixed", None): + fixed = event.get("fixed") + + if fixed is not None: + range_affected = _affected.copy() + + range_affected["versionStartIncluding"] = introduced + range_affected["versionEndExcluding"] = fixed + + fixed = None + affected = range_affected + + # delete unused json data before garbage collector does + del json_data + + yield cve, affected + + async def get_cve_data(self): """Returns OSV cve data to insert into db""" await self.update_ecosystem_info() await self._fetch_all() @@ -93,3 +213,6 @@ async def get_cve_data(self) -> None: # no need to keep links after download, there may be lots of them del self.download_url + + # to keep backwards compatibility convert iterator to list + return list(self.process_data_from_disk()), self.source_name From ec2cad4b5d5ea495b9250f99260d550b1bddd52d Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Thu, 13 Mar 2025 16:37:33 +0500 Subject: [PATCH 05/19] chore: added google-cloud-storage to requirements --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e6d8e62c47..45279ce6ce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ cvss defusedxml distro filetype>=1.2.0 -gsutil +google-cloud-storage importlib_metadata>=3.6; python_version < "3.10" importlib_resources; python_version < "3.9" jinja2>=2.11.3 From 803ce56dd2fa08eef080206eedafc8509b4195dd Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Thu, 13 Mar 2025 17:45:05 +0500 Subject: [PATCH 06/19] feat: replaced old OSV source --- cve_bin_tool/cli.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cve_bin_tool/cli.py b/cve_bin_tool/cli.py index bf93b0e483..cd8e936427 100644 --- a/cve_bin_tool/cli.py +++ b/cve_bin_tool/cli.py @@ -48,8 +48,8 @@ curl_source, epss_source, gad_source, + new_osv_source, nvd_source, - osv_source, purl2cpe_source, redhat_source, ) @@ -782,7 +782,9 @@ def main(argv=None): enabled_sources = [] if "OSV" not in disabled_sources: - source_osv = osv_source.OSV_Source(incremental_update=incremental_db_update) + source_osv = new_osv_source.OSVDataSource( + incremental_update=incremental_db_update + ) enabled_sources.append(source_osv) if "GAD" not in disabled_sources: From f9e7b9516c42195060bbb5a186703dee2ac193ab Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Thu, 13 Mar 2025 17:45:49 +0500 Subject: [PATCH 07/19] feat: minor logging and naming improvements --- cve_bin_tool/data_sources/new_osv_source.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py index 2fc3fd0f27..1e15421d66 100644 --- a/cve_bin_tool/data_sources/new_osv_source.py +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -17,7 +17,7 @@ class OSVDataSource(Data_Source): """Slightly more memory efficient reimplementation of OSV datasource""" def __init__( - self, partial_update=False, bucket_name=None, max_parallel_downloads=5 + self, incremental_update=False, bucket_name=None, max_parallel_downloads=5 ): self.source_name = "OSV" self._client = storage.Client(credentials=AnonymousCredentials(), project=None) @@ -27,7 +27,7 @@ def __init__( self.osv_path = str(pathlib.Path(DISK_LOCATION_DEFAULT) / "osv") self.bucket_name = bucket_name or "osv-vulnerabilities" - self.partial_update = partial_update + self.incremental_update = incremental_update self.signed_link_expiration_time = 3600 * 2 # 2 hours async def update_ecosystem_info(self) -> None: @@ -61,10 +61,10 @@ async def __fetch_single(self, url: str, download_to: str, session): file.write(content) del content LOGGER.debug(f"Fetched {url}") - except (ValueError, IndexError): - pass + except (ValueError, IndexError, aiohttp.ClientPayloadError) as e: + LOGGER.warning(f"OSV: was not able to fetch {url}: {str(e)}") else: - LOGGER.error(f"OSV: was not able to fetch {url}") + LOGGER.warning(f"OSV: was not able to fetch {url}") async def _fetch_all(self): """Concurrently fetches all zip files from OSV""" @@ -82,11 +82,14 @@ async def _extract_all(self): LOGGER.info("OSV: started extracting zip files...") for file in pathlib.Path(self.osv_path).glob("*.zip"): try: + LOGGER.info(f"OSV: extracting: {file}") with zipfile.ZipFile(file, "r") as zip_ref: zip_ref.extractall(self.osv_path) - os.remove(file) except zipfile.BadZipFile: - LOGGER.error(f"OSV: error while extracting {file}") + LOGGER.warning(f"OSV: error while extracting {file}") + finally: + os.remove(file) + await asyncio.sleep(0.5) def process_data_from_disk(self): """Read data from disk and yield each instance in required format""" From d3a1e927cbba1f5e18a85b145bd8b27959e85665 Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Thu, 13 Mar 2025 19:58:17 +0500 Subject: [PATCH 08/19] feat: decompressed small files in memory --- cve_bin_tool/data_sources/new_osv_source.py | 32 ++++++++++++++++----- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py index 1e15421d66..2db9a895c3 100644 --- a/cve_bin_tool/data_sources/new_osv_source.py +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -1,4 +1,5 @@ import asyncio +import io import json import os import pathlib @@ -55,12 +56,25 @@ async def __fetch_single(self, url: str, download_to: str, session): if response.status == 200: try: content = await response.read() - filename: str = f"{url.split("/")[-2]}.zip" - location = os.path.join(download_to, filename) - with open(location, "wb") as file: - file.write(content) + content_size_mb = len(content) / (1024 * 1024) + + # if file is more than 50 MB download it to disk + if content_size_mb > 50: + filename: str = f"{url.split("/")[-2]}.zip" + location = os.path.join(download_to, filename) + with open(location, "wb") as file: + file.write(content) + LOGGER.debug(f"OSV: fetched {url}") + else: + file = io.BytesIO(content) + zip_file = zipfile.ZipFile(file) + zip_file.extractall(self.osv_path) + del file + del zip_file + LOGGER.debug(f"OSV: fetched and unzipped {url}") + del content - LOGGER.debug(f"Fetched {url}") + del content_size_mb except (ValueError, IndexError, aiohttp.ClientPayloadError) as e: LOGGER.warning(f"OSV: was not able to fetch {url}: {str(e)}") else: @@ -217,5 +231,9 @@ async def get_cve_data(self): # no need to keep links after download, there may be lots of them del self.download_url - # to keep backwards compatibility convert iterator to list - return list(self.process_data_from_disk()), self.source_name + severity_data, affected_data = [], [] + for cve, affected in self.process_data_from_disk(): + severity_data.append(cve) + affected_data.append(affected) + + return (severity_data, affected_data), self.source_name From a240bda8bff8894e03c83c406a6c112efbbd1789 Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Mon, 17 Mar 2025 21:09:39 +0500 Subject: [PATCH 09/19] feat: minor code improvements --- cve_bin_tool/data_sources/new_osv_source.py | 101 +++++++++++--------- 1 file changed, 55 insertions(+), 46 deletions(-) diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py index 2db9a895c3..f22dbde822 100644 --- a/cve_bin_tool/data_sources/new_osv_source.py +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -15,7 +15,7 @@ class OSVDataSource(Data_Source): - """Slightly more memory efficient reimplementation of OSV datasource""" + """A slightly more memory-efficient reimplementation of the OSV data source.""" def __init__( self, incremental_update=False, bucket_name=None, max_parallel_downloads=5 @@ -32,8 +32,8 @@ def __init__( self.signed_link_expiration_time = 3600 * 2 # 2 hours async def update_ecosystem_info(self) -> None: - """Fetch OSV ecosystem information and prepare download links""" - LOGGER.info("OSV: started fetching ecosystems info...") + """Fetch OSV ecosystem information and prepare download links.""" + LOGGER.info("OSV: Started fetching ecosystem info...") blobs = self._client.list_blobs(self.bucket_name) for blob in blobs: if blob.name.endswith("all.zip"): @@ -41,6 +41,7 @@ async def update_ecosystem_info(self) -> None: ecosystem_name = blob.name.split("/")[-2] url = f"https://storage.googleapis.com/{self.bucket_name}/{ecosystem_name}/all.zip" self.download_url.append(url) + LOGGER.debug(f"OSV: Download link for {ecosystem_name} added.") if ecosystem_name.find(":") >= 0: ecosystem_name = ecosystem_name.split(":")[0] self.ecosystems_fetched.add(ecosystem_name) @@ -49,40 +50,48 @@ async def update_ecosystem_info(self) -> None: async def __fetch_single(self, url: str, download_to: str, session): """ - Fetches single file while preventing downloading more than $max_parallel_downloads files simultaneously + Fetches a single file while preventing downloading more than $max_parallel_downloads files simultaneously. """ async with self._semaphore: - async with session.get(url) as response: - if response.status == 200: - try: - content = await response.read() - content_size_mb = len(content) / (1024 * 1024) - - # if file is more than 50 MB download it to disk - if content_size_mb > 50: - filename: str = f"{url.split("/")[-2]}.zip" - location = os.path.join(download_to, filename) - with open(location, "wb") as file: - file.write(content) - LOGGER.debug(f"OSV: fetched {url}") - else: - file = io.BytesIO(content) - zip_file = zipfile.ZipFile(file) - zip_file.extractall(self.osv_path) - del file - del zip_file - LOGGER.debug(f"OSV: fetched and unzipped {url}") - - del content - del content_size_mb - except (ValueError, IndexError, aiohttp.ClientPayloadError) as e: - LOGGER.warning(f"OSV: was not able to fetch {url}: {str(e)}") - else: - LOGGER.warning(f"OSV: was not able to fetch {url}") + try: + async with session.get(url, timeout=120) as response: + if response.status == 200: + try: + content: bytes = await response.read() + content_size_mb = len(content) / (1024 * 1024) + + # If the file is more than 512 MB, download it to disk + if content_size_mb > 512: + _fname = url.split("/")[-2] + filename: str = f"{_fname}.zip" + location = os.path.join(download_to, filename) + with open(location, "wb") as file: + file.write(content) + LOGGER.debug(f"OSV: Fetched {url}.") + else: + in_memory_file: io.BytesIO = io.BytesIO(content) + zip_file = zipfile.ZipFile(in_memory_file) + zip_file.extractall(self.osv_path) + del in_memory_file + del zip_file + LOGGER.debug(f"OSV: Fetched and unzipped {url}.") + + del content + del content_size_mb + except ( + ValueError, + IndexError, + aiohttp.ClientPayloadError, + ) as e: + LOGGER.warning(f"OSV: Unable to fetch {url}: {str(e)}") + else: + LOGGER.warning(f"OSV: Unable to fetch {url}.") + except (TimeoutError, asyncio.TimeoutError): + LOGGER.warning(f"OSV: Timeout error while fetching {url}.") async def _fetch_all(self): - """Concurrently fetches all zip files from OSV""" - LOGGER.info("OSV: started fetching OSV CVE files...") + """Concurrently fetch all zip files from OSV.""" + LOGGER.info("OSV: Started fetching OSV CVE files...") async with aiohttp.ClientSession() as session: tasks = [ self.__fetch_single(url, self.osv_path, session) @@ -91,27 +100,27 @@ async def _fetch_all(self): await asyncio.gather(*tasks) async def _extract_all(self): - """Extract and delete all files in OSV cache dir""" - # .glob("zip") returns iterator so it is ok to process files in the loop - LOGGER.info("OSV: started extracting zip files...") + """Extract and delete all files in the OSV cache directory.""" + # .glob("zip") returns an iterator, so it is okay to process files in the loop + LOGGER.info("OSV: Started extracting zip files...") for file in pathlib.Path(self.osv_path).glob("*.zip"): try: - LOGGER.info(f"OSV: extracting: {file}") + LOGGER.debug(f"OSV: Extracting {file}") with zipfile.ZipFile(file, "r") as zip_ref: zip_ref.extractall(self.osv_path) except zipfile.BadZipFile: - LOGGER.warning(f"OSV: error while extracting {file}") + LOGGER.warning(f"OSV: Error while extracting {file}.") finally: os.remove(file) await asyncio.sleep(0.5) def process_data_from_disk(self): - """Read data from disk and yield each instance in required format""" + """Read data from disk and yield each instance in the required format.""" for file in pathlib.Path(self.osv_path).glob("*.json"): with open(file) as opened_file: content = opened_file.read() - json_data: dict = json.loads(content) + json_data: dict = json.loads(content) # type: ignore del content cve_id, severity, vector = ( @@ -120,13 +129,13 @@ def process_data_from_disk(self): None, ) - severity: dict | None + severity: dict | None # type: ignore if severity and "CVSS_V3" in [x["type"] for x in severity]: try: - # Ensure CVSS vector is valid + # Ensure the CVSS vector is valid if severity[0]["score"].endswith("/"): cvss_data = CVSS3(severity[0]["score"][:-1]) - LOGGER.debug(f"{cve_id} : Correcting malformed CVSS3 vector") + LOGGER.debug(f"{cve_id} : Correcting malformed CVSS3 vector.") else: cvss_data = CVSS3(severity[0]["score"]) # Now extract CVSS attributes @@ -217,18 +226,18 @@ def process_data_from_disk(self): fixed = None affected = range_affected - # delete unused json data before garbage collector does + # Delete unused json data before the garbage collector does. del json_data yield cve, affected async def get_cve_data(self): - """Returns OSV cve data to insert into db""" + """Returns OSV CVE data to insert into the database.""" await self.update_ecosystem_info() await self._fetch_all() await self._extract_all() - # no need to keep links after download, there may be lots of them + # No need to keep links after download, as there may be a lot of them. del self.download_url severity_data, affected_data = [], [] From 3b403b4d51030e0c0667caa446fe91e6016b12ba Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Wed, 19 Mar 2025 02:52:35 +0500 Subject: [PATCH 10/19] test: adapted tests for new data source --- test/test_new_osv_source.py | 27 ----------------- test/test_source_osv.py | 58 +++++++++++++++---------------------- 2 files changed, 23 insertions(+), 62 deletions(-) delete mode 100644 test/test_new_osv_source.py diff --git a/test/test_new_osv_source.py b/test/test_new_osv_source.py deleted file mode 100644 index c6ff4acdd1..0000000000 --- a/test/test_new_osv_source.py +++ /dev/null @@ -1,27 +0,0 @@ -from test.utils import EXTERNAL_SYSTEM - -import pytest - -from cve_bin_tool.data_sources import new_osv_source -from cve_bin_tool.util import make_http_requests - - -class TestNewOSVSource: - data_source = new_osv_source.OSVDataSource() - ecosystems_url = "https://osv-vulnerabilities.storage.googleapis.com/ecosystems.txt" - - @pytest.mark.asyncio - @pytest.mark.skipif(not EXTERNAL_SYSTEM(), reason="Needs network connection.") - async def test_update_ecosystem_info(self): - ecosystems_txt = make_http_requests( - "text", url=self.ecosystems_url, timeout=300 - ).strip("\n") - expected_ecosystems = set(ecosystems_txt.split("\n")) - - await self.data_source.update_ecosystem_info() - - # there may be more ecosystems fetched than provided in ecosystems.txt - assert all( - ecosystem in self.data_source.ecosystems_fetched - for ecosystem in expected_ecosystems - ) diff --git a/test/test_source_osv.py b/test/test_source_osv.py index 9bb105b0ce..44643ab37d 100644 --- a/test/test_source_osv.py +++ b/test/test_source_osv.py @@ -2,24 +2,22 @@ # SPDX-License-Identifier: GPL-3.0-or-later -import io import shutil import tempfile -import zipfile from pathlib import Path from test.utils import EXTERNAL_SYSTEM import aiohttp import pytest -from cve_bin_tool.data_sources import osv_source +from cve_bin_tool.data_sources import new_osv_source from cve_bin_tool.util import make_http_requests class TestSourceOSV: @classmethod def setup_class(cls): - cls.osv = osv_source.OSV_Source() + cls.osv = new_osv_source.OSVDataSource() cls.osv.cachedir = tempfile.mkdtemp(prefix="cvedb-") cls.osv.osv_path = str(Path(cls.osv.cachedir) / "osv") @@ -168,37 +166,18 @@ def teardown_class(cls): @pytest.mark.asyncio @pytest.mark.skipif(not EXTERNAL_SYSTEM(), reason="Needs network connection.") async def test_update_ecosystems(self): - await self.osv.update_ecosystems() - ecosystems_txt = make_http_requests( "text", url=self.ecosystems_url, timeout=300 ).strip("\n") expected_ecosystems = set(ecosystems_txt.split("\n")) - # Because ecosystems.txt does not contain the complete list, this must be - # manually fixed up. - expected_ecosystems.add("DWF") - expected_ecosystems.add("JavaScript") - - # Assert that there are no missing ecosystems - assert all(x in self.osv.ecosystems for x in expected_ecosystems) - # Assert that there are no extra ecosystems - assert all(x in expected_ecosystems for x in self.osv.ecosystems) - - @pytest.mark.asyncio - @pytest.mark.skipif(not EXTERNAL_SYSTEM(), reason="Needs network connection.") - @pytest.mark.parametrize("ecosystem_url", [url for url in cve_file_data]) - async def test_get_ecosystem_00(self, ecosystem_url): - connector = aiohttp.TCPConnector(limit_per_host=19) - async with aiohttp.ClientSession( - connector=connector, trust_env=True - ) as session: - content = await self.osv.get_ecosystem(ecosystem_url, session) - - cve_data = self.cve_file_data[ecosystem_url] + await self.osv.update_ecosystem_info() - assert content["id"] == cve_data["id"] - assert content["published"] == cve_data["published"] + # there may be more ecosystems fetched than provided in ecosystems.txt + assert all( + ecosystem in self.osv.ecosystems_fetched + for ecosystem in expected_ecosystems + ) @pytest.mark.asyncio @pytest.mark.skipif(not EXTERNAL_SYSTEM(), reason="Needs network connection.") @@ -209,19 +188,24 @@ async def test_get_ecosystem_01(self): async with aiohttp.ClientSession( connector=connector, trust_env=True ) as session: - content = await self.osv.get_ecosystem(eco_url, session, mode="bytes") + await self.osv.fetch_single(eco_url, self.osv.osv_path, session) - z = zipfile.ZipFile(io.BytesIO(content)) + p = Path(self.osv.osv_path).glob("**/*") + files = [x.name for x in p if x.is_file()] # Shouldn't be any files as DWF is no longer a valid ecosystems - assert len(z.namelist()) == 0 + assert len(files) == 0 @pytest.mark.asyncio @pytest.mark.skipif(not EXTERNAL_SYSTEM(), reason="Needs network connection.") async def test_fetch_cves(self): - self.osv.ecosystems = ["PyPI"] + ecosystem_name = "PyPI" + self.osv.ecosystems_fetched = [ecosystem_name] + self.osv.download_url = [ + f"https://storage.googleapis.com/osv-vulnerabilities/{ecosystem_name}/all.zip" + ] - await self.osv.fetch_cves() + await self.osv.fetch_all() p = Path(self.osv.osv_path).glob("**/*") files = [x.name for x in p if x.is_file()] @@ -234,7 +218,11 @@ async def test_fetch_cves(self): @pytest.mark.parametrize("cve_entries", [[x] for _, x in cve_file_data.items()]) def test_format_data(self, cve_entries): - severity_data, affected_data = self.osv.format_data(cve_entries) + severity_data, affected_data = [], [] + for cve_entry in cve_entries: + severity, affected = self.osv.get_formatted_data_from_json(cve_entry) + severity_data.append(severity) + affected_data.append(affected) severity_data = severity_data[0] From 7aeca1b70e70701323db632a74ee3f115615154d Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Wed, 19 Mar 2025 02:57:46 +0500 Subject: [PATCH 11/19] refactor: moved json parsing to separate function --- cve_bin_tool/data_sources/new_osv_source.py | 213 ++++++++++---------- 1 file changed, 107 insertions(+), 106 deletions(-) diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py index f22dbde822..47794a915a 100644 --- a/cve_bin_tool/data_sources/new_osv_source.py +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -48,7 +48,7 @@ async def update_ecosystem_info(self) -> None: except (ValueError, IndexError): pass - async def __fetch_single(self, url: str, download_to: str, session): + async def fetch_single(self, url: str, download_to: str, session): """ Fetches a single file while preventing downloading more than $max_parallel_downloads files simultaneously. """ @@ -71,7 +71,7 @@ async def __fetch_single(self, url: str, download_to: str, session): else: in_memory_file: io.BytesIO = io.BytesIO(content) zip_file = zipfile.ZipFile(in_memory_file) - zip_file.extractall(self.osv_path) + zip_file.extractall(download_to) del in_memory_file del zip_file LOGGER.debug(f"OSV: Fetched and unzipped {url}.") @@ -89,17 +89,17 @@ async def __fetch_single(self, url: str, download_to: str, session): except (TimeoutError, asyncio.TimeoutError): LOGGER.warning(f"OSV: Timeout error while fetching {url}.") - async def _fetch_all(self): + async def fetch_all(self): """Concurrently fetch all zip files from OSV.""" LOGGER.info("OSV: Started fetching OSV CVE files...") async with aiohttp.ClientSession() as session: tasks = [ - self.__fetch_single(url, self.osv_path, session) + self.fetch_single(url, self.osv_path, session) for url in self.download_url ] await asyncio.gather(*tasks) - async def _extract_all(self): + async def extract_all(self): """Extract and delete all files in the OSV cache directory.""" # .glob("zip") returns an iterator, so it is okay to process files in the loop LOGGER.info("OSV: Started extracting zip files...") @@ -114,128 +114,129 @@ async def _extract_all(self): os.remove(file) await asyncio.sleep(0.5) - def process_data_from_disk(self): - """Read data from disk and yield each instance in the required format.""" - for file in pathlib.Path(self.osv_path).glob("*.json"): - with open(file) as opened_file: - content = opened_file.read() + @staticmethod + def get_formatted_data_from_json(content: dict): + cve_id, severity, vector = ( + content.get("id"), + content.get("severity", None), + None, + ) - json_data: dict = json.loads(content) # type: ignore - del content - - cve_id, severity, vector = ( - json_data.get("id"), - json_data.get("severity", None), - None, - ) - - severity: dict | None # type: ignore - if severity and "CVSS_V3" in [x["type"] for x in severity]: - try: - # Ensure the CVSS vector is valid - if severity[0]["score"].endswith("/"): - cvss_data = CVSS3(severity[0]["score"][:-1]) - LOGGER.debug(f"{cve_id} : Correcting malformed CVSS3 vector.") - else: - cvss_data = CVSS3(severity[0]["score"]) - # Now extract CVSS attributes - version = "3" - severity = cvss_data.severities()[0] - score = cvss_data.scores()[0] - vector = cvss_data.clean_vector() - - except Exception as e: - LOGGER.debug(e) - LOGGER.debug(f"{cve_id} : {severity}") - vector = None - - cve = { - "ID": cve_id, - "severity": severity if vector is not None else "unknown", - "description": json_data.get("summary", "unknown"), - "score": score if vector is not None else "unknown", # noqa - "CVSS_version": version if vector is not None else "unknown", # noqa - "CVSS_vector": vector if vector is not None else "unknown", - "last_modified": ( - json_data["modified"] - if json_data.get("modified", None) - else json_data["published"] - ), + severity: dict | None # type: ignore + if severity and "CVSS_V3" in [x["type"] for x in severity]: + try: + # Ensure the CVSS vector is valid + if severity[0]["score"].endswith("/"): + cvss_data = CVSS3(severity[0]["score"][:-1]) + LOGGER.debug(f"{cve_id} : Correcting malformed CVSS3 vector.") + else: + cvss_data = CVSS3(severity[0]["score"]) + # Now extract CVSS attributes + version = "3" + severity = cvss_data.severities()[0] + score = cvss_data.scores()[0] + vector = cvss_data.clean_vector() + + except Exception as e: + LOGGER.debug(e) + LOGGER.debug(f"{cve_id} : {severity}") + vector = None + + cve = { + "ID": cve_id, + "severity": severity if vector is not None else "unknown", + "description": content.get("summary", "unknown"), + "score": score if vector is not None else "unknown", # noqa + "CVSS_version": version if vector is not None else "unknown", # noqa + "CVSS_vector": vector if vector is not None else "unknown", + "last_modified": ( + content["modified"] + if content.get("modified", None) + else content["published"] + ), + } + + affected = None + + for package_data in content.get("affected", []): + package = package_data.get("package", {}) + if not package: + continue + + product = package.get("name") + vendor = "unknown" # OSV Schema does not provide vendor names for packages + + if product.startswith("github.com/"): + vendor = product.split("/")[-2] + product = product.split("/")[-1] + + _affected = { + "cve_id": cve_id, + "vendor": vendor, + "product": product, + "version": "*", + "versionStartIncluding": "", + "versionStartExcluding": "", + "versionEndIncluding": "", + "versionEndExcluding": "", } - affected = None + events = None + for ranges in package_data.get("ranges", []): + if ranges["type"] == "SEMVER": + events = ranges["events"] - for package_data in json_data.get("affected", []): - package = package_data.get("package", {}) - if not package: - continue + if events is None and "versions" in package_data: + versions = package_data["versions"] - product = package.get("name") - vendor = ( - "unknown" # OSV Schema does not provide vendor names for packages - ) - - if product.startswith("github.com/"): - vendor = product.split("/")[-2] - product = product.split("/")[-1] - - _affected = { - "cve_id": cve_id, - "vendor": vendor, - "product": product, - "version": "*", - "versionStartIncluding": "", - "versionStartExcluding": "", - "versionEndIncluding": "", - "versionEndExcluding": "", - } - - events = None - for ranges in package_data.get("ranges", []): - if ranges["type"] == "SEMVER": - events = ranges["events"] + if not versions: + continue - if events is None and "versions" in package_data: - versions = package_data["versions"] + version_affected = _affected.copy() - if not versions: - continue + version_affected["versionStartIncluding"] = versions[0] + version_affected["versionEndIncluding"] = versions[-1] - version_affected = _affected.copy() + affected = version_affected + elif events is not None: + introduced = None + fixed = None - version_affected["versionStartIncluding"] = versions[0] - version_affected["versionEndIncluding"] = versions[-1] + for event in events: + if event.get("introduced", None): + introduced = event.get("introduced") + if event.get("fixed", None): + fixed = event.get("fixed") - affected = version_affected - elif events is not None: - introduced = None - fixed = None + if fixed is not None: + range_affected = _affected.copy() - for event in events: - if event.get("introduced", None): - introduced = event.get("introduced") - if event.get("fixed", None): - fixed = event.get("fixed") + range_affected["versionStartIncluding"] = introduced + range_affected["versionEndExcluding"] = fixed - if fixed is not None: - range_affected = _affected.copy() + fixed = None + affected = range_affected - range_affected["versionStartIncluding"] = introduced - range_affected["versionEndExcluding"] = fixed + return cve, affected - fixed = None - affected = range_affected + def process_data_from_disk(self): + """Read data from disk and yield each instance in the required format.""" + for file in pathlib.Path(self.osv_path).glob("*.json"): + with open(file) as opened_file: + content = opened_file.read() + json_data: dict = json.loads(content) # type: ignore + del content + data = self.get_formatted_data_from_json(json_data) # Delete unused json data before the garbage collector does. del json_data - - yield cve, affected + yield data async def get_cve_data(self): """Returns OSV CVE data to insert into the database.""" await self.update_ecosystem_info() - await self._fetch_all() - await self._extract_all() + await self.fetch_all() + await self.extract_all() # No need to keep links after download, as there may be a lot of them. del self.download_url From 77dd8e52f4a735ed1841d15049cef1c8790a571d Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Wed, 19 Mar 2025 03:03:49 +0500 Subject: [PATCH 12/19] docs: added some comments --- cve_bin_tool/data_sources/new_osv_source.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py index 47794a915a..ed56500c21 100644 --- a/cve_bin_tool/data_sources/new_osv_source.py +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -42,6 +42,7 @@ async def update_ecosystem_info(self) -> None: url = f"https://storage.googleapis.com/{self.bucket_name}/{ecosystem_name}/all.zip" self.download_url.append(url) LOGGER.debug(f"OSV: Download link for {ecosystem_name} added.") + # exclude ecosystem versions from appending, e.g. Debian:10 should not be included if ecosystem_name.find(":") >= 0: ecosystem_name = ecosystem_name.split(":")[0] self.ecosystems_fetched.add(ecosystem_name) @@ -101,7 +102,7 @@ async def fetch_all(self): async def extract_all(self): """Extract and delete all files in the OSV cache directory.""" - # .glob("zip") returns an iterator, so it is okay to process files in the loop + # .glob("zip") returns an iterator, so it is memory efficient to process files in the loop LOGGER.info("OSV: Started extracting zip files...") for file in pathlib.Path(self.osv_path).glob("*.zip"): try: @@ -226,10 +227,11 @@ def process_data_from_disk(self): content = opened_file.read() json_data: dict = json.loads(content) # type: ignore - del content data = self.get_formatted_data_from_json(json_data) # Delete unused json data before the garbage collector does. del json_data + del content + yield data async def get_cve_data(self): From 38fd375ad369472df05a61d0fc3e1644fb6be5d1 Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Wed, 19 Mar 2025 03:05:59 +0500 Subject: [PATCH 13/19] refactor: useless code line removed --- cve_bin_tool/data_sources/new_osv_source.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py index ed56500c21..a3bcd5362d 100644 --- a/cve_bin_tool/data_sources/new_osv_source.py +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -29,7 +29,6 @@ def __init__( self.bucket_name = bucket_name or "osv-vulnerabilities" self.incremental_update = incremental_update - self.signed_link_expiration_time = 3600 * 2 # 2 hours async def update_ecosystem_info(self) -> None: """Fetch OSV ecosystem information and prepare download links.""" From 8fc8aa77f43251fe041059d53c6ce4481e345f60 Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Wed, 19 Mar 2025 03:10:09 +0500 Subject: [PATCH 14/19] docs: docstring changed --- cve_bin_tool/data_sources/new_osv_source.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py index a3bcd5362d..45ea4640eb 100644 --- a/cve_bin_tool/data_sources/new_osv_source.py +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -15,7 +15,7 @@ class OSVDataSource(Data_Source): - """A slightly more memory-efficient reimplementation of the OSV data source.""" + """Reimplementation of the OSV data source.""" def __init__( self, incremental_update=False, bucket_name=None, max_parallel_downloads=5 From 3f29e1b3893999d2ea00deb10b2d485044af230e Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Wed, 19 Mar 2025 12:34:39 +0500 Subject: [PATCH 15/19] fix: ignored None values --- cve_bin_tool/data_sources/new_osv_source.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py index 45ea4640eb..fb3bb45a85 100644 --- a/cve_bin_tool/data_sources/new_osv_source.py +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -244,7 +244,9 @@ async def get_cve_data(self): severity_data, affected_data = [], [] for cve, affected in self.process_data_from_disk(): - severity_data.append(cve) - affected_data.append(affected) + if cve: + severity_data.append(cve) + if affected: + affected_data.append(affected) return (severity_data, affected_data), self.source_name From 28eb7acd2fcc69ba2a03e3d1d1f10c1e0643ccb7 Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Wed, 19 Mar 2025 17:02:38 +0500 Subject: [PATCH 16/19] chore: updated requirements --- requirements.csv | 2 +- test/language_data/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.csv b/requirements.csv index 3ad34e857e..dff46f5cec 100644 --- a/requirements.csv +++ b/requirements.csv @@ -15,7 +15,7 @@ sissaschool_not_in_db,xmlschema python_not_in_db,importlib_metadata python,requests python,urllib3 -google,gsutil +google,google-cloud-storage skontar,cvss python_not_in_db,packaging python_not_in_db,importlib_resources diff --git a/test/language_data/requirements.txt b/test/language_data/requirements.txt index 1d4aa9a090..2077ff76e5 100644 --- a/test/language_data/requirements.txt +++ b/test/language_data/requirements.txt @@ -14,6 +14,6 @@ xmlschema importlib_metadata; python_version < "3.8" requests urllib3>=1.26.5 # dependency of requests added explictly to avoid CVEs -gsutil +google-cloud-storage cvss packaging From 79a3d8198fe3d2a90ba9df1940e7da21803fd6ec Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Wed, 19 Mar 2025 20:10:05 +0500 Subject: [PATCH 17/19] Revert "chore: updated requirements" This reverts commit 28eb7acd2fcc69ba2a03e3d1d1f10c1e0643ccb7. --- requirements.csv | 2 +- test/language_data/requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.csv b/requirements.csv index dff46f5cec..3ad34e857e 100644 --- a/requirements.csv +++ b/requirements.csv @@ -15,7 +15,7 @@ sissaschool_not_in_db,xmlschema python_not_in_db,importlib_metadata python,requests python,urllib3 -google,google-cloud-storage +google,gsutil skontar,cvss python_not_in_db,packaging python_not_in_db,importlib_resources diff --git a/test/language_data/requirements.txt b/test/language_data/requirements.txt index 2077ff76e5..1d4aa9a090 100644 --- a/test/language_data/requirements.txt +++ b/test/language_data/requirements.txt @@ -14,6 +14,6 @@ xmlschema importlib_metadata; python_version < "3.8" requests urllib3>=1.26.5 # dependency of requests added explictly to avoid CVEs -google-cloud-storage +gsutil cvss packaging From 256009ebd050bdff576f138e55a92859dc06db17 Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Thu, 20 Mar 2025 01:26:12 +0500 Subject: [PATCH 18/19] chore: updated --- requirements.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.csv b/requirements.csv index 3ad34e857e..dff46f5cec 100644 --- a/requirements.csv +++ b/requirements.csv @@ -15,7 +15,7 @@ sissaschool_not_in_db,xmlschema python_not_in_db,importlib_metadata python,requests python,urllib3 -google,gsutil +google,google-cloud-storage skontar,cvss python_not_in_db,packaging python_not_in_db,importlib_resources From e43d7457d3ce1a98b389f5ef7dc9ea4cdcf0bc82 Mon Sep 17 00:00:00 2001 From: fil1n <22854425+fil1n@users.noreply.github.com> Date: Sun, 23 Mar 2025 23:56:15 +0500 Subject: [PATCH 19/19] refactor: removed unused field --- cve_bin_tool/cli.py | 4 +--- cve_bin_tool/data_sources/new_osv_source.py | 5 +---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/cve_bin_tool/cli.py b/cve_bin_tool/cli.py index 2f77c4888a..a4c78d40dd 100644 --- a/cve_bin_tool/cli.py +++ b/cve_bin_tool/cli.py @@ -782,9 +782,7 @@ def main(argv=None): enabled_sources = [] if "OSV" not in disabled_sources: - source_osv = new_osv_source.OSVDataSource( - incremental_update=incremental_db_update - ) + source_osv = new_osv_source.OSVDataSource() enabled_sources.append(source_osv) if "GAD" not in disabled_sources: diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py index fb3bb45a85..0d71450d4b 100644 --- a/cve_bin_tool/data_sources/new_osv_source.py +++ b/cve_bin_tool/data_sources/new_osv_source.py @@ -17,9 +17,7 @@ class OSVDataSource(Data_Source): """Reimplementation of the OSV data source.""" - def __init__( - self, incremental_update=False, bucket_name=None, max_parallel_downloads=5 - ): + def __init__(self, bucket_name=None, max_parallel_downloads=5): self.source_name = "OSV" self._client = storage.Client(credentials=AnonymousCredentials(), project=None) self.ecosystems_fetched = set() @@ -28,7 +26,6 @@ def __init__( self.osv_path = str(pathlib.Path(DISK_LOCATION_DEFAULT) / "osv") self.bucket_name = bucket_name or "osv-vulnerabilities" - self.incremental_update = incremental_update async def update_ecosystem_info(self) -> None: """Fetch OSV ecosystem information and prepare download links."""