From 1a2eff35bf7a27dc37b1c867c2cf83630c7cb31a Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Wed, 12 Mar 2025 21:29:32 +0500
Subject: [PATCH 01/19] feat: added new OSV data source class initial
 implementation

---
 cve_bin_tool/data_sources/new_osv_source.py | 38 +++++++++++++++++++++
 test/test_new_osv_source.py                 | 27 +++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100644 cve_bin_tool/data_sources/new_osv_source.py
 create mode 100644 test/test_new_osv_source.py

diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
new file mode 100644
index 0000000000..77dc1badc9
--- /dev/null
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -0,0 +1,38 @@
+from google.auth.credentials import AnonymousCredentials  # type: ignore[import-untyped]
+from google.cloud import storage  # type: ignore[import-untyped]
+
+from cve_bin_tool.data_sources import Data_Source
+from cve_bin_tool.log import LOGGER
+
+
+class OSVDataSource(Data_Source):
+    def __init__(self, partial_update=False, bucket_name=None):
+        self._client = storage.Client(credentials=AnonymousCredentials(), project=None)
+        self.ecosystems_fetched = []
+        self.download_url = []
+
+        self.bucket_name = bucket_name or "osv-vulnerabilities"
+        self.partial_update = partial_update
+
+    async def update_ecosystem_info(self) -> None:
+        """Fetch OSV ecosystem information and download links"""
+
+        LOGGER.debug("Started fetching OSV ecosystems info...")
+        blobs = self._client.list_blobs(self.bucket_name)
+        for blob in blobs:
+            if blob.name.endswith("all.zip"):
+                try:
+                    ecosystem_name = blob.name.split("/")[-2]
+                    if ecosystem_name.find(":") >= 0:
+                        ecosystem_name = ecosystem_name.split(":")[0]
+                    self.ecosystems_fetched.append(ecosystem_name)
+                    self.download_url.append(blob.media_link)
+                except (ValueError, IndexError):
+                    pass
+
+        # remove repeating items
+        self.ecosystems_fetched = list(set(self.ecosystems_fetched))
+
+    async def get_cve_data(self):
+        """Returns OSV cve data to insert into db"""
+        await self.update_ecosystem_info()
diff --git a/test/test_new_osv_source.py b/test/test_new_osv_source.py
new file mode 100644
index 0000000000..c6ff4acdd1
--- /dev/null
+++ b/test/test_new_osv_source.py
@@ -0,0 +1,27 @@
+from test.utils import EXTERNAL_SYSTEM
+
+import pytest
+
+from cve_bin_tool.data_sources import new_osv_source
+from cve_bin_tool.util import make_http_requests
+
+
+class TestNewOSVSource:
+    data_source = new_osv_source.OSVDataSource()
+    ecosystems_url = "https://osv-vulnerabilities.storage.googleapis.com/ecosystems.txt"
+
+    @pytest.mark.asyncio
+    @pytest.mark.skipif(not EXTERNAL_SYSTEM(), reason="Needs network connection.")
+    async def test_update_ecosystem_info(self):
+        ecosystems_txt = make_http_requests(
+            "text", url=self.ecosystems_url, timeout=300
+        ).strip("\n")
+        expected_ecosystems = set(ecosystems_txt.split("\n"))
+
+        await self.data_source.update_ecosystem_info()
+
+        # there may be more ecosystems fetched than provided in ecosystems.txt
+        assert all(
+            ecosystem in self.data_source.ecosystems_fetched
+            for ecosystem in expected_ecosystems
+        )

From c146c4522c7d698b4b21a7921afecb7ed5e8419e Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Thu, 13 Mar 2025 14:15:56 +0500
Subject: [PATCH 02/19] feat: added file fetching to OSV datasource

---
 cve_bin_tool/data_sources/new_osv_source.py | 60 +++++++++++++++++----
 1 file changed, 49 insertions(+), 11 deletions(-)

diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
index 77dc1badc9..8038e45a24 100644
--- a/cve_bin_tool/data_sources/new_osv_source.py
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -1,38 +1,76 @@
+import asyncio
+import os
+import pathlib
+
+import aiohttp
 from google.auth.credentials import AnonymousCredentials  # type: ignore[import-untyped]
 from google.cloud import storage  # type: ignore[import-untyped]
 
-from cve_bin_tool.data_sources import Data_Source
+from cve_bin_tool.data_sources import DISK_LOCATION_DEFAULT, Data_Source
 from cve_bin_tool.log import LOGGER
 
 
 class OSVDataSource(Data_Source):
-    def __init__(self, partial_update=False, bucket_name=None):
+    """Slightly more memory efficient reimplementation of OSV datasource"""
+
+    def __init__(
+        self, partial_update=False, bucket_name=None, max_parallel_downloads=5
+    ):
         self._client = storage.Client(credentials=AnonymousCredentials(), project=None)
-        self.ecosystems_fetched = []
+        self.ecosystems_fetched = set()
         self.download_url = []
+        self._semaphore = asyncio.Semaphore(max_parallel_downloads)
+        self.osv_path = str(pathlib.Path(DISK_LOCATION_DEFAULT) / "osv")
 
         self.bucket_name = bucket_name or "osv-vulnerabilities"
         self.partial_update = partial_update
+        self.signed_link_expiration_time = 3600 * 2  # 2 hours
 
     async def update_ecosystem_info(self) -> None:
-        """Fetch OSV ecosystem information and download links"""
-
-        LOGGER.debug("Started fetching OSV ecosystems info...")
+        """Fetch OSV ecosystem information and prepare download links"""
+        LOGGER.info("Started fetching OSV ecosystems info...")
         blobs = self._client.list_blobs(self.bucket_name)
         for blob in blobs:
             if blob.name.endswith("all.zip"):
                 try:
                     ecosystem_name = blob.name.split("/")[-2]
+                    url = f"https://storage.googleapis.com/{self.bucket_name}/{ecosystem_name}/all.zip"
+                    self.download_url.append(url)
                     if ecosystem_name.find(":") >= 0:
                         ecosystem_name = ecosystem_name.split(":")[0]
-                    self.ecosystems_fetched.append(ecosystem_name)
-                    self.download_url.append(blob.media_link)
+                    self.ecosystems_fetched.add(ecosystem_name)
                 except (ValueError, IndexError):
                     pass
 
-        # remove repeating items
-        self.ecosystems_fetched = list(set(self.ecosystems_fetched))
+    async def __fetch_single(self, url: str, download_to: str, session):
+        """
+        Fetches single file while preventing downloading more than $max_parallel_downloads files simultaneously
+        """
+        async with self._semaphore:
+            async with session.get(url) as response:
+                if response.status == 200:
+                    try:
+                        content = await response.read()
+                        filename: str = f"{url.split("/")[-2]}.zip"
+                        location = os.path.join(download_to, filename)
+                        with open(location, "wb") as file:
+                            file.write(content)
+                        del content
+                        LOGGER.debug(f"Fetched {url}")
+                    except (ValueError, IndexError):
+                        pass
+
+    async def _fetch_all(self):
+        """Concurrently fetches all zip files from OSV"""
+        LOGGER.info("Started fetching OSV CVE files...")
+        async with aiohttp.ClientSession() as session:
+            tasks = [
+                self.__fetch_single(url, self.osv_path, session)
+                for url in self.download_url
+            ]
+            await asyncio.gather(*tasks)
 
-    async def get_cve_data(self):
+    async def get_cve_data(self) -> None:
         """Returns OSV cve data to insert into db"""
         await self.update_ecosystem_info()
+        await self._fetch_all()

From 0b5a52f45e06aa3493fd94c8957ed41f9909cfa0 Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Thu, 13 Mar 2025 15:51:34 +0500
Subject: [PATCH 03/19] feat: added zip extraction for osv

---
 cve_bin_tool/data_sources/new_osv_source.py | 23 +++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
index 8038e45a24..d9225fdcc4 100644
--- a/cve_bin_tool/data_sources/new_osv_source.py
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -1,6 +1,7 @@
 import asyncio
 import os
 import pathlib
+import zipfile
 
 import aiohttp
 from google.auth.credentials import AnonymousCredentials  # type: ignore[import-untyped]
@@ -28,7 +29,7 @@ def __init__(
 
     async def update_ecosystem_info(self) -> None:
         """Fetch OSV ecosystem information and prepare download links"""
-        LOGGER.info("Started fetching OSV ecosystems info...")
+        LOGGER.info("OSV: started fetching ecosystems info...")
         blobs = self._client.list_blobs(self.bucket_name)
         for blob in blobs:
             if blob.name.endswith("all.zip"):
@@ -59,10 +60,12 @@ async def __fetch_single(self, url: str, download_to: str, session):
                         LOGGER.debug(f"Fetched {url}")
                     except (ValueError, IndexError):
                         pass
+                else:
+                    LOGGER.error(f"OSV: was not able to fetch {url}")
 
     async def _fetch_all(self):
         """Concurrently fetches all zip files from OSV"""
-        LOGGER.info("Started fetching OSV CVE files...")
+        LOGGER.info("OSV: started fetching OSV CVE files...")
         async with aiohttp.ClientSession() as session:
             tasks = [
                 self.__fetch_single(url, self.osv_path, session)
@@ -70,7 +73,23 @@ async def _fetch_all(self):
             ]
             await asyncio.gather(*tasks)
 
+    async def _extract_all(self):
+        """Extract and delete all files in OSV cache dir"""
+        # .glob("zip") returns iterator so it is ok to process files in the loop
+        LOGGER.info("OSV: started extracting zip files...")
+        for file in pathlib.Path(self.osv_path).glob("*.zip"):
+            try:
+                with zipfile.ZipFile(file, "r") as zip_ref:
+                    zip_ref.extractall(self.osv_path)
+                os.remove(file)
+            except zipfile.BadZipFile:
+                LOGGER.error(f"OSV: error while extracting {file}")
+
     async def get_cve_data(self) -> None:
         """Returns OSV cve data to insert into db"""
         await self.update_ecosystem_info()
         await self._fetch_all()
+        await self._extract_all()
+
+        # no need to keep links after download, there may be lots of them
+        del self.download_url

From f076096dd291d3bf886bd9de469f0c9e096ae4e6 Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Thu, 13 Mar 2025 16:32:28 +0500
Subject: [PATCH 04/19] feat: added data formatting method

---
 cve_bin_tool/data_sources/new_osv_source.py | 125 +++++++++++++++++++-
 1 file changed, 124 insertions(+), 1 deletion(-)

diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
index d9225fdcc4..2fc3fd0f27 100644
--- a/cve_bin_tool/data_sources/new_osv_source.py
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -1,9 +1,11 @@
 import asyncio
+import json
 import os
 import pathlib
 import zipfile
 
 import aiohttp
+from cvss import CVSS3
 from google.auth.credentials import AnonymousCredentials  # type: ignore[import-untyped]
 from google.cloud import storage  # type: ignore[import-untyped]
 
@@ -17,6 +19,7 @@ class OSVDataSource(Data_Source):
     def __init__(
         self, partial_update=False, bucket_name=None, max_parallel_downloads=5
     ):
+        self.source_name = "OSV"
         self._client = storage.Client(credentials=AnonymousCredentials(), project=None)
         self.ecosystems_fetched = set()
         self.download_url = []
@@ -85,7 +88,124 @@ async def _extract_all(self):
             except zipfile.BadZipFile:
                 LOGGER.error(f"OSV: error while extracting {file}")
 
-    async def get_cve_data(self) -> None:
+    def process_data_from_disk(self):
+        """Read data from disk and yield each instance in required format"""
+        for file in pathlib.Path(self.osv_path).glob("*.json"):
+            with open(file) as opened_file:
+                content = opened_file.read()
+
+            json_data: dict = json.loads(content)
+            del content
+
+            cve_id, severity, vector = (
+                json_data.get("id"),
+                json_data.get("severity", None),
+                None,
+            )
+
+            severity: dict | None
+            if severity and "CVSS_V3" in [x["type"] for x in severity]:
+                try:
+                    # Ensure CVSS vector is valid
+                    if severity[0]["score"].endswith("/"):
+                        cvss_data = CVSS3(severity[0]["score"][:-1])
+                        LOGGER.debug(f"{cve_id} : Correcting malformed CVSS3 vector")
+                    else:
+                        cvss_data = CVSS3(severity[0]["score"])
+                    # Now extract CVSS attributes
+                    version = "3"
+                    severity = cvss_data.severities()[0]
+                    score = cvss_data.scores()[0]
+                    vector = cvss_data.clean_vector()
+
+                except Exception as e:
+                    LOGGER.debug(e)
+                    LOGGER.debug(f"{cve_id} : {severity}")
+                    vector = None
+
+            cve = {
+                "ID": cve_id,
+                "severity": severity if vector is not None else "unknown",
+                "description": json_data.get("summary", "unknown"),
+                "score": score if vector is not None else "unknown",  # noqa
+                "CVSS_version": version if vector is not None else "unknown",  # noqa
+                "CVSS_vector": vector if vector is not None else "unknown",
+                "last_modified": (
+                    json_data["modified"]
+                    if json_data.get("modified", None)
+                    else json_data["published"]
+                ),
+            }
+
+            affected = None
+
+            for package_data in json_data.get("affected", []):
+                package = package_data.get("package", {})
+                if not package:
+                    continue
+
+                product = package.get("name")
+                vendor = (
+                    "unknown"  # OSV Schema does not provide vendor names for packages
+                )
+
+                if product.startswith("github.com/"):
+                    vendor = product.split("/")[-2]
+                    product = product.split("/")[-1]
+
+                _affected = {
+                    "cve_id": cve_id,
+                    "vendor": vendor,
+                    "product": product,
+                    "version": "*",
+                    "versionStartIncluding": "",
+                    "versionStartExcluding": "",
+                    "versionEndIncluding": "",
+                    "versionEndExcluding": "",
+                }
+
+                events = None
+                for ranges in package_data.get("ranges", []):
+                    if ranges["type"] == "SEMVER":
+                        events = ranges["events"]
+
+                if events is None and "versions" in package_data:
+                    versions = package_data["versions"]
+
+                    if not versions:
+                        continue
+
+                    version_affected = _affected.copy()
+
+                    version_affected["versionStartIncluding"] = versions[0]
+                    version_affected["versionEndIncluding"] = versions[-1]
+
+                    affected = version_affected
+                elif events is not None:
+                    introduced = None
+                    fixed = None
+
+                    for event in events:
+                        if event.get("introduced", None):
+                            introduced = event.get("introduced")
+                        if event.get("fixed", None):
+                            fixed = event.get("fixed")
+
+                        if fixed is not None:
+                            range_affected = _affected.copy()
+
+                            range_affected["versionStartIncluding"] = introduced
+                            range_affected["versionEndExcluding"] = fixed
+
+                            fixed = None
+                            affected = range_affected
+
+            # delete unused json data before garbage collector does
+            del json_data
+
+            yield cve, affected
+
+    async def get_cve_data(self):
         """Returns OSV cve data to insert into db"""
         await self.update_ecosystem_info()
         await self._fetch_all()
@@ -93,3 +213,6 @@ async def get_cve_data(self) -> None:
 
         # no need to keep links after download, there may be lots of them
         del self.download_url
+
+        # to keep backwards compatibility convert iterator to list
+        return list(self.process_data_from_disk()), self.source_name

From ec2cad4b5d5ea495b9250f99260d550b1bddd52d Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Thu, 13 Mar 2025 16:37:33 +0500
Subject: [PATCH 05/19] chore: added google-cloud-storage to requirements

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index e6d8e62c47..45279ce6ce 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ cvss
 defusedxml
 distro
 filetype>=1.2.0
-gsutil
+google-cloud-storage
 importlib_metadata>=3.6; python_version < "3.10"
 importlib_resources; python_version < "3.9"
 jinja2>=2.11.3

From 803ce56dd2fa08eef080206eedafc8509b4195dd Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Thu, 13 Mar 2025 17:45:05 +0500
Subject: [PATCH 06/19] feat: replaced old OSV source

---
 cve_bin_tool/cli.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cve_bin_tool/cli.py b/cve_bin_tool/cli.py
index bf93b0e483..cd8e936427 100644
--- a/cve_bin_tool/cli.py
+++ b/cve_bin_tool/cli.py
@@ -48,8 +48,8 @@
     curl_source,
     epss_source,
     gad_source,
+    new_osv_source,
     nvd_source,
-    osv_source,
     purl2cpe_source,
     redhat_source,
 )
@@ -782,7 +782,9 @@ def main(argv=None):
     enabled_sources = []
 
     if "OSV" not in disabled_sources:
-        source_osv = osv_source.OSV_Source(incremental_update=incremental_db_update)
+        source_osv = new_osv_source.OSVDataSource(
+            incremental_update=incremental_db_update
+        )
         enabled_sources.append(source_osv)
 
     if "GAD" not in disabled_sources:

From f9e7b9516c42195060bbb5a186703dee2ac193ab Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Thu, 13 Mar 2025 17:45:49 +0500
Subject: [PATCH 07/19] feat: minor logging and naming improvements

---
 cve_bin_tool/data_sources/new_osv_source.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
index 2fc3fd0f27..1e15421d66 100644
--- a/cve_bin_tool/data_sources/new_osv_source.py
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -17,7 +17,7 @@ class OSVDataSource(Data_Source):
     """Slightly more memory efficient reimplementation of OSV datasource"""
 
     def __init__(
-        self, partial_update=False, bucket_name=None, max_parallel_downloads=5
+        self, incremental_update=False, bucket_name=None, max_parallel_downloads=5
     ):
         self.source_name = "OSV"
         self._client = storage.Client(credentials=AnonymousCredentials(), project=None)
@@ -27,7 +27,7 @@ def __init__(
         self.osv_path = str(pathlib.Path(DISK_LOCATION_DEFAULT) / "osv")
 
         self.bucket_name = bucket_name or "osv-vulnerabilities"
-        self.partial_update = partial_update
+        self.incremental_update = incremental_update
         self.signed_link_expiration_time = 3600 * 2  # 2 hours
 
     async def update_ecosystem_info(self) -> None:
@@ -61,10 +61,10 @@ async def __fetch_single(self, url: str, download_to: str, session):
                             file.write(content)
                         del content
                         LOGGER.debug(f"Fetched {url}")
-                    except (ValueError, IndexError):
-                        pass
+                    except (ValueError, IndexError, aiohttp.ClientPayloadError) as e:
+                        LOGGER.warning(f"OSV: was not able to fetch {url}: {str(e)}")
                 else:
-                    LOGGER.error(f"OSV: was not able to fetch {url}")
+                    LOGGER.warning(f"OSV: was not able to fetch {url}")
 
     async def _fetch_all(self):
         """Concurrently fetches all zip files from OSV"""
@@ -82,11 +82,14 @@ async def _extract_all(self):
         LOGGER.info("OSV: started extracting zip files...")
         for file in pathlib.Path(self.osv_path).glob("*.zip"):
             try:
+                LOGGER.info(f"OSV: extracting: {file}")
                 with zipfile.ZipFile(file, "r") as zip_ref:
                     zip_ref.extractall(self.osv_path)
-                os.remove(file)
             except zipfile.BadZipFile:
-                LOGGER.error(f"OSV: error while extracting {file}")
+                LOGGER.warning(f"OSV: error while extracting {file}")
+            finally:
+                os.remove(file)
+                await asyncio.sleep(0.5)
 
     def process_data_from_disk(self):
         """Read data from disk and yield each instance in required format"""

From d3a1e927cbba1f5e18a85b145bd8b27959e85665 Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Thu, 13 Mar 2025 19:58:17 +0500
Subject: [PATCH 08/19] feat: decompressed small files in memory

---
 cve_bin_tool/data_sources/new_osv_source.py | 32 ++++++++++++++++-----
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
index 1e15421d66..2db9a895c3 100644
--- a/cve_bin_tool/data_sources/new_osv_source.py
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -1,4 +1,5 @@
 import asyncio
+import io
 import json
 import os
 import pathlib
@@ -55,12 +56,25 @@ async def __fetch_single(self, url: str, download_to: str, session):
                 if response.status == 200:
                     try:
                         content = await response.read()
-                        filename: str = f"{url.split("/")[-2]}.zip"
-                        location = os.path.join(download_to, filename)
-                        with open(location, "wb") as file:
-                            file.write(content)
+                        content_size_mb = len(content) / (1024 * 1024)
+
+                        # if file is more than 50 MB download it to disk
+                        if content_size_mb > 50:
+                            filename: str = f"{url.split("/")[-2]}.zip"
+                            location = os.path.join(download_to, filename)
+                            with open(location, "wb") as file:
+                                file.write(content)
+                            LOGGER.debug(f"OSV: fetched {url}")
+                        else:
+                            file = io.BytesIO(content)
+                            zip_file = zipfile.ZipFile(file)
+                            zip_file.extractall(self.osv_path)
+                            del file
+                            del zip_file
+                            LOGGER.debug(f"OSV: fetched and unzipped {url}")
+
                         del content
-                        LOGGER.debug(f"Fetched {url}")
+                        del content_size_mb
                     except (ValueError, IndexError, aiohttp.ClientPayloadError) as e:
                         LOGGER.warning(f"OSV: was not able to fetch {url}: {str(e)}")
                 else:
@@ -217,5 +231,9 @@ async def get_cve_data(self):
         # no need to keep links after download, there may be lots of them
         del self.download_url
 
-        # to keep backwards compatibility convert iterator to list
-        return list(self.process_data_from_disk()), self.source_name
+        severity_data, affected_data = [], []
+        for cve, affected in self.process_data_from_disk():
+            severity_data.append(cve)
+            affected_data.append(affected)
+
+        return (severity_data, affected_data), self.source_name

From a240bda8bff8894e03c83c406a6c112efbbd1789 Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Mon, 17 Mar 2025 21:09:39 +0500
Subject: [PATCH 09/19] feat: minor code improvements

---
 cve_bin_tool/data_sources/new_osv_source.py | 101 +++++++++++---------
 1 file changed, 55 insertions(+), 46 deletions(-)

diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
index 2db9a895c3..f22dbde822 100644
--- a/cve_bin_tool/data_sources/new_osv_source.py
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -15,7 +15,7 @@
 
 
 class OSVDataSource(Data_Source):
-    """Slightly more memory efficient reimplementation of OSV datasource"""
+    """A slightly more memory-efficient reimplementation of the OSV data source."""
 
     def __init__(
         self, incremental_update=False, bucket_name=None, max_parallel_downloads=5
@@ -32,8 +32,8 @@ def __init__(
         self.signed_link_expiration_time = 3600 * 2  # 2 hours
 
     async def update_ecosystem_info(self) -> None:
-        """Fetch OSV ecosystem information and prepare download links"""
-        LOGGER.info("OSV: started fetching ecosystems info...")
+        """Fetch OSV ecosystem information and prepare download links."""
+        LOGGER.info("OSV: Started fetching ecosystem info...")
         blobs = self._client.list_blobs(self.bucket_name)
         for blob in blobs:
             if blob.name.endswith("all.zip"):
@@ -41,6 +41,7 @@ async def update_ecosystem_info(self) -> None:
                     ecosystem_name = blob.name.split("/")[-2]
                     url = f"https://storage.googleapis.com/{self.bucket_name}/{ecosystem_name}/all.zip"
                     self.download_url.append(url)
+                    LOGGER.debug(f"OSV: Download link for {ecosystem_name} added.")
                     if ecosystem_name.find(":") >= 0:
                         ecosystem_name = ecosystem_name.split(":")[0]
                     self.ecosystems_fetched.add(ecosystem_name)
@@ -49,40 +50,48 @@ async def update_ecosystem_info(self) -> None:
 
     async def __fetch_single(self, url: str, download_to: str, session):
         """
-        Fetches single file while preventing downloading more than $max_parallel_downloads files simultaneously
+        Fetches a single file while preventing downloading more than $max_parallel_downloads files simultaneously.
         """
         async with self._semaphore:
-            async with session.get(url) as response:
-                if response.status == 200:
-                    try:
-                        content = await response.read()
-                        content_size_mb = len(content) / (1024 * 1024)
-
-                        # if file is more than 50 MB download it to disk
-                        if content_size_mb > 50:
-                            filename: str = f"{url.split("/")[-2]}.zip"
-                            location = os.path.join(download_to, filename)
-                            with open(location, "wb") as file:
-                                file.write(content)
-                            LOGGER.debug(f"OSV: fetched {url}")
-                        else:
-                            file = io.BytesIO(content)
-                            zip_file = zipfile.ZipFile(file)
-                            zip_file.extractall(self.osv_path)
-                            del file
-                            del zip_file
-                            LOGGER.debug(f"OSV: fetched and unzipped {url}")
-
-                        del content
-                        del content_size_mb
-                    except (ValueError, IndexError, aiohttp.ClientPayloadError) as e:
-                        LOGGER.warning(f"OSV: was not able to fetch {url}: {str(e)}")
-                else:
-                    LOGGER.warning(f"OSV: was not able to fetch {url}")
+            try:
+                async with session.get(url, timeout=120) as response:
+                    if response.status == 200:
+                        try:
+                            content: bytes = await response.read()
+                            content_size_mb = len(content) / (1024 * 1024)
+
+                            # If the file is more than 512 MB, download it to disk
+                            if content_size_mb > 512:
+                                _fname = url.split("/")[-2]
+                                filename: str = f"{_fname}.zip"
+                                location = os.path.join(download_to, filename)
+                                with open(location, "wb") as file:
+                                    file.write(content)
+                                LOGGER.debug(f"OSV: Fetched {url}.")
+                            else:
+                                in_memory_file: io.BytesIO = io.BytesIO(content)
+                                zip_file = zipfile.ZipFile(in_memory_file)
+                                zip_file.extractall(self.osv_path)
+                                del in_memory_file
+                                del zip_file
+                                LOGGER.debug(f"OSV: Fetched and unzipped {url}.")
+
+                            del content
+                            del content_size_mb
+                        except (
+                            ValueError,
+                            IndexError,
+                            aiohttp.ClientPayloadError,
+                        ) as e:
+                            LOGGER.warning(f"OSV: Unable to fetch {url}: {str(e)}")
+                    else:
+                        LOGGER.warning(f"OSV: Unable to fetch {url}.")
+            except (TimeoutError, asyncio.TimeoutError):
+                LOGGER.warning(f"OSV: Timeout error while fetching {url}.")
 
     async def _fetch_all(self):
-        """Concurrently fetches all zip files from OSV"""
-        LOGGER.info("OSV: started fetching OSV CVE files...")
+        """Concurrently fetch all zip files from OSV."""
+        LOGGER.info("OSV: Started fetching OSV CVE files...")
         async with aiohttp.ClientSession() as session:
             tasks = [
                 self.__fetch_single(url, self.osv_path, session)
@@ -91,27 +100,27 @@ async def _fetch_all(self):
             await asyncio.gather(*tasks)
 
     async def _extract_all(self):
-        """Extract and delete all files in OSV cache dir"""
-        # .glob("zip") returns iterator so it is ok to process files in the loop
-        LOGGER.info("OSV: started extracting zip files...")
+        """Extract and delete all files in the OSV cache directory."""
+        # .glob("zip") returns an iterator, so it is okay to process files in the loop
+        LOGGER.info("OSV: Started extracting zip files...")
         for file in pathlib.Path(self.osv_path).glob("*.zip"):
             try:
-                LOGGER.info(f"OSV: extracting: {file}")
+                LOGGER.debug(f"OSV: Extracting {file}")
                 with zipfile.ZipFile(file, "r") as zip_ref:
                     zip_ref.extractall(self.osv_path)
             except zipfile.BadZipFile:
-                LOGGER.warning(f"OSV: error while extracting {file}")
+                LOGGER.warning(f"OSV: Error while extracting {file}.")
             finally:
                 os.remove(file)
                 await asyncio.sleep(0.5)
 
     def process_data_from_disk(self):
-        """Read data from disk and yield each instance in required format"""
+        """Read data from disk and yield each instance in the required format."""
         for file in pathlib.Path(self.osv_path).glob("*.json"):
             with open(file) as opened_file:
                 content = opened_file.read()
 
-            json_data: dict = json.loads(content)
+            json_data: dict = json.loads(content)  # type: ignore
             del content
 
             cve_id, severity, vector = (
@@ -120,13 +129,13 @@ def process_data_from_disk(self):
                 None,
             )
 
-            severity: dict | None
+            severity: dict | None  # type: ignore
             if severity and "CVSS_V3" in [x["type"] for x in severity]:
                 try:
-                    # Ensure CVSS vector is valid
+                    # Ensure the CVSS vector is valid
                     if severity[0]["score"].endswith("/"):
                         cvss_data = CVSS3(severity[0]["score"][:-1])
-                        LOGGER.debug(f"{cve_id} : Correcting malformed CVSS3 vector")
+                        LOGGER.debug(f"{cve_id} : Correcting malformed CVSS3 vector.")
                     else:
                         cvss_data = CVSS3(severity[0]["score"])
                     # Now extract CVSS attributes
@@ -217,18 +226,18 @@ def process_data_from_disk(self):
                             fixed = None
                             affected = range_affected
 
-            # delete unused json data before garbage collector does
+            # Delete unused json data before the garbage collector does.
             del json_data
 
             yield cve, affected
 
     async def get_cve_data(self):
-        """Returns OSV cve data to insert into db"""
+        """Returns OSV CVE data to insert into the database."""
         await self.update_ecosystem_info()
         await self._fetch_all()
         await self._extract_all()
 
-        # no need to keep links after download, there may be lots of them
+        # No need to keep links after download, as there may be a lot of them.
         del self.download_url
 
         severity_data, affected_data = [], []

From 3b403b4d51030e0c0667caa446fe91e6016b12ba Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Wed, 19 Mar 2025 02:52:35 +0500
Subject: [PATCH 10/19] test: adapted tests for new data source

---
 test/test_new_osv_source.py | 27 -----------------
 test/test_source_osv.py     | 58 +++++++++++++++----------------------
 2 files changed, 23 insertions(+), 62 deletions(-)
 delete mode 100644 test/test_new_osv_source.py

diff --git a/test/test_new_osv_source.py b/test/test_new_osv_source.py
deleted file mode 100644
index c6ff4acdd1..0000000000
--- a/test/test_new_osv_source.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from test.utils import EXTERNAL_SYSTEM
-
-import pytest
-
-from cve_bin_tool.data_sources import new_osv_source
-from cve_bin_tool.util import make_http_requests
-
-
-class TestNewOSVSource:
-    data_source = new_osv_source.OSVDataSource()
-    ecosystems_url = "https://osv-vulnerabilities.storage.googleapis.com/ecosystems.txt"
-
-    @pytest.mark.asyncio
-    @pytest.mark.skipif(not EXTERNAL_SYSTEM(), reason="Needs network connection.")
-    async def test_update_ecosystem_info(self):
-        ecosystems_txt = make_http_requests(
-            "text", url=self.ecosystems_url, timeout=300
-        ).strip("\n")
-        expected_ecosystems = set(ecosystems_txt.split("\n"))
-
-        await self.data_source.update_ecosystem_info()
-
-        # there may be more ecosystems fetched than provided in ecosystems.txt
-        assert all(
-            ecosystem in self.data_source.ecosystems_fetched
-            for ecosystem in expected_ecosystems
-        )
diff --git a/test/test_source_osv.py b/test/test_source_osv.py
index 9bb105b0ce..44643ab37d 100644
--- a/test/test_source_osv.py
+++ b/test/test_source_osv.py
@@ -2,24 +2,22 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 
 
-import io
 import shutil
 import tempfile
-import zipfile
 from pathlib import Path
 from test.utils import EXTERNAL_SYSTEM
 
 import aiohttp
 import pytest
 
-from cve_bin_tool.data_sources import osv_source
+from cve_bin_tool.data_sources import new_osv_source
 from cve_bin_tool.util import make_http_requests
 
 
 class TestSourceOSV:
     @classmethod
     def setup_class(cls):
-        cls.osv = osv_source.OSV_Source()
+        cls.osv = new_osv_source.OSVDataSource()
         cls.osv.cachedir = tempfile.mkdtemp(prefix="cvedb-")
         cls.osv.osv_path = str(Path(cls.osv.cachedir) / "osv")
 
@@ -168,37 +166,18 @@ def teardown_class(cls):
     @pytest.mark.asyncio
     @pytest.mark.skipif(not EXTERNAL_SYSTEM(), reason="Needs network connection.")
     async def test_update_ecosystems(self):
-        await self.osv.update_ecosystems()
-
         ecosystems_txt = make_http_requests(
             "text", url=self.ecosystems_url, timeout=300
         ).strip("\n")
         expected_ecosystems = set(ecosystems_txt.split("\n"))
 
-        # Because ecosystems.txt does not contain the complete list, this must be
-        # manually fixed up.
-        expected_ecosystems.add("DWF")
-        expected_ecosystems.add("JavaScript")
-
-        # Assert that there are no missing ecosystems
-        assert all(x in self.osv.ecosystems for x in expected_ecosystems)
-        # Assert that there are no extra ecosystems
-        assert all(x in expected_ecosystems for x in self.osv.ecosystems)
-
-    @pytest.mark.asyncio
-    @pytest.mark.skipif(not EXTERNAL_SYSTEM(), reason="Needs network connection.")
-    @pytest.mark.parametrize("ecosystem_url", [url for url in cve_file_data])
-    async def test_get_ecosystem_00(self, ecosystem_url):
-        connector = aiohttp.TCPConnector(limit_per_host=19)
-        async with aiohttp.ClientSession(
-            connector=connector, trust_env=True
-        ) as session:
-            content = await self.osv.get_ecosystem(ecosystem_url, session)
-
-        cve_data = self.cve_file_data[ecosystem_url]
+        await self.osv.update_ecosystem_info()
 
-        assert content["id"] == cve_data["id"]
-        assert content["published"] == cve_data["published"]
+        # there may be more ecosystems fetched than provided in ecosystems.txt
+        assert all(
+            ecosystem in self.osv.ecosystems_fetched
+            for ecosystem in expected_ecosystems
+        )
 
     @pytest.mark.asyncio
     @pytest.mark.skipif(not EXTERNAL_SYSTEM(), reason="Needs network connection.")
@@ -209,19 +188,24 @@ async def test_get_ecosystem_01(self):
         async with aiohttp.ClientSession(
             connector=connector, trust_env=True
         ) as session:
-            content = await self.osv.get_ecosystem(eco_url, session, mode="bytes")
+            await self.osv.fetch_single(eco_url, self.osv.osv_path, session)
 
-        z = zipfile.ZipFile(io.BytesIO(content))
+        p = Path(self.osv.osv_path).glob("**/*")
+        files = [x.name for x in p if x.is_file()]
 
         # Shouldn't be any files as DWF is no longer a valid ecosystems
-        assert len(z.namelist()) == 0
+        assert len(files) == 0
 
     @pytest.mark.asyncio
     @pytest.mark.skipif(not EXTERNAL_SYSTEM(), reason="Needs network connection.")
     async def test_fetch_cves(self):
-        self.osv.ecosystems = ["PyPI"]
+        ecosystem_name = "PyPI"
+        self.osv.ecosystems_fetched = [ecosystem_name]
+        self.osv.download_url = [
+            f"https://storage.googleapis.com/osv-vulnerabilities/{ecosystem_name}/all.zip"
+        ]
 
-        await self.osv.fetch_cves()
+        await self.osv.fetch_all()
 
         p = Path(self.osv.osv_path).glob("**/*")
         files = [x.name for x in p if x.is_file()]
@@ -234,7 +218,11 @@ async def test_fetch_cves(self):
 
     @pytest.mark.parametrize("cve_entries", [[x] for _, x in cve_file_data.items()])
     def test_format_data(self, cve_entries):
-        severity_data, affected_data = self.osv.format_data(cve_entries)
+        severity_data, affected_data = [], []
+        for cve_entry in cve_entries:
+            severity, affected = self.osv.get_formatted_data_from_json(cve_entry)
+            severity_data.append(severity)
+            affected_data.append(affected)
 
         severity_data = severity_data[0]
 

From 7aeca1b70e70701323db632a74ee3f115615154d Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Wed, 19 Mar 2025 02:57:46 +0500
Subject: [PATCH 11/19] refactor: moved json parsing to separate function

---
 cve_bin_tool/data_sources/new_osv_source.py | 213 ++++++++++----------
 1 file changed, 107 insertions(+), 106 deletions(-)

diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
index f22dbde822..47794a915a 100644
--- a/cve_bin_tool/data_sources/new_osv_source.py
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -48,7 +48,7 @@ async def update_ecosystem_info(self) -> None:
                 except (ValueError, IndexError):
                     pass
 
-    async def __fetch_single(self, url: str, download_to: str, session):
+    async def fetch_single(self, url: str, download_to: str, session):
         """
         Fetches a single file while preventing downloading more than $max_parallel_downloads files simultaneously.
         """
@@ -71,7 +71,7 @@ async def __fetch_single(self, url: str, download_to: str, session):
                             else:
                                 in_memory_file: io.BytesIO = io.BytesIO(content)
                                 zip_file = zipfile.ZipFile(in_memory_file)
-                                zip_file.extractall(self.osv_path)
+                                zip_file.extractall(download_to)
                                 del in_memory_file
                                 del zip_file
                                 LOGGER.debug(f"OSV: Fetched and unzipped {url}.")
@@ -89,17 +89,17 @@ async def __fetch_single(self, url: str, download_to: str, session):
             except (TimeoutError, asyncio.TimeoutError):
                 LOGGER.warning(f"OSV: Timeout error while fetching {url}.")
 
-    async def _fetch_all(self):
+    async def fetch_all(self):
         """Concurrently fetch all zip files from OSV."""
         LOGGER.info("OSV: Started fetching OSV CVE files...")
         async with aiohttp.ClientSession() as session:
             tasks = [
-                self.__fetch_single(url, self.osv_path, session)
+                self.fetch_single(url, self.osv_path, session)
                 for url in self.download_url
             ]
             await asyncio.gather(*tasks)
 
-    async def _extract_all(self):
+    async def extract_all(self):
         """Extract and delete all files in the OSV cache directory."""
         # .glob("zip") returns an iterator, so it is okay to process files in the loop
         LOGGER.info("OSV: Started extracting zip files...")
@@ -114,128 +114,129 @@ async def _extract_all(self):
                 os.remove(file)
                 await asyncio.sleep(0.5)
 
-    def process_data_from_disk(self):
-        """Read data from disk and yield each instance in the required format."""
-        for file in pathlib.Path(self.osv_path).glob("*.json"):
-            with open(file) as opened_file:
-                content = opened_file.read()
+    @staticmethod
+    def get_formatted_data_from_json(content: dict):
+        cve_id, severity, vector = (
+            content.get("id"),
+            content.get("severity", None),
+            None,
+        )
 
-            json_data: dict = json.loads(content)  # type: ignore
-            del content
-
-            cve_id, severity, vector = (
-                json_data.get("id"),
-                json_data.get("severity", None),
-                None,
-            )
-
-            severity: dict | None  # type: ignore
-            if severity and "CVSS_V3" in [x["type"] for x in severity]:
-                try:
-                    # Ensure the CVSS vector is valid
-                    if severity[0]["score"].endswith("/"):
-                        cvss_data = CVSS3(severity[0]["score"][:-1])
-                        LOGGER.debug(f"{cve_id} : Correcting malformed CVSS3 vector.")
-                    else:
-                        cvss_data = CVSS3(severity[0]["score"])
-                    # Now extract CVSS attributes
-                    version = "3"
-                    severity = cvss_data.severities()[0]
-                    score = cvss_data.scores()[0]
-                    vector = cvss_data.clean_vector()
-
-                except Exception as e:
-                    LOGGER.debug(e)
-                    LOGGER.debug(f"{cve_id} : {severity}")
-                    vector = None
-
-            cve = {
-                "ID": cve_id,
-                "severity": severity if vector is not None else "unknown",
-                "description": json_data.get("summary", "unknown"),
-                "score": score if vector is not None else "unknown",  # noqa
-                "CVSS_version": version if vector is not None else "unknown",  # noqa
-                "CVSS_vector": vector if vector is not None else "unknown",
-                "last_modified": (
-                    json_data["modified"]
-                    if json_data.get("modified", None)
-                    else json_data["published"]
-                ),
+        severity: dict | None  # type: ignore
+        if severity and "CVSS_V3" in [x["type"] for x in severity]:
+            try:
+                # Ensure the CVSS vector is valid
+                if severity[0]["score"].endswith("/"):
+                    cvss_data = CVSS3(severity[0]["score"][:-1])
+                    LOGGER.debug(f"{cve_id} : Correcting malformed CVSS3 vector.")
+                else:
+                    cvss_data = CVSS3(severity[0]["score"])
+                # Now extract CVSS attributes
+                version = "3"
+                severity = cvss_data.severities()[0]
+                score = cvss_data.scores()[0]
+                vector = cvss_data.clean_vector()
+
+            except Exception as e:
+                LOGGER.debug(e)
+                LOGGER.debug(f"{cve_id} : {severity}")
+                vector = None
+
+        cve = {
+            "ID": cve_id,
+            "severity": severity if vector is not None else "unknown",
+            "description": content.get("summary", "unknown"),
+            "score": score if vector is not None else "unknown",  # noqa
+            "CVSS_version": version if vector is not None else "unknown",  # noqa
+            "CVSS_vector": vector if vector is not None else "unknown",
+            "last_modified": (
+                content["modified"]
+                if content.get("modified", None)
+                else content["published"]
+            ),
+        }
+
+        affected = None
+
+        for package_data in content.get("affected", []):
+            package = package_data.get("package", {})
+            if not package:
+                continue
+
+            product = package.get("name")
+            vendor = "unknown"  # OSV Schema does not provide vendor names for packages
+
+            if product.startswith("github.com/"):
+                vendor = product.split("/")[-2]
+                product = product.split("/")[-1]
+
+            _affected = {
+                "cve_id": cve_id,
+                "vendor": vendor,
+                "product": product,
+                "version": "*",
+                "versionStartIncluding": "",
+                "versionStartExcluding": "",
+                "versionEndIncluding": "",
+                "versionEndExcluding": "",
             }
 
-            affected = None
+            events = None
+            for ranges in package_data.get("ranges", []):
+                if ranges["type"] == "SEMVER":
+                    events = ranges["events"]
 
-            for package_data in json_data.get("affected", []):
-                package = package_data.get("package", {})
-                if not package:
-                    continue
+            if events is None and "versions" in package_data:
+                versions = package_data["versions"]
 
-                product = package.get("name")
-                vendor = (
-                    "unknown"  # OSV Schema does not provide vendor names for packages
-                )
-
-                if product.startswith("github.com/"):
-                    vendor = product.split("/")[-2]
-                    product = product.split("/")[-1]
-
-                _affected = {
-                    "cve_id": cve_id,
-                    "vendor": vendor,
-                    "product": product,
-                    "version": "*",
-                    "versionStartIncluding": "",
-                    "versionStartExcluding": "",
-                    "versionEndIncluding": "",
-                    "versionEndExcluding": "",
-                }
-
-                events = None
-                for ranges in package_data.get("ranges", []):
-                    if ranges["type"] == "SEMVER":
-                        events = ranges["events"]
+                if not versions:
+                    continue
 
-                if events is None and "versions" in package_data:
-                    versions = package_data["versions"]
+                version_affected = _affected.copy()
 
-                    if not versions:
-                        continue
+                version_affected["versionStartIncluding"] = versions[0]
+                version_affected["versionEndIncluding"] = versions[-1]
 
-                    version_affected = _affected.copy()
+                affected = version_affected
+            elif events is not None:
+                introduced = None
+                fixed = None
 
-                    version_affected["versionStartIncluding"] = versions[0]
-                    version_affected["versionEndIncluding"] = versions[-1]
+                for event in events:
+                    if event.get("introduced", None):
+                        introduced = event.get("introduced")
+                    if event.get("fixed", None):
+                        fixed = event.get("fixed")
 
-                    affected = version_affected
-                elif events is not None:
-                    introduced = None
-                    fixed = None
+                    if fixed is not None:
+                        range_affected = _affected.copy()
 
-                    for event in events:
-                        if event.get("introduced", None):
-                            introduced = event.get("introduced")
-                        if event.get("fixed", None):
-                            fixed = event.get("fixed")
+                        range_affected["versionStartIncluding"] = introduced
+                        range_affected["versionEndExcluding"] = fixed
 
-                        if fixed is not None:
-                            range_affected = _affected.copy()
+                        fixed = None
+                        affected = range_affected
 
-                            range_affected["versionStartIncluding"] = introduced
-                            range_affected["versionEndExcluding"] = fixed
+        return cve, affected
 
-                            fixed = None
-                            affected = range_affected
+    def process_data_from_disk(self):
+        """Read data from disk and yield each instance in the required format."""
+        for file in pathlib.Path(self.osv_path).glob("*.json"):
+            with open(file) as opened_file:
+                content = opened_file.read()
 
+            json_data: dict = json.loads(content)  # type: ignore
+            del content
+            data = self.get_formatted_data_from_json(json_data)
             # Delete unused json data before the garbage collector does.
             del json_data
-
-            yield cve, affected
+            yield data
 
     async def get_cve_data(self):
         """Returns OSV CVE data to insert into the database."""
         await self.update_ecosystem_info()
-        await self._fetch_all()
-        await self._extract_all()
+        await self.fetch_all()
+        await self.extract_all()
 
         # No need to keep links after download, as there may be a lot of them.
         del self.download_url

From 77dd8e52f4a735ed1841d15049cef1c8790a571d Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Wed, 19 Mar 2025 03:03:49 +0500
Subject: [PATCH 12/19] docs: added some comments

---
 cve_bin_tool/data_sources/new_osv_source.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
index 47794a915a..ed56500c21 100644
--- a/cve_bin_tool/data_sources/new_osv_source.py
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -42,6 +42,7 @@ async def update_ecosystem_info(self) -> None:
                     url = f"https://storage.googleapis.com/{self.bucket_name}/{ecosystem_name}/all.zip"
                     self.download_url.append(url)
                     LOGGER.debug(f"OSV: Download link for {ecosystem_name} added.")
+                    # exclude ecosystem versions from appending, e.g. Debian:10 should not be included
                     if ecosystem_name.find(":") >= 0:
                         ecosystem_name = ecosystem_name.split(":")[0]
                     self.ecosystems_fetched.add(ecosystem_name)
@@ -101,7 +102,7 @@ async def fetch_all(self):
 
     async def extract_all(self):
         """Extract and delete all files in the OSV cache directory."""
-        # .glob("zip") returns an iterator, so it is okay to process files in the loop
+        # .glob("zip") returns an iterator, so it is memory efficient to process files in the loop
         LOGGER.info("OSV: Started extracting zip files...")
         for file in pathlib.Path(self.osv_path).glob("*.zip"):
             try:
@@ -226,10 +227,11 @@ def process_data_from_disk(self):
                 content = opened_file.read()
 
             json_data: dict = json.loads(content)  # type: ignore
-            del content
             data = self.get_formatted_data_from_json(json_data)
             # Delete unused json data before the garbage collector does.
             del json_data
+            del content
+
             yield data
 
     async def get_cve_data(self):

From 38fd375ad369472df05a61d0fc3e1644fb6be5d1 Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Wed, 19 Mar 2025 03:05:59 +0500
Subject: [PATCH 13/19] refactor: useless code line removed

---
 cve_bin_tool/data_sources/new_osv_source.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
index ed56500c21..a3bcd5362d 100644
--- a/cve_bin_tool/data_sources/new_osv_source.py
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -29,7 +29,6 @@ def __init__(
 
         self.bucket_name = bucket_name or "osv-vulnerabilities"
         self.incremental_update = incremental_update
-        self.signed_link_expiration_time = 3600 * 2  # 2 hours
 
     async def update_ecosystem_info(self) -> None:
         """Fetch OSV ecosystem information and prepare download links."""

From 8fc8aa77f43251fe041059d53c6ce4481e345f60 Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Wed, 19 Mar 2025 03:10:09 +0500
Subject: [PATCH 14/19] docs: docstring changed

---
 cve_bin_tool/data_sources/new_osv_source.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
index a3bcd5362d..45ea4640eb 100644
--- a/cve_bin_tool/data_sources/new_osv_source.py
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -15,7 +15,7 @@
 
 
 class OSVDataSource(Data_Source):
-    """A slightly more memory-efficient reimplementation of the OSV data source."""
+    """Reimplementation of the OSV data source."""
 
     def __init__(
         self, incremental_update=False, bucket_name=None, max_parallel_downloads=5

From 3f29e1b3893999d2ea00deb10b2d485044af230e Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Wed, 19 Mar 2025 12:34:39 +0500
Subject: [PATCH 15/19] fix: ignored None values

---
 cve_bin_tool/data_sources/new_osv_source.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
index 45ea4640eb..fb3bb45a85 100644
--- a/cve_bin_tool/data_sources/new_osv_source.py
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -244,7 +244,9 @@ async def get_cve_data(self):
 
         severity_data, affected_data = [], []
         for cve, affected in self.process_data_from_disk():
-            severity_data.append(cve)
-            affected_data.append(affected)
+            if cve:
+                severity_data.append(cve)
+            if affected:
+                affected_data.append(affected)
 
         return (severity_data, affected_data), self.source_name

From 28eb7acd2fcc69ba2a03e3d1d1f10c1e0643ccb7 Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Wed, 19 Mar 2025 17:02:38 +0500
Subject: [PATCH 16/19] chore: updated requirements

---
 requirements.csv                    | 2 +-
 test/language_data/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.csv b/requirements.csv
index 3ad34e857e..dff46f5cec 100644
--- a/requirements.csv
+++ b/requirements.csv
@@ -15,7 +15,7 @@ sissaschool_not_in_db,xmlschema
 python_not_in_db,importlib_metadata
 python,requests
 python,urllib3
-google,gsutil
+google,google-cloud-storage
 skontar,cvss
 python_not_in_db,packaging
 python_not_in_db,importlib_resources
diff --git a/test/language_data/requirements.txt b/test/language_data/requirements.txt
index 1d4aa9a090..2077ff76e5 100644
--- a/test/language_data/requirements.txt
+++ b/test/language_data/requirements.txt
@@ -14,6 +14,6 @@ xmlschema
 importlib_metadata; python_version < "3.8"
 requests
 urllib3>=1.26.5 # dependency of requests added explictly to avoid CVEs
-gsutil
+google-cloud-storage
 cvss
 packaging

From 79a3d8198fe3d2a90ba9df1940e7da21803fd6ec Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Wed, 19 Mar 2025 20:10:05 +0500
Subject: [PATCH 17/19] Revert "chore: updated requirements"

This reverts commit 28eb7acd2fcc69ba2a03e3d1d1f10c1e0643ccb7.
---
 requirements.csv                    | 2 +-
 test/language_data/requirements.txt | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.csv b/requirements.csv
index dff46f5cec..3ad34e857e 100644
--- a/requirements.csv
+++ b/requirements.csv
@@ -15,7 +15,7 @@ sissaschool_not_in_db,xmlschema
 python_not_in_db,importlib_metadata
 python,requests
 python,urllib3
-google,google-cloud-storage
+google,gsutil
 skontar,cvss
 python_not_in_db,packaging
 python_not_in_db,importlib_resources
diff --git a/test/language_data/requirements.txt b/test/language_data/requirements.txt
index 2077ff76e5..1d4aa9a090 100644
--- a/test/language_data/requirements.txt
+++ b/test/language_data/requirements.txt
@@ -14,6 +14,6 @@ xmlschema
 importlib_metadata; python_version < "3.8"
 requests
 urllib3>=1.26.5 # dependency of requests added explictly to avoid CVEs
-google-cloud-storage
+gsutil
 cvss
 packaging

From 256009ebd050bdff576f138e55a92859dc06db17 Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Thu, 20 Mar 2025 01:26:12 +0500
Subject: [PATCH 18/19] chore: updated

---
 requirements.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.csv b/requirements.csv
index 3ad34e857e..dff46f5cec 100644
--- a/requirements.csv
+++ b/requirements.csv
@@ -15,7 +15,7 @@ sissaschool_not_in_db,xmlschema
 python_not_in_db,importlib_metadata
 python,requests
 python,urllib3
-google,gsutil
+google,google-cloud-storage
 skontar,cvss
 python_not_in_db,packaging
 python_not_in_db,importlib_resources

From e43d7457d3ce1a98b389f5ef7dc9ea4cdcf0bc82 Mon Sep 17 00:00:00 2001
From: fil1n <22854425+fil1n@users.noreply.github.com>
Date: Sun, 23 Mar 2025 23:56:15 +0500
Subject: [PATCH 19/19] refactor: removed unused field

---
 cve_bin_tool/cli.py                         | 4 +---
 cve_bin_tool/data_sources/new_osv_source.py | 5 +----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/cve_bin_tool/cli.py b/cve_bin_tool/cli.py
index 2f77c4888a..a4c78d40dd 100644
--- a/cve_bin_tool/cli.py
+++ b/cve_bin_tool/cli.py
@@ -782,9 +782,7 @@ def main(argv=None):
     enabled_sources = []
 
     if "OSV" not in disabled_sources:
-        source_osv = new_osv_source.OSVDataSource(
-            incremental_update=incremental_db_update
-        )
+        source_osv = new_osv_source.OSVDataSource()
         enabled_sources.append(source_osv)
 
     if "GAD" not in disabled_sources:
diff --git a/cve_bin_tool/data_sources/new_osv_source.py b/cve_bin_tool/data_sources/new_osv_source.py
index fb3bb45a85..0d71450d4b 100644
--- a/cve_bin_tool/data_sources/new_osv_source.py
+++ b/cve_bin_tool/data_sources/new_osv_source.py
@@ -17,9 +17,7 @@
 class OSVDataSource(Data_Source):
     """Reimplementation of the OSV data source."""
 
-    def __init__(
-        self, incremental_update=False, bucket_name=None, max_parallel_downloads=5
-    ):
+    def __init__(self, bucket_name=None, max_parallel_downloads=5):
         self.source_name = "OSV"
         self._client = storage.Client(credentials=AnonymousCredentials(), project=None)
         self.ecosystems_fetched = set()
@@ -28,7 +26,6 @@ def __init__(
         self.osv_path = str(pathlib.Path(DISK_LOCATION_DEFAULT) / "osv")
 
         self.bucket_name = bucket_name or "osv-vulnerabilities"
-        self.incremental_update = incremental_update
 
     async def update_ecosystem_info(self) -> None:
         """Fetch OSV ecosystem information and prepare download links."""