Skip to content

Commit 69099a8

Browse files
committed
chore: add support for PyPI PURLs
Signed-off-by: Ben Selwyn-Smith <[email protected]>
1 parent 599d261 commit 69099a8

File tree

3 files changed

+146
-24
lines changed

3 files changed

+146
-24
lines changed

src/macaron/slsa_analyzer/analyzer.py

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,8 @@
7676
from macaron.slsa_analyzer.git_service import GIT_SERVICES, BaseGitService, GitHub
7777
from macaron.slsa_analyzer.git_service.base_git_service import NoneGitService
7878
from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR
79-
from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, MavenCentralRegistry
79+
from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES, MavenCentralRegistry, PyPIRegistry
80+
from macaron.slsa_analyzer.package_registry.pypi_registry import find_or_create_pypi_asset
8081
from macaron.slsa_analyzer.provenance.expectations.expectation_registry import ExpectationRegistry
8182
from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV01Payload
8283
from macaron.slsa_analyzer.provenance.intoto.errors import LoadIntotoAttestationError
@@ -510,7 +511,9 @@ def run_single(
510511
except TypeError as error:
511512
logger.debug("Failed to parse repository path as URL: %s", error)
512513
if url and url.hostname == "github.com":
513-
artifact_hash = self.get_artifact_hash(parsed_purl, local_artifact_dirs, hashlib.sha256())
514+
artifact_hash = self.get_artifact_hash(
515+
parsed_purl, local_artifact_dirs, hashlib.sha256(), all_package_registries
516+
)
514517
if artifact_hash:
515518
git_attestation_dict = git_service.api_client.get_attestation(
516519
analyze_ctx.component.repository.full_name, artifact_hash
@@ -983,7 +986,11 @@ def create_analyze_ctx(self, component: Component) -> AnalyzeContext:
983986
return analyze_ctx
984987

985988
def get_artifact_hash(
986-
self, purl: PackageURL, cached_artifacts: list[str] | None, hash_algorithm: Any
989+
self,
990+
purl: PackageURL,
991+
cached_artifacts: list[str] | None,
992+
hash_algorithm: Any,
993+
all_package_registries: list[PackageRegistryInfo],
987994
) -> str | None:
988995
"""Get the hash of the artifact found from the passed PURL using local or remote files.
989996
@@ -995,6 +1002,8 @@ def get_artifact_hash(
9951002
The list of local files that match the PURL.
9961003
hash_algorithm: Any
9971004
The hash algorithm to use.
1005+
all_package_registries: list[PackageRegistryInfo]
1006+
The list of package registry information.
9981007
9991008
Returns
10001009
-------
@@ -1024,8 +1033,43 @@ def get_artifact_hash(
10241033
return maven_registry.get_artifact_hash(purl, hash_algorithm)
10251034

10261035
if purl.type == "pypi":
1027-
# TODO implement
1028-
return None
1036+
pypi_registry = next(
1037+
(
1038+
package_registry
1039+
for package_registry in PACKAGE_REGISTRIES
1040+
if isinstance(package_registry, PyPIRegistry)
1041+
),
1042+
None,
1043+
)
1044+
if not pypi_registry:
1045+
logger.debug("Missing registry for PyPI")
1046+
return None
1047+
1048+
registry_info = next(
1049+
(
1050+
info
1051+
for info in all_package_registries
1052+
if info.package_registry == pypi_registry and info.build_tool_name in {"pip", "poetry"}
1053+
),
1054+
None,
1055+
)
1056+
if not registry_info:
1057+
logger.debug("Missing registry information for PyPI")
1058+
return None
1059+
1060+
pypi_asset = find_or_create_pypi_asset(purl.name, purl.version, registry_info)
1061+
if not pypi_asset:
1062+
return None
1063+
1064+
pypi_asset.has_repository = True
1065+
if not pypi_asset.download(""):
1066+
return None
1067+
1068+
source_url = pypi_asset.get_sourcecode_url("bdist_wheel")
1069+
if not source_url:
1070+
return None
1071+
1072+
return pypi_registry.get_artifact_hash(source_url, hash_algorithm)
10291073

10301074
logger.debug("Purl type '%s' not yet supported for GitHub attestation discovery.", purl.type)
10311075
return None

src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,11 @@
3232
from macaron.slsa_analyzer.checks.base_check import BaseCheck
3333
from macaron.slsa_analyzer.checks.check_result import CheckResultData, CheckResultType, Confidence, JustificationType
3434
from macaron.slsa_analyzer.package_registry.deps_dev import APIAccessError, DepsDevService
35-
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, PyPIRegistry
35+
from macaron.slsa_analyzer.package_registry.pypi_registry import (
36+
PyPIPackageJsonAsset,
37+
PyPIRegistry,
38+
find_or_create_pypi_asset,
39+
)
3640
from macaron.slsa_analyzer.registry import registry
3741
from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
3842
from macaron.util import send_post_http_raw
@@ -261,23 +265,16 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
261265
case PackageRegistryInfo(
262266
build_tool_name="pip" | "poetry",
263267
build_tool_purl_type="pypi",
264-
package_registry=PyPIRegistry() as pypi_registry,
268+
package_registry=PyPIRegistry(),
265269
) as pypi_registry_info:
266-
267-
# Retrieve the pre-existing AssetLocator object for the PyPI package JSON object, if it exists.
268-
pypi_package_json = next(
269-
(asset for asset in pypi_registry_info.metadata if isinstance(asset, PyPIPackageJsonAsset)),
270-
None,
270+
# Retrieve the pre-existing asset, or create a new one.
271+
pypi_package_json = find_or_create_pypi_asset(
272+
ctx.component.name, ctx.component.version, pypi_registry_info
271273
)
272-
if not pypi_package_json:
273-
# Create an AssetLocator object for the PyPI package JSON object.
274-
pypi_package_json = PyPIPackageJsonAsset(
275-
component_name=ctx.component.name,
276-
component_version=ctx.component.version,
277-
has_repository=ctx.component.repository is not None,
278-
pypi_registry=pypi_registry,
279-
package_json={},
280-
)
274+
if pypi_package_json is None:
275+
return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN)
276+
277+
pypi_package_json.has_repository = ctx.component.repository is not None
281278

282279
pypi_registry_info.metadata.append(pypi_package_json)
283280

src/macaron/slsa_analyzer/package_registry/pypi_registry.py

Lines changed: 84 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import zipfile
1212
from dataclasses import dataclass
1313
from datetime import datetime
14+
from typing import Any
1415

1516
import requests
1617
from bs4 import BeautifulSoup, Tag
@@ -21,6 +22,7 @@
2122
from macaron.json_tools import json_extract
2223
from macaron.malware_analyzer.datetime_parser import parse_datetime
2324
from macaron.slsa_analyzer.package_registry.package_registry import PackageRegistry
25+
from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
2426
from macaron.util import send_get_http_raw
2527

2628
logger: logging.Logger = logging.getLogger(__name__)
@@ -231,6 +233,45 @@ def fetch_sourcecode(self, src_url: str) -> dict[str, str] | None:
231233
logger.debug("Successfully fetch the source code from PyPI")
232234
return py_files_content
233235

236+
def get_artifact_hash(self, artifact_url: str, hash_algorithm: Any) -> str | None:
237+
"""Return the hash of the artifact found at the passed URL.
238+
239+
Parameters
240+
----------
241+
artifact_url
242+
The URL of the artifact.
243+
hash_algorithm: Any
244+
The hash algorithm to use.
245+
246+
Returns
247+
-------
248+
str | None
249+
The hash of the artifact, or None if not found.
250+
"""
251+
try:
252+
response = requests.get(artifact_url, stream=True, timeout=40)
253+
response.raise_for_status()
254+
except requests.exceptions.HTTPError as http_err:
255+
logger.debug("HTTP error occurred: %s", http_err)
256+
return None
257+
258+
if response.status_code != 200:
259+
logger.debug("Invalid response: %s", response.status_code)
260+
return None
261+
262+
try:
263+
for chunk in response.iter_content():
264+
hash_algorithm.update(chunk)
265+
except RequestException as error:
266+
# Something went wrong with the request, abort.
267+
logger.debug("Error while streaming source file: %s", error)
268+
response.close()
269+
return None
270+
271+
artifact_hash: str = hash_algorithm.hexdigest()
272+
logger.debug("Computed artifact hash: %s", artifact_hash)
273+
return artifact_hash
274+
234275
def get_package_page(self, package_name: str) -> str | None:
235276
"""Implement custom API to get package main page.
236277
@@ -430,15 +471,19 @@ def get_latest_version(self) -> str | None:
430471
"""
431472
return json_extract(self.package_json, ["info", "version"], str)
432473

433-
def get_sourcecode_url(self) -> str | None:
474+
def get_sourcecode_url(self, package_type: str = "sdist") -> str | None:
434475
"""Get the url of the source distribution.
435476
477+
Parameters
478+
----------
479+
package_type: str
480+
The package type to retrieve the URL of.
481+
436482
Returns
437483
-------
438484
str | None
439485
The URL of the source distribution.
440486
"""
441-
urls: list | None = None
442487
if self.component_version:
443488
urls = json_extract(self.package_json, ["releases", self.component_version], list)
444489
else:
@@ -447,7 +492,7 @@ def get_sourcecode_url(self) -> str | None:
447492
if not urls:
448493
return None
449494
for distribution in urls:
450-
if distribution.get("packagetype") != "sdist":
495+
if distribution.get("packagetype") != package_type:
451496
continue
452497
# We intentionally check if the url is None and use empty string if that's the case.
453498
source_url: str = distribution.get("url") or ""
@@ -497,3 +542,39 @@ def get_sourcecode(self) -> dict[str, str] | None:
497542
source_code: dict[str, str] | None = self.pypi_registry.fetch_sourcecode(url)
498543
return source_code
499544
return None
545+
546+
547+
def find_or_create_pypi_asset(
548+
asset_name: str, asset_version: str | None, pypi_registry_info: PackageRegistryInfo
549+
) -> PyPIPackageJsonAsset | None:
550+
"""Find the asset in the provided package registry information, or create it.
551+
552+
Parameters
553+
----------
554+
asset_name: str
555+
The name of the asset.
556+
asset_version: str | None
557+
The version of the asset.
558+
pypi_registry_info:
559+
The package registry information.
560+
561+
Returns
562+
-------
563+
PyPIPackageJsonAsset | None
564+
The asset, or None if not found.
565+
"""
566+
pypi_package_json = next(
567+
(asset for asset in pypi_registry_info.metadata if isinstance(asset, PyPIPackageJsonAsset)),
568+
None,
569+
)
570+
if pypi_package_json:
571+
return pypi_package_json
572+
573+
package_registry = pypi_registry_info.package_registry
574+
if not isinstance(package_registry, PyPIRegistry):
575+
logger.debug("Failed to create PyPIPackageJson asset.")
576+
return None
577+
578+
asset = PyPIPackageJsonAsset(asset_name, asset_version, False, package_registry, {})
579+
pypi_registry_info.metadata.append(asset)
580+
return asset

0 commit comments

Comments
 (0)