Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge-cachi2-sboms: re-generate test data, fix uncovered bugs #204

Merged
merged 6 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions sbom-utility-scripts/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ WORKDIR /scripts

COPY scripts/merge_syft_sboms.py /scripts
COPY scripts/merge-cachi2-sboms-script/merge_cachi2_sboms.py /scripts
COPY scripts/merge-cachi2-sboms-script/requirements.txt /scripts/merge-cachi2-sboms-script-requirements.txt
COPY scripts/base-images-sbom-script/app/base_images_sbom_script.py /scripts
COPY scripts/base-images-sbom-script/app/requirements.txt /scripts/base-images-sbom-script-requirements.txt
COPY scripts/index-image-sbom-script/requirements.txt /scripts/index-image-sbom-script-requirements.txt
Expand All @@ -13,6 +14,7 @@ COPY scripts/add-image-reference-script/add_image_reference.py /scripts
COPY scripts/add-image-reference-script/requirements.txt /scripts/add-image-reference-requirements.txt

RUN pip3 install --no-cache-dir \
-r merge-cachi2-sboms-script-requirements.txt \
-r base-images-sbom-script-requirements.txt \
-r index-image-sbom-script-requirements.txt \
-r add-image-reference-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,23 +1,73 @@
#!/usr/bin/env python3
import json
from argparse import ArgumentParser
from typing import Any, Callable
from urllib.parse import quote_plus, urlsplit
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, Protocol, Sequence
from urllib.parse import quote_plus

tnevrlka marked this conversation as resolved.
Show resolved Hide resolved
from packageurl import PackageURL

def _is_syft_local_golang_component(component: dict) -> bool:

def try_parse_purl(s: str) -> PackageURL | None:
try:
return PackageURL.from_string(s)
except ValueError:
return None


class SBOMItem(Protocol):
def name(self) -> str: ...
def version(self) -> str: ...
def purl(self) -> PackageURL | None: ...


@dataclass
class CDXComponent:
data: dict[str, Any]

def name(self) -> str:
return self.data["name"]

def version(self) -> str:
return self.data.get("version") or ""

def purl(self) -> PackageURL | None:
if purl_str := self.data.get("purl"):
return try_parse_purl(purl_str)
return None


def wrap_as_cdx(items: list[dict[str, Any]]) -> list[CDXComponent]:
return list(map(CDXComponent, items))


def unwrap_from_cdx(items: list[CDXComponent]) -> list[dict[str, Any]]:
return [c.data for c in items]


def _subpath_is_version(subpath: str) -> bool:
# pkg:golang/github.com/cachito-testing/[email protected]#terminaltor -> subpath is a subpath
# pkg:golang/github.com/cachito-testing/[email protected]#v2 -> subpath is a version. Thanks, Syft.
return subpath.startswith("v") and subpath.removeprefix("v").isdecimal()


def _is_syft_local_golang_component(component: SBOMItem) -> bool:
"""
Check if a Syft Golang reported component is a local replacement.

Local replacements are reported in a very different way by Cachi2, which is why the same
reports by Syft should be removed.
"""
return component.get("purl", "").startswith("pkg:golang") and (
component.get("name", "").startswith(".") or component.get("version", "") == "(devel)"
)
purl = component.purl()
if not purl or purl.type != "golang":
return False
if (subpath := purl.subpath) and not _subpath_is_version(subpath):
return True
return component.name().startswith(".") or component.version() == "(devel)"


def _is_cachi2_non_registry_dependency(component: dict) -> bool:
def _is_cachi2_non_registry_dependency(component: SBOMItem) -> bool:
"""
Check if Cachi2 component was fetched from a VCS or a direct file location.

Expand All @@ -30,26 +80,29 @@ def _is_cachi2_non_registry_dependency(component: dict) -> bool:

Note that this function is only applicable for PyPI or NPM components.
"""
purl = component.get("purl", "")
purl = component.purl()
if not purl:
return False

return (purl.startswith("pkg:pypi") or purl.startswith("pkg:npm")) and (
"vcs_url=" in purl or "download_url=" in purl
)
qualifiers = purl.qualifiers or {}
return purl.type in ("pypi", "npm") and ("vcs_url" in qualifiers or "download_url" in qualifiers)


def _unique_key_cachi2(component: dict) -> str:
def _unique_key_cachi2(component: SBOMItem) -> str:
"""
Create a unique key from Cachi2 reported components.

This is done by taking a purl and removing any qualifiers and subpaths.

See https://github.com/package-url/purl-spec/tree/master#purl for more info on purls.
"""
url = urlsplit(component["purl"])
return url.scheme + ":" + url.path
purl = component.purl()
if not purl:
raise ValueError(f"cachi2 component with no purl? name={component.name()}, version={component.version()}")
return purl._replace(qualifiers=None, subpath=None).to_string()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just FYI: NamedTuple methods don't follow the same naming scheme as most other python classes. Despite starting with an underscore, the _replace method is public API https://docs.python.org/3/library/collections.html#collections.somenamedtuple._replace

(PackageURL is a NamedTuple)



def _unique_key_syft(component: dict) -> str:
def _unique_key_syft(component: SBOMItem) -> str:
"""
Create a unique key for Syft reported components.

Expand All @@ -60,24 +113,29 @@ def _unique_key_syft(component: dict) -> str:

If a Syft component lacks a purl (e.g. type OS), we'll use its name and version instead.
"""
if "purl" not in component:
return component.get("name", "") + "@" + component.get("version", "")
purl = component.purl()
if not purl:
return component.name() + "@" + component.version()

if "@" in component["purl"]:
name, version = component["purl"].split("@")
name = purl.name
version = purl.version
subpath = purl.subpath

if name.startswith("pkg:pypi"):
name = name.lower()
if purl.type == "pypi":
name = name.lower()

if name.startswith("pkg:golang"):
if purl.type == "golang":
if version:
version = quote_plus(version)
if subpath and _subpath_is_version(subpath):
# put the module version where it belongs (in the module name)
name = f"{name}/{subpath}"
subpath = None

return f"{name}@{version}"
else:
return component["purl"]
return purl._replace(name=name, version=version, subpath=subpath).to_string()


def _get_syft_component_filter(cachi_sbom_components: list[dict[str, Any]]) -> Callable:
def _get_syft_component_filter(cachi_sbom_components: Sequence[SBOMItem]) -> Callable[[SBOMItem], bool]:
"""
Get a function that filters out Syft components for the merged SBOM.

Expand All @@ -94,20 +152,32 @@ def _get_syft_component_filter(cachi_sbom_components: list[dict[str, Any]]) -> C
given that it scans all the source code properly and the image is built hermetically.
"""
cachi2_non_registry_components = [
component["name"] for component in cachi_sbom_components if _is_cachi2_non_registry_dependency(component)
component.name() for component in cachi_sbom_components if _is_cachi2_non_registry_dependency(component)
]
cachi2_local_paths = {
Path(subpath) for component in cachi_sbom_components if (purl := component.purl()) and (subpath := purl.subpath)
}

cachi2_indexed_components = {_unique_key_cachi2(component): component for component in cachi_sbom_components}

def is_duplicate_non_registry_component(component: dict[str, Any]) -> bool:
return component["name"] in cachi2_non_registry_components
def is_duplicate_non_registry_component(component: SBOMItem) -> bool:
return component.name() in cachi2_non_registry_components

def is_duplicate_npm_localpath_component(component: SBOMItem) -> bool:
purl = component.purl()
if not purl or purl.type != "npm":
return False
# instead of reporting path dependencies as pkg:npm/name@version?...#subpath,
# syft repots them as pkg:npm/subpath@version
return Path(purl.namespace or "", purl.name) in cachi2_local_paths

def component_is_duplicated(component: dict[str, Any]) -> bool:
def component_is_duplicated(component: SBOMItem) -> bool:
key = _unique_key_syft(component)

return (
_is_syft_local_golang_component(component)
or is_duplicate_non_registry_component(component)
or is_duplicate_npm_localpath_component(component)
or key in cachi2_indexed_components.keys()
)

Expand Down Expand Up @@ -148,6 +218,13 @@ def _merge_tools_metadata(syft_sbom: dict[Any, Any], cachi2_sbom: dict[Any, Any]
)


def merge_components[T: SBOMItem](cachi2_components: Sequence[T], syft_components: Sequence[T]) -> list[T]:
is_duplicate_component = _get_syft_component_filter(cachi2_components)
merged = [c for c in syft_components if not is_duplicate_component(c)]
merged += cachi2_components
return merged


def merge_sboms(cachi2_sbom_path: str, syft_sbom_path: str) -> str:
"""Merge Cachi2 components into the Syft SBOM while removing duplicates."""
with open(cachi2_sbom_path) as file:
Expand All @@ -156,11 +233,11 @@ def merge_sboms(cachi2_sbom_path: str, syft_sbom_path: str) -> str:
with open(syft_sbom_path) as file:
syft_sbom = json.load(file)

is_duplicate_component = _get_syft_component_filter(cachi2_sbom["components"])

filtered_syft_components = [c for c in syft_sbom.get("components", []) if not is_duplicate_component(c)]
cachi2_components = wrap_as_cdx(cachi2_sbom["components"])
syft_components = wrap_as_cdx(syft_sbom.get("components", []))
merged = merge_components(cachi2_components, syft_components)

syft_sbom["components"] = filtered_syft_components + cachi2_sbom["components"]
syft_sbom["components"] = unwrap_from_cdx(merged)

_merge_tools_metadata(syft_sbom, cachi2_sbom)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
packageurl-python
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#
# This file is autogenerated by pip-compile with Python 3.11
# by the following command:
#
# pip-compile --generate-hashes --output-file=requirements.txt requirements.in
#
packageurl-python==0.15.0 \
--hash=sha256:cdc6bd42dc30c4fc7f8f0ccb721fc31f8c33985dbffccb6e6be4c72874de48ca \
--hash=sha256:f219b2ce6348185a27bd6a72e6fdc9f984e6c9fa157effa7cb93e341c49cdcc2
# via -r requirements.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/bin/bash
set -o errexit -o nounset -o pipefail -o xtrace

# This script was used to generate the input SBOMs in this directory:
# - cachi2.bom.json
# - syft.bom.json
#
# Hopefully you won't need to run this script again, but if you do, you need:
# - cachi2 (https://github.com/containerbuildsystem/cachi2/blob/main/CONTRIBUTING.md#virtual-environment)
# - syft (https://github.com/anchore/syft/releases)
# - preferably at the version used by the tasks in https://github.com/konflux-ci/build-definitions
#
# It will generate cachi2 and syft SBOMs for a few sample repositories (and one
# container image, for syft) and assemble them into a merged cachi2 SBOM and a
# merged syft SBOM. You can then test the merge_cachi2_sboms.py script by merging
# the cachi2 SBOM with the syft SBOM.

testdata_dir=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd)

# This can't actually be in /tmp! Until v1.6.0, syft had a bug where directory scanning
# didn't work at all if the directory was in /tmp
temp_workdir=$(realpath ./assemble-sboms)
mkdir -p "$temp_workdir"
trap 'rm -rf "$temp_workdir"' EXIT

cd "$temp_workdir"
mkdir cachi2-sboms
mkdir syft-sboms

git clone https://github.com/cachito-testing/gomod-pandemonium
(
cd gomod-pandemonium

syft dir:. -o [email protected] > "$temp_workdir/syft-sboms/gomod-pandemonium.bom.json"

cachi2 fetch-deps '[
{"type": "gomod"},
{"type": "gomod", "path": "terminaltor"},
{"type": "gomod", "path": "weird"}
]'
cp cachi2-output/bom.json "../cachi2-sboms/gomod-pandemonium.bom.json"
)

git clone https://github.com/cachito-testing/pip-e2e-test
(
cd pip-e2e-test

syft dir:. -o [email protected] > "$temp_workdir/syft-sboms/pip-e2e-test.bom.json"

cachi2 fetch-deps pip
cp cachi2-output/bom.json "$temp_workdir/cachi2-sboms/pip-e2e-test.bom.json"
)

git clone https://github.com/cachito-testing/npm-cachi2-smoketest --branch lockfile-v3
(
cd npm-cachi2-smoketest

syft dir:. -o [email protected] > "$temp_workdir/syft-sboms/npm-cachi2-smoketest.bom.json"

cachi2 fetch-deps npm
cp cachi2-output/bom.json "$temp_workdir/cachi2-sboms/npm-cachi2-smoketest.bom.json"
)

ubi_micro=registry.access.redhat.com/ubi9/ubi-micro:9.5@sha256:a22fffe0256af00176c8b4f22eec5d8ecb1cb1684d811c33b1f2832fd573260f
syft image:"$ubi_micro" -o [email protected] > "$temp_workdir/syft-sboms/ubi-micro.bom.json"

postprocess_cachi2_cyclonedx() {
jq --sort-keys
}

postprocess_syft_cyclonedx() {
# These change every time. Set them to a hardcoded value to avoid unnecessary changes
# when re-running this script.
jq --sort-keys '
.metadata.timestamp = "2024-12-18T11:08:00+01:00" |
.serialNumber = "urn:uuid:1d823647-6b64-41b3-a29b-1d09cfb3ba8a"
'
}

cachi2 merge-sboms "$temp_workdir/cachi2-sboms"/* |
postprocess_cachi2_cyclonedx > "$testdata_dir/cachi2.bom.json"

syft ./syft-sboms --select-catalogers=+sbom-cataloger -o [email protected] |
postprocess_syft_cyclonedx > "$testdata_dir/syft.bom.json"
Loading
Loading