Skip to content

Commit

Permalink
Deduplicate sources
Browse files Browse the repository at this point in the history
STONEBLD-1831

If a parent image is built by Konflux, the sources introduced by it may
duplicate with prefetched sources gathered during the local source
build. Through the deduplicate process, comparing the parent sources and
local build sources layer by layer. Once two layers have the same blob
file and artifact name, they are treated as same, then the layer is
removed from local source build. The same layer introduced by the parent
will be merged into the source container image eventually.

Major changes:

* Add a new class BSILayer that wraps Layer to handle the layer archive
  generated by BuildSourceImage and implements a few methods for
  duplication check.
* Add a new method to dedpulicate the sources.
* Tests are updated with real tar archives.
* Shared code by tests are moved to test_utils.py specifically. This
  file intends to have shared code only and not include any test code.

Side effect:

* Same extra source tar archive names remain potentially. For instance,
  parent sources have extra-src-[0-3].tar and local source build has
  extra-src-[01].tar. When extra-src-1.tar duplicates the parent source
  extra-src-2.tar and is removed, then the final source image will have
  two layers including extra-src-0.tar.

  Whether this is a real problem in practice depends on how the sources
  are extracted. It should work well for extracting sources one after
  another. For the parallel extraction, separate working directory per
  layer should also work.

Signed-off-by: Chenxiong Qi <[email protected]>
  • Loading branch information
tkdchen committed Apr 22, 2024
1 parent 1244277 commit 3f61574
Show file tree
Hide file tree
Showing 4 changed files with 729 additions and 73 deletions.
233 changes: 221 additions & 12 deletions source-container-build/app/source_build.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,28 @@
#!/usr/bin/python3.11

import argparse
import filetype
import functools
import json
import hashlib
import itertools
import json
import logging
import os
import re
import shutil
import logging
import stat
import sys
import tarfile
import tempfile
import filetype
import hashlib

from dataclasses import dataclass, field
from pathlib import Path
from subprocess import run
from tarfile import TarInfo
from typing import Any, TypedDict, NotRequired, Literal, Final
from urllib.parse import urlparse


"""
Requires: git, skopeo, tar, BuildSourceImage
"""
Expand All @@ -40,6 +45,8 @@

MAX_RETRIES: Final = 5

StrPath = str | os.PathLike


class BuildResult(TypedDict):
status: Literal["failure", "success"]
Expand Down Expand Up @@ -575,7 +582,7 @@ def save(self) -> "Blob":


class Layer(Blob):
"""Represent an image layer. Currently, no operation on the layers."""
"""Represent an image layer"""


class JSONBlob(Blob):
Expand Down Expand Up @@ -639,6 +646,43 @@ def prepend_layer(self, layer: Layer) -> None:
layers: list[DescriptorT] = self.to_python["layers"]
layers.insert(0, layer.descriptor)

def _find_layer(self, layer: Layer) -> int:
"""Find layer by descriptor from internal JSON raw manifest
:param layer: a layer to find.
:type layer: Layer
:return: the index in the ``.layers``.
:rtype: int
"""
for idx, item in enumerate(self.to_python["layers"]):
if item == layer.descriptor:
return idx
return -1

def remove_layer(self, layer: Layer) -> tuple[DescriptorT, str, HistoryT]:
"""Remove a layer
Layer descriptor is removed from this manifest, and associated diff_id
and history are also removed from the config.
:param layer: remove this layer.
:type layer: Layer
:return: return a 3 elements tuple about removed layer, that are the
descriptor, diff_id and history.
:rtype: tuple[dict, str, dict]
"""
idx = self._find_layer(layer)
if idx < 0:
digest = layer.descriptor["digest"]
raise ValueError(f"Layer with digest {digest} does not exist")
layer.path.unlink()
del self.to_python["layers"][idx]
diff_id = self.config.diff_ids[idx]
del self.config.diff_ids[idx]
history = self.config.history[idx]
del self.config.history[idx]
return layer.descriptor, diff_id, history

def save(self) -> Blob:
"""Save this manifest"""

Expand All @@ -648,15 +692,16 @@ def save(self) -> Blob:

layer_descriptors: list[DescriptorT] = self.to_python["layers"]
for layer in self.layers:
idx = self._find_layer(layer)
if idx < 0:
# deleted already, do nothing.
continue
if not layer.path.exists():
raise ValueError(f"layer {str(layer.path)} does not exist.")
new_layer = layer.save()
if new_layer == layer:
continue
for idx, d in enumerate(layer_descriptors):
if d["digest"] == layer.descriptor["digest"]:
layer_descriptors[idx] = new_layer.descriptor
layer.delete()
if new_layer != layer:
layer_descriptors[idx] = new_layer.descriptor
layer.delete()

return super().save()

Expand Down Expand Up @@ -701,7 +746,7 @@ def save(self) -> None:
class OCIImage:
"""Represent an OCI image"""

def __init__(self, path: str | os.PathLike):
def __init__(self, path: StrPath):
"""Initialize this OCI image object
:param path: a path to an OCI image.
Expand All @@ -720,6 +765,126 @@ def index(self) -> Index:
return self._index


class BSILayer:
"""Wrapper of a layer generated by BuildSourceImage for equality determination"""

def __init__(self, layer: Layer) -> None:
self._layer = layer
self._symlink_member: TarInfo | None = None
self._blob_member: TarInfo | None = None
self._extra_source: TarInfo | None = None
self._extract()
self._check()

@property
def symlink_member(self) -> TarInfo:
"""Return symlink member
:return: a ``tarfile.TarInfo`` representing the symlink member. None is
returned, if no such member is found from a layer.
"""
return self._symlink_member # type: ignore

@property
def blob_member(self) -> TarInfo:
"""Return blob member
:return: a ``tarfile.TarInfo`` representing the blob member. None is
returned, if no such member is found from a layer.
"""
return self._blob_member # type: ignore

@property
def extra_source(self) -> TarInfo:
"""Return included extra source"""
return self._extra_source # type: ignore

@staticmethod
def is_extra_src(member: TarInfo) -> bool:
"""Check if an archive member is a link of extra source archive
Example arcname: ./extra_src_dir/extra-src-100.tar
"""
dirname, basename = os.path.split(member.name)
regex: Final = r"^extra-src-\d+\.tar$"
return (
member.issym()
and dirname == "./extra_src_dir"
and re.match(regex, basename) is not None
)

@staticmethod
def is_rpm_src(member: TarInfo) -> bool:
"""Check if an archive member is a link of RPM source
Example arcname: ./rpm_dir/foo-1.0.src.rpm
"""
dirname, basename = os.path.split(member.name)
return member.issym() and dirname == "./rpm_dir" and basename.endswith(".src.rpm")

def _is_blob_file(self, member: TarInfo) -> bool:
"""Check if an archive member is a blob file"""
regex: Final = r"\./blobs/sha256/[0-9a-f]+"
return member.isreg() and re.fullmatch(regex, member.name) is not None

def _extract(self) -> None:
"""Extract symlink and blob members"""
with tarfile.open(self._layer.path, "r") as tar:
for member in tar:
if self.is_rpm_src(member):
self._symlink_member = member
elif self.is_extra_src(member):
self._symlink_member = member
fo = tar.extractfile(member)
try:
with tarfile.open(fileobj=fo, mode="r") as extra_src_tar:
files = [m for m in extra_src_tar.getmembers() if m.isreg()]
self._extra_source = files[0]
finally:
fo.close() # type: ignore
elif self._is_blob_file(member):
self._blob_member = member

def _check(self) -> None:
"""Check if expected members are found
A layer generated by BSI must have symlink and blob members, and the
symlink member links to the blob one.
"""
err_prefix = "Invalid layer generated by BuildSourceImage."
if self.symlink_member is None:
raise ValueError(f"{err_prefix} No symlink member is found.")
if self.is_extra_src(self.symlink_member) and self.extra_source is None:
raise ValueError(f"{err_prefix} Missing extra source.")
if self.blob_member is None:
raise ValueError(f"{err_prefix} No blob member is found.")

dir_name, _ = os.path.split(self.symlink_member.name)
normalized_link_path = os.path.normpath(
os.path.join(dir_name, self.symlink_member.linkname)
)
if normalized_link_path != os.path.normpath(self.blob_member.name):
raise ValueError(
f"{err_prefix} Symlink {self.symlink_member.name} does not link to the blob."
)

def hash_key(self):
if self.extra_source:
artifact_name = self.extra_source.name
else:
artifact_name = self.symlink_member.name
return self.blob_member.name, artifact_name

def __hash__(self):
return hash(self.hash_key())

def __eq__(self, other: object) -> bool:
"""Check if this layer contains same content of the other"""
if not isinstance(other, BSILayer):
return False
return hash(self) == hash(other)


def merge_image(parent_sources_dir: str, local_source_build: str) -> None:
"""Merge parent sources into the local source build
Expand Down Expand Up @@ -761,6 +926,47 @@ def merge_image(parent_sources_dir: str, local_source_build: str) -> None:
local_build.index.save()


def deduplicate_sources(parent_sources_dir: StrPath, image_output_dir: StrPath) -> None:
"""Remove duplicate sources from local build
BuildSourceImage generates a layer blob from a tar archive that consists of
a symlink and linked the real SRPM or extra source tar archive. For example:
./blobs/sha256/checksum-computed-from-SRPM-file
./rpm_dir/requests-version.src.rpm
Then, if both layer blobs have the same symlink name and the blob filename,
they are treated as the same. Note, the comparison is not based on the
layer digest.
:param parent_sources_dir: str, parent sources are downloaded into this directory.
:param image_output_dir: str, local source build output directory.
"""

parent_source_image = OCIImage(parent_sources_dir)
local_source_build = OCIImage(image_output_dir)

parent_image_manifest = parent_source_image.index.manifests()[0]
local_build_manifest = local_source_build.index.manifests()[0]

parent_bsi_layers = set(BSILayer(layer) for layer in parent_image_manifest.layers)

for local_build_layer in local_build_manifest.layers:
if BSILayer(local_build_layer) not in parent_bsi_layers:
continue
d, diff_id, history = local_build_manifest.remove_layer(local_build_layer)
logger.debug(
"parent sources include source %r, remove it from local source build. "
"diff_id: %s, history: %r",
d,
diff_id,
history,
)
break

local_source_build.index.save()


def build(args) -> BuildResult:
build_result: BuildResult = {
"status": "success",
Expand Down Expand Up @@ -820,8 +1026,11 @@ def build(args) -> BuildResult:

image_output_dir = build_source_image_in_local(args.bsi, work_dir, sib_dirs)
if parent_sources_dir:
if build_result["dependencies_included"]:
deduplicate_sources(parent_sources_dir, image_output_dir)
merge_image(parent_sources_dir, image_output_dir)
build_result["base_image_source_included"] = True

image_digest = push_to_registry(image_output_dir, dest_images)
build_result["image_digest"] = image_digest
return build_result
Expand Down
Loading

0 comments on commit 3f61574

Please sign in to comment.