Skip to content

Commit

Permalink
Merge pull request #922 from ATIX-AG/fix_package_repo_uniqueness
Browse files Browse the repository at this point in the history
Fix package repo uniqueness

(cherry picked from commit 7ede5be)
  • Loading branch information
quba42 authored and patchback[bot] committed Nov 16, 2023
1 parent 5104382 commit ef1f458
Show file tree
Hide file tree
Showing 9 changed files with 246 additions and 39 deletions.
3 changes: 3 additions & 0 deletions CHANGES/921.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fixed repo uniqueness constraints.
Duplicate packages with identical checksums are now allowed.
In addition, duplicates are now also handled for the set of incoming content.
2 changes: 0 additions & 2 deletions pulp_deb/app/models/content/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,6 @@ def filename(self, component=""):
"{}.{}".format(self.name, self.SUFFIX),
)

repo_key_fields = ("package", "version", "architecture")

class Meta:
default_related_name = "%(app_label)s_%(model_name)s"
unique_together = (("relative_path", "sha256"),)
Expand Down
7 changes: 0 additions & 7 deletions pulp_deb/app/models/content/structure_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,6 @@ class PackageReleaseComponent(Content):
package = models.ForeignKey(Package, on_delete=models.CASCADE)
release_component = models.ForeignKey(ReleaseComponent, on_delete=models.CASCADE)

repo_key_fields = (
"release_component",
"package__package",
"package__version",
"package__architecture",
)

class Meta:
default_related_name = "%(app_label)s_%(model_name)s"
unique_together = (("package", "release_component"),)
Expand Down
117 changes: 107 additions & 10 deletions pulp_deb/app/models/repository.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
from django.db import models
from pulpcore.plugin.models import BaseModel, Repository
from pulpcore.plugin.repo_version_utils import remove_duplicates, validate_version_paths
from pulpcore.plugin.repo_version_utils import (
remove_duplicates,
validate_version_paths,
validate_duplicate_content,
)

from pulpcore.plugin.models import Content

from pulp_deb.app.models import (
AptReleaseSigningService,
Expand All @@ -20,6 +26,11 @@
SourcePackageReleaseComponent,
)

import logging
from gettext import gettext as _

log = logging.getLogger(__name__)


class AptRepository(Repository):
"""
Expand Down Expand Up @@ -91,17 +102,10 @@ def finalize_new_version(self, new_version):
finalize.
"""
from pulp_deb.app.tasks.exceptions import DuplicateDistributionException

handle_duplicate_packages(new_version)
validate_duplicate_content(new_version)
remove_duplicates(new_version)
validate_version_paths(new_version)
releases = new_version.get_content(Release.objects.all())
distributions = []
for release in releases:
distribution = release.distribution
if distribution in distributions:
raise DuplicateDistributionException(distribution)
distributions.append(distribution)


class AptRepositoryReleaseServiceOverride(BaseModel):
Expand All @@ -117,3 +121,96 @@ class AptRepositoryReleaseServiceOverride(BaseModel):

class Meta:
unique_together = (("repository", "release_distribution"),)


def handle_duplicate_packages(new_version):
"""
pulpcore's remove_duplicates does not work for .deb packages, since identical duplicate
packages (same sha256) are rare, but allowed, while duplicates with different sha256 are
forbidden. As such we need our own version of this function for .deb packages. Since we are
already building our own function, we will also be combining the functionality of pulpcore's
remove_duplicates and validate_duplicate_content within this function.
"""
content_qs_added = new_version.added(base_version=new_version.base_version)
if new_version.base_version:
content_qs_existing = new_version.base_version.content
else:
try:
content_qs_existing = new_version.previous().content
except new_version.DoesNotExist:
content_qs_existing = Content.objects.none()
package_types = {
Package.get_pulp_type(): Package,
InstallerPackage.get_pulp_type(): InstallerPackage,
}
repo_key_fields = ("package", "version", "architecture")

for pulp_type, package_obj in package_types.items():
# First handle duplicates within the packages added to new_version
package_qs_added = package_obj.objects.filter(
pk__in=content_qs_added.filter(pulp_type=pulp_type)
)
added_unique = package_qs_added.distinct(*repo_key_fields)
added_checksum_unique = package_qs_added.distinct(*repo_key_fields, "sha256")

if added_unique.count() < added_checksum_unique.count():
package_qs_added_duplicates = added_checksum_unique.difference(added_unique)
if log.isEnabledFor(logging.DEBUG):
message = _(
"New repository version contains multiple packages with '{}', but differing "
"checksum!"
)
for package_fields in package_qs_added_duplicates.values(*repo_key_fields):
log.debug(message.format(package_fields))

message = _(
"Cannot create repository version since there are newly added packages with the "
"same name, version, and architecture, but a different checksum. If the log level "
"is DEBUG, you can find a list of affected packages in the Pulp log."
)
raise ValueError(message)

# Now remove existing packages that are duplicates of any packages added to new_version
if package_qs_added.count() and content_qs_existing.count():
for batch in batch_qs(package_qs_added.values(*repo_key_fields, "sha256")):
find_dup_qs = models.Q()

for content_dict in batch:
sha256 = content_dict.pop("sha256")
item_query = models.Q(**content_dict) & ~models.Q(sha256=sha256)
find_dup_qs |= item_query

package_qs_duplicates = (
package_obj.objects.filter(pk__in=content_qs_existing)
.filter(find_dup_qs)
.only("pk")
)
prc_qs_duplicates = (
PackageReleaseComponent.objects.filter(pk__in=content_qs_existing)
.filter(package__in=package_qs_duplicates)
.only("pk")
)
if package_qs_duplicates.count():
message = _("Removing duplicates for type {} from new repo version.")
log.warning(message.format(pulp_type))
new_version.remove_content(package_qs_duplicates)
new_version.remove_content(prc_qs_duplicates)


# The following helper function is copy and pasted from pulpcore. As soon as
# https://github.com/pulp/pulpcore/issues/4607 is ready, we should replace it with an import!
def batch_qs(qs, batch_size=1000):
"""
Returns a queryset batch in the given queryset.
Usage:
# Make sure to order your querset
article_qs = Article.objects.order_by('id')
for qs in batch_qs(article_qs):
for article in qs:
print article.body
"""
total = qs.count()
for start in range(0, total, batch_size):
end = min(start + batch_size, total)
yield qs[start:end]
18 changes: 0 additions & 18 deletions pulp_deb/app/tasks/exceptions.py

This file was deleted.

132 changes: 132 additions & 0 deletions pulp_deb/tests/functional/api/test_duplicate_packages.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
"""Tests relating to duplicate package handling.
By "duplicate package" we mean two packages with the same Package, Version, and Architecture fields,
but different checksums. By contrast we refer to two packages with the same checksum (but stored at
a different relative_path within some repo) as two "identical packages". So defined, an APT repo may
contain identical packages, but may not contain any duplicates.
To ensure this is the case we use the handle_duplicate_packages function. As such, these tests are
primarily intended to test this function.
"""
import pytest
from uuid import uuid4

from pulpcore.tests.functional.utils import PulpTaskError

from pulp_deb.tests.functional.constants import DEB_PACKAGE_RELPATH
from pulp_deb.tests.functional.utils import (
get_counts_from_content_summary,
get_local_package_absolute_path,
)

DUPLICATE_PACKAGE_DIR = "data/packages/duplicates/" # below pulp_deb/tests/functional/


def test_upload_package_and_duplicate(
apt_package_api,
deb_get_content_summary,
deb_get_repository_by_href,
deb_package_factory,
deb_repository_factory,
):
"""Test uploading a package to a repo, and then uploading it's duplicate.
The expectation is that uploading the duplicate will kick the older duplicate (along with the
associated PackageReleaseComponent) out of the repo. Only the newer duplicate and its PRC will
remain.
"""
# Generate an empty test repo.
repository = deb_repository_factory()
assert repository.latest_version_href.endswith("/0/")
repository_href = repository.pulp_href

# Upload a test package to a component in the repo.
package_upload_params = {
"file": get_local_package_absolute_path(DEB_PACKAGE_RELPATH),
"relative_path": DEB_PACKAGE_RELPATH,
"distribution": str(uuid4()),
"component": str(uuid4()),
"repository": repository_href,
}
deb_package_factory(**package_upload_params)

# Assert that the uploaded package has arrived in the repo.
repository = deb_get_repository_by_href(repository_href)
assert repository.latest_version_href.endswith("/1/")
content_counts = get_counts_from_content_summary(deb_get_content_summary(repository).added)
assert content_counts == {
"deb.package": 1,
"deb.package_release_component": 1,
"deb.release_architecture": 1,
"deb.release_component": 1,
}
package1_sha256 = (
apt_package_api.list(
repository_version_added=repository.latest_version_href, fields=["sha256"]
)
.results[0]
.sha256
)

# Upload a duplicate of the first package into the repo.
package_upload_params["file"] = get_local_package_absolute_path(
package_name=DEB_PACKAGE_RELPATH, relative_path=DUPLICATE_PACKAGE_DIR
)
deb_package_factory(**package_upload_params)

# Assert that only the newer duplicate is now in the repo.
repository = deb_get_repository_by_href(repository_href)
assert repository.latest_version_href.endswith("/2/")
content_summary = deb_get_content_summary(repository)
content_counts_added = get_counts_from_content_summary(content_summary.added)
content_counts_removed = get_counts_from_content_summary(content_summary.removed)
assert content_counts_added == {
"deb.package": 1,
"deb.package_release_component": 1,
}
assert content_counts_removed == {
"deb.package": 1,
"deb.package_release_component": 1,
}
package2_sha256 = (
apt_package_api.list(
repository_version_added=repository.latest_version_href, fields=["sha256"]
)
.results[0]
.sha256
)
assert package1_sha256 != package2_sha256


def test_add_duplicates_to_repo(
deb_modify_repository,
deb_package_factory,
deb_repository_factory,
):
"""Test adding two duplicate packages to a repository in a single modify action.
The expectation is that this will raise a ValueError.
"""
# Upload two duplicate packages.
package_upload_params = {
"file": get_local_package_absolute_path(
package_name=DEB_PACKAGE_RELPATH, relative_path=DUPLICATE_PACKAGE_DIR
),
"relative_path": DEB_PACKAGE_RELPATH,
}
href1 = deb_package_factory(**package_upload_params).pulp_href
package_upload_params["file"] = get_local_package_absolute_path(DEB_PACKAGE_RELPATH)
href2 = deb_package_factory(**package_upload_params).pulp_href

# Generate an empty test repo.
repository = deb_repository_factory()
assert repository.latest_version_href.endswith("/0/")

# Add the duplicates to the repository.
with pytest.raises(PulpTaskError) as exception:
deb_modify_repository(repository, {"add_content_units": [href1, href2]})

# Assert the error message.
assert "Cannot create repository version since there are newly added packages with" in str(
exception.value
)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This is a duplicate of the package with a different checksum!
Binary file not shown.
5 changes: 3 additions & 2 deletions pulp_deb/tests/functional/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,16 @@ def gen_deb_remote_verbose(url=None, remove_policy=False):
return data


def get_local_package_absolute_path(package_name):
def get_local_package_absolute_path(package_name, relative_path="data/packages/"):
"""Looks up the local package of the given name under the relative path
'data/packages/' and returns the absolute path.
:param package_name: Name of the package to look up.
:param relative_path: The relative path of the directory below pulp_deb/tests/functional/.
:returns: The absolute path to the package.
"""
p = Path(__file__).parent.absolute()
return p.joinpath(f"data/packages/{package_name}")
return p.joinpath(relative_path).joinpath(package_name)


def gen_distribution(**kwargs):
Expand Down

0 comments on commit ef1f458

Please sign in to comment.