From 45098d95c988450883419d7a3a805878c048b101 Mon Sep 17 00:00:00 2001 From: Quirin Pamp Date: Mon, 23 Oct 2023 15:54:04 +0200 Subject: [PATCH] Fix duplicate package handling closes #921 As a result of the behaviour fixes, we can also drop the now superfluous duplicate distribution checking. We are using validate_duplicate_content to catch incoming duplicates, and we are removing old duplicates, so it is not possible to run into this error. As a result it is not worth the performance cost to check for it. --- CHANGES/921.bugfix | 3 + pulp_deb/app/models/content/content.py | 2 - .../app/models/content/structure_content.py | 7 -- pulp_deb/app/models/repository.py | 116 ++++++++++++++++-- pulp_deb/app/tasks/exceptions.py | 18 --- 5 files changed, 109 insertions(+), 37 deletions(-) create mode 100644 CHANGES/921.bugfix delete mode 100644 pulp_deb/app/tasks/exceptions.py diff --git a/CHANGES/921.bugfix b/CHANGES/921.bugfix new file mode 100644 index 000000000..0a539c9df --- /dev/null +++ b/CHANGES/921.bugfix @@ -0,0 +1,3 @@ +Fixed repo uniqueness constraints. +Duplicate packages with identical checksums are now allowed. +In addition, duplicates are now also handled for the set of incoming content. diff --git a/pulp_deb/app/models/content/content.py b/pulp_deb/app/models/content/content.py index 9f7cf5a59..ab58790c5 100644 --- a/pulp_deb/app/models/content/content.py +++ b/pulp_deb/app/models/content/content.py @@ -89,8 +89,6 @@ def filename(self, component=""): "{}.{}".format(self.name, self.SUFFIX), ) - repo_key_fields = ("package", "version", "architecture") - class Meta: default_related_name = "%(app_label)s_%(model_name)s" unique_together = (("relative_path", "sha256"),) diff --git a/pulp_deb/app/models/content/structure_content.py b/pulp_deb/app/models/content/structure_content.py index 30616fc30..7e3c095f6 100644 --- a/pulp_deb/app/models/content/structure_content.py +++ b/pulp_deb/app/models/content/structure_content.py @@ -91,13 +91,6 @@ class PackageReleaseComponent(Content): package = models.ForeignKey(Package, on_delete=models.CASCADE) release_component = models.ForeignKey(ReleaseComponent, on_delete=models.CASCADE) - repo_key_fields = ( - "release_component", - "package__package", - "package__version", - "package__architecture", - ) - class Meta: default_related_name = "%(app_label)s_%(model_name)s" unique_together = (("package", "release_component"),) diff --git a/pulp_deb/app/models/repository.py b/pulp_deb/app/models/repository.py index 7ed0ff8af..2d289be37 100644 --- a/pulp_deb/app/models/repository.py +++ b/pulp_deb/app/models/repository.py @@ -1,6 +1,12 @@ from django.db import models from pulpcore.plugin.models import BaseModel, Repository -from pulpcore.plugin.repo_version_utils import remove_duplicates, validate_version_paths +from pulpcore.plugin.repo_version_utils import ( + remove_duplicates, + validate_version_paths, + validate_duplicate_content, +) + +from pulpcore.plugin.models import Content from pulp_deb.app.models import ( AptReleaseSigningService, @@ -20,6 +26,11 @@ SourcePackageReleaseComponent, ) +import logging +from gettext import gettext as _ + +log = logging.getLogger(__name__) + class AptRepository(Repository): """ @@ -91,17 +102,10 @@ def finalize_new_version(self, new_version): finalize. """ - from pulp_deb.app.tasks.exceptions import DuplicateDistributionException - + handle_duplicate_packages(new_version) + validate_duplicate_content(new_version) remove_duplicates(new_version) validate_version_paths(new_version) - releases = new_version.get_content(Release.objects.all()) - distributions = [] - for release in releases: - distribution = release.distribution - if distribution in distributions: - raise DuplicateDistributionException(distribution) - distributions.append(distribution) class AptRepositoryReleaseServiceOverride(BaseModel): @@ -117,3 +121,95 @@ class AptRepositoryReleaseServiceOverride(BaseModel): class Meta: unique_together = (("repository", "release_distribution"),) + + +def handle_duplicate_packages(new_version): + """ + pulpcore's remove_duplicates does not work for .deb packages, since identical duplicate + packages (same sha256) are rare, but allowed, while duplicates with different sha256 are + forbidden. As such we need our own version of this function for .deb packages. Since we are + already building our own function, we will also be combining the functionality of pulpcore's + remove_duplicates and validate_duplicate_content within this function. + """ + package_types = { + Package.get_pulp_type(): Package, + InstallerPackage.get_pulp_type(): InstallerPackage, + } + repo_key_fields = ("package", "version", "architecture") + content_qs_added = new_version.added(base_version=new_version.base_version) + if new_version.base_version: + content_qs_existing = new_version.base_version.content + else: + try: + content_qs_existing = new_version.previous().content + except new_version.DoesNotExist: + content_qs_existing = Content.objects.none() + + for pulp_type, package_obj in package_types.items(): + # Prepare some qerry sets + content_qs_added_packages = content_qs_added.filter(pulp_type=pulp_type) + package_qs_added = package_obj.objects.filter(pk__in=content_qs_added_packages) + added_packages_count = package_qs_added.count() + + # First handle duplicates within the packages added to new_version + added_unique = package_qs_added.distinct(*repo_key_fields) + added_checksum_unique = package_qs_added.distinct(*repo_key_fields, "sha256") + + if added_unique.count() < added_checksum_unique.count(): + package_qs_added_duplicates = added_checksum_unique.difference(added_unique) + if log.isEnabledFor(logging.DEBUG): + message = _( + "New repository version contains multiple packages with '{}', but differing " + "checksum!" + ) + for package_fields in package_qs_added_duplicates.values(*repo_key_fields): + log.debug(message.format(pulp_type, package_fields)) + + message = _( + "Cannot create repository version since there are newly added packages with the " + "same name, version, and architecture, but a different checksum. If the log level " + "is DEBUG, you can find a list of affected packages in the Pulp log." + ) + raise ValueError(message) + + # Now remove existing packages that are duplicates of any packages added to new_version + if added_packages_count and content_qs_existing.count(): + log.debug(_("Removing duplicates for type: {}".format(pulp_type))) + + for batch in batch_qs(package_qs_added.values(*repo_key_fields, "sha256")): + find_dup_qs = models.Q() + + for content_dict in batch: + sha256 = content_dict.pop("sha256") + item_query = models.Q(**content_dict) & models.Q(sha256 != sha256) + find_dup_qs |= item_query + + package_qs_duplicates = ( + package_obj.objects.filter(pk__in=content_qs_existing) + .filter(find_dup_qs) + .only("pk") + ) + prc_qs_duplicates = PackageReleaseComponent.objects.filter( + package__in=package_qs_duplicates + ).only("pk") + new_version.remove_content(package_qs_duplicates) + new_version.remove_content(prc_qs_duplicates) + + +# The following helper function is copy and pasted from pulpcore. As soon as +# https://github.com/pulp/pulpcore/issues/4607 is ready, we should replace it with an import! +def batch_qs(qs, batch_size=1000): + """ + Returns a queryset batch in the given queryset. + + Usage: + # Make sure to order your querset + article_qs = Article.objects.order_by('id') + for qs in batch_qs(article_qs): + for article in qs: + print article.body + """ + total = qs.count() + for start in range(0, total, batch_size): + end = min(start + batch_size, total) + yield qs[start:end] diff --git a/pulp_deb/app/tasks/exceptions.py b/pulp_deb/app/tasks/exceptions.py deleted file mode 100644 index 470bfab60..000000000 --- a/pulp_deb/app/tasks/exceptions.py +++ /dev/null @@ -1,18 +0,0 @@ -from gettext import gettext as _ - - -class DuplicateDistributionException(Exception): - """ - Exception to signal, that we are creating a repository versions containing multiple Release - content with the same distribution. - """ - - def __init__(self, distribution, *args, **kwargs): - message = ( - "Cannot create the new repository version, since it contains multiple Release content " - "with the same distribution '{}'.\n" - "This known issue is tracked here: https://github.com/pulp/pulp_deb/issues/599\n" - "You can check the issue for known workarounds. Please also consider posting what you " - "did before getting this error, to help us to fix the underlying problem more quickly." - ) - super().__init__(_(message).format(distribution), *args, **kwargs)