Skip to content

Commit

Permalink
Fix duplicate package handling
Browse files Browse the repository at this point in the history
closes #921

As a result of the behaviour fixes, we can also drop the now superfluous
duplicate distribution checking. We are using validate_duplicate_content
to catch incoming duplicates, and we are removing old duplicates, so it
is not possible to run into this error. As a result it is not worth the
performance cost to check for it.
  • Loading branch information
quba42 committed Oct 30, 2023
1 parent 5b17ad2 commit 45098d9
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 37 deletions.
3 changes: 3 additions & 0 deletions CHANGES/921.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
Fixed repo uniqueness constraints.
Duplicate packages with identical checksums are now allowed.
In addition, duplicates are now also handled for the set of incoming content.
2 changes: 0 additions & 2 deletions pulp_deb/app/models/content/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,6 @@ def filename(self, component=""):
"{}.{}".format(self.name, self.SUFFIX),
)

repo_key_fields = ("package", "version", "architecture")

class Meta:
default_related_name = "%(app_label)s_%(model_name)s"
unique_together = (("relative_path", "sha256"),)
Expand Down
7 changes: 0 additions & 7 deletions pulp_deb/app/models/content/structure_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,13 +91,6 @@ class PackageReleaseComponent(Content):
package = models.ForeignKey(Package, on_delete=models.CASCADE)
release_component = models.ForeignKey(ReleaseComponent, on_delete=models.CASCADE)

repo_key_fields = (
"release_component",
"package__package",
"package__version",
"package__architecture",
)

class Meta:
default_related_name = "%(app_label)s_%(model_name)s"
unique_together = (("package", "release_component"),)
Expand Down
116 changes: 106 additions & 10 deletions pulp_deb/app/models/repository.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
from django.db import models
from pulpcore.plugin.models import BaseModel, Repository
from pulpcore.plugin.repo_version_utils import remove_duplicates, validate_version_paths
from pulpcore.plugin.repo_version_utils import (
remove_duplicates,
validate_version_paths,
validate_duplicate_content,
)

from pulpcore.plugin.models import Content

from pulp_deb.app.models import (
AptReleaseSigningService,
Expand All @@ -20,6 +26,11 @@
SourcePackageReleaseComponent,
)

import logging
from gettext import gettext as _

log = logging.getLogger(__name__)


class AptRepository(Repository):
"""
Expand Down Expand Up @@ -91,17 +102,10 @@ def finalize_new_version(self, new_version):
finalize.
"""
from pulp_deb.app.tasks.exceptions import DuplicateDistributionException

handle_duplicate_packages(new_version)
validate_duplicate_content(new_version)
remove_duplicates(new_version)
validate_version_paths(new_version)
releases = new_version.get_content(Release.objects.all())
distributions = []
for release in releases:
distribution = release.distribution
if distribution in distributions:
raise DuplicateDistributionException(distribution)
distributions.append(distribution)


class AptRepositoryReleaseServiceOverride(BaseModel):
Expand All @@ -117,3 +121,95 @@ class AptRepositoryReleaseServiceOverride(BaseModel):

class Meta:
unique_together = (("repository", "release_distribution"),)


def handle_duplicate_packages(new_version):
"""
pulpcore's remove_duplicates does not work for .deb packages, since identical duplicate
packages (same sha256) are rare, but allowed, while duplicates with different sha256 are
forbidden. As such we need our own version of this function for .deb packages. Since we are
already building our own function, we will also be combining the functionality of pulpcore's
remove_duplicates and validate_duplicate_content within this function.
"""
package_types = {
Package.get_pulp_type(): Package,
InstallerPackage.get_pulp_type(): InstallerPackage,
}
repo_key_fields = ("package", "version", "architecture")
content_qs_added = new_version.added(base_version=new_version.base_version)
if new_version.base_version:
content_qs_existing = new_version.base_version.content
else:
try:
content_qs_existing = new_version.previous().content
except new_version.DoesNotExist:
content_qs_existing = Content.objects.none()

for pulp_type, package_obj in package_types.items():
# Prepare some qerry sets
content_qs_added_packages = content_qs_added.filter(pulp_type=pulp_type)
package_qs_added = package_obj.objects.filter(pk__in=content_qs_added_packages)
added_packages_count = package_qs_added.count()

# First handle duplicates within the packages added to new_version
added_unique = package_qs_added.distinct(*repo_key_fields)
added_checksum_unique = package_qs_added.distinct(*repo_key_fields, "sha256")

if added_unique.count() < added_checksum_unique.count():
package_qs_added_duplicates = added_checksum_unique.difference(added_unique)
if log.isEnabledFor(logging.DEBUG):
message = _(
"New repository version contains multiple packages with '{}', but differing "
"checksum!"
)
for package_fields in package_qs_added_duplicates.values(*repo_key_fields):
log.debug(message.format(pulp_type, package_fields))

message = _(
"Cannot create repository version since there are newly added packages with the "
"same name, version, and architecture, but a different checksum. If the log level "
"is DEBUG, you can find a list of affected packages in the Pulp log."
)
raise ValueError(message)

# Now remove existing packages that are duplicates of any packages added to new_version
if added_packages_count and content_qs_existing.count():
log.debug(_("Removing duplicates for type: {}".format(pulp_type)))

for batch in batch_qs(package_qs_added.values(*repo_key_fields, "sha256")):
find_dup_qs = models.Q()

for content_dict in batch:
sha256 = content_dict.pop("sha256")
item_query = models.Q(**content_dict) & models.Q(sha256 != sha256)
find_dup_qs |= item_query

package_qs_duplicates = (
package_obj.objects.filter(pk__in=content_qs_existing)
.filter(find_dup_qs)
.only("pk")
)
prc_qs_duplicates = PackageReleaseComponent.objects.filter(
package__in=package_qs_duplicates
).only("pk")
new_version.remove_content(package_qs_duplicates)
new_version.remove_content(prc_qs_duplicates)


# The following helper function is copy and pasted from pulpcore. As soon as
# https://github.com/pulp/pulpcore/issues/4607 is ready, we should replace it with an import!
def batch_qs(qs, batch_size=1000):
"""
Returns a queryset batch in the given queryset.
Usage:
# Make sure to order your querset
article_qs = Article.objects.order_by('id')
for qs in batch_qs(article_qs):
for article in qs:
print article.body
"""
total = qs.count()
for start in range(0, total, batch_size):
end = min(start + batch_size, total)
yield qs[start:end]
18 changes: 0 additions & 18 deletions pulp_deb/app/tasks/exceptions.py

This file was deleted.

0 comments on commit 45098d9

Please sign in to comment.