Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Unaccent with dandiset search filter #2142

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions dandiapi/api/migrations/0014_unaccent_extension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Generated by Django 4.2.17 on 2025-01-17 17:45
from __future__ import annotations

from django.contrib.postgres.operations import UnaccentExtension
from django.db import migrations


class Migration(migrations.Migration):
dependencies = [
('api', '0013_remove_assetpath_consistent_slash_and_more'),
]

operations = [
UnaccentExtension(),
]
109 changes: 68 additions & 41 deletions dandiapi/api/views/dandiset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,10 @@

from allauth.socialaccount.models import SocialAccount
from django.contrib.auth.models import User
from django.contrib.postgres.lookups import Unaccent
from django.db import transaction
from django.db.models import Count, Max, OuterRef, QuerySet, Subquery, Sum
from django.db.models.functions import Coalesce
from django.db.models import Count, Max, OuterRef, QuerySet, Subquery, Sum, TextField
from django.db.models.functions import Cast, Coalesce
from django.db.models.query_utils import Q
from django.http import Http404
from drf_yasg.utils import no_body, swagger_auto_schema
Expand All @@ -17,6 +18,7 @@
from rest_framework.generics import get_object_or_404
from rest_framework.response import Response
from rest_framework.serializers import ValidationError
from rest_framework.settings import api_settings as drf_settings
from rest_framework.viewsets import ReadOnlyModelViewSet

from dandiapi.api.asset_paths import get_root_paths_many
Expand Down Expand Up @@ -56,11 +58,12 @@

if TYPE_CHECKING:
from rest_framework.request import Request
from rest_framework.views import APIView

from dandiapi.api.models.upload import Upload


class DandisetFilterBackend(filters.OrderingFilter):
class DandisetOrderingFilter(filters.OrderingFilter):
ordering_fields = ['id', 'name', 'modified', 'size']
ordering_description = (
'Which field to use when ordering the results. '
Expand All @@ -69,51 +72,75 @@ class DandisetFilterBackend(filters.OrderingFilter):

def filter_queryset(self, request, queryset, view):
orderings = self.get_ordering(request, queryset, view)
if orderings:
ordering = orderings[0]
# ordering can be either 'created' or '-created', so test for both
if ordering.endswith('id'):
return queryset.order_by(ordering)
if ordering.endswith('name'):
# name refers to the name of the most recent version, so a subquery is required
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by(
'-created'
)[:1]
queryset = queryset.annotate(name=Subquery(latest_version.values('metadata__name')))
return queryset.order_by(ordering)
if ordering.endswith('modified'):
# modified refers to the modification timestamp of the most
# recent version, so a subquery is required
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by(
'-created'
)[:1]
# get the `modified` field of the most recent version.
# '_version' is appended because the Dandiset model already has a `modified` field
queryset = queryset.annotate(
modified_version=Subquery(latest_version.values('modified'))
)
return queryset.order_by(f'{ordering}_version')
if ordering.endswith('size'):
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by(
'-created'
)[:1]
queryset = queryset.annotate(
size=Subquery(
latest_version.annotate(
size=Coalesce(Sum('assets__blob__size'), 0)
+ Coalesce(Sum('assets__zarr__size'), 0)
).values('size')
)
if not orderings:
return queryset
ordering = orderings[0]

# ordering can be either 'created' or '-created', so test for both
if ordering.endswith('id'):
return queryset.order_by(ordering)

if ordering.endswith('name'):
# name refers to the name of the most recent version, so a subquery is required
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by('-created')[
:1
]
queryset = queryset.annotate(name=Subquery(latest_version.values('metadata__name')))
return queryset.order_by(ordering)

if ordering.endswith('modified'):
# modified refers to the modification timestamp of the most
# recent version, so a subquery is required
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by('-created')[
:1
]
# get the `modified` field of the most recent version.
# '_version' is appended because the Dandiset model already has a `modified` field
queryset = queryset.annotate(
modified_version=Subquery(latest_version.values('modified'))
)
return queryset.order_by(f'{ordering}_version')

if ordering.endswith('size'):
latest_version = Version.objects.filter(dandiset=OuterRef('pk')).order_by('-created')[
:1
]
queryset = queryset.annotate(
size=Subquery(
latest_version.annotate(
size=Coalesce(Sum('assets__blob__size'), 0)
+ Coalesce(Sum('assets__zarr__size'), 0)
).values('size')
)
return queryset.order_by(ordering)
)
return queryset.order_by(ordering)

waxlamp marked this conversation as resolved.
Show resolved Hide resolved
return queryset


class DandisetSearchFilter(filters.BaseFilterBackend):
search_param = drf_settings.SEARCH_PARAM

def get_search_term(self, request):
param = request.query_params.get(self.search_param, '')
param = param.replace('\x00', '') # strip null characters

return param # noqa: RET504
Comment on lines +126 to +128
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of the noqa directive here, why not just follow the recommendation to eliminate the unnecessary assignment?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function is parsing the query param, and performing a number of actions on it. In this case, it's just that one .replace call, but realistically, you could see us adding to it in the future. It just felt a bit awkward/arbitrary to just return param.replace('\x00', '') 🤷‍♂️ .

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think just go with ruff's recommendation here. "We might change the code in the future" doesn't seem compelling enough to override its style advice.


def filter_queryset(self, request: Request, queryset: QuerySet, view: APIView) -> QuerySet:
search_term = self.get_search_term(request=request)
if not search_term:
return queryset

return queryset.alias(
search_field=Unaccent(Cast('versions__metadata', TextField()))
).filter(search_field__icontains=search_term)


class DandisetViewSet(ReadOnlyModelViewSet):
serializer_class = DandisetDetailSerializer
pagination_class = DandiPagination
filter_backends = [filters.SearchFilter, DandisetFilterBackend]
search_fields = ['versions__metadata']
filter_backends = [DandisetSearchFilter, DandisetOrderingFilter]

lookup_value_regex = Dandiset.IDENTIFIER_REGEX
# This is to maintain consistency with the auto-generated names shown in swagger.
Expand Down