-
Notifications
You must be signed in to change notification settings - Fork 11
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #618 from projectcaluma/celery
feat: add celery to allow indexing in background
- Loading branch information
Showing
20 changed files
with
815 additions
and
239 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# This will make sure the app is always imported when | ||
# Django starts so that shared_task will use this app. | ||
from .celery import app as celery_app | ||
|
||
__all__ = ("celery_app",) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
import os | ||
|
||
from celery import Celery | ||
|
||
# Set the default Django settings module for the 'celery' program. | ||
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "alexandria.settings") | ||
|
||
app = Celery("alexandria") | ||
|
||
# Using a string here means the worker doesn't have to serialize | ||
# the configuration object to child processes. | ||
# - namespace='CELERY' means all celery-related configuration keys | ||
# should have a `CELERY_` prefix. | ||
app.config_from_object("django.conf:settings", namespace="CELERY") | ||
|
||
|
||
# Load task modules from all registered Django apps. | ||
app.autodiscover_tasks() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
import hashlib | ||
from pathlib import Path | ||
|
||
import tika.language | ||
import tika.parser | ||
from django.conf import settings | ||
from django.contrib.postgres.search import SearchVector | ||
from django.db.models import Value | ||
|
||
from alexandria.core.models import File | ||
from celery import shared_task | ||
|
||
|
||
@shared_task(soft_time_limit=301) | ||
def set_content_vector(file_pk: str): | ||
file = File.objects.get(pk=file_pk) | ||
file.content.file.file.seek(0) | ||
|
||
# tika has an internal time limit of 300s, set the request limit to match that | ||
# different values should be set in tika as well | ||
# https://github.com/CogStack/tika-service/blob/master/README.md#tika-parsers-configuration | ||
parsed_content = tika.parser.from_buffer( | ||
file.content.file.file, requestOptions={"timeout": 300} | ||
) | ||
|
||
name_vector = SearchVector(Value(Path(file.name).stem), weight="A") | ||
if not parsed_content["content"]: | ||
# Update only content_vector, to avoid race conditions | ||
File.objects.filter(pk=file.pk).update(content_vector=name_vector) | ||
return | ||
|
||
# use part of content for language detection, beacause metadata is not reliable | ||
language = tika.language.from_buffer(parsed_content["content"][:1000]) | ||
config = settings.ALEXANDRIA_ISO_639_TO_PSQL_SEARCH_CONFIG.get(language, "simple") | ||
content_vector = name_vector + SearchVector( | ||
Value(parsed_content["content"].strip()), | ||
config=config, | ||
weight="B", | ||
) | ||
|
||
# Update only need fields, to avoid race conditions | ||
File.objects.filter(pk=file.pk).update( | ||
content_vector=content_vector, language=language | ||
) | ||
|
||
|
||
@shared_task | ||
def set_checksum(file_pk: str): | ||
file = File.objects.get(pk=file_pk) | ||
file.content.file.file.seek(0) | ||
checksum = make_checksum(file.content.file.file.read()) | ||
|
||
# Update only checksum, to avoid race conditions | ||
File.objects.filter(pk=file.pk).update(checksum=checksum) | ||
|
||
|
||
def make_checksum(bytes_: bytes) -> str: | ||
return f"sha256:{hashlib.sha256(bytes_).hexdigest()}" |
Oops, something went wrong.