Skip to content

Commit

Permalink
Merge pull request #115 from JSv4/JSv4/add-nlm-ingestor
Browse files Browse the repository at this point in the history
Add Nlm-ingestor
  • Loading branch information
JSv4 authored May 21, 2024
2 parents f105b90 + 51f05f2 commit 9d4fe16
Show file tree
Hide file tree
Showing 16 changed files with 36,985 additions and 12 deletions.
1 change: 1 addition & 0 deletions .idea/OpenContracts.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions config/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,13 @@
# Constants for Permissioning
DEFAULT_PERMISSIONS_GROUP = "Public Objects Access"

# Nlm-ingestor settings
# -----------------------------------------------------------------------------
NLM_INGESTOR_ACTIVE = env.bool('NLM_INGESTOR_ACTIVE', False) # Use nlm-ingestor where this is True... otherwise PAWLs
NLM_INGEST_USE_OCR = False # IF True, always tell nlm-ingestor to use OCR (Tesseract)
NLM_INGEST_HOSTNAME = "http://nlm-ingestor:5001" # Hostname to send nlm-ingestor REST requests to
NLM_INGEST_API_KEY = None # If the endpoint is secured with an API_KEY, specify it here, otherwise use None

# CORS
# ------------------------------------------------------------------------------
CORS_ORIGIN_WHITELIST = [
Expand Down
5 changes: 5 additions & 0 deletions frontend/src/graphql/queries.ts
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,7 @@ export const GET_ANNOTATION_LABELS = gql`
text
description
labelType
readOnly
isPublic
myPermissions
analyzer {
Expand Down Expand Up @@ -525,6 +526,7 @@ export const GET_LABELSET_WITH_ALL_LABELS = gql`
id
icon
labelType
readOnly
text
description
color
Expand Down Expand Up @@ -767,6 +769,7 @@ export const REQUEST_ANNOTATOR_DATA_FOR_DOCUMENT = gql`
color
description
text
readOnly
labelType
analyzer {
id
Expand All @@ -790,6 +793,7 @@ export const REQUEST_ANNOTATOR_DATA_FOR_DOCUMENT = gql`
description
text
labelType
readOnly
}
boundingBox
page
Expand Down Expand Up @@ -886,6 +890,7 @@ export const REQUEST_ANNOTATOR_DATA_FOR_DOCUMENT = gql`
description
text
labelType
readOnly
analyzer {
id
}
Expand Down
1 change: 1 addition & 0 deletions frontend/src/graphql/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ export type AnnotationLabelType = Node & {
created?: Scalars["DateTime"];
modified?: Scalars["DateTime"];
isPublic?: Scalars["Boolean"];
readonly?: Scalars["Boolean"];
myPermissions?: Scalars["String"][];
relationshipSet?: RelationshipTypeConnection;
annotationSet?: AnnotationTypeConnection;
Expand Down
4 changes: 4 additions & 0 deletions local.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ services:
image: redis:6
container_name: redis

nlm-ingestor:
image: jscrudato/nlm-ingestor-opencontracts
container_name: nlm-ingestor

celeryworker:
<<: *django
image: opencontractserver_local_celeryworker
Expand Down
6 changes: 6 additions & 0 deletions local_deploy_with_gremlin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ services:
image: redis:6
container_name: redis

nlm-ingestor:
image: jscrudato/nlm-ingestor-opencontracts
container_name: redis
environment:
- API_KEY=8j4t9kjdfgmdfpomd

gremlinengine: &gremlinengine
image: opensourcelegal/gremlin-engine:latest
container_name: gremlinengine
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.2.9 on 2024-05-19 03:31

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('annotations', '0003_auto_20230202_0604'),
]

operations = [
migrations.AddField(
model_name='annotationlabel',
name='read_only',
field=models.BooleanField(default=False),
),
]
4 changes: 4 additions & 0 deletions opencontractserver/annotations/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,10 @@ class AnnotationLabel(BaseOCModel):
"analyzer.Analyzer", on_delete=django.db.models.SET_NULL, null=True, blank=True
)

# If this is meant to be a 'built-in' label and be used across corpuses without being explicitly added to a
# labelset, set this value to True
read_only = django.db.models.BooleanField(default=False)

color = django.db.models.CharField(
max_length=12, blank=False, null=False, default="#ffff00"
)
Expand Down
33 changes: 24 additions & 9 deletions opencontractserver/documents/signals.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from celery import chain
from django.db import transaction
from django.conf import settings

from opencontractserver.tasks.doc_tasks import (
extract_thumbnail,
set_doc_lock_state,
split_pdf_for_processing,
nlm_ingest_pdf,
)


Expand All @@ -14,14 +16,27 @@ def process_doc_on_create_atomic(sender, instance, created, **kwargs):
# run OCR and token extract. Sometimes a doc will be created with tokens preloaded,
# such as when we do an import.
if created and not instance.pawls_parse_file:

# USE NLM Ingestor if NLM_INGESTOR_ACTIVE is set to True
if settings.NLM_INGESTOR_ACTIVE:
ingest_tasks = [
extract_thumbnail.s(doc_id=instance.id),
nlm_ingest_pdf.si(
user_id=instance.creator.id, doc_id=instance.id
),
set_doc_lock_state.si(locked=False, doc_id=instance.id),
]
# Otherwise fall back to PAWLs parser
else:
ingest_tasks = [
extract_thumbnail.s(doc_id=instance.id),
split_pdf_for_processing.si(
user_id=instance.creator.id, doc_id=instance.id
),
set_doc_lock_state.si(locked=False, doc_id=instance.id),
]

# Send tasks to celery for async execution
transaction.on_commit(
lambda: chain(
*[
extract_thumbnail.s(doc_id=instance.id),
split_pdf_for_processing.si(
user_id=instance.creator.id, doc_id=instance.id
),
set_doc_lock_state.si(locked=False, doc_id=instance.id),
]
).apply_async()
lambda: chain(*ingest_tasks).apply_async()
)
100 changes: 98 additions & 2 deletions opencontractserver/tasks/doc_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,8 @@
import logging
import pathlib
import uuid
from typing import Any
import requests
from typing import Any, Optional

from celery import chord, group
from django.conf import settings
Expand All @@ -16,10 +17,11 @@
from pydantic import validate_arguments

from config import celery_app
from config.graphql.serializers import AnnotationLabelSerializer
from opencontractserver.annotations.models import (
METADATA_LABEL,
TOKEN_LABEL,
Annotation,
Annotation, AnnotationLabel,
)
from opencontractserver.documents.models import Document
from opencontractserver.types.dicts import (
Expand All @@ -30,11 +32,13 @@
PawlsPagePythonType,
PawlsTokenPythonType,
)
from opencontractserver.types.enums import PermissionTypes
from opencontractserver.utils.etl import build_document_export, pawls_bbox_to_funsd_box
from opencontractserver.utils.pdf import (
extract_pawls_from_pdfs_bytes,
split_pdf_into_images,
)
from opencontractserver.utils.permissioning import set_permissions_for_obj_to_user
from opencontractserver.utils.text import __consolidate_common_equivalent_chars

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -219,6 +223,98 @@ def set_doc_lock_state(*args, locked: bool, doc_id: int):
document.backend_lock = locked
document.save()

@celery_app.task(
autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={"max_retries": 5}
)
def nlm_ingest_pdf(user_id: int, doc_id: int) -> list[tuple[int, str]]:

logger.info(f"nlm_ingest_pdf() - split doc {doc_id} for user {user_id}")

doc = Document.objects.get(pk=doc_id)
doc_path = doc.pdf_file.name
doc_file = default_storage.open(doc_path, mode="rb")

if settings.NLM_INGEST_API_KEY is not None:
headers = {'API_KEY': settings.NLM_INGEST_API_KEY}
else:
headers = {}

files = {'file': doc_file}
params = {
'calculate_opencontracts_data': 'yes',
'applyOcr': "yes" if settings.NLM_INGEST_USE_OCR else 'no'
} # Ensures calculate_opencontracts_data is set to True

response = requests.post(settings.NLM_INGEST_HOSTNAME + "/api/parseDocument/", headers=headers, files=files, params=params)

if not response.status_code == 200:
response.raise_for_status()

response_data = response.json()
open_contracts_data: Optional[OpenContractDocExport] = response_data.get('return_dict', {}).get('opencontracts_data', None)

document = Document.objects.get(pk=doc_id)

# Create new labels if needed
if open_contracts_data is not None:

# Get PAWLS layer and text contents
pawls_string = json.dumps(open_contracts_data['pawls_file_content'])
pawls_file = ContentFile(pawls_string.encode("utf-8"))
txt_file = ContentFile(open_contracts_data['content'].encode("utf-8"))

document.txt_extract_file.save(f"doc_{doc_id}.txt", txt_file)
document.pawls_parse_file.save(f"doc_{doc_id}.pawls", pawls_file)
document.page_count = len(open_contracts_data['pawls_file_content'])

existing_text_labels: dict[str, AnnotationLabel] = {}

# Now, annotate the document with any annotations that bubbled up from parser.
for label_data in open_contracts_data['labelled_text']:

label_name = label_data['annotationLabel']

if label_name not in existing_text_labels:
label_obj = AnnotationLabel.objects.filter(
text=label_name,
creator_id=user_id,
label_type=TOKEN_LABEL,
read_only=True
)
if label_obj.count() > 0:
label_obj = label_obj[0]
existing_text_labels[label_name] = label_obj
else:
label_serializer = AnnotationLabelSerializer(data={
"label_type": "TOKEN_LABEL",
"color":"grey",
"description": "NLM Structural Label",
"icon": "expand",
"text": label_name,
"creator_id": user_id
})
label_serializer.is_valid(raise_exception=True)
label_obj = label_serializer.save()
set_permissions_for_obj_to_user(
user_id, label_obj, [PermissionTypes.ALL]
)
existing_text_labels[label_name] = label_obj
else:
label_obj = existing_text_labels[label_name]

annot_obj = Annotation.objects.create(
raw_text=label_data["rawText"],
page=label_data["page"],
json=label_data["annotation_json"],
annotation_label=label_obj,
document=doc,
creator_id=user_id,
)
annot_obj.save()
set_permissions_for_obj_to_user(user_id, annot_obj, [PermissionTypes.ALL])

document.save()


@celery_app.task(
autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={"max_retries": 5}
Expand Down
4 changes: 4 additions & 0 deletions opencontractserver/tests/fixtures/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
"Agreement_ZrZJLLv.pdf"
)

# files for nlm ingestor pipeline test
NLM_INGESTOR_SAMPLE_PDF = pathlib.Path(__file__).parent / "sample.pdf"
NLM_INGESTOR_EXPECTED_JSON = pathlib.Path(__file__).parent / "nlm_ingestor_output_for_sample_pdf.json"

SAMPLE_PDF_FILE_TWO_PATH = pathlib.Path(__file__).parent / "USC Title 1 - CHAPTER 1.pdf"

PUBLIC_PDF_URL_LIST = pathlib.Path(__file__).parent / "test_pdf_file_urls.txt"
Expand Down
Loading

0 comments on commit 9d4fe16

Please sign in to comment.