Merge pull request #115 from JSv4/JSv4/add-nlm-ingestor

Add Nlm-ingestor
JSv4 · May 21, 2024 · 9d4fe16 · 9d4fe16
2 parents f105b90 + 51f05f2
commit 9d4fe16
Show file tree

Hide file tree

Showing 16 changed files with 36,985 additions and 12 deletions.
diff --git a/.idea/OpenContracts.iml b/.idea/OpenContracts.iml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/config/settings/base.py b/config/settings/base.py
@@ -451,6 +451,13 @@
 # Constants for Permissioning
 DEFAULT_PERMISSIONS_GROUP = "Public Objects Access"
 
+# Nlm-ingestor settings
+# -----------------------------------------------------------------------------
+NLM_INGESTOR_ACTIVE = env.bool('NLM_INGESTOR_ACTIVE', False)  # Use nlm-ingestor where this is True... otherwise PAWLs
+NLM_INGEST_USE_OCR = False  # IF True, always tell nlm-ingestor to use OCR (Tesseract)
+NLM_INGEST_HOSTNAME = "http://nlm-ingestor:5001"  # Hostname to send nlm-ingestor REST requests to
+NLM_INGEST_API_KEY = None  # If the endpoint is secured with an API_KEY, specify it here, otherwise use None
+
 # CORS
 # ------------------------------------------------------------------------------
 CORS_ORIGIN_WHITELIST = [

diff --git a/frontend/src/graphql/queries.ts b/frontend/src/graphql/queries.ts
@@ -492,6 +492,7 @@ export const GET_ANNOTATION_LABELS = gql`
           text
           description
           labelType
+          readOnly
           isPublic
           myPermissions
           analyzer {
@@ -525,6 +526,7 @@ export const GET_LABELSET_WITH_ALL_LABELS = gql`
         id
         icon
         labelType
+        readOnly
         text
         description
         color
@@ -767,6 +769,7 @@ export const REQUEST_ANNOTATOR_DATA_FOR_DOCUMENT = gql`
           color
           description
           text
+          readOnly
           labelType
           analyzer {
             id
@@ -790,6 +793,7 @@ export const REQUEST_ANNOTATOR_DATA_FOR_DOCUMENT = gql`
         description
         text
         labelType
+        readOnly
       }
       boundingBox
       page
@@ -886,6 +890,7 @@ export const REQUEST_ANNOTATOR_DATA_FOR_DOCUMENT = gql`
           description
           text
           labelType
+          readOnly
           analyzer {
             id
           }

diff --git a/frontend/src/graphql/types.ts b/frontend/src/graphql/types.ts
@@ -60,6 +60,7 @@ export type AnnotationLabelType = Node & {
   created?: Scalars["DateTime"];
   modified?: Scalars["DateTime"];
   isPublic?: Scalars["Boolean"];
+  readonly?: Scalars["Boolean"];
   myPermissions?: Scalars["String"][];
   relationshipSet?: RelationshipTypeConnection;
   annotationSet?: AnnotationTypeConnection;

diff --git a/local.yml b/local.yml
@@ -39,6 +39,10 @@ services:
     image: redis:6
     container_name: redis
 
+  nlm-ingestor:
+    image: jscrudato/nlm-ingestor-opencontracts
+    container_name: nlm-ingestor
+
   celeryworker:
     <<: *django
     image: opencontractserver_local_celeryworker

diff --git a/local_deploy_with_gremlin.yml b/local_deploy_with_gremlin.yml
@@ -63,6 +63,12 @@ services:
     image: redis:6
     container_name: redis
 
+  nlm-ingestor:
+    image: jscrudato/nlm-ingestor-opencontracts
+    container_name: redis
+    environment:
+      - API_KEY=8j4t9kjdfgmdfpomd
+
   gremlinengine: &gremlinengine
     image: opensourcelegal/gremlin-engine:latest
     container_name: gremlinengine

diff --git a/opencontractserver/annotations/migrations/0004_annotationlabel_read_only.py b/opencontractserver/annotations/migrations/0004_annotationlabel_read_only.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.2.9 on 2024-05-19 03:31
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('annotations', '0003_auto_20230202_0604'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='annotationlabel',
+            name='read_only',
+            field=models.BooleanField(default=False),
+        ),
+    ]
diff --git a/opencontractserver/annotations/models.py b/opencontractserver/annotations/models.py
@@ -45,6 +45,10 @@ class AnnotationLabel(BaseOCModel):
         "analyzer.Analyzer", on_delete=django.db.models.SET_NULL, null=True, blank=True
     )
 
+    # If this is meant to be a 'built-in' label and be used across corpuses without being explicitly added to a
+    # labelset, set this value to True
+    read_only = django.db.models.BooleanField(default=False)
+
     color = django.db.models.CharField(
         max_length=12, blank=False, null=False, default="#ffff00"
     )

diff --git a/opencontractserver/documents/signals.py b/opencontractserver/documents/signals.py
@@ -1,10 +1,12 @@
 from celery import chain
 from django.db import transaction
+from django.conf import settings
 
 from opencontractserver.tasks.doc_tasks import (
     extract_thumbnail,
     set_doc_lock_state,
     split_pdf_for_processing,
+    nlm_ingest_pdf,
 )
 
 
@@ -14,14 +16,27 @@ def process_doc_on_create_atomic(sender, instance, created, **kwargs):
     # run OCR and token extract. Sometimes a doc will be created with tokens preloaded,
     # such as when we do an import.
     if created and not instance.pawls_parse_file:
+
+        # USE NLM Ingestor if NLM_INGESTOR_ACTIVE is set to True
+        if settings.NLM_INGESTOR_ACTIVE:
+            ingest_tasks = [
+                extract_thumbnail.s(doc_id=instance.id),
+                nlm_ingest_pdf.si(
+                    user_id=instance.creator.id, doc_id=instance.id
+                ),
+                set_doc_lock_state.si(locked=False, doc_id=instance.id),
+            ]
+        # Otherwise fall back to PAWLs parser
+        else:
+            ingest_tasks = [
+                extract_thumbnail.s(doc_id=instance.id),
+                split_pdf_for_processing.si(
+                    user_id=instance.creator.id, doc_id=instance.id
+                ),
+                set_doc_lock_state.si(locked=False, doc_id=instance.id),
+            ]
+
+        # Send tasks to celery for async execution
         transaction.on_commit(
-            lambda: chain(
-                *[
-                    extract_thumbnail.s(doc_id=instance.id),
-                    split_pdf_for_processing.si(
-                        user_id=instance.creator.id, doc_id=instance.id
-                    ),
-                    set_doc_lock_state.si(locked=False, doc_id=instance.id),
-                ]
-            ).apply_async()
+            lambda: chain(*ingest_tasks).apply_async()
         )
diff --git a/opencontractserver/tasks/doc_tasks.py b/opencontractserver/tasks/doc_tasks.py
@@ -6,7 +6,8 @@
 import logging
 import pathlib
 import uuid
-from typing import Any
+import requests
+from typing import Any, Optional
 
 from celery import chord, group
 from django.conf import settings
@@ -16,10 +17,11 @@
 from pydantic import validate_arguments
 
 from config import celery_app
+from config.graphql.serializers import AnnotationLabelSerializer
 from opencontractserver.annotations.models import (
     METADATA_LABEL,
     TOKEN_LABEL,
-    Annotation,
+    Annotation, AnnotationLabel,
 )
 from opencontractserver.documents.models import Document
 from opencontractserver.types.dicts import (
@@ -30,11 +32,13 @@
     PawlsPagePythonType,
     PawlsTokenPythonType,
 )
+from opencontractserver.types.enums import PermissionTypes
 from opencontractserver.utils.etl import build_document_export, pawls_bbox_to_funsd_box
 from opencontractserver.utils.pdf import (
     extract_pawls_from_pdfs_bytes,
     split_pdf_into_images,
 )
+from opencontractserver.utils.permissioning import set_permissions_for_obj_to_user
 from opencontractserver.utils.text import __consolidate_common_equivalent_chars
 
 logger = logging.getLogger(__name__)
@@ -219,6 +223,98 @@ def set_doc_lock_state(*args, locked: bool, doc_id: int):
     document.backend_lock = locked
     document.save()
 
+@celery_app.task(
+    autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={"max_retries": 5}
+)
+def nlm_ingest_pdf(user_id: int, doc_id: int) -> list[tuple[int, str]]:
+
+    logger.info(f"nlm_ingest_pdf() - split doc {doc_id} for user {user_id}")
+
+    doc = Document.objects.get(pk=doc_id)
+    doc_path = doc.pdf_file.name
+    doc_file = default_storage.open(doc_path, mode="rb")
+
+    if settings.NLM_INGEST_API_KEY is not None:
+        headers = {'API_KEY': settings.NLM_INGEST_API_KEY}
+    else:
+        headers = {}
+
+    files = {'file': doc_file}
+    params = {
+        'calculate_opencontracts_data': 'yes',
+        'applyOcr': "yes" if settings.NLM_INGEST_USE_OCR else 'no'
+    }  # Ensures calculate_opencontracts_data is set to True
+
+    response = requests.post(settings.NLM_INGEST_HOSTNAME + "/api/parseDocument/", headers=headers, files=files, params=params)
+
+    if not response.status_code == 200:
+        response.raise_for_status()
+
+    response_data = response.json()
+    open_contracts_data: Optional[OpenContractDocExport] = response_data.get('return_dict', {}).get('opencontracts_data', None)
+
+    document = Document.objects.get(pk=doc_id)
+
+    # Create new labels if needed
+    if open_contracts_data is not None:
+
+        # Get PAWLS layer and text contents
+        pawls_string = json.dumps(open_contracts_data['pawls_file_content'])
+        pawls_file = ContentFile(pawls_string.encode("utf-8"))
+        txt_file = ContentFile(open_contracts_data['content'].encode("utf-8"))
+
+        document.txt_extract_file.save(f"doc_{doc_id}.txt", txt_file)
+        document.pawls_parse_file.save(f"doc_{doc_id}.pawls", pawls_file)
+        document.page_count = len(open_contracts_data['pawls_file_content'])
+
+        existing_text_labels: dict[str, AnnotationLabel] = {}
+
+        # Now, annotate the document with any annotations that bubbled up from parser.
+        for label_data in open_contracts_data['labelled_text']:
+
+            label_name = label_data['annotationLabel']
+
+            if label_name not in existing_text_labels:
+                label_obj = AnnotationLabel.objects.filter(
+                    text=label_name,
+                    creator_id=user_id,
+                    label_type=TOKEN_LABEL,
+                    read_only=True
+                )
+                if label_obj.count() > 0:
+                    label_obj = label_obj[0]
+                    existing_text_labels[label_name] = label_obj
+                else:
+                    label_serializer = AnnotationLabelSerializer(data={
+                        "label_type": "TOKEN_LABEL",
+                        "color":"grey",
+                        "description": "NLM Structural Label",
+                        "icon": "expand",
+                        "text": label_name,
+                        "creator_id": user_id
+                    })
+                    label_serializer.is_valid(raise_exception=True)
+                    label_obj = label_serializer.save()
+                    set_permissions_for_obj_to_user(
+                        user_id, label_obj, [PermissionTypes.ALL]
+                    )
+                    existing_text_labels[label_name] = label_obj
+            else:
+                label_obj = existing_text_labels[label_name]
+
+            annot_obj = Annotation.objects.create(
+                raw_text=label_data["rawText"],
+                page=label_data["page"],
+                json=label_data["annotation_json"],
+                annotation_label=label_obj,
+                document=doc,
+                creator_id=user_id,
+            )
+            annot_obj.save()
+            set_permissions_for_obj_to_user(user_id, annot_obj, [PermissionTypes.ALL])
+
+    document.save()
+
 
 @celery_app.task(
     autoretry_for=(Exception,), retry_backoff=True, retry_kwargs={"max_retries": 5}

diff --git a/opencontractserver/tests/fixtures/__init__.py b/opencontractserver/tests/fixtures/__init__.py
@@ -26,6 +26,10 @@
     "Agreement_ZrZJLLv.pdf"
 )
 
+# files for nlm ingestor pipeline test
+NLM_INGESTOR_SAMPLE_PDF = pathlib.Path(__file__).parent / "sample.pdf"
+NLM_INGESTOR_EXPECTED_JSON = pathlib.Path(__file__).parent / "nlm_ingestor_output_for_sample_pdf.json"
+
 SAMPLE_PDF_FILE_TWO_PATH = pathlib.Path(__file__).parent / "USC Title 1 - CHAPTER 1.pdf"
 
 PUBLIC_PDF_URL_LIST = pathlib.Path(__file__).parent / "test_pdf_file_urls.txt"