Worked through test_extract_tasks not actually fully testing extract …

…anymore. Due in part to incompatibility with urllib3 (which is NOT locked and clearly was upgraded in my env) and the version of vcr.py I had locked to. That is fixed. Also fixed a syntax error with how we were loading SentenceTransformer embed model.
JSv4 · Jan 2, 2025 · 8dadd14 · 8dadd14
1 parent bb018ae
commit 8dadd14
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 9 deletions.
diff --git a/model_preloaders/download_docling_models.py b/model_preloaders/download_docling_models.py
@@ -1,6 +1,7 @@
 import logging
 from pathlib import Path
 
+import easyocr
 from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 
 # Configure logging
@@ -37,8 +38,15 @@ def download_docling_models(artifacts_path: str) -> None:
 
     # Explicitly prefetch and download the models
     StandardPdfPipeline.download_models_hf(local_dir=artifacts_path, force=True)
+    logger.info(
+        f"Docling LAYOUT models have been downloaded and saved to '{artifacts_path}'."
+    )
 
-    logger.info(f"Docling models have been downloaded and saved to '{artifacts_path}'.")
+    logger.info("Proceed to attempt to download the EasyOCR models...")
+    # Hoping this works to force download of the EasyOCR models
+    easyocr.Reader(["ch_tra", "en"], model_storage_directory=artifacts_path)
+
+    logger.info(f"EasyOCR models have been downloaded and saved to '{artifacts_path}'.")
 
 
 if __name__ == "__main__":

diff --git a/opencontractserver/pipeline/parsers/docling_parser.py b/opencontractserver/pipeline/parsers/docling_parser.py
@@ -11,7 +11,7 @@
 from django.conf import settings
 from django.core.files.storage import default_storage
 from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
-from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
 from docling_core.types.doc import (
@@ -185,11 +185,17 @@ def __init__(self):
         # Log the contents of the models directory
         logger.info(f"Docling models directory contents: {os.listdir(artifacts_path)}")
 
+        # TODO - expose some settings from here - like GPU acceleration
+        ocr_options = EasyOcrOptions(
+            model_storage_directory=artifacts_path  # We want to preload this to avoid SLOW download at runtime
+        )
+
         pipeline_options = PdfPipelineOptions(
             artifacts_path=artifacts_path,
             do_ocr=True,
             do_table_structure=True,
             generate_page_images=True,
+            ocr_options=ocr_options,
         )
         self.doc_converter = DocumentConverter(
             format_options={

diff --git a/opencontractserver/tasks/data_extract_tasks.py b/opencontractserver/tasks/data_extract_tasks.py
@@ -83,7 +83,7 @@ def oc_llama_index_doc_query(
         document = datacell.document
 
         embed_model = HuggingFaceEmbedding(
-            model_name="multi-qa-MiniLM-L6-cos-v1", cache_folder="/models"
+            "/models/sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
         )  # Using our pre-load cache path where the model was stored on container build
         Settings.embed_model = embed_model
 
@@ -223,7 +223,7 @@ def oc_llama_index_doc_query(
         retrieved_annotation_ids = [
             n.node.extra_info["annotation_id"] for n in retrieved_nodes
         ]
-        if retrieved_annotation_ids:
+        if len(retrieved_annotation_ids) > 0:
             datacell.sources.add(*retrieved_annotation_ids)
 
         raw_retrieved_text = "\n".join(
@@ -248,7 +248,7 @@ def oc_llama_index_doc_query(
         logger.info(f"Retrieved {len(relationships)} relationships")
 
         relationship_sections = []
-        if relationships:
+        if relationships.count() > 0:
             relationship_sections.append(
                 "\n========== Sections Related to Nodes Most Semantically Similar to Query =========="
             )
@@ -592,7 +592,7 @@ def llama_index_react_agent_query(cell_id):
 
         document = datacell.document
         embed_model = HuggingFaceEmbedding(
-            model_name="multi-qa-MiniLM-L6-cos-v1", cache_folder="/models"
+            "/models/sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
         )  # Using our pre-load cache path where the model was stored on container build
         Settings.embed_model = embed_model
 

diff --git a/opencontractserver/tests/test_extract_tasks.py b/opencontractserver/tests/test_extract_tasks.py
@@ -109,8 +109,7 @@ def setUp(self):
             backend_lock=True,
         )
 
-        # Run ingest pipeline SYNCHRONOUS and, with @responses.activate decorator, no API call ought to go out to
-        # nlm-ingestor host
+        # Run ingest pipeline SYNCHRONOUS
         ingest_doc.delay(user_id=self.user.id, doc_id=self.doc2.id)
 
         # Manually run the calcs for the embeddings as post_save hook is hard
@@ -162,6 +161,9 @@ def test_run_extract_task(self):
         rows = DocumentAnalysisRow.objects.filter(extract=self.extract)
         self.assertEqual(3, rows.count())
 
+        # TODO - this is not actually testing the extract WORKED.
+        # looking at the codecov, seems tests keep failing when setting up embedder:
+
         for cell in Datacell.objects.all():
             print(f"Cell data: {cell.data}")
             print(f"Cell started: {cell.started}")

diff --git a/requirements/local.txt b/requirements/local.txt
@@ -15,7 +15,7 @@ pytest-cov==6.0.0  # https://github.com/pytest-dev/pytest-cov
 pytest-sugar==1.0.0  # https://github.com/Frozenball/pytest-sugar
 djangorestframework-stubs==1.8.0  # https://github.com/typeddjango/djangorestframework-stubs
 responses==0.22.0  # https://github.com/getsentry/responses
-git+https://github.com/kevin1024/vcrpy.git@35650b141b5689eed84eac05c23b48412c76dd52 # VCR.py 6.0.* has deprecated setuptools which broke recently.
+vcrpy==7.0.0
 
 # Profiling
 # ------------------------------------------------------------------------------