Skip to content

Commit

Permalink
Worked through test_extract_tasks not actually fully testing extract …
Browse files Browse the repository at this point in the history
…anymore. Due in part to incompatibility with urllib3 (which is NOT locked and clearly was upgraded in my env) and the version of vcr.py I had locked to. That is fixed. Also fixed a syntax error with how we were loading SentenceTransformer embed model.
  • Loading branch information
JSv4 committed Jan 2, 2025
1 parent bb018ae commit 8dadd14
Show file tree
Hide file tree
Showing 5 changed files with 25 additions and 9 deletions.
10 changes: 9 additions & 1 deletion model_preloaders/download_docling_models.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
from pathlib import Path

import easyocr
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline

# Configure logging
Expand Down Expand Up @@ -37,8 +38,15 @@ def download_docling_models(artifacts_path: str) -> None:

# Explicitly prefetch and download the models
StandardPdfPipeline.download_models_hf(local_dir=artifacts_path, force=True)
logger.info(
f"Docling LAYOUT models have been downloaded and saved to '{artifacts_path}'."
)

logger.info(f"Docling models have been downloaded and saved to '{artifacts_path}'.")
logger.info("Proceed to attempt to download the EasyOCR models...")
# Hoping this works to force download of the EasyOCR models
easyocr.Reader(["ch_tra", "en"], model_storage_directory=artifacts_path)

logger.info(f"EasyOCR models have been downloaded and saved to '{artifacts_path}'.")


if __name__ == "__main__":
Expand Down
8 changes: 7 additions & 1 deletion opencontractserver/pipeline/parsers/docling_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from django.conf import settings
from django.core.files.storage import default_storage
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
from docling_core.types.doc import (
Expand Down Expand Up @@ -185,11 +185,17 @@ def __init__(self):
# Log the contents of the models directory
logger.info(f"Docling models directory contents: {os.listdir(artifacts_path)}")

# TODO - expose some settings from here - like GPU acceleration
ocr_options = EasyOcrOptions(
model_storage_directory=artifacts_path # We want to preload this to avoid SLOW download at runtime
)

pipeline_options = PdfPipelineOptions(
artifacts_path=artifacts_path,
do_ocr=True,
do_table_structure=True,
generate_page_images=True,
ocr_options=ocr_options,
)
self.doc_converter = DocumentConverter(
format_options={
Expand Down
8 changes: 4 additions & 4 deletions opencontractserver/tasks/data_extract_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def oc_llama_index_doc_query(
document = datacell.document

embed_model = HuggingFaceEmbedding(
model_name="multi-qa-MiniLM-L6-cos-v1", cache_folder="/models"
"/models/sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
) # Using our pre-load cache path where the model was stored on container build
Settings.embed_model = embed_model

Expand Down Expand Up @@ -223,7 +223,7 @@ def oc_llama_index_doc_query(
retrieved_annotation_ids = [

Check warning on line 223 in opencontractserver/tasks/data_extract_tasks.py

View check run for this annotation

Codecov / codecov/patch

opencontractserver/tasks/data_extract_tasks.py#L223

Added line #L223 was not covered by tests
n.node.extra_info["annotation_id"] for n in retrieved_nodes
]
if retrieved_annotation_ids:
if len(retrieved_annotation_ids) > 0:

Check warning on line 226 in opencontractserver/tasks/data_extract_tasks.py

View check run for this annotation

Codecov / codecov/patch

opencontractserver/tasks/data_extract_tasks.py#L226

Added line #L226 was not covered by tests
datacell.sources.add(*retrieved_annotation_ids)

raw_retrieved_text = "\n".join(

Check warning on line 229 in opencontractserver/tasks/data_extract_tasks.py

View check run for this annotation

Codecov / codecov/patch

opencontractserver/tasks/data_extract_tasks.py#L229

Added line #L229 was not covered by tests
Expand All @@ -248,7 +248,7 @@ def oc_llama_index_doc_query(
logger.info(f"Retrieved {len(relationships)} relationships")

Check warning on line 248 in opencontractserver/tasks/data_extract_tasks.py

View check run for this annotation

Codecov / codecov/patch

opencontractserver/tasks/data_extract_tasks.py#L248

Added line #L248 was not covered by tests

relationship_sections = []
if relationships:
if relationships.count() > 0:
relationship_sections.append(

Check warning on line 252 in opencontractserver/tasks/data_extract_tasks.py

View check run for this annotation

Codecov / codecov/patch

opencontractserver/tasks/data_extract_tasks.py#L250-L252

Added lines #L250 - L252 were not covered by tests
"\n========== Sections Related to Nodes Most Semantically Similar to Query =========="
)
Expand Down Expand Up @@ -592,7 +592,7 @@ def llama_index_react_agent_query(cell_id):

document = datacell.document
embed_model = HuggingFaceEmbedding(
model_name="multi-qa-MiniLM-L6-cos-v1", cache_folder="/models"
"/models/sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
) # Using our pre-load cache path where the model was stored on container build
Settings.embed_model = embed_model

Expand Down
6 changes: 4 additions & 2 deletions opencontractserver/tests/test_extract_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,7 @@ def setUp(self):
backend_lock=True,
)

# Run ingest pipeline SYNCHRONOUS and, with @responses.activate decorator, no API call ought to go out to
# nlm-ingestor host
# Run ingest pipeline SYNCHRONOUS
ingest_doc.delay(user_id=self.user.id, doc_id=self.doc2.id)

# Manually run the calcs for the embeddings as post_save hook is hard
Expand Down Expand Up @@ -162,6 +161,9 @@ def test_run_extract_task(self):
rows = DocumentAnalysisRow.objects.filter(extract=self.extract)
self.assertEqual(3, rows.count())

# TODO - this is not actually testing the extract WORKED.
# looking at the codecov, seems tests keep failing when setting up embedder:

for cell in Datacell.objects.all():
print(f"Cell data: {cell.data}")
print(f"Cell started: {cell.started}")
Expand Down
2 changes: 1 addition & 1 deletion requirements/local.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ pytest-cov==6.0.0 # https://github.com/pytest-dev/pytest-cov
pytest-sugar==1.0.0 # https://github.com/Frozenball/pytest-sugar
djangorestframework-stubs==1.8.0 # https://github.com/typeddjango/djangorestframework-stubs
responses==0.22.0 # https://github.com/getsentry/responses
git+https://github.com/kevin1024/vcrpy.git@35650b141b5689eed84eac05c23b48412c76dd52 # VCR.py 6.0.* has deprecated setuptools which broke recently.
vcrpy==7.0.0

# Profiling
# ------------------------------------------------------------------------------
Expand Down

0 comments on commit 8dadd14

Please sign in to comment.