Merge pull request #139 from pymupdf/v0.0.15

Version 0.0.15
pymupdf · Sep 16, 2024 · b86b33f · b86b33f
2 parents 65130d2 + 8578c49
commit b86b33f
Show file tree

Hide file tree

Showing 8 changed files with 318 additions and 162 deletions.
diff --git a/docs/src/changes.rst b/docs/src/changes.rst
@@ -4,6 +4,27 @@
 Change Log
 ===========================================================================
 
+Changes in version 0.0.15
+--------------------------
+
+Fixes:
+~~~~~~~
+
+* `138 <https://github.com/pymupdf/RAG/issues/138>`_ "Table is not extracted and some text order was wrong."
+* `135 <https://github.com/pymupdf/RAG/issues/135>`_ "Problem with multiple columns in simple text."
+* `134 <https://github.com/pymupdf/RAG/issues/134>`_ "Exclude images based on size threshold parameter."
+* `132 <https://github.com/pymupdf/RAG/issues/132>`_ "Optionally embed images as base64 string."
+* `128 <https://github.com/pymupdf/RAG/issues/128>`_ "Enhanced image embedding format."
+
+
+Improvements:
+~~~~~~~~~~~~~~
+* New parameter `embed_images` (bool) **embeds** images and vector graphics in the markdown text as base64-encoded strings. Ignores `write_images` and `image_path` parameters.
+* New parameter `image_size_limit` which is a float between 0 and 1, default is 0.05 (5%). Causes images to be ignored if their width or height values are smaller than the corresponding fraction of the page's width or height.
+* The algorithm has been improved which determins the sequence of the text rectangles on multi-column pages.
+* Change of the header identification algorithm: If more than six header levels are required for a document, then all text with a font size larger than body text is assumed to be a header of level 6 (i.e. HTML "h6" = "###### ").
+
+
 Changes in version 0.0.13
 --------------------------
 
@@ -19,7 +40,6 @@ Improvements:
 * New parameter `extract_words` enforces `page_chunks=True` and adds a "words" list to each page dictionary.
 
 
-
 Changes in version 0.0.11
 --------------------------
 

diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py
@@ -1,6 +1,6 @@
 from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown
 
-__version__ = "0.0.14"
+__version__ = "0.0.15"
 version = __version__
 version_tuple = tuple(map(int, version.split(".")))
 

diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py
@@ -34,7 +34,8 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
     Result is a sorted list of line objects that consist of the recomputed line
     boundary box and the sorted list of spans in that line.
 
-    This result can then easily be converted e.g. to plain or markdown text.
+    This result can then easily be converted e.g. to plain text and other
+    formats like Markdown or JSON.
 
     Args:
         textpage: (mandatory) TextPage object
@@ -45,7 +46,7 @@ def get_raw_lines(textpage, clip=None, tolerance=3):
 
     Returns:
         A sorted list of items (rect, [spans]), each representing one line. The
-        spans are sorted left to right, Span dictionaries have been changed:
+        spans are sorted left to right. Span dictionaries have been changed:
         - "bbox" has been converted to a Rect object
         - "line" (new) the line number in TextPage.extractDICT
         - "block" (new) the block number in TextPage.extractDICT
@@ -98,7 +99,7 @@ def sanitize_spans(line):
     spans = []  # all spans in TextPage here
     for bno, b in enumerate(blocks):  # the numbered blocks
         for lno, line in enumerate(b["lines"]):  # the numbered lines
-            if abs(1-line["dir"][0]) > 1e-3:  # only accept horizontal text
+            if abs(1 - line["dir"][0]) > 1e-3:  # only accept horizontal text
                 continue
             for sno, s in enumerate(line["spans"]):  # the numered spans
                 sbbox = pymupdf.Rect(s["bbox"])  # span bbox as a Rect
@@ -131,7 +132,10 @@ def sanitize_spans(line):
         sbbox = s["bbox"]  # this bbox
         sbbox0 = line[-1]["bbox"]  # previous bbox
         # if any of top or bottom coordinates are close enough, join...
-        if abs(sbbox.y1 - sbbox0.y1) <= y_delta or abs(sbbox.y0 - sbbox0.y0) <= y_delta:
+        if (
+            abs(sbbox.y1 - sbbox0.y1) <= y_delta
+            or abs(sbbox.y0 - sbbox0.y0) <= y_delta
+        ):
             line.append(s)  # append to this line
             lrect |= sbbox  # extend line rectangle
             continue
@@ -152,7 +156,9 @@ def sanitize_spans(line):
     return nlines
 
 
-def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False):
+def get_text_lines(
+    page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False
+):
     """Extract text by line keeping natural reading sequence.
 
     Notes: