diff --git a/docs/src/changes.rst b/docs/src/changes.rst index d9bb793e..5335df39 100644 --- a/docs/src/changes.rst +++ b/docs/src/changes.rst @@ -4,6 +4,27 @@ Change Log =========================================================================== +Changes in version 0.0.15 +-------------------------- + +Fixes: +~~~~~~~ + +* `138 `_ "Table is not extracted and some text order was wrong." +* `135 `_ "Problem with multiple columns in simple text." +* `134 `_ "Exclude images based on size threshold parameter." +* `132 `_ "Optionally embed images as base64 string." +* `128 `_ "Enhanced image embedding format." + + +Improvements: +~~~~~~~~~~~~~~ +* New parameter `embed_images` (bool) **embeds** images and vector graphics in the markdown text as base64-encoded strings. Ignores `write_images` and `image_path` parameters. +* New parameter `image_size_limit` which is a float between 0 and 1, default is 0.05 (5%). Causes images to be ignored if their width or height values are smaller than the corresponding fraction of the page's width or height. +* The algorithm has been improved which determins the sequence of the text rectangles on multi-column pages. +* Change of the header identification algorithm: If more than six header levels are required for a document, then all text with a font size larger than body text is assumed to be a header of level 6 (i.e. HTML "h6" = "###### "). + + Changes in version 0.0.13 -------------------------- @@ -19,7 +40,6 @@ Improvements: * New parameter `extract_words` enforces `page_chunks=True` and adds a "words" list to each page dictionary. - Changes in version 0.0.11 -------------------------- diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index 75f6156c..512fbeb8 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -1,6 +1,6 @@ from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown -__version__ = "0.0.14" +__version__ = "0.0.15" version = __version__ version_tuple = tuple(map(int, version.split("."))) diff --git a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py index fb134b15..8af6a1e8 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py +++ b/pymupdf4llm/pymupdf4llm/helpers/get_text_lines.py @@ -34,7 +34,8 @@ def get_raw_lines(textpage, clip=None, tolerance=3): Result is a sorted list of line objects that consist of the recomputed line boundary box and the sorted list of spans in that line. - This result can then easily be converted e.g. to plain or markdown text. + This result can then easily be converted e.g. to plain text and other + formats like Markdown or JSON. Args: textpage: (mandatory) TextPage object @@ -45,7 +46,7 @@ def get_raw_lines(textpage, clip=None, tolerance=3): Returns: A sorted list of items (rect, [spans]), each representing one line. The - spans are sorted left to right, Span dictionaries have been changed: + spans are sorted left to right. Span dictionaries have been changed: - "bbox" has been converted to a Rect object - "line" (new) the line number in TextPage.extractDICT - "block" (new) the block number in TextPage.extractDICT @@ -98,7 +99,7 @@ def sanitize_spans(line): spans = [] # all spans in TextPage here for bno, b in enumerate(blocks): # the numbered blocks for lno, line in enumerate(b["lines"]): # the numbered lines - if abs(1-line["dir"][0]) > 1e-3: # only accept horizontal text + if abs(1 - line["dir"][0]) > 1e-3: # only accept horizontal text continue for sno, s in enumerate(line["spans"]): # the numered spans sbbox = pymupdf.Rect(s["bbox"]) # span bbox as a Rect @@ -131,7 +132,10 @@ def sanitize_spans(line): sbbox = s["bbox"] # this bbox sbbox0 = line[-1]["bbox"] # previous bbox # if any of top or bottom coordinates are close enough, join... - if abs(sbbox.y1 - sbbox0.y1) <= y_delta or abs(sbbox.y0 - sbbox0.y0) <= y_delta: + if ( + abs(sbbox.y1 - sbbox0.y1) <= y_delta + or abs(sbbox.y0 - sbbox0.y0) <= y_delta + ): line.append(s) # append to this line lrect |= sbbox # extend line rectangle continue @@ -152,7 +156,9 @@ def sanitize_spans(line): return nlines -def get_text_lines(page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False): +def get_text_lines( + page, *, textpage=None, clip=None, sep="\t", tolerance=3, ocr=False +): """Extract text by line keeping natural reading sequence. Notes: diff --git a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py index 8580b892..3c96bcb0 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/multi_column.py +++ b/pymupdf4llm/pymupdf4llm/helpers/multi_column.py @@ -64,6 +64,8 @@ import pymupdf +pymupdf.TOOLS.set_small_glyph_heights(True) + def column_boxes( page, @@ -91,36 +93,11 @@ def is_white(text): """Check for relevant text.""" return WHITE.issuperset(text) - # compute relevant page area - clip = +page.rect - clip.y1 -= footer_margin # Remove footer area - clip.y0 += header_margin # Remove header area - - if paths is None: - paths = page.get_drawings() - - if textpage is None: - textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXTFLAGS_TEXT) - - bboxes = [] - - # path rectangles - path_rects = [] - - # image bboxes - img_bboxes = [] - if avoid is not None: - img_bboxes.extend(avoid) - - # bboxes of non-horizontal text - # avoid when expanding horizontal text boxes - vert_bboxes = [] - def in_bbox(bb, bboxes): """Return 1-based number if a bbox contains bb, else return 0.""" - for i, bbox in enumerate(bboxes): + for i, bbox in enumerate(bboxes, start=1): if bb in bbox: - return i + 1 + return i return 0 def intersects_bboxes(bb, bboxes): @@ -132,7 +109,8 @@ def intersects_bboxes(bb, bboxes): def can_extend(temp, bb, bboxlist, vert_bboxes): """Determines whether rectangle 'temp' can be extended by 'bb' - without intersecting any of the rectangles contained in 'bboxlist'. + without intersecting any of the rectangles contained in 'bboxlist' + or 'vert_bboxes'. Items of bboxlist may be None if they have been removed. @@ -148,6 +126,42 @@ def can_extend(temp, bb, bboxlist, vert_bboxes): return True + def clean_nblocks(nblocks): + """Do some elementary cleaning.""" + + # 1. remove any duplicate blocks. + blen = len(nblocks) + if blen < 2: + return nblocks + start = blen - 1 + for i in range(start, -1, -1): + bb1 = nblocks[i] + bb0 = nblocks[i - 1] + if bb0 == bb1: + del nblocks[i] + + # 2. repair sequence in special cases: + # consecutive bboxes with almost same bottom value are sorted ascending + # by x-coordinate. + y1 = nblocks[0].y1 # first bottom coordinate + i0 = 0 # its index + i1 = -1 # index of last bbox with same bottom + + # Iterate over bboxes, identifying segments with approx. same bottom value. + # Replace every segment by its sorted version. + for i in range(1, len(nblocks)): + b1 = nblocks[i] + if abs(b1.y1 - y1) > 3: # different bottom + if i1 > i0: # segment length > 1? Sort it! + nblocks[i0 : i1 + 1] = sorted( + nblocks[i0 : i1 + 1], key=lambda b: b.x0 + ) + y1 = b1.y1 # store new bottom value + i0 = i # store its start index + i1 = i # store current index + if i1 > i0: # segment waiting to be sorted + nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0) + return nblocks def join_rects_phase1(bboxes): """Postprocess identified text blocks, phase 1. @@ -156,7 +170,7 @@ def join_rects_phase1(bboxes): This means that their intersection is valid (but may be empty). To prefer vertical joins, we will ignore small horizontal gaps. """ - delta=(0,-3,0,3) # allow thid gap above and below + delta = (0, 0, 0, 2) # allow this gap below prects = bboxes[:] new_rects = [] while prects: @@ -165,7 +179,7 @@ def join_rects_phase1(bboxes): while repeat: repeat = False for i in range(len(prects) - 1, 0, -1): - if ((prect0+delta) & (prects[i]+delta)).is_valid: + if not ((prect0 + delta) & prects[i]).is_empty: prect0 |= prects[i] del prects[i] repeat = True @@ -211,10 +225,10 @@ def join_rects_phase2(bboxes): new_rects.append(r) return new_rects - def join_rects_phase3(bboxes): + def join_rects_phase3(bboxes, path_rects): prects = bboxes[:] - prects.sort(key=lambda b: (b.x0, b.y0)) new_rects = [] + while prects: prect0 = prects[0] repeat = True @@ -222,15 +236,15 @@ def join_rects_phase3(bboxes): repeat = False for i in range(len(prects) - 1, 0, -1): prect1 = prects[i] + # do not join across columns if prect1.x0 > prect0.x1 or prect1.x1 < prect0.x0: continue - temp = prect0 | prects[i] + # do not join different backgrounds + if in_bbox(prect0, path_rects) != in_bbox(prect1, path_rects): + continue + temp = prect0 | prect1 test = set( - [ - tuple(b) - for b in prects + new_rects - if b.intersects(temp) - ] + [tuple(b) for b in prects + new_rects if b.intersects(temp)] ) if test == set((tuple(prect0), tuple(prect1))): prect0 |= prect1 @@ -238,55 +252,101 @@ def join_rects_phase3(bboxes): repeat = True new_rects.append(prect0) del prects[0] - new_rects.sort(key=lambda b: (b.y0, b.x0)) - return new_rects - def clean_nblocks(nblocks): - """Do some elementary cleaning.""" + """ + Hopefully the most reasonable sorting sequence: + At this point we have finished identifying blocks that wrap text. + We now need to determine the SEQUENCE by which text extraction from + these blocks should take place. This is hardly possible with 100% + certainty. Our sorting approach is guided by the following thought: + 1. Extraction should start with the block whose top-left corner is the + left-most and top-most. + 2. Any blocks further to the right should be extracted later - even if + their top-left corner is higher up on the page. + 3. Sorting the identified rectangles must therefore happen using a + tuple (y, x) as key, where y is not smaller (= higher up) than that + of the left-most block with a non-empty vertical overlap. + 4. To continue "left block" with "next is ...", its sort key must be + Q +---------+ tuple (P.y, Q.x). + | next is | + P +-------+ | this | + | left | | block | + | block | +---------+ + +-------+ + """ + sort_rects = [] # copy of "new_rects" with a computed sort key + for box in new_rects: + # search for the left-most rect that overlaps like "P" above + # candidates must have the same background + background = in_bbox(box, path_rects) # this background + left_rects = sorted( + [ + r + for r in new_rects + if r.x1 < box.x0 + and (box.y0 <= r.y0 <= box.y1 or box.y0 <= r.y1 <= box.y1) + # and in_bbox(r, path_rects) == background + ], + key=lambda r: r.x1, + ) + if left_rects: # if a "P" rectangle was found ... + key = (left_rects[-1].y0, box.x0) # use this key + else: + key = (box.y0, box.x0) # else use the original (Q.y, Q.x). + sort_rects.append((box, key)) + sort_rects.sort(key=lambda sr: sr[1]) # by computed key + new_rects = [sr[0] for sr in sort_rects] # extract sorted rectangles + + # move shaded text rects into a separate list + shadow_rects = [] + # for i in range(len(new_rects) - 1, 0, -1): + # r = +new_rects[i] + # if in_bbox(r, path_rects): # text with shaded background + # shadow_rects.insert(0, r) # put in front to keep sequence + # del new_rects[i] + return new_rects + shadow_rects - # 1. remove any duplicate blocks. - blen = len(nblocks) - if blen < 2: - return nblocks - start = blen - 1 - for i in range(start, -1, -1): - bb1 = nblocks[i] - bb0 = nblocks[i - 1] - if bb0 == bb1: - del nblocks[i] + # compute relevant page area + clip = +page.rect + clip.y1 -= footer_margin # Remove footer area + clip.y0 += header_margin # Remove header area - # 2. repair sequence in special cases: - # consecutive bboxes with almost same bottom value are sorted ascending - # by x-coordinate. - y1 = nblocks[0].y1 # first bottom coordinate - i0 = 0 # its index - i1 = -1 # index of last bbox with same bottom + paths = [ + p + for p in page.get_drawings() + if p["rect"].width < clip.width and p["rect"].height < clip.height + ] - # Iterate over bboxes, identifying segments with approx. same bottom value. - # Replace every segment by its sorted version. - for i in range(1, len(nblocks)): - b1 = nblocks[i] - if abs(b1.y1 - y1) > 10: # different bottom - if i1 > i0: # segment length > 1? Sort it! - nblocks[i0 : i1 + 1] = sorted( - nblocks[i0 : i1 + 1], key=lambda b: b.x0 - ) - y1 = b1.y1 # store new bottom value - i0 = i # store its start index - i1 = i # store current index - if i1 > i0: # segment waiting to be sorted - nblocks[i0 : i1 + 1] = sorted( - nblocks[i0 : i1 + 1], key=lambda b: b.x0 - ) - return nblocks + if textpage is None: + textpage = page.get_textpage(clip=clip, flags=pymupdf.TEXTFLAGS_TEXT) + + bboxes = [] + + # image bboxes + img_bboxes = [] + if avoid is not None: + img_bboxes.extend(avoid) + + # non-horizontal text boxes, avoid when expanding other text boxes + vert_bboxes = [] - # extract vector graphics + # path rectangles + path_rects = [] for p in paths: - path_rects.append(p["rect"].irect) - path_bboxes = path_rects + # give empty path rectangles some small width or height + prect = p["rect"] + lwidth = 0.5 if (_ := p["width"]) is None else _ * 0.5 + + if prect.width == 0: + prect.x0 -= lwidth + prect.x1 += lwidth + if prect.height == 0: + prect.y0 -= lwidth + prect.y1 += lwidth + path_rects.append(prect) # sort path bboxes by ascending top, then left coordinates - path_bboxes.sort(key=lambda b: (b.y0, b.x0)) + path_rects.sort(key=lambda b: (b.y0, b.x0)) # bboxes of images on page, no need to sort them for item in page.get_images(): @@ -297,7 +357,7 @@ def clean_nblocks(nblocks): # Make block rectangles, ignoring non-horizontal text for b in blocks: - bbox = pymupdf.IRect(b["bbox"]) # bbox of the block + bbox = pymupdf.Rect(b["bbox"]) # bbox of the block # ignore text written upon images if no_image_text and in_bbox(bbox, img_bboxes): @@ -309,15 +369,15 @@ def clean_nblocks(nblocks): except IndexError: continue - if line0["dir"] != (1, 0): # only accept horizontal text - vert_bboxes.append(bbox) + if abs(1 - line0["dir"][0]) > 1e-3: # only (almost) horizontal text + vert_bboxes.append(bbox) # a block with non-horizontal text continue - srect = pymupdf.EMPTY_IRECT() + srect = pymupdf.EMPTY_RECT() for line in b["lines"]: - lbbox = pymupdf.IRect(line["bbox"]) - text = "".join([s["text"].strip() for s in line["spans"]]) - if len(text) > 1: + lbbox = pymupdf.Rect(line["bbox"]) + text = "".join([s["text"] for s in line["spans"]]) + if not is_white(text): srect |= lbbox bbox = +srect @@ -325,12 +385,7 @@ def clean_nblocks(nblocks): bboxes.append(bbox) # Sort text bboxes by ascending background, top, then left coordinates - bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0)) - - # Extend bboxes to the right where possible - # bboxes = extend_right( - # bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes - # ) + bboxes.sort(key=lambda k: (in_bbox(k, path_rects), k.y0, k.x0)) # immediately return of no text found if bboxes == []: @@ -351,16 +406,16 @@ def clean_nblocks(nblocks): nbb = nblocks[j] # a new block # never join across columns - if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0: + if bb is None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0: continue # never join across different background colors - if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes): + if in_bbox(nbb, path_rects) != in_bbox(bb, path_rects): continue temp = bb | nbb # temporary extension of new block check = can_extend(temp, nbb, nblocks, vert_bboxes) - if check == True: + if check is True: break if not check: # bb cannot be used to extend any of the new bboxes @@ -370,7 +425,7 @@ def clean_nblocks(nblocks): # check if some remaining bbox is contained in temp check = can_extend(temp, bb, bboxes, vert_bboxes) - if check == False: + if check is False: nblocks.append(bb) else: nblocks[j] = temp @@ -378,10 +433,11 @@ def clean_nblocks(nblocks): # do some elementary cleaning nblocks = clean_nblocks(nblocks) - # final joining of overlapping rectangles + + # several phases of rectangle joining nblocks = join_rects_phase1(nblocks) nblocks = join_rects_phase2(nblocks) - nblocks = join_rects_phase3(nblocks) + nblocks = join_rects_phase3(nblocks, path_rects) # return identified text bboxes return nblocks diff --git a/pymupdf4llm/pymupdf4llm/helpers/progress.py b/pymupdf4llm/pymupdf4llm/helpers/progress.py index db671de6..e71e601a 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/progress.py +++ b/pymupdf4llm/pymupdf4llm/helpers/progress.py @@ -13,7 +13,7 @@ """ import sys -from typing import List, Any +from typing import Any, List class _ProgressBar: @@ -29,9 +29,13 @@ def __init__(self, items: List[Any], progress_width: int = 40): self._increment = self._progress_width / self._len if self._len else 1 # Init progress bar - sys.stdout.write("[%s] (0/%d)" % (" " * self._progress_width, self._len)) + sys.stdout.write( + "[%s] (0/%d)" % (" " * self._progress_width, self._len) + ) sys.stdout.flush() - sys.stdout.write("\b" * (self._progress_width + len(str(self._len)) + 6)) + sys.stdout.write( + "\b" * (self._progress_width + len(str(self._len)) + 6) + ) def __iter__(self): return self @@ -57,7 +61,9 @@ def __next__(self): # Update the numerical progress padded_index = str(self._current_index).rjust(self._len_digits) progress_info = f" ({padded_index}/{self._len})" - sys.stdout.write("\b" * (self._progress_width + len(progress_info) + 1)) + sys.stdout.write( + "\b" * (self._progress_width + len(progress_info) + 1) + ) sys.stdout.write("[") sys.stdout.write( "=" * int(self._current_index * self._progress_width / self._len) diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index 03e12fc6..a0778b4d 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -28,9 +28,8 @@ import os import string - +from binascii import b2a_base64 import pymupdf - from pymupdf4llm.helpers.get_text_lines import get_raw_lines, is_white from pymupdf4llm.helpers.multi_column import column_boxes from pymupdf4llm.helpers.progress import ProgressBar @@ -106,13 +105,13 @@ def __init__( reverse=True, ) if temp: - b_limit = max(body_limit, temp[0][0]) + self.body_limit = min(body_limit, temp[0][0]) else: - b_limit = body_limit + self.body_limit = body_limit # identify up to 6 font sizes as header candidates sizes = sorted( - [f for f in fontsizes.keys() if f > b_limit], + [f for f in fontsizes.keys() if f > self.body_limit], reverse=True, )[:6] @@ -128,6 +127,8 @@ def get_header_id(self, span: dict, page=None) -> str: """ fontsize = round(span["size"]) # compute fontsize hdr_id = self.header_id.get(fontsize, "") + if not hdr_id and fontsize > self.body_limit: + hdr_id = "###### " return hdr_id @@ -136,15 +137,17 @@ def poly_area(points): We are using the "shoelace" algorithm (Gauss) for this. """ + # make a local copy of points (avoid changing the original) + pts = points[:] # remove duplicated connector points first - for i in range(len(points) - 1, 0, -1): - if points[i] == points[i - 1]: - del points[i] + for i in range(len(pts) - 1, 0, -1): + if pts[i] == pts[i - 1]: + del pts[i] area = 0 - for i in range(len(points) - 1): - p0 = pymupdf.Point(points[i]) - p1 = pymupdf.Point(points[i + 1]) + for i in range(len(pts) - 1): + p0 = pymupdf.Point(pts[i]) + p1 = pymupdf.Point(pts[i + 1]) area += p0.x * p1.y - p1.x * p0.y return abs(area) / 2 @@ -177,7 +180,7 @@ def is_significant(box, paths): """Check whether the rectangle "box" contains 'signifiant' drawings. For this to be true, at least one path must cover an area, - which is less than 90% of box. Otherwise we assume + which is smaller than 90% of box. Otherwise we assume that the graphic is decoration (highlighting, border-only etc.). """ box_area = abs(box) * 0.9 # 90% of area of box @@ -216,8 +219,10 @@ def to_markdown( pages: list = None, hdr_info=None, write_images=False, + embed_images=False, image_path="", image_format="png", + image_size_limit=0.05, force_text=True, page_chunks=False, margins=(0, 50, 0, 50), @@ -226,6 +231,7 @@ def to_markdown( page_height=None, table_strategy="lines_strict", graphics_limit=None, + fontsize_limit=3, ignore_code=False, extract_words=False, show_progress=True, @@ -237,6 +243,7 @@ def to_markdown( pages: list of page numbers to consider (0-based). hdr_info: callable or object having a method named 'get_hdr_info'. write_images: (bool) whether to save images / drawing as files. + embed_images: (bool) embed images as base64 encoded strings image_path: (str) folder into which images should be stored. image_format: (str) desired image format. Choose a supported one. force_text: (bool) output text despite of background. @@ -252,19 +259,27 @@ def to_markdown( show_progress: (bool) print progress as each page is processed. """ - if write_images is False and force_text is False: - raise ValueError("Image and text output cannot both be suppressed.") + if write_images is False and embed_images is False and force_text is False: + raise ValueError("Image and text on images cannot both be suppressed.") + if embed_images is True: + write_images = False + image_path = "" + if not 0 < image_size_limit < 1: + raise ValueError("'image_size_limit' must be positive and less than 1.") DPI = dpi IGNORE_CODE = ignore_code IMG_EXTENSION = image_format EXTRACT_WORDS = extract_words if EXTRACT_WORDS is True: page_chunks = True + ignore_code = True IMG_PATH = image_path if IMG_PATH and write_images is True and not os.path.exists(IMG_PATH): os.mkdir(IMG_PATH) GRAPHICS_LIMIT = graphics_limit + FONTSIZE_LIMIT = fontsize_limit + if not isinstance(doc, pymupdf.Document): doc = pymupdf.open(doc) @@ -327,19 +342,32 @@ def resolve_links(links, span): def save_image(page, rect, i): """Optionally render the rect part of a page. - We will always ignore images with an edge smaller than 5% - of the corresponding page edge.""" - if rect.width < page.rect.width * 0.05 or rect.height < page.rect.height * 0.05: + We will ignore images that are empty or that have an edge smaller + than x% of the corresponding page edge.""" + + if ( + rect.width < page.rect.width * image_size_limit + or rect.height < page.rect.height * image_size_limit + ): return "" - filename = os.path.basename(page.parent.name) - image_filename = os.path.join( - image_path, f"{filename}-{page.number}-{i}.{IMG_EXTENSION}" - ) - if write_images is True: + if write_images is True or embed_images is True: pix = page.get_pixmap(clip=rect, dpi=DPI) - if pix.height > 0 and pix.width > 0: - pix.save(image_filename) - return image_filename.replace("\\", "/") + else: + return "" + if pix.height <= 0 or pix.width <= 0: + return "" + + if write_images is True: + filename = os.path.basename(page.parent.name).replace(" ", "-") + image_filename = os.path.join( + IMG_PATH, f"{filename}-{page.number}-{i}.{IMG_EXTENSION}" + ) + return image_filename.replace("\\", "/") + elif embed_images is True: + # make a bas64 encoded string of the image + data = b2a_base64(pix.tobytes(IMG_EXTENSION)).decode() + data = f"data:image/{IMG_EXTENSION};base64," + data + return data return "" def write_text( @@ -380,6 +408,9 @@ def write_text( tab_rects0 = list(tab_rects.values()) img_rects0 = list(img_rects.values()) + line_rects.extend( + [l[0] for l in nlines if not intersects_rects(l[0], tab_rects0)] + ) # store line rectangles prev_lrect = None # previous line rectangle prev_bno = -1 # previous block number of line @@ -405,13 +436,19 @@ def write_text( key=lambda j: (j[1].y1, j[1].x0), ): out_string += "\n" + tabs[i].to_markdown(clean=False) + "\n" - if EXTRACT_WORDS: # determine raw line rects within this table - line_rects.extend( - [ - pymupdf.Rect(rl[0]) - for rl in get_raw_lines(textpage, clip=tab_rects[i]) - ] + if EXTRACT_WORDS: + # for "words" extraction, add table cells as line rects + cells = sorted( + set( + [ + pymupdf.Rect(c) + for c in tabs[i].header.cells + tabs[i].cells + if c is not None + ] + ), + key=lambda c: (c.y1, c.x0), ) + line_rects.extend(cells) del tab_rects[i] # ------------------------------------------------------------ @@ -566,28 +603,40 @@ def output_tables(tabs, text_rect, tab_rects, line_rects, textpage): key=lambda j: (j[1].y1, j[1].x0), ): this_md += tabs[i].to_markdown(clean=False) - if EXTRACT_WORDS: # determine raw line rects within this table - line_rects.extend( - [ - pymupdf.Rect(rl[0]) - for rl in get_raw_lines(textpage, clip=tab_rects[i]) - ] + if EXTRACT_WORDS: + # for "words" extraction, add table cells as line rects + cells = sorted( + set( + [ + pymupdf.Rect(c) + for c in tabs[i].header.cells + tabs[i].cells + if c is not None + ] + ), + key=lambda c: (c.y1, c.x0), ) + line_rects.extend(cells) del tab_rects[i] # do not touch this table twice - else: # output all remaining table + else: # output all remaining tables for i, trect in sorted( tab_rects.items(), key=lambda j: (j[1].y1, j[1].x0), ): this_md += tabs[i].to_markdown(clean=False) - if EXTRACT_WORDS: # determine raw line rects within this table - line_rects.extend( - [ - pymupdf.Rect(rl[0]) - for rl in get_raw_lines(textpage, clip=tab_rects[i]) - ] + if EXTRACT_WORDS: + # for "words" extraction, add table cells as line rects + cells = sorted( + set( + [ + pymupdf.Rect(c) + for c in tabs[i].header.cells + tabs[i].cells + if c is not None + ] + ), + key=lambda c: (c.y1, c.x0), ) + line_rects.extend(cells) del tab_rects[i] # do not touch this table twice return this_md @@ -653,6 +702,23 @@ def get_metadata(doc, pno): meta["page"] = pno + 1 return meta + def sort_words(words): + nwords = [] + line = [words[0]] + lrect = pymupdf.Rect(words[0][:4]) + for w in words[1:]: + if abs(w[1] - lrect.y0) <= 3 or abs(w[3] - lrect.y1) <= 3: + line.append(w) + lrect |= w[:4] + else: + line.sort(key=lambda w: w[0]) + nwords.extend(line) + line = [w] + lrect = pymupdf.Rect(w[:4]) + line.sort(key=lambda w: w[0]) + nwords.extend(line) + return nwords + def get_page_output(doc, pno, margins, textflags): """Process one page. @@ -801,17 +867,17 @@ def get_page_output(doc, pno, margins, textflags): if EXTRACT_WORDS is True: # output words in sequence compliant with Markdown text rawwords = textpage.extractWORDS() + rawwords.sort(key=lambda w: (w[3], w[0])) words = [] for lrect in line_rects: lwords = [] for w in rawwords: wrect = pymupdf.Rect(w[:4]) if wrect in lrect: - wrect.y0 = lrect.y0 # set upper coord to line - wrect.y1 = lrect.y1 # set lower coord to line - lwords.append(list(wrect) + list(w[4:])) + lwords.append(w) # append sorted words of this line - words.extend(sorted(lwords, key=lambda w: w[0])) + # words.extend(sorted(lwords, key=lambda w: w[0])) + words.extend(sort_words(lwords)) # remove word duplicates without spoiling the sequence # duplicates may occur for multiple reasons diff --git a/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py b/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py index ec85dc7a..48c40894 100644 --- a/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py +++ b/pymupdf4llm/pymupdf4llm/llama/pdf_markdown_reader.py @@ -2,9 +2,7 @@ from typing import Any, Callable, Dict, List, Optional, Union import pymupdf - from pymupdf import Document as FitzDocument - from pymupdf4llm import IdentifyHeaders, to_markdown try: @@ -23,7 +21,9 @@ class PDFMarkdownReader(BaseReader): def __init__( self, - meta_filter: Optional[Callable[[Dict[str, Any]], Dict[str, Any]]] = None, + meta_filter: Optional[ + Callable[[Dict[str, Any]], Dict[str, Any]] + ] = None, ): self.meta_filter = meta_filter @@ -78,7 +78,9 @@ def _process_doc_page( hdr_info: IdentifyHeaders, ): """Processes a single page of a PDF document.""" - extra_info = self._process_doc_meta(doc, file_path, page_number, extra_info) + extra_info = self._process_doc_meta( + doc, file_path, page_number, extra_info + ) if self.meta_filter: extra_info = self.meta_filter(extra_info) diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index 6b4fe33a..af047b36 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -17,7 +17,7 @@ setuptools.setup( name="pymupdf4llm", - version="0.0.14", + version="0.0.15", author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG",