From 52ed7aa440704d029f0f2bf392a0e95fa8af6024 Mon Sep 17 00:00:00 2001 From: "Jorj X. McKie" Date: Sun, 21 Jul 2024 07:23:09 -0400 Subject: [PATCH] Changes for v0.0.10 --- docs/src/api.rst | 10 +- docs/src/changes.rst | 22 ++ pymupdf4llm/pymupdf4llm/__init__.py | 2 +- .../pymupdf4llm/helpers/pymupdf_rag.py | 303 ++++++++++++------ pymupdf4llm/setup.py | 2 +- 5 files changed, 245 insertions(+), 94 deletions(-) diff --git a/docs/src/api.rst b/docs/src/api.rst index e67e64b4..7a62f77b 100644 --- a/docs/src/api.rst +++ b/docs/src/api.rst @@ -10,7 +10,7 @@ API Prints the version of the library. -.. method:: to_markdown(doc: pymupdf.Document | str, *, pages: list | range | None = None, hdr_info: Any = None, write_images: bool = False, dpi: int = 150, margins=(0, 50, 0, 50), page_chunks: bool = False, page_width: float = 612, page_height: float = None, table_strategy="lines_strict", graphics_limit: int = None) -> str | list[dict] +.. method:: to_markdown(doc: pymupdf.Document | str, *, pages: list | range | None = None, hdr_info: Any = None, write_images: bool = False, dpi: int = 150, image_path="", image_format="png", force_text=True, margins=(0, 50, 0, 50), page_chunks: bool = False, page_width: float = 612, page_height: float = None, table_strategy="lines_strict", graphics_limit: int = None) -> str | list[dict] Read the pages of the file and outputs the text of its pages in |Markdown| format. How this should happen in detail can be influenced by a number of parameters. Please note that there exists **support for building page chunks** from the |Markdown| text. @@ -20,10 +20,16 @@ API :arg hdr_info: optional. Use this if you want to provide your own header detection logic. This may be a callable or an object having a method named `get_header_id`. It must accept a text span (a span dictionary as contained in `extractDict `_) and a keyword parameter "page" (which is the owning `Page `_ object). It must return a string "" or up to 6 "#" characters followed by 1 space. If omitted, a full document scan will be performed to find the most popular font sizes and derive header levels based on them. To completely avoid this behavior specify `hdr_info=lambda s, page=None: ""` or `hdr_info=False`. - :arg bool write_images: when encountering images or vector graphics, PNG images will be created from the respective page area and stored in the folder of the document. Markdown references will be generated pointing to these images. Any text contained in these areas will not be included in the text output (but appear as part of the images). Therefore, if your document has text written on full page images, make sure to set this parameter to `False`. + :arg bool write_images: when encountering images or vector graphics, images will be created from the respective page area and stored in the specified folder. Markdown references will be generated pointing to these images. Any text contained in these areas will not be included in the text output (but appear as part of the images). Therefore, if for instance your document has text written on full page images, make sure to set this parameter to `False`. :arg int dpi: specify the desired image resolution in dots per inch. Relevant only if `write_images=True`. Default value is 150. + :arg str image_path: store images in this folder. Relevant if `write_images=True`. Default is the path of the script directory. + + :arg str image_format: specify the desired image format via its extension. Default is "png" (portable network graphics). Another popular format may be "jpg". Possible values are all `supported output formats `_. + + :arg bool force_text: generate text output even when overlapping images / graphics. This text then appears after the respective image. If `write_images=True` this parameter may be `False` to suppress repetition of text on images. + :arg float,list margins: a float or a sequence of 2 or 4 floats specifying page borders. Only objects inside the margins will be considered for output. * `margin=f` yields `(f, f, f, f)` for `(left, top, right, bottom)`. diff --git a/docs/src/changes.rst b/docs/src/changes.rst index b8436151..1fb9436a 100644 --- a/docs/src/changes.rst +++ b/docs/src/changes.rst @@ -4,6 +4,28 @@ Change Log =========================================================================== +Changes in version 0.0.10 +-------------------------- + +Fixes: +~~~~~~~ + +* `73 `_ "bug in to_markdown internal function" +* `74 `_ "minimum area for images & vector graphics" +* `75 `_ "Poor Markdown Generation for Particular PDF" +* `76 `_ "suggestion on useful api parameters" + + +Improvements: +~~~~~~~~~~~~~~ +* Improved recognition of "insignificant" vector graphics. Graphics like text highlights or borders will be ignored. +* The format of saved images can now be controlled via new parameter `image_format`. +* Images can be stored in a specific folder via the new parameter `image_path`. +* Images are **not stored if contained** in another image on same page. +* Images are **not stored if too small:** if width or height are less than 5% of corresponding page dimension. +* All text is always written. If `write_images=True`, text on images / graphics can be suppressed by setting `force_text=False`. + + Changes in version 0.0.9 -------------------------- diff --git a/pymupdf4llm/pymupdf4llm/__init__.py b/pymupdf4llm/pymupdf4llm/__init__.py index fd721dca..0d3e7fb2 100644 --- a/pymupdf4llm/pymupdf4llm/__init__.py +++ b/pymupdf4llm/pymupdf4llm/__init__.py @@ -1,6 +1,6 @@ from .helpers.pymupdf_rag import IdentifyHeaders, to_markdown -__version__ = "0.0.9" +__version__ = "0.0.10" version = __version__ version_tuple = tuple(map(int, version.split("."))) diff --git a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py index 78c8712a..d60d0f7b 100644 --- a/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py +++ b/pymupdf4llm/pymupdf4llm/helpers/pymupdf_rag.py @@ -49,7 +49,7 @@ chr(8226), chr(9679), ) -GRAPHICS_TEXT = "\n![%s](%s)\n" +GRAPHICS_TEXT = "\n![](%s)\n" class IdentifyHeaders: @@ -131,13 +131,95 @@ def get_header_id(self, span: dict, page=None) -> str: return hdr_id +def poly_area(points): + """Compute the area of the polygon represented by the given points. + + We are using the shoelace algorithm (Gauss) for this. + """ + # remove duplicated connector points first + for i in range(len(points) - 1, 0, -1): + if points[i] == points[i - 1]: + del points[i] + + area = 0 + for i in range(len(points) - 1): + p0 = fitz.Point(points[i]) + p1 = fitz.Point(points[i + 1]) + area += p0.x * p1.y - p1.x * p0.y + return abs(area) / 2 + + +def refine_boxes(boxes): + """Join any rectangles with a pairwise non-empty overlap.""" + new_rects = [] + # list of all vector graphic rectangles + prects = boxes[:] + + while prects: # the algorithm will empty this list + r = +prects[0] # copy of first rectangle + repeat = True # initialize condition + while repeat: + repeat = False # set false as default + for i in range(len(prects) - 1, 0, -1): # from back to front + if r.intersects(prects[i]): # enlarge first rect with this + r |= prects[i] + del prects[i] # delete this rect + repeat = True # indicate we must try again + + new_rects.append(r) + del prects[0] + + new_rects = sorted(set(new_rects), key=lambda r: (r.x0, r.y0)) + return new_rects + + +def is_significant(box, paths): + """Check whether the rectangle "box" contains 'signifiant' drawings. + + For this to be true, at least one path must cover an area, + which is less than 90% of box. Otherwise we assume + that the graphic is decoration (highlighting, border-only etc.). + """ + box_area = abs(box) * 0.9 # 90% of area of box + + for p in paths: + if p["rect"] not in box: + continue + if p["type"] == "f" and set([i[0] for i in p["items"]]) == {"re"}: + # only borderless rectangles are contained: ignore this path + continue + points = [] # list of points represented by the items. + # We are going to append all the points as they occur. + for itm in p["items"]: + if itm[0] in ("l", "c"): # line or curve + points.extend(itm[1:]) # append all the points + elif itm[0] == "q": # quad + q = itm[1] + # follow corners anti-clockwise + points.extend([q.ul, q.ll, q.lr, q.ur, q.ul]) + else: # rectangles come in two flavors. + # starting point is always top-left + r = itm[1] + if itm[-1] == 1: # anti-clockwise (the standard) + points.extend([r.tl, r.bl, r.br, r.tr, r.tl]) + else: # clockwise: area counts as negative + points.extend([r.tl, r.tr, r.br, r.bl, r.tl]) + area = poly_area(points) # compute area of polygon + if area < box_area: # less than threshold: graphic is significant + return True + return False + + def to_markdown( doc, *, pages: list = None, hdr_info=None, - write_images: bool = False, - page_chunks: bool = False, + write_images=False, + image_path="", + image_format="png", + force_text=True, + page_chunks=False, margins=(0, 50, 0, 50), dpi=150, page_width=612, @@ -148,10 +230,13 @@ def to_markdown( """Process the document and return the text of the selected pages. Args: - doc: pymupdf.Document or string. + doc: fitz.Document or string. pages: list of page numbers to consider (0-based). hdr_info: callable or object having a method named 'get_hdr_info'. write_images: (bool) whether to save images / drawing as files. + image_path: (str) folder into which images should be stored. + image_format: (str) desired image format. Choose a supported one. + force_text: (bool) output text despite of background. page_chunks: (bool) whether to segment output by page. margins: do not consider content overlapping margin areas. dpi: (int) desired resolution for generated images. @@ -161,8 +246,14 @@ def to_markdown( graphics_limit: (int) ignore page with too many vector graphics. """ - + if write_images is False and force_text is False: + raise ValueError("Image and text output cannot both be suppressed.") DPI = dpi + IMG_EXTENSION = image_format + IMG_PATH = image_path + if IMG_PATH and write_images is True and not os.path.exists(IMG_PATH): + os.mkdir(IMG_PATH) + GRAPHICS_LIMIT = graphics_limit if not isinstance(doc, fitz.Document): doc = fitz.open(doc) @@ -187,19 +278,15 @@ def to_markdown( if len(margins) == 2: margins = (0, margins[0], 0, margins[1]) if len(margins) != 4: - raise ValueError( - "margins must be one, two or four floats" - ) + raise ValueError("margins must be one, two or four floats") elif not all([hasattr(m, "__float__") for m in margins]): raise ValueError("margin values must be floats") - # If "hdr_info" is not an object having method "get_header_id", scan the + # If "hdr_info" is not an object with a method "get_header_id", scan the # document and use font sizes as header level indicators. if callable(hdr_info): get_header_id = hdr_info - elif hasattr(hdr_info, "get_header_id") and callable( - hdr_info.get_header_id - ): + elif hasattr(hdr_info, "get_header_id") and callable(hdr_info.get_header_id): get_header_id = hdr_info.get_header_id elif hdr_info is False: get_header_id = lambda s, page=None: "" @@ -228,12 +315,19 @@ def resolve_links(links, span): return text def save_image(page, rect, i): - """Optionally render the rect part of a page.""" - filename = page.parent.name.replace("\\", "/") - image_path = f"{filename}-{page.number}-{i}.png" + """Optionally render the rect part of a page. + + We will always ignore images with an edge smaller than 5% + of the corresponding page edge.""" + if rect.width < page.rect.width * 0.05 or rect.height < page.rect.height * 0.05: + return "" + filename = os.path.basename(page.parent.name) + image_filename = os.path.join( + image_path, f"{filename}-{page.number}-{i}.{IMG_EXTENSION}" + ) if write_images is True: - page.get_pixmap(clip=rect, dpi=DPI).save(image_path) - return os.path.basename(image_path) + page.get_pixmap(clip=rect, dpi=DPI).save(image_filename) + return image_filename.replace("\\", "/") return "" def write_text( @@ -244,6 +338,7 @@ def write_text( tab_rects: dict = None, img_rects: dict = None, links: list = None, + force_text=force_text, ) -> string: """Output the text found inside the given clip. @@ -260,6 +355,7 @@ def write_text( via their own 'to_markdown' method. Images and vector graphics are optionally saved as files and pointed to by respective markdown text. """ + if clip is None: clip = textpage.rect out_string = "" @@ -276,14 +372,16 @@ def write_text( prev_hdr_string = None for lrect, spans in nlines: - # there may tables or images inside the text block: skip them + # there may be tables or images inside the text block: skip them if intersects_rects(lrect, tab_rects0) or intersects_rects( lrect, img_rects0 ): continue - # Pick up tables intersecting this text block - for i, tab_rect in sorted( + # ------------------------------------------------------------ + # Pick up tables ABOVE this text block + # ------------------------------------------------------------ + for i, _ in sorted( [ j for j in tab_rects.items() @@ -294,8 +392,10 @@ def write_text( out_string += "\n" + tabs[i].to_markdown(clean=False) + "\n" del tab_rects[i] - # Pick up images / graphics intersecting this text block - for i, img_rect in sorted( + # ------------------------------------------------------------ + # Pick up images / graphics ABOVE this text block + # ------------------------------------------------------------ + for i, temp_rect in sorted( [ j for j in img_rects.items() @@ -303,9 +403,25 @@ def write_text( ], key=lambda j: (j[1].y1, j[1].x0), ): - pathname = save_image(page, img_rect, i) + pathname = save_image(page, temp_rect, i) if pathname: - out_string += GRAPHICS_TEXT % (pathname, pathname) + out_string += GRAPHICS_TEXT % pathname + + # recursive invocation + if force_text: + img_txt = write_text( + page, + textpage, + clip=temp_rect, + tabs=None, + tab_rects={}, + img_rects={}, + links=links, + force_text=True, + ) + + if not is_white(img_txt): + out_string += img_txt del img_rects[i] text = " ".join([s["text"] for s in spans]) @@ -384,9 +500,7 @@ def write_text( if ltext: text = f"{hdr_string}{prefix}{ltext}{suffix} " else: - text = ( - f"{hdr_string}{prefix}{s['text'].strip()}{suffix} " - ) + text = f"{hdr_string}{prefix}{s['text'].strip()}{suffix} " if text.startswith(bullet): text = "- " + text[1:] @@ -399,9 +513,7 @@ def write_text( code = False return ( - out_string.replace(" \n", "\n") - .replace(" ", " ") - .replace("\n\n\n", "\n\n") + out_string.replace(" \n", "\n").replace(" ", " ").replace("\n\n\n", "\n\n") ) def is_in_rects(rect, rect_list): @@ -438,29 +550,56 @@ def output_tables(tabs, text_rect, tab_rects): del tab_rects[i] # do not touch this table twice return this_md - def output_images(page, text_rect, img_rects): + def output_images(page, textpage, text_rect, img_rects): """Output images and graphics above text rectangle.""" if img_rects is None: return "" this_md = "" # markdown string - if text_rect is not None: # select tables above the text block + if text_rect is not None: # select images above the text block for i, img_rect in sorted( [j for j in img_rects.items() if j[1].y1 <= text_rect.y0], key=lambda j: (j[1].y1, j[1].x0), ): pathname = save_image(page, img_rect, i) if pathname: - this_md += GRAPHICS_TEXT % (pathname, pathname) + this_md += GRAPHICS_TEXT % pathname + if force_text: + img_txt = write_text( + page, + textpage, + clip=img_rect, + tabs=None, + tab_rects={}, # we have no tables here + img_rects={}, # we have no other images here + links=[], # rely on explicit HTML syntax + force_text=True, + ) + if not is_white(img_txt): # was there text at all? + this_md += img_txt + del img_rects[i] # do not touch this image twice - else: # output all remaining table + else: # output all remaining images for i, img_rect in sorted( img_rects.items(), key=lambda j: (j[1].y1, j[1].x0), ): pathname = save_image(page, img_rect, i) if pathname: - this_md += GRAPHICS_TEXT % (pathname, pathname) + this_md += GRAPHICS_TEXT % pathname + if force_text: + img_txt = write_text( + page, + textpage, + clip=img_rect, + tabs=None, + tab_rects={}, # we have no tables here + img_rects={}, # we have no other images here + links=[], # rely on explicit HTML syntax + force_text=True, + ) + if not is_white(img_txt): + this_md += img_txt del img_rects[i] # do not touch this image twice return this_md @@ -502,10 +641,21 @@ def get_page_output(doc, pno, margins, textflags): # make a TextPage for all later extractions textpage = page.get_textpage(flags=textflags, clip=clip) - img_info = [ - img for img in page.get_image_info() if img["bbox"] in clip - ] - images = img_info[:] + # extract images on page + # ignore images contained in another one (simplified mechanism) + img_info = page.get_image_info()[:] + # sort descending by image area size + img_info.sort(key=lambda i: abs(fitz.Rect(i["bbox"])), reverse=True) + # run from back to front (= small to large) + for i in range(len(img_info) - 1, 0, -1): + img1 = img_info[i] + img0 = img_info[i - 1] + if ( + fitz.Rect(img1["bbox"]) & page.rect + in fitz.Rect(img0["bbox"]) & page.rect + ): + del img_info[i] # contained in some larger image + images = img_info tables = [] graphics = [] @@ -527,8 +677,9 @@ def get_page_output(doc, pno, margins, textflags): # list of table rectangles tab_rects0 = list(tab_rects.values()) - # Select paths that are not contained in any table - page_clip = page.rect + (36, 36, -36, -36) # ignore full page graphics + # Select paths not contained in any table + # ignore full page graphics + page_clip = page.rect + (36, 36, -36, -36) paths = [ p for p in page.get_drawings() @@ -538,61 +689,32 @@ def get_page_output(doc, pno, margins, textflags): and p["rect"].height < page_clip.height ] - # We also ignore vector graphics that only represent "background - # sugar". - vg_clusters = [] # worthwhile vector graphics go here - - # walk through all vector graphics not belonging to a table - for bbox in page.cluster_drawings(drawings=paths): - subbox = bbox + (3, 3, -3, -3) # sub rect without any border - box_area = abs(bbox) - include = False - for p in paths: - mp = (p["rect"].tl + p["rect"].br) / 2 # center point of rect - - # fill-only paths or being part of the border will not - # make this a worthwhile vector grahic - if mp not in subbox or p["type"] == "f": - continue - - # this checks if all items are part of the bbox border - near_border = set() - for itm in p["items"]: # walk through path items - if itm[0] == "re": # a full-sized rectangle - if abs(itm[1]) / box_area < 1e-3: - near_border.add(True) # is part of the border - elif itm[0] in ("c", "l"): # curves and lines - for temp in itm[1:]: - # if their points are on the border - near_border.add(temp not in subbox) - # if any stroked path has a point inside bbox (i.e. not on its - # border then this vector graphic is treated as significant - if not near_border == {True}: - include = True - break - if include is True: # this box is a significant vector graphic - vg_clusters.append(bbox) - - actual_paths = [ - p for p in paths if is_in_rects(p["rect"], vg_clusters) - ] + # We also ignore vector graphics that only represent "text + # emphasizing sugar". + vg_clusters0 = [] # worthwhile vector graphics go here - vg_clusters0 = [ - r - for r in vg_clusters - if not intersects_rects(r, tab_rects0) and r.height > 20 - ] + # walk through all vector graphics outside any table + for bbox in refine_boxes(page.cluster_drawings(drawings=paths)): + if is_significant(bbox, paths): + vg_clusters0.append(bbox) - if write_images is True: - vg_clusters0 += [fitz.Rect(i["bbox"]) for i in img_info] + # remove paths that are not in some relevant graphic + actual_paths = [p for p in paths if is_in_rects(p["rect"], vg_clusters0)] + + # also add image rectangles to the list + vg_clusters0 += [fitz.Rect(i["bbox"]) for i in img_info] + + # these may no longer be pairwise disjoint: + # remove area overlaps by joining into larger rects + vg_clusters0 = refine_boxes(vg_clusters0) vg_clusters = dict((i, r) for i, r in enumerate(vg_clusters0)) - # Determine text column bboxes on page, avoiding tables and graphics + # identify text bboxes on page, avoiding tables, images and graphics text_rects = column_boxes( page, paths=actual_paths, - no_image_text=write_images, + no_image_text=True, textpage=textpage, avoid=tab_rects0 + vg_clusters0, footer_margin=margins[3], @@ -606,7 +728,7 @@ def get_page_output(doc, pno, margins, textflags): for text_rect in text_rects: # output tables above this block of text md_string += output_tables(tabs, text_rect, tab_rects) - md_string += output_images(page, text_rect, vg_clusters) + md_string += output_images(page, textpage, text_rect, vg_clusters) # output text inside this rectangle md_string += write_text( @@ -617,12 +739,13 @@ def get_page_output(doc, pno, margins, textflags): tab_rects=tab_rects, img_rects=vg_clusters, links=links, + force_text=force_text, ) md_string = md_string.replace(" ,", ",").replace("-\n", "") # write any remaining tables and images md_string += output_tables(tabs, None, tab_rects) - md_string += output_images(None, tab_rects, None) + md_string += output_images(page, textpage, None, vg_clusters) md_string += "\n-----\n\n" while md_string.startswith("\n"): md_string = md_string[1:] diff --git a/pymupdf4llm/setup.py b/pymupdf4llm/setup.py index 0f27b2db..40141947 100644 --- a/pymupdf4llm/setup.py +++ b/pymupdf4llm/setup.py @@ -17,7 +17,7 @@ setuptools.setup( name="pymupdf4llm", - version="0.0.9", + version="0.0.10", author="Artifex", author_email="support@artifex.com", description="PyMuPDF Utilities for LLM/RAG",