diff --git a/nb.ipynb b/nb.ipynb index 17cd362..6940f1a 100644 --- a/nb.ipynb +++ b/nb.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -23,14 +23,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "import pdfplumber\n", "\n", - "page_idx = 0\n", - "with pdfplumber.open(\"/Users/johnathanchiu/Downloads/san-jose-pd-firearm-sample.pdf\") as pdf:\n", + "page_idx = 15\n", + "with pdfplumber.open(\"/Users/johnathanchiu/Downloads/21-11-1156.pdf\") as pdf:\n", " crops = segment_pdf_page(pdf.pages[page_idx])\n", "\n", " im = pdf.pages[page_idx].to_image()\n", @@ -42,16 +42,20 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from PIL import Image, ImageDraw\n", "\n", - "ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00014.jpg\"\n", - "ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00016.jpg\"\n", - "ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00066.jpg\"\n", - "ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00071.jpg\"\n", + "# ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00014.jpg\"\n", + "# ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00016.jpg\"\n", + "# ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00066.jpg\"\n", + "# ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00071.jpg\"\n", + "ifile = \"/Users/johnathanchiu/Downloads/PM209/images/toshiba_air conditioner manuals_fa6db8bc-5435-4363-a3e2-e6b85ab5ad6e/images/toshiba_air conditioner manuals_fa6db8bc-5435-4363-a3e2-e6b85ab5ad6e_00008.jpg\"\n", + "ifile = \"/Users/johnathanchiu/Downloads/PM209/images/toshiba_air conditioner manuals_fa6db8bc-5435-4363-a3e2-e6b85ab5ad6e/images/toshiba_air conditioner manuals_fa6db8bc-5435-4363-a3e2-e6b85ab5ad6e_00022.jpg\"\n", + "ifile = \"/Users/johnathanchiu/Downloads/PM209/images/toshiba_air conditioner manuals_fa6db8bc-5435-4363-a3e2-e6b85ab5ad6e/images/toshiba_air conditioner manuals_fa6db8bc-5435-4363-a3e2-e6b85ab5ad6e_00024.jpg\"\n", + "ifile = \"/Users/johnathanchiu/Downloads/PM209/images/sony_speaker_ca7086d3-effb-4230-8f16-26066834c1e3/images/sony_speaker_ca7086d3-effb-4230-8f16-26066834c1e3_00009.jpg\"\n", "img = Image.open(ifile)\n", "\n", "draw = ImageDraw.Draw(img, \"RGBA\")\n", diff --git a/segmentor/document/pdf.py b/segmentor/document/pdf.py index 3f3ea55..023fce0 100644 --- a/segmentor/document/pdf.py +++ b/segmentor/document/pdf.py @@ -1,15 +1,12 @@ from dataclasses import dataclass -from typing import List, Tuple +from typing import List -from pdfplumber.page import CroppedPage, Page +from pdfplumber.page import CroppedPage import numpy as np from .div import div_intersections -OBJECT_TYPES = ["line", "curve", "rect", "char", "image"] - - @dataclass class Section: page_crop: CroppedPage @@ -17,31 +14,31 @@ class Section: seg_depth: int = 0 -def pdf_page_scan( - page: Page, - line_spacing: float = 5.0, - vertical_scan: bool = True, - debug: bool = False, -): - page_objs = page.objects - page_bbox = page.bbox +def pdf_page_scan(page: CroppedPage, line_spacing=5.0, vertical_scan=True, debug=False): # vertical scan implies the lines are going across the page dropped from top to bottom - if not vertical_scan: + page_bbox = page.bbox + page_objs = page.objects + if vertical_scan: + p0, p1 = "top", "bottom" page_dim = (page_bbox[1], page_bbox[3]) - p0, p1 = "x0", "x1" + line_spacing = 5.0 # arbitrary hyperparameters else: + p0, p1 = "x0", "x1" page_dim = (page_bbox[0], page_bbox[2]) - p0, p1 = "top", "bottom" + line_spacing = 8.0 # arbitrary hyperparameters scan_intersects = [] scan_lines = list(np.arange(*page_dim, line_spacing)) for scan_line in scan_lines: is_crossed = False - for obj_type in OBJECT_TYPES: + for obj_type in page_objs: + # We only check objects that fall into these categories + if obj_type not in {"line", "curve", "rect", "char", "image"}: + continue for obj in page_objs[obj_type]: if obj[p0] < scan_line < obj[p1]: - scan_intersects.append(True) is_crossed = True + scan_intersects.append(True) break if is_crossed: break @@ -57,10 +54,7 @@ def pdf_page_scan( def section_page( - page: Section, - page_breaks: List[Tuple[int, int]], - vertical_div: bool = True, - debug_info: List[Tuple[bool, int]] = None, + page: Section, page_breaks, vertical_div=True, debug_info=None ) -> List[CroppedPage]: if debug_info: im = page.page_crop.to_image()