Skip to content

Commit

Permalink
fix bug in pdf scan
Browse files Browse the repository at this point in the history
  • Loading branch information
johnathanchiu committed Sep 13, 2024
1 parent db7e51b commit 1108c1d
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 31 deletions.
22 changes: 13 additions & 9 deletions nb.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"outputs": [
{
Expand All @@ -23,14 +23,14 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"import pdfplumber\n",
"\n",
"page_idx = 0\n",
"with pdfplumber.open(\"/Users/johnathanchiu/Downloads/san-jose-pd-firearm-sample.pdf\") as pdf:\n",
"page_idx = 15\n",
"with pdfplumber.open(\"/Users/johnathanchiu/Downloads/21-11-1156.pdf\") as pdf:\n",
" crops = segment_pdf_page(pdf.pages[page_idx])\n",
"\n",
" im = pdf.pages[page_idx].to_image()\n",
Expand All @@ -42,16 +42,20 @@
},
{
"cell_type": "code",
"execution_count": 46,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"from PIL import Image, ImageDraw\n",
"\n",
"ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00014.jpg\"\n",
"ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00016.jpg\"\n",
"ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00066.jpg\"\n",
"ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00071.jpg\"\n",
"# ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00014.jpg\"\n",
"# ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00016.jpg\"\n",
"# ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00066.jpg\"\n",
"# ifile = \"/Users/johnathanchiu/Downloads/PM209/images/Apple_iphone-13-pro-max-07300325A-repair/images/Apple_iphone-13-pro-max-07300325A-repair_00071.jpg\"\n",
"ifile = \"/Users/johnathanchiu/Downloads/PM209/images/toshiba_air conditioner manuals_fa6db8bc-5435-4363-a3e2-e6b85ab5ad6e/images/toshiba_air conditioner manuals_fa6db8bc-5435-4363-a3e2-e6b85ab5ad6e_00008.jpg\"\n",
"ifile = \"/Users/johnathanchiu/Downloads/PM209/images/toshiba_air conditioner manuals_fa6db8bc-5435-4363-a3e2-e6b85ab5ad6e/images/toshiba_air conditioner manuals_fa6db8bc-5435-4363-a3e2-e6b85ab5ad6e_00022.jpg\"\n",
"ifile = \"/Users/johnathanchiu/Downloads/PM209/images/toshiba_air conditioner manuals_fa6db8bc-5435-4363-a3e2-e6b85ab5ad6e/images/toshiba_air conditioner manuals_fa6db8bc-5435-4363-a3e2-e6b85ab5ad6e_00024.jpg\"\n",
"ifile = \"/Users/johnathanchiu/Downloads/PM209/images/sony_speaker_ca7086d3-effb-4230-8f16-26066834c1e3/images/sony_speaker_ca7086d3-effb-4230-8f16-26066834c1e3_00009.jpg\"\n",
"img = Image.open(ifile)\n",
"\n",
"draw = ImageDraw.Draw(img, \"RGBA\")\n",
Expand Down
38 changes: 16 additions & 22 deletions segmentor/document/pdf.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,44 @@
from dataclasses import dataclass
from typing import List, Tuple
from typing import List

from pdfplumber.page import CroppedPage, Page
from pdfplumber.page import CroppedPage
import numpy as np

from .div import div_intersections


OBJECT_TYPES = ["line", "curve", "rect", "char", "image"]


@dataclass
class Section:
page_crop: CroppedPage
vertical_seg: bool
seg_depth: int = 0


def pdf_page_scan(
page: Page,
line_spacing: float = 5.0,
vertical_scan: bool = True,
debug: bool = False,
):
page_objs = page.objects
page_bbox = page.bbox
def pdf_page_scan(page: CroppedPage, line_spacing=5.0, vertical_scan=True, debug=False):
# vertical scan implies the lines are going across the page dropped from top to bottom
if not vertical_scan:
page_bbox = page.bbox
page_objs = page.objects
if vertical_scan:
p0, p1 = "top", "bottom"
page_dim = (page_bbox[1], page_bbox[3])
p0, p1 = "x0", "x1"
line_spacing = 5.0 # arbitrary hyperparameters
else:
p0, p1 = "x0", "x1"
page_dim = (page_bbox[0], page_bbox[2])
p0, p1 = "top", "bottom"
line_spacing = 8.0 # arbitrary hyperparameters

scan_intersects = []
scan_lines = list(np.arange(*page_dim, line_spacing))
for scan_line in scan_lines:
is_crossed = False
for obj_type in OBJECT_TYPES:
for obj_type in page_objs:
# We only check objects that fall into these categories
if obj_type not in {"line", "curve", "rect", "char", "image"}:
continue
for obj in page_objs[obj_type]:
if obj[p0] < scan_line < obj[p1]:
scan_intersects.append(True)
is_crossed = True
scan_intersects.append(True)
break
if is_crossed:
break
Expand All @@ -57,10 +54,7 @@ def pdf_page_scan(


def section_page(
page: Section,
page_breaks: List[Tuple[int, int]],
vertical_div: bool = True,
debug_info: List[Tuple[bool, int]] = None,
page: Section, page_breaks, vertical_div=True, debug_info=None
) -> List[CroppedPage]:
if debug_info:
im = page.page_crop.to_image()
Expand Down

0 comments on commit 1108c1d

Please sign in to comment.