-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathOCRifExist.py
60 lines (49 loc) · 2.21 KB
/
OCRifExist.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import os
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
# Function to check if a corresponding docx file exists
def docx_exists(pdf_path):
docx_path = os.path.splitext(pdf_path)[0] + '.docx'
return os.path.exists(docx_path)
# Function to convert PDF to Word document only if a corresponding docx file doesn't exist
def convert_pdf_to_word(pdf_path):
if not docx_exists(pdf_path):
# Convert the PDF to images
images = convert_from_path(pdf_path)
# Create a new Word document
doc = Document()
paragraph_format = doc.styles['Normal'].paragraph_format
paragraph_format.alignment = WD_ALIGN_PARAGRAPH.RIGHT
# Loop through each image and extract text using OCR
for i, image in enumerate(images):
# Save the image as a temporary file
image_path = f'temp_image_{i}.jpg'
image.save(image_path, 'JPEG')
# Extract text using OCR
text = pytesseract.image_to_string(Image.open(image_path), lang='fas+equ')
# Add a new page to the Word document
if i > 0:
doc.add_page_break()
# Add the extracted text to the Word document
paragraph = doc.add_paragraph(text)
paragraph.style = doc.styles['Normal']
# Save the Word document with the same name as the PDF file
docx_path = os.path.splitext(pdf_path)[0] + '.docx'
doc.save(docx_path)
# Set the path to the folder containing the PDF files
pdf_folder_path = input('Please enter the folder address: ')
# Traverse through the directory and subdirectories
for root, dirs, files in os.walk(pdf_folder_path):
for file in files:
# Check if the file is a PDF file
if file.endswith('.pdf'):
pdf_path = os.path.join(root, file)
# Check if a corresponding docx file already exists
if not docx_exists(pdf_path):
print(f'Converting {pdf_path} to Word document...')
convert_pdf_to_word(pdf_path)
else:
print(f'Docx file already exists for {pdf_path}. Skipping OCR.')