Skip to content

Commit

Permalink
Merge pull request #13 from moheladwy/AddLanguageDetectionSupport
Browse files Browse the repository at this point in the history
Add language detection support, Improved Error Handling, and Improved Code Quality.
  • Loading branch information
moheladwy authored Dec 16, 2024
2 parents a634b3e + ed58f58 commit ce3759e
Show file tree
Hide file tree
Showing 5 changed files with 195 additions and 129 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,9 @@ cython_debug/
# VS Code
# Add .vscode/ to the .gitignore file if you are using Visual Studio Code
.vscode/

*.jpg
*.jpeg
*.png
*.gif
output*.txt
4 changes: 3 additions & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,6 @@ disable=
C0411, # Wrong import order
E1101, # No member
W0612, # Unused variable
W0718 # Broad exception caught
W0718, # Broad exception caught
R1705, # Unnecessary "else" after "return"
R0902, # Too many instance attributes
310 changes: 184 additions & 126 deletions OCR4Linux.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# ========================================================================================================================
# Author:
# Mohamed Hussein Al-Adawy
# Version: 1.1.0
# Version: 1.2.0
# Description:
# OCR4Linux.py is a Python script that handles image preprocessing and text extraction using Tesseract OCR.
# The script takes an input image, processes it for optimal OCR accuracy, and extracts text while preserving
Expand Down Expand Up @@ -38,182 +38,240 @@
import os
from PIL import Image
import pytesseract
import cv2
import numpy as np


class TesseractConfig:
"""
TesseractConfig is a class that provides functionality to preprocess images,
and extract text from them using Tesseract OCR.
TesseractConfig is a class that configures and uses Tesseract OCR to extract text from images.
langs (str): The languages to be used by Tesseract for OCR.
custom_config (str): Custom configuration string for Tesseract.
ouput_encoding (str): The encoding to be used for the output file.
Methods:
__init__():
Initializes the TesseractConfig instance with command line arguments.
preprocess_image(image):
Preprocesses the given image to improve OCR accuracy.
Args:
image (PIL.Image): The image to preprocess.
Returns:
PIL.Image: The preprocessed image.
extract_text_with_lines(image):
Extracts text from the given image while preserving line breaks.
Args:
image (PIL.Image): The image from which to extract text.
Returns:
str: The extracted text with line breaks preserved.
help():
Prints the usage information for the script.
main():
The main method that processes the image and extracts text.
Returns:
int: 0 if successful, 1 otherwise.
__init__(self, image_path: str, output_path: str):
Initializes the TesseractConfig class with the provided image and output file paths.
extract_text_with_lines(image: Image) -> str:
Uses Tesseract OCR to extract text from the provided image, preserving line breaks.
main() -> int:
Main function to process the image and extract text. Performs validation, image processing,
text extraction, and saves the extracted text to an output file. Returns 0 if successful, 1 otherwise.
"""

def __init__(self):
def __init__(self, image_path: str, output_path: str):
"""
Initializes the OCR4Linux class with command-line arguments.
Attributes:
args_num (int): The number of expected command-line arguments.
script_name (str): The name of the script being executed.
image_path (str): The path to the input image file.
output_path (str): The path to the output file where results will be saved.
oem_mode (int): The OCR Engine Mode (OEM) for Tesseract.
psm_mode (int): The Page Segmentation Mode (PSM) for Tesseract.
langs (str): The languages to be used by Tesseract for OCR.
custom_config (str): Custom configuration string for Tesseract.
ouput_encoding (str): The encoding to be used for the output file.
"""
self.args_num = 3
self.script_name = sys.argv[0]
self.image_path = sys.argv[1]
self.output_path = sys.argv[2]
self.image_path = image_path
self.output_path = output_path
self.oem_mode = 3 # Default LSTM engine
self.psm_mode = 6 # Uniform block of text
self.available_langs = pytesseract.get_languages()
self.langs = '+'.join(filter(None, self.available_langs)
) if self.available_langs else 'eng'
self.custom_config = f'--oem {self.oem_mode} --psm {self.psm_mode}'
self.ouput_encoding = 'utf-8'

def preprocess_image(self, image) -> Image:
def extract_text_with_lines(self, image: Image) -> str:
"""
Preprocess image for better OCR accuracy.
This function converts the input image to grayscale, applies thresholding
to binarize the image, and removes noise using a median blur filter.
This method uses Tesseract OCR to extract text from the provided image.
Args:
image (PIL.Image.Image): The input image to preprocess.
image: The image from which to extract text. This should be a format
supported by the pytesseract library.
Returns:
PIL.Image.Image: The preprocessed image.
A string containing the extracted text with line breaks preserved.
"""
# Convert to grayscale
gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
# Apply thresholding
thresh = cv2.threshold(
gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
# Noise removal
denoised = cv2.medianBlur(thresh, 3)
return Image.fromarray(denoised)
return pytesseract.image_to_string(
image=image, lang=self.langs, config=self.custom_config)

def extract_text_with_lines(self, image: Image) -> str:
def main(self) -> int:
"""
Extract text from an image while preserving line breaks.
This method uses Tesseract OCR to extract text from the provided image,
preserving the layout and line breaks. It filters out low-confidence
results to improve the accuracy of the extracted text.
Main function to process the image and extract text.
Args:
image: The image from which to extract text. This should be a format
supported by the pytesseract library.
This function performs the following steps:
1. Extracts text from the processed image while preserving line breaks.
2. Saves the extracted text to an output file.
Returns:
A string containing the extracted text with line breaks preserved.
int: 0 if text extraction is successful, 1 otherwise.
"""
# Get image dimensions
custom_config = r'--oem 3 --psm 6'
# Extract text with layout preservation
data = pytesseract.image_to_data(
image, config=custom_config, output_type=pytesseract.Output.DICT)

# Group text by line
lines = {}
for i, _ in enumerate(data['level']):
if int(data['conf'][i]) > 60: # Filter low confidence results
page_num = data['page_num'][i]
block_num = data['block_num'][i]
par_num = data['par_num'][i]
line_num = data['line_num'][i]

key = f"{page_num}_{block_num}_{par_num}_{line_num}"
if key not in lines:
lines[key] = []
lines[key].append(data['text'][i])

# Join text preserving line breaks
return '\n'.join(' '.join(line).strip() for line in lines.values() if ''.join(line).strip())
try:
# Open and process the image
with Image.open(self.image_path) as image:
# Extract text with line preservation
extracted_text = self.extract_text_with_lines(image)

# Save the extracted text to a file
with open(self.output_path, 'w', encoding=self.ouput_encoding) as file:
file.write(extracted_text)

return 0

except Exception as e:
print(f"Error processing image because: {str(e)}")
return 1


class Program:
def __init__(self):
"""
Initializes the OCR4Linux class with the following attributes:
- args_num: Number of arguments expected by the script.
- author: Author of the script.
- email: Author's email address.
- github: URL to the GitHub repository.
- version: Version of the script.
- description: Brief description of the script's functionality.
- useges: List of usage examples for the script.
- examples: List of example commands for using the script.
- arguments: List of arguments that the script accepts with their descriptions.
"""
self.args_num = 3
self.author = "Mohamed Hussein Al-Adawy"
self.email = "[email protected]"
self.github = "https://github.com/moheladwy/OCR4Linux"
self.version = "1.2.0"
self.description = \
" OCR4Linux.py is a Python script that handles image preprocessing\n" + \
" and text extraction using Tesseract OCR. The script takes an input\n" + \
" based on the language in the image."
self.useges = [
"python OCR4Linux.py <image_path> <output_path>",
"python OCR4Linux.py [-l | --list-langs]",
"python OCR4Linux.py [-h | --help]"
]
self.examples = [
"python OCR4Linux.py screenshot.png output.txt",
"python OCR4Linux.py -l",
"python OCR4Linux.py -h"
]
self.arguments = [
"file_path: Path to the python script",
"image_path: Path to the image file",
"output_path: Path to the output text file",
"-l, --list-langs: List all available languages for OCR in the system",
"-h, --help: Display this help message, then exit"
]

def help(self) -> None:
"""
Prints the usage instructions for the OCR4Linux script.
This method displays the correct way to run the script, including the required
arguments and their descriptions.
Usage:
python <script_name> <image_path> <output_path>
Arguments:
file_path: Path to the python script
image_path: Path to the image file
output_path: Path to the output text file
arguments and their descriptions. It also provides examples of how to use the script.
"""
print(f"Usage: python {self.script_name} <image_path> <output_path>")
print("OCR4Linux - OCR script for Linux using Tesseract")
print(f"Version: {self.version}")
print(f"Author: {self.author}")
print(f"Email: {self.email}")
print(f"GitHub: {self.github}")
print()
print("Description:")
print(self.description)
print()
print("Usage:")
for usege in self.useges:
print(f" - {usege}")
print()
print("Example:")
for example in self.examples:
print(f" - {example}")
print()
print("Arguments:")
print(" file_path: Path to the python script")
print(" image_path: Path to the image file")
print(" output_path: Path to the output text file")
for argument in self.arguments:
print(f" {argument}")

def main(self) -> int:
def check_arguments(self) -> int:
"""
Main function to process the image and extract text.
Checks the command line arguments for validity.
This function performs the following steps:
1. Checks command line arguments for validity.
2. Verifies if the specified image file exists.
3. Opens and processes the image.
4. Extracts text from the processed image while preserving line breaks.
5. Saves the extracted text to an output file.
Handles the following options:
- Standard usage: <image_path> <output_path>
- Help: -h or --help
- List languages: -l or --list-langs
Returns:
int: 0 if text extraction is successful, 1 otherwise.
bool: True if arguments are valid, False otherwise.
"""
# Check command line arguments
if len(sys.argv) != self.args_num or sys.argv[1] in ['-h', '--help']:
if len(sys.argv) == 2 and sys.argv[1] in ['-l', '--list-langs']:
self.list_available_languages()
return 0
elif len(sys.argv) == 2 and sys.argv[1] in ['-h', '--help']:
self.help()
return 0
elif len(sys.argv) != self.args_num:
self.help()
return 1
return 2

# Check if file exists
if not os.path.exists(self.image_path):
print(f"Error: File '{self.image_path}' not found")
return 1
def list_available_languages(self) -> None:
"""
Displays all available languages for Tesseract OCR.
"""
langs = pytesseract.get_languages()
if not langs:
print("Error: No languages found")
return

try:
# Open and process the image
with Image.open(self.image_path) as image:
# Preprocess the image
processed_image = self.preprocess_image(image)
print("Available languages for OCR:")
for lang in langs:
print(f" - {lang}")

# Extract text with line preservation
extracted_text = self.extract_text_with_lines(processed_image)
def check_image_path(self, image_path: str) -> bool:
"""
Checks if the specified image file exists.
# Save the extracted text to a file
with open(self.output_path, 'w', encoding='utf-8') as file:
file.write(extracted_text)
Args:
image_path: The path to the image file to be checked.
print("Text extraction completed successfully")
return 0
Returns:
bool: True if the image file exists, False otherwise.
"""
if not os.path.exists(image_path):
print(f"Error: File '{image_path}' not found")
return False
return True

except Exception as e:
print(f"Error processing image because: {str(e)}")
def main(self):
"""
Main function to execute the OCR process.
This function performs the following steps:
1. Checks if the correct number of arguments is provided.
2. Verifies if the image file exists.
3. Creates an instance of the TesseractConfig class and runs the OCR process.
Returns:
int: Returns 1 if there is an error with the arguments or image path, otherwise returns the result of the TesseractConfig main function.
"""
# Check if the correct number of arguments is provided
result = self.check_arguments()
if result == 1:
return 1
elif result == 0:
return 0

# Check if the image file exists
if not self.check_image_path(sys.argv[1]):
return 1

# Create an instance of the TesseractConfig class
tesseract = TesseractConfig(sys.argv[1], sys.argv[2])
return tesseract.main()


if __name__ == "__main__":
sys.exit(TesseractConfig().main())
sys.exit(Program().main())
Loading

0 comments on commit ce3759e

Please sign in to comment.