-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #13 from moheladwy/AddLanguageDetectionSupport
Add language detection support, Improved Error Handling, and Improved Code Quality.
- Loading branch information
Showing
5 changed files
with
195 additions
and
129 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
# ======================================================================================================================== | ||
# Author: | ||
# Mohamed Hussein Al-Adawy | ||
# Version: 1.1.0 | ||
# Version: 1.2.0 | ||
# Description: | ||
# OCR4Linux.py is a Python script that handles image preprocessing and text extraction using Tesseract OCR. | ||
# The script takes an input image, processes it for optimal OCR accuracy, and extracts text while preserving | ||
|
@@ -38,182 +38,240 @@ | |
import os | ||
from PIL import Image | ||
import pytesseract | ||
import cv2 | ||
import numpy as np | ||
|
||
|
||
class TesseractConfig: | ||
""" | ||
TesseractConfig is a class that provides functionality to preprocess images, | ||
and extract text from them using Tesseract OCR. | ||
TesseractConfig is a class that configures and uses Tesseract OCR to extract text from images. | ||
langs (str): The languages to be used by Tesseract for OCR. | ||
custom_config (str): Custom configuration string for Tesseract. | ||
ouput_encoding (str): The encoding to be used for the output file. | ||
Methods: | ||
__init__(): | ||
Initializes the TesseractConfig instance with command line arguments. | ||
preprocess_image(image): | ||
Preprocesses the given image to improve OCR accuracy. | ||
Args: | ||
image (PIL.Image): The image to preprocess. | ||
Returns: | ||
PIL.Image: The preprocessed image. | ||
extract_text_with_lines(image): | ||
Extracts text from the given image while preserving line breaks. | ||
Args: | ||
image (PIL.Image): The image from which to extract text. | ||
Returns: | ||
str: The extracted text with line breaks preserved. | ||
help(): | ||
Prints the usage information for the script. | ||
main(): | ||
The main method that processes the image and extracts text. | ||
Returns: | ||
int: 0 if successful, 1 otherwise. | ||
__init__(self, image_path: str, output_path: str): | ||
Initializes the TesseractConfig class with the provided image and output file paths. | ||
extract_text_with_lines(image: Image) -> str: | ||
Uses Tesseract OCR to extract text from the provided image, preserving line breaks. | ||
main() -> int: | ||
Main function to process the image and extract text. Performs validation, image processing, | ||
text extraction, and saves the extracted text to an output file. Returns 0 if successful, 1 otherwise. | ||
""" | ||
|
||
def __init__(self): | ||
def __init__(self, image_path: str, output_path: str): | ||
""" | ||
Initializes the OCR4Linux class with command-line arguments. | ||
Attributes: | ||
args_num (int): The number of expected command-line arguments. | ||
script_name (str): The name of the script being executed. | ||
image_path (str): The path to the input image file. | ||
output_path (str): The path to the output file where results will be saved. | ||
oem_mode (int): The OCR Engine Mode (OEM) for Tesseract. | ||
psm_mode (int): The Page Segmentation Mode (PSM) for Tesseract. | ||
langs (str): The languages to be used by Tesseract for OCR. | ||
custom_config (str): Custom configuration string for Tesseract. | ||
ouput_encoding (str): The encoding to be used for the output file. | ||
""" | ||
self.args_num = 3 | ||
self.script_name = sys.argv[0] | ||
self.image_path = sys.argv[1] | ||
self.output_path = sys.argv[2] | ||
self.image_path = image_path | ||
self.output_path = output_path | ||
self.oem_mode = 3 # Default LSTM engine | ||
self.psm_mode = 6 # Uniform block of text | ||
self.available_langs = pytesseract.get_languages() | ||
self.langs = '+'.join(filter(None, self.available_langs) | ||
) if self.available_langs else 'eng' | ||
self.custom_config = f'--oem {self.oem_mode} --psm {self.psm_mode}' | ||
self.ouput_encoding = 'utf-8' | ||
|
||
def preprocess_image(self, image) -> Image: | ||
def extract_text_with_lines(self, image: Image) -> str: | ||
""" | ||
Preprocess image for better OCR accuracy. | ||
This function converts the input image to grayscale, applies thresholding | ||
to binarize the image, and removes noise using a median blur filter. | ||
This method uses Tesseract OCR to extract text from the provided image. | ||
Args: | ||
image (PIL.Image.Image): The input image to preprocess. | ||
image: The image from which to extract text. This should be a format | ||
supported by the pytesseract library. | ||
Returns: | ||
PIL.Image.Image: The preprocessed image. | ||
A string containing the extracted text with line breaks preserved. | ||
""" | ||
# Convert to grayscale | ||
gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY) | ||
# Apply thresholding | ||
thresh = cv2.threshold( | ||
gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1] | ||
# Noise removal | ||
denoised = cv2.medianBlur(thresh, 3) | ||
return Image.fromarray(denoised) | ||
return pytesseract.image_to_string( | ||
image=image, lang=self.langs, config=self.custom_config) | ||
|
||
def extract_text_with_lines(self, image: Image) -> str: | ||
def main(self) -> int: | ||
""" | ||
Extract text from an image while preserving line breaks. | ||
This method uses Tesseract OCR to extract text from the provided image, | ||
preserving the layout and line breaks. It filters out low-confidence | ||
results to improve the accuracy of the extracted text. | ||
Main function to process the image and extract text. | ||
Args: | ||
image: The image from which to extract text. This should be a format | ||
supported by the pytesseract library. | ||
This function performs the following steps: | ||
1. Extracts text from the processed image while preserving line breaks. | ||
2. Saves the extracted text to an output file. | ||
Returns: | ||
A string containing the extracted text with line breaks preserved. | ||
int: 0 if text extraction is successful, 1 otherwise. | ||
""" | ||
# Get image dimensions | ||
custom_config = r'--oem 3 --psm 6' | ||
# Extract text with layout preservation | ||
data = pytesseract.image_to_data( | ||
image, config=custom_config, output_type=pytesseract.Output.DICT) | ||
|
||
# Group text by line | ||
lines = {} | ||
for i, _ in enumerate(data['level']): | ||
if int(data['conf'][i]) > 60: # Filter low confidence results | ||
page_num = data['page_num'][i] | ||
block_num = data['block_num'][i] | ||
par_num = data['par_num'][i] | ||
line_num = data['line_num'][i] | ||
|
||
key = f"{page_num}_{block_num}_{par_num}_{line_num}" | ||
if key not in lines: | ||
lines[key] = [] | ||
lines[key].append(data['text'][i]) | ||
|
||
# Join text preserving line breaks | ||
return '\n'.join(' '.join(line).strip() for line in lines.values() if ''.join(line).strip()) | ||
try: | ||
# Open and process the image | ||
with Image.open(self.image_path) as image: | ||
# Extract text with line preservation | ||
extracted_text = self.extract_text_with_lines(image) | ||
|
||
# Save the extracted text to a file | ||
with open(self.output_path, 'w', encoding=self.ouput_encoding) as file: | ||
file.write(extracted_text) | ||
|
||
return 0 | ||
|
||
except Exception as e: | ||
print(f"Error processing image because: {str(e)}") | ||
return 1 | ||
|
||
|
||
class Program: | ||
def __init__(self): | ||
""" | ||
Initializes the OCR4Linux class with the following attributes: | ||
- args_num: Number of arguments expected by the script. | ||
- author: Author of the script. | ||
- email: Author's email address. | ||
- github: URL to the GitHub repository. | ||
- version: Version of the script. | ||
- description: Brief description of the script's functionality. | ||
- useges: List of usage examples for the script. | ||
- examples: List of example commands for using the script. | ||
- arguments: List of arguments that the script accepts with their descriptions. | ||
""" | ||
self.args_num = 3 | ||
self.author = "Mohamed Hussein Al-Adawy" | ||
self.email = "[email protected]" | ||
self.github = "https://github.com/moheladwy/OCR4Linux" | ||
self.version = "1.2.0" | ||
self.description = \ | ||
" OCR4Linux.py is a Python script that handles image preprocessing\n" + \ | ||
" and text extraction using Tesseract OCR. The script takes an input\n" + \ | ||
" based on the language in the image." | ||
self.useges = [ | ||
"python OCR4Linux.py <image_path> <output_path>", | ||
"python OCR4Linux.py [-l | --list-langs]", | ||
"python OCR4Linux.py [-h | --help]" | ||
] | ||
self.examples = [ | ||
"python OCR4Linux.py screenshot.png output.txt", | ||
"python OCR4Linux.py -l", | ||
"python OCR4Linux.py -h" | ||
] | ||
self.arguments = [ | ||
"file_path: Path to the python script", | ||
"image_path: Path to the image file", | ||
"output_path: Path to the output text file", | ||
"-l, --list-langs: List all available languages for OCR in the system", | ||
"-h, --help: Display this help message, then exit" | ||
] | ||
|
||
def help(self) -> None: | ||
""" | ||
Prints the usage instructions for the OCR4Linux script. | ||
This method displays the correct way to run the script, including the required | ||
arguments and their descriptions. | ||
Usage: | ||
python <script_name> <image_path> <output_path> | ||
Arguments: | ||
file_path: Path to the python script | ||
image_path: Path to the image file | ||
output_path: Path to the output text file | ||
arguments and their descriptions. It also provides examples of how to use the script. | ||
""" | ||
print(f"Usage: python {self.script_name} <image_path> <output_path>") | ||
print("OCR4Linux - OCR script for Linux using Tesseract") | ||
print(f"Version: {self.version}") | ||
print(f"Author: {self.author}") | ||
print(f"Email: {self.email}") | ||
print(f"GitHub: {self.github}") | ||
print() | ||
print("Description:") | ||
print(self.description) | ||
print() | ||
print("Usage:") | ||
for usege in self.useges: | ||
print(f" - {usege}") | ||
print() | ||
print("Example:") | ||
for example in self.examples: | ||
print(f" - {example}") | ||
print() | ||
print("Arguments:") | ||
print(" file_path: Path to the python script") | ||
print(" image_path: Path to the image file") | ||
print(" output_path: Path to the output text file") | ||
for argument in self.arguments: | ||
print(f" {argument}") | ||
|
||
def main(self) -> int: | ||
def check_arguments(self) -> int: | ||
""" | ||
Main function to process the image and extract text. | ||
Checks the command line arguments for validity. | ||
This function performs the following steps: | ||
1. Checks command line arguments for validity. | ||
2. Verifies if the specified image file exists. | ||
3. Opens and processes the image. | ||
4. Extracts text from the processed image while preserving line breaks. | ||
5. Saves the extracted text to an output file. | ||
Handles the following options: | ||
- Standard usage: <image_path> <output_path> | ||
- Help: -h or --help | ||
- List languages: -l or --list-langs | ||
Returns: | ||
int: 0 if text extraction is successful, 1 otherwise. | ||
bool: True if arguments are valid, False otherwise. | ||
""" | ||
# Check command line arguments | ||
if len(sys.argv) != self.args_num or sys.argv[1] in ['-h', '--help']: | ||
if len(sys.argv) == 2 and sys.argv[1] in ['-l', '--list-langs']: | ||
self.list_available_languages() | ||
return 0 | ||
elif len(sys.argv) == 2 and sys.argv[1] in ['-h', '--help']: | ||
self.help() | ||
return 0 | ||
elif len(sys.argv) != self.args_num: | ||
self.help() | ||
return 1 | ||
return 2 | ||
|
||
# Check if file exists | ||
if not os.path.exists(self.image_path): | ||
print(f"Error: File '{self.image_path}' not found") | ||
return 1 | ||
def list_available_languages(self) -> None: | ||
""" | ||
Displays all available languages for Tesseract OCR. | ||
""" | ||
langs = pytesseract.get_languages() | ||
if not langs: | ||
print("Error: No languages found") | ||
return | ||
|
||
try: | ||
# Open and process the image | ||
with Image.open(self.image_path) as image: | ||
# Preprocess the image | ||
processed_image = self.preprocess_image(image) | ||
print("Available languages for OCR:") | ||
for lang in langs: | ||
print(f" - {lang}") | ||
|
||
# Extract text with line preservation | ||
extracted_text = self.extract_text_with_lines(processed_image) | ||
def check_image_path(self, image_path: str) -> bool: | ||
""" | ||
Checks if the specified image file exists. | ||
# Save the extracted text to a file | ||
with open(self.output_path, 'w', encoding='utf-8') as file: | ||
file.write(extracted_text) | ||
Args: | ||
image_path: The path to the image file to be checked. | ||
print("Text extraction completed successfully") | ||
return 0 | ||
Returns: | ||
bool: True if the image file exists, False otherwise. | ||
""" | ||
if not os.path.exists(image_path): | ||
print(f"Error: File '{image_path}' not found") | ||
return False | ||
return True | ||
|
||
except Exception as e: | ||
print(f"Error processing image because: {str(e)}") | ||
def main(self): | ||
""" | ||
Main function to execute the OCR process. | ||
This function performs the following steps: | ||
1. Checks if the correct number of arguments is provided. | ||
2. Verifies if the image file exists. | ||
3. Creates an instance of the TesseractConfig class and runs the OCR process. | ||
Returns: | ||
int: Returns 1 if there is an error with the arguments or image path, otherwise returns the result of the TesseractConfig main function. | ||
""" | ||
# Check if the correct number of arguments is provided | ||
result = self.check_arguments() | ||
if result == 1: | ||
return 1 | ||
elif result == 0: | ||
return 0 | ||
|
||
# Check if the image file exists | ||
if not self.check_image_path(sys.argv[1]): | ||
return 1 | ||
|
||
# Create an instance of the TesseractConfig class | ||
tesseract = TesseractConfig(sys.argv[1], sys.argv[2]) | ||
return tesseract.main() | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(TesseractConfig().main()) | ||
sys.exit(Program().main()) |
Oops, something went wrong.