Merge pull request #13 from moheladwy/AddLanguageDetectionSupport

Add language detection support, Improved Error Handling, and Improved Code Quality.
moheladwy · Dec 16, 2024 · ce3759e · ce3759e
2 parents a634b3e + ed58f58
commit ce3759e
Show file tree

Hide file tree

Showing 5 changed files with 195 additions and 129 deletions.
diff --git a/.gitignore b/.gitignore
@@ -164,3 +164,9 @@ cython_debug/
 # VS Code
 #  Add .vscode/ to the .gitignore file if you are using Visual Studio Code
 .vscode/
+
+*.jpg
+*.jpeg
+*.png
+*.gif
+output*.txt
diff --git a/.pylintrc b/.pylintrc
@@ -7,4 +7,6 @@ disable=
     C0411,  # Wrong import order
     E1101,  # No member
     W0612,  # Unused variable
-    W0718   # Broad exception caught
+    W0718,  # Broad exception caught
+    R1705,  # Unnecessary "else" after "return"
+    R0902,  # Too many instance attributes
diff --git a/OCR4Linux.py b/OCR4Linux.py
@@ -1,7 +1,7 @@
 # ========================================================================================================================
 # Author:
 #     Mohamed Hussein Al-Adawy
-# Version: 1.1.0
+# Version: 1.2.0
 # Description:
 #     OCR4Linux.py is a Python script that handles image preprocessing and text extraction using Tesseract OCR.
 #     The script takes an input image, processes it for optimal OCR accuracy, and extracts text while preserving
@@ -38,182 +38,240 @@
 import os
 from PIL import Image
 import pytesseract
-import cv2
-import numpy as np
 
 
 class TesseractConfig:
     """
-    TesseractConfig is a class that provides functionality to preprocess images,
-    and extract text from them using Tesseract OCR.
+    TesseractConfig is a class that configures and uses Tesseract OCR to extract text from images.
+
+        langs (str): The languages to be used by Tesseract for OCR.
+        custom_config (str): Custom configuration string for Tesseract.
+        ouput_encoding (str): The encoding to be used for the output file.
 
     Methods:
-        __init__():
-            Initializes the TesseractConfig instance with command line arguments.
-
-        preprocess_image(image):
-            Preprocesses the given image to improve OCR accuracy.
-            Args:
-                image (PIL.Image): The image to preprocess.
-            Returns:
-                PIL.Image: The preprocessed image.
-
-        extract_text_with_lines(image):
-            Extracts text from the given image while preserving line breaks.
-            Args:
-                image (PIL.Image): The image from which to extract text.
-            Returns:
-                str: The extracted text with line breaks preserved.
-
-        help():
-            Prints the usage information for the script.
-
-        main():
-            The main method that processes the image and extracts text.
-            Returns:
-                int: 0 if successful, 1 otherwise.
+        __init__(self, image_path: str, output_path: str):
+            Initializes the TesseractConfig class with the provided image and output file paths.
+
+        extract_text_with_lines(image: Image) -> str:
+            Uses Tesseract OCR to extract text from the provided image, preserving line breaks.
+
+        main() -> int:
+            Main function to process the image and extract text. Performs validation, image processing,
+            text extraction, and saves the extracted text to an output file. Returns 0 if successful, 1 otherwise.
     """
 
-    def __init__(self):
+    def __init__(self, image_path: str, output_path: str):
         """
         Initializes the OCR4Linux class with command-line arguments.
 
         Attributes:
-            args_num (int): The number of expected command-line arguments.
-            script_name (str): The name of the script being executed.
             image_path (str): The path to the input image file.
             output_path (str): The path to the output file where results will be saved.
+            oem_mode (int): The OCR Engine Mode (OEM) for Tesseract.
+            psm_mode (int): The Page Segmentation Mode (PSM) for Tesseract.
+            langs (str): The languages to be used by Tesseract for OCR.
+            custom_config (str): Custom configuration string for Tesseract.
+            ouput_encoding (str): The encoding to be used for the output file.
         """
-        self.args_num = 3
-        self.script_name = sys.argv[0]
-        self.image_path = sys.argv[1]
-        self.output_path = sys.argv[2]
+        self.image_path = image_path
+        self.output_path = output_path
+        self.oem_mode = 3  # Default LSTM engine
+        self.psm_mode = 6  # Uniform block of text
+        self.available_langs = pytesseract.get_languages()
+        self.langs = '+'.join(filter(None, self.available_langs)
+                              ) if self.available_langs else 'eng'
+        self.custom_config = f'--oem {self.oem_mode} --psm {self.psm_mode}'
+        self.ouput_encoding = 'utf-8'
 
-    def preprocess_image(self, image) -> Image:
+    def extract_text_with_lines(self, image: Image) -> str:
         """
-        Preprocess image for better OCR accuracy.
-
-        This function converts the input image to grayscale, applies thresholding 
-        to binarize the image, and removes noise using a median blur filter.
+        This method uses Tesseract OCR to extract text from the provided image.
 
         Args:
-            image (PIL.Image.Image): The input image to preprocess.
+            image: The image from which to extract text. This should be a format
+                   supported by the pytesseract library.
 
         Returns:
-            PIL.Image.Image: The preprocessed image.
+            A string containing the extracted text with line breaks preserved.
         """
-        # Convert to grayscale
-        gray = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
-        # Apply thresholding
-        thresh = cv2.threshold(
-            gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
-        # Noise removal
-        denoised = cv2.medianBlur(thresh, 3)
-        return Image.fromarray(denoised)
+        return pytesseract.image_to_string(
+            image=image, lang=self.langs, config=self.custom_config)
 
-    def extract_text_with_lines(self, image: Image) -> str:
+    def main(self) -> int:
         """
-        Extract text from an image while preserving line breaks.
-
-        This method uses Tesseract OCR to extract text from the provided image,
-        preserving the layout and line breaks. It filters out low-confidence
-        results to improve the accuracy of the extracted text.
+        Main function to process the image and extract text.
 
-        Args:
-            image: The image from which to extract text. This should be a format
-                   supported by the pytesseract library.
+        This function performs the following steps:
+        1. Extracts text from the processed image while preserving line breaks.
+        2. Saves the extracted text to an output file.
 
         Returns:
-            A string containing the extracted text with line breaks preserved.
+            int: 0 if text extraction is successful, 1 otherwise.
         """
-        # Get image dimensions
-        custom_config = r'--oem 3 --psm 6'
-        # Extract text with layout preservation
-        data = pytesseract.image_to_data(
-            image, config=custom_config, output_type=pytesseract.Output.DICT)
-
-        # Group text by line
-        lines = {}
-        for i, _ in enumerate(data['level']):
-            if int(data['conf'][i]) > 60:  # Filter low confidence results
-                page_num = data['page_num'][i]
-                block_num = data['block_num'][i]
-                par_num = data['par_num'][i]
-                line_num = data['line_num'][i]
-
-                key = f"{page_num}_{block_num}_{par_num}_{line_num}"
-                if key not in lines:
-                    lines[key] = []
-                lines[key].append(data['text'][i])
-
-        # Join text preserving line breaks
-        return '\n'.join(' '.join(line).strip() for line in lines.values() if ''.join(line).strip())
+        try:
+            # Open and process the image
+            with Image.open(self.image_path) as image:
+                # Extract text with line preservation
+                extracted_text = self.extract_text_with_lines(image)
+
+                # Save the extracted text to a file
+                with open(self.output_path, 'w', encoding=self.ouput_encoding) as file:
+                    file.write(extracted_text)
+
+                return 0
+
+        except Exception as e:
+            print(f"Error processing image because: {str(e)}")
+            return 1
+
+
+class Program:
+    def __init__(self):
+        """
+        Initializes the OCR4Linux class with the following attributes:
+        - args_num: Number of arguments expected by the script.
+        - author: Author of the script.
+        - email: Author's email address.
+        - github: URL to the GitHub repository.
+        - version: Version of the script.
+        - description: Brief description of the script's functionality.
+        - useges: List of usage examples for the script.
+        - examples: List of example commands for using the script.
+        - arguments: List of arguments that the script accepts with their descriptions.
+        """
+        self.args_num = 3
+        self.author = "Mohamed Hussein Al-Adawy"
+        self.email = "[email protected]"
+        self.github = "https://github.com/moheladwy/OCR4Linux"
+        self.version = "1.2.0"
+        self.description = \
+            "    OCR4Linux.py is a Python script that handles image preprocessing\n" + \
+            "    and text extraction using Tesseract OCR. The script takes an input\n" + \
+            "    based on the language in the image."
+        self.useges = [
+            "python OCR4Linux.py <image_path> <output_path>",
+            "python OCR4Linux.py [-l | --list-langs]",
+            "python OCR4Linux.py [-h | --help]"
+        ]
+        self.examples = [
+            "python OCR4Linux.py screenshot.png output.txt",
+            "python OCR4Linux.py -l",
+            "python OCR4Linux.py -h"
+        ]
+        self.arguments = [
+            "file_path:         Path to the python script",
+            "image_path:        Path to the image file",
+            "output_path:       Path to the output text file",
+            "-l, --list-langs:  List all available languages for OCR in the system",
+            "-h, --help:        Display this help message, then exit"
+        ]
 
     def help(self) -> None:
         """
         Prints the usage instructions for the OCR4Linux script.
 
         This method displays the correct way to run the script, including the required
-        arguments and their descriptions.
-
-        Usage:
-            python <script_name> <image_path> <output_path>
-
-        Arguments:
-            file_path: Path to the python script
-            image_path: Path to the image file
-            output_path: Path to the output text file
+        arguments and their descriptions. It also provides examples of how to use the script.
         """
-        print(f"Usage: python {self.script_name} <image_path> <output_path>")
+        print("OCR4Linux - OCR script for Linux using Tesseract")
+        print(f"Version: {self.version}")
+        print(f"Author:  {self.author}")
+        print(f"Email:   {self.email}")
+        print(f"GitHub:  {self.github}")
+        print()
+        print("Description:")
+        print(self.description)
+        print()
+        print("Usage:")
+        for usege in self.useges:
+            print(f"    - {usege}")
+        print()
+        print("Example:")
+        for example in self.examples:
+            print(f"    - {example}")
+        print()
         print("Arguments:")
-        print("  file_path: Path to the python script")
-        print("  image_path: Path to the image file")
-        print("  output_path: Path to the output text file")
+        for argument in self.arguments:
+            print(f"    {argument}")
 
-    def main(self) -> int:
+    def check_arguments(self) -> int:
         """
-        Main function to process the image and extract text.
+        Checks the command line arguments for validity.
 
-        This function performs the following steps:
-        1. Checks command line arguments for validity.
-        2. Verifies if the specified image file exists.
-        3. Opens and processes the image.
-        4. Extracts text from the processed image while preserving line breaks.
-        5. Saves the extracted text to an output file.
+        Handles the following options:
+        - Standard usage: <image_path> <output_path>
+        - Help: -h or --help
+        - List languages: -l or --list-langs
 
         Returns:
-            int: 0 if text extraction is successful, 1 otherwise.
+            bool: True if arguments are valid, False otherwise.
         """
-        # Check command line arguments
-        if len(sys.argv) != self.args_num or sys.argv[1] in ['-h', '--help']:
+        if len(sys.argv) == 2 and sys.argv[1] in ['-l', '--list-langs']:
+            self.list_available_languages()
+            return 0
+        elif len(sys.argv) == 2 and sys.argv[1] in ['-h', '--help']:
+            self.help()
+            return 0
+        elif len(sys.argv) != self.args_num:
             self.help()
             return 1
+        return 2
 
-        # Check if file exists
-        if not os.path.exists(self.image_path):
-            print(f"Error: File '{self.image_path}' not found")
-            return 1
+    def list_available_languages(self) -> None:
+        """
+        Displays all available languages for Tesseract OCR.
+        """
+        langs = pytesseract.get_languages()
+        if not langs:
+            print("Error: No languages found")
+            return
 
-        try:
-            # Open and process the image
-            with Image.open(self.image_path) as image:
-                # Preprocess the image
-                processed_image = self.preprocess_image(image)
+        print("Available languages for OCR:")
+        for lang in langs:
+            print(f"  - {lang}")
 
-                # Extract text with line preservation
-                extracted_text = self.extract_text_with_lines(processed_image)
+    def check_image_path(self, image_path: str) -> bool:
+        """
+        Checks if the specified image file exists.
 
-                # Save the extracted text to a file
-                with open(self.output_path, 'w', encoding='utf-8') as file:
-                    file.write(extracted_text)
+        Args:
+            image_path: The path to the image file to be checked.
 
-                print("Text extraction completed successfully")
-                return 0
+        Returns:
+            bool: True if the image file exists, False otherwise.
+        """
+        if not os.path.exists(image_path):
+            print(f"Error: File '{image_path}' not found")
+            return False
+        return True
 
-        except Exception as e:
-            print(f"Error processing image because: {str(e)}")
+    def main(self):
+        """
+        Main function to execute the OCR process.
+
+        This function performs the following steps:
+        1. Checks if the correct number of arguments is provided.
+        2. Verifies if the image file exists.
+        3. Creates an instance of the TesseractConfig class and runs the OCR process.
+
+        Returns:
+            int: Returns 1 if there is an error with the arguments or image path, otherwise returns the result of the TesseractConfig main function.
+        """
+        # Check if the correct number of arguments is provided
+        result = self.check_arguments()
+        if result == 1:
+            return 1
+        elif result == 0:
+            return 0
+
+        # Check if the image file exists
+        if not self.check_image_path(sys.argv[1]):
             return 1
 
+        # Create an instance of the TesseractConfig class
+        tesseract = TesseractConfig(sys.argv[1], sys.argv[2])
+        return tesseract.main()
+
 
 if __name__ == "__main__":
-    sys.exit(TesseractConfig().main())
+    sys.exit(Program().main())