Fixed pre-commit issues

openvinotoolkit · Jan 28, 2025 · 33f87c1 · 33f87c1
1 parent 3b016df
commit 33f87c1
Show file tree

Hide file tree

Showing 135 changed files with 684 additions and 624 deletions.
diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml
@@ -22,6 +22,18 @@ env:
   w_ov_link: https://storage.openvinotoolkit.org/repositories/openvino/packages/nightly/2025.1.0-17911-83c047443de/w_openvino_toolkit_windows_2025.1.0.dev20250116_x86_64.zip
 
 jobs:
+  code-quality-checks:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/setup-python@v4
+        with:
+          python-version: 3.9
+      - name: Install pre-commit
+        run: pip install pre-commit
+      - name: Run pre-commit (checks for trailing whitespaces, and non-ASCII symbols in filenames and file content)
+        run: pre-commit run --all-files --show-diff-on-failure
+
   cpp-multinomial-greedy_causal_lm-ubuntu:
     runs-on: ubuntu-20.04-8-cores
     defaults:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,20 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+      - id: trailing-whitespace # checks for files with trailing whitespaces, excluding .md and Git-related hidden files
+        exclude: '\.md$|.*\.git.*'
+      - id: check-merge-conflict # checks for files that contain merge conflict strings (such as <<<<<<<, =======, and >>>>>>>)
+      - id: check-json # Ensures that JSON files are syntactically correct
+      - id: end-of-file-fixer # ensures that each file ends with one blank line, excluding Git-related hidden files
+        exclude: '.*\.git.*'
+  - repo: local
+    hooks:
+      - id: forbid-non-ascii-filenames # runs the script that prohibits non-ASCII characters in file names
+        name: Prohibit non-ASCII characters in file names
+        entry: ./pre_commit_scripts/check_non_ascii_filenames.sh
+        language: script
+      - id: forbid-non-ascii-in-files # checks for non-ASCII characters in files (excluding Markdown and hidden files), with characters ± and ? allowed
+        name: Check for non-ASCII characters in files (excluding Markdown and hidden files), with characters ± and ? allowed
+        entry: ./pre_commit_scripts/check_non_ascii_in_files.sh
+        language: script
diff --git a/pre_commit_scripts/check_non_ascii_filenames.sh b/pre_commit_scripts/check_non_ascii_filenames.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# Store the command output:
+empty_tree=$(git hash-object -t tree /dev/null)
+
+# Get a list of new files that might have non-ASCII characters:
+problem_files=$(git diff --name-only --diff-filter=A -z "$empty_tree" | LC_ALL=C grep -P "[^\x00-\x7F]")
+
+# Count the number of problematic files:
+count=$(echo "$problem_files" | wc -w)
+
+# Print necessary info based on the result:
+if [ "$count" -ne 0 ]; then
+  echo "Error: Non-ASCII characters found in filenames of new files:"
+  echo "$problem_files"
+  exit 1
+else
+  echo "Success: No non-ASCII filenames found."
+fi
+exit 0
diff --git a/pre_commit_scripts/check_non_ascii_in_files.sh b/pre_commit_scripts/check_non_ascii_in_files.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+# Define the list of files to check, excluding .md, hidden, and a number of specific files:
+files_to_check=$(git ls-files | grep -vE "^\." | grep -vE "\.md$" | grep -vE "^(tests/python_tests|tools/who_what_benchmark/(tests|whowhatbench))" | grep -v "tools/llm_bench/llm_bench_utils/ov_model_classes.py")
+
+# Run git grep to find non-ASCII characters in the selected files and store the results:
+results=$(LC_ALL=C git grep -n "[^ -~±�“”]" -- $files_to_check)
+
+# Print the results:
+if [ -n "$results" ]; then
+  echo "Error: Non-ASCII characters found in files:"
+  echo "$results"
+  exit 1
+else
+  echo "Success: No non-ASCII characters found in files."
+fi
+exit 0
diff --git a/requirements-build.txt b/requirements-build.txt
@@ -1,3 +1,3 @@
 cmake~=3.23.0; platform_system != 'Darwin' or platform_machine == 'x86_64'
 cmake~=3.24.0; platform_system == 'Darwin' and platform_machine == 'arm64'
-pybind11-stubgen==2.5.1
+pybind11-stubgen==2.5.1
diff --git a/samples/cpp/image_generation/CMakeLists.txt b/samples/cpp/image_generation/CMakeLists.txt
@@ -107,4 +107,4 @@ set_target_properties(inpainting PROPERTIES
 install(TARGETS inpainting
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
-        EXCLUDE_FROM_ALL)
+        EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/text_generation/CMakeLists.txt b/samples/cpp/text_generation/CMakeLists.txt
@@ -58,4 +58,4 @@ set_target_properties(benchmark_genai PROPERTIES
 install(TARGETS benchmark_genai
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
-        EXCLUDE_FROM_ALL)
+        EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/text_generation/beam_search_causal_lm.cpp b/samples/cpp/text_generation/beam_search_causal_lm.cpp
@@ -19,7 +19,7 @@ int main(int argc, char* argv[]) try {
     config.num_beams = 15;
     config.diversity_penalty = 1.0f;
     config.num_return_sequences = config.num_beams;
-       
+
     // Since the streamer is set, the results will
     // be printed each time a new token is generated.
     auto beams = pipe.generate(prompts, config);

diff --git a/samples/cpp/text_generation/chat_sample.cpp b/samples/cpp/text_generation/chat_sample.cpp
@@ -12,14 +12,14 @@ int main(int argc, char* argv[]) try {
 
     std::string device = "CPU";  // GPU, NPU can be used as well
     ov::genai::LLMPipeline pipe(models_path, device);
-    
+
     ov::genai::GenerationConfig config;
     config.max_new_tokens = 100;
-    std::function<bool(std::string)> streamer = [](std::string word) { 
+    std::function<bool(std::string)> streamer = [](std::string word) {
         std::cout << word << std::flush;
         // Return flag corresponds whether generation should be stopped.
         // false means continue generation.
-        return false; 
+        return false;
     };
 
     pipe.start_chat();

diff --git a/samples/cpp/text_generation/encrypted_model_causal_lm.cpp b/samples/cpp/text_generation/encrypted_model_causal_lm.cpp
@@ -41,7 +41,7 @@ int main(int argc, char* argv[]) try {
 
     auto [model_str, model_weights] = decrypt_model(models_path + "/openvino_model.xml", models_path + "/openvino_model.bin");
     ov::genai::Tokenizer tokenizer = decrypt_tokenizer(models_path);
-    
+
     ov::genai::LLMPipeline pipe(model_str, model_weights, tokenizer, device);
 
     std::string result = pipe.generate(prompt, ov::genai::max_new_tokens(100));

diff --git a/samples/cpp/text_generation/prompt_lookup_decoding_lm.cpp b/samples/cpp/text_generation/prompt_lookup_decoding_lm.cpp
@@ -19,7 +19,7 @@ int main(int argc, char* argv[]) try {
 
     std::string model_path = argv[1];
     std::string prompt = argv[2];
-    
+
     std::string device = "CPU";
 
     ov::genai::LLMPipeline pipe(

diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt
@@ -42,4 +42,4 @@ set_target_properties(benchmark_vlm PROPERTIES
 install(TARGETS benchmark_vlm
         RUNTIME DESTINATION samples_bin/
         COMPONENT samples_bin
-        EXCLUDE_FROM_ALL)
+        EXCLUDE_FROM_ALL)
diff --git a/samples/cpp/visual_language_chat/benchmark_vlm.cpp b/samples/cpp/visual_language_chat/benchmark_vlm.cpp
@@ -42,15 +42,15 @@ int main(int argc, char* argv[]) try {
     size_t num_warmup = result["num_warmup"].as<size_t>();
     size_t num_iter = result["num_iter"].as<size_t>();
     ov::Tensor image = utils::load_image(image_path);
-  
+
     ov::genai::GenerationConfig config;
     config.max_new_tokens = result["max_new_tokens"].as<size_t>();
 
     ov::genai::VLMPipeline pipe(models_path, device);
-    
+
     for (size_t i = 0; i < num_warmup; i++)
         pipe.generate(prompt, ov::genai::image(image), ov::genai::generation_config(config));
-    
+
     auto res = pipe.generate(prompt, ov::genai::image(image), ov::genai::generation_config(config));
     auto metrics = res.perf_metrics;
     for (size_t i = 0; i < num_iter - 1; i++) {

diff --git a/samples/export-requirements.txt b/samples/export-requirements.txt
@@ -10,4 +10,4 @@ diffusers==0.32.2 # For image generation pipelines
 timm==1.0.14  # For exporting InternVL2
 torchvision  # For visual language models
 transformers>=4.43 # For Whisper
-hf_transfer # for faster models download, should used with env var HF_HUB_ENABLE_HF_TRANSFER=1
+hf_transfer # for faster models download, should used with env var HF_HUB_ENABLE_HF_TRANSFER=1
diff --git a/samples/python/image_generation/text2image.py b/samples/python/image_generation/text2image.py
@@ -29,4 +29,4 @@ def main():
 
 
 if '__main__' == __name__:
-    main()
+    main()
diff --git a/samples/python/text_generation/benchmark_genai.py b/samples/python/text_generation/benchmark_genai.py
@@ -12,31 +12,31 @@ def main():
     parser.add_argument("-n", "--num_iter", type=int, default=2, help="Number of iterations")
     parser.add_argument("-mt", "--max_new_tokens", type=int, default=20, help="Maximal number of new tokens")
     parser.add_argument("-d", "--device", type=str, default="CPU", help="Device")
-    
+
     args = parser.parse_args()
 
-    # Perf metrics is stored in DecodedResults. 
+    # Perf metrics is stored in DecodedResults.
     # In order to get DecodedResults instead of a string input should be a list.
     prompt = [args.prompt]
     models_path = args.model
     device = args.device
     num_warmup = args.num_warmup
     num_iter = args.num_iter
-    
+
     config = ov_genai.GenerationConfig()
     config.max_new_tokens = args.max_new_tokens
 
     pipe = ov_genai.LLMPipeline(models_path, device)
-    
+
     for _ in range(num_warmup):
         pipe.generate(prompt, config)
-    
+
     res = pipe.generate(prompt, config)
     perf_metrics = res.perf_metrics
     for _ in range(num_iter - 1):
         res = pipe.generate(prompt, config)
         perf_metrics += res.perf_metrics
-    
+
     print(f"Load time: {perf_metrics.get_load_time():.2f} ms")
     print(f"Generate time: {perf_metrics.get_generate_duration().mean:.2f} ± {perf_metrics.get_generate_duration().std:.2f} ms")
     print(f"Tokenization time: {perf_metrics.get_tokenization_duration().mean:.2f} ± {perf_metrics.get_tokenization_duration().std:.2f} ms")

diff --git a/samples/python/text_generation/multinomial_causal_lm.py b/samples/python/text_generation/multinomial_causal_lm.py
@@ -11,18 +11,18 @@
 class IterableStreamer(openvino_genai.StreamerBase):
     """
     A custom streamer class for handling token streaming and detokenization with buffering.
-    
+
     Attributes:
         tokenizer (Tokenizer): The tokenizer used for encoding and decoding tokens.
         tokens_cache (list): A buffer to accumulate tokens for detokenization.
         text_queue (Queue): A synchronized queue for storing decoded text chunks.
         print_len (int): The length of the printed text to manage incremental decoding.
     """
-    
+
     def __init__(self, tokenizer):
         """
         Initializes the IterableStreamer with the given tokenizer.
-        
+
         Args:
             tokenizer (Tokenizer): The tokenizer to use for encoding and decoding tokens.
         """
@@ -38,35 +38,35 @@ def __iter__(self):
         Returns the iterator object itself.
         """
         return self
-    
+
     def __next__(self):
         """
         Returns the next value from the text queue.
-        
+
         Returns:
             str: The next decoded text chunk.
-        
+
         Raises:
             StopIteration: If there are no more elements in the queue.
         """
         value = self.text_queue.get()  # get() will be blocked until a token is available.
         if value is None:
             raise StopIteration
         return value
-    
+
     def get_stop_flag(self):
         """
         Checks whether the generation process should be stopped.
-        
+
         Returns:
             bool: Always returns False in this implementation.
         """
         return False
-    
+
     def put_word(self, word: str):
         """
         Puts a word into the text queue.
-        
+
         Args:
             word (str): The word to put into the queue.
         """
@@ -75,10 +75,10 @@ def put_word(self, word: str):
     def put(self, token_id: int) -> bool:
         """
         Processes a token and manages the decoding buffer. Adds decoded text to the queue.
-        
+
         Args:
             token_id (int): The token_id to process.
-        
+
         Returns:
             bool: True if generation should be stopped, False otherwise.
         """
@@ -168,7 +168,7 @@ def token_printer():
     config.top_p = 0.9
     config.top_k = 30
 
-    # Since the streamer is set, the results will be printed 
+    # Since the streamer is set, the results will be printed
     # every time a new token is generated and put into the streamer queue.
     pipe.generate(args.prompt, config, text_print_streamer)
     printer_thread.join()

diff --git a/samples/python/text_generation/prompt_lookup_decoding_lm.py b/samples/python/text_generation/prompt_lookup_decoding_lm.py
@@ -5,10 +5,10 @@
 import argparse
 import openvino_genai
 
-def streamer(subword): 
-        print(subword, end='', flush=True) 
-        # Return flag corresponds whether generation should be stopped. 
-        # False means continue generation. 
+def streamer(subword):
+        print(subword, end='', flush=True)
+        # Return flag corresponds whether generation should be stopped.
+        # False means continue generation.
         return False
 
 def main():
@@ -20,15 +20,15 @@ def main():
     device = 'CPU'
 
     pipe = openvino_genai.LLMPipeline(args.model_dir, device, prompt_lookup=True)
-    
+
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100
     # add parameter to enable prompt lookup decoding to generate `num_assistant_tokens` candidates per iteration
     config.num_assistant_tokens = 5
     # Define max_ngram_size
     config.max_ngram_size = 3
 
-    # Since the streamer is set, the results will be printed 
+    # Since the streamer is set, the results will be printed
     # every time a new token is generated and put into the streamer queue.
     pipe.generate(args.prompt, config, streamer)
 

diff --git a/samples/python/text_generation/speculative_decoding_lm.py b/samples/python/text_generation/speculative_decoding_lm.py
@@ -8,7 +8,7 @@
 
 def streamer(subword):
     print(subword, end='', flush=True)
-    # Return flag corresponds whether generation should be stopped. 
+    # Return flag corresponds whether generation should be stopped.
     # False means continue generation.
     return False
 
@@ -27,7 +27,7 @@ def main():
     draft_model = openvino_genai.draft_model(args.draft_model_dir, draft_device)
 
     pipe = openvino_genai.LLMPipeline(args.model_dir, main_device, draft_model=draft_model)
-    
+
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100
     # Speculative decoding generation parameters like `num_assistant_tokens` and `assistant_confidence_threshold` are mutually excluded
@@ -36,7 +36,7 @@ def main():
     # add parameter to enable speculative decoding to generate candidates by draft_model while candidate probability is higher than `assistant_confidence_threshold`
     # config.assistant_confidence_threshold = 0.4
 
-    # Since the streamer is set, the results will be printed 
+    # Since the streamer is set, the results will be printed
     # every time a new token is generated and put into the streamer queue.
     pipe.generate(args.prompt, config, streamer)
 

diff --git a/samples/python/whisper_speech_recognition/recorder.py b/samples/python/whisper_speech_recognition/recorder.py
@@ -15,7 +15,7 @@
 sample_format = pyaudio.paInt16  # 16 bits per sample
 channels = 1
 fs = 16000  # Record at 16k samples per second
-seconds = 5 
+seconds = 5
 filename = "output.wav"
 
 p = pyaudio.PyAudio()  # Create an interface to PortAudio
@@ -34,7 +34,7 @@
     data = stream.read(chunk)
     frames.append(data)
 
-# Stop and close the stream 
+# Stop and close the stream
 stream.stop_stream()
 stream.close()
 # Terminate the PortAudio interface

diff --git a/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp b/src/cpp/include/openvino/genai/continuous_batching_pipeline.hpp
@@ -95,7 +95,7 @@ class OPENVINO_GENAI_EXPORTS ContinuousBatchingPipeline {
 
     /**
      * @brief Constructs a ContinuousBatchingPipeline from already existing model and tokenizer.
-     * 
+     *
      * This constructor allows for the creation of a ContinuousBatchingPipeline using an existing model
      * represented as a string and a weights tensor, along with a manually initialized tokenizer.
      * This is useful when the model and tokenizer are already loaded or created in memory and do not
Original file line number	Diff line number	Diff line change
Expand Up		@@ -29,4 +29,4 @@ def main():


		if '__main__' == __name__:
		main()
		main()