From 385c0d8efeaa0715966fe0f4bee4ca86fce0a971 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@gmail.com>
Date: Fri, 13 Dec 2024 07:28:27 +0200
Subject: [PATCH] output with colorization

---
 credsweeper/__main__.py                       |  2 ++
 credsweeper/app.py                            | 19 +++++++++++
 credsweeper/credentials/line_data.py          | 34 +++++++++++++++++++
 .../file_handler/diff_content_provider.py     |  2 +-
 docs/source/guide.rst                         |  3 +-
 pyproject.toml                                |  1 +
 requirements.txt                              |  1 +
 tests/test_app.py                             | 15 ++++++++
 tests/test_main.py                            |  5 +--
 9 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/credsweeper/__main__.py b/credsweeper/__main__.py
index f025e5d51..b607a40a5 100644
--- a/credsweeper/__main__.py
+++ b/credsweeper/__main__.py
@@ -224,6 +224,7 @@ def get_arguments() -> Namespace:
                         const="output.xlsx",
                         dest="xlsx_filename",
                         metavar="PATH")
+    parser.add_argument("--color", "-C", help="print results with colorization", action="store_const", const=True)
     parser.add_argument("--hashed",
                         help="line, variable, value will be hashed in output",
                         action="store_const",
@@ -299,6 +300,7 @@ def scan(args: Namespace, content_provider: AbstractProvider, json_filename: Opt
                                   api_validation=args.api_validation,
                                   json_filename=json_filename,
                                   xlsx_filename=xlsx_filename,
+                                  color=args.color,
                                   hashed=args.hashed,
                                   subtext=args.subtext,
                                   sort_output=args.sort_output,
diff --git a/credsweeper/app.py b/credsweeper/app.py
index f60b28394..9373529b7 100644
--- a/credsweeper/app.py
+++ b/credsweeper/app.py
@@ -5,6 +5,8 @@
 from typing import Any, List, Optional, Union, Dict, Sequence, Tuple
 
 import pandas as pd
+from colorama import Fore
+from colorama.ansi import AnsiStyle, Style
 
 # Directory of credsweeper sources MUST be placed before imports to avoid circular import error
 APP_PATH = Path(__file__).resolve().parent
@@ -42,6 +44,7 @@ def __init__(self,
                  api_validation: bool = False,
                  json_filename: Union[None, str, Path] = None,
                  xlsx_filename: Union[None, str, Path] = None,
+                 color: bool = False,
                  hashed: bool = False,
                  subtext: bool = False,
                  sort_output: bool = False,
@@ -73,6 +76,7 @@ def __init__(self,
                 to json
             xlsx_filename: optional string variable, path to save result
                 to xlsx
+            color: print results to stdout with colorization
             hashed: use hash of line, value and variable instead plain text
             subtext: use subtext of line near variable-value like it performed in ML
             use_filters: boolean variable, specifying the need of rule filters
@@ -112,6 +116,7 @@ def __init__(self,
         self.credential_manager = CredentialManager()
         self.json_filename: Union[None, str, Path] = json_filename
         self.xlsx_filename: Union[None, str, Path] = xlsx_filename
+        self.color = color
         self.hashed = hashed
         self.subtext = subtext
         self.sort_output = sort_output
@@ -427,6 +432,20 @@ def export_results(self) -> None:
             df = pd.DataFrame(data=data_list)
             df.to_excel(self.xlsx_filename, index=False)
 
+        if self.color:
+            is_exported = True
+            for credential in credentials:
+                for line_data in credential.line_data_list:
+                    print(Style.BRIGHT + credential.rule_name \
+                          + f" {line_data.info or line_data.path}:{line_data.line_num}"
+                          + Style.RESET_ALL)
+                    if self.hashed:
+                        print(Fore.LIGHTGREEN_EX \
+                              + line_data.get_hash_or_subtext(line_data.line, self.hashed) \
+                              + Style.RESET_ALL)
+                    else:
+                        print(f"{line_data.get_colored_line(self.subtext)}")
+
         if is_exported is False:
             for credential in credentials:
                 print(credential.to_str(hashed=self.hashed, subtext=self.subtext))
diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py
index 92f801484..761724d8f 100644
--- a/credsweeper/credentials/line_data.py
+++ b/credsweeper/credentials/line_data.py
@@ -5,6 +5,8 @@
 from functools import cached_property
 from typing import Any, Dict, Optional, Tuple
 
+from colorama import Fore, Style, Back
+
 from credsweeper.common.constants import MAX_LINE_LENGTH, UTF_8, StartEnd, ML_HUNK
 from credsweeper.config import Config
 from credsweeper.utils import Util
@@ -414,3 +416,35 @@ def to_json(self, hashed: bool, subtext: bool) -> Dict:
         }
         reported_output = {k: v for k, v in full_output.items() if k in self.config.line_data_output}
         return reported_output
+
+    def get_colored_line(self, subtext: bool = False) -> str:
+        # at least, value must present
+        line = self.line[:self.value_start] \
+               + Fore.LIGHTYELLOW_EX \
+               + self.line[self.value_start:self.value_end] \
+               + Style.RESET_ALL \
+               + self.line[self.value_end:]
+        # separator may be missing
+        if 0 <= self.separator_start < self.separator_end <= self.value_start:
+            line = line[:self.separator_start] \
+                   + Fore.LIGHTGREEN_EX \
+                   + line[self.separator_start:self.separator_end] \
+                   + Style.RESET_ALL \
+                   + line[self.separator_end:]
+        # variable may be missing
+        if 0 <= self.separator_start \
+                and 0 <= self.variable_start < self.variable_end <= self.separator_end <= self.value_start \
+                or 0 <= self.variable_start < self.variable_end <= self.value_start:
+            line = line[:self.variable_start] \
+                   + Fore.LIGHTBLUE_EX \
+                   + line[self.variable_start:self.variable_end] \
+                   + Style.RESET_ALL \
+                   + line[self.variable_end:]
+        if subtext:
+            # display part of the text, centered around the start of the value
+            line = Util.subtext(line, self.value_start + len(line) - len(self.line), ML_HUNK)
+            # put style reset at the end as a fallback
+            return f"{line}{Style.RESET_ALL}"
+        else:
+            # show whole line
+            return line
diff --git a/credsweeper/file_handler/diff_content_provider.py b/credsweeper/file_handler/diff_content_provider.py
index f4dc1cb93..9669cdbde 100644
--- a/credsweeper/file_handler/diff_content_provider.py
+++ b/credsweeper/file_handler/diff_content_provider.py
@@ -31,7 +31,7 @@ def __init__(
             file_path: str,  #
             change_type: DiffRowType,  #
             diff: List[DiffDict]) -> None:
-        super().__init__(file_path=file_path, info=change_type.value)
+        super().__init__(file_path=file_path, info=f"{file_path}:{change_type.value}")
         self.change_type = change_type
         self.diff = diff
 
diff --git a/docs/source/guide.rst b/docs/source/guide.rst
index ebebbc67f..62c41d7bf 100644
--- a/docs/source/guide.rst
+++ b/docs/source/guide.rst
@@ -18,7 +18,7 @@ Get all argument list:
                              [--find-by-ext] [--depth POSITIVE_INT] [--no-filters] [--doc] [--ml_threshold FLOAT_OR_STR]
                              [--ml_batch_size POSITIVE_INT] [--ml_config PATH] [--ml_model PATH] [--ml_providers STR]
                              [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]]
-                             [--save-xlsx [PATH]] [--hashed] [--subtext] [--sort] [--log LOG_LEVEL]
+                             [--save-xlsx [PATH]] [--color] [--hashed] [--subtext] [--sort] [--log LOG_LEVEL]
                              [--size_limit SIZE_LIMIT]
                              [--banner] [--version]
     options:
@@ -54,6 +54,7 @@ Get all argument list:
       --skip_ignored        parse .gitignore files and skip credentials from ignored objects
       --save-json [PATH]    save result to json file (default: output.json)
       --save-xlsx [PATH]    save result to xlsx file (default: output.xlsx)
+      --color, -C           print results with colorization
       --hashed              line, variable, value will be hashed in output
       --subtext             line text will be stripped in 160 symbols but value and variable are kept
       --sort                enable output sorting
diff --git a/pyproject.toml b/pyproject.toml
index f7adfec16..9898d0fcf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,6 +7,7 @@ authors = [
 dependencies = [
     "base58",
     "beautifulsoup4>=4.11.0",
+    "colorama",
     "cryptography",
     "GitPython",
     "google_auth_oauthlib",
diff --git a/requirements.txt b/requirements.txt
index a503eb483..cc23fc162 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,6 +8,7 @@ hatchling==1.26.3
 # Common requirements
 base58==2.1.1
 beautifulsoup4==4.12.3
+colorama==0.4.6
 cryptography==43.0.3
 GitPython==3.1.43
 google-auth-oauthlib==1.2.1
diff --git a/tests/test_app.py b/tests/test_app.py
index 1f1654936..b5b4c153c 100644
--- a/tests/test_app.py
+++ b/tests/test_app.py
@@ -212,6 +212,20 @@ def test_it_works_with_api_p(self) -> None:
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
+    def test_it_works_with_patch_color_p(self) -> None:
+        target_path = str(SAMPLES_PATH / "password.patch")
+        _stdout, _stderr = self._m_credsweeper(["--diff_path", target_path, "--log", "silence", "--color"])
+        output = " ".join(_stdout.split()[:-1])
+        expected = """
+                    \x1b[1mPassword .changes/1.16.98.json:added:3\x1b[0m 
+                    "\x1b[94mpassword\x1b[0m"\x1b[92m:\x1b[0m "\x1b[93mdkajco1\x1b[0m"
+                    Added File Credentials: 1 Deleted File Credentials: 0 Time Elapsed:
+                    """
+        expected = " ".join(expected.split())
+        self.assertEqual(expected, output)
+
+    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+
     def test_it_works_n(self) -> None:
         _stdout, _stderr = self._m_credsweeper([])
 
@@ -243,6 +257,7 @@ def test_it_works_n(self) -> None:
                    " [--skip_ignored]" \
                    " [--save-json [PATH]]" \
                    " [--save-xlsx [PATH]]" \
+                   " [--color]" \
                    " [--hashed]" \
                    " [--subtext]" \
                    " [--sort]" \
diff --git a/tests/test_main.py b/tests/test_main.py
index 7cfb7b25c..a077e0187 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -163,6 +163,7 @@ def test_main_path_p(self, mock_get_arguments) -> None:
                              diff_path=[str(target_path)],
                              json_filename=os.path.join(tmp_dir, f"{__name__}.json"),
                              xlsx_filename=None,
+                             color=False,
                              subtext=False,
                              hashed=False,
                              rule_path=None,
@@ -450,7 +451,7 @@ def test_tar_n(self) -> None:
 
     def test_aws_multi_p(self) -> None:
         content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "aws_multi.md"])
-        cred_sweeper = CredSweeper(ml_threshold=0)
+        cred_sweeper = CredSweeper(ml_threshold=0, color=True, hashed=True)
         cred_sweeper.run(content_provider=content_provider)
         for i in cred_sweeper.credential_manager.get_credentials():
             if "AWS Multi" == i.rule_name:
@@ -609,7 +610,7 @@ def test_yaml_n(self) -> None:
     def test_encoded_p(self) -> None:
         # test for finding credentials in ENCODED data
         content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "encoded_data"])
-        cred_sweeper = CredSweeper(depth=5, ml_threshold=0)
+        cred_sweeper = CredSweeper(depth=5, ml_threshold=0, color=True)
         cred_sweeper.run(content_provider=content_provider)
         found_credentials = cred_sweeper.credential_manager.get_credentials()
         self.assertEqual(2, len(found_credentials))