From 3938c7dda74f1767cee3973d214ad84af770c3f8 Mon Sep 17 00:00:00 2001
From: Roman Babenko <babenek@users.noreply.github.com>
Date: Fri, 13 Dec 2024 10:20:53 +0200
Subject: [PATCH] Credential colorization in stdout export (#632)

* output with colorization

* missed types lib, import optimization, extend test

* fix

* fix

* fix
---
 credsweeper/__main__.py                       |  2 +
 credsweeper/app.py                            | 13 +++++++
 credsweeper/credentials/line_data.py          | 37 +++++++++++++++++++
 credsweeper/deep_scanner/gzip_scanner.py      |  2 +-
 .../file_handler/diff_content_provider.py     |  2 +-
 credsweeper/filters/value_camel_case_check.py |  2 +-
 credsweeper/filters/value_file_path_check.py  |  2 +-
 credsweeper/ml_model/features/__init__.py     |  2 +-
 credsweeper/ml_model/ml_validator.py          |  2 +-
 docs/source/guide.rst                         |  3 +-
 pyproject.toml                                |  1 +
 requirements.txt                              |  2 +
 tests/test_app.py                             | 15 ++++++++
 tests/test_main.py                            |  5 ++-
 14 files changed, 81 insertions(+), 9 deletions(-)

diff --git a/credsweeper/__main__.py b/credsweeper/__main__.py
index f025e5d51..b607a40a5 100644
--- a/credsweeper/__main__.py
+++ b/credsweeper/__main__.py
@@ -224,6 +224,7 @@ def get_arguments() -> Namespace:
                         const="output.xlsx",
                         dest="xlsx_filename",
                         metavar="PATH")
+    parser.add_argument("--color", "-C", help="print results with colorization", action="store_const", const=True)
     parser.add_argument("--hashed",
                         help="line, variable, value will be hashed in output",
                         action="store_const",
@@ -299,6 +300,7 @@ def scan(args: Namespace, content_provider: AbstractProvider, json_filename: Opt
                                   api_validation=args.api_validation,
                                   json_filename=json_filename,
                                   xlsx_filename=xlsx_filename,
+                                  color=args.color,
                                   hashed=args.hashed,
                                   subtext=args.subtext,
                                   sort_output=args.sort_output,
diff --git a/credsweeper/app.py b/credsweeper/app.py
index f60b28394..dfa8f3782 100644
--- a/credsweeper/app.py
+++ b/credsweeper/app.py
@@ -5,6 +5,7 @@
 from typing import Any, List, Optional, Union, Dict, Sequence, Tuple
 
 import pandas as pd
+from colorama import Style
 
 # Directory of credsweeper sources MUST be placed before imports to avoid circular import error
 APP_PATH = Path(__file__).resolve().parent
@@ -42,6 +43,7 @@ def __init__(self,
                  api_validation: bool = False,
                  json_filename: Union[None, str, Path] = None,
                  xlsx_filename: Union[None, str, Path] = None,
+                 color: bool = False,
                  hashed: bool = False,
                  subtext: bool = False,
                  sort_output: bool = False,
@@ -73,6 +75,7 @@ def __init__(self,
                 to json
             xlsx_filename: optional string variable, path to save result
                 to xlsx
+            color: print results to stdout with colorization
             hashed: use hash of line, value and variable instead plain text
             subtext: use subtext of line near variable-value like it performed in ML
             use_filters: boolean variable, specifying the need of rule filters
@@ -112,6 +115,7 @@ def __init__(self,
         self.credential_manager = CredentialManager()
         self.json_filename: Union[None, str, Path] = json_filename
         self.xlsx_filename: Union[None, str, Path] = xlsx_filename
+        self.color = color
         self.hashed = hashed
         self.subtext = subtext
         self.sort_output = sort_output
@@ -427,6 +431,15 @@ def export_results(self) -> None:
             df = pd.DataFrame(data=data_list)
             df.to_excel(self.xlsx_filename, index=False)
 
+        if self.color:
+            is_exported = True
+            for credential in credentials:
+                for line_data in credential.line_data_list:
+                    # bright rule name and path or info
+                    print(Style.BRIGHT + credential.rule_name +
+                          f" {line_data.info or line_data.path}:{line_data.line_num}" + Style.RESET_ALL)
+                    print(line_data.get_colored_line(hashed=self.hashed, subtext=self.subtext))
+
         if is_exported is False:
             for credential in credentials:
                 print(credential.to_str(hashed=self.hashed, subtext=self.subtext))
diff --git a/credsweeper/credentials/line_data.py b/credsweeper/credentials/line_data.py
index 92f801484..7fba85e5d 100644
--- a/credsweeper/credentials/line_data.py
+++ b/credsweeper/credentials/line_data.py
@@ -5,6 +5,8 @@
 from functools import cached_property
 from typing import Any, Dict, Optional, Tuple
 
+from colorama import Fore, Style
+
 from credsweeper.common.constants import MAX_LINE_LENGTH, UTF_8, StartEnd, ML_HUNK
 from credsweeper.config import Config
 from credsweeper.utils import Util
@@ -414,3 +416,38 @@ def to_json(self, hashed: bool, subtext: bool) -> Dict:
         }
         reported_output = {k: v for k, v in full_output.items() if k in self.config.line_data_output}
         return reported_output
+
+    def get_colored_line(self, hashed: bool, subtext: bool = False) -> str:
+        """Represents the LineData with a value, separator, and variable color formatting"""
+        if hashed:
+            # return colored hash
+            return Fore.LIGHTGREEN_EX \
+                + self.get_hash_or_subtext(self.line, hashed,
+                                           StartEnd(self.value_start, self.value_end) if subtext else None) \
+                + Style.RESET_ALL
+        # at least, value must present
+        line = self.line[:self.value_start] \
+               + Fore.LIGHTYELLOW_EX \
+               + self.line[self.value_start:self.value_end] \
+               + Style.RESET_ALL \
+               + self.line[self.value_end:]  # noqa: E127
+        # separator may be missing
+        if 0 <= self.separator_start < self.separator_end <= self.value_start:
+            line = line[:self.separator_start] \
+                   + Fore.LIGHTGREEN_EX \
+                   + line[self.separator_start:self.separator_end] \
+                   + Style.RESET_ALL \
+                   + line[self.separator_end:]
+        # variable may be missing
+        if 0 <= self.separator_start \
+                and 0 <= self.variable_start < self.variable_end <= self.separator_end <= self.value_start \
+                or 0 <= self.variable_start < self.variable_end <= self.value_start:
+            line = line[:self.variable_start] \
+                   + Fore.LIGHTBLUE_EX \
+                   + line[self.variable_start:self.variable_end] \
+                   + Style.RESET_ALL \
+                   + line[self.variable_end:]
+        if subtext:
+            # display part of the text, centered around the start of the value, style reset at the end as a fallback
+            line = f"{Util.subtext(line, self.value_start + len(line) - len(self.line), ML_HUNK)}{Style.RESET_ALL}"
+        return line
diff --git a/credsweeper/deep_scanner/gzip_scanner.py b/credsweeper/deep_scanner/gzip_scanner.py
index 06e2321ca..1f8ec39ee 100644
--- a/credsweeper/deep_scanner/gzip_scanner.py
+++ b/credsweeper/deep_scanner/gzip_scanner.py
@@ -5,10 +5,10 @@
 from pathlib import Path
 from typing import List
 
-from credsweeper.utils import Util
 from credsweeper.credentials import Candidate
 from credsweeper.deep_scanner.abstract_scanner import AbstractScanner
 from credsweeper.file_handler.data_content_provider import DataContentProvider
+from credsweeper.utils import Util
 
 logger = logging.getLogger(__name__)
 
diff --git a/credsweeper/file_handler/diff_content_provider.py b/credsweeper/file_handler/diff_content_provider.py
index f4dc1cb93..9669cdbde 100644
--- a/credsweeper/file_handler/diff_content_provider.py
+++ b/credsweeper/file_handler/diff_content_provider.py
@@ -31,7 +31,7 @@ def __init__(
             file_path: str,  #
             change_type: DiffRowType,  #
             diff: List[DiffDict]) -> None:
-        super().__init__(file_path=file_path, info=change_type.value)
+        super().__init__(file_path=file_path, info=f"{file_path}:{change_type.value}")
         self.change_type = change_type
         self.diff = diff
 
diff --git a/credsweeper/filters/value_camel_case_check.py b/credsweeper/filters/value_camel_case_check.py
index 0fad1df90..7faa7d15e 100644
--- a/credsweeper/filters/value_camel_case_check.py
+++ b/credsweeper/filters/value_camel_case_check.py
@@ -1,7 +1,7 @@
 import re
 
-from credsweeper.config import Config
 from credsweeper.common import static_keyword_checklist
+from credsweeper.config import Config
 from credsweeper.credentials import LineData
 from credsweeper.file_handler.analysis_target import AnalysisTarget
 from credsweeper.filters import Filter
diff --git a/credsweeper/filters/value_file_path_check.py b/credsweeper/filters/value_file_path_check.py
index b871547dd..81bd3dae4 100644
--- a/credsweeper/filters/value_file_path_check.py
+++ b/credsweeper/filters/value_file_path_check.py
@@ -1,5 +1,5 @@
-from credsweeper.common.constants import Chars
 from credsweeper.common import static_keyword_checklist
+from credsweeper.common.constants import Chars
 from credsweeper.config import Config
 from credsweeper.credentials import LineData
 from credsweeper.file_handler.analysis_target import AnalysisTarget
diff --git a/credsweeper/ml_model/features/__init__.py b/credsweeper/ml_model/features/__init__.py
index 95b705eca..480c3ecb4 100644
--- a/credsweeper/ml_model/features/__init__.py
+++ b/credsweeper/ml_model/features/__init__.py
@@ -3,9 +3,9 @@
 from credsweeper.ml_model.features.hartley_entropy import HartleyEntropy
 from credsweeper.ml_model.features.has_html_tag import HasHtmlTag
 from credsweeper.ml_model.features.is_secret_numeric import IsSecretNumeric
-from credsweeper.ml_model.features.search_in_attribute import SearchInAttribute
 from credsweeper.ml_model.features.reny_entropy import RenyiEntropy
 from credsweeper.ml_model.features.rule_name import RuleName
+from credsweeper.ml_model.features.search_in_attribute import SearchInAttribute
 from credsweeper.ml_model.features.shannon_entropy import ShannonEntropy
 from credsweeper.ml_model.features.word_in_line import WordInLine
 from credsweeper.ml_model.features.word_in_path import WordInPath
diff --git a/credsweeper/ml_model/ml_validator.py b/credsweeper/ml_model/ml_validator.py
index 589a4bb60..1617111b1 100644
--- a/credsweeper/ml_model/ml_validator.py
+++ b/credsweeper/ml_model/ml_validator.py
@@ -7,9 +7,9 @@
 import numpy as np
 import onnxruntime as ort
 
+import credsweeper.ml_model.features as features
 from credsweeper.common.constants import ThresholdPreset, ML_HUNK
 from credsweeper.credentials import Candidate, CandidateKey
-import credsweeper.ml_model.features as features
 from credsweeper.utils import Util
 
 logger = logging.getLogger(__name__)
diff --git a/docs/source/guide.rst b/docs/source/guide.rst
index ebebbc67f..62c41d7bf 100644
--- a/docs/source/guide.rst
+++ b/docs/source/guide.rst
@@ -18,7 +18,7 @@ Get all argument list:
                              [--find-by-ext] [--depth POSITIVE_INT] [--no-filters] [--doc] [--ml_threshold FLOAT_OR_STR]
                              [--ml_batch_size POSITIVE_INT] [--ml_config PATH] [--ml_model PATH] [--ml_providers STR]
                              [--api_validation] [--jobs POSITIVE_INT] [--skip_ignored] [--save-json [PATH]]
-                             [--save-xlsx [PATH]] [--hashed] [--subtext] [--sort] [--log LOG_LEVEL]
+                             [--save-xlsx [PATH]] [--color] [--hashed] [--subtext] [--sort] [--log LOG_LEVEL]
                              [--size_limit SIZE_LIMIT]
                              [--banner] [--version]
     options:
@@ -54,6 +54,7 @@ Get all argument list:
       --skip_ignored        parse .gitignore files and skip credentials from ignored objects
       --save-json [PATH]    save result to json file (default: output.json)
       --save-xlsx [PATH]    save result to xlsx file (default: output.xlsx)
+      --color, -C           print results with colorization
       --hashed              line, variable, value will be hashed in output
       --subtext             line text will be stripped in 160 symbols but value and variable are kept
       --sort                enable output sorting
diff --git a/pyproject.toml b/pyproject.toml
index f7adfec16..9898d0fcf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,6 +7,7 @@ authors = [
 dependencies = [
     "base58",
     "beautifulsoup4>=4.11.0",
+    "colorama",
     "cryptography",
     "GitPython",
     "google_auth_oauthlib",
diff --git a/requirements.txt b/requirements.txt
index a503eb483..379b364f8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,6 +8,7 @@ hatchling==1.26.3
 # Common requirements
 base58==2.1.1
 beautifulsoup4==4.12.3
+colorama==0.4.6
 cryptography==43.0.3
 GitPython==3.1.43
 google-auth-oauthlib==1.2.1
@@ -45,6 +46,7 @@ pytest-cov
 pytest-html
 pytest-random-order
 types-beautifulsoup4
+types-colorama
 types-PyYAML
 types-requests
 types-oauthlib
diff --git a/tests/test_app.py b/tests/test_app.py
index 1f1654936..b5b4c153c 100644
--- a/tests/test_app.py
+++ b/tests/test_app.py
@@ -212,6 +212,20 @@ def test_it_works_with_api_p(self) -> None:
 
     # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
 
+    def test_it_works_with_patch_color_p(self) -> None:
+        target_path = str(SAMPLES_PATH / "password.patch")
+        _stdout, _stderr = self._m_credsweeper(["--diff_path", target_path, "--log", "silence", "--color"])
+        output = " ".join(_stdout.split()[:-1])
+        expected = """
+                    \x1b[1mPassword .changes/1.16.98.json:added:3\x1b[0m 
+                    "\x1b[94mpassword\x1b[0m"\x1b[92m:\x1b[0m "\x1b[93mdkajco1\x1b[0m"
+                    Added File Credentials: 1 Deleted File Credentials: 0 Time Elapsed:
+                    """
+        expected = " ".join(expected.split())
+        self.assertEqual(expected, output)
+
+    # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
+
     def test_it_works_n(self) -> None:
         _stdout, _stderr = self._m_credsweeper([])
 
@@ -243,6 +257,7 @@ def test_it_works_n(self) -> None:
                    " [--skip_ignored]" \
                    " [--save-json [PATH]]" \
                    " [--save-xlsx [PATH]]" \
+                   " [--color]" \
                    " [--hashed]" \
                    " [--subtext]" \
                    " [--sort]" \
diff --git a/tests/test_main.py b/tests/test_main.py
index 7cfb7b25c..8961f59ee 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -163,6 +163,7 @@ def test_main_path_p(self, mock_get_arguments) -> None:
                              diff_path=[str(target_path)],
                              json_filename=os.path.join(tmp_dir, f"{__name__}.json"),
                              xlsx_filename=None,
+                             color=False,
                              subtext=False,
                              hashed=False,
                              rule_path=None,
@@ -450,7 +451,7 @@ def test_tar_n(self) -> None:
 
     def test_aws_multi_p(self) -> None:
         content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "aws_multi.md"])
-        cred_sweeper = CredSweeper(ml_threshold=0)
+        cred_sweeper = CredSweeper(ml_threshold=0, color=True, hashed=True)
         cred_sweeper.run(content_provider=content_provider)
         for i in cred_sweeper.credential_manager.get_credentials():
             if "AWS Multi" == i.rule_name:
@@ -609,7 +610,7 @@ def test_yaml_n(self) -> None:
     def test_encoded_p(self) -> None:
         # test for finding credentials in ENCODED data
         content_provider: AbstractProvider = FilesProvider([SAMPLES_PATH / "encoded_data"])
-        cred_sweeper = CredSweeper(depth=5, ml_threshold=0)
+        cred_sweeper = CredSweeper(depth=5, ml_threshold=0, color=True, subtext=True)
         cred_sweeper.run(content_provider=content_provider)
         found_credentials = cred_sweeper.credential_manager.get_credentials()
         self.assertEqual(2, len(found_credentials))