[kerastuner] 2024-11-11T16:32:30+02:00

Samsung · Nov 11, 2024 · 4f66b07 · 4f66b07
1 parent 5cc9685
commit 4f66b07
Show file tree

Hide file tree

Showing 22 changed files with 1,533 additions and 605 deletions.
diff --git a/.ci/benchmark.txt b/.ci/benchmark.txt
@@ -226,19 +226,19 @@ TOTAL:                  10232      16342283        12255        49690         51
 credsweeper result_cnt : 11521, lost_cnt : 0, true_cnt : 11342, false_cnt : 179
 Rules                             Positives    Negatives    Templates    Reported     TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  -----  ----  -----  ----  --------  --------  --------  --------  --------  --------
-API                                     130         3166          188         125    123     2   3352     7  0.000596  0.053846  0.997417  0.984000  0.946154  0.964706
+API                                     130         3166          188         124    122     2   3352     8  0.000596  0.061538  0.997130  0.983871  0.938462  0.960630
 AWS Client ID                           168           21            0         160    160     0     21     8  0.000000  0.047619  0.957672  1.000000  0.952381  0.975610
 AWS Multi                                82           10            0          88     82     5      5     0  0.500000  0.000000  0.945652  0.942529  1.000000  0.970414
 AWS S3 Bucket                            67           23            0          92     67    23      0     0  1.000000  0.000000  0.744444  0.744444  1.000000  0.853503
 Atlassian Old PAT token                  27          308            3          12      3     8    303    24  0.025723  0.888889  0.905325  0.272727  0.111111  0.157895
-Auth                                    414         2739           82         390    387     3   2818    27  0.001063  0.065217  0.990726  0.992308  0.934783  0.962687
+Auth                                    414         2739           82         391    385     6   2815    29  0.002127  0.070048  0.989181  0.984655  0.929952  0.956522
 Azure Access Token                       19            0            0          12     12     0      0     7            0.368421  0.631579  1.000000  0.631579  0.774194
 BASE64 Private Key                        7            4            0           7      7     0      4     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 BASE64 encoded PEM Private Key            7            0            0           5      5     0      0     2            0.285714  0.714286  1.000000  0.714286  0.833333
 Bitbucket Client ID                     143         2095            9          48     28    19   2085   115  0.009030  0.804196  0.940365  0.595745  0.195804  0.294737
 Bitbucket Client Secret                 301          807           10          40     29    11    806   272  0.013464  0.903654  0.746869  0.725000  0.096346  0.170088
-CMD ConvertTo-SecureString               13            4            0          13     13     0      4     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
-CMD Password                             21          128            6          18     18     0    134     3  0.000000  0.142857  0.980645  1.000000  0.857143  0.923077
+CMD ConvertTo-SecureString               13            4            0          12     12     0      4     1  0.000000  0.076923  0.941176  1.000000  0.923077  0.960000
+CMD Password                             21          128            6          20     20     0    134     1  0.000000  0.047619  0.993548  1.000000  0.952381  0.975610
 CMD Secret                                1            1            0           1      1     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 CMD Token                                 6            0            0           6      6     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
 Certificate                              24          471            0          20     20     0    471     4  0.000000  0.166667  0.991919  1.000000  0.833333  0.909091
@@ -257,18 +257,18 @@ Grafana Provisioned API Key              22            1            0
 JSON Web Token                          170           61            0         131    131     0     61    39  0.000000  0.229412  0.831169  1.000000  0.770588  0.870432
 Jira / Confluence PAT token               0            4            0                  0     0      4     0  0.000000            1.000000
 Jira 2FA                                 15            6            1          12     12     0      7     3  0.000000  0.200000  0.863636  1.000000  0.800000  0.888889
-Key                                    3909        15717          485        3944   3893    51  16151    16  0.003148  0.004093  0.996668  0.987069  0.995907  0.991468
-Nonce                                    91           49            0          89     88     1     48     3  0.020408  0.032967  0.971429  0.988764  0.967033  0.977778
+Key                                    3909        15717          485        3930   3889    41  16161    20  0.002531  0.005116  0.996967  0.989567  0.994884  0.992218
+Nonce                                    91           49            0          88     88     0     49     3  0.000000  0.032967  0.978571  1.000000  0.967033  0.983240
 Other                                     8         7445            1                  0     0   7446     8  0.000000  1.000000  0.998927            0.000000
 PEM Private Key                        1019         1483            0        1023   1019     4   1479     0  0.002697  0.000000  0.998401  0.996090  1.000000  0.998041
-Password                               1869         7535         2680        1776   1758    18  10197   111  0.001762  0.059390  0.989325  0.989865  0.940610  0.964609
+Password                               1869         7535         2680        1778   1763    15  10200   106  0.001468  0.056715  0.989987  0.991564  0.943285  0.966822
 Salt                                     47           76            1          44     44     0     77     3  0.000000  0.063830  0.975806  1.000000  0.936170  0.967033
-Secret                                 1297         1576          802        1288   1283     5   2373    14  0.002103  0.010794  0.994830  0.996118  0.989206  0.992650
+Secret                                 1297         1576          802        1291   1286     5   2373    11  0.002103  0.008481  0.995646  0.996127  0.991519  0.993818
 Seed                                      1            6            0                  0     0      6     1  0.000000  1.000000  0.857143            0.000000
 Slack Token                               4            1            0           4      4     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 Tencent WeChat API App ID                 6            0            0           6      6     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
-Token                                   643         4170          454         616    614     2   4622    29  0.000433  0.045101  0.994114  0.996753  0.954899  0.975377
+Token                                   643         4170          454         619    617     2   4622    26  0.000433  0.040435  0.994684  0.996769  0.959565  0.977813
 Twilio Credentials                       30           39            0          30     30     0     39     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
-URL Credentials                         210          156          216         205    205     0    372     5  0.000000  0.023810  0.991409  1.000000  0.976190  0.987952
+URL Credentials                         210          156          216         209    207     2    370     3  0.005376  0.014286  0.991409  0.990431  0.985714  0.988067
 UUID                                   1069          265            0        1068   1067     1    264     2  0.003774  0.001871  0.997751  0.999064  0.998129  0.998596
                                       12255        49690         5102       11528  11342   179  49511   913  0.003602  0.074500  0.982371  0.984463  0.925500  0.954071
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
@@ -41,7 +41,7 @@ jobs:
       if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
       run: |
         md5sum --binary credsweeper/ml_model/ml_config.json | grep 49c4352ae9ec82ad432d49d7e51c27f1
-        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep ff66e97c446d0f2bbd8d37b7dfff7361
+        md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 4abed2705dcb5565cfd3c80580d17f2a
 
     # # # line ending
 

diff --git a/credsweeper/common/constants.py b/credsweeper/common/constants.py
@@ -1,3 +1,4 @@
+import string
 import typing
 from enum import Enum
 from typing import Optional, Union
@@ -79,21 +80,27 @@ class Chars(Enum):
     """
 
     # set of characters, hexadecimal numeral system (Base16). Upper- and lowercase
-    HEX_CHARS = "0123456789ABCDEFabcdef"
+    HEX_CHARS = string.digits + "ABCDEFabcdef"
     # set of characters, hexadecimal numeral system (Base16). Uppercase
-    BASE16UPPER = "0123456789ABCDEF"
+    BASE16UPPER = string.digits + "ABCDEF"
     # set of characters, hexadecimal numeral system (Base16). Lowercase
-    BASE16LOWER = "0123456789abcdef"
+    BASE16LOWER = string.digits + "abcdef"
     # set of 32 characters, used in Base32 encoding
-    BASE32_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567"
+    BASE32_CHARS = string.ascii_uppercase + "234567"
     # set of 36 characters, used in Base36 encoding
-    BASE36_CHARS = "abcdefghijklmnopqrstuvwxyz1234567890"
-    # standard base64 with padding sign
-    BASE64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
+    BASE36_CHARS = string.digits + string.ascii_lowercase
+    # base62 set https://en.wikipedia.org/wiki/Base62
+    BASE62_CHARS = string.digits +  string.ascii_letters
     # URL- and filename-safe standard
-    BASE64URL_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
+    BASE64URL_CHARS = BASE62_CHARS + "-_"
     # standard base64
-    BASE64STD_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
+    BASE64STD_CHARS =  BASE62_CHARS + "+/"
+    # standard base64 with padding sign
+    BASE64_CHARS = BASE64STD_CHARS + "="
+    # except whitespaces
+    ASCII_VISIBLE = BASE62_CHARS + string.punctuation
+    # all printable symbols
+    ASCII_PRINTABLE = string.printable
 
 
 ENTROPY_LIMIT_BASE64 = 4.5

diff --git a/credsweeper/deep_scanner/pdf_scanner.py b/credsweeper/deep_scanner/pdf_scanner.py
@@ -38,7 +38,7 @@ def data_scan(
                                 pdf_content_provider = DataContentProvider(
                                     data=element_text.encode(),
                                     file_path=data_provider.file_path,
-                                    file_type=".xml",
+                                    file_type=data_provider.file_type,
                                     info=f"{data_provider.info}|PDF:{page.pageid}")
                                 new_limit = recursive_limit_size - len(pdf_content_provider.data)
                                 element_candidates = self.recursive_scan(pdf_content_provider, depth, new_limit)

diff --git a/credsweeper/filters/value_file_path_check.py b/credsweeper/filters/value_file_path_check.py
@@ -1,3 +1,5 @@
+import re
+
 from credsweeper.common.constants import Chars
 from credsweeper.common import static_keyword_checklist
 from credsweeper.config import Config
@@ -14,8 +16,8 @@ class ValueFilePathCheck(Filter):
     and do not have any special characters ( !$@`&*()+)
     """
     base64_possible_set = set(Chars.BASE64_CHARS.value) | set(Chars.BASE64URL_CHARS.value)
-    unusual_windows_symbols_in_path = "\t\n\r !$@`&*()[]{}<>+=;,~^"
-    unusual_linux_symbols_in_path = unusual_windows_symbols_in_path + ":\\"
+    unusual_windows_symbols_in_path = "\t\n\r!$@`&*(){}<>+=;,~^"
+    unusual_linux_symbols_in_path = "\t\n\r!@`&*<>+=;,~^:\\"
 
     def __init__(self, config: Config = None) -> None:
         pass

diff --git a/credsweeper/ml_model/features/__init__.py b/credsweeper/ml_model/features/__init__.py
@@ -1,12 +1,11 @@
-from credsweeper.ml_model.features.char_set import CharSet
+from credsweeper.ml_model.features.entropy_evaluation import EntropyEvaluation
 from credsweeper.ml_model.features.file_extension import FileExtension
-from credsweeper.ml_model.features.hartley_entropy import HartleyEntropy
 from credsweeper.ml_model.features.has_html_tag import HasHtmlTag
 from credsweeper.ml_model.features.is_secret_numeric import IsSecretNumeric
+from credsweeper.ml_model.features.length_of_attribute import LengthOfAttribute
+from credsweeper.ml_model.features.morpheme_dense import MorphemeDense
 from credsweeper.ml_model.features.search_in_attribute import SearchInAttribute
-from credsweeper.ml_model.features.reny_entropy import RenyiEntropy
 from credsweeper.ml_model.features.rule_name import RuleName
-from credsweeper.ml_model.features.shannon_entropy import ShannonEntropy
 from credsweeper.ml_model.features.word_in_line import WordInLine
 from credsweeper.ml_model.features.word_in_path import WordInPath
 from credsweeper.ml_model.features.word_in_value import WordInValue

diff --git a/credsweeper/ml_model/features/char_set.py b/credsweeper/ml_model/features/char_set.py
diff --git a/credsweeper/ml_model/features/entropy_evaluation.py b/credsweeper/ml_model/features/entropy_evaluation.py
@@ -0,0 +1,65 @@
+import math
+from typing import Dict, List, Set
+
+import numpy as np
+
+from credsweeper.common.constants import Base, Chars, ML_HUNK
+from credsweeper.credentials import Candidate
+from credsweeper.ml_model.features.feature import Feature
+
+
+class EntropyEvaluation(Feature):
+    """
+    Renyi, Shannon entropy evaluation with Hartley entropy normalization.
+    Augmentation with possible set of chars (hex, base64, etc.)
+    Analyse only begin of the value
+
+    See next link for details:
+    https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf
+
+    """
+
+    def __init__(self) -> None:
+        """Class initializer"""
+        super().__init__()
+        # Max size of ML analyzed value is ML_HUNK but value may be bigger
+        self.hunk_size = 4 * ML_HUNK
+        self.log2_cache: Dict[int, float] = {x: math.log2(x) for x in range(4, self.hunk_size + 1)}
+        self.char_sets: List[Set[str]] = [set(x.value) for x in Chars]
+
+    def extract(self, candidate: Candidate) -> np.ndarray:
+        """Returns real entropy and possible sets of characters"""
+        # only head of value will be analyzed
+        result = np.zeros(shape=3 + len(self.char_sets), dtype=np.float32)
+        value = candidate.line_data_list[0].value[:self.hunk_size]
+        size = len(value)
+        uniq, counts = np.unique(list(value), return_counts=True)
+        if 4 <= size:
+            # evaluate the entropy for a value of at least 4
+            probabilities = counts / size
+            hartley_entropy = self.log2_cache.get(size, -1.0)
+            assert hartley_entropy, str(candidate)
+
+            # renyi_entropy alpha=0.5
+            sum_prob_05 = np.sum(probabilities ** 0.5)
+            renyi_entropy_05 = 2 * np.log2(sum_prob_05)
+            result[0] = renyi_entropy_05 / hartley_entropy
+
+            # shannon_entropy or renyi_entropy alpha=1
+            shannon_entropy = -np.sum(probabilities * np.log2(probabilities))
+            result[1] = shannon_entropy / hartley_entropy
+
+            # renyi_entropy alpha=2
+            sum_prob_2 = np.sum(probabilities ** 2)
+            renyi_entropy_2 = -1 * np.log2(sum_prob_2)
+            result[2] = renyi_entropy_2 / hartley_entropy
+
+        elif 0 < size:
+            # check charset for non-zero value
+            # use the new variable to deal with mypy
+            uniq_set = set(uniq)
+            for n, i in enumerate(self.char_sets):
+                if not uniq_set.difference(i):
+                    result[3 + n] = 1.0
+
+        return result
diff --git a/credsweeper/ml_model/features/length_of_attribute.py b/credsweeper/ml_model/features/length_of_attribute.py
@@ -0,0 +1,31 @@
+import numpy as np
+
+from credsweeper.common.constants import ML_HUNK
+from credsweeper.credentials import Candidate
+from credsweeper.ml_model.features.feature import Feature
+
+
+class LengthOfAttribute(Feature):
+    """Abstract class for obtain a normalized value of length"""
+
+    def __init__(self, attribute: str):
+        super().__init__()
+        if "line" == attribute:
+            self.hunk_plus = 2 * ML_HUNK + 1
+        elif "value" == attribute or "variable" == attribute:
+            self.hunk_plus = ML_HUNK + 1
+        else:
+            raise ValueError(f"Not supported attribute '{attribute}'")
+        self.attribute = attribute
+
+    def extract(self, candidate: Candidate) -> np.ndarray:
+        """Returns boolean for first LineData member"""
+        if attribute := getattr(candidate.line_data_list[0], self.attribute, None):
+            if len(attribute) < self.hunk_plus:
+                # should be in (0, 1)
+                return np.array([len(attribute) / self.hunk_plus])
+            else:
+                # 1.0 means the attribute is oversize
+                return np.array([1.0])
+        # the attribute is empty
+        return np.array([0.0])
diff --git a/credsweeper/ml_model/features/morpheme_dense.py b/credsweeper/ml_model/features/morpheme_dense.py
@@ -0,0 +1,29 @@
+import string
+from typing import Dict, Set
+
+import numpy as np
+
+from credsweeper.common import KeywordChecklist, static_keyword_checklist
+from credsweeper.common.constants import Base, Chars
+from credsweeper.credentials import Candidate
+from credsweeper.ml_model.features.feature import Feature
+
+
+class MorphemeDense(Feature):
+    """Feature calculates morphemes density for a value"""
+
+    def __init__(self) -> None:
+        """Class initializer"""
+        super().__init__()
+
+    def extract(self, candidate: Candidate) -> float:
+        if value := candidate.line_data_list[0].value.lower():
+            morphemes_counter = 0
+            for morpheme in static_keyword_checklist.morpheme_set:
+                if morpheme in value:
+                    morphemes_counter += 1
+            # normalization: minimal morpheme length is 3
+            return 3.0 * morphemes_counter / len(value)
+        else:
+            # empty value case
+            return 0.0