Samsung · babenek · Dec 9, 2024 · Dec 5, 2024 · Dec 5, 2024 · Dec 9, 2024
@@ -1,6 +1,6 @@
-META MD5 414228344bac7e55c5127be7b244e460
-DATA MD5 abd9c025d5c323af814fbeb33f469c90
-DATA: 16342283 interested lines. MARKUP: 62020 items
+META MD5 5bb0a05fd77c2761b8414bba41103939
+DATA MD5 9e77a2d9f718f175264ab5a386ae86c4
+DATA: 16342283 interested lines. MARKUP: 62022 items
 FileType           FileNumber    ValidLines    Positives    Negatives    Templates
 ---------------  ------------  ------------  -----------  -----------  -----------
                           194         28318           71          418           90
@@ -82,7 +82,7 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .ipynb                      1           134                         5
 .j                          1           241            4
 .j2                        30          5530            6          186           10
-.java                     621        134132          362         1365          171
+.java                     621        134132          368         1365          171
 .jenkinsfile                1            58            2            6
 .jinja2                     1            64                         2
 .js                       659        536413          531         2497          331
@@ -222,16 +222,16 @@ FileType           FileNumber    ValidLines    Positives    Negatives    Templat
 .yml                      419         36169          559          889          376
 .zsh                        6           872                        12
 .zsh-theme                  1            97                         1
-TOTAL:                  10232      16342283        12255        49692         5101
-credsweeper result_cnt : 11517, lost_cnt : 0, true_cnt : 11342, false_cnt : 175
+TOTAL:                  10232      16342283        12261        49692         5101
+credsweeper result_cnt : 11521, lost_cnt : 0, true_cnt : 11346, false_cnt : 175
 Rules                             Positives    Negatives    Templates    Reported     TP    FP     TN    FN       FPR       FNR       ACC       PRC       RCL        F1
 ------------------------------  -----------  -----------  -----------  ----------  -----  ----  -----  ----  --------  --------  --------  --------  --------  --------
 API                                     130         3166          188         125    123     2   3352     7  0.000596  0.053846  0.997417  0.984000  0.946154  0.964706
 AWS Client ID                           168           21            0         160    160     0     21     8  0.000000  0.047619  0.957672  1.000000  0.952381  0.975610
 AWS Multi                                82           10            0          84     82     1      9     0  0.100000  0.000000  0.989130  0.987952  1.000000  0.993939
 AWS S3 Bucket                            67           23            0          92     67    23      0     0  1.000000  0.000000  0.744444  0.744444  1.000000  0.853503
 Atlassian Old PAT token                  27          308            3          12      3     8    303    24  0.025723  0.888889  0.905325  0.272727  0.111111  0.157895
-Auth                                    414         2739           82         390    387     3   2818    27  0.001063  0.065217  0.990726  0.992308  0.934783  0.962687
+Auth                                    417         2739           82         393    390     3   2818    27  0.001063  0.064748  0.990735  0.992366  0.935252  0.962963
 Azure Access Token                       19            0            0          12     12     0      0     7            0.368421  0.631579  1.000000  0.631579  0.774194
 BASE64 Private Key                        7            4            0           7      7     0      4     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 BASE64 encoded PEM Private Key            7            0            0           5      5     0      0     2            0.285714  0.714286  1.000000  0.714286  0.833333
@@ -258,17 +258,17 @@ JSON Web Token                          170           61            0         13
 Jira / Confluence PAT token               0            4            0                  0     0      4     0  0.000000            1.000000
 Jira 2FA                                 15            6            1          12     12     0      7     3  0.000000  0.200000  0.863636  1.000000  0.800000  0.888889
 Key                                    3909        15717          485        3944   3893    51  16151    16  0.003148  0.004093  0.996668  0.987069  0.995907  0.991468
-Nonce                                    91           49            0          89     88     1     48     3  0.020408  0.032967  0.971429  0.988764  0.967033  0.977778
+Nonce                                    93           49            0          91     90     1     48     3  0.020408  0.032258  0.971831  0.989011  0.967742  0.978261
 Other                                     8         7445            1                  0     0   7446     8  0.000000  1.000000  0.998927            0.000000
 PEM Private Key                        1019         1483            0        1023   1019     4   1479     0  0.002697  0.000000  0.998401  0.996090  1.000000  0.998041
-Password                               1869         7536         2680        1776   1758    18  10198   111  0.001762  0.059390  0.989326  0.989865  0.940610  0.964609
+Password                               1869         7536         2680        1774   1756    18  10198   113  0.001762  0.060460  0.989160  0.989853  0.939540  0.964041
 Salt                                     47           76            1          44     44     0     77     3  0.000000  0.063830  0.975806  1.000000  0.936170  0.967033
 Secret                                 1297         1576          802        1288   1283     5   2373    14  0.002103  0.010794  0.994830  0.996118  0.989206  0.992650
 Seed                                      1            6            0                  0     0      6     1  0.000000  1.000000  0.857143            0.000000
 Slack Token                               4            1            0           4      4     0      1     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 Tencent WeChat API App ID                 6            0            0           6      6     0      0     0            0.000000  1.000000  1.000000  1.000000  1.000000
-Token                                   643         4170          454         616    614     2   4622    29  0.000433  0.045101  0.994114  0.996753  0.954899  0.975377
+Token                                   644         4170          454         617    615     2   4622    29  0.000433  0.045031  0.994115  0.996759  0.954969  0.975416
 Twilio Credentials                       30           39            0          30     30     0     39     0  0.000000  0.000000  1.000000  1.000000  1.000000  1.000000
 URL Credentials                         210          157          215         205    205     0    372     5  0.000000  0.023810  0.991409  1.000000  0.976190  0.987952
 UUID                                   1069          265            0        1068   1067     1    264     2  0.003774  0.001871  0.997751  0.999064  0.998129  0.998596
-                                      12255        49692         5101       11524  11342   175  49517   913  0.003522  0.074500  0.982437  0.984805  0.925500  0.954232
+                                      12261        49692         5101       11528  11346   175  49517   915  0.003522  0.074627  0.982406  0.984810  0.925373  0.954167
@@ -3,14 +3,14 @@
 
 class KeywordPattern:
     """Pattern set of keyword types"""
-    key_left = r"(\\[nrt])?"\
-               r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,]*)" \
+    key_left = r"(\\[nrt]|%[0-9a-f]{2})?"\
+               r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \
                r"(?P<keyword>"
     # there will be inserted a keyword
     key_right = r")" \
-                r"[^:='\"`<>{?!&]*)[`'\"]*)"  # <variable>
+                r"[^%:='\"`<>{?!&]*)[`'\"]*)"  # <variable>
     separator = r"(\s|\\+[tnr])*\]?(\s|\\+[tnr])*" \
-                r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=)" \
+                r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=|%3d)" \
                 r"(\s|\\+[tnr])*"
     # might be curly, square or parenthesis with words before
     wrap = r"(?P<wrap>(" \

@@ -33,7 +33,10 @@ class LineData:
     comment_starts = ("//", "* ", "#", "/*", "<!––", "%{", "%", "...", "(*", "--", "--[[", "#=")
     bash_param_split = re.compile("\\s+(\\-|\\||\\>|\\w+?\\>|\\&)")
     line_endings = re.compile(r"\\{1,8}[nr]")
-    url_param_split = re.compile(r"(%|\\u(00){0,2})(26|3f)", flags=re.IGNORECASE)
+    # https://en.wikipedia.org/wiki/Percent-encoding
+    url_percent_split = re.compile(r"%(21|23|24|26|27|28|29|2a|2b|2c|2f|3a|3b|3d|3f|40|5b|5d)", flags=re.IGNORECASE)
+    url_unicode_split = re.compile(r"\\u00(0000)?(21|23|24|26|27|28|29|2a|2b|2c|2f|3a|3b|3d|3f|40|5b|5d)",
+                                   flags=re.IGNORECASE)
     # some symbols e.g. double quotes cannot be in URL string https://www.ietf.org/rfc/rfc1738.txt
     # \ - was added for case of url in escaped string \u0026amp; - means escaped & in HTML
     url_scheme_part_regex = re.compile(r"[0-9A-Za-z.-]{3}")
@@ -81,6 +84,7 @@ def __init__(
         # is set when variable & value are in URL for any source type
         self.url_part = False
         self.wrap = None
+        self._3d_escaped_separator = False
 
         self.initialize(match_obj)
 
@@ -124,6 +128,8 @@ def get_span_from_match_obj(_match_obj: re.Match, group: str) -> Tuple[int, int]
         self.value_leftquote = get_group_from_match_obj(match_obj, "value_leftquote")
         self.value_rightquote = get_group_from_match_obj(match_obj, "value_rightquote")
         self.wrap = get_group_from_match_obj(match_obj, "wrap")
+        # percent encoded '=' in url
+        self._3d_escaped_separator = bool(self.separator) and "%3D" == self.separator.upper()
         self.sanitize_value()
         self.sanitize_variable()
 
@@ -159,6 +165,7 @@ def check_url_part(self) -> bool:
         self.url_part &= not self.url_chars_not_allowed_pattern.search(line_before_value, pos=url_pos + 3)
         self.url_part |= self.line[self.variable_start - 1] in "?&" if 0 < self.variable_start else False
         self.url_part |= bool(self.url_value_pattern.match(self.value))
+        self.url_part |= self._3d_escaped_separator
         return self.url_part
 
     def clean_url_parameters(self) -> None:
@@ -173,9 +180,9 @@ def clean_url_parameters(self) -> None:
             self.value = self.value.split('&', maxsplit=1)[0].split(';', maxsplit=1)[0].split('#', maxsplit=1)[0]
             if not self.variable.endswith("://"):
                 # skip sanitize in case of URL credential rule
-                value_spl = self.url_param_split.split(self.value)
-                if len(value_spl) > 1:
-                    self.value = value_spl[0]
+                self.value = self.url_unicode_split.split(self.value)[0]
+                if self._3d_escaped_separator:
+                    self.value = self.url_percent_split.split(self.value)[0]
 
     def clean_bash_parameters(self) -> None:
         """Split variable and value by bash special characters, if line assumed to be CLI command."""
@@ -198,7 +205,7 @@ def clean_toml_parameters(self) -> None:
             cleaning_required = False
             for left, right in [('{', '}'), ('[', ']'), ('(', ')')]:
                 if self.value.endswith(right) and left not in self.value \
-                      and line_before_value.count(left) > line_before_value.count(right):
+                        and line_before_value.count(left) > line_before_value.count(right):
                     # full match does not reasonable to implement due open character may be in other line
                     self.value = self.value[:-1]
                     cleaning_required = True

@@ -1,24 +1,27 @@
 from pathlib import Path
 
 # total number of files in test samples
-SAMPLES_FILES_COUNT: int = 140
+SAMPLES_FILES_COUNT = 140
 
 # the lowest value of ML threshold is used to display possible lowest values
 NEGLIGIBLE_ML_THRESHOLD = 0.0001
 
-# credentials count after scan
-SAMPLES_CRED_COUNT: int = 412
-SAMPLES_CRED_LINE_COUNT: int = 430
+# credentials count after scan with negligible ML threshold
+SAMPLES_CRED_COUNT = 418
+SAMPLES_CRED_LINE_COUNT = 437
+
+# Number of filtered credentials with ML
+ML_FILTERED = 43
 
 # credentials count after post-processing
-SAMPLES_POST_CRED_COUNT: int = 369
+SAMPLES_POST_CRED_COUNT = SAMPLES_CRED_COUNT - ML_FILTERED
 
 # with option --doc
-SAMPLES_IN_DOC = 448
+SAMPLES_IN_DOC = 451
 
 # archived credentials that are not found without --depth
 SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 29
-SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 53
+SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 54
 SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1
 
 # well known string with all latin letters

@@ -4,15 +4,14 @@
     NEGLIGIBLE_ML_THRESHOLD
 
 DATA_TEST_CFG: List[Dict[str, Any]] = [{
-    "__cred_count": SAMPLES_POST_CRED_COUNT,
-    "sort_output": True,
-    "json_filename": "output.json"
-}, {
     "__cred_count": SAMPLES_CRED_COUNT,
     "sort_output": True,
-    "hashed": True,
     "json_filename": "ml_threshold.json",
     "ml_threshold": NEGLIGIBLE_ML_THRESHOLD
+}, {
+    "__cred_count": SAMPLES_POST_CRED_COUNT,
+    "sort_output": True,
+    "json_filename": "output.json"
 }, {
     "__cred_count": SAMPLES_IN_DOC,
     "sort_output": True,