Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

%3D separator for keywords #628

Merged
merged 4 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions .ci/benchmark.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
META MD5 414228344bac7e55c5127be7b244e460
DATA MD5 abd9c025d5c323af814fbeb33f469c90
DATA: 16342283 interested lines. MARKUP: 62020 items
META MD5 5bb0a05fd77c2761b8414bba41103939
DATA MD5 9e77a2d9f718f175264ab5a386ae86c4
DATA: 16342283 interested lines. MARKUP: 62022 items
FileType FileNumber ValidLines Positives Negatives Templates
--------------- ------------ ------------ ----------- ----------- -----------
194 28318 71 418 90
Expand Down Expand Up @@ -82,7 +82,7 @@ FileType FileNumber ValidLines Positives Negatives Templat
.ipynb 1 134 5
.j 1 241 4
.j2 30 5530 6 186 10
.java 621 134132 362 1365 171
.java 621 134132 368 1365 171
.jenkinsfile 1 58 2 6
.jinja2 1 64 2
.js 659 536413 531 2497 331
Expand Down Expand Up @@ -222,16 +222,16 @@ FileType FileNumber ValidLines Positives Negatives Templat
.yml 419 36169 559 889 376
.zsh 6 872 12
.zsh-theme 1 97 1
TOTAL: 10232 16342283 12255 49692 5101
credsweeper result_cnt : 11517, lost_cnt : 0, true_cnt : 11342, false_cnt : 175
TOTAL: 10232 16342283 12261 49692 5101
credsweeper result_cnt : 11521, lost_cnt : 0, true_cnt : 11346, false_cnt : 175
Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1
------------------------------ ----------- ----------- ----------- ---------- ----- ---- ----- ---- -------- -------- -------- -------- -------- --------
API 130 3166 188 125 123 2 3352 7 0.000596 0.053846 0.997417 0.984000 0.946154 0.964706
AWS Client ID 168 21 0 160 160 0 21 8 0.000000 0.047619 0.957672 1.000000 0.952381 0.975610
AWS Multi 82 10 0 84 82 1 9 0 0.100000 0.000000 0.989130 0.987952 1.000000 0.993939
AWS S3 Bucket 67 23 0 92 67 23 0 0 1.000000 0.000000 0.744444 0.744444 1.000000 0.853503
Atlassian Old PAT token 27 308 3 12 3 8 303 24 0.025723 0.888889 0.905325 0.272727 0.111111 0.157895
Auth 414 2739 82 390 387 3 2818 27 0.001063 0.065217 0.990726 0.992308 0.934783 0.962687
Auth 417 2739 82 393 390 3 2818 27 0.001063 0.064748 0.990735 0.992366 0.935252 0.962963
Azure Access Token 19 0 0 12 12 0 0 7 0.368421 0.631579 1.000000 0.631579 0.774194
BASE64 Private Key 7 4 0 7 7 0 4 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
BASE64 encoded PEM Private Key 7 0 0 5 5 0 0 2 0.285714 0.714286 1.000000 0.714286 0.833333
Expand All @@ -258,17 +258,17 @@ JSON Web Token 170 61 0 13
Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000
Jira 2FA 15 6 1 12 12 0 7 3 0.000000 0.200000 0.863636 1.000000 0.800000 0.888889
Key 3909 15717 485 3944 3893 51 16151 16 0.003148 0.004093 0.996668 0.987069 0.995907 0.991468
Nonce 91 49 0 89 88 1 48 3 0.020408 0.032967 0.971429 0.988764 0.967033 0.977778
Nonce 93 49 0 91 90 1 48 3 0.020408 0.032258 0.971831 0.989011 0.967742 0.978261
Other 8 7445 1 0 0 7446 8 0.000000 1.000000 0.998927 0.000000
PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041
Password 1869 7536 2680 1776 1758 18 10198 111 0.001762 0.059390 0.989326 0.989865 0.940610 0.964609
Password 1869 7536 2680 1774 1756 18 10198 113 0.001762 0.060460 0.989160 0.989853 0.939540 0.964041
Salt 47 76 1 44 44 0 77 3 0.000000 0.063830 0.975806 1.000000 0.936170 0.967033
Secret 1297 1576 802 1288 1283 5 2373 14 0.002103 0.010794 0.994830 0.996118 0.989206 0.992650
Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000
Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
Tencent WeChat API App ID 6 0 0 6 6 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Token 643 4170 454 616 614 2 4622 29 0.000433 0.045101 0.994114 0.996753 0.954899 0.975377
Token 644 4170 454 617 615 2 4622 29 0.000433 0.045031 0.994115 0.996759 0.954969 0.975416
Twilio Credentials 30 39 0 30 30 0 39 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
URL Credentials 210 157 215 205 205 0 372 5 0.000000 0.023810 0.991409 1.000000 0.976190 0.987952
UUID 1069 265 0 1068 1067 1 264 2 0.003774 0.001871 0.997751 0.999064 0.998129 0.998596
12255 49692 5101 11524 11342 175 49517 913 0.003522 0.074500 0.982437 0.984805 0.925500 0.954232
12261 49692 5101 11528 11346 175 49517 915 0.003522 0.074627 0.982406 0.984810 0.925373 0.954167
8 changes: 4 additions & 4 deletions credsweeper/common/keyword_pattern.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@

class KeywordPattern:
"""Pattern set of keyword types"""
key_left = r"(\\[nrt])?"\
r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,]*)" \
key_left = r"(\\[nrt]|%[0-9a-f]{2})?"\
r"(?P<variable>(([`'\"]+[^:='\"`}<>\\/&?]*|[^:='\"`}<>\s()\\/&?;,%]*)" \
r"(?P<keyword>"
# there will be inserted a keyword
key_right = r")" \
r"[^:='\"`<>{?!&]*)[`'\"]*)" # <variable>
r"[^%:='\"`<>{?!&]*)[`'\"]*)" # <variable>
separator = r"(\s|\\+[tnr])*\]?(\s|\\+[tnr])*" \
r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=)" \
r"(?P<separator>:( [a-z]{3,9}[?]? )?=|:|=(>|&gt;|\\u0026gt;)|!=|===|==|=|%3d)" \
r"(\s|\\+[tnr])*"
# might be curly, square or parenthesis with words before
wrap = r"(?P<wrap>(" \
Expand Down
17 changes: 12 additions & 5 deletions credsweeper/credentials/line_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ class LineData:
comment_starts = ("//", "* ", "#", "/*", "<!––", "%{", "%", "...", "(*", "--", "--[[", "#=")
bash_param_split = re.compile("\\s+(\\-|\\||\\>|\\w+?\\>|\\&)")
line_endings = re.compile(r"\\{1,8}[nr]")
url_param_split = re.compile(r"(%|\\u(00){0,2})(26|3f)", flags=re.IGNORECASE)
# https://en.wikipedia.org/wiki/Percent-encoding
url_percent_split = re.compile(r"%(21|23|24|26|27|28|29|2a|2b|2c|2f|3a|3b|3d|3f|40|5b|5d)", flags=re.IGNORECASE)
url_unicode_split = re.compile(r"\\u00(0000)?(21|23|24|26|27|28|29|2a|2b|2c|2f|3a|3b|3d|3f|40|5b|5d)",
flags=re.IGNORECASE)
# some symbols e.g. double quotes cannot be in URL string https://www.ietf.org/rfc/rfc1738.txt
# \ - was added for case of url in escaped string \u0026amp; - means escaped & in HTML
url_scheme_part_regex = re.compile(r"[0-9A-Za-z.-]{3}")
Expand Down Expand Up @@ -81,6 +84,7 @@ def __init__(
# is set when variable & value are in URL for any source type
self.url_part = False
self.wrap = None
self._3d_escaped_separator = False

self.initialize(match_obj)

Expand Down Expand Up @@ -124,6 +128,8 @@ def get_span_from_match_obj(_match_obj: re.Match, group: str) -> Tuple[int, int]
self.value_leftquote = get_group_from_match_obj(match_obj, "value_leftquote")
self.value_rightquote = get_group_from_match_obj(match_obj, "value_rightquote")
self.wrap = get_group_from_match_obj(match_obj, "wrap")
# percent encoded '=' in url
self._3d_escaped_separator = bool(self.separator) and "%3D" == self.separator.upper()
self.sanitize_value()
self.sanitize_variable()

Expand Down Expand Up @@ -159,6 +165,7 @@ def check_url_part(self) -> bool:
self.url_part &= not self.url_chars_not_allowed_pattern.search(line_before_value, pos=url_pos + 3)
self.url_part |= self.line[self.variable_start - 1] in "?&" if 0 < self.variable_start else False
self.url_part |= bool(self.url_value_pattern.match(self.value))
self.url_part |= self._3d_escaped_separator
return self.url_part

def clean_url_parameters(self) -> None:
Expand All @@ -173,9 +180,9 @@ def clean_url_parameters(self) -> None:
self.value = self.value.split('&', maxsplit=1)[0].split(';', maxsplit=1)[0].split('#', maxsplit=1)[0]
if not self.variable.endswith("://"):
# skip sanitize in case of URL credential rule
value_spl = self.url_param_split.split(self.value)
if len(value_spl) > 1:
self.value = value_spl[0]
self.value = self.url_unicode_split.split(self.value)[0]
if self._3d_escaped_separator:
self.value = self.url_percent_split.split(self.value)[0]

def clean_bash_parameters(self) -> None:
"""Split variable and value by bash special characters, if line assumed to be CLI command."""
Expand All @@ -198,7 +205,7 @@ def clean_toml_parameters(self) -> None:
cleaning_required = False
for left, right in [('{', '}'), ('[', ']'), ('(', ')')]:
if self.value.endswith(right) and left not in self.value \
and line_before_value.count(left) > line_before_value.count(right):
and line_before_value.count(left) > line_before_value.count(right):
# full match does not reasonable to implement due open character may be in other line
self.value = self.value[:-1]
cleaning_required = True
Expand Down
17 changes: 10 additions & 7 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,27 @@
from pathlib import Path

# total number of files in test samples
SAMPLES_FILES_COUNT: int = 140
SAMPLES_FILES_COUNT = 140

# the lowest value of ML threshold is used to display possible lowest values
NEGLIGIBLE_ML_THRESHOLD = 0.0001

# credentials count after scan
SAMPLES_CRED_COUNT: int = 412
SAMPLES_CRED_LINE_COUNT: int = 430
# credentials count after scan with negligible ML threshold
SAMPLES_CRED_COUNT = 418
SAMPLES_CRED_LINE_COUNT = 437

# Number of filtered credentials with ML
ML_FILTERED = 43

# credentials count after post-processing
SAMPLES_POST_CRED_COUNT: int = 369
SAMPLES_POST_CRED_COUNT = SAMPLES_CRED_COUNT - ML_FILTERED

# with option --doc
SAMPLES_IN_DOC = 448
SAMPLES_IN_DOC = 451

# archived credentials that are not found without --depth
SAMPLES_IN_DEEP_1 = SAMPLES_POST_CRED_COUNT + 29
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 53
SAMPLES_IN_DEEP_2 = SAMPLES_IN_DEEP_1 + 54
SAMPLES_IN_DEEP_3 = SAMPLES_IN_DEEP_2 + 1

# well known string with all latin letters
Expand Down
9 changes: 4 additions & 5 deletions tests/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,14 @@
NEGLIGIBLE_ML_THRESHOLD

DATA_TEST_CFG: List[Dict[str, Any]] = [{
"__cred_count": SAMPLES_POST_CRED_COUNT,
"sort_output": True,
"json_filename": "output.json"
}, {
"__cred_count": SAMPLES_CRED_COUNT,
"sort_output": True,
"hashed": True,
"json_filename": "ml_threshold.json",
"ml_threshold": NEGLIGIBLE_ML_THRESHOLD
}, {
"__cred_count": SAMPLES_POST_CRED_COUNT,
"sort_output": True,
"json_filename": "output.json"
}, {
"__cred_count": SAMPLES_IN_DOC,
"sort_output": True,
Expand Down
Loading
Loading