Skip to content

Commit

Permalink
[kerastuner] 2024-11-11T16:32:30+02:00
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Nov 11, 2024
1 parent 5cc9685 commit 4f66b07
Show file tree
Hide file tree
Showing 22 changed files with 1,533 additions and 605 deletions.
20 changes: 10 additions & 10 deletions .ci/benchmark.txt
Original file line number Diff line number Diff line change
Expand Up @@ -226,19 +226,19 @@ TOTAL: 10232 16342283 12255 49690 51
credsweeper result_cnt : 11521, lost_cnt : 0, true_cnt : 11342, false_cnt : 179
Rules Positives Negatives Templates Reported TP FP TN FN FPR FNR ACC PRC RCL F1
------------------------------ ----------- ----------- ----------- ---------- ----- ---- ----- ---- -------- -------- -------- -------- -------- --------
API 130 3166 188 125 123 2 3352 7 0.000596 0.053846 0.997417 0.984000 0.946154 0.964706
API 130 3166 188 124 122 2 3352 8 0.000596 0.061538 0.997130 0.983871 0.938462 0.960630
AWS Client ID 168 21 0 160 160 0 21 8 0.000000 0.047619 0.957672 1.000000 0.952381 0.975610
AWS Multi 82 10 0 88 82 5 5 0 0.500000 0.000000 0.945652 0.942529 1.000000 0.970414
AWS S3 Bucket 67 23 0 92 67 23 0 0 1.000000 0.000000 0.744444 0.744444 1.000000 0.853503
Atlassian Old PAT token 27 308 3 12 3 8 303 24 0.025723 0.888889 0.905325 0.272727 0.111111 0.157895
Auth 414 2739 82 390 387 3 2818 27 0.001063 0.065217 0.990726 0.992308 0.934783 0.962687
Auth 414 2739 82 391 385 6 2815 29 0.002127 0.070048 0.989181 0.984655 0.929952 0.956522
Azure Access Token 19 0 0 12 12 0 0 7 0.368421 0.631579 1.000000 0.631579 0.774194
BASE64 Private Key 7 4 0 7 7 0 4 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
BASE64 encoded PEM Private Key 7 0 0 5 5 0 0 2 0.285714 0.714286 1.000000 0.714286 0.833333
Bitbucket Client ID 143 2095 9 48 28 19 2085 115 0.009030 0.804196 0.940365 0.595745 0.195804 0.294737
Bitbucket Client Secret 301 807 10 40 29 11 806 272 0.013464 0.903654 0.746869 0.725000 0.096346 0.170088
CMD ConvertTo-SecureString 13 4 0 13 13 0 4 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
CMD Password 21 128 6 18 18 0 134 3 0.000000 0.142857 0.980645 1.000000 0.857143 0.923077
CMD ConvertTo-SecureString 13 4 0 12 12 0 4 1 0.000000 0.076923 0.941176 1.000000 0.923077 0.960000
CMD Password 21 128 6 20 20 0 134 1 0.000000 0.047619 0.993548 1.000000 0.952381 0.975610
CMD Secret 1 1 0 1 1 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
CMD Token 6 0 0 6 6 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Certificate 24 471 0 20 20 0 471 4 0.000000 0.166667 0.991919 1.000000 0.833333 0.909091
Expand All @@ -257,18 +257,18 @@ Grafana Provisioned API Key 22 1 0
JSON Web Token 170 61 0 131 131 0 61 39 0.000000 0.229412 0.831169 1.000000 0.770588 0.870432
Jira / Confluence PAT token 0 4 0 0 0 4 0 0.000000 1.000000
Jira 2FA 15 6 1 12 12 0 7 3 0.000000 0.200000 0.863636 1.000000 0.800000 0.888889
Key 3909 15717 485 3944 3893 51 16151 16 0.003148 0.004093 0.996668 0.987069 0.995907 0.991468
Nonce 91 49 0 89 88 1 48 3 0.020408 0.032967 0.971429 0.988764 0.967033 0.977778
Key 3909 15717 485 3930 3889 41 16161 20 0.002531 0.005116 0.996967 0.989567 0.994884 0.992218
Nonce 91 49 0 88 88 0 49 3 0.000000 0.032967 0.978571 1.000000 0.967033 0.983240
Other 8 7445 1 0 0 7446 8 0.000000 1.000000 0.998927 0.000000
PEM Private Key 1019 1483 0 1023 1019 4 1479 0 0.002697 0.000000 0.998401 0.996090 1.000000 0.998041
Password 1869 7535 2680 1776 1758 18 10197 111 0.001762 0.059390 0.989325 0.989865 0.940610 0.964609
Password 1869 7535 2680 1778 1763 15 10200 106 0.001468 0.056715 0.989987 0.991564 0.943285 0.966822
Salt 47 76 1 44 44 0 77 3 0.000000 0.063830 0.975806 1.000000 0.936170 0.967033
Secret 1297 1576 802 1288 1283 5 2373 14 0.002103 0.010794 0.994830 0.996118 0.989206 0.992650
Secret 1297 1576 802 1291 1286 5 2373 11 0.002103 0.008481 0.995646 0.996127 0.991519 0.993818
Seed 1 6 0 0 0 6 1 0.000000 1.000000 0.857143 0.000000
Slack Token 4 1 0 4 4 0 1 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
Tencent WeChat API App ID 6 0 0 6 6 0 0 0 0.000000 1.000000 1.000000 1.000000 1.000000
Token 643 4170 454 616 614 2 4622 29 0.000433 0.045101 0.994114 0.996753 0.954899 0.975377
Token 643 4170 454 619 617 2 4622 26 0.000433 0.040435 0.994684 0.996769 0.959565 0.977813
Twilio Credentials 30 39 0 30 30 0 39 0 0.000000 0.000000 1.000000 1.000000 1.000000 1.000000
URL Credentials 210 156 216 205 205 0 372 5 0.000000 0.023810 0.991409 1.000000 0.976190 0.987952
URL Credentials 210 156 216 209 207 2 370 3 0.005376 0.014286 0.991409 0.990431 0.985714 0.988067
UUID 1069 265 0 1068 1067 1 264 2 0.003774 0.001871 0.997751 0.999064 0.998129 0.998596
12255 49690 5102 11528 11342 179 49511 913 0.003602 0.074500 0.982371 0.984463 0.925500 0.954071
2 changes: 1 addition & 1 deletion .github/workflows/check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
if: ${{ always() && steps.code_checkout.conclusion == 'success' }}
run: |
md5sum --binary credsweeper/ml_model/ml_config.json | grep 49c4352ae9ec82ad432d49d7e51c27f1
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep ff66e97c446d0f2bbd8d37b7dfff7361
md5sum --binary credsweeper/ml_model/ml_model.onnx | grep 4abed2705dcb5565cfd3c80580d17f2a
# # # line ending

Expand Down
25 changes: 16 additions & 9 deletions credsweeper/common/constants.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import string
import typing
from enum import Enum
from typing import Optional, Union
Expand Down Expand Up @@ -79,21 +80,27 @@ class Chars(Enum):
"""

# set of characters, hexadecimal numeral system (Base16). Upper- and lowercase
HEX_CHARS = "0123456789ABCDEFabcdef"
HEX_CHARS = string.digits + "ABCDEFabcdef"
# set of characters, hexadecimal numeral system (Base16). Uppercase
BASE16UPPER = "0123456789ABCDEF"
BASE16UPPER = string.digits + "ABCDEF"
# set of characters, hexadecimal numeral system (Base16). Lowercase
BASE16LOWER = "0123456789abcdef"
BASE16LOWER = string.digits + "abcdef"
# set of 32 characters, used in Base32 encoding
BASE32_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567"
BASE32_CHARS = string.ascii_uppercase + "234567"
# set of 36 characters, used in Base36 encoding
BASE36_CHARS = "abcdefghijklmnopqrstuvwxyz1234567890"
# standard base64 with padding sign
BASE64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
BASE36_CHARS = string.digits + string.ascii_lowercase
# base62 set https://en.wikipedia.org/wiki/Base62
BASE62_CHARS = string.digits + string.ascii_letters
# URL- and filename-safe standard
BASE64URL_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
BASE64URL_CHARS = BASE62_CHARS + "-_"
# standard base64
BASE64STD_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
BASE64STD_CHARS = BASE62_CHARS + "+/"
# standard base64 with padding sign
BASE64_CHARS = BASE64STD_CHARS + "="
# except whitespaces
ASCII_VISIBLE = BASE62_CHARS + string.punctuation
# all printable symbols
ASCII_PRINTABLE = string.printable


ENTROPY_LIMIT_BASE64 = 4.5
Expand Down
2 changes: 1 addition & 1 deletion credsweeper/deep_scanner/pdf_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def data_scan(
pdf_content_provider = DataContentProvider(
data=element_text.encode(),
file_path=data_provider.file_path,
file_type=".xml",
file_type=data_provider.file_type,
info=f"{data_provider.info}|PDF:{page.pageid}")
new_limit = recursive_limit_size - len(pdf_content_provider.data)
element_candidates = self.recursive_scan(pdf_content_provider, depth, new_limit)
Expand Down
6 changes: 4 additions & 2 deletions credsweeper/filters/value_file_path_check.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re

from credsweeper.common.constants import Chars
from credsweeper.common import static_keyword_checklist
from credsweeper.config import Config
Expand All @@ -14,8 +16,8 @@ class ValueFilePathCheck(Filter):
and do not have any special characters ( !$@`&*()+)
"""
base64_possible_set = set(Chars.BASE64_CHARS.value) | set(Chars.BASE64URL_CHARS.value)
unusual_windows_symbols_in_path = "\t\n\r !$@`&*()[]{}<>+=;,~^"
unusual_linux_symbols_in_path = unusual_windows_symbols_in_path + ":\\"
unusual_windows_symbols_in_path = "\t\n\r!$@`&*(){}<>+=;,~^"
unusual_linux_symbols_in_path = "\t\n\r!@`&*<>+=;,~^:\\"

def __init__(self, config: Config = None) -> None:
pass
Expand Down
7 changes: 3 additions & 4 deletions credsweeper/ml_model/features/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from credsweeper.ml_model.features.char_set import CharSet
from credsweeper.ml_model.features.entropy_evaluation import EntropyEvaluation
from credsweeper.ml_model.features.file_extension import FileExtension
from credsweeper.ml_model.features.hartley_entropy import HartleyEntropy
from credsweeper.ml_model.features.has_html_tag import HasHtmlTag
from credsweeper.ml_model.features.is_secret_numeric import IsSecretNumeric
from credsweeper.ml_model.features.length_of_attribute import LengthOfAttribute
from credsweeper.ml_model.features.morpheme_dense import MorphemeDense
from credsweeper.ml_model.features.search_in_attribute import SearchInAttribute
from credsweeper.ml_model.features.reny_entropy import RenyiEntropy
from credsweeper.ml_model.features.rule_name import RuleName
from credsweeper.ml_model.features.shannon_entropy import ShannonEntropy
from credsweeper.ml_model.features.word_in_line import WordInLine
from credsweeper.ml_model.features.word_in_path import WordInPath
from credsweeper.ml_model.features.word_in_value import WordInValue
Expand Down
41 changes: 0 additions & 41 deletions credsweeper/ml_model/features/char_set.py

This file was deleted.

65 changes: 65 additions & 0 deletions credsweeper/ml_model/features/entropy_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import math
from typing import Dict, List, Set

import numpy as np

from credsweeper.common.constants import Base, Chars, ML_HUNK
from credsweeper.credentials import Candidate
from credsweeper.ml_model.features.feature import Feature


class EntropyEvaluation(Feature):
"""
Renyi, Shannon entropy evaluation with Hartley entropy normalization.
Augmentation with possible set of chars (hex, base64, etc.)
Analyse only begin of the value
See next link for details:
https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf
"""

def __init__(self) -> None:
"""Class initializer"""
super().__init__()
# Max size of ML analyzed value is ML_HUNK but value may be bigger
self.hunk_size = 4 * ML_HUNK
self.log2_cache: Dict[int, float] = {x: math.log2(x) for x in range(4, self.hunk_size + 1)}
self.char_sets: List[Set[str]] = [set(x.value) for x in Chars]

def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns real entropy and possible sets of characters"""
# only head of value will be analyzed
result = np.zeros(shape=3 + len(self.char_sets), dtype=np.float32)
value = candidate.line_data_list[0].value[:self.hunk_size]
size = len(value)
uniq, counts = np.unique(list(value), return_counts=True)
if 4 <= size:
# evaluate the entropy for a value of at least 4
probabilities = counts / size
hartley_entropy = self.log2_cache.get(size, -1.0)
assert hartley_entropy, str(candidate)

# renyi_entropy alpha=0.5
sum_prob_05 = np.sum(probabilities ** 0.5)
renyi_entropy_05 = 2 * np.log2(sum_prob_05)
result[0] = renyi_entropy_05 / hartley_entropy

# shannon_entropy or renyi_entropy alpha=1
shannon_entropy = -np.sum(probabilities * np.log2(probabilities))
result[1] = shannon_entropy / hartley_entropy

# renyi_entropy alpha=2
sum_prob_2 = np.sum(probabilities ** 2)
renyi_entropy_2 = -1 * np.log2(sum_prob_2)
result[2] = renyi_entropy_2 / hartley_entropy

elif 0 < size:
# check charset for non-zero value
# use the new variable to deal with mypy
uniq_set = set(uniq)
for n, i in enumerate(self.char_sets):
if not uniq_set.difference(i):
result[3 + n] = 1.0

return result
31 changes: 31 additions & 0 deletions credsweeper/ml_model/features/length_of_attribute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import numpy as np

from credsweeper.common.constants import ML_HUNK
from credsweeper.credentials import Candidate
from credsweeper.ml_model.features.feature import Feature


class LengthOfAttribute(Feature):
"""Abstract class for obtain a normalized value of length"""

def __init__(self, attribute: str):
super().__init__()
if "line" == attribute:
self.hunk_plus = 2 * ML_HUNK + 1
elif "value" == attribute or "variable" == attribute:
self.hunk_plus = ML_HUNK + 1
else:
raise ValueError(f"Not supported attribute '{attribute}'")
self.attribute = attribute

def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns boolean for first LineData member"""
if attribute := getattr(candidate.line_data_list[0], self.attribute, None):
if len(attribute) < self.hunk_plus:
# should be in (0, 1)
return np.array([len(attribute) / self.hunk_plus])
else:
# 1.0 means the attribute is oversize
return np.array([1.0])
# the attribute is empty
return np.array([0.0])
29 changes: 29 additions & 0 deletions credsweeper/ml_model/features/morpheme_dense.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import string
from typing import Dict, Set

import numpy as np

from credsweeper.common import KeywordChecklist, static_keyword_checklist
from credsweeper.common.constants import Base, Chars
from credsweeper.credentials import Candidate
from credsweeper.ml_model.features.feature import Feature


class MorphemeDense(Feature):
"""Feature calculates morphemes density for a value"""

def __init__(self) -> None:
"""Class initializer"""
super().__init__()

def extract(self, candidate: Candidate) -> float:
if value := candidate.line_data_list[0].value.lower():
morphemes_counter = 0
for morpheme in static_keyword_checklist.morpheme_set:
if morpheme in value:
morphemes_counter += 1
# normalization: minimal morpheme length is 3
return 3.0 * morphemes_counter / len(value)
else:
# empty value case
return 0.0
Loading

0 comments on commit 4f66b07

Please sign in to comment.