-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[kerastuner] 2024-11-11T16:32:30+02:00
- Loading branch information
Showing
22 changed files
with
1,533 additions
and
605 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import math | ||
from typing import Dict, List, Set | ||
|
||
import numpy as np | ||
|
||
from credsweeper.common.constants import Base, Chars, ML_HUNK | ||
from credsweeper.credentials import Candidate | ||
from credsweeper.ml_model.features.feature import Feature | ||
|
||
|
||
class EntropyEvaluation(Feature): | ||
""" | ||
Renyi, Shannon entropy evaluation with Hartley entropy normalization. | ||
Augmentation with possible set of chars (hex, base64, etc.) | ||
Analyse only begin of the value | ||
See next link for details: | ||
https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf | ||
""" | ||
|
||
def __init__(self) -> None: | ||
"""Class initializer""" | ||
super().__init__() | ||
# Max size of ML analyzed value is ML_HUNK but value may be bigger | ||
self.hunk_size = 4 * ML_HUNK | ||
self.log2_cache: Dict[int, float] = {x: math.log2(x) for x in range(4, self.hunk_size + 1)} | ||
self.char_sets: List[Set[str]] = [set(x.value) for x in Chars] | ||
|
||
def extract(self, candidate: Candidate) -> np.ndarray: | ||
"""Returns real entropy and possible sets of characters""" | ||
# only head of value will be analyzed | ||
result = np.zeros(shape=3 + len(self.char_sets), dtype=np.float32) | ||
value = candidate.line_data_list[0].value[:self.hunk_size] | ||
size = len(value) | ||
uniq, counts = np.unique(list(value), return_counts=True) | ||
if 4 <= size: | ||
# evaluate the entropy for a value of at least 4 | ||
probabilities = counts / size | ||
hartley_entropy = self.log2_cache.get(size, -1.0) | ||
assert hartley_entropy, str(candidate) | ||
|
||
# renyi_entropy alpha=0.5 | ||
sum_prob_05 = np.sum(probabilities ** 0.5) | ||
renyi_entropy_05 = 2 * np.log2(sum_prob_05) | ||
result[0] = renyi_entropy_05 / hartley_entropy | ||
|
||
# shannon_entropy or renyi_entropy alpha=1 | ||
shannon_entropy = -np.sum(probabilities * np.log2(probabilities)) | ||
result[1] = shannon_entropy / hartley_entropy | ||
|
||
# renyi_entropy alpha=2 | ||
sum_prob_2 = np.sum(probabilities ** 2) | ||
renyi_entropy_2 = -1 * np.log2(sum_prob_2) | ||
result[2] = renyi_entropy_2 / hartley_entropy | ||
|
||
elif 0 < size: | ||
# check charset for non-zero value | ||
# use the new variable to deal with mypy | ||
uniq_set = set(uniq) | ||
for n, i in enumerate(self.char_sets): | ||
if not uniq_set.difference(i): | ||
result[3 + n] = 1.0 | ||
|
||
return result |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
import numpy as np | ||
|
||
from credsweeper.common.constants import ML_HUNK | ||
from credsweeper.credentials import Candidate | ||
from credsweeper.ml_model.features.feature import Feature | ||
|
||
|
||
class LengthOfAttribute(Feature): | ||
"""Abstract class for obtain a normalized value of length""" | ||
|
||
def __init__(self, attribute: str): | ||
super().__init__() | ||
if "line" == attribute: | ||
self.hunk_plus = 2 * ML_HUNK + 1 | ||
elif "value" == attribute or "variable" == attribute: | ||
self.hunk_plus = ML_HUNK + 1 | ||
else: | ||
raise ValueError(f"Not supported attribute '{attribute}'") | ||
self.attribute = attribute | ||
|
||
def extract(self, candidate: Candidate) -> np.ndarray: | ||
"""Returns boolean for first LineData member""" | ||
if attribute := getattr(candidate.line_data_list[0], self.attribute, None): | ||
if len(attribute) < self.hunk_plus: | ||
# should be in (0, 1) | ||
return np.array([len(attribute) / self.hunk_plus]) | ||
else: | ||
# 1.0 means the attribute is oversize | ||
return np.array([1.0]) | ||
# the attribute is empty | ||
return np.array([0.0]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import string | ||
from typing import Dict, Set | ||
|
||
import numpy as np | ||
|
||
from credsweeper.common import KeywordChecklist, static_keyword_checklist | ||
from credsweeper.common.constants import Base, Chars | ||
from credsweeper.credentials import Candidate | ||
from credsweeper.ml_model.features.feature import Feature | ||
|
||
|
||
class MorphemeDense(Feature): | ||
"""Feature calculates morphemes density for a value""" | ||
|
||
def __init__(self) -> None: | ||
"""Class initializer""" | ||
super().__init__() | ||
|
||
def extract(self, candidate: Candidate) -> float: | ||
if value := candidate.line_data_list[0].value.lower(): | ||
morphemes_counter = 0 | ||
for morpheme in static_keyword_checklist.morpheme_set: | ||
if morpheme in value: | ||
morphemes_counter += 1 | ||
# normalization: minimal morpheme length is 3 | ||
return 3.0 * morphemes_counter / len(value) | ||
else: | ||
# empty value case | ||
return 0.0 |
Oops, something went wrong.