Skip to content

Commit

Permalink
missed
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Oct 17, 2024
1 parent 727f64d commit f92c06a
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 0 deletions.
65 changes: 65 additions & 0 deletions credsweeper/ml_model/features/entropy_evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import math
from typing import Dict, List, Set

import numpy as np

from credsweeper.common.constants import Base, Chars, ML_HUNK
from credsweeper.credentials import Candidate
from credsweeper.ml_model.features.feature import Feature


class EntropyEvaluation(Feature):
"""
Renyi, Shannon entropy evaluation with Hartley entropy normalization.
Augmentation with possible set of chars (hex, base64, etc.)
Analyse only begin of the value
See next link for details:
https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf
"""

def __init__(self) -> None:
"""Class initializer"""
super().__init__()
# Max size of ML analyzed value is ML_HUNK but value may be bigger
self.hunk_size = 4 * ML_HUNK
self.log2_cache: Dict[int, float] = {x: math.log2(x) for x in range(4, self.hunk_size + 1)}
self.char_sets: List[Set[str]] = [set(x.value) for x in Chars]

def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns real entropy and possible sets of characters"""
# only head of value will be analyzed
result = np.zeros(shape=3 + len(self.char_sets), dtype=np.float32)
value = candidate.line_data_list[0].value[:self.hunk_size]
size = len(value)
uniq, counts = np.unique(list(value), return_counts=True)
if 4 <= size:
# evaluate the entropy for a value of at least 4
probabilities = counts / size
hartley_entropy = self.log2_cache.get(size, -1.0)
assert hartley_entropy, str(candidate)

# renyi_entropy alpha=0.5
sum_prob_05 = np.sum(probabilities ** 0.5)
renyi_entropy_05 = 2 * np.log2(sum_prob_05)
result[0] = renyi_entropy_05 / hartley_entropy

# shannon_entropy or renyi_entropy alpha=1
shannon_entropy = -np.sum(probabilities * np.log2(probabilities))
result[1] = shannon_entropy / hartley_entropy

# renyi_entropy alpha=2
sum_prob_2 = np.sum(probabilities ** 2)
renyi_entropy_2 = -1 * np.log2(sum_prob_2)
result[2] = renyi_entropy_2 / hartley_entropy

elif 0 < size:
# check charset for non-zero value
# use the new variable to deal with mypy
uniq_set = set(uniq)
for n, i in enumerate(self.char_sets):
if not uniq_set.difference(i):
result[3 + n] = 1.0

return result
31 changes: 31 additions & 0 deletions credsweeper/ml_model/features/length_of_attribute.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import numpy as np

from credsweeper.common.constants import ML_HUNK
from credsweeper.credentials import Candidate
from credsweeper.ml_model.features.feature import Feature


class LengthOfAttribute(Feature):
"""Abstract class for obtain a normalized value of length"""

def __init__(self, attribute: str):
super().__init__()
if "line" == attribute:
self.hunk_plus = 2 * ML_HUNK + 1
elif "value" == attribute or "variable" == attribute:
self.hunk_plus = ML_HUNK + 1
else:
raise ValueError(f"Not supported attribute '{attribute}'")
self.attribute = attribute

def extract(self, candidate: Candidate) -> np.ndarray:
"""Returns boolean for first LineData member"""
if attribute := getattr(candidate.line_data_list[0], self.attribute, None):
if len(attribute) < self.hunk_plus:
# should be in (0, 1)
return np.array([len(attribute) / self.hunk_plus])
else:
# 1.0 means the attribute is oversize
return np.array([1.0])
# the attribute is empty
return np.array([0.0])

0 comments on commit f92c06a

Please sign in to comment.