Skip to content

Commit

Permalink
fix&tests
Browse files Browse the repository at this point in the history
  • Loading branch information
babenek committed Nov 24, 2024
1 parent d8f7e06 commit e11cd05
Show file tree
Hide file tree
Showing 7 changed files with 74 additions and 26 deletions.
2 changes: 1 addition & 1 deletion credsweeper/utils/entropy_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
class EntropyValidator:
"""Verifies data entropy with base64, base36 and base16(hex)"""
CHARS_LIMIT_MAP = {
Chars.BASE64STD_CHARS: ENTROPY_LIMIT_BASE64,
Chars.BASE64STDPAD_CHARS: ENTROPY_LIMIT_BASE64,
Chars.BASE36_CHARS: ENTROPY_LIMIT_BASE3x,
Chars.HEX_CHARS: ENTROPY_LIMIT_BASE3x
}
Expand Down
17 changes: 11 additions & 6 deletions experiment/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray


def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:

print(f"Memory at start: {LogCallback.get_memory_info()}")

current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
Expand Down Expand Up @@ -140,8 +139,10 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
max_epochs = 100
# ^^^ the line is patched in GitHub action to speed-up test train
batch_size = 4096
log_callback = LogCallback()
patience = 5
#return

log_callback = LogCallback()
if use_tuner:
tuner = kt.GridSearch(
hypermodel=MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape),
Expand All @@ -150,16 +151,16 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
project_name='ml_tuning',
)
search_early_stopping = EarlyStopping(monitor="val_loss",
patience=2,
patience=patience,
mode="min",
restore_best_weights=True,
verbose=1)
tuner.search(
x=[x_train_line, x_train_variable, x_train_value, x_train_features],
y=y_train,
epochs=3,
epochs=max_epochs,
batch_size=batch_size,
callbacks=[search_early_stopping],
callbacks=[search_early_stopping, log_callback],
validation_data=([x_test_line, x_test_variable, x_test_value, x_test_features], y_test),
verbose=2,
)
Expand All @@ -172,7 +173,11 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
keras_model = MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape,
x_full_features.shape).build()

early_stopping = EarlyStopping(monitor="val_loss", patience=7, mode="min", restore_best_weights=True, verbose=1)
early_stopping = EarlyStopping(monitor="val_loss",
patience=patience,
mode="min",
restore_best_weights=True,
verbose=1)
model_checkpoint = ModelCheckpoint(filepath=str(dir_path / f"{current_time}.best_model"),
monitor="val_loss",
save_best_only=True,
Expand Down
4 changes: 3 additions & 1 deletion experiment/main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ now=$(date +%Y%m%d_%H%M%S)
RESULT_DIR=${CREDSWEEPER_DIR}/experiment/results
mkdir -vp ${RESULT_DIR}

${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/w/CredData-master --jobs $(nproc) | tee ${RESULT_DIR}/${now}.train.log
# set env TUNER to use keras-tuner
TUNER=--tuner
${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/w/CredData-master --jobs $(nproc) ${TUNER} | tee ${RESULT_DIR}/${now}.train.log
error_code=${PIPESTATUS}
if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi

Expand Down
2 changes: 1 addition & 1 deletion experiment/src/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def read_text(path) -> list[str]:
with open(path, "r", encoding="utf8") as f:
return f.read().replace("\r\n", '\n').replace('\r', '\n').split('\n')

positive_lines = set((x[0], x[1]) for x, y in meta_data if 'T' == y["GroundTruth"])
positive_lines = set((x[0], x[1]) for x, y in meta_data.items() if 'T' == y["GroundTruth"])
values = []
detected_rules: Set[str] = set()
for index, line_data in detected_data.items():
Expand Down
20 changes: 20 additions & 0 deletions experiment/src/log_callback.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import datetime

from keras.src.callbacks import Callback
import psutil


class LogCallback(Callback):
def __init__(self):
super().__init__()

@staticmethod
def get_memory_info():
process = psutil.Process()
memory_info = process.memory_info()
return str(memory_info)

def on_epoch_end(self, epoch, logs=None):
print(str(datetime.datetime.now()), flush=True)
print(f"{epoch + 1}:{self.get_memory_info()}", flush=True)
print(logs, flush=True)
31 changes: 17 additions & 14 deletions experiment/src/lstm_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ class MlModel(kt.HyperModel):
d_type = "float32"

def __init__(
self,
line_shape: tuple,
variable_shape: tuple,
value_shape: tuple,
feature_shape: tuple,
self,
line_shape: tuple,
variable_shape: tuple,
value_shape: tuple,
feature_shape: tuple,
):
self.line_shape = line_shape
self.variable_shape = variable_shape
Expand All @@ -25,16 +25,19 @@ def __init__(

def build(self, hp=None) -> Model:
"""Get keras model with string and feature input and single binary out"""
min_val = 0.11
max_val = 0.44
step_val = (max_val - min_val) / 3
if hp:
dropout_line = hp.Float('dropout_line', min_value=0.271828, max_value=0.314159, step=0.042331)
dropout_variable = hp.Float('dropout_variable', min_value=0.271828, max_value=0.314159, step=0.042331)
dropout_value = hp.Float('dropout_value', min_value=0.271828, max_value=0.314159, step=0.042331)
dropout_dense = hp.Float('dropout_dense', min_value=0.271828, max_value=0.314159, step=0.042331)
dropout_line = hp.Float('dropout_line', min_value=min_val, max_value=max_val, step=step_val)
dropout_variable = hp.Float('dropout_variable', min_value=min_val, max_value=max_val, step=step_val)
dropout_value = hp.Float('dropout_value', min_value=min_val, max_value=max_val, step=step_val)
dropout_dense = hp.Float('dropout_dense', min_value=min_val, max_value=max_val, step=step_val)
else:
dropout_line = 0.314159
dropout_variable = 0.271828
dropout_value = 0.271828
dropout_dense = 0.271828
dropout_line = min_val
dropout_variable = max_val
dropout_value = max_val
dropout_dense = max_val

line_input = Input(shape=(None, self.line_shape[2]), name="line_input", dtype=self.d_type)
line_lstm = LSTM(units=self.line_shape[1], dtype=self.d_type)
Expand All @@ -52,7 +55,7 @@ def build(self, hp=None) -> Model:
value_bidirectional = Bidirectional(layer=value_lstm, name="value_bidirectional")
value_lstm_branch = Dropout(dropout_value, name="value_dropout")(value_bidirectional(value_input))

feature_input = Input(shape=(self.feature_shape[1], ), name="feature_input", dtype=self.d_type)
feature_input = Input(shape=(self.feature_shape[1],), name="feature_input", dtype=self.d_type)

joined_features = Concatenate()([line_lstm_branch, variable_lstm_branch, value_lstm_branch, feature_input])

Expand Down
24 changes: 21 additions & 3 deletions tests/utils/test_entropy_validator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import unittest

from credsweeper.common.constants import Chars
from credsweeper.utils.entropy_validator import EntropyValidator


Expand All @@ -8,11 +9,28 @@ class TestUtils(unittest.TestCase):
def test_validator_n(self):
self.assertEqual("None None None", str(EntropyValidator(None)))
self.assertEqual("HEX_CHARS 0 False", str(EntropyValidator("")))
self.assertEqual("BASE64STD_CHARS 2.321928 False", str(EntropyValidator("12345")))
self.assertEqual("BASE64STD_CHARS 2.321928 False", str(EntropyValidator("/home")))
self.assertEqual("BASE64STDPAD_CHARS 2.321928 False", str(EntropyValidator("12345")))
self.assertEqual("BASE64STDPAD_CHARS 2.321928 False", str(EntropyValidator("/home")))

def test_validator_p(self):
self.assertEqual("HEX_CHARS 3.584963 True", str(EntropyValidator("abcdefABCDEF")))
self.assertEqual("BASE36_CHARS 3.169925 True", str(EntropyValidator("123456789")))
self.assertEqual("BASE64STD_CHARS 4.681881 True",
self.assertEqual("BASE64STDPAD_CHARS 4.681881 True",
str(EntropyValidator("dNJKHBD34534928DRFCsnkjBUygtrd+32sd/uy")))

def test_validator_max_n(self):
entropy_validator = EntropyValidator(Chars.BASE64URL_CHARS.value, Chars.BASE64URL_CHARS)
self.assertFalse(entropy_validator.valid)

def test_validator_max_p(self):
entropy_validator = EntropyValidator(Chars.BASE64STDPAD_CHARS.value, Chars.BASE64STDPAD_CHARS)
self.assertTrue(entropy_validator.valid)

def test_validator_min_n(self):
# not mentioned iterator
entropy_validator = EntropyValidator(Chars.HEX_CHARS.value, Chars.ASCII_PRINTABLE)
self.assertFalse(entropy_validator.valid)

def test_validator_min_p(self):
entropy_validator = EntropyValidator(Chars.HEX_CHARS.value, Chars.HEX_CHARS)
self.assertTrue(entropy_validator.valid)

0 comments on commit e11cd05

Please sign in to comment.