diff --git a/credsweeper/utils/entropy_validator.py b/credsweeper/utils/entropy_validator.py index f9a82a259..bbe6aa503 100644 --- a/credsweeper/utils/entropy_validator.py +++ b/credsweeper/utils/entropy_validator.py @@ -8,7 +8,7 @@ class EntropyValidator: """Verifies data entropy with base64, base36 and base16(hex)""" CHARS_LIMIT_MAP = { - Chars.BASE64STD_CHARS: ENTROPY_LIMIT_BASE64, + Chars.BASE64STDPAD_CHARS: ENTROPY_LIMIT_BASE64, Chars.BASE36_CHARS: ENTROPY_LIMIT_BASE3x, Chars.HEX_CHARS: ENTROPY_LIMIT_BASE3x } diff --git a/experiment/main.py b/experiment/main.py index 14dc7a5b7..5d9515150 100644 --- a/experiment/main.py +++ b/experiment/main.py @@ -54,7 +54,6 @@ def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str: - print(f"Memory at start: {LogCallback.get_memory_info()}") current_time = datetime.now().strftime("%Y%m%d_%H%M%S") @@ -140,8 +139,10 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str: max_epochs = 100 # ^^^ the line is patched in GitHub action to speed-up test train batch_size = 4096 - log_callback = LogCallback() + patience = 5 + #return + log_callback = LogCallback() if use_tuner: tuner = kt.GridSearch( hypermodel=MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape), @@ -150,16 +151,16 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str: project_name='ml_tuning', ) search_early_stopping = EarlyStopping(monitor="val_loss", - patience=2, + patience=patience, mode="min", restore_best_weights=True, verbose=1) tuner.search( x=[x_train_line, x_train_variable, x_train_value, x_train_features], y=y_train, - epochs=3, + epochs=max_epochs, batch_size=batch_size, - callbacks=[search_early_stopping], + callbacks=[search_early_stopping, log_callback], validation_data=([x_test_line, x_test_variable, x_test_value, x_test_features], y_test), verbose=2, ) @@ -172,7 +173,11 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str: keras_model = MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape).build() - early_stopping = EarlyStopping(monitor="val_loss", patience=7, mode="min", restore_best_weights=True, verbose=1) + early_stopping = EarlyStopping(monitor="val_loss", + patience=patience, + mode="min", + restore_best_weights=True, + verbose=1) model_checkpoint = ModelCheckpoint(filepath=str(dir_path / f"{current_time}.best_model"), monitor="val_loss", save_best_only=True, diff --git a/experiment/main.sh b/experiment/main.sh index 2f972e916..cc4d9eec3 100755 --- a/experiment/main.sh +++ b/experiment/main.sh @@ -12,7 +12,9 @@ now=$(date +%Y%m%d_%H%M%S) RESULT_DIR=${CREDSWEEPER_DIR}/experiment/results mkdir -vp ${RESULT_DIR} -${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/w/CredData-master --jobs $(nproc) | tee ${RESULT_DIR}/${now}.train.log +# set env TUNER to use keras-tuner +TUNER=--tuner +${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/w/CredData-master --jobs $(nproc) ${TUNER} | tee ${RESULT_DIR}/${now}.train.log error_code=${PIPESTATUS} if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi diff --git a/experiment/src/data_loader.py b/experiment/src/data_loader.py index 37a611a48..8d2373b22 100644 --- a/experiment/src/data_loader.py +++ b/experiment/src/data_loader.py @@ -150,7 +150,7 @@ def read_text(path) -> list[str]: with open(path, "r", encoding="utf8") as f: return f.read().replace("\r\n", '\n').replace('\r', '\n').split('\n') - positive_lines = set((x[0], x[1]) for x, y in meta_data if 'T' == y["GroundTruth"]) + positive_lines = set((x[0], x[1]) for x, y in meta_data.items() if 'T' == y["GroundTruth"]) values = [] detected_rules: Set[str] = set() for index, line_data in detected_data.items(): diff --git a/experiment/src/log_callback.py b/experiment/src/log_callback.py new file mode 100644 index 000000000..49a47c780 --- /dev/null +++ b/experiment/src/log_callback.py @@ -0,0 +1,20 @@ +import datetime + +from keras.src.callbacks import Callback +import psutil + + +class LogCallback(Callback): + def __init__(self): + super().__init__() + + @staticmethod + def get_memory_info(): + process = psutil.Process() + memory_info = process.memory_info() + return str(memory_info) + + def on_epoch_end(self, epoch, logs=None): + print(str(datetime.datetime.now()), flush=True) + print(f"{epoch + 1}:{self.get_memory_info()}", flush=True) + print(logs, flush=True) diff --git a/experiment/src/lstm_model.py b/experiment/src/lstm_model.py index 18397bd10..f00ba82c0 100644 --- a/experiment/src/lstm_model.py +++ b/experiment/src/lstm_model.py @@ -12,11 +12,11 @@ class MlModel(kt.HyperModel): d_type = "float32" def __init__( - self, - line_shape: tuple, - variable_shape: tuple, - value_shape: tuple, - feature_shape: tuple, + self, + line_shape: tuple, + variable_shape: tuple, + value_shape: tuple, + feature_shape: tuple, ): self.line_shape = line_shape self.variable_shape = variable_shape @@ -25,16 +25,19 @@ def __init__( def build(self, hp=None) -> Model: """Get keras model with string and feature input and single binary out""" + min_val = 0.11 + max_val = 0.44 + step_val = (max_val - min_val) / 3 if hp: - dropout_line = hp.Float('dropout_line', min_value=0.271828, max_value=0.314159, step=0.042331) - dropout_variable = hp.Float('dropout_variable', min_value=0.271828, max_value=0.314159, step=0.042331) - dropout_value = hp.Float('dropout_value', min_value=0.271828, max_value=0.314159, step=0.042331) - dropout_dense = hp.Float('dropout_dense', min_value=0.271828, max_value=0.314159, step=0.042331) + dropout_line = hp.Float('dropout_line', min_value=min_val, max_value=max_val, step=step_val) + dropout_variable = hp.Float('dropout_variable', min_value=min_val, max_value=max_val, step=step_val) + dropout_value = hp.Float('dropout_value', min_value=min_val, max_value=max_val, step=step_val) + dropout_dense = hp.Float('dropout_dense', min_value=min_val, max_value=max_val, step=step_val) else: - dropout_line = 0.314159 - dropout_variable = 0.271828 - dropout_value = 0.271828 - dropout_dense = 0.271828 + dropout_line = min_val + dropout_variable = max_val + dropout_value = max_val + dropout_dense = max_val line_input = Input(shape=(None, self.line_shape[2]), name="line_input", dtype=self.d_type) line_lstm = LSTM(units=self.line_shape[1], dtype=self.d_type) @@ -52,7 +55,7 @@ def build(self, hp=None) -> Model: value_bidirectional = Bidirectional(layer=value_lstm, name="value_bidirectional") value_lstm_branch = Dropout(dropout_value, name="value_dropout")(value_bidirectional(value_input)) - feature_input = Input(shape=(self.feature_shape[1], ), name="feature_input", dtype=self.d_type) + feature_input = Input(shape=(self.feature_shape[1],), name="feature_input", dtype=self.d_type) joined_features = Concatenate()([line_lstm_branch, variable_lstm_branch, value_lstm_branch, feature_input]) diff --git a/tests/utils/test_entropy_validator.py b/tests/utils/test_entropy_validator.py index 45d76121d..fd552919f 100644 --- a/tests/utils/test_entropy_validator.py +++ b/tests/utils/test_entropy_validator.py @@ -1,5 +1,6 @@ import unittest +from credsweeper.common.constants import Chars from credsweeper.utils.entropy_validator import EntropyValidator @@ -8,11 +9,28 @@ class TestUtils(unittest.TestCase): def test_validator_n(self): self.assertEqual("None None None", str(EntropyValidator(None))) self.assertEqual("HEX_CHARS 0 False", str(EntropyValidator(""))) - self.assertEqual("BASE64STD_CHARS 2.321928 False", str(EntropyValidator("12345"))) - self.assertEqual("BASE64STD_CHARS 2.321928 False", str(EntropyValidator("/home"))) + self.assertEqual("BASE64STDPAD_CHARS 2.321928 False", str(EntropyValidator("12345"))) + self.assertEqual("BASE64STDPAD_CHARS 2.321928 False", str(EntropyValidator("/home"))) def test_validator_p(self): self.assertEqual("HEX_CHARS 3.584963 True", str(EntropyValidator("abcdefABCDEF"))) self.assertEqual("BASE36_CHARS 3.169925 True", str(EntropyValidator("123456789"))) - self.assertEqual("BASE64STD_CHARS 4.681881 True", + self.assertEqual("BASE64STDPAD_CHARS 4.681881 True", str(EntropyValidator("dNJKHBD34534928DRFCsnkjBUygtrd+32sd/uy"))) + + def test_validator_max_n(self): + entropy_validator = EntropyValidator(Chars.BASE64URL_CHARS.value, Chars.BASE64URL_CHARS) + self.assertFalse(entropy_validator.valid) + + def test_validator_max_p(self): + entropy_validator = EntropyValidator(Chars.BASE64STDPAD_CHARS.value, Chars.BASE64STDPAD_CHARS) + self.assertTrue(entropy_validator.valid) + + def test_validator_min_n(self): + # not mentioned iterator + entropy_validator = EntropyValidator(Chars.HEX_CHARS.value, Chars.ASCII_PRINTABLE) + self.assertFalse(entropy_validator.valid) + + def test_validator_min_p(self): + entropy_validator = EntropyValidator(Chars.HEX_CHARS.value, Chars.HEX_CHARS) + self.assertTrue(entropy_validator.valid)