fix&tests

Samsung · Nov 24, 2024 · e11cd05 · e11cd05
1 parent d8f7e06
commit e11cd05
Show file tree

Hide file tree

Showing 7 changed files with 74 additions and 26 deletions.
diff --git a/credsweeper/utils/entropy_validator.py b/credsweeper/utils/entropy_validator.py
@@ -8,7 +8,7 @@
 class EntropyValidator:
     """Verifies data entropy with base64, base36 and base16(hex)"""
     CHARS_LIMIT_MAP = {
-        Chars.BASE64STD_CHARS: ENTROPY_LIMIT_BASE64,
+        Chars.BASE64STDPAD_CHARS: ENTROPY_LIMIT_BASE64,
         Chars.BASE36_CHARS: ENTROPY_LIMIT_BASE3x,
         Chars.HEX_CHARS: ENTROPY_LIMIT_BASE3x
     }

diff --git a/experiment/main.py b/experiment/main.py
@@ -54,7 +54,6 @@ def evaluate_model(thresholds: dict, keras_model: Model, x_data: List[np.ndarray
 
 
 def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
-
     print(f"Memory at start: {LogCallback.get_memory_info()}")
 
     current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
@@ -140,8 +139,10 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
     max_epochs = 100
     # ^^^ the line is patched in GitHub action to speed-up test train
     batch_size = 4096
-    log_callback = LogCallback()
+    patience = 5
+    #return
 
+    log_callback = LogCallback()
     if use_tuner:
         tuner = kt.GridSearch(
             hypermodel=MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape, x_full_features.shape),
@@ -150,16 +151,16 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
             project_name='ml_tuning',
         )
         search_early_stopping = EarlyStopping(monitor="val_loss",
-                                              patience=2,
+                                              patience=patience,
                                               mode="min",
                                               restore_best_weights=True,
                                               verbose=1)
         tuner.search(
             x=[x_train_line, x_train_variable, x_train_value, x_train_features],
             y=y_train,
-            epochs=3,
+            epochs=max_epochs,
             batch_size=batch_size,
-            callbacks=[search_early_stopping],
+            callbacks=[search_early_stopping, log_callback],
             validation_data=([x_test_line, x_test_variable, x_test_value, x_test_features], y_test),
             verbose=2,
         )
@@ -172,7 +173,11 @@ def main(cred_data_location: str, jobs: int, use_tuner: bool = False) -> str:
         keras_model = MlModel(x_full_line.shape, x_full_variable.shape, x_full_value.shape,
                               x_full_features.shape).build()
 
-    early_stopping = EarlyStopping(monitor="val_loss", patience=7, mode="min", restore_best_weights=True, verbose=1)
+    early_stopping = EarlyStopping(monitor="val_loss",
+                                   patience=patience,
+                                   mode="min",
+                                   restore_best_weights=True,
+                                   verbose=1)
     model_checkpoint = ModelCheckpoint(filepath=str(dir_path / f"{current_time}.best_model"),
                                        monitor="val_loss",
                                        save_best_only=True,

diff --git a/experiment/main.sh b/experiment/main.sh
@@ -12,7 +12,9 @@ now=$(date +%Y%m%d_%H%M%S)
 RESULT_DIR=${CREDSWEEPER_DIR}/experiment/results
 mkdir -vp ${RESULT_DIR}
 
-${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/w/CredData-master --jobs $(nproc)  | tee ${RESULT_DIR}/${now}.train.log
+# set env TUNER to use keras-tuner
+TUNER=--tuner
+${CREDSWEEPER_DIR}/.venv/bin/python main.py --data ~/w/CredData-master --jobs $(nproc) ${TUNER} | tee ${RESULT_DIR}/${now}.train.log
 error_code=${PIPESTATUS}
 if [ 0 -ne ${error_code} ]; then exit ${error_code}; fi
 

diff --git a/experiment/src/data_loader.py b/experiment/src/data_loader.py
@@ -150,7 +150,7 @@ def read_text(path) -> list[str]:
         with open(path, "r", encoding="utf8") as f:
             return f.read().replace("\r\n", '\n').replace('\r', '\n').split('\n')
 
-    positive_lines = set((x[0], x[1]) for x, y in meta_data if 'T' == y["GroundTruth"])
+    positive_lines = set((x[0], x[1]) for x, y in meta_data.items() if 'T' == y["GroundTruth"])
     values = []
     detected_rules: Set[str] = set()
     for index, line_data in detected_data.items():

diff --git a/experiment/src/log_callback.py b/experiment/src/log_callback.py
@@ -0,0 +1,20 @@
+import datetime
+
+from keras.src.callbacks import Callback
+import psutil
+
+
+class LogCallback(Callback):
+    def __init__(self):
+        super().__init__()
+
+    @staticmethod
+    def get_memory_info():
+        process = psutil.Process()
+        memory_info = process.memory_info()
+        return str(memory_info)
+
+    def on_epoch_end(self, epoch, logs=None):
+        print(str(datetime.datetime.now()), flush=True)
+        print(f"{epoch + 1}:{self.get_memory_info()}", flush=True)
+        print(logs, flush=True)
diff --git a/experiment/src/lstm_model.py b/experiment/src/lstm_model.py
@@ -12,11 +12,11 @@ class MlModel(kt.HyperModel):
     d_type = "float32"
 
     def __init__(
-        self,
-        line_shape: tuple,
-        variable_shape: tuple,
-        value_shape: tuple,
-        feature_shape: tuple,
+            self,
+            line_shape: tuple,
+            variable_shape: tuple,
+            value_shape: tuple,
+            feature_shape: tuple,
     ):
         self.line_shape = line_shape
         self.variable_shape = variable_shape
@@ -25,16 +25,19 @@ def __init__(
 
     def build(self, hp=None) -> Model:
         """Get keras model with string and feature input and single binary out"""
+        min_val = 0.11
+        max_val = 0.44
+        step_val = (max_val - min_val) / 3
         if hp:
-            dropout_line = hp.Float('dropout_line', min_value=0.271828, max_value=0.314159, step=0.042331)
-            dropout_variable = hp.Float('dropout_variable', min_value=0.271828, max_value=0.314159, step=0.042331)
-            dropout_value = hp.Float('dropout_value', min_value=0.271828, max_value=0.314159, step=0.042331)
-            dropout_dense = hp.Float('dropout_dense', min_value=0.271828, max_value=0.314159, step=0.042331)
+            dropout_line = hp.Float('dropout_line', min_value=min_val, max_value=max_val, step=step_val)
+            dropout_variable = hp.Float('dropout_variable', min_value=min_val, max_value=max_val, step=step_val)
+            dropout_value = hp.Float('dropout_value', min_value=min_val, max_value=max_val, step=step_val)
+            dropout_dense = hp.Float('dropout_dense', min_value=min_val, max_value=max_val, step=step_val)
         else:
-            dropout_line = 0.314159
-            dropout_variable = 0.271828
-            dropout_value = 0.271828
-            dropout_dense = 0.271828
+            dropout_line = min_val
+            dropout_variable = max_val
+            dropout_value = max_val
+            dropout_dense = max_val
 
         line_input = Input(shape=(None, self.line_shape[2]), name="line_input", dtype=self.d_type)
         line_lstm = LSTM(units=self.line_shape[1], dtype=self.d_type)
@@ -52,7 +55,7 @@ def build(self, hp=None) -> Model:
         value_bidirectional = Bidirectional(layer=value_lstm, name="value_bidirectional")
         value_lstm_branch = Dropout(dropout_value, name="value_dropout")(value_bidirectional(value_input))
 
-        feature_input = Input(shape=(self.feature_shape[1], ), name="feature_input", dtype=self.d_type)
+        feature_input = Input(shape=(self.feature_shape[1],), name="feature_input", dtype=self.d_type)
 
         joined_features = Concatenate()([line_lstm_branch, variable_lstm_branch, value_lstm_branch, feature_input])
 

diff --git a/tests/utils/test_entropy_validator.py b/tests/utils/test_entropy_validator.py
@@ -1,5 +1,6 @@
 import unittest
 
+from credsweeper.common.constants import Chars
 from credsweeper.utils.entropy_validator import EntropyValidator
 
 
@@ -8,11 +9,28 @@ class TestUtils(unittest.TestCase):
     def test_validator_n(self):
         self.assertEqual("None None None", str(EntropyValidator(None)))
         self.assertEqual("HEX_CHARS 0 False", str(EntropyValidator("")))
-        self.assertEqual("BASE64STD_CHARS 2.321928 False", str(EntropyValidator("12345")))
-        self.assertEqual("BASE64STD_CHARS 2.321928 False", str(EntropyValidator("/home")))
+        self.assertEqual("BASE64STDPAD_CHARS 2.321928 False", str(EntropyValidator("12345")))
+        self.assertEqual("BASE64STDPAD_CHARS 2.321928 False", str(EntropyValidator("/home")))
 
     def test_validator_p(self):
         self.assertEqual("HEX_CHARS 3.584963 True", str(EntropyValidator("abcdefABCDEF")))
         self.assertEqual("BASE36_CHARS 3.169925 True", str(EntropyValidator("123456789")))
-        self.assertEqual("BASE64STD_CHARS 4.681881 True",
+        self.assertEqual("BASE64STDPAD_CHARS 4.681881 True",
                          str(EntropyValidator("dNJKHBD34534928DRFCsnkjBUygtrd+32sd/uy")))
+
+    def test_validator_max_n(self):
+        entropy_validator = EntropyValidator(Chars.BASE64URL_CHARS.value, Chars.BASE64URL_CHARS)
+        self.assertFalse(entropy_validator.valid)
+
+    def test_validator_max_p(self):
+        entropy_validator = EntropyValidator(Chars.BASE64STDPAD_CHARS.value, Chars.BASE64STDPAD_CHARS)
+        self.assertTrue(entropy_validator.valid)
+
+    def test_validator_min_n(self):
+        # not mentioned iterator
+        entropy_validator = EntropyValidator(Chars.HEX_CHARS.value, Chars.ASCII_PRINTABLE)
+        self.assertFalse(entropy_validator.valid)
+
+    def test_validator_min_p(self):
+        entropy_validator = EntropyValidator(Chars.HEX_CHARS.value, Chars.HEX_CHARS)
+        self.assertTrue(entropy_validator.valid)