GH-7118 - add posibility to disable training metrics API

h2oai · Nov 16, 2023 · db3a710 · db3a710
1 parent 4a016c7
commit db3a710
Show file tree

Hide file tree

Showing 6 changed files with 42 additions and 6 deletions.
diff --git a/h2o-algos/src/main/java/hex/schemas/ExtendedIsolationForestV3.java b/h2o-algos/src/main/java/hex/schemas/ExtendedIsolationForestV3.java
@@ -25,6 +25,7 @@ public static final class ExtendedIsolationForestParametersV3 extends ModelParam
                 "sample_size",
                 "extension_level",
                 "seed",
+                "disable_training_metrics"
         };
 
         @API(help = "Number of Extended Isolation Forest trees.", gridable = true)
@@ -42,5 +43,8 @@ public static final class ExtendedIsolationForestParametersV3 extends ModelParam
 
         @API(help="Score the model after every so many trees. Disabled if set to 0.", level = API.Level.secondary, gridable = false)
         public int score_tree_interval;
+
+        @API(help = "Disable calculating training metrics (expensive on large datasets)")
+        public boolean disable_training_metrics;
     }
 }
diff --git a/h2o-docs/src/product/data-science/eif.rst b/h2o-docs/src/product/data-science/eif.rst
@@ -53,6 +53,8 @@ Algorithm-specific parameters
 
 -  `sample_size <algo-params/sample_size.html>`__: The number of randomly sampled observations used to train each Extended Isolation Forest tree. This option defaults to ``256``.
 
+-  **disable_training_metrics**: Disable calculating training metrics (expensive on large datasets). This option defaults to ``True`` (enabled).
+
 Shared tree-algorithm parameters
 ''''''''''''''''''''''''''''''''
 

diff --git a/h2o-py/h2o/estimators/extended_isolation_forest.py b/h2o-py/h2o/estimators/extended_isolation_forest.py
@@ -42,6 +42,7 @@ def __init__(self,
                  sample_size=256,  # type: int
                  extension_level=0,  # type: int
                  seed=-1,  # type: int
+                 disable_training_metrics=True,  # type: bool
                  ):
         """
         :param model_id: Destination id for this model; auto-generated if not specified.
@@ -79,6 +80,9 @@ def __init__(self,
         :param seed: Seed for pseudo random number generator (if applicable)
                Defaults to ``-1``.
         :type seed: int
+        :param disable_training_metrics: Disable calculating training metrics (expensive on large datasets)
+               Defaults to ``True``.
+        :type disable_training_metrics: bool
         """
         super(H2OExtendedIsolationForestEstimator, self).__init__()
         self._parms = {}
@@ -93,6 +97,7 @@ def __init__(self,
         self.sample_size = sample_size
         self.extension_level = extension_level
         self.seed = seed
+        self.disable_training_metrics = disable_training_metrics
 
     @property
     def training_frame(self):
@@ -314,4 +319,18 @@ def seed(self, seed):
         assert_is_type(seed, None, int)
         self._parms["seed"] = seed
 
+    @property
+    def disable_training_metrics(self):
+        """
+        Disable calculating training metrics (expensive on large datasets)
+
+        Type: ``bool``, defaults to ``True``.
+        """
+        return self._parms.get("disable_training_metrics")
+
+    @disable_training_metrics.setter
+    def disable_training_metrics(self, disable_training_metrics):
+        assert_is_type(disable_training_metrics, None, bool)
+        self._parms["disable_training_metrics"] = disable_training_metrics
+
 
diff --git a/h2o-py/tests/testdir_algos/isoforextended/pyunit_isoforextended_scoring_history.py b/h2o-py/tests/testdir_algos/isoforextended/pyunit_isoforextended_scoring_history.py
@@ -11,15 +11,19 @@ def extended_isolation_forest_scoring_history():
 
     train = h2o.import_file(pyunit_utils.locate("smalldata/anomaly/single_blob.csv"))
 
-    eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1,
-                                                    score_each_iteration=True)
+    eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1)
     eif_model.train(training_frame=train)
+    print(eif_model.scoring_history())
+    assert_equals(None, eif_model.scoring_history(), "No scoring history by default")
 
+    eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1,
+                                                    score_each_iteration=True, disable_training_metrics=False)
+    eif_model.train(training_frame=train)
     print(eif_model.scoring_history())
     assert_equals(11, len(eif_model.scoring_history()), "There should be one empty row and one row for each tree")
 
     eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1,
-                                                    score_tree_interval=3)
+                                                    score_tree_interval=3, disable_training_metrics=False)
     eif_model.train(training_frame=train)
     print(eif_model.scoring_history())
     assert_equals(5, len(eif_model.scoring_history()), "There should be one empty row and one row for each interval")

diff --git a/h2o-r/h2o-package/R/extendedisolationforest.R b/h2o-r/h2o-package/R/extendedisolationforest.R
@@ -19,6 +19,7 @@
 #'        Isolation Forest. Defaults to 0.
 #' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default).
 #'        Defaults to -1 (time-based random number).
+#' @param disable_training_metrics \code{Logical}. Disable calculating training metrics (expensive on large datasets) Defaults to TRUE.
 #' @examples
 #' \dontrun{
 #' library(h2o)
@@ -60,7 +61,8 @@ h2o.extendedIsolationForest <- function(training_frame,
                                         ntrees = 100,
                                         sample_size = 256,
                                         extension_level = 0,
-                                        seed = -1)
+                                        seed = -1,
+                                        disable_training_metrics = TRUE)
 {
   # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
   training_frame <- .validate.H2OFrame(training_frame, required=TRUE)
@@ -89,6 +91,8 @@ h2o.extendedIsolationForest <- function(training_frame,
     parms$extension_level <- extension_level
   if (!missing(seed))
     parms$seed <- seed
+  if (!missing(disable_training_metrics))
+    parms$disable_training_metrics <- disable_training_metrics
 
   # Error check and build model
   model <- .h2o.modelJob('extendedisolationforest', parms, h2oRestApiVersion=3, verbose=FALSE)
@@ -104,6 +108,7 @@ h2o.extendedIsolationForest <- function(training_frame,
                                                         sample_size = 256,
                                                         extension_level = 0,
                                                         seed = -1,
+                                                        disable_training_metrics = TRUE,
                                                         segment_columns = NULL,
                                                         segment_models_id = NULL,
                                                         parallelism = 1)
@@ -137,6 +142,8 @@ h2o.extendedIsolationForest <- function(training_frame,
     parms$extension_level <- extension_level
   if (!missing(seed))
     parms$seed <- seed
+  if (!missing(disable_training_metrics))
+    parms$disable_training_metrics <- disable_training_metrics
 
   # Build segment-models specific parameters
   segment_parms <- list()

diff --git a/h2o-r/tests/testdir_algos/isoforextended/runit_isoforextended_scoring_history.R b/h2o-r/tests/testdir_algos/isoforextended/runit_isoforextended_scoring_history.R
@@ -8,11 +8,11 @@ test.ExtendedIsolationForest.scoring_history <- function() {
       h2o.importFile(path = locate("smalldata/anomaly/single_blob.csv"),
                    destination_frame = "single_blob.hex")
 
-    exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_each_iteration=TRUE, ntrees=10)
+    exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_each_iteration=TRUE, ntrees=10, disable_training_metrics=FALSE)
     print(exisofor.model)
     expect_equal(nrow(h2o.scoreHistory(exisofor.model)), 11)
 
-    exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_tree_interval=3, ntrees=10)
+    exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_tree_interval=3, ntrees=10, disable_training_metrics=FALSE)
     print(exisofor.model)
     expect_equal(nrow(h2o.scoreHistory(exisofor.model)), 5)
 }