diff --git a/h2o-algos/src/main/java/hex/schemas/ExtendedIsolationForestV3.java b/h2o-algos/src/main/java/hex/schemas/ExtendedIsolationForestV3.java index 4bc73ffdd6d7..46c56310eb11 100644 --- a/h2o-algos/src/main/java/hex/schemas/ExtendedIsolationForestV3.java +++ b/h2o-algos/src/main/java/hex/schemas/ExtendedIsolationForestV3.java @@ -25,6 +25,7 @@ public static final class ExtendedIsolationForestParametersV3 extends ModelParam "sample_size", "extension_level", "seed", + "disable_training_metrics" }; @API(help = "Number of Extended Isolation Forest trees.", gridable = true) @@ -42,5 +43,8 @@ public static final class ExtendedIsolationForestParametersV3 extends ModelParam @API(help="Score the model after every so many trees. Disabled if set to 0.", level = API.Level.secondary, gridable = false) public int score_tree_interval; + + @API(help = "Disable calculating training metrics (expensive on large datasets)") + public boolean disable_training_metrics; } } diff --git a/h2o-docs/src/product/data-science/eif.rst b/h2o-docs/src/product/data-science/eif.rst index 9cbad3f69dc7..8b9253264aa7 100644 --- a/h2o-docs/src/product/data-science/eif.rst +++ b/h2o-docs/src/product/data-science/eif.rst @@ -53,6 +53,8 @@ Algorithm-specific parameters - `sample_size `__: The number of randomly sampled observations used to train each Extended Isolation Forest tree. This option defaults to ``256``. +- **disable_training_metrics**: Disable calculating training metrics (expensive on large datasets). This option defaults to ``True`` (enabled). + Shared tree-algorithm parameters '''''''''''''''''''''''''''''''' diff --git a/h2o-py/h2o/estimators/extended_isolation_forest.py b/h2o-py/h2o/estimators/extended_isolation_forest.py index 8873a096ea7a..5be8ae966918 100644 --- a/h2o-py/h2o/estimators/extended_isolation_forest.py +++ b/h2o-py/h2o/estimators/extended_isolation_forest.py @@ -42,6 +42,7 @@ def __init__(self, sample_size=256, # type: int extension_level=0, # type: int seed=-1, # type: int + disable_training_metrics=True, # type: bool ): """ :param model_id: Destination id for this model; auto-generated if not specified. @@ -79,6 +80,9 @@ def __init__(self, :param seed: Seed for pseudo random number generator (if applicable) Defaults to ``-1``. :type seed: int + :param disable_training_metrics: Disable calculating training metrics (expensive on large datasets) + Defaults to ``True``. + :type disable_training_metrics: bool """ super(H2OExtendedIsolationForestEstimator, self).__init__() self._parms = {} @@ -93,6 +97,7 @@ def __init__(self, self.sample_size = sample_size self.extension_level = extension_level self.seed = seed + self.disable_training_metrics = disable_training_metrics @property def training_frame(self): @@ -314,4 +319,18 @@ def seed(self, seed): assert_is_type(seed, None, int) self._parms["seed"] = seed + @property + def disable_training_metrics(self): + """ + Disable calculating training metrics (expensive on large datasets) + + Type: ``bool``, defaults to ``True``. + """ + return self._parms.get("disable_training_metrics") + + @disable_training_metrics.setter + def disable_training_metrics(self, disable_training_metrics): + assert_is_type(disable_training_metrics, None, bool) + self._parms["disable_training_metrics"] = disable_training_metrics + diff --git a/h2o-py/tests/testdir_algos/isoforextended/pyunit_isoforextended_scoring_history.py b/h2o-py/tests/testdir_algos/isoforextended/pyunit_isoforextended_scoring_history.py index 4b4ee10db57d..518c6097b2f9 100644 --- a/h2o-py/tests/testdir_algos/isoforextended/pyunit_isoforextended_scoring_history.py +++ b/h2o-py/tests/testdir_algos/isoforextended/pyunit_isoforextended_scoring_history.py @@ -11,15 +11,19 @@ def extended_isolation_forest_scoring_history(): train = h2o.import_file(pyunit_utils.locate("smalldata/anomaly/single_blob.csv")) - eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1, - score_each_iteration=True) + eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1) eif_model.train(training_frame=train) + print(eif_model.scoring_history()) + assert_equals(None, eif_model.scoring_history(), "No scoring history by default") + eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1, + score_each_iteration=True, disable_training_metrics=False) + eif_model.train(training_frame=train) print(eif_model.scoring_history()) assert_equals(11, len(eif_model.scoring_history()), "There should be one empty row and one row for each tree") eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1, - score_tree_interval=3) + score_tree_interval=3, disable_training_metrics=False) eif_model.train(training_frame=train) print(eif_model.scoring_history()) assert_equals(5, len(eif_model.scoring_history()), "There should be one empty row and one row for each interval") diff --git a/h2o-r/h2o-package/R/extendedisolationforest.R b/h2o-r/h2o-package/R/extendedisolationforest.R index e542813549b9..cd278885f660 100644 --- a/h2o-r/h2o-package/R/extendedisolationforest.R +++ b/h2o-r/h2o-package/R/extendedisolationforest.R @@ -19,6 +19,7 @@ #' Isolation Forest. Defaults to 0. #' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default). #' Defaults to -1 (time-based random number). +#' @param disable_training_metrics \code{Logical}. Disable calculating training metrics (expensive on large datasets) Defaults to TRUE. #' @examples #' \dontrun{ #' library(h2o) @@ -60,7 +61,8 @@ h2o.extendedIsolationForest <- function(training_frame, ntrees = 100, sample_size = 256, extension_level = 0, - seed = -1) + seed = -1, + disable_training_metrics = TRUE) { # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object training_frame <- .validate.H2OFrame(training_frame, required=TRUE) @@ -89,6 +91,8 @@ h2o.extendedIsolationForest <- function(training_frame, parms$extension_level <- extension_level if (!missing(seed)) parms$seed <- seed + if (!missing(disable_training_metrics)) + parms$disable_training_metrics <- disable_training_metrics # Error check and build model model <- .h2o.modelJob('extendedisolationforest', parms, h2oRestApiVersion=3, verbose=FALSE) @@ -104,6 +108,7 @@ h2o.extendedIsolationForest <- function(training_frame, sample_size = 256, extension_level = 0, seed = -1, + disable_training_metrics = TRUE, segment_columns = NULL, segment_models_id = NULL, parallelism = 1) @@ -137,6 +142,8 @@ h2o.extendedIsolationForest <- function(training_frame, parms$extension_level <- extension_level if (!missing(seed)) parms$seed <- seed + if (!missing(disable_training_metrics)) + parms$disable_training_metrics <- disable_training_metrics # Build segment-models specific parameters segment_parms <- list() diff --git a/h2o-r/tests/testdir_algos/isoforextended/runit_isoforextended_scoring_history.R b/h2o-r/tests/testdir_algos/isoforextended/runit_isoforextended_scoring_history.R index a8c0d24c6678..91b45f51ac8c 100644 --- a/h2o-r/tests/testdir_algos/isoforextended/runit_isoforextended_scoring_history.R +++ b/h2o-r/tests/testdir_algos/isoforextended/runit_isoforextended_scoring_history.R @@ -8,11 +8,11 @@ test.ExtendedIsolationForest.scoring_history <- function() { h2o.importFile(path = locate("smalldata/anomaly/single_blob.csv"), destination_frame = "single_blob.hex") - exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_each_iteration=TRUE, ntrees=10) + exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_each_iteration=TRUE, ntrees=10, disable_training_metrics=FALSE) print(exisofor.model) expect_equal(nrow(h2o.scoreHistory(exisofor.model)), 11) - exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_tree_interval=3, ntrees=10) + exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_tree_interval=3, ntrees=10, disable_training_metrics=FALSE) print(exisofor.model) expect_equal(nrow(h2o.scoreHistory(exisofor.model)), 5) }