Skip to content

Commit

Permalink
GH-7118 - add posibility to disable training metrics API
Browse files Browse the repository at this point in the history
  • Loading branch information
valenad1 committed Nov 16, 2023
1 parent 4a016c7 commit db3a710
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ public static final class ExtendedIsolationForestParametersV3 extends ModelParam
"sample_size",
"extension_level",
"seed",
"disable_training_metrics"
};

@API(help = "Number of Extended Isolation Forest trees.", gridable = true)
Expand All @@ -42,5 +43,8 @@ public static final class ExtendedIsolationForestParametersV3 extends ModelParam

@API(help="Score the model after every so many trees. Disabled if set to 0.", level = API.Level.secondary, gridable = false)
public int score_tree_interval;

@API(help = "Disable calculating training metrics (expensive on large datasets)")
public boolean disable_training_metrics;
}
}
2 changes: 2 additions & 0 deletions h2o-docs/src/product/data-science/eif.rst
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ Algorithm-specific parameters

- `sample_size <algo-params/sample_size.html>`__: The number of randomly sampled observations used to train each Extended Isolation Forest tree. This option defaults to ``256``.

- **disable_training_metrics**: Disable calculating training metrics (expensive on large datasets). This option defaults to ``True`` (enabled).

Shared tree-algorithm parameters
''''''''''''''''''''''''''''''''

Expand Down
19 changes: 19 additions & 0 deletions h2o-py/h2o/estimators/extended_isolation_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def __init__(self,
sample_size=256, # type: int
extension_level=0, # type: int
seed=-1, # type: int
disable_training_metrics=True, # type: bool
):
"""
:param model_id: Destination id for this model; auto-generated if not specified.
Expand Down Expand Up @@ -79,6 +80,9 @@ def __init__(self,
:param seed: Seed for pseudo random number generator (if applicable)
Defaults to ``-1``.
:type seed: int
:param disable_training_metrics: Disable calculating training metrics (expensive on large datasets)
Defaults to ``True``.
:type disable_training_metrics: bool
"""
super(H2OExtendedIsolationForestEstimator, self).__init__()
self._parms = {}
Expand All @@ -93,6 +97,7 @@ def __init__(self,
self.sample_size = sample_size
self.extension_level = extension_level
self.seed = seed
self.disable_training_metrics = disable_training_metrics

@property
def training_frame(self):
Expand Down Expand Up @@ -314,4 +319,18 @@ def seed(self, seed):
assert_is_type(seed, None, int)
self._parms["seed"] = seed

@property
def disable_training_metrics(self):
"""
Disable calculating training metrics (expensive on large datasets)
Type: ``bool``, defaults to ``True``.
"""
return self._parms.get("disable_training_metrics")

@disable_training_metrics.setter
def disable_training_metrics(self, disable_training_metrics):
assert_is_type(disable_training_metrics, None, bool)
self._parms["disable_training_metrics"] = disable_training_metrics


Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,19 @@ def extended_isolation_forest_scoring_history():

train = h2o.import_file(pyunit_utils.locate("smalldata/anomaly/single_blob.csv"))

eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1,
score_each_iteration=True)
eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1)
eif_model.train(training_frame=train)
print(eif_model.scoring_history())
assert_equals(None, eif_model.scoring_history(), "No scoring history by default")

eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1,
score_each_iteration=True, disable_training_metrics=False)
eif_model.train(training_frame=train)
print(eif_model.scoring_history())
assert_equals(11, len(eif_model.scoring_history()), "There should be one empty row and one row for each tree")

eif_model = H2OExtendedIsolationForestEstimator(ntrees=10, seed=0xBEEF, sample_size=255, extension_level=1,
score_tree_interval=3)
score_tree_interval=3, disable_training_metrics=False)
eif_model.train(training_frame=train)
print(eif_model.scoring_history())
assert_equals(5, len(eif_model.scoring_history()), "There should be one empty row and one row for each interval")
Expand Down
9 changes: 8 additions & 1 deletion h2o-r/h2o-package/R/extendedisolationforest.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#' Isolation Forest. Defaults to 0.
#' @param seed Seed for random numbers (affects certain parts of the algo that are stochastic and those might or might not be enabled by default).
#' Defaults to -1 (time-based random number).
#' @param disable_training_metrics \code{Logical}. Disable calculating training metrics (expensive on large datasets) Defaults to TRUE.
#' @examples
#' \dontrun{
#' library(h2o)
Expand Down Expand Up @@ -60,7 +61,8 @@ h2o.extendedIsolationForest <- function(training_frame,
ntrees = 100,
sample_size = 256,
extension_level = 0,
seed = -1)
seed = -1,
disable_training_metrics = TRUE)
{
# Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
training_frame <- .validate.H2OFrame(training_frame, required=TRUE)
Expand Down Expand Up @@ -89,6 +91,8 @@ h2o.extendedIsolationForest <- function(training_frame,
parms$extension_level <- extension_level
if (!missing(seed))
parms$seed <- seed
if (!missing(disable_training_metrics))
parms$disable_training_metrics <- disable_training_metrics

# Error check and build model
model <- .h2o.modelJob('extendedisolationforest', parms, h2oRestApiVersion=3, verbose=FALSE)
Expand All @@ -104,6 +108,7 @@ h2o.extendedIsolationForest <- function(training_frame,
sample_size = 256,
extension_level = 0,
seed = -1,
disable_training_metrics = TRUE,
segment_columns = NULL,
segment_models_id = NULL,
parallelism = 1)
Expand Down Expand Up @@ -137,6 +142,8 @@ h2o.extendedIsolationForest <- function(training_frame,
parms$extension_level <- extension_level
if (!missing(seed))
parms$seed <- seed
if (!missing(disable_training_metrics))
parms$disable_training_metrics <- disable_training_metrics

# Build segment-models specific parameters
segment_parms <- list()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ test.ExtendedIsolationForest.scoring_history <- function() {
h2o.importFile(path = locate("smalldata/anomaly/single_blob.csv"),
destination_frame = "single_blob.hex")

exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_each_iteration=TRUE, ntrees=10)
exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_each_iteration=TRUE, ntrees=10, disable_training_metrics=FALSE)
print(exisofor.model)
expect_equal(nrow(h2o.scoreHistory(exisofor.model)), 11)

exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_tree_interval=3, ntrees=10)
exisofor.model <- h2o.extendedIsolationForest(training_frame = single_blob.hex, score_tree_interval=3, ntrees=10, disable_training_metrics=FALSE)
print(exisofor.model)
expect_equal(nrow(h2o.scoreHistory(exisofor.model)), 5)
}
Expand Down

0 comments on commit db3a710

Please sign in to comment.