Skip to content

Commit

Permalink
added rule_example (#16415)
Browse files Browse the repository at this point in the history
added predict_rules.
added example for algorithm.
added max_categorical_levels example.
added max_num_rules example
added min_rule_length example.
added model_type example.
added distribution example
added rule_generateion_ntrees example.
  • Loading branch information
shaunyogeshwaran authored Oct 11, 2024
1 parent 733c496 commit 99aafb0
Show file tree
Hide file tree
Showing 2 changed files with 318 additions and 2 deletions.
158 changes: 157 additions & 1 deletion h2o-bindings/bin/custom/python/gen_rulefit.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,21 @@ def rule_importance(self):
Retrieve rule importances for a Rulefit model
:return: H2OTwoDimTable
:examples:
>>> import h2o
>>> h2o.init()
>>> from h2o.estimators import H2ORuleFitEstimator
>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
>>> y = "survived"
>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
... max_num_rules=100,
... seed=1)
>>> rfit.train(training_frame=df, x=x, y=y)
>>> rule_importance = rfit.rule_importance()
>>> print(rfit.rule_importance())
"""
if self._model_json["algo"] != "rulefit":
raise H2OValueError("This function is available for Rulefit models only")
Expand All @@ -18,11 +33,29 @@ def rule_importance(self):

def predict_rules(self, frame, rule_ids):
"""
Evaluates validity of the given rules on the given data.
Evaluates validity of the given rules on the given data.
:param frame: H2OFrame on which rule validity is to be evaluated
:param rule_ids: string array of rule ids to be evaluated against the frame
:return: H2OFrame with a column per each input ruleId, representing a flag whether given rule is applied to the observation or not.
:examples:
>>> import h2o
>>> h2o.init()
>>> from h2o.estimators import H2ORuleFitEstimator
>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv"
>>> df = h2o.import_file(path=f, col_types={'species': "enum"})
>>> x = df.columns
>>> y = "species"
>>> x.remove(y)
>>> train, test = df.split_frame(ratios=[.8], seed=1234)
>>> rfit = H2ORuleFitEstimator(min_rule_length=4,
... max_rule_length=5,
... max_num_rules=3,
... seed=1234,
... model_type="rules")
>>> rfit.train(training_frame=train, x=x, y=y, validation_frame=test)
>>> print(rfit.predict_rules(train, ['M0T38N5_Iris-virginica']))
"""
from h2o.frame import H2OFrame
from h2o.utils.typechecks import assert_is_type
Expand Down Expand Up @@ -52,3 +85,126 @@ def predict_rules(self, frame, rule_ids):
"""
),
)

examples = dict(
algorithm="""
>>> import h2o
>>> h2o.init()
>>> from h2o.estimators import H2ORuleFitEstimator
>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
>>> y = "survived"
>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
... max_num_rules=100,
... algorithm="gbm",
... seed=1)
>>> rfit.train(training_frame=df, x=x, y=y)
>>> print(rfit.rule_importance())
""",
max_categorical_levels="""
>>> import h2o
>>> h2o.init()
>>> from h2o.estimators import H2ORuleFitEstimator
>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
>>> y = "survived"
>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
... max_num_rules=100,
... max_categorical_levels=11,
... seed=1)
>>> rfit.train(training_frame=df, x=x, y=y)
>>> print(rfit.rule_importance())
""",
max_num_rules="""
>>> import h2o
>>> h2o.init()
>>> from h2o.estimators import H2ORuleFitEstimator
>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
>>> y = "survived"
>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
... max_num_rules=3,
... seed=1)
>>> rfit.train(training_frame=df, x=x, y=y)
>>> print(rfit.rule_importance())
""",
min_rule_length="""
>>> import h2o
>>> h2o.init()
>>> from h2o.estimators import H2ORuleFitEstimator
>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
>>> y = "survived"
>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
... max_num_rules=100,
... min_rule_length=4,
... seed=1)
>>> rfit.train(training_frame=df, x=x, y=y)
>>> print(rfit.rule_importance())
""",
max_rule_length="""
>>> import h2o
>>> h2o.init()
>>> from h2o.estimators import H2ORuleFitEstimator
>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
>>> y = "survived"
>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
... max_num_rules=100,
... min_rule_length=3,
... seed=1)
>>> rfit.train(training_frame=df, x=x, y=y)
>>> print(rfit.rule_importance())
""",
model_type="""
>>> import h2o
>>> h2o.init()
>>> from h2o.estimators import H2ORuleFitEstimator
>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
>>> y = "survived"
>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
... max_num_rules=100,
... model_type="rules",
... seed=1)
>>> rfit.train(training_frame=df, x=x, y=y)
>>> print(rfit.rule_importance())
""",
distribution="""
>>> import h2o
>>> h2o.init()
>>> from h2o.estimators import H2ORuleFitEstimator
>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
>>> y = "survived"
>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
... max_num_rules=100,
... distribution="bernoulli",
... seed=1)
>>> rfit.train(training_frame=df, x=x, y=y)
>>> print(rfit.rule_importance())
""",
rule_generation_ntrees="""
>>> import h2o
>>> h2o.init()
>>> from h2o.estimators import H2ORuleFitEstimator
>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
>>> y = "survived"
>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
... max_num_rules=100,
... rule_generation_ntrees=60,
... seed=1)
>>> rfit.train(training_frame=df, x=x, y=y)
>>> print(rfit.rule_importance())
"""
)
Loading

0 comments on commit 99aafb0

Please sign in to comment.