From 99aafb037de55f0dd3de992ee1c121212a3f574f Mon Sep 17 00:00:00 2001
From: Shaun <124687868+shaunyogeshwaran@users.noreply.github.com>
Date: Fri, 11 Oct 2024 21:04:28 +0530
Subject: [PATCH] added rule_example (#16415)

added predict_rules.
added example for algorithm.
added max_categorical_levels example.
added max_num_rules example
added min_rule_length example.
added model_type example.
added distribution example
added rule_generateion_ntrees example.
---
 h2o-bindings/bin/custom/python/gen_rulefit.py | 158 ++++++++++++++++-
 h2o-py/h2o/estimators/rulefit.py              | 162 +++++++++++++++++-
 2 files changed, 318 insertions(+), 2 deletions(-)

diff --git a/h2o-bindings/bin/custom/python/gen_rulefit.py b/h2o-bindings/bin/custom/python/gen_rulefit.py
index 55a01c4057db..9ac14b842743 100644
--- a/h2o-bindings/bin/custom/python/gen_rulefit.py
+++ b/h2o-bindings/bin/custom/python/gen_rulefit.py
@@ -6,6 +6,21 @@ def rule_importance(self):
         Retrieve rule importances for a Rulefit model
 
         :return: H2OTwoDimTable
+        
+        :examples:
+        >>> import h2o
+        >>> h2o.init()
+        >>> from h2o.estimators import H2ORuleFitEstimator
+        >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+        >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+        >>> y = "survived"
+        >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+        ...                            max_num_rules=100,
+        ...                            seed=1)
+        >>> rfit.train(training_frame=df, x=x, y=y)
+        >>> rule_importance = rfit.rule_importance()
+        >>> print(rfit.rule_importance())
         """
         if self._model_json["algo"] != "rulefit":
             raise H2OValueError("This function is available for Rulefit models only")
@@ -18,11 +33,29 @@ def rule_importance(self):
 
     def predict_rules(self, frame, rule_ids):
         """
-        Evaluates validity of the given rules on the given data. 
+        Evaluates validity of the given rules on the given data.
 
         :param frame: H2OFrame on which rule validity is to be evaluated
         :param rule_ids: string array of rule ids to be evaluated against the frame
         :return: H2OFrame with a column per each input ruleId, representing a flag whether given rule is applied to the observation or not.
+        
+        :examples:
+        >>> import h2o
+        >>> h2o.init()
+        >>> from h2o.estimators import H2ORuleFitEstimator
+        >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv"
+        >>> df = h2o.import_file(path=f, col_types={'species': "enum"})
+        >>> x = df.columns
+        >>> y = "species"
+        >>> x.remove(y)
+        >>> train, test = df.split_frame(ratios=[.8], seed=1234)
+        >>> rfit = H2ORuleFitEstimator(min_rule_length=4,
+        ...                            max_rule_length=5,
+        ...                            max_num_rules=3,
+        ...                            seed=1234,
+        ...                            model_type="rules")
+        >>> rfit.train(training_frame=train, x=x, y=y, validation_frame=test)
+        >>> print(rfit.predict_rules(train, ['M0T38N5_Iris-virginica']))
         """
         from h2o.frame import H2OFrame
         from h2o.utils.typechecks import assert_is_type
@@ -52,3 +85,126 @@ def predict_rules(self, frame, rule_ids):
  """
     ),
 )
+
+examples = dict(
+    algorithm="""
+>>> import h2o
+>>> h2o.init()
+>>> from h2o.estimators import H2ORuleFitEstimator
+>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+>>> y = "survived"
+>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+...                            max_num_rules=100,
+...                            algorithm="gbm",
+...                            seed=1)
+>>> rfit.train(training_frame=df, x=x, y=y)
+>>> print(rfit.rule_importance())
+
+""",
+    max_categorical_levels="""
+>>> import h2o
+>>> h2o.init()
+>>> from h2o.estimators import H2ORuleFitEstimator
+>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+>>> y = "survived"
+>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+...                            max_num_rules=100,
+...                            max_categorical_levels=11,
+...                            seed=1)
+>>> rfit.train(training_frame=df, x=x, y=y)
+>>> print(rfit.rule_importance())
+""",
+    max_num_rules="""
+>>> import h2o
+>>> h2o.init()
+>>> from h2o.estimators import H2ORuleFitEstimator
+>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+>>> y = "survived"
+>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+...                            max_num_rules=3,
+...                            seed=1)
+>>> rfit.train(training_frame=df, x=x, y=y)
+>>> print(rfit.rule_importance())
+""",
+    min_rule_length="""
+>>> import h2o
+>>> h2o.init()
+>>> from h2o.estimators import H2ORuleFitEstimator
+>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+>>> y = "survived"
+>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+...                            max_num_rules=100,
+...                            min_rule_length=4,
+...                            seed=1)
+>>> rfit.train(training_frame=df, x=x, y=y)
+>>> print(rfit.rule_importance())
+""",
+    max_rule_length="""
+>>> import h2o
+>>> h2o.init()
+>>> from h2o.estimators import H2ORuleFitEstimator
+>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+>>> y = "survived"
+>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+...                            max_num_rules=100,
+...                            min_rule_length=3,
+...                            seed=1)
+>>> rfit.train(training_frame=df, x=x, y=y)
+>>> print(rfit.rule_importance())
+""",
+    model_type="""
+>>> import h2o
+>>> h2o.init()
+>>> from h2o.estimators import H2ORuleFitEstimator
+>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+>>> y = "survived"
+>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+...                            max_num_rules=100,
+...                            model_type="rules",
+...                            seed=1)
+>>> rfit.train(training_frame=df, x=x, y=y)
+>>> print(rfit.rule_importance())
+""",
+    distribution="""
+>>> import h2o
+>>> h2o.init()
+>>> from h2o.estimators import H2ORuleFitEstimator
+>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+>>> y = "survived"
+>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+...                            max_num_rules=100,
+...                            distribution="bernoulli",
+...                            seed=1)
+>>> rfit.train(training_frame=df, x=x, y=y)
+>>> print(rfit.rule_importance())
+""",
+    rule_generation_ntrees="""
+>>> import h2o
+>>> h2o.init()
+>>> from h2o.estimators import H2ORuleFitEstimator
+>>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+>>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+>>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+>>> y = "survived"
+>>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+...                            max_num_rules=100,
+...                            rule_generation_ntrees=60,
+...                            seed=1)
+>>> rfit.train(training_frame=df, x=x, y=y)
+>>> print(rfit.rule_importance())
+"""
+)
diff --git a/h2o-py/h2o/estimators/rulefit.py b/h2o-py/h2o/estimators/rulefit.py
index 529b371780ea..be80309794b0 100644
--- a/h2o-py/h2o/estimators/rulefit.py
+++ b/h2o-py/h2o/estimators/rulefit.py
@@ -206,6 +206,22 @@ def algorithm(self):
         The algorithm to use to generate rules.
 
         Type: ``Literal["auto", "drf", "gbm"]``, defaults to ``"auto"``.
+
+        :examples:
+
+        >>> import h2o
+        >>> h2o.init()
+        >>> from h2o.estimators import H2ORuleFitEstimator
+        >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+        >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+        >>> y = "survived"
+        >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+        ...                            max_num_rules=100,
+        ...                            algorithm="gbm",
+        ...                            seed=1)
+        >>> rfit.train(training_frame=df, x=x, y=y)
+        >>> print(rfit.rule_importance())
         """
         return self._parms.get("algorithm")
 
@@ -220,6 +236,22 @@ def min_rule_length(self):
         Minimum length of rules. Defaults to 3.
 
         Type: ``int``, defaults to ``3``.
+
+        :examples:
+
+        >>> import h2o
+        >>> h2o.init()
+        >>> from h2o.estimators import H2ORuleFitEstimator
+        >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+        >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+        >>> y = "survived"
+        >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+        ...                            max_num_rules=100,
+        ...                            min_rule_length=4,
+        ...                            seed=1)
+        >>> rfit.train(training_frame=df, x=x, y=y)
+        >>> print(rfit.rule_importance())
         """
         return self._parms.get("min_rule_length")
 
@@ -234,6 +266,22 @@ def max_rule_length(self):
         Maximum length of rules. Defaults to 3.
 
         Type: ``int``, defaults to ``3``.
+
+        :examples:
+
+        >>> import h2o
+        >>> h2o.init()
+        >>> from h2o.estimators import H2ORuleFitEstimator
+        >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+        >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+        >>> y = "survived"
+        >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+        ...                            max_num_rules=100,
+        ...                            min_rule_length=3,
+        ...                            seed=1)
+        >>> rfit.train(training_frame=df, x=x, y=y)
+        >>> print(rfit.rule_importance())
         """
         return self._parms.get("max_rule_length")
 
@@ -249,6 +297,21 @@ def max_num_rules(self):
         by diminishing returns in model deviance.
 
         Type: ``int``, defaults to ``-1``.
+
+        :examples:
+
+        >>> import h2o
+        >>> h2o.init()
+        >>> from h2o.estimators import H2ORuleFitEstimator
+        >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+        >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+        >>> y = "survived"
+        >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+        ...                            max_num_rules=3,
+        ...                            seed=1)
+        >>> rfit.train(training_frame=df, x=x, y=y)
+        >>> print(rfit.rule_importance())
         """
         return self._parms.get("max_num_rules")
 
@@ -263,6 +326,22 @@ def model_type(self):
         Specifies type of base learners in the ensemble.
 
         Type: ``Literal["rules_and_linear", "rules", "linear"]``, defaults to ``"rules_and_linear"``.
+
+        :examples:
+
+        >>> import h2o
+        >>> h2o.init()
+        >>> from h2o.estimators import H2ORuleFitEstimator
+        >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+        >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+        >>> y = "survived"
+        >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+        ...                            max_num_rules=100,
+        ...                            model_type="rules",
+        ...                            seed=1)
+        >>> rfit.train(training_frame=df, x=x, y=y)
+        >>> print(rfit.rule_importance())
         """
         return self._parms.get("model_type")
 
@@ -298,6 +377,22 @@ def distribution(self):
 
         Type: ``Literal["auto", "bernoulli", "multinomial", "gaussian", "poisson", "gamma", "tweedie", "laplace",
         "quantile", "huber"]``, defaults to ``"auto"``.
+
+        :examples:
+
+        >>> import h2o
+        >>> h2o.init()
+        >>> from h2o.estimators import H2ORuleFitEstimator
+        >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+        >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+        >>> y = "survived"
+        >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+        ...                            max_num_rules=100,
+        ...                            distribution="bernoulli",
+        ...                            seed=1)
+        >>> rfit.train(training_frame=df, x=x, y=y)
+        >>> print(rfit.rule_importance())
         """
         return self._parms.get("distribution")
 
@@ -312,6 +407,22 @@ def rule_generation_ntrees(self):
         Specifies the number of trees to build in the tree model. Defaults to 50.
 
         Type: ``int``, defaults to ``50``.
+
+        :examples:
+
+        >>> import h2o
+        >>> h2o.init()
+        >>> from h2o.estimators import H2ORuleFitEstimator
+        >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+        >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+        >>> y = "survived"
+        >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+        ...                            max_num_rules=100,
+        ...                            rule_generation_ntrees=60,
+        ...                            seed=1)
+        >>> rfit.train(training_frame=df, x=x, y=y)
+        >>> print(rfit.rule_importance())
         """
         return self._parms.get("rule_generation_ntrees")
 
@@ -370,6 +481,22 @@ def max_categorical_levels(self):
         for categorical_encoding == EnumLimited.
 
         Type: ``int``, defaults to ``10``.
+
+        :examples:
+
+        >>> import h2o
+        >>> h2o.init()
+        >>> from h2o.estimators import H2ORuleFitEstimator
+        >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+        >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+        >>> y = "survived"
+        >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+        ...                            max_num_rules=100,
+        ...                            max_categorical_levels=11,
+        ...                            seed=1)
+        >>> rfit.train(training_frame=df, x=x, y=y)
+        >>> print(rfit.rule_importance())
         """
         return self._parms.get("max_categorical_levels")
 
@@ -385,6 +512,21 @@ def rule_importance(self):
         Retrieve rule importances for a Rulefit model
 
         :return: H2OTwoDimTable
+
+        :examples:
+        >>> import h2o
+        >>> h2o.init()
+        >>> from h2o.estimators import H2ORuleFitEstimator
+        >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/gbm_test/titanic.csv"
+        >>> df = h2o.import_file(path=f, col_types={'pclass': "enum", 'survived': "enum"})
+        >>> x = ["age", "sibsp", "parch", "fare", "sex", "pclass"]
+        >>> y = "survived"
+        >>> rfit = H2ORuleFitEstimator(max_rule_length=10,
+        ...                            max_num_rules=100,
+        ...                            seed=1)
+        >>> rfit.train(training_frame=df, x=x, y=y)
+        >>> rule_importance = rfit.rule_importance()
+        >>> print(rfit.rule_importance())
         """
         if self._model_json["algo"] != "rulefit":
             raise H2OValueError("This function is available for Rulefit models only")
@@ -397,11 +539,29 @@ def rule_importance(self):
 
     def predict_rules(self, frame, rule_ids):
         """
-        Evaluates validity of the given rules on the given data. 
+        Evaluates validity of the given rules on the given data.
 
         :param frame: H2OFrame on which rule validity is to be evaluated
         :param rule_ids: string array of rule ids to be evaluated against the frame
         :return: H2OFrame with a column per each input ruleId, representing a flag whether given rule is applied to the observation or not.
+
+        :examples:
+        >>> import h2o
+        >>> h2o.init()
+        >>> from h2o.estimators import H2ORuleFitEstimator
+        >>> f = "https://s3.amazonaws.com/h2o-public-test-data/smalldata/iris/iris_train.csv"
+        >>> df = h2o.import_file(path=f, col_types={'species': "enum"})
+        >>> x = df.columns
+        >>> y = "species"
+        >>> x.remove(y)
+        >>> train, test = df.split_frame(ratios=[.8], seed=1234)
+        >>> rfit = H2ORuleFitEstimator(min_rule_length=4,
+        ...                            max_rule_length=5,
+        ...                            max_num_rules=3,
+        ...                            seed=1234,
+        ...                            model_type="rules")
+        >>> rfit.train(training_frame=train, x=x, y=y, validation_frame=test)
+        >>> print(rfit.predict_rules(train, ['M0T38N5_Iris-virginica']))
         """
         from h2o.frame import H2OFrame
         from h2o.utils.typechecks import assert_is_type