added more python tests.

h2oai · Oct 20, 2024 · 8df305d · 8df305d
1 parent 2181a64
commit 8df305d
Show file tree

Hide file tree

Showing 6 changed files with 298 additions and 20 deletions.
diff --git a/h2o-algos/src/main/java/hex/hglm/HGLM.java b/h2o-algos/src/main/java/hex/hglm/HGLM.java
@@ -17,11 +17,11 @@
 import java.util.List;
 import java.util.stream.Collectors;
 
-import static hex.ModelMetricsRegressionHGLM.calHGLMllg;
 import static hex.glm.GLMModel.GLMParameters.Family.gaussian;
 import static hex.glm.GLMModel.GLMParameters.MissingValuesHandling.*;
 import static hex.hglm.HGLMModel.HGLMParameters.Method.EM;
 import static hex.hglm.HGLMUtils.*;
+import static hex.hglm.MetricBuilderHGLM.calHGLMllg;
 import static water.util.ArrayUtils.*;
 
 public class HGLM extends ModelBuilder<HGLMModel, HGLMModel.HGLMParameters, HGLMModel.HGLMModelOutput> {
@@ -183,6 +183,12 @@ else if (!trainFrame.vec(_parms._group_column).isCategorical())
         if (_parms._tau_e_var_init <= 0) 
           error("tau_e_var_init", "If gen_syn_data is true, tau_e_var_init must be > 0.");
       }
+
+      if (!_parms._random_intercept && _parms._standardize)
+        warn("random_intercept and standardize", 
+                "If random_intercept is false and standardize is true, model building process can be unstable" +
+                        " due to the denormalization process which can create singular T matrix.  If encounter singlar" +
+                        " T matrix problem, set standardize to false in this case to ensure model building can finish.");
     }
   }
 
@@ -292,9 +298,9 @@ void fitEM(HGLMModel model, Job job, ScoringHistory scTrain, ScoringHistory scVa
         if (_parms._max_iterations > 0) {
           // grab current value of fixed beta, tauEVar, tauUVar
           double[] beta = _state.get_beta().clone();
-          double[][] ubeta = copy2DArray(_state.get_ubeta()); // keep to generate synthetic data.
+          double[][] ubeta;
           double tauEVarE10 = _state.get_tauEVarE10();
-          double tauEVarE17 = _state.get_tauEVarE17();
+          double tauEVarE17;
           double[][] tMat = copy2DArray(_state.get_T());
           double[][][] cjInv;
           double[][] tMatInv;
@@ -307,7 +313,8 @@ void fitEM(HGLMModel model, Job job, ScoringHistory scTrain, ScoringHistory scVa
             ubeta = estimateNewRandomEffects(cjInv, engineTask._ArjTYj, engineTask._ArjTAfj, beta);// new random coefficients
             // M step
             beta = estimateFixedCoeff(engineTask._AfTAftInv, engineTask._AfjTYjSum, engineTask._AfjTArj, ubeta);// new fixed coeficients
-            tMat = estimateNewtMat(ubeta, tauEVarE10, cjInv, engineTask._oneOverJ);  // new tMat
+            tMat = estimateNewtMat(ubeta, tauEVarE10, cjInv, engineTask._oneOverJ);  // provide better estimate of tauEVar
+            //tMat = generateNewTmat(ubeta);
             // estimate new tauEVar, 
             HGLMTask.ResidualLLHTask rLlhE17 = new HGLMTask.ResidualLLHTask(_job, _parms, _dinfo, ubeta, _state.get_beta(),
                     engineTask);  // use equation 17 of the doc
@@ -334,7 +341,10 @@ void fitEM(HGLMModel model, Job job, ScoringHistory scTrain, ScoringHistory scVa
           }
         }
       } catch(Exception ex) { // will catch matrix singular during loglikelihood calculation
-        return;
+        if (iteration > 1)  // some coefficients are valid, just return
+          return;
+        else
+          throw new RuntimeException(ex); // bad matrix from the start, no model is built.
       }
     }
 

diff --git a/h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java b/h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java
@@ -42,7 +42,7 @@ public ModelMetricsRegressionHGLM(Model model, Frame frame, long nobs, double si
    *
    * This method calculates the log-likelihood as described in section II.V of the doc.
    */
-  public static double calHGLMllg(long nobs, double[][] tmat, double varResidual, double[][][] zjTTimesZj,
+/*  public static double calHGLMllg(long nobs, double[][] tmat, double varResidual, double[][][] zjTTimesZj,
                                    double yMinsXFixSquared, double[][] yMinusXFixTimesZ) {
       int numLevel2 = zjTTimesZj.length;
       double[][] tmatInv = new Matrix(tmat).inverse().getArray();
@@ -65,7 +65,7 @@ public static double calHGLMllg(long nobs, double[][] tmat, double varResidual,
   
   public static double[][] calInvTPZjTZ(double[][] tmatInv, double[][] zjTTimesZj, double oneOVar) {
    return new Matrix(tmatInv).plus(new Matrix(zjTTimesZj).times(oneOVar)).getArray();
-  }
+  }*/
 
   /***
    *
@@ -89,7 +89,6 @@ public static double calHGLMllg2(long nobs, double[][] tmat, double varResidual,
 
   /**
    * See the doc section II.V, calculates G inverse + transpose(Z)*Z/var_e.
-   * @return
    */
   public static double[][] calInnverV(double[][] gmat, double[][] zTTimesZ, double oneOVar) {
     try {

diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_1p5_noise_var_scoring_history_summary.py b/h2o-py/tests/testdir_algos/hglm/pyunit_GH_8487_1p5_noise_var_scoring_history_summary.py
@@ -0,0 +1,89 @@
+import sys
+sys.path.insert(1,"../../../")
+import h2o
+from tests import pyunit_utils
+from h2o.estimators.hglm import H2OHGLMEstimator as hglm
+from tests.pyunit_utils import utils_for_glm_hglm_tests
+
+# in this test, want to check the following with standardization and with random intercept:
+# 1.scoring history (both training and valid)
+# 2. the model summary
+# 3. Fixed effect coefficients, normal and standardized
+# 4. icc
+# 5. residual variance
+def test_scoring_history_model_summary():
+    h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_123R_6enum_5num_1p5oise_p08T_woIntercept_standardize.gz"))
+    train, valid = h2o_data.split_frame(ratios = [.8], seed = 1234)
+    y = "response"
+    x = h2o_data.names
+    x.remove("response")
+    x.remove("C1")
+    random_columns = ["C2", "C3", "C4"]
+    hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345,
+                      max_iterations = 20, random_intercept = False, standardize=False)
+    hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid)
+    # grab various metrics
+    model_metrics = hglm_model.training_model_metrics()
+    scoring_history = hglm_model.scoring_history(as_data_frame=False)
+    scoring_history_valid = hglm_model.scoring_history_valid(as_data_frame=False)
+    model_summary = hglm_model.summary()
+    coef_random_names = hglm_model.coefs_random_names()
+    t_mat = hglm_model.matrix_T()
+    residual_var = hglm_model.residual_variance()
+    mse = hglm_model.mse()
+    mse_fixed = hglm_model.mean_residual_fixed()
+    mse_fixed_valid = hglm_model.mean_residual_fixed(train=False)
+    icc = hglm_model.icc()
+    level2_names = hglm_model.level_2_names()
+
+    # check to make sure metrics/coefficients make sense
+    residual_var_true = 1.5
+    assert abs(residual_var-residual_var_true) < 0.05, \
+        "Expected variance: {1}, actual: {0}.  The difference is too big.".format(residual_var, residual_var_true)
+    # residual error taking account into only fixed effect coefficients should be greater than mse, mse_valid
+    assert mse < mse_fixed, "residual error with only fixed effects {0} should exceed that of mse {1} but is" \
+                            " not.".format(mse_fixed, mse)
+    assert mse < mse_fixed_valid, "residual error with only fixed effects from validation frames {0} should exceed that" \
+                                  " of mse {1} but is not.".format(mse_fixed_valid, mse)
+    # make sure level 2 values are captured correctly
+    group2_value = train["C1"].unique()
+    utils_for_glm_hglm_tests.compare_list_h2o_frame(level2_names, group2_value, "C1.")
+    # assert icc is calculated correctly.
+    assert len(t_mat) == len(coef_random_names), "expected T matrix size: {0}, actual: {1} and they are not " \
+                                          "equal.".format(len(coef_random_names), len(t_mat))
+    utils_for_glm_hglm_tests.check_icc_calculation(t_mat, residual_var, icc)
+    # check model summary and model metrics if contain the same information should equal to each other
+    model_iterations = model_metrics["iterations"]
+    assert model_iterations == model_summary.cell_values[0][1], \
+        "model metrics iterations {0} should equal model_summary iterations {1}".format(model_iterations, model_summary.cell_values[0][1])
+    last_mse = model_metrics["MSE"]
+    assert abs(last_mse - model_summary.cell_values[0][3]) < 1e-6, \
+        "model metrics MSE {0} should equal to model summary MSE {1}.".format(last_mse, model_summary.cell_values[0][3])
+    last_llg = model_metrics["log_likelihood"]
+    assert abs(last_llg - model_summary.cell_values[0][2]) < 1e-6,\
+        "model metrics llg {0} should equal to model summary llg {1}.".format(last_llg, model_summary.cell_values[0][2])
+    # check scoring history last entry with model metric values
+    assert len(scoring_history.cell_values) == model_iterations, \
+        "length of scoring history {0} should equal to number of model iterations {1}".format(len(scoring_history.cell_values), model_iterations)
+    last_sc_index = model_iterations-1
+    assert abs(scoring_history.cell_values[last_sc_index][3] - last_llg) < 1e-6, \
+        "last scoring history llg {0} should equal to model metrics llg {1}".format(scoring_history.cell_values[last_sc_index][3], last_llg)
+    assert abs(scoring_history.cell_values[last_sc_index][4] - last_mse) < 1e-6, \
+        "last scoring history MSE {0} should equal to model metrics MSE {1}.".format(scoring_history.cell_values[last_sc_index][4], last_mse)
+    # check and make sure the llg from training and validation frame should be increasing in values
+    # this is only true when the true residual variance is high.  For low true residual variance, it is only
+    # true for the last few iterations when the residual variance estimate is close to the true residual variance
+    if (residual_var_true >= 2):
+        for ind in list(range(1, model_iterations)):
+            p_ind = ind-1
+            assert scoring_history.cell_values[p_ind][3] <= scoring_history.cell_values[ind][3], \
+                "training llg {0} from iteration {1} should be smaller than training llg {2} from iteration " \
+                "{3}".format(scoring_history.cell_values[p_ind][3], p_ind, scoring_history.cell_values[ind][3], ind)
+            assert scoring_history_valid.cell_values[p_ind][3] <= scoring_history_valid.cell_values[ind][3], \
+                "validation llg {0} from iteration {1} should be smaller than validation llg {2} from iteration " \
+                "{3}".format(scoring_history_valid.cell_values[p_ind][3], p_ind, scoring_history_valid.cell_values[ind][3], ind)
+
+if __name__ == "__main__":
+    pyunit_utils.standalone_test(test_scoring_history_model_summary)
+else:
+    test_scoring_history_model_summary()
diff --git a/...pyunit_GH_8487_scoring_history_summary.py → ...noise_variance_scoring_history_summary.py b/...pyunit_GH_8487_scoring_history_summary.py → ...noise_variance_scoring_history_summary.py
@@ -12,16 +12,17 @@
 # 4. icc
 # 5. residual variance
 def test_scoring_history_model_summary():
-    h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_1267R_6enum_5num_p08oise_p08T_wIntercept_standardize.gz"))
+    h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_allRC_2enum2numeric_3noise_p08T_wIntercept_standardize.gz"))
     train, valid = h2o_data.split_frame(ratios = [.8], seed = 1234)
     y = "response"
     x = h2o_data.names
     x.remove("response")
     x.remove("C1")
     random_columns = ["C2", "C3", "C10", "C20"]
     hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345,
-                      max_iterations = 10, random_intercept = True)
+                      random_intercept = True, standardize = False, max_iterations=10)
     hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid)
+    print(hglm_model) # make sure this one works.
     # grab various metrics
     model_metrics = hglm_model.training_model_metrics()
     scoring_history = hglm_model.scoring_history(as_data_frame=False)
@@ -36,8 +37,9 @@ def test_scoring_history_model_summary():
     icc = hglm_model.icc()
     level2_names = hglm_model.level_2_names()
     # check to make sure metrics/coefficients make sense
-    # residual_var = 0.05
-    assert abs(residual_var-0.05) < 1.0e-3, "Expected variance: 0.05, actual: {0}.  The difference is too big."
+    residual_var_true = 3.0
+    assert abs(residual_var-residual_var_true) < 0.05, \
+        "Expected variance: {1}}, actual: {0}.  The difference is too big.".format(residual_var, residual_var_true)
     # residual error taking account into only fixed effect coefficients should be greater than mse, mse_valid
     assert mse < mse_fixed, "residual error with only fixed effects {0} should exceed that of mse {1} but is" \
                             " not.".format(mse_fixed, mse)
@@ -69,14 +71,14 @@ def test_scoring_history_model_summary():
     assert abs(scoring_history.cell_values[last_sc_index][4] - last_mse) < 1e-6, \
         "last scoring history MSE {0} should equal to model metrics MSE {1}.".format(scoring_history.cell_values[last_sc_index][4], last_mse)
     # check and make sure the llg from training and validation frame should be increasing in values
-    # for ind in list(range(1, model_iterations)):
-    #     p_ind = ind-1
-    #     assert scoring_history.cell_values[p_ind][3] <= scoring_history.cell_values[ind][3], \
-    #         "training llg {0} from iteration {1} should be smaller than training llg {2} from iteration " \
-    #         "{3}".format(scoring_history.cell_values[p_ind][3], p_ind, scoring_history.cell_values[ind][3], ind)
-    #     assert scoring_history_valid.cell_values[p_ind][3] <= scoring_history_valid.cell_values[ind][3], \
-    #         "validation llg {0} from iteration {1} should be smaller than validation llg {2} from iteration " \
-    #         "{3}".format(scoring_history_valid.cell_values[p_ind][3], p_ind, scoring_history_valid.cell_values[ind][3], ind)
+    for ind in list(range(1, model_iterations)):
+        p_ind = ind-1
+        assert scoring_history.cell_values[p_ind][3] <= scoring_history.cell_values[ind][3], \
+           "training llg {0} from iteration {1} should be smaller than training llg {2} from iteration " \
+           "{3}".format(scoring_history.cell_values[p_ind][3], p_ind, scoring_history.cell_values[ind][3], ind)
+        assert scoring_history_valid.cell_values[p_ind][3] <= scoring_history_valid.cell_values[ind][3], \
+           "validation llg {0} from iteration {1} should be smaller than validation llg {2} from iteration " \
+            "{3}".format(scoring_history_valid.cell_values[p_ind][3], p_ind, scoring_history_valid.cell_values[ind][3], ind)
 
 if __name__ == "__main__":
     pyunit_utils.standalone_test(test_scoring_history_model_summary)