fixed bugs in R clients utility functions.

h2oai · Oct 16, 2024 · 2b09a4e · 2b09a4e
1 parent 6450082
commit 2b09a4e
Show file tree

Hide file tree

Showing 7 changed files with 65 additions and 75 deletions.
diff --git a/h2o-algos/src/main/java/hex/hglm/HGLM.java b/h2o-algos/src/main/java/hex/hglm/HGLM.java
@@ -308,6 +308,7 @@ void fitEM(HGLMModel model, Job job, ScoringHistory scTrain, ScoringHistory scVa
             HGLMTask.ResidualLLHTask rLlh = new HGLMTask.ResidualLLHTask(_job, _parms, _dinfo, ubeta, _state.get_beta(),
                     engineTask);  // use equation 17 of the doc
             rLlh.doAll(_dinfo._adaptedFrame);
+            // equation 17
             tauEVar2 = calTauEvarEq17(rLlh._residualSquare, _state.get_tauEVar2(), cjInv, engineTask._ArjTArj, engineTask._oneOverN);
 
             HGLMTask.ResidualLLHTask rLlh2 = new HGLMTask.ResidualLLHTask(_job, _parms, _dinfo, ubeta, beta, engineTask);
@@ -317,8 +318,14 @@ void fitEM(HGLMModel model, Job job, ScoringHistory scTrain, ScoringHistory scVa
             // check to make sure determinant of V is positive, see section II.V of the doc
             if (!checkPositiveG(engineTask._numLevel2Units, tMat))
               Log.info("HGLM model building is stopped due to matrix G in section II.V of the doc is no longer PSD");
+            double logLikelihood = calHGLMllg(_state._nobs, tMat, tauEVar, model._output._arjtarj, rLlh2._sse_fixed,
+                    rLlh2._yMinusXTimesZ);  // equation 10
+            double logLikelihood2 = calHGLMllg(_state._nobs, tMat, tauEVar2, model._output._arjtarj, rLlh._sse_fixed,
+                    rLlh2._yMinusXTimesZ);    // equation 17
+            Log.info("likelihood use tauEVar from equation 10 " + logLikelihood + ", likelihood use tauEVar" +
+                    " from equation 17 "+logLikelihood2);
             // check if stopping conditions are satisfied
-            if (!progress(beta, ubeta, tMat, tauEVar2, scTrain, scValid, model, rLlh2))
+            if (!progress(beta, ubeta, tMat, tauEVar2, scTrain, scValid, model, rLlh))
               return;
           }
         }
@@ -332,43 +339,36 @@ public boolean progress(double[] beta, double[][] ubeta, double[][] tmat, double
       _state._iter++;
       double[] betaDiff = new double[beta.length];
       minus(betaDiff, beta, _state.get_beta());
-      double maxBetaDiff = maxMag(betaDiff)/maxMag(beta);
+      double maxBetaDiff = maxMag(betaDiff) / maxMag(beta);
       double[][] tmatDiff = new double[tmat.length][tmat[0].length];
       minus(tmatDiff, tmat, _state.get_T());
-      double maxTmatDiff = maxMag(tmatDiff)/maxMag(tmat);
+      double maxTmatDiff = maxMag(tmatDiff) / maxMag(tmat);
       double[][] ubetaDiff = new double[ubeta.length][ubeta[0].length];
       minus(ubetaDiff, ubeta, _state.get_ubeta());
-      double maxUBetaDiff = maxMag(ubetaDiff)/maxMag(ubeta);
-      double tauEVarDiff = Math.abs(tauEVar - _state.get_tauEVar())/tauEVar;
+      double maxUBetaDiff = maxMag(ubetaDiff) / maxMag(ubeta);
+      double tauEVarDiff = Math.abs(tauEVar - _state.get_tauEVar()) / tauEVar;
       // calculate log likelihood with current parameter settings, standardize if parms._standardize and vice versa
-      double logLikelihood = calHGLMllg(_state._nobs, tmat, tauEVar, model._output._arjtarj, rLlh2._sse_fixed, 
+      double logLikelihood = calHGLMllg(_state._nobs, tmat, tauEVar, model._output._arjtarj, rLlh2._sse_fixed,
               rLlh2._yMinusXTimesZ);
-      boolean converged = ((maxBetaDiff < _parms._em_epsilon) && (maxTmatDiff < _parms._em_epsilon) && (maxUBetaDiff
-              < _parms._em_epsilon) && (tauEVarDiff < _parms._em_epsilon) && (logLikelihood < model._output._log_likelihood));
+      boolean converged = ((maxBetaDiff <= _parms._em_epsilon) && (maxTmatDiff <= _parms._em_epsilon) && (maxUBetaDiff
+              <= _parms._em_epsilon) && (tauEVarDiff <= _parms._em_epsilon));
       ComputationStateHGLM.ComputationStateSimple simpleState = new ComputationStateHGLM.ComputationStateSimple(_state.get_beta(), _state.get_ubeta(),
               _state.get_T(), _state.get_tauEVar());
       if (!converged) { // update values in _state
-        try {
-          _state.set_beta(beta);
-          _state.set_ubeta(ubeta);
-          _state.set_T(tmat);
-          _state.set_tauEVar(tauEVar);
-          model._output._log_likelihood = logLikelihood;
-          if (_parms._score_each_iteration || _parms._score_iteration_interval / _state._iter == 0) {
-            model._output.setModelOutputFields(_state);
-            scoreAndUpdateModel(model, true, scTrain); // perform scoring and updating scoring history
-            if (_parms.valid() != null)
-              scoreAndUpdateModel(model, false, scValid);
-          } else {
-            scTrain.addIterationScore(_state._iter, model._output._log_likelihood, tauEVar);
-          }
-        } catch(Exception ex) {
-          _state.set_beta(simpleState._beta);
-          _state.set_ubeta(simpleState._ubeta);
-          _state.set_T(simpleState._tmat);
-          _state.set_tauEVar(tauEVar);
-          return false; // stop execution when calculation of loglikelihood is bad due to matrix inverse failure
+        _state.set_beta(beta);
+        _state.set_ubeta(ubeta);
+        _state.set_T(tmat);
+        _state.set_tauEVar(tauEVar);
+        model._output._log_likelihood = logLikelihood;
+        if (_parms._score_each_iteration || _parms._score_iteration_interval / _state._iter == 0) {
+          model._output.setModelOutputFields(_state);
+          scoreAndUpdateModel(model, true, scTrain); // perform scoring and updating scoring history
+          if (_parms.valid() != null)
+            scoreAndUpdateModel(model, false, scValid);
+        } else {
+          scTrain.addIterationScore(_state._iter, model._output._log_likelihood, tauEVar);
         }
+
       }
       return !stop_requested() && !converged && (_state._iter < _parms._max_iterations);
     }

diff --git a/h2o-bindings/bin/custom/R/gen_hglm.py b/h2o-bindings/bin/custom/R/gen_hglm.py
@@ -8,7 +8,7 @@
     } else {
         stop("random_columns is required.")
     }
-    if (!missing(group_column) {
+    if (!missing(group_column)) {
         parms$group_column <- group_column
     } else {
         stop("group_column is required.")

diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_generate_synthetic_HGLM_data.py b/h2o-py/tests/testdir_algos/hglm/pyunit_generate_synthetic_HGLM_data.py
@@ -90,9 +90,10 @@ def generate_dataset(family, trainData, group_column, random_columns):
     startVal = [-1.4313612,  0.6795744,  1.9795154, -3.1187255,  0.2058840, -1.6596187,  0.3460812, -0.7809777,
                 1.6617960, -0.5174034, 1.8273497, -2.4161541,  0.9474324,  2.3616221,  0.7710148,  0.2706556,  1.0541668]
     # hglm_test/gaussian_0GC_allRC_2enum2numeric_p05oise_p08T_wIntercept_standardize
+    # hglm_test/gaussian_0GC_allRC_2enum2numeric_2p0noise_p5T_wIntercept
     m = hglm(family=family, max_iterations=0, random_columns=random_columns, group_column=group_column, 
-             tau_u_var_init = 0.08, tau_e_var_init = 0.05, random_intercept = True, gen_syn_data=True, 
-             seed = 12345, standardize = True, initial_fixed_effects=startVal)
+             tau_u_var_init = 0.5, tau_e_var_init = 2, random_intercept = True, gen_syn_data=True, 
+             seed = 12345, standardize = False, initial_fixed_effects=startVal)
     m.train(training_frame=trainData, y = "response", x =myX)
     f2 = m.predict(trainData)   
     finalDataset = trainData[names_without_response]

diff --git a/h2o-r/h2o-package/R/hglm.R b/h2o-r/h2o-package/R/hglm.R
@@ -129,7 +129,7 @@ h2o.hglm <- function(x,
   } else {
       stop("random_columns is required.")
   }
-  if (!missing(group_column) {
+  if (!missing(group_column)) {
       parms$group_column <- group_column
   } else {
       stop("group_column is required.")
@@ -256,7 +256,7 @@ h2o.hglm <- function(x,
   } else {
       stop("random_columns is required.")
   }
-  if (!missing(group_column) {
+  if (!missing(group_column)) {
       parms$group_column <- group_column
   } else {
       stop("group_column is required.")

diff --git a/h2o-r/h2o-package/pkgdown/_pkgdown.yml b/h2o-r/h2o-package/pkgdown/_pkgdown.yml
@@ -74,6 +74,10 @@ reference:
       - h2o.coef
       - h2o.coef_names
       - h2o.coef_norm
+      - h2o.coef_random
+      - h2o.coefs_random_names
+      - h2o.coefs_random_names_norm
+      - h2o.coef_random_norm
       - h2o.coef_with_p_values
       - h2o.colnames
       - h2o.columns_by_type
@@ -172,6 +176,7 @@ reference:
       - h2o.hist
       - h2o.hit_ratio_table
       - h2o.hour
+      - h2o.icc
       - h2o.ice_plot
       - h2o.ifelse
       - h2o.import_hive_table
@@ -199,6 +204,7 @@ reference:
       - h2o.kolmogorov_smirnov
       - h2o.kurtosis
       - h2o.learning_curve_plot
+      - h2o.level_2_names
       - h2o.levels
       - h2o.list_all_extensions
       - h2o.list_api_extensions
@@ -222,10 +228,12 @@ reference:
       - h2o.make_metrics
       - h2o.makeGLMModel
       - h2o.match
+      - h2o.matrix_T
       - h2o.max
       - h2o.modelSelection
       - h2o.mean_per_class_error
       - h2o.mean_residual_deviance
+      - h2o.mean_residual_fixed
       - h2o.mean
       - h2o.median
       - h2o.melt
@@ -288,6 +296,7 @@ reference:
       - h2o.residual_deviance 
       - h2o.residual_dof
       - h2o.residual_analysis_plot
+      - h2o.residual_variance
       - h2o.result
       - h2o.rm 
       - h2o.rmse
@@ -304,6 +313,8 @@ reference:
       - h2o.save_frame
       - h2o.scale
       - h2o.scoreHistory
+      - h2o.scoring_history
+      - h2o.scoring_history_valid
       - h2o.scoreHistoryGAM
       - h2o.sd 
       - h2o.sdev

diff --git a/h2o-r/tests/testdir_algos/hglm/runit_GH_8487_HGLM_setInitialValues.R b/h2o-r/tests/testdir_algos/hglm/runit_GH_8487_HGLM_setInitialValues.R
diff --git a/h2o-r/tests/testdir_algos/hglm/runit_GH_8487_coefs_check.R b/h2o-r/tests/testdir_algos/hglm/runit_GH_8487_coefs_check.R
@@ -0,0 +1,20 @@
+setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
+source("../../../scripts/h2o-r-test-setup.R")
+##
+# Build HGLM Model and check against models from model outputs.
+##
+
+test.HGLMData1 <- function() {
+  tol = 1e-4
+  h2odata <- h2o.importFile(locate("smalldata/hglm_test/gaussian_0GC_allRC_2enum2numeric_2p0noise_p5T_wIntercept.gz"))
+  browser()
+  yresp <- "response"
+  random_columns <- c("C2", "C3", "C10", "C20")
+  group_column <- "C1"
+
+
+ }
+
+doTest("Check HGLM model building and coefficient retrievea.", test.HGLMData1)
+
+