continue to fix metrics passing to clients

h2oai · Oct 3, 2024 · 952baff · 952baff
1 parent d6a969c
commit 952baff
Show file tree

Hide file tree

Showing 9 changed files with 94 additions and 25 deletions.
diff --git a/h2o-algos/src/main/java/hex/hglm/HGLM.java b/h2o-algos/src/main/java/hex/hglm/HGLM.java
@@ -180,6 +180,12 @@ else if (!trainFrame.vec(_parms._group_column).isCategorical())
         if (!goodRandomColumns)
           error("random_columns", " can only contain columns in the training frame.");
       }
+
+      if (_parms._gen_syn_data) {
+        _parms._max_iterations = 0;
+        if (_parms._tau_e_var_init <= 0) 
+          error("tau_e_var_init", "If gen_syn_data is true, tau_e_var_init must be > 0.");
+      }
     }
   }
 
@@ -232,7 +238,7 @@ public void computeImpl() {
     }
 
     private TwoDimTable generateSummary(HGLMModel.HGLMModelOutput modelOutput) {
-      String[] names = new String[]{"iteration", "loglikelihood", "loglikelihood_valid"};
+      String[] names = new String[]{"iteration", "loglikelihood", "loglikelihood2"};
       String[] types = new String[]{"int", "double", "double"};
       String[] formats = new String[]{"%d", "%.5f", "%.5f"};
       TwoDimTable summary = new TwoDimTable("HGLM Model", "summary", new String[]{""}, names, types, formats, "");

diff --git a/h2o-algos/src/main/java/hex/hglm/HGLMModel.java b/h2o-algos/src/main/java/hex/hglm/HGLMModel.java
@@ -10,6 +10,7 @@
 import water.udf.CFuncRef;
 
 import java.io.Serializable;
+import java.util.Arrays;
 
 import static hex.glm.GLMModel.GLMParameters.Family.gaussian;
 import static hex.hglm.HGLMModel.HGLMParameters.Method.EM;
@@ -102,6 +103,7 @@ public static class HGLMParameters extends Model.Parameters {
     public boolean _showFixedMatVecs = false; // internal parameter, if true, will show AfjTY, ArjTY, ArjTArj, AfjTAfj, AfjTArj
     public int _score_iteration_interval = 5;
     public boolean _score_each_iteration = false;
+    public boolean _gen_syn_data = false;
 
     @Override
     public String algoName() {
@@ -317,4 +319,23 @@ protected AutoBuffer writeAll_impl(AutoBuffer ab) {
   protected Keyed readAll_impl(AutoBuffer ab, Futures fs) {
     return super.readAll_impl(ab, fs);
   }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(super.toString());
+    sb.append(" loglikelihood: "+this._output._log_likelihood);
+    sb.append(" fixed effect coefficients: "+ Arrays.toString(this._output._beta));
+    int numLevel2 = this._output._ubeta.length;
+    for (int index=0; index<numLevel2; index++)
+      sb.append(" standard error of random effects for level 2 index " + index + ": "+this._output._tmat[index][index]);
+/*    sb.append(" standard error of residual error: "+_var_residual);
+    sb.append(" ICC: "+ Arrays.toString(_icc));
+    sb.append(" loglikelihood: "+_log_likelihood);
+    sb.append(" iterations taken to build model: " + _iterations);
+    sb.append(" coefficients for fixed effect: "+Arrays.toString(_beta));
+    for (int index=0; index<numLevel2; index++)
+      sb.append(" coefficients for random effect for level 2 index: "+index+": "+Arrays.toString(_ubeta[index]));*/
+    return sb.toString();
+  }
 }
diff --git a/h2o-algos/src/main/java/hex/hglm/HGLMScore.java b/h2o-algos/src/main/java/hex/hglm/HGLMScore.java
@@ -50,8 +50,8 @@ public HGLMScore(final Job j, final HGLMModel model, DataInfo dinfo, final Strin
     _dinfo = dinfo;
     _computeMetrics = computeMetrics; // can be true only if the response column is available and calcualte loglikelihood
     _makePredictions = makePredictions;
-    _beta = model._output._beta;
-    _ubeta = model._output._ubeta;
+    _beta = model._output._beta;    // non-standardized/non-normalized coefficients
+    _ubeta = model._output._ubeta;  // non-standardized/non-normalized coefficients
     _predDomains = respDomain;
     _nclass = model._output.nclasses();
     _parms = model._parms;
@@ -130,7 +130,7 @@ public double[] scoreRow(DataInfo.Row r, double[] preds, double[] xji, double[]
     fillInRandomRowValues(r, zji, _parms, _randomCatIndices, _randomNumIndices, _randomCatArrayStartIndices,
             _predStartIndexRandom, _dinfo, _randomSlopeToo, _randomIntercept);
     preds[0] = innerProduct(xji, _beta) + innerProduct(zji, _ubeta[level2Index]) + r.offset;
-    preds[0] = _parms._max_iterations == 0 ? preds[0]+randomObj.nextGaussian() : preds[0];
+    preds[0] = _parms._gen_syn_data ? preds[0]+randomObj.nextGaussian()*_parms._tau_e_var_init : preds[0];
     return preds;
   }
 }
diff --git a/h2o-algos/src/main/java/hex/schemas/HGLMModelV3.java b/h2o-algos/src/main/java/hex/schemas/HGLMModelV3.java
@@ -5,6 +5,7 @@
 import water.api.schemas3.ModelOutputSchemaV3;
 import water.api.schemas3.ModelSchemaV3;
 import water.api.schemas3.TwoDimTableV3;
+import water.util.Log;
 import water.util.TwoDimTable;
 
 import java.util.Arrays;
@@ -63,8 +64,10 @@ public HGLMModelOutputV3 fillFromImpl(HGLMModel.HGLMModelOutput impl) {
       beta_normalized = impl._beta_normalized;
       ubeta = impl._ubeta;
       ubeta_normalized = impl._ubeta_normalized;
+      fixed_coefficients_table = new TwoDimTableV3();
       fixed_coefficients_table.fillFromImpl(generateCoeffTable("fixed effect oefficients", 
               "HGLM fixed effect coefficients", beta, beta_normalized, fixed_coefficient_names));
+      random_coefficients_table = new TwoDimTableV3();
       random_coefficients_table.fillFromImpl(generate2DCoeffTable("random effect coefficients", 
               "HGLM random effect coefficients", ubeta, ubeta_normalized, random_coefficient_names, 
               random_coefficient_names_normalized, impl._group_column_names));
@@ -75,8 +78,8 @@ public HGLMModelOutputV3 fillFromImpl(HGLMModel.HGLMModelOutput impl) {
   public static TwoDimTable generateCoeffTable(String title1, String title2, double[] coeffs, double[] coeffs_normalized, 
                                  String[] coeffNames) {
     String[] colnames = new String[] {"coefficient values", "standardized coefficient values"};
-    String[] colTypes = new String[] {"%.5f", "%.5f"};
-    String[] colFormats = new String[] {"double", "double"};
+    String[] colFormats = new String[] {"%.5f", "%.5f"};
+    String[] colTypes = new String[] {"double", "double"};
     TwoDimTable tdt = new TwoDimTable(title1, title2, coeffNames, colnames, colTypes, colFormats, "names");
     int tableLen = coeffs.length;
     for (int index=0; index<tableLen; index++) {
@@ -110,14 +113,15 @@ public static TwoDimTable generate2DCoeffTable(String title1, String title2, dou
     double[] fCoeffValues = flattenArray(coeffsUsed);
     double[] fCoeffValuesNormalized = flattenArray(coeffsNormalizedUsed);
     String[] fCoeffNames = extendCoeffNames(coeffNamesused, numLevel2Index);
-    String[] fLevel2Vals = extendLevel2Ind(level2Domain, coeffsUsed.length);
+    String[] fLevel2Vals = extendLevel2Ind(level2Domain, coeffsUsed[0].length);
 
     String[] colnames = new String[] {"coefficient names", "coefficient values", "standardized coefficient values"};
-    String[] colTypes = new String[] {"%s", "%.5f", "%.5f"};
-    String[] colFormats = new String[] {"string", "double", "double"};
+    String[] colFormats = new String[] {"%s", "%.5f", "%.5f"};
+    String[] colTypes = new String[] {"string", "double", "double"};
     TwoDimTable tdt = new TwoDimTable(title1, title2, fLevel2Vals, colnames, colTypes, colFormats, "names");
-    int tableLen = fCoeffValues.length;
+    int tableLen = fCoeffNames.length;
     for (int index=0; index<tableLen; index++) {
+      Log.info("index "+index);
       tdt.set(index, 0, fCoeffNames[index]);
       tdt.set(index, 1, fCoeffValues[index]);
       tdt.set(index, 2, fCoeffValuesNormalized[index]);
@@ -128,7 +132,8 @@ public static TwoDimTable generate2DCoeffTable(String title1, String title2, dou
   public static String[] extendLevel2Ind(String[] level2Domain, int numCoeff) {
     int levelIndNum = level2Domain.length;
     String[][] extendedDomain = new String[levelIndNum][numCoeff];
-    for (int index=0; index<levelIndNum; index++) {
+    int extendLen = extendedDomain.length;
+    for (int index=0; index<extendLen; index++) {
       Arrays.fill(extendedDomain[index], level2Domain[index]);
     }
     return flattenArray(extendedDomain);
@@ -139,7 +144,7 @@ public static String[] extendCoeffNames(String[] coeffNames, int numLevel2Ind) {
     String[] extendedCoeffNames = new String[numCoeff*numLevel2Ind];
     int indexStart;
     for (int index=0; index<numLevel2Ind; index++) {
-      indexStart = index*numLevel2Ind;
+      indexStart = index*numCoeff;
       System.arraycopy(coeffNames, 0, extendedCoeffNames, indexStart, numCoeff);
     }
       return extendedCoeffNames;

diff --git a/h2o-algos/src/main/java/hex/schemas/HGLMV3.java b/h2o-algos/src/main/java/hex/schemas/HGLMV3.java
@@ -41,7 +41,8 @@ public static final class HGLMParametersV3 extends ModelParametersSchemaV3<HGLMM
             "method",
             "em_epsilon",
             "random_intercept", 
-            "group_column"
+            "group_column",
+            "gen_syn_data"
     };
 
     @API(help = "Perform scoring for every score_iteration_interval iterations.", level = Level.secondary)
@@ -123,5 +124,9 @@ public static final class HGLMParametersV3 extends ModelParametersSchemaV3<HGLMM
 
     @API(help="group_column is the column that is categorical and used to generate the groups in HGLM")
     public String group_column;
+
+    @API(help="if true, add gaussian noise with variance specified in parms._tau_e_var_init.", 
+            direction=Direction.INPUT, gridable=true)
+    public boolean gen_syn_data;
   }
 }
diff --git a/h2o-algos/src/test/java/hex/hglm/HGLMBasicTest.java b/h2o-algos/src/test/java/hex/hglm/HGLMBasicTest.java
@@ -732,7 +732,7 @@ public void testCoeffDeNNormalizationWithRandomIntercept() {
               .build();
       Scope.track(ubetaFrameStandardize);
 
-      Frame fr = parseTestFile("smalldata/glm_test/semiconductor.csv");
+      Frame fr = parseTestFile("smalldata/hglm_test/semiconductor.csv");
       fr.replace(0, fr.vec(0).toCategoricalVec()).remove();
       DKV.put(fr);
       Scope.track(fr);
@@ -811,7 +811,7 @@ public void testCoeffDeNNormalizationWORandomIntercept() {
               .build();
       Scope.track(ubetaFrameStandardize);
 
-      Frame fr = parseTestFile("smalldata/glm_test/semiconductor.csv");
+      Frame fr = parseTestFile("smalldata/hglm_test/semiconductor.csv");
       fr.replace(0, fr.vec(0).toCategoricalVec()).remove();
       DKV.put(fr);
       Scope.track(fr);
@@ -892,7 +892,7 @@ public void testRandomInterceptOnly() {
   public void testSemiconductor() {
     try {
       Scope.enter();
-      Frame fr = parseTestFile("smalldata/glm_test/semiconductor.csv");
+      Frame fr = parseTestFile("smalldata/hglm_test/semiconductor.csv");
       fr.replace(0, fr.vec(0).toCategoricalVec()).remove();
       DKV.put(fr);
       Scope.track(fr);
@@ -947,15 +947,15 @@ public void testPredictionMetricsSumaryScoringHistoryWRIntercept() {
               .build();
       Scope.track(ubetaInitFrame);
 
-      Frame fr = parseTestFile("smalldata/glm_test/semiconductor.csv");
-      Frame validFr = parseTestFile("smalldata/glm_test/semiconductor.csv");
+      Frame fr = parseTestFile("smalldata/hglm_test/semiconductor.csv");
+      Frame validFr = parseTestFile("smalldata/hglm_test/semiconductor.csv");
       fr.replace(0, fr.vec(0).toCategoricalVec()).remove();
       validFr.replace(0, validFr.vec(0).toCategoricalVec()).remove();
       DKV.put(fr);
       DKV.put(validFr);
       Scope.track(fr);
       Scope.track(validFr);
-      SplitFrame sf = new SplitFrame(validFr, new double[]{0.5, 0.5}, new Key[]{Key.make("train.hex"), Key.make("test.hex")});
+      SplitFrame sf = new SplitFrame(validFr, new double[]{0.1, 0.9}, new Key[]{Key.make("train.hex"), Key.make("test.hex")});
       sf.exec().get();
       Key[] ksplits = sf._destination_frames;
       Frame tr = DKV.get(ksplits[0]).get();
@@ -1017,8 +1017,8 @@ public void testPredictionMetricsSumaryScoringHistoryWORIntercept() {
               .build();
       Scope.track(ubetaInitFrame);
 
-      Frame fr = parseTestFile("smalldata/glm_test/semiconductor.csv");
-      Frame validFr = parseTestFile("smalldata/glm_test/semiconductor.csv");
+      Frame fr = parseTestFile("smalldata/hglm_test/semiconductor.csv");
+      Frame validFr = parseTestFile("smalldata/hglm_test/semiconductor.csv");
       fr.replace(0, fr.vec(0).toCategoricalVec()).remove();
       validFr.replace(0, validFr.vec(0).toCategoricalVec()).remove();
       DKV.put(fr);
@@ -1110,7 +1110,7 @@ public void fillDataRows(Frame fr, int rowInd, String[] coefNames, String[] rCoe
   public void testMultiChunkData(){
 /*    try {
       Scope.enter();
-      Frame fr = parseTestFile("smalldata/glm_test/HGLM_5KRows_100Z.csv");
+      Frame fr = parseTestFile("smalldata/hglm_test/HGLM_5KRows_100Z.csv");
       fr.replace(0, fr.vec(0).toCategoricalVec()).remove();
       fr.replace(1, fr.vec(1).toCategoricalVec()).remove();
       fr.replace(2, fr.vec(2).toCategoricalVec()).remove();

diff --git a/h2o-py/h2o/estimators/hglm.py b/h2o-py/h2o/estimators/hglm.py
@@ -51,6 +51,7 @@ def __init__(self,
                  em_epsilon=1e-06,  # type: float
                  random_intercept=True,  # type: bool
                  group_column=None,  # type: Optional[str]
+                 gen_syn_data=False,  # type: bool
                  ):
         """
         :param model_id: Destination id for this model; auto-generated if not specified.
@@ -159,6 +160,9 @@ def __init__(self,
         :param group_column: group_column is the column that is categorical and used to generate the groups in HGLM
                Defaults to ``None``.
         :type group_column: str, optional
+        :param gen_syn_data: if true, add gaussian noise with variance specified in parms._tau_e_var_init.
+               Defaults to ``False``.
+        :type gen_syn_data: bool
         """
         super(H2OHGLMEstimator, self).__init__()
         self._parms = {}
@@ -191,6 +195,7 @@ def __init__(self,
         self.em_epsilon = em_epsilon
         self.random_intercept = random_intercept
         self.group_column = group_column
+        self.gen_syn_data = gen_syn_data
 
     @property
     def training_frame(self):
@@ -606,4 +611,18 @@ def group_column(self, group_column):
         assert_is_type(group_column, None, str)
         self._parms["group_column"] = group_column
 
+    @property
+    def gen_syn_data(self):
+        """
+        if true, add gaussian noise with variance specified in parms._tau_e_var_init.
+
+        Type: ``bool``, defaults to ``False``.
+        """
+        return self._parms.get("gen_syn_data")
+
+    @gen_syn_data.setter
+    def gen_syn_data(self, gen_syn_data):
+        assert_is_type(gen_syn_data, None, bool)
+        self._parms["gen_syn_data"] = gen_syn_data
+
 
diff --git a/h2o-py/tests/testdir_algos/hglm/pyunit_generate_synthetic_HGLM_data.py b/h2o-py/tests/testdir_algos/hglm/pyunit_generate_synthetic_HGLM_data.py
@@ -21,7 +21,7 @@ def test_define_dataset():
     family = 'gaussian' # can be any valid GLM families
     nrow = 40000
     nenum = 6
-    nreal = 6 # last one is the response
+    nreal = 1 # last one is the response
     enum_columns = pyunit_utils.random_dataset_enums_only(nrow, nenum, factorL=8, misFrac=0.0)
     real_columns = pyunit_utils.random_dataset_real_only(nrow, nreal, realR = 2,  misFrac=0.0)
     dataset = enum_columns.cbind(real_columns)
@@ -43,8 +43,13 @@ def generate_dataset(family, trainData, group_column, random_columns):
     names_without_response = trainData.names
     names_without_response.remove(myY)
 
+    startVal = [1.90118665, -1.26168122,  0.42931675,  0.98026587,  0.76808271, -0.63595311,  0.36327481, -0.30597578,  
+                1.54715533, 1.39872770 -1.75658816,  1.31031573, -1.85003284,  1.33139505,  1.17457290,  1.84407102,
+                -0.07955216,  0.87777599, 0.07614022,  1.96488429, -0.52619981, -1.11206544, -0.55910850, -0.68860274,
+                0.61111377,  1.48083252, -0.46079518, -1.10481602, -1.32406489,  1.47580376,  0.66306257, -0.40125219,
+                0.70811714, -0.93184588,  1.52309741, -0.65421192, 0.83816616]
     m = hglm(family=family, max_iterations=10, random_columns=random_columns, group_column=group_column, 
-             tau_u_var_init = 0.5, tau_e_var_init = 0.6, random_intercept = True)
+             tau_u_var_init = 0.08, tau_e_var_init = 0.06, random_intercept = False, gen_syn_data=True, initial_fixed_effects=startVal)
     m.train(training_frame=trainData, y = "response", x =myX)
     f2 = m.predict(trainData)   
     finalDataset = trainData[names_without_response]

diff --git a/h2o-r/h2o-package/R/hglm.R b/h2o-r/h2o-package/R/hglm.R
@@ -60,6 +60,8 @@
 #' @param em_epsilon Converge if beta/ubeta/tmat/tauEVar changes less (using L-infinity norm) than em esilon. ONLY applies to EM
 #'        method. Defaults to 1e-06.
 #' @param random_intercept \code{Logical}. if true, will allow random component to the GLM coefficients. Defaults to TRUE.
+#' @param gen_syn_data \code{Logical}. if true, add gaussian noise with variance specified in parms._tau_e_var_init. Defaults to
+#'        FALSE.
 #' @examples
 #' \dontrun{
 #' library(h2o)
@@ -101,7 +103,8 @@ h2o.hglm <- function(x,
                      tau_e_var_init = 0,
                      method = c("EM"),
                      em_epsilon = 1e-06,
-                     random_intercept = TRUE)
+                     random_intercept = TRUE,
+                     gen_syn_data = FALSE)
 {
   # Validate required training_frame first and other frame args: should be a valid key or an H2OFrame object
   training_frame <- .validate.H2OFrame(training_frame, required=TRUE)
@@ -186,6 +189,8 @@ h2o.hglm <- function(x,
     parms$random_intercept <- random_intercept
   if (!missing(group_column))
     parms$group_column <- group_column
+  if (!missing(gen_syn_data))
+    parms$gen_syn_data <- gen_syn_data
 
   # Error check and build model
   model <- .h2o.modelJob('hglm', parms, h2oRestApiVersion=3, verbose=FALSE)
@@ -219,6 +224,7 @@ h2o.hglm <- function(x,
                                      method = c("EM"),
                                      em_epsilon = 1e-06,
                                      random_intercept = TRUE,
+                                     gen_syn_data = FALSE,
                                      segment_columns = NULL,
                                      segment_models_id = NULL,
                                      parallelism = 1)
@@ -308,6 +314,8 @@ h2o.hglm <- function(x,
     parms$random_intercept <- random_intercept
   if (!missing(group_column))
     parms$group_column <- group_column
+  if (!missing(gen_syn_data))
+    parms$gen_syn_data <- gen_syn_data
 
   # Build segment-models specific parameters
   segment_parms <- list()