Skip to content

Commit

Permalink
clean out more bugs in coefficients de-normalization/normalization an…
Browse files Browse the repository at this point in the history
…d model metrics calculation.
  • Loading branch information
wendycwong committed Oct 5, 2024
1 parent ff87aa9 commit d883ece
Show file tree
Hide file tree
Showing 7 changed files with 112 additions and 49 deletions.
16 changes: 7 additions & 9 deletions h2o-algos/src/main/java/hex/hglm/HGLMModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -260,35 +260,33 @@ public void setModelOutputFields(ComputationStateHGLM state) {
_numLevel2Units = state.get_numLevel2Units();
_level2UnitIndex = state.get_level2UnitIndex();
_nobs = state._nobs;
if (state._parms._standardize) {
if (state._parms._standardize) { // for random coefficients, the names of random coefficients names may change
_beta_normalized = state.get_beta();
_ubeta_normalized = state.get_ubeta();
_beta = denormalizedOneBeta(_beta_normalized, _fixed_coefficient_names, _dinfo._adaptedFrame.names(),
state._parms.train(), true);
_ubeta = denormalizedUBeta(_ubeta_normalized, _random_coefficient_names, state._parms._random_columns,
state._parms.train(), state._parms._random_intercept);
if (state._parms._random_intercept)
_random_coefficient_names_normalized = _random_coefficient_names;
else
_random_coefficient_names_normalized = copyCoeffNames(_random_coefficient_names);

_random_coefficient_names_normalized = _random_coefficient_names.clone();
if (_ubeta_normalized[0].length < _ubeta[0].length) // added intercept term, need to add name to random coeff names
_random_coefficient_names = copyCoefAddIntercept(_random_coefficient_names_normalized);
} else {
_beta = state.get_beta();
_beta_normalized = normalizedOneBeta(_beta, _fixed_coefficient_names, _dinfo._adaptedFrame.names(),
state._parms.train(), true);
_ubeta = state.get_ubeta();
_ubeta_normalized = normalizedUBeta(_ubeta, _random_coefficient_names, state._parms._random_columns,
state._parms.train(), state._parms._random_intercept);
if (state._parms._random_intercept)
if (_ubeta[0].length == _ubeta_normalized[0].length)
_random_coefficient_names_normalized = _random_coefficient_names;
else
_random_coefficient_names_normalized = copyCoeffNames(_random_coefficient_names);
_random_coefficient_names_normalized = copyCoefAddIntercept(_random_coefficient_names);
}
_num_random_coeffs_normalized = _ubeta_normalized[0].length;
_num_random_coeffs = _ubeta[0].length;
}

public static String[] copyCoeffNames(String[] originalNames) {
public static String[] copyCoefAddIntercept(String[] originalNames) {
int nameLen = originalNames.length;
String[] longerNames = new String[nameLen+1];
System.arraycopy(originalNames, 0, longerNames, 0, nameLen);
Expand Down
60 changes: 46 additions & 14 deletions h2o-algos/src/main/java/hex/hglm/HGLMUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -141,23 +141,24 @@ public static double calTauEvar(double residualSquare, double tauEVar, double[][
public static double[] denormalizedOneBeta(double[] beta, String[] coeffNames, String[] colNames,
Frame train, boolean interceptPresent) {
int numRandomCoeff = beta.length;
int interceptIndex = interceptPresent ? numRandomCoeff - 1 : numRandomCoeff;
List<String> randomColList = Arrays.stream(colNames).collect(Collectors.toList());
double[] denormalizedUBeta = new double[interceptIndex + 1];
Map<String, Double> coefMean = new HashMap<>();
Map<String, Double> coefOStd = new HashMap<>();
Map<String, Double> coefMeanOStd = new HashMap<>();
List<String> randomColList = Arrays.stream(colNames).collect(Collectors.toList());
genMeanStd(coeffNames, randomColList, train, coefMean, coefOStd, coefMeanOStd, true);
int interceptIndex = interceptPresent ? numRandomCoeff - 1 : numRandomCoeff;
double[] denormalizedUBeta = new double[interceptIndex + 1];
if (interceptPresent)
denormalizedUBeta[interceptIndex] = beta[interceptIndex];

String coefName;

for (int coefInd = 0; coefInd < numRandomCoeff; coefInd++) {
coefName = coeffNames[coefInd];
if (randomColList.contains(coefName)) {
denormalizedUBeta[coefInd] = beta[coefInd] * coefOStd.get(coefName);
denormalizedUBeta[interceptIndex] -= denormalizedUBeta[coefInd] * coefMeanOStd.get(coefName);
if (randomColList.contains(coefName)) { // pick out the numerical columns
denormalizedUBeta[coefInd] = beta[coefInd] * coefOStd.get(coefName);
denormalizedUBeta[interceptIndex] -= denormalizedUBeta[coefInd] * coefMeanOStd.get(coefName);
} else {
denormalizedUBeta[coefInd] = beta[coefInd];
}
}
return denormalizedUBeta;
Expand All @@ -167,9 +168,14 @@ public static double[][] denormalizedUBeta(double[][] ubeta, String[] randomCoef
Frame train, boolean randomIntercept) {
int numLevel2 = ubeta.length;
double[][] denormalizedBeta = new double[numLevel2][];
for (int index=0; index<numLevel2; index++)
denormalizedBeta[index] = denormalizedOneBeta(ubeta[index], randomCoeffNames, randomColNames, train, randomIntercept);

boolean onlyEnumRandomCols = randomColAllEnum(train, randomColNames);
for (int index=0; index<numLevel2; index++) {
if (onlyEnumRandomCols)
denormalizedBeta[index] = ubeta[index].clone();
else
denormalizedBeta[index] = denormalizedOneBeta(ubeta[index], randomCoeffNames, randomColNames, train,
randomIntercept);
}
return denormalizedBeta;
}

Expand All @@ -190,9 +196,11 @@ public static double[] normalizedOneBeta(double[] beta, String[] coeffNames, Str
String coefName;
for (int coefInd=0; coefInd < numCoeff; coefInd++) {
coefName = coeffNames[coefInd];
if (colNamesList.contains(coefName)) {
normalizedBeta[coefInd] = beta[coefInd]*coefStd.get(coefName);
normalizedBeta[interceptIndex] += normalizedBeta[coefInd]*coefMeanOStd.get(coefName);
if (colNamesList.contains(coefName)) { // pick out numerical columns
normalizedBeta[coefInd] = beta[coefInd] * coefStd.get(coefName);
normalizedBeta[interceptIndex] += normalizedBeta[coefInd] * coefMeanOStd.get(coefName);
} else { // no change to enum columns
normalizedBeta[coefInd] = beta[coefInd];
}
}
return normalizedBeta;
Expand All @@ -205,8 +213,12 @@ public static double[][] normalizedUBeta(double[][] ubeta, String[] randomCoeffN
Frame train, boolean randomIntercept) {
int numLevel2 = ubeta.length;
double[][] normalizedUBeta = new double[numLevel2][];
boolean onlyEnumRandomCols = randomColAllEnum(train, randomColNames);
for (int index=0; index<numLevel2; index++) {
normalizedUBeta[index] = normalizedOneBeta(ubeta[index], randomCoeffNames, randomColNames, train, randomIntercept);
if (onlyEnumRandomCols)
normalizedUBeta[index] = ubeta[index].clone();
else
normalizedUBeta[index] = normalizedOneBeta(ubeta[index], randomCoeffNames, randomColNames, train, randomIntercept);
}
return normalizedUBeta;
}
Expand Down Expand Up @@ -257,6 +269,8 @@ public static boolean checkPositiveG(int numLevel2Units, double[][] tMat) {

public static void generateNonStandardizeZTZArjTArs(HGLMModel.HGLMParameters parms, HGLMModel model) {
if (parms._standardize) {
boolean orignalRandomIntercept = parms._random_intercept;
parms._random_intercept = true && !randomColAllEnum(parms.train(), parms._random_columns);
List<String> colNames = Arrays.asList(parms.train().names());
boolean hasWeights = model._parms._weights_column != null && colNames.contains(model._parms._weights_column);
boolean hasOffsets = model._parms._offset_column != null && colNames.contains(model._parms._offset_column);
Expand All @@ -269,9 +283,27 @@ public static void generateNonStandardizeZTZArjTArs(HGLMModel.HGLMParameters par
engineTask.doAll(dinfo._adaptedFrame);
model._output._arjtarj_score = engineTask._ArjTArj;
model._output._zttimesz_score = engineTask._zTTimesZ;
parms._random_intercept = orignalRandomIntercept;
} else {
model._output._arjtarj_score = model._output._arjtarj;
model._output._zttimesz_score = model._output._zttimesz;
}
}

public static double[][] generateNewTmat(double[][] ubeta) {
int numIndex2 = ubeta.length;
double oneOverJ = 1.0/numIndex2;
int numRandCoeff = ubeta[0].length;
double[][] newTmat = new double[numRandCoeff][numRandCoeff];
for (int index=0; index<numIndex2; index++) {
outerProductCum(newTmat, ubeta[index], ubeta[index]);
}
mult(newTmat, oneOverJ);
return newTmat;
}

public static boolean randomColAllEnum(Frame train, String[] randomColumns) {
int numRandCols = randomColumns.length;
return Arrays.stream(randomColumns).filter(x -> train.vec(x).isCategorical()).count() == numRandCols;
}
}
10 changes: 8 additions & 2 deletions h2o-algos/src/main/java/hex/hglm/MetricBuilderHGLM.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import java.util.List;

import static hex.glm.GLMModel.GLMParameters.MissingValuesHandling.*;
import static hex.hglm.HGLMUtils.generateNewTmat;

public class MetricBuilderHGLM extends ModelMetricsSupervised.MetricBuilderSupervised<MetricBuilderHGLM> {
// the doc = document attached to https://github.com/h2oai/h2o-3/issues/8487, title HGLM_H2O_Implementation.pdf
Expand Down Expand Up @@ -63,10 +64,15 @@ public ModelMetrics makeModelMetrics(Model m, Frame f, Frame adaptedFrame, Frame
HGLMModel hglmM = (HGLMModel) m;
ModelMetrics mm;
boolean forTraining = m._parms.train().getKey().equals(f.getKey());
double[][] tmat = hglmM._output._tmat;
if (hglmM._parms._standardize) {
tmat = generateNewTmat(hglmM._output._ubeta);
}
double mse = this._sse / hglmM._output._nobs;
if (forTraining) {
mm = new ModelMetricsRegressionHGLM(m, f, hglmM._output._nobs, this._domain, this.weightedSigma(),
this._customMetric, hglmM._output._iterations, hglmM._output._beta, hglmM._output._ubeta,
hglmM._output._tmat, hglmM._output._tauEVar, hglmM._output._zttimesz_score, this._sse / hglmM._output._nobs,
tmat, mse, hglmM._output._zttimesz_score, mse,
this._yMinusfixPredSquare / hglmM._output._nobs, hglmM._output._yminusxtimesz,
hglmM._output._arjtarj_score);
} else {
Expand All @@ -83,7 +89,7 @@ public ModelMetrics makeModelMetrics(Model m, Frame f, Frame adaptedFrame, Frame
engineTask.doAll(dinfo._adaptedFrame);
mm = new ModelMetricsRegressionHGLM(m, f, engineTask._nobs, this._domain, this.weightedSigma(),
this._customMetric, hglmM._output._iterations, hglmM._output._beta, hglmM._output._ubeta,
hglmM._output._tmat, hglmM._output._tauEVar, engineTask._zTTimesZ, this._sse/engineTask._nobs,
tmat, mse, engineTask._zTTimesZ, this._sse/engineTask._nobs,
this._yMinusfixPredSquare/engineTask._nobs, hglmM._output._yminusxtimesz_valid, engineTask._ArjTArj);
hglmM._output._nobs_valid = engineTask._nobs;
}
Expand Down
14 changes: 6 additions & 8 deletions h2o-algos/src/main/java/hex/schemas/HGLMModelV3.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import water.api.schemas3.ModelOutputSchemaV3;
import water.api.schemas3.ModelSchemaV3;
import water.api.schemas3.TwoDimTableV3;
import water.util.Log;
import water.util.TwoDimTable;

import java.util.Arrays;
Expand Down Expand Up @@ -91,17 +90,17 @@ public static TwoDimTable generateCoeffTable(String title1, String title2, doubl

public static TwoDimTable generate2DCoeffTable(String title1, String title2, double[][] coeffs, double[][] coeffs_normalized,
String[] coeffNames, String[] coeffNamesNormalized, String[] level2Domain) {
int coeffNamesLen = coeffNames.length;
int coefNameLenUsed = Math.max(coeffNamesLen, coeffNamesNormalized.length);
int randomCoeffNamesLen = coeffNames.length;
int randomCoefNameLenNorm = coeffNamesNormalized.length;
int numLevel2Index = level2Domain.length;
String[] coeffNamesused;
double[][] coeffsUsed;
double[][] coeffsNormalizedUsed;
if (coefNameLenUsed > coeffNamesLen) { // model building, added intercept to coeffs_normalized, extend coeffs
if (randomCoefNameLenNorm > randomCoeffNamesLen) { // model building, added intercept to coeffs_normalized, extend coeffs
coeffNamesused = coeffNamesNormalized;
coeffsUsed = addInterceptValue(coeffs);
coeffsUsed = addInterceptValue(coeffs);
coeffsNormalizedUsed = coeffs_normalized;
} else if (coefNameLenUsed < coeffNamesLen) { // model building with standardization, added intercept to coeffs, extend coeffsNormalized
} else if (randomCoefNameLenNorm < randomCoeffNamesLen) { // model building with standardization, added intercept to coeffs, extend coeffsNormalized
coeffNamesused = coeffNames;
coeffsUsed = coeffs;
coeffsNormalizedUsed = addInterceptValue(coeffs_normalized);
Expand All @@ -121,7 +120,6 @@ public static TwoDimTable generate2DCoeffTable(String title1, String title2, dou
TwoDimTable tdt = new TwoDimTable(title1, title2, fLevel2Vals, colnames, colTypes, colFormats, "names");
int tableLen = fCoeffNames.length;
for (int index=0; index<tableLen; index++) {
Log.info("index "+index);
tdt.set(index, 0, fCoeffNames[index]);
tdt.set(index, 1, fCoeffValues[index]);
tdt.set(index, 2, fCoeffValuesNormalized[index]);
Expand Down Expand Up @@ -157,7 +155,7 @@ public static double[][] addInterceptValue(double[][] coeffs) {
double[][] coeffsExt = new double[coeffs.length][coefLenNew];
for (int index=0; index<numLevel2Index; index++) {
System.arraycopy(coeffs[index], 0, coeffsExt[index], 0, coefLen);
coeffsExt[index][coefLen] = Double.NaN;
coeffsExt[index][coefLen] = 0.0;
}
return coeffsExt;
}
Expand Down
21 changes: 21 additions & 0 deletions h2o-algos/src/test/java/hex/hglm/HGLMUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,27 @@ public void checkCalTauEvar(int numLevel2, int numRandomCoeffs) {
assertEquals(newTauEVar, newTauEVarMat, 1e-6);
}

@Test
public void testGenerateNewTmat() {
checkGenerateNewTmat(2, 1);
checkGenerateNewTmat(5, 5);
checkGenerateNewTmat(10, 8);
checkGenerateNewTmat(8, 15);
}

public void checkGenerateNewTmat(int numLevel2, int numRandCoeff) {
double[][] ubeta = genRandomMatrix(numLevel2, numRandCoeff, 123);
double[][] tmat = generateNewTmat(ubeta);
double oneOverJ = 1.0/numLevel2;
Matrix tmatManual = new Matrix(new double[numRandCoeff][numRandCoeff]);
for (int index=0; index < numLevel2; index++) {
Matrix oneVect = new Matrix(new double[][]{ubeta[index]});
tmatManual = tmatManual.plus(oneVect.transpose().times(oneVect));
}
double[][] tmatM = tmatManual.times(oneOverJ).getArray();
TestUtil.checkDoubleArrays(tmatM, tmat, 1e-6);
}

@Test
public void testCumSum() {
checkCumSum(10, 5);
Expand Down
14 changes: 9 additions & 5 deletions h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,15 @@ public static double calHGLMllg2(long nobs, double[][] tmat, double varResidual,
* @return
*/
public static double[][] calInnverV(double[][] gmat, double[][] zTTimesZ, double oneOVar) {
double[][] gmatInv = new Matrix(gmat).inverse().getArray();
double[][] tempzTTimesZ = copy2DArray(zTTimesZ);
ArrayUtils.mult(tempzTTimesZ, oneOVar);
ArrayUtils.add(gmatInv, tempzTTimesZ);
return gmatInv;
try {
double[][] gmatInv = new Matrix(gmat).inverse().getArray();
double[][] tempzTTimesZ = copy2DArray(zTTimesZ);
ArrayUtils.mult(tempzTTimesZ, oneOVar);
ArrayUtils.add(gmatInv, tempzTTimesZ);
return gmatInv;
} catch(Exception ex) {
throw new RuntimeException("Tmat matrix is singular.");
}
}

public static ModelMetricsRegressionHGLM getFromDKV(Model model, Frame frame) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
def test_define_dataset():
family = 'gaussian' # can be any valid GLM families
nrow = 40000
nenum = 1
nreal = 6 # last one is the response
nenum = 6
nreal = 1 # last one is the response
enum_columns = pyunit_utils.random_dataset_enums_only(nrow, nenum, factorL=8, misFrac=0.0)
real_columns = pyunit_utils.random_dataset_real_only(nrow, nreal, realR = 2, misFrac=0.0)
dataset = enum_columns.cbind(real_columns)
Expand All @@ -44,20 +44,24 @@ def generate_dataset(family, trainData, group_column, random_columns):
names_without_response.remove(myY)
# to generate data in hglm_test/gaussian_0GC_123R_allNumeric_p05oise_p08T_wIntercept_standardize.csv, 1 cat, 5 numeric
# 1 response, seed = 12345
startVal = [1.9011867, -1.2616812, 0.4293167, 0.9802659, 0.7680827, -0.6359531]
# startVal = [1.9011867, -1.2616812, 0.4293167, 0.9802659, 0.7680827, -0.6359531]
# gaussian_0GC_123R_allNumeric_p05oise_p08T_wIntercept.csv for non-standardization
# gaussian_0GC_123R_allNumeric_p05oise_p08T_woIntercept_standardize.csv
# gaussian_0GC_123R_allNumeric_p05oise_p08T_woIntercept.csv

# used to generate
# startVal = [1.90118665, -1.26168122, 0.42931675, 0.98026587, 0.76808271, -0.63595311, 0.36327481, -0.30597578,
# 1.54715533, 1.39872770 -1.75658816, 1.31031573, -1.85003284, 1.33139505, 1.17457290, 1.84407102,
# -0.07955216, 0.87777599, 0.07614022, 1.96488429, -0.52619981, -1.11206544, -0.55910850, -0.68860274,
# 0.61111377, 1.48083252, -0.46079518, -1.10481602, -1.32406489, 1.47580376, 0.66306257, -0.40125219,
# 0.70811714, -0.93184588, 1.52309741, -0.65421192, 0.83816616]

# to generate data in hglm_test/gaussian_0GC_123R_allEnum_p05oise_p08T_wIntercept_standardize.csv, 6 cat, 0 numeric
# 1 response, seed = 12345
startVal = [0.7906251, 1.8005780, -3.5665564, -0.8804172, -1.5809320, 1.5188019, -1.6089287, 1.7509011,
-0.5286826, -1.1203812, -2.3159930, 0.1674759, -0.9065857, -0.7587694, -0.8578529, 0.3007900,
1.5765745, 1.1725489, -0.6935900, -1.1467158, 1.3960304, -1.7078175, -2.8960526, 0.9847858,
-1.0951275, 0.1393349, -0.6782085, 3.3711444, -2.0059428, 1.3293327, -0.5083064, 2.7324153,
0.2036385, -1.6967069, 0.699569, -0.4288891]
# hglm_test/gaussian_0GC_123R_allEnum_p05oise_p08T_wIntercept.csv
# hglm_test/gaussian_0GC_123R_allEnum_p05oise_p08T_woIntercept_standardize.csv
# hglm_test/gaussian_0GC_123R_allEnum_p05oise_p08T_woIntercept.csv
m = hglm(family=family, max_iterations=0, random_columns=random_columns, group_column=group_column,
tau_u_var_init = 0.08, tau_e_var_init = 0.05, random_intercept = False, gen_syn_data=True,
seed = 12345, standardize = True, initial_fixed_effects=startVal)
seed = 12345, standardize = False, initial_fixed_effects=startVal)
m.train(training_frame=trainData, y = "response", x =myX)
f2 = m.predict(trainData)
finalDataset = trainData[names_without_response]
Expand Down

0 comments on commit d883ece

Please sign in to comment.