Skip to content

Commit

Permalink
added more python tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
wendycwong committed Oct 20, 2024
1 parent 2181a64 commit 8df305d
Show file tree
Hide file tree
Showing 6 changed files with 298 additions and 20 deletions.
20 changes: 15 additions & 5 deletions h2o-algos/src/main/java/hex/hglm/HGLM.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@
import java.util.List;
import java.util.stream.Collectors;

import static hex.ModelMetricsRegressionHGLM.calHGLMllg;
import static hex.glm.GLMModel.GLMParameters.Family.gaussian;
import static hex.glm.GLMModel.GLMParameters.MissingValuesHandling.*;
import static hex.hglm.HGLMModel.HGLMParameters.Method.EM;
import static hex.hglm.HGLMUtils.*;
import static hex.hglm.MetricBuilderHGLM.calHGLMllg;
import static water.util.ArrayUtils.*;

public class HGLM extends ModelBuilder<HGLMModel, HGLMModel.HGLMParameters, HGLMModel.HGLMModelOutput> {
Expand Down Expand Up @@ -183,6 +183,12 @@ else if (!trainFrame.vec(_parms._group_column).isCategorical())
if (_parms._tau_e_var_init <= 0)
error("tau_e_var_init", "If gen_syn_data is true, tau_e_var_init must be > 0.");
}

if (!_parms._random_intercept && _parms._standardize)
warn("random_intercept and standardize",
"If random_intercept is false and standardize is true, model building process can be unstable" +
" due to the denormalization process which can create singular T matrix. If encounter singlar" +
" T matrix problem, set standardize to false in this case to ensure model building can finish.");
}
}

Expand Down Expand Up @@ -292,9 +298,9 @@ void fitEM(HGLMModel model, Job job, ScoringHistory scTrain, ScoringHistory scVa
if (_parms._max_iterations > 0) {
// grab current value of fixed beta, tauEVar, tauUVar
double[] beta = _state.get_beta().clone();
double[][] ubeta = copy2DArray(_state.get_ubeta()); // keep to generate synthetic data.
double[][] ubeta;
double tauEVarE10 = _state.get_tauEVarE10();
double tauEVarE17 = _state.get_tauEVarE17();
double tauEVarE17;
double[][] tMat = copy2DArray(_state.get_T());
double[][][] cjInv;
double[][] tMatInv;
Expand All @@ -307,7 +313,8 @@ void fitEM(HGLMModel model, Job job, ScoringHistory scTrain, ScoringHistory scVa
ubeta = estimateNewRandomEffects(cjInv, engineTask._ArjTYj, engineTask._ArjTAfj, beta);// new random coefficients
// M step
beta = estimateFixedCoeff(engineTask._AfTAftInv, engineTask._AfjTYjSum, engineTask._AfjTArj, ubeta);// new fixed coeficients
tMat = estimateNewtMat(ubeta, tauEVarE10, cjInv, engineTask._oneOverJ); // new tMat
tMat = estimateNewtMat(ubeta, tauEVarE10, cjInv, engineTask._oneOverJ); // provide better estimate of tauEVar
//tMat = generateNewTmat(ubeta);
// estimate new tauEVar,
HGLMTask.ResidualLLHTask rLlhE17 = new HGLMTask.ResidualLLHTask(_job, _parms, _dinfo, ubeta, _state.get_beta(),
engineTask); // use equation 17 of the doc
Expand All @@ -334,7 +341,10 @@ void fitEM(HGLMModel model, Job job, ScoringHistory scTrain, ScoringHistory scVa
}
}
} catch(Exception ex) { // will catch matrix singular during loglikelihood calculation
return;
if (iteration > 1) // some coefficients are valid, just return
return;
else
throw new RuntimeException(ex); // bad matrix from the start, no model is built.
}
}

Expand Down
5 changes: 2 additions & 3 deletions h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ public ModelMetricsRegressionHGLM(Model model, Frame frame, long nobs, double si
*
* This method calculates the log-likelihood as described in section II.V of the doc.
*/
public static double calHGLMllg(long nobs, double[][] tmat, double varResidual, double[][][] zjTTimesZj,
/* public static double calHGLMllg(long nobs, double[][] tmat, double varResidual, double[][][] zjTTimesZj,
double yMinsXFixSquared, double[][] yMinusXFixTimesZ) {
int numLevel2 = zjTTimesZj.length;
double[][] tmatInv = new Matrix(tmat).inverse().getArray();
Expand All @@ -65,7 +65,7 @@ public static double calHGLMllg(long nobs, double[][] tmat, double varResidual,
public static double[][] calInvTPZjTZ(double[][] tmatInv, double[][] zjTTimesZj, double oneOVar) {
return new Matrix(tmatInv).plus(new Matrix(zjTTimesZj).times(oneOVar)).getArray();
}
}*/

/***
*
Expand All @@ -89,7 +89,6 @@ public static double calHGLMllg2(long nobs, double[][] tmat, double varResidual,

/**
* See the doc section II.V, calculates G inverse + transpose(Z)*Z/var_e.
* @return
*/
public static double[][] calInnverV(double[][] gmat, double[][] zTTimesZ, double oneOVar) {
try {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import sys
sys.path.insert(1,"../../../")
import h2o
from tests import pyunit_utils
from h2o.estimators.hglm import H2OHGLMEstimator as hglm
from tests.pyunit_utils import utils_for_glm_hglm_tests

# in this test, want to check the following with standardization and with random intercept:
# 1.scoring history (both training and valid)
# 2. the model summary
# 3. Fixed effect coefficients, normal and standardized
# 4. icc
# 5. residual variance
def test_scoring_history_model_summary():
h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_123R_6enum_5num_1p5oise_p08T_woIntercept_standardize.gz"))
train, valid = h2o_data.split_frame(ratios = [.8], seed = 1234)
y = "response"
x = h2o_data.names
x.remove("response")
x.remove("C1")
random_columns = ["C2", "C3", "C4"]
hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345,
max_iterations = 20, random_intercept = False, standardize=False)
hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid)
# grab various metrics
model_metrics = hglm_model.training_model_metrics()
scoring_history = hglm_model.scoring_history(as_data_frame=False)
scoring_history_valid = hglm_model.scoring_history_valid(as_data_frame=False)
model_summary = hglm_model.summary()
coef_random_names = hglm_model.coefs_random_names()
t_mat = hglm_model.matrix_T()
residual_var = hglm_model.residual_variance()
mse = hglm_model.mse()
mse_fixed = hglm_model.mean_residual_fixed()
mse_fixed_valid = hglm_model.mean_residual_fixed(train=False)
icc = hglm_model.icc()
level2_names = hglm_model.level_2_names()

# check to make sure metrics/coefficients make sense
residual_var_true = 1.5
assert abs(residual_var-residual_var_true) < 0.05, \
"Expected variance: {1}, actual: {0}. The difference is too big.".format(residual_var, residual_var_true)
# residual error taking account into only fixed effect coefficients should be greater than mse, mse_valid
assert mse < mse_fixed, "residual error with only fixed effects {0} should exceed that of mse {1} but is" \
" not.".format(mse_fixed, mse)
assert mse < mse_fixed_valid, "residual error with only fixed effects from validation frames {0} should exceed that" \
" of mse {1} but is not.".format(mse_fixed_valid, mse)
# make sure level 2 values are captured correctly
group2_value = train["C1"].unique()
utils_for_glm_hglm_tests.compare_list_h2o_frame(level2_names, group2_value, "C1.")
# assert icc is calculated correctly.
assert len(t_mat) == len(coef_random_names), "expected T matrix size: {0}, actual: {1} and they are not " \
"equal.".format(len(coef_random_names), len(t_mat))
utils_for_glm_hglm_tests.check_icc_calculation(t_mat, residual_var, icc)
# check model summary and model metrics if contain the same information should equal to each other
model_iterations = model_metrics["iterations"]
assert model_iterations == model_summary.cell_values[0][1], \
"model metrics iterations {0} should equal model_summary iterations {1}".format(model_iterations, model_summary.cell_values[0][1])
last_mse = model_metrics["MSE"]
assert abs(last_mse - model_summary.cell_values[0][3]) < 1e-6, \
"model metrics MSE {0} should equal to model summary MSE {1}.".format(last_mse, model_summary.cell_values[0][3])
last_llg = model_metrics["log_likelihood"]
assert abs(last_llg - model_summary.cell_values[0][2]) < 1e-6,\
"model metrics llg {0} should equal to model summary llg {1}.".format(last_llg, model_summary.cell_values[0][2])
# check scoring history last entry with model metric values
assert len(scoring_history.cell_values) == model_iterations, \
"length of scoring history {0} should equal to number of model iterations {1}".format(len(scoring_history.cell_values), model_iterations)
last_sc_index = model_iterations-1
assert abs(scoring_history.cell_values[last_sc_index][3] - last_llg) < 1e-6, \
"last scoring history llg {0} should equal to model metrics llg {1}".format(scoring_history.cell_values[last_sc_index][3], last_llg)
assert abs(scoring_history.cell_values[last_sc_index][4] - last_mse) < 1e-6, \
"last scoring history MSE {0} should equal to model metrics MSE {1}.".format(scoring_history.cell_values[last_sc_index][4], last_mse)
# check and make sure the llg from training and validation frame should be increasing in values
# this is only true when the true residual variance is high. For low true residual variance, it is only
# true for the last few iterations when the residual variance estimate is close to the true residual variance
if (residual_var_true >= 2):
for ind in list(range(1, model_iterations)):
p_ind = ind-1
assert scoring_history.cell_values[p_ind][3] <= scoring_history.cell_values[ind][3], \
"training llg {0} from iteration {1} should be smaller than training llg {2} from iteration " \
"{3}".format(scoring_history.cell_values[p_ind][3], p_ind, scoring_history.cell_values[ind][3], ind)
assert scoring_history_valid.cell_values[p_ind][3] <= scoring_history_valid.cell_values[ind][3], \
"validation llg {0} from iteration {1} should be smaller than validation llg {2} from iteration " \
"{3}".format(scoring_history_valid.cell_values[p_ind][3], p_ind, scoring_history_valid.cell_values[ind][3], ind)

if __name__ == "__main__":
pyunit_utils.standalone_test(test_scoring_history_model_summary)
else:
test_scoring_history_model_summary()
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,17 @@
# 4. icc
# 5. residual variance
def test_scoring_history_model_summary():
h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_1267R_6enum_5num_p08oise_p08T_wIntercept_standardize.gz"))
h2o_data = h2o.import_file(path=pyunit_utils.locate("smalldata/hglm_test/gaussian_0GC_allRC_2enum2numeric_3noise_p08T_wIntercept_standardize.gz"))
train, valid = h2o_data.split_frame(ratios = [.8], seed = 1234)
y = "response"
x = h2o_data.names
x.remove("response")
x.remove("C1")
random_columns = ["C2", "C3", "C10", "C20"]
hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345,
max_iterations = 10, random_intercept = True)
random_intercept = True, standardize = False, max_iterations=10)
hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid)
print(hglm_model) # make sure this one works.
# grab various metrics
model_metrics = hglm_model.training_model_metrics()
scoring_history = hglm_model.scoring_history(as_data_frame=False)
Expand All @@ -36,8 +37,9 @@ def test_scoring_history_model_summary():
icc = hglm_model.icc()
level2_names = hglm_model.level_2_names()
# check to make sure metrics/coefficients make sense
# residual_var = 0.05
assert abs(residual_var-0.05) < 1.0e-3, "Expected variance: 0.05, actual: {0}. The difference is too big."
residual_var_true = 3.0
assert abs(residual_var-residual_var_true) < 0.05, \
"Expected variance: {1}}, actual: {0}. The difference is too big.".format(residual_var, residual_var_true)
# residual error taking account into only fixed effect coefficients should be greater than mse, mse_valid
assert mse < mse_fixed, "residual error with only fixed effects {0} should exceed that of mse {1} but is" \
" not.".format(mse_fixed, mse)
Expand Down Expand Up @@ -69,14 +71,14 @@ def test_scoring_history_model_summary():
assert abs(scoring_history.cell_values[last_sc_index][4] - last_mse) < 1e-6, \
"last scoring history MSE {0} should equal to model metrics MSE {1}.".format(scoring_history.cell_values[last_sc_index][4], last_mse)
# check and make sure the llg from training and validation frame should be increasing in values
# for ind in list(range(1, model_iterations)):
# p_ind = ind-1
# assert scoring_history.cell_values[p_ind][3] <= scoring_history.cell_values[ind][3], \
# "training llg {0} from iteration {1} should be smaller than training llg {2} from iteration " \
# "{3}".format(scoring_history.cell_values[p_ind][3], p_ind, scoring_history.cell_values[ind][3], ind)
# assert scoring_history_valid.cell_values[p_ind][3] <= scoring_history_valid.cell_values[ind][3], \
# "validation llg {0} from iteration {1} should be smaller than validation llg {2} from iteration " \
# "{3}".format(scoring_history_valid.cell_values[p_ind][3], p_ind, scoring_history_valid.cell_values[ind][3], ind)
for ind in list(range(1, model_iterations)):
p_ind = ind-1
assert scoring_history.cell_values[p_ind][3] <= scoring_history.cell_values[ind][3], \
"training llg {0} from iteration {1} should be smaller than training llg {2} from iteration " \
"{3}".format(scoring_history.cell_values[p_ind][3], p_ind, scoring_history.cell_values[ind][3], ind)
assert scoring_history_valid.cell_values[p_ind][3] <= scoring_history_valid.cell_values[ind][3], \
"validation llg {0} from iteration {1} should be smaller than validation llg {2} from iteration " \
"{3}".format(scoring_history_valid.cell_values[p_ind][3], p_ind, scoring_history_valid.cell_values[ind][3], ind)

if __name__ == "__main__":
pyunit_utils.standalone_test(test_scoring_history_model_summary)
Expand Down
Loading

0 comments on commit 8df305d

Please sign in to comment.