Skip to content

Commit

Permalink
done checking scoring history, model summary and model metrics.
Browse files Browse the repository at this point in the history
  • Loading branch information
wendycwong committed Oct 16, 2024
1 parent 5c2248f commit 3b6699b
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 49 deletions.
35 changes: 0 additions & 35 deletions h2o-algos/src/test/java/hex/hglm/HGLMBasicTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -1105,39 +1105,4 @@ public void fillDataRows(Frame fr, int rowInd, String[] coefNames, String[] rCoe
if (rCoeffNames[index] != "intercept")
zvals[index] = fr.vec(rCoeffNames[index]).at(rowInd);
}

@Test
public void testMultiChunkData(){
/* try {
Scope.enter();
Frame fr = parseTestFile("smalldata/hglm_test/HGLM_5KRows_100Z.csv");
fr.replace(0, fr.vec(0).toCategoricalVec()).remove();
fr.replace(1, fr.vec(1).toCategoricalVec()).remove();
fr.replace(2, fr.vec(2).toCategoricalVec()).remove();
fr.replace(3, fr.vec(3).toCategoricalVec()).remove();
DKV.put(fr);
Scope.track(fr);
GLMParameters parms = new GLMParameters();
parms._train = fr._key;
parms._response_column = "response";
parms._ignored_columns = new String[]{"Z"};
parms._ignore_const_cols = true;
parms._family = Family.gaussian;
parms._link = GLMParameters.Link.identity;
parms._HGLM=true;
parms._rand_family = new Family[] {Family.gaussian};
parms._rand_link = new GLMParameters.Link[] {GLMParameters.Link.identity};
parms._random_columns = new int[]{0};
parms._calc_like = true;
// just make sure it runs
GLMModel model = new GLM(parms).trainModel().get();
Scope.track_generic(model);
ModelMetricsHGLMGaussianGaussian mmetrics = (ModelMetricsHGLMGaussianGaussian) model._output._training_metrics;
Scope.track_generic(mmetrics);
assertEquals(-23643.3076231, mmetrics._hlik, 1e-4);
} finally {
Scope.exit();
}*/
}
}
3 changes: 1 addition & 2 deletions h2o-core/src/main/java/hex/ModelMetricsRegressionHGLM.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,7 @@ public ModelMetricsRegressionHGLM(Model model, Frame frame, long nobs, String[]

/**
*
* This method calculates the log-likelihood as described in section II.V of the doc. I surround the method with
* try catch because it seems like the Matrix toolbox can crash h2o-3 otherwise.
* This method calculates the log-likelihood as described in section II.V of the doc.
*/
public static double calHGLMllg(long nobs, double[][] tmat, double varResidual, double[][][] zjTTimesZj,
double yMinsXFixSquared, double[][] yMinusXFixTimesZ) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,13 @@ def test_scoring_history_model_summary():
x.remove("C1")
random_columns = ["C2", "C3", "C10", "C20"]
hglm_model = hglm(random_columns=random_columns, group_column = "C1", score_each_iteration=True, seed=12345,
em_epsilon = 0.0001, random_intercept = True, standardize = False)
em_epsilon = 0.001, random_intercept = True, standardize = False)
hglm_model.train(x=x, y=y, training_frame=train, validation_frame=valid)
hglm_model2 = hglm(random_columns=random_columns, group_column = "C1", seed=12345, em_epsilon = 0.0001,
random_intercept = True, standardize = False) # loglikelihood calculated in training and not with scoring
hglm_model2.train(x=x, y=y, training_frame=train, validation_frame=valid)
# grab various metrics
modelMetrics = hglm_model.training_model_metrics()
model_metrics = hglm_model.training_model_metrics()
scoring_history = hglm_model.scoring_history(as_data_frame=False)
scoring_history_valid = hglm_model.scoring_history_valid(as_data_frame=False)
model_summary = hglm_model.summary()
modelMetrics2 = hglm_model2.training_model_metrics()
scoring_history2 = hglm_model2.scoring_history(as_data_frame=False)
scoring_history_valid2 = hglm_model2.scoring_history_valid(as_data_frame=False)
model_summary2 = hglm_model2.summary()
coef_random_names = hglm_model.coefs_random_names()
t_mat = hglm_model.matrix_T()
residual_var = hglm_model.residual_variance()
Expand All @@ -57,9 +50,33 @@ def test_scoring_history_model_summary():
assert len(t_mat) == len(coef_random_names), "expected T matrix size: {0}, actual: {1} and they are not " \
"equal.".format(len(coef_random_names), len(t_mat))
utils_for_glm_hglm_tests.check_icc_calculation(t_mat, residual_var, icc)
# make sure contents in model summary, model history and model metrics are consistent with each other

print("Done")
# check model summary and model metrics if contain the same information should equal to each other
model_iterations = model_metrics["iterations"]
assert model_iterations == model_summary.cell_values[0][1], \
"model metrics iterations {0} should equal model_summary iterations {1}".format(model_iterations, model_summary.cell_values[0][1])
last_mse = model_metrics["MSE"]
assert abs(last_mse - model_summary.cell_values[0][3]) < 1e-6, \
"model metrics MSE {0} should equal to model summary MSE {1}.".format(last_mse, model_summary.cell_values[0][3])
last_llg = model_metrics["log_likelihood"]
assert abs(last_llg - model_summary.cell_values[0][2]) < 1e-6,\
"model metrics llg {0} should equal to model summary llg {1}.".format(last_llg, model_summary.cell_values[0][2])
# check scoring history last entry with model metric values
assert len(scoring_history.cell_values) == model_iterations, \
"length of scoring history {0} should equal to number of model iterations {1}".format(len(scoring_history.cell_values), model_iterations)
last_sc_index = model_iterations-1
assert abs(scoring_history.cell_values[last_sc_index][3] - last_llg) < 1e-6, \
"last scoring history llg {0} should equal to model metrics llg {1}".format(scoring_history.cell_values[last_sc_index][3], last_llg)
assert abs(scoring_history.cell_values[last_sc_index][4] - last_mse) < 1e-6, \
"last scoring history MSE {0} should equal to model metrics MSE {1}.".format(scoring_history.cell_values[last_sc_index][4], last_mse)
# check and make sure the llg from training and validation frame should be increasing in values
# for ind in list(range(1, model_iterations)):
# p_ind = ind-1
# assert scoring_history.cell_values[p_ind][3] <= scoring_history.cell_values[ind][3], \
# "training llg {0} from iteration {1} should be smaller than training llg {2} from iteration " \
# "{3}".format(scoring_history.cell_values[p_ind][3], p_ind, scoring_history.cell_values[ind][3], ind)
# assert scoring_history_valid.cell_values[p_ind][3] <= scoring_history_valid.cell_values[ind][3], \
# "validation llg {0} from iteration {1} should be smaller than validation llg {2} from iteration " \
# "{3}".format(scoring_history_valid.cell_values[p_ind][3], p_ind, scoring_history_valid.cell_values[ind][3], ind)

if __name__ == "__main__":
pyunit_utils.standalone_test(test_scoring_history_model_summary)
Expand Down

0 comments on commit 3b6699b

Please sign in to comment.