diff --git a/h2o-algos/src/main/java/hex/glm/GLMMetricBuilder.java b/h2o-algos/src/main/java/hex/glm/GLMMetricBuilder.java index 5bcb85e6238f..e6418a0ba5ea 100644 --- a/h2o-algos/src/main/java/hex/glm/GLMMetricBuilder.java +++ b/h2o-algos/src/main/java/hex/glm/GLMMetricBuilder.java @@ -189,7 +189,7 @@ public final long resDOF() { protected void computeAIC(GLMModel gm) { if (gm._parms._calc_like && gm._finalScoring) { // uses likelihood which is calculated for the final scoring - _aic = -2 * _log_likelihood + 2 * Arrays.stream(gm.beta()).filter(b -> b != 0).count(); + _aic = 2 * _log_likelihood + 2 * Arrays.stream(gm.beta()).filter(b -> b != 0).count(); } else { // original calculation for the model build _aic = 0; switch (_glmf._family) { diff --git a/h2o-algos/src/test/java/hex/glm/GLMTestAICLikelihood.java b/h2o-algos/src/test/java/hex/glm/GLMTestAICLikelihood.java index 40197ea0a5dd..99d864ee2490 100644 --- a/h2o-algos/src/test/java/hex/glm/GLMTestAICLikelihood.java +++ b/h2o-algos/src/test/java/hex/glm/GLMTestAICLikelihood.java @@ -61,7 +61,7 @@ public void testGaussianAICLikelihood() { } assertTrue("Dispersion parameter estimation from model: "+model._parms._dispersion_estimated+". Manual dispersion estimation: "+dispersion_estimated_manual+" and they are different.", Math.abs(dispersion_estimated_manual-model._parms._dispersion_estimated)<1e-6); assertTrue("Log likelihood from model: "+((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood+". Manual loglikelihood: "+logLike+" and they are different.", Math.abs(logLike-((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood)<1e-6); - double aic = -2*logLike + 2*model._output.rank(); + double aic = 2*logLike + 2*model._output.rank(); assertTrue("AIC from model: "+((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC+". Manual AIC: "+aic+" and they are different.", Math.abs(aic-((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC)<1e-6); System.out.println(((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood + " " + ((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC); System.out.println(logLike + " " + aic); @@ -99,7 +99,7 @@ public void testBinomialAICLikelihood() { logLike += w * (yr * log(probabilityOf1) + (1-yr) * log(1 - probabilityOf1)); } assertTrue("Log likelihood from model: "+((ModelMetricsBinomialGLM) model._output._training_metrics)._loglikelihood+". Manual loglikelihood: "+logLike+" and they are different.", Math.abs(logLike-((ModelMetricsBinomialGLM) model._output._training_metrics)._loglikelihood)<1e-6); - double aic = -2*logLike + 2*model._output.rank(); + double aic = 2*logLike + 2*model._output.rank(); assertTrue("AIC from model: "+((ModelMetricsBinomialGLM) model._output._training_metrics)._AIC+". Manual AIC: "+aic+" and they are different.", Math.abs(aic-((ModelMetricsBinomialGLM) model._output._training_metrics)._AIC)<1e-6); System.out.println(((ModelMetricsBinomialGLM) model._output._training_metrics)._loglikelihood + " " + ((ModelMetricsBinomialGLM) model._output._training_metrics)._AIC); System.out.println(logLike + " " + aic); @@ -144,7 +144,7 @@ public void testQuasibinomialAICLikelihood() { } } assertTrue("Log likelihood from model: "+((ModelMetricsBinomialGLM) model._output._training_metrics)._loglikelihood+". Manual loglikelihood: "+logLike+" and they are different.", Math.abs(logLike-((ModelMetricsBinomialGLM) model._output._training_metrics)._loglikelihood)<1e-6); - double aic = -2*logLike + 2*model._output.rank(); + double aic = 2*logLike + 2*model._output.rank(); assertTrue("AIC from model: "+((ModelMetricsBinomialGLM) model._output._training_metrics)._AIC+". Manual AIC: "+aic+" and they are different.", Math.abs(aic-((ModelMetricsBinomialGLM) model._output._training_metrics)._AIC)<1e-6); System.out.println(((ModelMetricsBinomialGLM) model._output._training_metrics)._loglikelihood + " " + ((ModelMetricsBinomialGLM) model._output._training_metrics)._AIC); System.out.println(logLike + " " + aic); @@ -186,7 +186,7 @@ public void testFractionalbinomialAICLikelihood() { } } assertTrue("Log likelihood from model: "+((ModelMetricsBinomialGLM) model._output._training_metrics)._loglikelihood+". Manual loglikelihood: "+logLike+" and they are different.", Math.abs(logLike-((ModelMetricsBinomialGLM) model._output._training_metrics)._loglikelihood)<1e-6); - double aic = -2*logLike + 2*model._output.rank(); + double aic = 2*logLike + 2*model._output.rank(); assertTrue("AIC from model: "+((ModelMetricsBinomialGLM) model._output._training_metrics)._AIC+". Manual AIC: "+aic+" and they are different.", Math.abs(aic-((ModelMetricsBinomialGLM) model._output._training_metrics)._AIC)<1e-6); System.out.println(((ModelMetricsBinomialGLM) model._output._training_metrics)._loglikelihood + " " + ((ModelMetricsBinomialGLM) model._output._training_metrics)._AIC); System.out.println(logLike + " " + aic); @@ -225,7 +225,7 @@ public void testPoissonAICLikelihood() { } } assertTrue("Log likelihood from model: "+((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood+". Manual loglikelihood: "+logLike+" and they are different.", Math.abs(logLike-((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood)<1e-6); - double aic = -2*logLike + 2*model._output.rank(); + double aic = 2*logLike + 2*model._output.rank(); assertTrue("AIC from model: "+((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC+". Manual AIC: "+aic+" and they are different.", Math.abs(aic-((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC)<1e-6); System.out.println(((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood + " " + ((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC); System.out.println(logLike + " " + aic); @@ -264,7 +264,7 @@ public void testNegativeBinomialAICLikelihood() { + log(Gamma.gamma(yr + 1/inv_theta_estimated) / (Gamma.gamma(yr + 1) * Gamma.gamma(1/inv_theta_estimated))); } assertTrue("Log likelihood from model: "+((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood+". Manual loglikelihood: "+logLike+" and they are different.", Math.abs(logLike-((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood)<1e-6); - double aic = -2*logLike + 2*model._output.rank(); + double aic = 2*logLike + 2*model._output.rank(); assertTrue("AIC from model: "+((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC+". Manual AIC: "+aic+" and they are different.", Math.abs(aic-((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC)<1e-6); System.out.println(((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood + " " + ((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC); System.out.println(logLike + " " + aic); @@ -306,7 +306,7 @@ public void testGammaAICLikelihood() { } } assertTrue("Log likelihood from model: "+((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood+". Manual loglikelihood: "+logLike+" and they are different.", Math.abs(logLike-((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood)<1e-6); - double aic = -2*logLike + 2*model._output.rank(); + double aic = 2*logLike + 2*model._output.rank(); assertTrue("AIC from model: "+((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC+". Manual AIC: "+aic+" and they are different.", Math.abs(aic-((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC)<1e-6); System.out.println(((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood + " " + ((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC); System.out.println(logLike + " " + aic); @@ -346,7 +346,7 @@ public void testMultinomialAICLikelihood() { logLike += log(predictedProbabilityOfActualClass); } assertTrue("Log likelihood from model: "+((ModelMetricsBinomialGLM.ModelMetricsMultinomialGLM) model._output._training_metrics)._loglikelihood+". Manual loglikelihood: "+logLike+" and they are different.", Math.abs(logLike-((ModelMetricsBinomialGLM.ModelMetricsMultinomialGLM) model._output._training_metrics)._loglikelihood)<1e-6); - double aic = -2*logLike + 2*model._output.rank(); + double aic = 2*logLike + 2*model._output.rank(); assertTrue("AIC from model: "+((ModelMetricsBinomialGLM.ModelMetricsMultinomialGLM) model._output._training_metrics)._AIC+". Manual AIC: "+aic+" and they are different.", Math.abs(aic-((ModelMetricsBinomialGLM.ModelMetricsMultinomialGLM) model._output._training_metrics)._AIC)<1e-6); System.out.println(((ModelMetricsBinomialGLM.ModelMetricsMultinomialGLM) model._output._training_metrics)._loglikelihood + " " + ((ModelMetricsBinomialGLM.ModelMetricsMultinomialGLM) model._output._training_metrics)._AIC); System.out.println(logLike + " " + aic); @@ -378,7 +378,7 @@ public void testTweedieAICLikelihood() { // only check that loglikelihood is calculated double logLike = ((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood; assertNotEquals("Log likelihood from model: "+((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood, 0.0, logLike); - double aic = -2*logLike + 2*model._output.rank(); + double aic = 2*logLike + 2*model._output.rank(); assertTrue("AIC from model: "+((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC+". Manual AIC: "+aic+" and they are different.", Math.abs(aic-((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC)<1e-6); System.out.println(((ModelMetricsRegressionGLM) model._output._training_metrics)._loglikelihood + " " + ((ModelMetricsRegressionGLM) model._output._training_metrics)._AIC); System.out.println(logLike + " " + aic); diff --git a/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_8947_GLM_likelihoods_aic.R b/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_8947_GLM_likelihoods_aic.R index d4ed423950f8..dc139c1fbedb 100644 --- a/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_8947_GLM_likelihoods_aic.R +++ b/h2o-r/tests/testdir_algos/glm/runit_PUBDEV_8947_GLM_likelihoods_aic.R @@ -16,7 +16,7 @@ test.glm.aic.likelihood <- function() { dev <- 157465 rRank <- 9 loglikeR <- -0.5*((nobs-1) + nobs*log(2*pi*dev/(nobs-1))) - aicR <- -2*loglikeR+2*rRank + aicR <- 2*loglikeR+2*rRank perf <- h2o.performance(model.h2o.gaussian.identity) print("GLM Gaussian") print("H2O AIC") diff --git a/h2o-r/tests/testdir_algos/glm/runit_gh_15891_tweedie_aic.R b/h2o-r/tests/testdir_algos/glm/runit_gh_15891_tweedie_aic.R new file mode 100644 index 000000000000..40e09578e130 --- /dev/null +++ b/h2o-r/tests/testdir_algos/glm/runit_gh_15891_tweedie_aic.R @@ -0,0 +1,85 @@ +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source("../../../scripts/h2o-r-test-setup.R") + +test_glm_tweedies <- function() { + require(statmod) + require(tweedie) + if (requireNamespace("tweedie")) { + num_rows <- 1000 + num_cols <- 5 + f1 <- random_dataset_real_only(num_rows, num_cols) # generate dataset containing the predictors. + f1R <- as.data.frame(h2o.abs(f1)) + weights <- c(0.1, 0.2, 0.3, 0.4, 0.5, 1) # weights to generate the mean + mu <- generate_mean(f1R, num_rows, num_cols, weights) + pow <- c(1.1) # variance power range + phi <- c(1) # dispersion factor range + y <- "resp" # response column + x <- c("abs.C1.", "abs.C2.", "abs.C3.", "abs.C4.", "abs.C5.") + for (ind in c(1:length(pow))) { # generate dataset with each variance power and dispersion factor + trainF <- generate_dataset(f1R, num_rows, num_cols, pow[ind], phi[ind], mu) + print(paste("Compare H2O, R GLM model coefficients and standard error for var_power=", pow[ind], "link_power=", 1 - pow[ind], sep = " ")) + compareH2ORGLM(pow[ind], 1 - pow[ind], x, y, trainF, as.data.frame(trainF), phi[ind]) + } + } else { + print("test_glm_tweedies is skipped. Need to install tweedie package.") + } +} + +generate_dataset <- function(f1R, numRows, numCols, pow, phi, mu) { + resp <- tweedie::rtweedie(numRows, xi = pow, mu, phi, power = pow) + f1h2o <- as.h2o(f1R) + resph2o <- as.h2o(as.data.frame(resp)) + finalFrame <- h2o.cbind(f1h2o, resph2o) + return(finalFrame) +} + +generate_mean <- function(f1R, numRows, numCols, weights) { + y <- c(1:numRows) + for (rowIndex in c(1:numRows)) { + tempResp = 0.0 + for (colIndex in c(1:numCols)) { + tempResp = tempResp + weights[colIndex] * f1R[rowIndex, colIndex] + } + y[rowIndex] = tempResp + } + return(y) +} + +compareH2ORGLM <- + function(vpower, lpower, x, y, hdf, df, truedisp, tolerance = 2e-4) { + print("Define formula for R") + formula <- (df[, "resp"] ~ .) + rmodel <- glm( + formula = formula, + data = df[, x], + family = tweedie(var.power = vpower, link.power = + lpower), + na.action = na.omit + ) + rAIC <- AICtweedie(rmodel) + h2omodel <- + h2o.glm( + x = x, + y = y, + training_frame = hdf, + family = "tweedie", + link = "tweedie", + tweedie_variance_power = vpower, + tweedie_link_power = lpower, + alpha = 0.5, + lambda = 0, + nfolds = 0, + compute_p_values = TRUE, + calc_like = TRUE, + fix_tweedie_variance_power = TRUE + ) + h2oAIC <- h2o.aic(h2omodel) + print("Comparing H2O and R GLM model AIC.") + print("R AIC") + print(rAIC) + print("h2o model AIC") + print(h2oAIC) + expect_true(abs(rAIC - h2oAIC) / h2oAIC < 1e-2) + } + +doTest("Comparison of H2O to R TWEEDIE family AIC with tweedie dataset", test_glm_tweedies)