From 09454399f99f74685449a1b401760e0a22d1464e Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Thu, 15 Aug 2024 11:11:30 +0200 Subject: [PATCH 1/3] Fixes response leakage --- NEWS.md | 4 ++++ R/methods.R | 5 ++--- R/missRanger.R | 7 ++++--- cran-comments.md | 4 +++- tests/testthat/test-missRanger.R | 3 +++ vignettes/multiple_imputation.Rmd | 14 +++++++------- vignettes/working_with_censoring.Rmd | 18 +++++++++--------- 7 files changed, 32 insertions(+), 23 deletions(-) diff --git a/NEWS.md b/NEWS.md index 6a1cfc5..cebf0ad 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # missRanger 2.6.0 +### Major bug fix + +- Fixes a major bug introduced in version 2.3.0, by which responses would be used as covariates in the random forests. Thanks for reporting @flystar233 (Issue #77). + ### Major feature Out-of-sample application is now possible! Thanks to [@jeandigitale](https://github.com/jeandigitale) for pushing the idea in [#58](https://github.com/mayer79/missRanger/issues/58). diff --git a/R/methods.R b/R/methods.R index 4cadd45..df619fa 100644 --- a/R/methods.R +++ b/R/methods.R @@ -194,9 +194,8 @@ predict.missRanger <- function( stop("No random forests in 'object'. Use missRanger(, keep_forests = TRUE).") } - # Do we have a random forest for all variables with missings? - # This can fire only if the first iteration in missRanger() was the best, and only - # for maximal one variable. It is a rare case. + # Do we have a random forest for all variables with missings? If no, we don't repeat + # its univariate imputation. forests_missing <- setdiff(to_impute, names(object$forests)) if (length(forests_missing) > 0L) { if (verbose >= 1L) { diff --git a/R/missRanger.R b/R/missRanger.R index 64416f6..e94febe 100644 --- a/R/missRanger.R +++ b/R/missRanger.R @@ -295,8 +295,9 @@ missRanger <- function( for (v in to_impute) { v.na <- data_NA[, v] + xvars <- setdiff(completed, v) - if (length(completed) == 0L) { + if (length(xvars) == 0L) { data[[v]] <- imputeUnivariate(data[[v]]) } else { y <- data[[v]][!v.na] @@ -316,13 +317,13 @@ missRanger <- function( case.weights = if (!is.null(case.weights)) case.weights[!v.na], num.threads = num.threads, save.memory = save.memory, - x = data[!v.na, completed, drop = FALSE], + x = data[!v.na, xvars, drop = FALSE], y = y, verbose = verbose >= 1, ... ) - pred <- stats::predict(fit, data[v.na, completed, drop = FALSE])$predictions + pred <- stats::predict(fit, data[v.na, xvars, drop = FALSE])$predictions if (pmm.k >= 1L) { pred <- pmm(xtrain = fit$predictions, xtest = pred, ytrain = y, k = pmm.k) diff --git a/cran-comments.md b/cran-comments.md index 0f5eb9e..21aa02e 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -2,7 +2,9 @@ Hello CRAN -This update brings a long awaited feature of out-of-sample application of the imputation models. +This update +- fixes a major bug introduced in 2.3.0 +- and brings a long awaited feature of out-of-sample application of the imputation models. ## R CMD check (local) diff --git a/tests/testthat/test-missRanger.R b/tests/testthat/test-missRanger.R index e2d50aa..711ca5a 100644 --- a/tests/testthat/test-missRanger.R +++ b/tests/testthat/test-missRanger.R @@ -245,6 +245,9 @@ test_that("formula interface works with unspecified left side", { expect_true(!anyNA(imp$data)) expect_true(.setequal(imp$to_impute, colnames(X_NA))) expect_equal(imp$impute_by, "int") + + # Prediction error is on its default 1 for int (issue #77) + expect_equal(imp$pred_errors[, "int"], rep(1, length(imp$mean_pred_errors))) }) test_that("dropping columns on left side leaves missing values", { diff --git a/vignettes/multiple_imputation.Rmd b/vignettes/multiple_imputation.Rmd index 63df68f..537446f 100644 --- a/vignettes/multiple_imputation.Rmd +++ b/vignettes/multiple_imputation.Rmd @@ -50,13 +50,13 @@ models <- lapply(filled, function(x) lm(Sepal.Length ~ ., x)) # Pool the results by mice summary(pooled_fit <- pool(models)) -# term estimate std.error statistic df p.value -# 1 (Intercept) 2.4600621 0.33998737 7.235746 86.25283 1.785004e-10 -# 2 Sepal.Width 0.4454417 0.10405609 4.280785 96.17676 4.406215e-05 -# 3 Petal.Length 0.7394242 0.08393401 8.809590 77.63584 2.620202e-13 -# 4 Petal.Width -0.1937151 0.17905818 -1.081856 80.36361 2.825524e-01 -# 5 Speciesversicolor -0.6785451 0.26812613 -2.530694 116.18041 1.272124e-02 -# 6 Speciesvirginica -0.8737822 0.37086417 -2.356071 110.15525 2.023735e-02 +# term estimate std.error statistic df p.value +# 1 (Intercept) 2.3343548 0.3244342 7.195157 97.08106 1.314353e-10 +# 2 Sepal.Width 0.4715273 0.1041384 4.527891 88.55776 1.848669e-05 +# 3 Petal.Length 0.7700316 0.0768588 10.018783 122.02953 1.321441e-17 +# 4 Petal.Width -0.2506538 0.1739537 -1.440922 88.10220 1.531513e-01 +# 5 Speciesversicolor -0.6648375 0.2940828 -2.260715 81.17797 2.645368e-02 +# 6 Speciesvirginica -0.9065327 0.4055137 -2.235517 79.87581 2.817491e-02 # Compare with model on original data summary(lm(Sepal.Length ~ ., data = iris)) diff --git a/vignettes/working_with_censoring.Rmd b/vignettes/working_with_censoring.Rmd index ab45129..6e5b83e 100644 --- a/vignettes/working_with_censoring.Rmd +++ b/vignettes/working_with_censoring.Rmd @@ -81,15 +81,15 @@ models <- lapply(filled, function(x) coxph(Surv(time, status) ~ . - surv, x)) # 4. Pool the results by mice summary(pooled_fit <- pool(models)) -# term estimate std.error statistic df p.value -# 1 trt 0.264601452 0.212828712 1.24326013 110.30869 2.164079e-01 -# 2 celltypesmallcell 0.789909124 0.284989547 2.77171262 113.68488 6.516937e-03 -# 3 celltypeadeno 1.114851697 0.306765748 3.63421179 113.01225 4.210453e-04 -# 4 celltypelarge 0.356374858 0.289111314 1.23265621 112.81945 2.202666e-01 -# 5 karno -0.031939872 0.005678831 -5.62437418 111.96501 1.388135e-07 -# 6 diagtime 0.003620720 0.008929001 0.40550108 99.98576 6.859756e-01 -# 7 age -0.007503755 0.009199070 -0.81570798 108.97973 4.164464e-01 -# 8 prior 0.002002572 0.023640459 0.08470952 112.81848 9.326425e-01 +# term estimate std.error statistic df p.value +# 1 trt 0.231154077 0.214672763 1.0767741 105.1514 2.840454e-01 +# 2 celltypesmallcell 0.805824737 0.285571376 2.8217980 114.1273 5.634607e-03 +# 3 celltypeadeno 1.130585762 0.306698637 3.6863084 113.3636 3.506786e-04 +# 4 celltypelarge 0.340627347 0.296740520 1.1478963 103.4753 2.536583e-01 +# 5 karno -0.030623274 0.005653790 -5.4164149 106.3603 3.806255e-07 +# 6 diagtime 0.001273007 0.009102230 0.1398566 108.7518 8.890320e-01 +# 7 age -0.005587627 0.009379064 -0.5957554 105.3053 5.526170e-01 +# 8 prior 0.005174395 0.023433186 0.2208148 112.4847 8.256369e-01 # Compare with the results on the original data summary(coxph(Surv(time, status) ~ ., veteran))$coefficients From acb4a0e5c321c71c112acbee680acd8557522ccb Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Thu, 15 Aug 2024 11:26:48 +0200 Subject: [PATCH 2/3] update 2024-08-15 11:26:48.786605 --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index cebf0ad..3e06044 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ ### Major bug fix -- Fixes a major bug introduced in version 2.3.0, by which responses would be used as covariates in the random forests. Thanks for reporting @flystar233 (Issue #77). +- Fixes a major bug introduced in version 2.3.0, by which responses would be used as covariates in the random forests. Thanks for reporting @flystar233 (PR #78). ### Major feature From 65a87abe7fb89f84e68176da4e9a97b9ec757ede Mon Sep 17 00:00:00 2001 From: Michael Mayer Date: Thu, 15 Aug 2024 11:52:12 +0200 Subject: [PATCH 3/3] Update readme, news, revdep --- NEWS.md | 10 +++++++++- README.md | 14 +++++++------- revdep/README.md | 2 +- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/NEWS.md b/NEWS.md index 3e06044..51b873c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,15 @@ ### Major bug fix -- Fixes a major bug introduced in version 2.3.0, by which responses would be used as covariates in the random forests. Thanks for reporting @flystar233 (PR #78). +Fixes a major bug, by which responses would be used as covariates in the random forests. Thanks for reporting @flystar233 (PR #78). + +You can expect to get + +- different imputations, +- better and more logical imputations, +- less good stated OOB prediction errors, +- higher variability in multiple imputation settings. + ### Major feature diff --git a/README.md b/README.md index 732fa9b..47cf9f8 100644 --- a/README.md +++ b/README.md @@ -45,13 +45,13 @@ head(iris_NA) iris_filled <- missRanger(iris_NA, pmm.k = 5, num.trees = 100) head(iris_filled) -# Sepal.Length Sepal.Width Petal.Length Petal.Width Species -# 5.1 3.5 1.4 0.2 setosa -# 4.9 3.0 1.4 0.2 setosa -# 4.7 3.2 1.3 0.2 setosa -# 4.6 3.1 1.5 0.2 setosa -# 5.7 3.6 1.4 0.2 setosa -# 5.4 3.9 1.7 0.4 setosa +# Sepal.Length Sepal.Width Petal.Length Petal.Width Species +# 1 5.1 3.5 1.4 0.2 setosa +# 2 4.9 3.0 1.4 0.2 setosa +# 3 4.7 3.2 1.3 0.2 setosa +# 4 4.6 3.1 1.5 0.2 setosa +# 5 5.2 3.6 1.4 0.2 setosa +# 6 5.4 3.9 1.7 0.4 setosa ``` ## How it works diff --git a/revdep/README.md b/revdep/README.md index 9e5ca3b..8639072 100644 --- a/revdep/README.md +++ b/revdep/README.md @@ -10,7 +10,7 @@ |collate |German_Switzerland.utf8 | |ctype |German_Switzerland.utf8 | |tz |Europe/Zurich | -|date |2024-08-02 | +|date |2024-08-15 | |rstudio |2024.04.2+764 Chocolate Cosmos (desktop) | |pandoc |3.1.6 @ C:\Users\Michael\AppData\Local\Pandoc\pandoc.exe |