From 09454399f99f74685449a1b401760e0a22d1464e Mon Sep 17 00:00:00 2001
From: Michael Mayer <mayermichael79@gmail.com>
Date: Thu, 15 Aug 2024 11:11:30 +0200
Subject: [PATCH 1/3] Fixes response leakage

---
 NEWS.md                              |  4 ++++
 R/methods.R                          |  5 ++---
 R/missRanger.R                       |  7 ++++---
 cran-comments.md                     |  4 +++-
 tests/testthat/test-missRanger.R     |  3 +++
 vignettes/multiple_imputation.Rmd    | 14 +++++++-------
 vignettes/working_with_censoring.Rmd | 18 +++++++++---------
 7 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 6a1cfc5..cebf0ad 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,9 @@
 # missRanger 2.6.0
 
+### Major bug fix
+
+- Fixes a major bug introduced in version 2.3.0, by which responses would be used as covariates in the random forests. Thanks for reporting @flystar233 (Issue #77).
+
 ### Major feature
 
 Out-of-sample application is now possible! Thanks to [@jeandigitale](https://github.com/jeandigitale) for pushing the idea in [#58](https://github.com/mayer79/missRanger/issues/58).
diff --git a/R/methods.R b/R/methods.R
index 4cadd45..df619fa 100644
--- a/R/methods.R
+++ b/R/methods.R
@@ -194,9 +194,8 @@ predict.missRanger <- function(
     stop("No random forests in 'object'. Use missRanger(, keep_forests = TRUE).")
   }
   
-  # Do we have a random forest for all variables with missings?
-  # This can fire only if the first iteration in missRanger() was the best, and only
-  # for maximal one variable. It is a rare case.
+  # Do we have a random forest for all variables with missings? If no, we don't repeat
+  # its univariate imputation.
   forests_missing <- setdiff(to_impute, names(object$forests))
   if (length(forests_missing) > 0L) {
     if (verbose >= 1L) {
diff --git a/R/missRanger.R b/R/missRanger.R
index 64416f6..e94febe 100644
--- a/R/missRanger.R
+++ b/R/missRanger.R
@@ -295,8 +295,9 @@ missRanger <- function(
 
     for (v in to_impute) {
       v.na <- data_NA[, v]
+      xvars <- setdiff(completed, v)
 
-      if (length(completed) == 0L) {
+      if (length(xvars) == 0L) {
         data[[v]] <- imputeUnivariate(data[[v]])
       } else {
         y <- data[[v]][!v.na]
@@ -316,13 +317,13 @@ missRanger <- function(
           case.weights = if (!is.null(case.weights)) case.weights[!v.na],
           num.threads = num.threads,
           save.memory = save.memory,
-          x = data[!v.na, completed, drop = FALSE],
+          x = data[!v.na, xvars, drop = FALSE],
           y = y,
           verbose = verbose >= 1,
           ...
         )
 
-        pred <- stats::predict(fit, data[v.na, completed, drop = FALSE])$predictions
+        pred <- stats::predict(fit, data[v.na, xvars, drop = FALSE])$predictions
         
         if (pmm.k >= 1L) {
           pred <- pmm(xtrain = fit$predictions, xtest = pred, ytrain = y, k = pmm.k)
diff --git a/cran-comments.md b/cran-comments.md
index 0f5eb9e..21aa02e 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -2,7 +2,9 @@
 
 Hello CRAN
 
-This update brings a long awaited feature of out-of-sample application of the imputation models.
+This update
+- fixes a major bug introduced in 2.3.0
+- and brings a long awaited feature of out-of-sample application of the imputation models.
 
 ## R CMD check (local)
 
diff --git a/tests/testthat/test-missRanger.R b/tests/testthat/test-missRanger.R
index e2d50aa..711ca5a 100644
--- a/tests/testthat/test-missRanger.R
+++ b/tests/testthat/test-missRanger.R
@@ -245,6 +245,9 @@ test_that("formula interface works with unspecified left side", {
   expect_true(!anyNA(imp$data))
   expect_true(.setequal(imp$to_impute, colnames(X_NA)))
   expect_equal(imp$impute_by, "int")
+  
+  # Prediction error is on its default 1 for int (issue #77)
+  expect_equal(imp$pred_errors[, "int"], rep(1, length(imp$mean_pred_errors)))
 })
 
 test_that("dropping columns on left side leaves missing values", {
diff --git a/vignettes/multiple_imputation.Rmd b/vignettes/multiple_imputation.Rmd
index 63df68f..537446f 100644
--- a/vignettes/multiple_imputation.Rmd
+++ b/vignettes/multiple_imputation.Rmd
@@ -50,13 +50,13 @@ models <- lapply(filled, function(x) lm(Sepal.Length ~ ., x))
 # Pool the results by mice
 summary(pooled_fit <- pool(models))
 
-#                term   estimate  std.error statistic        df      p.value
-# 1       (Intercept)  2.4600621 0.33998737  7.235746  86.25283 1.785004e-10
-# 2       Sepal.Width  0.4454417 0.10405609  4.280785  96.17676 4.406215e-05
-# 3      Petal.Length  0.7394242 0.08393401  8.809590  77.63584 2.620202e-13
-# 4       Petal.Width -0.1937151 0.17905818 -1.081856  80.36361 2.825524e-01
-# 5 Speciesversicolor -0.6785451 0.26812613 -2.530694 116.18041 1.272124e-02
-# 6  Speciesvirginica -0.8737822 0.37086417 -2.356071 110.15525 2.023735e-02
+#                term   estimate std.error statistic        df      p.value
+# 1       (Intercept)  2.3343548 0.3244342  7.195157  97.08106 1.314353e-10
+# 2       Sepal.Width  0.4715273 0.1041384  4.527891  88.55776 1.848669e-05
+# 3      Petal.Length  0.7700316 0.0768588 10.018783 122.02953 1.321441e-17
+# 4       Petal.Width -0.2506538 0.1739537 -1.440922  88.10220 1.531513e-01
+# 5 Speciesversicolor -0.6648375 0.2940828 -2.260715  81.17797 2.645368e-02
+# 6  Speciesvirginica -0.9065327 0.4055137 -2.235517  79.87581 2.817491e-02
 
 # Compare with model on original data
 summary(lm(Sepal.Length ~ ., data = iris))
diff --git a/vignettes/working_with_censoring.Rmd b/vignettes/working_with_censoring.Rmd
index ab45129..6e5b83e 100644
--- a/vignettes/working_with_censoring.Rmd
+++ b/vignettes/working_with_censoring.Rmd
@@ -81,15 +81,15 @@ models <- lapply(filled, function(x) coxph(Surv(time, status) ~ . - surv, x))
 # 4. Pool the results by mice
 summary(pooled_fit <- pool(models))
 
-#                term     estimate   std.error   statistic        df      p.value
-# 1               trt  0.264601452 0.212828712  1.24326013 110.30869 2.164079e-01
-# 2 celltypesmallcell  0.789909124 0.284989547  2.77171262 113.68488 6.516937e-03
-# 3     celltypeadeno  1.114851697 0.306765748  3.63421179 113.01225 4.210453e-04
-# 4     celltypelarge  0.356374858 0.289111314  1.23265621 112.81945 2.202666e-01
-# 5             karno -0.031939872 0.005678831 -5.62437418 111.96501 1.388135e-07
-# 6          diagtime  0.003620720 0.008929001  0.40550108  99.98576 6.859756e-01
-# 7               age -0.007503755 0.009199070 -0.81570798 108.97973 4.164464e-01
-# 8             prior  0.002002572 0.023640459  0.08470952 112.81848 9.326425e-01
+#                term     estimate   std.error  statistic       df      p.value
+# 1               trt  0.231154077 0.214672763  1.0767741 105.1514 2.840454e-01
+# 2 celltypesmallcell  0.805824737 0.285571376  2.8217980 114.1273 5.634607e-03
+# 3     celltypeadeno  1.130585762 0.306698637  3.6863084 113.3636 3.506786e-04
+# 4     celltypelarge  0.340627347 0.296740520  1.1478963 103.4753 2.536583e-01
+# 5             karno -0.030623274 0.005653790 -5.4164149 106.3603 3.806255e-07
+# 6          diagtime  0.001273007 0.009102230  0.1398566 108.7518 8.890320e-01
+# 7               age -0.005587627 0.009379064 -0.5957554 105.3053 5.526170e-01
+# 8             prior  0.005174395 0.023433186  0.2208148 112.4847 8.256369e-01
 
 # Compare with the results on the original data
 summary(coxph(Surv(time, status) ~ ., veteran))$coefficients

From acb4a0e5c321c71c112acbee680acd8557522ccb Mon Sep 17 00:00:00 2001
From: Michael Mayer <mayermichael79@gmail.com>
Date: Thu, 15 Aug 2024 11:26:48 +0200
Subject: [PATCH 2/3] update 2024-08-15 11:26:48.786605

---
 NEWS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index cebf0ad..3e06044 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,7 +2,7 @@
 
 ### Major bug fix
 
-- Fixes a major bug introduced in version 2.3.0, by which responses would be used as covariates in the random forests. Thanks for reporting @flystar233 (Issue #77).
+- Fixes a major bug introduced in version 2.3.0, by which responses would be used as covariates in the random forests. Thanks for reporting @flystar233 (PR #78).
 
 ### Major feature
 

From 65a87abe7fb89f84e68176da4e9a97b9ec757ede Mon Sep 17 00:00:00 2001
From: Michael Mayer <mayermichael79@gmail.com>
Date: Thu, 15 Aug 2024 11:52:12 +0200
Subject: [PATCH 3/3] Update readme, news, revdep

---
 NEWS.md          | 10 +++++++++-
 README.md        | 14 +++++++-------
 revdep/README.md |  2 +-
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 3e06044..51b873c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -2,7 +2,15 @@
 
 ### Major bug fix
 
-- Fixes a major bug introduced in version 2.3.0, by which responses would be used as covariates in the random forests. Thanks for reporting @flystar233 (PR #78).
+Fixes a major bug, by which responses would be used as covariates in the random forests. Thanks for reporting @flystar233 (PR #78).
+
+You can expect to get
+
+- different imputations,
+- better and more logical imputations,
+- less good stated OOB prediction errors,
+- higher variability in multiple imputation settings.
+
 
 ### Major feature
 
diff --git a/README.md b/README.md
index 732fa9b..47cf9f8 100644
--- a/README.md
+++ b/README.md
@@ -45,13 +45,13 @@ head(iris_NA)
 iris_filled <- missRanger(iris_NA, pmm.k = 5, num.trees = 100)
 head(iris_filled)
 
-# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
-#          5.1         3.5          1.4         0.2  setosa
-#          4.9         3.0          1.4         0.2  setosa
-#          4.7         3.2          1.3         0.2  setosa
-#          4.6         3.1          1.5         0.2  setosa
-#          5.7         3.6          1.4         0.2  setosa
-#          5.4         3.9          1.7         0.4  setosa
+#   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+# 1          5.1         3.5          1.4         0.2  setosa
+# 2          4.9         3.0          1.4         0.2  setosa
+# 3          4.7         3.2          1.3         0.2  setosa
+# 4          4.6         3.1          1.5         0.2  setosa
+# 5          5.2         3.6          1.4         0.2  setosa
+# 6          5.4         3.9          1.7         0.4  setosa
 ```
 
 ## How it works
diff --git a/revdep/README.md b/revdep/README.md
index 9e5ca3b..8639072 100644
--- a/revdep/README.md
+++ b/revdep/README.md
@@ -10,7 +10,7 @@
 |collate  |German_Switzerland.utf8                                  |
 |ctype    |German_Switzerland.utf8                                  |
 |tz       |Europe/Zurich                                            |
-|date     |2024-08-02                                               |
+|date     |2024-08-15                                               |
 |rstudio  |2024.04.2+764 Chocolate Cosmos (desktop)                 |
 |pandoc   |3.1.6 @ C:\Users\Michael\AppData\Local\Pandoc\pandoc.exe |