mayer79 · mayer79 · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,17 @@
 # missRanger 2.6.0
 
+### Major bug fix
+
+Fixes a major bug, by which responses would be used as covariates in the random forests. Thanks for reporting @flystar233 (PR #78).
+
+You can expect to get
+
+- different imputations,
+- better and more logical imputations,
+- less good stated OOB prediction errors,
+- higher variability in multiple imputation settings.
+
+
 ### Major feature
 
 Out-of-sample application is now possible! Thanks to [@jeandigitale](https://github.com/jeandigitale) for pushing the idea in [#58](https://github.com/mayer79/missRanger/issues/58).

diff --git a/R/methods.R b/R/methods.R
@@ -194,9 +194,8 @@ predict.missRanger <- function(
     stop("No random forests in 'object'. Use missRanger(, keep_forests = TRUE).")
   }
 
-  # Do we have a random forest for all variables with missings?
-  # This can fire only if the first iteration in missRanger() was the best, and only
-  # for maximal one variable. It is a rare case.
+  # Do we have a random forest for all variables with missings? If no, we don't repeat
+  # its univariate imputation.
   forests_missing <- setdiff(to_impute, names(object$forests))
   if (length(forests_missing) > 0L) {
     if (verbose >= 1L) {

diff --git a/R/missRanger.R b/R/missRanger.R
@@ -295,8 +295,9 @@ missRanger <- function(
 
     for (v in to_impute) {
       v.na <- data_NA[, v]
+      xvars <- setdiff(completed, v)
 
-      if (length(completed) == 0L) {
+      if (length(xvars) == 0L) {
         data[[v]] <- imputeUnivariate(data[[v]])
       } else {
         y <- data[[v]][!v.na]
@@ -316,13 +317,13 @@ missRanger <- function(
           case.weights = if (!is.null(case.weights)) case.weights[!v.na],
           num.threads = num.threads,
           save.memory = save.memory,
-          x = data[!v.na, completed, drop = FALSE],
+          x = data[!v.na, xvars, drop = FALSE],
           y = y,
           verbose = verbose >= 1,
           ...
         )
 
-        pred <- stats::predict(fit, data[v.na, completed, drop = FALSE])$predictions
+        pred <- stats::predict(fit, data[v.na, xvars, drop = FALSE])$predictions
 
         if (pmm.k >= 1L) {
           pred <- pmm(xtrain = fit$predictions, xtest = pred, ytrain = y, k = pmm.k)

diff --git a/README.md b/README.md
@@ -45,13 +45,13 @@ head(iris_NA)
 iris_filled <- missRanger(iris_NA, pmm.k = 5, num.trees = 100)
 head(iris_filled)
 
-# Sepal.Length Sepal.Width Petal.Length Petal.Width Species
-#          5.1         3.5          1.4         0.2  setosa
-#          4.9         3.0          1.4         0.2  setosa
-#          4.7         3.2          1.3         0.2  setosa
-#          4.6         3.1          1.5         0.2  setosa
-#          5.7         3.6          1.4         0.2  setosa
-#          5.4         3.9          1.7         0.4  setosa
+#   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+# 1          5.1         3.5          1.4         0.2  setosa
+# 2          4.9         3.0          1.4         0.2  setosa
+# 3          4.7         3.2          1.3         0.2  setosa
+# 4          4.6         3.1          1.5         0.2  setosa
+# 5          5.2         3.6          1.4         0.2  setosa
+# 6          5.4         3.9          1.7         0.4  setosa
 ```
 
 ## How it works

diff --git a/cran-comments.md b/cran-comments.md
@@ -2,7 +2,9 @@
 
 Hello CRAN
 
-This update brings a long awaited feature of out-of-sample application of the imputation models.
+This update
+- fixes a major bug introduced in 2.3.0
+- and brings a long awaited feature of out-of-sample application of the imputation models.
 
 ## R CMD check (local)
 

diff --git a/revdep/README.md b/revdep/README.md
@@ -10,7 +10,7 @@
 |collate  |German_Switzerland.utf8                                  |
 |ctype    |German_Switzerland.utf8                                  |
 |tz       |Europe/Zurich                                            |
-|date     |2024-08-02                                               |
+|date     |2024-08-15                                               |
 |rstudio  |2024.04.2+764 Chocolate Cosmos (desktop)                 |
 |pandoc   |3.1.6 @ C:\Users\Michael\AppData\Local\Pandoc\pandoc.exe |
 

diff --git a/tests/testthat/test-missRanger.R b/tests/testthat/test-missRanger.R
@@ -245,6 +245,9 @@ test_that("formula interface works with unspecified left side", {
   expect_true(!anyNA(imp$data))
   expect_true(.setequal(imp$to_impute, colnames(X_NA)))
   expect_equal(imp$impute_by, "int")
+
+  # Prediction error is on its default 1 for int (issue #77)
+  expect_equal(imp$pred_errors[, "int"], rep(1, length(imp$mean_pred_errors)))
 })
 
 test_that("dropping columns on left side leaves missing values", {

diff --git a/vignettes/multiple_imputation.Rmd b/vignettes/multiple_imputation.Rmd
@@ -50,13 +50,13 @@ models <- lapply(filled, function(x) lm(Sepal.Length ~ ., x))
 # Pool the results by mice
 summary(pooled_fit <- pool(models))
 
-#                term   estimate  std.error statistic        df      p.value
-# 1       (Intercept)  2.4600621 0.33998737  7.235746  86.25283 1.785004e-10
-# 2       Sepal.Width  0.4454417 0.10405609  4.280785  96.17676 4.406215e-05
-# 3      Petal.Length  0.7394242 0.08393401  8.809590  77.63584 2.620202e-13
-# 4       Petal.Width -0.1937151 0.17905818 -1.081856  80.36361 2.825524e-01
-# 5 Speciesversicolor -0.6785451 0.26812613 -2.530694 116.18041 1.272124e-02
-# 6  Speciesvirginica -0.8737822 0.37086417 -2.356071 110.15525 2.023735e-02
+#                term   estimate std.error statistic        df      p.value
+# 1       (Intercept)  2.3343548 0.3244342  7.195157  97.08106 1.314353e-10
+# 2       Sepal.Width  0.4715273 0.1041384  4.527891  88.55776 1.848669e-05
+# 3      Petal.Length  0.7700316 0.0768588 10.018783 122.02953 1.321441e-17
+# 4       Petal.Width -0.2506538 0.1739537 -1.440922  88.10220 1.531513e-01
+# 5 Speciesversicolor -0.6648375 0.2940828 -2.260715  81.17797 2.645368e-02
+# 6  Speciesvirginica -0.9065327 0.4055137 -2.235517  79.87581 2.817491e-02
 
 # Compare with model on original data
 summary(lm(Sepal.Length ~ ., data = iris))

diff --git a/vignettes/working_with_censoring.Rmd b/vignettes/working_with_censoring.Rmd
@@ -81,15 +81,15 @@ models <- lapply(filled, function(x) coxph(Surv(time, status) ~ . - surv, x))
 # 4. Pool the results by mice
 summary(pooled_fit <- pool(models))
 
-#                term     estimate   std.error   statistic        df      p.value
-# 1               trt  0.264601452 0.212828712  1.24326013 110.30869 2.164079e-01
-# 2 celltypesmallcell  0.789909124 0.284989547  2.77171262 113.68488 6.516937e-03
-# 3     celltypeadeno  1.114851697 0.306765748  3.63421179 113.01225 4.210453e-04
-# 4     celltypelarge  0.356374858 0.289111314  1.23265621 112.81945 2.202666e-01
-# 5             karno -0.031939872 0.005678831 -5.62437418 111.96501 1.388135e-07
-# 6          diagtime  0.003620720 0.008929001  0.40550108  99.98576 6.859756e-01
-# 7               age -0.007503755 0.009199070 -0.81570798 108.97973 4.164464e-01
-# 8             prior  0.002002572 0.023640459  0.08470952 112.81848 9.326425e-01
+#                term     estimate   std.error  statistic       df      p.value
+# 1               trt  0.231154077 0.214672763  1.0767741 105.1514 2.840454e-01
+# 2 celltypesmallcell  0.805824737 0.285571376  2.8217980 114.1273 5.634607e-03
+# 3     celltypeadeno  1.130585762 0.306698637  3.6863084 113.3636 3.506786e-04
+# 4     celltypelarge  0.340627347 0.296740520  1.1478963 103.4753 2.536583e-01
+# 5             karno -0.030623274 0.005653790 -5.4164149 106.3603 3.806255e-07
+# 6          diagtime  0.001273007 0.009102230  0.1398566 108.7518 8.890320e-01
+# 7               age -0.005587627 0.009379064 -0.5957554 105.3053 5.526170e-01
+# 8             prior  0.005174395 0.023433186  0.2208148 112.4847 8.256369e-01
 
 # Compare with the results on the original data
 summary(coxph(Surv(time, status) ~ ., veteran))$coefficients