mayer79 · mayer79 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: missRanger
 Title: Fast Imputation of Missing Values
-Version: 2.2.2
+Version: 2.3.0
 Authors@R: 
     person(given = "Michael",
            family = "Mayer",

diff --git a/NEWS.md b/NEWS.md
@@ -1,6 +1,14 @@
-# missRanger 2.2.2
+# missRanger 2.3.0
 
-- Documentation improvement
+## Major improvements
+
+- `missRanger()` now works with syntactically wrong variable names like "1bad:variable". This solves an [old issue](https://github.com/mayer79/missRanger/issues/19), recently popping up in [this new issue](https://github.com/mayer79/missRanger/issues/51).
+- `missRanger()` now works with any number of features, as long as the formula is left at its default, i.e., `.~.`. This solves this [issue](https://github.com/mayer79/missRanger/issues/50).
+
+## Other changes
+
+- Documentation improvement.
+- `ranger()` is now called via the x/y interface, not the formula interface anymore.
 
 # missRanger 2.2.1
 
@@ -92,4 +100,4 @@ This is a summary of all changes since version 1.x.x.
 
 ## Minor bug fix
 
-* The argument `returnOOB` is now effectively controlling if out-of-bag errors are attached as attribute "oob" to the resulting data frame or not. So far, it was always attached.
+* The argument `returnOOB` is now effectively controlling if out-of-bag errors are attached as attribute "oob" to the resulting data frame or not. So far, it was always attached.
diff --git a/R/missRanger.R b/R/missRanger.R
@@ -1,6 +1,6 @@
 #' Fast Imputation of Missing Values by Chained Random Forests
 #' 
-#' Uses the {ranger} package (Wright & Ziegler) to do fast missing value imputation by 
+#' Uses the "ranger" package (Wright & Ziegler) to do fast missing value imputation by 
 #' chained random forests, see Stekhoven & Buehlmann and Van Buuren & Groothuis-Oudshoorn.
 #' Between the iterative model fitting, it offers the option of predictive mean matching. 
 #' This firstly avoids imputation with values not present in the original data 
@@ -108,8 +108,13 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L,
   # 2) SELECT AND CONVERT VARIABLES TO IMPUTE
 
   # Extract lhs and rhs from formula
-  relevantVars <- lapply(formula[2:3], function(z) attr(stats::terms.formula(
-    stats::reformulate(z), data = data[1L, ]), "term.labels"))
+  parsef <- function(z) {
+    if (z == ".") {
+      return(colnames(data))
+    }
+    all.vars(stats::terms.formula(stats::reformulate(z), data = data[1L, ]))
+  }
+  relevantVars <- lapply(formula[2:3], parsef)
 
   # Pick variables from lhs with some but not all missings
   toImpute <- relevantVars[[1L]][vapply(data[, relevantVars[[1L]], drop = FALSE], 
@@ -189,9 +194,9 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L,
         data[[v]] <- imputeUnivariate(data[[v]])
       } else {
         fit <- ranger::ranger(
-          formula = stats::reformulate(completed, response = v),
-          data = data[!v.na, union(v, completed), drop = FALSE],
-          case.weights = case.weights[!v.na], 
+          y = data[[v]][!v.na],
+          x = data[!v.na, completed, drop = FALSE],
+          case.weights = case.weights[!v.na],
           ...
         )
         pred <- stats::predict(fit, data[v.na, completed, drop = FALSE])$predictions

diff --git a/cran-comments.md b/cran-comments.md
@@ -1,27 +1,31 @@
-# missRanger 2.2.1
+# missRanger 2.3.0
 
 Dear CRAN team
 
-This is a small update, mainly aiming at replacing "importFrom" by "::" logic, plus
-some documentation improvement.
+This update fixes two issues: non-syntactic column names, and too many features.
+
+I have checked reverse dependencies, without finding.
 
 ## R CMD check
 
 - WARNING: 'qpdf' is needed for checks on size reduction of PDFs
-- NOTE: unable to verify current time
 - NOTE: no command 'tidy' found
 
 ## RHub 
 
-Note: lastMiKTeXException
+* checking HTML version of manual ... NOTE
+Skipping checking HTML validation: no command 'tidy' found
 
 ## Winbuilder
 
-Status: OK
+Status: 1 NOTE
+R Under development (unstable) (2023-10-18 r85349 ucrt)
+
+## revdepcheck results
+
+We checked 7 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.
+
+ * We saw 0 new problems
+ * We failed to check 0 packages
 
-## REVDEP
 
-- OK: 7                                                                 
-- BROKEN: 0
-
-## Reverse dependency check of 7 packages
diff --git a/man/missRanger.Rd b/man/missRanger.Rd
diff --git a/packaging.R b/packaging.R
@@ -15,7 +15,7 @@ library(usethis)
 use_description(
   fields = list(
     Title = "Fast Imputation of Missing Values",
-    Version = "2.2.2",
+    Version = "2.3.0",
     Description = "Alternative implementation of the beautiful 'MissForest' algorithm used to impute 
     mixed-type data sets by chaining random forests, introduced by Stekhoven, D.J. and 
     Buehlmann, P. (2012) <doi:10.1093/bioinformatics/btr597>. Under the hood, it uses the 
@@ -92,7 +92,7 @@ document()
 test()
 # build_vignettes()
 check(manual = TRUE, cran = TRUE)
-build()
+build(vignettes = FALSE)
 # build(binary = TRUE)
 install()
 

diff --git a/revdep/README.md b/revdep/README.md
@@ -1,34 +1,28 @@
 # Platform
 
-|field    |value                                                |
-|:--------|:----------------------------------------------------|
-|version  |R version 4.3.0 (2023-04-21 ucrt)                    |
-|os       |Windows 11 x64 (build 22621)                         |
-|system   |x86_64, mingw32                                      |
-|ui       |RStudio                                              |
-|language |(EN)                                                 |
-|collate  |German_Switzerland.utf8                              |
-|ctype    |German_Switzerland.utf8                              |
-|tz       |Europe/Zurich                                        |
-|date     |2023-04-28                                           |
-|rstudio  |2023.03.0+386 Cherry Blossom (desktop)               |
-|pandoc   |2.12 @ C:\Users\Michael\anaconda3\Scripts\pandoc.exe |
+|field    |value                                                    |
+|:--------|:--------------------------------------------------------|
+|version  |R version 4.3.0 (2023-04-21 ucrt)                        |
+|os       |Windows 11 x64 (build 22621)                             |
+|system   |x86_64, mingw32                                          |
+|ui       |RStudio                                                  |
+|language |(EN)                                                     |
+|collate  |German_Switzerland.utf8                                  |
+|ctype    |German_Switzerland.utf8                                  |
+|tz       |Europe/Zurich                                            |
+|date     |2023-10-20                                               |
+|rstudio  |2023.06.1+524 Mountain Hydrangea (desktop)               |
+|pandoc   |3.1.6 @ C:\Users\Michael\AppData\Local\Pandoc\pandoc.exe |
 
 # Dependencies
 
 |package    |old       |new       |Δ  |
 |:----------|:---------|:---------|:--|
-|missRanger |2.2.0     |2.2.1     |*  |
+|missRanger |2.2.1     |2.3.0     |*  |
 |FNN        |1.1.3.2   |1.1.3.2   |   |
 |ranger     |0.15.1    |0.15.1    |   |
-|Rcpp       |1.0.10    |1.0.10    |   |
+|Rcpp       |1.0.11    |1.0.11    |   |
 |RcppEigen  |0.3.3.9.3 |0.3.3.9.3 |   |
 
 # Revdeps
 
-## Failed to check (1)
-
-|package         |version |error |warning |note |
-|:---------------|:-------|:-----|:-------|:----|
-|marginaleffects |?       |      |        |     |
-
diff --git a/revdep/cran.md b/revdep/cran.md
@@ -1,6 +1,6 @@
 ## revdepcheck results
 
-We checked 7 reverse dependencies (6 from CRAN + 1 from Bioconductor), comparing R CMD check results across CRAN and dev versions of this package.
+We checked 7 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package.
 
  * We saw 0 new problems
  * We failed to check 0 packages

diff --git a/revdep/failures.md b/revdep/failures.md
@@ -1,45 +1 @@
-# marginaleffects
-
-<details>
-
-* Version: 
-* GitHub: https://github.com/mayer79/missRanger
-* Source code: NA
-* Number of recursive dependencies: 0
-
-</details>
-
-## Error before installation
-
-### Devel
-
-```
-
-  Es gibt Binärversionen, jedoch sind die Quelltexte neuer:
-          binary source needs_compilation
-checkmate  2.1.0  2.2.0              TRUE
-xml2       1.3.3  1.3.4              TRUE
-
-  Binaries will be installed
-
-
-
-
-
-```
-### CRAN
-
-```
-
-  Es gibt Binärversionen, jedoch sind die Quelltexte neuer:
-          binary source needs_compilation
-checkmate  2.1.0  2.2.0              TRUE
-xml2       1.3.3  1.3.4              TRUE
-
-  Binaries will be installed
-
-
-
-
-
-```
+*Wow, no problems at all. :)*
diff --git a/tests/testthat/test-missRanger.R b/tests/testthat/test-missRanger.R
@@ -31,14 +31,56 @@ X <- data.frame(
   x5 = seq_len(n) > n %/% 3
 )
 X_NA <- generateNA(X, p = seq(0.2, 0.8, length.out = ncol(X)), seed = 13L)
+imp <- missRanger(X_NA, maxiter = 3L, num.trees = 20L, verbose = 0L, seed = 1L)
 
 test_that("variable type is respected (integer might get double)", {
-  imp <- missRanger(X_NA, maxiter = 3L, num.trees = 20L, verbose = 0L)
   expect_true(!anyNA(imp))
   expect_equal(sapply(imp[-1L], class), sapply(X_NA[-1L], class))
   expect_true(class(imp[, 1L]) %in% c("integer", "numeric"))
 })
 
+test_that("non-syntactic column names work", {
+  X_NA2 <- X_NA
+  colnames(X_NA2) <- paste(1:5, colnames(X_NA))
+  imp2 <- missRanger(X_NA2, maxiter = 3L, num.trees = 20L, verbose = 0L, seed = 1L)
+  imp3 <- missRanger(
+    `1 x1` + `2 x2` + `3 x3` + `4 x4` + `5 x5` ~ `1 x1` + `2 x2` + `3 x3` + `4 x4` + `5 x5`,
+    data = X_NA2, 
+    maxiter = 3L, 
+    num.trees = 20L, 
+    verbose = 0L, 
+    seed = 1L
+  )
+
+  imp4 <- missRanger(
+    . ~ `1 x1` + `2 x2` + `3 x3` + `4 x4` + `5 x5`,
+    data = X_NA2, 
+    maxiter = 3L, 
+    num.trees = 20L, 
+    verbose = 0L, 
+    seed = 1L
+  )
+
+  imp5 <- missRanger(
+    `1 x1` + `2 x2` + `3 x3` + `4 x4` + `5 x5` ~ .,
+    data = X_NA2, 
+    maxiter = 3L, 
+    num.trees = 20L, 
+    verbose = 0L, 
+    seed = 1L
+  )
+  expect_equal(colnames(X_NA2), colnames(imp2))
+  expect_equal(imp, setNames(imp2, colnames(imp)))
+  expect_equal(imp2, imp3)
+  expect_equal(imp2, imp4)
+  expect_equal(imp2, imp5)
+
+  # https://github.com/mayer79/missRanger/issues/51
+  ir3 <- iris
+  colnames(ir3)[2L] <- "IGHV3-43D;IGHV3-9"
+  expect_no_error(missRanger(ir3, verbose = 0L))
+})
+
 test_that("pmm.k works regarding value range in double columns", {
   imp <- missRanger(X_NA, maxiter = 3L, num.trees = 20L, verbose = 0L, pmm.k = 3L)
   expect_true(all(imp$x2 %in% X$x2))
@@ -158,3 +200,11 @@ test_that("Too few case.weights give an error", {
     )
   )
 })
+
+test_that("Extremely wide datasets are handled", {
+  # https://github.com/mayer79/missRanger/issues/50
+  set.seed(1L)
+  data <- matrix(rnorm(385 * 20000), nrow = 385L, ncol = 20000L)
+  data[5L, 5L] <- NA
+  expect_no_error(missRanger(as.data.frame(data), num.trees = 3L, verbose = 0L))
+})
diff --git a/vignettes/missRanger.Rmd b/vignettes/missRanger.Rmd
@@ -182,7 +182,7 @@ system.time(
   m <- missRanger(diamonds_with_NA, pmm.k = 3, num.trees = 50)
 )
 
-# Takes 7 seconds
+# Takes 6 seconds
 system.time(
   m <- missRanger(diamonds_with_NA, pmm.k = 3, num.trees = 1)
 )

diff --git a/vignettes/working_with_censoring.Rmd b/vignettes/working_with_censoring.Rmd
@@ -93,14 +93,14 @@ models <- lapply(filled, function(x) coxph(Surv(time, status) ~ . - surv, x))
 summary(pooled_fit <- pool(models))
 
 #                term     estimate   std.error  statistic        df      p.value
-# 1               trt  0.238408881 0.213416156  1.1171079 108.30303 2.664203e-01
-# 2 celltypesmallcell  0.801088770 0.286383107  2.7972626 112.17712 6.066665e-03
-# 3     celltypeadeno  1.134351839 0.308977998  3.6713030 110.65791 3.731780e-04
-# 4     celltypelarge  0.327092592 0.291069423  1.1237614 109.29555 2.635765e-01
-# 5             karno -0.031250557 0.005786704 -5.4004073  99.60711 4.529695e-07
-# 6          diagtime  0.002889092 0.009020319  0.3202872 106.17585 7.493801e-01
-# 7               age -0.007620985 0.009632902 -0.7911411  97.27459 4.307864e-01
-# 8             prior  0.003954604 0.023537476  0.1680131 111.98360 8.668760e-01
+# 1               trt  0.245855250 0.212810467  1.1552780 108.72929 2.505091e-01
+# 2 celltypesmallcell  0.805233656 0.284424937  2.8310937 114.17088 5.483657e-03
+# 3     celltypeadeno  1.110172771 0.307570269  3.6094931 111.91588 4.603422e-04
+# 4     celltypelarge  0.328227283 0.291163500  1.1272954 109.30510 2.620862e-01
+# 5             karno -0.031838682 0.005663349 -5.6218824 112.60325 1.390333e-07
+# 6          diagtime  0.002775351 0.009382270  0.2958081  86.61582 7.680847e-01
+# 7               age -0.007843577 0.009293988 -0.8439410 107.86917 4.005701e-01
+# 8             prior  0.003165245 0.023501821  0.1346809 111.84783 8.931063e-01
 
 # Compare with the results on the original data
 summary(coxph(Surv(time, status) ~ ., veteran))$coefficients