diff --git a/DESCRIPTION b/DESCRIPTION index 3cb0872..d9f2069 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: missRanger Title: Fast Imputation of Missing Values -Version: 2.2.2 +Version: 2.3.0 Authors@R: person(given = "Michael", family = "Mayer", diff --git a/NEWS.md b/NEWS.md index 0c305b3..3c1d031 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,14 @@ -# missRanger 2.2.2 +# missRanger 2.3.0 -- Documentation improvement +## Major improvements + +- `missRanger()` now works with syntactically wrong variable names like "1bad:variable". This solves an [old issue](https://github.com/mayer79/missRanger/issues/19), recently popping up in [this new issue](https://github.com/mayer79/missRanger/issues/51). +- `missRanger()` now works with any number of features, as long as the formula is left at its default, i.e., `.~.`. This solves this [issue](https://github.com/mayer79/missRanger/issues/50). + +## Other changes + +- Documentation improvement. +- `ranger()` is now called via the x/y interface, not the formula interface anymore. # missRanger 2.2.1 @@ -92,4 +100,4 @@ This is a summary of all changes since version 1.x.x. ## Minor bug fix -* The argument `returnOOB` is now effectively controlling if out-of-bag errors are attached as attribute "oob" to the resulting data frame or not. So far, it was always attached. \ No newline at end of file +* The argument `returnOOB` is now effectively controlling if out-of-bag errors are attached as attribute "oob" to the resulting data frame or not. So far, it was always attached. diff --git a/R/missRanger.R b/R/missRanger.R index 83b5028..afbd9d9 100644 --- a/R/missRanger.R +++ b/R/missRanger.R @@ -1,6 +1,6 @@ #' Fast Imputation of Missing Values by Chained Random Forests #' -#' Uses the {ranger} package (Wright & Ziegler) to do fast missing value imputation by +#' Uses the "ranger" package (Wright & Ziegler) to do fast missing value imputation by #' chained random forests, see Stekhoven & Buehlmann and Van Buuren & Groothuis-Oudshoorn. #' Between the iterative model fitting, it offers the option of predictive mean matching. #' This firstly avoids imputation with values not present in the original data @@ -108,8 +108,13 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L, # 2) SELECT AND CONVERT VARIABLES TO IMPUTE # Extract lhs and rhs from formula - relevantVars <- lapply(formula[2:3], function(z) attr(stats::terms.formula( - stats::reformulate(z), data = data[1L, ]), "term.labels")) + parsef <- function(z) { + if (z == ".") { + return(colnames(data)) + } + all.vars(stats::terms.formula(stats::reformulate(z), data = data[1L, ])) + } + relevantVars <- lapply(formula[2:3], parsef) # Pick variables from lhs with some but not all missings toImpute <- relevantVars[[1L]][vapply(data[, relevantVars[[1L]], drop = FALSE], @@ -189,9 +194,9 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L, data[[v]] <- imputeUnivariate(data[[v]]) } else { fit <- ranger::ranger( - formula = stats::reformulate(completed, response = v), - data = data[!v.na, union(v, completed), drop = FALSE], - case.weights = case.weights[!v.na], + y = data[[v]][!v.na], + x = data[!v.na, completed, drop = FALSE], + case.weights = case.weights[!v.na], ... ) pred <- stats::predict(fit, data[v.na, completed, drop = FALSE])$predictions diff --git a/cran-comments.md b/cran-comments.md index 292dd29..8fe7288 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,27 +1,31 @@ -# missRanger 2.2.1 +# missRanger 2.3.0 Dear CRAN team -This is a small update, mainly aiming at replacing "importFrom" by "::" logic, plus -some documentation improvement. +This update fixes two issues: non-syntactic column names, and too many features. + +I have checked reverse dependencies, without finding. ## R CMD check - WARNING: 'qpdf' is needed for checks on size reduction of PDFs -- NOTE: unable to verify current time - NOTE: no command 'tidy' found ## RHub -Note: lastMiKTeXException +* checking HTML version of manual ... NOTE +Skipping checking HTML validation: no command 'tidy' found ## Winbuilder -Status: OK +Status: 1 NOTE +R Under development (unstable) (2023-10-18 r85349 ucrt) + +## revdepcheck results + +We checked 7 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. + + * We saw 0 new problems + * We failed to check 0 packages -## REVDEP -- OK: 7 -- BROKEN: 0 - -## Reverse dependency check of 7 packages diff --git a/man/missRanger.Rd b/man/missRanger.Rd index f76c8c7..9d4451b 100644 --- a/man/missRanger.Rd +++ b/man/missRanger.Rd @@ -58,7 +58,7 @@ better use less trees (e.g. \code{num.trees = 20}) and/or a low value of An imputed \code{data.frame}. } \description{ -Uses the {ranger} package (Wright & Ziegler) to do fast missing value imputation by +Uses the "ranger" package (Wright & Ziegler) to do fast missing value imputation by chained random forests, see Stekhoven & Buehlmann and Van Buuren & Groothuis-Oudshoorn. Between the iterative model fitting, it offers the option of predictive mean matching. This firstly avoids imputation with values not present in the original data diff --git a/packaging.R b/packaging.R index 2bdc46b..9041981 100644 --- a/packaging.R +++ b/packaging.R @@ -15,7 +15,7 @@ library(usethis) use_description( fields = list( Title = "Fast Imputation of Missing Values", - Version = "2.2.2", + Version = "2.3.0", Description = "Alternative implementation of the beautiful 'MissForest' algorithm used to impute mixed-type data sets by chaining random forests, introduced by Stekhoven, D.J. and Buehlmann, P. (2012) . Under the hood, it uses the @@ -92,7 +92,7 @@ document() test() # build_vignettes() check(manual = TRUE, cran = TRUE) -build() +build(vignettes = FALSE) # build(binary = TRUE) install() diff --git a/revdep/README.md b/revdep/README.md index 6418fb6..92d239e 100644 --- a/revdep/README.md +++ b/revdep/README.md @@ -1,34 +1,28 @@ # Platform -|field |value | -|:--------|:----------------------------------------------------| -|version |R version 4.3.0 (2023-04-21 ucrt) | -|os |Windows 11 x64 (build 22621) | -|system |x86_64, mingw32 | -|ui |RStudio | -|language |(EN) | -|collate |German_Switzerland.utf8 | -|ctype |German_Switzerland.utf8 | -|tz |Europe/Zurich | -|date |2023-04-28 | -|rstudio |2023.03.0+386 Cherry Blossom (desktop) | -|pandoc |2.12 @ C:\Users\Michael\anaconda3\Scripts\pandoc.exe | +|field |value | +|:--------|:--------------------------------------------------------| +|version |R version 4.3.0 (2023-04-21 ucrt) | +|os |Windows 11 x64 (build 22621) | +|system |x86_64, mingw32 | +|ui |RStudio | +|language |(EN) | +|collate |German_Switzerland.utf8 | +|ctype |German_Switzerland.utf8 | +|tz |Europe/Zurich | +|date |2023-10-20 | +|rstudio |2023.06.1+524 Mountain Hydrangea (desktop) | +|pandoc |3.1.6 @ C:\Users\Michael\AppData\Local\Pandoc\pandoc.exe | # Dependencies |package |old |new |Δ | |:----------|:---------|:---------|:--| -|missRanger |2.2.0 |2.2.1 |* | +|missRanger |2.2.1 |2.3.0 |* | |FNN |1.1.3.2 |1.1.3.2 | | |ranger |0.15.1 |0.15.1 | | -|Rcpp |1.0.10 |1.0.10 | | +|Rcpp |1.0.11 |1.0.11 | | |RcppEigen |0.3.3.9.3 |0.3.3.9.3 | | # Revdeps -## Failed to check (1) - -|package |version |error |warning |note | -|:---------------|:-------|:-----|:-------|:----| -|marginaleffects |? | | | | - diff --git a/revdep/cran.md b/revdep/cran.md index 162754f..7bdf5de 100644 --- a/revdep/cran.md +++ b/revdep/cran.md @@ -1,6 +1,6 @@ ## revdepcheck results -We checked 7 reverse dependencies (6 from CRAN + 1 from Bioconductor), comparing R CMD check results across CRAN and dev versions of this package. +We checked 7 reverse dependencies, comparing R CMD check results across CRAN and dev versions of this package. * We saw 0 new problems * We failed to check 0 packages diff --git a/revdep/failures.md b/revdep/failures.md index f02e3d8..9a20736 100644 --- a/revdep/failures.md +++ b/revdep/failures.md @@ -1,45 +1 @@ -# marginaleffects - -
- -* Version: -* GitHub: https://github.com/mayer79/missRanger -* Source code: NA -* Number of recursive dependencies: 0 - -
- -## Error before installation - -### Devel - -``` - - Es gibt Binärversionen, jedoch sind die Quelltexte neuer: - binary source needs_compilation -checkmate 2.1.0 2.2.0 TRUE -xml2 1.3.3 1.3.4 TRUE - - Binaries will be installed - - - - - -``` -### CRAN - -``` - - Es gibt Binärversionen, jedoch sind die Quelltexte neuer: - binary source needs_compilation -checkmate 2.1.0 2.2.0 TRUE -xml2 1.3.3 1.3.4 TRUE - - Binaries will be installed - - - - - -``` +*Wow, no problems at all. :)* \ No newline at end of file diff --git a/tests/testthat/test-missRanger.R b/tests/testthat/test-missRanger.R index 96af82b..06e9bf6 100644 --- a/tests/testthat/test-missRanger.R +++ b/tests/testthat/test-missRanger.R @@ -31,14 +31,56 @@ X <- data.frame( x5 = seq_len(n) > n %/% 3 ) X_NA <- generateNA(X, p = seq(0.2, 0.8, length.out = ncol(X)), seed = 13L) +imp <- missRanger(X_NA, maxiter = 3L, num.trees = 20L, verbose = 0L, seed = 1L) test_that("variable type is respected (integer might get double)", { - imp <- missRanger(X_NA, maxiter = 3L, num.trees = 20L, verbose = 0L) expect_true(!anyNA(imp)) expect_equal(sapply(imp[-1L], class), sapply(X_NA[-1L], class)) expect_true(class(imp[, 1L]) %in% c("integer", "numeric")) }) +test_that("non-syntactic column names work", { + X_NA2 <- X_NA + colnames(X_NA2) <- paste(1:5, colnames(X_NA)) + imp2 <- missRanger(X_NA2, maxiter = 3L, num.trees = 20L, verbose = 0L, seed = 1L) + imp3 <- missRanger( + `1 x1` + `2 x2` + `3 x3` + `4 x4` + `5 x5` ~ `1 x1` + `2 x2` + `3 x3` + `4 x4` + `5 x5`, + data = X_NA2, + maxiter = 3L, + num.trees = 20L, + verbose = 0L, + seed = 1L + ) + + imp4 <- missRanger( + . ~ `1 x1` + `2 x2` + `3 x3` + `4 x4` + `5 x5`, + data = X_NA2, + maxiter = 3L, + num.trees = 20L, + verbose = 0L, + seed = 1L + ) + + imp5 <- missRanger( + `1 x1` + `2 x2` + `3 x3` + `4 x4` + `5 x5` ~ ., + data = X_NA2, + maxiter = 3L, + num.trees = 20L, + verbose = 0L, + seed = 1L + ) + expect_equal(colnames(X_NA2), colnames(imp2)) + expect_equal(imp, setNames(imp2, colnames(imp))) + expect_equal(imp2, imp3) + expect_equal(imp2, imp4) + expect_equal(imp2, imp5) + + # https://github.com/mayer79/missRanger/issues/51 + ir3 <- iris + colnames(ir3)[2L] <- "IGHV3-43D;IGHV3-9" + expect_no_error(missRanger(ir3, verbose = 0L)) +}) + test_that("pmm.k works regarding value range in double columns", { imp <- missRanger(X_NA, maxiter = 3L, num.trees = 20L, verbose = 0L, pmm.k = 3L) expect_true(all(imp$x2 %in% X$x2)) @@ -158,3 +200,11 @@ test_that("Too few case.weights give an error", { ) ) }) + +test_that("Extremely wide datasets are handled", { + # https://github.com/mayer79/missRanger/issues/50 + set.seed(1L) + data <- matrix(rnorm(385 * 20000), nrow = 385L, ncol = 20000L) + data[5L, 5L] <- NA + expect_no_error(missRanger(as.data.frame(data), num.trees = 3L, verbose = 0L)) +}) diff --git a/vignettes/missRanger.Rmd b/vignettes/missRanger.Rmd index 4106885..d6db1d8 100644 --- a/vignettes/missRanger.Rmd +++ b/vignettes/missRanger.Rmd @@ -182,7 +182,7 @@ system.time( m <- missRanger(diamonds_with_NA, pmm.k = 3, num.trees = 50) ) -# Takes 7 seconds +# Takes 6 seconds system.time( m <- missRanger(diamonds_with_NA, pmm.k = 3, num.trees = 1) ) diff --git a/vignettes/working_with_censoring.Rmd b/vignettes/working_with_censoring.Rmd index 1c9d776..a9aeb7d 100644 --- a/vignettes/working_with_censoring.Rmd +++ b/vignettes/working_with_censoring.Rmd @@ -93,14 +93,14 @@ models <- lapply(filled, function(x) coxph(Surv(time, status) ~ . - surv, x)) summary(pooled_fit <- pool(models)) # term estimate std.error statistic df p.value -# 1 trt 0.238408881 0.213416156 1.1171079 108.30303 2.664203e-01 -# 2 celltypesmallcell 0.801088770 0.286383107 2.7972626 112.17712 6.066665e-03 -# 3 celltypeadeno 1.134351839 0.308977998 3.6713030 110.65791 3.731780e-04 -# 4 celltypelarge 0.327092592 0.291069423 1.1237614 109.29555 2.635765e-01 -# 5 karno -0.031250557 0.005786704 -5.4004073 99.60711 4.529695e-07 -# 6 diagtime 0.002889092 0.009020319 0.3202872 106.17585 7.493801e-01 -# 7 age -0.007620985 0.009632902 -0.7911411 97.27459 4.307864e-01 -# 8 prior 0.003954604 0.023537476 0.1680131 111.98360 8.668760e-01 +# 1 trt 0.245855250 0.212810467 1.1552780 108.72929 2.505091e-01 +# 2 celltypesmallcell 0.805233656 0.284424937 2.8310937 114.17088 5.483657e-03 +# 3 celltypeadeno 1.110172771 0.307570269 3.6094931 111.91588 4.603422e-04 +# 4 celltypelarge 0.328227283 0.291163500 1.1272954 109.30510 2.620862e-01 +# 5 karno -0.031838682 0.005663349 -5.6218824 112.60325 1.390333e-07 +# 6 diagtime 0.002775351 0.009382270 0.2958081 86.61582 7.680847e-01 +# 7 age -0.007843577 0.009293988 -0.8439410 107.86917 4.005701e-01 +# 8 prior 0.003165245 0.023501821 0.1346809 111.84783 8.931063e-01 # Compare with the results on the original data summary(coxph(Surv(time, status) ~ ., veteran))$coefficients