Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Some small aesthetic changes #67

Merged
merged 6 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
# missRanger 2.6.0

## Minor changes
## Minor changes in output object

- Add original data as `data_raw`.
- Renamed `visit_seq` to `to_impute`.

## Other changes

- Add original data to output.
- Improvement in the vignette on censored data.

# missRanger 2.5.0

Expand Down
47 changes: 22 additions & 25 deletions R/missRanger.R
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@
#' `write.forest`, `probability`, `split.select.weights`,
#' `dependent.variable.name`, and `classification`.
#' @returns
#' If `data_only` an imputed `data.frame`. Otherwise, a "missRanger" object with
#' the following elements that can be extracted via `$`:
#' If `data_only = TRUE` an imputed `data.frame`. Otherwise, a "missRanger" object
#' with the following elements:
#' - `data`: The imputed data.
#' - `data_raw`: The original data provided.
#' - `forests`: When `keep_forests = TRUE`, a list of "ranger" models used to
#' generate the imputed data. `NULL` otherwise.
#' - `visit_seq`: Variables to be imputed (in this order).
#' - `to_impute`: Variables to be imputed (in this order).
#' - `impute_by`: Variables used for imputation.
#' - `best_iter`: Best iteration.
#' - `pred_errors`: Per-iteration OOB prediction errors (1 - R^2 for regression,
Expand All @@ -92,19 +92,17 @@
#' head(irisImputed)
#' head(irisWithNA)
#'
#' \dontrun{
#' # Extended output
#' imp <- missRanger(irisWithNA, pmm.k = 3, num.trees = 100, data_only = FALSE)
#' imp <- missRanger(irisWithNA, pmm.k = 3, num.trees = 50, data_only = FALSE)
#' head(imp$data)
#' imp$pred_errors
#'
#' # If you even want to keep the random forests of the best iteration
#' imp <- missRanger(
#' irisWithNA, pmm.k = 3, num.trees = 100, data_only = FALSE, keep_forests = TRUE
#' irisWithNA, pmm.k = 3, num.trees = 50, data_only = FALSE, keep_forests = TRUE
#' )
#' imp$forests$Sepal.Width
#' imp$pred_errors[imp$best_iter, "Sepal.Width"] # 1 - R-squared
#' }
missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L,
seed = NULL, verbose = 1, returnOOB = FALSE, case.weights = NULL,
data_only = TRUE, keep_forests = FALSE, ...) {
Expand Down Expand Up @@ -153,7 +151,7 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L,
stats::terms.formula(stats::reformulate(z), data = data[1L, ]),
"term.labels"
)
trimws(out, whitespace = "`") # Remove enclosing backticks
return(trimws(out, whitespace = "`")) # Remove enclosing backticks
}
relevant_vars <- lapply(formula[2:3], parsef)

Expand All @@ -171,14 +169,14 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L,
data[, to_impute] <- converted$X

# Remove variables that cannot be safely converted
visit_seq <- setdiff(to_impute, converted$bad)
to_impute <- setdiff(to_impute, converted$bad)

if (verbose) {
cat("\n Variables to impute:\t\t")
cat(visit_seq, sep = ", ")
cat(to_impute, sep = ", ")
}

if (!length(visit_seq)) {
if (!length(to_impute)) {
if (verbose) {
cat("\n")
}
Expand All @@ -188,8 +186,9 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L,
out <- structure(
list(
data = data,
data_raw = data_raw,
forests = NULL,
visit_seq = c(),
to_impute = c(),
impute_by = c(),
best_iter = 0L,
pred_errors = NULL,
Expand All @@ -202,16 +201,16 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L,
}

# Get missing indicators and order variables by number of missings
data_NA <- is.na(data[, visit_seq, drop = FALSE])
visit_seq <- names(sort(colSums(data_NA)))
data_NA <- is.na(data[, to_impute, drop = FALSE])
to_impute <- names(sort(colSums(data_NA)))

# 3) SELECT VARIABLES USED TO IMPUTE

# Variables on the rhs should either appear in "visit_seq"
# Variables on the rhs should either appear in "to_impute"
# or do not contain any missings
impute_by <- relevant_vars[[2L]][relevant_vars[[2L]] %in% visit_seq |
impute_by <- relevant_vars[[2L]][relevant_vars[[2L]] %in% to_impute |
!vapply(data[, relevant_vars[[2L]], drop = FALSE], anyNA, TRUE)]
completed <- setdiff(impute_by, visit_seq)
completed <- setdiff(impute_by, to_impute)

if (verbose) {
cat("\n Variables used to impute:\t")
Expand All @@ -225,25 +224,23 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L,
j <- 1L
crit <- TRUE
dig <- 4L
pred_error <- stats::setNames(rep(1, length(visit_seq)), visit_seq)
pred_error <- stats::setNames(rep(1, length(to_impute)), to_impute)
pred_errors <- list()
if (keep_forests) {
forests <- list()
}

if (verbose >= 2) {
cat("\n", abbreviate(visit_seq, minlength = dig + 2L), sep = "\t")
cat("\n", abbreviate(to_impute, minlength = dig + 2L), sep = "\t")
}

# Looping over iterations and variables to impute
while (crit && j <= maxiter) {
if (verbose) {
if (verbose == 1) {
i <- 1L
cat("\n")
cat(paste("iter", j))
cat("\n")
pb <- utils::txtProgressBar(0, length(visit_seq), style = 3)
cat("\niter", j, "\n")
pb <- utils::txtProgressBar(0, length(to_impute), style = 3)
} else if (verbose >= 2) {
cat("\niter ", j, ":\t", sep = "")
}
Expand All @@ -255,7 +252,7 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L,
forests_last <- forests
}

for (v in visit_seq) {
for (v in to_impute) {
v.na <- data_NA[, v]

if (length(completed) == 0L) {
Expand Down Expand Up @@ -338,7 +335,7 @@ missRanger <- function(data, formula = . ~ ., pmm.k = 0L, maxiter = 10L,
data = data_last,
data_raw = data_raw,
forests = if (keep_forests) forests_last,
visit_seq = visit_seq,
to_impute = to_impute,
impute_by = impute_by,
best_iter = best_iter,
pred_errors = do.call(rbind, pred_errors),
Expand Down
10 changes: 5 additions & 5 deletions R/pmm.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
#' @returns Vector of the same length as `xtest` with values from `xtrain`.
#' @export
#' @examples
#' pmm(xtrain = c(0.2, 0.2, 0.8), xtest = 0.3, ytrain = c(0, 0, 1)) # 0
#' pmm(xtrain = c(TRUE, FALSE, TRUE), xtest = FALSE, ytrain = c(2, 0, 1)) # 0
#' pmm(xtrain = c(0.2, 0.8), xtest = 0.3, ytrain = c("A", "B"), k = 2) # "A" or "B"
#' pmm(xtrain = c("A", "A", "B"), xtest = "A", ytrain = c(2, 2, 4), k = 2) # 2
#' pmm(xtrain = factor(c("A", "B")), xtest = factor("C"), ytrain = 1:2) # 2
#' pmm(xtrain = c(0.2, 0.2, 0.8), xtest = 0.3, ytrain = c(0, 0, 1))
#' pmm(xtrain = c(TRUE, FALSE, TRUE), xtest = FALSE, ytrain = c(2, 0, 1))
#' pmm(xtrain = c(0.2, 0.8), xtest = 0.3, ytrain = c("A", "B"), k = 2)
#' pmm(xtrain = c("A", "A", "B"), xtest = "A", ytrain = c(2, 2, 4), k = 2)
#' pmm(xtrain = factor(c("A", "B")), xtest = factor("C"), ytrain = 1:2)
pmm <- function(xtrain, xtest, ytrain, k = 1L, seed = NULL) {
stopifnot(
length(xtrain) == length(ytrain),
Expand Down
12 changes: 5 additions & 7 deletions man/missRanger.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 5 additions & 5 deletions man/pmm.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 27 additions & 0 deletions tests/testthat/test-ranger.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
test_that("x/y API of ranger does not care about feature order in x at prediction", {
fit <- ranger::ranger(y = iris[[1L]], x = iris[, 2:5], num.trees = 10L)
pred1 <- predict(fit, iris[1:5, 2:5])
pred2 <- predict(fit, iris[1:5, 5:2])
expect_equal(pred1, pred2)
})

test_that("x/y API of ranger does not care about extra columns during prediction", {
fit <- ranger::ranger(y = iris[[1L]], x = iris[, 2:5], num.trees = 10L)
pred1 <- predict(fit, iris[1:5, 2:5])
pred2 <- predict(fit, iris[1:5, ])
expect_equal(pred1, pred2)
})


test_that("ranger can safely predict on character feature", {
ir <- transform(iris, Species = as.character(Species))
fit <- ranger::ranger(Sepal.Length ~ ., data = ir)

pred1 <- predict(fit, ir[c(1, 51, 101), ])$predictions
pred2 <- c(
predict(fit, ir[c(1), ])$predictions,
predict(fit, ir[c(51), ])$predictions,
predict(fit, ir[c(101), ])$predictions
)
expect_equal(pred1, pred2)
})
58 changes: 28 additions & 30 deletions vignettes/working_with_censoring.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ knitr::opts_chunk$set(

## How to deal with censored variables?

There is no obvious way of how to deal with survival variables as covariables in imputation models.
There is no obvious way of how to deal with survival variables as covariates in imputation models.

Options discussed in [@white] include:

Expand All @@ -31,16 +31,16 @@ Options discussed in [@white] include:

- $\text{surv}(t)$, and, optionally $s$

By $\text{surv}(t)$, we denote the Nelson-Aalen survival estimate at each value of $t$. The third option is the most elegant one as it explicitly deals with censoring information. We provide some additional details on it in the example.
By $\text{surv}(t)$, we denote the Nelson-Aalen survival estimate at each value of $t$. The third option seems attractive as it explicitly deals with censoring information. We provide some additional details on it in the example.

### Example

For illustration, we use data from a randomized two-arm trial about lung cancer. The aim is to estimate the treatment effect of "trt" with reliable inference using Cox regression. Unfortunately, we have added missing values in the covariables "age", "karno", and "diagtime".
For illustration, we use data from a randomized two-arm trial about lung cancer. The aim is to estimate the treatment effect of "trt" with reliable inference using Cox regression. We add missing values in the covariates "age", "karno", and "diagtime".

A reasonable way to estimate the covariable adjusted treatment effect is the following:
Let's estimate the covariate adjusted treatment effect using the following steps:

1. Add Nelson-Aalen survival estimates "surv" to the dataset.
2. Use "surv" as well as the covariables to impute missing values in the covariables multiple times.
2. Use "surv" as well as the covariates to impute missing values in the covariates multiple times.
3. Perform the intended Cox regression for each of the imputed data sets.
4. Pool their results by Rubin's rule [@rubin], using package {mice} [@buuren].

Expand All @@ -62,14 +62,12 @@ head(veteran)
# 6 1 squamous 10 1 20 5 49 0

# 1. Calculate Nelson-Aalen survival probabilities for each time point
nelson_aalen <- summary(
survfit(Surv(time, status) ~ 1, data = veteran),
times = unique(veteran$time)
)[c("time", "surv")]
nelson_aalen <- data.frame(nelson_aalen)
nelson_aalen <- survfit(Surv(time, status) ~ 1, data = veteran) |>
summary(times = unique(veteran$time))
nelson_aalen <- nelson_aalen[c("time", "surv")]

# Add it to the original data set
veteran2 <- merge(veteran, nelson_aalen, all.x = TRUE)
veteran2 <- merge(veteran, nelson_aalen, all.x = TRUE, by = "time")

# Add missing values to make things tricky
veteran2 <- generateNA(veteran2, p = c(age = 0.1, karno = 0.1, diagtime = 0.1))
Expand All @@ -80,7 +78,7 @@ filled <- replicate(
missRanger(
veteran2, . ~ . - time - status,
verbose = 0,
pmm.k = 3,
pmm.k = 10,
num.trees = 25
),
simplify = FALSE
Expand All @@ -92,28 +90,28 @@ models <- lapply(filled, function(x) coxph(Surv(time, status) ~ . - surv, x))
# 4. Pool the results by mice
summary(pooled_fit <- pool(models))

# term estimate std.error statistic df p.value
# 1 trt 0.245855250 0.212810467 1.1552780 108.72929 2.505091e-01
# 2 celltypesmallcell 0.805233656 0.284424937 2.8310937 114.17088 5.483657e-03
# 3 celltypeadeno 1.110172771 0.307570269 3.6094931 111.91588 4.603422e-04
# 4 celltypelarge 0.328227283 0.291163500 1.1272954 109.30510 2.620862e-01
# 5 karno -0.031838682 0.005663349 -5.6218824 112.60325 1.390333e-07
# 6 diagtime 0.002775351 0.009382270 0.2958081 86.61582 7.680847e-01
# 7 age -0.007843577 0.009293988 -0.8439410 107.86917 4.005701e-01
# 8 prior 0.003165245 0.023501821 0.1346809 111.84783 8.931063e-01
term estimate std.error statistic df p.value
1 trt 0.253538955 0.212912636 1.1908122 108.60558 2.363235e-01
2 celltypesmallcell 0.815455716 0.285715626 2.8540816 113.48112 5.132019e-03
3 celltypeadeno 1.110530627 0.311566058 3.5643505 108.55506 5.436043e-04
4 celltypelarge 0.346762041 0.288497586 1.2019582 113.27005 2.318866e-01
5 karno -0.030754195 0.005758037 -5.3410901 99.68585 5.837902e-07
6 diagtime 0.001835002 0.009233708 0.1987286 107.07024 8.428519e-01
7 age -0.006032505 0.009447786 -0.6385099 101.88961 5.245745e-01
8 prior 0.002983767 0.023275501 0.1281935 115.27919 8.982192e-01

# Compare with the results on the original data
summary(coxph(Surv(time, status) ~ ., veteran))$coefficients

# coef exp(coef) se(coef) z Pr(>|z|)
# trt 2.946028e-01 1.3425930 0.207549604 1.419433313 1.557727e-01
# celltypesmallcell 8.615605e-01 2.3668512 0.275284474 3.129709606 1.749792e-03
# celltypeadeno 1.196066e+00 3.3070825 0.300916994 3.974738536 7.045662e-05
# celltypelarge 4.012917e-01 1.4937529 0.282688638 1.419553530 1.557377e-01
# karno -3.281533e-02 0.9677173 0.005507757 -5.958020093 2.553121e-09
# diagtime 8.132051e-05 1.0000813 0.009136062 0.008901046 9.928981e-01
# age -8.706475e-03 0.9913313 0.009300299 -0.936149992 3.491960e-01
# prior 7.159360e-03 1.0071850 0.023230538 0.308187441 7.579397e-01
coef exp(coef) se(coef) z Pr(>|z|)
trt 2.946028e-01 1.3425930 0.207549604 1.419433313 1.557727e-01
celltypesmallcell 8.615605e-01 2.3668512 0.275284474 3.129709606 1.749792e-03
celltypeadeno 1.196066e+00 3.3070825 0.300916994 3.974738536 7.045662e-05
celltypelarge 4.012917e-01 1.4937529 0.282688638 1.419553530 1.557377e-01
karno -3.281533e-02 0.9677173 0.005507757 -5.958020093 2.553121e-09
diagtime 8.132051e-05 1.0000813 0.009136062 0.008901046 9.928981e-01
age -8.706475e-03 0.9913313 0.009300299 -0.936149992 3.491960e-01
prior 7.159360e-03 1.0071850 0.023230538 0.308187441 7.579397e-01
```

## References
Loading