From 73cf3e48e14e16cfd828b1daece6905c0f51cbc3 Mon Sep 17 00:00:00 2001 From: simonpcouch Date: Fri, 27 Sep 2024 11:03:37 -0500 Subject: [PATCH 1/5] remove `make_inner_split()` and its tests workflows will no longer take an `add_tailor(prop)` or `add_tailor(method)` argument, instead taking a `fit.workflow(calibration)` argument that supersedes both of them. first, remove machinery that relates specifically to those arguments. --- R/fit.R | 41 +++------------------ tests/testthat/test-fit.R | 34 ------------------ tests/testthat/test-post-action-tailor.R | 46 ------------------------ 3 files changed, 5 insertions(+), 116 deletions(-) diff --git a/R/fit.R b/R/fit.R index dcea1f9..54b4250 100644 --- a/R/fit.R +++ b/R/fit.R @@ -62,23 +62,12 @@ fit.workflow <- function(object, data, ..., control = control_workflow()) { data <- sparsevctrs::coerce_to_sparse_tibble(data) } - # If `calibration` is not overwritten in the following `if` statement, then the - # the postprocessor doesn't actually require training and the dataset - # passed to `.fit_post()` will have no effect. - calibration <- data - if (.should_inner_split(object)) { - inner_split <- make_inner_split(object, data) - - data <- rsample::analysis(inner_split) - calibration <- rsample::assessment(inner_split) - } - workflow <- object workflow <- .fit_pre(workflow, data) workflow <- .fit_model(workflow, control) - if (has_postprocessor(workflow)) { - workflow <- .fit_post(workflow, calibration) - } + # if (has_postprocessor(workflow)) { + # workflow <- .fit_post(workflow, calibration) + # } workflow <- .fit_finalize(workflow) workflow @@ -89,29 +78,9 @@ fit.workflow <- function(object, data, ..., control = control_workflow()) { #' @keywords internal .should_inner_split <- function(workflow) { has_postprocessor(workflow) && - tailor::tailor_requires_fit( - extract_postprocessor(workflow, estimated = FALSE) - ) -} - -make_inner_split <- function(object, data) { - validate_rsample_available() - - method <- object$post$actions$tailor$method - mocked_split <- - rsample::make_splits( - list(analysis = seq_len(nrow(data)), assessment = integer()), - data = data, - class = if (is.null(method)) "mc_split" else method + tailor::tailor_requires_fit( + extract_postprocessor(workflow, estimated = FALSE) ) - - # add_tailor(prop) is the proportion to train the postprocessor, while - # rsample::mc_cv(prop) is the proportion to train the model (#247) - prop <- object$post$actions$tailor$prop - rsample::inner_split( - mocked_split, - list(prop = if (is.null(prop)) 2/3 else 1 - prop) - ) } # ------------------------------------------------------------------------------ diff --git a/tests/testthat/test-fit.R b/tests/testthat/test-fit.R index a4c157f..02b861f 100644 --- a/tests/testthat/test-fit.R +++ b/tests/testthat/test-fit.R @@ -218,40 +218,6 @@ test_that(".should_inner_split works", { )) }) -test_that("`make_inner_split()` works", { - tlr <- - tailor::tailor() %>% - tailor::adjust_numeric_calibration() %>% - tailor::adjust_numeric_range(lower_limit = 1) - - wflow <- - workflow() %>% - add_formula(mpg ~ .) %>% - add_model(parsnip::linear_reg()) %>% - add_tailor(tlr) - - # defaults to 1/3 allotted to calibration via `mc_split`s - inner_splt <- make_inner_split(wflow, data.frame(x = 1:36)) - expect_s3_class(inner_splt, c("mc_split_inner", "mc_split")) - expect_equal(nrow(rsample::analysis(inner_splt)), 24) - expect_equal(nrow(rsample::assessment(inner_splt)), 12) - - # respects `add_tailor(prop)` - wflow <- wflow %>% remove_tailor() %>% add_tailor(tlr, prop = 1/2) - inner_splt <- make_inner_split(wflow, data.frame(x = 1:36)) - expect_s3_class(inner_splt, c("mc_split_inner", "mc_split")) - expect_equal(nrow(rsample::analysis(inner_splt)), 18) - expect_equal(nrow(rsample::assessment(inner_splt)), 18) - - # respects `add_tailor(method)` - skip("currently errors re: passing `prop` to `bootstraps()`") - wflow <- wflow %>% remove_tailor() %>% add_tailor(tlr, method = "boot_split") - inner_splt <- make_inner_split(wflow, data.frame(x = 1:36)) - expect_s3_class(inner_splt, "boot_split") - expect_equal(nrow(rsample::analysis(inner_splt)), 18) - expect_equal(nrow(rsample::assessment(inner_splt)), 18) -}) - # ------------------------------------------------------------------------------ # .fit_finalize() diff --git a/tests/testthat/test-post-action-tailor.R b/tests/testthat/test-post-action-tailor.R index 4955472..238f2b9 100644 --- a/tests/testthat/test-post-action-tailor.R +++ b/tests/testthat/test-post-action-tailor.R @@ -152,49 +152,3 @@ test_that("postprocessor fit aligns with manually fitted version (with calibrati expect_equal(wflow_manual_preds[".pred"], wflow_post_preds) expect_false(all(wflow_simple_preds[".pred"] == wflow_manual_preds[".pred"])) }) - -test_that("postprocessor fit uses correct data (with calibration, non-default `prop`)", { - skip_if_not_installed("modeldata") - skip_if_not_installed("rsample") - - # create example data - y <- seq(0, 7, .1) - dat <- data.frame(y = y, x = y + (y-3)^2) - - # construct workflows - post <- tailor::tailor() - post <- tailor::adjust_numeric_calibration(post, "linear") - - wflow_simple <- workflow(y ~ ., parsnip::linear_reg()) - wflow_post <- add_tailor(wflow_simple, post, prop = 1/2) - - # train workflow - - # first, separate out the same data that workflows ought to internally - # when training with a postprocessor that needs estimation - mocked_split <- - rsample::make_splits( - list(analysis = seq_len(nrow(dat)), assessment = integer()), - data = dat, - class = "mc_split" - ) - set.seed(1) - inner_split <- rsample::inner_split(mocked_split, list(prop = 1/2)) - - wf_simple_fit <- fit(wflow_simple, rsample::analysis(inner_split)) - - # the following fit will do all of this internally - set.seed(1) - wf_post_fit <- fit(wflow_post, dat) - - # ...verify predictions are the same as training the post-proc separately. - # note that this test naughtily re-predicts on the calibration set. - wflow_simple_preds <- augment(wf_simple_fit, rsample::assessment(inner_split)) - post_trained <- fit(post, wflow_simple_preds, y, .pred) - wflow_manual_preds <- predict(post_trained, wflow_simple_preds) - - wflow_post_preds <- predict(wf_post_fit, rsample::assessment(inner_split)) - - expect_equal(wflow_manual_preds[".pred"], wflow_post_preds) - expect_false(all(wflow_simple_preds[".pred"] == wflow_manual_preds[".pred"])) -}) From d7a9797e8209cdf6664ac43f7690d2b7e496624c Mon Sep 17 00:00:00 2001 From: simonpcouch Date: Fri, 27 Sep 2024 11:32:48 -0500 Subject: [PATCH 2/5] transition from `add_tailor(prop)` and `method` to `fit.workflow(calibration)` * removes `add_tailor(prop)` and `add_tailor(method)` * adds `fit.workflow(calibration)` * various documentation updates --- R/fit.R | 39 ++++++++++++-- R/post-action-tailor.R | 68 +++++++----------------- man/add_tailor.Rd | 51 ++++++------------ man/fit-workflow.Rd | 8 ++- tests/testthat/_snaps/fit.md | 16 ++++++ tests/testthat/test-fit.R | 23 ++++++++ tests/testthat/test-post-action-tailor.R | 28 +++------- vignettes/stages.Rmd | 3 +- 8 files changed, 124 insertions(+), 112 deletions(-) diff --git a/R/fit.R b/R/fit.R index 54b4250..42e54b2 100644 --- a/R/fit.R +++ b/R/fit.R @@ -16,10 +16,14 @@ #' @param object A workflow #' #' @param data A data frame of predictors and outcomes to use when fitting the -#' workflow +#' preprocessor and model. #' #' @param ... Not used #' +#' @param calibration A data frame of predictors and outcomes to use when +#' fitting the postprocessor. See the "Data Usage" section of [add_tailor()] +#' for more information. +#' #' @param control A [control_workflow()] object #' #' @return @@ -51,13 +55,15 @@ #' add_recipe(recipe) #' #' fit(recipe_wf, mtcars) -fit.workflow <- function(object, data, ..., control = control_workflow()) { +fit.workflow <- function(object, data, ..., calibration = NULL, control = control_workflow()) { check_dots_empty() if (is_missing(data)) { cli_abort("{.arg data} must be provided to fit a workflow.") } + validate_has_calibration(object, calibration) + if (is_sparse_matrix(data)) { data <- sparsevctrs::coerce_to_sparse_tibble(data) } @@ -65,9 +71,11 @@ fit.workflow <- function(object, data, ..., control = control_workflow()) { workflow <- object workflow <- .fit_pre(workflow, data) workflow <- .fit_model(workflow, control) - # if (has_postprocessor(workflow)) { - # workflow <- .fit_post(workflow, calibration) - # } + if (has_postprocessor(workflow)) { + # if (is.null(calibration)), then the tailor doesn't have a calibrator + # and training the tailor on `data` will not leak data + workflow <- .fit_post(workflow, calibration %||% data) + } workflow <- .fit_finalize(workflow) workflow @@ -218,6 +226,27 @@ validate_has_model <- function(x, ..., call = caller_env()) { invisible(x) } +validate_has_calibration <- function(x, calibration, + x_arg = caller_arg(x), call = caller_env()) { + if (.should_inner_split(x) && is.null(calibration)) { + cli::cli_abort( + "{.arg {x_arg}} requires a {.arg calibration} set to train but none + was supplied.", + call = call + ) + } + + if (!.should_inner_split(x) && !is.null(calibration)) { + cli::cli_abort( + "{.arg {x_arg}} does not require a {.arg calibration} set to train + but one was supplied.", + call = call + ) + } + + invisible(x) +} + # ------------------------------------------------------------------------------ finalize_blueprint <- function(workflow) { diff --git a/R/post-action-tailor.R b/R/post-action-tailor.R index 882e4c1..e6382a2 100644 --- a/R/post-action-tailor.R +++ b/R/post-action-tailor.R @@ -17,28 +17,16 @@ #' should not have been trained already with [tailor::fit()]; workflows #' will handle training internally. #' -#' @param prop The proportion of the data in [fit.workflow()] that should be -#' held back specifically for estimating the postprocessor. Only relevant for -#' postprocessors that require estimation---see section Data Usage below to -#' learn more. Defaults to 1/3. -#' -#' @param method The method with which to split the data in [fit.workflow()], -#' as a character vector. Only relevant for postprocessors that -#' require estimation and not required when resampling the workflow with -#' tune. If `fit.workflow(data)` arose as `training(split_object)`, this argument can -#' usually be supplied as `class(split_object)`. Defaults to `"mc_split"`, which -#' randomly samples `fit.workflow(data)` into two sets, similarly to -#' [rsample::initial_split()]. See section Data Usage below to learn more. -#' #' @section Data Usage: #' #' While preprocessors and models are trained on data in the usual sense, #' postprocessors are training on _predictions_ on data. When a workflow -#' is fitted, the user supplies training data with the `data` argument. +#' is fitted, the user typically supplies training data with the `data` argument. #' When workflows don't contain a postprocessor that requires training, -#' they can use all of the supplied `data` to train the preprocessor and model. -#' However, in the case where a postprocessor must be trained as well, -#' training the preprocessor and model on all of `data` would leave no data +#' users can pass all of the available data to the `data` argument to train the +#' preprocessor and model. However, in the case where a postprocessor must be +#' trained as well, allotting all of the available data to the `data` argument +#' to train the preprocessor and model would leave no data #' left to train the postprocessor with---if that were the case, workflows #' would need to `predict()` from the preprocessor and model on the same `data` #' that they were trained on, with the postprocessor then training on those @@ -49,22 +37,15 @@ #' is passed to that trained postprocessor and model to generate predictions, #' which then form the training data for the postprocessor. #' -#' The arguments `prop` and `method` parameterize how that data is split up. -#' `prop` determines the proportion of rows in `fit.workflow(data)` that are -#' allotted to training the preprocessor and model, while the rest are used to -#' train the postprocessor. `method` determines how that split occurs; since -#' `fit.workflow()` just takes in a data frame, the function doesn't have -#' any information on how that dataset came to be. For example, `data` could -#' have been created as: -#' -#' ``` -#' split <- rsample::initial_split(some_other_data) -#' data <- rsample::training(split) -#' ``` +#' When fitting a workflow with a postprocessor that requires training +#' (i.e. one that returns `TRUE` in `.should_inner_split(workflow)`), users +#' must pass two data arguments--the usual `fit.workflow(data)` will be used +#' to train the preprocessor and model while `fit.workflow(calibration)` will +#' be used to train the postprocessor. #' -#' ...in which case it's okay to randomly allot some rows of `data` to train the -#' preprocessor and model and the rest to train the postprocessor. However, -#' `data` could also have arisen as: +#' In some situations, randomly splitting `fit.workflow(data)` (with +#' [rsample::initial_split()], for example) is sufficient to prevent data +#' leakage. However, `fit.workflow(data)` could also have arisen as: #' #' ``` #' boots <- rsample::bootstraps(some_other_data) @@ -78,8 +59,9 @@ #' datasets, resulting in the preprocessor and model generating predictions on #' rows they've seen before. Similarly problematic situations could arise in the #' context of other resampling situations, like time-based splits. -#' The `method` argument ensures that data is allotted properly (and is -#' internally handled by the tune package when resampling workflows). +#' In general, use the [rsample::inner_split()] function to prevent data +#' leakage when resampling; when workflows with postprocessors that require +#' training are passed to the tune package, this is handled internally. #' #' @param ... Not used. #' @@ -102,14 +84,11 @@ #' remove_tailor(workflow) #' #' update_tailor(workflow, adjust_probability_threshold(tailor, .2)) -add_tailor <- function(x, tailor, prop = NULL, method = NULL, ...) { +add_tailor <- function(x, tailor, ...) { check_dots_empty() validate_tailor_available() - action <- new_action_tailor(tailor, prop = prop, method = method) + action <- new_action_tailor(tailor) res <- add_action(x, action, "tailor") - if (.should_inner_split(res)) { - validate_rsample_available() - } res } @@ -185,7 +164,7 @@ mock_trained_workflow <- function(workflow) { # ------------------------------------------------------------------------------ -new_action_tailor <- function(tailor, prop, method, ..., call = caller_env()) { +new_action_tailor <- function(tailor, ..., call = caller_env()) { check_dots_empty() if (!is_tailor(tailor)) { @@ -196,17 +175,8 @@ new_action_tailor <- function(tailor, prop, method, ..., call = caller_env()) { cli_abort("Can't add a trained tailor to a workflow.", call = call) } - if (!is.null(prop) && - (!rlang::is_double(prop, n = 1) || prop <= 0 || prop >= 1)) { - cli_abort("{.arg prop} must be a numeric on (0, 1).", call = call) - } - - # todo: test method - new_action_post( tailor = tailor, - prop = prop, - method = method, subclass = "action_tailor" ) } diff --git a/man/add_tailor.Rd b/man/add_tailor.Rd index 75e970d..97eba8f 100644 --- a/man/add_tailor.Rd +++ b/man/add_tailor.Rd @@ -6,7 +6,7 @@ \alias{update_tailor} \title{Add a tailor to a workflow} \usage{ -add_tailor(x, tailor, prop = NULL, method = NULL, ...) +add_tailor(x, tailor, ...) remove_tailor(x) @@ -19,19 +19,6 @@ update_tailor(x, tailor, ...) should not have been trained already with \code{\link[tailor:reexports]{tailor::fit()}}; workflows will handle training internally.} -\item{prop}{The proportion of the data in \code{\link[=fit.workflow]{fit.workflow()}} that should be -held back specifically for estimating the postprocessor. Only relevant for -postprocessors that require estimation---see section Data Usage below to -learn more. Defaults to 1/3.} - -\item{method}{The method with which to split the data in \code{\link[=fit.workflow]{fit.workflow()}}, -as a character vector. Only relevant for postprocessors that -require estimation and not required when resampling the workflow with -tune. If \code{fit.workflow(data)} arose as \code{training(split_object)}, this argument can -usually be supplied as \code{class(split_object)}. Defaults to \code{"mc_split"}, which -randomly samples \code{fit.workflow(data)} into two sets, similarly to -\code{\link[rsample:initial_split]{rsample::initial_split()}}. See section Data Usage below to learn more.} - \item{...}{Not used.} } \value{ @@ -53,11 +40,12 @@ tailor with the new one. While preprocessors and models are trained on data in the usual sense, postprocessors are training on \emph{predictions} on data. When a workflow -is fitted, the user supplies training data with the \code{data} argument. +is fitted, the user typically supplies training data with the \code{data} argument. When workflows don't contain a postprocessor that requires training, -they can use all of the supplied \code{data} to train the preprocessor and model. -However, in the case where a postprocessor must be trained as well, -training the preprocessor and model on all of \code{data} would leave no data +users can pass all of the available data to the \code{data} argument to train the +preprocessor and model. However, in the case where a postprocessor must be +trained as well, allotting all of the available data to the \code{data} argument +to train the preprocessor and model would leave no data left to train the postprocessor with---if that were the case, workflows would need to \code{predict()} from the preprocessor and model on the same \code{data} that they were trained on, with the postprocessor then training on those @@ -68,21 +56,15 @@ train the preprocessor and model and the second, called the "calibration set," is passed to that trained postprocessor and model to generate predictions, which then form the training data for the postprocessor. -The arguments \code{prop} and \code{method} parameterize how that data is split up. -\code{prop} determines the proportion of rows in \code{fit.workflow(data)} that are -allotted to training the preprocessor and model, while the rest are used to -train the postprocessor. \code{method} determines how that split occurs; since -\code{fit.workflow()} just takes in a data frame, the function doesn't have -any information on how that dataset came to be. For example, \code{data} could -have been created as: - -\if{html}{\out{
}}\preformatted{split <- rsample::initial_split(some_other_data) -data <- rsample::training(split) -}\if{html}{\out{
}} +When fitting a workflow with a postprocessor that requires training +(i.e. one that returns \code{TRUE} in \code{.should_inner_split(workflow)}), users +must pass two data arguments--the usual \code{fit.workflow(data)} will be used +to train the preprocessor and model while \code{fit.workflow(calibration)} will +be used to train the postprocessor. -...in which case it's okay to randomly allot some rows of \code{data} to train the -preprocessor and model and the rest to train the postprocessor. However, -\code{data} could also have arisen as: +In some situations, randomly splitting \code{fit.workflow(data)} (with +\code{\link[rsample:initial_split]{rsample::initial_split()}}, for example) is sufficient to prevent data +leakage. However, \code{fit.workflow(data)} could also have arisen as: \if{html}{\out{
}}\preformatted{boots <- rsample::bootstraps(some_other_data) split <- rsample::get_rsplit(boots, 1) @@ -95,8 +77,9 @@ the preprocessor would likely result in the same rows appearing in both datasets, resulting in the preprocessor and model generating predictions on rows they've seen before. Similarly problematic situations could arise in the context of other resampling situations, like time-based splits. -The \code{method} argument ensures that data is allotted properly (and is -internally handled by the tune package when resampling workflows). +In general, use the \code{\link[rsample:inner_split]{rsample::inner_split()}} function to prevent data +leakage when resampling; when workflows with postprocessors that require +training are passed to the tune package, this is handled internally. } \examples{ diff --git a/man/fit-workflow.Rd b/man/fit-workflow.Rd index be434fd..3c2a0a0 100644 --- a/man/fit-workflow.Rd +++ b/man/fit-workflow.Rd @@ -5,16 +5,20 @@ \alias{fit.workflow} \title{Fit a workflow object} \usage{ -\method{fit}{workflow}(object, data, ..., control = control_workflow()) +\method{fit}{workflow}(object, data, ..., calibration = NULL, control = control_workflow()) } \arguments{ \item{object}{A workflow} \item{data}{A data frame of predictors and outcomes to use when fitting the -workflow} +preprocessor and model.} \item{...}{Not used} +\item{calibration}{A data frame of predictors and outcomes to use when +fitting the postprocessor. See the "Data Usage" section of \code{\link[=add_tailor]{add_tailor()}} +for more information.} + \item{control}{A \code{\link[=control_workflow]{control_workflow()}} object} } \value{ diff --git a/tests/testthat/_snaps/fit.md b/tests/testthat/_snaps/fit.md index caa386e..cc6658e 100644 --- a/tests/testthat/_snaps/fit.md +++ b/tests/testthat/_snaps/fit.md @@ -32,6 +32,22 @@ ! The workflow must have a model. i Provide one with `add_model()`. +# fit.workflow confirms compatibility of object and calibration + + Code + fit(workflow, mtcars, calibration = mtcars) + Condition + Error in `fit()`: + ! `object` does not require a `calibration` set to train but one was supplied. + +--- + + Code + fit(workflow, mtcars) + Condition + Error in `fit()`: + ! `object` requires a `calibration` set to train but none was supplied. + # can `predict()` from workflow fit from individual pieces Code diff --git a/tests/testthat/test-fit.R b/tests/testthat/test-fit.R index 02b861f..5df66c2 100644 --- a/tests/testthat/test-fit.R +++ b/tests/testthat/test-fit.R @@ -85,6 +85,29 @@ test_that("cannot fit without a fit stage", { }) }) +test_that("fit.workflow confirms compatibility of object and calibration", { + skip_if_not_installed("tailor") + + mod <- parsnip::linear_reg() + mod <- parsnip::set_engine(mod, "lm") + + workflow <- workflow() + workflow <- add_formula(workflow, mpg ~ cyl) + workflow <- add_model(workflow, mod) + + expect_snapshot(error = TRUE, { + fit(workflow, mtcars, calibration = mtcars) + }) + + tailor <- tailor::tailor() + tailor <- tailor::adjust_numeric_calibration(tailor) + workflow <- add_tailor(workflow, tailor) + + expect_snapshot(error = TRUE, { + fit(workflow, mtcars) + }) +}) + # ------------------------------------------------------------------------------ # .fit_pre() diff --git a/tests/testthat/test-post-action-tailor.R b/tests/testthat/test-post-action-tailor.R index 238f2b9..7698bab 100644 --- a/tests/testthat/test-post-action-tailor.R +++ b/tests/testthat/test-post-action-tailor.R @@ -115,6 +115,9 @@ test_that("postprocessor fit aligns with manually fitted version (with calibrati y <- seq(0, 7, .1) dat <- data.frame(y = y, x = y + (y-3)^2) + dat_data <- dat[1:40, ] + dat_calibration <- dat[41:71, ] + # construct workflows post <- tailor::tailor() post <- tailor::adjust_numeric_calibration(post, "linear") @@ -122,32 +125,17 @@ test_that("postprocessor fit aligns with manually fitted version (with calibrati wflow_simple <- workflow(y ~ ., parsnip::linear_reg()) wflow_post <- add_tailor(wflow_simple, post) - # train workflow - - # first, separate out the same data that workflows ought to internally - # when training with a postprocessor that needs estimation - mocked_split <- - rsample::make_splits( - list(analysis = seq_len(nrow(dat)), assessment = integer()), - data = dat, - class = "mc_split" - ) - set.seed(1) - inner_split <- rsample::inner_split(mocked_split, list(prop = 2/3)) - - wf_simple_fit <- fit(wflow_simple, rsample::analysis(inner_split)) - - # the following fit will do all of this internally - set.seed(1) - wf_post_fit <- fit(wflow_post, dat) + # train workflows + wf_simple_fit <- fit(wflow_simple, dat_data) + wf_post_fit <- fit(wflow_post, dat_data, calibration = dat_calibration) # ...verify predictions are the same as training the post-proc separately. # note that this test naughtily re-predicts on the calibration set. - wflow_simple_preds <- augment(wf_simple_fit, rsample::assessment(inner_split)) + wflow_simple_preds <- augment(wf_simple_fit, dat_calibration) post_trained <- fit(post, wflow_simple_preds, y, .pred) wflow_manual_preds <- predict(post_trained, wflow_simple_preds) - wflow_post_preds <- predict(wf_post_fit, rsample::assessment(inner_split)) + wflow_post_preds <- predict(wf_post_fit, dat_calibration) expect_equal(wflow_manual_preds[".pred"], wflow_post_preds) expect_false(all(wflow_simple_preds[".pred"] == wflow_manual_preds[".pred"])) diff --git a/vignettes/stages.Rmd b/vignettes/stages.Rmd index 36ea57e..57af49b 100644 --- a/vignettes/stages.Rmd +++ b/vignettes/stages.Rmd @@ -39,5 +39,4 @@ When using a preprocessor, you may need an additional formula for special model ## Post-processing -`tailor` post-processors are the only option here, specified via `add_tailor()`. Some examples of post-processing model predictions could include adding a probability threshold for two-class problems, calibration of probability estimates, truncating the possible range of predictions, and so on. `add_tailor()` takes two arguments, `prop` and `method`, that control how data will be internally partitioned into two training sets, one for the preprocessor and model and one for the post-processor---see the Data Usage section in `?add_tailor()` to learn more. - +`tailor` post-processors are the only option here, specified via `add_tailor()`. Some examples of post-processing model predictions could include adding a probability threshold for two-class problems, calibration of probability estimates, truncating the possible range of predictions, and so on. From 47eda7beb5dc8a02c6bd0ec40ea7ed5d78b6f5a4 Mon Sep 17 00:00:00 2001 From: simonpcouch Date: Fri, 27 Sep 2024 11:34:40 -0500 Subject: [PATCH 3/5] remove rsample Suggests --- DESCRIPTION | 2 -- R/post-action-tailor.R | 4 ++-- R/utils.R | 14 -------------- man/add_tailor.Rd | 4 ++-- tests/testthat/test-post-action-tailor.R | 1 - 5 files changed, 4 insertions(+), 21 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e99ff4c..f497af1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,7 +42,6 @@ Suggests: methods, modeldata (>= 1.0.0), recipes (>= 1.0.10.9000), - rsample (>= 1.2.1.9000), rmarkdown, testthat (>= 3.0.0) VignetteBuilder: @@ -54,7 +53,6 @@ Config/Needs/website: tidyverse/tidytemplate, yardstick Remotes: - tidymodels/rsample, tidymodels/recipes, tidymodels/parsnip, tidymodels/tailor, diff --git a/R/post-action-tailor.R b/R/post-action-tailor.R index e6382a2..bee3d78 100644 --- a/R/post-action-tailor.R +++ b/R/post-action-tailor.R @@ -44,7 +44,7 @@ #' be used to train the postprocessor. #' #' In some situations, randomly splitting `fit.workflow(data)` (with -#' [rsample::initial_split()], for example) is sufficient to prevent data +#' `rsample::initial_split()`, for example) is sufficient to prevent data #' leakage. However, `fit.workflow(data)` could also have arisen as: #' #' ``` @@ -59,7 +59,7 @@ #' datasets, resulting in the preprocessor and model generating predictions on #' rows they've seen before. Similarly problematic situations could arise in the #' context of other resampling situations, like time-based splits. -#' In general, use the [rsample::inner_split()] function to prevent data +#' In general, use the `rsample::inner_split()` function to prevent data #' leakage when resampling; when workflows with postprocessors that require #' training are passed to the tune package, this is handled internally. #' diff --git a/R/utils.R b/R/utils.R index 692a13c..7d60933 100644 --- a/R/utils.R +++ b/R/utils.R @@ -40,20 +40,6 @@ validate_tailor_available <- function(..., call = caller_env()) { invisible() } -validate_rsample_available <- function(..., call = caller_env()) { - check_dots_empty() - - if (!requireNamespace("rsample", quietly = TRUE)) { - cli_abort( - "The {.pkg rsample} package must be available to add a tailor that - requires training.", - call = call - ) - } - - invisible() -} - # ------------------------------------------------------------------------------ # https://github.com/r-lib/tidyselect/blob/10e00cea2fff3585fc827b6a7eb5e172acadbb2f/R/utils.R#L109 diff --git a/man/add_tailor.Rd b/man/add_tailor.Rd index 97eba8f..8942368 100644 --- a/man/add_tailor.Rd +++ b/man/add_tailor.Rd @@ -63,7 +63,7 @@ to train the preprocessor and model while \code{fit.workflow(calibration)} will be used to train the postprocessor. In some situations, randomly splitting \code{fit.workflow(data)} (with -\code{\link[rsample:initial_split]{rsample::initial_split()}}, for example) is sufficient to prevent data +\code{rsample::initial_split()}, for example) is sufficient to prevent data leakage. However, \code{fit.workflow(data)} could also have arisen as: \if{html}{\out{
}}\preformatted{boots <- rsample::bootstraps(some_other_data) @@ -77,7 +77,7 @@ the preprocessor would likely result in the same rows appearing in both datasets, resulting in the preprocessor and model generating predictions on rows they've seen before. Similarly problematic situations could arise in the context of other resampling situations, like time-based splits. -In general, use the \code{\link[rsample:inner_split]{rsample::inner_split()}} function to prevent data +In general, use the \code{rsample::inner_split()} function to prevent data leakage when resampling; when workflows with postprocessors that require training are passed to the tune package, this is handled internally. } diff --git a/tests/testthat/test-post-action-tailor.R b/tests/testthat/test-post-action-tailor.R index 7698bab..74579b7 100644 --- a/tests/testthat/test-post-action-tailor.R +++ b/tests/testthat/test-post-action-tailor.R @@ -109,7 +109,6 @@ test_that("postprocessor fit aligns with manually fitted version (no calibration test_that("postprocessor fit aligns with manually fitted version (with calibration)", { skip_if_not_installed("modeldata") - skip_if_not_installed("rsample") # create example data y <- seq(0, 7, .1) From 0a38a86b9db0f3ccb62d7ca4080a66f0578dbe70 Mon Sep 17 00:00:00 2001 From: simonpcouch Date: Mon, 30 Sep 2024 09:44:19 -0500 Subject: [PATCH 4/5] apply suggestions from review * edits to `validate_has_calibration()`: * refer to "The workflow" rather than `caller_arg()` * Warn rather than error on unneeded calibration set * All arguments on one line in function signature * comment on expectation re: predictions * doc tweaks --- R/fit.R | 9 ++++----- R/post-action-tailor.R | 2 +- man/add_tailor.Rd | 2 +- tests/testthat/_snaps/fit.md | 8 ++++---- tests/testthat/test-fit.R | 6 +++--- tests/testthat/test-post-action-tailor.R | 2 ++ 6 files changed, 15 insertions(+), 14 deletions(-) diff --git a/R/fit.R b/R/fit.R index 42e54b2..e31fb2e 100644 --- a/R/fit.R +++ b/R/fit.R @@ -226,19 +226,18 @@ validate_has_model <- function(x, ..., call = caller_env()) { invisible(x) } -validate_has_calibration <- function(x, calibration, - x_arg = caller_arg(x), call = caller_env()) { +validate_has_calibration <- function(x, calibration, call = caller_env()) { if (.should_inner_split(x) && is.null(calibration)) { cli::cli_abort( - "{.arg {x_arg}} requires a {.arg calibration} set to train but none + "The workflow requires a {.arg calibration} set to train but none was supplied.", call = call ) } if (!.should_inner_split(x) && !is.null(calibration)) { - cli::cli_abort( - "{.arg {x_arg}} does not require a {.arg calibration} set to train + cli::cli_warn( + "The workflow does not require a {.arg calibration} set to train but one was supplied.", call = call ) diff --git a/R/post-action-tailor.R b/R/post-action-tailor.R index bee3d78..ac28c35 100644 --- a/R/post-action-tailor.R +++ b/R/post-action-tailor.R @@ -27,7 +27,7 @@ #' preprocessor and model. However, in the case where a postprocessor must be #' trained as well, allotting all of the available data to the `data` argument #' to train the preprocessor and model would leave no data -#' left to train the postprocessor with---if that were the case, workflows +#' to train the postprocessor with---if that were the case, workflows #' would need to `predict()` from the preprocessor and model on the same `data` #' that they were trained on, with the postprocessor then training on those #' predictions. Predictions on data that a model was trained on likely follow diff --git a/man/add_tailor.Rd b/man/add_tailor.Rd index 8942368..69643e7 100644 --- a/man/add_tailor.Rd +++ b/man/add_tailor.Rd @@ -46,7 +46,7 @@ users can pass all of the available data to the \code{data} argument to train th preprocessor and model. However, in the case where a postprocessor must be trained as well, allotting all of the available data to the \code{data} argument to train the preprocessor and model would leave no data -left to train the postprocessor with---if that were the case, workflows +to train the postprocessor with---if that were the case, workflows would need to \code{predict()} from the preprocessor and model on the same \code{data} that they were trained on, with the postprocessor then training on those predictions. Predictions on data that a model was trained on likely follow diff --git a/tests/testthat/_snaps/fit.md b/tests/testthat/_snaps/fit.md index cc6658e..4a9590b 100644 --- a/tests/testthat/_snaps/fit.md +++ b/tests/testthat/_snaps/fit.md @@ -35,10 +35,10 @@ # fit.workflow confirms compatibility of object and calibration Code - fit(workflow, mtcars, calibration = mtcars) + res <- fit(workflow, mtcars, calibration = mtcars) Condition - Error in `fit()`: - ! `object` does not require a `calibration` set to train but one was supplied. + Warning in `fit()`: + The workflow does not require a `calibration` set to train but one was supplied. --- @@ -46,7 +46,7 @@ fit(workflow, mtcars) Condition Error in `fit()`: - ! `object` requires a `calibration` set to train but none was supplied. + ! The workflow requires a `calibration` set to train but none was supplied. # can `predict()` from workflow fit from individual pieces diff --git a/tests/testthat/test-fit.R b/tests/testthat/test-fit.R index 5df66c2..ac43889 100644 --- a/tests/testthat/test-fit.R +++ b/tests/testthat/test-fit.R @@ -95,9 +95,9 @@ test_that("fit.workflow confirms compatibility of object and calibration", { workflow <- add_formula(workflow, mpg ~ cyl) workflow <- add_model(workflow, mod) - expect_snapshot(error = TRUE, { - fit(workflow, mtcars, calibration = mtcars) - }) + expect_snapshot( + res <- fit(workflow, mtcars, calibration = mtcars) + ) tailor <- tailor::tailor() tailor <- tailor::adjust_numeric_calibration(tailor) diff --git a/tests/testthat/test-post-action-tailor.R b/tests/testthat/test-post-action-tailor.R index 74579b7..ce5a0db 100644 --- a/tests/testthat/test-post-action-tailor.R +++ b/tests/testthat/test-post-action-tailor.R @@ -137,5 +137,7 @@ test_that("postprocessor fit aligns with manually fitted version (with calibrati wflow_post_preds <- predict(wf_post_fit, dat_calibration) expect_equal(wflow_manual_preds[".pred"], wflow_post_preds) + + # okay if some predictions are the same, but we wouldn't expect all of them to be expect_false(all(wflow_simple_preds[".pred"] == wflow_manual_preds[".pred"])) }) From b28a6c4bfb6531cd431f41f889dd9ce37cc67b56 Mon Sep 17 00:00:00 2001 From: simonpcouch Date: Mon, 30 Sep 2024 13:57:12 -0500 Subject: [PATCH 5/5] rephrase "inner split" * `.should_inner_split()` -> `.workflow_includes_calibration()` * refactor conditional in `fit.workflow()` into two --- NAMESPACE | 2 +- R/fit.R | 16 ++++++++++------ R/post-action-tailor.R | 2 +- man/add_tailor.Rd | 2 +- man/workflows-internals.Rd | 6 +++--- tests/testthat/test-fit.R | 20 ++++++++++---------- 6 files changed, 26 insertions(+), 22 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index d14bbca..ca42029 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -38,7 +38,7 @@ export(.fit_finalize) export(.fit_model) export(.fit_post) export(.fit_pre) -export(.should_inner_split) +export(.workflow_includes_calibration) export(add_case_weights) export(add_formula) export(add_model) diff --git a/R/fit.R b/R/fit.R index e31fb2e..1691408 100644 --- a/R/fit.R +++ b/R/fit.R @@ -71,11 +71,15 @@ fit.workflow <- function(object, data, ..., calibration = NULL, control = contro workflow <- object workflow <- .fit_pre(workflow, data) workflow <- .fit_model(workflow, control) + + if (!.workflow_includes_calibration(workflow)) { + # in this case, training the tailor on `data` will not leak data (#262) + calibration <- data + } if (has_postprocessor(workflow)) { - # if (is.null(calibration)), then the tailor doesn't have a calibrator - # and training the tailor on `data` will not leak data - workflow <- .fit_post(workflow, calibration %||% data) + workflow <- .fit_post(workflow, calibration) } + workflow <- .fit_finalize(workflow) workflow @@ -84,7 +88,7 @@ fit.workflow <- function(object, data, ..., calibration = NULL, control = contro #' @export #' @rdname workflows-internals #' @keywords internal -.should_inner_split <- function(workflow) { +.workflow_includes_calibration <- function(workflow) { has_postprocessor(workflow) && tailor::tailor_requires_fit( extract_postprocessor(workflow, estimated = FALSE) @@ -227,7 +231,7 @@ validate_has_model <- function(x, ..., call = caller_env()) { } validate_has_calibration <- function(x, calibration, call = caller_env()) { - if (.should_inner_split(x) && is.null(calibration)) { + if (.workflow_includes_calibration(x) && is.null(calibration)) { cli::cli_abort( "The workflow requires a {.arg calibration} set to train but none was supplied.", @@ -235,7 +239,7 @@ validate_has_calibration <- function(x, calibration, call = caller_env()) { ) } - if (!.should_inner_split(x) && !is.null(calibration)) { + if (!.workflow_includes_calibration(x) && !is.null(calibration)) { cli::cli_warn( "The workflow does not require a {.arg calibration} set to train but one was supplied.", diff --git a/R/post-action-tailor.R b/R/post-action-tailor.R index ac28c35..f8d9787 100644 --- a/R/post-action-tailor.R +++ b/R/post-action-tailor.R @@ -38,7 +38,7 @@ #' which then form the training data for the postprocessor. #' #' When fitting a workflow with a postprocessor that requires training -#' (i.e. one that returns `TRUE` in `.should_inner_split(workflow)`), users +#' (i.e. one that returns `TRUE` in `.workflow_includes_calibration(workflow)`), users #' must pass two data arguments--the usual `fit.workflow(data)` will be used #' to train the preprocessor and model while `fit.workflow(calibration)` will #' be used to train the postprocessor. diff --git a/man/add_tailor.Rd b/man/add_tailor.Rd index 69643e7..85450f1 100644 --- a/man/add_tailor.Rd +++ b/man/add_tailor.Rd @@ -57,7 +57,7 @@ is passed to that trained postprocessor and model to generate predictions, which then form the training data for the postprocessor. When fitting a workflow with a postprocessor that requires training -(i.e. one that returns \code{TRUE} in \code{.should_inner_split(workflow)}), users +(i.e. one that returns \code{TRUE} in \code{.workflow_includes_calibration(workflow)}), users must pass two data arguments--the usual \code{fit.workflow(data)} will be used to train the preprocessor and model while \code{fit.workflow(calibration)} will be used to train the postprocessor. diff --git a/man/workflows-internals.Rd b/man/workflows-internals.Rd index e83a4ef..3a23442 100644 --- a/man/workflows-internals.Rd +++ b/man/workflows-internals.Rd @@ -1,7 +1,7 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/fit.R -\name{.should_inner_split} -\alias{.should_inner_split} +\name{.workflow_includes_calibration} +\alias{.workflow_includes_calibration} \alias{workflows-internals} \alias{.fit_pre} \alias{.fit_model} @@ -9,7 +9,7 @@ \alias{.fit_finalize} \title{Internal workflow functions} \usage{ -.should_inner_split(workflow) +.workflow_includes_calibration(workflow) .fit_pre(workflow, data) diff --git a/tests/testthat/test-fit.R b/tests/testthat/test-fit.R index ac43889..127d7dd 100644 --- a/tests/testthat/test-fit.R +++ b/tests/testthat/test-fit.R @@ -197,31 +197,31 @@ test_that("`.fit_pre()` doesn't modify user supplied recipe blueprint", { # ------------------------------------------------------------------------------ # .fit_post() -test_that(".should_inner_split works", { +test_that(".workflow_includes_calibration works", { skip_if_not_installed("tailor") - expect_false(.should_inner_split(workflow())) - expect_false(.should_inner_split(workflow() %>% add_model(parsnip::linear_reg()))) - expect_false(.should_inner_split(workflow() %>% add_formula(mpg ~ .))) - expect_false(.should_inner_split( + expect_false(.workflow_includes_calibration(workflow())) + expect_false(.workflow_includes_calibration(workflow() %>% add_model(parsnip::linear_reg()))) + expect_false(.workflow_includes_calibration(workflow() %>% add_formula(mpg ~ .))) + expect_false(.workflow_includes_calibration( workflow() %>% add_formula(mpg ~ .) %>% add_model(parsnip::linear_reg()) )) - expect_false(.should_inner_split( + expect_false(.workflow_includes_calibration( workflow() %>% add_tailor(tailor::tailor()) )) - expect_false(.should_inner_split( + expect_false(.workflow_includes_calibration( workflow() %>% add_tailor(tailor::tailor() %>% tailor::adjust_probability_threshold(.4)) )) - expect_true(.should_inner_split( + expect_true(.workflow_includes_calibration( workflow() %>% add_tailor(tailor::tailor() %>% tailor::adjust_numeric_calibration()) )) - expect_true(.should_inner_split( + expect_true(.workflow_includes_calibration( workflow() %>% add_tailor( tailor::tailor() %>% @@ -229,7 +229,7 @@ test_that(".should_inner_split works", { tailor::adjust_numeric_range(lower_limit = 1) ) )) - expect_true(.should_inner_split( + expect_true(.workflow_includes_calibration( workflow() %>% add_formula(mpg ~ .) %>% add_model(parsnip::linear_reg()) %>%