man/step_discretize_cart.Rd

% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/discretize_cart.R
\name{step_discretize_cart}
\alias{step_discretize_cart}
\alias{tidy.step_discretize_cart}
\title{Discretize numeric variables with CART}
\usage{
step_discretize_cart(
  recipe,
  ...,
  role = NA,
  trained = FALSE,
  outcome = NULL,
  cost_complexity = 0.01,
  tree_depth = 10,
  min_n = 20,
  rules = NULL,
  skip = FALSE,
  id = rand_id("discretize_cart")
)
}
\arguments{
\item{recipe}{A recipe object. The step will be added to the sequence of
operations for this recipe.}

\item{...}{One or more selector functions to choose which variables are
affected by the step. See \link[recipes:selections]{recipes::selections} for more details.}

\item{role}{Defaults to \code{"predictor"}.}

\item{trained}{A logical to indicate if the quantities for preprocessing have
been estimated.}

\item{outcome}{A call to \code{vars} to specify which variable is used as the
outcome to train CART models in order to discretize explanatory variables.}

\item{cost_complexity}{The regularization parameter. Any split that does not
decrease the overall lack of fit by a factor of \code{cost_complexity} is not
attempted. Corresponds to \code{cp} in \code{\link[rpart:rpart]{rpart::rpart()}}. Defaults to 0.01.}

\item{tree_depth}{The \emph{maximum} depth in the final tree. Corresponds to
\code{maxdepth} in  \code{\link[rpart:rpart]{rpart::rpart()}}. Defaults to 10.}

\item{min_n}{The number of data points in a node required to continue
splitting. Corresponds to \code{minsplit} in  \code{\link[rpart:rpart]{rpart::rpart()}}. Defaults to 20.}

\item{rules}{The splitting rules of the best CART tree to retain for each
variable. If length zero, splitting could not be used on that column.}

\item{skip}{A logical. Should the step be skipped when the recipe is baked by
\code{\link[recipes:bake]{recipes::bake()}}? While all operations are baked when \code{\link[recipes:prep]{recipes::prep()}} is
run, some operations may not be able to be conducted on new data (e.g.
processing the outcome variable(s)). Care should be taken when using \code{skip = TRUE} as it may affect the computations for subsequent operations}

\item{id}{A character string that is unique to this step to identify it.}
}
\value{
An updated version of \code{recipe} with the new step added to the
sequence of any existing operations.
}
\description{
\code{step_discretize_cart()} creates a \emph{specification} of a recipe step that will
discretize numeric data (e.g. integers or doubles) into bins in a supervised
way using a CART model.
}
\details{
\code{step_discretize_cart()} creates non-uniform bins from numerical variables by
utilizing the information about the outcome variable and applying a CART
model.

The best selection of buckets for each variable is selected using the
standard cost-complexity pruning of CART, which makes this discretization
method resistant to overfitting.

This step requires the \pkg{rpart} package. If not installed, the step will
stop with a note about installing the package.

Note that the original data will be replaced with the new bins.
}
\section{Tidying}{
When you \code{\link[recipes:tidy.recipe]{tidy()}} this step, a tibble is returned with
columns \code{terms}, \code{value}, and \code{id}:

\describe{
\item{terms}{character, the selectors or variables selected}
\item{value}{numeric, location of the splits}
\item{id}{character, id of this step}
}
}

\section{Tuning Parameters}{
This step has 3 tuning parameters:
\itemize{
\item \code{cost_complexity}: Cost-Complexity Parameter (type: double, default: 0.01)
\item \code{tree_depth}: Tree Depth (type: integer, default: 10)
\item \code{min_n}: Minimal Node Size (type: integer, default: 20)
}
}

\section{Case weights}{


This step performs an supervised operation that can utilize case weights.
To use them, see the documentation in \link[recipes:case_weights]{recipes::case_weights} and the examples on
\code{tidymodels.org}.
}

\examples{
\dontshow{if (rlang::is_installed("modeldata")) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf}
library(modeldata)
data(ad_data)
library(rsample)

split <- initial_split(ad_data, strata = "Class")

ad_data_tr <- training(split)
ad_data_te <- testing(split)

cart_rec <-
  recipe(Class ~ ., data = ad_data_tr) \%>\%
  step_discretize_cart(
    tau, age, p_tau, Ab_42,
    outcome = "Class", id = "cart splits"
  )

cart_rec <- prep(cart_rec, training = ad_data_tr)

# The splits:
tidy(cart_rec, id = "cart splits")

bake(cart_rec, ad_data_te, tau)
\dontshow{\}) # examplesIf}
}
\seealso{
\code{\link[=step_discretize_xgb]{step_discretize_xgb()}}, \code{\link[recipes:recipe]{recipes::recipe()}},
\code{\link[recipes:prep]{recipes::prep()}}, \code{\link[recipes:bake]{recipes::bake()}}
}