From 4db8ca6bca948c4537ecb54f9d199eb42cfe0215 Mon Sep 17 00:00:00 2001 From: topepo Date: Thu, 10 Oct 2024 09:00:27 -0400 Subject: [PATCH] added documentation on model --- DESCRIPTION | 4 +- R/install_packages.R | 4 +- R/linear_reg_quantreg.R | 11 ++ inst/models.tsv | 1 + man/details_linear_reg_quantreg.Rd | 173 +++++++++++++++++++ man/other_predict.Rd | 4 +- man/rmd/linear_reg_lm.Rmd | 2 +- man/rmd/linear_reg_quantreg.Rmd | 79 +++++++++ man/rmd/linear_reg_quantreg.md | 154 +++++++++++++++++ vignettes/articles/Examples.Rmd | 45 +++++ vignettes/articles/template-reg-sacramento.R | 11 ++ 11 files changed, 481 insertions(+), 7 deletions(-) create mode 100644 R/linear_reg_quantreg.R create mode 100644 man/details_linear_reg_quantreg.Rd create mode 100644 man/rmd/linear_reg_quantreg.Rmd create mode 100644 man/rmd/linear_reg_quantreg.md create mode 100644 vignettes/articles/template-reg-sacramento.R diff --git a/DESCRIPTION b/DESCRIPTION index 335c4e43e..d6fd03340 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -74,8 +74,8 @@ Remotes: tidymodels/hardhat ByteCompile: true Config/Needs/website: C50, dbarts, earth, glmnet, keras, kernlab, kknn, - LiblineaR, mgcv, nnet, parsnip, randomForest, ranger, rpart, rstanarm, - tidymodels/tidymodels, tidyverse/tidytemplate, rstudio/reticulate, + LiblineaR, mgcv, nnet, parsnip, quantreg, randomForest, ranger, rpart, + rstanarm, tidymodels/tidymodels, tidyverse/tidytemplate, rstudio/reticulate, xgboost Config/rcmdcheck/ignore-inconsequential-notes: true Config/testthat/edition: 3 diff --git a/R/install_packages.R b/R/install_packages.R index fdd682634..89f409ad6 100644 --- a/R/install_packages.R +++ b/R/install_packages.R @@ -26,8 +26,8 @@ install_engine_packages <- function(extension = TRUE, extras = TRUE, } if (extras) { - rmd_pkgs <- c("tidymodels", "broom.mixed", "glmnet", "Cubist", "xrf", "ape", - "rmarkdown") + rmd_pkgs <- c("ape", "broom.mixed", "Cubist", "glmnet", "quantreg", + "rmarkdown", "tidymodels", "xrf") engine_packages <- unique(c(engine_packages, rmd_pkgs)) } diff --git a/R/linear_reg_quantreg.R b/R/linear_reg_quantreg.R new file mode 100644 index 000000000..d8c0824c9 --- /dev/null +++ b/R/linear_reg_quantreg.R @@ -0,0 +1,11 @@ +#' Linear quantile regression via the quantreg package +#' +#' [quantreg::rq()] optimizes quantile loss to fit models with numeric outcomes. +#' +#' @includeRmd man/rmd/linear_reg_quantreg.md details +#' +#' @name details_linear_reg_quantreg +#' @keywords internal +NULL + +# See inst/README-DOCS.md for a description of how these files are processed diff --git a/inst/models.tsv b/inst/models.tsv index e77757b0b..1ef2a8505 100644 --- a/inst/models.tsv +++ b/inst/models.tsv @@ -55,6 +55,7 @@ "linear_reg" "regression" "lm" NA "linear_reg" "regression" "lme" "multilevelmod" "linear_reg" "regression" "lmer" "multilevelmod" +"linear_reg" "quantile regression" "quantreg" NA "linear_reg" "regression" "spark" NA "linear_reg" "regression" "stan" NA "linear_reg" "regression" "stan_glmer" "multilevelmod" diff --git a/man/details_linear_reg_quantreg.Rd b/man/details_linear_reg_quantreg.Rd new file mode 100644 index 000000000..1256bada8 --- /dev/null +++ b/man/details_linear_reg_quantreg.Rd @@ -0,0 +1,173 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/linear_reg_quantreg.R +\name{details_linear_reg_quantreg} +\alias{details_linear_reg_quantreg} +\title{Linear quantile regression via the quantreg package} +\description{ +\code{\link[quantreg:rq]{quantreg::rq()}} optimizes quantile loss to fit models with numeric outcomes. +} +\details{ +For this engine, there is a single mode: quantile regression + +This model has the same structure as the model fit by \code{lm()}, but +instead of optimizing the sum of squared errors, it optimizes “quantile +loss” in order to produce better estimates of the predictive +distribution. +\subsection{Tuning Parameters}{ + +This engine has no tuning parameters. +} + +\subsection{Translation from parsnip to the original package}{ + +This model only works with the \code{"quantile regression"} model and +requires users to specify which areas of the distribution to predict via +the \code{quantile_levels} argument. For example: + +\if{html}{\out{
}}\preformatted{linear_reg() \%>\% + set_engine("quantreg") \%>\% + set_mode("quantile regression", quantile_levels = (1:3) / 4) \%>\% + translate() +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## Linear Regression Model Specification (quantile regression) +## +## Computational engine: quantreg +## +## Model fit template: +## quantreg::rq(formula = missing_arg(), data = missing_arg(), weights = missing_arg(), +## tau = quantile_levels) + +## Quantile levels: 0.25, 0.5, and 0.75. +}\if{html}{\out{
}} +} + +\subsection{Output format}{ + +When multiple quantile levels are predicted, there are multiple +predicted values for each row of new data. The \code{predict()} method for +this mode produces a column named \code{.pred_quantile} that has a special +class of \code{"quantile_pred"}, and it contains the predictions for each +row. + +For example: + +\if{html}{\out{
}}\preformatted{library(modeldata) +rlang::check_installed("quantreg") + +n <- nrow(Chicago) +Chicago <- Chicago \%>\% select(ridership, Clark_Lake) + +Chicago_train <- Chicago[1:(n - 7), ] +Chicago_test <- Chicago[(n - 6):n, ] + +qr_fit <- + linear_reg() \%>\% + set_engine("quantreg") \%>\% + set_mode("quantile regression", quantile_levels = (1:3) / 4) \%>\% + fit(ridership ~ Clark_Lake, data = Chicago_train) +qr_fit +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## parsnip model object +## +## Call: +## quantreg::rq(formula = ridership ~ Clark_Lake, tau = quantile_levels, +## data = data) +## +## Coefficients: +## tau= 0.25 tau= 0.50 tau= 0.75 +## (Intercept) -0.2064189 0.2051549 0.8112286 +## Clark_Lake 0.9820582 0.9862306 0.9777820 +## +## Degrees of freedom: 5691 total; 5689 residual +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{qr_pred <- predict(qr_fit, Chicago_test) +qr_pred +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## # A tibble: 7 x 1 +## .pred_quantile +## +## 1 [21.1] +## 2 [21.4] +## 3 [21.7] +## 4 [21.4] +## 5 [19.5] +## 6 [6.88] +## # i 1 more row +}\if{html}{\out{
}} + +We can unnest these values and/or convert them to a rectangular format: + +\if{html}{\out{
}}\preformatted{as_tibble(qr_pred$.pred_quantile) +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## # A tibble: 21 x 3 +## .pred_quantile .quantile_levels .row +## +## 1 20.6 0.25 1 +## 2 21.1 0.5 1 +## 3 21.5 0.75 1 +## 4 20.9 0.25 2 +## 5 21.4 0.5 2 +## 6 21.8 0.75 2 +## # i 15 more rows +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{as.matrix(qr_pred$.pred_quantile) +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{## [,1] [,2] [,3] +## [1,] 20.590627 21.090561 21.517717 +## [2,] 20.863639 21.364733 21.789541 +## [3,] 21.190665 21.693148 22.115142 +## [4,] 20.879352 21.380513 21.805185 +## [5,] 19.047814 19.541193 19.981622 +## [6,] 6.435241 6.875033 7.423968 +## [7,] 6.062058 6.500265 7.052411 +}\if{html}{\out{
}} +} + +\subsection{Preprocessing requirements}{ + +Factor/categorical predictors need to be converted to numeric values +(e.g., dummy or indicator variables) for this engine. When using the +formula method via \code{\link[=fit.model_spec]{fit()}}, parsnip will +convert factor columns to indicators. +} + +\subsection{Case weights}{ + +This model can utilize case weights during model fitting. To use them, +see the documentation in \link{case_weights} and the examples +on \code{tidymodels.org}. + +The \code{fit()} and \code{fit_xy()} arguments have arguments called +\code{case_weights} that expect vectors of case weights. +} + +\subsection{Saving fitted model objects}{ + +This model object contains data that are not required to make +predictions. When saving the model for the purpose of prediction, the +size of the saved object might be substantially reduced by using +functions from the \href{https://butcher.tidymodels.org}{butcher} package. +} + +\subsection{Examples}{ + +The “Fitting and Predicting with parsnip” article contains +\href{https://parsnip.tidymodels.org/articles/articles/Examples.html#linear-reg-quantreg}{examples} +for \code{linear_reg()} with the \code{"quantreg"} engine. +} + +\subsection{References}{ +\itemize{ +\item Waldmann, E. (2018). Quantile regression: a short story on how and +why. \emph{Statistical Modelling}, 18(3-4), 203-218. +} +} +} +\keyword{internal} diff --git a/man/other_predict.Rd b/man/other_predict.Rd index f65d71446..81f09e120 100644 --- a/man/other_predict.Rd +++ b/man/other_predict.Rd @@ -103,8 +103,8 @@ interval estimates.} \item{std_error}{A single logical for whether the standard error should be returned (assuming that the model can compute it).} -\item{quantile_levels}{A vector of values between zero and one for the -quantile to be predicted. If the model has a \code{"censored regression"} mode, +\item{quantile_levels}{A vector of values between 0 and 1 for the +quantile to be predicted. If the model has a \code{"quantile regression"} mode, this value should be \code{NULL}. For other modes, the default is \code{(1:9)/10}.} } \description{ diff --git a/man/rmd/linear_reg_lm.Rmd b/man/rmd/linear_reg_lm.Rmd index f07068d18..a904b724d 100644 --- a/man/rmd/linear_reg_lm.Rmd +++ b/man/rmd/linear_reg_lm.Rmd @@ -27,7 +27,7 @@ linear_reg() %>% _However_, the documentation in [stats::lm()] assumes that is specific type of case weights are being used: "Non-NULL weights can be used to indicate that different observations have different variances (with the values in weights being inversely proportional to the variances); or equivalently, when the elements of weights are positive integers `w_i`, that each response `y_i` is the mean of `w_i` unit-weight observations (including the case that there are w_i observations equal to `y_i` and the data have been summarized). However, in the latter case, notice that within-group variation is not used. Therefore, the sigma estimate and residual degrees of freedom may be suboptimal; in the case of replication weights, **even wrong**. Hence, standard errors and analysis of variance tables should be treated with care" (emphasis added) -Depending on your application, the degrees of freedown for the model (and other statistics) might be incorrect. +Depending on your application, the degrees of freedom for the model (and other statistics) might be incorrect. ## Saving fitted model objects diff --git a/man/rmd/linear_reg_quantreg.Rmd b/man/rmd/linear_reg_quantreg.Rmd new file mode 100644 index 000000000..a3cc386e7 --- /dev/null +++ b/man/rmd/linear_reg_quantreg.Rmd @@ -0,0 +1,79 @@ +```{r, child = "aaa.Rmd", include = FALSE} +``` + +`r descr_models("linear_reg", "quantreg")` + +This model has the same structure as the model fit by `lm()`, but instead of optimizing the sum of squared errors, it optimizes "quantile loss" in order to produce better estimates of the predictive distribution. + +## Tuning Parameters + +This engine has no tuning parameters. + +## Translation from parsnip to the original package + +This model only works with the `"quantile regression"` model and requires users to specify which areas of the distribution to predict via the `quantile_levels` argument. For example: + +```{r quantreg-reg} +linear_reg() %>% + set_engine("quantreg") %>% + set_mode("quantile regression", quantile_levels = (1:3) / 4) %>% + translate() +``` + +## Output format + +When multiple quantile levels are predicted, there are multiple predicted values for each row of new data. The `predict()` method for this mode produces a column named `.pred_quantile` that has a special class of `"quantile_pred"`, and it contains the predictions for each row. + +For example: + +```{r example} +library(modeldata) +rlang::check_installed("quantreg") + +n <- nrow(Chicago) +Chicago <- Chicago %>% select(ridership, Clark_Lake) + +Chicago_train <- Chicago[1:(n - 7), ] +Chicago_test <- Chicago[(n - 6):n, ] + +qr_fit <- + linear_reg() %>% + set_engine("quantreg") %>% + set_mode("quantile regression", quantile_levels = (1:3) / 4) %>% + fit(ridership ~ Clark_Lake, data = Chicago_train) +qr_fit + +qr_pred <- predict(qr_fit, Chicago_test) +qr_pred +``` + +We can unnest these values and/or convert them to a rectangular format: + +```{r example-format} +as_tibble(qr_pred$.pred_quantile) + +as.matrix(qr_pred$.pred_quantile) +``` + +## Preprocessing requirements + +```{r child = "template-makes-dummies.Rmd"} +``` + +## Case weights + +```{r child = "template-uses-case-weights.Rmd"} +``` + +## Saving fitted model objects + +```{r child = "template-butcher.Rmd"} +``` + +## Examples + +The "Fitting and Predicting with parsnip" article contains [examples](https://parsnip.tidymodels.org/articles/articles/Examples.html#linear-reg-quantreg) for `linear_reg()` with the `"quantreg"` engine. + +## References + + - Waldmann, E. (2018). Quantile regression: a short story on how and why. _Statistical Modelling_, 18(3-4), 203-218. diff --git a/man/rmd/linear_reg_quantreg.md b/man/rmd/linear_reg_quantreg.md new file mode 100644 index 000000000..61b7e4e06 --- /dev/null +++ b/man/rmd/linear_reg_quantreg.md @@ -0,0 +1,154 @@ + + + +For this engine, there is a single mode: quantile regression + +This model has the same structure as the model fit by `lm()`, but instead of optimizing the sum of squared errors, it optimizes "quantile loss" in order to produce better estimates of the predictive distribution. + +## Tuning Parameters + +This engine has no tuning parameters. + +## Translation from parsnip to the original package + +This model only works with the `"quantile regression"` model and requires users to specify which areas of the distribution to predict via the `quantile_levels` argument. For example: + + +``` r +linear_reg() %>% + set_engine("quantreg") %>% + set_mode("quantile regression", quantile_levels = (1:3) / 4) %>% + translate() +``` + +``` +## Linear Regression Model Specification (quantile regression) +## +## Computational engine: quantreg +## +## Model fit template: +## quantreg::rq(formula = missing_arg(), data = missing_arg(), weights = missing_arg(), +## tau = quantile_levels) +``` + +``` +## Quantile levels: 0.25, 0.5, and 0.75. +``` + +## Output format + +When multiple quantile levels are predicted, there are multiple predicted values for each row of new data. The `predict()` method for this mode produces a column named `.pred_quantile` that has a special class of `"quantile_pred"`, and it contains the predictions for each row. + +For example: + + +``` r +library(modeldata) +rlang::check_installed("quantreg") + +n <- nrow(Chicago) +Chicago <- Chicago %>% select(ridership, Clark_Lake) + +Chicago_train <- Chicago[1:(n - 7), ] +Chicago_test <- Chicago[(n - 6):n, ] + +qr_fit <- + linear_reg() %>% + set_engine("quantreg") %>% + set_mode("quantile regression", quantile_levels = (1:3) / 4) %>% + fit(ridership ~ Clark_Lake, data = Chicago_train) +qr_fit +``` + +``` +## parsnip model object +## +## Call: +## quantreg::rq(formula = ridership ~ Clark_Lake, tau = quantile_levels, +## data = data) +## +## Coefficients: +## tau= 0.25 tau= 0.50 tau= 0.75 +## (Intercept) -0.2064189 0.2051549 0.8112286 +## Clark_Lake 0.9820582 0.9862306 0.9777820 +## +## Degrees of freedom: 5691 total; 5689 residual +``` + +``` r +qr_pred <- predict(qr_fit, Chicago_test) +qr_pred +``` + +``` +## # A tibble: 7 x 1 +## .pred_quantile +## +## 1 [21.1] +## 2 [21.4] +## 3 [21.7] +## 4 [21.4] +## 5 [19.5] +## 6 [6.88] +## # i 1 more row +``` + +We can unnest these values and/or convert them to a rectangular format: + + +``` r +as_tibble(qr_pred$.pred_quantile) +``` + +``` +## # A tibble: 21 x 3 +## .pred_quantile .quantile_levels .row +## +## 1 20.6 0.25 1 +## 2 21.1 0.5 1 +## 3 21.5 0.75 1 +## 4 20.9 0.25 2 +## 5 21.4 0.5 2 +## 6 21.8 0.75 2 +## # i 15 more rows +``` + +``` r +as.matrix(qr_pred$.pred_quantile) +``` + +``` +## [,1] [,2] [,3] +## [1,] 20.590627 21.090561 21.517717 +## [2,] 20.863639 21.364733 21.789541 +## [3,] 21.190665 21.693148 22.115142 +## [4,] 20.879352 21.380513 21.805185 +## [5,] 19.047814 19.541193 19.981622 +## [6,] 6.435241 6.875033 7.423968 +## [7,] 6.062058 6.500265 7.052411 +``` + +## Preprocessing requirements + + +Factor/categorical predictors need to be converted to numeric values (e.g., dummy or indicator variables) for this engine. When using the formula method via \\code{\\link[=fit.model_spec]{fit()}}, parsnip will convert factor columns to indicators. + +## Case weights + + +This model can utilize case weights during model fitting. To use them, see the documentation in [case_weights] and the examples on `tidymodels.org`. + +The `fit()` and `fit_xy()` arguments have arguments called `case_weights` that expect vectors of case weights. + +## Saving fitted model objects + + +This model object contains data that are not required to make predictions. When saving the model for the purpose of prediction, the size of the saved object might be substantially reduced by using functions from the [butcher](https://butcher.tidymodels.org) package. + +## Examples + +The "Fitting and Predicting with parsnip" article contains [examples](https://parsnip.tidymodels.org/articles/articles/Examples.html#linear-reg-quantreg) for `linear_reg()` with the `"quantreg"` engine. + +## References + + - Waldmann, E. (2018). Quantile regression: a short story on how and why. _Statistical Modelling_, 18(3-4), 203-218. diff --git a/vignettes/articles/Examples.Rmd b/vignettes/articles/Examples.Rmd index 1a95daadc..705014dce 100644 --- a/vignettes/articles/Examples.Rmd +++ b/vignettes/articles/Examples.Rmd @@ -586,7 +586,52 @@ The following examples use consistent data sets throughout. For regression, we u +
+ + With the `"quantreg"` engine + +

Quantile regression Example (`quantreg`)

+ + ```{r echo=FALSE} + knitr::spin_child("template-reg-sacramento.R") + ``` + + We can define the model but should set the model mode. Also, for these models the levels of the distirunbtion that we would like to predict need to specified with the mode using the `quantile_levels` argument. Let's predict the 0.25, 0.50, and 0.75 quantiles: + + ```{r} + linreg_quant_spec <- + linear_reg() %>% + set_engine("quantreg") %>% + set_mode("quantile regression", quantile_levels = (1:3) / 4) + linreg_quant_spec + ``` + + Now we create the model fit object: + + ```{r} + set.seed(1) + linreg_quant_fit <- linreg_quant_spec %>% fit(price ~ sqft, data = sac_train) + linreg_quant_fit + ``` + + The holdout data can be predicted: + ```{r} + quant_pred <- predict(linreg_quant_fit, sac_test) + quant_pred + ``` + + `.pred_quantile` is a vector type that contains all of the quartile predictions for each row. You can convert this to a rectangular data set using either of: + + ```{r} + as.matrix(quant_pred$.pred_quantile) + + # or + as_tibble(quant_pred$.pred_quantile) + ``` + +
+ ## `logistic_reg()` models
diff --git a/vignettes/articles/template-reg-sacramento.R b/vignettes/articles/template-reg-sacramento.R new file mode 100644 index 000000000..bdb5fc01e --- /dev/null +++ b/vignettes/articles/template-reg-sacramento.R @@ -0,0 +1,11 @@ +#' We'll model the relationship between the cost of a house in Sacramento CA and the square footage of a property. + +#' A few rows were randomly held out for illustrating prediction. + +#+ results = "hide", messages = FALSE +library(tidymodels) +tidymodels_prefer() + +sac_holdout <- c(90L, 203L, 264L, 733L, 771L) +sac_train <- Sacramento[-sac_holdout, ] +sac_test <- Sacramento[ sac_holdout, ]