From e22bf710bc53bf55b2807e9d197f2f0ea58a8e62 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Wed, 18 Dec 2024 21:40:58 -0500 Subject: [PATCH] Initial separation of `tabyl` --- DESCRIPTION | 7 +- NAMESPACE | 23 - R/adorn_ns.R | 164 ------ R/adorn_pct_formatting.R | 104 ---- R/adorn_percentages.R | 119 ----- R/adorn_rounding.R | 92 ---- R/adorn_title.R | 113 ---- R/adorn_totals.R | 174 ------ R/as_and_untabyl.R | 117 ---- R/get_level_groups.R | 28 - R/janitor_deprecated.R | 86 --- R/print_tabyl.R | 4 - R/statistical_tests.R | 205 ------- R/tabyl.R | 346 ------------ R/top_levels.R | 65 --- README.Rmd | 50 -- man/add_totals_col.Rd | 20 - man/add_totals_row.Rd | 22 - man/adorn_crosstab.Rd | 35 -- man/adorn_ns.Rd | 82 --- man/adorn_pct_formatting.Rd | 71 --- man/adorn_percentages.Rd | 57 -- man/adorn_rounding.Rd | 67 --- man/adorn_title.Rd | 58 -- man/adorn_totals.Rd | 49 -- man/as_tabyl.Rd | 55 -- man/chisq.test.Rd | 40 -- man/crosstab.Rd | 15 - man/fisher.test.Rd | 34 -- man/janitor_deprecated.Rd | 4 - man/remove_empty.Rd | 4 +- man/tabyl.Rd | 70 --- man/top_levels.Rd | 28 - man/untabyl.Rd | 24 - tests/testthat/test-adorn-ns.R | 244 --------- tests/testthat/test-adorn-pct-formatting.R | 196 ------- tests/testthat/test-adorn-percentages.R | 242 --------- tests/testthat/test-adorn-rounding.R | 120 ----- tests/testthat/test-adorn-title.R | 165 ------ tests/testthat/test-adorn-totals.R | 595 --------------------- tests/testthat/test-get-level-groups.R | 24 - tests/testthat/test-statistical-tests.R | 126 ----- tests/testthat/test-tabyl-classifiers.R | 119 ----- tests/testthat/test-tabyl.R | 507 ------------------ tests/testthat/test-top-levels.R | 63 --- vignettes/tabyls.Rmd | 271 ---------- vignettes/tabyls.md | 454 ---------------- 47 files changed, 3 insertions(+), 5555 deletions(-) delete mode 100644 R/adorn_ns.R delete mode 100644 R/adorn_pct_formatting.R delete mode 100644 R/adorn_percentages.R delete mode 100644 R/adorn_rounding.R delete mode 100644 R/adorn_title.R delete mode 100644 R/adorn_totals.R delete mode 100644 R/as_and_untabyl.R delete mode 100644 R/get_level_groups.R delete mode 100644 R/print_tabyl.R delete mode 100644 R/statistical_tests.R delete mode 100644 R/tabyl.R delete mode 100644 R/top_levels.R delete mode 100644 man/add_totals_col.Rd delete mode 100644 man/add_totals_row.Rd delete mode 100644 man/adorn_crosstab.Rd delete mode 100644 man/adorn_ns.Rd delete mode 100644 man/adorn_pct_formatting.Rd delete mode 100644 man/adorn_percentages.Rd delete mode 100644 man/adorn_rounding.Rd delete mode 100644 man/adorn_title.Rd delete mode 100644 man/adorn_totals.Rd delete mode 100644 man/as_tabyl.Rd delete mode 100644 man/chisq.test.Rd delete mode 100644 man/crosstab.Rd delete mode 100644 man/fisher.test.Rd delete mode 100644 man/tabyl.Rd delete mode 100644 man/top_levels.Rd delete mode 100644 man/untabyl.Rd delete mode 100644 tests/testthat/test-adorn-ns.R delete mode 100644 tests/testthat/test-adorn-pct-formatting.R delete mode 100644 tests/testthat/test-adorn-percentages.R delete mode 100644 tests/testthat/test-adorn-rounding.R delete mode 100644 tests/testthat/test-adorn-title.R delete mode 100644 tests/testthat/test-adorn-totals.R delete mode 100644 tests/testthat/test-get-level-groups.R delete mode 100644 tests/testthat/test-statistical-tests.R delete mode 100644 tests/testthat/test-tabyl-classifiers.R delete mode 100644 tests/testthat/test-tabyl.R delete mode 100644 tests/testthat/test-top-levels.R delete mode 100644 vignettes/tabyls.Rmd delete mode 100644 vignettes/tabyls.md diff --git a/DESCRIPTION b/DESCRIPTION index 715ff2f7..b581b4ce 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,11 +12,7 @@ Authors@R: c( person("Josep", family = "Pueyo-Ros", email = "josep.pueyo@udg.edu", role = "ctb") ) Description: The main janitor functions can: perfectly format data.frame - column names; provide quick counts of variable combinations (i.e., - frequency tables and crosstabs); and explore duplicate records. Other - janitor functions nicely format the tabulation results. These - tabulate-and-report functions approximate popular features of SPSS and - Microsoft Excel. This package follows the principles of the + column names and explore duplicate records. This package follows the principles of the "tidyverse" and works well with the pipe function %>%. janitor was built with beginning-to-intermediate R users in mind and is optimized for user-friendliness. @@ -50,7 +46,6 @@ Suggests: VignetteBuilder: knitr Config/testthat/edition: 3 -Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index 1461f757..c373ceab 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,5 @@ # Generated by roxygen2: do not edit by hand -S3method(chisq.test,default) -S3method(chisq.test,tabyl) S3method(clean_names,default) S3method(clean_names,sf) S3method(clean_names,tbl_graph) @@ -13,35 +11,17 @@ S3method(excel_time_to_numeric,POSIXlt) S3method(excel_time_to_numeric,character) S3method(excel_time_to_numeric,logical) S3method(excel_time_to_numeric,numeric) -S3method(fisher.test,default) -S3method(fisher.test,tabyl) -S3method(print,tabyl) -S3method(tabyl,data.frame) -S3method(tabyl,default) export("%>%") -export(add_totals_col) -export(add_totals_row) -export(adorn_crosstab) -export(adorn_ns) -export(adorn_pct_formatting) -export(adorn_percentages) -export(adorn_rounding) -export(adorn_title) -export(adorn_totals) -export(as_tabyl) -export(chisq.test) export(clean_names) export(compare_df_cols) export(compare_df_cols_same) export(convert_to_NA) export(convert_to_date) export(convert_to_datetime) -export(crosstab) export(describe_class) export(excel_numeric_to_date) export(excel_time_to_numeric) export(find_header) -export(fisher.test) export(get_dupes) export(get_one_to_one) export(make_clean_names) @@ -56,9 +36,6 @@ export(row_to_names) export(sas_numeric_to_date) export(signif_half_up) export(single_value) -export(tabyl) -export(top_levels) -export(untabyl) export(use_first_valid_of) importFrom(lubridate,as_date) importFrom(lubridate,as_datetime) diff --git a/R/adorn_ns.R b/R/adorn_ns.R deleted file mode 100644 index 36ed4285..00000000 --- a/R/adorn_ns.R +++ /dev/null @@ -1,164 +0,0 @@ -#' Add underlying Ns to a tabyl displaying percentages. -#' -#' This function adds back the underlying Ns to a `tabyl` whose percentages were -#' calculated using [adorn_percentages()], to display the Ns and percentages together. -#' You can also call it on a non-tabyl data.frame to which you wish to append Ns. -#' -#' @param dat A data.frame of class `tabyl` that has had `adorn_percentages` and/or -#' `adorn_pct_formatting` called on it. If given a list of data.frames, -#' this function will apply itself to each data.frame in the list (designed for 3-way `tabyl` lists). -#' @param position Should the N go in the front, or in the rear, of the percentage? -#' @param ns The Ns to append. The default is the "core" attribute of the input tabyl -#' `dat`, where the original Ns of a two-way `tabyl` are stored. However, if your Ns -#' are stored somewhere else, or you need to customize them beyond what can be done -#' with `format_func`, you can supply them here. -#' @param format_func A formatting function to run on the Ns. Consider defining -#' with [base::format()]. -#' @param ... Columns to adorn. This takes a tidyselect specification. By default, -#' all columns are adorned except for the first column and columns not of class -#' `numeric`, but this allows you to manually specify which columns should be adorned, -#' for use on a data.frame that does not result from a call to `tabyl`. -#' -#' @return A `data.frame` with Ns appended -#' @export -#' @examples -#' mtcars %>% -#' tabyl(am, cyl) %>% -#' adorn_percentages("col") %>% -#' adorn_pct_formatting() %>% -#' adorn_ns(position = "front") -#' -#' # Format the Ns with a custom format_func: -#' set.seed(1) -#' bigger_dat <- data.frame( -#' sex = rep(c("m", "f"), 3000), -#' age = round(runif(3000, 1, 102), 0) -#' ) -#' bigger_dat$age_group <- cut(bigger_dat$age, quantile(bigger_dat$age, c(0, 1 / 3, 2 / 3, 1))) -#' -#' bigger_dat %>% -#' tabyl(age_group, sex, show_missing_levels = FALSE) %>% -#' adorn_totals(c("row", "col")) %>% -#' adorn_percentages("col") %>% -#' adorn_pct_formatting(digits = 1) %>% -#' adorn_ns(format_func = function(x) format(x, big.mark = ".", decimal.mark = ",")) - -#' # Control the columns to be adorned with the ... variable selection argument -#' # If using only the ... argument, you can use empty commas as shorthand -#' # to supply the default values to the preceding arguments: -#' -#' cases <- data.frame( -#' region = c("East", "West"), -#' year = 2015, -#' recovered = c(125, 87), -#' died = c(13, 12) -#' ) -#' -#' cases %>% -#' adorn_percentages("col",,recovered:died) %>% -#' adorn_pct_formatting(,,,,,recovered:died) %>% -#' adorn_ns(,,,recovered:died) -#' -adorn_ns <- function(dat, position = "rear", ns = attr(dat, "core"), format_func = function(x) { - format(x, big.mark = ",") - }, ...) { - # if input is a list, call purrr::map to recursively apply this function to each data.frame - if (is.list(dat) && !is.data.frame(dat)) { - purrr::map(dat, adorn_ns, position) # okay not to pass ns and allow for static Ns, b/c one size fits all for each list entry doesn't make sense for Ns. - } else { - ns_provided <- !missing(ns) - - # catch bad inputs - if (!is.data.frame(dat)) { - stop("adorn_ns() must be called on a data.frame or list of data.frames") - } - - rlang::arg_match0(position, c("front", "rear")) - - if (is.null(ns)) { - stop("argument \"ns\" cannot be null; if not calling adorn_ns() on a data.frame of class \"tabyl\", pass your own value for ns") - } - # If ns argument is not the default "core" attribute, validate that it's a data.frame and has correct right dimensions - if (!is.data.frame(ns)) { - stop("if supplying a value to the ns argument, it must be of class data.frame") - } - if ("one_way" %in% attr(dat, "tabyl_type")) { - warning("adorn_ns() is meant to be called on a two_way tabyl; consider combining columns of a one_way tabyl with tidyr::unite()") - } - - attrs <- attributes(dat) # save these to re-append later - custom_ns_supplied <- !(identical(ns, attr(dat, "core"))) - - if (custom_ns_supplied & !identical(dim(ns), dim(dat))) { # user-supplied Ns must include values for totals row/col if present - stop("if supplying your own data.frame of Ns to append, its dimensions must match those of the data.frame in the \"dat\" argument") - } - - # If appending the default Ns from the core, and there are totals rows/cols, append those values to the Ns table - # Custom inputs to ns argument will need to calculate & format their own totals row/cols - if (!custom_ns_supplied) { - if (!is.null(attr(dat, "totals"))) { # add totals row/col to core for pasting, if applicable - ns <- adorn_totals(ns, attr(dat, "totals")) - ns <- ns[order(match(ns[, 1], dat[, 1])), ] # from #407 - in rare event Totals row has been sorted off the bottom, sort to match - } - numeric_cols <- which(vapply(ns, is.numeric, logical(1))) - ns[] <- lapply(ns, format_func) - ns[] <- lapply(ns, stringr::str_trim) - } - - if (position == "rear") { - result <- paste_matrices(dat, ns %>% - dplyr::mutate( - dplyr::across(dplyr::everything(), purrr::compose(as.character, wrap_parens, standardize_col_width, .dir = "forward")) - )) - } else if (position == "front") { - result <- paste_matrices(ns, dat %>% - dplyr::mutate( - dplyr::across(dplyr::everything(), purrr::compose(as.character, wrap_parens, standardize_col_width, .dir = "forward")) - )) - } - attributes(result) <- attrs - - if (custom_ns_supplied & rlang::dots_n(...) == 0) { - dont_adorn <- 1L - } else if (rlang::dots_n(...) == 0) { - cols_to_adorn <- numeric_cols - dont_adorn <- setdiff(1:ncol(dat), cols_to_adorn) - dont_adorn <- unique(c(1, dont_adorn)) # always don't-append first column - } else { - expr <- rlang::expr(c(...)) - cols_to_adorn <- tidyselect::eval_select(expr, data = dat) - dont_adorn <- setdiff(1:ncol(dat), cols_to_adorn) - } - - for (i in dont_adorn) { - result[[i]] <- dat[[i]] - } - result - } -} - -### Helper functions called by adorn_ns - -# takes two matrices, pastes them together, keeps spacing of the two columns aligned -paste_matrices <- function(front, rear) { - front_matrix <- as.matrix(front) - rear_matrix <- as.matrix(rear) - - # paste the results together - pasted <- paste(front_matrix, " ", rear_matrix, sep = "") %>% # paste the matrices - matrix(., nrow = nrow(front_matrix), dimnames = dimnames(rear_matrix)) %>% # cast as matrix, then data.frame - dplyr::as_tibble() - pasted -} - - -# Padding function to standardize a column's width by pre-pending whitespace -standardize_col_width <- function(x) { - width <- max(nchar(x)) - sprintf(paste0("%", width, "s"), x) -} - -# Wrap a string in parentheses -wrap_parens <- function(x) { - paste0("(", x, ")") -} diff --git a/R/adorn_pct_formatting.R b/R/adorn_pct_formatting.R deleted file mode 100644 index d2d6652d..00000000 --- a/R/adorn_pct_formatting.R +++ /dev/null @@ -1,104 +0,0 @@ -#' Format a `data.frame` of decimals as percentages. -#' -#' @description -#' Numeric columns get multiplied by 100 and formatted as -#' percentages according to user specifications. This function defaults to -#' excluding the first column of the input data.frame, assuming that it contains -#' a descriptive variable, but this can be overridden by specifying the columns -#' to adorn in the `...` argument. Non-numeric columns are always excluded. -#' -#' The decimal separator character is the result of `getOption("OutDec")`, which -#' is based on the user's locale. If the default behavior is undesirable, -#' change this value ahead of calling the function, either by changing locale or -#' with `options(OutDec = ",")`. This aligns the decimal separator character -#' with that used in `base::print()`. -#' -#' @param dat a data.frame with decimal values, typically the result of a call -#' to `adorn_percentages` on a `tabyl`. If given a list of data.frames, this -#' function will apply itself to each data.frame in the list (designed for -#' 3-way `tabyl` lists). -#' @param digits how many digits should be displayed after the decimal point? -#' @param rounding method to use for rounding - either "half to even", the base -#' R default method, or "half up", where 14.5 rounds up to 15. -#' @param affix_sign should the % sign be affixed to the end? -#' @param ... columns to adorn. This takes a tidyselect specification. By -#' default, all numeric columns (besides the initial column, if numeric) are -#' adorned, but this allows you to manually specify which columns should be -#' adorned, for use on a data.frame that does not result from a call to -#' `tabyl`. -#' @return a data.frame with formatted percentages -#' @export -#' @examples -#' mtcars %>% -#' tabyl(am, cyl) %>% -#' adorn_percentages("col") %>% -#' adorn_pct_formatting() -#' -#' # Control the columns to be adorned with the ... variable selection argument -#' # If using only the ... argument, you can use empty commas as shorthand -#' # to supply the default values to the preceding arguments: -#' -#' cases <- data.frame( -#' region = c("East", "West"), -#' year = 2015, -#' recovered = c(125, 87), -#' died = c(13, 12) -#' ) -#' -#' cases %>% -#' adorn_percentages("col", , recovered:died) %>% -#' adorn_pct_formatting(, , , recovered:died) -#' -adorn_pct_formatting <- function(dat, digits = 1, rounding = "half to even", affix_sign = TRUE, ...) { - # if input is a list, call purrr::map to recursively apply this function to each data.frame - if (is.list(dat) && !is.data.frame(dat)) { - purrr::map(dat, adorn_pct_formatting, digits, rounding, affix_sign) - } else { - # catch bad inputs - if (!is.data.frame(dat)) { - stop("adorn_pct_formatting() must be called on a data.frame or list of data.frames") - } - rlang::arg_match0(rounding, c("half to even", "half up")) - - original <- dat # used below to record original instances of NA and NaN - - numeric_cols <- which(vapply(dat, is.numeric, logical(1))) - non_numeric_cols <- setdiff(1:ncol(dat), numeric_cols) - numeric_cols <- setdiff(numeric_cols, 1) # assume 1st column should not be included so remove it from numeric_cols. Moved up to this line so that if only 1st col is numeric, the function errors - - if (rlang::dots_n(...) == 0) { - cols_to_adorn <- numeric_cols - } else { - expr <- rlang::expr(c(...)) - cols_to_adorn <- tidyselect::eval_select(expr, data = dat) - if (any(cols_to_adorn %in% non_numeric_cols)) { - # don't need to print a message, adorn_rounding will - cols_to_adorn <- setdiff(cols_to_adorn, non_numeric_cols) - } - } - - - if ("one_way" %in% attr(dat, "tabyl_type")) { - cols_to_adorn <- setdiff(numeric_cols, 2) # so that it works on a one-way tabyl - } - - if (length(cols_to_adorn) == 0) { - stop("at least one targeted column must be of class numeric") - } - - dat[cols_to_adorn] <- lapply(dat[cols_to_adorn], function(x) x * 100) - dat <- adorn_rounding(dat, digits = digits, rounding = rounding, ...) - dat[cols_to_adorn] <- lapply(dat[cols_to_adorn], function(x) { - format(x, - nsmall = digits, - decimal.mark = getOption("OutDec"), - trim = TRUE - ) - }) # so that 0% prints as 0.0% or 0.00% etc. - if (affix_sign) { - dat[cols_to_adorn] <- lapply(dat[cols_to_adorn], function(x) paste0(x, "%")) - } - dat[cols_to_adorn][is.na(original[cols_to_adorn])] <- "-" # NA and NaN values in the original should be simply "-" for printing of results - dat - } -} diff --git a/R/adorn_percentages.R b/R/adorn_percentages.R deleted file mode 100644 index 6536325f..00000000 --- a/R/adorn_percentages.R +++ /dev/null @@ -1,119 +0,0 @@ -#' Convert a data.frame of counts to percentages. -#' -#' This function defaults to excluding the first column of the input data.frame, -#' assuming that it contains a descriptive variable, but this can be overridden -#' by specifying the columns to adorn in the `...` argument. -#' -#' @param dat A `tabyl` or other data.frame with a tabyl-like layout. -#' If given a list of data.frames, this function will apply itself to each -#' `data.frame` in the list (designed for 3-way `tabyl` lists). -#' @param denominator The direction to use for calculating percentages. -#' One of "row", "col", or "all". -#' @param na.rm should missing values (including `NaN`) be omitted from the calculations? -#' @param ... columns to adorn. This takes a <[`tidy-select`][dplyr::dplyr_tidy_select]> -#' specification. By default, all numeric columns (besides the initial column, if numeric) -#' are adorned, but this allows you to manually specify which columns should -#' be adorned, for use on a `data.frame` that does not result from a call to [tabyl()]. -#' -#' @return A `data.frame` of percentages, expressed as numeric values between 0 and 1. -#' @export -#' @examples -#' -#' mtcars %>% -#' tabyl(am, cyl) %>% -#' adorn_percentages("col") -#' -#' # calculates correctly even with totals column and/or row: -#' mtcars %>% -#' tabyl(am, cyl) %>% -#' adorn_totals("row") %>% -#' adorn_percentages() -#' -#' # Control the columns to be adorned with the ... variable selection argument -#' # If using only the ... argument, you can use empty commas as shorthand -#' # to supply the default values to the preceding arguments: -#' -#' cases <- data.frame( -#' region = c("East", "West"), -#' year = 2015, -#' recovered = c(125, 87), -#' died = c(13, 12) -#' ) -#' -#' cases %>% -#' adorn_percentages(, , recovered:died) -adorn_percentages <- function(dat, denominator = "row", na.rm = TRUE, ...) { - # if input is a list, call purrr::map to recursively apply this function to each data.frame - if (is.list(dat) && !is.data.frame(dat)) { - purrr::map(dat, adorn_percentages, denominator, na.rm, ...) - } else { - # catch bad inputs - if (!is.data.frame(dat)) { - stop("adorn_percentages() must be called on a data.frame or list of data.frames") - } - rlang::arg_match0(denominator, c("row", "col", "all")) - - dat <- as_tabyl(dat) - - numeric_cols <- which(vapply(dat, is.numeric, logical(1))) - non_numeric_cols <- setdiff(1:ncol(dat), numeric_cols) - numeric_cols <- setdiff(numeric_cols, 1) # assume 1st column should not be included so remove it from numeric_cols. Moved up to this line so that if only 1st col is numeric, the function errors - explicitly_exempt_totals <- FALSE - - if (rlang::dots_n(...) == 0) { - cols_to_tally <- numeric_cols - } else { - expr <- rlang::expr(c(...)) - cols_to_tally <- tidyselect::eval_select(expr, data = dat) - explicitly_exempt_totals <- !(ncol(dat) %in% cols_to_tally) # if not present, it's b/c user explicitly exempted it - if (any(cols_to_tally %in% non_numeric_cols)) { - message("At least one non-numeric column was specified. All non-numeric columns will be removed from percentage calculations.") - cols_to_tally <- setdiff(cols_to_tally, non_numeric_cols) - } - } - - if ("col" %in% attr(dat, "totals")) { - # if there's a totals col, don't use it to calculate the %s - cols_to_tally <- setdiff(cols_to_tally, ncol(dat)) - } - - if (denominator == "row") { - # if row-wise percentages and a totals column, need to exempt totals col and make it all 1s - if ("col" %in% attr(dat, "totals") & !explicitly_exempt_totals) { - dat[[ncol(dat)]] <- rep(1, nrow(dat)) - } - row_sum <- rowSums(dat[cols_to_tally], na.rm = na.rm) - dat[, cols_to_tally] <- dat[cols_to_tally] / row_sum - } else if (denominator == "col") { - # if col-wise percentages and a row column, need to exempt totals row and make it all 1s - if ("row" %in% attr(dat, "totals")) { - col_sum <- colSums(dat[-nrow(dat), ][cols_to_tally], na.rm = na.rm) - } else { - col_sum <- colSums(dat[cols_to_tally], na.rm = na.rm) - } - # add totals col back to be tallied, #357 - if ("col" %in% attr(dat, "totals") & !explicitly_exempt_totals) { - cols_to_tally <- c(cols_to_tally, ncol(dat)) - if ("row" %in% attr(dat, "totals")) { - col_sum <- c(col_sum, sum(dat[-nrow(dat), ncol(dat)])) - } else { - col_sum <- c(col_sum, sum(dat[, ncol(dat)])) - } - } - dat[cols_to_tally] <- sweep(dat[cols_to_tally], 2, col_sum, `/`) # from http://stackoverflow.com/questions/9447801/dividing-columns-by-colsums-in-r - } else if (denominator == "all") { - # if all-wise percentages, need to exempt any totals col or row - if ("row" %in% attr(dat, "totals")) { - complete_n <- sum(dat[-nrow(dat), cols_to_tally], na.rm = TRUE) - } else { - complete_n <- sum(dat[, cols_to_tally], na.rm = TRUE) - } - # add totals col back to be tallied, #357 - if ("col" %in% attr(dat, "totals") & !explicitly_exempt_totals) { - cols_to_tally <- c(cols_to_tally, ncol(dat)) - } - dat[cols_to_tally] <- dat[cols_to_tally] / complete_n - } - dat - } -} diff --git a/R/adorn_rounding.R b/R/adorn_rounding.R deleted file mode 100644 index 60059ed5..00000000 --- a/R/adorn_rounding.R +++ /dev/null @@ -1,92 +0,0 @@ -#' Round the numeric columns in a data.frame. -#' -#' @description -#' Can run on any `data.frame` with at least one numeric column. -#' This function defaults to excluding the first column of the input data.frame, -#' assuming that it contains a descriptive variable, but this can be overridden by -#' specifying the columns to round in the `...` argument. -#' -#' If you're formatting percentages, e.g., the result of [adorn_percentages()], -#' use [adorn_pct_formatting()] instead. This is a more flexible variant for ad-hoc usage. -#' Compared to `adorn_pct_formatting()`, it does not multiply by 100 or pad the -#' numbers with spaces for alignment in the results `data.frame`. -#' This function retains the class of numeric input columns. -#' -#' @param dat A `tabyl` or other `data.frame` with similar layout. -#' If given a list of data.frames, this function will apply itself to each -#' `data.frame` in the list (designed for 3-way `tabyl` lists). -#' @param digits How many digits should be displayed after the decimal point? -#' @param rounding Method to use for rounding - either "half to even" -#' (the base R default method), or "half up", where 14.5 rounds up to 15. -#' @param ... Columns to adorn. This takes a tidyselect specification. -#' By default, all numeric columns (besides the initial column, if numeric) -#' are adorned, but this allows you to manually specify which columns should -#' be adorned, for use on a data.frame that does not result from a call to `tabyl`. -#' -#' @return The `data.frame` with rounded numeric columns. -#' @export -#' @examples -#' -#' mtcars %>% -#' tabyl(am, cyl) %>% -#' adorn_percentages() %>% -#' adorn_rounding(digits = 2, rounding = "half up") -#' -#' # tolerates non-numeric columns: -#' library(dplyr) -#' mtcars %>% -#' tabyl(am, cyl) %>% -#' adorn_percentages("all") %>% -#' mutate(dummy = "a") %>% -#' adorn_rounding() -#' -#' # Control the columns to be adorned with the ... variable selection argument -#' # If using only the ... argument, you can use empty commas as shorthand -#' # to supply the default values to the preceding arguments: -#' cases <- data.frame( -#' region = c("East", "West"), -#' year = 2015, -#' recovered = c(125, 87), -#' died = c(13, 12) -#' ) -#' -#' cases %>% -#' adorn_percentages(, , ends_with("ed")) %>% -#' adorn_rounding(, , all_of(c("recovered", "died"))) -adorn_rounding <- function(dat, digits = 1, rounding = "half to even", ...) { - # if input is a list, call purrr::map to recursively apply this function to each data.frame - if (is.list(dat) && !is.data.frame(dat)) { - purrr::map(dat, adorn_rounding, digits, rounding, ...) - } else { - # catch bad inputs - if (!is.data.frame(dat)) { - stop("adorn_rounding() must be called on a data.frame or list of data.frames") - } - if (!rounding %in% c("half to even", "half up")) { - stop("'rounding' must be one of 'half to even' or 'half up'") - } - numeric_cols <- which(vapply(dat, is.numeric, logical(1))) - non_numeric_cols <- setdiff(1:ncol(dat), numeric_cols) - # assume 1st column should not be included so remove it from numeric_cols. - # Moved up to this line so that if only 1st col is numeric, the function errors - numeric_cols <- setdiff(numeric_cols, 1) - - if (rlang::dots_n(...) == 0) { - cols_to_round <- numeric_cols - } else { - expr <- rlang::expr(c(...)) - cols_to_round <- tidyselect::eval_select(expr, data = dat) - if (any(cols_to_round %in% non_numeric_cols)) { - message("At least one non-numeric column was specified and will not be modified.") - cols_to_round <- setdiff(cols_to_round, non_numeric_cols) - } - } - - if (rounding == "half to even") { - dat[cols_to_round] <- lapply(dat[cols_to_round], function(x) round(x, digits = digits)) - } else { - dat[cols_to_round] <- lapply(dat[cols_to_round], function(x) round_half_up(x, digits = digits)) - } - dat - } -} diff --git a/R/adorn_title.R b/R/adorn_title.R deleted file mode 100644 index fa5a5e8a..00000000 --- a/R/adorn_title.R +++ /dev/null @@ -1,113 +0,0 @@ -#' Add column name to the top of a two-way tabyl. -#' -#' This function adds the column variable name to the top of a `tabyl` for a -#' complete display of information. This makes the tabyl prettier, but renders -#' the `data.frame` less useful for further manipulation. -#' -#' The `placement` argument indicates whether the column name should be added to -#' the `top` of the tabyl in an otherwise-empty row `"top"` or appended to the -#' already-present row name variable (`"combined"`). The formatting in the `"top"` -#' option has the look of base R's `table()`; it also wipes out the other column -#' names, making it hard to further use the `data.frame` besides formatting it for reporting. -#' The `"combined"` option is more conservative in this regard. -#' -#' @param dat A `data.frame` of class `tabyl` or other `data.frame` with a tabyl-like layout. -#' If given a list of data.frames, this function will apply itself to each `data.frame` -#' in the list (designed for 3-way `tabyl` lists). -#' @param placement The title placement, one of `"top"`, or `"combined"`. -#' See **Details** for more information. -#' @param row_name (optional) default behavior is to pull the row name from the -#' attributes of the input `tabyl` object. If you wish to override that text, -#' or if your input is not a `tabyl`, supply a string here. -#' @param col_name (optional) default behavior is to pull the column_name from -#' the attributes of the input `tabyl` object. If you wish to override that text, -#' or if your input is not a `tabyl`, supply a string here. -#' @return The input `tabyl`, augmented with the column title. Non-tabyl inputs -#' that are of class `tbl_df` are downgraded to basic data.frames so that the -#' title row prints correctly. -#' -#' @export -#' @examples -#' -#' mtcars %>% -#' tabyl(am, cyl) %>% -#' adorn_title(placement = "top") -#' -#' # Adding a title to a non-tabyl -#' library(tidyr) -#' library(dplyr) -#' mtcars %>% -#' group_by(gear, am) %>% -#' summarise(avg_mpg = mean(mpg), .groups = "drop") %>% -#' pivot_wider(names_from = am, values_from = avg_mpg) %>% -#' adorn_rounding() %>% -#' adorn_title("top", row_name = "Gears", col_name = "Cylinders") -adorn_title <- function(dat, placement = "top", row_name, col_name) { - # if input is a list, call purrr::map to recursively apply this function to each data.frame - if (is.list(dat) && !is.data.frame(dat)) { - purrr::map(dat, adorn_title, placement, row_name, col_name) - } else { - if (!is.data.frame(dat)) { - stop("\"dat\" must be a data.frame") - } - - rlang::arg_match0(placement, c("top", "combined")) - - if (inherits(dat, "tabyl")) { - if (attr(dat, "tabyl_type") == "one_way") { - warning( - "adorn_title is meant for two-way tabyls, calling it on a one-way tabyl may not yield a meaningful result" - ) - } - } - if (missing(col_name)) { - if (!inherits(dat, "tabyl")) { - stop("When input is not a data.frame of class tabyl, a value must be specified for the col_name argument.") - } - col_var <- attr(dat, "var_names")$col - } else { - if (!is.character(col_name)) { - stop("col_name must be a string") - } - col_var <- col_name - } - - if (!missing(row_name)) { - if (!is.character(row_name)) { - stop("row_name must be a string") - } - names(dat)[1] <- row_name - row_var <- row_name - } else { - if (inherits(dat, "tabyl")) { - row_var <- attr(dat, "var_names")$row - } else { - # for non-tabyl input, if no row_name supplied, use first existing name - row_var <- names(dat)[1] - } - } - - - if (placement == "top") { - # to handle factors, problematic in first column and at bind_rows. - dat[, ] <- lapply(dat[, ], as.character) - # Can't use mutate_all b/c it strips attributes - top <- dat[1, ] - - top[1, ] <- as.list(names(top)) - - out <- dplyr::bind_rows(top, dat) - out <- stats::setNames(out, c("", col_var, rep("", ncol(out) - 2))) - } - if (placement == "combined") { - out <- dat - names(out)[1] <- paste(row_var, col_var, sep = "/") - } - # "top" text doesn't print if input (and thus the output) is a tibble - if (inherits(out, "tbl_df")) { - # but this prints row numbers, so don't apply to non-tbl_dfs like tabyls - out <- as.data.frame(out) - } - out - } -} diff --git a/R/adorn_totals.R b/R/adorn_totals.R deleted file mode 100644 index b4db52f3..00000000 --- a/R/adorn_totals.R +++ /dev/null @@ -1,174 +0,0 @@ -#' Append a totals row and/or column to a data.frame -#' -#' This function defaults to excluding the first column of the input data.frame, -#' assuming that it contains a descriptive variable, but this can be overridden -#' by specifying the columns to be totaled in the `...` argument. Non-numeric -#' columns are converted to character class and have a user-specified fill character -#' inserted in the totals row. -#' -#' @param dat An input `data.frame` with at least one numeric column. If given a -#' list of data.frames, this function will apply itself to each `data.frame` -#' in the list (designed for 3-way `tabyl` lists). -#' @param where One of "row", "col", or `c("row", "col")` -#' @param fill If there are non-numeric columns, what should fill the bottom row -#' of those columns? If a string, relevant columns will be coerced to character. -#' If `NA` then column types are preserved. -#' @param na.rm Should missing values (including `NaN`) be omitted from the calculations? -#' @param name Name of the totals row and/or column. If both are created, and -#' `name` is a single string, that name is applied to both. If both are created -#' and `name` is a vector of length 2, the first element of the vector will be -#' used as the row name (in column 1), and the second element will be used as the -#' totals column name. Defaults to "Total". -#' @param ... Columns to total. This takes a tidyselect specification. By default, -#' all numeric columns (besides the initial column, if numeric) are included in -#' the totals, but this allows you to manually specify which columns should be -#' included, for use on a data.frame that does not result from a call to `tabyl`. -#' @return A `data.frame` augmented with a totals row, column, or both. -#' The `data.frame` is now also of class `tabyl` and stores information about -#' the attached totals and underlying data in the tabyl attributes. -#' @export -#' @examples -#' mtcars %>% -#' tabyl(am, cyl) %>% -#' adorn_totals() -adorn_totals <- function(dat, where = "row", fill = "-", na.rm = TRUE, name = "Total", ...) { - if ("both" %in% where) { - where <- c("row", "col") - } - # if input is a list, call purrr::map to recursively apply this function to each data.frame - if (is.list(dat) && !is.data.frame(dat)) { - purrr::map(dat, adorn_totals, where, fill, na.rm, name) - } else { - if (!is.data.frame(dat)) { - stop("adorn_totals() must be called on a data.frame or list of data.frames") - } - - numeric_cols <- which(vapply(dat, is.numeric, logical(1))) - non_numeric_cols <- setdiff(1:ncol(dat), numeric_cols) - - if (rlang::dots_n(...) == 0) { - # by default 1st column is not totaled so remove it from numeric_cols and add to non_numeric_cols - numeric_cols <- setdiff(numeric_cols, 1) - non_numeric_cols <- unique(c(1, non_numeric_cols)) - cols_to_total <- numeric_cols - } else { - expr <- rlang::expr(c(...)) - cols_to_total <- tidyselect::eval_select(expr, data = dat) - if (any(cols_to_total %in% non_numeric_cols)) { - cols_to_total <- setdiff(cols_to_total, non_numeric_cols) - } - } - - if (length(cols_to_total) == 0) { - stop("at least one targeted column must be of class numeric. Control target variables with the ... argument. adorn_totals should be called before other adorn_ functions.") - } - - if (sum(where %in% c("row", "col")) != length(where)) { - stop("\"where\" must be one of \"row\", \"col\", or c(\"row\", \"col\")") - } - - if (length(name) == 1) name <- rep(name, 2) - - - # grouped_df causes problems, #97 - if (inherits(dat, "grouped_df")) { - dat <- dplyr::ungroup(dat) - } - - dat <- as_tabyl(dat) # even a tabyl needs to be recast as a tabyl to reset the core in case it's been sorted - - # set totals attribute - if (sum(where %in% attr(dat, "totals")) > 0) { # if either of the values of "where" are already in totals attribute - stop("trying to re-add a totals dimension that is already been added") - } else if (length(attr(dat, "totals")) == 1) { - # if totals row OR col has already been adorned, append new axis to the current attribute - attr(dat, "totals") <- c(attr(dat, "totals"), where) - } else { - attr(dat, "totals") <- where - } - - if ("row" %in% where) { - # capture factor levels if relevant, #494 - factor_input <- is.factor(dat[[1]]) - if (factor_input) { - col1_backup <- dat[[1]][1] - } - # creates the totals row to be appended - col_sum <- function(a_col, na_rm = na.rm) { - # can't do this with if_else because it doesn't like the sum() of a character vector, - # even if that clause is not reached - if (is.numeric(a_col)) { - sum(a_col, na.rm = na_rm) - } else { - if (!is.character(fill)) { # if fill isn't a character string, use NA consistent with data types - switch(typeof(a_col), - "character" = NA_character_, - "integer" = NA_integer_, - "double" = if (inherits(a_col, "Date") || inherits(a_col, "POSIXt")) { - as.Date(NA_real_, origin = "1970-01-01") - } else { - NA_real_ - }, - "complex" = NA_complex_, - NA - ) - } else { - fill # otherwise just use the string provided - } - } - } - - if (is.character(fill)) { # if fill is a string, keep original implementation - col_totals <- purrr::map_df(dat, col_sum) - not_totaled_cols <- setdiff(1:length(col_totals), cols_to_total) - col_totals[not_totaled_cols] <- fill # reset numeric columns that weren't to be totaled - dat[not_totaled_cols] <- lapply(dat[not_totaled_cols], as.character) # type compatibility for bind_rows - } else { - cols_idx <- seq_along(dat) # get col indexes - names(cols_idx) <- names(dat) # name them using dat names - - col_totals <- purrr::map_df(cols_idx, function(i) { - if (is.numeric(dat[[i]]) && !i %in% cols_to_total) { # check if numeric and not to be totaled - switch(typeof(dat[[i]]), # and set to NA - "integer" = NA_integer_, - "double" = NA_real_, - NA - ) - } else { # otherwise run col_sum on the rest - col_sum(dat[[i]]) - } - }) - - if (!is.character(dat[[1]]) && !1 %in% cols_to_total) { - # convert first col to character so that name can be appended - dat[[1]] <- as.character(dat[[1]]) - col_totals[[1]] <- as.character(col_totals[[1]]) - } - } - - if (!1 %in% cols_to_total) { # give users the option to total the first column?? Up to them I guess - col_totals[1, 1] <- name[1] # replace first column value with name argument - } else { - message("Because the first column was specified to be totaled, it does not contain the label 'Total' (or user-specified name) in the totals row") - } - dat[(nrow(dat) + 1), ] <- col_totals[1, ] # insert totals_col as last row in dat - if (factor_input) { # restore factor/ordered info, #494 - dat[[1]] <- factor(dat[[1]], - levels = c(setdiff(levels(col1_backup), name[1]), name[1]), # don't add if level is present - ordered = is.ordered(col1_backup) - ) - } - } - - if ("col" %in% where) { - # Add totals col - row_totals <- dat %>% - dplyr::select(dplyr::all_of(cols_to_total) & dplyr::where(is.numeric)) %>% - dplyr::transmute(Total = rowSums(., na.rm = na.rm)) - - dat[[name[2]]] <- row_totals$Total - } - - dat - } -} diff --git a/R/as_and_untabyl.R b/R/as_and_untabyl.R deleted file mode 100644 index 235b3758..00000000 --- a/R/as_and_untabyl.R +++ /dev/null @@ -1,117 +0,0 @@ -#' Add `tabyl` attributes to a data.frame -#' -#' @description -#' A `tabyl` is a `data.frame` containing counts of a variable or -#' co-occurrences of two variables (a.k.a., a contingency table or crosstab). -#' This specialized kind of data.frame has attributes that enable `adorn_` -#' functions to be called for precise formatting and presentation of results. -#' E.g., display results as a mix of percentages, Ns, add totals rows or -#' columns, rounding options, in the style of Microsoft Excel PivotTable. -#' -#' A `tabyl` can be the result of a call to `janitor::tabyl()`, in which case -#' these attributes are added automatically. This function adds `tabyl` class -#' attributes to a data.frame that isn't the result of a call to `tabyl` but -#' meets the requirements of a two-way tabyl: 1) First column contains values of -#' variable 1 2) Column names 2:n are the values of variable 2 3) Numeric values -#' in columns 2:n are counts of the co-occurrences of the two variables.* -#' -#' * = this is the ideal form of a `tabyl`, but janitor's `adorn_` functions tolerate -#' and ignore non-numeric columns in positions 2:n. -#' -#' For instance, the result of [dplyr::count()] followed by [tidyr::pivot_wider()] -#' can be treated as a `tabyl`. -#' -#' The result of calling [tabyl()] on a single variable is a special class of -#' one-way tabyl; this function only pertains to the two-way tabyl. -#' -#' @param dat a data.frame with variable values in the first column and numeric -#' values in all other columns. -#' @param axes is this a two_way tabyl or a one_way tabyl? If this function is -#' being called by a user, this should probably be "2". One-way tabyls are -#' created by `tabyl` but are a special case. -#' @param row_var_name (optional) the name of the variable in the row dimension; -#' used by `adorn_title()`. -#' @param col_var_name (optional) the name of the variable in the column -#' dimension; used by `adorn_title()`. -#' @return Returns the same data.frame, but with the additional class of "tabyl" -#' and the attribute "core". -#' @export -#' @examples -#' as_tabyl(mtcars) -#' -as_tabyl <- function(dat, axes = 2, row_var_name = NULL, col_var_name = NULL) { - if (!axes %in% 1:2) { - stop("axes must be either 1 or 2") - } - - # check whether input meets requirements - if (!is.data.frame(dat)) { - stop("input must be a data.frame") - } - if (sum(unlist(lapply(dat, is.numeric))[-1]) == 0) { - stop("at least one one of columns 2:n must be of class numeric") - } - - # assign core attribute and classes - if (inherits(dat, "tabyl")) { - # if already a tabyl, may have totals row. - # Safest play is to simply reorder the core rows to match the dat rows - attr(dat, "core") <- attr(dat, "core")[order(match( - attr(dat, "core")[, 1], - dat[, 1] - )), ] - row.names(attr(dat, "core")) <- 1:nrow(attr(dat, "core")) # if they're sorted in the prior step above, this resets - } else { - attr(dat, "core") <- as.data.frame(dat) # core goes first so dat does not yet have attributes attached to it - } - - attr(dat, "tabyl_type") <- ifelse( - !is.null(attr(dat, "tabyl_type")), - attr(dat, "tabyl_type"), # if a one_way tabyl has as_tabyl called on it, it should stay a one_way #523 - dplyr::case_when( - axes == 1 ~ "one_way", - axes == 2 ~ "two_way" - ) - ) - class(dat) <- c("tabyl", setdiff(class(dat), "tabyl")) - - if (!missing(row_var_name) | !missing(col_var_name)) { - if (axes != 2) { - stop("variable names are only meaningful for two-way tabyls") - } - attr(dat, "var_names") <- list(row = row_var_name, col = col_var_name) - } - - dat -} - -#' Remove `tabyl` attributes from a data.frame. -#' -#' Strips away all `tabyl`-related attributes from a data.frame. -#' -#' @param dat a `data.frame` of class `tabyl`. -#' @return the same `data.frame`, but without the `tabyl` class and attributes. -#' @export -#' @examples -#' -#' mtcars %>% -#' tabyl(am) %>% -#' untabyl() %>% -#' attributes() # tabyl-specific attributes are gone -untabyl <- function(dat) { - # if input is a list, call purrr::map to recursively apply this function to each data.frame - if (is.list(dat) && !is.data.frame(dat)) { - purrr::map(dat, untabyl) - } else { - if (!inherits(dat, "tabyl")) { - warning("untabyl() called on a non-tabyl") - } - class(dat) <- class(dat)[!class(dat) %in% "tabyl"] - attr(dat, "core") <- NULL - # These attributes may not exist, but simpler to declare them NULL regardless than to check to see if they exist: - attr(dat, "totals") <- NULL - attr(dat, "tabyl_type") <- NULL # may not exist, but simpler to declare it NULL regardless than to check to see if it exists - attr(dat, "var_names") <- NULL # may not exist, but simpler to declare it NULL regardless than to check to see if it exists - dat - } -} diff --git a/R/get_level_groups.R b/R/get_level_groups.R deleted file mode 100644 index 0f3e6a37..00000000 --- a/R/get_level_groups.R +++ /dev/null @@ -1,28 +0,0 @@ -# Return groupings for a factor variable in the top_levels() function - -get_level_groups <- function(vec, n, num_levels_in_var) { - top_n_lvls <- paste(levels(vec)[1:n], collapse = ", ") - bot_n_lvls <- paste(levels(vec)[(num_levels_in_var - n + 1):num_levels_in_var], collapse = ", ") - - # Identify middle combinations, if needed - if (num_levels_in_var > 2 * n) { - mid_lvls <- paste(levels(vec)[(n + 1):(num_levels_in_var - n)], collapse = ", ") - } else { - mid_lvls <- NA - } - - # Truncate strings if needed - ## Middle groups are variable size, so displaying the N there is useful; - ## Top/Bottom are user-specified size, so just truncate the labels - if (!is.na(mid_lvls) & nchar(mid_lvls) > 30) { - mid_lvls <- paste0("<<< Middle Group (", num_levels_in_var - 2 * n, " categories) >>>") - } - if (nchar(top_n_lvls) > 30) { - top_n_lvls <- paste0(substr(top_n_lvls, 1, 27), "...") - } - if (nchar(bot_n_lvls) > 30) { - bot_n_lvls <- paste0(substr(bot_n_lvls, 1, 27), "...") - } - - list(top = top_n_lvls, mid = mid_lvls, bot = bot_n_lvls) -} diff --git a/R/janitor_deprecated.R b/R/janitor_deprecated.R index 946a2924..6a558ed6 100644 --- a/R/janitor_deprecated.R +++ b/R/janitor_deprecated.R @@ -2,12 +2,8 @@ #' #' These functions have already become defunct or may be defunct as soon as the next release. #' -#' * [adorn_crosstab()] -> `adorn_` -#' * [crosstab()] -> [tabyl()] #' * [use_first_valid_of()] -> [dplyr::coalesce()] #' * [convert_to_NA()] -> [dplyr::na_if()] -#' * [add_totals_col()] -> [`adorn_totals(where = "col")`][adorn_totals()] -#' * [add_totals_row()] -> [adorn_totals()] #' * [remove_empty_rows()] -> [`remove_empty("rows")`][remove_empty()] #' * [remove_empty_cols()] -> [`remove_empty("cols")`][remove_empty()] #' @@ -16,88 +12,6 @@ # EXCLUDE COVERAGE START NULL - - - -#' @title Generate a crosstabulation of two vectors. -#' @param ... arguments -#' @keywords internal -#' @description -#' This function is deprecated, use [`tabyl(dat, var1, var2)`][tabyl()] instead. -#' @export - -crosstab <- function(...) { - lifecycle::deprecate_stop( - when = "2.0.0", - what = "janitor::crosstab()", - with = "tabyl()", - details = "See the guide to tabyl(): https://cran.r-project.org/web/packages/janitor/vignettes/tabyls.html" - ) -} - -#' @title Add presentation formatting to a crosstabulation table. -#' @description -#' This function is deprecated, use [tabyl()] with the `adorn_` family of functions instead. -#' @param dat a data.frame with row names in the first column and numeric values in all other columns. Usually the piped-in result of a call to `crosstab` that included the argument `percent = "none"`. -#' @param denom the denominator to use for calculating percentages. One of "row", "col", or "all". -#' @param show_n should counts be displayed alongside the percentages? -#' @param digits how many digits should be displayed after the decimal point? -#' @param show_totals display a totals summary? Will be a row, column, or both depending on the value of `denom`. -#' @param rounding method to use for truncating percentages - either "half to even", the base R default method, or "half up", where 14.5 rounds up to 15. -#' @return Returns a data.frame. -#' @keywords internal -#' @export - -adorn_crosstab <- function(dat, denom = "row", show_n = TRUE, digits = 1, show_totals = FALSE, rounding = "half to even") { - lifecycle::deprecate_stop( - when = "2.0.0", - what = "janitor::adorn_crosstab()", - with = "tabyl()", - details = "See the adorn_* functions for formatting a tabyl: https://cran.r-project.org/web/packages/janitor/vignettes/tabyls.html" - ) -} - -#' @title Append a totals row to a data.frame. -#' -#' @description -#' This function is deprecated, use [adorn_totals()] instead. -#' -#' @param dat an input data.frame with at least one numeric column. -#' @param fill if there are more than one non-numeric columns, what string should fill the bottom row of those columns? -#' @param na.rm should missing values (including NaN) be omitted from the calculations? -#' @return Returns a data.frame with a totals row, consisting of "Total" in the first column and column sums in the others. -#' @keywords internal -#' @export -add_totals_row <- function(dat, fill = "-", na.rm = TRUE) { - lifecycle::deprecate_stop( - when = "2.0.0", - what = "janitor::add_totals_row()", - with = "adorn_totals()", - details = "See the adorn_* functions for formatting a tabyl or data.frame: https://cran.r-project.org/web/packages/janitor/vignettes/tabyls.html" - ) -} - -#' @title Append a totals column to a data.frame. -#' -#' @description -#' This function is deprecated, use [`adorn_totals(where = "col")`][adorn_totals()] instead. -#' -#' @param dat an input data.frame with at least one numeric column. -#' @param na.rm should missing values (including NaN) be omitted from the calculations? -#' @keywords internal -#' @return Returns a data.frame with a totals column containing row-wise sums. -#' @export - -add_totals_col <- function(dat, na.rm = TRUE) { - lifecycle::deprecate_stop( - when = "2.0.0", - what = "janitor::add_totals_cols()", - with = "adorn_totals()", - details = "See the adorn_* functions for formatting a tabyl or data.frame: https://cran.r-project.org/web/packages/janitor/vignettes/tabyls.html" - ) -} - - #' @title Returns first non-`NA` value from a set of vectors. #' #' @description diff --git a/R/print_tabyl.R b/R/print_tabyl.R deleted file mode 100644 index b971bc40..00000000 --- a/R/print_tabyl.R +++ /dev/null @@ -1,4 +0,0 @@ -#' @export -print.tabyl <- function(x, ...) { - print.data.frame(x, row.names = FALSE) -} diff --git a/R/statistical_tests.R b/R/statistical_tests.R deleted file mode 100644 index b0fe5b51..00000000 --- a/R/statistical_tests.R +++ /dev/null @@ -1,205 +0,0 @@ -#' Apply `stats::chisq.test()` to a two-way tabyl -#' -#' @description -#' This generic function overrides `stats::chisq.test`. If the passed table -#' is a two-way tabyl, it runs it through janitor::chisq.test.tabyl, otherwise -#' it just calls `stats::chisq.test()`. -#' -#' @param x a two-way tabyl, a numeric vector or a factor -#' @param ... other parameters passed to [stats::chisq.test()] -#' @return The result is the same as the one of `stats::chisq.test()`. -#' If `tabyl_results` is `TRUE`, the returned tables `observed`, `expected`, -#' `residuals` and `stdres` are converted to tabyls. -#' -#' @examples -#' tab <- tabyl(mtcars, gear, cyl) -#' chisq.test(tab) -#' chisq.test(tab)$residuals -#' -#' @export - -chisq.test <- function(x, ...) { - UseMethod("chisq.test") -} - - -#' @rdname chisq.test -#' @method chisq.test default -#' @param y if x is a vector, must be another vector or factor of the same length -#' @export - -chisq.test.default <- function(x, y = NULL, ...) { - # keep track of object names to keep `data.name` attribute - if (!is.null(y)) { - dname_x <- deparse(substitute(x)) - dname_y <- deparse(substitute(y)) - dname <- paste(dname_x, "and", dname_y) - } else { - dname <- deparse(substitute(x)) - } - - result <- stats::chisq.test(x, y, ...) - - # Replace object name in result for strict equality with stats::chisq.test - result$data.name <- dname - if (!is.null(y)) { - names(attr(result$observed, "dimnames")) <- c(dname_x, dname_y) - names(attr(result$expected, "dimnames")) <- c(dname_x, dname_y) - names(attr(result$residuals, "dimnames")) <- c(dname_x, dname_y) - names(attr(result$stdres, "dimnames")) <- c(dname_x, dname_y) - } - - result -} - - -#' @rdname chisq.test -#' @method chisq.test tabyl -#' @param tabyl_results If `TRUE` and `x` is a tabyl object, -#' also return `observed`, `expected`, `residuals` and `stdres` as tabyl. -#' @export - -chisq.test.tabyl <- function(x, tabyl_results = TRUE, ...) { - # keep track of object name to keep `data.name` attribute - dname <- deparse(substitute(x)) - - # check if table is a two-way tabyl - if (!(inherits(x, "tabyl") && attr(x, "tabyl_type") == "two_way")) { - stop("chisq.test.tabyl() must be applied to a two-way tabyl object") - } - - # check for and remove totals row / column, if present - if (!is.null(attr(x, "totals"))) { - if ("row" %in% attr(x, "totals")) { - x <- x[-nrow(x), ] - } - if ("col" %in% attr(x, "totals")) { - # this causes the var_names attribute to become NULL, not sure why - x[ncol(x)] <- NULL - } - warning("janitor::chisq.test.tabyl() detected a totals row and/or column. The totals were removed from the tabyl before the test was run. - If you intend to include the totals row and/or column in the test, first call untabyl() on the data.frame, then proceed from there.") - } - - rownames(x) <- x[[1]] - - result <- x %>% - dplyr::select(-1) %>% - as.matrix() %>% - as.table() %>% - stats::chisq.test(...) - - # Replace values and attributes for strict object equality - result$data.name <- dname - names(attr(result$observed, "dimnames")) <- c("", "") - names(attr(result$expected, "dimnames")) <- c("", "") - names(attr(result$residuals, "dimnames")) <- c("", "") - names(attr(result$stdres, "dimnames")) <- c("", "") - - # Return results tables as tabyl - if (tabyl_results) { - # Keep track of row names column name and var_names attributes - rownames_column <- names(x)[1] - var_names <- attr(x, "var_names") - - # For each returned table, convert it to a two-way tabyl - tables <- c("observed", "expected", "residuals", "stdres") - for (table in tables) { - tab <- result[[table]] - ttab <- as.data.frame.matrix(tab) - ttab[[rownames_column]] <- rownames(tab) - ttab <- ttab %>% dplyr::select(!!rownames_column, dplyr::everything()) - ttab <- as_tabyl(ttab) - attr(ttab, "var_names") <- var_names - result[[table]] <- ttab - } - } - - result -} - - - -#' Apply `stats::fisher.test()` to a two-way tabyl -#' -#' This generic function overrides [stats::fisher.test()]. If the passed table -#' is a two-way tabyl, it runs it through `janitor::fisher.test.tabyl`, otherwise -#' it just calls `stats::fisher.test()`. -#' -#' @return -#' The same as the one of `stats::fisher.test()`. -#' -#' @param x A two-way tabyl, a numeric vector or a factor -#' @param ... Parameters passed to [stats::fisher.test()] -#' -#' @examples -#' tab <- tabyl(mtcars, gear, cyl) -#' fisher.test(tab) -#' -#' @export - -fisher.test <- function(x, ...) { - UseMethod("fisher.test") -} - - -#' @rdname fisher.test -#' @method fisher.test default -#' @param y if x is a vector, must be another vector or factor of the same length -#' @export - -fisher.test.default <- function(x, y = NULL, ...) { - # keep track of object names to keep `data.name` attribute - if (!is.null(y)) { - dname_x <- deparse(substitute(x)) - dname_y <- deparse(substitute(y)) - dname <- paste(dname_x, "and", dname_y) - } else { - dname <- deparse(substitute(x)) - } - - result <- stats::fisher.test(x, y, ...) - result$data.name <- dname - - result -} - - -#' @rdname fisher.test -#' @method fisher.test tabyl -#' @export - -fisher.test.tabyl <- function(x, ...) { - # keep track of object name to keep `data.name` attribute - dname <- deparse(substitute(x)) - - # check if table is a two-way tabyl - if (!(inherits(x, "tabyl") && attr(x, "tabyl_type") == "two_way")) { - stop("fisher.test.tabyl() must be applied to a two-way tabyl object") - } - - # check for and remove totals row / column, if present - if (!is.null(attr(x, "totals"))) { - if ("row" %in% attr(x, "totals")) { - x <- x[-nrow(x), ] - } - if ("col" %in% attr(x, "totals")) { - x[ncol(x)] <- NULL - } - warning("janitor::fisher.test.tabyl() detected a totals row and/or column. The totals were removed from the tabyl before the test was run. - If you intend to include the totals row and/or column in the test, first call untabyl() on the data.frame, then proceed from there.") - } - - rownames(x) <- x[[1]] - - result <- x %>% - dplyr::select(-1) %>% - as.matrix() %>% - as.table() %>% - stats::fisher.test(...) - - # Replace values and attributes for strict object equality - result$data.name <- dname - - result -} diff --git a/R/tabyl.R b/R/tabyl.R deleted file mode 100644 index 9e534473..00000000 --- a/R/tabyl.R +++ /dev/null @@ -1,346 +0,0 @@ -#' Generate a frequency table (1-, 2-, or 3-way). -#' -#' @description -#' A fully-featured alternative to `table()`. Results are data.frames and can be -#' formatted and enhanced with janitor's family of `adorn_` functions. -#' -#' Specify a `data.frame` and the one, two, or three unquoted column names you -#' want to tabulate. Three variables generates a list of 2-way tabyls, -#' split by the third variable. -#' -#' Alternatively, you can tabulate a single variable that isn't in a `data.frame` -#' by calling `tabyl()` on a vector, e.g., `tabyl(mtcars$gear)`. -#' -#' @param dat A `data.frame` containing the variables you wish to count. -#' Or, a vector you want to tabulate. -#' @param var1 The column name of the first variable. -#' @param var2 (optional) the column name of the second variable -#' (its values become the column names in a 2-way tabulation). -#' @param var3 (optional) the column name of the third variable -#' (a 3-way tabulation is split into a list on its values). -#' @param show_na Should counts of `NA` values be displayed? In a one-way tabyl, -#' the presence of `NA` values triggers an additional column showing valid percentages -#' (calculated excluding `NA` values). -#' @param show_missing_levels Should counts of missing levels of factors be displayed? -#' These will be rows and/or columns of zeroes. Useful for keeping consistent -#' output dimensions even when certain factor levels may not be present in the data. -#' @param ... Additional arguments passed to methods. -#' @return A `data.frame` with frequencies and percentages of the tabulated variable(s). -#' A 3-way tabulation returns a list of data frames. -#' @export -#' @examples -#' -#' tabyl(mtcars, cyl) -#' tabyl(mtcars, cyl, gear) -#' tabyl(mtcars, cyl, gear, am) -#' -#' # or using the %>% pipe -#' mtcars %>% -#' tabyl(cyl, gear) -#' -#' # illustrating show_na functionality: -#' my_cars <- rbind(mtcars, rep(NA, 11)) -#' my_cars %>% tabyl(cyl) -#' my_cars %>% tabyl(cyl, show_na = FALSE) -#' -#' # Calling on a single vector not in a data.frame: -#' val <- c("hi", "med", "med", "lo") -#' tabyl(val) -tabyl <- function(dat, ...) UseMethod("tabyl") - - - -#' @export -#' @rdname tabyl -# this method runs when tabyl() is called on plain vectors; tabyl_1way -# also reverts to this method - -tabyl.default <- function(dat, show_na = TRUE, show_missing_levels = TRUE, ...) { - if (is.list(dat) && !"data.frame" %in% class(dat)) { - stop("tabyl() is meant to be called on vectors and data.frames; convert non-data.frame lists to one of these types") - } - # catch and adjust input variable name. - if (is.null(names(dat)) || is.vector(dat)) { - var_name <- deparse(substitute(dat)) - } else { - var_name <- names(dat) - } - - # useful error message if input vector doesn't exist - if (is.null(dat)) { - stop(paste0("object ", var_name, " not found")) - } - # an odd variable name can be deparsed into a vector of length >1, rare but throws warning, see issue #87 - if (length(var_name) > 1) { - var_name <- paste(var_name, collapse = "") - } - - # Try to retrieve label - if (is.data.frame(dat)) { - var_label <- attr(dat[, var_name], "label", exact = TRUE) %||% var_name - } else { - var_label <- attr(dat, "label", exact = TRUE) %||% var_name - } - - # if show_na is not length-1 logical, error helpfully (#377) - if (length(show_na) > 1 || !inherits(show_na, "logical")) { - stop("The value supplied to the \"show_na\" argument must be TRUE or FALSE.\n\nDid you try to call tabyl on two vectors, like tabyl(data$var1, data$var2) ? To create a two-way tabyl, the two vectors must be in the same data.frame, and the function should be called like this: \n - tabyl(data, var1, var2) - or - data %>% tabyl(var1, var2). \n\nSee ?tabyl for more.") - } - - # calculate initial counts table - # convert vector to a 1 col data.frame - if (mode(dat) %in% c("logical", "numeric", "character", "list") && !is.matrix(dat)) { - # to preserve factor properties when vec is passed in as a list from data.frame method: - if (is.list(dat)) { - dat <- dat[[1]] - } - dat_df <- data.frame(dat, stringsAsFactors = is.factor(dat)) - names(dat_df)[1] <- "dat" - result <- dat_df %>% dplyr::count(dat) - - if (is.factor(dat) && show_missing_levels) { - expanded <- tidyr::expand(result, dat) - result <- merge( # can't use left_join b/c NA matching changed in 0.6.0 - x = expanded, - y = result, - by = "dat", - all.x = TRUE, - all.y = TRUE - ) - result <- dplyr::arrange(result, dat) # restore sorting by factor level - } - } else { - stop("input must be a vector of type logical, numeric, character, list, or factor") - } - - # calculate percent, move NA row to bottom - result <- result %>% - dplyr::mutate(percent = n / sum(n, na.rm = TRUE)) - - # sort the NA row to the bottom, necessary to retain factor sorting - result <- result[order(is.na(result$dat)), ] - result$is_na <- NULL - - # replace all NA values with 0 - only applies to missing factor levels - result <- tidyr::replace_na(result, replace = list(n = 0, percent = 0)) - - ## NA handling: - # if there are NA values & show_na = T, calculate valid % as a new column - if (show_na && sum(is.na(result[[1]])) > 0) { - valid_total <- sum(result$n[!is.na(result[[1]])], na.rm = TRUE) - result$valid_percent <- result$n / valid_total - result$valid_percent[is.na(result[[1]])] <- NA - } else { # don't show NA values, which necessitates adjusting the %s - result <- result %>% - dplyr::filter(!is.na(.[, 1])) %>% - dplyr::mutate(percent = n / sum(n, na.rm = TRUE)) # recalculate % without NAs - } - - # reassign correct variable name (or label if it exists) - names(result)[1] <- var_label - - # in case input var name was "n" or "percent", call helper function to set unique names - result <- handle_if_special_names_used(result) - - data.frame(result, check.names = FALSE) %>% - as_tabyl(axes = 1) -} - - - -#' @export -#' @rdname tabyl -# Main dispatching function to underlying functions depending on whether "..." contains 1, 2, or 3 variables -tabyl.data.frame <- function(dat, var1, var2, var3, show_na = TRUE, show_missing_levels = TRUE, ...) { - if (missing(var1) && missing(var2) && missing(var3)) { - stop("if calling on a data.frame, specify unquoted column names(s) to tabulate. Did you mean to call tabyl() on a vector?") - } - if (dplyr::is_grouped_df(dat)) { - dat <- dplyr::ungroup(dat) - } - - if (missing(var2) && missing(var3) && !missing(var1)) { - tabyl_1way(dat, rlang::enquo(var1), show_na = show_na, show_missing_levels = show_missing_levels) - } else if (missing(var3) && !missing(var1) && !missing(var2)) { - tabyl_2way(dat, rlang::enquo(var1), rlang::enquo(var2), show_na = show_na, show_missing_levels = show_missing_levels) - } else if (!missing(var1) && - !missing(var2) && - !missing(var3)) { - tabyl_3way(dat, rlang::enquo(var1), rlang::enquo(var2), rlang::enquo(var3), show_na = show_na, show_missing_levels = show_missing_levels) - } else { - stop("please specify var1 OR var1 & var2 OR var1 & var2 & var3") - } -} - -# a one-way frequency table; this was called "tabyl" in janitor <= 0.3.0 -tabyl_1way <- function(dat, var1, show_na = TRUE, show_missing_levels = TRUE) { - x <- dplyr::select(dat, !!var1) - - # gather up arguments, pass them to tabyl.default - arguments <- list() - arguments$dat <- x[1] - arguments$show_na <- show_na - arguments$show_missing_levels <- show_missing_levels - do.call(tabyl.default, - args = arguments - ) -} - - -# a two-way frequency table; this was called "crosstab" in janitor <= 0.3.0 -tabyl_2way <- function(dat, var1, var2, show_na = TRUE, show_missing_levels = TRUE) { - dat <- dplyr::select(dat, !!var1, !!var2) - - if (!show_na) { - dat <- dat[!is.na(dat[[1]]) & !is.na(dat[[2]]), ] - } - if (nrow(dat) == 0) { # if passed a zero-length input, or an entirely NA input, return a zero-row data.frame - message("No records to count so returning a zero-row tabyl") - return(dat %>% - dplyr::select(1) %>% - dplyr::slice(0)) - } - - tabl <- dat %>% - dplyr::count(!!var1, !!var2, name = "tabyl_2way_n") - - # Optionally expand missing factor levels. - if (show_missing_levels) { - tabl <- tidyr::complete(tabl, !!var1, !!var2) - } - - # replace NA with string NA_ in vec2 to avoid invalid col name after spreading - # if this col is a factor, need to add that level to the factor - if (is.numeric(tabl[[2]])) { # have numerics treated like factors to not spread alphabetically - tabl[[2]] <- ordered(tabl[[2]], levels = unique(tabl[[2]])) - } - if (is.factor(tabl[[2]])) { - levels(tabl[[2]]) <- c(levels(tabl[[2]]), "emptystring_", "NA_") - } else { - tabl[2] <- as.character(tabl[[2]]) - } - tabl[2][is.na(tabl[2])] <- "NA_" - tabl[2][tabl[2] == ""] <- "emptystring_" - result <- tabl %>% - tidyr::pivot_wider( - names_from = !!var2, - values_from = "tabyl_2way_n", - values_fn = ~ dplyr::coalesce(.x, 0L), - values_fill = 0L, - names_sort = TRUE - ) - if ("emptystring_" %in% names(result)) { - result <- result[c(setdiff(names(result), "emptystring_"), "emptystring_")] - if (getOption("tabyl.emptystring", TRUE) & interactive()) { - message("The tabyl's column variable contained the empty string value, \"\". This is not a legal column name and has been converted to \"emptystring_\".\nConsider converting \"\" to NA if appropriate.\nThis message is shown once per session and may be disabled by setting options(\"tabyl.emptystring\" = FALSE).") # nocov - options("tabyl.emptystring" = FALSE) # nocov - } - } - if ("NA_" %in% names(result)) { - # move NA_ column to end, from http://stackoverflow.com/a/18339562 - result <- result[c(setdiff(names(result), "NA_"), "NA_")] - } - - row_var_name <- names(dat)[1] - col_var_name <- names(dat)[2] - names(result)[1] <- attr(dat[, 1], "label", exact = TRUE) %||% names(result)[1] - data.frame(result, check.names = FALSE) %>% - as_tabyl(axes = 2, row_var_name = row_var_name, col_var_name = col_var_name) -} - - -# a list of two-way frequency tables, split into a list on a third variable -tabyl_3way <- function(dat, var1, var2, var3, show_na = TRUE, show_missing_levels = TRUE) { - dat <- dplyr::select(dat, !!var1, !!var2, !!var3) - var3_numeric <- is.numeric(dat[[3]]) - - # Preserve labels, as attributes are sometimes dropped during transformations. - var1_label <- attr(dat[, 1], "label", exact = TRUE) - var2_label <- attr(dat[, 2], "label", exact = TRUE) - - # Keep factor levels for ordering the list at the end - if (is.factor(dat[[3]])) { - third_levels_for_sorting <- levels(dat[[3]]) - } - dat[[3]] <- as.character(dat[[3]]) # don't want empty factor levels in the result list - they would be empty data.frames - - # grab class of 1st variable to restore it later - col1_class <- class(dat[[1]]) - col1_levels <- NULL - if (is.factor(dat[[1]])) { - col1_levels <- levels(dat[[1]]) - } - - # print NA level as its own data.frame, and make it appear last - if (show_na && sum(is.na(dat[[3]])) > 0) { - dat[[3]] <- factor(dat[[3]], levels = c(sort(unique(dat[[3]])), "NA_")) - dat[[3]][is.na(dat[[3]])] <- "NA_" - if (exists("third_levels_for_sorting")) { - third_levels_for_sorting <- c(third_levels_for_sorting, "NA_") - } - } - - if (show_missing_levels) { # needed to have each crosstab in the list aware of all values in the pre-split variables - dat[[1]] <- as.factor(dat[[1]]) - dat[[2]] <- as.factor(dat[[2]]) - } - - result <- split(dat, dat[[rlang::quo_name(var3)]]) - # split() drops attributes, so we manually add back the label attributes. - result <- lapply(result, function(x) { - attr(x[[1]], "label") <- var1_label - attr(x[[2]], "label") <- var2_label - x - }) - result <- result %>% - purrr::map(tabyl_2way, var1, var2, show_na = show_na, show_missing_levels = show_missing_levels) %>% - purrr::map(reset_1st_col_status, col1_class, col1_levels) # reset class of var in 1st col to its input class, #168 - - # reorder when var 3 is a factor, per #250 - if (exists("third_levels_for_sorting")) { - result <- result[order(third_levels_for_sorting[third_levels_for_sorting %in% unique(dat[[3]])])] - } - - if (var3_numeric) { - result <- result[order(suppressWarnings(as.numeric(names(result))), na.last = TRUE)] - } - - result -} - -### Helper functions called by tabyl() ------------ - -# function that checks if col 1 name is "n" or "percent", -## if so modifies the appropriate other column name to avoid duplicates -handle_if_special_names_used <- function(dat) { - if (names(dat)[1] == "n") { - names(dat)[2] <- "n_n" - } else if (names(dat)[1] == "percent") { - names(dat)[3] <- "percent_percent" - } - dat -} - -# reset the 1st col's class of a data.frame to a provided class -# also reset in tabyl's core -reset_1st_col_status <- function(dat, new_class, lvls) { - if ("factor" %in% new_class) { - dat[[1]] <- factor(dat[[1]], - levels = lvls, - ordered = ("ordered" %in% new_class) - ) - attr(dat, "core")[[1]] <- factor(attr(dat, "core")[[1]], - levels = lvls, - ordered = ("ordered" %in% new_class) - ) - } else { - dat[[1]] <- as.character(dat[[1]]) # first do as.character in case eventual class is numeric - class(dat[[1]]) <- new_class - attr(dat, "core")[[1]] <- as.character(attr(dat, "core")[[1]]) - class(attr(dat, "core")[[1]]) <- new_class - } - dat -} diff --git a/R/top_levels.R b/R/top_levels.R deleted file mode 100644 index 87320c11..00000000 --- a/R/top_levels.R +++ /dev/null @@ -1,65 +0,0 @@ -#' Generate a frequency table of a factor grouped into top-n, bottom-n, and all -#' other levels. -#' -#' Get a frequency table of a factor variable, grouped into categories by level. -#' -#' @param input_vec The factor variable to tabulate. -#' @param n Number of levels to include in top and bottom groups -#' @param show_na Should cases where the variable is `NA` be shown? -#' @return A `data.frame` (actually a `tbl_df`) with the frequencies of the -#' grouped, tabulated variable. Includes counts and percentages, and valid -#' percentages (calculated omitting `NA` values, if present in the vector and -#' `show_na = TRUE`.) -#' @export -#' @examples -#' top_levels(as.factor(mtcars$hp), 2) -top_levels <- function(input_vec, n = 2, show_na = FALSE) { - # Initial type error catching - if (!is.factor(input_vec)) { - stop("factor_vec is not of type 'factor'") - } - - num_levels_in_var <- nlevels(input_vec) - - # handle bad inputs - if (!num_levels_in_var > 2) { - stop("input factor variable must have at least 3 levels") - } - if (num_levels_in_var < 2 * n) { - stop(paste0( - "there are ", num_levels_in_var, " levels in the variable and ", - n, " levels in each of the top and bottom groups.\nSince 2 * ", n, " = ", 2 * n, - " is greater than ", num_levels_in_var, ", - there would be overlap in the top and bottom groups and some records will be double-counted." - )) - } - if (n < 1 || n %% 1 != 0) { - stop("n must be a whole number at least 1") - } - - var_name <- deparse(substitute(input_vec)) - - # Identify top/mid/bottom group labels for printing - groups <- get_level_groups(input_vec, n, num_levels_in_var) - - # convert input vector into grouped variable - new_vec <- ifelse(as.numeric(input_vec) <= n, - groups$top, - ifelse(as.numeric(input_vec) > (num_levels_in_var - n), - groups$bot, - groups$mid - ) - ) - - # recode variable as hi-med-lo factor so table prints w/ correct sorting - if (!is.na(groups$mid)) { - new_vec <- factor(new_vec, levels = c(groups$top, groups$mid, groups$bot)) - } else { - new_vec <- factor(new_vec, levels = c(groups$top, groups$bot)) - } - - # tabulate grouped variable, then reset name to match input variable name - result <- tabyl(new_vec, show_na = show_na) - names(result)[1] <- var_name - result -} diff --git a/README.Rmd b/README.Rmd index ed66eec6..fba821cc 100644 --- a/README.Rmd +++ b/README.Rmd @@ -143,56 +143,6 @@ roster %>% get_dupes(contains("name")) Yes, some teachers appear twice. We ought to address this before counting employees. -#### Tabulating tools -A variable (or combinations of two or three variables) can be tabulated with `tabyl()`. The resulting data.frame can be tweaked and formatted -with the suite of `adorn_` functions for quick analysis and printing of pretty results in a report. `adorn_` functions can be helpful with non-tabyls, too. - -#### `tabyl()` - -Like `table()`, but pipe-able, data.frame-based, and fully featured. - -`tabyl()` can be called two ways: - -* On a vector, when tabulating a single variable: `tabyl(roster$subject)` -* On a data.frame, specifying 1, 2, or 3 variable names to tabulate: `roster %>% tabyl(subject, employee_status)`. - * Here the data.frame is passed in with the `%>%` pipe; this allows `tabyl` to be used in an analysis pipeline - -One variable: -```{r} -roster %>% - tabyl(subject) -``` - -Two variables: -```{r} -roster %>% - filter(hire_date > as.Date("1950-01-01")) %>% - tabyl(employee_status, full_time) -``` - -Three variables: -```{r} -roster %>% - tabyl(full_time, subject, employee_status, show_missing_levels = FALSE) -``` - -#### Adorning tabyls -The `adorn_` functions dress up the results of these tabulation calls for fast, basic reporting. Here are some of the functions that augment a summary table for reporting: - -```{r} -roster %>% - tabyl(employee_status, full_time) %>% - adorn_totals("row") %>% - adorn_percentages("row") %>% - adorn_pct_formatting() %>% - adorn_ns() %>% - adorn_title("combined") -``` - -Pipe that right into `knitr::kable()` in your RMarkdown report. - -These modular adornments can be layered to reduce R's deficit against Excel and SPSS when it comes to quick, informative counts. Learn more about `tabyl()` and the `adorn_` functions from the [tabyls vignette](https://sfirke.github.io/janitor/articles/tabyls.html). - ## Contact me You are welcome to: diff --git a/man/add_totals_col.Rd b/man/add_totals_col.Rd deleted file mode 100644 index c0c0c38c..00000000 --- a/man/add_totals_col.Rd +++ /dev/null @@ -1,20 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/janitor_deprecated.R -\name{add_totals_col} -\alias{add_totals_col} -\title{Append a totals column to a data.frame.} -\usage{ -add_totals_col(dat, na.rm = TRUE) -} -\arguments{ -\item{dat}{an input data.frame with at least one numeric column.} - -\item{na.rm}{should missing values (including NaN) be omitted from the calculations?} -} -\value{ -Returns a data.frame with a totals column containing row-wise sums. -} -\description{ -This function is deprecated, use \code{\link[=adorn_totals]{adorn_totals(where = "col")}} instead. -} -\keyword{internal} diff --git a/man/add_totals_row.Rd b/man/add_totals_row.Rd deleted file mode 100644 index d6dc8883..00000000 --- a/man/add_totals_row.Rd +++ /dev/null @@ -1,22 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/janitor_deprecated.R -\name{add_totals_row} -\alias{add_totals_row} -\title{Append a totals row to a data.frame.} -\usage{ -add_totals_row(dat, fill = "-", na.rm = TRUE) -} -\arguments{ -\item{dat}{an input data.frame with at least one numeric column.} - -\item{fill}{if there are more than one non-numeric columns, what string should fill the bottom row of those columns?} - -\item{na.rm}{should missing values (including NaN) be omitted from the calculations?} -} -\value{ -Returns a data.frame with a totals row, consisting of "Total" in the first column and column sums in the others. -} -\description{ -This function is deprecated, use \code{\link[=adorn_totals]{adorn_totals()}} instead. -} -\keyword{internal} diff --git a/man/adorn_crosstab.Rd b/man/adorn_crosstab.Rd deleted file mode 100644 index f3a94e42..00000000 --- a/man/adorn_crosstab.Rd +++ /dev/null @@ -1,35 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/janitor_deprecated.R -\name{adorn_crosstab} -\alias{adorn_crosstab} -\title{Add presentation formatting to a crosstabulation table.} -\usage{ -adorn_crosstab( - dat, - denom = "row", - show_n = TRUE, - digits = 1, - show_totals = FALSE, - rounding = "half to even" -) -} -\arguments{ -\item{dat}{a data.frame with row names in the first column and numeric values in all other columns. Usually the piped-in result of a call to \code{crosstab} that included the argument \code{percent = "none"}.} - -\item{denom}{the denominator to use for calculating percentages. One of "row", "col", or "all".} - -\item{show_n}{should counts be displayed alongside the percentages?} - -\item{digits}{how many digits should be displayed after the decimal point?} - -\item{show_totals}{display a totals summary? Will be a row, column, or both depending on the value of \code{denom}.} - -\item{rounding}{method to use for truncating percentages - either "half to even", the base R default method, or "half up", where 14.5 rounds up to 15.} -} -\value{ -Returns a data.frame. -} -\description{ -This function is deprecated, use \code{\link[=tabyl]{tabyl()}} with the \code{adorn_} family of functions instead. -} -\keyword{internal} diff --git a/man/adorn_ns.Rd b/man/adorn_ns.Rd deleted file mode 100644 index aeaa0b43..00000000 --- a/man/adorn_ns.Rd +++ /dev/null @@ -1,82 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/adorn_ns.R -\name{adorn_ns} -\alias{adorn_ns} -\title{Add underlying Ns to a tabyl displaying percentages.} -\usage{ -adorn_ns( - dat, - position = "rear", - ns = attr(dat, "core"), - format_func = function(x) { - format(x, big.mark = ",") - }, - ... -) -} -\arguments{ -\item{dat}{A data.frame of class \code{tabyl} that has had \code{adorn_percentages} and/or -\code{adorn_pct_formatting} called on it. If given a list of data.frames, -this function will apply itself to each data.frame in the list (designed for 3-way \code{tabyl} lists).} - -\item{position}{Should the N go in the front, or in the rear, of the percentage?} - -\item{ns}{The Ns to append. The default is the "core" attribute of the input tabyl -\code{dat}, where the original Ns of a two-way \code{tabyl} are stored. However, if your Ns -are stored somewhere else, or you need to customize them beyond what can be done -with \code{format_func}, you can supply them here.} - -\item{format_func}{A formatting function to run on the Ns. Consider defining -with \code{\link[base:format]{base::format()}}.} - -\item{...}{Columns to adorn. This takes a tidyselect specification. By default, -all columns are adorned except for the first column and columns not of class -\code{numeric}, but this allows you to manually specify which columns should be adorned, -for use on a data.frame that does not result from a call to \code{tabyl}.} -} -\value{ -A \code{data.frame} with Ns appended -} -\description{ -This function adds back the underlying Ns to a \code{tabyl} whose percentages were -calculated using \code{\link[=adorn_percentages]{adorn_percentages()}}, to display the Ns and percentages together. -You can also call it on a non-tabyl data.frame to which you wish to append Ns. -} -\examples{ -mtcars \%>\% - tabyl(am, cyl) \%>\% - adorn_percentages("col") \%>\% - adorn_pct_formatting() \%>\% - adorn_ns(position = "front") - -# Format the Ns with a custom format_func: -set.seed(1) -bigger_dat <- data.frame( - sex = rep(c("m", "f"), 3000), - age = round(runif(3000, 1, 102), 0) -) -bigger_dat$age_group <- cut(bigger_dat$age, quantile(bigger_dat$age, c(0, 1 / 3, 2 / 3, 1))) - -bigger_dat \%>\% - tabyl(age_group, sex, show_missing_levels = FALSE) \%>\% - adorn_totals(c("row", "col")) \%>\% - adorn_percentages("col") \%>\% - adorn_pct_formatting(digits = 1) \%>\% - adorn_ns(format_func = function(x) format(x, big.mark = ".", decimal.mark = ",")) -# Control the columns to be adorned with the ... variable selection argument -# If using only the ... argument, you can use empty commas as shorthand -# to supply the default values to the preceding arguments: - -cases <- data.frame( - region = c("East", "West"), - year = 2015, - recovered = c(125, 87), - died = c(13, 12) -) - -cases \%>\% - adorn_percentages("col",,recovered:died) \%>\% - adorn_pct_formatting(,,,,,recovered:died) \%>\% - adorn_ns(,,,recovered:died) - -} diff --git a/man/adorn_pct_formatting.Rd b/man/adorn_pct_formatting.Rd deleted file mode 100644 index 13a49c6a..00000000 --- a/man/adorn_pct_formatting.Rd +++ /dev/null @@ -1,71 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/adorn_pct_formatting.R -\name{adorn_pct_formatting} -\alias{adorn_pct_formatting} -\title{Format a \code{data.frame} of decimals as percentages.} -\usage{ -adorn_pct_formatting( - dat, - digits = 1, - rounding = "half to even", - affix_sign = TRUE, - ... -) -} -\arguments{ -\item{dat}{a data.frame with decimal values, typically the result of a call -to \code{adorn_percentages} on a \code{tabyl}. If given a list of data.frames, this -function will apply itself to each data.frame in the list (designed for -3-way \code{tabyl} lists).} - -\item{digits}{how many digits should be displayed after the decimal point?} - -\item{rounding}{method to use for rounding - either "half to even", the base -R default method, or "half up", where 14.5 rounds up to 15.} - -\item{affix_sign}{should the \% sign be affixed to the end?} - -\item{...}{columns to adorn. This takes a tidyselect specification. By -default, all numeric columns (besides the initial column, if numeric) are -adorned, but this allows you to manually specify which columns should be -adorned, for use on a data.frame that does not result from a call to -\code{tabyl}.} -} -\value{ -a data.frame with formatted percentages -} -\description{ -Numeric columns get multiplied by 100 and formatted as -percentages according to user specifications. This function defaults to -excluding the first column of the input data.frame, assuming that it contains -a descriptive variable, but this can be overridden by specifying the columns -to adorn in the \code{...} argument. Non-numeric columns are always excluded. - -The decimal separator character is the result of \code{getOption("OutDec")}, which -is based on the user's locale. If the default behavior is undesirable, -change this value ahead of calling the function, either by changing locale or -with \code{options(OutDec = ",")}. This aligns the decimal separator character -with that used in \code{base::print()}. -} -\examples{ -mtcars \%>\% - tabyl(am, cyl) \%>\% - adorn_percentages("col") \%>\% - adorn_pct_formatting() - -# Control the columns to be adorned with the ... variable selection argument -# If using only the ... argument, you can use empty commas as shorthand -# to supply the default values to the preceding arguments: - -cases <- data.frame( - region = c("East", "West"), - year = 2015, - recovered = c(125, 87), - died = c(13, 12) -) - -cases \%>\% - adorn_percentages("col", , recovered:died) \%>\% - adorn_pct_formatting(, , , recovered:died) - -} diff --git a/man/adorn_percentages.Rd b/man/adorn_percentages.Rd deleted file mode 100644 index 0d6b8714..00000000 --- a/man/adorn_percentages.Rd +++ /dev/null @@ -1,57 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/adorn_percentages.R -\name{adorn_percentages} -\alias{adorn_percentages} -\title{Convert a data.frame of counts to percentages.} -\usage{ -adorn_percentages(dat, denominator = "row", na.rm = TRUE, ...) -} -\arguments{ -\item{dat}{A \code{tabyl} or other data.frame with a tabyl-like layout. -If given a list of data.frames, this function will apply itself to each -\code{data.frame} in the list (designed for 3-way \code{tabyl} lists).} - -\item{denominator}{The direction to use for calculating percentages. -One of "row", "col", or "all".} - -\item{na.rm}{should missing values (including \code{NaN}) be omitted from the calculations?} - -\item{...}{columns to adorn. This takes a <\code{\link[dplyr:dplyr_tidy_select]{tidy-select}}> -specification. By default, all numeric columns (besides the initial column, if numeric) -are adorned, but this allows you to manually specify which columns should -be adorned, for use on a \code{data.frame} that does not result from a call to \code{\link[=tabyl]{tabyl()}}.} -} -\value{ -A \code{data.frame} of percentages, expressed as numeric values between 0 and 1. -} -\description{ -This function defaults to excluding the first column of the input data.frame, -assuming that it contains a descriptive variable, but this can be overridden -by specifying the columns to adorn in the \code{...} argument. -} -\examples{ - -mtcars \%>\% - tabyl(am, cyl) \%>\% - adorn_percentages("col") - -# calculates correctly even with totals column and/or row: -mtcars \%>\% - tabyl(am, cyl) \%>\% - adorn_totals("row") \%>\% - adorn_percentages() - -# Control the columns to be adorned with the ... variable selection argument -# If using only the ... argument, you can use empty commas as shorthand -# to supply the default values to the preceding arguments: - -cases <- data.frame( - region = c("East", "West"), - year = 2015, - recovered = c(125, 87), - died = c(13, 12) -) - -cases \%>\% - adorn_percentages(, , recovered:died) -} diff --git a/man/adorn_rounding.Rd b/man/adorn_rounding.Rd deleted file mode 100644 index 409ce000..00000000 --- a/man/adorn_rounding.Rd +++ /dev/null @@ -1,67 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/adorn_rounding.R -\name{adorn_rounding} -\alias{adorn_rounding} -\title{Round the numeric columns in a data.frame.} -\usage{ -adorn_rounding(dat, digits = 1, rounding = "half to even", ...) -} -\arguments{ -\item{dat}{A \code{tabyl} or other \code{data.frame} with similar layout. -If given a list of data.frames, this function will apply itself to each -\code{data.frame} in the list (designed for 3-way \code{tabyl} lists).} - -\item{digits}{How many digits should be displayed after the decimal point?} - -\item{rounding}{Method to use for rounding - either "half to even" -(the base R default method), or "half up", where 14.5 rounds up to 15.} - -\item{...}{Columns to adorn. This takes a tidyselect specification. -By default, all numeric columns (besides the initial column, if numeric) -are adorned, but this allows you to manually specify which columns should -be adorned, for use on a data.frame that does not result from a call to \code{tabyl}.} -} -\value{ -The \code{data.frame} with rounded numeric columns. -} -\description{ -Can run on any \code{data.frame} with at least one numeric column. -This function defaults to excluding the first column of the input data.frame, -assuming that it contains a descriptive variable, but this can be overridden by -specifying the columns to round in the \code{...} argument. - -If you're formatting percentages, e.g., the result of \code{\link[=adorn_percentages]{adorn_percentages()}}, -use \code{\link[=adorn_pct_formatting]{adorn_pct_formatting()}} instead. This is a more flexible variant for ad-hoc usage. -Compared to \code{adorn_pct_formatting()}, it does not multiply by 100 or pad the -numbers with spaces for alignment in the results \code{data.frame}. -This function retains the class of numeric input columns. -} -\examples{ - -mtcars \%>\% - tabyl(am, cyl) \%>\% - adorn_percentages() \%>\% - adorn_rounding(digits = 2, rounding = "half up") - -# tolerates non-numeric columns: -library(dplyr) -mtcars \%>\% - tabyl(am, cyl) \%>\% - adorn_percentages("all") \%>\% - mutate(dummy = "a") \%>\% - adorn_rounding() - -# Control the columns to be adorned with the ... variable selection argument -# If using only the ... argument, you can use empty commas as shorthand -# to supply the default values to the preceding arguments: -cases <- data.frame( - region = c("East", "West"), - year = 2015, - recovered = c(125, 87), - died = c(13, 12) -) - -cases \%>\% - adorn_percentages(, , ends_with("ed")) \%>\% - adorn_rounding(, , all_of(c("recovered", "died"))) -} diff --git a/man/adorn_title.Rd b/man/adorn_title.Rd deleted file mode 100644 index 0bdd54f3..00000000 --- a/man/adorn_title.Rd +++ /dev/null @@ -1,58 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/adorn_title.R -\name{adorn_title} -\alias{adorn_title} -\title{Add column name to the top of a two-way tabyl.} -\usage{ -adorn_title(dat, placement = "top", row_name, col_name) -} -\arguments{ -\item{dat}{A \code{data.frame} of class \code{tabyl} or other \code{data.frame} with a tabyl-like layout. -If given a list of data.frames, this function will apply itself to each \code{data.frame} -in the list (designed for 3-way \code{tabyl} lists).} - -\item{placement}{The title placement, one of \code{"top"}, or \code{"combined"}. -See \strong{Details} for more information.} - -\item{row_name}{(optional) default behavior is to pull the row name from the -attributes of the input \code{tabyl} object. If you wish to override that text, -or if your input is not a \code{tabyl}, supply a string here.} - -\item{col_name}{(optional) default behavior is to pull the column_name from -the attributes of the input \code{tabyl} object. If you wish to override that text, -or if your input is not a \code{tabyl}, supply a string here.} -} -\value{ -The input \code{tabyl}, augmented with the column title. Non-tabyl inputs -that are of class \code{tbl_df} are downgraded to basic data.frames so that the -title row prints correctly. -} -\description{ -This function adds the column variable name to the top of a \code{tabyl} for a -complete display of information. This makes the tabyl prettier, but renders -the \code{data.frame} less useful for further manipulation. -} -\details{ -The \code{placement} argument indicates whether the column name should be added to -the \code{top} of the tabyl in an otherwise-empty row \code{"top"} or appended to the -already-present row name variable (\code{"combined"}). The formatting in the \code{"top"} -option has the look of base R's \code{table()}; it also wipes out the other column -names, making it hard to further use the \code{data.frame} besides formatting it for reporting. -The \code{"combined"} option is more conservative in this regard. -} -\examples{ - -mtcars \%>\% - tabyl(am, cyl) \%>\% - adorn_title(placement = "top") - -# Adding a title to a non-tabyl -library(tidyr) -library(dplyr) -mtcars \%>\% - group_by(gear, am) \%>\% - summarise(avg_mpg = mean(mpg), .groups = "drop") \%>\% - pivot_wider(names_from = am, values_from = avg_mpg) \%>\% - adorn_rounding() \%>\% - adorn_title("top", row_name = "Gears", col_name = "Cylinders") -} diff --git a/man/adorn_totals.Rd b/man/adorn_totals.Rd deleted file mode 100644 index 2f6a55ef..00000000 --- a/man/adorn_totals.Rd +++ /dev/null @@ -1,49 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/adorn_totals.R -\name{adorn_totals} -\alias{adorn_totals} -\title{Append a totals row and/or column to a data.frame} -\usage{ -adorn_totals(dat, where = "row", fill = "-", na.rm = TRUE, name = "Total", ...) -} -\arguments{ -\item{dat}{An input \code{data.frame} with at least one numeric column. If given a -list of data.frames, this function will apply itself to each \code{data.frame} -in the list (designed for 3-way \code{tabyl} lists).} - -\item{where}{One of "row", "col", or \code{c("row", "col")}} - -\item{fill}{If there are non-numeric columns, what should fill the bottom row -of those columns? If a string, relevant columns will be coerced to character. -If \code{NA} then column types are preserved.} - -\item{na.rm}{Should missing values (including \code{NaN}) be omitted from the calculations?} - -\item{name}{Name of the totals row and/or column. If both are created, and -\code{name} is a single string, that name is applied to both. If both are created -and \code{name} is a vector of length 2, the first element of the vector will be -used as the row name (in column 1), and the second element will be used as the -totals column name. Defaults to "Total".} - -\item{...}{Columns to total. This takes a tidyselect specification. By default, -all numeric columns (besides the initial column, if numeric) are included in -the totals, but this allows you to manually specify which columns should be -included, for use on a data.frame that does not result from a call to \code{tabyl}.} -} -\value{ -A \code{data.frame} augmented with a totals row, column, or both. -The \code{data.frame} is now also of class \code{tabyl} and stores information about -the attached totals and underlying data in the tabyl attributes. -} -\description{ -This function defaults to excluding the first column of the input data.frame, -assuming that it contains a descriptive variable, but this can be overridden -by specifying the columns to be totaled in the \code{...} argument. Non-numeric -columns are converted to character class and have a user-specified fill character -inserted in the totals row. -} -\examples{ -mtcars \%>\% - tabyl(am, cyl) \%>\% - adorn_totals() -} diff --git a/man/as_tabyl.Rd b/man/as_tabyl.Rd deleted file mode 100644 index 635898d1..00000000 --- a/man/as_tabyl.Rd +++ /dev/null @@ -1,55 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/as_and_untabyl.R -\name{as_tabyl} -\alias{as_tabyl} -\title{Add \code{tabyl} attributes to a data.frame} -\usage{ -as_tabyl(dat, axes = 2, row_var_name = NULL, col_var_name = NULL) -} -\arguments{ -\item{dat}{a data.frame with variable values in the first column and numeric -values in all other columns.} - -\item{axes}{is this a two_way tabyl or a one_way tabyl? If this function is -being called by a user, this should probably be "2". One-way tabyls are -created by \code{tabyl} but are a special case.} - -\item{row_var_name}{(optional) the name of the variable in the row dimension; -used by \code{adorn_title()}.} - -\item{col_var_name}{(optional) the name of the variable in the column -dimension; used by \code{adorn_title()}.} -} -\value{ -Returns the same data.frame, but with the additional class of "tabyl" -and the attribute "core". -} -\description{ -A \code{tabyl} is a \code{data.frame} containing counts of a variable or -co-occurrences of two variables (a.k.a., a contingency table or crosstab). -This specialized kind of data.frame has attributes that enable \code{adorn_} -functions to be called for precise formatting and presentation of results. -E.g., display results as a mix of percentages, Ns, add totals rows or -columns, rounding options, in the style of Microsoft Excel PivotTable. - -A \code{tabyl} can be the result of a call to \code{janitor::tabyl()}, in which case -these attributes are added automatically. This function adds \code{tabyl} class -attributes to a data.frame that isn't the result of a call to \code{tabyl} but -meets the requirements of a two-way tabyl: 1) First column contains values of -variable 1 2) Column names 2:n are the values of variable 2 3) Numeric values -in columns 2:n are counts of the co-occurrences of the two variables.* -\itemize{ -\item = this is the ideal form of a \code{tabyl}, but janitor's \code{adorn_} functions tolerate -and ignore non-numeric columns in positions 2:n. -} - -For instance, the result of \code{\link[dplyr:count]{dplyr::count()}} followed by \code{\link[tidyr:pivot_wider]{tidyr::pivot_wider()}} -can be treated as a \code{tabyl}. - -The result of calling \code{\link[=tabyl]{tabyl()}} on a single variable is a special class of -one-way tabyl; this function only pertains to the two-way tabyl. -} -\examples{ -as_tabyl(mtcars) - -} diff --git a/man/chisq.test.Rd b/man/chisq.test.Rd deleted file mode 100644 index 675d948b..00000000 --- a/man/chisq.test.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/statistical_tests.R -\name{chisq.test} -\alias{chisq.test} -\alias{chisq.test.default} -\alias{chisq.test.tabyl} -\title{Apply \code{stats::chisq.test()} to a two-way tabyl} -\usage{ -chisq.test(x, ...) - -\method{chisq.test}{default}(x, y = NULL, ...) - -\method{chisq.test}{tabyl}(x, tabyl_results = TRUE, ...) -} -\arguments{ -\item{x}{a two-way tabyl, a numeric vector or a factor} - -\item{...}{other parameters passed to \code{\link[stats:chisq.test]{stats::chisq.test()}}} - -\item{y}{if x is a vector, must be another vector or factor of the same length} - -\item{tabyl_results}{If \code{TRUE} and \code{x} is a tabyl object, -also return \code{observed}, \code{expected}, \code{residuals} and \code{stdres} as tabyl.} -} -\value{ -The result is the same as the one of \code{stats::chisq.test()}. -If \code{tabyl_results} is \code{TRUE}, the returned tables \code{observed}, \code{expected}, -\code{residuals} and \code{stdres} are converted to tabyls. -} -\description{ -This generic function overrides \code{stats::chisq.test}. If the passed table -is a two-way tabyl, it runs it through janitor::chisq.test.tabyl, otherwise -it just calls \code{stats::chisq.test()}. -} -\examples{ -tab <- tabyl(mtcars, gear, cyl) -chisq.test(tab) -chisq.test(tab)$residuals - -} diff --git a/man/crosstab.Rd b/man/crosstab.Rd deleted file mode 100644 index 32f44354..00000000 --- a/man/crosstab.Rd +++ /dev/null @@ -1,15 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/janitor_deprecated.R -\name{crosstab} -\alias{crosstab} -\title{Generate a crosstabulation of two vectors.} -\usage{ -crosstab(...) -} -\arguments{ -\item{...}{arguments} -} -\description{ -This function is deprecated, use \code{\link[=tabyl]{tabyl(dat, var1, var2)}} instead. -} -\keyword{internal} diff --git a/man/fisher.test.Rd b/man/fisher.test.Rd deleted file mode 100644 index dd54d990..00000000 --- a/man/fisher.test.Rd +++ /dev/null @@ -1,34 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/statistical_tests.R -\name{fisher.test} -\alias{fisher.test} -\alias{fisher.test.default} -\alias{fisher.test.tabyl} -\title{Apply \code{stats::fisher.test()} to a two-way tabyl} -\usage{ -fisher.test(x, ...) - -\method{fisher.test}{default}(x, y = NULL, ...) - -\method{fisher.test}{tabyl}(x, ...) -} -\arguments{ -\item{x}{A two-way tabyl, a numeric vector or a factor} - -\item{...}{Parameters passed to \code{\link[stats:fisher.test]{stats::fisher.test()}}} - -\item{y}{if x is a vector, must be another vector or factor of the same length} -} -\value{ -The same as the one of \code{stats::fisher.test()}. -} -\description{ -This generic function overrides \code{\link[stats:fisher.test]{stats::fisher.test()}}. If the passed table -is a two-way tabyl, it runs it through \code{janitor::fisher.test.tabyl}, otherwise -it just calls \code{stats::fisher.test()}. -} -\examples{ -tab <- tabyl(mtcars, gear, cyl) -fisher.test(tab) - -} diff --git a/man/janitor_deprecated.Rd b/man/janitor_deprecated.Rd index 6b9cb5c2..c0906722 100644 --- a/man/janitor_deprecated.Rd +++ b/man/janitor_deprecated.Rd @@ -8,12 +8,8 @@ These functions have already become defunct or may be defunct as soon as the nex } \details{ \itemize{ -\item \code{\link[=adorn_crosstab]{adorn_crosstab()}} -> \code{adorn_} -\item \code{\link[=crosstab]{crosstab()}} -> \code{\link[=tabyl]{tabyl()}} \item \code{\link[=use_first_valid_of]{use_first_valid_of()}} -> \code{\link[dplyr:coalesce]{dplyr::coalesce()}} \item \code{\link[=convert_to_NA]{convert_to_NA()}} -> \code{\link[dplyr:na_if]{dplyr::na_if()}} -\item \code{\link[=add_totals_col]{add_totals_col()}} -> \code{\link[=adorn_totals]{adorn_totals(where = "col")}} -\item \code{\link[=add_totals_row]{add_totals_row()}} -> \code{\link[=adorn_totals]{adorn_totals()}} \item \code{\link[=remove_empty_rows]{remove_empty_rows()}} -> \code{\link[=remove_empty]{remove_empty("rows")}} \item \code{\link[=remove_empty_cols]{remove_empty_cols()}} -> \code{\link[=remove_empty]{remove_empty("cols")}} } diff --git a/man/remove_empty.Rd b/man/remove_empty.Rd index 66652c27..f9d14dce 100644 --- a/man/remove_empty.Rd +++ b/man/remove_empty.Rd @@ -13,8 +13,8 @@ remove_empty(dat, which = c("rows", "cols"), cutoff = 1, quiet = TRUE) value of which is provided, defaults to removing both empty rows and empty columns, declaring the behavior with a printed message.} -\item{cutoff}{Under what fraction (>0 to <=1) of non-empty rows or columns should -\code{which} be removed? Lower values keep more rows/columns, higher values drop more.} +\item{cutoff}{a row/col should have more than this fraction of non-NA values to be +retained. E.g., \code{cutoff = 0.8} means that rows/cols that are 20\% or more missing will be dropped.} \item{quiet}{Should messages be suppressed (\code{TRUE}) or printed (\code{FALSE}) indicating the summary of empty columns or rows removed?} diff --git a/man/tabyl.Rd b/man/tabyl.Rd deleted file mode 100644 index 9b4569fe..00000000 --- a/man/tabyl.Rd +++ /dev/null @@ -1,70 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/tabyl.R -\name{tabyl} -\alias{tabyl} -\alias{tabyl.default} -\alias{tabyl.data.frame} -\title{Generate a frequency table (1-, 2-, or 3-way).} -\usage{ -tabyl(dat, ...) - -\method{tabyl}{default}(dat, show_na = TRUE, show_missing_levels = TRUE, ...) - -\method{tabyl}{data.frame}(dat, var1, var2, var3, show_na = TRUE, show_missing_levels = TRUE, ...) -} -\arguments{ -\item{dat}{A \code{data.frame} containing the variables you wish to count. -Or, a vector you want to tabulate.} - -\item{...}{Additional arguments passed to methods.} - -\item{show_na}{Should counts of \code{NA} values be displayed? In a one-way tabyl, -the presence of \code{NA} values triggers an additional column showing valid percentages -(calculated excluding \code{NA} values).} - -\item{show_missing_levels}{Should counts of missing levels of factors be displayed? -These will be rows and/or columns of zeroes. Useful for keeping consistent -output dimensions even when certain factor levels may not be present in the data.} - -\item{var1}{The column name of the first variable.} - -\item{var2}{(optional) the column name of the second variable -(its values become the column names in a 2-way tabulation).} - -\item{var3}{(optional) the column name of the third variable -(a 3-way tabulation is split into a list on its values).} -} -\value{ -A \code{data.frame} with frequencies and percentages of the tabulated variable(s). -A 3-way tabulation returns a list of data frames. -} -\description{ -A fully-featured alternative to \code{table()}. Results are data.frames and can be -formatted and enhanced with janitor's family of \code{adorn_} functions. - -Specify a \code{data.frame} and the one, two, or three unquoted column names you -want to tabulate. Three variables generates a list of 2-way tabyls, -split by the third variable. - -Alternatively, you can tabulate a single variable that isn't in a \code{data.frame} -by calling \code{tabyl()} on a vector, e.g., \code{tabyl(mtcars$gear)}. -} -\examples{ - -tabyl(mtcars, cyl) -tabyl(mtcars, cyl, gear) -tabyl(mtcars, cyl, gear, am) - -# or using the \%>\% pipe -mtcars \%>\% - tabyl(cyl, gear) - -# illustrating show_na functionality: -my_cars <- rbind(mtcars, rep(NA, 11)) -my_cars \%>\% tabyl(cyl) -my_cars \%>\% tabyl(cyl, show_na = FALSE) - -# Calling on a single vector not in a data.frame: -val <- c("hi", "med", "med", "lo") -tabyl(val) -} diff --git a/man/top_levels.Rd b/man/top_levels.Rd deleted file mode 100644 index 821c02c9..00000000 --- a/man/top_levels.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/top_levels.R -\name{top_levels} -\alias{top_levels} -\title{Generate a frequency table of a factor grouped into top-n, bottom-n, and all -other levels.} -\usage{ -top_levels(input_vec, n = 2, show_na = FALSE) -} -\arguments{ -\item{input_vec}{The factor variable to tabulate.} - -\item{n}{Number of levels to include in top and bottom groups} - -\item{show_na}{Should cases where the variable is \code{NA} be shown?} -} -\value{ -A \code{data.frame} (actually a \code{tbl_df}) with the frequencies of the -grouped, tabulated variable. Includes counts and percentages, and valid -percentages (calculated omitting \code{NA} values, if present in the vector and -\code{show_na = TRUE}.) -} -\description{ -Get a frequency table of a factor variable, grouped into categories by level. -} -\examples{ -top_levels(as.factor(mtcars$hp), 2) -} diff --git a/man/untabyl.Rd b/man/untabyl.Rd deleted file mode 100644 index aa8da644..00000000 --- a/man/untabyl.Rd +++ /dev/null @@ -1,24 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/as_and_untabyl.R -\name{untabyl} -\alias{untabyl} -\title{Remove \code{tabyl} attributes from a data.frame.} -\usage{ -untabyl(dat) -} -\arguments{ -\item{dat}{a \code{data.frame} of class \code{tabyl}.} -} -\value{ -the same \code{data.frame}, but without the \code{tabyl} class and attributes. -} -\description{ -Strips away all \code{tabyl}-related attributes from a data.frame. -} -\examples{ - -mtcars \%>\% - tabyl(am) \%>\% - untabyl() \%>\% - attributes() # tabyl-specific attributes are gone -} diff --git a/tests/testthat/test-adorn-ns.R b/tests/testthat/test-adorn-ns.R deleted file mode 100644 index ebc28770..00000000 --- a/tests/testthat/test-adorn-ns.R +++ /dev/null @@ -1,244 +0,0 @@ -source_an <- tibble::tibble( - x = c(rep("a", 500), "b", "b", "c", "d"), - y = rep(c(0, 0, 0, 0, 0, 1), 84) -) %>% - tabyl(x, y) - - -test_that("spacing is correct", { - expect_equal( - source_an %>% - adorn_totals() %>% - adorn_percentages("all") %>% - adorn_pct_formatting() %>% - adorn_ns() %>% - untabyl(), - data.frame( - x = c(letters[1:4], "Total"), - `0` = c("82.7% (417)", "0.4% (2)", "0.2% (1)", "0.0% (0)", "83.3% (420)"), - `1` = c("16.5% (83)", "0.0% (0)", "0.0% (0)", "0.2% (1)", "16.7% (84)"), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) -}) - -test_that("front parameter works", { - expect_equal( - source_an %>% - adorn_totals() %>% - adorn_percentages("all") %>% - adorn_pct_formatting() %>% - adorn_ns("front") %>% - untabyl(), - data.frame( - x = c(letters[1:4], "Total"), - `0` = c("417 (82.7%)", "2 (0.4%)", "1 (0.2%)", "0 (0.0%)", "420 (83.3%)"), - `1` = c("83 (16.5%)", "0 (0.0%)", "0 (0.0%)", "1 (0.2%)", "84 (16.7%)"), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) -}) - -test_that("bad inputs are caught", { - expect_error(mtcars %>% adorn_ns(), - "argument \"ns\" cannot be null; if not calling adorn_ns() on a data.frame of class \"tabyl\", pass your own value for ns", - fixed = TRUE - ) - expect_error( - mtcars %>% tabyl(am, cyl) %>% adorn_ns("huh"), - "`position` must be one of \"front\" or \"rear\", not \"huh\"" - ) - expect_error( - mtcars %>% tabyl(am, cyl) %>% adorn_ns(ns = mtcars$mpg), - "if supplying a value to the ns argument, it must be of class data.frame" - ) - reg_df <- mtcars %>% tabyl(am, cyl) - wide_df <- mtcars %>% tabyl(am, cyl) - wide_df$extra <- c(10, 20) - expect_error( - adorn_ns(reg_df, ns = wide_df), - "if supplying your own data.frame of Ns to append, its dimensions must match those of the data.frame in the \"dat\" argument" - ) - expect_warning(mtcars %>% tabyl(cyl) %>% adorn_ns(), - "adorn_ns() is meant to be called on a two_way tabyl; consider combining columns of a one_way tabyl with tidyr::unite()", - fixed = TRUE - ) -}) - -test_that("attributes make it through unaltered", { - expect_equal( - attributes( - source_an %>% - adorn_totals() %>% - adorn_percentages("all") %>% - adorn_pct_formatting() %>% - adorn_ns("front") # with adorn_ns - ), - attributes( - source_an %>% - adorn_totals() %>% - adorn_percentages("all") %>% - adorn_pct_formatting() # without adorn_ns - ) - ) -}) - -test_that("works on smallest tabyls", { - expect_equal( - mtcars %>% - dplyr::slice(1) %>% - tabyl(am, cyl) %>% - dplyr::rename(new_var_name = `6`) %>% - adorn_percentages() %>% - adorn_pct_formatting() %>% - adorn_ns() %>% - untabyl(), - data.frame( - am = 1, - new_var_name = "100.0% (1)", stringsAsFactors = FALSE - ) - ) -}) - - -test_that("users can supply own Ns", { - # make tabyl with thousands, convert to Ks to append - big_tabyl <- data.frame( - a = rep(c("x", rep("y", 9)), 999), - b = rep(c("big", "big", "big", "small", "small"), 1998), - stringsAsFactors = FALSE - ) %>% - tabyl(a, b) - - custom_Ns <- big_tabyl %>% - dplyr::mutate( - big = paste0(round(big / 1000, 1), "k"), - small = paste0(round(small / 1000, 1), "k") - ) - - expect_equal( - big_tabyl %>% - adorn_percentages("col") %>% - adorn_pct_formatting() %>% - adorn_ns(ns = custom_Ns) %>% - untabyl(), - data.frame( - a = c("x", "y"), - big = c("16.7% (1k)", "83.3% (5k)"), - small = c("0.0% (0k)", "100.0% (4k)"), - stringsAsFactors = FALSE - ) - ) -}) - -test_that("automatically invokes purrr::map when called on a 3-way tabyl", { - three <- tabyl(mtcars, cyl, am, gear) %>% - adorn_percentages() %>% - adorn_pct_formatting() - expect_equal( - adorn_ns(three), # vanilla call - purrr::map(three, adorn_ns) - ) - - # with arguments passing through - expect_equal( - adorn_ns(three, "front"), - purrr::map(three, adorn_ns, "front") - ) -}) - -test_that("non-data.frame inputs are handled", { - expect_error(adorn_ns(1:5), "adorn_ns() must be called on a data.frame or list of data.frames", fixed = TRUE) -}) - -test_that("multiple character columns in a tabyl are left untouched", { - small_with_char <- data.frame( - x = letters[1:2], - a = 1:2, - b = 3:4, - text = "text", - stringsAsFactors = FALSE - ) - expect_equal( - small_with_char %>% - adorn_percentages() %>% - dplyr::pull(text), - c("text", "text") - ) -}) - -test_that("works with tidyselect", { - simple_percs <- source_an %>% adorn_percentages() - one_adorned <- simple_percs %>% adorn_ns(, , , `1`) - expect_equal( - simple_percs[, 1:2], - one_adorned[, 1:2] - ) - expect_equal( - one_adorned[[3]], - c("0.166 (83)", "0.000 (0)", "0.000 (0)", "1.000 (1)") - ) -}) - -test_that("no message thrown on grouped df input", { - expect_silent(source_an %>% - adorn_percentages() %>% - adorn_ns()) -}) - -test_that("adorn_ns works on single column data.frame with custom Ns if tidyselect is used, #456", { - adorned_single <- mtcars %>% - tabyl(am, cyl) %>% - adorn_percentages() - adorned_single <- - adorned_single %>% - dplyr::select(a = `4`) %>% - adorn_ns(ns = dplyr::select(attr(adorned_single, "core"), a = `4`), , , , a) - expect_equal(stringr::str_sub(adorned_single$a, -4, -1), c(" (3)", " (8)")) -}) - -# This tests the display of the decimal.mark by forcing a decimal into a tabyl -# Can't happen with a natural table, but maybe someone will use adorn_ns on a homespun data.frame -test_that("formatting function works, (#444)", { - set.seed(1) - bigger_dat <- data.frame( - sex = rep(c("m", "f"), 3000), - age = round(runif(3000, 1, 102), 0) - ) - bigger_dat$age_group <- cut(bigger_dat$age, quantile(bigger_dat$age, c(0, 1 / 3, 2 / 3, 1))) - - bigger_tab <- bigger_dat %>% - tabyl(age_group, sex, show_missing_levels = FALSE) - - standard_output <- bigger_tab %>% - adorn_percentages("col") %>% - adorn_pct_formatting(digits = 1) %>% - adorn_ns(position = "front") - - # test commas in thousands place by default - expect_equal( - standard_output$f, - c("1,018 (33.9%)", "990 (33.0%)", "980 (32.7%)", "12 (0.4%)") - ) - - # Test decimal mark - bigger_tab$f[1] <- 1018.5 # makes no sense in a tabyl but need for testing decimal mark display - - bigger_result <- bigger_tab %>% - untabyl() %>% # to get the decimal into the core - as_tabyl() %>% - adorn_totals(c("row", "col")) %>% - adorn_percentages("col") %>% - adorn_pct_formatting(digits = 1) %>% - adorn_ns(position = "rear", format_func = function(x) format(x, big.mark = ".", decimal.mark = ",")) - - expect_equal( - bigger_result$f, - c( - "33.9% (1.018,5)", "33.0% (990,0)", "32.7% (980,0)", "0.4% (12,0)", - "100.0% (3.000,5)" - ) - ) -}) diff --git a/tests/testthat/test-adorn-pct-formatting.R b/tests/testthat/test-adorn-pct-formatting.R deleted file mode 100644 index 2bc461c9..00000000 --- a/tests/testthat/test-adorn-pct-formatting.R +++ /dev/null @@ -1,196 +0,0 @@ -source1 <- mtcars %>% - tabyl(cyl, am) %>% - adorn_percentages() - -test_that("calculations are accurate", { - expect_equal( - untabyl(adorn_pct_formatting(source1)), # default parameter is denom = "row" - data.frame( - cyl = c(4, 6, 8), - `0` = c("27.3%", "57.1%", "85.7%"), - `1` = c("72.7%", "42.9%", "14.3%"), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) -}) - -test_that("data.frames with no numeric columns beyond the first cause failure", { - expect_error( - adorn_pct_formatting(data.frame(a = 1:2, b = c("hi", "lo"))), - "at least one targeted column must be of class numeric", - fixed = TRUE - ) -}) - -dat <- data.frame(Operation = c("Login", "Posted", "Deleted"), `Total Count` = c(5, 25, 40), check.names = FALSE) - -test_that("works with a single numeric column per #89", { - expect_equal( - dat %>% adorn_percentages("col") %>% untabyl(), - data.frame( - Operation = c("Login", "Posted", "Deleted"), - `Total Count` = c(5 / 70, 25 / 70, 40 / 70), - check.names = FALSE - ) - ) -}) - -test_that("works with totals row", { - expect_equal( - dat %>% adorn_totals("row") %>% adorn_percentages("col") %>% untabyl(), - data.frame( - Operation = c("Login", "Posted", "Deleted", "Total"), - `Total Count` = c(5 / 70, 25 / 70, 40 / 70, 1), - check.names = FALSE, stringsAsFactors = FALSE - ) - ) -}) - -test_that("works with one-way tabyl", { - expect_equal( - mtcars %>% - tabyl(carb) %>% - adorn_pct_formatting(digits = 0) %>% - untabyl(), - data.frame( - carb = c(1:4, 6, 8), - n = c(7, 10, 3, 10, 1, 1), - percent = c("22%", "31%", "9%", "31%", "3%", "3%"), - stringsAsFactors = FALSE - ) - ) -}) - -test_that("NAs are replaced with dashes when percentage signs are affixed", { - # NaNs from adorn_percentages, the more common case (still uncommon) - has_nans <- mtcars %>% - tabyl(carb, cyl) %>% - .[5:6, ] %>% - adorn_percentages("col") %>% - adorn_pct_formatting() %>% - untabyl() - row.names(has_nans) <- NULL - expect_equal( - has_nans, - data.frame( - carb = c(6, 8), - `4` = c("-", "-"), - `6` = c("100.0%", "0.0%"), - `8` = c("0.0%", "100.0%"), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) - - # NAs convert to - - has_nas <- data.frame(a = c("big", "little"), x = c(0.1, 0.123), y = c(0.98, NA), stringsAsFactors = FALSE) - expect_equal( - adorn_pct_formatting(has_nas), - data.frame(a = c("big", "little"), x = c("10.0%", "12.3%"), y = c("98.0%", "-"), stringsAsFactors = FALSE) - ) -}) - -test_that("NAs are replaced with dashes - no percentage signs affixed", { - # NaNs from adorn_percentages, the more common case (still uncommon) - has_nans <- mtcars %>% - tabyl(carb, cyl) %>% - .[5:6, ] %>% - adorn_percentages("col") %>% - adorn_pct_formatting(affix_sign = FALSE) %>% - untabyl() - row.names(has_nans) <- NULL - expect_equal( - has_nans, - data.frame( - carb = c(6, 8), - `4` = c("-", "-"), - `6` = c("100.0", "0.0"), - `8` = c("0.0", "100.0"), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) - - # NAs convert to - symbol - has_nas <- data.frame(a = c("big", "little"), x = c(0.1, 0.123), y = c(0.98, NA), stringsAsFactors = FALSE) - expect_equal( - adorn_pct_formatting(has_nas, affix_sign = FALSE), - data.frame(a = c("big", "little"), x = c("10.0", "12.3"), y = c("98.0", "-"), stringsAsFactors = FALSE) - ) -}) - - -test_that("bad rounding argument caught", { - expect_error( - dat %>% - adorn_percentages() %>% - adorn_pct_formatting(rounding = "blargh"), - "`rounding` must be one of \"half to even\" or \"half up\", not \"blargh\".", - fixed = TRUE - ) -}) - -test_that("automatically invokes purrr::map when called on a 3-way tabyl", { - three <- tabyl(mtcars, cyl, am, gear) - expect_equal( - adorn_pct_formatting(three), # vanilla call - purrr::map(three, adorn_pct_formatting) - ) - - # with arguments passing through - expect_equal( - adorn_pct_formatting(three, 2, "half up", affix_sign = FALSE), - purrr::map(three, adorn_pct_formatting, 2, "half up", FALSE) - ) -}) - -test_that("non-data.frame inputs are handled", { - expect_error(adorn_pct_formatting(1:5), "adorn_pct_formatting() must be called on a data.frame or list of data.frames", fixed = TRUE) -}) - -test_that("tidyselecting works", { - target <- data.frame( - color = c("green", "blue", "red"), - first_wave = c(1:3), - second_wave = c(4:6), - third_wave = c(3, 3, 3), - size = c("small", "medium", "large"), - stringsAsFactors = FALSE - ) %>% - adorn_percentages() - - two_cols <- target %>% - adorn_pct_formatting(, , , first_wave:second_wave) - expect_equal(two_cols$first_wave, c("12.5%", "20.0%", "25.0%")) - expect_equal(two_cols$third_wave, c(3 / 8, 3 / 10, 3 / 12)) - - expect_message( - target %>% - adorn_pct_formatting(, , , third_wave:size), - "At least one non-numeric column was specified and will not be modified." - ) - # correct behavior occurs when text columns are skipped - expect_message( - text_skipped <- target %>% - adorn_pct_formatting(., , , , c(first_wave, size)), - "At least one non-numeric column was specified and will not be modified." - ) - - expect_equal(text_skipped$first_wave, c("12.5%", "20.0%", "25.0%")) - expect_equal( - text_skipped %>% dplyr::select(-first_wave), - target %>% dplyr::select(-first_wave), - ignore_attr = TRUE - ) -}) - -test_that("decimal.mark works", { - locale_decimal_sep <- getOption("OutDec") # not sure if it's necessary to save and restore this, - # but seems safe for locale-independent testing processes - options(OutDec = ",") - expect_true( - all(grepl(",", unlist(adorn_pct_formatting(source1)[2]))) - ) - options(OutDec = locale_decimal_sep) -}) diff --git a/tests/testthat/test-adorn-percentages.R b/tests/testthat/test-adorn-percentages.R deleted file mode 100644 index 6e3caf0e..00000000 --- a/tests/testthat/test-adorn-percentages.R +++ /dev/null @@ -1,242 +0,0 @@ -source1 <- mtcars %>% - tabyl(cyl, am) - -test_that("bad input to denominator arg is caught", { - expect_error( - mtcars %>% - adorn_percentages("blargh"), - "`denominator` must be one of \"row\", \"col\", or \"all\"", - fixed = TRUE - ) -}) - -test_that("calculations are accurate", { - expect_equal( - untabyl(adorn_percentages(source1)), # default parameter is denominator = "row" - data.frame( - cyl = c(4, 6, 8), - `0` = c(3 / 11, 4 / 7, 12 / 14), - `1` = c(8 / 11, 3 / 7, 2 / 14), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) - expect_equal( - untabyl(adorn_percentages(source1, denominator = "col")), - data.frame( - cyl = c(4, 6, 8), - `0` = c(3 / 19, 4 / 19, 12 / 19), - `1` = c(8 / 13, 3 / 13, 2 / 13), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) - expect_equal( - untabyl(adorn_percentages(source1, denominator = "all")), - data.frame( - cyl = c(4, 6, 8), - `0` = c(3 / 32, 4 / 32, 12 / 32), - `1` = c(8 / 32, 3 / 32, 2 / 32), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) -}) - -source2 <- source1 %>% - adorn_totals(c("row", "col")) -test_that("calculations are correct when totals row/col doesn't match axis of computation", { - expect_equal( - untabyl(adorn_percentages(source2, denominator = "row")), - data.frame( - cyl = c(4, 6, 8, "Total"), - `0` = c(3 / 11, 4 / 7, 12 / 14, 19 / 32), - `1` = c(8 / 11, 3 / 7, 2 / 14, 13 / 32), - Total = c(1, 1, 1, 1), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) -}) - -test_that("works with totals row/col when denominator = col or all, #357", { - col_percs <- source1 %>% - adorn_totals(where = c("col", "row")) %>% - adorn_percentages(denominator = "col") - expect_equal(col_percs$Total, c(11, 7, 14, 32) / 32) - expect_equal(unname(unlist(col_percs[4, ])), c("Total", rep(1, 3))) - - # Same but for denominator = all - all_percs <- source1 %>% - adorn_totals(where = c("col", "row")) %>% - adorn_percentages(denominator = "all") - expect_equal(all_percs$Total, c(11, 7, 14, 32) / 32) - expect_equal(unname(unlist(all_percs[4, ])), unname(c("Total", colSums(source1)[2:3] / 32, 32 / 32))) - - # Now with no totals row, same two tests as preceding - col_percs_no_row <- source1 %>% - adorn_totals(where = c("col")) %>% - adorn_percentages(denominator = "col") - expect_equal(col_percs_no_row$Total, c(11, 7, 14) / 32) - - # Same but for denominator = all - all_percs_no_row <- source1 %>% - adorn_totals(where = c("col")) %>% - adorn_percentages(denominator = "all") - expect_equal(all_percs_no_row$Total, c(11, 7, 14) / 32) - - # And try one where we exempt the totals col - expect_message( - col_percs_exempted <- source1 %>% - adorn_totals(where = c("col", "row")) %>% - adorn_percentages(denominator = "col", , -Total), - regexp = "At least one non-numeric column was specified. All non-numeric columns will be removed from percentage calculations." - ) - expect_equal(col_percs_exempted$Total, c(11, 7, 14, 32)) - expect_equal(unname(unlist(col_percs_exempted[4, ])), c("Total", 1, 1, 32)) - - expect_message( - all_percs_exempted <- source1 %>% - adorn_totals(where = c("col", "row")) %>% - adorn_percentages(denominator = "all", , -Total), - regexp = "At least one non-numeric column was specified. All non-numeric columns will be removed from percentage calculations." - ) - expect_equal(all_percs_exempted$Total, c(11, 7, 14, 32)) - expect_equal(unname(unlist(all_percs_exempted[4, ])), unname(c("Total", colSums(source1)[2:3] / 32, 32))) -}) - -source2 <- source1 -source2[2, 2] <- NA - -test_that("NAs handled correctly with na.rm = TRUE", { - expect_equal( - untabyl(adorn_percentages(source2)), # row - data.frame( - cyl = c(4, 6, 8), - `0` = c(3 / 11, NA, 12 / 14), - `1` = c(8 / 11, 1, 2 / 14), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) - expect_equal( - untabyl(adorn_percentages(source2, denominator = "col")), - data.frame( - cyl = c(4, 6, 8), - `0` = c(3 / 15, NA, 12 / 15), - `1` = c(8 / 13, 3 / 13, 2 / 13), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) -}) - -test_that("NAs handled correctly with na.rm = FALSE", { - expect_equal( - untabyl(adorn_percentages(source2, na.rm = FALSE)), # row - data.frame( - cyl = c(4, 6, 8), - `0` = c(3 / 11, NA, 12 / 14), - `1` = c(8 / 11, NA, 2 / 14), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) - expect_equal( - untabyl(adorn_percentages(source2, denominator = "col", na.rm = FALSE)), - data.frame( - cyl = c(4, 6, 8), - `0` = as.numeric(c(NA, NA, NA)), - `1` = c(8 / 13, 3 / 13, 2 / 13), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) -}) - -test_that("data.frames with no numeric columns beyond the first cause failure", { - expect_error( - adorn_percentages(data.frame(a = 1:2, b = c("hi", "lo"))), - "at least one one of columns 2:n must be of class numeric" - ) -}) - -test_that("works with a single numeric column per #89", { - dat <- data.frame(Operation = c("Login", "Posted", "Deleted"), `Total Count` = c(5, 25, 40), check.names = FALSE) - expect_equal( - dat %>% adorn_percentages("col") %>% untabyl(), - data.frame( - Operation = c("Login", "Posted", "Deleted"), - `Total Count` = c(5 / 70, 25 / 70, 40 / 70), - check.names = FALSE - ) - ) -}) - -test_that("works with totals row", { - dat <- data.frame(Operation = c("Login", "Posted", "Deleted"), `Total Count` = c(5, 25, 40), check.names = FALSE) - expect_equal( - dat %>% adorn_totals("row") %>% adorn_percentages("col") %>% untabyl(), - data.frame( - Operation = c("Login", "Posted", "Deleted", "Total"), - `Total Count` = c(5 / 70, 25 / 70, 40 / 70, 1), - check.names = FALSE, stringsAsFactors = FALSE - ) - ) -}) - -test_that("automatically invokes purrr::map when called on a 3-way tabyl", { - three <- tabyl(mtcars, cyl, am, gear) - expect_equal( - adorn_percentages(three), # vanilla call - purrr::map(three, adorn_percentages) - ) - - # with arguments passing through - expect_equal( - adorn_percentages(three, "col", na.rm = FALSE), - purrr::map(three, adorn_percentages, "col", FALSE) - ) -}) - -test_that("non-data.frame inputs are handled", { - expect_error(adorn_percentages(1:5), "adorn_percentages() must be called on a data.frame or list of data.frames", fixed = TRUE) -}) - -test_that("tidyselecting works", { - target <- data.frame( - color = c("green", "blue", "red"), - first_wave = c(1:3), - second_wave = c(4:6), - third_wave = c(3, 3, 3), - size = c("small", "medium", "large"), - stringsAsFactors = FALSE - ) - two_cols <- target %>% - adorn_percentages(, , , first_wave:second_wave) - expect_equal(two_cols$first_wave, c(1 / 5, 2 / 7, 3 / 9)) - expect_equal(two_cols$third_wave, rep(3, 3)) - - expect_message( - target %>% - adorn_percentages(., "col", , c(first_wave, size)), - "At least one non-numeric column was specified. All non-numeric columns will be removed from percentage calculations." - ) - expect_message( - text_skipped <- target %>% - adorn_percentages(., "col", , c(first_wave, size)), - regexp = "At least one non-numeric column was specified. All non-numeric columns will be removed from percentage calculations." - ) - expect_equal(text_skipped$first_wave, target$first_wave / sum(target$first_wave)) - expect_equal( - text_skipped %>% dplyr::select(-first_wave), - target %>% dplyr::select(-first_wave), - ignore_attr = TRUE - ) - - # Check combination of totals and tidyselecting does not modify totals col - totaled <- target %>% - adorn_totals("col", , , , second_wave:third_wave) %>% - adorn_percentages(, , , second_wave:third_wave) - expect_equal(totaled$Total, 7:9) -}) diff --git a/tests/testthat/test-adorn-rounding.R b/tests/testthat/test-adorn-rounding.R deleted file mode 100644 index b98a025e..00000000 --- a/tests/testthat/test-adorn-rounding.R +++ /dev/null @@ -1,120 +0,0 @@ -x <- data.frame( - a = c(rep("x", 55), rep("y", 45)), - b = c(rep("x", 50), rep("y", 50)), - stringsAsFactors = FALSE -) - -# Crosstab with decimal values ending in .5 -y <- x %>% - tabyl(a, b) %>% - adorn_percentages("all") - -test_that("rounding parameter works", { - expect_equal( - y %>% - adorn_rounding(digits = 1, rounding = "half up") %>% - untabyl(), - data.frame( - a = c("x", "y"), - x = c(0.5, 0.0), - y = c(0.1, 0.5), - stringsAsFactors = FALSE - ) - ) - # Test failing on CRAN and only there - skip_on_cran() - expect_equal( - y %>% - adorn_rounding(digits = 1) %>% # default rounding: "half to even" - untabyl(), - data.frame( - a = c("x", "y"), - x = c(0.5, 0.0), - y = c(0.0, 0.4), - stringsAsFactors = FALSE - ) - ) -}) - -test_that("digit control succeeds", { - expect_equal( - y %>% - adorn_rounding(digits = 0, rounding = "half up") %>% - untabyl(), - data.frame( - a = c("x", "y"), - x = c(1, 0), - y = c(0, 0), - stringsAsFactors = FALSE - ) - ) - expect_equal( - y %>% - adorn_rounding(digits = 2, rounding = "half up"), # shouldn't do anything given the input only having 2 decimal places - y - ) -}) - -test_that("bad rounding argument caught", { - expect_error( - y %>% - adorn_rounding(rounding = "blargh"), - "'rounding' must be one of 'half to even' or 'half up'", - fixed = TRUE - ) -}) - -test_that("works when called on a 3-way tabyl", { - triple <- mtcars %>% - tabyl(am, cyl, vs) %>% - adorn_percentages("row") - - triple_rounded_manual <- triple - triple_rounded_manual[[1]] <- adorn_rounding(triple[[1]]) - triple_rounded_manual[[2]] <- adorn_rounding(triple[[2]]) - - expect_equal( - triple %>% - adorn_rounding(), - triple_rounded_manual - ) -}) - - -test_that("tidyselecting works", { - target <- data.frame( - color = c("green", "blue", "red"), - first_wave = c(1:3), - second_wave = c(4:6), - third_wave = c(3, 3, 3), - size = c("small", "medium", "large"), - stringsAsFactors = FALSE - ) %>% - adorn_percentages() - - two_cols <- target %>% - adorn_rounding(, "half up", first_wave:second_wave) - expect_equal(two_cols$first_wave, c(.1, .2, .3)) - expect_equal(two_cols$third_wave, c(3 / 8, 3 / 10, 3 / 12)) - - expect_message( - target %>% - adorn_rounding(, , third_wave:size), - "At least one non-numeric column was specified and will not be modified." - ) - expect_message( - text_skipped <- target %>% - adorn_rounding(, , c(first_wave, size)), - "At least one non-numeric column was specified and will not be modified." - ) - expect_equal(text_skipped$first_wave, c(.1, .2, .2)) - expect_equal( - text_skipped %>% dplyr::select(-first_wave), - target %>% dplyr::select(-first_wave), - ignore_attr = TRUE - ) -}) - -test_that("non-data.frame inputs are handled", { - expect_error(adorn_rounding(1:5), "adorn_rounding() must be called on a data.frame or list of data.frames", fixed = TRUE) -}) diff --git a/tests/testthat/test-adorn-title.R b/tests/testthat/test-adorn-title.R deleted file mode 100644 index 6dab98bb..00000000 --- a/tests/testthat/test-adorn-title.R +++ /dev/null @@ -1,165 +0,0 @@ -source1 <- mtcars %>% - tabyl(gear, cyl) - -test_that("placement is correct", { - # Top - expect_equal( - source1 %>% - adorn_title() %>% - names(), - c("", "cyl", rep("", 2)) - ) - expect_equal( - source1 %>% - adorn_title() %>% - .[1, ] %>% - unlist() %>% - unname(), - c("gear", "4", "6", "8") - ) - # Combined - expect_equal( - source1 %>% - adorn_title("combined") %>% - names(), - c("gear/cyl", "4", "6", "8") - ) -}) - -test_that("name overrides work", { - expect_equal( - source1 %>% - adorn_title(row_name = "R", col_name = "C") %>% - names(), - c("", "C", rep("", 2)) - ) -}) - -test_that("non-tabyls are treated correctly", { - non_tab <- mtcars %>% - dplyr::count(gear, cyl) %>% - tidyr::pivot_wider(names_from = gear, values_from = n) - expect_error(adorn_title(non_tab), "When input is not a data.frame of class tabyl, a value must be specified for the col_name argument") - - expect_equal( - non_tab %>% adorn_title(col_name = "col") %>% names(), - c("", "col", rep("", 2)) - ) - - expect_equal( - non_tab %>% adorn_title(placement = "combined", col_name = "col") %>% names(), - c("cyl/col", 3, 4, 5) - ) - - expect_equal( - non_tab %>% adorn_title(placement = "combined", row_name = "row!", col_name = "col") %>% names(), - c("row!/col", 3, 4, 5) - ) -}) -test_that("bad inputs are caught", { - expect_error(adorn_title(1:2), - "\"dat\" must be a data.frame", - fixed = TRUE - ) - expect_error( - adorn_title(source1, - placement = "blargh" - ), - "`placement` must be one of \"top\" or \"combined\"", - fixed = TRUE - ) - expect_error( - adorn_title(source1, - row_name = 1:4 - ), - "row_name must be a string" - ) - expect_error( - adorn_title(source1, - col_name = mtcars - ), - "col_name must be a string" - ) - - # Doesn't make sense with a one-way tabyl - expect_warning( - mtcars %>% tabyl(cyl) %>% adorn_title(), - "adorn_title is meant for two-way tabyls, calling it on a one-way tabyl may not yield a meaningful result" - ) -}) - -test_that("works with non-count inputs", { - source2_base <- data.frame(sector = c("North", "South"), units = 1:2, group = c("a", "b")) - source2_tibble <- dplyr::as_tibble(source2_base) - expect_equal( - adorn_title(source2_base, col_name = "Characteristics") %>% names(), - c("", "Characteristics", "") - ) - expect_equal( - adorn_title(source2_base, col_name = "Characteristics"), - adorn_title(source2_tibble, col_name = "Characteristics") - ) -}) - -test_that("for printing purposes: tabyl class stays tabyl, data.frame stays data.frame, tibble is downgraded to data.frame", { - # right output classes with tabyl inputs - expect_equal(class(mtcars %>% tabyl(cyl, am) %>% adorn_title()), c("tabyl", "data.frame")) - expect_equal(class(mtcars %>% tabyl(gear, carb) %>% adorn_title(., "combined")), c("tabyl", "data.frame")) - - # Create tibble input: - mpg_by_cyl_and_am <- - mtcars %>% - dplyr::group_by(cyl, am) %>% - dplyr::summarise(mean_mpg = mean(mpg)) %>% - tidyr::pivot_wider(names_from = am, values_from = mean_mpg) - - # handles tibble input - expect_s3_class( - mpg_by_cyl_and_am %>% adorn_title("top", "Cylinders", "Automatic?"), - "data.frame" - ) - - # Convert columns 2:n to strings - expect_s3_class( - mpg_by_cyl_and_am %>% adorn_pct_formatting() %>% # nonsense command here, just want to convert cols 2:n into character - adorn_title("top", "Cylinders", "Automatic?"), - "data.frame" - ) - - # handles data.frame non-tabyl input - expect_s3_class( - mtcars %>% adorn_title("top", col_name = "hey look ma I'm a title"), - "data.frame" - ) -}) - -test_that("works with factors in input", { - facts <- data.frame(a = "high", large = "1", stringsAsFactors = TRUE) - # first with "top" then "combined" - expect_equal( - facts %>% adorn_title(col_name = "col"), - data.frame(a = c("a", "high"), col = c("large", "1"), stringsAsFactors = FALSE) %>% - setNames(., c("", "col")) - ) - # with combined the original column types are preserved - expect_equal( - facts %>% adorn_title("combined", col_name = "col"), - data.frame(`a/col` = "high", large = "1", stringsAsFactors = TRUE, check.names = FALSE) - ) -}) - -test_that("automatically invokes purrr::map when called on a 3-way tabyl", { - three <- tabyl(mtcars, cyl, am, gear) %>% - adorn_percentages() %>% - adorn_pct_formatting() - expect_equal( - adorn_title(three), # vanilla call - purrr::map(three, adorn_title) - ) - - # with arguments passing through, incl. custom row and col names - expect_equal( - adorn_title(three, "combined", "cyl", "am"), - purrr::map(three, adorn_title, "combined", "cyl", "am") - ) -}) diff --git a/tests/testthat/test-adorn-totals.R b/tests/testthat/test-adorn-totals.R deleted file mode 100644 index 045460be..00000000 --- a/tests/testthat/test-adorn-totals.R +++ /dev/null @@ -1,595 +0,0 @@ -dat <- data.frame( - a = factor(c(rep(c("big", "small", "big"), 3)), levels = c("small", "big")), - b = c(1:3, 1:3, 1, 1, 1) -) -ct <- dat %>% - tabyl(a, b) - -mixed <- data.frame( - a = 1:3, - b = c("x", "y", "z"), - c = 5:7, - d = c("big", "med", "small"), - stringsAsFactors = FALSE -) - -test_that("totals row is correct", { - expect_equal( - untabyl(adorn_totals(ct, "row")), - data.frame( - a = factor(c("small", "big", "Total"), levels = c("small", "big", "Total")), - `1` = c(1, 4, 5), - `2` = c(2, 0, 2), - `3` = c(0, 2, 2), - check.names = FALSE - ) - ) -}) - -test_that("totals col is correct", { - expect_equal( - untabyl(adorn_totals(ct, "col")), - data.frame( - a = factor(c("small", "big"), levels = c("small", "big")), - `1` = c(1, 4), - `2` = c(2, 0), - `3` = c(0, 2), - Total = c(3, 6), - check.names = FALSE - ) - ) -}) - -test_that("totals row and col produce correct results when called together", { - expect_equal( - ct %>% - adorn_totals(c("row", "col")) %>% - untabyl(), - data.frame( - a = factor(c("small", "big", "Total"), levels = c("small", "big", "Total")), - `1` = c(1, 4, 5), - `2` = c(2, 0, 2), - `3` = c(0, 2, 2), - Total = c(3, 6, 9), - check.names = FALSE - ) - ) -}) - -test_that("totals where='both' produce equivalent results to c('row','col')", { - expect_equal( - ct %>% - adorn_totals("both") %>% - untabyl(), - ct %>% - adorn_totals(c("row", "col")) %>% - untabyl() - ) -}) - -test_that("order doesn't matter when row and col are called together", { - expect_equal( - ct %>% - adorn_totals(c("row", "col")) %>% - untabyl(), - ct %>% - adorn_totals(c("col", "row")) %>% - untabyl() - ) -}) - -test_that("both functions work with a single column", { - single_col <- tibble::tibble( - a = c(as.Date("2016-01-01"), as.Date("2016-02-03")), - b = c(1, 2) - ) - expect_error(single_col %>% adorn_totals("row"), NA) # this method of testing passage is from http://stackoverflow.com/a/30068233 - expect_error(single_col %>% adorn_totals("col"), NA) - expect_error(single_col %>% adorn_totals(c("col", "row")), NA) -}) - -dat <- data.frame( - a = c("hi", "lo"), - b = c(1, 2), - c = c(5, 10), - d = c("big", "small"), - e = c(20, NA), - stringsAsFactors = FALSE -) - -test_that("numeric first column is ignored", { - expect_equal( - mtcars %>% - tabyl(cyl, gear) %>% - adorn_totals("col") %>% - untabyl(), - data.frame( - cyl = c(4, 6, 8), - `3` = c(1, 2, 12), - `4` = c(8, 4, 0), - `5` = c(2, 1, 2), - Total = c(11, 7, 14), - check.names = FALSE, - stringsAsFactors = FALSE - ) - ) -}) - -# create input tables for subsequent testing -ct_2 <- - mtcars %>% - dplyr::group_by(cyl, gear) %>% - dplyr::tally() %>% - tidyr::pivot_wider(names_from = gear, values_from = n) -df1 <- data.frame(x = c(1, 2), y = c(NA, 4)) - -test_that("grouped_df gets ungrouped and succeeds", { - ct_2 <- - mtcars %>% - dplyr::group_by(cyl, gear) %>% - dplyr::tally() %>% - tidyr::pivot_wider(names_from = gear, values_from = n) - expect_equal( - ct_2 %>% adorn_totals(), - ct_2 %>% dplyr::ungroup() %>% adorn_totals() - ) -}) - -test_that("na.rm value works correctly", { - expect_equal( - df1 %>% adorn_totals(c("row", "col"), na.rm = FALSE) %>% untabyl(), - data.frame( - x = c("1", "2", "Total"), - y = c(NA, 4, NA), - Total = c(NA, 4, NA), - stringsAsFactors = FALSE - ) - ) -}) - -test_that("add_totals respects if input was data.frame", { - expect_equal( - class(df1), - class(df1 %>% adorn_totals() %>% untabyl()) - ) -}) - -test_that("add_totals respects if input was tibble", { - expect_equal( - class(df1 %>% tibble::as_tibble()), - class(df1 %>% tibble::as_tibble() %>% adorn_totals() %>% untabyl()) - ) -}) - -test_that("error thrown if no columns past first are numeric", { - df2 <- data.frame( - x = c("big", "small"), - y = c("hi", "lo") - ) - expect_error( - adorn_totals(df2, "col"), - "at least one targeted column must be of class numeric. Control target variables with the ... argument. adorn_totals should be called before other adorn_ functions." - ) - expect_error( - mixed %>% - adorn_totals("row", "-", TRUE, "Totals", d), - "at least one targeted column must be of class numeric. Control target variables with the ... argument. adorn_totals should be called before other adorn_ functions." - ) - - # Add a test where only the first column is numeric - df3 <- data.frame( - x = 1:2, - y = c("hi", "lo") - ) - expect_error( - adorn_totals(df3), - "at least one targeted column must be of class numeric. Control target variables with the ... argument. adorn_totals should be called before other adorn_ functions." - ) -}) - -test_that("bad input to where arg is caught", { - expect_error( - mtcars %>% - adorn_totals("blargh"), - paste0('"where" must be one of "row", "col", or c("row", "col")'), - fixed = TRUE - ) -}) - - -test_that("works with non-numeric columns mixed in; fill character specification", { - expect_equal( - mixed %>% adorn_totals(where = c("row", "col"), fill = "*") %>% untabyl(), - data.frame( - a = c("1", "2", "3", "Total"), - b = c("x", "y", "z", "*"), - c = c(5, 6, 7, 18), - d = c("big", "med", "small", "*"), - Total = c(5, 6, 7, 18), - stringsAsFactors = FALSE - ) - ) -}) - -test_that("fill works with multiple factor and date columns", { - has_facs <- data.frame( - a = c("hi", "low"), - b = c("big", "small"), - c = c(as.Date("2000-01-01"), as.Date("2000-01-02")), - d = 1:2 - ) - expect_equal( - adorn_totals(has_facs, "row") %>% untabyl(), - data.frame( - a = c("hi", "low", "Total"), - b = c("big", "small", "-"), - c = c("2000-01-01", "2000-01-02", "-"), - d = 1:3, - stringsAsFactors = FALSE - ) - ) -}) - -test_that("totals attributes are assigned correctly", { - post <- adorn_totals(ct, c("row", "col")) - expect_equal(attr(post, "totals"), c("row", "col")) - expect_equal(class(post), c("tabyl", "data.frame")) - expect_equal(attr(post, "tabyl_type"), "two_way") - expect_equal(attr(post, "core"), untabyl(ct)) - - post_col <- adorn_totals(ct, "col") - expect_equal(attr(post_col, "totals"), "col") - expect_equal(class(post_col), c("tabyl", "data.frame")) - expect_equal(attr(post_col, "tabyl_type"), "two_way") - expect_equal(attr(post_col, "core"), untabyl(ct)) - - post_sequential_both <- adorn_totals(ct, "col") %>% - adorn_totals("row") - expect_equal(post_sequential_both, post, ignore_attr = TRUE) - expect_equal( - sort(attr(post, "totals")), - sort(attr(post_sequential_both, "totals")) - ) -}) - -test_that("trying to re-adorn a dimension fails", { - expect_error( - ct %>% adorn_totals("col") %>% adorn_totals("col"), - "trying to re-add a totals dimension that is already been added" - ) - expect_error( - ct %>% adorn_totals() %>% adorn_totals(), - "trying to re-add a totals dimension that is already been added" - ) -}) - -test_that("automatically invokes purrr::map when called on a 3-way tabyl", { - three <- tabyl(mtcars, cyl, am, gear) - expect_equal( - adorn_totals(three), # vanilla call - purrr::map(three, adorn_totals) - ) - - # with arguments passing through - expect_equal( - adorn_totals(three, c("row", "col"), fill = "---", na.rm = FALSE, name = "dummy_name"), - purrr::map(three, adorn_totals, c("row", "col"), fill = "---", FALSE, name = "dummy_name") - ) -}) - -test_that("non-data.frame inputs are handled", { - expect_error(adorn_totals(1:5), "adorn_totals() must be called on a data.frame or list of data.frames", fixed = TRUE) -}) - -test_that("row total name is changed", { - expect_equal( - as.character(adorn_totals(ct, name = "NewTitle")[nrow(ct) + 1, 1]), - "NewTitle" - ) -}) - -test_that("column total name is changed", { - expect_equal( - colnames(adorn_totals(ct, where = "col", name = "NewTitle"))[(ncol(ct) + 1)], - "NewTitle" - ) -}) - -test_that("tidyselecting works", { - cyl_gear <- mtcars %>% - adorn_totals(c("row", "col"), "-", TRUE, "cylgear", c(cyl, gear)) - expect_equal(cyl_gear$cylgear, c(mtcars$cyl + mtcars$gear, (sum(mtcars$cyl) + sum(mtcars$gear)))) - expect_equal( - unname(unlist(cyl_gear[33, ])), - c("cylgear", "198", rep("-", 7), "118", "-", "316") - ) - - # Can override the first column not being included - # adorn_totals() still fails if ONLY the first column is numeric, that's fine - it's a nonsensical operation - simple <- data.frame( - x = 1:2, - y = 3:4, - z = c("hi", "lo") - ) - - expect_message( - simple %>% - adorn_totals(c("row", "col"), "-", TRUE, "Total", x), - "Because the first column was specified to be totaled, it does not contain the label 'Total' (or user-specified name) in the totals row", - fixed = TRUE - ) - - expect_message( - simple_total <- simple %>% - adorn_totals(c("row", "col"), "-", TRUE, "Total", x), - regexp = "Because the first column was specified to be totaled, it does not contain the label 'Total' (or user-specified name) in the totals row", - fixed = TRUE - ) - - expect_equal(unname(unlist(simple_total[3, ])), c("3", "-", "-", "3")) - expect_equal(simple_total$Total, 1:3) - - # test that leaving out a numeric column of a tibble succeeds, #388 - expect_equal( - simple %>% - adorn_totals(, , , , y) %>% - as.data.frame(), - simple %>% - tibble::tibble() %>% - adorn_totals() %>% - as.data.frame() - ) -}) - -test_that("supplying NA to fill preserves column types", { - test_df <- data.frame( - a = c("hi", "low", "med"), - b = factor(c("big", "small", "regular")), - c = c(as.Date("2000-01-01"), as.Date("2000-01-02"), as.Date("2000-01-03")), - d = c(as.POSIXct("2000-01-01", tz = "ROK"), as.POSIXct("2000-01-02"), as.POSIXct("2000-01-03")), - e = 1:3, - f = 4:6, - g = c(TRUE, FALSE, TRUE), - h = c(7.2, 8.2, 9.2), - stringsAsFactors = FALSE - ) - - out <- adorn_totals(test_df, fill = NA) - - # expect types to be preserved - expect_type(out[["a"]], "character") - expect_s3_class(out[["b"]], "factor") - expect_s3_class(out[["c"]], "Date") - expect_s3_class(out[["d"]], "POSIXct") - expect_type(out[["g"]], "logical") - # expect factor levels to be preserved - expect_equal(levels(out[["b"]]), levels(test_df[["b"]])) - # expect NAs in total rows for non-numerics - expect_true(is.na(out[4, "b"])) - expect_true(is.na(out[4, "c"])) - expect_true(is.na(out[4, "d"])) - expect_true(is.na(out[4, "g"])) - # test values of totals - expect_equal(out[4, "a"], "Total") - expect_equal(out[4, "e"], 6) - expect_equal(out[4, "f"], 15) - expect_equal(out[4, "h"], 24.6) - # expect original df intact - expect_equal(test_df, out[1:3, ], ignore_attr = TRUE) -}) - -test_that("supplying NA as fill still works with non-character first col and numeric non-totaled cols", { - test_df <- data.frame( - a = factor(c("hi", "low", "med"), levels = c("low", "med", "hi")), - b = factor(c("big", "small", "regular")), - c = c(as.Date("2000-01-01"), as.Date("2000-01-02"), as.Date("2000-01-03")), - d = 1:3, - e = 4:6, - f = c(TRUE, FALSE, TRUE), - g = c(7.2, 8.2, 9.2), - stringsAsFactors = FALSE - ) - - out <- adorn_totals(test_df, - where = "row", - fill = NA, - na.rm = TRUE, - name = "Total", - d, e - ) - - expect_equal(out[["a"]], factor(c("hi", "low", "med", "Total"), levels = c("low", "med", "hi", "Total"))) - expect_equal(out[["g"]], c(7.2, 8.2, 9.2, NA_real_)) - expect_equal(out[4, "d"], 6) - expect_equal(out[4, "e"], 15) - expect_equal(test_df[1:3, 2:7], out[1:3, 2:7], ignore_attr = TRUE) -}) - -test_that("one_way tabyl inputs retain that class", { - expect_equal( - attr(mtcars %>% tabyl(am) %>% adorn_totals("both"), "tabyl_type"), - "one_way" - ) -}) - - -# Tests from #413, different values for row and col names -test_that("long vectors are trimmed", { - expect_equal( - mixed %>% - adorn_totals( - where = "row", - name = c("total", "something_else"), - fill = "-" - ) %>% - untabyl(), - data.frame( - a = c(as.character(1:3), "total"), - b = c("x", "y", "z", "-"), - c = c(5:7, 18), - d = c("big", "med", "small", "-"), - stringsAsFactors = FALSE - ) - ) -}) - -test_that("row and column names are taken correctly from a vector", { - expect_equal( - mixed %>% - adorn_totals( - where = "both", - name = c("row_name", "col_name"), - fill = "-" - ) %>% - untabyl(), - data.frame( - a = c(as.character(1:3), "row_name"), - b = c("x", "y", "z", "-"), - c = c(5, 6, 7, 18), - d = c("big", "med", "small", "-"), - col_name = c(5, 6, 7, 18), - stringsAsFactors = FALSE - ) - ) -}) - -test_that("row and column names are taken correctly from a single name", { - expect_equal( - mixed %>% - adorn_totals( - where = "both", - name = "totals", - fill = "-" - ) %>% - untabyl(), - data.frame( - a = c(as.character(1:3), "totals"), - b = c("x", "y", "z", "-"), - c = c(5, 6, 7, 18), - d = c("big", "med", "small", "-"), - totals = c(5, 6, 7, 18), - stringsAsFactors = FALSE - ) - ) -}) - -dat <- data.frame( - a = c(rep(c("big", "small", "big"), 3)), - b = c(1:3, 1:3, 1, 1, 1), - stringsAsFactors = TRUE -) -ct <- dat %>% - tabyl(a, b) - -mixed <- data.frame( - a = 1:3, - b = c("x", "y", "z"), - c = 5:7, - d = c("big", "med", "small"), - stringsAsFactors = FALSE -) - -test_that("long vectors are trimmed", { - expect_equal( - mixed %>% - adorn_totals( - where = "row", - name = c("total", "row_total"), - fill = "-" - ) %>% - untabyl(), - data.frame( - a = c(as.character(1:3), "total"), - b = c("x", "y", "z", "-"), - c = c(5:7, 18), - d = c("big", "med", "small", "-"), - stringsAsFactors = FALSE - ) - ) -}) - -test_that("row and column names are taken correctly from a vector", { - expect_equal( - mixed %>% - adorn_totals( - where = "both", - name = c("column_totals", "row_totals"), - fill = "-" - ) %>% - untabyl(), - data.frame( - a = c(as.character(1:3), "column_totals"), - b = c("x", "y", "z", "-"), - c = c(5, 6, 7, 18), - d = c("big", "med", "small", "-"), - row_totals = c(5, 6, 7, 18), - stringsAsFactors = FALSE - ) - ) -}) - -test_that("row and column names are taken correctly from a single name", { - expect_equal( - mixed %>% - adorn_totals( - where = "both", - name = "totals", - fill = "-" - ) %>% - untabyl(), - data.frame( - a = c(as.character(1:3), "totals"), - b = c("x", "y", "z", "-"), - c = c(5, 6, 7, 18), - d = c("big", "med", "small", "-"), - totals = c(5, 6, 7, 18), - stringsAsFactors = FALSE - ) - ) -}) - -test_that("order is maintained when first column is a factor, #494", { - o <- data.frame( - a = 1:5, - fac = factor(c("orange", "blue", "orange", "orange", "blue")), - ord = ordered( - c("huge", "medium", "small", "medium", "medium"), - levels = c("small", "medium", "huge") - ) - ) - - o_tabyl_totaled <- o %>% - tabyl(ord, a) %>% - adorn_totals("both") - - expect_equal( - attr(o_tabyl_totaled$ord, "levels"), - c("small", "medium", "huge", "Total") - ) - expect_equal( - class(o_tabyl_totaled$ord), - c("ordered", "factor") - ) - f_tabyl_totaled <- o %>% - tabyl(fac, a) %>% - adorn_totals("both") - - expect_equal( - attr(f_tabyl_totaled$fac, "levels"), - c("blue", "orange", "Total") - ) - expect_equal( - class(f_tabyl_totaled$fac), - "factor" - ) -}) - -test_that("if factor level already present, adorn_totals() still works, #529", { - factor_present <- mtcars %>% - tabyl(am, cyl) - factor_present$am <- factor(factor_present$am, levels = c("0", "1", "Total")) - expect_equal( - levels(adorn_totals(factor_present, "row")$am), - c("0", "1", "Total") - ) -}) diff --git a/tests/testthat/test-get-level-groups.R b/tests/testthat/test-get-level-groups.R deleted file mode 100644 index b95c30fd..00000000 --- a/tests/testthat/test-get-level-groups.R +++ /dev/null @@ -1,24 +0,0 @@ -# Tests the get_level_groups helper function called by top_levels() - -shorts <- factor(c("a", "b", "c", "d", "e", "f"), levels = rev(letters[1:6])) -longs <- factor(c("aaaaaaaaaaaaaaaa", "bbbbbbbbbbbbbbbbb", "cccccccccccccccccccc", "dddddddddddddddd", NA, "hhhhhhhhhhhhhhhh", "bbbbbbbbbbbbbbbbb"), levels = c("dddddddddddddddd", "aaaaaaaaaaaaaaaa", "cccccccccccccccccccc", "bbbbbbbbbbbbbbbbb", "hhhhhhhhhhhhhhhh")) - -short1 <- get_level_groups(shorts, 1, max(as.numeric(shorts), na.rm = TRUE)) -short2 <- get_level_groups(shorts, 2, max(as.numeric(shorts), na.rm = TRUE)) -short3 <- get_level_groups(shorts, 3, max(as.numeric(shorts), na.rm = TRUE)) - -test_that("names are grouped properly and groups are ordered correctly", { - expect_equal(short1, list(top = "f", mid = "e, d, c, b", bot = "a")) - expect_equal(short2, list(top = "f, e", mid = c("d, c"), bot = "b, a")) - expect_equal(short3, list(top = "f, e, d", mid = NA, bot = "c, b, a")) -}) - -long1 <- get_level_groups(longs, 1, max(as.numeric(longs), na.rm = TRUE)) -long2 <- get_level_groups(longs, 2, max(as.numeric(longs), na.rm = TRUE)) - -test_that("truncation works correctly", { - expect_equal(long1, list(top = "dddddddddddddddd", mid = "<<< Middle Group (3 categories) >>>", bot = "hhhhhhhhhhhhhhhh")) - expect_equal(long2, list(top = "dddddddddddddddd, aaaaaaaaa...", mid = "cccccccccccccccccccc", bot = "bbbbbbbbbbbbbbbbb, hhhhhhhh...")) - expect_equal(nchar(long2$top), 30) - expect_equal(nchar(long2$bot), 30) -}) diff --git a/tests/testthat/test-statistical-tests.R b/tests/testthat/test-statistical-tests.R deleted file mode 100644 index 4f0c6ddd..00000000 --- a/tests/testthat/test-statistical-tests.R +++ /dev/null @@ -1,126 +0,0 @@ -# Tests for two-way statistical tests - -# Duplicate mtcars rows to avoid chis.test warnings -mtcars3 <- rbind(mtcars, mtcars, mtcars) -tab <- table(mtcars3$am, mtcars3$cyl) -ttab <- tabyl(mtcars3, am, cyl) -ow_tab <- tabyl(mtcars3, am) - -test_that("one-way tabyl is rejected by chisq.test and fisher.test", { - expect_error(chisq.test(ow_tab)) - expect_error(fisher.test(ow_tab)) -}) - -test_that("janitor::chisq.test on a table is correct", { - res <- stats::chisq.test(tab) - jres <- janitor::chisq.test(tab) - expect_equal(jres, res) -}) - -test_that("janitor::chisq.test on a matrix is correct", { - mat <- matrix(c(151, 434, 345, 221, 145, 167), ncol = 3) - res <- stats::chisq.test(mat) - jres <- janitor::chisq.test(mat) - expect_equal(jres, res) -}) - -test_that("janitor::chisq.test on two factors is correct", { - res <- stats::chisq.test(mtcars3$am, mtcars3$cyl) - jres <- janitor::chisq.test(mtcars3$am, mtcars3$cyl) - expect_equal(jres, res) -}) - -test_that("janitor::chisq.test with a numeric vector and p is correct", { - v1 <- round(runif(10, 200, 1000)) - v2 <- round(runif(10, 200, 1000)) - res <- stats::chisq.test(v1, p = v2 / sum(v2)) - jres <- janitor::chisq.test(v1, p = v2 / sum(v2)) - expect_equal(jres, res) -}) - -test_that("janitor::fisher.test on a table is correct", { - res <- stats::fisher.test(tab) - jres <- janitor::fisher.test(tab) - expect_equal(jres, res) -}) - -test_that("janitor::fisher.test on a matrix is correct", { - mat <- matrix(c(151, 434, 345, 221, 145, 167), ncol = 3) - res <- stats::fisher.test(mat) - jres <- janitor::fisher.test(mat) - expect_equal(jres, res) -}) - -test_that("janitor::fisher.test on two vectors is correct", { - res <- stats::fisher.test(mtcars3$am, mtcars3$cyl) - jres <- janitor::fisher.test(mtcars3$am, mtcars3$cyl) - expect_equal(jres, res) -}) - -test_that("janitor::chisq.test on a two-way tabyl is identical to stats::chisq.test", { - tab <- tabyl(mtcars3, am, cyl) - tres <- chisq.test(tab, tabyl_results = FALSE) - tab <- table(mtcars3$am, mtcars3$cyl) - res <- chisq.test(tab) - expect_equal(tres, res) -}) - -test_that("janitor::fisher.test on a two-way tabyl is identical to stats::fisher.test", { - tab <- tabyl(mtcars3, am, cyl) - tres <- fisher.test(tab) - tab <- table(mtcars3$am, mtcars3$cyl) - res <- fisher.test(tab) - expect_equal(tres, res) -}) - -test_that("janitor::chisq.test returns tabyl tables", { - tres <- chisq.test(ttab, tabyl_results = TRUE) - expect_s3_class(tres$observed, "tabyl") - expect_s3_class(tres$expected, "tabyl") - expect_s3_class(tres$residuals, "tabyl") - expect_s3_class(tres$stdres, "tabyl") -}) - -test_that("returned tabyls have correct names and attributes", { - tres <- chisq.test(ttab, tabyl_results = TRUE) - expect_named(tres$observed, c("am", "4", "6", "8")) - expect_named(tres$expected, c("am", "4", "6", "8")) - expect_named(tres$residuals, c("am", "4", "6", "8")) - expect_named(tres$stdres, c("am", "4", "6", "8")) - expect_equal(tres$observed[[1]], c("0", "1")) - expect_equal(tres$expected[[1]], c("0", "1")) - expect_equal(tres$residuals[[1]], c("0", "1")) - expect_equal(tres$stdres[[1]], c("0", "1")) - expect_equal(attr(tres$observed, "var_names"), list(row = "am", col = "cyl")) - expect_equal(attr(tres$expected, "var_names"), list(row = "am", col = "cyl")) - expect_equal(attr(tres$residuals, "var_names"), list(row = "am", col = "cyl")) - expect_equal(attr(tres$stdres, "var_names"), list(row = "am", col = "cyl")) -}) - -test_that("totals are excluded from the statistical tests, #385", { - # Chi-Square - cx <- chisq.test(ttab) - cx_totals <- suppressWarnings(chisq.test(adorn_totals(ttab, "both"))) - cx_totals$data.name <- "ttab" # otherwise the test shows a mismatch, as the inputs had different names - expect_equal( - cx, - cx_totals - ) - expect_warning( - chisq.test(ttab %>% adorn_totals()), - "detected a totals row" - ) - - # Fisher - fisher <- fisher.test(ttab) - fisher_totals <- suppressWarnings(fisher.test(adorn_totals(ttab, "both"))) - fisher_totals$data.name <- "ttab" # otherwise the test shows a mismatch, as the inputs had different names - expect_equal( - fisher, - fisher_totals - ) - expect_warning( - fisher.test(ttab %>% adorn_totals()), - "detected a totals row" - ) -}) diff --git a/tests/testthat/test-tabyl-classifiers.R b/tests/testthat/test-tabyl-classifiers.R deleted file mode 100644 index f682d9e4..00000000 --- a/tests/testthat/test-tabyl-classifiers.R +++ /dev/null @@ -1,119 +0,0 @@ -# Tests tabyl class functions - -a <- mtcars %>% - tabyl(cyl, carb) - -b <- mtcars %>% - dplyr::count(cyl, carb) %>% - tidyr::pivot_wider( - names_from = carb, - values_from = n, - values_fill = 0, - names_sort = TRUE - ) %>% - as.data.frame() # for comparison purposes, remove the tbl_df aspect - - -test_that("as_tabyl works on result of a non-janitor count/pivot_wider", { - expect_equal( - as_tabyl(a), - as_tabyl(b, 2, "cyl", "carb") - ) -}) - -test_that("as_tabyl sets attributes correctly", { - d <- as_tabyl(a) - expect_equal(class(d), class(a)) - expect_equal(attr(d, "core"), untabyl(a)) - expect_equal(attr(d, "tabyl_type"), "two_way") -}) - -test_that("untabyl puts back to original form", { - expect_equal(mtcars, untabyl(as_tabyl(mtcars))) -}) - -test_that("untabyl warns if called on non-tabyl", { - expect_warning( - untabyl(mtcars), - "untabyl\\(\\) called on a non-tabyl" - ) -}) - -test_that("untabyl automatically invokes purrr::map when called on a 3-way tabyl", { - three <- tabyl(mtcars, cyl, am, gear) - expect_equal( - untabyl(three), # vanilla call - purrr::map(three, untabyl) - ) -}) - -test_that("as_tabyl is okay with non-numeric columns", { - e <- b %>% - dplyr::mutate(extra = "val") - expect_equal(attr(as_tabyl(e), "core"), e) # implied success of as_tabyl -}) - -test_that("as_tabyl fails if no numeric columns in 2:n", { - bad <- data.frame( - a = 1:2, - b = c("x", "y") - ) - expect_error(as_tabyl(bad), "at least one one of columns 2:n must be of class numeric") -}) - -test_that("bad inputs are caught", { - expect_error(as_tabyl(mtcars, 3), - "axes must be either 1 or 2", - fixed = TRUE - ) - - expect_error(as_tabyl(1:10), - "input must be a data.frame", - fixed = TRUE - ) - - # don't pass names to a 1-way tabyl - expect_error( - as_tabyl(mtcars, axes = 1, row_var_name = "foo"), - "variable names are only meaningful for two-way tabyls" - ) -}) - -test_that("adorn_totals and adorn_percentages reset the tabyl's core to reflect sorting, #407", { - unsorted <- mtcars %>% tabyl(am, cyl) - sorted <- dplyr::arrange(unsorted, desc(`4`)) - expect_equal( - sorted %>% - adorn_totals() %>% - attr(., "core"), - sorted %>% - untabyl() - ) - expect_equal( - sorted %>% - adorn_percentages() %>% - attr(., "core"), - sorted %>% - untabyl() - ) - # both: - expect_equal( - sorted %>% - adorn_totals() %>% - adorn_percentages() %>% - attr(., "core"), - sorted %>% - untabyl() - ) - # Ns with "Total" row sorted to top - the Total N should be up there too: - expect_equal( - sorted %>% - adorn_totals() %>% - adorn_percentages("col") %>% - dplyr::arrange(desc(`4`)) %>% - adorn_ns() %>% - dplyr::pull(`4`) %>% - dplyr::first(), - "1.0000000 (11)" - ) -}) diff --git a/tests/testthat/test-tabyl.R b/tests/testthat/test-tabyl.R deleted file mode 100644 index 4226e987..00000000 --- a/tests/testthat/test-tabyl.R +++ /dev/null @@ -1,507 +0,0 @@ -# Tests for data.frame renaming function - -cyl_tbl <- tabyl(mtcars$cyl) - -test_that("counts are accurate", { - expect_equal(cyl_tbl$`mtcars$cyl`, c(4, 6, 8)) - expect_equal(cyl_tbl$n, c(11, 7, 14)) -}) - -test_that("percentages are accurate", { - expect_equal(cyl_tbl$percent, c(11 / 32, 7 / 32, 14 / 32)) -}) - -# Character input, with and without NA -test_df <- data.frame(grp = c("a", "b", "b", "c"), stringsAsFactors = FALSE) -test_df_na <- data.frame(grp = c("a", "b", "b", "c", NA), stringsAsFactors = FALSE) -test_res <- tabyl(test_df$grp) -test_res_na <- tabyl(test_df_na$grp) - -test_that("names are right", { - expect_equal(names(cyl_tbl), c("mtcars$cyl", "n", "percent")) - expect_equal(names(test_res_na), c("test_df_na$grp", "n", "percent", "valid_percent")) -}) - -test_that("named vectors are handled properly", { # issue 144 - x <- c(a = "x", b = "y", c = "z") - expect_equal(names(tabyl(x))[1], "x") -}) - -test_that("NAs handled correctly", { - expect_equal(test_res_na$percent, c(0.2, 0.4, 0.2, 0.2)) - expect_equal(test_res_na$valid_percent, c(0.25, 0.5, 0.25, NA)) -}) - -test_that("show_NA = FALSE parameter works, incl. with piped input", { - resss <- test_res - names(resss)[1] <- "test_df_na$grp" - names(attr(resss, "core"))[1] <- "test_df_na$grp" - expect_equal( - resss, - tabyl(test_df_na$grp, show_na = FALSE) - ) - names(attr(resss, "core"))[1] <- "grp" - names(resss)[1] <- "grp" # for this next instance, col name changes - expect_equal( - resss, - test_df_na %>% tabyl(grp, show_na = FALSE) - ) -}) - -test_that("ordering of result by factor levels is preserved for factors", { - expect_equal(tabyl(factor(c("x", "y", "z"), levels = c("y", "z", "x")))[[1]], factor(c("y", "z", "x"), levels = c("y", "z", "x"))) -}) - -# missing factor levels shown, with and without NA -fac <- iris[["Species"]][70:80] # to get versicolor, not the first alphabetically -fac_na <- fac -fac_na[1:2] <- NA - - -test_that("missing factor levels are displayed without NA values", { - expect_equal(tabyl(fac)[[1]], factor(c("setosa", "versicolor", "virginica"), levels = c("setosa", "versicolor", "virginica"))) - expect_equal(tabyl(fac)[[2]], c(0, 11, 0)) - expect_equal(tabyl(fac)[[3]], c(0, 1, 0)) -}) - -test_that("missing factor levels are displayed with NA values", { - expect_equal(tabyl(fac_na)[[1]], factor(c("setosa", "versicolor", "virginica", NA), levels = c("setosa", "versicolor", "virginica"))) - expect_equal(tabyl(fac_na)[[2]], c(0, 9, 0, 2)) - expect_equal(tabyl(fac_na)[[3]], c(0, 9 / 11, 0, 2 / 11)) - expect_equal(tabyl(fac_na)[[4]], c(0, 1, 0, NA)) -}) - -# piping -test_that("piping in a data.frame works", { - x <- tabyl(mtcars$cyl) - names(x)[1] <- "cyl" - names(attr(x, "core"))[1] <- "cyl" - expect_equal( - x, - mtcars %>% tabyl(cyl) - ) -}) - - -test_that("column1 stays its original data type per #168, in both resulting tabyl and core", { - # test character, logical, numeric, factor X both values for show_missing_levels; confirm class in core and in main result - # do those 8 tests in a loop? - loop_df <- data.frame( - a = c(TRUE, FALSE, TRUE), - b = c("x", "y", "y"), - c = c(1, 1, 2), stringsAsFactors = FALSE - ) - for (i in c("logical", "numeric", "character")) { - for (j in c(TRUE, FALSE)) { - loop_df_temp <- loop_df - class(loop_df_temp$a) <- i - loop_tab <- loop_df_temp %>% tabyl(a, b, c, show_missing_levels = j) - expect_equal(class(loop_tab[[1]]$a), class(loop_df_temp$a)) - expect_equal(class(attr(loop_tab[[1]], "core")$a), class(loop_df_temp$a)) # check core class - } - } - loop_df$a <- factor(c("hi", "lo", "hi")) - for (j in c(TRUE, FALSE)) { - loop_df_temp <- loop_df - loop_tab <- loop_df_temp %>% tabyl(a, b, c, show_missing_levels = j) - expect_equal(class(loop_tab[[1]]$a), class(loop_df_temp$a)) - expect_equal(levels(loop_tab[[1]]$a), levels(loop_df_temp$a)) - expect_equal(class(attr(loop_tab[[1]], "core")$a), class(loop_df_temp$a)) # check core class and levels - expect_equal(levels(attr(loop_tab[[1]], "core")$a), levels(loop_df_temp$a)) - } -}) - -# bad inputs - -test_that("failure occurs when passed unsupported types", { - expect_error(tabyl(matrix(1:10, nrow = 5)), "input must be a vector of type logical, numeric, character, list, or factor") - expect_error(tabyl(complex(10)), "input must be a vector of type logical, numeric, character, list, or factor") -}) - -test_that("bad input variable name is preserved", { - expect_equal( - mtcars %>% dplyr::mutate(`bad name` = cyl) %>% tabyl(`bad name`) %>% names() %>% .[[1]], - "bad name" - ) - k <- mtcars %>% dplyr::mutate(`bad name` = cyl) - expect_equal( - tabyl(k$`bad name`) %>% names() %>% .[[1]], - "k$`bad name`" - ) -}) - - -test_that("input variable names 'percent' and 'n' are handled", { - a <- mtcars %>% tabyl(mpg) - expect_equal( - a %>% tabyl(percent), - as_tabyl( - data.frame( - percent = c(1 / 32, 2 / 32), - n = c(18, 7), - percent_percent = c(18 / 25, 7 / 25) - ), - 1 - ) - ) - expect_equal( - a %>% tabyl(n), - as_tabyl( - data.frame( - n = 1:2, - n_n = c(18, 7), - percent = c(18 / 25, 7 / 25) - ), - 1 - ) - ) -}) - -test_that("bizarre combination of %>%, quotes, and spaces in names is handled", { - dat <- data.frame( - `The candidate(s) applied directly to my school` = c("a", "b", "a", "b"), - check.names = FALSE, - stringsAsFactors = FALSE - ) - - expect_equal( - tabyl(dat$`The candidate(s) applied directly to my school` %>% gsub("hi", "there", .)) %>% - names() %>% - .[1], - "dat$`The candidate(s) applied directly to my school` %>% gsub(\"hi\", \"there\", .)" - ) -}) - -test_that("grouped data.frame inputs are handled (#125)", { - expect_equal( - mtcars %>% dplyr::group_by(cyl) %>% tabyl(carb, gear), - mtcars %>% tabyl(carb, gear) - ) -}) - - -test_that("if called on non-existent vector, returns useful error message", { - expect_error(tabyl(mtcars$moose), "object mtcars\\$moose not found") - expect_error(tabyl(moose), "object 'moose' not found") - expect_error(mtcars %>% tabyl(moose)) -}) - -test_that("if called on data.frame with no or irregular columns specified, returns informative error message", { - expect_error(tabyl(mtcars), "if calling on a data.frame, specify unquoted column names(s) to tabulate. Did you mean to call tabyl() on a vector?", - fixed = TRUE - ) - expect_error(tabyl(mtcars, var2 = am), - "please specify var1 OR var1 & var2 OR var1 & var2 & var3", - fixed = TRUE - ) -}) - -test_that("fails if called on a non-data.frame list", { # it's not meant to do this and result will likely be garbage, so fail - L <- list(a = 1, b = "rstats") - expect_error(tabyl(L), - "tabyl() is meant to be called on vectors and data.frames; convert non-data.frame lists to one of these types", - fixed = TRUE - ) -}) - -# showing missing factor levels - -test_that("show_missing_levels parameter works", { - z <- structure( - list( - a = structure(1, .Label = c("hi", "lo"), class = "factor"), - b = structure(2, .Label = c("big", "small"), class = "factor"), - new = structure(1, .Label = c("lvl1", "lvl2"), class = "factor") - ), - row.names = c(NA, -1L), class = c("tbl_df", "tbl", "data.frame"), - .Names = c("a", "b", "new") - ) - - expect_equal( - z %>% tabyl(a, b, new, show_missing_levels = TRUE), - list(lvl1 = data.frame( - a = c("hi", "lo"), - big = c(0, 0), - small = c(1, 0), - stringsAsFactors = TRUE - ) %>% as_tabyl(2, "a", "b")) - ) - expect_equal( - z %>% tabyl(a, b, new, show_missing_levels = FALSE) %>% .[[1]], - data.frame( - a = factor("hi", levels = c("hi", "lo")), - small = c(1) - ) %>% as_tabyl(2, "a", "b") - ) - - # Works with numerics - expect_equal( - mtcars %>% tabyl(cyl, am), - data.frame( - cyl = c(4, 6, 8), - `0` = c(3, 4, 12), - `1` = c(8, 3, 2), - check.names = FALSE - ) %>% as_tabyl(2, "cyl", "am") - ) -}) - -# NA handling - position and removal -# Putting this outside the following test block for later re-use -x <- data.frame( - a = c(1, 2, 2, 2, 1, 1, 1, NA, NA, 1), - b = c(rep("up", 4), rep("down", 4), NA, NA), - c = 10, - d = c(NA, 10:2), - stringsAsFactors = FALSE -) - -test_that("NA levels get moved to the last column in the data.frame, are suppressed properly", { - y <- tabyl(x, a, b) %>% - untabyl() - expect_equal( - y, - data.frame( - a = c(1, 2, NA), - down = c(3, 0, 1), - up = c(1, 3, 0), - NA_ = c(1, 0, 1) - ) - ) - - expect_equal( - tabyl(x, a, b, show_na = FALSE) %>% - untabyl(), - data.frame( - a = c(1, 2), - down = c(3, 0), - up = c(1, 3) - ) - ) - - # one-way suppression - expect_equal( - tabyl(x$a, show_na = FALSE) %>% - untabyl(), - data.frame( - `x$a` = 1:2, - n = c(5, 3), - percent = c(0.625, 0.375), - check.names = FALSE - ) - ) - - # NA level is shown in 3 way split - y <- x %>% tabyl(c, a, b, show_missing_levels = FALSE) - expect_equal(length(y), 3) - expect_equal(names(y), c("down", "up", "NA_")) - expect_equal( - y[["NA_"]], # column c remains numeric - x %>% - dplyr::filter(is.na(b)) %>% - tabyl(c, a) - ) - - y_with_missing <- x %>% tabyl(c, a, b, show_missing_levels = TRUE) - expect_equal(length(y_with_missing), 3) - expect_equal(names(y_with_missing), c("down", "up", "NA_")) - expect_equal( - y_with_missing[["NA_"]] %>% untabyl(), # column c remains numeric - data.frame(c = 10, `1` = 1, `2` = 0, NA_ = 1, check.names = FALSE) - ) - # If no NA in 3rd variable, it doesn't appear in split list - expect_equal(length(dplyr::starwars %>% - dplyr::filter(species == "Human") %>% - tabyl(eye_color, skin_color, gender, show_missing_levels = TRUE)), 2) - - # If there is NA, it does appear in split list - expect_equal(length(dplyr::starwars %>% - tabyl(eye_color, skin_color, gender, show_missing_levels = TRUE)), 3) - expect_equal(length(dplyr::starwars %>% - tabyl(eye_color, skin_color, gender, show_missing_levels = FALSE)), 3) - - # NA level in the list gets suppressed if show_na = FALSE. Should have one less level if NA is suppressed. - expect_equal(length(dplyr::starwars %>% - tabyl(eye_color, skin_color, gender, show_na = TRUE)), 3) - expect_equal(length(dplyr::starwars %>% - tabyl(eye_color, skin_color, gender, show_na = FALSE)), 2) -}) - -test_that("tabyl fill 0s with show_missing_levels = FALSE", { - res <- x %>% tabyl(a, b, show_missing_levels = FALSE) - got <- data.frame(a = c(1, 2, NA), down = c(3L, 0L, 1L), up = c(1L, 3L, 0L), NA_ = c(1L, 0L, 1L)) %>% - structure( - class = c("tabyl", "data.frame"), - core = data.frame(a = c(1, 2, NA), down = c(3L, 0L, 1L), up = c(1L, 3L, 0L), NA_ = c(1L, 0L, 1L)), - tabyl_type = "two_way", - var_names = list(row = "a", col = "b") - ) - expect_equal(res, got) -}) - -test_that("zero-row and fully-NA inputs are handled", { - zero_vec <- character(0) - expect_equal(nrow(tabyl(zero_vec)), 0) - expect_equal(names(tabyl(zero_vec)), c("zero_vec", "n", "percent")) - - zero_df <- data.frame(a = character(0), b = character(0)) - expect_message( - expect_equal(nrow(tabyl(zero_df, a, b)), 0) - ) - expect_message( - expect_equal(names(tabyl(zero_df, a, b)), "a"), - "No records to count so returning a zero-row tabyl" - ) - - all_na_df <- data.frame(a = c(NA, NA), b = c(NA_character_, NA_character_)) - expect_message( - expect_equal(tabyl(all_na_df, a, b, show_na = FALSE) %>% nrow(), 0) - ) - expect_message( - expect_equal(tabyl(all_na_df, a, b, show_na = FALSE) %>% names(), "a"), - "No records to count so returning a zero-row tabyl" - ) -}) - -test_that("print.tabyl prints without row numbers", { - expect_equal( - mtcars %>% tabyl(am, cyl) %>% capture.output(), - c(" am 4 6 8", " 0 3 4 12", " 1 8 3 2") - ) -}) - -test_that("the dplyr warning suggesting forcats::fct_explicit_na that is generated by a tabyl of a factor with NA values is caught ", { - # leaving this in as I'd want to know if it ever gets loud again, but the warning seems to be gone in - # dplyr 1.0.0 and I have removed the withCallingHandlers({}) code in tabyl() that this was testing - expect_silent( - tabyl(factor(c("a", "b", NA))) - ) - xx <- data.frame( - a = factor(c("a", "b", NA)), - b = 1:3 - ) - expect_silent(xx %>% - tabyl(a, b)) -}) - -test_that("3-way tabyl with 3rd var factor is listed in right order, #250", { - z <- mtcars - z$cyl <- factor(z$cyl, levels = c(4, 8, 6)) - expect_equal(names(tabyl(z, am, gear, cyl)), c("4", "8", "6")) - z$cyl[32] <- NA - expect_equal(names(tabyl(z, am, gear, cyl)), c("4", "8", "6", "NA_")) - expect_equal(names(tabyl(z, am, gear, cyl, show_na = FALSE)), c("4", "8", "6")) - z <- z %>% dplyr::filter(!cyl %in% "4") - expect_equal(names(tabyl(z, am, gear, cyl)), c("8", "6", "NA_")) -}) - -test_that("tabyl works with label attributes (#394)", { - mt_label <- mtcars - attr(mt_label$cyl, "label") <- "Number of cyl" - tab <- tabyl(mt_label, cyl) - expect_named(tab, c("Number of cyl", "n", "percent")) - tab2 <- tabyl(mt_label, cyl, am) - expect_named(tab2, c("Number of cyl", "0", "1")) - tab3 <- tabyl(mt_label, cyl, am, vs) - expect_equal(names(tab3[[1]])[1], "Number of cyl") -}) - -test_that("tabyl works with ordered 1st variable, #386", { - mt_ordered <- mtcars - mt_ordered$cyl <- ordered(mt_ordered$cyl, levels = c("4", "8", "6")) - - ordered_3way <- mt_ordered %>% - tabyl(cyl, gear, am) - expect_equal(class(ordered_3way[[1]]$cyl), c("ordered", "factor")) # 1st col in resulting tabyl - expect_equal(class(attr(ordered_3way[[1]], "core")$cyl), c("ordered", "factor")) # 1st col in tabyl core -}) - -test_that("factor ordering of columns is correct in 2-way tabyl", { - two_factors <- data.frame( - x = factor(c("big", "small", "medium", "small"), - levels = c("small", "medium", "big") - ), - y = factor(c("hi", "hi", "hi", "lo"), - levels = c("lo", "hi") - ) - ) - expect_equal( - two_factors %>% - tabyl(x, y) %>% - names(), - c("x", "lo", "hi") - ) -}) - -test_that("empty strings converted to _emptystring", { - mt_empty <- mtcars - mt_empty$cyl[1:2] <- c("", NA_character_) - expect_equal( - mt_empty %>% - tabyl(am, cyl) %>% - names(), - c("am", "4", "6", "8", "emptystring_", "NA_") - ) -}) - -test_that("3way tabyls with factors in cols 1-2 are arranged correctly, #379", { - dat_3wayfactors <- data.frame( - gender = c("f", "m", "m", "f", "m"), - age_group = factor( - c("18-35", "46-55", "46-55", "36-45", ">55"), - levels = c("18-35", "36-45", "46-55", ">55") - ), - bmi_group = factor( - c("18.5 - 25", "25 - 30", "18.5 - 25", ">30", "<18.5"), - levels = c("<18.5", "18.5 - 25", "25 - 30", ">30") - ), - stringsAsFactors = TRUE - ) - - tabyl_3wf <- dat_3wayfactors %>% - tabyl(bmi_group, age_group, gender, show_missing_levels = FALSE) - - expect_equal(names(tabyl_3wf$m), c("bmi_group", "46-55", ">55")) - expect_equal( - tabyl_3wf$f[[1]], - factor( - c("18.5 - 25", ">30"), - levels = c("<18.5", "18.5 - 25", "25 - 30", ">30") - ) - ) -}) - -test_that("tabyl errors informatively called like tabyl(mtcars$cyl, mtcars$gear), #377", { - expect_error( - tabyl(mtcars$cyl, mtcars$am), - regexp = "Did you try to call tabyl on two vectors" - ) - has_logicals <- data.frame(x = 1:2, y = c(TRUE, FALSE)) - expect_error( - tabyl(has_logicals$x, has_logicals$y), - regexp = "Did you try to call tabyl on two vectors" - ) - expect_type( - has_logicals %>% - tabyl(x, y), - "list" - ) -}) - -test_that("2-way tabyl with numeric column names is sorted numerically", { - df <- data.frame(var1 = c(1:11), var2 = c(NA, 10:1)) - expect_equal(colnames(df %>% tabyl(var1, var2)), c("var1", 1:10, "NA_")) -}) - -test_that("3-way tabyl with numeric names is sorted numerically", { - expect_equal( - names(mtcars %>% tabyl(gear, cyl, hp)), - as.character(sort(unique(mtcars$hp))) - ) - - # Check putting NA last - data.frame "x" is created way above - expect_equal( - names(x %>% tabyl(a, c, d)), - c(2:10, "NA_") - ) -}) diff --git a/tests/testthat/test-top-levels.R b/tests/testthat/test-top-levels.R deleted file mode 100644 index f90b0c19..00000000 --- a/tests/testthat/test-top-levels.R +++ /dev/null @@ -1,63 +0,0 @@ -fac <- factor(c("a", "b", "c", "d", "e", "f", "f"), levels = rev(letters[1:6])) -fac_odd_lvls <- factor(fac, levels = rev(letters[1:5])) - -# more tests - group names and ordering - are in test-get-level-groups.R -test_that("top_levels values are correct", { - expect_equal(top_levels(fac)[[3]], c(3 / 7, 2 / 7, 2 / 7)) # default n = 2, num_levels = 6 - expect_equal(top_levels(fac)[[2]], c(3, 2, 2)) - expect_equal(top_levels(fac, 3)[[3]], c(4 / 7, 3 / 7)) # n = 3, num_levels = 6 - expect_equal(top_levels(fac, 3)[[2]], c(4, 3)) - expect_equal(top_levels(fac_odd_lvls)[[2]], c(2, 1, 2)) # default n = 2, num_levels = 5 - expect_equal(top_levels(fac_odd_lvls)[[3]], c(0.4, 0.2, 0.4)) - expect_equal(top_levels(fac_odd_lvls, 1)[[2]], c(1, 3, 1)) # n = 1, num_levels = 5 - expect_equal(top_levels(fac_odd_lvls, 1)[[3]], c(0.2, 0.6, 0.2)) -}) - -test_that("top_levels missing levels are represented", { - x <- as.factor(letters[1:5])[1:3] - expect_equal( - top_levels(x)[[1]], - structure(1:3, .Label = c("a, b", "c", "d, e"), class = "factor") - ) - expect_equal( - top_levels(x)[[2]], - c(2, 1, 0) - ) -}) - - -test_that("top_levels NA results are treated appropriately", { - fac_na <- fac - fac_na[7] <- NA - expect_equal(top_levels(fac_na)[[2]], rep(2, 3)) - expect_equal(top_levels(fac_na, show_na = TRUE)[[2]], c(2, 2, 2, 1)) - expect_equal(top_levels(fac_na, show_na = TRUE)[[3]], c(2 / 7, 2 / 7, 2 / 7, 1 / 7)) - expect_equal(top_levels(fac_na, show_na = TRUE)[[4]], c(1 / 3, 1 / 3, 1 / 3, NA)) -}) - -test_that("top_levels default n parameter works", { - expect_equal(top_levels(fac), top_levels(fac, 2)) -}) - -test_that("top_levels missing levels are treated appropriately", { - fac_missing_lvl <- fac - fac_missing_lvl[2] <- NA - expect_equal(top_levels(fac_missing_lvl)[[2]], c(3, 2, 1)) -}) - -test_that("top_levels bad type inputs are handled", { - expect_error(top_levels(c(0, 1), "factor_vec is not of type 'factor'")) - expect_error(top_levels(c("hi", "lo"), "factor_vec is not of type 'factor'")) - expect_error(top_levels(mtcars, "factor_vec is not of type 'factor'")) -}) - -test_that("top_levels bad n value is handled", { - expect_error(top_levels(fac, 4)) - expect_error(top_levels(fac_odd_lvls, 3)) - expect_error(top_levels(fac, 0)) - expect_error(top_levels(factor(c("a", "b"))), "input factor variable must have at least 3 levels") -}) - -test_that("top_levels correct variable name assigned to first column of result", { - expect_equal(names(top_levels(fac))[1], "fac") -}) diff --git a/vignettes/tabyls.Rmd b/vignettes/tabyls.Rmd deleted file mode 100644 index 31908572..00000000 --- a/vignettes/tabyls.Rmd +++ /dev/null @@ -1,271 +0,0 @@ ---- -title: "tabyls: a tidy, fully-featured approach to counting things" -date: '`r Sys.Date()`' -output: - rmarkdown::github_document -vignette: > - %\VignetteIndexEntry{tabyls} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteEncoding{UTF-8} ---- - -```{r chunk_options, include = FALSE} -knitr::opts_chunk$set(collapse = TRUE, comment = "#>") -``` - -## Motivation: why tabyl? - -Analysts do a lot of counting. Indeed, it's been said that "data science is mostly counting things." But the base R function for counting, `table()`, leaves much to be desired: - -- It doesn't accept data.frame inputs (and thus doesn't play nicely with the `%>%` pipe) -- It doesn't output data.frames -- Its results are hard to format. Compare the look and formatting choices of an R table to a Microsoft Excel PivotTable or even the table formatting provided by SPSS. - -`tabyl()` is an approach to tabulating variables that addresses these shortcomings. It's part of the janitor package because counting is such a fundamental part of data cleaning and exploration. - -`tabyl()` is tidyverse-aligned and is primarily built upon the dplyr and tidyr packages. - -## How it works - -On its surface, `tabyl()` produces frequency tables using 1, 2, or 3 variables. Under the hood, `tabyl()` also attaches a copy of these counts as an attribute of the resulting data.frame. - -The result looks like a basic data.frame of counts, but because it's also a `tabyl` containing this metadata, you can use `adorn_` functions to add additional information and pretty formatting. - -The `adorn_` functions are built to work on `tabyls`, but have been adapted to work with similar, non-tabyl data.frames that need formatting. - -# Examples -This vignette demonstrates `tabyl` in the context of studying humans in the `starwars` dataset from dplyr: -```{r clean_starwars, warning = FALSE, message = FALSE} -library(dplyr) -humans <- starwars %>% - filter(species == "Human") -``` - - -## One-way tabyl - -Tabulating a single variable is the simplest kind of tabyl: - -```{r one_way, message=FALSE} -library(janitor) - -t1 <- humans %>% - tabyl(eye_color) - -t1 -``` - - -When `NA` values are present, `tabyl()` also displays "valid" percentages, i.e., with missing values removed from the denominator. And while `tabyl()` is built to take a data.frame and column names, you can also produce a one-way tabyl by calling it directly on a vector: - -```{r one_way_vector} -x <- c("big", "big", "small", "small", "small", NA) -tabyl(x) -``` - - -Most `adorn_` helper functions are built for 2-way tabyls, but those that make sense for a 1-way tabyl do work: -```{r one_way_adorns} -t1 %>% - adorn_totals("row") %>% - adorn_pct_formatting() -``` - - -## Two-way tabyl - -This is often called a "crosstab" or "contingency" table. Calling `tabyl` on two columns of a data.frame produces the same result as the common combination of `dplyr::count()`, followed by `tidyr::pivot_wider()` to wide form: - -```{r two_way} -t2 <- humans %>% - tabyl(gender, eye_color) - -t2 -``` - -Since it's a `tabyl`, we can enhance it with `adorn_` helper functions. For instance: - -```{r two_way_adorns} -t2 %>% - adorn_percentages("row") %>% - adorn_pct_formatting(digits = 2) %>% - adorn_ns() -``` - -Adornments have options to control axes, rounding, and other relevant formatting choices (more on that below). - -## Three-way tabyl - -Just as `table()` accepts three variables, so does `tabyl()`, producing a list of tabyls: - -```{r three_Way} -t3 <- humans %>% - tabyl(eye_color, skin_color, gender) - -# the result is a tabyl of eye color x skin color, split into a list by gender -t3 -``` - -If the `adorn_` helper functions are called on a list of data.frames - like the output of a three-way `tabyl` call - they will call `purrr::map()` to apply themselves to each data.frame in the list: - -```{r three_way_adorns, warning = FALSE, message = FALSE} -library(purrr) -humans %>% - tabyl(eye_color, skin_color, gender, show_missing_levels = FALSE) %>% - adorn_totals("row") %>% - adorn_percentages("all") %>% - adorn_pct_formatting(digits = 1) %>% - adorn_ns() %>% - adorn_title() -``` - -This automatic mapping supports interactive data analysis that switches between combinations of 2 and 3 variables. That way, if a user starts with `humans %>% tabyl(eye_color, skin_color)`, adds some `adorn_` calls, then decides to split the tabulation by gender and modifies their first line to `humans %>% tabyl(eye_color, skin_color, gender`), they don't have to rewrite the subsequent adornment calls to use `map()`. - -However, if feels more natural to call these with `map()` or `lapply()`, that is still supported. For instance, `t3 %>% lapply(adorn_percentages)` would produce the same result as `t3 %>% adorn_percentages`. - -### Other features of tabyls - -+ When called on a factor, `tabyl` will show missing levels (levels not present in the data) in the result - + This can be suppressed if not desired -+ `NA` values can be displayed or suppressed -+ `tabyls` print without displaying row numbers - -You can call `chisq.test()` and `fisher.test()` on a two-way tabyl to perform those statistical tests, just like on a base R `table()` object. - -## The `adorn_*` functions - -These modular functions build on a `tabyl` to approximate the functionality of a PivotTable in Microsoft Excel. They print elegant results for interactive analysis or for sharing in a report, e.g., with `knitr::kable()`. For example: - -```{r} -humans %>% - tabyl(gender, eye_color) %>% - adorn_totals(c("row", "col")) %>% - adorn_percentages("row") %>% - adorn_pct_formatting(rounding = "half up", digits = 0) %>% - adorn_ns() %>% - adorn_title("combined") %>% - knitr::kable() -``` - -### The adorn functions are: - -+ **`adorn_totals()`**: Add totals row, column, or both. -+ **`adorn_percentages()`**: Calculate percentages along either axis or over the entire tabyl -+ **`adorn_pct_formatting()`**: Format percentage columns, controlling the number of digits to display and whether to append the `%` symbol -+ **`adorn_rounding()`**: Round a data.frame of numbers (usually the result of `adorn_percentages`), either using the base R `round()` function or using janitor's `round_half_up()` to round all ties up ([thanks, StackOverflow](https://stackoverflow.com/a/12688836/4470365)). - + e.g., round 10.5 up to 11, consistent with Excel's tie-breaking behavior. - + This contrasts with rounding 10.5 down to 10 as in base R's `round(10.5)`. - + `adorn_rounding()` returns columns of class `numeric`, allowing for graphing, sorting, etc. It's a less-aggressive substitute for `adorn_pct_formatting()`; these two functions should not be called together. -+ **`adorn_ns()`**: add Ns to a tabyl. These can be drawn from the tabyl's underlying counts, which are attached to the tabyl as metadata, or they can be supplied by the user. -+ **`adorn_title()`**: add a title to a tabyl (or other data.frame). Options include putting the column title in a new row on top of the data.frame or combining the row and column titles in the data.frame's first name slot. - - -These adornments should be called in a logical order, e.g., you probably want to add totals before percentages are calculated. In general, call them in the order they appear above. - -## BYOt (Bring Your Own tabyl) - -You can also call `adorn_` functions on other data.frames, not only the results of calls to `tabyl()`. E.g., `mtcars %>% adorn_totals("col") %>% adorn_percentages("col")` performs as expected, despite `mtcars` not being a `tabyl`. - -This can be handy when you have a data.frame that is not a simple tabulation generated by `tabyl` but would still benefit from the `adorn_` formatting functions. - -A simple example: calculate the proportion of records meeting a certain condition, then format the results. - -```{r first_non_tabyl} -percent_above_165_cm <- humans %>% - group_by(gender) %>% - summarise(pct_above_165_cm = mean(height > 165, na.rm = TRUE), .groups = "drop") - -percent_above_165_cm %>% - adorn_pct_formatting() -``` - -You can control which columns are adorned by using the `...` argument. It accepts the [tidyselect helpers](https://r4ds.had.co.nz/transform.html#select). That is, you can specify columns the same way you would using `dplyr::select()`. - -For instance, say you have a numeric column that should not be included in percentage formatting and you wish to exempt it. Here, only the `proportion` column is adorned: - -```{r tidyselect, warning = FALSE, message = FALSE} -mtcars %>% - count(cyl, gear) %>% - rename(proportion = n) %>% - adorn_percentages("col", na.rm = TRUE, proportion) %>% - adorn_pct_formatting(, , , proportion) # the commas say to use the default values of the other arguments -``` - -Here we specify that only two consecutive numeric columns should be totaled (`year` is numeric but should not be included): - -```{r dont_total, warning = FALSE, message = FALSE} -cases <- data.frame( - region = c("East", "West"), - year = 2015, - recovered = c(125, 87), - died = c(13, 12) -) - -cases %>% - adorn_totals(c("col", "row"), fill = "-", na.rm = TRUE, name = "Total Cases", recovered:died) -``` - -Here's a more complex example that uses a data.frame of means, not counts. We create a table containing the mean of a 3rd variable when grouped by two other variables, then use `adorn_` functions to round the values and append Ns. The first part is pretty straightforward: - -```{r more_non_tabyls, warning = FALSE, message = FALSE} -library(tidyr) # for pivot_wider() -mpg_by_cyl_and_am <- mtcars %>% - group_by(cyl, am) %>% - summarise(mpg = mean(mpg), .groups = "drop") %>% - pivot_wider(names_from = am, values_from = mpg) - -mpg_by_cyl_and_am -``` - -Now to `adorn_` it. Since this is not the result of a `tabyl()` call, it doesn't have the underlying Ns stored in the `core` attribute, so we'll have to supply them: -```{r add_the_Ns} -mpg_by_cyl_and_am %>% - adorn_rounding() %>% - adorn_ns( - ns = mtcars %>% # calculate the Ns on the fly by calling tabyl on the original data - tabyl(cyl, am) - ) %>% - adorn_title("combined", row_name = "Cylinders", col_name = "Is Automatic") -``` - -If needed, Ns can be manipulated in their own data.frame before they are appended. Here a tabyl with values in the thousands has its Ns formatted to include the separating character `,` as typically seen in American numbers, e.g., `3,000`. - -First we create the tabyl to adorn: - -```{r formatted_Ns_thousands_prep} -set.seed(1) -raw_data <- data.frame( - sex = rep(c("m", "f"), 3000), - age = round(runif(3000, 1, 102), 0) -) -raw_data$agegroup <- cut(raw_data$age, quantile(raw_data$age, c(0, 1 / 3, 2 / 3, 1))) - -comparison <- raw_data %>% - tabyl(agegroup, sex, show_missing_levels = FALSE) %>% - adorn_totals(c("row", "col")) %>% - adorn_percentages("col") %>% - adorn_pct_formatting(digits = 1) - -comparison -``` - -At this point, the Ns are unformatted: -```{r adorn_ns_unformatted} -comparison %>% - adorn_ns() -``` - -Now we format them to insert the thousands commas. A tabyl's raw Ns are stored in its `"core"` attribute. Here we retrieve those with `attr()`, then apply the base R function `format()` to all numeric columns. Lastly, we append these Ns using `adorn_ns()`. - -```{r formatted_Ns_thousands} -formatted_ns <- attr(comparison, "core") %>% # extract the tabyl's underlying Ns - adorn_totals(c("row", "col")) %>% # to match the data.frame we're appending to - dplyr::mutate(across(where(is.numeric), ~ format(.x, big.mark = ","))) - -comparison %>% - adorn_ns(position = "rear", ns = formatted_ns) -``` - -### Questions? Comments? - -File [an issue on GitHub](https://github.com/sfirke/janitor/issues) if you have suggestions related to `tabyl()` and its `adorn_` helpers or encounter problems while using them. diff --git a/vignettes/tabyls.md b/vignettes/tabyls.md deleted file mode 100644 index ea526931..00000000 --- a/vignettes/tabyls.md +++ /dev/null @@ -1,454 +0,0 @@ -tabyls: a tidy, fully-featured approach to counting things -================ -2023-02-03 - -## Motivation: why tabyl? - -Analysts do a lot of counting. Indeed, it’s been said that “data science -is mostly counting things.” But the base R function for counting, -`table()`, leaves much to be desired: - -- It doesn’t accept data.frame inputs (and thus doesn’t play nicely with - the `%>%` pipe) -- It doesn’t output data.frames -- Its results are hard to format. Compare the look and formatting - choices of an R table to a Microsoft Excel PivotTable or even the - table formatting provided by SPSS. - -`tabyl()` is an approach to tabulating variables that addresses these -shortcomings. It’s part of the janitor package because counting is such -a fundamental part of data cleaning and exploration. - -`tabyl()` is tidyverse-aligned and is primarily built upon the dplyr and -tidyr packages. - -## How it works - -On its surface, `tabyl()` produces frequency tables using 1, 2, or 3 -variables. Under the hood, `tabyl()` also attaches a copy of these -counts as an attribute of the resulting data.frame. - -The result looks like a basic data.frame of counts, but because it’s -also a `tabyl` containing this metadata, you can use `adorn_` functions -to add additional information and pretty formatting. - -The `adorn_` functions are built to work on `tabyls`, but have been -adapted to work with similar, non-tabyl data.frames that need -formatting. - -# Examples - -This vignette demonstrates `tabyl` in the context of studying humans in -the `starwars` dataset from dplyr: - -``` r -library(dplyr) -humans <- starwars %>% - filter(species == "Human") -``` - -## One-way tabyl - -Tabulating a single variable is the simplest kind of tabyl: - -``` r -library(janitor) - -t1 <- humans %>% - tabyl(eye_color) - -t1 -#> eye_color n percent -#> blue 12 0.34285714 -#> blue-gray 1 0.02857143 -#> brown 17 0.48571429 -#> dark 1 0.02857143 -#> hazel 2 0.05714286 -#> yellow 2 0.05714286 -``` - -When `NA` values are present, `tabyl()` also displays “valid” -percentages, i.e., with missing values removed from the denominator. And -while `tabyl()` is built to take a data.frame and column names, you can -also produce a one-way tabyl by calling it directly on a vector: - -``` r -x <- c("big", "big", "small", "small", "small", NA) -tabyl(x) -#> x n percent valid_percent -#> big 2 0.3333333 0.4 -#> small 3 0.5000000 0.6 -#> 1 0.1666667 NA -``` - -Most `adorn_` helper functions are built for 2-way tabyls, but those -that make sense for a 1-way tabyl do work: - -``` r -t1 %>% - adorn_totals("row") %>% - adorn_pct_formatting() -#> eye_color n percent -#> blue 12 34.3% -#> blue-gray 1 2.9% -#> brown 17 48.6% -#> dark 1 2.9% -#> hazel 2 5.7% -#> yellow 2 5.7% -#> Total 35 100.0% -``` - -## Two-way tabyl - -This is often called a “crosstab” or “contingency” table. Calling -`tabyl` on two columns of a data.frame produces the same result as the -common combination of `dplyr::count()`, followed by -`tidyr::pivot_wider()` to wide form: - -``` r -t2 <- humans %>% - tabyl(gender, eye_color) - -t2 -#> gender blue blue-gray brown dark hazel yellow -#> feminine 3 0 5 0 1 0 -#> masculine 9 1 12 1 1 2 -``` - -Since it’s a `tabyl`, we can enhance it with `adorn_` helper functions. -For instance: - -``` r - -t2 %>% - adorn_percentages("row") %>% - adorn_pct_formatting(digits = 2) %>% - adorn_ns() -#> gender blue blue-gray brown dark hazel yellow -#> feminine 33.33% (3) 0.00% (0) 55.56% (5) 0.00% (0) 11.11% (1) 0.00% (0) -#> masculine 34.62% (9) 3.85% (1) 46.15% (12) 3.85% (1) 3.85% (1) 7.69% (2) -``` - -Adornments have options to control axes, rounding, and other relevant -formatting choices (more on that below). - -## Three-way tabyl - -Just as `table()` accepts three variables, so does `tabyl()`, producing -a list of tabyls: - -``` r -t3 <- humans %>% - tabyl(eye_color, skin_color, gender) - -# the result is a tabyl of eye color x skin color, split into a list by gender -t3 -#> $feminine -#> eye_color dark fair light pale tan white -#> blue 0 2 1 0 0 0 -#> blue-gray 0 0 0 0 0 0 -#> brown 0 1 4 0 0 0 -#> dark 0 0 0 0 0 0 -#> hazel 0 0 1 0 0 0 -#> yellow 0 0 0 0 0 0 -#> -#> $masculine -#> eye_color dark fair light pale tan white -#> blue 0 7 2 0 0 0 -#> blue-gray 0 1 0 0 0 0 -#> brown 3 4 3 0 2 0 -#> dark 1 0 0 0 0 0 -#> hazel 0 1 0 0 0 0 -#> yellow 0 0 0 1 0 1 -``` - -If the `adorn_` helper functions are called on a list of data.frames - -like the output of a three-way `tabyl` call - they will call -`purrr::map()` to apply themselves to each data.frame in the list: - -``` r -library(purrr) -humans %>% - tabyl(eye_color, skin_color, gender, show_missing_levels = FALSE) %>% - adorn_totals("row") %>% - adorn_percentages("all") %>% - adorn_pct_formatting(digits = 1) %>% - adorn_ns %>% - adorn_title -#> $feminine -#> skin_color -#> eye_color fair light -#> blue 22.2% (2) 11.1% (1) -#> brown 11.1% (1) 44.4% (4) -#> hazel 0.0% (0) 11.1% (1) -#> Total 33.3% (3) 66.7% (6) -#> -#> $masculine -#> skin_color -#> eye_color dark fair light pale tan white -#> blue 0.0% (0) 26.9% (7) 7.7% (2) 0.0% (0) 0.0% (0) 0.0% (0) -#> blue-gray 0.0% (0) 3.8% (1) 0.0% (0) 0.0% (0) 0.0% (0) 0.0% (0) -#> brown 11.5% (3) 15.4% (4) 11.5% (3) 0.0% (0) 7.7% (2) 0.0% (0) -#> dark 3.8% (1) 0.0% (0) 0.0% (0) 0.0% (0) 0.0% (0) 0.0% (0) -#> hazel 0.0% (0) 3.8% (1) 0.0% (0) 0.0% (0) 0.0% (0) 0.0% (0) -#> yellow 0.0% (0) 0.0% (0) 0.0% (0) 3.8% (1) 0.0% (0) 3.8% (1) -#> Total 15.4% (4) 50.0% (13) 19.2% (5) 3.8% (1) 7.7% (2) 3.8% (1) -``` - -This automatic mapping supports interactive data analysis that switches -between combinations of 2 and 3 variables. That way, if a user starts -with `humans %>% tabyl(eye_color, skin_color)`, adds some `adorn_` -calls, then decides to split the tabulation by gender and modifies their -first line to `humans %>% tabyl(eye_color, skin_color, gender`), they -don’t have to rewrite the subsequent adornment calls to use `map()`. - -However, if feels more natural to call these with `map()` or `lapply()`, -that is still supported. For instance, -`t3 %>% lapply(adorn_percentages)` would produce the same result as -`t3 %>% adorn_percentages`. - -### Other features of tabyls - -- When called on a factor, `tabyl` will show missing levels (levels not - present in the data) in the result - - This can be suppressed if not desired -- `NA` values can be displayed or suppressed -- `tabyls` print without displaying row numbers - -You can call `chisq.test()` and `fisher.test()` on a two-way tabyl to -perform those statistical tests, just like on a base R `table()` object. - -## The `adorn_*` functions - -These modular functions build on a `tabyl` to approximate the -functionality of a PivotTable in Microsoft Excel. They print elegant -results for interactive analysis or for sharing in a report, e.g., with -`knitr::kable()`. For example: - -``` r -humans %>% - tabyl(gender, eye_color) %>% - adorn_totals(c("row", "col")) %>% - adorn_percentages("row") %>% - adorn_pct_formatting(rounding = "half up", digits = 0) %>% - adorn_ns() %>% - adorn_title("combined") %>% - knitr::kable() -``` - -| gender/eye_color | blue | blue-gray | brown | dark | hazel | yellow | Total | -|:-----------------|:---------|:----------|:---------|:-------|:--------|:-------|:----------| -| feminine | 33% (3) | 0% (0) | 56% (5) | 0% (0) | 11% (1) | 0% (0) | 100% (9) | -| masculine | 35% (9) | 4% (1) | 46% (12) | 4% (1) | 4% (1) | 8% (2) | 100% (26) | -| Total | 34% (12) | 3% (1) | 49% (17) | 3% (1) | 6% (2) | 6% (2) | 100% (35) | - -### The adorn functions are: - -- **`adorn_totals()`**: Add totals row, column, or both. -- **`adorn_percentages()`**: Calculate percentages along either axis or - over the entire tabyl -- **`adorn_pct_formatting()`**: Format percentage columns, controlling - the number of digits to display and whether to append the `%` symbol -- **`adorn_rounding()`**: Round a data.frame of numbers (usually the - result of `adorn_percentages`), either using the base R `round()` - function or using janitor’s `round_half_up()` to round all ties up - ([thanks, - StackOverflow](https://stackoverflow.com/a/12688836/4470365)). - - e.g., round 10.5 up to 11, consistent with Excel’s tie-breaking - behavior. - - This contrasts with rounding 10.5 down to 10 as in base R’s - `round(10.5)`. - - `adorn_rounding()` returns columns of class `numeric`, allowing for - graphing, sorting, etc. It’s a less-aggressive substitute for - `adorn_pct_formatting()`; these two functions should not be called - together. -- **`adorn_ns()`**: add Ns to a tabyl. These can be drawn from the - tabyl’s underlying counts, which are attached to the tabyl as - metadata, or they can be supplied by the user. -- **`adorn_title()`**: add a title to a tabyl (or other data.frame). - Options include putting the column title in a new row on top of the - data.frame or combining the row and column titles in the data.frame’s - first name slot. - -These adornments should be called in a logical order, e.g., you probably -want to add totals before percentages are calculated. In general, call -them in the order they appear above. - -## BYOt (Bring Your Own tabyl) - -You can also call `adorn_` functions on other data.frames, not only the -results of calls to `tabyl()`. E.g., -`mtcars %>% adorn_totals("col") %>% adorn_percentages("col")` performs -as expected, despite `mtcars` not being a `tabyl`. - -This can be handy when you have a data.frame that is not a simple -tabulation generated by `tabyl` but would still benefit from the -`adorn_` formatting functions. - -A simple example: calculate the proportion of records meeting a certain -condition, then format the results. - -``` r -percent_above_165_cm <- humans %>% - group_by(gender) %>% - summarise(pct_above_165_cm = mean(height > 165, na.rm = TRUE), .groups = "drop") - -percent_above_165_cm %>% - adorn_pct_formatting() -#> # A tibble: 2 × 2 -#> gender pct_above_165_cm -#> -#> 1 feminine 12.5% -#> 2 masculine 100.0% -``` - -You can control which columns are adorned by using the `...` argument. -It accepts the [tidyselect -helpers](https://r4ds.had.co.nz/transform.html#select). That is, you can -specify columns the same way you would using `dplyr::select()`. - -For instance, say you have a numeric column that should not be included -in percentage formatting and you wish to exempt it. Here, only the -`proportion` column is adorned: - -``` r -mtcars %>% - count(cyl, gear) %>% - rename(proportion = n) %>% - adorn_percentages("col", na.rm = TRUE, proportion) %>% - adorn_pct_formatting(,,,proportion) # the commas say to use the default values of the other arguments -#> cyl gear proportion -#> 4 3 3.1% -#> 4 4 25.0% -#> 4 5 6.2% -#> 6 3 6.2% -#> 6 4 12.5% -#> 6 5 3.1% -#> 8 3 37.5% -#> 8 5 6.2% -``` - -Here we specify that only two consecutive numeric columns should be -totaled (`year` is numeric but should not be included): - -``` r -cases <- data.frame( - region = c("East", "West"), - year = 2015, - recovered = c(125, 87), - died = c(13, 12) -) - -cases %>% - adorn_totals(c("col", "row"), fill = "-", na.rm = TRUE, name = "Total Cases", recovered:died) -#> region year recovered died Total Cases -#> East 2015 125 13 138 -#> West 2015 87 12 99 -#> Total Cases - 212 25 237 -``` - -Here’s a more complex example that uses a data.frame of means, not -counts. We create a table containing the mean of a 3rd variable when -grouped by two other variables, then use `adorn_` functions to round the -values and append Ns. The first part is pretty straightforward: - -``` r -library(tidyr) # for pivot_wider() -mpg_by_cyl_and_am <- mtcars %>% - group_by(cyl, am) %>% - summarise(mpg = mean(mpg), .groups = "drop") %>% - pivot_wider(names_from = am, values_from = mpg) - -mpg_by_cyl_and_am -#> # A tibble: 3 × 3 -#> cyl `0` `1` -#> -#> 1 4 22.9 28.1 -#> 2 6 19.1 20.6 -#> 3 8 15.0 15.4 -``` - -Now to `adorn_` it. Since this is not the result of a `tabyl()` call, it -doesn’t have the underlying Ns stored in the `core` attribute, so we’ll -have to supply them: - -``` r -mpg_by_cyl_and_am %>% - adorn_rounding() %>% - adorn_ns( - ns = mtcars %>% # calculate the Ns on the fly by calling tabyl on the original data - tabyl(cyl, am) - ) %>% - adorn_title("combined", row_name = "Cylinders", col_name = "Is Automatic") -#> Cylinders/Is Automatic 0 1 -#> 1 4 22.9 (3) 28.1 (8) -#> 2 6 19.1 (4) 20.6 (3) -#> 3 8 15.1 (12) 15.4 (2) -``` - -If needed, Ns can be manipulated in their own data.frame before they are -appended. Here a tabyl with values in the thousands has its Ns formatted -to include the separating character `,` as typically seen in American -numbers, e.g., `3,000`. - -First we create the tabyl to adorn: - -``` r -set.seed(1) -raw_data <- data.frame(sex = rep(c("m", "f"), 3000), - age = round(runif(3000, 1, 102), 0)) -raw_data$agegroup = cut(raw_data$age, quantile(raw_data$age, c(0, 1/3, 2/3, 1))) - -comparison <- raw_data %>% - tabyl(agegroup, sex, show_missing_levels = FALSE) %>% - adorn_totals(c("row", "col")) %>% - adorn_percentages("col") %>% - adorn_pct_formatting(digits = 1) - -comparison -#> agegroup f m Total -#> (1,34] 33.9% 32.3% 33.1% -#> (34,68] 33.0% 33.7% 33.4% -#> (68,102] 32.7% 33.3% 33.0% -#> 0.4% 0.6% 0.5% -#> Total 100.0% 100.0% 100.0% -``` - -At this point, the Ns are unformatted: - -``` r -comparison %>% - adorn_ns() -#> agegroup f m Total -#> (1,34] 33.9% (1,018) 32.3% (970) 33.1% (1,988) -#> (34,68] 33.0% (990) 33.7% (1,012) 33.4% (2,002) -#> (68,102] 32.7% (980) 33.3% (1,000) 33.0% (1,980) -#> 0.4% (12) 0.6% (18) 0.5% (30) -#> Total 100.0% (3,000) 100.0% (3,000) 100.0% (6,000) -``` - -Now we format them to insert the thousands commas. A tabyl’s raw Ns are -stored in its `"core"` attribute. Here we retrieve those with `attr()`, -then apply the base R function `format()` to all numeric columns. -Lastly, we append these Ns using `adorn_ns()`. - -``` r -formatted_ns <- attr(comparison, "core") %>% # extract the tabyl's underlying Ns - adorn_totals(c("row", "col")) %>% # to match the data.frame we're appending to - dplyr::mutate(across(where(is.numeric), ~ format(.x, big.mark = ","))) - -comparison %>% - adorn_ns(position = "rear", ns = formatted_ns) -#> agegroup f m Total -#> (1,34] 33.9% (1,018) 32.3% ( 970) 33.1% (1,988) -#> (34,68] 33.0% ( 990) 33.7% (1,012) 33.4% (2,002) -#> (68,102] 32.7% ( 980) 33.3% (1,000) 33.0% (1,980) -#> 0.4% ( 12) 0.6% ( 18) 0.5% ( 30) -#> Total 100.0% (3,000) 100.0% (3,000) 100.0% (6,000) -``` - -### Questions? Comments? - -File [an issue on GitHub](https://github.com/sfirke/janitor/issues) if -you have suggestions related to `tabyl()` and its `adorn_` helpers or -encounter problems while using them.