diff --git a/.gitignore b/.gitignore index 2fbc330c..070849d6 100644 --- a/.gitignore +++ b/.gitignore @@ -2,15 +2,9 @@ .Rhistory .RData *.docx - - ~WRL0005\.tmp doc docs Meta docs/ janitor.Rproj - - -revdep/* -revdep \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 43586ce0..86e12892 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,6 +44,7 @@ Suggests: rmarkdown, RSQLite, sf, + spelling, testthat (>= 3.0.0), tibble, tidygraph @@ -54,3 +55,4 @@ Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 +Language: en-US diff --git a/NAMESPACE b/NAMESPACE index 1461f757..a1969864 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,6 +29,7 @@ export(adorn_rounding) export(adorn_title) export(adorn_totals) export(as_tabyl) +export(assert_count) export(chisq.test) export(clean_names) export(compare_df_cols) diff --git a/NEWS.md b/NEWS.md index 061954c7..4fbb85cf 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,8 +16,7 @@ These are all minor breaking changes resulting from enhancements and are not exp * The new function `excel_time_to_numeric()` converts times from Excel that do not have accompanying dates into a number of seconds. (#245, thanks to **@billdenney** for the feature.) -* A new argument `set_labels` to `clean_names()` stores the old names as labels in each column. Variable labels are visualized in Rstudio's data viewer or used by default by some packages such as `gt` instead of variable names. Labels can also be used in ggplot labels thanks to the function `easy_labs()` in the `ggeasy` package. Read this wonderful [post](https://www.pipinghotdata.com/posts/2022-09-13-the-case-for-variable-labels-in-r/) for more info about column labels. (#563, thanks to **@jospueyo** for the feature). - +* The new function `assert_count()` verifies that an expected number of values are `TRUE` for quality checks in data pipelines ## Bug fixes * `adorn_totals("row")` now succeeds if the new `name` of the totals row is already a factor level of the input data.frame (#529, thanks @egozoglu for reporting). @@ -113,7 +112,7 @@ These are all minor breaking changes resulting from enhancements and are not exp ## New features -* The `adorn_totals()` function now accepts the special argument `fill = NA`, which will insert a class-appropriate `NA` value into each column that isn't being totaled. This preserves the class of each column; previously they were all convered to character. (thanks **@hamstr147** for implementing in #404 and **@ymer** for reporting in #298). +* The `adorn_totals()` function now accepts the special argument `fill = NA`, which will insert a class-appropriate `NA` value into each column that isn't being totaled. This preserves the class of each column; previously they were all converted to character. (thanks **@hamstr147** for implementing in #404 and **@ymer** for reporting in #298). * `adorn_totals()` now takes the value of `"both"` for the `where` argument. That is, `adorn_totals("both")` is a shorter version of `adorn_totals(c("col", "row"))`. (#362, thanks to **@svgsstats** for implementing and **@sfd99** for suggesting). @@ -137,7 +136,7 @@ These are all minor breaking changes resulting from enhancements and are not exp * A call to make a 3-way `tabyl()` now succeeds when the first variable is of class `ordered` (#386) -* If a totals row and/or column is present on a tabyl as a result of `adorn_totals()`, the functions `chisq.test()` and `fisher.test()` drop the totals and print a warning before proceding with the calculations (#385). +* If a totals row and/or column is present on a tabyl as a result of `adorn_totals()`, the functions `chisq.test()` and `fisher.test()` drop the totals and print a warning before proceeding with the calculations (#385). # janitor 2.0.1 (2020-04-12) @@ -283,7 +282,7 @@ This builds on the original functionality of janitor, with similar-but-improved ### A fully-overhauled `tabyl` -`tabyl()` is now a single function that can count combinations of one, two, or three variables, ala base R's `table()`. The resulting `tabyl` data.frames can be manipulated and formatted using a family of `adorn_` functions. See the [tabyls vignette](https://sfirke.github.io/janitor/articles/tabyls.html) for more. +`tabyl()` is now a single function that can count combinations of one, two, or three variables, a la base R's `table()`. The resulting `tabyl` data.frames can be manipulated and formatted using a family of `adorn_` functions. See the [tabyls vignette](https://sfirke.github.io/janitor/articles/tabyls.html) for more. The now-redundant legacy functions `crosstab()` and `adorn_crosstab()` have been deprecated, but remain in the package for now. Existing code that relies on the version of `tabyl` present in janitor versions <= 0.3.1 will break if the `sort` argument was used, as that argument no longer exists in `tabyl` (use `dplyr::arrange()` instead). @@ -299,7 +298,7 @@ No further changes are planned to `clean_names()` and its results should be stab ## Major features -- `clean_names()` transliterates accented letters, e.g., `çãüœ` becomes `cauoe` [(#120)](https://github.com/sfirke/janitor/issues/120). Thanks to **@fernandovmacedo**. +- `clean_names()` transliterates accented letters, e.g., `C'C#C% +#' dplyr::mutate( +#' big_values = assert_count(A > 2, n = 3) +#' ) +#' +#' my_data <- data.frame(name = c("Bill", "Sam"), birthdate = c("2024-05-22", "2024-05-22")) +#' my_data |> +#' dplyr::mutate( +#' birthdate = +#' dplyr::case_when( +#' assert_count(name == "Bill" & birthdate == "2024-05-22") ~ "2024-05-23", +#' TRUE ~ birthdate +#' ) +#' ) +#' @export +assert_count <- function(x, n = 1) { + stopifnot(is.logical(x)) + if (any(is.na(x))) { + stop(deparse(substitute(x)), " has NA values") + } + if (sum(x) != n) { + stop_message <- + sprintf( + "`%s` expected %g `TRUE` %s but %g %s found.", + deparse(substitute(x)), + n, + ngettext(n, "value", "values"), + sum(x), + ngettext(sum(x), "was", "were") + ) + stop(stop_message) + } + x +} diff --git a/R/clean_names.R b/R/clean_names.R index 388144d8..abdcd73c 100644 --- a/R/clean_names.R +++ b/R/clean_names.R @@ -33,7 +33,7 @@ #' support using `clean_names()` on `sf` and `tbl_graph` (from #' `tidygraph`) objects as well as on database connections through #' `dbplyr`. For cleaning other named objects like named lists -#' and vectors, use `make_clean_names()`. When `set_labels` is set to `TRUE`, the old names, +#' and vectors, use `make_clean_names()`. When `set_labels` is set to `TRUE`, the old names, #' stored as column labels, can be restored using `sjlabelled::label_to_colnames()`. #' #' @export @@ -83,14 +83,13 @@ clean_names.default <- function(dat, ..., set_labels = FALSE) { if (is.null(names(dat))) { dimnames(dat) <- lapply(dimnames(dat), make_clean_names, ...) } else { - if (set_labels){ + if (set_labels) { old_names <- names(dat) - for (i in seq_along(old_names)){ + for (i in seq_along(old_names)) { attr(dat[[i]], "label") <- old_names[[i]] } } names(dat) <- make_clean_names(names(dat), ...) - } dat } @@ -112,9 +111,9 @@ clean_names.sf <- function(dat, ..., set_labels = FALSE) { sf_cleaned <- make_clean_names(sf_names[cols_to_rename], ...) # rename original df names(dat)[cols_to_rename] <- sf_cleaned - - if(set_labels){ - for (i in seq_along(sf_names[cols_to_rename])){ + + if (set_labels) { + for (i in seq_along(sf_names[cols_to_rename])) { attr(dat[[i]], "label") <- sf_names[[i]] } } @@ -131,7 +130,7 @@ clean_names.tbl_graph <- function(dat, ...) { call. = FALSE ) } # nocov end - + dplyr::rename_all(dat, .funs = make_clean_names, ...) } diff --git a/R/round_half_up.R b/R/round_half_up.R index c4f63006..a7ba9524 100644 --- a/R/round_half_up.R +++ b/R/round_half_up.R @@ -1,4 +1,4 @@ -#' Round a numeric vector; halves will be rounded up, ala Microsoft Excel. +#' Round a numeric vector; halves will be rounded up, a la Microsoft Excel. #' #' @description #' In base R `round()`, halves are rounded to even, e.g., 12.5 and diff --git a/_pkgdown.yml b/_pkgdown.yml index 965f4647..2038dddf 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -5,11 +5,11 @@ template: reference: - title: Cleaning data - + - subtitle: Cleaning variable names contents: - contains("clean_names") - + - title: Exploring data desc: > tabyls are an enhanced version of tables. See `vignette("tabyls")` @@ -19,7 +19,7 @@ reference: - starts_with("adorn") - contains("tabyl") - -contains('.test') - + - subtitle: Change order contents: - row_to_names @@ -30,6 +30,7 @@ reference: Compare data frames columns contents: - starts_with("compare_df_cols") + - assert_count - title: Removing unnecessary columns / rows contents: @@ -38,9 +39,9 @@ reference: - get_one_to_one - top_levels - single_value - + - title: Rounding / dates helpers - desc: > + desc: > Help to mimic some behaviour from Excel or SAS. These should be used on vector. contents: diff --git a/man/assert_count.Rd b/man/assert_count.Rd new file mode 100644 index 00000000..0a88a61a --- /dev/null +++ b/man/assert_count.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/assertions.R +\name{assert_count} +\alias{assert_count} +\title{Verify that a vector of values has the expected number of \code{TRUE} values} +\usage{ +assert_count(x, n = 1) +} +\arguments{ +\item{x}{A logical vector without \code{NA} values} + +\item{n}{The expected number of \code{TRUE} values} +} +\value{ +\code{x} if \code{sum(x) == n} or an informative error message otherwise +} +\description{ +Verify that a vector of values has the expected number of \code{TRUE} values +} +\examples{ +data.frame(A = 1:5) \%>\% + dplyr::mutate( + big_values = assert_count(A > 2, n = 3) + ) + +my_data <- data.frame(name = c("Bill", "Sam"), birthdate = c("2024-05-22", "2024-05-22")) +my_data |> + dplyr::mutate( + birthdate = + dplyr::case_when( + assert_count(name == "Bill" & birthdate == "2024-05-22") ~ "2024-05-23", + TRUE ~ birthdate + ) + ) +} diff --git a/tests/testthat/test-assertions.R b/tests/testthat/test-assertions.R new file mode 100644 index 00000000..3d10287d --- /dev/null +++ b/tests/testthat/test-assertions.R @@ -0,0 +1,36 @@ +test_that("assert_count", { + expect_equal( + assert_count(TRUE, 1), + TRUE + ) + expect_equal( + assert_count(rep(TRUE, 3), 3), + rep(TRUE, 3) + ) + my_vector <- c(rep(TRUE, 3), FALSE) + expect_equal( + assert_count(my_vector, 3), + my_vector + ) + expect_error( + assert_count(NA), + regexp = "NA has NA values" + ) + # more informative errors + my_vector <- c(NA, TRUE) + expect_error( + assert_count(my_vector), + regexp = "my_vector has NA values" + ) + my_vector <- c(FALSE, TRUE) + expect_error( + assert_count(my_vector, n = 2), + regexp = "`my_vector` expected 2 `TRUE` values but 1 was found." + ) + # Check grammar of error message + my_vector <- c(TRUE, TRUE) + expect_error( + assert_count(my_vector, n = 1), + regexp = "`my_vector` expected 1 `TRUE` value but 2 were found." + ) +}) diff --git a/tests/testthat/test-clean-names.R b/tests/testthat/test-clean-names.R index 15bb942a..bc4e704d 100644 --- a/tests/testthat/test-clean-names.R +++ b/tests/testthat/test-clean-names.R @@ -190,14 +190,14 @@ test_that("labels are created in default method (feature request #563)", { dat_df <- dplyr::tibble(`a a` = c(11, 22), `b b` = c(2, 3)) dat_df_clean_labels <- clean_names(dat_df, set_labels = TRUE) dat_df_clean <- clean_names(dat_df) - - for (i in seq_along(names(dat_df))){ + + for (i in seq_along(names(dat_df))) { # check that old names are saved as labels when set_labels is TRUE expect_equal(attr(dat_df_clean_labels[[i]], "label"), names(dat_df)[[i]]) # check that old names are not stored if set_labels is not TRUE expect_null(attr(dat_df_clean[[i]], "label")) - } - + } + # expect names are always cleaned expect_equal(names(dat_df_clean), c("a_a", "b_b")) expect_equal(names(dat_df_clean_labels), c("a_a", "b_b")) @@ -605,19 +605,19 @@ test_that("Tests for cases beyond default snake for sf objects", { test_that("labels are created in sf method (feature request #563)", { skip_if_not_installed("sf") - + dat_df <- dplyr::tibble(`a a` = c(11, 22), `b b` = c(2, 3)) dat_sf <- dat_df - dat_sf$x <- c(1,2) - dat_sf$y <- c(1,2) + dat_sf$x <- c(1, 2) + dat_sf$y <- c(1, 2) dat_sf <- sf::st_as_sf(dat_sf, coords = c("x", "y")) dat_sf_clean_labels <- clean_names(dat_sf, set_labels = TRUE) dat_sf_clean <- clean_names(dat_sf) - - for (i in seq_along(names(dat_df))){ + + for (i in seq_along(names(dat_df))) { # check that old names are saved as labels when set_labels is TRUE expect_equal(attr(dat_sf_clean_labels[[i]], "label"), names(dat_sf)[[i]]) - + # check that old names are not stored if set_labels is not TRUE expect_null(attr(dat_sf_clean[[i]], "label")) } diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 00000000..097b2416 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/assertions.Rmd b/vignettes/assertions.Rmd new file mode 100644 index 00000000..7099cf98 --- /dev/null +++ b/vignettes/assertions.Rmd @@ -0,0 +1,137 @@ +--- +title: "Assertions for cleaning data" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Assertions for cleaning data} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +# Assertions for cleaning data + +Part of cleaning data includes assertions to make sure that data are expected +before changing the values. `janitor` provides an assertion to enable data +verification before making changes; more assertions may be added in the future. + +```{r setup} +library(janitor) +library(dplyr) +``` + +## `assert_count()` - Verify the number of `TRUE` values + +`assert_count()` will verify that the number of `TRUE` values is the expected +number. It is useful when data may change over time and you want to be sure that +you are changing only data that you intend to change. + +For example, you are given a data set with test scores for several students. +Some of the scores are missing. + +```{r raw-v1} +raw <- + data.frame( + student_id = c(123, 124, 125, 126), + test_score = c(NA, 93, NA, 82) + ) +``` + +When you first receive the data, you're told separately that student 123 has a +score of 84 and 125 has a score of 91. You want to verify that you are finding +the right rows to replace and that you replace them. + +```{r clean-v1-mistake} +clean_mistake <- + raw %>% + mutate( + test_score = + case_when( + student_id == 124 & is.na(test_score) ~ 84, + student_id == 125 & is.na(test_score) ~ 91, + TRUE ~ test_score + ) + ) +``` + +Because of a bug in the code, `student_id == 123` was not replaced. + +```{r clean-v1-mistake-table} +clean_mistake +``` + +Using `assert_count()`, you would find this error because of an error raised by +`assert_count()` in the pipeline. + +```{r clean_assert} +try({ + clean_assert <- + raw %>% + mutate( + test_score = + case_when( + assert_count(student_id == 124 & is.na(test_score)) ~ 84, + assert_count(student_id == 125 & is.na(test_score)) ~ 91, + TRUE ~ test_score + ) + ) +}) +``` + +Fixing the code bug so that the first `student_id == 123` instead of +`student_id == 124`, you now get the expected result. + +```{r clean_assert_fixed} +clean_assert <- + raw %>% + mutate( + test_score = + case_when( + assert_count(student_id == 123 & is.na(test_score)) ~ 84, + assert_count(student_id == 125 & is.na(test_score)) ~ 91, + TRUE ~ test_score + ) + ) + +# New result +clean_assert + +# Original data +raw +``` + +### Changing data + +Another way that `assert_count()` can help is verifying that your code notifies +you if your data changes in an important way. Using the example before, you may +get a new raw data set (`raw_v2`) that has some of the `test_score` values +added. They may be different than what you were told before. + +Running the same code on the new data will give you an informative error telling +you what to look into. + +```{r raw_v2} +raw_v2 <- + data.frame( + student_id = c(123, 124, 125, 126), + test_score = c(90, 93, NA, 82) + ) + +try({ + clean_assert <- + raw_v2 %>% + mutate( + test_score = + case_when( + assert_count(student_id == 123 & is.na(test_score)) ~ 84, + assert_count(student_id == 125 & is.na(test_score)) ~ 91, + TRUE ~ test_score + ) + ) +}) +``` diff --git a/vignettes/janitor.Rmd b/vignettes/janitor.Rmd index 5e64c70d..4d76deff 100644 --- a/vignettes/janitor.Rmd +++ b/vignettes/janitor.Rmd @@ -124,7 +124,7 @@ Smaller functions for use in particular situations. More human-readable than th ### Manipulate vectors of names with `make_clean_names()` -Like base R's `make.names()`, but with the stylings and case choice of the long-time janitor function `clean_names()`. While `clean_names()` is still offered for use in data.frame pipeline with `%>%`, `make_clean_names()` allows for more general usage, e.g., on a vector. +Like base R's `make.names()`, but with the styling and case choice of the long-time janitor function `clean_names()`. While `clean_names()` is still offered for use in data.frame pipeline with `%>%`, `make_clean_names()` allows for more general usage, e.g., on a vector. It can also be used as an argument to `.name_repair` in the newest version of `tibble::as_tibble`: ```{r} diff --git a/vignettes/janitor.md b/vignettes/janitor.md index 3627d7b3..2ee3b61f 100644 --- a/vignettes/janitor.md +++ b/vignettes/janitor.md @@ -1,68 +1,45 @@ Overview of janitor functions ================ -2023-02-02 - -- Major functions - - Cleaning - - Clean data.frame names - with clean_names() - - Do those - data.frames actually contain the same columns? - - Exploring - - tabyl() - a - better version of table() - - Explore - records with duplicated values for specific combinations of variables - with get_dupes() - - Explore - relationships between columns with get_one_to_one() -- Minor functions - - Cleaning - - Manipulate - vectors of names with make_clean_names() - - Validate - that a column has a single_value() per group - - remove_empty() rows - and columns - - remove_constant() - columns - - Directionally-consistent - rounding behavior with round_half_up() - - Round - decimals to precise fractions of a given denominator with - round_to_fraction() - - Fix - dates stored as serial numbers with - excel_numeric_to_date() - - Convert a - mix of date and datetime formats to date - - Elevate column - names stored in a data.frame row - - Find the - header row buried within a messy data.frame - - Exploring - - Count - factor levels in groups of high, medium, and low with - top_levels() +2024-12-18 + +- [Major functions](#major-functions) + - [Cleaning](#cleaning) + - [Clean data.frame names with + `clean_names()`](#clean-dataframe-names-with-clean_names) + - [Do those data.frames actually contain the same + columns?](#do-those-dataframes-actually-contain-the-same-columns) + - [Exploring](#exploring) + - [`tabyl()` - a better version of + `table()`](#tabyl---a-better-version-of-table) + - [Explore records with duplicated values for specific combinations + of variables with + `get_dupes()`](#explore-records-with-duplicated-values-for-specific-combinations-of-variables-with-get_dupes) + - [Explore relationships between columns with + `get_one_to_one()`](#explore-relationships-between-columns-with-get_one_to_one) +- [Minor functions](#minor-functions) + - [Cleaning](#cleaning-1) + - [Manipulate vectors of names with + `make_clean_names()`](#manipulate-vectors-of-names-with-make_clean_names) + - [Validate that a column has a `single_value()` per + group](#validate-that-a-column-has-a-single_value-per-group) + - [`remove_empty()` rows and + columns](#remove_empty-rows-and-columns) + - [`remove_constant()` columns](#remove_constant-columns) + - [Directionally-consistent rounding behavior with + `round_half_up()`](#directionally-consistent-rounding-behavior-with-round_half_up) + - [Round decimals to precise fractions of a given denominator with + `round_to_fraction()`](#round-decimals-to-precise-fractions-of-a-given-denominator-with-round_to_fraction) + - [Fix dates stored as serial numbers with + `excel_numeric_to_date()`](#fix-dates-stored-as-serial-numbers-with-excel_numeric_to_date) + - [Convert a mix of date and datetime formats to + date](#convert-a-mix-of-date-and-datetime-formats-to-date) + - [Elevate column names stored in a data.frame + row](#elevate-column-names-stored-in-a-dataframe-row) + - [Find the header row buried within a messy + data.frame](#find-the-header-row-buried-within-a-messy-dataframe) + - [Exploring](#exploring-1) + - [Count factor levels in groups of high, medium, and low with + `top_levels()`](#count-factor-levels-in-groups-of-high-medium-and-low-with-top_levels) The janitor functions expedite the initial data exploration and cleaning that comes with any new data set. This catalog describes the usage for @@ -78,7 +55,7 @@ Functions for everyday use. Call this function every time you read data. -It works in a `%>%` pipeline, and handles problematic variable names, +It works in a `%>%` pipeline and handles problematic variable names, especially those that are so well-preserved by `readxl::read_excel()` and `readr::read_csv()`. @@ -94,8 +71,10 @@ and `readr::read_csv()`. ``` r # Create a data.frame with dirty names test_df <- as.data.frame(matrix(ncol = 6)) -names(test_df) <- c("firstName", "ábc@!*", "% successful (2009)", - "REPEAT VALUE", "REPEAT VALUE", "") +names(test_df) <- c( + "firstName", "ábc@!*", "% successful (2009)", + "REPEAT VALUE", "REPEAT VALUE", "" +) ``` Clean the variable names, returning a data.frame: @@ -111,8 +90,8 @@ Compare to what base R produces: ``` r make.names(names(test_df)) -#> [1] "firstName" "ábc..." "X..successful..2009." "REPEAT.VALUE" "REPEAT.VALUE" -#> [6] "X" +#> [1] "firstName" "ábc..." "X..successful..2009." +#> [4] "REPEAT.VALUE" "REPEAT.VALUE" "X" ``` This function is powered by the underlying exported function @@ -229,10 +208,11 @@ sets of one-to-one clusters: ``` r library(dplyr) -starwars[1:4,] %>% +starwars[1:4, ] %>% get_one_to_one() #> [[1]] -#> [1] "name" "height" "mass" "skin_color" "birth_year" "films" +#> [1] "name" "height" "mass" "skin_color" "birth_year" +#> [6] "films" #> #> [[2]] #> [1] "hair_color" "starships" @@ -250,7 +230,7 @@ than the equivalent code they replace. ### Manipulate vectors of names with `make_clean_names()` -Like base R’s `make.names()`, but with the stylings and case choice of +Like base R’s `make.names()`, but with the styling and case choice of the long-time janitor function `clean_names()`. While `clean_names()` is still offered for use in data.frame pipeline with `%>%`, `make_clean_names()` allows for more general usage, e.g., on a vector. @@ -273,7 +253,7 @@ tibble::as_tibble(iris, .name_repair = janitor::make_clean_names) #> 8 5 3.4 1.5 0.2 setosa #> 9 4.4 2.9 1.4 0.2 setosa #> 10 4.9 3.1 1.5 0.1 setosa -#> # … with 140 more rows +#> # ℹ 140 more rows ``` ### Validate that a column has a `single_value()` per group @@ -290,7 +270,8 @@ where it should not: ``` r not_one_to_one <- data.frame( X = rep(1:3, each = 2), - Y = c(rep(1:2, each = 2), 1:2)) + Y = c(rep(1:2, each = 2), 1:2) +) not_one_to_one #> X Y @@ -303,12 +284,13 @@ not_one_to_one # throws informative error: try(not_one_to_one %>% - dplyr::group_by(X) %>% - dplyr::mutate( - Z = single_value(Y, info = paste("Calculating Z for group X =", X))) - ) + dplyr::group_by(X) %>% + dplyr::mutate( + Z = single_value(Y, info = paste("Calculating Z for group X =", X)) + )) #> Error in dplyr::mutate(., Z = single_value(Y, info = paste("Calculating Z for group X =", : -#> ℹ In argument: `Z = single_value(Y, info = paste("Calculating Z for group X =", X))`. +#> ℹ In argument: `Z = single_value(Y, info = paste("Calculating Z for +#> group X =", X))`. #> ℹ In group 3: `X = 3`. #> Caused by error in `single_value()`: #> ! More than one (2) value found (1, 2): Calculating Z for group X = 3: Calculating Z for group X = 3 @@ -320,9 +302,11 @@ Does what it says. For cases like cleaning Excel files that contain empty rows and columns after being read into R. ``` r -q <- data.frame(v1 = c(1, NA, 3), - v2 = c(NA, NA, NA), - v3 = c("a", NA, "b")) +q <- data.frame( + v1 = c(1, NA, 3), + v2 = c(NA, NA, NA), + v3 = c("a", NA, "b") +) q %>% remove_empty(c("rows", "cols")) #> v1 v3 @@ -419,8 +403,10 @@ names of the data.frame and optionally (by default) remove the row in which names were stored and/or the rows above it. ``` r -dirt <- data.frame(X_1 = c(NA, "ID", 1:3), - X_2 = c(NA, "Value", 4:6)) +dirt <- data.frame( + X_1 = c(NA, "ID", 1:3), + X_2 = c(NA, "Value", 4:6) +) row_to_names(dirt, 2) #> ID Value @@ -454,7 +440,8 @@ grouped into head/middle/tail groups. ``` r f <- factor(c("strongly agree", "agree", "neutral", "neutral", "disagree", "strongly agree"), - levels = c("strongly agree", "agree", "neutral", "disagree", "strongly disagree")) + levels = c("strongly agree", "agree", "neutral", "disagree", "strongly disagree") +) top_levels(f) #> f n percent #> strongly agree, agree 3 0.5000000 diff --git a/vignettes/tabyls.md b/vignettes/tabyls.md index ea526931..262a1801 100644 --- a/vignettes/tabyls.md +++ b/vignettes/tabyls.md @@ -254,7 +254,7 @@ humans %>% function or using janitor’s `round_half_up()` to round all ties up ([thanks, StackOverflow](https://stackoverflow.com/a/12688836/4470365)). - - e.g., round 10.5 up to 11, consistent with Excel’s tie-breaking + - e.g., round 10.5 up to 11, consistent with Excel's tie-breaking behavior. - This contrasts with rounding 10.5 down to 10 as in base R’s `round(10.5)`. @@ -263,7 +263,7 @@ humans %>% `adorn_pct_formatting()`; these two functions should not be called together. - **`adorn_ns()`**: add Ns to a tabyl. These can be drawn from the - tabyl’s underlying counts, which are attached to the tabyl as + tabyl's underlying counts, which are attached to the tabyl as metadata, or they can be supplied by the user. - **`adorn_title()`**: add a title to a tabyl (or other data.frame). Options include putting the column title in a new row on top of the @@ -427,7 +427,7 @@ comparison %>% #> Total 100.0% (3,000) 100.0% (3,000) 100.0% (6,000) ``` -Now we format them to insert the thousands commas. A tabyl’s raw Ns are +Now we format them to insert the thousands commas. A tabyl's raw Ns are stored in its `"core"` attribute. Here we retrieve those with `attr()`, then apply the base R function `format()` to all numeric columns. Lastly, we append these Ns using `adorn_ns()`.