From 7a8e651d65ff781679ec9a75d9a2c9c478864ac0 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Wed, 22 May 2024 10:00:55 -0400 Subject: [PATCH 1/6] Add `assert_count_true()` to verify that an expected number of values are `TRUE` --- NAMESPACE | 1 + NEWS.md | 2 ++ R/assert_count_true.R | 30 +++++++++++++++++++++ man/assert_count_true.Rd | 25 +++++++++++++++++ tests/testthat/test-assert_count_true.R | 36 +++++++++++++++++++++++++ 5 files changed, 94 insertions(+) create mode 100644 R/assert_count_true.R create mode 100644 man/assert_count_true.Rd create mode 100644 tests/testthat/test-assert_count_true.R diff --git a/NAMESPACE b/NAMESPACE index 27073345..37444fca 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,6 +29,7 @@ export(adorn_rounding) export(adorn_title) export(adorn_totals) export(as_tabyl) +export(assert_count_true) export(chisq.test) export(clean_names) export(compare_df_cols) diff --git a/NEWS.md b/NEWS.md index 5d715ce2..0526177f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,6 +14,8 @@ These are all minor breaking changes resulting from enhancements and are not exp * The new function `excel_time_to_numeric()` converts times from Excel that do not have accompanying dates into a number of seconds. (#245, thanks to **@billdenney** for the feature.) +* The new function `assert_count_true()` verifies that an expected number of values are `TRUE` for quality checks in data pipelines + ## Bug fixes * `adorn_totals("row")` now succeeds if the new `name` of the totals row is already a factor level of the input data.frame (#529, thanks @egozoglu for reporting). diff --git a/R/assert_count_true.R b/R/assert_count_true.R new file mode 100644 index 00000000..bd212568 --- /dev/null +++ b/R/assert_count_true.R @@ -0,0 +1,30 @@ +#' Verify that a vector of values has the expected number of `TRUE` values +#' +#' @param x A logical vecotor without `NA` values +#' @param n The expected number of `TRUE` values +#' @returns `x` if `sum(x) == n` or an informative error message otherwise +#' @examples +#' data.frame(A = 1:5) %>% +#' dplyr::mutate( +#' big_values = assert_count_true(A > 2, n = 3) +#' ) +#' @export +assert_count_true <- function(x, n = 1) { + stopifnot(is.logical(x)) + if (any(is.na(x))) { + stop(deparse(substitute(x)), " has NA values") + } + if (sum(x) != n) { + stop_message <- + sprintf( + "`%s` expected %g `TRUE` %s but %g %s found.", + deparse(substitute(x)), + n, + ngettext(n, "value", "values"), + sum(x), + ngettext(sum(x), "was", "were") + ) + stop(stop_message) + } + x +} diff --git a/man/assert_count_true.Rd b/man/assert_count_true.Rd new file mode 100644 index 00000000..46db9697 --- /dev/null +++ b/man/assert_count_true.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/assert_count_true.R +\name{assert_count_true} +\alias{assert_count_true} +\title{Verify that a vector of values has the expected number of \code{TRUE} values} +\usage{ +assert_count_true(x, n = 1) +} +\arguments{ +\item{x}{A logical vecotor without \code{NA} values} + +\item{n}{The expected number of \code{TRUE} values} +} +\value{ +\code{x} if \code{sum(x) == n} or an informative error message otherwise +} +\description{ +Verify that a vector of values has the expected number of \code{TRUE} values +} +\examples{ +data.frame(A = 1:5) \%>\% + dplyr::mutate( + big_values = assert_count_true(A > 2, n = 3) + ) +} diff --git a/tests/testthat/test-assert_count_true.R b/tests/testthat/test-assert_count_true.R new file mode 100644 index 00000000..7b477070 --- /dev/null +++ b/tests/testthat/test-assert_count_true.R @@ -0,0 +1,36 @@ +test_that("assert_count_true", { + expect_equal( + assert_count_true(TRUE, 1), + TRUE + ) + expect_equal( + assert_count_true(rep(TRUE, 3), 3), + rep(TRUE, 3) + ) + my_vector <- c(rep(TRUE, 3), FALSE) + expect_equal( + assert_count_true(my_vector, 3), + my_vector + ) + expect_error( + assert_count_true(NA), + regexp = "NA has NA values" + ) + # more informative errors + my_vector <- c(NA, TRUE) + expect_error( + assert_count_true(my_vector), + regexp = "my_vector has NA values" + ) + my_vector <- c(FALSE, TRUE) + expect_error( + assert_count_true(my_vector, n = 2), + regexp = "`my_vector` expected 2 `TRUE` values but 1 was found." + ) + # Check grammar of error message + my_vector <- c(TRUE, TRUE) + expect_error( + assert_count_true(my_vector, n = 1), + regexp = "`my_vector` expected 1 `TRUE` value but 2 were found." + ) +}) From 030a03db7801ec25058d5d7ee5e80bf079916ab7 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Wed, 22 May 2024 10:16:14 -0400 Subject: [PATCH 2/6] Fix pkgdown; add another example --- R/assert_count_true.R | 10 ++++++++++ _pkgdown.yml | 11 ++++++----- man/assert_count_true.Rd | 10 ++++++++++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/R/assert_count_true.R b/R/assert_count_true.R index bd212568..37779315 100644 --- a/R/assert_count_true.R +++ b/R/assert_count_true.R @@ -8,6 +8,16 @@ #' dplyr::mutate( #' big_values = assert_count_true(A > 2, n = 3) #' ) +#' +#' my_data <- data.frame(name = c("Bill", "Sam"), birthdate = c("2024-05-22", "2024-05-22")) +#' my_data |> +#' dplyr::mutate( +#' birthdate = +#' dplyr::case_when( +#' assert_count_true(name == "Bill" & birthdate == "2024-05-22") ~ "2024-05-23", +#' TRUE ~ birthdate +#' ) +#' ) #' @export assert_count_true <- function(x, n = 1) { stopifnot(is.logical(x)) diff --git a/_pkgdown.yml b/_pkgdown.yml index 965f4647..7d96cb04 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -5,11 +5,11 @@ template: reference: - title: Cleaning data - + - subtitle: Cleaning variable names contents: - contains("clean_names") - + - title: Exploring data desc: > tabyls are an enhanced version of tables. See `vignette("tabyls")` @@ -19,7 +19,7 @@ reference: - starts_with("adorn") - contains("tabyl") - -contains('.test') - + - subtitle: Change order contents: - row_to_names @@ -30,6 +30,7 @@ reference: Compare data frames columns contents: - starts_with("compare_df_cols") + - assert_count_true - title: Removing unnecessary columns / rows contents: @@ -38,9 +39,9 @@ reference: - get_one_to_one - top_levels - single_value - + - title: Rounding / dates helpers - desc: > + desc: > Help to mimic some behaviour from Excel or SAS. These should be used on vector. contents: diff --git a/man/assert_count_true.Rd b/man/assert_count_true.Rd index 46db9697..c158760e 100644 --- a/man/assert_count_true.Rd +++ b/man/assert_count_true.Rd @@ -22,4 +22,14 @@ data.frame(A = 1:5) \%>\% dplyr::mutate( big_values = assert_count_true(A > 2, n = 3) ) + +my_data <- data.frame(name = c("Bill", "Sam"), birthdate = c("2024-05-22", "2024-05-22")) +my_data |> + dplyr::mutate( + birthdate = + dplyr::case_when( + assert_count_true(name == "Bill" & birthdate == "2024-05-22") ~ "2024-05-23", + TRUE ~ birthdate + ) + ) } From 5b4c1fe79ea539a555c8262015e07d9d5afafdb9 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Wed, 18 Dec 2024 16:43:45 -0500 Subject: [PATCH 3/6] Move function name to `assert_count()`; add assertions vignette --- .gitignore | 1 + DESCRIPTION | 2 +- NAMESPACE | 2 +- NEWS.md | 2 +- R/{assert_count_true.R => assertions.R} | 8 +- _pkgdown.yml | 2 +- man/{assert_count_true.Rd => assert_count.Rd} | 14 +- tests/testthat/test-assert_count_true.R | 16 +- vignettes/.gitignore | 2 + vignettes/assertions.Rmd | 137 ++++++++++++++++++ 10 files changed, 163 insertions(+), 23 deletions(-) rename R/{assert_count_true.R => assertions.R} (79%) rename man/{assert_count_true.Rd => assert_count.Rd} (66%) create mode 100644 vignettes/.gitignore create mode 100644 vignettes/assertions.Rmd diff --git a/.gitignore b/.gitignore index 9591447d..8e6f9acb 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ docs Meta docs/ janitor.Rproj +inst/doc diff --git a/DESCRIPTION b/DESCRIPTION index 7fd04e02..e0d9ac4a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -52,4 +52,4 @@ Config/testthat/edition: 3 Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index 37444fca..c6fcc4c1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -29,7 +29,7 @@ export(adorn_rounding) export(adorn_title) export(adorn_totals) export(as_tabyl) -export(assert_count_true) +export(assert_count) export(chisq.test) export(clean_names) export(compare_df_cols) diff --git a/NEWS.md b/NEWS.md index 0526177f..a235f0fa 100644 --- a/NEWS.md +++ b/NEWS.md @@ -14,7 +14,7 @@ These are all minor breaking changes resulting from enhancements and are not exp * The new function `excel_time_to_numeric()` converts times from Excel that do not have accompanying dates into a number of seconds. (#245, thanks to **@billdenney** for the feature.) -* The new function `assert_count_true()` verifies that an expected number of values are `TRUE` for quality checks in data pipelines +* The new function `assert_count()` verifies that an expected number of values are `TRUE` for quality checks in data pipelines ## Bug fixes diff --git a/R/assert_count_true.R b/R/assertions.R similarity index 79% rename from R/assert_count_true.R rename to R/assertions.R index 37779315..f007024f 100644 --- a/R/assert_count_true.R +++ b/R/assertions.R @@ -1,12 +1,12 @@ #' Verify that a vector of values has the expected number of `TRUE` values #' -#' @param x A logical vecotor without `NA` values +#' @param x A logical vector without `NA` values #' @param n The expected number of `TRUE` values #' @returns `x` if `sum(x) == n` or an informative error message otherwise #' @examples #' data.frame(A = 1:5) %>% #' dplyr::mutate( -#' big_values = assert_count_true(A > 2, n = 3) +#' big_values = assert_count(A > 2, n = 3) #' ) #' #' my_data <- data.frame(name = c("Bill", "Sam"), birthdate = c("2024-05-22", "2024-05-22")) @@ -14,12 +14,12 @@ #' dplyr::mutate( #' birthdate = #' dplyr::case_when( -#' assert_count_true(name == "Bill" & birthdate == "2024-05-22") ~ "2024-05-23", +#' assert_count(name == "Bill" & birthdate == "2024-05-22") ~ "2024-05-23", #' TRUE ~ birthdate #' ) #' ) #' @export -assert_count_true <- function(x, n = 1) { +assert_count <- function(x, n = 1) { stopifnot(is.logical(x)) if (any(is.na(x))) { stop(deparse(substitute(x)), " has NA values") diff --git a/_pkgdown.yml b/_pkgdown.yml index 7d96cb04..2038dddf 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -30,7 +30,7 @@ reference: Compare data frames columns contents: - starts_with("compare_df_cols") - - assert_count_true + - assert_count - title: Removing unnecessary columns / rows contents: diff --git a/man/assert_count_true.Rd b/man/assert_count.Rd similarity index 66% rename from man/assert_count_true.Rd rename to man/assert_count.Rd index c158760e..0a88a61a 100644 --- a/man/assert_count_true.Rd +++ b/man/assert_count.Rd @@ -1,13 +1,13 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/assert_count_true.R -\name{assert_count_true} -\alias{assert_count_true} +% Please edit documentation in R/assertions.R +\name{assert_count} +\alias{assert_count} \title{Verify that a vector of values has the expected number of \code{TRUE} values} \usage{ -assert_count_true(x, n = 1) +assert_count(x, n = 1) } \arguments{ -\item{x}{A logical vecotor without \code{NA} values} +\item{x}{A logical vector without \code{NA} values} \item{n}{The expected number of \code{TRUE} values} } @@ -20,7 +20,7 @@ Verify that a vector of values has the expected number of \code{TRUE} values \examples{ data.frame(A = 1:5) \%>\% dplyr::mutate( - big_values = assert_count_true(A > 2, n = 3) + big_values = assert_count(A > 2, n = 3) ) my_data <- data.frame(name = c("Bill", "Sam"), birthdate = c("2024-05-22", "2024-05-22")) @@ -28,7 +28,7 @@ my_data |> dplyr::mutate( birthdate = dplyr::case_when( - assert_count_true(name == "Bill" & birthdate == "2024-05-22") ~ "2024-05-23", + assert_count(name == "Bill" & birthdate == "2024-05-22") ~ "2024-05-23", TRUE ~ birthdate ) ) diff --git a/tests/testthat/test-assert_count_true.R b/tests/testthat/test-assert_count_true.R index 7b477070..3d10287d 100644 --- a/tests/testthat/test-assert_count_true.R +++ b/tests/testthat/test-assert_count_true.R @@ -1,36 +1,36 @@ -test_that("assert_count_true", { +test_that("assert_count", { expect_equal( - assert_count_true(TRUE, 1), + assert_count(TRUE, 1), TRUE ) expect_equal( - assert_count_true(rep(TRUE, 3), 3), + assert_count(rep(TRUE, 3), 3), rep(TRUE, 3) ) my_vector <- c(rep(TRUE, 3), FALSE) expect_equal( - assert_count_true(my_vector, 3), + assert_count(my_vector, 3), my_vector ) expect_error( - assert_count_true(NA), + assert_count(NA), regexp = "NA has NA values" ) # more informative errors my_vector <- c(NA, TRUE) expect_error( - assert_count_true(my_vector), + assert_count(my_vector), regexp = "my_vector has NA values" ) my_vector <- c(FALSE, TRUE) expect_error( - assert_count_true(my_vector, n = 2), + assert_count(my_vector, n = 2), regexp = "`my_vector` expected 2 `TRUE` values but 1 was found." ) # Check grammar of error message my_vector <- c(TRUE, TRUE) expect_error( - assert_count_true(my_vector, n = 1), + assert_count(my_vector, n = 1), regexp = "`my_vector` expected 1 `TRUE` value but 2 were found." ) }) diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 00000000..097b2416 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/assertions.Rmd b/vignettes/assertions.Rmd new file mode 100644 index 00000000..6ee31f97 --- /dev/null +++ b/vignettes/assertions.Rmd @@ -0,0 +1,137 @@ +--- +title: "Assertions for cleaning data" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Assertions for cleaning data} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +``` + +# Assertions for cleaning data + +Part of cleaning data includes assertions to make sure that data are expected +before changing the values. `janitor` provides an assertion to enable data +verification before making changes; more assertions may be added in the future. + +```{r setup} +library(janitor) +library(dplyr) +``` + +## `assert_count()` - Verify the number of `TRUE` values + +`assert_count()` will verify that the number of `TRUE` values is the expected +number. It is useful when data may change over time and you want to be sure that +you are changing only data that you intend to change. + +For example, you are given a data set with test scores for several students. +Some of the scores are missing. + +```{r raw-v1} +raw <- + data.frame( + student_id = c(123, 124, 125, 126), + test_score = c(NA, 93, NA, 82) + ) +``` + +When you first receive the data, you're told separately that student 123 has a +score of 84 and 125 has a score of 91. You want to verify that you are finding +the right rows to replace and that you replace them. + +```{r clean-v1-mistake} +clean_mistake <- + raw %>% + mutate( + test_score = + case_when( + student_id == 124 & is.na(test_score) ~ 84, + student_id == 125 & is.na(test_score) ~ 91, + TRUE ~ test_score + ) + ) +``` + +Because of a bug in the code, `student_id == 123` was not replaced. + +```{r clean-v1-mistake-table} +clean_mistake +``` + +Using `assert_count()`, you would find this error because of an error raised by +`assert_count()` in the pipeline. + +```{r clean_assert} +try({ +clean_assert <- + raw %>% + mutate( + test_score = + case_when( + assert_count(student_id == 124 & is.na(test_score)) ~ 84, + assert_count(student_id == 125 & is.na(test_score)) ~ 91, + TRUE ~ test_score + ) + ) +}) +``` + +Fixing the code bug so that the first `student_id == 123` instead of +`student_id == 124`, you now get the expected result. + +```{r clean_assert_fixed} +clean_assert <- + raw %>% + mutate( + test_score = + case_when( + assert_count(student_id == 123 & is.na(test_score)) ~ 84, + assert_count(student_id == 125 & is.na(test_score)) ~ 91, + TRUE ~ test_score + ) + ) + +# New result +clean_assert + +# Original data +raw +``` + +### Changing data + +Another way that `assert_count()` can help is verifying that your code notifies +you if your data changes in an important way. Using the example before, you may +get a new raw data set (`raw_v2`) that has some of the `test_score` values +added. They may be different than what you were told before. + +Running the same code on the new data will give you an informative error telling +you what to look into. + +```{r raw_v2} +raw_v2 <- + data.frame( + student_id = c(123, 124, 125, 126), + test_score = c(90, 93, NA, 82) + ) + +try({ +clean_assert <- + raw_v2 %>% + mutate( + test_score = + case_when( + assert_count(student_id == 123 & is.na(test_score)) ~ 84, + assert_count(student_id == 125 & is.na(test_score)) ~ 91, + TRUE ~ test_score + ) + ) +}) +``` From 140d619f63b98c2309ea0f1258fdfcca8c5256f1 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Wed, 18 Dec 2024 16:46:50 -0500 Subject: [PATCH 4/6] Clean up filename --- tests/testthat/{test-assert_count_true.R => test-assertions.R} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/testthat/{test-assert_count_true.R => test-assertions.R} (100%) diff --git a/tests/testthat/test-assert_count_true.R b/tests/testthat/test-assertions.R similarity index 100% rename from tests/testthat/test-assert_count_true.R rename to tests/testthat/test-assertions.R From 0edd037ddba5313158e1bb655febd5dd31a6f8bc Mon Sep 17 00:00:00 2001 From: billdenney Date: Wed, 18 Dec 2024 21:48:42 +0000 Subject: [PATCH 5/6] Style code (GHA) --- R/clean_names.R | 15 ++++++------ tests/testthat/test-clean-names.R | 20 ++++++++-------- vignettes/assertions.Rmd | 40 +++++++++++++++---------------- 3 files changed, 37 insertions(+), 38 deletions(-) diff --git a/R/clean_names.R b/R/clean_names.R index 388144d8..abdcd73c 100644 --- a/R/clean_names.R +++ b/R/clean_names.R @@ -33,7 +33,7 @@ #' support using `clean_names()` on `sf` and `tbl_graph` (from #' `tidygraph`) objects as well as on database connections through #' `dbplyr`. For cleaning other named objects like named lists -#' and vectors, use `make_clean_names()`. When `set_labels` is set to `TRUE`, the old names, +#' and vectors, use `make_clean_names()`. When `set_labels` is set to `TRUE`, the old names, #' stored as column labels, can be restored using `sjlabelled::label_to_colnames()`. #' #' @export @@ -83,14 +83,13 @@ clean_names.default <- function(dat, ..., set_labels = FALSE) { if (is.null(names(dat))) { dimnames(dat) <- lapply(dimnames(dat), make_clean_names, ...) } else { - if (set_labels){ + if (set_labels) { old_names <- names(dat) - for (i in seq_along(old_names)){ + for (i in seq_along(old_names)) { attr(dat[[i]], "label") <- old_names[[i]] } } names(dat) <- make_clean_names(names(dat), ...) - } dat } @@ -112,9 +111,9 @@ clean_names.sf <- function(dat, ..., set_labels = FALSE) { sf_cleaned <- make_clean_names(sf_names[cols_to_rename], ...) # rename original df names(dat)[cols_to_rename] <- sf_cleaned - - if(set_labels){ - for (i in seq_along(sf_names[cols_to_rename])){ + + if (set_labels) { + for (i in seq_along(sf_names[cols_to_rename])) { attr(dat[[i]], "label") <- sf_names[[i]] } } @@ -131,7 +130,7 @@ clean_names.tbl_graph <- function(dat, ...) { call. = FALSE ) } # nocov end - + dplyr::rename_all(dat, .funs = make_clean_names, ...) } diff --git a/tests/testthat/test-clean-names.R b/tests/testthat/test-clean-names.R index 15bb942a..bc4e704d 100644 --- a/tests/testthat/test-clean-names.R +++ b/tests/testthat/test-clean-names.R @@ -190,14 +190,14 @@ test_that("labels are created in default method (feature request #563)", { dat_df <- dplyr::tibble(`a a` = c(11, 22), `b b` = c(2, 3)) dat_df_clean_labels <- clean_names(dat_df, set_labels = TRUE) dat_df_clean <- clean_names(dat_df) - - for (i in seq_along(names(dat_df))){ + + for (i in seq_along(names(dat_df))) { # check that old names are saved as labels when set_labels is TRUE expect_equal(attr(dat_df_clean_labels[[i]], "label"), names(dat_df)[[i]]) # check that old names are not stored if set_labels is not TRUE expect_null(attr(dat_df_clean[[i]], "label")) - } - + } + # expect names are always cleaned expect_equal(names(dat_df_clean), c("a_a", "b_b")) expect_equal(names(dat_df_clean_labels), c("a_a", "b_b")) @@ -605,19 +605,19 @@ test_that("Tests for cases beyond default snake for sf objects", { test_that("labels are created in sf method (feature request #563)", { skip_if_not_installed("sf") - + dat_df <- dplyr::tibble(`a a` = c(11, 22), `b b` = c(2, 3)) dat_sf <- dat_df - dat_sf$x <- c(1,2) - dat_sf$y <- c(1,2) + dat_sf$x <- c(1, 2) + dat_sf$y <- c(1, 2) dat_sf <- sf::st_as_sf(dat_sf, coords = c("x", "y")) dat_sf_clean_labels <- clean_names(dat_sf, set_labels = TRUE) dat_sf_clean <- clean_names(dat_sf) - - for (i in seq_along(names(dat_df))){ + + for (i in seq_along(names(dat_df))) { # check that old names are saved as labels when set_labels is TRUE expect_equal(attr(dat_sf_clean_labels[[i]], "label"), names(dat_sf)[[i]]) - + # check that old names are not stored if set_labels is not TRUE expect_null(attr(dat_sf_clean[[i]], "label")) } diff --git a/vignettes/assertions.Rmd b/vignettes/assertions.Rmd index 6ee31f97..7099cf98 100644 --- a/vignettes/assertions.Rmd +++ b/vignettes/assertions.Rmd @@ -70,16 +70,16 @@ Using `assert_count()`, you would find this error because of an error raised by ```{r clean_assert} try({ -clean_assert <- - raw %>% - mutate( - test_score = - case_when( - assert_count(student_id == 124 & is.na(test_score)) ~ 84, - assert_count(student_id == 125 & is.na(test_score)) ~ 91, - TRUE ~ test_score - ) - ) + clean_assert <- + raw %>% + mutate( + test_score = + case_when( + assert_count(student_id == 124 & is.na(test_score)) ~ 84, + assert_count(student_id == 125 & is.na(test_score)) ~ 91, + TRUE ~ test_score + ) + ) }) ``` @@ -123,15 +123,15 @@ raw_v2 <- ) try({ -clean_assert <- - raw_v2 %>% - mutate( - test_score = - case_when( - assert_count(student_id == 123 & is.na(test_score)) ~ 84, - assert_count(student_id == 125 & is.na(test_score)) ~ 91, - TRUE ~ test_score - ) - ) + clean_assert <- + raw_v2 %>% + mutate( + test_score = + case_when( + assert_count(student_id == 123 & is.na(test_score)) ~ 84, + assert_count(student_id == 125 & is.na(test_score)) ~ 91, + TRUE ~ test_score + ) + ) }) ``` From f9ec0eb03b705b56dbe8de9afa533e4a6e57b6e6 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Wed, 18 Dec 2024 16:58:54 -0500 Subject: [PATCH 6/6] Fix spelling issues, update janitor.md --- DESCRIPTION | 2 + NEWS.md | 8 +-- R/round_half_up.R | 2 +- vignettes/janitor.Rmd | 2 +- vignettes/janitor.md | 155 +++++++++++++++++++----------------------- vignettes/tabyls.md | 6 +- 6 files changed, 82 insertions(+), 93 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 715ff2f7..ff84bb41 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,6 +44,7 @@ Suggests: rmarkdown, RSQLite, sf, + spelling, testthat (>= 3.0.0), tibble, tidygraph @@ -54,3 +55,4 @@ Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 +Language: en-US diff --git a/NEWS.md b/NEWS.md index b6a2844f..26f469ee 100644 --- a/NEWS.md +++ b/NEWS.md @@ -106,7 +106,7 @@ These are all minor breaking changes resulting from enhancements and are not exp ## New features -* The `adorn_totals()` function now accepts the special argument `fill = NA`, which will insert a class-appropriate `NA` value into each column that isn't being totaled. This preserves the class of each column; previously they were all convered to character. (thanks **@hamstr147** for implementing in #404 and **@ymer** for reporting in #298). +* The `adorn_totals()` function now accepts the special argument `fill = NA`, which will insert a class-appropriate `NA` value into each column that isn't being totaled. This preserves the class of each column; previously they were all converted to character. (thanks **@hamstr147** for implementing in #404 and **@ymer** for reporting in #298). * `adorn_totals()` now takes the value of `"both"` for the `where` argument. That is, `adorn_totals("both")` is a shorter version of `adorn_totals(c("col", "row"))`. (#362, thanks to **@svgsstats** for implementing and **@sfd99** for suggesting). @@ -130,7 +130,7 @@ These are all minor breaking changes resulting from enhancements and are not exp * A call to make a 3-way `tabyl()` now succeeds when the first variable is of class `ordered` (#386) -* If a totals row and/or column is present on a tabyl as a result of `adorn_totals()`, the functions `chisq.test()` and `fisher.test()` drop the totals and print a warning before proceding with the calculations (#385). +* If a totals row and/or column is present on a tabyl as a result of `adorn_totals()`, the functions `chisq.test()` and `fisher.test()` drop the totals and print a warning before proceeding with the calculations (#385). # janitor 2.0.1 (2020-04-12) @@ -276,7 +276,7 @@ This builds on the original functionality of janitor, with similar-but-improved ### A fully-overhauled `tabyl` -`tabyl()` is now a single function that can count combinations of one, two, or three variables, ala base R's `table()`. The resulting `tabyl` data.frames can be manipulated and formatted using a family of `adorn_` functions. See the [tabyls vignette](https://sfirke.github.io/janitor/articles/tabyls.html) for more. +`tabyl()` is now a single function that can count combinations of one, two, or three variables, a la base R's `table()`. The resulting `tabyl` data.frames can be manipulated and formatted using a family of `adorn_` functions. See the [tabyls vignette](https://sfirke.github.io/janitor/articles/tabyls.html) for more. The now-redundant legacy functions `crosstab()` and `adorn_crosstab()` have been deprecated, but remain in the package for now. Existing code that relies on the version of `tabyl` present in janitor versions <= 0.3.1 will break if the `sort` argument was used, as that argument no longer exists in `tabyl` (use `dplyr::arrange()` instead). @@ -292,7 +292,7 @@ No further changes are planned to `clean_names()` and its results should be stab ## Major features -- `clean_names()` transliterates accented letters, e.g., `çãüœ` becomes `cauoe` [(#120)](https://github.com/sfirke/janitor/issues/120). Thanks to **@fernandovmacedo**. +- `clean_names()` transliterates accented letters, e.g., `C'C#C%`, `make_clean_names()` allows for more general usage, e.g., on a vector. +Like base R's `make.names()`, but with the styling and case choice of the long-time janitor function `clean_names()`. While `clean_names()` is still offered for use in data.frame pipeline with `%>%`, `make_clean_names()` allows for more general usage, e.g., on a vector. It can also be used as an argument to `.name_repair` in the newest version of `tibble::as_tibble`: ```{r} diff --git a/vignettes/janitor.md b/vignettes/janitor.md index 3627d7b3..2ee3b61f 100644 --- a/vignettes/janitor.md +++ b/vignettes/janitor.md @@ -1,68 +1,45 @@ Overview of janitor functions ================ -2023-02-02 - -- Major functions - - Cleaning - - Clean data.frame names - with clean_names() - - Do those - data.frames actually contain the same columns? - - Exploring - - tabyl() - a - better version of table() - - Explore - records with duplicated values for specific combinations of variables - with get_dupes() - - Explore - relationships between columns with get_one_to_one() -- Minor functions - - Cleaning - - Manipulate - vectors of names with make_clean_names() - - Validate - that a column has a single_value() per group - - remove_empty() rows - and columns - - remove_constant() - columns - - Directionally-consistent - rounding behavior with round_half_up() - - Round - decimals to precise fractions of a given denominator with - round_to_fraction() - - Fix - dates stored as serial numbers with - excel_numeric_to_date() - - Convert a - mix of date and datetime formats to date - - Elevate column - names stored in a data.frame row - - Find the - header row buried within a messy data.frame - - Exploring - - Count - factor levels in groups of high, medium, and low with - top_levels() +2024-12-18 + +- [Major functions](#major-functions) + - [Cleaning](#cleaning) + - [Clean data.frame names with + `clean_names()`](#clean-dataframe-names-with-clean_names) + - [Do those data.frames actually contain the same + columns?](#do-those-dataframes-actually-contain-the-same-columns) + - [Exploring](#exploring) + - [`tabyl()` - a better version of + `table()`](#tabyl---a-better-version-of-table) + - [Explore records with duplicated values for specific combinations + of variables with + `get_dupes()`](#explore-records-with-duplicated-values-for-specific-combinations-of-variables-with-get_dupes) + - [Explore relationships between columns with + `get_one_to_one()`](#explore-relationships-between-columns-with-get_one_to_one) +- [Minor functions](#minor-functions) + - [Cleaning](#cleaning-1) + - [Manipulate vectors of names with + `make_clean_names()`](#manipulate-vectors-of-names-with-make_clean_names) + - [Validate that a column has a `single_value()` per + group](#validate-that-a-column-has-a-single_value-per-group) + - [`remove_empty()` rows and + columns](#remove_empty-rows-and-columns) + - [`remove_constant()` columns](#remove_constant-columns) + - [Directionally-consistent rounding behavior with + `round_half_up()`](#directionally-consistent-rounding-behavior-with-round_half_up) + - [Round decimals to precise fractions of a given denominator with + `round_to_fraction()`](#round-decimals-to-precise-fractions-of-a-given-denominator-with-round_to_fraction) + - [Fix dates stored as serial numbers with + `excel_numeric_to_date()`](#fix-dates-stored-as-serial-numbers-with-excel_numeric_to_date) + - [Convert a mix of date and datetime formats to + date](#convert-a-mix-of-date-and-datetime-formats-to-date) + - [Elevate column names stored in a data.frame + row](#elevate-column-names-stored-in-a-dataframe-row) + - [Find the header row buried within a messy + data.frame](#find-the-header-row-buried-within-a-messy-dataframe) + - [Exploring](#exploring-1) + - [Count factor levels in groups of high, medium, and low with + `top_levels()`](#count-factor-levels-in-groups-of-high-medium-and-low-with-top_levels) The janitor functions expedite the initial data exploration and cleaning that comes with any new data set. This catalog describes the usage for @@ -78,7 +55,7 @@ Functions for everyday use. Call this function every time you read data. -It works in a `%>%` pipeline, and handles problematic variable names, +It works in a `%>%` pipeline and handles problematic variable names, especially those that are so well-preserved by `readxl::read_excel()` and `readr::read_csv()`. @@ -94,8 +71,10 @@ and `readr::read_csv()`. ``` r # Create a data.frame with dirty names test_df <- as.data.frame(matrix(ncol = 6)) -names(test_df) <- c("firstName", "ábc@!*", "% successful (2009)", - "REPEAT VALUE", "REPEAT VALUE", "") +names(test_df) <- c( + "firstName", "ábc@!*", "% successful (2009)", + "REPEAT VALUE", "REPEAT VALUE", "" +) ``` Clean the variable names, returning a data.frame: @@ -111,8 +90,8 @@ Compare to what base R produces: ``` r make.names(names(test_df)) -#> [1] "firstName" "ábc..." "X..successful..2009." "REPEAT.VALUE" "REPEAT.VALUE" -#> [6] "X" +#> [1] "firstName" "ábc..." "X..successful..2009." +#> [4] "REPEAT.VALUE" "REPEAT.VALUE" "X" ``` This function is powered by the underlying exported function @@ -229,10 +208,11 @@ sets of one-to-one clusters: ``` r library(dplyr) -starwars[1:4,] %>% +starwars[1:4, ] %>% get_one_to_one() #> [[1]] -#> [1] "name" "height" "mass" "skin_color" "birth_year" "films" +#> [1] "name" "height" "mass" "skin_color" "birth_year" +#> [6] "films" #> #> [[2]] #> [1] "hair_color" "starships" @@ -250,7 +230,7 @@ than the equivalent code they replace. ### Manipulate vectors of names with `make_clean_names()` -Like base R’s `make.names()`, but with the stylings and case choice of +Like base R’s `make.names()`, but with the styling and case choice of the long-time janitor function `clean_names()`. While `clean_names()` is still offered for use in data.frame pipeline with `%>%`, `make_clean_names()` allows for more general usage, e.g., on a vector. @@ -273,7 +253,7 @@ tibble::as_tibble(iris, .name_repair = janitor::make_clean_names) #> 8 5 3.4 1.5 0.2 setosa #> 9 4.4 2.9 1.4 0.2 setosa #> 10 4.9 3.1 1.5 0.1 setosa -#> # … with 140 more rows +#> # ℹ 140 more rows ``` ### Validate that a column has a `single_value()` per group @@ -290,7 +270,8 @@ where it should not: ``` r not_one_to_one <- data.frame( X = rep(1:3, each = 2), - Y = c(rep(1:2, each = 2), 1:2)) + Y = c(rep(1:2, each = 2), 1:2) +) not_one_to_one #> X Y @@ -303,12 +284,13 @@ not_one_to_one # throws informative error: try(not_one_to_one %>% - dplyr::group_by(X) %>% - dplyr::mutate( - Z = single_value(Y, info = paste("Calculating Z for group X =", X))) - ) + dplyr::group_by(X) %>% + dplyr::mutate( + Z = single_value(Y, info = paste("Calculating Z for group X =", X)) + )) #> Error in dplyr::mutate(., Z = single_value(Y, info = paste("Calculating Z for group X =", : -#> ℹ In argument: `Z = single_value(Y, info = paste("Calculating Z for group X =", X))`. +#> ℹ In argument: `Z = single_value(Y, info = paste("Calculating Z for +#> group X =", X))`. #> ℹ In group 3: `X = 3`. #> Caused by error in `single_value()`: #> ! More than one (2) value found (1, 2): Calculating Z for group X = 3: Calculating Z for group X = 3 @@ -320,9 +302,11 @@ Does what it says. For cases like cleaning Excel files that contain empty rows and columns after being read into R. ``` r -q <- data.frame(v1 = c(1, NA, 3), - v2 = c(NA, NA, NA), - v3 = c("a", NA, "b")) +q <- data.frame( + v1 = c(1, NA, 3), + v2 = c(NA, NA, NA), + v3 = c("a", NA, "b") +) q %>% remove_empty(c("rows", "cols")) #> v1 v3 @@ -419,8 +403,10 @@ names of the data.frame and optionally (by default) remove the row in which names were stored and/or the rows above it. ``` r -dirt <- data.frame(X_1 = c(NA, "ID", 1:3), - X_2 = c(NA, "Value", 4:6)) +dirt <- data.frame( + X_1 = c(NA, "ID", 1:3), + X_2 = c(NA, "Value", 4:6) +) row_to_names(dirt, 2) #> ID Value @@ -454,7 +440,8 @@ grouped into head/middle/tail groups. ``` r f <- factor(c("strongly agree", "agree", "neutral", "neutral", "disagree", "strongly agree"), - levels = c("strongly agree", "agree", "neutral", "disagree", "strongly disagree")) + levels = c("strongly agree", "agree", "neutral", "disagree", "strongly disagree") +) top_levels(f) #> f n percent #> strongly agree, agree 3 0.5000000 diff --git a/vignettes/tabyls.md b/vignettes/tabyls.md index ea526931..262a1801 100644 --- a/vignettes/tabyls.md +++ b/vignettes/tabyls.md @@ -254,7 +254,7 @@ humans %>% function or using janitor’s `round_half_up()` to round all ties up ([thanks, StackOverflow](https://stackoverflow.com/a/12688836/4470365)). - - e.g., round 10.5 up to 11, consistent with Excel’s tie-breaking + - e.g., round 10.5 up to 11, consistent with Excel's tie-breaking behavior. - This contrasts with rounding 10.5 down to 10 as in base R’s `round(10.5)`. @@ -263,7 +263,7 @@ humans %>% `adorn_pct_formatting()`; these two functions should not be called together. - **`adorn_ns()`**: add Ns to a tabyl. These can be drawn from the - tabyl’s underlying counts, which are attached to the tabyl as + tabyl's underlying counts, which are attached to the tabyl as metadata, or they can be supplied by the user. - **`adorn_title()`**: add a title to a tabyl (or other data.frame). Options include putting the column title in a new row on top of the @@ -427,7 +427,7 @@ comparison %>% #> Total 100.0% (3,000) 100.0% (3,000) 100.0% (6,000) ``` -Now we format them to insert the thousands commas. A tabyl’s raw Ns are +Now we format them to insert the thousands commas. A tabyl's raw Ns are stored in its `"core"` attribute. Here we retrieve those with `attr()`, then apply the base R function `format()` to all numeric columns. Lastly, we append these Ns using `adorn_ns()`.