Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a fast approximation for simple surveys #55

Merged
merged 1 commit into from
Oct 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,17 @@ Imports:
survey
Suggests:
testthat
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
Collate:
'anthro-package.R'
'utils.R'
'prevalence-survey.R'
'prevalence-simple.R'
'assertions.R'
'prevalence.R'
'z-score-helper.R'
'api.R'
'prevalence-compute-api.R'
'z-score-arm-circumference-for-age.R'
'z-score-bmi-for-age.R'
'z-score-head-circumference-for-age.R'
Expand Down
16 changes: 16 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
# Generated by roxygen2: do not edit by hand

S3method(column_names,simple_design)
S3method(column_names,survey_design)
S3method(column_values,simple_design)
S3method(column_values,survey_design)
S3method(compute_prevalence_estimates_for_column_by,simple_design)
S3method(compute_prevalence_estimates_for_column_by,survey_design)
S3method(compute_prevalence_sample_size_by,simple_design)
S3method(compute_prevalence_sample_size_by,survey_design)
S3method(compute_prevalence_zscore_summaries_by,simple_design)
S3method(compute_prevalence_zscore_summaries_by,survey_design)
export(anthro_api_compute_prevalence)
export(anthro_api_compute_zscore)
export(anthro_api_compute_zscore_adjusted)
export(anthro_api_standardize_oedema_var)
export(anthro_api_standardize_sex_var)
export(anthro_prevalence)
export(anthro_zscores)
importFrom(stats,aggregate)
importFrom(stats,as.formula)
importFrom(stats,confint)
importFrom(stats,glm)
importFrom(stats,plogis)
importFrom(stats,qt)
importFrom(stats,quasibinomial)
importFrom(stats,sd)
importFrom(stats,setNames)
importFrom(survey,degf)
importFrom(survey,svyby)
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# anthro (development version)

## Performance

* For inputs with `cluster/strata = NULL` and `sw = NULL or 1` a faster
method was added for this special case to compute `anthro_prevalence`.

## Bugfix

* Fixed a bug during prevalence computation when a value used as a group
Expand Down
40 changes: 40 additions & 0 deletions R/prevalence-compute-api.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#' In order to support different types of computation for special cases of the
#' data we define here a compute API that different backends can hook into.
#' @noRd
NULL

#' Computes the sample size by indicator and subset
#' @noRd
compute_prevalence_sample_size_by <- function(
data, indicator, subset_col_name
) {
UseMethod("compute_prevalence_sample_size_by")
}

#' Computes prevalence of rates for a given indicator, subset and cutoff
#' @noRd
compute_prevalence_estimates_for_column_by <- function(
data, indicator_name, subset_col_name, prev_col_name
){
UseMethod("compute_prevalence_estimates_for_column_by")
}

#' Computes prevalence of zscores by indicator and subset
#' @noRd
compute_prevalence_zscore_summaries_by <- function(
data, indicator, subset_col_name
) {
UseMethod("compute_prevalence_zscore_summaries_by")
}

#' Returns the values for a specific column
#' @noRd
column_values <- function(x, col) {
UseMethod("column_values")
}

#' Returns the column names of the data
#' @noRd
column_names <- function(x) {
UseMethod("column_names")
}
168 changes: 168 additions & 0 deletions R/prevalence-simple.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#' The following is a fast approximation of what the survey package
#' computes given cluster/strata/sw is NULL.
#'
#' T. Lumley (2024) "survey: analysis of complex survey samples".
#' R package version 4.4.
#'
#' The speedup and memory savings can be substantial in particular for very
#' large surveys with > 100.000 observations.
#'
#' The simple here refers to the absence of cluster/strata/sw, making the
#' structure a bit more simple.
#' @noRd

#' @export
column_names.simple_design <- function(x) {
colnames(x$data)
}

#' @export
column_values.simple_design <- function(x, col) {
x$data[[col]]
}

#' @export
compute_prevalence_zscore_summaries_by.simple_design <- function(
data, indicator, subset_col_name) {
zscore_col_name <- prev_zscore_value_column(indicator)
compute_and_aggregate(
data,
zscore_col_name,
subset_col_name,
compute = zscore_estimate,
empty_data_prototype = data.frame(
r = NA_real_,
se = NA_real_,
ll = NA_real_,
ul = NA_real_,
stdev = NA_real_
)
)
}

#' @export
compute_prevalence_estimates_for_column_by.simple_design <- function(
data, indicator_name, subset_col_name, prev_col_name) {
compute_and_aggregate(
data,
prev_col_name,
subset_col_name,
compute = logit_rate_estimate,
empty_data_prototype = data.frame(
r = NA_real_,
se = NA_real_,
ll = NA_real_,
ul = NA_real_
)
)
}

#' @export
compute_prevalence_sample_size_by.simple_design <- function(
data, indicator, subset_col_name) {
column_name <- prev_prevalence_column_name(indicator)
compute_and_aggregate(
data,
column_name,
subset_col_name,
compute = sample_size,
empty_data_prototype = data.frame(
pop = 0,
unwpop = 0
)
)
}

#' @importFrom stats aggregate
compute_and_aggregate <- function(
data, value_column, subset_column, compute, empty_data_prototype) {
col_values <- column_values(data, value_column)
N <- length(col_values)
values <- aggregate(
col_values,
by = list(Group = column_values(data, subset_column)),
FUN = compute,
simplify = FALSE,
drop = FALSE,
N = N,
empty_data_prototype = empty_data_prototype
)
Group <- values$Group
values <- lapply(values$x, function(df) {
if (is.null(df)) {
return(empty_data_prototype)
}
df
})
data <- do.call(rbind, values)
cbind(Group, data)
}

#' @importFrom stats glm plogis qt quasibinomial
logit_rate_estimate <- function(x, N, empty_data_prototype) {
x_orig <- x
x <- x[!is.na(x)]
if (length(x) == 0) {
return(empty_data_prototype)
}
n <- length(x)
r <- mean(x == 1)
se <- sample_se(x, r, n, N)

cis <- if (r == 1) {
c(1, 1)
} else if (r == 0) {
c(0, 0)
} else {
glm_model <- glm(x_orig ~ 1, family = quasibinomial(link = "logit"))
summary_model <- summary(glm_model)
r_var <- summary_model$cov.unscaled[1, 1]
logit_r <- summary_model$coefficients[1, "Estimate"]
scale <- N / (N - 1)
se_logit <- sqrt(scale * r_var)
t_value <- qt(1 - prevalence_significance_level / 2, df = N - 1)
cis <- plogis(
logit_r + se_logit * c(-t_value, t_value)
)
as.numeric(cis)
}

data.frame(
r = r * 100,
se = se * 100,
ll = cis[1L] * 100,
ul = cis[2L] * 100
)
}

#' @importFrom stats sd plogis qt
zscore_estimate <- function(x, N, empty_data_prototype) {
x <- x[!is.na(x)]
if (length(x) == 0) {
return(empty_data_prototype)
}
n <- length(x)
r <- mean(x)
stdev <- sd(x)
se <- sample_se(x, r, n, N)
t_value <- qt(1 - prevalence_significance_level / 2, N - 1)
cis <- r + se * c(-t_value, t_value)
data.frame(
r = r,
se = se,
ll = cis[1L],
ul = cis[2L],
stdev = stdev
)
}

sample_size <- function(x, N, empty_data_prototype) {
n <- as.numeric(sum(!is.na(x)))
data.frame(pop = n, unwpop = n)
}

sample_se <- function(x, x_mean, n, N) {
scale <- N / (N - 1)
x_centered_and_scaled <- (x - x_mean) / n * sqrt(scale)
sqrt(sum(x_centered_and_scaled^2))
}
Loading
Loading