Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import api with pseudobulk #150

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: CuratedAtlasQueryR
Title: Queries the Human Cell Atlas
Version: 1.3.7
Version: 1.4.8
Authors@R: c(
person(
"Stefano",
Expand Down Expand Up @@ -95,7 +95,9 @@ Imports:
duckdb,
stringr,
checkmate,
stringfish
stringfish,
tidySingleCellExperiment,
tidybulk
Suggests:
zellkonverter,
rmarkdown,
Expand All @@ -107,7 +109,6 @@ Suggests:
spelling,
forcats,
ggplot2,
tidySingleCellExperiment,
rprojroot,
openssl,
scuttle,
Expand All @@ -126,7 +127,7 @@ biocViews:
Transcription,
Transcriptomics
Encoding: UTF-8
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
LazyDataCompression: xz
URL: https://github.com/stemangiola/CuratedAtlasQueryR
BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues
Expand Down
4 changes: 4 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

S3method(as.sparse,DelayedMatrix)
export(SAMPLE_DATABASE_URL)
export(calculate_pseudobulk)
export(get_SingleCellExperiment)
export(get_database_url)
export(get_default_cache_dir)
Expand All @@ -26,6 +27,7 @@ importFrom(SingleCellExperiment,"reducedDims<-")
importFrom(SingleCellExperiment,SingleCellExperiment)
importFrom(SingleCellExperiment,reducedDims)
importFrom(SingleCellExperiment,rowData)
importFrom(SummarizedExperiment,"assay<-")
importFrom(SummarizedExperiment,"assayNames<-")
importFrom(SummarizedExperiment,"assays<-")
importFrom(SummarizedExperiment,"colData<-")
Expand Down Expand Up @@ -83,6 +85,8 @@ importFrom(stringr,str_detect)
importFrom(stringr,str_remove_all)
importFrom(stringr,str_replace_all)
importFrom(tibble,column_to_rownames)
importFrom(tidySingleCellExperiment,aggregate_cells)
importFrom(tidybulk,quantile_normalise_abundance)
importFrom(tools,R_user_dir)
importFrom(utils,head)
importFrom(utils,packageName)
7 changes: 4 additions & 3 deletions R/counts_per_million.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,16 @@
get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) {

# Save SCE to the cache directory counts folder
input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir)
input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir, replace=TRUE)

data <- input_sce_obj

# Avoid completely empty cells
col_sums <- colSums(as.matrix(assay(data)))
selected_cols <- which(col_sums >0 & col_sums < Inf)

sce <- SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,selected_cols ,drop=FALSE ], assay.type = "X")))
assay_name <- assays(data) |> names()
sce <- SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,selected_cols ,drop=FALSE ], assay.type = assay_name)))
rownames(sce) <- rownames(data[,selected_cols ])
colnames(sce) <- colnames(data[,selected_cols ])

Expand All @@ -38,6 +39,6 @@ get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) {
# Check if there is a memory issue
assays(sce) <- assays(sce) |> map(DelayedArray::realize)

sce |> saveHDF5SummarizedExperiment(output_dir)
sce |> saveHDF5SummarizedExperiment(output_dir, replace = TRUE)
}

108 changes: 97 additions & 11 deletions R/import_metadata_and_counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,21 @@
#' @param cache_dir Optional character vector of length 1. A file path on
#' your local system to a directory (not a file) that will be used to store
#' `metadata.parquet`
#' @param pseudobulk Optional character. Set to TRUE for generating and importing pseudobulk,
#' the metadata slot of which must contain `file_id`, `cell_type_harmonised` and `sample_`
#'
#' @export
#' @return A metadata.parquet strip from the SingleCellExperiment object.
#' Directories store counts and counts per million in the provided cache directory.
#' @importFrom checkmate check_true check_character check_subset assert
#' @importFrom dplyr select distinct pull
#' @importFrom cli cli_alert_info
#' @importFrom cli cli_alert_info cli_alert_warning
#' @importFrom rlang .data
#' @importFrom SingleCellExperiment reducedDims rowData reducedDims<-
#' @importFrom S4Vectors metadata metadata<-
#' @importFrom SummarizedExperiment assay
#' @importFrom SummarizedExperiment assay assay<-
#' @importFrom stringr str_detect
#' @importFrom HDF5Array saveHDF5SummarizedExperiment
#' @examples
#' data(sample_sce_obj)
#' import_one_sce(sample_sce_obj,
Expand All @@ -27,7 +31,8 @@
#' @source [Mangiola et al.,2023](https://www.biorxiv.org/content/10.1101/2023.06.08.542671v3)
import_one_sce <- function(
sce_obj,
cache_dir = get_default_cache_dir()
cache_dir = get_default_cache_dir(),
pseudobulk = FALSE
) {
original_dir <- file.path(cache_dir, "original")

Expand Down Expand Up @@ -63,6 +68,18 @@ import_one_sce <- function(
reducedDims(sce_obj) <- NULL
}

# Pseudobulk checkpoint
pseudobulk_sample <- c("sample_", "cell_type_harmonised")
if (isTRUE(pseudobulk)) {
assert(
all(pseudobulk_sample %in% (colData(sce_obj) |> colnames()) ),
"Sample_ and cell_type_harmonised columns must be in the SingleCellExperiment colData")

assert(c(pseudobulk_sample, "file_id") %in% (names(metadata_tbl)) |> all() ,
"SingleCellExperiment metadata must at least contain sample_, cell_type_harmonised,
file_id for pseudobulk generation"
) }

# Create original and cpm folders in the cache directory if not exist
if (!dir.exists(original_dir)) {
cache_dir |> file.path("original") |> dir.create(recursive = TRUE)
Expand All @@ -73,10 +90,14 @@ import_one_sce <- function(
}

# Check whether count H5 directory has been generated
all(!file_id_db %in% dir(original_dir)) |>
check_true() |>
assert("The filename for count assay (file_id_db) already exists in the cache directory.")

if (any(file_id_db %in% dir(original_dir))) {
cli_alert_warning(
single_line_str(
"Import API says: The filename for count assay (file_id_db) already exists in the cache directory. "
)
)
}

# Check the metadata contains cell_, file_id_db, sample_ with correct types
check_true("cell_" %in% colnames(metadata_tbl))
check_true("file_id_db" %in% names(metadata_tbl))
Expand All @@ -88,9 +109,14 @@ import_one_sce <- function(

# Check cell_ values are not duplicated when join with parquet
cells <- select(get_metadata(cache_directory = cache_dir), .data$cell_) |> as_tibble()
(!any(metadata_tbl$cell_ %in% cells$cell_)) |>
assert("Cell names (cell_) should not clash with cells that already exist in the atlas.")

if ((any(metadata_tbl$cell_ %in% cells$cell_)))
cli_alert_warning(
single_line_str(
"Import API says:
Cells in your SingleCellExperiment already exists in the atlas."
)
)

# Check age_days is either -99 or greater than 365
if (any(colnames(metadata_tbl) == "age_days")) {
assert(all(metadata_tbl$age_days==-99 | metadata_tbl$age_days> 365),
Expand All @@ -114,9 +140,69 @@ import_one_sce <- function(

# check metadata sample file ID match the count file ID in cache directory
all(metadata_tbl |> pull(.data$file_id_db) %in% dir(original_dir)) |>
assert("The filename for count assay, which matches the file_id_db column in the metadata, already exists in the cache directory.")
assert("The filename for count assay, which matches the file_id_db column in
the metadata, already exists in the cache directory.")

# convert metadata_tbl to parquet if above checkpoints pass
arrow::write_parquet(metadata_tbl, file.path(cache_dir, "metadata.parquet"))

# Generate pseudobulk
if (isTRUE(pseudobulk)) sce_obj |> calculate_pseudobulk(cache_dir = cache_dir)
}


#' Generate pseudobulk counts and quantile_normalised counts
#' @param sce_data A SingleCellExperiment object from RDS, the metadata slot of which
#' must contain `cell_` and `dataset_id`
#' @param cache_dir Optional character vector of length 1. A file path on
#' your local system to a directory (not a file) that will be used to store pseudobulk counts
#' @return Pseudobulk counts in `HDF5` format stored in the cache directory
#' @export
#' @importFrom S4Vectors metadata
#' @importFrom dplyr select distinct pull
#' @importFrom cli cli_alert_info cli_alert_warning
#' @importFrom S4Vectors metadata
#' @importFrom SummarizedExperiment assay assay<- assays
#' @importFrom tidySingleCellExperiment aggregate_cells
#' @importFrom tidybulk quantile_normalise_abundance
#' @importFrom HDF5Array saveHDF5SummarizedExperiment

calculate_pseudobulk <- function(sce_data,
cache_dir = get_default_cache_dir()) {
metadata_tbl <- metadata(sce_data)$data
file_id <- metadata_tbl$file_id |> unique() |> as.character()

if (!dir.exists(file.path(cache_dir, "pseudobulk/original"))) {
cache_dir |> file.path("pseudobulk/original") |> dir.create(recursive = TRUE)
}

if (!dir.exists(file.path(cache_dir, "pseudobulk/quantile_normalised"))) {
cache_dir |> file.path("pseudobulk/quantile_normalised") |> dir.create(recursive = TRUE)
}

cli_alert_info("Generating pseudobulk counts from {file_id}. ")
pseudobulk_counts <- sce_data |> aggregate_cells(c(sample_, cell_type_harmonised))

assay_name <- pseudobulk_counts |> assays() |> names()
normalised_counts_best_distribution <- assay(pseudobulk_counts, assay_name) |> as.matrix() |>
preprocessCore::normalize.quantiles.determine.target()

normalised_counts <- pseudobulk_counts |> quantile_normalise_abundance(
method="preprocesscore_normalize_quantiles_use_target",
target_distribution = normalised_counts_best_distribution
)

assay(normalised_counts, assay_name) <- NULL
names(assays(normalised_counts)) <- "quantile_normalised"

path <- file.path(cache_dir, "pseudobulk")
pseudobulk_counts_path <- file.path(path, "original", basename(file_id))
pseudobulk_qnorm_path <- file.path(path, "quantile_normalised", basename(file_id))

saveHDF5SummarizedExperiment(pseudobulk_counts, pseudobulk_counts_path, replace = TRUE )
saveHDF5SummarizedExperiment(normalised_counts, pseudobulk_qnorm_path , replace = TRUE)

cli_alert_info("pseudobulk are generated in {.path {path}}. ")
}


Binary file modified data/sample_sce_obj.rda
Binary file not shown.
9 changes: 8 additions & 1 deletion man/import_one_sce.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 12 additions & 3 deletions tests/testthat/test-query.R
Original file line number Diff line number Diff line change
Expand Up @@ -203,17 +203,27 @@ test_that("get_metadata() expect a unique cell_type `b` is present, which comes
expect_true(n_cell > 0)
})

test_that("import_one_sce() loads metadata from a SingleCellExperiment object into a parquet file", {
test_that("import_one_sce() loads metadata from a SingleCellExperiment object into a parquet file and generates pseudobulk", {
# Test both functionalities together because if import them independently,
# the sample data will be loaded into the cache, which causes the second import to fail the unique file check
data(sample_sce_obj)
temp <- tempfile()
dataset_id <- "GSE122999"
import_one_sce(sce_obj = sample_sce_obj,
cache_dir = temp)
cache_dir = temp,
pseudobulk = TRUE)

dataset_id %in% (get_metadata(cache_directory = temp) |>
dplyr::distinct(dataset_id) |>
dplyr::pull()) |>
expect(failure_message = "The correct metadata was not created")

sme <- get_metadata(cache_directory = temp) |> filter(file_id == "id1") |>
get_pseudobulk(cache_directory = file.path(temp, "pseudobulk"))
sme |>
row.names() |>
length() |>
expect_gt(1)
})

test_that("get_pseudobulk() syncs appropriate files", {
Expand Down Expand Up @@ -253,4 +263,3 @@ test_that("get_pseudobulk() syncs appropriate fixed file", {
expect_gt(1)
})


Loading