stemangiola · myushen · Jul 18, 2024 · Jul 22, 2024 · Jul 24, 2024 · Jul 24, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: CuratedAtlasQueryR
 Title: Queries the Human Cell Atlas
-Version: 1.3.7
+Version: 1.4.8
 Authors@R: c(
     person(
         "Stefano",
@@ -95,7 +95,9 @@ Imports:
     duckdb,
     stringr,
     checkmate,
-    stringfish
+    stringfish,
+    tidySingleCellExperiment,
+    tidybulk
 Suggests:
     zellkonverter,
     rmarkdown,
@@ -107,7 +109,6 @@ Suggests:
     spelling,
     forcats,
     ggplot2,
-    tidySingleCellExperiment,
     rprojroot,
     openssl,
     scuttle,
@@ -126,7 +127,7 @@ biocViews:
     Transcription,
     Transcriptomics
 Encoding: UTF-8
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 LazyDataCompression: xz
 URL: https://github.com/stemangiola/CuratedAtlasQueryR
 BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 S3method(as.sparse,DelayedMatrix)
 export(SAMPLE_DATABASE_URL)
+export(calculate_pseudobulk)
 export(get_SingleCellExperiment)
 export(get_database_url)
 export(get_default_cache_dir)
@@ -26,6 +27,7 @@ importFrom(SingleCellExperiment,"reducedDims<-")
 importFrom(SingleCellExperiment,SingleCellExperiment)
 importFrom(SingleCellExperiment,reducedDims)
 importFrom(SingleCellExperiment,rowData)
+importFrom(SummarizedExperiment,"assay<-")
 importFrom(SummarizedExperiment,"assayNames<-")
 importFrom(SummarizedExperiment,"assays<-")
 importFrom(SummarizedExperiment,"colData<-")
@@ -83,6 +85,8 @@ importFrom(stringr,str_detect)
 importFrom(stringr,str_remove_all)
 importFrom(stringr,str_replace_all)
 importFrom(tibble,column_to_rownames)
+importFrom(tidySingleCellExperiment,aggregate_cells)
+importFrom(tidybulk,quantile_normalise_abundance)
 importFrom(tools,R_user_dir)
 importFrom(utils,head)
 importFrom(utils,packageName)
diff --git a/R/counts_per_million.R b/R/counts_per_million.R
@@ -12,15 +12,16 @@
 get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) {
 
   # Save SCE to the cache directory counts folder
-  input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir)
+  input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir, replace=TRUE)
 
   data <- input_sce_obj
 
   # Avoid completely empty cells
   col_sums <- colSums(as.matrix(assay(data)))
   selected_cols <- which(col_sums >0 & col_sums < Inf)
 
-  sce <- SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,selected_cols ,drop=FALSE ], assay.type = "X")))
+  assay_name <- assays(data) |> names()
+  sce <- SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,selected_cols ,drop=FALSE ], assay.type = assay_name)))
   rownames(sce) <- rownames(data[,selected_cols  ])
   colnames(sce) <- colnames(data[,selected_cols  ])
 
@@ -38,6 +39,6 @@ get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) {
   # Check if there is a memory issue 
   assays(sce) <- assays(sce) |> map(DelayedArray::realize)
 
-  sce |> saveHDF5SummarizedExperiment(output_dir)
+  sce |> saveHDF5SummarizedExperiment(output_dir, replace = TRUE)
 } 
 
diff --git a/R/import_metadata_and_counts.R b/R/import_metadata_and_counts.R
@@ -5,17 +5,21 @@
 #' @param cache_dir Optional character vector of length 1. A file path on
 #'   your local system to a directory (not a file) that will be used to store
 #'   `metadata.parquet`
+#' @param pseudobulk Optional character. Set to TRUE for generating and importing pseudobulk,
+#' the metadata slot of which must contain `file_id`, `cell_type_harmonised` and `sample_`
+#' 
 #' @export
 #' @return A metadata.parquet strip from the SingleCellExperiment object. 
 #' Directories store counts and counts per million in the provided cache directory.
 #' @importFrom checkmate check_true check_character check_subset assert
 #' @importFrom dplyr select distinct pull
-#' @importFrom cli cli_alert_info
+#' @importFrom cli cli_alert_info cli_alert_warning
 #' @importFrom rlang .data
 #' @importFrom SingleCellExperiment reducedDims rowData reducedDims<-
 #' @importFrom S4Vectors metadata metadata<-
-#' @importFrom SummarizedExperiment assay
+#' @importFrom SummarizedExperiment assay assay<-
 #' @importFrom stringr str_detect
+#' @importFrom HDF5Array saveHDF5SummarizedExperiment
 #' @examples
 #' data(sample_sce_obj)
 #' import_one_sce(sample_sce_obj,
@@ -27,7 +31,8 @@
 #' @source [Mangiola et al.,2023](https://www.biorxiv.org/content/10.1101/2023.06.08.542671v3)
 import_one_sce <- function(
     sce_obj,  
-    cache_dir = get_default_cache_dir()
+    cache_dir = get_default_cache_dir(),
+    pseudobulk = FALSE
   ) {
   original_dir <- file.path(cache_dir, "original")
 
@@ -63,6 +68,18 @@ import_one_sce <- function(
     reducedDims(sce_obj) <- NULL
   }
 
+    # Pseudobulk checkpoint 
+  pseudobulk_sample <- c("sample_", "cell_type_harmonised")
+  if (isTRUE(pseudobulk)) {
+    assert(
+      all(pseudobulk_sample %in% (colData(sce_obj) |> colnames()) ),
+      "Sample_ and cell_type_harmonised columns must be in the SingleCellExperiment colData")
+
+    assert(c(pseudobulk_sample, "file_id") %in% (names(metadata_tbl)) |> all() ,
+           "SingleCellExperiment metadata must at least contain sample_, cell_type_harmonised,
+           file_id for pseudobulk generation"
+    ) }
+
   # Create original and cpm folders in the cache directory if not exist
   if (!dir.exists(original_dir)) {
     cache_dir |> file.path("original") |> dir.create(recursive = TRUE)
@@ -73,10 +90,14 @@ import_one_sce <- function(
   }
 
   # Check whether count H5 directory has been generated
-  all(!file_id_db %in% dir(original_dir)) |>
-    check_true() |>
-    assert("The filename for count assay (file_id_db) already exists in the cache directory.")
-
+  if (any(file_id_db %in% dir(original_dir))) {
+    cli_alert_warning(
+      single_line_str(
+        "Import API says: The filename for count assay (file_id_db) already exists in the cache directory. "
+      )
+    )
+  }
+
   # Check the metadata contains cell_, file_id_db, sample_ with correct types
   check_true("cell_" %in% colnames(metadata_tbl))
   check_true("file_id_db" %in% names(metadata_tbl)) 
@@ -88,9 +109,14 @@ import_one_sce <- function(
 
   # Check cell_ values are not duplicated when join with parquet
   cells <- select(get_metadata(cache_directory = cache_dir), .data$cell_) |> as_tibble()
-  (!any(metadata_tbl$cell_ %in% cells$cell_)) |> 
-    assert("Cell names (cell_) should not clash with cells that already exist in the atlas.")
-
+  if ((any(metadata_tbl$cell_ %in% cells$cell_))) 
+    cli_alert_warning(
+      single_line_str(
+        "Import API says:
+        Cells in your SingleCellExperiment already exists in the atlas."
+      )
+    )
+
   # Check age_days is either -99 or greater than 365
   if (any(colnames(metadata_tbl) == "age_days")) {
     assert(all(metadata_tbl$age_days==-99 | metadata_tbl$age_days> 365),
@@ -114,9 +140,69 @@ import_one_sce <- function(
 
   # check metadata sample file ID match the count file ID in cache directory
   all(metadata_tbl |> pull(.data$file_id_db) %in% dir(original_dir)) |> 
-    assert("The filename for count assay, which matches the file_id_db column in the metadata, already exists in the cache directory.")
+    assert("The filename for count assay, which matches the file_id_db column in 
+           the metadata, already exists in the cache directory.")
 
   # convert metadata_tbl to parquet if above checkpoints pass
   arrow::write_parquet(metadata_tbl, file.path(cache_dir, "metadata.parquet"))
+
+  # Generate pseudobulk
+  if (isTRUE(pseudobulk)) sce_obj |> calculate_pseudobulk(cache_dir = cache_dir)
 }
 
+
+#' Generate pseudobulk counts and quantile_normalised counts
+#' @param sce_data A SingleCellExperiment object from RDS, the metadata slot of which
+#' must contain `cell_` and `dataset_id`
+#' @param cache_dir Optional character vector of length 1. A file path on
+#'   your local system to a directory (not a file) that will be used to store pseudobulk counts
+#' @return Pseudobulk counts in `HDF5` format stored in the cache directory
+#' @export
+#' @importFrom S4Vectors metadata
+#' @importFrom dplyr select distinct pull
+#' @importFrom cli cli_alert_info cli_alert_warning
+#' @importFrom S4Vectors metadata
+#' @importFrom SummarizedExperiment assay assay<- assays
+#' @importFrom tidySingleCellExperiment aggregate_cells
+#' @importFrom tidybulk quantile_normalise_abundance
+#' @importFrom HDF5Array saveHDF5SummarizedExperiment
+
+calculate_pseudobulk <- function(sce_data,
+                                 cache_dir = get_default_cache_dir()) {
+  metadata_tbl <- metadata(sce_data)$data
+  file_id <- metadata_tbl$file_id |> unique() |> as.character()
+
+  if (!dir.exists(file.path(cache_dir, "pseudobulk/original"))) {
+    cache_dir |> file.path("pseudobulk/original") |> dir.create(recursive = TRUE)
+  }
+
+  if (!dir.exists(file.path(cache_dir, "pseudobulk/quantile_normalised"))) {
+    cache_dir |> file.path("pseudobulk/quantile_normalised") |> dir.create(recursive = TRUE)
+  }
+
+  cli_alert_info("Generating pseudobulk counts from {file_id}. ")
+  pseudobulk_counts <- sce_data |> aggregate_cells(c(sample_, cell_type_harmonised)) 
+
+  assay_name <- pseudobulk_counts |> assays() |> names()
+  normalised_counts_best_distribution <- assay(pseudobulk_counts, assay_name) |> as.matrix() |>
+    preprocessCore::normalize.quantiles.determine.target()
+
+  normalised_counts <- pseudobulk_counts |> quantile_normalise_abundance(
+    method="preprocesscore_normalize_quantiles_use_target",
+    target_distribution = normalised_counts_best_distribution
+  )
+
+  assay(normalised_counts, assay_name) <- NULL
+  names(assays(normalised_counts)) <- "quantile_normalised"
+
+  path <- file.path(cache_dir, "pseudobulk")
+  pseudobulk_counts_path <- file.path(path, "original", basename(file_id))
+  pseudobulk_qnorm_path <- file.path(path, "quantile_normalised", basename(file_id))
+
+  saveHDF5SummarizedExperiment(pseudobulk_counts, pseudobulk_counts_path, replace = TRUE )
+  saveHDF5SummarizedExperiment(normalised_counts, pseudobulk_qnorm_path , replace = TRUE)
+
+  cli_alert_info("pseudobulk are generated in {.path {path}}. ")
+}
+
+
diff --git a/data/sample_sce_obj.rda b/data/sample_sce_obj.rda
diff --git a/man/import_one_sce.Rd b/man/import_one_sce.Rd
diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R
@@ -203,17 +203,27 @@ test_that("get_metadata() expect a unique cell_type `b` is present, which comes
   expect_true(n_cell > 0)
 })
 
-test_that("import_one_sce() loads metadata from a SingleCellExperiment object into a parquet file", {
+test_that("import_one_sce() loads metadata from a SingleCellExperiment object into a parquet file and generates pseudobulk", {
+  # Test both functionalities together because if import them independently,
+  # the sample data will be loaded into the cache, which causes the second import to fail the unique file check
   data(sample_sce_obj)
   temp <- tempfile()
   dataset_id <- "GSE122999"
   import_one_sce(sce_obj = sample_sce_obj,
-                         cache_dir = temp)
+                         cache_dir = temp,
+                 pseudobulk = TRUE)
 
   dataset_id %in% (get_metadata(cache_directory = temp) |> 
                     dplyr::distinct(dataset_id) |> 
                     dplyr::pull()) |>
     expect(failure_message = "The correct metadata was not created")
+
+  sme <- get_metadata(cache_directory = temp) |> filter(file_id == "id1") |>
+    get_pseudobulk(cache_directory = file.path(temp, "pseudobulk"))
+  sme |>
+    row.names() |>
+    length() |>
+    expect_gt(1)
 })
 
 test_that("get_pseudobulk() syncs appropriate files", {
@@ -253,4 +263,3 @@ test_that("get_pseudobulk() syncs appropriate fixed file", {
     expect_gt(1)
 })
 
-