stemangiola · stemangiola · Mar 26, 2024 · Feb 28, 2024 · Mar 1, 2024 · Mar 1, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -93,7 +93,8 @@ Imports:
     utils,
     dbplyr (>= 2.3.0),
     duckdb,
-    stringr
+    stringr,
+    checkmate
 Suggests:
     zellkonverter,
     rmarkdown,
@@ -106,7 +107,10 @@ Suggests:
     forcats,
     ggplot2,
     tidySingleCellExperiment,
-    rprojroot
+    rprojroot,
+    openssl,
+    scuttle,
+    DelayedArray
 Biarch: true
 biocViews: 
     AssayDomain,
@@ -130,9 +134,13 @@ Roxygen: list(markdown = TRUE)
 Collate: 
     'utils.R'
     'counts.R'
+    'counts_per_million.R'
+    'data.R'
     'dev.R'
+    'import_metadata_and_counts.R'
     'metadata.R'
     'seurat.R'
     'unharmonised.R'
     'zzz.R'
 Language: en-US
+LazyData: true
diff --git a/NAMESPACE b/NAMESPACE
@@ -4,26 +4,40 @@ S3method(as.sparse,DelayedMatrix)
 export(SAMPLE_DATABASE_URL)
 export(get_SingleCellExperiment)
 export(get_database_url)
+export(get_default_cache_dir)
 export(get_metadata)
 export(get_seurat)
 export(get_single_cell_experiment)
 export(get_unharmonised_metadata)
+export(import_metadata_counts)
 importFrom(BiocGenerics,cbind)
 importFrom(DBI,dbConnect)
 importFrom(DBI,dbDisconnect)
 importFrom(HDF5Array,HDF5RealizationSink)
 importFrom(HDF5Array,loadHDF5SummarizedExperiment)
+importFrom(HDF5Array,saveHDF5SummarizedExperiment)
+importFrom(S4Vectors,"metadata<-")
 importFrom(S4Vectors,DataFrame)
+importFrom(S4Vectors,metadata)
 importFrom(Seurat,as.SingleCellExperiment)
 importFrom(SeuratObject,as.Seurat)
 importFrom(SeuratObject,as.sparse)
+importFrom(SingleCellExperiment,"reducedDims<-")
 importFrom(SingleCellExperiment,SingleCellExperiment)
 importFrom(SingleCellExperiment,combineCols)
+importFrom(SingleCellExperiment,reducedDims)
+importFrom(SingleCellExperiment,rowData)
 importFrom(SummarizedExperiment,"assayNames<-")
+importFrom(SummarizedExperiment,"assays<-")
 importFrom(SummarizedExperiment,"colData<-")
+importFrom(SummarizedExperiment,assay)
+importFrom(SummarizedExperiment,assays)
 importFrom(SummarizedExperiment,colData)
 importFrom(assertthat,assert_that)
 importFrom(assertthat,has_name)
+importFrom(checkmate,check_character)
+importFrom(checkmate,check_subset)
+importFrom(checkmate,check_true)
 importFrom(cli,cli_abort)
 importFrom(cli,cli_alert_info)
 importFrom(cli,cli_alert_success)
@@ -33,11 +47,13 @@ importFrom(dbplyr,remote_con)
 importFrom(dbplyr,sql)
 importFrom(dplyr,as_tibble)
 importFrom(dplyr,collect)
+importFrom(dplyr,distinct)
 importFrom(dplyr,filter)
 importFrom(dplyr,group_by)
 importFrom(dplyr,inner_join)
 importFrom(dplyr,mutate)
 importFrom(dplyr,pull)
+importFrom(dplyr,select)
 importFrom(dplyr,summarise)
 importFrom(dplyr,tbl)
 importFrom(dplyr,transmute)
@@ -64,6 +80,7 @@ importFrom(purrr,set_names)
 importFrom(purrr,walk)
 importFrom(rlang,.data)
 importFrom(stats,setNames)
+importFrom(stringr,str_detect)
 importFrom(stringr,str_remove_all)
 importFrom(stringr,str_replace_all)
 importFrom(tibble,column_to_rownames)

diff --git a/R/counts.R b/R/counts.R
@@ -85,94 +85,99 @@ get_single_cell_experiment <- function(
     repository = COUNTS_URL,
     features = NULL
 ) {
-    # Parameter validation
-
-    assays %in% names(assay_map) |>
-        all() |>
-        assert_that(
-            msg = 'assays must be a character vector containing "counts" and/or
-            "cpm"'
-        )
-    assert_that(
-        !anyDuplicated(assays),
-        inherits(cache_directory, "character"),
-        is.null(repository) || is.character(repository),
-        is.null(features) || is.character(features)
-    )
-
-    # Data parameter validation (last, because it's slower)
-    ## Evaluate the promise now so that we get a sensible error message
-    force(data)
-    ## We have to convert to an in-memory table here, or some of the dplyr
-    ## operations will fail when passed a database connection
-    cli_alert_info("Realising metadata.")
-    raw_data <- collect(data)
+  # Parameter validation
+  assays %in% names(assay_map) |>
+    all() |>
     assert_that(
-        inherits(raw_data, "tbl"),
-        has_name(raw_data, c("cell_", "file_id_db"))
-    )
-
-    versioned_cache_directory <- cache_directory
-    versioned_cache_directory |> dir.create(
-        showWarnings = FALSE,
-        recursive = TRUE
+      msg = 'assays must be a character vector containing "counts" and/or
+            "cpm"'
     )
-
-    subdirs <- assay_map[assays]
-
-    # The repository is optional. If not provided we load only from the cache
-    if (!is.null(repository)) {
-        cli_alert_info("Synchronising files")
-        parsed_repo <- parse_url(repository)
-        parsed_repo$scheme |>
-            `%in%`(c("http", "https")) |>
-            assert_that()
-
-        files_to_read <-
-            raw_data |>
-            pull(.data$file_id_db) |>
-            unique() |>
-            as.character() |>
-            sync_assay_files(
-                url = parsed_repo,
-                cache_dir = versioned_cache_directory,
-                files = _,
-                subdirs = subdirs
-            )
-    }
-
-    cli_alert_info("Reading files.")
-    sces <- subdirs |>
-        imap(function(current_subdir, current_assay) {
-            # Build up an SCE for each assay
-            dir_prefix <- file.path(
-                versioned_cache_directory,
-                current_subdir
-            )
-
-            raw_data |>
-                dplyr::group_by(.data$file_id_db) |>
-                # Load each file and attach metadata
-                dplyr::summarise(sces = list(group_to_sce(
-                    dplyr::cur_group_id(),
-                    dplyr::cur_data_all(),
-                    dir_prefix,
-                    features
-                ))) |>
-                dplyr::pull(sces) |>
-                # Combine each sce by column, since each sce has a different set
-                # of cells
-                do.call(cbind, args = _)
-        })
-
-    cli_alert_info("Compiling Single Cell Experiment.")
-    # Combine all the assays
-    sce <- sces[[1]]
-    SummarizedExperiment::assays(sce) <- map(sces, function(sce) {
-        SummarizedExperiment::assays(sce)[[1]]
+  assert_that(
+    !anyDuplicated(assays),
+    inherits(cache_directory, "character"),
+    is.null(repository) || is.character(repository),
+    is.null(features) || is.character(features)
+  )
+
+  # Data parameter validation (last, because it's slower)
+  ## Evaluate the promise now so that we get a sensible error message
+  force(data)
+  ## We have to convert to an in-memory table here, or some of the dplyr
+  ## operations will fail when passed a database connection
+  cli_alert_info("Realising metadata.")
+  raw_data <- collect(data)
+  assert_that(
+    inherits(raw_data, "tbl"),
+    has_name(raw_data, c("cell_", "file_id_db"))
+  )
+
+  versioned_cache_directory <- cache_directory
+  versioned_cache_directory |> dir.create(
+    showWarnings = FALSE,
+    recursive = TRUE
+  )
+
+  subdirs <- assay_map[assays]
+
+  # The repository is optional. If not provided we load only from the cache
+  if (!is.null(repository)) {
+    cli_alert_info("Synchronising files")
+    parsed_repo <- parse_url(repository)
+    parsed_repo$scheme |>
+      `%in%`(c("http", "https")) |>
+      assert_that()
+
+    files_to_read <-
+      raw_data |>
+      pull(.data$file_id_db) |>
+      unique() |>
+      as.character() |>
+      sync_assay_files(
+        url = parsed_repo,
+        cache_dir = versioned_cache_directory,
+        files = _,
+        subdirs = subdirs
+      )
+  }
+
+  cli_alert_info("Reading files.")
+  sces <- subdirs |>
+    imap(function(current_subdir, current_assay) {
+      # Build up an SCE for each assay
+      dir_prefix <- file.path(
+        versioned_cache_directory,
+        current_subdir
+      )
+
+      sce_list <- raw_data |>
+        dplyr::group_by(.data$file_id_db) |>
+        # Load each file and attach metadata
+        dplyr::summarise(sces = list(
+          group_to_sce(
+            dplyr::cur_group_id(),
+            dplyr::cur_data_all(),
+            dir_prefix,
+            features
+          )
+        )) |>
+        dplyr::pull(sces)
+
+      # Check whether genes in a list of SCEs overlap, use gene intersection if overlap
+      commonGenes <- sce_list |> check_gene_overlap()
+      sce_list <- map(sce_list, function(sce) {
+        sce[commonGenes,]
+      }) |>
+        do.call(cbind, args = _)
     })
-
-    sce
+
+  cli_alert_info("Compiling Single Cell Experiment.")
+  # Combine all the assays
+  sce <- sces[[1]]
+  SummarizedExperiment::assays(sce) <- map(sces, function(sce) {
+    SummarizedExperiment::assays(sce)[[1]]
+  })
+
+  sce
 }
 
 #' Converts a data frame into a single SCE
@@ -214,10 +219,10 @@ group_to_sce <- function(i, df, dir_prefix, features) {
 
     if (length(cells) < nrow(df)){
         single_line_str(
-            "Some cells were filtered out while loading {head(df$file_id_db, 1)}
-            because of extremely low counts. The
-            number of cells in the SingleCellExperiment will be less than the
-            number of cells you have selected from the metadata."
+            "The number of cells in the SingleCellExperiment will be less than the
+            number of cells you have selected from the metadata.
+            Are cell IDs duplicated? Or, do cell IDs correspond to the counts file?
+            "
         ) |> cli_alert_warning()
         df <- filter(df, .data$cell_ %in% cells)
     }
@@ -319,3 +324,22 @@ sync_assay_files <- function(
         output_file
     }, .progress = list(name = "Downloading files"))
 }
+
+#' Checks whether genes in a list of SCE objects overlap
+#' @param sce_list A list of SingleCellExperiment objects
+#' @return A character vector of genes intersection across SCE objects
+#' @importFrom purrr map reduce map_int
+#' @importFrom cli cli_alert_warning
+#' @noRd
+check_gene_overlap <- function(sce_list) {
+  gene_lists <- map(sce_list, rownames)
+  common_genes <- reduce(gene_lists, intersect)
+  if (any(lengths(gene_lists) != length(common_genes))) {
+    single_line_str(
+      "CuratedAtlasQuery reports: Not all genes completely overlap across the provided SCE objects.
+      Counts are generated by genes intersection"
+    ) |> cli_alert_warning()
+  }
+
+  common_genes
+}
diff --git a/R/counts_per_million.R b/R/counts_per_million.R
@@ -0,0 +1,43 @@
+#' Generating counts per million from a SingleCellExperiment object
+#'
+#' @param input_sce_obj A SingleCellExperiment object read from RDS
+#' @param output_dir A character vector of counts per million path
+#' @param hd5_file_dir A character vector of HDF5 file path after converting from RDS
+#' @return A directory stores counts per million 
+#' @importFrom SingleCellExperiment SingleCellExperiment
+#' @importFrom SummarizedExperiment assay assays assays<-
+#' @importFrom HDF5Array saveHDF5SummarizedExperiment
+#' @importFrom purrr map
+#' @noRd
+get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) {
+
+  # Save SCE to the cache directory counts folder
+  input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir)
+
+  data <- input_sce_obj
+
+  # Avoid completely empty cells
+  col_sums <- colSums(as.matrix(assay(data)))
+  selected_cols <- which(col_sums >0 & col_sums < Inf)
+
+  sce <- SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,selected_cols ,drop=FALSE ], assay.type = "X")))
+  rownames(sce) <- rownames(data[,selected_cols  ])
+  colnames(sce) <- colnames(data[,selected_cols  ])
+
+  # Avoid scaling zeros
+  sce_zero <- SingleCellExperiment(list(counts_per_million = assay(data)[, !selected_cols ,drop=FALSE ]))
+  rownames(sce_zero) <- rownames(data[, !selected_cols  ])
+  colnames(sce_zero) <- colnames(data[, !selected_cols ])
+
+  sce <- sce |> cbind(sce_zero)
+
+  sce <- sce[,colnames(data)]
+
+  rm(data)
+
+  # Check if there is a memory issue 
+  assays(sce) <- assays(sce) |> map(DelayedArray::realize)
+
+  sce |> saveHDF5SummarizedExperiment(output_dir)
+} 
+
diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,8 @@
+#' Sample SingleCellExperiment Object
+#'
+#' A sample SCE object from HeOrganAtlasData with transformation in metadata and assay
+#' 
+#' @format An object of class \code{SingleCellExperiment}
+#' @source HeOrganAtlasData Liver 
+#' @keywords datasets
+"sample_sce_obj"