Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

import api #134

Merged
merged 44 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
9a2b287
export default_cache function, add clear_cache function
myushen Feb 28, 2024
94a6edc
check metadata, gene names, positive counts, file_id
myushen Mar 1, 2024
670c10f
generate cpm from original, check subdir number match
myushen Mar 1, 2024
c816770
check metadata input has minimum set of column names and types
myushen Mar 1, 2024
1b0d608
add check age, sex, minimum sets of input columns
myushen Mar 4, 2024
7d264f3
upload meta, counts to cloud
myushen Mar 4, 2024
47f237a
add documentation
myushen Mar 4, 2024
fd38b9e
Revert "upload meta, counts to cloud"
myushen Mar 4, 2024
0e4123f
add function `import_metadata_counts`
myushen Mar 4, 2024
0e5e27c
fix
myushen Mar 5, 2024
02d4511
try fix code suggested by knitr error
myushen Mar 5, 2024
e802374
Revert "try fix code suggested by knitr error"
myushen Mar 5, 2024
1969796
remove meta_output argument, add counts_path argument
myushen Mar 6, 2024
bf8aa68
add counts_per_million function
myushen Mar 6, 2024
56c32ea
fix error
myushen Mar 6, 2024
593dcc7
fix memory issue in counts_per_million
myushen Mar 6, 2024
815648c
fix documentation
myushen Mar 6, 2024
e7e9047
add check: cells_ cannot be duplicates when join with api
myushen Mar 6, 2024
62525e5
add checkpoint, cell_ values in metadata must be unique
myushen Mar 7, 2024
1fdea73
optimise code and comments
myushen Mar 8, 2024
6aa3380
fix tiny issue
myushen Mar 8, 2024
7ba0f25
assert_that
myushen Mar 8, 2024
c9e0d8e
read rds, assign matrix for counts data, save HDF5
myushen Mar 12, 2024
944535d
accept sce RDS as input
myushen Mar 12, 2024
1c4aa79
accept count_path_df.rds as an input
myushen Mar 13, 2024
f8bda0b
roxygenise
myushen Mar 13, 2024
8e34280
fix a missing variable
myushen Mar 13, 2024
bdf294b
tiny fix
myushen Mar 13, 2024
65b8975
checking whether gene set of new count data and old count data match
myushen Mar 13, 2024
77d0b68
change function input, accept sce rds and cache dir only
myushen Mar 14, 2024
8101da2
fix a variable
myushen Mar 14, 2024
2ab8a45
save created file_id_db column in metadata slot to rds
myushen Mar 14, 2024
97cbfa5
fix variable
myushen Mar 14, 2024
731e877
accept a loaded sce_obj as input
myushen Mar 15, 2024
90fef3a
checking whether genes in a list of SCEs overlap
myushen Mar 18, 2024
5a6dad3
use gene intersection if genes across SCEs do not completely overlap
myushen Mar 18, 2024
15628a4
move check_gene_overlap function to counts.R
myushen Mar 19, 2024
8e2c89c
optimise counts_per_million.R. interact import API with get_SCE
myushen Mar 19, 2024
2792f11
change function documentation
myushen Mar 19, 2024
1209f7c
fix assert_that
myushen Mar 19, 2024
1ddec5e
update Rd and local checks
myushen Mar 19, 2024
883402b
export necessary functions. create sample_sce_obj for checking purpose
myushen Mar 20, 2024
506ee5a
Apply suggestions from code review
myushen Mar 22, 2024
b6ee0b2
add suggestions and optimise code from code review
myushen Mar 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ Imports:
utils,
dbplyr (>= 2.3.0),
duckdb,
stringr
stringr,
checkmate
Suggests:
zellkonverter,
rmarkdown,
Expand All @@ -106,7 +107,10 @@ Suggests:
forcats,
ggplot2,
tidySingleCellExperiment,
rprojroot
rprojroot,
openssl,
scuttle,
DelayedArray
Biarch: true
biocViews:
AssayDomain,
Expand All @@ -130,9 +134,13 @@ Roxygen: list(markdown = TRUE)
Collate:
'utils.R'
'counts.R'
'counts_per_million.R'
'data.R'
'dev.R'
'import_metadata_and_counts.R'
'metadata.R'
'seurat.R'
'unharmonised.R'
'zzz.R'
Language: en-US
LazyData: true
17 changes: 17 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,40 @@ S3method(as.sparse,DelayedMatrix)
export(SAMPLE_DATABASE_URL)
export(get_SingleCellExperiment)
export(get_database_url)
export(get_default_cache_dir)
export(get_metadata)
export(get_seurat)
export(get_single_cell_experiment)
export(get_unharmonised_metadata)
export(import_metadata_counts)
importFrom(BiocGenerics,cbind)
importFrom(DBI,dbConnect)
importFrom(DBI,dbDisconnect)
importFrom(HDF5Array,HDF5RealizationSink)
importFrom(HDF5Array,loadHDF5SummarizedExperiment)
importFrom(HDF5Array,saveHDF5SummarizedExperiment)
importFrom(S4Vectors,"metadata<-")
importFrom(S4Vectors,DataFrame)
importFrom(S4Vectors,metadata)
importFrom(Seurat,as.SingleCellExperiment)
importFrom(SeuratObject,as.Seurat)
importFrom(SeuratObject,as.sparse)
importFrom(SingleCellExperiment,"reducedDims<-")
importFrom(SingleCellExperiment,SingleCellExperiment)
importFrom(SingleCellExperiment,combineCols)
importFrom(SingleCellExperiment,reducedDims)
importFrom(SingleCellExperiment,rowData)
importFrom(SummarizedExperiment,"assayNames<-")
importFrom(SummarizedExperiment,"assays<-")
importFrom(SummarizedExperiment,"colData<-")
importFrom(SummarizedExperiment,assay)
importFrom(SummarizedExperiment,assays)
importFrom(SummarizedExperiment,colData)
importFrom(assertthat,assert_that)
importFrom(assertthat,has_name)
importFrom(checkmate,check_character)
importFrom(checkmate,check_subset)
importFrom(checkmate,check_true)
importFrom(cli,cli_abort)
importFrom(cli,cli_alert_info)
importFrom(cli,cli_alert_success)
Expand All @@ -33,11 +47,13 @@ importFrom(dbplyr,remote_con)
importFrom(dbplyr,sql)
importFrom(dplyr,as_tibble)
importFrom(dplyr,collect)
importFrom(dplyr,distinct)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,inner_join)
importFrom(dplyr,mutate)
importFrom(dplyr,pull)
importFrom(dplyr,select)
importFrom(dplyr,summarise)
importFrom(dplyr,tbl)
importFrom(dplyr,transmute)
Expand All @@ -64,6 +80,7 @@ importFrom(purrr,set_names)
importFrom(purrr,walk)
importFrom(rlang,.data)
importFrom(stats,setNames)
importFrom(stringr,str_detect)
importFrom(stringr,str_remove_all)
importFrom(stringr,str_replace_all)
importFrom(tibble,column_to_rownames)
Expand Down
202 changes: 113 additions & 89 deletions R/counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -85,94 +85,99 @@ get_single_cell_experiment <- function(
repository = COUNTS_URL,
features = NULL
) {
# Parameter validation

assays %in% names(assay_map) |>
all() |>
assert_that(
msg = 'assays must be a character vector containing "counts" and/or
"cpm"'
)
assert_that(
!anyDuplicated(assays),
inherits(cache_directory, "character"),
is.null(repository) || is.character(repository),
is.null(features) || is.character(features)
)

# Data parameter validation (last, because it's slower)
## Evaluate the promise now so that we get a sensible error message
force(data)
## We have to convert to an in-memory table here, or some of the dplyr
## operations will fail when passed a database connection
cli_alert_info("Realising metadata.")
raw_data <- collect(data)
# Parameter validation
assays %in% names(assay_map) |>
all() |>
assert_that(
inherits(raw_data, "tbl"),
has_name(raw_data, c("cell_", "file_id_db"))
)

versioned_cache_directory <- cache_directory
versioned_cache_directory |> dir.create(
showWarnings = FALSE,
recursive = TRUE
msg = 'assays must be a character vector containing "counts" and/or
"cpm"'
)

subdirs <- assay_map[assays]

# The repository is optional. If not provided we load only from the cache
if (!is.null(repository)) {
cli_alert_info("Synchronising files")
parsed_repo <- parse_url(repository)
parsed_repo$scheme |>
`%in%`(c("http", "https")) |>
assert_that()

files_to_read <-
raw_data |>
pull(.data$file_id_db) |>
unique() |>
as.character() |>
sync_assay_files(
url = parsed_repo,
cache_dir = versioned_cache_directory,
files = _,
subdirs = subdirs
)
}

cli_alert_info("Reading files.")
sces <- subdirs |>
imap(function(current_subdir, current_assay) {
# Build up an SCE for each assay
dir_prefix <- file.path(
versioned_cache_directory,
current_subdir
)

raw_data |>
dplyr::group_by(.data$file_id_db) |>
# Load each file and attach metadata
dplyr::summarise(sces = list(group_to_sce(
dplyr::cur_group_id(),
dplyr::cur_data_all(),
dir_prefix,
features
))) |>
dplyr::pull(sces) |>
# Combine each sce by column, since each sce has a different set
# of cells
do.call(cbind, args = _)
})

cli_alert_info("Compiling Single Cell Experiment.")
# Combine all the assays
sce <- sces[[1]]
SummarizedExperiment::assays(sce) <- map(sces, function(sce) {
SummarizedExperiment::assays(sce)[[1]]
assert_that(
!anyDuplicated(assays),
inherits(cache_directory, "character"),
is.null(repository) || is.character(repository),
is.null(features) || is.character(features)
)

# Data parameter validation (last, because it's slower)
## Evaluate the promise now so that we get a sensible error message
force(data)
## We have to convert to an in-memory table here, or some of the dplyr
## operations will fail when passed a database connection
cli_alert_info("Realising metadata.")
raw_data <- collect(data)
assert_that(
inherits(raw_data, "tbl"),
has_name(raw_data, c("cell_", "file_id_db"))
)

versioned_cache_directory <- cache_directory
versioned_cache_directory |> dir.create(
showWarnings = FALSE,
recursive = TRUE
)

subdirs <- assay_map[assays]

# The repository is optional. If not provided we load only from the cache
if (!is.null(repository)) {
cli_alert_info("Synchronising files")
parsed_repo <- parse_url(repository)
parsed_repo$scheme |>
`%in%`(c("http", "https")) |>
assert_that()

files_to_read <-
raw_data |>
pull(.data$file_id_db) |>
unique() |>
as.character() |>
sync_assay_files(
url = parsed_repo,
cache_dir = versioned_cache_directory,
files = _,
subdirs = subdirs
)
}

cli_alert_info("Reading files.")
sces <- subdirs |>
imap(function(current_subdir, current_assay) {
# Build up an SCE for each assay
dir_prefix <- file.path(
versioned_cache_directory,
current_subdir
)

sce_list <- raw_data |>
dplyr::group_by(.data$file_id_db) |>
# Load each file and attach metadata
dplyr::summarise(sces = list(
group_to_sce(
dplyr::cur_group_id(),
dplyr::cur_data_all(),
dir_prefix,
features
)
)) |>
dplyr::pull(sces)

# Check whether genes in a list of SCEs overlap, use gene intersection if overlap
commonGenes <- sce_list |> check_gene_overlap()
sce_list <- map(sce_list, function(sce) {
sce[commonGenes,]
}) |>
do.call(cbind, args = _)
})

sce

cli_alert_info("Compiling Single Cell Experiment.")
# Combine all the assays
sce <- sces[[1]]
SummarizedExperiment::assays(sce) <- map(sces, function(sce) {
SummarizedExperiment::assays(sce)[[1]]
})

sce
}

#' Converts a data frame into a single SCE
Expand Down Expand Up @@ -214,10 +219,10 @@ group_to_sce <- function(i, df, dir_prefix, features) {

if (length(cells) < nrow(df)){
single_line_str(
"Some cells were filtered out while loading {head(df$file_id_db, 1)}
because of extremely low counts. The
number of cells in the SingleCellExperiment will be less than the
number of cells you have selected from the metadata."
"The number of cells in the SingleCellExperiment will be less than the
number of cells you have selected from the metadata.
Are cell IDs duplicated? Or, do cell IDs correspond to the counts file?
"
) |> cli_alert_warning()
df <- filter(df, .data$cell_ %in% cells)
}
Expand Down Expand Up @@ -319,3 +324,22 @@ sync_assay_files <- function(
output_file
}, .progress = list(name = "Downloading files"))
}

#' Checks whether genes in a list of SCE objects overlap
#' @param sce_list A list of SingleCellExperiment objects
#' @return A character vector of genes intersection across SCE objects
#' @importFrom purrr map reduce map_int
myushen marked this conversation as resolved.
Show resolved Hide resolved
#' @importFrom cli cli_alert_warning
#' @noRd
check_gene_overlap <- function(sce_list) {
gene_lists <- map(sce_list, rownames)
common_genes <- reduce(gene_lists, intersect)
if (any(lengths(gene_lists) != length(common_genes))) {
single_line_str(
"CuratedAtlasQuery reports: Not all genes completely overlap across the provided SCE objects.
Counts are generated by genes intersection"
) |> cli_alert_warning()
}

common_genes
}
43 changes: 43 additions & 0 deletions R/counts_per_million.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#' Generating counts per million from a SingleCellExperiment object
#'
#' @param input_sce_obj A SingleCellExperiment object read from RDS
#' @param output_dir A character vector of counts per million path
#' @param hd5_file_dir A character vector of HDF5 file path after converting from RDS
#' @return A directory stores counts per million
#' @importFrom SingleCellExperiment SingleCellExperiment
#' @importFrom SummarizedExperiment assay assays assays<-
#' @importFrom HDF5Array saveHDF5SummarizedExperiment
#' @importFrom purrr map
#' @noRd
get_counts_per_million <- function(input_sce_obj, output_dir, hd5_file_dir) {

# Save SCE to the cache directory counts folder
input_sce_obj |> saveHDF5SummarizedExperiment(hd5_file_dir)

data <- input_sce_obj

# Avoid completely empty cells
col_sums <- colSums(as.matrix(assay(data)))
selected_cols <- which(col_sums >0 & col_sums < Inf)

sce <- SingleCellExperiment(list(counts_per_million = scuttle::calculateCPM(data[,selected_cols ,drop=FALSE ], assay.type = "X")))
rownames(sce) <- rownames(data[,selected_cols ])
colnames(sce) <- colnames(data[,selected_cols ])

# Avoid scaling zeros
sce_zero <- SingleCellExperiment(list(counts_per_million = assay(data)[, !selected_cols ,drop=FALSE ]))
rownames(sce_zero) <- rownames(data[, !selected_cols ])
colnames(sce_zero) <- colnames(data[, !selected_cols ])

sce <- sce |> cbind(sce_zero)

sce <- sce[,colnames(data)]

rm(data)

# Check if there is a memory issue
assays(sce) <- assays(sce) |> map(DelayedArray::realize)

sce |> saveHDF5SummarizedExperiment(output_dir)
}

8 changes: 8 additions & 0 deletions R/data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#' Sample SingleCellExperiment Object
#'
#' A sample SCE object from HeOrganAtlasData with transformation in metadata and assay
#'
#' @format An object of class \code{SingleCellExperiment}
#' @source HeOrganAtlasData Liver
#' @keywords datasets
"sample_sce_obj"
Loading
Loading