✅ Update R package

raevskymichail · Dec 7, 2020 · db46f08 · db46f08
1 parent 54fdb5b
commit db46f08
Show file tree

Hide file tree

Showing 11 changed files with 173 additions and 12,595 deletions.
diff --git a/README.md b/README.md
@@ -1 +1,62 @@
-# epi-impute
+# Epi-Impute
+
+[![](https://img.shields.io/github/languages/code-size/raevskymichail/epi-impute)](https://img.shields.io/github/languages/code-size/raevskymichail/epi-impute)
+[![](https://img.shields.io/github/languages/top/raevskymichail/epi-impute)](https://img.shields.io/github/languages/top/raevskymichail/epi-impute)
+[![](https://img.shields.io/github/issues/raevskymichail/epi-impute)](https://img.shields.io/github/issues/raevskymichail/epi-impute)
+[![](https://img.shields.io/github/license/raevskymichail/epi-impute)](https://img.shields.io/github/license/raevskymichail/epi-impute)
+
+## Introduction
+
+This repository contains primary source code for *"Epi-Impute: single-cell RNA-seq imputation via integration with single-cell ATAC-seq"* that is a computational tool for imputing scRNA-seq data from DNA accessibility data (scATAC-seq) from consistent cell-type populations.
+
+Epi-Impute exploits the idea of open chromatin in active *cis*-regulatory elements of the genes and estimates average accessibility of gene regulatory elements, e.g., promoters and enhancers, in order to add a binarized pseudo-count to the gene expression values reflecting the potential for transcription activation observed at the epigenetic level
+
+## Preprocessing
+
+For preprocessing of scRNA-seq data, please follow the standard processing pipeline to get the expression count matrix, where each row represents a gene and each column represents a cell.
+
+For preprocessing of scATAC-seq data, please first put all the `.bam` files for each cell into a dedicated folder. Then run the preprocessing script we provided to get a count matrix and annotations files.
+
+## Requirements
+
+* reshape2
+* feather
+* rlist
+* data.table
+* MACS
+* UMAP (https://github.com/lmcinnes/umap)
+
+## Installation
+
+### Install from Github
+```r
+library(devtools)
+install_github("raevskymichail/epi-impute/epi.impute/R")
+```
+
+### Install from source codes
+
+Download source codes [here](https://github.com/raevskymichail/epi-impute/blob/master/epi-impute.tar.gz?raw=true) and then type in R session:
+
+```r
+install.packages(path_to_archive, type = 'source', rep = NULL)
+```
+
+Where `path_to_archive` would represent the full path and file name:
+- On Windows it will look something like this: `C:\\Downloads\epi-impute.tar.gz`.
+- On UNIX machines it will look like this: `~/Downloads/epi-impute.tar.gz`.
+
+## Quick start
+
+```r
+library("epi-impute")
+
+data_sc_exp <- demo_data[[1]]
+data_sc_atac <- demo_data[[2]]
+
+result <- epi_impute(demo_data)
+```
+
+## Help
+
+Please feel free to contact Mikhail Raevskiy ([email protected]) if you have any questions about the software.
diff --git a/epi-impute.R → epi-impute-ex.R b/epi-impute.R → epi-impute-ex.R
diff --git a/epi-impute.tar.gz b/epi-impute.tar.gz
diff --git a/epi.impute/DESCRIPTION b/epi.impute/DESCRIPTION
@@ -1,13 +1,23 @@
 Package: epi.impute
-Title: What the Package Does (One Line, Title Case)
-Version: 0.0.0.9000
+Title: Epi-Impute: single-cell RNA-seq imputation via integration with single-cell ATAC-seq
+Version: 0.0.1
 Authors@R: 
-    person(given = "First",
-           family = "Last",
+    person(given = "Mikhail",
+           family = "Raevskiy",
            role = c("aut", "cre"),
-           email = "[email protected]",
+           email = "[email protected]",
            comment = c(ORCID = "YOUR-ORCID-ID"))
-Description: What the package does (one paragraph).
-License: What license it uses
+Description: Epi-Impute, a computational tool for imputing scRNA-seq data from DNA accessibility data (scATAC-seq) from consistent cell-type populations.
+Depends: R (>= 3.2.0)
+License: MIT
 Encoding: UTF-8
 LazyData: true
+Imports:
+    rlist
+    reshape2
+    data.table
+    dplyr
+Suggests:
+    ggplot2
+    feather
+    ggvis
diff --git a/epi.impute/R/epi-impute.R b/epi.impute/R/epi-impute.R
@@ -0,0 +1,94 @@
+#!/usr/bin/env Rscript
+
+#' Epi-Impute: single-cell RNA-seq imputation via integration with single-cell ATAC-seq
+#' 
+#' Epi-Impute, a computational tool for imputing scRNA-seq data from DNA accessibility data (scATAC-seq) from consistent cell-type populations.
+#' @param sc_exp_data a single-cell RNA-seq count matrix, where rownames are
+#' HGNC genes (HUGO) and colnames are cell ids
+#' @param sc_atac_data a single-cell ATAC-seq count matrix, where rownames are
+#' cell ids and colnames are ids for euchromatine peaks (obtained from peak caller,
+#' for ex. MACS2)
+#' @param sc_atac_cell_names a matrix, containing description and annotation for
+#' cell types observed in scATAC-seq count matrix. It should have rownames (cell
+#' ids) that
+#' match rownames of \code{sc_atac_data}
+#' @param sc_atac_peaks_ann a matrix, containing description for euchromatine
+#' peaks, presented in count matrix. It should have rownames (peak ids) that
+#' match colnames of \code{sc_atac_data}
+#' @param cell_types a vector, containing names for cell types, presented in
+#' count matrix.
+#' @param atac_bin_thrld a numeric value for accessibility threshold used for
+#' primary binirization of peaks in scATAC-seq matrix.
+#' @return a data frame containg an imputed single-cell RNA-seq matrix
+#' 
+#' @export
+epi_impute <- function(sc_exp_data, sc_atac_data, sc_atac_cell_names,
+									sc_atac_peaks_ann, cell_types,
+									atac_bin_thrld = 100){
+	sc_exp_data = as.data.frame(t(as.matrix(sc_exp_data))) 
+	sc_exp_data$gene = rownames(sc_exp_data)
+
+	promoters = sc_atac_peaks_ann[grepl('promoter|TSS', sc_atac_peaks_ann$X7),]
+	promoters_genes = promoters[order(promoters$X13),]
+
+	sc_atac_data = merge(sc_atac_data, promoters_genes[c('X13', 'PeakFile_Peak_ID')], by = 'PeakFile_Peak_ID')
+
+	sc_atac_data <- setDT(sc_atac_data)
+	sc_atac_data = sc_atac_data[, lapply(.SD, sum), by = "X13", .SDcols = - "PeakFile_Peak_ID"]
+	sc_atac_genes = sc_atac_data$X13
+	sc_atac_data$X13 = NULL
+
+	genes_common = intersect(sc_atac_genes, sc_exp_data$gene)
+	sc_atac_data = sc_atac_data[sc_atac_genes %in% genes_common, ]
+	sc_atac_data = sc_atac_data[order(genes_common),]
+
+	sc_exp_data_genes = sc_exp_data[sc_exp_data$gene %in% genes_common, ]
+	sc_exp_data_genes = sc_exp_data_genes[order(genes_common),]
+
+	sc_atac_cell_names = sc_atac_cell_names[grepl(paste(cell_types, collapse="|"), sc_atac_cell_names$cell_types),]
+	sc_atac_data = as.data.frame(sc_atac_data) # next line will generate bool if comment this because of data.frame or use ", with=FALSE"
+	sc_atac_data = sc_atac_data[,colnames(sc_atac_data) %in% sc_atac_cell_names$cell_id]
+	sc_atac_cell_names = sc_atac_cell_names[sc_atac_cell_names$cell_id %in% colnames(sc_atac_data),]
+
+	celltype_list = split(sc_atac_cell_names, sc_atac_cell_names$cell_types)
+	celltype_list = celltype_list[cell_types] # there is no MLP
+
+	bulk_peaks_list = list()
+	for(celltype in celltype_list){
+		bulk_peaks_df = rowSums(sc_atac_data[, celltype$cell_id])
+		bulk_peaks_df = as.data.frame(as.matrix((bulk_peaks_df > atac_bin_thrld) + 0))
+		bulk_peaks_list = list.append(bulk_peaks_list, bulk_peaks_df)
+	}
+	names(bulk_peaks_list) = cell_types
+
+	print('Binirize the matrix')
+	sc_exp_data_genes_values = subset(sc_exp_data_genes, select = - gene)
+	sc_exp_data_genes_values = as.data.frame(as.matrix((sc_exp_data_genes_values > 0) + 0))
+	sc_exp_data_genes = cbind(sc_exp_data_genes$gene, sc_exp_data_genes_values)
+	colnames(sc_exp_data_genes)[1] = "gene"
+
+	sc_exp_data_genes = dcast(melt(sc_exp_data_genes, id.vars = "gene"), variable ~ gene)
+	rownames(sc_exp_data_genes) = sc_exp_data_genes$variable
+	sc_exp_data_genes$cells = gsub('_.*', '', sc_exp_data_genes$variable)
+	sc_exp_data_genes = sc_exp_data_genes[order(sc_exp_data_genes$cells),]
+	sc_exp_data_genes$variable = NULL
+
+	sc_data_list <- split(sc_exp_data_genes , f = sc_exp_data_genes$cells)
+	sc_data_list = lapply(sc_data_list, function(x){x$cells = NULL; as.data.frame(t(x))})
+
+	print('Imputation...')
+	runtime = system.time({
+		celltypes_to_impute = cell_types
+		scRNAseq_imputed_list = list()
+
+		for(i in celltypes_to_impute){
+			imputed_i = as.data.frame(sapply(as.data.frame(sc_data_list[[i]]), function(x){x + bulk_peaks_list[[i]]}))
+			scRNAseq_imputed_list = list.append(scRNAseq_imputed_list, as.data.frame(imputed_i))
+		}
+
+		scRNAseq_imputed_list = as.data.frame(scRNAseq_imputed_list)
+		rownames(scRNAseq_imputed_list) = genes_common
+	})
+
+	return(as.data.frame(t(scRNAseq_imputed_list))
+}
diff --git a/epi.impute/dca_results/dispersion.tsv b/epi.impute/dca_results/dispersion.tsv
diff --git a/epi.impute/dca_results/dropout.tsv b/epi.impute/dca_results/dropout.tsv