Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New version of GRaNIE integration #6

Merged
merged 10 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 8 additions & 23 deletions dockerfiles/granie/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,6 @@
FROM bioconductor/bioconductor_docker:devel-R-4.4.1


# Install required dependencies for the R packages
#RUN apt-get update && apt-get install -y \
# libcurl4-openssl-dev \
# libxml2-dev \
# libssl-dev \
# libcairo2-dev
# libxt-dev \
# libopenblas-dev


#Install R packages
RUN R -e "install.packages(c('devtools'))"

Expand All @@ -21,19 +11,14 @@ WORKDIR /workspace
# Default command
CMD ["R"]

RUN R -e "remotes::install_version('Matrix', version = '1.6-3')"

#Version 1.9.4 of GRaNIE
RUN R -e "devtools::install_gitlab('grp-zaugg/GRaNIE@855bf3def33ad7af9353db79c7e21c9279035fb8', host = 'git.embl.de', subdir = 'src/GRaNIE', dependencies = TRUE, upgrade = 'never')"

#Version 0.2.1 of GRaNIEverse
RUN R -e "devtools::install_gitlab('grp-zaugg/GRaNIEverse@3224b042f25f0085eeaf9194042bcb89103ea962', host = 'git.embl.de', upgrade = 'never', dependencies = TRUE)"

#There seems to be an incompatibility with the Matrix and irlba package, downgrading Matrix seems to work

RUN R -e "install.packages('irlba',type='source')"

RUN R -e "BiocManager::install(c('BSgenome.Hsapiens.UCSC.hg38', 'EnsDb.Hsapiens.v86', 'EnsDb.Mmusculus.v79', 'BSgenome.Mmusculus.UCSC.mm39'))"
#Version 1.9.4 of GRaNIE.
RUN R -e "devtools::install_gitlab('grp-zaugg/[email protected]', host = 'git.embl.de', subdir = 'src/GRaNIE', dependencies = TRUE, upgrade = 'never')"

#Version 0.3.1 of GRaNIEverse
RUN R -e "devtools::install_gitlab('grp-zaugg/[email protected]', host = 'git.embl.de', upgrade = 'never', dependencies = TRUE)"

#There is an incompatibility with the Matrix and irlba package, see https://github.com/satijalab/seurat/issues/8000
RUN R -e "install.packages('irlba',type='source', rebuild = TRUE)"

# biovizbase is only needed to create the ATAC assay along with annotation
RUN R -e "BiocManager::install(c('BSgenome.Hsapiens.UCSC.hg38', 'EnsDb.Hsapiens.v86', 'EnsDb.Mmusculus.v79', 'BSgenome.Mmusculus.UCSC.mm39', 'biovizBase', 'Signac', 'glmGamPoi'))"
79 changes: 68 additions & 11 deletions src/methods/multi_omics/granie/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,63 @@ functionality:
required: false
direction: input
description: "Path to the RNA data file (e.g., rna.rds)."
default: resources_test/grn-benchmark/multiomics_rna.rds
default: resources_test/grn-benchmark/multiomics_r/rna.rds

- name: --file_atac
type: file
required: false
direction: input
description: "Path to the ATAC data file (e.g., atac.rds)."
default: resources_test/grn-benchmark/multiomics_atac.rds
default: resources_test/grn-benchmark/multiomics_r/atac.rds

- name: --normRNA
type: string
required: false
direction: input
description: Normalization method for RNA data.
default: "SCT"

- name: --normATAC
type: string
required: false
direction: input
description: Normalization method for ATAC data.
default: "LSI"

- name: --LSI_featureCutoff
type: string
required: false
direction: input
description: Feature cutoff for LSI normalization.
default: "q0"

- name: --nDimensions_ATAC
type: integer
required: false
direction: input
description: Number of dimensions for ATAC modality
default: 50

- name: --integrationMethod
type: string
required: false
direction: input
description: Method used for data integration.
default: "WNN"

- name: --WNN_knn
type: integer
required: false
direction: input
description: Number of nearest neighbors for WNN integration.
default: 20

- name: --minCellsPerCluster
type: integer
required: false
direction: input
description: Minimum number of cells required per cluster.
default: 25


- name: --preprocessing_clusteringMethod
Expand All @@ -40,12 +89,12 @@ functionality:
description: Resolution for clustering, typically between 5 and 20.
default: 14

- name: --preprocessing_SCT_nDimensions
- name: --preprocessing_RNA_nDimensions
type: integer
required: false
direction: input
default: 50
description: Number of dimensions for SCT, default is 50.
description: Number of dimensions for RNA reduction, default is 50.

- name: --genomeAssembly
type: string
Expand Down Expand Up @@ -89,12 +138,13 @@ functionality:
direction: input
description: FDR threshold for peak-gene connections (default is 0.2).

- name: --peak_gene
type: file
required: false
direction: output
description: Path to the peak-gene output file (e.g., peak_gene.csv). Not yet implemented.
default: output/granie/peak_gene.csv
# Existance of output files is checked, not yet implemented
# - name: --peak_gene
# type: file
# required: false
# direction: output
# description: Path to the peak-gene output file (e.g., peak_gene.csv). Not yet implemented.
# default: output/granie/peak_gene.csv

- name: --useWeightingLinks
type: boolean
Expand All @@ -110,14 +160,21 @@ functionality:
direction: input
description: "Flag to force rerun of the analysis regardless of existing results."

- name: --subset
type: boolean
required: false
default: false
direction: input
description: "Flag for testing purposes to subset the data for faster running times"

resources:
- type: r_script
path: script.R


platforms:
- type: docker
image: chrarnold84/granieverse:latest
image: chrarnold84/granieverse:v1.3

- type: native
- type: nextflow
Expand Down
125 changes: 70 additions & 55 deletions src/methods/multi_omics/granie/script.R
Original file line number Diff line number Diff line change
@@ -1,48 +1,55 @@

## VIASH START
# par <- list(
# file_rna = "resources_test/grn-benchmark/multiomics_r/rna.rds",
# file_atac = "resources_test/grn-benchmark/multiomics_r/atac.rds",
# preprocessing_clusteringMethod = 1, # Seurat::FindClusters: (1 = original Louvain algorithm, 2 = Louvain algorithm with multilevel refinement, 3 = SLM algorithm, 4 = Leiden algorithm)
# preprocessing_clusterResolution = 14, # Typically between 5 and 20
# preprocessing_RNA_nDimensions = 50, # Default 50
# genomeAssembly = "hg38",
# GRaNIE_corMethod = "spearman",
# GRaNIE_includeSexChr = TRUE,
# GRaNIE_promoterRange = 250000,
# GRaNIE_TF_peak_fdr_threshold = 0.2,
# GGRaNIE_peak_gene_fdr_threshold = 0.2,
# num_workers = 4,
# peak_gene = "output/granie/peak_gene.csv", # not yet implemented, should I?
# prediction= "output/granie/prediction.csv",
# useWeightingLinks = FALSE,
# forceRerun = FALSE
# )

# meta <- list(
# functionality_name = "my_method_r"
# )
## VIASH END

set.seed(42)
suppressPackageStartupMessages(library(Seurat))

library(Seurat)
library(Signac)
library(Matrix)
suppressPackageStartupMessages(library(Signac))
suppressPackageStartupMessages(library(Matrix))
library(GRaNIEverse)
library(GRaNIE)
library(qs)
library(BSgenome.Hsapiens.UCSC.hg38)
library(EnsDb.Hsapiens.v86)
library(EnsDb.Mmusculus.v79)
library(BSgenome.Mmusculus.UCSC.mm39)
library(dplyr)
suppressPackageStartupMessages(library(qs))
suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg38))
suppressPackageStartupMessages(library(EnsDb.Hsapiens.v86))
suppressPackageStartupMessages(library(EnsDb.Mmusculus.v79))
suppressPackageStartupMessages(library(BSgenome.Mmusculus.UCSC.mm39))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(SummarizedExperiment))


cat("Content of par list:")
str(par)

#qs::qsave(par, "/home/carnold/par.qs")

## VIASH START
par <- list(
file_rna = "resources_test/grn-benchmark/multiomics_r/rna.rds",
file_atac = "resources_test/grn-benchmark/multiomics_r/atac.rds",
temp_dir = "output/granie/",
preprocessing_clusteringMethod = 1, # Seurat::FindClusters: (1 = original Louvain algorithm, 2 = Louvain algorithm with multilevel refinement, 3 = SLM algorithm, 4 = Leiden algorithm)
preprocessing_clusterResolution = 14, # Typically between 5 and 20
preprocessing_SCT_nDimensions = 50, # Default 50
genomeAssembly = "hg38",
GRaNIE_corMethod = "spearman",
GRaNIE_includeSexChr = TRUE,
GRaNIE_promoterRange = 250000,
GRaNIE_TF_peak_fdr_threshold = 0.2,
GRaNIE_peak_gene_fdr_threshold = 0.2,
num_workers = 4,
peak_gene = "output/granie/peak_gene.csv", # not yet implemented, should I?
prediction= "output/granie/prediction.csv",
useWeightingLinks = FALSE,
forceRerun = FALSE
)
## VIASH END
print(par)
# meta <- list(
# functionality_name = "my_method_r"
# )


#### STANDARD ASSIGNMENTS ###
file_seurat = "seurat_granie.qs"
outputDir = par$temp_dir
outputDir = dirname(par$prediction)

if (!dir.exists(outputDir)) {
dir.create(outputDir, recursive = TRUE)
Expand All @@ -67,7 +74,7 @@ if (!file.exists(destfile)) {
# Define the directory to extract the files to
exdir <- "PWMScan_HOCOMOCOv12_H12INVIVO"

GRaNIE_TFBSFolder = paste0(exdir, "/PWMScan_HOCOMOCOv12/H12INVIVO")
GRaNIE_TFBSFolder = paste0(exdir, "/H12INVIVO")

if (!file.exists(GRaNIE_TFBSFolder)) {
untar(destfile, exdir = exdir)
Expand All @@ -91,7 +98,7 @@ if (!file.exists(file_RNA)) {
# Preprocess data #
###################

if (par.l$forceRerun | !file.exists(file_seurat)) {
if (par$forceRerun | !file.exists(file_seurat)) {

# Sparse matrix
rna.m = readRDS(par$file_rna)
Expand Down Expand Up @@ -146,12 +153,6 @@ if (par.l$forceRerun | !file.exists(file_seurat)) {
}

output_seuratProcessed = paste0(outputDir, "/seuratObject.qs")
if (!file.exists(output_seuratProcessed)) {
prepareData = TRUE
} else {
prepareData = FALSE
}


###################
# Preprocess data #
Expand All @@ -162,24 +163,38 @@ GRaNIE_file_peaks = paste0(outputDir, "/atac.pseudobulkFromClusters_res", par$pr
GRaNIE_file_rna = paste0(outputDir, "/rna.pseudobulkFromClusters_res", par$preprocessing_clusterResolution, "_mean.tsv.gz")
GRaNIE_file_metadata = paste0(outputDir, "/metadata_res", par$preprocessing_clusterResolution, "_mean.tsv.gz")

if (file.exists(GRaNIE_file_peaks) & file.exists(GRaNIE_file_metadata) & file.exists(GRaNIE_file_rna) & !par.l$forceRerun) {
if (file.exists(GRaNIE_file_peaks) & file.exists(GRaNIE_file_metadata) & file.exists(GRaNIE_file_rna) & !par$forceRerun) {

cat("Preprocessing skipped because all files alreadx exist anf forceRerun = FALSE.")
cat("Preprocessing skipped because all files already exist anf forceRerun = FALSE.")

} else {


# Subset for testing purposes
if (par$subset == TRUE) {
cat("SUBSET cells\n")
random_cells <- sample(Cells(seurat_object), size = 5000, replace = FALSE)
seurat_object = subset(seurat_object, cells = random_cells)
cat("SUBSET peaks\n")
peak_names <- rownames(seurat_object[["peaks"]])
selected_peaks <- sample(peak_names, size = 50000, replace = FALSE)
seurat_object[["peaks"]] = subset(seurat_object[["peaks"]], features = selected_peaks)
}

seurat_object = prepareSeuratData_GRaNIE(seurat_object,
outputDir = par$outputDir,
file_RNA_features = file_RNA,
assayName_RNA_raw = "RNA", assayName_ATAC = "peaks",
prepareData = prepareData,
SCT_nDimensions = par$preprocessing_SCT_nDimensions,
dimensionsToIgnore_LSI_ATAC = 1,
outputDir = outputDir,
saveSeuratObject = TRUE,
genome = par$genomeAssembly,
assayName_RNA = "RNA", normRNA = "SCT", nDimensions_RNA = par$preprocessing_RNA_nDimensions, recalculateVariableFeatures = NULL,
assayName_ATAC_raw = "peaks",
normATAC = "LSI", LSI_featureCutoff = "q0", nDimensions_ATAC = 50, dimensionsToIgnore_LSI_ATAC = 1,
integrationMethod = "WNN", WNN_knn = 20,
pseudobulk_source = "cluster",
countAggregation = "mean",
clusteringAlgorithm = par$preprocessing_clusteringMethod,
clusterResolutions = par$preprocessing_clusterResolution,
saveSeuratObject = TRUE)
clusterResolutions = par$preprocessing_clusterResolution,
minCellsPerCluster = 25,
forceRerun = FALSE
)

}

Expand All @@ -190,7 +205,7 @@ if (file.exists(GRaNIE_file_peaks) & file.exists(GRaNIE_file_metadata) & file.ex
##############

GRN = runGRaNIE(
dir_output = par$temp_dir,
dir_output = outputDir,
datasetName = "undescribed",
GRaNIE_file_peaks,
GRaNIE_file_rna,
Expand Down
Loading