added createPairs from enhancer

slrvv · Nov 8, 2024 · 8289df4 · 8289df4
1 parent 25ed97e
commit 8289df4
Show file tree

Hide file tree

Showing 13 changed files with 212 additions and 111 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -2,17 +2,20 @@ Package: CENTRE
 Type: Package
 Title: an R package to predict tissue specific enhancer target interactions
 Version: 0.99.0
-Authors@R: c(person("Sara", "Lopez Ruiz de Vargas", role = c("cre"),email = "[email protected]"),
+Authors@R: c(person("Sara", "Lopez Ruiz de Vargas", role = c("cre"), 
+           email = "[email protected]", 
+           comment=c(ORCID="0000-0002-1290-4774")),
            person("Trisevgeni", "Rapakoulia", role = c("aut"), email = ""),
-           person("Martin", "Vingron", role = c("aut"), email = "[email protected]"))
+           person("Martin", "Vingron", role = c("aut"), 
+           email = "[email protected]"))
 Description: An R package to predict Enhancer Promoter Interactions in specific
              tissues. The package has two main use cases : the user can give a
              file with genes of interest or enhancer target pairs of interest. 
              In the first case CENTRE will generate the enhancer target pairs
              and predict how likely they are to interact in the tissue and in
              the second case it will predict how likely the given pairs are to
              interact. 
-License: MIT + file LICENSE
+License: Artistic-2.0
 Encoding: UTF-8
 LazyData: false 
 Imports: 
@@ -26,9 +29,9 @@ Imports:
     utils,
     stats
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Depends: 
-    R (>= 4.2.0)
+    R (>= 4.4.0)
 Suggests: 
     knitr,
     testthat (>= 3.0.0),

diff --git a/R/computeGenericFeatures.R b/R/computeGenericFeatures.R
@@ -14,7 +14,7 @@
 #' @examples
 #' #Create gene enhancer pairs
 #' genes <- as.data.frame(c("ENSG00000130203.10",
-#' "ENSG00000171119.3"))
+#' "ENSG00000280087.1"))
 #' colnames(genes) <- c("gene_id") #It is important to name the column gene_id
 #' pairs <- CENTRE::createPairs(genes)
 #'
@@ -25,7 +25,6 @@
 #' @import utils
 #' @importFrom metapod combineParallelPValues
 #' @importFrom RSQLite dbConnect dbGetQuery dbDisconnect
-#'
 computeGenericFeatures <- function(pairs) {
   startTime <- Sys.time()
   ## Pre-eliminary checks and computations
@@ -46,12 +45,12 @@ computeGenericFeatures <- function(pairs) {
   featuresDistances$pair <- paste(featuresDistances$enhancer_id,
                                    featuresDistances$gene_id2,
                                    sep = "_")
-
+  #connect to the precomputed values database
   conn <- RSQLite::dbConnect(RSQLite::SQLite(),
                              system.file("extdata",
                                          "PrecomputedDataLight.db",
                                          package = "CENTRE"))
-
+  
   combinedTestDf <- getPrecomputedValues("combinedTestData",
                                          "combined_tests",
                                          featuresDistances,

diff --git a/R/createPairs.R b/R/createPairs.R
@@ -1,81 +1,48 @@
 #' Create Pairs
 #'
 #' Creates all of the possible gene enhancer pairs at 500kb distance
-#' from the given genes
-#' transcription start sites.
+#' from the given genes transcription start sites. The pairs can be also computed
+#' from enhancers, in that mode we collect all gene enhancer pairs at 500kb distance
+#' of the enhancer middle point.
 #'
-#' @param gene One column dataframe with gene ENSEMBL id's
+#' @param ids One column dataframe with gene ENSEMBL id's or enhancer cCREs ids
+#' @param enhancerCentered Boolean value. If true the pairs are computed from 
+#' enhancers. In the default setting (false) pairs are computed from genes.
 #' @return dataframe with two columns the ENSEMBL id's without their version and
-#' the enhancer id's
+#' the enhancer id's (ENCODE cCREs)
 #'
 #'
 #' @examples
 #' #Create gene enhancer pairs
-#' genes <- as.data.frame(c("ENSG00000130203.10",
+#' ids <- as.data.frame(c("ENSG00000130203.10",
 #' "ENSG00000171119.3"))
-#' colnames(genes) <- c("gene_id") #It is important to name the column gene_id
-#' pairs <- CENTRE::createPairs(genes)
+#' colnames(ids) <- c("gene_id") #It is important to name the column gene_id or 
+#' # enhancer_id
+#' pairs <- CENTRE::createPairs(ids)
 #' @export
 #' @import utils
 #' @importFrom GenomicRanges GRanges findOverlaps
 #' @importFrom IRanges IRanges
 #' @importFrom RSQLite dbConnect dbGetQuery dbDisconnect
 #' @importFrom regioneR extendRegions
-createPairs <- function(gene) {
+createPairs <- function(ids, enhancerCentered = FALSE) {
   startTime <- Sys.time()
-
-  #check that the user named the column correctly
-  stopifnot("The column needs to be named gene_id" =
-              any(names(gene) == "gene_id"))
-
-  ## remove the "." version id of ENSEMBL ids
-  gene$gene_id1 <- gsub("\\..*", "", gene$gene_id)
-
-
-  ## connect to our GENCODE v40 database to get tts of the genes
-  conn <- RSQLite::dbConnect(RSQLite::SQLite(),
-                             system.file("extdata",
-                            "Annotation.db",
-                            package = "CENTRE"))
-  #get chromosome and tts of our genes
-
-  query <- paste("SELECT  gene_id1, chr, transcription_start FROM gencode WHERE gene_id1 in (",
-  paste0(sprintf("'%s'", gene$gene_id1), collapse = ", "), ")", sep = "")
-  gene <- RSQLite::dbGetQuery(conn, query)
-
-  #Select all of the annotation for ccres v3
-  ccresEnhancer <- RSQLite::dbGetQuery(conn, "SELECT * FROM ccres_enhancer")
-  RSQLite::dbDisconnect(conn)
-
-  genesRange <- with(gene,
-                     GenomicRanges::GRanges(chr,
-                                            IRanges::IRanges(start = transcription_start,
-                                                             end = transcription_start)))
-
-  #extend the gene region 500Kb to the left of TTS and to the right
-  genesRange <- regioneR::extendRegions(genesRange,
-                                        extend.start = 500000,
-                                        extend.end = 500000)
-
-  enhancerRange <-  with(ccresEnhancer,
-                         GenomicRanges::GRanges(V1,
-                                                IRanges::IRanges(start = new_start,
-                                                                 end = new_end)))
-
-
-  # find the enhancers that overlap the extended gene region
-  overlaps <- GenomicRanges::findOverlaps(genesRange, enhancerRange,
-                                          ignore.strand = TRUE)
-
-  ccresOverlapping <- data.frame(gene = overlaps@from, enhancer = overlaps@to)
-  ccresOverlapping$gene_id1 <- gene$gene_id1[ccresOverlapping$gene]
-  ccresOverlapping$enhancer_id <- ccresEnhancer$V5[ccresOverlapping$enhancer]
-
-  #select the gene and enhancer ids
-  ccresOverlapping <- ccresOverlapping[, c("gene_id1", "enhancer_id")]
-
-  ### add a function to exclude any pairs that are not in the same chromosome
-
+
+  if (enhancerCentered == TRUE){
+    stopifnot("The column needs to be named enhancer_id" =
+                any(names(ids) == "enhancer_id"))
+    cat("Pairs are computed from the enhancer IDs\n")
+    ccresOverlapping <- enhancerCenteredPairs(ids)
+
+  } else {
+    #check that the user named the column correctly
+    stopifnot("The column needs to be named gene_id" =
+                any(names(ids) == "gene_id"))
+    cat("Pairs are computed from the gene IDs\n")
+    ccresOverlapping <- geneCenteredPairs(ids)
+
+  }
+
   cat(paste0("time: ", format(Sys.time() - startTime), "\n"))
   return(ccresOverlapping)
 }
diff --git a/R/functions.R b/R/functions.R
@@ -377,3 +377,106 @@ downloader <- function(file, method) {
   }
 }
 
+################################################################################
+# function : gene centered pairs
+################################################################################
+
+geneCenteredPairs <- function(gene){
+  ## remove the "." version id of ENSEMBL ids
+  gene$gene_id1 <- gsub("\\..*", "", gene$gene_id)
+
+
+  ## connect to our GENCODE v40 database to get tts of the genes
+  conn <- RSQLite::dbConnect(RSQLite::SQLite(),
+                             system.file("extdata",
+                                         "Annotation.db",
+                                         package = "CENTRE"))
+  #get chromosome and tts of our genes
+
+  query <- paste("SELECT  gene_id1, chr, transcription_start FROM gencode WHERE gene_id1 in (",
+                 paste0(sprintf("'%s'", gene$gene_id1), collapse = ", "), ")", sep = "")
+  gene <- RSQLite::dbGetQuery(conn, query)
+
+  #Select all of the annotation for ccres v3
+  ccresEnhancer <- RSQLite::dbGetQuery(conn, "SELECT * FROM ccres_enhancer")
+  RSQLite::dbDisconnect(conn)
+
+
+  genesRange <- with(gene,
+                     GenomicRanges::GRanges(chr,
+                                            IRanges::IRanges(start = transcription_start,
+                                                             end = transcription_start),
+                                            gene_id1 = gene_id1))
+
+  #extend the gene region 500Kb to the left of TTS and to the right
+  genesRange <- regioneR::extendRegions(genesRange,
+                                        extend.start = 500000,
+                                        extend.end = 500000)
+
+  enhancerRange <-  with(ccresEnhancer,
+                         GenomicRanges::GRanges(V1,
+                                                IRanges::IRanges(start = new_start,
+                                                                 end = new_end),
+                                                enhancer_id = V5))
+
+
+  # find the enhancers that overlap the extended gene region
+  overlaps <- GenomicRanges::findOverlaps(genesRange, enhancerRange,
+                                          ignore.strand = TRUE)
+
+  ccresOverlapping <- data.frame(gene_id1 = GenomicRanges::elementMetadata(genesRange)$gene_id1[overlaps@from],
+                                 enhancer_id = GenomicRanges::elementMetadata(enhancerRange)$enhancer_id[overlaps@to])
+
+  return(ccresOverlapping)
+
+}
+
+
+
+################################################################################
+# function : enhancer centered pairs
+################################################################################
+
+enhancerCenteredPairs <- function(enhancer){
+
+  conn <- RSQLite::dbConnect(RSQLite::SQLite(),
+                             system.file("extdata",
+                                         "Annotation.db",
+                                         package = "CENTRE"))
+  #get chromosome and tts of our genes
+
+  query <- paste("SELECT  V5, V1, middle_point FROM ccres_enhancer WHERE V5 in (",
+                 paste0(sprintf("'%s'", enhancer$enhancer_id), collapse = ", "), ")", sep = "")
+  enhancer <- RSQLite::dbGetQuery(conn, query)
+
+  #Select all of the annotation for ccres v3
+  gene <- RSQLite::dbGetQuery(conn, "SELECT * FROM gencode")
+  RSQLite::dbDisconnect(conn)
+  gene$gene_id1 <- gsub("\\..*", "", gene$gene_id)
+  enhancerRange <- with(enhancer,
+                     GenomicRanges::GRanges(V1,
+                                            IRanges::IRanges(start = middle_point,
+                                                             end = middle_point),
+                                            enhancer_id = V5))
+
+  #extend the gene region 500Kb to the left of TTS and to the right
+  enhancerRange <- regioneR::extendRegions(enhancerRange,
+                                        extend.start = 500000,
+                                        extend.end = 500000)
+
+  genesRange <-  with(gene,
+                         GenomicRanges::GRanges(chr,
+                                                IRanges::IRanges(start = new_start,
+                                                                 end = new_end),
+                                                gene_id1 = gene_id1))
+
+
+  # find the enhancers that overlap the extended gene region
+  overlaps <- GenomicRanges::findOverlaps(enhancerRange,
+                                          genesRange,
+                                          ignore.strand = TRUE)
+
+  ccresOverlapping <- data.frame(gene_id1 = GenomicRanges::elementMetadata(genesRange)$gene_id1[overlaps@to],
+                        enhancer_id = GenomicRanges::elementMetadata(enhancerRange)$enhancer_id[overlaps@from])
+  return(ccresOverlapping)
+}
diff --git a/inst/extdata/output_enh_pairs.rds b/inst/extdata/output_enh_pairs.rds
diff --git a/man/centrePrediction.Rd b/man/centrePrediction.Rd
diff --git a/man/computeCellTypeFeatures.Rd b/man/computeCellTypeFeatures.Rd
diff --git a/man/computeGenericFeatures.Rd b/man/computeGenericFeatures.Rd