-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_gene_lengths_brightness.R
68 lines (51 loc) · 3.11 KB
/
get_gene_lengths_brightness.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#### -------------------------------------------------------------------------------
#### created on 03 oct 2022, 07:12pm
#### author: dhrubas2
#### -------------------------------------------------------------------------------
.wpath. <- "/Users/dhrubas2/OneDrive - National Institutes of Health/Projects/TMEcontribution/analysis/submission/Code/analysis/"
.mpath. <- "miscellaneous/r/miscellaneous.R"
setwd(.wpath.) # current path
source(.mpath.)
library(rtracklayer)
library(GenomicFeatures)
pcat <- function(...) cat(paste0(..., "\n"))
cat("\014") # clears console
#### -------------------------------------------------------------------------------
## import gtf file for BrighTNess: https://www.nature.com/articles/s41523-021-00349-y#Sec7
## Illumina HiSeq3000 with 50-bp single-end mode, STAR v2.0.4b,
## Ensembl release 76 (GRCh38.76)
## download: http://ftp.ensembl.org/pub/release-76/gtf/homo_sapiens/Homo_sapiens.GRCh38.76.gtf.gz
## cmd: curl -o filename URL (change "http" to "ftp")
gtf.file <- "../../data/BrighTNess/Homo_sapiens.GRCh38.76.gtf"
gtf <- import.gff(gtf.file, format = "gtf")
## extract total non-overlapping exon length per gene: https://www.biostars.org/p/83901/
## 1. import the GTF-file that you have also used as input for htseq-count
## 2. collect the exons per gene id
## 3. for each gene, reduce all the exons to a set of non overlapping exons, calculate their lengths (widths) and sum
gtf.txdb <- makeTxDbFromGRanges(gtf, drop.stop.codons = T)
exon.list.gene <- exonsBy(gtf.txdb, by = "gene", use.names = F)
exon.length.gene <- reduce(exon.list.gene) %>% width %>% sum # reduce() & width() are from GenomicRanges/GenomicFeatures
#### -------------------------------------------------------------------------------
## combine gene lengths with annotations.
gene.annot <- as.data.frame(gtf)[, c("gene_id", "gene_name", "gene_biotype")] %>%
(function(x) x[!(x$gene_id %>% duplicated), ])
gene.annot <- gene.annot[
order(gene.annot$gene_id, gene.annot$gene_name, decreasing = T), ]
rownames(gene.annot) <- gene.annot$gene_id
use.genes <- intersect(gene.annot$gene_id, exon.length.gene %>% names) # genes with annotation
gene.annot[use.genes, "gene_exon_length"] <- exon.length.gene[use.genes]
pcat("\ntotal #genes = ", gene.annot %>% nrow)
## keep only genes with unique ensembl IDs.
gene.annot <- gene.annot[-(gene.annot$gene_name %>% duplicated %>% which), ] %>%
`rownames<-`(.$gene_name)
pcat("\ntotal #genes kept = ", gene.annot %>% nrow)
#### -------------------------------------------------------------------------------
## save annotation.
svdat <- F # set T to save data
if (svdat){
datestamp <- DateTime()
annot.file <- sprintf("../../data/BrighTNess/gene_length_ensembl.grch38.76_SRD_%s.txt",
datestamp)
write.table(gene.annot, file = annot.file, sep = "\t", col.names = T,
row.names = F)
}