-
Notifications
You must be signed in to change notification settings - Fork 719
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1128 from nf-core/dev
Dev -> master for 3.13.2 release
- Loading branch information
Showing
10 changed files
with
167 additions
and
88 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,94 +1,143 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
# Written by Lorena Pantano and released under the MIT license. | ||
# Script for importing and processing transcript-level quantifications. | ||
# Written by Lorena Pantano, later modified by Jonathan Manning, and released | ||
# under the MIT license. | ||
|
||
# Loading required libraries | ||
library(SummarizedExperiment) | ||
library(tximport) | ||
|
||
args = commandArgs(trailingOnly=TRUE) | ||
# Parsing command line arguments | ||
args <- commandArgs(trailingOnly=TRUE) | ||
if (length(args) < 4) { | ||
stop("Usage: tximport.r <coldata> <path> <sample_name> <quant_type> <tx2gene_path>", call.=FALSE) | ||
stop("Usage: tximport.r <coldata_path> <path> <prefix> <quant_type> <tx2gene_path>", | ||
call.=FALSE) | ||
} | ||
|
||
coldata = args[1] | ||
path = args[2] | ||
sample_name = args[3] | ||
quant_type = args[4] | ||
tx2gene_path = args[5] | ||
|
||
prefix = sample_name | ||
|
||
info = file.info(tx2gene_path) | ||
if (info$size == 0) { | ||
tx2gene = NULL | ||
} else { | ||
rowdata = read.csv(tx2gene_path, sep="\t", header = FALSE) | ||
colnames(rowdata) = c("tx", "gene_id", "gene_name") | ||
tx2gene = rowdata[,1:2] | ||
# Assigning command line arguments to variables | ||
coldata_path <- args[1] | ||
path <- args[2] | ||
prefix <- args[3] | ||
quant_type <- args[4] | ||
tx2gene_path <- args[5] | ||
|
||
## Functions | ||
|
||
# Build a table from a SummarizedExperiment object | ||
build_table <- function(se.obj, slot) { | ||
cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) | ||
} | ||
|
||
pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") | ||
fns = list.files(path, pattern = pattern, recursive = T, full.names = T) | ||
names = basename(dirname(fns)) | ||
names(fns) = names | ||
|
||
if (file.exists(coldata)) { | ||
coldata = read.csv(coldata, sep="\t") | ||
coldata = coldata[match(names, coldata[,1]),] | ||
coldata = cbind(files = fns, coldata) | ||
} else { | ||
message("ColData not available: ", coldata) | ||
coldata = data.frame(files = fns, names = names) | ||
# Write a table to a file with given parameters | ||
write_se_table <- function(params) { | ||
file_name <- paste0(prefix, ".", params$suffix) | ||
write.table(build_table(params$obj, params$slot), file_name, | ||
sep="\t", quote=FALSE, row.names = FALSE) | ||
} | ||
|
||
dropInfReps = quant_type == "kallisto" | ||
# Read transcript metadata from a given path | ||
read_transcript_info <- function(tinfo_path){ | ||
info <- file.info(tinfo_path) | ||
if (info$size == 0) { | ||
stop("tx2gene file is empty") | ||
} | ||
|
||
transcript_info <- read.csv(tinfo_path, sep="\t", header = FALSE, | ||
col.names = c("tx", "gene_id", "gene_name")) | ||
|
||
txi = tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) | ||
rownames(coldata) = coldata[["names"]] | ||
extra = setdiff(rownames(txi[[1]]), as.character(rowdata[["tx"]])) | ||
if (length(extra) > 0) { | ||
rowdata = rbind(rowdata, data.frame(tx=extra, gene_id=extra, gene_name=extra)) | ||
extra <- setdiff(rownames(txi[[1]]), as.character(transcript_info[["tx"]])) | ||
transcript_info <- rbind(transcript_info, data.frame(tx=extra, gene_id=extra, gene_name=extra)) | ||
transcript_info <- transcript_info[match(rownames(txi[[1]]), transcript_info[["tx"]]), ] | ||
rownames(transcript_info) <- transcript_info[["tx"]] | ||
|
||
list(transcript = transcript_info, | ||
gene = unique(transcript_info[,2:3]), | ||
tx2gene = transcript_info[,1:2]) | ||
} | ||
rowdata = rowdata[match(rownames(txi[[1]]), as.character(rowdata[["tx"]])),] | ||
rownames(rowdata) = rowdata[["tx"]] | ||
se = SummarizedExperiment(assays = list(counts = txi[["counts"]], abundance = txi[["abundance"]], length = txi[["length"]]), | ||
colData = DataFrame(coldata), | ||
rowData = rowdata) | ||
if (!is.null(tx2gene)) { | ||
gi = summarizeToGene(txi, tx2gene = tx2gene) | ||
gi.ls = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") | ||
gi.s = summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") | ||
growdata = unique(rowdata[,2:3]) | ||
growdata = growdata[match(rownames(gi[[1]]), growdata[["gene_id"]]),] | ||
rownames(growdata) = growdata[["tx"]] | ||
gse = SummarizedExperiment(assays = list(counts = gi[["counts"]], abundance = gi[["abundance"]], length = gi[["length"]]), | ||
colData = DataFrame(coldata), | ||
rowData = growdata) | ||
gse.ls = SummarizedExperiment(assays = list(counts = gi.ls[["counts"]], abundance = gi.ls[["abundance"]], length = gi.ls[["length"]]), | ||
colData = DataFrame(coldata), | ||
rowData = growdata) | ||
gse.s = SummarizedExperiment(assays = list(counts = gi.s[["counts"]], abundance = gi.s[["abundance"]], length = gi.s[["length"]]), | ||
colData = DataFrame(coldata), | ||
rowData = growdata) | ||
|
||
# Read and process sample/column data from a given path | ||
read_coldata <- function(coldata_path){ | ||
if (file.exists(coldata_path)) { | ||
coldata <- read.csv(coldata_path, sep="\t") | ||
coldata <- coldata[match(names, coldata[,1]),] | ||
coldata <- cbind(files = fns, coldata) | ||
} else { | ||
message("ColData not available: ", coldata_path) | ||
coldata <- data.frame(files = fns, names = names) | ||
} | ||
rownames(coldata) <- coldata[["names"]] | ||
} | ||
|
||
build_table = function(se.obj, slot) { | ||
cbind(rowData(se.obj)[,1:2], assays(se.obj)[[slot]]) | ||
# Create a SummarizedExperiment object with given data | ||
create_summarized_experiment <- function(counts, abundance, length, col_data, row_data) { | ||
SummarizedExperiment(assays = list(counts = counts, abundance = abundance, length = length), | ||
colData = col_data, | ||
rowData = row_data) | ||
} | ||
|
||
if(exists("gse")){ | ||
write.table(build_table(gse, "abundance"), paste(c(prefix, "gene_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) | ||
write.table(build_table(gse, "counts"), paste(c(prefix, "gene_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) | ||
write.table(build_table(gse.ls, "abundance"), paste(c(prefix, "gene_tpm_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) | ||
write.table(build_table(gse.ls, "counts"), paste(c(prefix, "gene_counts_length_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) | ||
write.table(build_table(gse.s, "abundance"), paste(c(prefix, "gene_tpm_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) | ||
write.table(build_table(gse.s, "counts"), paste(c(prefix, "gene_counts_scaled.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) | ||
# Main script starts here | ||
|
||
# Define pattern for file names based on quantification type | ||
pattern <- ifelse(quant_type == "kallisto", "abundance.tsv", "quant.sf") | ||
fns <- list.files(path, pattern = pattern, recursive = T, full.names = T) | ||
names <- basename(dirname(fns)) | ||
names(fns) <- names | ||
dropInfReps <- quant_type == "kallisto" | ||
|
||
# Import transcript-level quantifications | ||
txi <- tximport(fns, type = quant_type, txOut = TRUE, dropInfReps = dropInfReps) | ||
|
||
# Read transcript and sample data | ||
transcript_info <- read_transcript_info(tx2gene_path) | ||
coldata <- read_coldata(coldata_path) | ||
|
||
# Create initial SummarizedExperiment object | ||
se <- create_summarized_experiment(txi[["counts"]], txi[["abundance"]], txi[["length"]], | ||
DataFrame(coldata), transcript_info$transcript) | ||
|
||
# Setting parameters for writing tables | ||
params <- list( | ||
list(obj = se, slot = "abundance", suffix = "transcript_tpm.tsv"), | ||
list(obj = se, slot = "counts", suffix = "transcript_counts.tsv"), | ||
list(obj = se, slot = "length", suffix = "transcript_lengths.tsv") | ||
) | ||
|
||
# Process gene-level data if tx2gene mapping is available | ||
if ("tx2gene" %in% names(transcript_info) && !is.null(transcript_info$tx2gene)) { | ||
tx2gene <- transcript_info$tx2gene | ||
gi <- summarizeToGene(txi, tx2gene = tx2gene) | ||
gi.ls <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "lengthScaledTPM") | ||
gi.s <- summarizeToGene(txi, tx2gene = tx2gene, countsFromAbundance = "scaledTPM") | ||
|
||
gene_info <- transcript_info$gene[match(rownames(gi[[1]]), transcript_info$gene[["gene_id"]]),] | ||
rownames(gene_info) <- gene_info[["tx"]] | ||
|
||
col_data_frame <- DataFrame(coldata) | ||
|
||
# Create gene-level SummarizedExperiment objects | ||
gse <- create_summarized_experiment(gi[["counts"]], gi[["abundance"]], gi[["length"]], | ||
col_data_frame, gene_info) | ||
gse.ls <- create_summarized_experiment(gi.ls[["counts"]], gi.ls[["abundance"]], gi.ls[["length"]], | ||
col_data_frame, gene_info) | ||
gse.s <- create_summarized_experiment(gi.s[["counts"]], gi.s[["abundance"]], gi.s[["length"]], | ||
col_data_frame, gene_info) | ||
|
||
params <- c(params, list( | ||
list(obj = gse, slot = "length", suffix = "gene_lengths.tsv"), | ||
list(obj = gse, slot = "abundance", suffix = "gene_tpm.tsv"), | ||
list(obj = gse, slot = "counts", suffix = "gene_counts.tsv"), | ||
list(obj = gse.ls, slot = "abundance", suffix = "gene_tpm_length_scaled.tsv"), | ||
list(obj = gse.ls, slot = "counts", suffix = "gene_counts_length_scaled.tsv"), | ||
list(obj = gse.s, slot = "abundance", suffix = "gene_tpm_scaled.tsv"), | ||
list(obj = gse.s, slot = "counts", suffix = "gene_counts_scaled.tsv") | ||
)) | ||
} | ||
|
||
write.table(build_table(se, "abundance"), paste(c(prefix, "transcript_tpm.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) | ||
write.table(build_table(se, "counts"), paste(c(prefix, "transcript_counts.tsv"), collapse="."), sep="\t", quote=FALSE, row.names = FALSE) | ||
# Writing tables for each set of parameters | ||
done <- lapply(params, write_se_table) | ||
|
||
# Print sessioninfo to standard out | ||
# Output session information and citations | ||
citation("tximeta") | ||
sessionInfo() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters