From 2778580fbe150537b912f6b08ca52301cf149892 Mon Sep 17 00:00:00 2001 From: subwaystation Date: Tue, 23 Feb 2021 16:13:11 +0100 Subject: [PATCH] user can decide display names in PNG --- README.md | 22 +++++++++++--- pgge | 74 ++++++++++++++++++++++++++++++++++++++--------- scripts/beehave.R | 52 ++++++++++++++++++++++++++------- 3 files changed, 119 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 42e439e..cad6060 100644 --- a/README.md +++ b/README.md @@ -53,10 +53,24 @@ in your given FASTA file, the results will only contain one line of metrics. In you have contig sequences in your FASTA and want to summarize by sample name. _`pgge`_ always splits by `.` and takes the first entry in the resulting split as sample name. -:warning: In addition, _`pgge`_ assumes that there is *@NUMBER* in each name of the input GAFs. This *NUMBER* is extracted -later and appears in the table and output plot as *cons.jump*, because _`pgge`_ was designed for processing the results -of _`pggb`_. If you are evaluating your own data not originating from _`pggb`_ it is recommended to add a different *@NUMBER* -for each of your input GAFs in order to better understand the resulting output. +:warning: _`pgge`_ was designed for processing the results +of _`pggb`_. If you are evaluating your own data not originating from _`pggb`_ it is recommended to set the `-n/--input-graph-names` parameter to ensure the final PNG is labeled correctly. This parameters requires a TSV with 2 rows: + +1. The name of the original input graph. +2. The name to display in the PNG. + +In the following an example for the yeast data set: + +``` +cerevisiae.pan.fa.pggb-W-s50000-l150000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@10000::y:0:1000000.gfa 10k::y:0:1000k +cerevisiae.pan.fa.pggb-W-s50000-l150000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@1000::y:0:1000000.gfa 1k::y:0:1000k +cerevisiae.pan.fa.pggb-W-s50000-l150000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@100::y:0:1000000.gfa 0.1k::y:0:1000k +cerevisiae.pan.fa.pggb-W-s50000-l150000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@10::y:0:1000000.gfa 0.01k::y:0:1000k +cerevisiae.pan.fa.pggb-W-s50000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@10000:y.gfa 10k:y +cerevisiae.pan.fa.pggb-W-s50000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@1000:y.gfa 1k:y +cerevisiae.pan.fa.pggb-W-s50000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@100:y.gfa 0.1k:y +cerevisiae.pan.fa.pggb-W-s50000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@10:y.gfa 0.01k:y +``` ### output diff --git a/pgge b/pgge index 98d7996..17a2867 100755 --- a/pgge +++ b/pgge @@ -6,6 +6,7 @@ set -eo pipefail input_gfa=false input_fasta=false input_gaf=false +input_graph_names=false output_dir="." splitfa=false seq_length=false @@ -14,16 +15,19 @@ beehave_R=false peanut_bed=false threads=1 +num_gfas=0 +num_input_graph_names=0 + if [ $# -eq 0 ]; then show_help=true -fi +fi ## TODO add options for GraphAligner # read the options cmd=$0" "$@ -TEMP=`getopt -o g:f:i:r:o:l:s:t:hb --long input-gfa:,input-fasta:,input-gaf:,beehave-r:,output-dir:,seq-length:,step:,threads:,help,peanut-bed -n 'pgge' -- "$@"` +TEMP=`getopt -o g:f:i:n:r:o:l:s:t:hb --long input-gfa:,input-fasta:,input-gaf:,input-graph-names:,beehave-r:,output-dir:,seq-length:,step:,threads:,help,peanut-bed -n 'pgge' -- "$@"` eval set -- "$TEMP" # extract options and their arguments into variables. @@ -32,6 +36,7 @@ while true ; do -g|--input-gfa) input_gfa=$2 ; shift 2 ;; -f|--input-fasta) input_fasta=$2 ; shift 2 ;; -i|--input-gaf) input_gaf=$2 ; shift 2 ;; + -n|--input-graph-names) input_graph_names=$2 ; shift 2 ;; -r|--beehave-r) beehave_R=$2 ; shift 2 ;; -o|--output-dir) output_dir=$2 ; shift 2 ;; -l|--seq-length) seq_length=$2 ; shift 2 ;; @@ -118,7 +123,8 @@ then echo " -f, --input-fasta FILE input FASTA file (uncompressed or gzipped)" echo " -o, --output-dir FILE output directory" echo " -r, --beehave-r PATH path to beehave.R" - echo " -b, --peanut-bed PATH output BED file" + echo " -b, --peanut-bed PATH output BED file" + echo " -n, --input-graph-names TSV input graph name file: first row is the name of the original input file, second row is the display name in the PNG" echo " [splitfa]" echo " -l, --seq-length N length of the splits" echo " -s, --step N step size between splits" @@ -159,6 +165,7 @@ evaluation: output-dir: $output_dir beehave-r: $beehave_R peanut-bed: $peanut_bed + input-graph-names: $input_graph_names splitfa: seq-length: $seq_length step: $step @@ -169,6 +176,18 @@ general: EOT echo -e "\nRunning pgge\n" >> "$log_file" +# does the given input-graph-names exist? +if [[ $input_graph_names != false ]]; +then + if [[ + ! -f $input_graph_names + ]] + then + >&2 echo "'$input_graph_names' graph names file does not exist! Please correct and re-run pgge." + exit + fi +fi + ## START gfa_input if [[ $input_gaf == false ]]; then @@ -180,11 +199,26 @@ do ! -f $gfa ]]; then - >&2 echo "$gfa GFA does not exist! Please correct and re-run pgge." + >&2 echo "'$gfa' GFA does not exist! Please correct and re-run pgge." exit + else + ((num_gfas=num_gfas+1)) fi done +# TODO find out if #lines == #GFAs +# echo $num_gfas +#echo $(wc -l $input_graph_names | cut -f 1 -d " ") +if [[ $input_graph_names != false ]]; +then + num_input_graph_names=$(wc -l $input_graph_names | cut -f 1 -d " ") + if [[ $num_gfas != $num_input_graph_names ]]; + then + >&2 echo "Number of given graph names in '$input_graph_names' does not match given number of GFAs. Please correct and re-run pgge." + exit + fi +fi + # Manage uncompressed or gzip compressed FASTA files if (file "$input_fasta" | grep -q 'gzip compressed data' ) ; then input_fasta_uncompressed="$(echo "$input_fasta" | sed -e 's/\.[^./]*$//')" @@ -272,7 +306,7 @@ if [[ ! -f $input_gaf ]]; then - >&2 echo "$input_gaf GAF does not exist! Please correct and re-run pgge." + >&2 echo "'$input_gaf' GAF does not exist! Please correct and re-run pgge." exit fi @@ -291,12 +325,24 @@ input_gaf_base="$(basename -- "$input_gaf")" fi ## END gaf_input -(echo sample.name cons.jump aln.id qsc uniq multi nonaln; ls "$output_dir" | grep gaf | grep pgge$ | while read f; \ -do - echo "$(echo "$output_dir"/"$f" | cut -f 2 -d .)" "$(echo "$output_dir"/"$f" | cut -f 2 -d @ | cut -f 1 -d. )" "$(cat "$output_dir"/"$f")"; \ -done) | tr ' ' '\t' > "$prefix_pgge".tsv - -$timer -f "$fmt" Rscript \ - "$beehave_R" "$prefix_pgge".tsv \ - "$prefix_pgge".tsv.png \ - 2> >(tee -a "$log_file") +if [[ $input_graph_names == false ]]; +then + (echo sample.name cons.jump aln.id qsc uniq multi nonaln; ls "$output_dir" | grep gaf | grep pgge$ | while read f; \ + do + echo "$(echo "$output_dir"/"$f" | cut -f 2 -d .)" "$(echo "$output_dir"/"$f" | cut -f 2 -d @ | cut -f 1 -d.)" "$(cat "$output_dir"/"$f")"; \ + done) | tr ' ' '\t' > "$prefix_pgge".tsv + $timer -f "$fmt" Rscript \ + "$beehave_R" "$prefix_pgge".tsv \ + "$prefix_pgge".tsv.png \ + 2> >(tee -a "$log_file") +else + (echo sample.name cons.jump aln.id qsc uniq multi nonaln; ls "$output_dir" | grep gaf | grep pgge$ | while read f; \ + do + echo "$(echo "$output_dir"/"$f" | cut -f 2 -d .)" "$(basename $(echo "$output_dir"/"$f"))" "$(cat "$output_dir"/"$f")"; \ + done) | tr ' ' '\t' > "$prefix_pgge".tsv + $timer -f "$fmt" Rscript \ + "$beehave_R" "$prefix_pgge".tsv \ + "$prefix_pgge".tsv.png \ + "$input_graph_names" \ + 2> >(tee -a "$log_file") +fi \ No newline at end of file diff --git a/scripts/beehave.R b/scripts/beehave.R index 2c899b2..fb85cae 100644 --- a/scripts/beehave.R +++ b/scripts/beehave.R @@ -7,47 +7,77 @@ require(gridExtra) args <- commandArgs(trailingOnly = T) input.pgge <- read.table(args[1], sep = '\t', header = T) +output.png <- args[2] + +## +#input.pgge <- read.table("/home/heumos/Downloads/yeast/pgge_graph_names/pgge-l100000-s50000.tsv", sep = '\t', header = T) +## +#output.png <- "/home/heumos/Downloads/yeast/pgge-l100000-s50000.png" +## +#graph_names <- read.delim("/home/heumos/Downloads/yeast/graph_names.tsv", header = F) + +if (!is.na(args[3])) { + graph_names <- read.delim(args[3], header = F) + for (row in 1:dim(input.pgge)[1]) { + name <- input.pgge$cons.jump[row] + split <- strsplit(name, "\\.") + end <- length(split[[1]]) - 2 + sub_split <- split[[1]][3:end] + final_name <- paste(sub_split, collapse = ".") + input.pgge$cons.jump[row] <- final_name + } + graph_display_names <- graph_names$V2 + names(graph_display_names) <- graph_names$V1 + for (row in 1:dim(input.pgge)[1]) { + input.pgge$cons.jump[row] <- graph_display_names[input.pgge$cons.jump[row]] + } +} aln.id <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = aln.id, label = sample.name)) + geom_violin() + geom_point() + geom_text_repel(box.padding = 0.1, max.overlaps = 20) + - xlab("consensus jump-max") + + xlab("graph name") + theme(text = element_text(size = 16)) + - scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) + scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) + + scale_x_discrete(guide = guide_axis(angle = 90)) qsc <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = qsc, label = sample.name)) + geom_violin() + geom_point() + geom_text_repel(box.padding = 0.1, max.overlaps = 20) + - xlab("consensus jump-max") + + xlab("graph name") + theme(text = element_text(size = 16)) + - scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) + scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) + + scale_x_discrete(guide = guide_axis(angle = 90)) uniq <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = uniq, label = sample.name)) + geom_violin() + geom_point() + geom_text_repel(box.padding = 0.1, max.overlaps = 20) + - xlab("consensus jump-max") + + xlab("graph name") + theme(text = element_text(size = 16)) + - scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) + scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) + + scale_x_discrete(guide = guide_axis(angle = 90)) multi <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = multi, label = sample.name)) + geom_violin() + geom_point() + geom_text_repel(box.padding = 0.1, max.overlaps = 20) + - xlab("consensus jump-max") + + xlab("graph name") + theme(text = element_text(size = 16)) + - scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) + scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) + + scale_x_discrete(guide = guide_axis(angle = 90)) nonaln <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = nonaln, label = sample.name)) + geom_violin() + geom_point() + geom_text_repel(box.padding = 0.1, max.overlaps = 20) + - xlab("consensus jump-max") + + xlab("graph name") + theme(text = element_text(size = 16)) + - scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) + scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))+ + scale_x_discrete(guide = guide_axis(angle = 90)) -png(args[2], width = 2000, height = 500, pointsize = 25) +png(output.png, width = 2000, height = 500, pointsize = 25) g <- grid.arrange(aln.id, qsc, uniq, multi, nonaln, nrow = 1) dev.off()