Merge pull request #7 from pangenome/user_friendliness

user can decide display names in PNG
pangenome · Feb 23, 2021 · 07b82c9 · 07b82c9
2 parents ae14af4 + 2778580
commit 07b82c9
Show file tree

Hide file tree

Showing 3 changed files with 119 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -53,10 +53,24 @@ in your given FASTA file, the results will only contain one line of metrics. In
 you have contig sequences in your FASTA and want to summarize by sample name. _`pgge`_ always splits by `.` and takes the
 first entry in the resulting split as sample name. 
 
-:warning: In addition, _`pgge`_ assumes that there is *@NUMBER* in each name of the input GAFs. This *NUMBER* is extracted
-later and appears in the table and output plot as *cons.jump*, because _`pgge`_ was designed for processing the results
-of _`pggb`_. If you are evaluating your own data not originating from _`pggb`_ it is recommended to add a different *@NUMBER*
-for each of your input GAFs in order to better understand the resulting output.
+:warning: _`pgge`_ was designed for processing the results
+of _`pggb`_. If you are evaluating your own data not originating from _`pggb`_ it is recommended to set the `-n/--input-graph-names` parameter to ensure the final PNG is labeled correctly. This parameters requires a TSV with 2 rows: 
+
+1. The name of the original input graph.
+2. The name to display in the PNG.
+
+In the following an example for the yeast data set:
+
+```
+cerevisiae.pan.fa.pggb-W-s50000-l150000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@10000::y:0:1000000.gfa	10k::y:0:1000k
+cerevisiae.pan.fa.pggb-W-s50000-l150000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@1000::y:0:1000000.gfa	1k::y:0:1000k
+cerevisiae.pan.fa.pggb-W-s50000-l150000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@100::y:0:1000000.gfa	0.1k::y:0:1000k
+cerevisiae.pan.fa.pggb-W-s50000-l150000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@10::y:0:1000000.gfa	0.01k::y:0:1000k
+cerevisiae.pan.fa.pggb-W-s50000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@10000:y.gfa	10k:y
+cerevisiae.pan.fa.pggb-W-s50000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@1000:y.gfa	1k:y
+cerevisiae.pan.fa.pggb-W-s50000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@100:y.gfa	0.1k:y
+cerevisiae.pan.fa.pggb-W-s50000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@10:y.gfa	0.01k:y
+```
 
 ### output
 

diff --git a/pgge b/pgge
@@ -6,6 +6,7 @@ set -eo pipefail
 input_gfa=false
 input_fasta=false
 input_gaf=false
+input_graph_names=false
 output_dir="."
 splitfa=false
 seq_length=false
@@ -14,16 +15,19 @@ beehave_R=false
 peanut_bed=false
 threads=1
 
+num_gfas=0
+num_input_graph_names=0
+
 if [ $# -eq 0 ];
 then
     show_help=true
-fi
+fi 
 
 ## TODO add options for GraphAligner
 
 # read the options
 cmd=$0" "$@
-TEMP=`getopt -o g:f:i:r:o:l:s:t:hb --long input-gfa:,input-fasta:,input-gaf:,beehave-r:,output-dir:,seq-length:,step:,threads:,help,peanut-bed -n 'pgge' -- "$@"`
+TEMP=`getopt -o g:f:i:n:r:o:l:s:t:hb --long input-gfa:,input-fasta:,input-gaf:,input-graph-names:,beehave-r:,output-dir:,seq-length:,step:,threads:,help,peanut-bed -n 'pgge' -- "$@"`
 eval set -- "$TEMP"
 
 # extract options and their arguments into variables.
@@ -32,6 +36,7 @@ while true ; do
         -g|--input-gfa) input_gfa=$2 ; shift 2 ;;
         -f|--input-fasta) input_fasta=$2 ; shift 2 ;;
         -i|--input-gaf) input_gaf=$2 ; shift 2 ;;
+        -n|--input-graph-names) input_graph_names=$2 ; shift 2 ;;
         -r|--beehave-r) beehave_R=$2 ; shift 2 ;;
         -o|--output-dir) output_dir=$2 ; shift 2 ;;
         -l|--seq-length) seq_length=$2 ; shift 2 ;;
@@ -118,7 +123,8 @@ then
     echo "    -f, --input-fasta FILE      input FASTA file (uncompressed or gzipped)"
     echo "    -o, --output-dir FILE       output directory"
     echo "    -r, --beehave-r PATH        path to beehave.R"
-    echo "    -b, --peanut-bed PATH       output BED file"    
+    echo "    -b, --peanut-bed PATH       output BED file"  
+    echo "    -n, --input-graph-names TSV input graph name file: first row is the name of the original input file, second row is the display name in the PNG"    
     echo "   [splitfa]"
     echo "    -l, --seq-length N          length of the splits"
     echo "    -s, --step N                step size between splits"
@@ -159,6 +165,7 @@ evaluation:
   output-dir:         $output_dir
   beehave-r:          $beehave_R
   peanut-bed:         $peanut_bed
+  input-graph-names:  $input_graph_names
 splitfa:
   seq-length:         $seq_length
   step:               $step
@@ -169,6 +176,18 @@ general:
 EOT
 echo -e "\nRunning pgge\n" >> "$log_file"
 
+# does the given input-graph-names exist?
+if [[ $input_graph_names != false ]];
+then
+    if [[
+        ! -f $input_graph_names
+    ]]
+    then
+        >&2 echo "'$input_graph_names' graph names file does not exist! Please correct and re-run pgge."
+        exit
+    fi
+fi
+
 ## START gfa_input
 if [[ $input_gaf == false ]];
 then
@@ -180,11 +199,26 @@ do
         ! -f $gfa
         ]];
     then
-        >&2 echo "$gfa GFA does not exist! Please correct and re-run pgge."
+        >&2 echo "'$gfa' GFA does not exist! Please correct and re-run pgge."
         exit
+    else 
+        ((num_gfas=num_gfas+1))
     fi
 done
 
+# TODO find out if #lines == #GFAs
+# echo $num_gfas
+#echo $(wc -l $input_graph_names | cut -f 1 -d " ")
+if [[ $input_graph_names != false ]];
+then
+    num_input_graph_names=$(wc -l $input_graph_names | cut -f 1 -d " ")
+    if [[ $num_gfas != $num_input_graph_names ]];
+    then
+        >&2 echo "Number of given graph names in '$input_graph_names' does not match given number of GFAs. Please correct and re-run pgge."
+            exit
+    fi
+fi
+
 # Manage uncompressed or gzip compressed FASTA files
 if (file "$input_fasta" | grep -q 'gzip compressed data' ) ; then
   input_fasta_uncompressed="$(echo "$input_fasta" | sed -e 's/\.[^./]*$//')"
@@ -272,7 +306,7 @@ if [[
     ! -f $input_gaf
     ]];
 then
-    >&2 echo "$input_gaf GAF does not exist! Please correct and re-run pgge."
+    >&2 echo "'$input_gaf' GAF does not exist! Please correct and re-run pgge."
     exit
 fi
 
@@ -291,12 +325,24 @@ input_gaf_base="$(basename -- "$input_gaf")"
 fi
 ## END gaf_input
 
-(echo sample.name cons.jump aln.id qsc uniq multi nonaln; ls "$output_dir" | grep gaf | grep pgge$ | while read f; \
-do
-    echo "$(echo "$output_dir"/"$f" | cut -f 2 -d .)" "$(echo "$output_dir"/"$f" | cut -f 2 -d @ | cut -f 1 -d. )" "$(cat "$output_dir"/"$f")"; \
-done) | tr ' ' '\t' > "$prefix_pgge".tsv
-
-$timer -f "$fmt" Rscript \
-    "$beehave_R" "$prefix_pgge".tsv \
-    "$prefix_pgge".tsv.png \
-    2> >(tee -a "$log_file")
+if [[ $input_graph_names == false ]];
+then
+    (echo sample.name cons.jump aln.id qsc uniq multi nonaln; ls "$output_dir" | grep gaf | grep pgge$ | while read f; \
+    do
+    echo "$(echo "$output_dir"/"$f" | cut -f 2 -d .)" "$(echo "$output_dir"/"$f" | cut -f 2 -d @ | cut -f 1 -d.)" "$(cat "$output_dir"/"$f")"; \
+    done) | tr ' ' '\t' > "$prefix_pgge".tsv
+    $timer -f "$fmt" Rscript \
+        "$beehave_R" "$prefix_pgge".tsv \
+        "$prefix_pgge".tsv.png \
+        2> >(tee -a "$log_file")
+else 
+    (echo sample.name cons.jump aln.id qsc uniq multi nonaln; ls "$output_dir" | grep gaf | grep pgge$ | while read f; \
+    do
+    echo "$(echo "$output_dir"/"$f" | cut -f 2 -d .)" "$(basename $(echo "$output_dir"/"$f"))" "$(cat "$output_dir"/"$f")"; \
+    done) | tr ' ' '\t' > "$prefix_pgge".tsv
+    $timer -f "$fmt" Rscript \
+        "$beehave_R" "$prefix_pgge".tsv \
+        "$prefix_pgge".tsv.png \
+        "$input_graph_names" \
+        2> >(tee -a "$log_file")
+fi
diff --git a/scripts/beehave.R b/scripts/beehave.R
@@ -7,47 +7,77 @@ require(gridExtra)
 args <- commandArgs(trailingOnly = T)
 
 input.pgge <- read.table(args[1], sep = '\t', header = T)
+output.png <- args[2]
+
+##
+#input.pgge <- read.table("/home/heumos/Downloads/yeast/pgge_graph_names/pgge-l100000-s50000.tsv", sep = '\t', header = T)
+##
+#output.png <- "/home/heumos/Downloads/yeast/pgge-l100000-s50000.png"
+##
+#graph_names <- read.delim("/home/heumos/Downloads/yeast/graph_names.tsv", header = F)
+
+if (!is.na(args[3])) {
+  graph_names <- read.delim(args[3], header = F)
+  for (row in 1:dim(input.pgge)[1]) {
+    name <- input.pgge$cons.jump[row]
+    split <- strsplit(name, "\\.")
+    end <- length(split[[1]]) - 2
+    sub_split <- split[[1]][3:end]
+    final_name <- paste(sub_split, collapse = ".")
+    input.pgge$cons.jump[row] <- final_name
+  }
+  graph_display_names <- graph_names$V2
+  names(graph_display_names) <- graph_names$V1
+  for (row in 1:dim(input.pgge)[1]) {
+    input.pgge$cons.jump[row] <- graph_display_names[input.pgge$cons.jump[row]]
+  }
+}
 
 aln.id <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = aln.id, label = sample.name)) +
   geom_violin() +
   geom_point() +
   geom_text_repel(box.padding = 0.1, max.overlaps = 20) +
-  xlab("consensus jump-max") +
+  xlab("graph name") +
   theme(text = element_text(size = 16)) +
-  scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))
+  scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) +
+  scale_x_discrete(guide = guide_axis(angle = 90))
 
 qsc <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = qsc, label = sample.name)) +
   geom_violin() +
   geom_point() +
   geom_text_repel(box.padding = 0.1, max.overlaps = 20) +
-  xlab("consensus jump-max") +
+  xlab("graph name") +
   theme(text = element_text(size = 16)) +
-  scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))
+  scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) +
+  scale_x_discrete(guide = guide_axis(angle = 90))
 
 uniq <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = uniq, label = sample.name)) +
   geom_violin() +
   geom_point() +
   geom_text_repel(box.padding = 0.1, max.overlaps = 20) +
-  xlab("consensus jump-max") +
+  xlab("graph name") +
   theme(text = element_text(size = 16)) +
-  scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))
+  scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) +
+  scale_x_discrete(guide = guide_axis(angle = 90))
 
 multi <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = multi, label = sample.name)) +
   geom_violin() +
   geom_point() +
   geom_text_repel(box.padding = 0.1, max.overlaps = 20) +
-  xlab("consensus jump-max") +
+  xlab("graph name") +
   theme(text = element_text(size = 16)) +
-  scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))
+  scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) +
+  scale_x_discrete(guide = guide_axis(angle = 90))
 
 nonaln <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = nonaln, label = sample.name)) +
   geom_violin() +
   geom_point() +
   geom_text_repel(box.padding = 0.1, max.overlaps = 20) +
-  xlab("consensus jump-max") +
+  xlab("graph name") +
   theme(text = element_text(size = 16)) +
-  scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))
+  scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))+
+  scale_x_discrete(guide = guide_axis(angle = 90))
 
-png(args[2], width = 2000, height = 500, pointsize = 25)
+png(output.png, width = 2000, height = 500, pointsize = 25)
 g <- grid.arrange(aln.id, qsc, uniq, multi, nonaln, nrow = 1)
 dev.off()