Skip to content

Commit

Permalink
Merge pull request #7 from pangenome/user_friendliness
Browse files Browse the repository at this point in the history
user can decide display names in PNG
  • Loading branch information
subwaystation authored Feb 23, 2021
2 parents ae14af4 + 2778580 commit 07b82c9
Show file tree
Hide file tree
Showing 3 changed files with 119 additions and 29 deletions.
22 changes: 18 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,24 @@ in your given FASTA file, the results will only contain one line of metrics. In
you have contig sequences in your FASTA and want to summarize by sample name. _`pgge`_ always splits by `.` and takes the
first entry in the resulting split as sample name.

:warning: In addition, _`pgge`_ assumes that there is *@NUMBER* in each name of the input GAFs. This *NUMBER* is extracted
later and appears in the table and output plot as *cons.jump*, because _`pgge`_ was designed for processing the results
of _`pggb`_. If you are evaluating your own data not originating from _`pggb`_ it is recommended to add a different *@NUMBER*
for each of your input GAFs in order to better understand the resulting output.
:warning: _`pgge`_ was designed for processing the results
of _`pggb`_. If you are evaluating your own data not originating from _`pggb`_ it is recommended to set the `-n/--input-graph-names` parameter to ensure the final PNG is labeled correctly. This parameters requires a TSV with 2 rows:

1. The name of the original input graph.
2. The name to display in the PNG.

In the following an example for the yeast data set:

```
cerevisiae.pan.fa.pggb-W-s50000-l150000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@10000::y:0:1000000.gfa 10k::y:0:1000k
cerevisiae.pan.fa.pggb-W-s50000-l150000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@1000::y:0:1000000.gfa 1k::y:0:1000k
cerevisiae.pan.fa.pggb-W-s50000-l150000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@100::y:0:1000000.gfa 0.1k::y:0:1000k
cerevisiae.pan.fa.pggb-W-s50000-l150000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@10::y:0:1000000.gfa 0.01k::y:0:1000k
cerevisiae.pan.fa.pggb-W-s50000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@10000:y.gfa 10k:y
cerevisiae.pan.fa.pggb-W-s50000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@1000:y.gfa 1k:y
cerevisiae.pan.fa.pggb-W-s50000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@100:y.gfa 0.1k:y
cerevisiae.pan.fa.pggb-W-s50000-p90-n5-a0-K16-k8.seqwish-w30000-j5000-e5000-I0.7.smooth.consensus@10:y.gfa 0.01k:y
```

### output

Expand Down
74 changes: 60 additions & 14 deletions pgge
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set -eo pipefail
input_gfa=false
input_fasta=false
input_gaf=false
input_graph_names=false
output_dir="."
splitfa=false
seq_length=false
Expand All @@ -14,16 +15,19 @@ beehave_R=false
peanut_bed=false
threads=1

num_gfas=0
num_input_graph_names=0

if [ $# -eq 0 ];
then
show_help=true
fi
fi

## TODO add options for GraphAligner

# read the options
cmd=$0" "$@
TEMP=`getopt -o g:f:i:r:o:l:s:t:hb --long input-gfa:,input-fasta:,input-gaf:,beehave-r:,output-dir:,seq-length:,step:,threads:,help,peanut-bed -n 'pgge' -- "$@"`
TEMP=`getopt -o g:f:i:n:r:o:l:s:t:hb --long input-gfa:,input-fasta:,input-gaf:,input-graph-names:,beehave-r:,output-dir:,seq-length:,step:,threads:,help,peanut-bed -n 'pgge' -- "$@"`
eval set -- "$TEMP"

# extract options and their arguments into variables.
Expand All @@ -32,6 +36,7 @@ while true ; do
-g|--input-gfa) input_gfa=$2 ; shift 2 ;;
-f|--input-fasta) input_fasta=$2 ; shift 2 ;;
-i|--input-gaf) input_gaf=$2 ; shift 2 ;;
-n|--input-graph-names) input_graph_names=$2 ; shift 2 ;;
-r|--beehave-r) beehave_R=$2 ; shift 2 ;;
-o|--output-dir) output_dir=$2 ; shift 2 ;;
-l|--seq-length) seq_length=$2 ; shift 2 ;;
Expand Down Expand Up @@ -118,7 +123,8 @@ then
echo " -f, --input-fasta FILE input FASTA file (uncompressed or gzipped)"
echo " -o, --output-dir FILE output directory"
echo " -r, --beehave-r PATH path to beehave.R"
echo " -b, --peanut-bed PATH output BED file"
echo " -b, --peanut-bed PATH output BED file"
echo " -n, --input-graph-names TSV input graph name file: first row is the name of the original input file, second row is the display name in the PNG"
echo " [splitfa]"
echo " -l, --seq-length N length of the splits"
echo " -s, --step N step size between splits"
Expand Down Expand Up @@ -159,6 +165,7 @@ evaluation:
output-dir: $output_dir
beehave-r: $beehave_R
peanut-bed: $peanut_bed
input-graph-names: $input_graph_names
splitfa:
seq-length: $seq_length
step: $step
Expand All @@ -169,6 +176,18 @@ general:
EOT
echo -e "\nRunning pgge\n" >> "$log_file"

# does the given input-graph-names exist?
if [[ $input_graph_names != false ]];
then
if [[
! -f $input_graph_names
]]
then
>&2 echo "'$input_graph_names' graph names file does not exist! Please correct and re-run pgge."
exit
fi
fi

## START gfa_input
if [[ $input_gaf == false ]];
then
Expand All @@ -180,11 +199,26 @@ do
! -f $gfa
]];
then
>&2 echo "$gfa GFA does not exist! Please correct and re-run pgge."
>&2 echo "'$gfa' GFA does not exist! Please correct and re-run pgge."
exit
else
((num_gfas=num_gfas+1))
fi
done

# TODO find out if #lines == #GFAs
# echo $num_gfas
#echo $(wc -l $input_graph_names | cut -f 1 -d " ")
if [[ $input_graph_names != false ]];
then
num_input_graph_names=$(wc -l $input_graph_names | cut -f 1 -d " ")
if [[ $num_gfas != $num_input_graph_names ]];
then
>&2 echo "Number of given graph names in '$input_graph_names' does not match given number of GFAs. Please correct and re-run pgge."
exit
fi
fi

# Manage uncompressed or gzip compressed FASTA files
if (file "$input_fasta" | grep -q 'gzip compressed data' ) ; then
input_fasta_uncompressed="$(echo "$input_fasta" | sed -e 's/\.[^./]*$//')"
Expand Down Expand Up @@ -272,7 +306,7 @@ if [[
! -f $input_gaf
]];
then
>&2 echo "$input_gaf GAF does not exist! Please correct and re-run pgge."
>&2 echo "'$input_gaf' GAF does not exist! Please correct and re-run pgge."
exit
fi

Expand All @@ -291,12 +325,24 @@ input_gaf_base="$(basename -- "$input_gaf")"
fi
## END gaf_input

(echo sample.name cons.jump aln.id qsc uniq multi nonaln; ls "$output_dir" | grep gaf | grep pgge$ | while read f; \
do
echo "$(echo "$output_dir"/"$f" | cut -f 2 -d .)" "$(echo "$output_dir"/"$f" | cut -f 2 -d @ | cut -f 1 -d. )" "$(cat "$output_dir"/"$f")"; \
done) | tr ' ' '\t' > "$prefix_pgge".tsv

$timer -f "$fmt" Rscript \
"$beehave_R" "$prefix_pgge".tsv \
"$prefix_pgge".tsv.png \
2> >(tee -a "$log_file")
if [[ $input_graph_names == false ]];
then
(echo sample.name cons.jump aln.id qsc uniq multi nonaln; ls "$output_dir" | grep gaf | grep pgge$ | while read f; \
do
echo "$(echo "$output_dir"/"$f" | cut -f 2 -d .)" "$(echo "$output_dir"/"$f" | cut -f 2 -d @ | cut -f 1 -d.)" "$(cat "$output_dir"/"$f")"; \
done) | tr ' ' '\t' > "$prefix_pgge".tsv
$timer -f "$fmt" Rscript \
"$beehave_R" "$prefix_pgge".tsv \
"$prefix_pgge".tsv.png \
2> >(tee -a "$log_file")
else
(echo sample.name cons.jump aln.id qsc uniq multi nonaln; ls "$output_dir" | grep gaf | grep pgge$ | while read f; \
do
echo "$(echo "$output_dir"/"$f" | cut -f 2 -d .)" "$(basename $(echo "$output_dir"/"$f"))" "$(cat "$output_dir"/"$f")"; \
done) | tr ' ' '\t' > "$prefix_pgge".tsv
$timer -f "$fmt" Rscript \
"$beehave_R" "$prefix_pgge".tsv \
"$prefix_pgge".tsv.png \
"$input_graph_names" \
2> >(tee -a "$log_file")
fi
52 changes: 41 additions & 11 deletions scripts/beehave.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,47 +7,77 @@ require(gridExtra)
args <- commandArgs(trailingOnly = T)

input.pgge <- read.table(args[1], sep = '\t', header = T)
output.png <- args[2]

##
#input.pgge <- read.table("/home/heumos/Downloads/yeast/pgge_graph_names/pgge-l100000-s50000.tsv", sep = '\t', header = T)
##
#output.png <- "/home/heumos/Downloads/yeast/pgge-l100000-s50000.png"
##
#graph_names <- read.delim("/home/heumos/Downloads/yeast/graph_names.tsv", header = F)

if (!is.na(args[3])) {
graph_names <- read.delim(args[3], header = F)
for (row in 1:dim(input.pgge)[1]) {
name <- input.pgge$cons.jump[row]
split <- strsplit(name, "\\.")
end <- length(split[[1]]) - 2
sub_split <- split[[1]][3:end]
final_name <- paste(sub_split, collapse = ".")
input.pgge$cons.jump[row] <- final_name
}
graph_display_names <- graph_names$V2
names(graph_display_names) <- graph_names$V1
for (row in 1:dim(input.pgge)[1]) {
input.pgge$cons.jump[row] <- graph_display_names[input.pgge$cons.jump[row]]
}
}

aln.id <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = aln.id, label = sample.name)) +
geom_violin() +
geom_point() +
geom_text_repel(box.padding = 0.1, max.overlaps = 20) +
xlab("consensus jump-max") +
xlab("graph name") +
theme(text = element_text(size = 16)) +
scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))
scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) +
scale_x_discrete(guide = guide_axis(angle = 90))

qsc <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = qsc, label = sample.name)) +
geom_violin() +
geom_point() +
geom_text_repel(box.padding = 0.1, max.overlaps = 20) +
xlab("consensus jump-max") +
xlab("graph name") +
theme(text = element_text(size = 16)) +
scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))
scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) +
scale_x_discrete(guide = guide_axis(angle = 90))

uniq <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = uniq, label = sample.name)) +
geom_violin() +
geom_point() +
geom_text_repel(box.padding = 0.1, max.overlaps = 20) +
xlab("consensus jump-max") +
xlab("graph name") +
theme(text = element_text(size = 16)) +
scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))
scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) +
scale_x_discrete(guide = guide_axis(angle = 90))

multi <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = multi, label = sample.name)) +
geom_violin() +
geom_point() +
geom_text_repel(box.padding = 0.1, max.overlaps = 20) +
xlab("consensus jump-max") +
xlab("graph name") +
theme(text = element_text(size = 16)) +
scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))
scale_y_continuous(labels = scales::number_format(accuracy = 0.0001)) +
scale_x_discrete(guide = guide_axis(angle = 90))

nonaln <- ggplot(input.pgge, aes(x = as.factor(cons.jump), y = nonaln, label = sample.name)) +
geom_violin() +
geom_point() +
geom_text_repel(box.padding = 0.1, max.overlaps = 20) +
xlab("consensus jump-max") +
xlab("graph name") +
theme(text = element_text(size = 16)) +
scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))
scale_y_continuous(labels = scales::number_format(accuracy = 0.0001))+
scale_x_discrete(guide = guide_axis(angle = 90))

png(args[2], width = 2000, height = 500, pointsize = 25)
png(output.png, width = 2000, height = 500, pointsize = 25)
g <- grid.arrange(aln.id, qsc, uniq, multi, nonaln, nrow = 1)
dev.off()

0 comments on commit 07b82c9

Please sign in to comment.