Skip to content

Commit

Permalink
NEW: pangenome analysis:
Browse files Browse the repository at this point in the history
  • Loading branch information
boopthesnoot committed Sep 21, 2020
1 parent 38b65ac commit 5147257
Show file tree
Hide file tree
Showing 9 changed files with 2,423 additions and 62 deletions.
145 changes: 83 additions & 62 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
import glob
import os


bacteria_name = glob.glob("*.fasta")[0][:-6]
storage_prefix = "/scratch/mlebedev"

rule all:
input: "unaligned.fa",
expand("hists/{sample}.png", sample=bacteria_name),
"viz.done",
f"hists/{bacteria_name}.png",


rule download_db:
output: targz = expand("{storage_prefix}/downloads/bacteria.tar.gz", storage_prefix=storage_prefix)
params:
dir = expand("{storage_prefix}/downloads/", storage_prefix=storage_prefix)
dir = storage_prefix + "/downloads/"
shell: "wget https://zenodo.org/record/3725706/files/bacteria.tar.gz -P {params.dir}"


rule unzip:
input: "{storage_prefix}/downloads/bacteria.tar.gz"
output: "{storage_prefix}/bacteria/"
shell: "mkdir {output} | tar -zxf {input} --directory {output}"
shell: "mkdir -p {output} && tar -zxf {input} --directory {output}"


rule referenceseeker:
input:
genome = "{bacteria}.fasta",
bacteria_db = expand("{storage_prefix}/bacteria/", storage_prefix=storage_prefix)
bacteria_db = storage_prefix + "/bacteria/"
output:
"{bacteria}.refseq"
conda:
Expand All @@ -37,8 +37,8 @@ rule referenceseeker:

rule path_to_relatives:
input:
reference = expand("{bacteria}.refseq", bacteria=bacteria_name),
bacteria_db = expand("{storage_prefix}/bacteria/", storage_prefix=storage_prefix)
reference = bacteria_name + ".refseq",
bacteria_db = storage_prefix + "/bacteria/"
output:
"path_to_refs.txt"
run:
Expand All @@ -49,56 +49,39 @@ rule path_to_relatives:
out.write(f"{input.bacteria_db}{i}.fna\n")


rule cp_relatives:
input: "path_to_refs.txt"
output: "references/"
shell:
'mkdir -p references && '
'cp `cat {input}` references && '
'mv references/* {output}'


IDS = glob_wildcards("references/{id}.fna")

## TODO: messed up the wildcards
rule mashmap:
input:
reference = expand("references/{ref_genomes}.fna", ref_genomes=IDS),
genome = expand("{bacteria}.fasta", bacteria=bacteria_name)
reference = "path_to_refs.txt",
genome = "{bacteria_name}.fasta"
conda:
"envs/mashmap.yml"
threads: 10
output:
expand("mashmap/{bacteria}_{ref_genomes}_out.mashmap", ref_genomes=IDS, bacteria=bacteria_name)
"mashmap/{bacteria_name}_out.mashmap"
shell:
'mkdir -p mashmap &&'
'mashmap -r {input.reference} -q {input.genome} -s 500b -t {threads} --perc_identity 95 -o mashmap/{output}'
'''
mashmap --rl {input.reference} -q {input.genome} -s 500 -t {threads} --perc_identity 95 -o {output}
'''


rule get_nonaligned:
input:
mashmaps = expand("mashmap/{bacteria}_{ref_genomes}_out.mashmap", ref_genomes=IDS, bacteria=bacteria_name)
params:
mashmaps_dir = "mashmap/"
input: f"mashmap/{bacteria_name}_out.mashmap"
output: "instruction_to_cut.bed"
run:
for root, dirs, files in os.walk(f"{params.mashmaps_dir}"):
for file in files:
ranges = []
name = None
with open(file) as m:
for line in m.readlines():
ranges.append([int(i) for i in line.split()[2:4]])
name = line.split()[0]

ranges.sort(key=lambda x: x[0])

for num, i in enumerate(ranges[1::]):
num_to_append = ranges[num][1]
if i[0] - ranges[num][1] > 1:
print(ranges[num][1], i[0])
with open(f"{output}", "a") as out:
out.write("\t".join((name, str(ranges[num][1]), str(i[0]))) + "\n")
ranges = []
name = None
with open(f"{input}") as m:
for line in m.readlines():
ranges.append([int(i) for i in line.split()[2:4]])
name = line.split()[0]

ranges.sort(key=lambda x: x[0])
for num, i in enumerate(ranges[1::]):
num_to_append = ranges[num][1]
if i[0] - ranges[num][1] > 1:
print(ranges[num][1], i[0])
with open(f"{output}", "a") as out:
out.write("\t".join((name, str(ranges[num][1]), str(i[0]))) + "\n")


rule bedtools:
Expand All @@ -108,39 +91,77 @@ rule bedtools:
shell: "bedtools getfasta -fi {input.genome} -bed {input.cut} -fo {output}"




rule prodigal:
input: expand("{bacteria}.fasta", bacteria=bacteria_name)
output: "proteins/{bacteria}.faa"
shell: "prodigal -a {output} -q -i {input}"
input: expand("{bacteria}.fasta", bacteria=bacteria_name)
output: "proteins/{bacteria}.faa"
shell: "prodigal -a {output} -q -i {input}"


rule diamond:
input: "proteins/{bacteria}.faa"
output: "lengths/{bacteria}.data"
threads: 10
params:
db="/scratch/mlebedev/uniprot_trembl.diamond.dmnd",
of="6 qlen slen"
shell: "diamond blastp --threads {threads} --max-target-seqs 1 --db {params.db} --query {input} --outfmt {params.of} --out {output}"
db = "/scratch/mlebedev/uniprot_trembl.diamond.dmnd",
of = "6 qlen slen"
shell:
'''
diamond blastp --threads {threads} --max-target-seqs 1 --db {params.db} \
--query {input} --outfmt {params.of} --out {output}
'''


rule hist:
input: "lengths/{bacteria}.data"
output: "hists/{bacteria}.png"
shell: "Rscript ../ideel/scripts/hist.R {input} {output}"

## TODO Change to make it work with multiple genomes

rule make_prokka_input:
input: refs="path_to_refs.txt"
params: bacteria_name
output: directory("prokka_inp/")
shell:
'''
mkdir -p {output} &&
cp `cat {input.refs}` {output} &&
cp {params}.fasta {output}
mv {output}{params}.fasta {output}{params}.fna
'''


rule prokka:
input: expand("{bacteria}.fasta", bacteria=bacteria_name)
output: "prokka/{bacteria_name}.gff"
threads: 4
shell: "prokka --outdir {output} --prefix {bacteria_name} {input} -cpus {threads}"
input: "prokka_inp/"
output: directory("prokka/")
threads: 10
shell: '''
for file in $(ls {input} | rev | cut -c5- | rev)
do prokka --outdir prokka --prefix $file {input}$file.fna -cpus {threads} --force
done
'''

IDS_roary_in = glob_wildcards("prokka/{id}.gff")

rule roary:
input: expand("prokka/{bacteria}.gff", bacteria=bacteria_name)
output: "roary/"
threads: 4
shell: "roary -p 4 -f {output}"
input: expand("prokka/{id}.gff", id=IDS_roary_in)
output: directory("roary_out/")
params: "prokka/*.gff"
threads: 10
shell: "roary -p {threads} -e -n -v -f {output} {params}"

IDS_roary_out = glob_wildcards("roary_out/{dirname}/core_gene_alignment.aln")[0]

rule fast_tree:
input:
f"roary_out/{IDS_roary_out[0]}/core_gene_alignment.aln"
output:
f"roary_out/{IDS_roary_out[0]}/core_gene_alignment.newick"
shell:
"FastTree -nt -gtr {input} > {output}"

rule roary_grahs:
input:
tree=f"roary_out/{IDS_roary_out[0]}/core_gene_alignment.newick",
csv=f"roary_out/{IDS_roary_out[0]}/gene_presence_absence.csv"
output: "viz.done"
shell: "python roary_plots/roary_plots.py {input.tree} {input.csv} && touch {output}"
20 changes: 20 additions & 0 deletions roary_plots/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#Roary plots
Marco Galardini ([email protected]) has prepared an ipython notebook which can take in a tree (newick) and the gene presence and absence spreadsheet, and generate some nice plots.

The dependancies are:
- python (2 or 3)
- Biopython
- numpy
- pandas
- matplotlib
- seaborn

Usage:
```
python roary_plots.py my_tree.tre gene_presence_absence.csv
```

The images it produces are:
* A pan genome frequency plot
* A presence and absence matrix against a tree
* A piechart of the pan genome, breaking down the core, soft core, shell and cloud.
59 changes: 59 additions & 0 deletions roary_plots/create_pan_genome_plots.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env Rscript
# ABSTRACT: Create R plots
# PODNAME: create_plots.R
# Take the output files from the pan genome pipeline and create nice plots.
library(ggplot2)


mydata = read.table("number_of_new_genes.Rtab")
boxplot(mydata, data=mydata, main="Number of new genes",
xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)

mydata = read.table("number_of_conserved_genes.Rtab")
boxplot(mydata, data=mydata, main="Number of conserved genes",
xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)

mydata = read.table("number_of_genes_in_pan_genome.Rtab")
boxplot(mydata, data=mydata, main="No. of genes in the pan-genome",
xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)

mydata = read.table("number_of_unique_genes.Rtab")
boxplot(mydata, data=mydata, main="Number of unique genes",
xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)

mydata = read.table("blast_identity_frequency.Rtab")
plot(mydata,main="Number of blastp hits with different percentage identity", xlab="Blast percentage identity", ylab="No. blast results")


library(ggplot2)
conserved = colMeans(read.table("number_of_conserved_genes.Rtab"))
total = colMeans(read.table("number_of_genes_in_pan_genome.Rtab"))

genes = data.frame( genes_to_genomes = c(conserved,total),
genomes = c(c(1:length(conserved)),c(1:length(conserved))),
Key = c(rep("Conserved genes",length(conserved)), rep("Total genes",length(total))) )

ggplot(data = genes, aes(x = genomes, y = genes_to_genomes, group = Key, linetype=Key)) +geom_line()+
theme_classic() +
ylim(c(1,max(total)))+
xlim(c(1,length(total)))+
xlab("No. of genomes") +
ylab("No. of genes")+ theme_bw(base_size = 16) + theme(legend.justification=c(0,1),legend.position=c(0,1))+
ggsave(filename="conserved_vs_total_genes.png", scale=1)

######################

unique_genes = colMeans(read.table("number_of_unique_genes.Rtab"))
new_genes = colMeans(read.table("number_of_new_genes.Rtab"))

genes = data.frame( genes_to_genomes = c(unique_genes,new_genes),
genomes = c(c(1:length(unique_genes)),c(1:length(unique_genes))),
Key = c(rep("Unique genes",length(unique_genes)), rep("New genes",length(new_genes))) )

ggplot(data = genes, aes(x = genomes, y = genes_to_genomes, group = Key, linetype=Key)) +geom_line()+
theme_classic() +
ylim(c(1,max(unique_genes)))+
xlim(c(1,length(unique_genes)))+
xlab("No. of genomes") +
ylab("No. of genes")+ theme_bw(base_size = 16) + theme(legend.justification=c(1,1),legend.position=c(1,1))+
ggsave(filename="unique_vs_new_genes.png", scale=1)
600 changes: 600 additions & 0 deletions roary_plots/roary.html

Large diffs are not rendered by default.

19 changes: 19 additions & 0 deletions roary_plots/roary_files/MathJax.js

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions roary_plots/roary_files/jquery.min.js

Large diffs are not rendered by default.

Loading

0 comments on commit 5147257

Please sign in to comment.