NEW: pangenome analysis:

boopthesnoot · Sep 21, 2020 · 5147257 · 5147257
1 parent 38b65ac
commit 5147257
Show file tree

Hide file tree

Showing 9 changed files with 2,423 additions and 62 deletions.
diff --git a/Snakefile b/Snakefile
@@ -1,32 +1,32 @@
 import glob
-import os
 
 
 bacteria_name = glob.glob("*.fasta")[0][:-6]
 storage_prefix = "/scratch/mlebedev"
 
 rule all:
     input: "unaligned.fa",
-            expand("hists/{sample}.png", sample=bacteria_name),
+           "viz.done",
+           f"hists/{bacteria_name}.png",
 
 
 rule download_db:
     output: targz = expand("{storage_prefix}/downloads/bacteria.tar.gz", storage_prefix=storage_prefix)
     params:
-        dir = expand("{storage_prefix}/downloads/", storage_prefix=storage_prefix)
+        dir = storage_prefix + "/downloads/"
     shell: "wget https://zenodo.org/record/3725706/files/bacteria.tar.gz -P {params.dir}"
 
 
 rule unzip:
     input: "{storage_prefix}/downloads/bacteria.tar.gz"
     output: "{storage_prefix}/bacteria/"
-    shell: "mkdir {output} | tar -zxf {input} --directory {output}"
+    shell: "mkdir -p {output} && tar -zxf {input} --directory {output}"
 
 
 rule referenceseeker:
     input:
          genome = "{bacteria}.fasta",
-         bacteria_db = expand("{storage_prefix}/bacteria/", storage_prefix=storage_prefix)
+         bacteria_db = storage_prefix + "/bacteria/"
     output:
         "{bacteria}.refseq"
     conda:
@@ -37,8 +37,8 @@ rule referenceseeker:
 
 rule path_to_relatives:
     input:
-         reference = expand("{bacteria}.refseq", bacteria=bacteria_name),
-         bacteria_db = expand("{storage_prefix}/bacteria/", storage_prefix=storage_prefix)
+         reference = bacteria_name + ".refseq",
+         bacteria_db = storage_prefix + "/bacteria/"
     output:
          "path_to_refs.txt"
     run:
@@ -49,56 +49,39 @@ rule path_to_relatives:
                 out.write(f"{input.bacteria_db}{i}.fna\n")
 
 
-rule cp_relatives:
-    input: "path_to_refs.txt"
-    output: "references/"
-    shell:
-        'mkdir -p references && '
-        'cp `cat {input}` references && '
-        'mv references/* {output}'
-
-
-IDS = glob_wildcards("references/{id}.fna")
-
-## TODO: messed up the wildcards
 rule mashmap:
     input:
-        reference = expand("references/{ref_genomes}.fna", ref_genomes=IDS),
-        genome = expand("{bacteria}.fasta", bacteria=bacteria_name)
+        reference = "path_to_refs.txt",
+        genome = "{bacteria_name}.fasta"
     conda:
         "envs/mashmap.yml"
     threads: 10
     output:
-        expand("mashmap/{bacteria}_{ref_genomes}_out.mashmap", ref_genomes=IDS, bacteria=bacteria_name)
+        "mashmap/{bacteria_name}_out.mashmap"
     shell:
-        'mkdir -p mashmap &&'
-        'mashmap -r {input.reference} -q {input.genome} -s 500b -t {threads} --perc_identity 95 -o mashmap/{output}'
+        '''
+        mashmap --rl {input.reference} -q {input.genome} -s 500 -t {threads} --perc_identity 95 -o {output}
+        '''
 
 
 rule get_nonaligned:
-    input:
-        mashmaps = expand("mashmap/{bacteria}_{ref_genomes}_out.mashmap", ref_genomes=IDS, bacteria=bacteria_name)
-    params:
-        mashmaps_dir = "mashmap/"
+    input: f"mashmap/{bacteria_name}_out.mashmap"
     output: "instruction_to_cut.bed"
     run:
-        for root, dirs, files in os.walk(f"{params.mashmaps_dir}"):
-            for file in files:
-                ranges = []
-                name = None
-                with open(file) as m:
-                    for line in m.readlines():
-                        ranges.append([int(i) for i in line.split()[2:4]])
-                        name = line.split()[0]
-
-                ranges.sort(key=lambda x: x[0])
-
-                for num, i in enumerate(ranges[1::]):
-                    num_to_append = ranges[num][1]
-                    if i[0] - ranges[num][1] > 1:
-                        print(ranges[num][1], i[0])
-                        with open(f"{output}", "a") as out:
-                            out.write("\t".join((name, str(ranges[num][1]), str(i[0]))) +  "\n")
+        ranges = []
+        name = None
+        with open(f"{input}") as m:
+            for line in m.readlines():
+                ranges.append([int(i) for i in line.split()[2:4]])
+                name = line.split()[0]
+
+        ranges.sort(key=lambda x: x[0])
+        for num, i in enumerate(ranges[1::]):
+            num_to_append = ranges[num][1]
+            if i[0] - ranges[num][1] > 1:
+                print(ranges[num][1], i[0])
+                with open(f"{output}", "a") as out:
+                    out.write("\t".join((name, str(ranges[num][1]), str(i[0]))) +  "\n")
 
 
 rule bedtools:
@@ -108,39 +91,77 @@ rule bedtools:
     shell: "bedtools getfasta -fi {input.genome} -bed {input.cut} -fo {output}"
 
 
-
-
 rule prodigal:
-	input: expand("{bacteria}.fasta", bacteria=bacteria_name)
-	output: "proteins/{bacteria}.faa"
-	shell: "prodigal -a {output} -q -i {input}"
+    input: expand("{bacteria}.fasta", bacteria=bacteria_name)
+    output: "proteins/{bacteria}.faa"
+    shell: "prodigal -a {output} -q -i {input}"
 
 
 rule diamond:
 	input: "proteins/{bacteria}.faa"
 	output: "lengths/{bacteria}.data"
 	threads: 10
 	params:
-		db="/scratch/mlebedev/uniprot_trembl.diamond.dmnd",
-		of="6 qlen slen"
-	shell: "diamond blastp --threads {threads} --max-target-seqs 1 --db {params.db} --query {input} --outfmt {params.of} --out {output}"
+		db = "/scratch/mlebedev/uniprot_trembl.diamond.dmnd",
+		of = "6 qlen slen"
+	shell:
+            '''
+            diamond blastp --threads {threads} --max-target-seqs 1 --db {params.db} \
+            --query {input} --outfmt {params.of} --out {output}
+            '''
 
 
 rule hist:
 	input: "lengths/{bacteria}.data"
 	output: "hists/{bacteria}.png"
 	shell: "Rscript ../ideel/scripts/hist.R {input} {output}"
 
-## TODO Change to make it work with multiple genomes
+
+rule make_prokka_input:
+    input: refs="path_to_refs.txt"
+    params: bacteria_name
+    output: directory("prokka_inp/")
+    shell:
+        '''
+        mkdir -p {output} && 
+        cp `cat {input.refs}` {output} && 
+        cp {params}.fasta {output}
+        mv {output}{params}.fasta {output}{params}.fna
+        '''
+
+
 rule prokka:
-    input: expand("{bacteria}.fasta", bacteria=bacteria_name)
-    output: "prokka/{bacteria_name}.gff"
-    threads: 4
-    shell: "prokka --outdir {output} --prefix {bacteria_name} {input} -cpus {threads}"
+    input: "prokka_inp/"
+    output: directory("prokka/")
+    threads: 10
+    shell: '''
+            for file in $(ls {input} | rev | cut -c5- | rev)
+            do prokka --outdir prokka --prefix $file {input}$file.fna -cpus {threads} --force
+            done
+           '''
 
+IDS_roary_in = glob_wildcards("prokka/{id}.gff")
 
 rule roary:
-    input: expand("prokka/{bacteria}.gff", bacteria=bacteria_name)
-    output: "roary/"
-    threads: 4
-    shell: "roary -p 4 -f {output}"
+    input: expand("prokka/{id}.gff", id=IDS_roary_in)
+    output: directory("roary_out/")
+    params: "prokka/*.gff"
+    threads: 10
+    shell: "roary -p {threads} -e -n -v -f {output} {params}"
+
+IDS_roary_out = glob_wildcards("roary_out/{dirname}/core_gene_alignment.aln")[0]
+
+rule fast_tree:
+    input:
+         f"roary_out/{IDS_roary_out[0]}/core_gene_alignment.aln"
+    output:
+         f"roary_out/{IDS_roary_out[0]}/core_gene_alignment.newick"
+    shell:
+         "FastTree -nt -gtr {input} > {output}"
+
+rule roary_grahs:
+    input:
+         tree=f"roary_out/{IDS_roary_out[0]}/core_gene_alignment.newick",
+         csv=f"roary_out/{IDS_roary_out[0]}/gene_presence_absence.csv"
+    output: "viz.done"
+    shell: "python roary_plots/roary_plots.py {input.tree} {input.csv} && touch {output}"
diff --git a/roary_plots/README.md b/roary_plots/README.md
@@ -0,0 +1,20 @@
+#Roary plots
+Marco Galardini ([email protected]) has prepared an ipython notebook which can take in a tree (newick) and the gene presence and absence spreadsheet, and generate some nice plots.
+
+The dependancies are:
+- python (2 or 3)
+- Biopython
+- numpy
+- pandas
+- matplotlib
+- seaborn
+
+Usage:
+```
+python roary_plots.py my_tree.tre gene_presence_absence.csv
+```
+
+The images it produces are:
+* A pan genome frequency plot
+* A presence and absence matrix against a tree
+* A piechart of the pan genome, breaking down the core, soft core, shell and cloud.
diff --git a/roary_plots/create_pan_genome_plots.R b/roary_plots/create_pan_genome_plots.R
@@ -0,0 +1,59 @@
+#!/usr/bin/env Rscript
+# ABSTRACT: Create R plots
+# PODNAME: create_plots.R
+# Take the output files from the pan genome pipeline and create nice plots.
+library(ggplot2)
+
+
+mydata = read.table("number_of_new_genes.Rtab")
+boxplot(mydata, data=mydata, main="Number of new genes",
+         xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)
+
+mydata = read.table("number_of_conserved_genes.Rtab")
+boxplot(mydata, data=mydata, main="Number of conserved genes",
+          xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)
+
+mydata = read.table("number_of_genes_in_pan_genome.Rtab")
+boxplot(mydata, data=mydata, main="No. of genes in the pan-genome",
+          xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)
+
+mydata = read.table("number_of_unique_genes.Rtab")
+boxplot(mydata, data=mydata, main="Number of unique genes",
+         xlab="No. of genomes", ylab="No. of genes",varwidth=TRUE, ylim=c(0,max(mydata)), outline=FALSE)
+
+mydata = read.table("blast_identity_frequency.Rtab")
+plot(mydata,main="Number of blastp hits with different percentage identity",  xlab="Blast percentage identity", ylab="No. blast results")
+
+
+library(ggplot2)
+conserved = colMeans(read.table("number_of_conserved_genes.Rtab"))
+total = colMeans(read.table("number_of_genes_in_pan_genome.Rtab"))
+
+genes = data.frame( genes_to_genomes = c(conserved,total),
+                    genomes = c(c(1:length(conserved)),c(1:length(conserved))),
+                    Key = c(rep("Conserved genes",length(conserved)), rep("Total genes",length(total))) )
+
+ggplot(data = genes, aes(x = genomes, y = genes_to_genomes, group = Key, linetype=Key)) +geom_line()+
+theme_classic() +
+ylim(c(1,max(total)))+
+xlim(c(1,length(total)))+
+xlab("No. of genomes") +
+ylab("No. of genes")+ theme_bw(base_size = 16) +  theme(legend.justification=c(0,1),legend.position=c(0,1))+
+ggsave(filename="conserved_vs_total_genes.png", scale=1)
+
+######################
+
+unique_genes = colMeans(read.table("number_of_unique_genes.Rtab"))
+new_genes = colMeans(read.table("number_of_new_genes.Rtab"))
+
+genes = data.frame( genes_to_genomes = c(unique_genes,new_genes),
+                    genomes = c(c(1:length(unique_genes)),c(1:length(unique_genes))),
+                    Key = c(rep("Unique genes",length(unique_genes)), rep("New genes",length(new_genes))) )
+
+ggplot(data = genes, aes(x = genomes, y = genes_to_genomes, group = Key, linetype=Key)) +geom_line()+
+theme_classic() +
+ylim(c(1,max(unique_genes)))+
+xlim(c(1,length(unique_genes)))+
+xlab("No. of genomes") +
+ylab("No. of genes")+ theme_bw(base_size = 16) +  theme(legend.justification=c(1,1),legend.position=c(1,1))+
+ggsave(filename="unique_vs_new_genes.png", scale=1)
diff --git a/roary_plots/roary.html b/roary_plots/roary.html
diff --git a/roary_plots/roary_files/MathJax.js b/roary_plots/roary_files/MathJax.js
diff --git a/roary_plots/roary_files/jquery.min.js b/roary_plots/roary_files/jquery.min.js