commands for assembly of V. inaequalis isolate 172 genome from PacBio platform
==========
Scripts used for the assembly of venturia isolate 172 Note - all this work was performed in the directory: /home/groups/harrisonlab/project_files/venturia
###Building of directory structure
RawDatDir=/home/harrir/projects/pacbio_test/v_inaeq/ENQ-933_Richard_Harrison_EMR.RH_PPBFX-340_S6_05-172
mkdir -p raw_dna/pacbio/v.inaequalis/172
cp -r $RawDatDir/A03_1 raw_dna/pacbio/v.inaequalis/172
cp -r $RawDatDir/B03_1 raw_dna/pacbio/v.inaequalis/172
cp -r $RawDatDir/E06_1 raw_dna/pacbio/v.inaequalis/172
OutDir=raw_dna/pacbio/v.inaequalis/172/extracted
mkdir -p $OutDir
cat raw_dna/pacbio/v.inaequalis/172/*/Analysis_Results/*.subreads.fastq > $OutDir/concatenated_pacbio.fastq
Genome size based on MiSeq assembled genome of the same isolate
Reads=$(ls raw_dna/pacbio/*/*/extracted/concatenated_pacbio.fastq)
GenomeSz="70m"
Strain=$(echo $Reads | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Reads | rev | cut -f4 -d '/' | rev)
Prefix="$Strain"_canu
OutDir="assembly/canu/$Organism/$Strain/70m"
ProgDir=~/git_repos/tools/seq_tools/assemblers/canu
qsub $ProgDir/submit_canu.sh $Reads $GenomeSz $Prefix $OutDir
for Assembly in $(ls assembly/canu/v.inaequalis/172/70m/172_canu.contigs.fasta); do
Organism=v.inaequalis
Strain=172
IlluminaDir=$(ls -d qc_dna/paired/$Organism/$Strain)
TrimF1_Read=$IlluminaDir/F/172_S4_L001_R1_001_trim.fq.gz
TrimR1_Read=$IlluminaDir/R/172_S4_L001_R2_001_trim.fq.gz
OutDir=assembly/canu/$Organism/$Strain/polished
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/pilon
qsub $ProgDir/sub_pilon.sh $Assembly $TrimF1_Read $TrimR1_Read $OutDir
done
quast for stats
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
for Assembly in $(ls assembly/canu/*/*/polished/pilon.fasta); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
OutDir=assembly/canu/$Organism/$Strain/polished
qsub $ProgDir/sub_quast.sh $Assembly $OutDir
done
for PacBioDat in $(ls raw_dna/pacbio/*/*/extracted/concatenated_pacbio.fastq); do
Organism=$(echo $PacBioDat | rev | cut -f4 -d '/' | rev)
Strain=$(echo $PacBioDat | rev | cut -f3 -d '/' | rev)
IlluminaDir=$(ls -d qc_dna/paired/$Organism/$Strain)
TrimF1_Read=$(ls $IlluminaDir/F/172_S4_L001_R1_001_trim.fq.gz);
TrimR1_Read=$(ls $IlluminaDir/R/172_S4_L001_R2_001_trim.fq.gz);
OutDir=assembly/spades_pacbio/$Organism/"$Strain"
echo $TrimF1_Read
echo $TrimR1_Read
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/spades
qsub $ProgDir/sub_spades_pacbio.sh $PacBioDat $TrimF1_Read $TrimR1_Read $OutDir 10
done
Filter out contigs < 500bp from assembly
for Contigs in $(ls assembly/spades_pacbio/*/*/contigs.fasta); do
AssemblyDir=$(dirname $Contigs)
mkdir $AssemblyDir/filtered_contigs
FilterDir=/home/passet/git_repos/tools/seq_tools/assemblers/abyss
$FilterDir/filter_abyss_contigs.py $Contigs 500 > $AssemblyDir/filtered_contigs/contigs_min_500bp.fasta
done
Quast
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
for Assembly in $(ls assembly/spades_pacbio/*/*/filtered_contigs/contigs_min_500bp.fasta); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
OutDir=assembly/spades_pacbio/$Organism/$Strain/filtered_contigs
qsub $ProgDir/sub_quast.sh $Assembly $OutDir
done
#Merge pacbio and hybrid assemblies
for PacBioAssembly in $(ls assembly/canu/*/*/polished/pilon.fasta); do
Organism=$(echo $PacBioAssembly | rev | cut -f4 -d '/' | rev)
Strain=$(echo $PacBioAssembly | rev | cut -f3 -d '/' | rev)
HybridAssembly=$(ls assembly/spades_pacbio/$Organism/$Strain/contigs.fasta)
OutDir=assembly/merged_canu_spades/$Organism/$Strain
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/quickmerge
qsub $ProgDir/sub_quickmerge.sh $PacBioAssembly $HybridAssembly $OutDir
done
Merged assembly polished with Pilon
for Assembly in $(ls assembly/merged_canu_spades/*/*/merged.fasta); do
Organism=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Strain=$(echo $Assembly | rev | cut -f2 -d '/' | rev)
IlluminaDir=$(ls -d qc_dna/paired/$Organism/$Strain)
TrimF1_Read=$(ls $IlluminaDir/F/172_S4_L001_R1_001_trim.fq.gz);
TrimR1_Read=$(ls $IlluminaDir/R/172_S4_L001_R2_001_trim.fq.gz);
OutDir=assembly/merged_canu_spades/$Organism/$Strain/polished
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/pilon
qsub $ProgDir/sub_pilon.sh $Assembly $TrimF1_Read $TrimR1_Read $OutDir
done
Contigs renamed in accordance with ncbi recommendations
ProgDir=~/git_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
touch tmp.csv
for Assembly in $(ls assembly/merged_canu_spades/*/*/polished/pilon.fasta); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
OutDir=assembly/merged_canu_spades/$Organism/$Strain/filtered_contigs
mkdir -p $OutDir
$ProgDir/remove_contaminants.py --inp $Assembly --out $OutDir/contigs_min_500bp_renamed.fasta --coord_file tmp.csv
done
rm tmp.csv
Quast of polished merged assembly
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
for Assembly in $(ls assembly/merged_canu_spades/*/*/filtered_contigs/contigs_min_500bp_renamed.fasta); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
OutDir=assembly/merged_canu_spades/$Organism/$Strain/filtered_contigs
qsub $ProgDir/sub_quast.sh $Assembly $OutDir
done
###Re-assembly of Pacbio isolate 172 with extra reads #Building of directory structure
RawDatDir=/home/harrir/projects/pacbio_test/v_inaeq/
cp -r $RawDatDir/ENQ-933_Richard_Harrison_EMR.RH_PPBFX-340_S6_05-172_repeat.tar.gz raw_dna/pacbio/v.inaequalis/172
tar -C /raw_dna/pacbio/v.inaequalis/172 -zxvf raw_dna/pacbio/v.inaequalis/172/ENQ-933_Richard_Harrison_EMR.RH_PPBFX-340_S6_05-172_repeat.tar.gz
mv raw_dna/pacbio/v.inaequalis/172/ENQ-933_Richard_Harrison_EMR.RH_PPBFX-340_S6_05-172_repeat/A03_1/ raw_dna/pacbio/v.inaequalis/172/repeat
OutDir=raw_dna/pacbio/v.inaequalis/172/extracted
cat raw_dna/pacbio/v.inaequalis/172/*/Analysis_Results/*.subreads.fastq > $OutDir/concatenated_pacbio.fastq
Genome size based on MiSeq assembled genome of the same isolate
Reads=$(ls raw_dna/pacbio/*/*/extracted/concatenated_pacbio.fastq)
GenomeSz="70m"
Strain=$(echo $Reads | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Reads | rev | cut -f4 -d '/' | rev)
Prefix="$Strain"_canu
OutDir="assembly/canu/$Organism/$Strain/repeat"
ProgDir=~/git_repos/tools/seq_tools/assemblers/canu
qsub $ProgDir/submit_canu.sh $Reads $GenomeSz $Prefix $OutDir
###Another Re-assembly of Pacbio isolate 172 with further reads #Building of directory structure
3rd data set
RawDatDir=/home/harrir/projects/pacbio_test/v_inaeq
cp -r $RawDatDir/Richard_Harrison_NEMR.RH.ENQ-933.C.02_S6_SAM22709_PRO1095_S6_HMW_05-172.tar.gz raw_dna/pacbio/v.inaequalis/172
cd raw_dna/pacbio/v.inaequalis/172
tar -zxvf raw_dna/pacbio/v.inaequalis/172/Richard_Harrison_NEMR.RH.ENQ-933.C.02_S6_SAM22709_PRO1095_S6_HMW_05-172.tar.gz
cd ../../../..
mv raw_dna/pacbio/v.inaequalis/172/tgac/pnp/raw/pacbio-1/n56105/2016_07_20_PSEQ1128_342/G03_1/ raw_dna/pacbio/v.inaequalis/172/
mv raw_dna/pacbio/v.inaequalis/172/tgac/pnp/raw/pacbio-1/n56105/2016_07_20_PSEQ1128_342/H03_1/ raw_dna/pacbio/v.inaequalis/172/
4th data set
RawDatDir=/home/harrir/projects/pacbio_test/v_inaeq
cp -r $RawDatDir/S6_Richard_Harrison_extra_data_NEMR.RH.ENQ-933.C.02_Raw_reads.tar.gz raw_dna/pacbio/v.inaequalis/172
cd raw_dna/pacbio/v.inaequalis/172
tar -zxvf raw_dna/pacbio/v.inaequalis/172/S6_Richard_Harrison_extra_data_NEMR.RH.ENQ-933.C.02_Raw_reads.tar.gz
cd ../../../..
mv raw_dna/pacbio/v.inaequalis/172/Raw_reads/A01_1 raw_dna/pacbio/v.inaequalis/172/
mv raw_dna/pacbio/v.inaequalis/172/Raw_reads/B01_1 raw_dna/pacbio/v.inaequalis/172/
mv raw_dna/pacbio/v.inaequalis/172/Raw_reads/C01_1 raw_dna/pacbio/v.inaequalis/172/
Concatenated Pacbio data from all 4 data sets (9 pacbio cells in total)
OutDir=raw_dna/pacbio/v.inaequalis/172/extracted
cat raw_dna/pacbio/v.inaequalis/172/*/Analysis_Results/*.subreads.fastq > $OutDir/concatenated_pacbio.fastq
#Canu Assembly
Genome size based on MiSeq assembled genome of the same isolate
Reads=$(ls raw_dna/pacbio/*/*/extracted/concatenated_pacbio.fastq)
GenomeSz="70m"
Strain=$(echo $Reads | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Reads | rev | cut -f4 -d '/' | rev)
Prefix="$Strain"_canu
OutDir="assembly/canu/$Organism/$Strain/Repeat_2"
ProgDir=~/git_repos/tools/seq_tools/assemblers/canu
qsub $ProgDir/submit_canu.sh $Reads $GenomeSz $Prefix $OutDir
quast of canu
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
for Assembly in $(ls assembly/canu/v.inaequalis/172/Repeat_2/172_canu.contigs.fasta); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
OutDir=assembly/canu/$Organism/$Strain/Repeat_2
qsub $ProgDir/sub_quast.sh $Assembly $OutDir
done
for Assembly in $(ls assembly/canu/v.inaequalis/172/Repeat_2/172_canu.contigs.fasta); do
Organism=v.inaequalis
Strain=172
IlluminaDir=$(ls -d qc_dna/paired/$Organism/$Strain)
TrimF1_Read=$IlluminaDir/F/172_S4_L001_R1_001_trim.fq.gz
TrimR1_Read=$IlluminaDir/R/172_S4_L001_R2_001_trim.fq.gz
OutDir=assembly/canu/$Organism/$Strain/polished_repeat
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/pilon
qsub $ProgDir/sub_pilon.sh $Assembly $TrimF1_Read $TrimR1_Read $OutDir
done
quast for stats
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
for Assembly in $(ls assembly/canu/*/*/polished_repeat/pilon.fasta); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
OutDir=assembly/canu/$Organism/$Strain/polished_repeat
qsub $ProgDir/sub_quast.sh $Assembly $OutDir
done
bwa alignment of canu contigs to pacbio
Assembly=assembly/canu/v.inaequalis/172/polished_repeat/pilon.fasta
Reads=raw_dna/pacbio/v.inaequalis/172/extracted/concatenated_pacbio.fastq
OutDir=analysis/genome_alignment/bwa/v.inaequalis/172/vs_172
ProgDir=/home/passet/git_repos/tools/seq_tools/genome_alignment/bwa
qsub $ProgDir/sub_bwa_pacbio.sh $Assembly $Reads $OutDir
for PacBioDat in $(ls raw_dna/pacbio/*/*/extracted/concatenated_pacbio.fastq); do
Organism=$(echo $PacBioDat | rev | cut -f4 -d '/' | rev)
Strain=$(echo $PacBioDat | rev | cut -f3 -d '/' | rev)
IlluminaDir=$(ls -d qc_dna/paired/$Organism/$Strain)
TrimF1_Read=$(ls $IlluminaDir/F/172_S4_L001_R1_001_trim.fq.gz);
TrimR1_Read=$(ls $IlluminaDir/R/172_S4_L001_R2_001_trim.fq.gz);
OutDir=assembly/spades_pacbio/$Organism/"$Strain"/
echo $TrimF1_Read
echo $TrimR1_Read
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/spades
qsub $ProgDir/sub_spades_pacbio.sh $PacBioDat $TrimF1_Read $TrimR1_Read $OutDir 10
done
Filter out contigs < 500bp from assembly
for Contigs in $(ls assembly/spades_pacbio/*/*/contigs.fasta); do
AssemblyDir=$(dirname $Contigs)
FilterDir=/home/passet/git_repos/tools/seq_tools/assemblers/abyss
$FilterDir/filter_abyss_contigs.py $Contigs 500 > $AssemblyDir/filtered_contigs/contigs_min_500bp.fasta
done
Quast
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
for Assembly in $(ls assembly/spades_pacbio/*/*/filtered_contigs/contigs_min_500bp.fasta); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
OutDir=assembly/spades_pacbio/$Organism/$Strain/filtered_contigs_repeat
qsub $ProgDir/sub_quast.sh $Assembly $OutDir
done
bwa alignment of spades contigs to pacbio
Assembly=assembly/spades_pacbio/v.inaequalis/172/filtered_contigs/contigs_min_500bp.fasta
Reads=raw_dna/pacbio/v.inaequalis/172/extracted/concatenated_pacbio.fastq
OutDir=analysis/genome_alignment/bwa/v.inaequalis/172/vs_172_spades
ProgDir=/home/passet/git_repos/tools/seq_tools/genome_alignment/bwa
qsub $ProgDir/sub_bwa_pacbio.sh $Assembly $Reads $OutDir
#Merge pacbio and hybrid assemblies
for PacBioAssembly in $(ls assembly/canu/*/*/polished_repeat/pilon.fasta); do
Organism=$(echo $PacBioAssembly | rev | cut -f4 -d '/' | rev)
Strain=$(echo $PacBioAssembly | rev | cut -f3 -d '/' | rev)
HybridAssembly=$(ls assembly/spades_pacbio/$Organism/$Strain/contigs.fasta)
AnchorLength=500000
OutDir=assembly/merged_canu_spades/$Organism/$Strain/
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/quickmerge
qsub $ProgDir/sub_quickmerge.sh $PacBioAssembly $HybridAssembly $OutDir $AnchorLength
done
Merged assembly polished with Pilon
for Assembly in $(ls assembly/merged_canu_spades/*/*/merged.fasta); do
Organism=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Strain=$(echo $Assembly | rev | cut -f2 -d '/' | rev)
IlluminaDir=$(ls -d qc_dna/paired/$Organism/$Strain)
TrimF1_Read=$(ls $IlluminaDir/F/172_S4_L001_R1_001_trim.fq.gz);
TrimR1_Read=$(ls $IlluminaDir/R/172_S4_L001_R2_001_trim.fq.gz);
OutDir=assembly/merged_canu_spades/$Organism/$Strain/polished
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/pilon
qsub $ProgDir/sub_pilon.sh $Assembly $TrimF1_Read $TrimR1_Read $OutDir
done
Contigs renamed in accordance with ncbi recommendations
ProgDir=~/git_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
touch tmp.csv
for Assembly in $(ls assembly/merged_canu_spades/*/*/polished/pilon.fasta); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
OutDir=assembly/merged_canu_spades/$Organism/$Strain/filtered_contigs
mkdir -p $OutDir
$ProgDir/remove_contaminants.py --inp $Assembly --out $OutDir/contigs_min_500bp_renamed.fasta --coord_file tmp.csv
done
rm tmp.csv
Quast of polished merged assembly
ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
for Assembly in $(ls assembly/merged_canu_spades/*/*/filtered_contigs/contigs_min_500bp_renamed.fasta); do
Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)
OutDir=assembly/merged_canu_spades/$Organism/$Strain/filtered_contigs
qsub $ProgDir/sub_quast.sh $Assembly $OutDir
done
bwa alignment of merged contigs to pacbio
Assembly=ls assembly/merged_canu_spades/*/*/filtered_contigs/contigs_min_500bp_renamed.fasta
Reads=raw_dna/pacbio/v.inaequalis/172/extracted/concatenated_pacbio.fastq
OutDir=analysis/genome_alignment/bwa/v.inaequalis/172/vs_172_merged
ProgDir=/home/passet/git_repos/tools/seq_tools/genome_alignment/bwa
qsub $ProgDir/sub_bwa_pacbio.sh $Assembly $Reads $OutDir
To avoid any potential overwritting of Isolate 172 miseq assembly analysis renamed pacbio assembly folders named "172" to "172_pacbio"
mv raw_dna/pacbio/*/* raw_dna/pacbio/v.inaequalis/172_pacbio
mv assembly/canu/v.inaequalis/172/ assembly/canu/v.inaequalis/172_pacbio
mv assembly/spades_pacbio/v.inaequalis/172/ assembly/spades_pacbio/v.inaequalis/172_pacbio
mv assembly/merged_canu_spades/v.inaequalis/172/ assembly/merged_canu_spades/v.inaequalis/172_pacbio
mv analysis/genome_alignment/bwa/v.inaequalis/172/ analysis/genome_alignment/bwa/v.inaequalis/172_pacbio
Repeat masking was performed and used the following programs: Repeatmasker Repeatmodeler
Repeat masking of canu assembly
ProgDir=/home/passet/git_repos/tools/seq_tools/repeat_masking
for Ass in $(ls assembly/canu/*/*/polished_repeat/pilon.fasta); do
qsub $ProgDir/rep_modeling.sh $Ass
qsub $ProgDir/transposonPSI.sh $Ass
done
Ran repeatmasking of merged assembly too
ProgDir=/home/passet/git_repos/tools/seq_tools/repeat_masking
for Ass in $(ls assembly/merged_canu_spades/*/*/filtered_contigs/contigs_min_500bp_renamed.fasta); do
qsub $ProgDir/rep_modeling.sh $Ass
qsub $ProgDir/transposonPSI.sh $Ass
done
The number of bases masked by transposonPSI and Repeatmasker were summarised using the following commands:
for RepDir in $(ls -d repeat_masked/v.*/172_pacbio/*); do
Strain=$(echo $RepDir | rev | cut -f2 -d '/' | rev)
Organism=$(echo $RepDir | rev | cut -f3 -d '/' | rev)
RepMaskGff=$(ls $RepDir/*_contigs_hardmasked.gff)
TransPSIGff=$(ls $RepDir/*_contigs_unmasked.fa.TPSI.allHits)
printf "$Organism\t$Strain\n"
printf "The number of bases masked by RepeatMasker:\t"
sortBed -i $RepMaskGff | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'
printf "The number of bases masked by TransposonPSI:\t"
sortBed -i $TransPSIGff | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'
printf "The total number of masked bases are:\t"
cat $RepMaskGff $TransPSIGff | sortBed | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'
echo
done
v.inaequalis 172_pacbio The number of bases masked by RepeatMasker: 33982597 The number of bases masked by TransposonPSI: 10254 The total number of masked bases are: Differing number of GFF fields encountered at line: 32896.
v.inaequalis 172_pacbio The number of bases masked by RepeatMasker: 34180552 The number of bases masked by TransposonPSI: 10254 The total number of masked bases are: Differing number of GFF fields encountered at line: 34454.
Genome announcement review wanted to know proportion of repeatmasked due to TEs
for RepDir in $(ls -d repeat_masked/v.*/172_pacbio/*); do
Strain=$(echo $RepDir | rev | cut -f2 -d '/' | rev)
Organism=$(echo $RepDir | rev | cut -f3 -d '/' | rev)
RepMaskGff=$(ls $RepDir/172_pacbio_contigs_transposonmasked.gff)
TransPSIGff=$(ls $RepDir/*_contigs_unmasked.fa.TPSI.allHits)
printf "$Organism\t$Strain\n"
printf "The number of bases masked by RepeatMasker:\t"
sortBed -i $RepMaskGff | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'
printf "The number of bases masked by TransposonPSI:\t"
sortBed -i $TransPSIGff | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'
printf "The total number of masked bases are:\t"
cat $RepMaskGff $TransPSIGff | sortBed | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'
echo
done
v.inaequalis 172_pacbio The number of bases masked by RepeatMasker: 33537704 The number of bases masked by TransposonPSI: 10254 The total number of masked bases are: Differing number of GFF fields encountered at line: 21779. Exiting... 0
v.inaequalis 172_pacbio The number of bases masked by RepeatMasker: 33733448 The number of bases masked by TransposonPSI: 10254 The total number of masked bases are: Differing number of GFF fields encountered at line: 23362. Exiting... 0
Gene prediction followed three steps: Pre-gene prediction - Quality of merged PacBio genome assembly assessed using Cegma to see how many core eukaryotic genes can be identified. Gene model training - Gene models were trained using assembled RNAseq data as part of the Braker1 pipeline Gene prediction - Gene models were used to predict genes in genomes as part of the the Braker1 pipeline. This used RNAseq data as hints for gene models.
Quality of genome assembly assessed by looking for the gene space in the assemblies.
ProgDir=/home/passet/git_repos/tools/gene_prediction/cegma
cd /home/groups/harrisonlab/project_files/venturia
for Genome in $(ls repeat_masked/v.*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -w "172_pacbio"); do
echo $Genome;
qsub $ProgDir/sub_cegma.sh $Genome dna;
done
Outputs were summarised using the commands:
for File in $(ls gene_pred/cegma/v*/*/*_dna_cegma.completeness_report | grep "172_pacbio"); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev);
Species=$(echo $File | rev | cut -f3 -d '/' | rev);
printf "$Species\t$Strain\n";
cat $File | head -n18 | tail -n+4;printf "\n";
done > gene_pred/cegma/cegma_results_dna_summary.txt
less gene_pred/cegma/cegma_results_dna_summary.txt
Output:
v.inaequalis 172_pacbio #Prots %Completeness - #Total Average %Ortho
Complete 238 95.97 - 260 1.09 7.98
Group 1 61 92.42 - 66 1.08 8.20 Group 2 54 96.43 - 59 1.09 7.41 Group 3 59 96.72 - 68 1.15 11.86 Group 4 64 98.46 - 67 1.05 4.69
Partial 245 98.79 - 273 1.11 9.80
Group 1 64 96.97 - 70 1.09 7.81 Group 2 55 98.21 - 61 1.11 9.09 Group 3 61 100.00 - 73 1.20 16.39 Group 4 65 100.00 - 69 1.06 6.15
Busco has replaced CEGMA and was run to check gene space in assemblies
for Assembly in $(ls repeat_masked/v.*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -w "172_pacbio"); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
ProgDir=/home/passet/git_repos/tools/gene_prediction/busco
# BuscoDB="Fungal"
BuscoDB=$(ls -d /home/groups/harrisonlab/dbBusco/ascomycota_odb9)
OutDir=gene_pred/busco/$Organism/$Strain/assembly
qsub $ProgDir/sub_busco3.sh $Assembly $BuscoDB $OutDir
done
Output:
BUSCO version is: 3.0.2
# The lineage dataset is: ascomycota_odb9 (Creation date: 2016-02-13, number of
# To reproduce this run: python /home/armita/prog/busco/busco/scripts/run_BUSCO.
#
# Summarized benchmarking in BUSCO notation for file 172_pacbio_contigs_unmasked
# BUSCO was run in mode: genome
C:97.8%[S:97.7%,D:0.1%],F:0.5%,M:1.7%,n:1315
1286 Complete BUSCOs (C)
1285 Complete and single-copy BUSCOs (S)
1 Complete and duplicated BUSCOs (D)
7 Fragmented BUSCOs (F)
22 Missing BUSCOs (M)
1315 Total BUSCO groups searched
for File in $(ls gene_pred/busco/v*/*/assembly/*/short_summary_*.txt); do
echo $File;
cat $File | grep -e '(C)' -e 'Total';
done
Output:
1286 Complete BUSCOs (C) 1315 Total BUSCO groups searched
#Gene prediction
Gene prediction was performed for V. inaequalis PacBio genome.
Gene prediction was performed using Braker1.
First, RNAseq data from Thakur et al was aligned to V. inaequalis PacBio genome.
- qc of RNA seq data is detailed in README.md
Insert sizes of the RNA seq library were unknown until a draft alignment could be made. To do this tophat and cufflinks were run, aligning the reads against the genome. The fragment length and stdev were printed to stdout while cufflinks was running.
for Assembly in $(ls repeat_masked/v.*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -w "172_pacbio"); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
Paired=$(echo $Assembly | rev | cut -d '/' -f5 | rev)
echo "$Organism - $Strain"
for rna_file in $(ls qc_rna/*/*/*/*/*.gz | grep -w 'paired'); do
Timepoint=$(echo $rna_file | rev | cut -f3 -d '/' | rev)
echo "$Timepoint"
OutDir=alignment/$Paired/$Organism/$Strain/$Timepoint
ProgDir=/home/passet/git_repos/tools/seq_tools/RNAseq
qsub $ProgDir/tophat_alignment_unpaired.sh $Assembly $rna_file $OutDir
done
done
Alignments were concatenated prior to running cufflinks: Cufflinks was run to produce the fragment length and stdev statistics:
for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_softmasked.fa | grep -w "172_pacbio"); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
mkdir -p alignment/repeat_masked/$Organism/$Strain/concatenated_prelim
AcceptedHits=alignment/repeat_masked/$Organism/$Strain/concatenated_prelim/concatenated.bam
samtools merge -f $AcceptedHits \
alignment/repeat_masked/$Organism/$Strain/V0/accepted_hits.bam \
alignment/repeat_masked/$Organism/$Strain/V2/accepted_hits.bam \
alignment/repeat_masked/$Organism/$Strain/V5/accepted_hits.bam
OutDir=gene_pred/cufflinks/$Organism/$Strain/concatenated_prelim
mkdir -p $OutDir
cufflinks -o $OutDir/cufflinks -p 8 --max-intron-length 4000 $AcceptedHits 2>&1 | tee $OutDir/cufflinks/cufflinks.log
done
Output from stdout included:
You are using Cufflinks v2.2.1, which is the most recent release.
[08:31:38] Inspecting reads and determining fragment length distribution.
> Processed 47186 loci. [*************************] 100%
> Map Properties:
> Normalized Map Mass: 10994414.77
> Raw Map Mass: 10994414.77
> Fragment Length Distribution: Truncated Gaussian (default)
> Default Mean: 200
> Default Std Dev: 80
[08:32:48] Assembling transcripts and estimating abundances.
> Processed 47219 loci. [*************************] 100%
The Estimated Mean: 200 allowed calculation of of the mean insert gap to be -88 bp 200-(144*2) where 144 was the mean read length. This was provided to tophat on a second run (as the -r option) along with the fragment length stdev to increase the accuracy of mapping.
Then RNaseq data was aligned to the PacBio genome assembly:
InsertGap='-88'
InsertStdDev='80'
for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -w "172_pacbio"); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
Paired=$(echo $Assembly | rev | cut -d '/' -f5 | rev)
echo "$Organism - $Strain"
for rna_file in $(ls qc_rna/*/*/*/*/*.gz | grep -w 'paired'); do
Timepoint=$(echo $rna_file | rev | cut -f3 -d '/' | rev)
echo "$Timepoint"
OutDir=alignment/$Paired/$Organism/$Strain/"$Timepoint"_paired
ProgDir=/home/passet/git_repos/tools/seq_tools/RNAseq
qsub $ProgDir/tophat_alignment_interlevered.sh $Assembly $rna_file $OutDir $InsertGap $InsertStdDev
done
for rna_file in $(ls qc_rna/*/*/*/*/*.gz | grep -w 'unpaired'); do
Timepoint=$(echo $rna_file | rev | cut -f3 -d '/' | rev)
echo "$Timepoint"
OutDir=alignment/$Paired/$Organism/$Strain/"$Timepoint"_unpaired
ProgDir=/home/passet/git_repos/tools/seq_tools/RNAseq
qsub $ProgDir/tophat_alignment_unpaired.sh $Assembly $rna_file $OutDir
done
done
Gene prediction using Braker1 Prediction of all putative ORFs in the genome using the ORF finder (atg.pl) approach.
Before braker predictiction was performed, I double checked that I had the genemark key in my user area and copied it over from the genemark install directory:
ls ~/.gm_key
cp /home/armita/prog/genemark/gm_key_64 ~/.gm_key
Ran braker prediction on PacBio genome
for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_softmasked.fa | grep "172_pacbio"); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
mkdir -p alignment/repeat_masked/$Organism/$Strain/concatenated
samtools merge -f alignment/repeat_masked/$Organism/$Strain/concatenated/concatenated.bam \
alignment/repeat_masked/$Organism/$Strain/V0_unpaired/accepted_hits.bam \
alignment/repeat_masked/$Organism/$Strain/V2_unpaired/accepted_hits.bam \
alignment/repeat_masked/$Organism/$Strain/V5_unpaired/accepted_hits.bam \
alignment/repeat_masked/$Organism/$Strain/V0_paired/accepted_hits.bam \
alignment/repeat_masked/$Organism/$Strain/V2_paired/accepted_hits.bam \
alignment/repeat_masked/$Organism/$Strain/V5_paired/accepted_hits.bam
OutDir=gene_pred/braker/$Organism/"$Strain"_braker_new
AcceptedHits=alignment/repeat_masked/$Organism/$Strain/concatenated/concatenated.bam
GeneModelName="$Organism"_"$Strain"_braker_new
rm -r /home/armita/prog/augustus-3.1/config/species/"$Organism"_"$Strain"_braker_new
ProgDir=/home/passet/git_repos/tools/gene_prediction/braker1
qsub $ProgDir/sub_braker_fungi.sh $Assembly $OutDir $AcceptedHits $GeneModelName
done
Fasta and gff files were extracted from Braker1 output.
for File in $(ls gene_pred/braker/v.*/172_pacbio_braker_new/*/augustus.gff); do
getAnnoFasta.pl $File
OutDir=$(dirname $File)
echo "##gff-version 3" > $OutDir/augustus_extracted.gff
cat $File | grep -v '#' >> $OutDir/augustus_extracted.gff
done
Additional genes were added to Braker gene predictions, using CodingQuary in pathogen mode to predict additional regions.
Fistly, aligned RNAseq data was assembled into transcripts using Cufflinks.
Note - cufflinks doesn't always predict direction of a transcript and therefore features can not be restricted by strand when they are intersected.
for Assembly in $(ls repeat_masked/*/*/filtered_*/*_contigs_unmasked.fa | grep -w "172_pacbio"); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
OutDir=gene_pred/cufflinks/$Organism/$Strain/concatenated
mkdir -p $OutDir
AcceptedHits=alignment/repeat_masked/$Organism/$Strain/concatenated/concatenated.bam
ProgDir=/home/passet/git_repos/tools/seq_tools/RNAseq
qsub $ProgDir/sub_cufflinks.sh $AcceptedHits $OutDir
done
Secondly, genes were predicted using CodingQuary:
for Assembly in $(ls repeat_masked/*/*/filtered_*/*_contigs_softmasked.fa | grep -w "172_pacbio"); do
Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
OutDir=gene_pred/codingquary/$Organism/$Strain
CufflinksGTF=gene_pred/cufflinks/$Organism/$Strain/concatenated/transcripts.gtf
ProgDir=/home/passet/git_repos/tools/gene_prediction/codingquary
qsub $ProgDir/sub_CodingQuary.sh $Assembly $CufflinksGTF $OutDir
done
Then, additional transcripts were added to Braker gene models, when CodingQuary genes were predicted in regions of the genome, not containing Braker gene models:
#The following is edited after NCBI found duplicated genes
for BrakerGff in $(ls gene_pred/braker/v.*/172_pacbio_braker_new/*/augustus.gff3); do
Strain=$(echo $BrakerGff| rev | cut -d '/' -f3 | rev | sed 's/_braker_new//g')
Organism=$(echo $BrakerGff | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
Assembly=$(ls repeat_masked/$Organism/$Strain/filtered_*/"$Strain"_contigs_softmasked.fa)
CodingQuaryGff=gene_pred/codingquary/$Organism/$Strain/out/PredictedPass.gff3
PGNGff=gene_pred/codingquary/$Organism/$Strain/out/PGN_predictedPass.gff3
AddDir=gene_pred/codingquary/$Organism/$Strain/additional
FinalDir=gene_pred/final/$Organism/$Strain/final
AddGenesList=$AddDir/additional_genes.txt
AddGenesGff=$AddDir/additional_genes.gff
FinalGff=$AddDir/combined_genes.gff
mkdir -p $AddDir
mkdir -p $FinalDir
bedtools intersect -v -a $CodingQuaryGff -b $BrakerGff | grep 'gene'| cut -f2 -d'=' | cut -f1 -d';' > $AddGenesList
bedtools intersect -v -a $PGNGff -b $BrakerGff | grep 'gene'| cut -f2 -d'=' | cut -f1 -d';' >> $AddGenesList
ProgDir=/home/passet/git_repos/tools/seq_tools/feature_annotation
$ProgDir/gene_list_to_gff.pl $AddGenesList $CodingQuaryGff CodingQuarry_v2.0 ID CodingQuary > $AddGenesGff
$ProgDir/gene_list_to_gff.pl $AddGenesList $PGNGff PGNCodingQuarry_v2.0 ID CodingQuary >> $AddGenesGff
ProgDir=/home/passet/git_repos/tools/gene_prediction/codingquary
# -
# This section is edited
$ProgDir/add_CodingQuary_features.pl $AddGenesGff $Assembly > $AddDir/add_genes_CodingQuary_unspliced.gff3
$ProgDir/correct_CodingQuary_splicing.py --inp_gff $AddDir/add_genes_CodingQuary_unspliced.gff3 > $FinalDir/final_genes_CodingQuary.gff3
# -
$ProgDir/gff2fasta.pl $Assembly $FinalDir/final_genes_CodingQuary.gff3 $FinalDir/final_genes_CodingQuary
cp $BrakerGff $FinalDir/final_genes_Braker.gff3
$ProgDir/gff2fasta.pl $Assembly $FinalDir/final_genes_Braker.gff3 $FinalDir/final_genes_Braker
cat $FinalDir/final_genes_Braker.pep.fasta $FinalDir/final_genes_CodingQuary.pep.fasta | sed -r 's/\*/X/g' > $FinalDir/final_genes_combined.pep.fasta
cat $FinalDir/final_genes_Braker.cdna.fasta $FinalDir/final_genes_CodingQuary.cdna.fasta > $FinalDir/final_genes_combined.cdna.fasta
cat $FinalDir/final_genes_Braker.gene.fasta $FinalDir/final_genes_CodingQuary.gene.fasta > $FinalDir/final_genes_combined.gene.fasta
cat $FinalDir/final_genes_Braker.upstream3000.fasta $FinalDir/final_genes_CodingQuary.upstream3000.fasta > $FinalDir/final_genes_combined.upstream3000.fasta
GffBraker=$FinalDir/final_genes_Braker.gff3
GffQuary=$FinalDir/final_genes_CodingQuary.gff3
GffAppended=$FinalDir/final_genes_appended.gff3
cat $GffBraker $GffQuary > $GffAppended
done
The final number of genes per isolate was observed using:
for DirPath in $(ls -d gene_pred/codingquary/v.*/*/final | grep -w "172_pacbio"); do
echo $DirPath;
cat $DirPath/final_genes_Braker.pep.fasta | grep '>' | wc -l;
cat $DirPath/final_genes_CodingQuary.pep.fasta | grep '>' | wc -l;
cat $DirPath/final_genes_combined.pep.fasta | grep '>' | wc -l;
echo "";
done
Output:
gene_pred/codingquary/v.inaequalis/172_pacbio/final
12198
1673
13871
After submission to ncbi and identification of duplicate genes, gene models were renamed and duplicate gene features were identified and removed.
for GffAppended in $(ls gene_pred/final/*/*/final/final_genes_appended.gff3); do
Strain=$(echo $GffAppended | rev | cut -d '/' -f3 | rev)
Organism=$(echo $GffAppended | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
FinalDir=gene_pred/final/$Organism/$Strain/final
GffFiltered=$FinalDir/filtered_duplicates.gff
ProgDir=/home/passet/git_repos/tools/gene_prediction/codingquary
$ProgDir/remove_dup_features.py --inp_gff $GffAppended --out_gff $GffFiltered
GffRenamed=$FinalDir/final_genes_appended_renamed.gff3
LogFile=$FinalDir/final_genes_appended_renamed.log
ProgDir=/home/passet/git_repos/tools/gene_prediction/codingquary
$ProgDir/gff_rename_genes.py --inp_gff $GffFiltered --conversion_log $LogFile > $GffRenamed
rm $GffFiltered
Assembly=$(ls repeat_masked/$Organism/$Strain/filtered_*/"$Strain"_contigs_softmasked.fa)
$ProgDir/gff2fasta.pl $Assembly $GffRenamed gene_pred/final/$Organism/$Strain/final/final_genes_appended_renamed
# The proteins fasta file contains * instead of Xs for stop codons, these should
# be changed
sed -i 's/\*/X/g' gene_pred/final/$Organism/$Strain/final/final_genes_appended_renamed.pep.fasta
done
Output:
v.inaequalis - 172_pacbio
Identifiied the following duplicated transcripts:
NOTE - if any of these represent the first transcript of a gene (.t1) then an entire gene may be duplicated
if so the gene feature will need to be stripped out of the gff file seperately.
Genome fasta parsed
The genome was searched in six reading frames for any start codon and following translated identification of a start codon translating sequence until a stop codon was found. This is based upon the atg.pl script used in paper describing the P. infestans genome. Additional functionality was added to this script by also printing ORFs in .gff format.
ProgDir=/home/passet/git_repos/tools/gene_prediction/ORF_finder
for Genome in $(ls repeat_masked/v.*/*/*/*_contigs_unmasked.fa | grep "172_pacbio"); do
qsub $ProgDir/run_ORF_finder.sh $Genome
done
The Gff files from the the ORF finder are not in true Gff3 format. These were corrected using the following commands:
ProgDir=~/git_repos/tools/seq_tools/feature_annotation
for ORF_Gff in $(ls gene_pred/ORF_finder/*/*/*_ORF.gff | grep -v '_F_atg_' | grep -v '_R_atg_'); do
ORF_Gff_mod=$(echo $ORF_Gff | sed 's/_ORF.gff/_ORF_corrected.gff3/g')
echo ""
echo "Correcting the following file:"
echo $ORF_Gff
echo "Redirecting to:"
echo $ORF_Gff_mod
$ProgDir/gff_corrector.pl $ORF_Gff > $ORF_Gff_mod
done
#Functional annotation
Interproscan was used to give gene models functional annotations. Annotation was run using the commands below:
Note: This is a long-running script. As such, these commands were run using 'screen' to allow jobs to be submitted and monitored in the background. This allows the session to be disconnected and reconnected over time.
Screen ouput detailing the progress of submission of interproscan jobs was redirected to a temporary output file named interproscan_submission.log .
screen -a
cd /home/groups/harrisonlab/project_files/venturia
ProgDir=/home/passet/git_repos/tools/seq_tools/feature_annotation/interproscan
for Genes in $(ls gene_pred/final/v.inaequalis/172_pacbio/final/final_genes_appended_renamed.pep.fasta | grep -w "172_pacbio"); do
echo $Genes
$ProgDir/sub_interproscan.sh $Genes
done 2>&1 | tee -a interproscan_submisison.log
Following interproscan annotation split files were combined using the following commands:
ProgDir=/home/passet/git_repos/tools/seq_tools/feature_annotation/interproscan
for Proteins in $(ls gene_pred/final/v.inaequalis/172_pacbio/final/final_genes_appended_renamed.pep.fasta | grep -w "172_pacbio"); do
Strain=$(echo $Proteins | rev | cut -d '/' -f3 | rev)
Organism=$(echo $Proteins | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
echo $Strain
InterProRaw=gene_pred/interproscan/$Organism/$Strain/raw
$ProgDir/append_interpro.sh $Proteins $InterProRaw
done
Annotations with SwissProt
for Proteome in $(ls gene_pred/final/v.inaequalis/172_pacbio/final/final_genes_appended_renamed.pep.fasta | grep -w "172_pacbio"); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
OutDir=gene_pred/swissprot/$Organism/$Strain
SwissDbDir=../../uniprot/swissprot
SwissDbName=uniprot_sprot
ProgDir=/home/passet/git_repos/tools/seq_tools/feature_annotation/swissprot
qsub $ProgDir/sub_swissprot.sh $Proteome $OutDir $SwissDbDir $SwissDbName
done
Carbohydrte active enzymes were identified using CAZYfollowing recomendations at http://csbl.bmb.uga.edu/dbCAN/download/readme.txt :
for Proteome in $(ls gene_pred/codingquary/v.*/*/*/final_genes_combined.pep.fasta | grep -w "172_pacbio"); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
OutDir=gene_pred/CAZY/$Organism/$Strain
mkdir -p $OutDir
Prefix="$Strain"_CAZY
CazyHmm=../../dbCAN/dbCAN-fam-HMMs.txt
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/HMMER
qsub $ProgDir/sub_hmmscan.sh $CazyHmm $Proteome $Prefix $OutDir
done
The Hmm parser was used to filter hits by an E-value of E1x10-5 or E1x10-e3 if they had a hit over a length of X %.
Those proteins with a signal peptide were extracted from the list and gff files representing these proteins made.
for File in $(ls gene_pred/CAZY/*/*/*CAZY.out.dm); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
OutDir=$(dirname $File)
echo "$Organism - $Strain"
ProgDir=/home/groups/harrisonlab/dbCAN
$ProgDir/hmmscan-parser.sh $OutDir/"$Strain"_CAZY.out.dm > $OutDir/"$Strain"_CAZY.out.dm.ps
CazyHeaders=$(echo $File | sed 's/.out.dm/_headers.txt/g')
cat $OutDir/"$Strain"_CAZY.out.dm.ps | cut -f3 | sort | uniq > $CazyHeaders
echo "number of CAZY proteins identified:"
cat $CazyHeaders | wc -l
Gff=$(ls gene_pred/codingquary/$Organism/$Strain/final/final_genes_appended.gff3)
CazyGff=$OutDir/"$Strain"_CAZY.gff
ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_gff_for_sigP_hits.pl $CazyHeaders $Gff CAZyme ID > $CazyGff
echo "number of CAZY genes identified:"
cat $CazyGff | grep -w 'gene' | wc -l
SecretedProts=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/"$Strain"_final_sp_no_trans_mem.aa)
SecretedHeaders=$(echo $SecretedProts | sed 's/.aa/_headers.txt/g')
cat $SecretedProts | grep '>' | tr -d '>' > $SecretedHeaders
CazyGffSecreted=$OutDir/"$Strain"_CAZY_secreted.gff
$ProgDir/extract_gff_for_sigP_hits.pl $SecretedHeaders $CazyGff Secreted_CAZyme ID > $CazyGffSecreted
echo "number of Secreted CAZY proteins identified:"
cat $CazyGffSecreted | grep -w 'mRNA' | cut -f9 | tr -d 'ID=' | cut -f1 -d ';' > $OutDir/"$Strain"_CAZY_secreted_headers.txt
cat $OutDir/"$Strain"_CAZY_secreted_headers.txt | wc -l
echo "number of Secreted CAZY genes identified:"
cat $CazyGffSecreted | grep -w 'gene' | wc -l
# cat $OutDir/"$Strain"_CAZY_secreted_headers.txt | cut -f1 -d '.' | sort | uniq | wc -l
done > tmp.txt
v.inaequalis - 172_pacbio
number of CAZY proteins identified:
578
number of CAZY genes identified:
578
number of Secreted CAZY proteins identified:
262
number of Secreted CAZY genes identified:
262
Note - the CAZY genes identified may need further filtering based on e value and cuttoff length - see below:
Cols in yourfile.out.dm.ps:
- Family HMM
- HMM length
- Query ID
- Query length
- E-value (how similar to the family HMM)
- HMM start
- HMM end
- Query start
- Query end
- Coverage
-
For fungi, use E-value < 1e-17 and coverage > 0.45
-
The best threshold varies for different CAZyme classes (please see http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4132414/ for details). Basically to annotate GH proteins, one should use a very relax coverage cutoff or the sensitivity will be low (Supplementary Tables S4 and S9); (ii) to annotate CE families a very stringent E-value cutoff and coverage cutoff should be used; otherwise the precision will be very low due to a very high false positive rate (Supplementary Tables S5 and S10)
for CAZY in $(ls gene_pred/CAZY/*/*/*_CAZY.out.dm.ps); do
Strain=$(echo $CAZY | rev | cut -f2 -d '/' | rev)
Organism=$(echo $CAZY | rev | cut -f3 -d '/' | rev)
OutDir=$(dirname $CAZY)
echo "$Organism - $Strain"
Secreted=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/*_final_sp_no_trans_mem_headers.txt)
Gff=gene_pred/codingquary/$Organism/$Strain/final/final_genes_appended.gff3
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/CAZY
$ProgDir/summarise_CAZY.py --cazy $CAZY --inp_secreted $Secreted --inp_gff $Gff --summarise_family --trim_gene_id 2 --kubicek_2014
done | less -S
v.inaequalis - 172_pacbio
B-Galactosidases - 2
B-Glucuronidases - 1
Polygalacturonase - 14
A-Arabinosidases - 11
Xylanases - 4
Polygalacturonate lyases - 10
A-Galactosidases - 3
B-Glycosidases - 7
Cellulases - 10
Antismash was run to identify clusters of secondary metabolite genes within the genome. Antismash was run using the weserver at: http://fungismash.secondarymetabolites.org/#!/start
Uploaded sequence file: repeat_masked/v.inaequalis/172_pacbio/filtered_contigs_repmask/172_pacbio_contigs_unmasked.fa Uploaded feature annotations (optional): gene_pred/codingquary/v.inaequalis/172_pacbio/final/final_genes_appended.gff3 Options selected (as default except CASSIS):
Cluster finder - off
Extra features selected:
KnownClusterBlast
smCoG analysis
Cluster-border prediction based on transcription factor binding sites (CASSIS)
ActiveSiteFinder
SubClusterBlast
Results of web-annotation of gene clusters within the assembly were downloaded to the following directories:
analysis/secondary_metabolites/antismash/v.inaequalis/172_pacbio
for Zip in $(ls analysis/secondary_metabolites/antismash/*/*/*.zip); do
OutDir=$(dirname $Zip)
unzip -d $OutDir $Zip
done
for AntiSmash in $(ls analysis/secondary_metabolites/antismash/*/*/*/*.final.gbk); do
Organism=$(echo $AntiSmash | rev | cut -f4 -d '/' | rev)
Strain=$(echo $AntiSmash | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=analysis/secondary_metabolites/antismash/$Organism/$Strain
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/secondary_metabolites
$ProgDir/antismash2gff.py --inp_antismash $AntiSmash > $OutDir/"$Strain"_secondary_metabolite_regions.gff
printf "Number of clusters detected:\t"
cat $OutDir/"$Strain"_secondary_metabolite_regions.gff | grep 'antismash_cluster' | wc -l
GeneGff=gene_pred/codingquary/$Organism/$Strain/final/final_genes_appended.gff3
bedtools intersect -u -a $GeneGff -b $OutDir/"$Strain"_secondary_metabolite_regions.gff > $OutDir/metabolite_cluster_genes.gff
cat $OutDir/metabolite_cluster_genes.gff | grep -w 'mRNA' | cut -f9 | cut -f2 -d '=' | cut -f1 -d ';' > $OutDir/metabolite_cluster_gene_headers.txt
printf "Number of predicted proteins in clusters:\t"
cat $OutDir/metabolite_cluster_gene_headers.txt | wc -l
printf "Number of predicted genes in clusters:\t"
cat $OutDir/metabolite_cluster_genes.gff | grep -w 'gene' | wc -l
done
These clusters represented the following genes. Note that these numbers just show the number of intersected genes with gff clusters and are not confirmed by function
v.inaequalis - 172_pacbio
Number of clusters detected: 22
Number of predicted proteins in clusters: 269
Number of predicted genes in clusters: 264
for Antismash in $(ls analysis/secondary_metabolites/antismash/v.*/*/*_secondary_metabolite_regions.gff); do
Organism=$(echo $Antismash | rev | cut -f3 -d '/' | rev)
Strain=$(echo $Antismash | rev | cut -f2 -d '/' | rev)
echo "$Organism - $Strain"
cat $Antismash | cut -f3 | sort | uniq -c
done
v.inaequalis - 172_pacbio
2 indole
5 nrps
6 other
1 siderophore
5 t1pks
3 terpene
SMURF was also run to identify secondary metabolite gene clusters.
Genes needed to be parsed into a specific tsv format prior to submission on the SMURF webserver.
Gff=gene_pred/codingquary/v.inaequalis/172_pacbio/final/final_genes_appended.gff3
OutDir=analysis/secondary_metabolites/smurf/v.inaequalis/172_pacbio
mkdir -p $OutDir
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/secondary_metabolites
$ProgDir/gff2smurf.py --gff $Gff > $OutDir/172_pacbio_genes_smurf.tsv
Submitted protein fasta file: gene_pred/codingquary/v.inaequalis/172_pacbio/final/final_genes_combined.pep.fasta Submitted above tsv file: analysis/secondary_metabolites/smurf/v.inaequalis/172_pacbio/172_pacbio_genes_smurf.tsv
SMURF output was received by email and downloaded to the cluster in the output directory above.
Output files were parsed into gff format:
OutDir=analysis/secondary_metabolites/smurf/v.inaequalis/172_pacbio
SmurfClusters=$OutDir/Secondary-Metabolite-Clusters.txt
SmurfBackbone=$OutDir/Backbone-genes.txt
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/secondary_metabolites
$ProgDir/smurf2gff.py --smurf_clusters $SmurfClusters --smurf_backbone $SmurfBackbone > $OutDir/Smurf_clusters.gff
#Genomic analysis
Putative pathogenicity and effector related genes were identified within Braker gene models using a number of approaches:
- A) From Augustus gene models - Identifying secreted proteins
- B) From Augustus gene models - Effector identification using EffectorP
Required programs:
- SignalP-4.1
- TMHMM
Proteins that were predicted to contain signal peptides were identified using the following commands:
SplitfileDir=/home/passet/git_repos/tools/seq_tools/feature_annotation/signal_peptides
ProgDir=/home/passet/git_repos/tools/seq_tools/feature_annotation/signal_peptides
CurPath=$PWD
for Proteome in $(ls gene_pred/codingquary/v.*/*/*/final_genes_combined.pep.fasta | grep -w "172_pacbio"); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
SplitDir=gene_pred/final_genes_split/$Organism/$Strain
mkdir -p $SplitDir
BaseName="$Organism""_$Strain"_final_preds
$SplitfileDir/splitfile_500.py --inp_fasta $Proteome --out_dir $SplitDir --out_base $BaseName
for File in $(ls $SplitDir/*_final_preds_*); do
Jobs=$(qstat | grep 'pred_sigP' | wc -l)
while [ $Jobs -gt 20 ]; do
sleep 10
printf "."
Jobs=$(qstat | grep 'pred_sigP' | wc -l)
done
printf "\n"
echo $File
qsub $ProgDir/pred_sigP.sh $File signalp-4.1
done
done
The batch files of predicted secreted proteins needed to be combined into a single file for each strain. This was done with the following commands:
for SplitDir in $(ls -d gene_pred/final_genes_split/*/* | grep -w "172_pacbio"); do
Strain=$(echo $SplitDir | rev |cut -d '/' -f1 | rev)
Organism=$(echo $SplitDir | rev |cut -d '/' -f2 | rev)
InStringAA=''
InStringNeg=''
InStringTab=''
InStringTxt=''
SigpDir=final_genes_signalp-4.1
for GRP in $(ls -l $SplitDir/*_final_preds_*.fa | rev | cut -d '_' -f1 | rev | sort -n); do
InStringAA="$InStringAA gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_preds_$GRP""_sp.aa";
InStringNeg="$InStringNeg gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_preds_$GRP""_sp_neg.aa";
InStringTab="$InStringTab gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_preds_$GRP""_sp.tab";
InStringTxt="$InStringTxt gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_preds_$GRP""_sp.txt";
done
cat $InStringAA > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_final_sp.aa
cat $InStringNeg > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_final_neg_sp.aa
tail -n +2 -q $InStringTab > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_final_sp.tab
cat $InStringTxt > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_final_sp.txt
done
Some proteins that are incorporated into the cell membrane require secretion. Therefore proteins with a transmembrane domain are not likely to represent cytoplasmic or apoplastic effectors.
Proteins containing a transmembrane domain were identified:
for Proteome in $(ls gene_pred/codingquary/v.*/*/*/final_genes_combined.pep.fasta | grep -w "172_pacbio"); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
ProgDir=/home/passet/git_repos/tools/seq_tools/feature_annotation/transmembrane_helices
qsub $ProgDir/submit_TMHMM.sh $Proteome
done
Those proteins with transmembrane domains were removed from lists of Signal peptide containing proteins
for File in $(ls gene_pred/trans_mem/*/*/*_TM_genes_neg.txt | grep -w "172_pacbio"); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
TmHeaders=$(echo "$File" | sed 's/neg.txt/neg_headers.txt/g')
cat $File | cut -f1 > $TmHeaders
SigP=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/*_final_sp.aa)
OutDir=$(dirname $SigP)
ProgDir=/home/passet/git_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_from_fasta.py --fasta $SigP --headers $TmHeaders > $OutDir/"$Strain"_final_sp_no_trans_mem.aa
cat $OutDir/"$Strain"_final_sp_no_trans_mem.aa | grep '>' | wc -l
done
v.inaequalis - 172_pacbio
1496
Required programs:
- EffectorP.py
for Proteome in $(ls gene_pred/codingquary/v.*/*/*/final_genes_combined.pep.fasta | grep -w "172_pacbio"); do
Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
BaseName="$Organism"_"$Strain"_EffectorP
OutDir=analysis/effectorP/$Organism/$Strain
ProgDir=~/git_repos/tools/seq_tools/feature_annotation/fungal_effectors
qsub $ProgDir/pred_effectorP.sh $Proteome $BaseName $OutDir
done
Those genes that were predicted as secreted and tested positive by effectorP were identified:
for File in $(ls analysis/effectorP/*/*/*_EffectorP.txt | grep -w "172_pacbio"); do
Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
Headers=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_headers.txt/g')
cat $File | grep 'Effector' | cut -f1 > $Headers
Secretome=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/*_final_sp_no_trans_mem.aa)
OutFile=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_secreted.aa/g')
ProgDir=/home/passet/git_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_from_fasta.py --fasta $Secretome --headers $Headers > $OutFile
OutFileHeaders=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_secreted_headers.txt/g')
cat $OutFile | grep '>' | tr -d '>' > $OutFileHeaders
cat $OutFileHeaders | wc -l
Gff=$(ls gene_pred/codingquary/$Organism/$Strain/*/final_genes_appended.gff3)
EffectorP_Gff=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_secreted.gff/g')
ProgDir=/home/passet/git_repos/tools/gene_prediction/ORF_finder
$ProgDir/extract_gff_for_sigP_hits.pl $OutFileHeaders $Gff effectorP ID > $EffectorP_Gff
done
v.inaequalis - 172_pacbio
629
Small secreted cysteine rich proteins were identified within secretomes. These proteins may be identified by EffectorP, but this approach allows direct control over what constitutes a SSCP.
for Secretome in $(ls gene_pred/final_genes_signalp-4.1/*/*/172_pacbio_final_sp_no_trans_mem.aa); do
Strain=$(echo $Secretome| rev | cut -f2 -d '/' | rev)
Organism=$(echo $Secretome | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=analysis/sscp/$Organism/$Strain
mkdir -p $OutDir
ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/sscp
$ProgDir/sscp_filter.py --inp_fasta $Secretome --max_length 300 --threshold 3 --out_fasta $OutDir/"$Strain"_sscp_all_results.fa
cat $OutDir/"$Strain"_sscp_all_results.fa | grep 'Yes' > $OutDir/"$Strain"_sscp.fa
printf "number of SSC-rich genes:\t"
cat $OutDir/"$Strain"_sscp.fa | grep '>' | tr -d '>' | cut -f1 -d '.' | sort | uniq | wc -l
printf "Number of effectors predicted by EffectorP:\t"
EffectorP=$(ls analysis/effectorP/$Organism/$Strain/*_EffectorP_secreted_headers.txt)
cat $EffectorP | wc -l
printf "Number of SSCPs predicted by both effectorP and this approach: \t"
cat $OutDir/"$Strain"_sscp.fa | grep '>' | tr -d '>' > $OutDir/"$Strain"_sscp_headers.txt
cat $OutDir/"$Strain"_sscp_headers.txt $EffectorP | cut -f1 | sort | uniq -d | wc -l
echo ""
done > tmp.txt
v.inaequalis - 172_pacbio
% cysteine content threshold set to: 3
maximum length set to: 300
No. short-cysteine rich proteins in input fasta: 507
number of SSC-rich genes: 507
Number of effectors predicted by EffectorP: 629
Number of SSCPs predicted by both effectorP and this approach: 463