Skip to content

Latest commit

 

History

History
1487 lines (1219 loc) · 60.5 KB

PacBio_assembly.mdown

File metadata and controls

1487 lines (1219 loc) · 60.5 KB

Venturia inaequalis PacBio

commands for assembly of V. inaequalis isolate 172 genome from PacBio platform

==========

Scripts used for the assembly of venturia isolate 172 Note - all this work was performed in the directory: /home/groups/harrisonlab/project_files/venturia

###Building of directory structure

RawDatDir=/home/harrir/projects/pacbio_test/v_inaeq/ENQ-933_Richard_Harrison_EMR.RH_PPBFX-340_S6_05-172
mkdir -p raw_dna/pacbio/v.inaequalis/172
cp -r $RawDatDir/A03_1 raw_dna/pacbio/v.inaequalis/172
cp -r $RawDatDir/B03_1 raw_dna/pacbio/v.inaequalis/172
cp -r $RawDatDir/E06_1 raw_dna/pacbio/v.inaequalis/172
OutDir=raw_dna/pacbio/v.inaequalis/172/extracted
mkdir -p $OutDir
cat raw_dna/pacbio/v.inaequalis/172/*/Analysis_Results/*.subreads.fastq > $OutDir/concatenated_pacbio.fastq

Canu Assembly

Genome size based on MiSeq assembled genome of the same isolate

Reads=$(ls raw_dna/pacbio/*/*/extracted/concatenated_pacbio.fastq)
GenomeSz="70m"
Strain=$(echo $Reads | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Reads | rev | cut -f4 -d '/' | rev)
Prefix="$Strain"_canu
OutDir="assembly/canu/$Organism/$Strain/70m"
ProgDir=~/git_repos/tools/seq_tools/assemblers/canu
qsub $ProgDir/submit_canu.sh $Reads $GenomeSz $Prefix $OutDir

Assembly polished with Pilon

for Assembly in $(ls assembly/canu/v.inaequalis/172/70m/172_canu.contigs.fasta); do
    Organism=v.inaequalis
    Strain=172
    IlluminaDir=$(ls -d qc_dna/paired/$Organism/$Strain)
    TrimF1_Read=$IlluminaDir/F/172_S4_L001_R1_001_trim.fq.gz
    TrimR1_Read=$IlluminaDir/R/172_S4_L001_R2_001_trim.fq.gz
    OutDir=assembly/canu/$Organism/$Strain/polished
    ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/pilon
    qsub $ProgDir/sub_pilon.sh $Assembly $TrimF1_Read $TrimR1_Read $OutDir
done

quast for stats

ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
for Assembly in $(ls assembly/canu/*/*/polished/pilon.fasta); do
    Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
    Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)  
    OutDir=assembly/canu/$Organism/$Strain/polished
    qsub $ProgDir/sub_quast.sh $Assembly $OutDir
done

Spades Assembly

for PacBioDat in $(ls raw_dna/pacbio/*/*/extracted/concatenated_pacbio.fastq); do
  Organism=$(echo $PacBioDat | rev | cut -f4 -d '/' | rev)
  Strain=$(echo $PacBioDat | rev | cut -f3 -d '/' | rev)
  IlluminaDir=$(ls -d qc_dna/paired/$Organism/$Strain)
  TrimF1_Read=$(ls $IlluminaDir/F/172_S4_L001_R1_001_trim.fq.gz);
  TrimR1_Read=$(ls $IlluminaDir/R/172_S4_L001_R2_001_trim.fq.gz);
  OutDir=assembly/spades_pacbio/$Organism/"$Strain"
  echo $TrimF1_Read
  echo $TrimR1_Read
  ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/spades
  qsub $ProgDir/sub_spades_pacbio.sh $PacBioDat $TrimF1_Read $TrimR1_Read $OutDir 10
done

Filter out contigs < 500bp from assembly

  for Contigs in $(ls assembly/spades_pacbio/*/*/contigs.fasta); do
    AssemblyDir=$(dirname $Contigs)
    mkdir $AssemblyDir/filtered_contigs
    FilterDir=/home/passet/git_repos/tools/seq_tools/assemblers/abyss
    $FilterDir/filter_abyss_contigs.py $Contigs 500 > $AssemblyDir/filtered_contigs/contigs_min_500bp.fasta
  done

Quast

  ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
  for Assembly in $(ls assembly/spades_pacbio/*/*/filtered_contigs/contigs_min_500bp.fasta); do
    Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
    Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)  
    OutDir=assembly/spades_pacbio/$Organism/$Strain/filtered_contigs
    qsub $ProgDir/sub_quast.sh $Assembly $OutDir
  done

#Merge pacbio and hybrid assemblies

  for PacBioAssembly in $(ls assembly/canu/*/*/polished/pilon.fasta); do
    Organism=$(echo $PacBioAssembly | rev | cut -f4 -d '/' | rev)
    Strain=$(echo $PacBioAssembly | rev | cut -f3 -d '/' | rev)
    HybridAssembly=$(ls assembly/spades_pacbio/$Organism/$Strain/contigs.fasta)
    OutDir=assembly/merged_canu_spades/$Organism/$Strain
    ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/quickmerge
    qsub $ProgDir/sub_quickmerge.sh $PacBioAssembly $HybridAssembly $OutDir
  done

Merged assembly polished with Pilon

  for Assembly in $(ls assembly/merged_canu_spades/*/*/merged.fasta); do
    Organism=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
    Strain=$(echo $Assembly | rev | cut -f2 -d '/' | rev)
    IlluminaDir=$(ls -d qc_dna/paired/$Organism/$Strain)
    TrimF1_Read=$(ls $IlluminaDir/F/172_S4_L001_R1_001_trim.fq.gz);
    TrimR1_Read=$(ls $IlluminaDir/R/172_S4_L001_R2_001_trim.fq.gz);
    OutDir=assembly/merged_canu_spades/$Organism/$Strain/polished
    ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/pilon
    qsub $ProgDir/sub_pilon.sh $Assembly $TrimF1_Read $TrimR1_Read $OutDir
  done

Contigs renamed in accordance with ncbi recommendations

  ProgDir=~/git_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
  touch tmp.csv
  for Assembly in $(ls assembly/merged_canu_spades/*/*/polished/pilon.fasta); do
    Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
    Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)  
    OutDir=assembly/merged_canu_spades/$Organism/$Strain/filtered_contigs
    mkdir -p $OutDir
    $ProgDir/remove_contaminants.py --inp $Assembly --out $OutDir/contigs_min_500bp_renamed.fasta --coord_file tmp.csv
  done
  rm tmp.csv

Quast of polished merged assembly

  ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
  for Assembly in $(ls assembly/merged_canu_spades/*/*/filtered_contigs/contigs_min_500bp_renamed.fasta); do
    Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
    Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)  
    OutDir=assembly/merged_canu_spades/$Organism/$Strain/filtered_contigs
    qsub $ProgDir/sub_quast.sh $Assembly $OutDir
  done

###Re-assembly of Pacbio isolate 172 with extra reads #Building of directory structure

RawDatDir=/home/harrir/projects/pacbio_test/v_inaeq/
cp -r $RawDatDir/ENQ-933_Richard_Harrison_EMR.RH_PPBFX-340_S6_05-172_repeat.tar.gz raw_dna/pacbio/v.inaequalis/172
tar -C /raw_dna/pacbio/v.inaequalis/172 -zxvf raw_dna/pacbio/v.inaequalis/172/ENQ-933_Richard_Harrison_EMR.RH_PPBFX-340_S6_05-172_repeat.tar.gz
 mv raw_dna/pacbio/v.inaequalis/172/ENQ-933_Richard_Harrison_EMR.RH_PPBFX-340_S6_05-172_repeat/A03_1/ raw_dna/pacbio/v.inaequalis/172/repeat
OutDir=raw_dna/pacbio/v.inaequalis/172/extracted
cat raw_dna/pacbio/v.inaequalis/172/*/Analysis_Results/*.subreads.fastq > $OutDir/concatenated_pacbio.fastq

Canu Assembly

Genome size based on MiSeq assembled genome of the same isolate

Reads=$(ls raw_dna/pacbio/*/*/extracted/concatenated_pacbio.fastq)
GenomeSz="70m"
Strain=$(echo $Reads | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Reads | rev | cut -f4 -d '/' | rev)
Prefix="$Strain"_canu
OutDir="assembly/canu/$Organism/$Strain/repeat"
ProgDir=~/git_repos/tools/seq_tools/assemblers/canu
qsub $ProgDir/submit_canu.sh $Reads $GenomeSz $Prefix $OutDir

###Another Re-assembly of Pacbio isolate 172 with further reads #Building of directory structure

3rd data set

RawDatDir=/home/harrir/projects/pacbio_test/v_inaeq
cp -r $RawDatDir/Richard_Harrison_NEMR.RH.ENQ-933.C.02_S6_SAM22709_PRO1095_S6_HMW_05-172.tar.gz raw_dna/pacbio/v.inaequalis/172
cd raw_dna/pacbio/v.inaequalis/172
  tar -zxvf raw_dna/pacbio/v.inaequalis/172/Richard_Harrison_NEMR.RH.ENQ-933.C.02_S6_SAM22709_PRO1095_S6_HMW_05-172.tar.gz 
  cd ../../../..
mv raw_dna/pacbio/v.inaequalis/172/tgac/pnp/raw/pacbio-1/n56105/2016_07_20_PSEQ1128_342/G03_1/ raw_dna/pacbio/v.inaequalis/172/
mv raw_dna/pacbio/v.inaequalis/172/tgac/pnp/raw/pacbio-1/n56105/2016_07_20_PSEQ1128_342/H03_1/ raw_dna/pacbio/v.inaequalis/172/

4th data set

RawDatDir=/home/harrir/projects/pacbio_test/v_inaeq
cp -r $RawDatDir/S6_Richard_Harrison_extra_data_NEMR.RH.ENQ-933.C.02_Raw_reads.tar.gz raw_dna/pacbio/v.inaequalis/172
cd raw_dna/pacbio/v.inaequalis/172
  tar -zxvf raw_dna/pacbio/v.inaequalis/172/S6_Richard_Harrison_extra_data_NEMR.RH.ENQ-933.C.02_Raw_reads.tar.gz 
  cd ../../../..
mv raw_dna/pacbio/v.inaequalis/172/Raw_reads/A01_1 raw_dna/pacbio/v.inaequalis/172/
mv raw_dna/pacbio/v.inaequalis/172/Raw_reads/B01_1 raw_dna/pacbio/v.inaequalis/172/
mv raw_dna/pacbio/v.inaequalis/172/Raw_reads/C01_1 raw_dna/pacbio/v.inaequalis/172/

Concatenated Pacbio data from all 4 data sets (9 pacbio cells in total)

OutDir=raw_dna/pacbio/v.inaequalis/172/extracted
cat raw_dna/pacbio/v.inaequalis/172/*/Analysis_Results/*.subreads.fastq > $OutDir/concatenated_pacbio.fastq

#Canu Assembly

Genome size based on MiSeq assembled genome of the same isolate

Reads=$(ls raw_dna/pacbio/*/*/extracted/concatenated_pacbio.fastq)
GenomeSz="70m"
Strain=$(echo $Reads | rev | cut -f3 -d '/' | rev)
Organism=$(echo $Reads | rev | cut -f4 -d '/' | rev)
Prefix="$Strain"_canu
OutDir="assembly/canu/$Organism/$Strain/Repeat_2"
ProgDir=~/git_repos/tools/seq_tools/assemblers/canu
qsub $ProgDir/submit_canu.sh $Reads $GenomeSz $Prefix $OutDir

quast of canu

  ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
  for Assembly in $(ls assembly/canu/v.inaequalis/172/Repeat_2/172_canu.contigs.fasta); do
    Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
    Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)  
    OutDir=assembly/canu/$Organism/$Strain/Repeat_2
    qsub $ProgDir/sub_quast.sh $Assembly $OutDir
  done

Assembly polished with Pilon

  for Assembly in $(ls assembly/canu/v.inaequalis/172/Repeat_2/172_canu.contigs.fasta); do
    Organism=v.inaequalis
    Strain=172
    IlluminaDir=$(ls -d qc_dna/paired/$Organism/$Strain)
    TrimF1_Read=$IlluminaDir/F/172_S4_L001_R1_001_trim.fq.gz
    TrimR1_Read=$IlluminaDir/R/172_S4_L001_R2_001_trim.fq.gz
    OutDir=assembly/canu/$Organism/$Strain/polished_repeat
    ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/pilon
    qsub $ProgDir/sub_pilon.sh $Assembly $TrimF1_Read $TrimR1_Read $OutDir
  done

quast for stats

ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
for Assembly in $(ls assembly/canu/*/*/polished_repeat/pilon.fasta); do
  Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
  Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)  
  OutDir=assembly/canu/$Organism/$Strain/polished_repeat
  qsub $ProgDir/sub_quast.sh $Assembly $OutDir
done

bwa alignment of canu contigs to pacbio

  Assembly=assembly/canu/v.inaequalis/172/polished_repeat/pilon.fasta
  Reads=raw_dna/pacbio/v.inaequalis/172/extracted/concatenated_pacbio.fastq
  OutDir=analysis/genome_alignment/bwa/v.inaequalis/172/vs_172
  ProgDir=/home/passet/git_repos/tools/seq_tools/genome_alignment/bwa
  qsub $ProgDir/sub_bwa_pacbio.sh $Assembly $Reads $OutDir

Spades Assembly

  for PacBioDat in $(ls raw_dna/pacbio/*/*/extracted/concatenated_pacbio.fastq); do
    Organism=$(echo $PacBioDat | rev | cut -f4 -d '/' | rev)
    Strain=$(echo $PacBioDat | rev | cut -f3 -d '/' | rev)
    IlluminaDir=$(ls -d qc_dna/paired/$Organism/$Strain)
    TrimF1_Read=$(ls $IlluminaDir/F/172_S4_L001_R1_001_trim.fq.gz);
    TrimR1_Read=$(ls $IlluminaDir/R/172_S4_L001_R2_001_trim.fq.gz);
    OutDir=assembly/spades_pacbio/$Organism/"$Strain"/
    echo $TrimF1_Read
    echo $TrimR1_Read
    ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/spades
    qsub $ProgDir/sub_spades_pacbio.sh $PacBioDat $TrimF1_Read $TrimR1_Read $OutDir 10
  done

Filter out contigs < 500bp from assembly

  for Contigs in $(ls assembly/spades_pacbio/*/*/contigs.fasta); do
    AssemblyDir=$(dirname $Contigs)
    FilterDir=/home/passet/git_repos/tools/seq_tools/assemblers/abyss
    $FilterDir/filter_abyss_contigs.py $Contigs 500 > $AssemblyDir/filtered_contigs/contigs_min_500bp.fasta
  done

Quast

  ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
  for Assembly in $(ls assembly/spades_pacbio/*/*/filtered_contigs/contigs_min_500bp.fasta); do
    Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
    Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)  
    OutDir=assembly/spades_pacbio/$Organism/$Strain/filtered_contigs_repeat
    qsub $ProgDir/sub_quast.sh $Assembly $OutDir
  done

bwa alignment of spades contigs to pacbio

  Assembly=assembly/spades_pacbio/v.inaequalis/172/filtered_contigs/contigs_min_500bp.fasta
  Reads=raw_dna/pacbio/v.inaequalis/172/extracted/concatenated_pacbio.fastq
  OutDir=analysis/genome_alignment/bwa/v.inaequalis/172/vs_172_spades
  ProgDir=/home/passet/git_repos/tools/seq_tools/genome_alignment/bwa
  qsub $ProgDir/sub_bwa_pacbio.sh $Assembly $Reads $OutDir

#Merge pacbio and hybrid assemblies

  for PacBioAssembly in $(ls assembly/canu/*/*/polished_repeat/pilon.fasta); do
    Organism=$(echo $PacBioAssembly | rev | cut -f4 -d '/' | rev)
    Strain=$(echo $PacBioAssembly | rev | cut -f3 -d '/' | rev)
    HybridAssembly=$(ls assembly/spades_pacbio/$Organism/$Strain/contigs.fasta)
    AnchorLength=500000
    OutDir=assembly/merged_canu_spades/$Organism/$Strain/
    ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/quickmerge
    qsub $ProgDir/sub_quickmerge.sh $PacBioAssembly $HybridAssembly $OutDir $AnchorLength
  done

Merged assembly polished with Pilon

  for Assembly in $(ls assembly/merged_canu_spades/*/*/merged.fasta); do
    Organism=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
    Strain=$(echo $Assembly | rev | cut -f2 -d '/' | rev)
    IlluminaDir=$(ls -d qc_dna/paired/$Organism/$Strain)
    TrimF1_Read=$(ls $IlluminaDir/F/172_S4_L001_R1_001_trim.fq.gz);
    TrimR1_Read=$(ls $IlluminaDir/R/172_S4_L001_R2_001_trim.fq.gz);
    OutDir=assembly/merged_canu_spades/$Organism/$Strain/polished
    ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/pilon
    qsub $ProgDir/sub_pilon.sh $Assembly $TrimF1_Read $TrimR1_Read $OutDir
  done

Contigs renamed in accordance with ncbi recommendations

  ProgDir=~/git_repos/tools/seq_tools/assemblers/assembly_qc/remove_contaminants
  touch tmp.csv
    for Assembly in $(ls assembly/merged_canu_spades/*/*/polished/pilon.fasta); do
      Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
      Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)  
      OutDir=assembly/merged_canu_spades/$Organism/$Strain/filtered_contigs
      mkdir -p $OutDir
      $ProgDir/remove_contaminants.py --inp $Assembly --out $OutDir/contigs_min_500bp_renamed.fasta --coord_file tmp.csv
    done
  rm tmp.csv

Quast of polished merged assembly

  ProgDir=/home/passet/git_repos/tools/seq_tools/assemblers/assembly_qc/quast
  for Assembly in $(ls assembly/merged_canu_spades/*/*/filtered_contigs/contigs_min_500bp_renamed.fasta); do
    Strain=$(echo $Assembly | rev | cut -f3 -d '/' | rev)
    Organism=$(echo $Assembly | rev | cut -f4 -d '/' | rev)  
    OutDir=assembly/merged_canu_spades/$Organism/$Strain/filtered_contigs
    qsub $ProgDir/sub_quast.sh $Assembly $OutDir
  done

bwa alignment of merged contigs to pacbio

  Assembly=ls assembly/merged_canu_spades/*/*/filtered_contigs/contigs_min_500bp_renamed.fasta
  Reads=raw_dna/pacbio/v.inaequalis/172/extracted/concatenated_pacbio.fastq
  OutDir=analysis/genome_alignment/bwa/v.inaequalis/172/vs_172_merged
  ProgDir=/home/passet/git_repos/tools/seq_tools/genome_alignment/bwa
  qsub $ProgDir/sub_bwa_pacbio.sh $Assembly $Reads $OutDir

Folder tidying

To avoid any potential overwritting of Isolate 172 miseq assembly analysis renamed pacbio assembly folders named "172" to "172_pacbio"

  mv raw_dna/pacbio/*/* raw_dna/pacbio/v.inaequalis/172_pacbio
  mv assembly/canu/v.inaequalis/172/ assembly/canu/v.inaequalis/172_pacbio
  mv assembly/spades_pacbio/v.inaequalis/172/ assembly/spades_pacbio/v.inaequalis/172_pacbio
  mv assembly/merged_canu_spades/v.inaequalis/172/ assembly/merged_canu_spades/v.inaequalis/172_pacbio
  mv analysis/genome_alignment/bwa/v.inaequalis/172/ analysis/genome_alignment/bwa/v.inaequalis/172_pacbio

Analysis of assemblies

Repeatmasking

Repeat masking was performed and used the following programs: Repeatmasker Repeatmodeler

Repeat masking of canu assembly

  ProgDir=/home/passet/git_repos/tools/seq_tools/repeat_masking
  for Ass in $(ls assembly/canu/*/*/polished_repeat/pilon.fasta); do
    qsub $ProgDir/rep_modeling.sh $Ass
    qsub $ProgDir/transposonPSI.sh $Ass
  done

Ran repeatmasking of merged assembly too

  ProgDir=/home/passet/git_repos/tools/seq_tools/repeat_masking
  for Ass in $(ls assembly/merged_canu_spades/*/*/filtered_contigs/contigs_min_500bp_renamed.fasta); do
    qsub $ProgDir/rep_modeling.sh $Ass
    qsub $ProgDir/transposonPSI.sh $Ass
done

The number of bases masked by transposonPSI and Repeatmasker were summarised using the following commands:

  for RepDir in $(ls -d repeat_masked/v.*/172_pacbio/*); do
    Strain=$(echo $RepDir | rev | cut -f2 -d '/' | rev)
    Organism=$(echo $RepDir | rev | cut -f3 -d '/' | rev)  
    RepMaskGff=$(ls $RepDir/*_contigs_hardmasked.gff)
    TransPSIGff=$(ls $RepDir/*_contigs_unmasked.fa.TPSI.allHits)
    printf "$Organism\t$Strain\n"
    printf "The number of bases masked by RepeatMasker:\t"
    sortBed -i $RepMaskGff | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'
    printf "The number of bases masked by TransposonPSI:\t"
    sortBed -i $TransPSIGff | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'
    printf "The total number of masked bases are:\t"
    cat $RepMaskGff $TransPSIGff | sortBed | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'
    echo
  done

v.inaequalis 172_pacbio The number of bases masked by RepeatMasker: 33982597 The number of bases masked by TransposonPSI: 10254 The total number of masked bases are: Differing number of GFF fields encountered at line: 32896.

v.inaequalis 172_pacbio The number of bases masked by RepeatMasker: 34180552 The number of bases masked by TransposonPSI: 10254 The total number of masked bases are: Differing number of GFF fields encountered at line: 34454.

Proportion of Transposable elements

Genome announcement review wanted to know proportion of repeatmasked due to TEs

for RepDir in $(ls -d repeat_masked/v.*/172_pacbio/*); do
Strain=$(echo $RepDir | rev | cut -f2 -d '/' | rev)
Organism=$(echo $RepDir | rev | cut -f3 -d '/' | rev)  
RepMaskGff=$(ls $RepDir/172_pacbio_contigs_transposonmasked.gff)
TransPSIGff=$(ls $RepDir/*_contigs_unmasked.fa.TPSI.allHits)
printf "$Organism\t$Strain\n"
printf "The number of bases masked by RepeatMasker:\t"
sortBed -i $RepMaskGff | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'
printf "The number of bases masked by TransposonPSI:\t"
sortBed -i $TransPSIGff | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'
printf "The total number of masked bases are:\t"
cat $RepMaskGff $TransPSIGff | sortBed | bedtools merge | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'
echo
done

v.inaequalis 172_pacbio The number of bases masked by RepeatMasker: 33537704 The number of bases masked by TransposonPSI: 10254 The total number of masked bases are: Differing number of GFF fields encountered at line: 21779. Exiting... 0

v.inaequalis 172_pacbio The number of bases masked by RepeatMasker: 33733448 The number of bases masked by TransposonPSI: 10254 The total number of masked bases are: Differing number of GFF fields encountered at line: 23362. Exiting... 0

Gene Prediction

Gene prediction followed three steps: Pre-gene prediction - Quality of merged PacBio genome assembly assessed using Cegma to see how many core eukaryotic genes can be identified. Gene model training - Gene models were trained using assembled RNAseq data as part of the Braker1 pipeline Gene prediction - Gene models were used to predict genes in genomes as part of the the Braker1 pipeline. This used RNAseq data as hints for gene models.

Pre-gene prediction

Quality of genome assembly assessed by looking for the gene space in the assemblies.

  ProgDir=/home/passet/git_repos/tools/gene_prediction/cegma
    cd /home/groups/harrisonlab/project_files/venturia
    for Genome in $(ls repeat_masked/v.*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -w "172_pacbio"); do
    echo $Genome;
    qsub $ProgDir/sub_cegma.sh $Genome dna;
  done

Outputs were summarised using the commands:

  for File in $(ls gene_pred/cegma/v*/*/*_dna_cegma.completeness_report | grep "172_pacbio"); do
    Strain=$(echo $File | rev | cut -f2 -d '/' | rev);
    Species=$(echo $File | rev | cut -f3 -d '/' | rev);
    printf "$Species\t$Strain\n";
    cat $File | head -n18 | tail -n+4;printf "\n";
  done > gene_pred/cegma/cegma_results_dna_summary.txt

less gene_pred/cegma/cegma_results_dna_summary.txt

Output:

v.inaequalis 172_pacbio #Prots %Completeness - #Total Average %Ortho

Complete 238 95.97 - 260 1.09 7.98

Group 1 61 92.42 - 66 1.08 8.20 Group 2 54 96.43 - 59 1.09 7.41 Group 3 59 96.72 - 68 1.15 11.86 Group 4 64 98.46 - 67 1.05 4.69

Partial 245 98.79 - 273 1.11 9.80

Group 1 64 96.97 - 70 1.09 7.81 Group 2 55 98.21 - 61 1.11 9.09 Group 3 61 100.00 - 73 1.20 16.39 Group 4 65 100.00 - 69 1.06 6.15

Busco has replaced CEGMA and was run to check gene space in assemblies

  for Assembly in $(ls repeat_masked/v.*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -w "172_pacbio"); do
    Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
    Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
    echo "$Organism - $Strain"
    ProgDir=/home/passet/git_repos/tools/gene_prediction/busco
    # BuscoDB="Fungal"
    BuscoDB=$(ls -d /home/groups/harrisonlab/dbBusco/ascomycota_odb9)
    OutDir=gene_pred/busco/$Organism/$Strain/assembly
    qsub $ProgDir/sub_busco3.sh $Assembly $BuscoDB $OutDir
  done

Output:

 BUSCO version is: 3.0.2
# The lineage dataset is: ascomycota_odb9 (Creation date: 2016-02-13, number of
# To reproduce this run: python /home/armita/prog/busco/busco/scripts/run_BUSCO.
#
# Summarized benchmarking in BUSCO notation for file 172_pacbio_contigs_unmasked
# BUSCO was run in mode: genome

        C:97.8%[S:97.7%,D:0.1%],F:0.5%,M:1.7%,n:1315

        1286    Complete BUSCOs (C)
        1285    Complete and single-copy BUSCOs (S)
        1       Complete and duplicated BUSCOs (D)
        7       Fragmented BUSCOs (F)
        22      Missing BUSCOs (M)
        1315    Total BUSCO groups searched
  for File in $(ls gene_pred/busco/v*/*/assembly/*/short_summary_*.txt); do  
    echo $File;
    cat $File | grep -e '(C)' -e 'Total';
  done

Output:

1286 Complete BUSCOs (C) 1315 Total BUSCO groups searched

#Gene prediction

Gene prediction was performed for V. inaequalis PacBio genome.

Gene prediction 1 - Braker1 gene model training and prediction

Gene prediction was performed using Braker1.

First, RNAseq data from Thakur et al was aligned to V. inaequalis PacBio genome.

  • qc of RNA seq data is detailed in README.md

Aligning

Insert sizes of the RNA seq library were unknown until a draft alignment could be made. To do this tophat and cufflinks were run, aligning the reads against the genome. The fragment length and stdev were printed to stdout while cufflinks was running.

  for Assembly in $(ls repeat_masked/v.*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -w "172_pacbio"); do
    Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
    Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
    Paired=$(echo $Assembly | rev | cut -d '/' -f5 | rev)
    echo "$Organism - $Strain"
    for rna_file in $(ls qc_rna/*/*/*/*/*.gz | grep -w 'paired'); do
      Timepoint=$(echo $rna_file | rev | cut -f3 -d '/' | rev)
      echo "$Timepoint"
      OutDir=alignment/$Paired/$Organism/$Strain/$Timepoint
      ProgDir=/home/passet/git_repos/tools/seq_tools/RNAseq
      qsub $ProgDir/tophat_alignment_unpaired.sh $Assembly $rna_file $OutDir
    done
  done

Alignments were concatenated prior to running cufflinks: Cufflinks was run to produce the fragment length and stdev statistics:

  for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_softmasked.fa | grep -w "172_pacbio"); do
    Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
    Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
    echo "$Organism - $Strain"
    mkdir -p alignment/repeat_masked/$Organism/$Strain/concatenated_prelim
    AcceptedHits=alignment/repeat_masked/$Organism/$Strain/concatenated_prelim/concatenated.bam
    samtools merge -f $AcceptedHits \
    alignment/repeat_masked/$Organism/$Strain/V0/accepted_hits.bam \
    alignment/repeat_masked/$Organism/$Strain/V2/accepted_hits.bam \
    alignment/repeat_masked/$Organism/$Strain/V5/accepted_hits.bam
    OutDir=gene_pred/cufflinks/$Organism/$Strain/concatenated_prelim
    mkdir -p $OutDir
    cufflinks -o $OutDir/cufflinks -p 8 --max-intron-length 4000 $AcceptedHits 2>&1 | tee $OutDir/cufflinks/cufflinks.log
  done

Output from stdout included:

You are using Cufflinks v2.2.1, which is the most recent release.
[08:31:38] Inspecting reads and determining fragment length distribution.
> Processed 47186 loci.                        [*************************] 100%
> Map Properties:
>       Normalized Map Mass: 10994414.77
>       Raw Map Mass: 10994414.77
>       Fragment Length Distribution: Truncated Gaussian (default)
>                     Default Mean: 200
>                  Default Std Dev: 80
[08:32:48] Assembling transcripts and estimating abundances.
> Processed 47219 loci.                        [*************************] 100%

The Estimated Mean: 200 allowed calculation of of the mean insert gap to be -88 bp 200-(144*2) where 144 was the mean read length. This was provided to tophat on a second run (as the -r option) along with the fragment length stdev to increase the accuracy of mapping.

Then RNaseq data was aligned to the PacBio genome assembly:

InsertGap='-88'
InsertStdDev='80'

  for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_unmasked.fa | grep -w "172_pacbio"); do
      Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
      Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
      Paired=$(echo $Assembly | rev | cut -d '/' -f5 | rev)
      echo "$Organism - $Strain"
      for rna_file in $(ls qc_rna/*/*/*/*/*.gz | grep -w 'paired'); do
      Timepoint=$(echo $rna_file | rev | cut -f3 -d '/' | rev)
      echo "$Timepoint"
      OutDir=alignment/$Paired/$Organism/$Strain/"$Timepoint"_paired
      ProgDir=/home/passet/git_repos/tools/seq_tools/RNAseq
      qsub $ProgDir/tophat_alignment_interlevered.sh $Assembly $rna_file $OutDir $InsertGap $InsertStdDev
    done
    for rna_file in $(ls qc_rna/*/*/*/*/*.gz | grep -w 'unpaired'); do
      Timepoint=$(echo $rna_file | rev | cut -f3 -d '/' | rev)
      echo "$Timepoint"
      OutDir=alignment/$Paired/$Organism/$Strain/"$Timepoint"_unpaired
      ProgDir=/home/passet/git_repos/tools/seq_tools/RNAseq
      qsub $ProgDir/tophat_alignment_unpaired.sh $Assembly $rna_file $OutDir
    done
  done

Braker prediction

Gene prediction using Braker1 Prediction of all putative ORFs in the genome using the ORF finder (atg.pl) approach.

Before braker predictiction was performed, I double checked that I had the genemark key in my user area and copied it over from the genemark install directory:

ls ~/.gm_key
cp /home/armita/prog/genemark/gm_key_64 ~/.gm_key

Ran braker prediction on PacBio genome

  for Assembly in $(ls repeat_masked/*/*/filtered_contigs_repmask/*_contigs_softmasked.fa | grep "172_pacbio"); do
    Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
    Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
    echo "$Organism - $Strain"
    mkdir -p alignment/repeat_masked/$Organism/$Strain/concatenated
    samtools merge -f alignment/repeat_masked/$Organism/$Strain/concatenated/concatenated.bam \
    alignment/repeat_masked/$Organism/$Strain/V0_unpaired/accepted_hits.bam \
    alignment/repeat_masked/$Organism/$Strain/V2_unpaired/accepted_hits.bam \
    alignment/repeat_masked/$Organism/$Strain/V5_unpaired/accepted_hits.bam \
    alignment/repeat_masked/$Organism/$Strain/V0_paired/accepted_hits.bam \
    alignment/repeat_masked/$Organism/$Strain/V2_paired/accepted_hits.bam \
    alignment/repeat_masked/$Organism/$Strain/V5_paired/accepted_hits.bam
    OutDir=gene_pred/braker/$Organism/"$Strain"_braker_new
    AcceptedHits=alignment/repeat_masked/$Organism/$Strain/concatenated/concatenated.bam
    GeneModelName="$Organism"_"$Strain"_braker_new
    rm -r /home/armita/prog/augustus-3.1/config/species/"$Organism"_"$Strain"_braker_new
    ProgDir=/home/passet/git_repos/tools/gene_prediction/braker1
    qsub $ProgDir/sub_braker_fungi.sh $Assembly $OutDir $AcceptedHits $GeneModelName
  done

Fasta and gff files were extracted from Braker1 output.

  for File in $(ls gene_pred/braker/v.*/172_pacbio_braker_new/*/augustus.gff); do
    getAnnoFasta.pl $File
    OutDir=$(dirname $File)
    echo "##gff-version 3" > $OutDir/augustus_extracted.gff
    cat $File | grep -v '#' >> $OutDir/augustus_extracted.gff
  done

Supplimenting Braker gene models with CodingQuary genes

Additional genes were added to Braker gene predictions, using CodingQuary in pathogen mode to predict additional regions.

Fistly, aligned RNAseq data was assembled into transcripts using Cufflinks.

Note - cufflinks doesn't always predict direction of a transcript and therefore features can not be restricted by strand when they are intersected.

  for Assembly in $(ls repeat_masked/*/*/filtered_*/*_contigs_unmasked.fa | grep -w "172_pacbio"); do 
    Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
    Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
    echo "$Organism - $Strain"
    OutDir=gene_pred/cufflinks/$Organism/$Strain/concatenated
    mkdir -p $OutDir
    AcceptedHits=alignment/repeat_masked/$Organism/$Strain/concatenated/concatenated.bam
    ProgDir=/home/passet/git_repos/tools/seq_tools/RNAseq
    qsub $ProgDir/sub_cufflinks.sh $AcceptedHits $OutDir
  done

Secondly, genes were predicted using CodingQuary:

  for Assembly in $(ls repeat_masked/*/*/filtered_*/*_contigs_softmasked.fa | grep -w "172_pacbio"); do
    Strain=$(echo $Assembly| rev | cut -d '/' -f3 | rev)
    Organism=$(echo $Assembly | rev | cut -d '/' -f4 | rev)
    echo "$Organism - $Strain"
    OutDir=gene_pred/codingquary/$Organism/$Strain
    CufflinksGTF=gene_pred/cufflinks/$Organism/$Strain/concatenated/transcripts.gtf
    ProgDir=/home/passet/git_repos/tools/gene_prediction/codingquary
    qsub $ProgDir/sub_CodingQuary.sh $Assembly $CufflinksGTF $OutDir
  done

Then, additional transcripts were added to Braker gene models, when CodingQuary genes were predicted in regions of the genome, not containing Braker gene models:

#The following is edited after NCBI found duplicated genes

  for BrakerGff in $(ls gene_pred/braker/v.*/172_pacbio_braker_new/*/augustus.gff3); do
    Strain=$(echo $BrakerGff| rev | cut -d '/' -f3 | rev | sed 's/_braker_new//g')
    Organism=$(echo $BrakerGff | rev | cut -d '/' -f4 | rev)
    echo "$Organism - $Strain"
    Assembly=$(ls repeat_masked/$Organism/$Strain/filtered_*/"$Strain"_contigs_softmasked.fa)
    CodingQuaryGff=gene_pred/codingquary/$Organism/$Strain/out/PredictedPass.gff3
    PGNGff=gene_pred/codingquary/$Organism/$Strain/out/PGN_predictedPass.gff3
    AddDir=gene_pred/codingquary/$Organism/$Strain/additional
    FinalDir=gene_pred/final/$Organism/$Strain/final
    AddGenesList=$AddDir/additional_genes.txt
    AddGenesGff=$AddDir/additional_genes.gff
    FinalGff=$AddDir/combined_genes.gff
    mkdir -p $AddDir
    mkdir -p $FinalDir

    bedtools intersect -v -a $CodingQuaryGff -b $BrakerGff | grep 'gene'| cut -f2 -d'=' | cut -f1 -d';' > $AddGenesList
    bedtools intersect -v -a $PGNGff -b $BrakerGff | grep 'gene'| cut -f2 -d'=' | cut -f1 -d';' >> $AddGenesList
    ProgDir=/home/passet/git_repos/tools/seq_tools/feature_annotation
    $ProgDir/gene_list_to_gff.pl $AddGenesList $CodingQuaryGff CodingQuarry_v2.0 ID CodingQuary > $AddGenesGff
    $ProgDir/gene_list_to_gff.pl $AddGenesList $PGNGff PGNCodingQuarry_v2.0 ID CodingQuary >> $AddGenesGff
    ProgDir=/home/passet/git_repos/tools/gene_prediction/codingquary
    # -
    # This section is edited
    $ProgDir/add_CodingQuary_features.pl $AddGenesGff $Assembly > $AddDir/add_genes_CodingQuary_unspliced.gff3
    $ProgDir/correct_CodingQuary_splicing.py --inp_gff $AddDir/add_genes_CodingQuary_unspliced.gff3 > $FinalDir/final_genes_CodingQuary.gff3
    # -
    $ProgDir/gff2fasta.pl $Assembly $FinalDir/final_genes_CodingQuary.gff3 $FinalDir/final_genes_CodingQuary
    cp $BrakerGff $FinalDir/final_genes_Braker.gff3
    $ProgDir/gff2fasta.pl $Assembly $FinalDir/final_genes_Braker.gff3 $FinalDir/final_genes_Braker
    cat $FinalDir/final_genes_Braker.pep.fasta $FinalDir/final_genes_CodingQuary.pep.fasta | sed -r 's/\*/X/g' > $FinalDir/final_genes_combined.pep.fasta
    cat $FinalDir/final_genes_Braker.cdna.fasta $FinalDir/final_genes_CodingQuary.cdna.fasta > $FinalDir/final_genes_combined.cdna.fasta
    cat $FinalDir/final_genes_Braker.gene.fasta $FinalDir/final_genes_CodingQuary.gene.fasta > $FinalDir/final_genes_combined.gene.fasta
    cat $FinalDir/final_genes_Braker.upstream3000.fasta $FinalDir/final_genes_CodingQuary.upstream3000.fasta > $FinalDir/final_genes_combined.upstream3000.fasta


    GffBraker=$FinalDir/final_genes_Braker.gff3
    GffQuary=$FinalDir/final_genes_CodingQuary.gff3
    GffAppended=$FinalDir/final_genes_appended.gff3
    cat $GffBraker $GffQuary > $GffAppended
  done

The final number of genes per isolate was observed using:

for DirPath in $(ls -d gene_pred/codingquary/v.*/*/final | grep -w "172_pacbio"); do
echo $DirPath;
cat $DirPath/final_genes_Braker.pep.fasta | grep '>' | wc -l;
cat $DirPath/final_genes_CodingQuary.pep.fasta | grep '>' | wc -l;
cat $DirPath/final_genes_combined.pep.fasta | grep '>' | wc -l;
echo "";
done

Output:

gene_pred/codingquary/v.inaequalis/172_pacbio/final
12198
1673
13871

After submission to ncbi and identification of duplicate genes, gene models were renamed and duplicate gene features were identified and removed.

for GffAppended in $(ls gene_pred/final/*/*/final/final_genes_appended.gff3); do
Strain=$(echo $GffAppended | rev | cut -d '/' -f3 | rev)
Organism=$(echo $GffAppended | rev | cut -d '/' -f4 | rev)
echo "$Organism - $Strain"
FinalDir=gene_pred/final/$Organism/$Strain/final
GffFiltered=$FinalDir/filtered_duplicates.gff
ProgDir=/home/passet/git_repos/tools/gene_prediction/codingquary
$ProgDir/remove_dup_features.py --inp_gff $GffAppended --out_gff $GffFiltered
GffRenamed=$FinalDir/final_genes_appended_renamed.gff3
LogFile=$FinalDir/final_genes_appended_renamed.log
ProgDir=/home/passet/git_repos/tools/gene_prediction/codingquary
$ProgDir/gff_rename_genes.py --inp_gff $GffFiltered --conversion_log $LogFile > $GffRenamed
rm $GffFiltered
Assembly=$(ls repeat_masked/$Organism/$Strain/filtered_*/"$Strain"_contigs_softmasked.fa)
$ProgDir/gff2fasta.pl $Assembly $GffRenamed gene_pred/final/$Organism/$Strain/final/final_genes_appended_renamed
# The proteins fasta file contains * instead of Xs for stop codons, these should
# be changed
sed -i 's/\*/X/g' gene_pred/final/$Organism/$Strain/final/final_genes_appended_renamed.pep.fasta
done

Output:

v.inaequalis - 172_pacbio
Identifiied the following duplicated transcripts:

NOTE - if any of these represent the first transcript of a gene (.t1) then an entire gene may be duplicated
if so the gene feature will need to be stripped out of the gff file seperately.
Genome fasta parsed

ORF finder

The genome was searched in six reading frames for any start codon and following translated identification of a start codon translating sequence until a stop codon was found. This is based upon the atg.pl script used in paper describing the P. infestans genome. Additional functionality was added to this script by also printing ORFs in .gff format.

  ProgDir=/home/passet/git_repos/tools/gene_prediction/ORF_finder
  for Genome in $(ls repeat_masked/v.*/*/*/*_contigs_unmasked.fa | grep "172_pacbio"); do
    qsub $ProgDir/run_ORF_finder.sh $Genome
  done

The Gff files from the the ORF finder are not in true Gff3 format. These were corrected using the following commands:

  ProgDir=~/git_repos/tools/seq_tools/feature_annotation
  for ORF_Gff in $(ls gene_pred/ORF_finder/*/*/*_ORF.gff | grep -v '_F_atg_' | grep -v '_R_atg_'); do
    ORF_Gff_mod=$(echo $ORF_Gff | sed 's/_ORF.gff/_ORF_corrected.gff3/g')
    echo ""
    echo "Correcting the following file:"
    echo $ORF_Gff
    echo "Redirecting to:"
    echo $ORF_Gff_mod
    $ProgDir/gff_corrector.pl $ORF_Gff > $ORF_Gff_mod
  done

#Functional annotation

A) Interproscan

Interproscan was used to give gene models functional annotations. Annotation was run using the commands below:

Note: This is a long-running script. As such, these commands were run using 'screen' to allow jobs to be submitted and monitored in the background. This allows the session to be disconnected and reconnected over time.

Screen ouput detailing the progress of submission of interproscan jobs was redirected to a temporary output file named interproscan_submission.log .

  screen -a
  cd /home/groups/harrisonlab/project_files/venturia
  ProgDir=/home/passet/git_repos/tools/seq_tools/feature_annotation/interproscan
  for Genes in $(ls gene_pred/final/v.inaequalis/172_pacbio/final/final_genes_appended_renamed.pep.fasta | grep -w "172_pacbio"); do
    echo $Genes
    $ProgDir/sub_interproscan.sh $Genes
  done 2>&1 | tee -a interproscan_submisison.log

Following interproscan annotation split files were combined using the following commands:

  ProgDir=/home/passet/git_repos/tools/seq_tools/feature_annotation/interproscan
  for Proteins in $(ls gene_pred/final/v.inaequalis/172_pacbio/final/final_genes_appended_renamed.pep.fasta | grep -w "172_pacbio"); do
    Strain=$(echo $Proteins | rev | cut -d '/' -f3 | rev)
    Organism=$(echo $Proteins | rev | cut -d '/' -f4 | rev)
    echo "$Organism - $Strain"
    echo $Strain
    InterProRaw=gene_pred/interproscan/$Organism/$Strain/raw
    $ProgDir/append_interpro.sh $Proteins $InterProRaw
  done

B) SwissProt

Annotations with SwissProt

  for Proteome in $(ls gene_pred/final/v.inaequalis/172_pacbio/final/final_genes_appended_renamed.pep.fasta | grep -w "172_pacbio"); do
    Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
    Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
    OutDir=gene_pred/swissprot/$Organism/$Strain
    SwissDbDir=../../uniprot/swissprot
    SwissDbName=uniprot_sprot
    ProgDir=/home/passet/git_repos/tools/seq_tools/feature_annotation/swissprot
    qsub $ProgDir/sub_swissprot.sh $Proteome $OutDir $SwissDbDir $SwissDbName
  done

C) CAZY proteins

Carbohydrte active enzymes were identified using CAZYfollowing recomendations at http://csbl.bmb.uga.edu/dbCAN/download/readme.txt :

  for Proteome in $(ls gene_pred/codingquary/v.*/*/*/final_genes_combined.pep.fasta | grep -w "172_pacbio"); do
    Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
    Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
    OutDir=gene_pred/CAZY/$Organism/$Strain
    mkdir -p $OutDir
    Prefix="$Strain"_CAZY
    CazyHmm=../../dbCAN/dbCAN-fam-HMMs.txt
    ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/HMMER
    qsub $ProgDir/sub_hmmscan.sh $CazyHmm $Proteome $Prefix $OutDir
  done

The Hmm parser was used to filter hits by an E-value of E1x10-5 or E1x10-e3 if they had a hit over a length of X %.

Those proteins with a signal peptide were extracted from the list and gff files representing these proteins made.

  for File in $(ls gene_pred/CAZY/*/*/*CAZY.out.dm); do
    Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
    Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
    OutDir=$(dirname $File)
    echo "$Organism - $Strain"
    ProgDir=/home/groups/harrisonlab/dbCAN
    $ProgDir/hmmscan-parser.sh $OutDir/"$Strain"_CAZY.out.dm > $OutDir/"$Strain"_CAZY.out.dm.ps
    CazyHeaders=$(echo $File | sed 's/.out.dm/_headers.txt/g')
    cat $OutDir/"$Strain"_CAZY.out.dm.ps | cut -f3 | sort | uniq > $CazyHeaders
    echo "number of CAZY proteins identified:"
    cat $CazyHeaders | wc -l
    Gff=$(ls gene_pred/codingquary/$Organism/$Strain/final/final_genes_appended.gff3)
    CazyGff=$OutDir/"$Strain"_CAZY.gff
    ProgDir=/home/armita/git_repos/emr_repos/tools/gene_prediction/ORF_finder
    $ProgDir/extract_gff_for_sigP_hits.pl $CazyHeaders $Gff CAZyme ID > $CazyGff
    echo "number of CAZY genes identified:"
    cat $CazyGff | grep -w 'gene' | wc -l

    SecretedProts=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/"$Strain"_final_sp_no_trans_mem.aa)
    SecretedHeaders=$(echo $SecretedProts | sed 's/.aa/_headers.txt/g')
    cat $SecretedProts | grep '>' | tr -d '>' > $SecretedHeaders
    CazyGffSecreted=$OutDir/"$Strain"_CAZY_secreted.gff
    $ProgDir/extract_gff_for_sigP_hits.pl $SecretedHeaders $CazyGff Secreted_CAZyme ID > $CazyGffSecreted
    echo "number of Secreted CAZY proteins identified:"
    cat $CazyGffSecreted | grep -w 'mRNA' | cut -f9 | tr -d 'ID=' | cut -f1 -d ';' > $OutDir/"$Strain"_CAZY_secreted_headers.txt
    cat $OutDir/"$Strain"_CAZY_secreted_headers.txt | wc -l
    echo "number of Secreted CAZY genes identified:"
    cat $CazyGffSecreted | grep -w 'gene' | wc -l
    # cat $OutDir/"$Strain"_CAZY_secreted_headers.txt | cut -f1 -d '.' | sort | uniq | wc -l
  done > tmp.txt
v.inaequalis - 172_pacbio
number of CAZY proteins identified:
578
number of CAZY genes identified:
578
number of Secreted CAZY proteins identified:
262
number of Secreted CAZY genes identified:
262

Note - the CAZY genes identified may need further filtering based on e value and cuttoff length - see below:

Cols in yourfile.out.dm.ps:

  1. Family HMM
  2. HMM length
  3. Query ID
  4. Query length
  5. E-value (how similar to the family HMM)
  6. HMM start
  7. HMM end
  8. Query start
  9. Query end
  10. Coverage
  • For fungi, use E-value < 1e-17 and coverage > 0.45

  • The best threshold varies for different CAZyme classes (please see http://www.ncbi.nlm.nih.gov/pmc/articles/PMC4132414/ for details). Basically to annotate GH proteins, one should use a very relax coverage cutoff or the sensitivity will be low (Supplementary Tables S4 and S9); (ii) to annotate CE families a very stringent E-value cutoff and coverage cutoff should be used; otherwise the precision will be very low due to a very high false positive rate (Supplementary Tables S5 and S10)

Summary of CAZY families

  for CAZY in $(ls gene_pred/CAZY/*/*/*_CAZY.out.dm.ps); do
    Strain=$(echo $CAZY | rev | cut -f2 -d '/' | rev)
    Organism=$(echo $CAZY | rev | cut -f3 -d '/' | rev)
    OutDir=$(dirname $CAZY)
    echo "$Organism - $Strain"
    Secreted=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/*_final_sp_no_trans_mem_headers.txt)
    Gff=gene_pred/codingquary/$Organism/$Strain/final/final_genes_appended.gff3
    ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/CAZY
    $ProgDir/summarise_CAZY.py --cazy $CAZY --inp_secreted $Secreted --inp_gff $Gff --summarise_family --trim_gene_id 2 --kubicek_2014
  done | less -S
v.inaequalis - 172_pacbio
B-Galactosidases - 2
B-Glucuronidases - 1
Polygalacturonase - 14
A-Arabinosidases - 11
Xylanases - 4
Polygalacturonate lyases - 10
A-Galactosidases - 3
B-Glycosidases - 7
Cellulases - 10

D) Secondary Metabolites (AntiSMASH + SMURF)

Antismash was run to identify clusters of secondary metabolite genes within the genome. Antismash was run using the weserver at: http://fungismash.secondarymetabolites.org/#!/start

Uploaded sequence file: repeat_masked/v.inaequalis/172_pacbio/filtered_contigs_repmask/172_pacbio_contigs_unmasked.fa Uploaded feature annotations (optional): gene_pred/codingquary/v.inaequalis/172_pacbio/final/final_genes_appended.gff3 Options selected (as default except CASSIS):

Cluster finder - off
Extra features selected:
KnownClusterBlast
smCoG analysis
Cluster-border prediction based on transcription factor binding sites (CASSIS)
ActiveSiteFinder
SubClusterBlast

Results of web-annotation of gene clusters within the assembly were downloaded to the following directories:

analysis/secondary_metabolites/antismash/v.inaequalis/172_pacbio
  for Zip in $(ls analysis/secondary_metabolites/antismash/*/*/*.zip); do
    OutDir=$(dirname $Zip)
    unzip -d $OutDir $Zip
  done
for AntiSmash in $(ls analysis/secondary_metabolites/antismash/*/*/*/*.final.gbk); do
Organism=$(echo $AntiSmash | rev | cut -f4 -d '/' | rev)
Strain=$(echo $AntiSmash | rev | cut -f3 -d '/' | rev)
echo "$Organism - $Strain"
OutDir=analysis/secondary_metabolites/antismash/$Organism/$Strain
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/secondary_metabolites
$ProgDir/antismash2gff.py --inp_antismash $AntiSmash > $OutDir/"$Strain"_secondary_metabolite_regions.gff
printf "Number of clusters detected:\t"
cat $OutDir/"$Strain"_secondary_metabolite_regions.gff | grep 'antismash_cluster' | wc -l
GeneGff=gene_pred/codingquary/$Organism/$Strain/final/final_genes_appended.gff3
bedtools intersect -u -a $GeneGff -b $OutDir/"$Strain"_secondary_metabolite_regions.gff > $OutDir/metabolite_cluster_genes.gff
cat $OutDir/metabolite_cluster_genes.gff | grep -w 'mRNA' | cut -f9 | cut -f2 -d '=' | cut -f1 -d ';' > $OutDir/metabolite_cluster_gene_headers.txt
printf "Number of predicted proteins in clusters:\t"
cat $OutDir/metabolite_cluster_gene_headers.txt | wc -l
printf "Number of predicted genes in clusters:\t"
cat $OutDir/metabolite_cluster_genes.gff | grep -w 'gene' | wc -l
done

These clusters represented the following genes. Note that these numbers just show the number of intersected genes with gff clusters and are not confirmed by function

v.inaequalis - 172_pacbio
Number of clusters detected:    22
Number of predicted proteins in clusters:       269
Number of predicted genes in clusters:  264
  for Antismash in $(ls analysis/secondary_metabolites/antismash/v.*/*/*_secondary_metabolite_regions.gff); do
    Organism=$(echo $Antismash | rev | cut -f3 -d '/' | rev)
    Strain=$(echo $Antismash | rev | cut -f2 -d '/' | rev)
    echo "$Organism - $Strain"
    cat $Antismash | cut -f3 | sort | uniq -c
  done
v.inaequalis - 172_pacbio
      2 indole
      5 nrps
      6 other
      1 siderophore
      5 t1pks
      3 terpene

SMURF was also run to identify secondary metabolite gene clusters.

Genes needed to be parsed into a specific tsv format prior to submission on the SMURF webserver.

  Gff=gene_pred/codingquary/v.inaequalis/172_pacbio/final/final_genes_appended.gff3
  OutDir=analysis/secondary_metabolites/smurf/v.inaequalis/172_pacbio
  mkdir -p $OutDir
  ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/secondary_metabolites
  $ProgDir/gff2smurf.py --gff $Gff > $OutDir/172_pacbio_genes_smurf.tsv

Submitted protein fasta file: gene_pred/codingquary/v.inaequalis/172_pacbio/final/final_genes_combined.pep.fasta Submitted above tsv file: analysis/secondary_metabolites/smurf/v.inaequalis/172_pacbio/172_pacbio_genes_smurf.tsv

SMURF output was received by email and downloaded to the cluster in the output directory above.

Output files were parsed into gff format:

OutDir=analysis/secondary_metabolites/smurf/v.inaequalis/172_pacbio
SmurfClusters=$OutDir/Secondary-Metabolite-Clusters.txt
SmurfBackbone=$OutDir/Backbone-genes.txt
ProgDir=/home/armita/git_repos/emr_repos/tools/seq_tools/feature_annotation/secondary_metabolites
$ProgDir/smurf2gff.py --smurf_clusters $SmurfClusters --smurf_backbone $SmurfBackbone > $OutDir/Smurf_clusters.gff

#Genomic analysis

Effector genes

Putative pathogenicity and effector related genes were identified within Braker gene models using a number of approaches:

  • A) From Augustus gene models - Identifying secreted proteins
  • B) From Augustus gene models - Effector identification using EffectorP

A) From Augustus gene models - Identifying secreted proteins

Required programs:

  • SignalP-4.1
  • TMHMM

Proteins that were predicted to contain signal peptides were identified using the following commands:

  SplitfileDir=/home/passet/git_repos/tools/seq_tools/feature_annotation/signal_peptides
  ProgDir=/home/passet/git_repos/tools/seq_tools/feature_annotation/signal_peptides
  CurPath=$PWD
  for Proteome in $(ls gene_pred/codingquary/v.*/*/*/final_genes_combined.pep.fasta | grep -w "172_pacbio"); do
    Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
    Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
    SplitDir=gene_pred/final_genes_split/$Organism/$Strain
    mkdir -p $SplitDir
    BaseName="$Organism""_$Strain"_final_preds
    $SplitfileDir/splitfile_500.py --inp_fasta $Proteome --out_dir $SplitDir --out_base $BaseName
    for File in $(ls $SplitDir/*_final_preds_*); do
      Jobs=$(qstat | grep 'pred_sigP' | wc -l)
      while [ $Jobs -gt 20 ]; do
      sleep 10
      printf "."
      Jobs=$(qstat | grep 'pred_sigP' | wc -l)
      done
      printf "\n"
      echo $File
      qsub $ProgDir/pred_sigP.sh $File signalp-4.1
    done
  done

The batch files of predicted secreted proteins needed to be combined into a single file for each strain. This was done with the following commands:

  for SplitDir in $(ls -d gene_pred/final_genes_split/*/* | grep -w "172_pacbio"); do
    Strain=$(echo $SplitDir | rev |cut -d '/' -f1 | rev)
    Organism=$(echo $SplitDir | rev |cut -d '/' -f2 | rev)
    InStringAA=''
    InStringNeg=''
    InStringTab=''
    InStringTxt=''
    SigpDir=final_genes_signalp-4.1
    for GRP in $(ls -l $SplitDir/*_final_preds_*.fa | rev | cut -d '_' -f1 | rev | sort -n); do  
      InStringAA="$InStringAA gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_preds_$GRP""_sp.aa";  
      InStringNeg="$InStringNeg gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_preds_$GRP""_sp_neg.aa";  
      InStringTab="$InStringTab gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_preds_$GRP""_sp.tab";
      InStringTxt="$InStringTxt gene_pred/$SigpDir/$Organism/$Strain/split/"$Organism"_"$Strain"_final_preds_$GRP""_sp.txt";  
    done
    cat $InStringAA > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_final_sp.aa
    cat $InStringNeg > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_final_neg_sp.aa
    tail -n +2 -q $InStringTab > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_final_sp.tab
    cat $InStringTxt > gene_pred/$SigpDir/$Organism/$Strain/"$Strain"_final_sp.txt
  done

Some proteins that are incorporated into the cell membrane require secretion. Therefore proteins with a transmembrane domain are not likely to represent cytoplasmic or apoplastic effectors.

Proteins containing a transmembrane domain were identified:

  for Proteome in $(ls gene_pred/codingquary/v.*/*/*/final_genes_combined.pep.fasta | grep -w "172_pacbio"); do
    Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
    Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
    ProgDir=/home/passet/git_repos/tools/seq_tools/feature_annotation/transmembrane_helices
    qsub $ProgDir/submit_TMHMM.sh $Proteome
  done

Those proteins with transmembrane domains were removed from lists of Signal peptide containing proteins

  for File in $(ls gene_pred/trans_mem/*/*/*_TM_genes_neg.txt | grep -w "172_pacbio"); do
    Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
    Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
    echo "$Organism - $Strain"
    TmHeaders=$(echo "$File" | sed 's/neg.txt/neg_headers.txt/g')
    cat $File | cut -f1 > $TmHeaders
    SigP=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/*_final_sp.aa)
    OutDir=$(dirname $SigP)
    ProgDir=/home/passet/git_repos/tools/gene_prediction/ORF_finder
    $ProgDir/extract_from_fasta.py --fasta $SigP --headers $TmHeaders > $OutDir/"$Strain"_final_sp_no_trans_mem.aa
    cat $OutDir/"$Strain"_final_sp_no_trans_mem.aa | grep '>' | wc -l
  done
v.inaequalis - 172_pacbio
1496

B) From Augustus gene models - Effector identification using EffectorP

Required programs:

  • EffectorP.py
  for Proteome in $(ls gene_pred/codingquary/v.*/*/*/final_genes_combined.pep.fasta | grep -w "172_pacbio"); do
    Strain=$(echo $Proteome | rev | cut -f3 -d '/' | rev)
    Organism=$(echo $Proteome | rev | cut -f4 -d '/' | rev)
    BaseName="$Organism"_"$Strain"_EffectorP
    OutDir=analysis/effectorP/$Organism/$Strain
    ProgDir=~/git_repos/tools/seq_tools/feature_annotation/fungal_effectors
    qsub $ProgDir/pred_effectorP.sh $Proteome $BaseName $OutDir
  done

Those genes that were predicted as secreted and tested positive by effectorP were identified:

  for File in $(ls analysis/effectorP/*/*/*_EffectorP.txt | grep -w "172_pacbio"); do
    Strain=$(echo $File | rev | cut -f2 -d '/' | rev)
    Organism=$(echo $File | rev | cut -f3 -d '/' | rev)
    echo "$Organism - $Strain"
    Headers=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_headers.txt/g')
    cat $File | grep 'Effector' | cut -f1 > $Headers
    Secretome=$(ls gene_pred/final_genes_signalp-4.1/$Organism/$Strain/*_final_sp_no_trans_mem.aa)
    OutFile=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_secreted.aa/g')
    ProgDir=/home/passet/git_repos/tools/gene_prediction/ORF_finder
    $ProgDir/extract_from_fasta.py --fasta $Secretome --headers $Headers > $OutFile
    OutFileHeaders=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_secreted_headers.txt/g')
    cat $OutFile | grep '>' | tr -d '>' > $OutFileHeaders
    cat $OutFileHeaders | wc -l
    Gff=$(ls gene_pred/codingquary/$Organism/$Strain/*/final_genes_appended.gff3)
    EffectorP_Gff=$(echo "$File" | sed 's/_EffectorP.txt/_EffectorP_secreted.gff/g')
    ProgDir=/home/passet/git_repos/tools/gene_prediction/ORF_finder
    $ProgDir/extract_gff_for_sigP_hits.pl $OutFileHeaders $Gff effectorP ID > $EffectorP_Gff
  done
v.inaequalis - 172_pacbio
629

SSCP

Small secreted cysteine rich proteins were identified within secretomes. These proteins may be identified by EffectorP, but this approach allows direct control over what constitutes a SSCP.

  for Secretome in $(ls gene_pred/final_genes_signalp-4.1/*/*/172_pacbio_final_sp_no_trans_mem.aa); do
    Strain=$(echo $Secretome| rev | cut -f2 -d '/' | rev)
    Organism=$(echo $Secretome | rev | cut -f3 -d '/' | rev)
    echo "$Organism - $Strain"
    OutDir=analysis/sscp/$Organism/$Strain
    mkdir -p $OutDir
    ProgDir=/home/armita/git_repos/emr_repos/tools/pathogen/sscp
    $ProgDir/sscp_filter.py --inp_fasta $Secretome --max_length 300 --threshold 3 --out_fasta $OutDir/"$Strain"_sscp_all_results.fa
    cat $OutDir/"$Strain"_sscp_all_results.fa | grep 'Yes' > $OutDir/"$Strain"_sscp.fa
    printf "number of SSC-rich genes:\t"
    cat $OutDir/"$Strain"_sscp.fa | grep '>' | tr -d '>' | cut -f1 -d '.' | sort | uniq | wc -l
    printf "Number of effectors predicted by EffectorP:\t"
    EffectorP=$(ls analysis/effectorP/$Organism/$Strain/*_EffectorP_secreted_headers.txt)
    cat $EffectorP | wc -l
    printf "Number of SSCPs predicted by both effectorP and this approach: \t"
    cat $OutDir/"$Strain"_sscp.fa | grep '>' | tr -d '>' > $OutDir/"$Strain"_sscp_headers.txt
    cat $OutDir/"$Strain"_sscp_headers.txt $EffectorP | cut -f1 | sort | uniq -d | wc -l
    echo ""
  done > tmp.txt
v.inaequalis - 172_pacbio
% cysteine content threshold set to: 3
maximum length set to: 300
No. short-cysteine rich proteins in input fasta: 507
number of SSC-rich genes: 507
Number of effectors predicted by EffectorP: 629
Number of SSCPs predicted by both effectorP and this approach: 463