CBrev.txt

## start over, see if we can redo our original biom table construction. 


## we already rearranged the split barcodes on the leaf reads, is that 
## still laying around?

## they're here:

ls -lh /home/daniel/Documents/taiwan/taiwan_dada2/rearranged_leafR1.fastq
ls -lh /home/daniel/Documents/taiwan/taiwan_dada2/rearranged_leafR2.fastq

## let's make a link to these:


ln -s /home/daniel/Documents/taiwan/taiwan_dada2/rearranged_leafR1.fastq reLeafR1.fastq
ln -s /home/daniel/Documents/taiwan/taiwan_dada2/rearranged_leafR2.fastq reLeafR2.fastq

rm reLeafR*.fastq

## confused - it looks like I used to have an aggregate wood reads file, 
## all the R1s in one file, R2s in the other. 

## whatever it was, I don't have it anymore. 

## without this, do we just loop through the wood reads, trim as we go?

## wood directory

wooddir='/home/daniel/Documents/taiwan/woodreads/'

i='lane1-s257-index-GAGGACTT-CCTAAGTCNNNN-PosI_S257_L001_R1_001.fastq'

cut <(echo $i) -d "-" -f 1

echo $i 

for i in $wooddir*"_R1_"*; do 
    echo $i
done

fastx_trimmer -l 255 -i $i -o woodR1_trimmed.fastq
fastx_trimmer -l 210 -i woodR2.fastq -o woodR2_trimmed.fastq


## okay, but we don't have single wood forward and reverse files, 
## we have a forward and a reverse for each sample 

## so apply the trim across all reads in the wood files. 

## where are these?
wooddir=/home/daniel/Documents/taiwan/woodreads/


mkdir trimmed_wood
mkdir trimmed_wood/R1
mkdir trimmed_wood/R2

## trims.sh
########################################

wooddir=/home/daniel/Documents/taiwan/woodreads/

cd /home/daniel/Documents/taiwan/taiwan_combined_biom

R1trimdir='/home/daniel/Documents/taiwan/taiwan_combined_biom/trimmed_wood/R1/'

for i in $wooddir*_R1_*; do
    echo $i
    out=$R1trimdir$(basename ${i/_001\.fastq/_trimmed\.fastq}) 
    fastx_trimmer -l 255 -i $i -o $out && echo $out 
done

R2trimdir='/home/daniel/Documents/taiwan/taiwan_combined_biom/trimmed_wood/R2/'

for j in $wooddir*_R2_*; do
    echo $j
    out=$R2trimdir$(basename ${j/_001\.fastq/_trimmed\.fastq}) 
    fastx_trimmer -l 210 -i $j -o $out && echo $out 
done

########################################


time ./trims.sh

echo "zoop"

## and the leaf reads?

mkdir  trimmed_leaf
mkdir trimmed_leaf/R{1..2}

### okay, but I think we gotta do this on Talapas. 
## I recall it's sometimes weird using outputs from
## the 32-bit home version of usearch with the 64-bit
## software, etc, so best to stick with the 64bit
## version on Talapas...

scp reLeafR1.fastq dthomas@talapas-login.uoregon.edu:/projects/xylaria/dthomas 

scp reLeafR2.fastq dthomas@talapas-login.uoregon.edu:/projects/xylaria/dthomas 

## gotta get the wood reads (trimmed) there too:

scp trimmed_wood -r dthomas@talapas-login.uoregon.edu:/projects/xylaria/dthomas
cp trimmed_wood -r dthomas@talapas-login.uoregon.edu:/projects/xylaria/dthomas

## merging is next....

## on the new cluster (Talapas) the 
## batch info is included as comments
## in the script:

## merge_leaves.sh
###########################
#! /usr/bin/env bash

#SBATCH --job-name=merge_leaves
#SBATCH --output=merge_leaves.out
#SBATCH --error=merge_leaves.err
#SBATCH --time=0-04:00:00
#SBATCH --nodes=1

leafdir=/projects/xylaria/dthomas/leaf/

module load usearch/8.0

usearch -fastq_mergepairs $leafdir"reLeafR2.fastq" -reverse $leafdir"reLeafR1.fastq" -fastqout $leafdir"leafmerged.fastq"
usearch -fastq_mergepairs $leafdir"Roo_R2_trimmed.fastq" -reverse $leafdir"Roo_R1_trimmed.fastq" -fastqout $leafdir"leaftrimmedmerged.fastq"


################################

## didn't work. they don't have the license for
## usearch 64bit, just 32bit. 

## what file size can usearch 32-bit handle?
## our wood reads are demultiplexed, much 
## smaller, let's try these:

## merge_wood.sh
################################
#!usr/bin/env bash

#SBATCH --job-name=merge_wood
#SBATCH --output=merge_wood.out
#SBATCH --error=merge_wood.err
#SBATCH --time=0-05:00:00
#SBATCH --nodes=1

module load usearch/8.0

cd projects/xylaria/dthomas/

R1d=/projects/xylaria/dthomas/trimmed_wood/R1/

for forward in $R1d*; do 
    echo $forward
    reverse=${forward//R1/R2}
    aa=$(basename $forward); output="/projects/xylaria/dthomas/merged_wood/"${aa/_R1_trimmed.fastq/_merged.fastq}
    usearch -fastq_mergepairs $forward -reverse $reverse -fastqout $output
done

###########################

## did that work?

## looks good. So what was our largest file size?

ls -Slhr

## 51 meg. 

## our leaf files after trimming are 10 and 6.8 gig. 

## this largest file size is 374560 lines 

## let's split up the leaves in to files with line
## numbers in multiples of four.

## pipeline: 

## 1) split leaf and wood reads down to ~1 gig files
## 2) run usearch on all
## 3) combine with cat

## 1) split leaf into 1 gig files:

## trimmed, unpaired leaves are here in talapas:

cd /projects/xylaria/dthomas/leaf

## how many lines in these leaf read files?

wc -l Roo_R1_trimmed.fastq ## 72806460

wc -l Roo_R2_trimmed.fastq ## same, 72806460

expr 72806460 / 4 ## 18201615 reads... is that about right? 

## anyway, multiple of four

## we should probably write a script for this, get off the 
## head node:

## split_leaf.sh
################################

#!/bin/bash

#SBATCH --job-name=split_leaf
#SBATCH --output=split_leaf.out
#SBATCH --error=split_leaf.err
#SBATCH --time=0-05:00:00
#SBATCH --nodes=1

cd /projects/xylaria/dthomas/leaf/split_leaf

for i in ../Roo_R*_*; do 
    ls $i 
    split -d -l 500000 $i ${i/fastq/split\.fastq}
done


###########################


## Submitted batch job 1731251

## worked?:


cd /projects/xylaria/dthomas/leaf/split_leaf


expr 72806460 / 500000 ## should be 145 or so files:

for i in *; do
wc -l $i
done

## looks good, matches up
## can we cycle through these and merge?

######################

#!/bin/bash

#SBATCH --job-name=leaf_merge
#SBATCH --output=leaf_merge.out
#SBATCH --error=leaf_merge.err
#SBATCH --time=0-05:00:00
#SBATCH --nodes=1

module load usearch/8.0

cd /projects/xylaria/dthomas/leaf/split_leaf/

for i in Roo_R2_trimmed.split.fastq*; do
    usearch -fastq_mergepairs $i -reverse ${i/_R2_/_R1_}  -fastqout "merged/"${i/_R2_/_merged_} 
done

###################################

## seems like it worked...

scp dthomas@talapas-login.uoregon.edu:/home/dthomas/leaf_merge.out ./
scp dthomas@talapas-login.uoregon.edu:/home/dthomas/leaf_merge.err ./

## okay, can we recombine these? or is there an order issue now?

cat * > Roo_merged_trimmed.split.fastq

## that may be a bit much for the head node ...

srun --pty --partition=short --mem=1024M --time=60 bash

## rerun

ls -l Roo_merged_trimmed.split.fastq

wc -l Roo_merged_trimmed.split.fastq ## 65858336

expr 65858336 / 4 ## 16,464,584 reads. 

echo $((18201615 - 16464584)) ## 1,737,031 reads lost. 

## anyway, what happens next? 
## I think we next demultiplex our leaf reads. 

## but let's catch up the jupyter notebook...

## it looks like more than half of our wood reads didn't pair. 
## check this:

cd /projects/xylaria/dthomas/merged_wood

tot=0
for i in *; do
    tot=$(( $tot + $(wc -l $i | cut -f 1 -d " ") ))
done

echo $(( tot / 4 )) reads

## 1.86 million reads. Something is fucked. 

## so... what happened?

## if we rerun just one of our merges, do we get the same result?

## for instance, we got really bad results from our second file, 33% matched
## this is ... 

## lane1-s161-index-AAGCACTG-TTCGTACGNNNN-Dc-PosG_S161_L001_R1_trimmed.fastq

## to rerun this:

forward="/projects/xylaria/dthomas/trimmed_wood/R1/lane1-s161-index-AAGCACTG-TTCGTACGNNNN-Dc-PosG_S161_L001_R1_trimmed.fastq"
reverse="/projects/xylaria/dthomas/trimmed_wood/R2/lane1-s161-index-AAGCACTG-TTCGTACGNNNN-Dc-PosG_S161_L001_R2_trimmed.fastq"

module load usearch/8.0

usearch -fastq_mergepairs $forward -reverse $reverse -fastqout ./test.fastq

usearch -fastq_mergepairs $forward -reverse $reverse -fastqout ./test.fastq -notrunclabels

## get some error reports:

usearch -fastq_mergepairs $forward -reverse $reverse  -alnout aln.txt

usearch -fastq_mergepairs $forward -reverse $reverse -fastqout ./test.fastq -alnout aln.txt

## it looks like most of our reads aligned, ~95%, and they looks pretty good on visual 
## inspection, but we need to relax our permitted numbers of mismatches. 
## in newer versions, this is the  "-fastq_maxdiffs" parameter, defautl = 5. 
## True for 8.0?

usearch -fastq_mergepairs $forward -reverse $reverse -fastq_maxdiffs 10 -alnout aln_10.txt

usearch -fastq_mergepairs $forward -reverse $reverse -fastq_maxdiffs 20 -alnout aln_20.txt

## none of this is helping. "mismatches" and "differences" are different types of error in 
## this version, though not in the more recent versions. And I can't find documentation 
## for older versions. What is the difference between these errors? Most of our reads aligned
## very well.  

## let's try this on my own computer, with v8.1.x of usearch

cd mergetest 

forward='lane1-s161-index-AAGCACTG-TTCGTACGNNNN-Dc-PosG_S161_L001_R1_trimmed.fastq'
reverse='lane1-s161-index-AAGCACTG-TTCGTACGNNNN-Dc-PosG_S161_L001_R2_trimmed.fastq'

usearch -fastq_mergepairs $forward -reverse $reverse -fastqout ./test.fastq

## better results. Still not perfect (67%) but maybe that's okay. 

usearch -fastq_mergepairs $forward -reverse $reverse -fastq_maxdiffs 10 -fastqout ./test2.fastq

## even better, 

usearch -fastq_mergepairs $forward \
    -reverse $reverse \
    -fastq_maxdiffs 10 \
    -alnout aln_2.txt \
    -report test2.report.txt 

## maybe wiser to use a percentage?

usearch -fastq_mergepairs $forward \
    -reverse $reverse \
    -fastq_maxdiffpct 40 \
    -alnout aln_3.txt \
    -report test3.report.txt \
    -fastqout ./test3.fastq

## drops us back to 67.7%

## sleepy. Time for bed. 

## so in the future, need to rerun the merging for wood and split up leaf reads,
## because usearch 8.1 seems so much more forgiving that 8.0. 

## is there something I can set to running tonight?

## split up leaf files on the optiplex?
## run the merge on all of them?

## we need to 

## 1 get all trimmed wood onto the optiplex
## 2 and all trimmed leaves
## 3 install usearch 8.1 onto optiplex
## 4 split up the leaves
## 5 run mergepair on on all wood
## 6 run mergepair on on all leaf
## 7 recombine leaves
 

## or just sleep. Let's do that. 

##### try again #########

## get trimmed wood onto optiplex ...

scp -r ./trimmed_wood/ daniel@192.168.1.7:/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/

## trimmed leaves - better to do the trimming on site? or copy?

## where are our leaves? put them on optiplex

scp dthomas@talapas-login.uoregon.edu:/projects/xylaria/dthomas/leaf/Roo_R{1..2}_trimmed.fastq ./

## so we're loading the trimmed wood and leaves onto the optiplex 

## split up the leaves:


## split_leaf.sh
################################

#!/usr/bin/env bash

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves

for i in Roo_R*_*; do 
    ls $i 
    split -d -l 500000 $i ${i/fastq/split\.fastq}
done


###########################

## can we record our outputs from usearch?

cd mergetest 

forward='lane1-s161-index-AAGCACTG-TTCGTACGNNNN-Dc-PosG_S161_L001_R1_trimmed.fastq'
reverse='lane1-s161-index-AAGCACTG-TTCGTACGNNNN-Dc-PosG_S161_L001_R2_trimmed.fastq'

usearch -fastq_mergepairs $forward \
    -reverse $reverse \
    -fastq_maxdiffpct 40 \
    -alnout aln_3.txt \
    -report reports/$i.report.txt \
    -fastqout ./test3.fastq

## merge wood on optiplex

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/R1

for forward in *fastq; do
    ls -l $forward
    reverse="../R2/${forward/R1/R2}"
    usearch -fastq_mergepairs $forward \
        -reverse $reverse \
        -fastq_maxdiffpct 40 \
        -alnout aln_3.txt \ ## oops, fix this if reused
        -report ../reports/$forward.report.txt \
        -fastqout ./$forward.merged.fastq
    echo $reverse
done

## oops, deleted a file - redo just this:

forward=lane1-s257-index-GAGGACTT-CCTAAGTCNNNN-PosI_S257_L001_R1_trimmed.fastq
ls -l $forward
reverse="../R2/${forward/R1/R2}"
ls -l $reverse
usearch -fastq_mergepairs $forward \
    -reverse $reverse \
    -fastq_maxdiffpct 40 \
    -report ../reports/$forward.report.txt \
    -fastqout ./$forward.merged.fastq

## look, this publication and the others need to get out. 
## the expedition needs to happen. 
## then I can move on the next los Cedros project... some
## thing beautiful for Rosa in this dying world. 


## I think we're ready to merge wood reads ...

## and the leaves, still on optiplex:

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/split

## did the splitting mess up order?
head -n 1 Roo_R1_trimmed.split.fastq9054
head -n 1 Roo_R2_trimmed.split.fastq9054

tail -n 4 Roo_R1_trimmed.split.fastq72
tail -n 4 Roo_R2_trimmed.split.fastq72

## looks good

for forward in *_R1_*; do
    #ls $forward
    reverse=${forward/_R1_/_R2_}
    usearch -fastq_mergepairs $forward \
        -reverse $reverse \
        -fastq_maxdiffpct 40 \
        -alnout aln_3.txt \ ## oops, fix this if reused
        -report ../reports/$forward.report.txt \
        -fastqout ../merged/$forward.merged.fastq
    #ls $reverse
done

## seems to be working ...

## okay, so to revisit the wood

## did we lose a lot?

cat lane1-s160-index-AAGCACTG-GTGATCCANNNN-Dc-X_S160_L001_R1_trimmed.fastq.report.txt

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/merged

clear
wc -l lane1-s257-index-GAGGACTT-CCTAAGTCNNNN-PosI_S257_L001_R1_trimmed.fastq.merged.fastq 

grep "^@M" lane1-s257-index-GAGGACTT-CCTAAGTCNNNN-PosI_S257_L001_R1_trimmed.fastq.merged.fastq  | wc -l

head lane1-s257-index-GAGGACTT-CCTAAGTCNNNN-PosI_S257_L001_R1_trimmed.fastq.merged.fastq 

bb=$(wc -l lane1-s257-index-GAGGACTT-CCTAAGTCNNNN-PosI_S257_L001_R1_trimmed.fastq.merged.fastq | cut -f 1 -d " ")
echo "bb="$bb
aa=$(( $bb / 4 ))
echo aa=$aa
echo aaX4=$(( $aa * 4 ))

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/merged

lines=0
for i in *; do
    bb=$(wc -l $i | cut -f 1 -d " ")
    aa=$(( $bb / 4 ))
    lines=$(( aa + lines ))
done

echo $lines ## 3745085

## or maybe just 
grep "^@M" -r | wc -l ## yeah. 3745085

## vs..
cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/R1
grep "^@M" -r | wc -l ## 5608274

## 3745085 / 5608274 = ## 66%. Hmm, not great, but what can I do? Why is this so different 
## from when we did this last year? What did I do differently?

## And the leaves?

## merged
cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/merged
(grep "^@HWI" -r | wc -l &)
## 14813290
## 14,813,290

## unmerged:
cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/
(grep "^@HWI" Roo_R2_trimmed.fastq | wc -l &)
## 18201615
## 18,201,615

## 81%, a lot better. Why? The wood library has generally better qscores. And when we
## merged them before with the 64-bit version of usearch, we had nearly 100% merging
## with the wood. Something seems off. 

## Oh well, let's march through to the biom table, and see how biom tables compare. 
## If they are drastically different from last years results, we'll see what can be 
## tweaked. 

## onward

## cat the leaf files:

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/merged

cat * > leaf_trimmed_merged.fastq

## cat the wood files, for quality charts:

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/merged

cat * > wood_trimmed_merged.fastq


## then what? 
## get trimmed, merged files onto laptop
## visualize qualities
## quality filter
## go to fasta
## demultiplex leaves
## remove leaf primers
## remove floating primers
## chimera check
## combine 
## trim to ITS1 region
## combine 

## oh, lotsa shit. I'm far from finished. Trek on. 


## tonight
## run fastq stats on optiplex
## get all files onto laptop

## after visualization, organize files, update github and notebook

## stats, on optiplex. What do we want to look at? Unmerged forward 
## and reverse, both trimmmed and un-. The merged, trimmed reads. 

## leaves:

## leafQcharts.sh
#####################################

#!/usr/bin/env bash

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/qCharts/leaf

rawLeafReadsR1="/home/daniel/Documents/taiwan_supp/roo_reads/TaiwanFA_R1.fastq"
rawLeafReadsR2="/home/daniel/Documents/taiwan_supp/roo_reads/TaiwanFA_R2.fastq"
trimmedLeafReadsR1="/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/Roo_R1_trimmed.fastq"
trimmedLeafReadsR2="/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/Roo_R2_trimmed.fastq"
leafmerg="/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/merged/leaf_trimmed_merged.fastq"

## leaf quality stats:
fastx_quality_stats -i $rawLeafReadsR1 -o rawLeafReadsR1_fastxstats.txt
fastx_quality_stats -i $rawLeafReadsR2 -o rawLeafReadsR2_fastxstats.txt
fastx_quality_stats -i $trimmedLeafReadsR1 -o trimmedLeafReadsR1_fastxstats.txt
fastx_quality_stats -i $trimmedLeafReadsR2 -o trimmedLeafReadsR2_fastxstats.txt
fastx_quality_stats -i $leafmerg -o leafmerged_fastxstats.txt


#####################################

(./leafQcharts.sh &) &

## wood reads

## make a combined files for the various prep stages so far:

cat /home/daniel/Documents/taiwan_supp/wood_reads/*R1* > /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/rawWoodReadsR1.fastq

cat /home/daniel/Documents/taiwan_supp/wood_reads/*R2* > /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/rawWoodReadsR2.fastq

cat /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/R1/* > /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmedWoodR1.fastq

cat /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/R2/* > /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmedWoodR2.fastq

cat /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/merged/* > /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/woodMerged.fastq

## how?


## woodQcharts.sh
############################
#!/usr/bin/env bash

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom

fastx_quality_stats -i rawWoodReadsR1.fastq -o qCharts/wood/rawWoodReadsR1_fastxstats.txt
fastx_quality_stats -i rawWoodReadsR2.fastq -o qCharts/wood/rawWoodReadsR2_fastxstats.txt
fastx_quality_stats -i trimmedWoodR1.fastq -o qCharts/wood/trimmedWoodR1_fastxstats.txt
fastx_quality_stats -i trimmedWoodR2.fastq -o qCharts/wood/trimmedWoodR2_fastxstats.txt
fastx_quality_stats -i woodMerged.fastq -o qCharts/wood/woodMerged_fastxstats.txt

##########################

(./woodQcharts.sh &) &


#################### quality filtering ######################

## get an example file
scp daniel@192.168.1.7:/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/merged/lane1-s255-index-TGCGAACT-CCTAAGTCNNNN-Neg_S255_L001_R1_trimmed.fastq.merged.fastq ./

exFastq='lane1-s255-index-TGCGAACT-CCTAAGTCNNNN-Neg_S255_L001_R1_trimmed.fastq.merged.fastq'

## let's try quality filtering 
## and what if we want to pipe out the screen messages to a file?

usearch -fastq_filter $exFastq -fastq_maxee_rate .01 -fastqout exampleOut.fastq -notrunclabels 2> exError.txt

## doesn't catch the licensing, but everything else. 

## can we do both - see the sterr on the screen and in file?

usearch -fastq_filter $exFastq -fastq_maxee_rate .01 -fastqout exampleOut.fastq -notrunclabels 1> exStdout.txt 2> exError.txt  

## works, but can we put them on the same file?

usearch -fastq_filter $exFastq -fastq_maxee_rate .01 -fastqout exampleOut.fastq -notrunclabels &> exStdout.txt   

## yeah, that's what I need. To remember all this, 0 is input, 1 in stdout, 2 is stderr, & is both 1 & 2. 

## anyway, do this on the optiplex, all wood files. Where are they?

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/merged

for i in *; do
    out=${i/.fastq.merged.fastq/\.merge\.filt\.fastq}
    usearch -fastq_filter $i -fastq_maxee_rate .01 -fastqout $out -notrunclabels &>> mergeStdout.txt   
done

## looks like we kept almost all, so maybe the hard filtering happened in the pairing step. 

scp daniel@192.168.1.7/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/merged/mergeStdout.txt ./
## do we need to use our split up leaves? probably.

## try the single merged file first:

usearch -fastq_filter leaf_trimmed_merged.fastq -fastq_maxee_rate 0.01 -fastqout leaf_filtered.fastq

## yeah, too big. So go through the split files:


mkdir ../filtered

#!/usr/bin/env bash

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/merged

for i in *; do
    out=${i/.merged.fastq/\.merged\.filt\.fastq}
    #echo $i $out
    usearch -fastq_filter $i -fastq_maxee_rate .01 -fastqout "../filtered/"$out -notrunclabels &>> ../filtered/leaf_mergeStdout.txt   
done

(./filterleaves.sh &) &


## so  as I understand it, we should have quality filtered reads, for both leaves and wood. 

## check out these reports:

## leaves:

lR=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/filtered/leaf_mergeStdout.txt

less $lR


grep "....\%.passed" $lR 

## looks generally successful 

## leaves?
lL=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/merged/mergeStdout.txt

less $lL

grep "....\%.passed" $lL

## whoah, way higher...

grep "....\%.passed" $lL | cut -d " " -f 8

## can we convert these to fasta, then update the notebook?

## let's do this with bbmap:

## got to bbmap directory on my laptop, put it on the optiplex:
scp -r ./bbmap daniel@192.168.1.7:/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom 

bb=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/bbmap

## let's do leaves first. Recombine the leaf reads:

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/filtered

cat *fastq > leaf_merged_filt.fastq

$bb/reformat.sh in=leaf_merged_filt.fastq out=leaf_merged_filt.fasta

## worked, but can we get rid of the line breaks? maybe by setting 
## "fastawrap" to 0? Or 600?  

head -n 20 leaf_merged_filt.fastq > submergedLeaf.fastq
$bb/reformat.sh in=submergedLeaf.fastq out=test.fasta fastawrap=600

## that worked. Just curious, how about 0?

$bb/reformat.sh in=submergedLeaf.fastq out=test.fasta fastawrap=0

## also works, let's go with this.

$bb/reformat.sh in=leaf_merged_filt.fastq out=leaf_merged_filt.fasta fastawrap=0 &> makeLeafFasta.txt

## this worked?
head -n 1 leaf_merged_filt.fastq 
head -n 1 leaf_merged_filt.fasta

tail -n 20 leaf_merged_filt.fasta | grep ">HWI"
tail -n 40 leaf_merged_filt.fastq | grep "@HWI"

## looks okay. Onward...

## let's demultiplex and update the notebook if there is still time:

## to dumultiplex - which script, if any, will do the job?

## on the dada pipeline I had to split up the file, apply one of the
## python scripts, and reassemble...

## huh. confused. Looks like I should have done this a while ago, before 
## merging...

## shit. can we do this now? Were these barcodes lost in trimming or 
## merging?

## how to tell?

## look at a few?

head leafread_fastx_map.txt

head 

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/filtered

sed -n '4000000,4000001p' leaf_merged_filt.fasta

head leaf_merged_filt.fasta 


grep ^ACCCAT leaf_merged_filt.fasta -B 1 | grep ">HWI" | wc -l ## 780,301 reads, makes sense. 


## do the other ends of these reads still have the other half of these barcodes?

grep ">HWI" leaf_merged_filt.fasta -B 1 | grep ATATCC$ 

grep ">HWI" leaf_merged_filt.fasta -B 1 | grep ATATCC$ | wc -l ## 301, not many. 

## Reverse compliment?

grep ">HWI" leaf_merged_filt.fasta -B 1 | grep GGATAT$ | wc -l ## 499. Fuck. 

grep ">HWI" leaf_merged_filt.fasta -B 1 | grep TATAGG$ | wc -l ## 184. 


## oh, shoot, it looks like I did do this, already
## I'm going nuts. 

## check that really quick:

grep ">HWI" leaf_merged_filt.fasta -A 1 | grep ^ACCCATATATCC | wc -l ## 227,705 seems reasonable. Maybe low, but possible. 

## okay, so we're good to start the demultiplexing then. We can use fastx to do this:

ld="/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/demult/"

(cat leaf_merged_filt.fasta | fastx_barcode_splitter.pl \
    --bcfile leafread_fastx_map.txt \
    --prefix $ld"leaf_"  \
    --suffix ".fa"  \
    --bol --mismatches 1 --partial 1 \
    &>> "leaf_demult_log.txt" &) &

## don't need to do this for wood. But I think we do need the FASTA versions
## of the wood...

bb=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/bbmap
wfd=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodFasta/

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/filtered

for i in *; do 
$bb/reformat.sh in=$i \
    out=$wfd${i/_R1_trimmed.merge.filt.fastq/.fasta} \
    fastawrap=0 \
    &>> ../../woodFastaStdout.txt
done


## so where are we now? 

## next step is to trim off the primers from the leaves. But the 
## demultiplexing is taking a while. So let's catch up our notebook.

tail leaf_merged_filt.fasta

head leaf_merged_filt.fasta

## did the barcodes on the opposite ends of our merged pairs 
## remain?


## did we get reasonable results from the demultiplexing?

cd $ld

less leaf_104.fa

## now what?

## trim primers off of leaf reads. 

## did the barcodes on the opposite ends of our merged pairs 
## remain?

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/demult


## sample 94 should have a barcode of CGTGATAAGACG

cd demult

wc leaf_94.fa -l

grep >HWI leaf_94.fa -l ## 91439 reads

grep CGTGATAAGACG leaf_94.fa | wc -l ## 84480

grep CGTCTTATCACG leaf_94.fa | wc -l ## 84462

grep CGTCTTATCACG$ leaf_94.fa  | wc -l ## 84462, all at the end

## so these plus primers will need to be cut. Are the primers still
## on there? 

## check, primers still there? ITS2 was our linker primer for 
## the leaf run, so it should be our forward primer:

grep GCTGCGTTCTTCATCGATGC leaf_94.fa | wc -l ## 87084 yup

grep GCTGCGTTCTTCATCGATGC <(head leaf_94.fa)

aa=CGTGATAAGACGGCTGCGTTCTTCATCGATGC
echo ${#aa}


grep TTACTTCCTCTAAATGACCAAG leaf_94.fa | wc -l

grep TTACTTCCTCTAAATGACCAAG <(head -n 1000 leaf_94.fa)

bb=TTACTTCCTCTAAATGACCAAGCGTCTTATCACG
echo ${#bb}

grep TTACTTCCTCTAAATGACCAAG.{1} <(head leaf_94.fa)


## anyway, trim the primers off the leaves:
for i in *; do
    fastx_trimmer -i $i -f 33 | fastx_trimmer -t 34 -o ../leafNoPrim/${i/leaf/leafNoPrim}
done

## wood has no primers, don't need to do this. 

## the next step would be? 

## chimera checks.

wget https://unite.ut.ee/sh_files/uchime_reference_dataset_28.06.2017.zip

## where to put this...

ITS1_ref='/home/daniel/Documents/taiwan/uchime_reference_dataset_28.06.2017/'\
'ITS1_ITS2_datasets/uchime_reference_dataset_ITS1_28.06.2017.fasta'

## do the check. Try one.

cd /home/daniel/Documents/taiwan/taiwan_combined_biom/leafNoPrim 

usearch -uchime_ref leafNoPrim_89.fa \
-db $ITS1_ref \
-nonchimeras test_notchim.fasta \
-uchimeout test.log \
-strand plus \
-notrunclabels \
&> uchime_stdout.txt


less uchime_stdout.txt

## works. do with all files, leaf reads:

cd /home/daniel/Documents/taiwan/taiwan_combined_biom/leafNoPrim 

for i in *; do
echo $i
j="../leaf_notchim/"${i/NoPrim/NotChim}
k=${j/\.fa/\.log}
#echo $j
#echo $k
usearch -uchime_ref $i \
-db $ITS1_ref \
-nonchimeras $j \
-uchimeout $k \
-strand plus \
-notrunclabels \
&>> uchime_stdout.txt
done

## huh, probably should have done that on the optiplex.

## onto the wood reads. Where are they again?

## on optiplex:

ITS1_ref='/home/daniel/Documents/submissions/taibioinfo/UNITE/uchime_reference_dataset_28.06.2017/ITS1_ITS2_datasets/uchime_reference_dataset_ITS1_28.06.2017.fasta'


cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodFasta

(for i in *; do
echo $i
j="../woodNotChim/"${i/\.fasta/\.notChim\.fasta}
k=${j/\.fasta/\.log}
echo $j
echo $k
usearch -uchime_ref $i \
-db $ITS1_ref \
-nonchimeras $j \
-uchimeout $k \
-strand plus \
-notrunclabels \
&>> uchime_stdout.txt
done &) &

## leaf reads crashed my laptop. not sure if I can trust them. Rerun on the optiplex.

ITS1_ref='/home/daniel/Documents/submissions/taibioinfo/UNITE/uchime_reference_dataset_28.06.2017/ITS1_ITS2_datasets/uchime_reference_dataset_ITS1_28.06.2017.fasta'

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/demult

( for i in *; do
echo $i
j="../leafNotChim/"${i/NoPrim/NotChim}
k=${j/\.fa/\.log}
echo $j
echo $k
usearch -uchime_ref $i \
-db $ITS1_ref \
-nonchimeras $j \
-uchimeout $k \
-strand plus \
-notrunclabels \
&>> ../leafNotChim/leafUchime_stdout.txt
done &) &

## how much did we lose?

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/ 

## leaves
grep -r '>' /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/ | wc -l
## 14372161
grep -r '>' /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/demult/ | wc -l
## 14372164


cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodNotChim

## 
grep -r '>' /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodFasta | wc -l

grep -r '>' /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodNotChim | wc -l


## then what?

## update notebook...

## so we need to think about clustering...

## first, fix labels to prep for the biom table

## second, get ITS1 out of the leaves and wood reads

## what do we need to do to fix labels?

## I think for OTU clustering, we may have problems. We're going to combine all reads,
## resulting in a pool too large for the 32bit version. 

## but let's try. 

##############

## combine all reads into one big file?

## let's homogenize the labels of the leaves and the wood. 
## we need them to be +/- the same. 
 
cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/ 

cat *log > ../leafChim.log

head -n 1 leaf_94.fa

head -n 1 leaf_43.fa

## can we create new labels?  

## copy all files to a new directory

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/ 
cp *.fa /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodNotChim
cp *.fasta /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/

## for the leaves:

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/

## for the leaves:
for i in *fa; do
sed -i "s/>HWI.*/>${i%\.fa}/" $i
done

## check differences:
ls /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/ 

head -n 1 /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/leaf_100.fa
head -n 1 leaf_100.fa

head -n 1 /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/leaf_27.fa
head -n 1 leaf_27.fa

tail -n 10 /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/leaf_27.fa
tail -n 10 leaf_27.fa

## seems to work

## change read labels for the wood...

for i in *fasta; do 
j=${i#*NN-}
k=${j%%_*}
sed -i "s/>M.*/>$k/" $i
done

## check this:

ls *fasta

head lane1-s223-index-TGCGAACT-CTCCTGAANNNN-82w_S223_L001.notChim.fasta

head /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodNotChim/lane1-s223-index-TGCGAACT-CTCCTGAANNNN-82w_S223_L001.notChim.fasta

## looks good. Now concatenate...

######## ITSx ###########

## get ITSx going... pain in the ass...

## let's do all this on the optiplex, 

## make soft links to binaries in the usr/bin

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/hmmer-3.1b2-linux-intel-x86_64/binaries

for i in *; do
echo $i
sudo ln -s $PWD/$i /usr/bin/$i
done

## and repeat for optiplex

## and ITSx?

## for both comps:

cd /home/daniel/ITSx_1.0.11

sudo ln -s $PWD/ITSx /usr/bin/ITSx 
sudo ln -s $PWD/ITSx_db/ /usr/bin/ITSx_db

## seems to work, but let's try it out on something:

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust

ITSx -i <(head leaf_9.fa) -o ITSx_test.txt

## nope. Is this the old database issue from before? Where are our 
## 

ITSx -i <(head leaf_9.fa) -p /home/daniel/ITSx_1.0.11/ITSx_db -o ITSx_test.txt --reset T

## doesn't work. Is it looking for the hmm directory?

## clean out binaries, let's just do this locally

cd /usr/bin

find ./ -type l -name "*hmm*"

find ./ -type l -ctime -2 

find ./ -type l -ctime -2 -exec sudo rm '{}' ';'

## mv ITSx to where we want it:

mv /home/daniel/hmmer-3.1b2-linux-intel-x86_64/ /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust
mv /home/daniel/ITSx_1.0.11/ /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust


../ITSx_1.0.11/ITSx -i <(head leaf_9.fa) -o ITSx_test.txt


## shit. go back and relink hmm binaries... ugh...

## try again:

../ITSx_1.0.11/ITSx -i <(head leaf_9.fa) -o ITSx_test.txt

find -type f -cmin -10

../ITSx_1.0.11/ITSx -i <(head -n 100 leaf_9.fa) -o ITSx_test.txt

## eh? no errors but outputs empty. Something about outdated hmms...

## this is supposed to fix the problem
../ITSx_1.0.11/ITSx -i <(head leaf_9.fa) --reset T -o ITSx_test.txt

## yup, that worked. Now a bigger subset:

head -n 200 leaf_9.fa

../ITSx_1.0.11/ITSx -i <(head -n 200 leaf_9.fa) --reset T -o ITSx_test.txt

less ITSx_test.txt.positions.txt

## oops, only read first line

../ITSx_1.0.11/ITSx -i <(head -n 200 leaf_9.fa) --preserve T --allow_single_domain -t F -o ITSx_test.txt

## huh. still only first line... can ITSx handle line breaks in the fasta?
## or is the identical labels? Check:

leaf9old=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/leaf_9.fa

../ITSx_1.0.11/ITSx -i <(head -n 200 $leaf9old) --preserve T --allow_single_domain -t F -o ITSx_test2.txt

## yup, that worked. It's the labels...
## but it's also the line breaks - this is saying that 
## the SSU is 1-80 on every read, which happens to be
## how many characters are in a line. 

## how to get rid of line breaks? I know I wrote a script for this...

wget https://raw.githubusercontent.com/danchurch/taiwan_combined_biom/master/scripts/fasta_remove_linebreaks.py

./fasta_remove_linebreaks.py $leaf9old leaf9noLB.fasta

clear

head $leaf9old 
head -n 4 leaf9noLB.fasta

clear
tail $leaf9old 
tail -n 4 leaf9noLB.fasta

## looks okay. So... 

../ITSx_1.0.11/ITSx -i <(head -n 100 leaf9noLB.fasta) --preserve T --allow_single_domain -t F -o ITSx_test3.txt

#rm leaf9noLB.fasta

## still getting 80. Coincidence? Why doesn't this match our old results of ITS starting at BP 47?

## try another:

leaf90old=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/leaf_90.fa

./fasta_remove_linebreaks.py $leaf90old leaf90noLB.fasta

../ITSx_1.0.11/ITSx -i <(head -n 100 leaf90noLB.fasta) --preserve T --allow_single_domain -t F -o ITSx_test4.txt

## same, 80 bp for the ssu.
## also, 62 bp of 5.8s

ls /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodNotChim 

## yup. 


leaf23old=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/leaf_23.fa
./fasta_remove_linebreaks.py $leaf23old leaf23noLB.fasta
../ITSx_1.0.11/ITSx -i <(head -n 100 leaf23noLB.fasta) --preserve T --allow_single_domain -t F -o ITSx_test6.txt

## same.

## hmmm. Why would this be different? The old pipeline showed us 46 bp into the ssu, and 30 bp
## into the 5.8s. ITS1 was usually ~180 bp long. 

## why would this be 80 bp instead of 46? where did we get the extra 34 bp? 

## not sure. Check wood:

ls /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodNotChim/

w49=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodNotChim/lane1-s199-index-TGCGAACT-TGTTCCGTNNNN-49w_S199_L001.notChim.fasta

./fasta_remove_linebreaks.py $w49 w49_noLB.fasta

../ITSx_1.0.11/ITSx -i <(head -n 200 w49_noLB.fasta) --preserve T --allow_single_domain -t F -o ITSx_w49.txt

less ITSx_w49.txt.positions.txt

## ah. Wood is back to the old numbers - 46 bp of ssu, 30 bp into 5.8s

## why not the leaves? Should be the same primers... does this have
## something with how we demultiplexed? 

## other differences?:  demultiplexing, previous script was qiime, current 
## is from 

## beh. not sure if it matters. 

head w49_noLB.fasta

## we have to go with what the ITSx extractor says. So chop off the first 80 reads and 
## the last 62? seems odd. 

## let's go with it, and return to this spot if there are problems downstream.

## clip off 80 reads from 5' and 62 from 3' in the leaf reads?

## ugh, something just seems weird. Just to check, do we know that our reads 
## are pointing the right way? Do we get a different answer from ITSx if they 
## are? Let's check.... 

## one read, the first from leaf_94:

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/demult
head -n 2 leaf_94.fa

../../ITSx_1.0.11/ITSx -i <(head -n 2 leaf_94.fa) --preserve T --allow_single_domain -t F -o ../../OTUclust/aa.txt

less /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/less aa.txt.ITS1.fasta

## the rc of this read is:

>leaf9line1_RC
CGTGATAAGACGCTTGGTNATTTAGAGGAAGTAAAAGTCGTAACAAGGTCTCCGTTGGTGAACCAGCGGAGGGATCATTACTGAGTTACCGCTCTATAACCCTTTGTGAACGTACCTAACCGTTGCTTCGGCGGGCAGGGGAAGCCTCTCGCGGGCCTCCCCTCCCGGCGCCGGCCCCCACCACGGGGACGGGGCGCCCGCCGGAGGAAACCAAACTCTATTTACACGACGTCTCTTCTGAGTGGCACAAGCAAATAATTAAAACTTTTAACAACGGATCTCTTGGTTCTGGCATCGATGAAGAACGCAGCCGTCTTATCACG

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust

../ITSx_1.0.11/ITSx -i leaf9line1_RC.fasta --preserve T --allow_single_domain -t F -o bb.txt

cat  aa.txt.ITS1.fasta

aaa=$(sed -n '2p'  aa.txt.ITS1.fasta)

cat  bb.txt.ITS1.fasta

bbb=$(sed -n '2p'  bb.txt.ITS1.fasta)

echo $aaa
echo $bbb

if [ "$aaa" == "$bbb" ]
then 
echo matches!
else
echo zoop!
fi


## seems able to find ITS with 5-3 or RC. 

## but oops, I think I found the bug - the 
## wrong reads were used to check for ITS. 
## I used reads from before the chimera check
## for the leaves...it looks like we did not 
## have the noprimer leaves on the optiplex...

## let's see if using the newer files fixes things:
find ~ -type f -name "uchime_reference_dataset_ITS1_28.06.2017.fasta"

ITS1_ref='/home/daniel/Documents/submissions/taibioinfo/UNITE/uchime_reference_dataset_28.06.2017/ITS1_ITS2_datasets/uchime_reference_dataset_ITS1_28.06.2017.fasta'

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNoPrim

( for i in *; do
# echo $i
j="../leafNotChim/"${i/NoPrim/NotChim}
k=${j/\.fa/\.log}
# echo $j
# echo $k
usearch -uchime_ref $i \
-db $ITS1_ref \
-nonchimeras $j \
-uchimeout $k \
-strand plus \
-notrunclabels \
&>> ../leafNotChim/leafUchime_stdout.txt
done &) &

cat *log > leafChimlog.txt && rm *log

## where were we? We need to check these new chimera-checked
## leaf reads with ITSx:


## on optiplex
cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim

## on laptop
cd /home/daniel/Documents/taiwan/taiwan_combined_biom/leaf_notchim

for i in *; do
echo $i
../../ITSx_1.0.11/ITSx \
-i <(head -n 2 $i) \
--preserve T \
--allow_single_domain \
-t F \
-o $checkITS/${i/\.fa/\.ITSx} \
&>> ITSx.stdout.txt
done

## okay, that didn't work. Why don't we add the first read of all fasta files into one:

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim


## get this onto the laptop 

## leaves
cd  /home/daniel/Documents/taiwan/taiwan_combined_biom/leaf_notchim
scp daniel@192.168.1.7:/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/* .

## wood
cd /home/daniel/Documents/taiwan/taiwan_combined_biom/wood_notchim
scp daniel@192.168.1.7:/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim/* .


## we gotta get rid of the linebreaks:
#wget https://raw.githubusercontent.com/danchurch/taiwan_combined_biom/master/scripts/fasta_remove_linebreaks.py

for i in *; do
echo $i
j=checkITS/${i/\.fa/_noLB\.fa}
#echo checkITS/${i/\.fa/_noLB\.fa}
fasta_remove_linebreaks.py $i $j
head -n 2 $j >> checkITS/allFirstReads.fa
done

## getting sleepy. 

## what's next? check ITS for all of these:


../../ITSx_1.0.11/ITSx \
-i checkITS/allFirstReads.fa \
--preserve T \
--allow_single_domain \
-t F \
-o checkITS/allFirstLeafReads

## cool, we are back in business. All of these reads tell us to 
## trim 46 bp off of the 5' of our leaf reads, and 30 bp off the 3'.

## can we do something like this for wood?

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodNotChim

for i in *; do
echo $i
j=checkITS/${i/\.fasta/_noLB\.fa}
#echo $j
fasta_remove_linebreaks.py $i $j
head -n 2 $j >> checkITS/allFirstWoodReads.fa
done

../../ITSx_1.0.11/ITSx \
-i checkITS/allFirstWoodReads.fa \
--preserve T \
--allow_single_domain \
-t F \
-o checkITS/allFirstWoodReads

## not perfect, there are exceptions to the 46/30 bo, but mostly there.

## assuming this works, how do we clip these?

## we have two pools of reads with no line breaks...

## can fastx handle line breaks?

## try. Clip notchimera reads. 

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodNotChim

for i in *fasta; do
#echo $i
echo ${i/notChim/ITSonly}
fastx_trimmer -i <(fasta_formatter -i $i)  -f 47 | fastx_trimmer -t 30 -o ../woodITSonly/${i/notChim/ITSonly}
done

## and leaves:

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafNotChim

for i in *fa; do
echo $i
#echo ../leafITSonly/${i/notChim/ITSonly}
fastx_trimmer -i <(fasta_formatter -i $i)  -f 47 | fastx_trimmer -t 30 -o ../leafITSonly/${i/notChim/ITSonly}
done

## ok, where did I leave off?

for i in *fa; do 
mv $i ${i/notChim/ITSonly}
done
## looks like all reads have been clipped to +/- the ITS1 region. 

## now we need to change the labels, right?


cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafITSonly

## for the leaves:
for i in *fa; do
echo $i
#echo ../leafRelab/${i/NotChim/Relab}
sed "s/>HWI.*/>${i%\.fa}/" $i 
done

## seems to work

## change read labels for the wood...

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodITSonly

for i in *fasta; do 
j=${i#*NN-}
k=${j%%_*}
#echo $i
#echo $k
sed -i "s/>M.*/>$k/" $i
done

## looks good. Now what?

## combine them. derep, sort, 

## combine:

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust

leafITSonly=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trim_leaves/leafITSonly
woodITSonly=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/trimmed_wood/woodITSonly

cat $leafITSonly/* $woodITSonly/* > allReads.fasta

## did they all make it in there?
wc -l allReads.fasta ## 36203910
echo $(( 36203910 / 2 )) ## 18101955

## 18,101,955

## is this all the reads we have right now?

## this includes the unmatched reads, I think that's 
## okay...

## derep:

usearch -derep_fulllength allReads.fasta -fastaout allReads_derep.fasta -sizeout 

usearch -derep_fulllength allReads.fasta -fastaout allReads_derep.fasta -sizeout &> derep.log

## doesn't work. too big

## can we break it up?

w61=lane1-s208-index-TCTCCGAT-GATACCTGNNNN-61w_S208_L001.ITSonly.fasta

echo $woodITSonly/$w61

ls $woodITSonly/$w61


usearch -derep_fulllength  $woodITSonly/$w61 -fastaout test.fasta -sizeout 

usearch -derep_fulllength allReads.fasta -fastaout allReads_derep.fasta -sizeout &> derep.log

## what if we split up the reads:

wc -l allReads.fasta

split -l 18101956 allReads.fasta splitReads

tail splitReadsaa
head splitReadsaa

usearch -derep_fulllength splitReadsaa -fastaout splitReadsaa_derep -sizeout 

usearch -derep_fulllength splitReadsaa -fastaout splitReadsab_derep -sizeout 

## can we now cat this and use derep to check for reads that are in both halves:

cat splitReadsaa_derep splitReadsab_derep > splitReads_recom


usearch -derep_fulllength splitReads_recom -fastaout ReadsDupes -sizeout 

head splitReadsaa_derep

head ReadsDupes

grep "^>" ReadsDupes 

grep "^>" splitReadsaa_derep | wc -l ## 9050978

grep "^>" splitReadsab_derep | wc -l ## 9050977


grep "^>" ReadsDupes | wc -l ## 1114379 reads occur in both halves

## how do we use this?

## the quantity of each one of these should be combined from both files


## ah fuck it. Let's try vsearch:

vsearch --derep_fulllength allReads.fasta --output allReads_derep.fasta --sizeout 
## seems to be a bug, sets default minseqlength to 32?

## get rid of singletons for clustering
vsearch --derep_fulllength allReads.fasta \
--output allReads_derep.fasta \
--sizeout \
--minseqlength 1 \
--minuniquesize 2 \
&> derep_stdout.log

## whoah, that was fast. Did it work?

wc -l allReads_derep.fasta

head allReads_derep.fasta

tail allReads_derep.fasta

## looks good, except the damn line breaks
## why do some programs insert these, others
## can't handle them?

## anyway, next step? will usearch work for us now?
## singletons have already been removed

usearch -sortbysize allReads_derep.fasta -fastaout allReads_sorted.fasta &> usearch_sort_stdout.log

## now try clustering? 

usearch -cluster_smallmem allReads_sorted.fasta \
-id 0.95 \
-centroids otus_95_combo.fasta \
-sizein \
-sizeout \
-sortedby size \
|& tee clust_stdout.log

##  did this work?

head otus_95_combo.fasta

wc -l otus_95_combo.fasta

grep "^>" otus_95_combo.fasta | wc -l

## 12601 OTUs


## okay, seems comparable to what we saw previously. Not exactly the same...but (found 10,000 previously)
## but close enough. 

## okay, let's update the notebook. 

## now what?


## get utax unite database 
## remove anything less than class id?
## assign tax
## make biom table 

## but let's be done tonight. 

########## Assign taxonomy  ############

## we didn't have a lot of luck last times 
## with UTAX's id assignments, never figured out
## how to train the algorithm, etc. 

## but let's try again. First, we need to get the 
## UTAX UNITE database:

## we'll follow instructions here: https://drive5.com/usearch/manual/utax_its.html

## download the  UNITE UTAX database version 7, see if it has
## backwards compatibility with usearch 8.x:

cd /home/daniel/Documents/submissions/taibioinfo/UNITE

wget https://drive5.com/utax/data/utax_unite_v7.tar.gz

tar -xzvf utax_unite_v7.tar.gz

## get the db and taxconfs file for ITS1

## some useful shortcuts:
ITS1db=/home/daniel/Documents/submissions/taibioinfo/UNITE/utaxref/unite_v7/fasta/refdb.fa
ITS1tf=/home/daniel/Documents/submissions/taibioinfo/UNITE/utaxref/unite_v7/taxconfs/its1.tc
OTUs=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/otus_95_combo.fasta

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/assTax

## now make the UDB... do we have memory for this?

usearch -makeudb_utax $ITS1db  -taxconfsin $ITS1tf -output ITS1tax.udb |& tee assTax.log

## looks okay. Now run UTAX:

usearch -utax $OTUs -db ITS1tax.udb -utaxout OTUtax.txt -strand plus 

usearch -utax $OTUs -db ITS1tax.udb -fastaout OTUs_95_assTaxed.fasta -strand plus 

usearch -utax $OTUs -db ITS1tax.udb -fastaout OTUs_95_assTaxed.fasta -strand plus |& OtuAssTaxed.log

## ugh, these are horrible. 

## let's not invest too much into these. Throw them on there, 
## but if we want to look closer at them, blast them individually

## so what next? can we make a biom table out of it?

usearch -usearch_global -db OTUs_95_assTaxed.fasta -strand both -id 0.97 -biomout combo_otu_97.biom

usearch -usearch_global combo.fasta -db otus_97_combo_asstax.fasta -strand both -id 0.97 -biomout combo_otu_97.biom

ls -l /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/otus_95_combo.fasta 

head /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/otus_95_combo.fasta 

## hmm. don't our OTUs need unique labels? 

## oh yea, we wrote a script for this:
#wget https://raw.githubusercontent.com/danchurch/taiwan_combined_biom/master/scripts/addOTUtag.py

./addOTUtag.py otus_95_combo.fasta OTU otus_95_combo_relab.fasta

## now redo the taxonomy assignments, even though they suck:

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/assTax

OTUs=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/otus_95_combo_relab.fasta

usearch -utax $OTUs -db ITS1tax.udb -strand plus -fastaout OTUs_95_assTaxed.fasta

## now try to make a biom table:

cd /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/assTax

allReads=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/allReads.fasta

usearch -usearch_global $allReads -db OTUs_95_assTaxed.fasta -strand both -id 0.95 -biomout combo_otu.biom |& tee makebiom.log

## that's gotta sit for a while...

chmod 444 combo_otu.biom

########### biom metadata and phyloseq ###############

## okay, how's it look? What will it take to get phyloseq to digest this?

## check it with the biom python package:

biom validate-table -i combo_otu.biom 

## the usual, weird small problems.

grep rows -A 5 combo_otu.biom 

grep "g:Pesotum(0.0202)" combo_otu.biom

grep "g:Pesotum(0.0202)" combo_otu.biom -A 5 -B 5

## not sure why, but this otu isn't identified. Can we find it otherwise?
 
grep "g:Pesotum(0.0202)" /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/otus_95_combo_relab.fasta
## oops, no classifications there. 

## actually we want to look here
grep "g:Pesotum(0.0202)" /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/assTax/OTUs_95_assTaxed.fasta
## OTU6797:leafNotChim_111

grep '"id":""' combo_otu.biom 

## can we fix this with SED?

sed '/"id":""/ s/"id":"",/"id":"OTU6797:leafNotChim_111",/' combo_otu.biom > combo_otu_corrected.biom

grep '"id":""' combo_otu_corrected.biom 

grep "g:Pesotum(0.0202)" OTUs_95_assTaxed.fasta

grep "g:Pesotum(0.0202)" combo_otu_corrected.biom 
## okay, looks good. let's just do this with the original file: 

sed '/"id":""/ s/"id":"",/"id":"OTU6797:leafNotChim_111",/' combo_otu.biom -i

grep "g:Pesotum(0.0202)" combo_otu.biom 

## time stamp.. who cares?

## let's see what phyloseq does with it. I think I recall that some 
## sort of modifications of the tax info are necessary, but let's see.

R 

library('phyloseq')

## ugh, install phyloseq...20 minutes later...

Ubiom <- import_biom('combo_otu.biom', parseFunction=parse_taxonomy_greengenes)

Ubiom <- import_biom('combo_otu.biom', parseFunction=parse_taxonomy_default())

## neither works. 

## we had a script for reformatting our taxonomy. Does this still work?

less ../scripts/format_tax.py

../scripts/format_tax.py combo_otu.biom combo_otu_reformatted.biom

grep rows combo_otu_reformatted.biom -A 10

## not working. Why not? 

## not sure. Can SED do the trick for us? It's late and I don't want to debug:

echo "<b>foo</b>bar" 

echo "<b>foo</b>bar" | sed 's/<.*>//g'

echo "<b>foo</b>bar" | sed 's/<[^>]*>//g'

echo "James Bond" | sed -E 's/(.*) (.*)/The name is \2, \1 \2./'

clear

grep rows combo_otu.biom -A 5

sed '/taxonomy/ s/([0-1]\.[0-9]*)//g' combo_otu.biom |\
sed -E 's/("taxonomy")(:")/\1:[/' |\
sed -E 's/"}}/,]}}/' |\
sed -E '/taxonomy/ s/(d:)([^,]*)/"k__\2"/' |\
sed -E '/taxonomy/ s/(p:)([^,]*)/"p__\2"/' |\
sed -E '/taxonomy/ s/(c:)([^,]*)/"c__\2"/' |\
sed -E '/taxonomy/ s/(o:)([^,]*)/"o__\2"/' |\
sed -E '/taxonomy/ s/(f:)([^,]*)/"f__\2"/' |\
sed -E '/taxonomy/ s/(g:)([^,]*)/"g__\2"/' |\
sed -E '/taxonomy/ s/(s:)([^,]*)/"s__\2"/' |\
sed -E '/taxonomy/ s/,]}}/]}}/' > combo_otu_relab.biom


grep rows combo_otu.biom -A 5

grep rows combo_otu_relab.biom -A 5


## work in R?


#### R

R 

library("phyloseq")

biom95 <- import_biom("combo_otu_relab.biom", parseFunction=parse_taxonomy_greengenes)

colnames(tax_table(biom95))

## okay... worked... now sample metadata

##########

## we have a metadata table in the folder - does it work for us?

biom add-metadata -i combo_otu_relab.biom -o combo_otu_wMeta.biom -m meta_2016.1.2.tsv --output-as-json

biom validate-table -i combo_otu_wMeta.biom

vim combo_otu_relab.biom

R 
library("phyloseq")

biom95 <- import_biom("combo_otu_wMeta.biom", parseFunction=parse_taxonomy_greengenes)

colnames(tax_table(biom95))

########

grep '{"id":' combo_otu_relab.biom

grep columns combo_otu_relab.biom -A 3 -B 3

grep columns combo_otu_relab.biom -A 200

## we need the 

## we can get our new ids off our biom table, put into a separate file, and clean it up a litte:

cut newIDs.txt -d : -f 2 | cut -d , -f 1 

cut newIDs.txt -d : -f 2 | cut -d , -f 1 > IDlabels.txt


wc -l IDlabels.txt ## 230

## what do we do with this?

cp meta_2016.1.2.tsv meta_2018.06.14.tsv

head meta_2018.06.14.tsv

## ugh. We got some serious qc to do here. 

cut -f 1 meta_2018.06.14.tsv  | wc -l
cut -f 1 meta_2018.06.14.tsv  | wc -l

## we need to correct our meta table with new sample names,
## or the reverse, change the sample names in our biome table. 

## chance for propagating errors seems lower if we leave 
## our biom table alone. 

## So how do we change our metatable ids to match our new biom table?

## well regardless, we do need to get rid of the "NotChim" in our 
## leaf labels in the biom table...


grep 'NotChim' combo_otu_relab.biom

head -n 20 'NotChim' combo_otu_relab.biom

## right, our OTU centroids are named with this also. Does this 
## matter? Don't think so, the unique OTU number should be all 
## we need if we need to come back to these....

head -n 20 combo_otu_relab.biom | sed -E 's/(leaf)(NotChim_)([0-9]*)/\3\1/g'

head -n 2000 combo_otu_relab.biom | sed -E 's/(leaf)(NotChim_)([0-9]*)/\3\1/g'

sed -E -i 's/(leaf)(NotChim_)([0-9]*)/\3\1/g' combo_otu_relab.biom

## actually, that should take care of the main leaf id issue. 

## the wood is probably more easily taken care of on the metadata form, 
## the old one had the illumina sample name as the id instead of our 
## experimental sample id. Should be as simple as moving columns around 
## to use the experimental sample id. 

grep [0-9]*w combo_otu_relab.biom

head meta_2018.06.14.tsv  

tail meta_2018.06.14.tsv  

grep "rows" combo_otu_relab.biom -A 10

grep "columns" combo_otu_relab.biom -A 250 | less

grep '"id":"[0-9]*w"' combo_otu_relab.biom | wc -l

## weird, somewhere we lost Graham's controls. Don't really need it, but 
## we should either include them or redo the clustering. 

## where did we lose them? 

grep 'Dc-X' /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/assTax/combo_otu.biom ## they're not in the biom

vim /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/assTax/combo_otu.biom ## they're not in the biom

grep 'Dc-X' combo_otu_relab.biom ## they're not in the biom

grep 'Dc-X' /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/allReads.fasta

grep 'Dc-X' /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/allReads_sorted.fasta
## they're in there

grep 'Dc-X' /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/allReads_derep.fasta
## and there

grep 'Dc-X' /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/allReads_sorted.fasta
## yup, there too. 

grep 'Dc-X' /home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/otus_95_combo.fasta
## yeah, otus are named with these samples

grep 'Dc-X' OTUclust/otus_95_combo_relab.fasta
## in there...

grep 'Dc-X' ./assTax/OTUs_95_assTaxed.fasta

## it looks like it is the formation of the table itself, using the usearch_global command. What if we 
## sub out "-" for "_" in these negative controls?

## it is probably getting the sample ids from the headers of the massive, all-read fasta file. Change them here:

allReads=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/OTUclust/allReads.fasta

tail $allReads

grep 'Dc-X' $allReads -A 1 | head 

grep 'Dc-X' $allReads -A 1 | tail

sed -n '/Dc-X/p' 


grep 'Dc-PosG' $allReads -A 1 | wc -l

sed -E '/>Dc-X/ s/(Dc)-(X)/\1_\2/g' <(grep 'Dc-X' $allReads -A 1 | head )
sed -E '/>Dc-PosG/ s/(Dc)-(PosG)/\1_\2/' <(grep 'Dc-PosG' $allReads -A 1 | head )
sed -E '/>Dc-PosI/ s/(Dc)-(PosI)/\1_\2/' <(grep 'Dc-PosI' $allReads -A 1 | head )
sed -E '/>Dc-Neg/ s/(Dc)-(Neg)/\1_\2/' <(grep 'Dc-Neg' $allReads -A 1 | head )

## works

## okay, gotta go. But the plan - sed out the hyphens in these
## graham controls to underscores. Then rerun the usearch global
## command, see if they retain their individual samples instead 
## of getting lumped. 

sed -E '/>Dc-X/ s/(Dc)-(X)/\1_\2/g' $allReads |\
sed -E '/>Dc-PosG/ s/(Dc)-(PosG)/\1_\2/' |\
sed -E '/>Dc-PosI/ s/(Dc)-(PosI)/\1_\2/' |\
sed -E '/>Dc-Neg/ s/(Dc)-(Neg)/\1_\2/' > allReads_con.fasta

## did that work?

grep 'Dc-X' allReads_con.fasta

grep 'Dc_X' allReads_con.fasta

## yup

## does that improve our biome table?

allReadsCon=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/assTax/allReads_con.fasta

usearch -usearch_global $allReadsCon -db OTUs_95_assTaxed.fasta -strand plus -id 0.95 -biomout combo_otu_test.biom |& tee makebiom_test.log

grep 'Dc_X' combo_otu_test.biom 
grep 'Dc-X' combo_otu_test.biom ## OTUs only, not IDs

grep 'Dc-Neg' combo_otu_test.biom 

grep 'Dc_Neg' combo_otu_test.biom 

## seems to have worked. Not going to worry about the OTU names. 

## now push it through the pipeline, replace the old one:

mv combo_otu_test.biom combo_otu.biom
 
sed '/taxonomy/ s/([0-1]\.[0-9]*)//g' combo_otu.biom |\
sed -E 's/("taxonomy")(:")/\1:[/' |\
sed -E 's/"}}/,]}}/' |\
sed -E '/taxonomy/ s/(d:)([^,]*)/"k__\2"/' |\
sed -E '/taxonomy/ s/(p:)([^,]*)/"p__\2"/' |\
sed -E '/taxonomy/ s/(c:)([^,]*)/"c__\2"/' |\
sed -E '/taxonomy/ s/(o:)([^,]*)/"o__\2"/' |\
sed -E '/taxonomy/ s/(f:)([^,]*)/"f__\2"/' |\
sed -E '/taxonomy/ s/(g:)([^,]*)/"g__\2"/' |\
sed -E '/taxonomy/ s/(s:)([^,]*)/"s__\2"/' |\
sed -E '/taxonomy/ s/,]}}/]}}/' > combo_otu_relab.biom

## look okay?

head -n 20 combo_otu_relab.biom

vim  combo_otu_relab.biom


## get this to the laptop

## okay.. so back to getting the metadata into shape:

## strategy? there's only ~ 230 samples... get both and put them 
## side-by-side, something will hit me...

vim combo_otu_relab.biom

head newIDs.txt

cut -d : -f 2 newIDs.txt | cut -d , -f 1 | sed 's/"//g' > change_these_IDs.txt


## these names need to go into the new biom table at the correct spots, so 
## that the sample metadata can be assigned. 

## not sure what to do with the unmatched reads. Probably should get rid of them 
## before making the biom table?...

## meh. Keep them in there, remove them with phyloseq. 

## so place the new names into the new metadata file, see if it works:

biom add-metadata -i combo_otu_relab.biom -o combo_otu_wMeta.biom --sample-metadata-fp  meta_2018.06.14.tsv --output-as-json

## did this work?

grep columns combo_otu_wMeta.biom  -A 15

head -n 100 combo_otu_relab.biom  


head -n 20 combo_otu_wMeta.biom | sed "s/\[\}\}\,/\[\}\}\,\n/"

head -n 100 combo_otu_relab.biom | sed "s/\[\}\}\,/\[\}\}\,\n/"

## right, it's all one line now, grep sucks for this.

## sooo.... does phyloseq like it?

vim combo_otu_wMeta.biom

R 
library("phyloseq")

biom95 <- import_biom("combo_otu_wMeta.biom", parseFunction=parse_taxonomy_greengenes)

## nope

## does R still like the file, pre-metadata?


biom95 <- import_biom("combo_otu_relab.biom", parseFunction=parse_taxonomy_greengenes)

## nope. What happened? Did we do all of our cleaning steps?

biom validate-table -i combo_otu_relab.biom


grep "g__Pesotum" combo_otu_relab.biom -A 2 -B 2

grep "g:Pesotum" combo_otu.biom -A 2 -B 2

## try again to add metadata:
biom add-metadata -i combo_otu_relab.biom -o combo_otu_wMeta.biom --sample-metadata-fp  meta_2018.06.14.tsv --output-as-json

biom validate-table -i combo_otu_wMeta.biom

## see if this helped with R:
R 
library("phyloseq")

biom95 <- import_biom("combo_otu_wMeta.biom", parseFunction=parse_taxonomy_greengenes)
## doesn't work

biom95 <- import_biom("combo_otu_wMeta.biom")

## still not working. Would like to look at the file but it is a single-line 
## mess. Can we fix this?

sudo pip install jsbeautifier

js-beautify combo_otu_wMeta.biom > test.biom

biom validate-table -i test.biom

## looks okay. Why won't phyloseq accept this?

R 

library("phyloseq")
library("biomformat")

biom95 <- import_biom("combo_otu_wMeta.biom", parseFunction=parse_taxonomy_greengenes)
## don't work 

aa <- read_biom("combo_otu_wMeta.biom")

sample_metadata(aa)
## read_biom finds the metadata just fine...

## maybe it's our unmatched reads? They have no metadata, could be screwing 
## with phyloseq's parsing?

## back up to...?

## I think we want to get rid of anything with the unmatched header in our allReads_con.fasta:

## back on optiplex:
allReadsCon=/home/daniel/Documents/submissions/taibioinfo/taiwan_combined_biom/assTax/allReads_con.fasta

head $allReadsCon

tail $allReadsCon

grep "^>" $allReadsCon | uniq | grep unmatched

## how to delete all of these with the >leafNotChim_unmatched name?

cat <(head $allReadsCon) <(tail $allReadsCon) 

cat <(head $allReadsCon) <(tail $allReadsCon) | sed '/>leafNotChim_100/,+1d'

cat <(head $allReadsCon) <(tail $allReadsCon) | sed '/>PosI/,+1d'

cat <(head $allReadsCon) <(tail $allReadsCon) | sed -e '/>leafNotChim_100/,+1d'

## so if we want to delete reads that start with  "leafNotChim_unmatched"


sed '/>leafNotChim_unmatched/,+1d' $allReadsCon > test.fasta

grep "^>" test.fasta | uniq | wc -l

grep "^>" $allReadsCon | uniq | wc -l

## seems to work. do it for real:

sed -i '/>leafNotChim_unmatched/,+1d' $allReadsCon 

## redo biom table:

usearch -usearch_global $allReadsCon -db ./assTax/OTUs_95_assTaxed.fasta -strand plus -id 0.95 -biomout combo_otu.biom |& tee makebiom.log

scp combo_otu.biom makebiom.log daniel@192.168.1.4:/home/daniel/Documents/taiwan/taiwan_combined_biom/

## did that help at all?


R 
library("phyloseq")
library("biomformat")

biom95_meta <- import_biom("combo_otu_wMeta.biom", parseFunction=parse_taxonomy_greengenes)
## nope


## not sure. maybe a better approach is to add the metadata in phyloseq?

## I think we can use the merge_phyloseq command, if we can get phyloseq 
## see our metadata....

biom95 <- import_biom("combo_otu_relab.biom", parseFunction=parse_taxonomy_greengenes)

aa <- read_biom("combo_otu_wMeta.biom")

bb <- sample_metadata(aa)

sample_data(biom95) <- bb

## nope. Try importing the metadata table directly?

dd <- read.csv('meta_2018.06.14.tsv', sep = '\t') ## read in the metadata
colnames(dd)[1] <- "SampleID" ## clean up column names

## do our rownames in this metadata dataframe match our sample names?
all(rownames(dd) %in% sample_names(biom95)) 
## nope. Why not?

rownames(dd) ## ah. our new dataframe has the default numeric rownames

## change our rownames to our sample names:
rownames(dd) <- dd$SampleID

rownames(dd) 

sample_names(biom95)

rownames(dd) %in% sample_names(biom95)


rownames(dd)[208]

"112A" %in% sample_names(biom95)

## did this help?

sample_data(biom95) <- dd

sample_data(biom95)

## looks good to me. Update notebook. 

## then look at reviewers again, and eco notebook. 

## You have ~12 days. come up with a strategy.


sample_names(biom95)