exploreReads.txt

## let's take a look at Sulari and Arne's reads

### set up ssh key for this. ###
git remote set-url origin git@github.com:danchurch/fichtelgebirgeSoils.git

## might need to grab files quickly between computers. a template

laptopLoc="/home/daniel/Documents/projects/fichtelgebirge_project/sulariArneSoils/fichtelgebirgeSoils"
nanoCompLoc="/media/vol1/daniel/sulariArne/illuminaReads/sulariPhyloseqObject.rda"
scp -r -i ~/.ssh/id_ed25519 test@132.180.112.115:$nanoCompLoc $laptopLoc

## or something like that


###########################################

## now, much of this analysis will likely have to be done in R,
## on a computer with sufficient memory.

## the goal would be to contain this analysis to R
## so we'll use dada2 in R

## for the moment, let's use the lab computer,

## let's look in our files for primers, etc:

conda activate 

conda create -n readQC -c bioconda cutadapt

readDir="/media/vol1/daniel/sulariArne/illuminaReads/goodReads"

## let's do this old-school:

cd $readDir

gunzip -k *

cat *fq > allSulariArneReads.fq

mkdir /media/vol1/daniel/sulariArne/illuminaReads/fastqcOut

## set variables
file="/media/vol1/daniel/sulariArne/illuminaReads/goodReads/allSulariArneReads.fq"
outdir="/media/vol1/daniel/sulariArne/illuminaReads/fastqcOut"
fastqc -t 10 -o  $outdir $file  &


path2put=/home/daniel/Documents/projects/fichtelgebirge_project/sulariArneSoils/readReport/
path2get="/media/vol1/daniel/sulariArne/illuminaReads/fastqcOut/"
scp -r -i ~/.ssh/id_ed25519 test@132.180.112.115:$path2get $path2put

## quality looks great. 

## split by direction and check again

## on nanoComp

conda activate 

cd /media/vol1/daniel/sulariArne/illuminaReads

readDir="/media/vol1/daniel/sulariArne/illuminaReads/goodReads"

cat $readDir/*good_1.fq > R1_SulariArneReads.fq &
cat $readDir/*good_2.fq > R2_SulariArneReads.fq

r1=/media/vol1/daniel/sulariArne/illuminaReads/R1_SulariArneReads.fq
r2=/media/vol1/daniel/sulariArne/illuminaReads/R2_SulariArneReads.fq
outdir="/media/vol1/daniel/sulariArne/illuminaReads/fastqcOut"
fastqc -t 10 -o $outdir $r1 &
fastqc -t 10 -o $outdir $r2 &

## local machine:
path2put=/home/daniel/Documents/projects/fichtelgebirge_project/sulariArneSoils/fichtelgebirgeSoils
path2get="/media/vol1/daniel/sulariArne/illuminaReads/Envt_Matrix.csv"
scp -r -i ~/.ssh/id_ed25519 test@132.180.112.115:$path2get $path2put

firefox *html

## all looks pretty amazing. For each (R1 and R2) We have 15510344/195 = 79540 reads per sample. 
## after pairing. 
## pretty much what they reported. 

## look for primers...the first ten or so basepairs are highly conserved in both
## are these remnants of the primers? 

## back on nanoComp
 
head R1_SulariArneReads.fq

wc -l R1_SulariArneReads.fq

## here are the primers used as reported by the company.
GTGYCAGCMGCCGCGGTAA 

len('GTGYCAGCMGCCGCGGTAA') ##19

GGACTACNVGGGTWTCTAAT

len('GGACTACNVGGGTWTCTAAT') ##20

## these look like the latest earth microbiome, parada primers

## 515 forward, parada, in R1?:
grep -c "^GTG.CAGC.GCCGCGGTAA" R1_SulariArneReads.fq ## 14903512 reads
grep -c GTG.CAGC.GCCGCGGTAA R1_SulariArneReads.fq ## again 14903512 reads, out of 
grep -c "@A01720" R1_SulariArneReads.fq ## 15510344
wc -l R1_SulariArneReads.fq ## 15510344
## 806R, in R1 reads?:
grep -c "GGACTAC..GGGT.TCTAAT" R1_SulariArneReads.fq ## 0 reverse primers, that's good
## reverse complement:
grep -c "ATTAGA.ACCC.NGTAGTCC" R1_SulariArneReads.fq ## 0 reverse RC primers, that's good

## 515 forward, parada, in R2?:
grep -c "^GTG.CAGC.GCCGCGGTAA" R2_SulariArneReads.fq ## 0, good

## 806R, in R1 reads?:
grep -c "GGACTAC..GGGT.TCTAAT" R2_SulariArneReads.fq ## 15170340, out of 15510344
## reverse complement:
grep -c "ATTAGA.ACCC.NGTAGTCC" R2_SulariArneReads.fq ## 0, good.

## this looks like really good data. 

## let's clip these primers, and get on to dada2

## make a directories of uncompressed, separated R1/R2 

readDir="/media/vol1/daniel/sulariArne/illuminaReads/goodReads"
allReadsPrimersClippedDir="/media/vol1/daniel/sulariArne/illuminaReads/goodReads_primerClipped"
#mkdir $allReadsPrimersClippedDir
readsPrimersClippedDir="/media/vol1/daniel/sulariArne/illuminaReads/goodReads_primerClipped"

conda activate readQC

cd $readDir

cd $allReadsPrimersClippedDir

## something like this?

for iR1 in *good_1.fq; do
  iR2=${iR1/_1\.fq/_2\.fq}
  echo $iR1
  outR1=${iR1/1\.fq/1_trimmed\.fq}
  outR2=${iR2/2\.fq/2_trimmed\.fq}
  cutadapt -g GTGYCAGCMGCCGCGGTAA -G GGACTACNVGGGTWTCTAAT -o $outR1 -p $outR2 $iR1 $iR2 
done


## did that work? 

cd /media/vol1/daniel/sulariArne/illuminaReads/uncompressedReads

head -n4 Bacteria_BM663-01M0087_good_1.fq

head -n4 Bacteria_BM663-01M0087_good_1_trimmed.fq

clear
tail -n4 Bacteria_BM663-01M0087_good_1.fq
tail -n4 Bacteria_BM663-01M0087_good_1_trimmed.fq

tail -n4 Bacteria_BM663-01M0087_good_2.fq
tail -n4 Bacteria_BM663-01M0087_good_2_trimmed.fq

grep "^GTG.CAGC.GCCGCGGTAA" Bacteria_BM663-01M0087_good_1.fq ## lots
grep "^GTG.CAGC.GCCGCGGTAA" Bacteria_BM663-01M0087_good_1_trimmed.fq ## nada

## looks pretty good, move them to their own directory:
#mv *trimmed* /media/vol1/daniel/sulariArne/illuminaReads/goodReads_primerClipped/

cd /media/vol1/daniel/sulariArne/illuminaReads/goodReads_primerClipped

######################

## with a specialized environment for dada2.

conda activate 

## in R
install.packages("BiocManager")


library("BiocManager")

BiocManager::install("dada2")

BiocManager::install("phyloseq")


BiocManager::install("DESeq2")


library(DESeq2)

library("phyloseq")


## that takes forever.

## following these tutorials from the dada2 site:
http://benjjneb.github.io/dada2/bigdata_paired.html
http://benjjneb.github.io/dada2/bigdata.html

## but start with this:
http://benjjneb.github.io/dada2/tutorial.html


#############

## try to get Arne and Sulari to divide up their read directories, to reduce 
## memory usage?

nanoComp

conda activate dada2
R

library(dada2)
library(phyloseq)
library(ggplot2)
library(scales)
library(grid)

packageVersion("dada2")

## will dada2 take our data as is?


## on lab comp
setwd("/media/vol1/daniel/sulariArne/illuminaReads/")

## on personal comp:
#setwd("/home/daniel/Documents/projects/fichtelgebirge_project/sulariArneSoils/soilExtractions")

## for clipped: 
#path <- "/media/vol1/daniel/sulariArne/illuminaReads/goodReads_primerClipped"

## sample names are here:

sampleNames <- read.csv('sampleName_clientId.txt', 
                        sep='\t', 
                        col.names= c('sampleName', 'clientId'))

## on my personal computer:
#sampleNames <- read.csv('/home/daniel/Documents/projects/fichtelgebirge_project/sulariArneSoils/readReport/sampleName_clientId.txt',
#                        sep='\t', 
#                        col.names= c('sampleName', 'clientId'))

getwd()


head(sampleNames)

## for raw reads: 
path <- "/media/vol1/daniel/sulariArne/illuminaReads/uncompressedReads"

## on personal computer: ##
#path <- "/home/daniel/Documents/projects/fichtelgebirge_project/sulariArneSoils/readsUncompressed"

list.files(path)


# make two sorted lists, for R1 and R2
fnFs <- sort(list.files(path, pattern="_1.fq", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_2.fq", full.names = TRUE))


plotQualityProfile(fnFs[1:2]) ## kicks error. is this because of the lab computer setup?
## yes, works fine on my local setup

## does the order of our sampleNames DF fit the order of our file name vectors?

head(sampleNames)

fnFs[1:6]

tail(sampleNames)

fnFs[190:195]

## looks good
## so this should be the vector of sample names:

sample.names <- sampleNames$clientId
filtFs <- file.path(path, "filtered", paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(path, "filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names


#### can we subset to just Sulari's reads? ###

## use grep to get just samples that start with "S":
onlySulari <- grep("S", filtFs)
controls <- grep("C[1-2]", filtFs)
sulAndCon <- c(onlySulari, controls)
justSulariSamplesF <- filtFs[sulAndCon]


## repeat with reverse files
onlySulari <- grep("S", filtRs)
sulAndCon <- c(onlySulari, controls)
controls <- grep("C[1-2]", filtRs)
justSulariSamplesR <- filtRs[sulAndCon]

length(justSulariSamplesF)
head(justSulariSamplesF)
tail(justSulariSamplesF)

length(justSulariSamplesR)
head(justSulariSamplesR)
tail(justSulariSamplesR)

#################################################


out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, trimLeft=c(19,20), 
                     maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE, 
                     compress=TRUE, multithread=TRUE) # On Windows set multithread=FALSE

## that uses pretty much all of my resources on my laptop
## run it on the lab computer.

## for me, I import them here:

out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, trimLeft=c(19,20), 

head(out) ## gives you a report of the results.

## to 
## now dada2 needs to model the errors from the sequencing run:

## now this may be to much for laptops:
errF <- learnErrors(filtFs, multithread=TRUE)

## interruptions, so save this to be recovered for tomorrow
#save(errF, file='errF.rda')

load('errF.rda')


errR <- learnErrors(filtRs, multithread=TRUE)
#save(errR, file='errR.rda')

load('errR.rda')


plotErrors(errF, nominalQ=TRUE)

## to run dada2 on ALL the reads (Arne, Sulari, Controls)
dadaFs <- dada(filtFs, err=errR, multithread=TRUE)
#save(dadaFs, file='dadaFs.rda')
dadaRs <- dada(filtRs, err=errR, multithread=TRUE)

## can we do this with just Sulari's reads (and controls)?
dadaFs <- dada(justSulariSamplesF, err=errF, multithread=TRUE)
#save(dadaFs, file="dadaFsSulariOnly.rda")

dadaRs <- dada(justSulariSamplesR, err=errR, multithread=TRUE)
#save(dadaRs, file="dadaRsSulariOnly.rda")

load("sulariData/dadaFsSulariOnly.rda")

## the object is here:
dadaFs[[1]]
dadaRs[[1]]


## for just sulari
mergers <- mergePairs(dadaFs, justSulariSamplesF, dadaRs, justSulariSamplesR, verbose=TRUE)

#save(mergers, file="sulariMergers.rda")

load("sulariMergers.rda")

## for all samples
#mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)


# Inspect the merger data.frame from the first sample

head(mergers[[1]])

## so now we should have fully denoised, merged sequences
## this means we can construct ASVs, or zero-radius-OTUs (zOTUs)

## after merging, keep following the same tutorial, here:

## http://benjjneb.github.io/dada2/tutorial.html

seqtab <- makeSequenceTable(mergers)

?makeSequenceTable

dim(seqtab)

# Inspect distribution of sequence lengths

table(nchar(getSequences(seqtab))

## get rid of chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)

#save(seqtab.nochim, file ="sulariOnlyseqtab.nochim")

load("sulariOnlyseqtab.nochim")

dim(seqtab.nochim)

## how many were chimeras?

sum(seqtab.nochim)/sum(seqtab) ## 99%, good stuff

## let's get the latest tax training dataset:

wget https://zenodo.org/record/4587955/files/silva_nr99_v138.1_train_set.fa.gz

taxa <- assignTaxonomy(seqtab.nochim, "silva_nr99_v138.1_train_set.fa.gz", multithread=TRUE)

#save(taxa, file ="sulariData/sulariOnlyTaxa.rda")

load("sulariOnlyTaxa.rda")


head(taxa)

taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print)


## checking mock communities:

## first mock community

unqs.mock <- seqtab.nochim['S53',]

unqs.mock <- seqtab.nochim['C1.1',]

unqs.mock <- seqtab.nochim['C1.2',]

unqs.mock <- seqtab.nochim['C2.1',]

unqs.mock <- seqtab.nochim['C2.2',]

unqs.mock <- sort(unqs.mock[unqs.mock>0], decreasing=TRUE) # Drop ASVs absent in the Mock
cat("DADA2 inferred", length(unqs.mock), "sample sequences present in the Mock community.\n")

dim(seqtab.nochim)

seqtab.nochim['C1.1',2:10]


tail(seqtab.nochim)[,1:2]


## try these out on phyloseq!

library(phyloseq)

ps <- phyloseq(otu_table(seqtab.nochim, taxa_are_rows=FALSE), 
               tax_table(taxa))

########## step1 get data into phyloseq ###############

## on lab computer
#conda activate dada2

R

library(dada2)

library(phyloseq)
library(ggplot2)
library(scales)
library(grid)
library(vegan)

library(DESeq2)

library(patchwork) ## good for ggplot
library(BiocGenerics) ## might not need to load in newer versions of deseq2?
library(rsq)

packageVersion("DESeq2") ## I'm working with deseq2 version 1.34.0

## try  making a phyloseq object out of just our  
## otu table and our taxonomic object:

## on dan's lab computer
#setwd("/media/vol1/daniel/sulariArne/illuminaReads/")

## dan's lab comp
setwd("/home/daniel/Documents/projects/fichtelgebirge_project/sulariArneSoils/fichtelgebirgeSoils")

## we can move directly from the dada2 code block above
dna <- Biostrings::DNAStringSet(taxa_names(ps))
names(dna) <- taxa_names(ps)
ps <- merge_phyloseq(ps, dna)
taxa_names(ps) <- paste0("ASV", seq(ntaxa(ps)))
#save(ps, file="sulariData/sulariData/sulariPhyloseqObject.rda")

## or if we are coming back from a break  we can start here:

load("sulariData/sulariPhyloseqObject.rda")

## sulari's environmental data should be here:

## if you need to update...
## get the latest:

download.file(
    "https://raw.githubusercontent.com/danchurch/fichtelgebirgeSoils/main/sulariData/Envt_Matrix.csv",
    destfile = "sulariData/Envt_Matrix.csv", 
    method="wget") ## remove this last argument if using windows

env_data = read.csv('sulariData/Envt_Matrix.csv', row.names=1)
## add this dataframe to our phyloseq object as sample_data 
sample_data(ps) <- env_data
## and our transformed, min-cutoff, etc ps created below:
sample_data(logMin50ps) <- env_data

## save the R data object
#save(ps, file="sulariData/sulariPhyloseqObject.rda")
#save(logMin50ps, file= "sulariData/sularilogMin50ps.rda")

head(env_data) 

##### side note, Sulari found a typo: ####
grep("w1", env_data) ## column 4...
grep("w1", env_data$season) ## row 25

## correct it:
env_data[25,4] <- "W1" 
## save it:
write.csv(env_data, 'Envt_Matrix.csv')
## also correct it in the ps phyloseq object
sample_data(ps) <- env_data
## and resave
#save(ps, file="sulariData/sulariPhyloseqObject.rda")

load("sulariData/sulariPhyloseqObject.rda")
##### side note, Sulari found a typo^^^ ####


###### another side note this is how dan got lat/lon data from betty's spreadsheet ####

## we also have geographic data we can add to our 
## environmental matrix

## some horrible data wrangling here follows:

## get our geographic data, a csv made from betty's google doc
## I have kept a slightly cleaned up version here:

download.file(
    "https://raw.githubusercontent.com/danchurch/fichtelgebirgeSoils/main/carb4D_cleaned.csv", 
    destfile = "carb4D_cleaned.csv",
    method="wget") ## for windows users, don't add this last "method" line, leave it out and use default
geoDat <- read.csv("carb4D_cleaned.csv")

## read in the environmental data we have that doesn't yet have lat/long:
env_data <- read.csv('Envt Matrix.csv')

## make functions to get latitude and longitude out of the geoDat df:
getLat <- function(samp){
bb <- substring(samp, 1, 5)
cc <- grep(bb, geoDat$plotID, value=TRUE)
lat <- as.numeric(geoDat[geoDat$plotID == cc,]["lat"])
return(lat)
}

getLon <- function(samp){
bb <- substring(samp, 1, 5)
cc <- grep(bb, geoDat$plotID, value=TRUE)
lon <- as.numeric(geoDat[geoDat$plotID == cc,]["lon"])
return(lon)
}

## make empty vectors, search for latitude values, fill vectors
lats <- vector()
for (i in env_data$Plot.ID){
  lats <- c(lats,getLat( i))
}
## add the control NAs
lats <- c(lats, c(NA,NA,NA,NA))
env_data["Latitude"] <- lats

## repeat for longitude
lons <- vector()
for (i in env_data$Plot.ID){
  lons <- c(lons,getLon( i))
}
## add the control NAs
lons <- c(lons, c(NA,NA,NA,NA))
env_data["Longitude"] <- lons

## update the environmental matrix csv file 
write.csv(env_data, file='Envt_Matrix.csv', row.names=FALSE)

## use this now as our latest environmental data
## from now on, let's avoid spaces in file names

## and back to the tutorial and data exploration

############## step2 exploting data, especially controls ################

## let's start with this introductory tutorial,


http://joey711.github.io/phyloseq-demo/phyloseq-demo.html

## since we've already imported data to create our 
## start with the section "Basic Interaction with phyloseq Data"

## some useful phyloseq commands:

ntaxa(ps)

nsamples(ps)

sample_names(ps)

sample_names(ps)[1:10]

taxa_names(ps)[1:10]

sample_variables(ps)

rank_names(ps)

sample_sums(ps)


(p = plot_richness(ps, x = "pH"))

(p = plot_richness(ps, x = "Land.type"))

p + geom_boxplot(data = p$data, aes(x = "Land.type", y = value, color = NULL), alpha = 0.1)


## we can plot abundances with their ggplot-based functions:

TopNOTUs = names(sort(taxa_sums(ps), TRUE)[1:10])
ps10 = prune_taxa(TopNOTUs, ps)

plot_bar(ps10, "Land.type",  facet_grid = ~Phylum)

## or try:
plot_bar(ps10, "Land.type",  facet_grid = ~Genus)

## etc, etc

## this actually looks very good. only the 1rst sample is lower than the rest, essentially

## I tend to use base-R plotter, I find it simpler to use:

summary(sample_sums(ps))

quantile(sample_sums(ps))

quantile(sample_sums(ps), probs=c(.50,.60,.70,.80,.90,1))

quantile(sample_sums(ps), probs=c(0.05,.1,.2,.3,.4,.50,.60,.70,.80,.90,0.95,1))

summary(sample_sums(ps), quantile.type=90)

png(file='sulariReadDepths.png')
barplot(sample_sums(ps), main="Read depth by sample")
dev.off()


barplot(sort(sample_sums(ps), decreasing=TRUE))

## so maybe not a lot of correction for mean depth/variance relationship
## especially if we drop singletons.

barplot(get_taxa(ps, "C1.1" ))

## that figure is hard to see/understand. customize a bit.
## We'll use the following customized barplotting function a lot:

rankAb <- function(phyObj, sampleName, ylimit=500, ntax=NULL, textatX=100, textatY=(ylimit-40)){
    sampleNo0filter <- get_taxa(phyObj, sampleName) > 0
    if(is.null(ntax)) ntax=sum(sampleNo0filter)
    print(sum(sampleNo0filter)) ## let user know how many unique taxa are in sample
    sampleNo0 <- get_taxa(phyObj, sampleName)[sampleNo0filter]
    sampleNo0 <- sort(sampleNo0, decreasing=TRUE)
    sampleNo0 <- sampleNo0[1:ntax]
    taxaNames=tax_table(phyObj)[ names(sampleNo0), "Genus"]
    nuASV <- paste("number of unique ASVs = ",sum(sampleNo0filter), sep="")
    par(cex.axis = .75, mar=c(10,4,4,2))
    barplot(sampleNo0, 
        ylim = c(0,ylimit), 
        main=sampleName, 
        cex.main=2, 
        las=2, 
        names.arg=taxaNames) 
    text(textatX, textatY,  nuASV, cex = 2, )
}

## this function currently takes six arguments:
## phyObj=the phyloseq object of interest
## sampleName=obvious
## ylimit= maximum abundance you want to cut the graph off at, default 500
## ntax= number of taxa you want to look at
## textatX=if you need to adjust the message about number of ASVs
## textatY=if you need to adjust the message about number of ASVs

## this lets us look a little closer:

rankAb(ps,"C1.1")

rankAb(ps,"C1.1", 50000, textatX=100, textatY=10000)

rankAb(ps,"S1", 1000, 10, 5, 800)

## first control, cutoff graph at 100 observations, 10 most abundant species
rankAb(ps,"C1.1", 100,10, 5)

## E. coli has at least two ASVs!
## that is called "OTU-splitting", also in the mock community (C2 controls)
## also see that a lot of the members in our Mock community (C2 controls)  ended up
## here in the single species (C1 controls). 
## the reverse also happens, see below: E. coli is in our mock community??
## This is called tag-switching!! 

## or taller:
rankAb(ps,"C1.1", 100,10, 5)

## look at the single species control side-by-side:

par(mfrow=c(2,1))
rankAb(ps,"C1.1", 10000, 12)
rankAb(ps,"C1.2", 10000, 12)
par(mfrow=c(1,1))

## looking at the mock communities side by side:
par(mfrow=c(2,1))
rankAb(ps,"C2.1", 10000,15, 50)
rankAb(ps,"C2.2", 10000,15, 50)
par(mfrow=c(1,1))

## consistency is good, indicates low sequencer error,
## but lots of "weeds" in there. 
## Also E. coli! What is that doing in there?? 
## it leaked over from our single-species positive control (C1)
##  = tag switching!! 

## we can explore actual samples, too:
rankAb(ps, "S2", 10000, 15)

############# filtering out errors with abundance #############

## we see from our controls that there are a lot of low abundance
## "weeds", contaminants of some kind

rankAb(ps, "C1.1", 10000, 15)

## putting aside OTU-splitting, let's try tansforming our 
## samples to reduce the big artificial differences in abundances
## and make samples a little more comparable. 

## let's say that we don't trust any ASV that isn't 
## observed at least 50 times in a sample

## set a cutoff
#minCutoff <- 20
#minCutoff <- 30
minCutoff <- 50

#minCutoff <- 150 ## etc
## make a copy of our otu_table from our original phyloseq object:
ot <- otu_table(ps)
## change any ASV's abundance to zero 
## if it doesn't meet this cutoff:
ot[ot < minCutoff] <- 0
## insert this into a new phyloseq object
psMinCutoff <- ps
otu_table(psMinCutoff) <- ot
## how many reads do we lose with this?

sum(otu_table(ps)) ## 7944340 originally, before cutoff
sum(otu_table(psMinCutoff)) ## 6943873 after 20 mininum read cutoff
                            ## 5144801 after 50 mininum read cutoff

## and check, does this change our controls much?
par(mfrow=c(2,1))
rankAb(ps,"C1.1", 50000, textatX=100, textatY=20000)
rankAb(psMinCutoff,"C1.1", 80000, textatX=10, textatY=30000)
par(mfrow=c(1,1))

## this C1 control should only have 1 ASV
## a minimum cutoff of 20 reads leaves 73 ASVs out 216 
## a minimum cutoff of 30 reads leaves 51 ASVs out 216 
## a minimum cutoff of 50 reads leaves 29 ASVs out 216 

## and our mock community controls?
par(mfrow=c(2,1))
rankAb(ps,"C2.1", 10000, textatX=50, textatY=8000)
rankAb(psMinCutoff,"C2.1", 10000, textatX=10, textatY=8000)
par(mfrow=c(1,1))
## this C2 control should have 11 ASVs
## a minimum cutoff of 20 reads leaves 36 ASVs out 94 
## a minimum cutoff of 30 reads leaves 26 ASVs out 94 
## a minimum cutoff of 50 reads leaves 18 ASVs out 94 

## check real samples too!

par(mfrow=c(2,1))
rankAb(ps,"S10", ylimit=1000, textatX=500, textatY=300)
rankAb(psMinCutoff,"S10", ylimit=1000, textatX=200, textatY=300)
par(mfrow=c(1,1))

## a minimum cutoff of 20 reads leaves 848 ASVs out of 1796
## a minimum cutoff of 30 reads leaves 569 ASVs out of 1796
## a minimum cutoff of 50 reads leaves 322 ASVs out of 1796

## pretty powerful stuff. We can't really know how many of these
## are "really there" and how many are contaminants of some kind.
## our mock community helps us a bit with this, but in ecological
## samples there are many "real" species that are present in 
## much lower amounts that the ASVs in our mock controlr.
## the best we can do is try to find a minimum that removes 
## most of the noise from our single species control with as 
## little loss overall in reads and ASVs. It is a somewhat 
## subjective decision, but record what cutoff you use, and why,
## for readers.

## For the moment, I will use a minimum cutoff of 50 reads, 
## it retains all mock community members, and leaves some
## room for appearance of semi-rare species.

## so my phyloseq object going forward is:

minCutoff <- 50
ot <- otu_table(ps)
ot[ot < minCutoff] <- 0
psMinCutoff <- ps
otu_table(psMinCutoff) <- ot
## (code explained above at beginning of this section)

psMinCutoff

###### side note - mystery ASV in C2 controls #####
## side note - unexpected mystery ASV in our positive control
## interestingly, with Sulari's data  we do not see an ASV with the 
## taxonomy assigned as Enterobacter (which was in 
## the mock community). We do see a Klebsiella species...

## maybe this is the Enterobacter? 

## the NCBI accession for the intended Enterobacter is here:
https://www.ncbi.nlm.nih.gov/nuccore/EU721605.2/

## does our reference sequence for this Klebsiella bast to this?
## what is it called?

## look at the barplots:
par(mfrow = c(1,2))
rankAb(ps, "C2.1", 10000, 15)

rankAb(ps, "C2.2", 10000, 15)

par(mfrow = c(1,1))

## it's the 4th/5th most common taxa in these controls samples
## which asv is that?
names(sort(get_taxa(ps, "C2.1"), TRUE)[4]) ## ASV62
names(sort(get_taxa(ps, "C2.2"), TRUE)[5]) ## ASV62

## get the reference 16sV4 sequence for this asv:
as.character(refseq(ps)['ASV62'])

## we can blast this sequence quickly on NCBI:
https://blast.ncbi.nlm.nih.gov/Blast.cgi?PAGE=MegaBlast&PROGRAM=blastn&PAGE_TYPE=BlastSearch&BLAST_SPEC=
## this blasts with 100% to a million different 
## klebsiellas, enterobacter, and Leclercia species
## so this is very probably the Enterobacter that 
## Nico included in our positive controls

############# step3 transformation of data #########


barplot(sort(sample_sums(ps), TRUE))

## our sample sizes for this run are actually pretty 
## even, most of them fall between 60-70,000 reads,
## with a few exceptions. All are between 40,000 and 
## 70,000, I think.

## the sample-mean-variance problem is therefore
## not much of an issue here. Especially since 
## we are filtering a lot of the low-abundance
## observations out. 

## so let's do a simple transformation, as in
## 1- normalization of ASV abundances by sample read total, 
## 2 - log-transform of abundances to reduce the effect of
## high abundance reads. 

## this would look like this, just using our raw, original phyloseq object:

## transform to normalize by abundances
aa <- transform_sample_counts(ps, function(x) x/sum(x))

## now log transform. Add 1 to avoid the log giving a non-zero 
## value to zero abundances
otu_table(aa) <- otu_table(aa) + 1
logTransformedPS <- transform_sample_counts(aa, log)

## how does this affect our controls?

## use the custom rankAb plotting function from above
## if R-studio can't handle two plots at a time, 
## ignore the "par" command and run one plot a time.
## since we log transformed, our y-values
## are now much smaller (<15?), so adjust the settings
## accordingly


## this hopefully dampens radical differences in abundances
## for the mock community samples

## check our mock community
par(mfrow=c(1,2))
rankAb(ps,"C2.1", 10000,ntax=NULL,30, 6000)
rankAb(logTransformedPS,"C2.1", 0.2,ntax=NULL,30, 0.1)
par(mfrow=c(1,1))

## single species:
par(mfrow=c(2,1))
rankAb(ps,"C1.1", 10000, ntax=NULL, 100, 2000)
rankAb(logTransformedPS,"C1.1", 1, ntax=NULL, 100,0.5 )
par(mfrow=c(1,1))


## we can combine these two steps 
## (minimum cutoffs + transformation) 
 
## we start with this as our phyloseq object
psMinCutoff ## created in step above, 50 read minimum cutoff

## transform to normalize by abundances then log to flatten abundance differences
aa <- transform_sample_counts(psMinCutoff, function(x) x/sum(x))
## now log transform. Add 1 to avoid the log giving a non-zero 
## value to zero abundances
otu_table(aa) <- otu_table(aa) + 1
logMin50ps <- transform_sample_counts(aa, log)

## check out the differences.
## use the "pdf" and "dev.off" commands to save these to files for easier viewing

## single species
pdf("filteringStepsSingleSpComparison.pdf")
par(mfrow=c(3,1))
rankAb(ps,"C1.1", 10000, ntax=NULL, 150, 7000)
rankAb(psMinCutoff,"C1.1", 10000, ntax=NULL, 20, 7000)
rankAb(logMin50ps,"C1.1", 1, ntax=NULL, 25, 0.7)
dev.off()

## mock community
pdf("filteringStepsMockComComparison.pdf")
par(mfrow=c(3,1))
rankAb(ps,"C2.1", 10000, ntax=NULL, 50, 7000)
rankAb(psMinCutoff,"C2.1", 10000, ntax=NULL, 15, 7000)
rankAb(logMin50ps,"C2.1", 0.2, ntax=NULL, 14, 0.1)
dev.off()

## real sample
sample="S100"
filename=paste0("filteringStepsSample",sample,"Comparison.pdf")
pdf(filename)
par(mfrow=c(3,1))
rankAb(ps, sample, 1000, ntax=NULL, 1000, 500)
rankAb(psMinCutoff, sample, 1000, ntax=NULL, 200, 500)
rankAb(logMin50ps, sample, 0.03, ntax=NULL, 200, 0.015)
dev.off()

## save out the filtered, transformed phyloseq object

############# ordination ####################
## let's try an ordination:

## or just run it as is:
plot_ordination(ps, ordinate(ps, "MDS"), color = "Land.type") + geom_point(size = 5)

## changing symbols
plot_ordination(ps, ordinate(ps, "MDS"), color = "pH") + geom_point(aes(shape = Land.type), size=5)

## how does this compare to our phyloseq object that has 
## undergone some filtering?:

aa <- plot_ordination(ps, ordinate(ps, "MDS"), color = "pH") 
      aa <- aa + geom_point(aes(shape = Land.type), size=5)
      aa <- aa + labs(title="ps")

bb <- plot_ordination(logMin50ps, ordinate(logMin50ps, "MDS"), color = "pH")  
      bb <- bb + geom_point(aes(shape = Land.type), size=5)
      bb <- bb + labs(title="logmin50ps")

####

## the following only works if you have the patchwork package installed:

aa + bb

## what if we use a different method, such as non-metric multidimensional scaling?

cc <- plot_ordination(logMin50ps, ordinate(ps, "NMDS"), color = "pH") 
      cc <- cc + geom_point(aes(shape = Land.type), size=5)
      cc <- cc + labs(title="ps")

bb + cc 

## that looks pretty different at first glance, but
## the groupings by land type stay the same
logMin50ps

######### statistical tests ############

## there are different ways to test the effect 
## of environmental variables on multivariate
## community matrices

## probably the easiest to use is the 
## permanova test. it is not the most 
## sensitive test, and it is not a true
## linear model of any kind, so it has
## severe limitations.

## but it is non-parametric, instead based on 
## permatutations (randome rearrangements) of  
## your data to create null hypothesis.
## because it has so few statistical assumptions,
## it is the easiest test to implement and often
## the first test we try.

## so let's try a statistical test, permanova.
## make sure the vegan package is loaded

env_data <- as(sample_data(logMin50ps), "data.frame")
## get rid of controls:
env_data <- env_data[!row.names(env_data) %in% c("C1.1", "C1.2","C2.1", "C2.2"),]


comm_data <- as.data.frame(otu_table(logMin50ps))
## get rid of controls:
comm_data <- comm_data[!row.names(comm_data) %in% c("C1.1", "C1.2","C2.1", "C2.2"),]

## make our distance matrix
vegDistTable <- vegdist(comm_data, method="bray")

## run the test:
permaNOVAland <- adonis2(vegDistTable ~ Land.type,
                 data = env_data, na.action = na.omit)

permaNOVAland <- adonis2(vegDistTable ~ Land.type*pH,
                 data = env_data, na.action = na.omit)

permaNOVAland <- adonis2(vegDistTable ~ pH*Land.type,
                 data = env_data, na.action = na.omit)

permaNOVAland

############################


############### differential abundances #################

## if we want to look at what species might be most important 
## for explaining the community-level differences that we 
## see with our ordinations, we need to do statistically 
## sound checks. This is called checking for differential abundance

## we'll use deseq2, following this tutorial: 
https://bioconductor.org/packages/devel/bioc/vignettes/phyloseq/inst/doc/phyloseq-mixture-models.html

##There are other packages that can be 
## used. Other good options include ANCOM-II, and ALDEx2. 
## A good study to look at for comparison is:
https://www.nature.com/articles/s41467-022-28034-z
## despite theoretical critiques (like Gloor et al. 2017),
## deSeq2 seems to perform fairly well (see Fig. 1 and Fig 3 
## from above nature paper)

## deseq2 acts on raw abundances, not our transformed data
## (we use the transformed data for other statistical tests)

## in this case, I will focus on land type as a experimental factor

## let's add a new "land type" to our data, so we can keep our controls:

load("sulariData/sulariPhyloseqObject.rda")


psLandCont <- ps ## make a duplicate phyloseq obj to play with
sample_data(psLandCont)[c('C1.1','C1.2','C2.1','C2.2'),'Land.type']  <- "control" ## add info

tail(sample_data(psLandCont)) ## looks okay

## okay, now following the tutorial above:

diagdds = phyloseq_to_deseq2(psLandCont, ~ Land.type)
gm_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
geoMeans = apply(counts(diagdds), 1, gm_mean)
diagdds = estimateSizeFactors(diagdds, geoMeans = geoMeans)
diagdds = DESeq(diagdds, fitType="local")

## so that should have adjusted our abundances somewhat
## to account for the accidental abundance differences
## that occur with high-throuput sequencers, pcr, etc...

## check out the possible contrasts:

resultsNames(diagdds) ## Forest vs. Grassland not mentioned, but still possible

## let's see how the results look:
res <- results(diagdds, contrast=c("Land.type","Forest","Arable Land"))

## this gives a results table:

dim(res)

head(res)

## the log2FoldChange columns is the log2 of the ratio of
## counts in our second argument in the "contrast" setting above
## (in this case "Forest") over the other group ("Arable Land"):

## log2(forest/Arable land)

## so a positive number in this column indicates that this 
## ASV is more abundant in the Forest than in Arable Land.

## we can flip the positive/negative sign by changing the 
## order of the groups in our "results" command above

## this matrix gives us all results, which is huge. We can 
## summarize:

summary(res) 

## this tells me that there are 542 ASVs that are more 
## abundant in forests that are not in Arable Land, and 
## 1669 that are abundant in Arable Land that are not 
## in Forests


alpha = 0.1 ## significance cutoff
res <- res[complete.cases(res),] ## necessary because not all ASVs in all landtypes
sigtab = res[(res$padj < alpha), ] ## cut off below the alpha using adjusted pvalues
## add taxonomy:
sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(psLandCont)[rownames(sigtab), ], "matrix"))


## still following their tutorial, we can visualize a bit::
theme_set(theme_bw())

## say we just want to look at Acidobacter:
sigtabgen = subset(sigtab, !is.na(Genus)) ## get things id'd to genus
sigtabAcido = sigtab[sigtab$Phylum == "Acidobacteriota",]
sigtabgen = subset(sigtabAcido, !is.na(Genus))

## to sort by Phylum order, meaningless if we have subsetted
## already by phylum to "Acidobacter"...but it's in the tutorial
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Phylum, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Phylum = factor(as.character(sigtabgen$Phylum), levels=names(x))

## or we can sort by Genus, makes more sense for us when subsetted to Acidobacter
x = tapply(sigtabgen$log2FoldChange, sigtabgen$Genus, function(x) max(x))
x = sort(x, TRUE)
sigtabgen$Genus = factor(as.character(sigtabgen$Genus), levels=names(x))

ggplot(sigtabgen, aes(y=Genus, x=log2FoldChange, color=Phylum)) + 
  geom_vline(xintercept = 0.0, color = "gray", linewidth = 0.5) +
  geom_point(size=6) + 
  theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))


## if we are interested in a particular land.type versus all others,
## I think we need to reduce the treatment to a binary variable (yes/no), 
## then rerun deseq as above.

## for instance, if I want to find ASVs that are unique to forests:

psForestVsEverybodyElse <- ps ## make a duplicate phyloseq obj 
## add control as a land type:
sample_data(psForestVsEverybodyElse)[c('C1.1','C1.2','C2.1','C2.2'),'Land.type']  <- "control" 

## make a function that relabels Arable land and grassland to notForest:
lumpNonForest = function(landt){
  if (landt %in% c("Arable Land","Grassland")){newLandt <- "notForest"}
  else {newLandt <- landt}
  return (newLandt)
}

## run this over our landtype data:
newComboLandTypes = sapply(sample_data(psForestVsEverybodyElse)$Land.type, lumpNonForest, USE.NAMES=TRUE)

## check it:
print(newComboLandTypes)

## let's use this as our new landtype data:
sample_data(psForestVsEverybodyElse)$Land.type <- newComboLandTypes 

sample_data(psForestVsEverybodyElse)$Land.type

## rerun the above pipeline to get asvs that were
## generally important to forests vs. notForest:

diagdds = phyloseq_to_deseq2(psForestVsEverybodyElse, ~ Land.type)
gm_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
geoMeans = apply(counts(diagdds), 1, gm_mean)
diagdds = estimateSizeFactors(diagdds, geoMeans = geoMeans)
diagdds = DESeq(diagdds, fitType="local")

res <- results(diagdds, contrast=c("Land.type","Forest","notForest"))

head(res) 

summary(res) 

## very similar to our contrast above, between forests and croplands

alpha = 0.1 ## significance cutoff
res <- res[complete.cases(res),] ## necessary because not all ASVs in all landtypes
sigtab = res[(res$padj < alpha), ] ## cut off below the alpha using adjusted pvalues
## let's get just the species that seem to be more abundant 
## in forests:

forestASVs = sigtab[(sigtab$log2FoldChange > 0), ] 

## add taxonomy:
forestASVs = cbind(as(forestASVs, "data.frame"), as(tax_table(psLandCont)[rownames(forestASVs), ], "matrix"))


## what phyla seem to most important for forests?

forestASVs$Phylum

table(forestASVs$Phylum)

table(forestASVs$Genus)

## most look like Acidobacteriota and Actinobacteriota

## plot it

par(mar=c(10, 4, 4, 2)) ## need a bigger margin on the bottom  for names

barplot(sort(table(forestASVs$Phylum), TRUE), las=2, )

barplot(sort(table(forestASVs$Genus), TRUE), las=2, )

## if we want to track the (normalized) counts of a particular ASV:

## sort by log fold change
forestASVsorted <- (forestASVs[order(forestASVs$log2FoldChange, decreasing=TRUE),])

tail(forestASVs[order(forestASVs$log2FoldChange),])

plotCounts(diagdds, gene="ASV1", intgroup="Land.type")

## you can repeat the above for other land uses, cropland, arable land, etc


#########################################################################33

## and if we want to look at a single, non-categorical (continuous) variable?
## let's try looking for ASVs that seem important for predicting 
## respiration. 

## this is going to give us the logfoldchange per unit of
## of basal respiration as per here: 
https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#how-can-i-include-a-continuous-covariate-in-the-design-formula


## I will look for ASVs whose abundances seem to correlated with 
## increasing respiration.

library

load("../sulariData/sulariPhyloseqObject.rda") ## sulari original "raw" phyloseq output from dada2
load("../sulariData/sularilogMin50ps.rda")  ## sulari's transformed, cutoffs enforeced, etc

ps 

logMin50ps

## just like before make a duplicate phyloseq obj to play with
## and get rid of controls while we're at it
psNoControl = prune_samples(!(rownames(sample_data(ps)) %in% c("C1.1","C1.2","C2.1","C2.2")), ps)

## get rid of NAs in basalResp

variable_names

sample_data(psNoControl)$Basal.respiration

basalRespNotNA <- !is.na(sample_data(psNoControl)$Basal.respiration)

psNoControl = prune_samples(basalRespNotNA, psNoControl)

## okay, same old code as before:
diagdds = phyloseq_to_deseq2(psNoControl, ~ Basal.respiration) ## set "treatment" of interest
gm_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
geoMeans = apply(counts(diagdds), 1, gm_mean)
diagdds = estimateSizeFactors(diagdds, geoMeans = geoMeans)
diagdds = DESeq(diagdds, fitType="local")
resultsNames(diagdds) 
res <- results(diagdds)
alpha = 0.1 ## significance cutoff
res <- res[complete.cases(res),] ## necessary because not all ASVs in all landtypes
sigtab = res[(res$padj < alpha), ] ## cut off below the alpha using adjusted pvalues

## of interest to us are those species that have the positive log fold change:
posRespASVs = sigtab[(sigtab$log2FoldChange > 0), ] 

## sort this dataframe by log fold change
posRespASVsorted <- (posRespASVs[order(posRespASVs$log2FoldChange, decreasing=TRUE),])
## this gives us a list of species whose abundances seem to be 
## very positively correlated with respiration...

## if you like, you can attach taxonomy data and look at the phyla or genus names
## associated with these ASVs as we did above. 

## do any of these match with the ASVs that were associated with grasslands?

## if we want to pull these out of our data to use as a variable in our 
## model of respiration: 

## taking the 5 most common:

posRespASVtop5 <- rownames(posRespASVsorted)[1:5]

## what are these? taxanomy:

tax_table(ps)[posRespASVtop5,]

## let's get the abundances from our phyloseq object that has been 
## normalized, log transformed, and had minimum cutoffs enforced,
## in the code above:

logMin50ps

## to remove the controls:
psLogMin50NoControl = prune_samples(!(rownames(sample_data(logMin50ps)) %in% c("C1.1","C1.2","C2.1","C2.2")), logMin50ps)

## look at our list again:
print(posRespASVtop5)

## look at the most responsive ASV:
otu_table(psLogMin50NoControl)[,"ASV621"]


## or look at all 5:
otu_table(psLogMin50NoControl)[,posRespASVtop5]

## how do we incorporate these into our models? 
mostRespyAsv <- otu_table(psLogMin50NoControl)[,"ASV621"]
## use this below as one our predictor variables in a GLM


mostRespyAsv
####################################################

## let's make a regression model of respiration 

load("sulariData/sulariPhyloseqObject.rda") ## sulari original "raw" phyloseq output from dada2
load("sulariData/sularilogMin50ps.rda")  ## sulari's transformed, cutoffs enforeced, etc

## to remove the controls:
psLogMin50NoControl = prune_samples(!(rownames(sample_data(logMin50ps)) %in% c("C1.1","C1.2","C2.1","C2.2")), logMin50ps)

head(sample_data(psLogMin50NoControl))
tail(sample_data(psLogMin50NoControl))

## we also have, for instance, our most sensitive ASV, in terms of 
## increasing abundance with increasing soil 

Plot.ID

otu_table(psLogMin50NoControl)[,"ASV621"]


## because phyloseq won't let us export a clean data frame 
## from their phyloseq object, I'll pull out the variables
## one-by-one for use with the model.

env_data = sample_data(psLogMin50NoControl)
## remove controls
env_data <- env_data[0:120,]

soil.respiration <- env_data$soil.respiration
pH <- env_data$pH
MBC <- env_data$MBC
CNR <- env_data$CNR
Land.type <- env_data$Land.type

## try out some models:

aa = glm(soil.respiration ~ pH + MBC + CNR + Land.type + mostRespyAsv)
bb = glm(soil.respiration ~ pH + MBC + Land.type)
cc = glm(soil.respiration ~ pH + MBC + CNR)
dd = glm(soil.respiration ~ pH + CNR + Land.type)
ee = glm(soil.respiration ~ pH + CNR )

## any one of these can be examined using the summary command:
summary(ee)

## if we want to include interatction terms:

## land type and pH
ff = glm(soil.respiration ~ pH + MBC + CNR + Land.type*pH + mostRespyAsv)
## land type and CNR
gg = glm(soil.respiration ~ pH + MBC + CNR + Land.type*CNR + mostRespyAsv)
## etc

## particularly interesting is when we examine the interaction of 
## our two strong predictors, microbial biomass and Land.type

bigModel = glm(soil.respiration ~ pH + MBC + CNR + Land.type*MBC + mostRespyAsv)

## we lose some degrees of freedom, so our pvalues in our main predictors 
## drop a bit, but we see that something interesting is going on within 
## forests - there is a strong interaction between forest land type and 
## microbial biomass, taken together they are a strong predictor 
## to me this implies that forest soils are particularly sensitive to 
## microbial biomass as an indicator of resipiration. 

simplifyModel <- step(bigModel)

## if you look at the results with summary(), you see that the model moves
## from all terms (top) and stepwise reduces the terms in model until 
## it gets to the simplest models at the bottom. In that last model, 
## they show that a simplified model with MBC and Land.type and the interaction
## between these two, as removing CNR doesn't seem to change the deviance
## or AIC much at all, meaning that it's not that useful to the model.

## so the simplest possible model would be

interactLTmbc = glm(soil.respiration ~ Land.type*MBC )

## from the rsq library, we can get a r-squared. It is somewhat controversial,
## as r-squared of generalized models with categorical variables, etc, are a 
## little weird, but it is the best available metric I see:

library(rsq) 
rsq(aa)## 0.53, pretty strong, lots of variance explained. some warnings, probably due to missing values.

## and with the interaction effect 
rsq(bigModel)## 0.617, stronger, more variance explained, lots of variance explained. still some warnings, probably due to missing values.

rsq(interactLTmbc)## 0.606, just a litter weaker than the full model, but much simpler, which is usually a better model

##########################################

## to review, the total pipeline from sulari's
## dada2 results so far:

library(dada2)
library(phyloseq)
library(ggplot2)
library(scales)
library(grid)
library(vegan)
library(DESeq2)
library(patchwork) ## good for ggplot
library(BiocGenerics) ## might not need to load in newer versions of deseq2?
library(rsq)

load("sulariData/sulariPhyloseqObject.rda")
## institute minimum cutoff
minCutoff <- 50
ot <- otu_table(ps)
ot[ot < minCutoff] <- 0
psMinCutoff <- ps
otu_table(psMinCutoff) <- ot
## transform to normalize by abundances then log to flatten abundance differences
aa <- transform_sample_counts(psMinCutoff, function(x) x/sum(x))
## now log transform. Add 1 to avoid the log giving a non-zero 
## value to zero abundances
otu_table(aa) <- otu_table(aa) + 1
logMin50ps <- transform_sample_counts(aa, log)
#save(logMin50ps, file= "sulariData/sularilogMin50ps.rda")


tax_table(ps)[posRespASVtop5]

psOnlyGrass = prune_samples(sample_data(ps)$Land.type == "Grassland", ps)

psLogMin50NoControl = prune_samples(!(rownames(sample_data(logMin50ps)) %in% c("C1.1","C1.2","C2.1","C2.2")), logMin50ps)


#### looking for index bleed ####

## there are some hints that index bleed may be occuring
## let's check

library(phyloseq)

setwd('/home/daniel/Documents/projects/fichtelSoils/fichtelgebirgeSoils/spatialAnalysis')

load("/home/daniel/Documents/projects/fichtelSoils/fichtelgebirgeSoils/sulariData/sulariPhyloseqObject.rda")
load("/home/daniel/Documents/projects/fichtelSoils/fichtelgebirgeSoils/sulariData/sularilogMin50ps.rda")

rankAb <- function(phyObj, sampleName, ylimit=500, ntax=NULL, textatX=100, textatY=(ylimit-40)){
    sampleNo0filter <- get_taxa(phyObj, sampleName) > 0
    if(is.null(ntax)) ntax=sum(sampleNo0filter)
    print(sum(sampleNo0filter)) ## let user know how many unique taxa are in sample
    sampleNo0 <- get_taxa(phyObj, sampleName)[sampleNo0filter]
    sampleNo0 <- sort(sampleNo0, decreasing=TRUE)
    sampleNo0 <- sampleNo0[1:ntax]
    taxaNames=tax_table(phyObj)[ names(sampleNo0), "Genus"]
    nuASV <- paste("number of unique ASVs = ",sum(sampleNo0filter), sep="")
    par(cex.axis = .75, mar=c(10,4,4,2))
    barplot(sampleNo0, 
        ylim = c(0,ylimit), 
        main=sampleName, 
        cex.main=2, 
        las=2, 
        names.arg=taxaNames) 
    text(textatX, textatY,  nuASV, cex = 2, )
}

sample_variables(ps)

rankAb(ps,"C1.1", ntax = 30)

rankAb(ps,"C2.1", ntax = 30)

rankAb(ps,"S5")

rankAb(logMin50ps, "S5", 0.03, ntax=20, 200, 0.015)

rankAb(ps, "C1.1", 0.03, ntax=20, 200, 0.015)

rankAb(logMin50ps, "C1.1", 0.03, ntax=20, 200, 0.015)

rankAb(logMin50ps, "S120", 0.03, ntax=20, 200, 0.015)

sample_data(ps)["S120",] ## forest. the second most common bacteria is E. coli. 
## seems likely that there was cross-contamination. Ugh. 

## ponder this for a bit. It doesn't seem to be affecting our community-level
## analysis, these otus aren't coming up as important for respiration, etc.