From 7ca5efcdeef72468946ee432194ba4e1e92e3680 Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Sat, 19 Jun 2021 14:31:06 +0600
Subject: [PATCH 01/18] hardcoded smartseq settings

---
 zUMIs-dge2.R    | 2 +-
 zUMIs-mapping.R | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/zUMIs-dge2.R b/zUMIs-dge2.R
index 5c222c9..a26c8e9 100644
--- a/zUMIs-dge2.R
+++ b/zUMIs-dge2.R
@@ -33,7 +33,7 @@ if(UMIcheck == "nonUMI"){
   opt$counting_opts$Ham_Dist <- 0
 }
 #is the data Smart-seq3?
-smart3_flag <- ifelse(any(grepl(pattern = "ATTGCGCAATG",x = unlist(opt$sequence_files))), TRUE, FALSE)
+smart3_flag <- TRUE #ifelse(any(grepl(pattern = "ATTGCGCAATG",x = unlist(opt$sequence_files))), TRUE, FALSE)
 
 #######################################################################
 ##### Barcode handling & chunking
diff --git a/zUMIs-mapping.R b/zUMIs-mapping.R
index 9f5db0d..504132d 100755
--- a/zUMIs-mapping.R
+++ b/zUMIs-mapping.R
@@ -29,7 +29,7 @@ if(inp$which_Stage == "Filtering"){
 
 genome_size <- system(command = paste("du -sh",inp$reference$STAR_index,"| cut -f1"), intern = TRUE)
 genome_size <- as.numeric(gsub(pattern = "G",replacement = "", x = genome_size))
-num_star_instances <- floor(inp$mem_limit/genome_size)
+num_star_instances <- 16 #floor(inp$mem_limit/genome_size)
 
 # GTF file setup ----------------------------------------------------------
 #in case of additional sequences, we need to create a custom GTF
@@ -81,12 +81,12 @@ getmode <- function(v) {
   uniqv[which.max(tabulate(match(v, uniqv)))]
 }
 
-cDNA_read_length <- getmode(nchar(cDNA_peek$V1))
+cDNA_read_length <- 150 #getmode(nchar(cDNA_peek$V1))
 
 
 # Setup STAR mapping ------------------------------------------------------
 samtools_load_cores <- ifelse(inp$num_threads>8,2,1)
-avail_cores <- inp$num_threads - samtools_load_cores #reserve threads for samtools file opening
+avail_cores <- 16 # inp$num_threads - samtools_load_cores #reserve threads for samtools file opening
 if(inp$which_Stage == "Filtering"){
   avail_cores <- floor(avail_cores / num_star_instances)
 }

From fbe994ab2e1be35b87d0528935a1291d247a956d Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Mon, 21 Jun 2021 08:13:00 +0600
Subject: [PATCH 02/18] set up smartseq run with STAR 2.7.9a

---
 mergeBAM.sh          |  2 +-
 runfeatureCountFUN.R |  4 ++--
 zUMIs-dge2.R         |  3 ++-
 zUMIs-mapping.R      | 32 +++++++++++++++++++-------------
 4 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/mergeBAM.sh b/mergeBAM.sh
index 7c2909f..7d48323 100755
--- a/mergeBAM.sh
+++ b/mergeBAM.sh
@@ -13,7 +13,7 @@ yaml=$6
 samtoolsexc=$7
 
 ls $tmpMerge/$project.*.filtered.tagged.bam > $tmpMerge/$project.bamlist.txt
-#samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1
+samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1
 
 cat $tmpMerge/$project.*.BCstats.txt > $outdir/$project.BCstats.txt
 
diff --git a/runfeatureCountFUN.R b/runfeatureCountFUN.R
index 2ca4fd9..9e2a32d 100755
--- a/runfeatureCountFUN.R
+++ b/runfeatureCountFUN.R
@@ -348,8 +348,8 @@ get_gr <- function(y){GenomicRanges::makeGRangesFromDataFrame(y,
 
 .runFeatureCount<-function(abamfile,RG,saf,strand,type,primaryOnly,cpu,mem,fcounts_clib,multi_overlap_var){
   print(paste0("Assigning reads to features (",type,")"))
-  #  fc.stat<-Rsubread::featureCounts(files=abamfile,
-     fc.stat <- featureCounts(files=abamfile,
+    fc.stat<-Rsubread::featureCounts(files=abamfile,
+  #   fc.stat <- featureCounts(files=abamfile,
                                    annot.ext=saf,
                                    isGTFAnnotationFile=F,
                                    primaryOnly=primaryOnly,
diff --git a/zUMIs-dge2.R b/zUMIs-dge2.R
index eb3d052..43b97fd 100644
--- a/zUMIs-dge2.R
+++ b/zUMIs-dge2.R
@@ -66,7 +66,8 @@ try(gene_name_mapping <- .get_gene_names(gtf = paste0(opt$out_dir,"/",opt$projec
 try(data.table::fwrite(gene_name_mapping, file = paste0(opt$out_dir,"/zUMIs_output/expression/",opt$project,".gene_names.txt"), sep ="\t", quote = FALSE), silent = TRUE)
 ##
 
-if(smart3_flag & opt$counting_opts$strand == 1){
+#if(smart3_flag & opt$counting_opts$strand == 1){
+if(TRUE){
   #split bam in UMU ends and internal
   print("Preparing Smart-seq3 data for stranded gene assignment...")
   print(Sys.time())
diff --git a/zUMIs-mapping.R b/zUMIs-mapping.R
index 35b44ec..1196ed8 100755
--- a/zUMIs-mapping.R
+++ b/zUMIs-mapping.R
@@ -25,7 +25,7 @@ if(inp$which_Stage == "Filtering"){
 }else{
   filtered_bams <- paste0(inp$out_dir,"/",inp$project,".filtered.tagged.unmapped.bam") # for resuming from mapping state using the merged unmapped bam
 }
-
+print("1")
 
 # check if multiple STAR instances can be run -----------------------------
 
@@ -34,14 +34,16 @@ genome_size <- as.numeric(gsub(pattern = "G",replacement = "", x = genome_size))
 if(is.na(genome_size)){
   genome_size <- 25 #set average genome size if there was a problem detecting
 }
-num_star_instances <- 16 #floor(inp$mem_limit/genome_size)
+print(genome_size)
+num_star_instances <- floor(inp$mem_limit/genome_size)
 if(num_star_instances < 1){
   num_star_instances = 1 #set the number of STAR instances to 1 if it is 0
 }
 if(num_star_instances > inp$num_threads){
   num_star_instances = inp$num_threads
 } 
-
+print(num_star_instances)
+print("2")
 # GTF file setup ----------------------------------------------------------
 #in case of additional sequences, we need to create a custom GTF
 
@@ -77,13 +79,13 @@ if ( is.null(additional_fq[1]) | length(additional_fq)==0 ) {
   gtf_to_use <- paste(inp$out_dir,"/",inp$project,".final_annot.gtf",sep="")
   param_additional_fa <- paste("--genomeFastaFiles",paste(inp$reference$additional_files,collapse = " "))
 }
-
-#inp$reference$GTF_file_final <- gtf_to_use
-#yaml::write_yaml(inp,file = paste(inp$out_dir,"/",inp$project,".postmap.yaml",sep=""))
+print("3")
+inp$reference$GTF_file_final <- gtf_to_use
+yaml::write_yaml(inp,file = paste(inp$out_dir,"/",inp$project,".postmap.yaml",sep=""))
 
 # Detect read length ------------------------------------------------------
 #check the first 100 reads to detect the read length of the cDNA read
-#filtered_bam <- paste(inp$out_dir,"/",inp$project,".filtered.tagged.bam",sep="")
+filtered_bam <- paste(inp$out_dir,"/",inp$project,".filtered.tagged.bam",sep="")
 
 cDNA_peek <- data.table::fread(cmd = paste(samtools,"view",filtered_bams[1],"| cut -f10 | head -n 1000"),stringsAsFactors = F,data.table = T, header = F)
 
@@ -92,9 +94,9 @@ getmode <- function(v) {
   uniqv[which.max(tabulate(match(v, uniqv)))]
 }
 
-cDNA_read_length <- 150 #getmode(nchar(cDNA_peek$V1))
-
-
+cDNA_read_length <- getmode(nchar(cDNA_peek$V1))
+print(cDNA_read_length)
+print("4")
 # Setup STAR mapping ------------------------------------------------------
 samtools_load_cores <- ifelse(inp$num_threads>8,2,1)
 avail_cores <- 16 # inp$num_threads - samtools_load_cores #reserve threads for samtools file opening
@@ -112,13 +114,17 @@ param_misc <- paste("--genomeDir",inp$reference$STAR_index,
                     "--sjdbGTFfile",gtf_to_use,
                     "--runThreadN",avail_cores,
                     "--sjdbOverhang", cDNA_read_length-1,
-                    "--readFilesType SAM",inp$read_layout)
+#                    "--sjdbOverhang 149",
+                    "--readFilesType SAM PE",
+	            "--genomeSAindexNbases 11",
+		    "--limitOutSJcollapsed 5000000",
+                    inp$read_layout)
 
 STAR_command <- paste(STAR_exec,param_defaults,param_misc,inp$reference$additional_STAR_params,param_additional_fa)
 if(inp$counting_opts$twoPass==TRUE){
   STAR_command <- paste(STAR_command,"--twopassMode Basic")
 }
-
+print("5")
 #finally, run STAR
 if(num_star_instances>1 & inp$which_Stage == "Filtering"){
   map_tmp_dir <- paste0(inp$out_dir,"/zUMIs_output/.tmpMap/")
@@ -154,7 +160,7 @@ if(num_star_instances>1 & inp$which_Stage == "Filtering"){
     system(STAR_command)
   }
 }
-
+print("6")
 
 #clean up chunked bam files
 if(inp$which_Stage == "Filtering"){

From eb7646317f00b0223f427b4531b33c79aadc7d85 Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Tue, 22 Jun 2021 06:09:12 +0600
Subject: [PATCH 03/18] correct counting by excluding unmapped reads from
 chromosomes

---
 UMIstuffFUN.R         | 14 ++++++++++++++
 mergeBAM.sh           |  2 +-
 misc/countUMIfrags.py |  2 +-
 runfeatureCountFUN.R  |  4 ++--
 zUMIs-dge2.R          | 11 +++++++----
 zUMIs-mapping.R       |  9 ++++-----
 6 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/UMIstuffFUN.R b/UMIstuffFUN.R
index b35aba0..cbd967e 100644
--- a/UMIstuffFUN.R
+++ b/UMIstuffFUN.R
@@ -68,6 +68,17 @@ reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigne
     taglist <- c(taglist, "GI")
   }
 
+  idxstats <- idxstats[grep("\\*", idxstats[,1], invert = TRUE), ]
+print(idxstats)
+#print(str(idxstats))
+#print(levels(idxstats[,"seqnames"]))
+
+idxstats[,1] <- droplevels(idxstats[,1])
+print(levels(idxstats[,"seqnames"]))
+
+idxstats[,1] <- factor(as.character(idxstats[,1]), levels = seqnames(seqinfo(BamFile(featfile))))
+
+#print(idxstats)
   rsamtools_reads <- mclapply(1:nrow(idxstats), function(x) {
     if(opt$read_layout == "PE"){
       parms <- ScanBamParam(tag=taglist,
@@ -83,6 +94,9 @@ reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigne
     }
 
     dat <- scanBam(file = featfile, param = parms)
+#print(head(dat))
+#print(dim(dat))
+#print(str(dat))
     if(inex){
       dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE, GEin = dat[[1]]$tag$GI)
     }else{
diff --git a/mergeBAM.sh b/mergeBAM.sh
index 7d48323..7c2909f 100755
--- a/mergeBAM.sh
+++ b/mergeBAM.sh
@@ -13,7 +13,7 @@ yaml=$6
 samtoolsexc=$7
 
 ls $tmpMerge/$project.*.filtered.tagged.bam > $tmpMerge/$project.bamlist.txt
-samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1
+#samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1
 
 cat $tmpMerge/$project.*.BCstats.txt > $outdir/$project.BCstats.txt
 
diff --git a/misc/countUMIfrags.py b/misc/countUMIfrags.py
index aa7e1d5..775529f 100755
--- a/misc/countUMIfrags.py
+++ b/misc/countUMIfrags.py
@@ -24,7 +24,7 @@ def count_UMItags(inpath, bcs, chr):
         bc = read.get_tag('BC')
         if bc in bcs:
             ub = read.get_tag('UB')
-            if ub is '':
+            if ub == '':
                 bccounts[bc]['int'] += 1
             else:
                  bccounts[bc]['umi'] += 1
diff --git a/runfeatureCountFUN.R b/runfeatureCountFUN.R
index 9e2a32d..26e6e08 100755
--- a/runfeatureCountFUN.R
+++ b/runfeatureCountFUN.R
@@ -348,8 +348,8 @@ get_gr <- function(y){GenomicRanges::makeGRangesFromDataFrame(y,
 
 .runFeatureCount<-function(abamfile,RG,saf,strand,type,primaryOnly,cpu,mem,fcounts_clib,multi_overlap_var){
   print(paste0("Assigning reads to features (",type,")"))
-    fc.stat<-Rsubread::featureCounts(files=abamfile,
-  #   fc.stat <- featureCounts(files=abamfile,
+  #  fc.stat<-Rsubread::featureCounts(files=abamfile,
+    fc.stat <- featureCounts(files=abamfile,
                                    annot.ext=saf,
                                    isGTFAnnotationFile=F,
                                    primaryOnly=primaryOnly,
diff --git a/zUMIs-dge2.R b/zUMIs-dge2.R
index 43b97fd..737d2ba 100644
--- a/zUMIs-dge2.R
+++ b/zUMIs-dge2.R
@@ -8,7 +8,7 @@ suppressPackageStartupMessages(library(Rsamtools))
 ##########################
 myYaml <- commandArgs(trailingOnly = T)
 
-opt   <-read_yaml(myYaml)
+opt   <- read_yaml(myYaml)
 setwd(opt$out_dir)
 #try(unixtools::set.tempdir(opt$out_dir))
 source(paste0(opt$zUMIs_directory,"/runfeatureCountFUN.R"))
@@ -26,6 +26,7 @@ data.table::setDTthreads(threads=1)
 fcounts_clib <- paste0(opt$zUMIs_directory,"/misc/fcountsLib2")
 
 opt <- fixMissingOptions(opt)
+print(opt)
 #######################################################################
 ########################## double check for non-UMI method
 UMIcheck <- check_nonUMIcollapse(opt$sequence_files)
@@ -33,7 +34,7 @@ if(UMIcheck == "nonUMI"){
   opt$counting_opts$Ham_Dist <- 0
 }
 #is the data Smart-seq3?
-smart3_flag <- TRUE #ifelse(any(grepl(pattern = "ATTGCGCAATG",x = unlist(opt$sequence_files))), TRUE, FALSE)
+smart3_flag <- ifelse(any(grepl(pattern = "ATTGCGCAATG", x = unlist(opt$sequence_files))), TRUE, FALSE)
 
 #######################################################################
 ##### Barcode handling & chunking
@@ -66,8 +67,7 @@ try(gene_name_mapping <- .get_gene_names(gtf = paste0(opt$out_dir,"/",opt$projec
 try(data.table::fwrite(gene_name_mapping, file = paste0(opt$out_dir,"/zUMIs_output/expression/",opt$project,".gene_names.txt"), sep ="\t", quote = FALSE), silent = TRUE)
 ##
 
-#if(smart3_flag & opt$counting_opts$strand == 1){
-if(TRUE){
+if(smart3_flag & opt$counting_opts$strand == 1){
   #split bam in UMU ends and internal
   print("Preparing Smart-seq3 data for stranded gene assignment...")
   print(Sys.time())
@@ -210,6 +210,9 @@ if( opt$counting_opts$introns ){
 for(i in unique(bccount$chunkID)){
      print( paste( "Working on barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
      print( paste( "Processing",length(bccount[chunkID==i]$XC), "barcodes in this chunk..." ))
+print(sortbamfile)
+print(bccount)
+print(opt$counting_opts$introns)
      reads <- reads2genes_new(featfile = sortbamfile,
                               bccount  = bccount,
                               inex     = opt$counting_opts$introns,
diff --git a/zUMIs-mapping.R b/zUMIs-mapping.R
index 1196ed8..c9b6dae 100755
--- a/zUMIs-mapping.R
+++ b/zUMIs-mapping.R
@@ -113,12 +113,11 @@ param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_
 param_misc <- paste("--genomeDir",inp$reference$STAR_index,
                     "--sjdbGTFfile",gtf_to_use,
                     "--runThreadN",avail_cores,
-                    "--sjdbOverhang", cDNA_read_length-1,
+#                    "--sjdbOverhang", cDNA_read_length-1,
 #                    "--sjdbOverhang 149",
-                    "--readFilesType SAM PE",
+                    "--readFilesType SAM", inp$read_layout,
 	            "--genomeSAindexNbases 11",
-		    "--limitOutSJcollapsed 5000000",
-                    inp$read_layout)
+		    "--limitOutSJcollapsed 5000000")
 
 STAR_command <- paste(STAR_exec,param_defaults,param_misc,inp$reference$additional_STAR_params,param_additional_fa)
 if(inp$counting_opts$twoPass==TRUE){
@@ -143,7 +142,7 @@ if(num_star_instances>1 & inp$which_Stage == "Filtering"){
   #after parallel instance STAR, collect output data in the usual file places
   out_logs <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Log.final.out"), full = TRUE)
   merge_logs <- paste("cat",paste(out_logs, collapse = " "),">",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Log.final.out"))
-  out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.out.bam"), full = TRUE)
+  out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*Aligned.out.bam"), full = TRUE)
   merge_bams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam"),paste(out_bams, collapse = " "))
   out_txbams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.toTranscriptome.out.bam"), full = TRUE)
   merge_txbams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.toTranscriptome.out.bam"),paste(out_txbams, collapse = " "))

From 37ec456e7e61904b4282aae3007e1598cbf37d72 Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Wed, 23 Jun 2021 14:42:24 +0900
Subject: [PATCH 04/18] remove verbose logs for debugging

---
 UMIstuffFUN.R        | 16 ++++------------
 runfeatureCountFUN.R |  3 +--
 zUMIs-dge2.R         |  4 +---
 zUMIs-mapping.R      | 11 -----------
 4 files changed, 6 insertions(+), 28 deletions(-)
 mode change 100755 => 100644 zUMIs-mapping.R

diff --git a/UMIstuffFUN.R b/UMIstuffFUN.R
index cbd967e..441126b 100644
--- a/UMIstuffFUN.R
+++ b/UMIstuffFUN.R
@@ -68,17 +68,12 @@ reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigne
     taglist <- c(taglist, "GI")
   }
 
+  # remove special characters for unmapped reads from list of chromosomes
   idxstats <- idxstats[grep("\\*", idxstats[,1], invert = TRUE), ]
-print(idxstats)
-#print(str(idxstats))
-#print(levels(idxstats[,"seqnames"]))
+  idxstats[,1] <- droplevels(idxstats[,1])
+  idxstats[,1] <- factor(as.character(idxstats[,1]), levels = seqnames(seqinfo(BamFile(featfile))))
+  print(idxstats)
 
-idxstats[,1] <- droplevels(idxstats[,1])
-print(levels(idxstats[,"seqnames"]))
-
-idxstats[,1] <- factor(as.character(idxstats[,1]), levels = seqnames(seqinfo(BamFile(featfile))))
-
-#print(idxstats)
   rsamtools_reads <- mclapply(1:nrow(idxstats), function(x) {
     if(opt$read_layout == "PE"){
       parms <- ScanBamParam(tag=taglist,
@@ -94,9 +89,6 @@ idxstats[,1] <- factor(as.character(idxstats[,1]), levels = seqnames(seqinfo(Bam
     }
 
     dat <- scanBam(file = featfile, param = parms)
-#print(head(dat))
-#print(dim(dat))
-#print(str(dat))
     if(inex){
       dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE, GEin = dat[[1]]$tag$GI)
     }else{
diff --git a/runfeatureCountFUN.R b/runfeatureCountFUN.R
index 26e6e08..336ade1 100755
--- a/runfeatureCountFUN.R
+++ b/runfeatureCountFUN.R
@@ -348,8 +348,7 @@ get_gr <- function(y){GenomicRanges::makeGRangesFromDataFrame(y,
 
 .runFeatureCount<-function(abamfile,RG,saf,strand,type,primaryOnly,cpu,mem,fcounts_clib,multi_overlap_var){
   print(paste0("Assigning reads to features (",type,")"))
-  #  fc.stat<-Rsubread::featureCounts(files=abamfile,
-    fc.stat <- featureCounts(files=abamfile,
+     fc.stat <- featureCounts(files=abamfile,
                                    annot.ext=saf,
                                    isGTFAnnotationFile=F,
                                    primaryOnly=primaryOnly,
diff --git a/zUMIs-dge2.R b/zUMIs-dge2.R
index 737d2ba..34ebbd0 100644
--- a/zUMIs-dge2.R
+++ b/zUMIs-dge2.R
@@ -210,9 +210,7 @@ if( opt$counting_opts$introns ){
 for(i in unique(bccount$chunkID)){
      print( paste( "Working on barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
      print( paste( "Processing",length(bccount[chunkID==i]$XC), "barcodes in this chunk..." ))
-print(sortbamfile)
-print(bccount)
-print(opt$counting_opts$introns)
+     print(sortbamfile)
      reads <- reads2genes_new(featfile = sortbamfile,
                               bccount  = bccount,
                               inex     = opt$counting_opts$introns,
diff --git a/zUMIs-mapping.R b/zUMIs-mapping.R
old mode 100755
new mode 100644
index c9b6dae..1bbc47c
--- a/zUMIs-mapping.R
+++ b/zUMIs-mapping.R
@@ -25,7 +25,6 @@ if(inp$which_Stage == "Filtering"){
 }else{
   filtered_bams <- paste0(inp$out_dir,"/",inp$project,".filtered.tagged.unmapped.bam") # for resuming from mapping state using the merged unmapped bam
 }
-print("1")
 
 # check if multiple STAR instances can be run -----------------------------
 
@@ -34,7 +33,6 @@ genome_size <- as.numeric(gsub(pattern = "G",replacement = "", x = genome_size))
 if(is.na(genome_size)){
   genome_size <- 25 #set average genome size if there was a problem detecting
 }
-print(genome_size)
 num_star_instances <- floor(inp$mem_limit/genome_size)
 if(num_star_instances < 1){
   num_star_instances = 1 #set the number of STAR instances to 1 if it is 0
@@ -42,8 +40,6 @@ if(num_star_instances < 1){
 if(num_star_instances > inp$num_threads){
   num_star_instances = inp$num_threads
 } 
-print(num_star_instances)
-print("2")
 # GTF file setup ----------------------------------------------------------
 #in case of additional sequences, we need to create a custom GTF
 
@@ -79,7 +75,6 @@ if ( is.null(additional_fq[1]) | length(additional_fq)==0 ) {
   gtf_to_use <- paste(inp$out_dir,"/",inp$project,".final_annot.gtf",sep="")
   param_additional_fa <- paste("--genomeFastaFiles",paste(inp$reference$additional_files,collapse = " "))
 }
-print("3")
 inp$reference$GTF_file_final <- gtf_to_use
 yaml::write_yaml(inp,file = paste(inp$out_dir,"/",inp$project,".postmap.yaml",sep=""))
 
@@ -95,8 +90,6 @@ getmode <- function(v) {
 }
 
 cDNA_read_length <- getmode(nchar(cDNA_peek$V1))
-print(cDNA_read_length)
-print("4")
 # Setup STAR mapping ------------------------------------------------------
 samtools_load_cores <- ifelse(inp$num_threads>8,2,1)
 avail_cores <- 16 # inp$num_threads - samtools_load_cores #reserve threads for samtools file opening
@@ -113,8 +106,6 @@ param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_
 param_misc <- paste("--genomeDir",inp$reference$STAR_index,
                     "--sjdbGTFfile",gtf_to_use,
                     "--runThreadN",avail_cores,
-#                    "--sjdbOverhang", cDNA_read_length-1,
-#                    "--sjdbOverhang 149",
                     "--readFilesType SAM", inp$read_layout,
 	            "--genomeSAindexNbases 11",
 		    "--limitOutSJcollapsed 5000000")
@@ -123,7 +114,6 @@ STAR_command <- paste(STAR_exec,param_defaults,param_misc,inp$reference$addition
 if(inp$counting_opts$twoPass==TRUE){
   STAR_command <- paste(STAR_command,"--twopassMode Basic")
 }
-print("5")
 #finally, run STAR
 if(num_star_instances>1 & inp$which_Stage == "Filtering"){
   map_tmp_dir <- paste0(inp$out_dir,"/zUMIs_output/.tmpMap/")
@@ -159,7 +149,6 @@ if(num_star_instances>1 & inp$which_Stage == "Filtering"){
     system(STAR_command)
   }
 }
-print("6")
 
 #clean up chunked bam files
 if(inp$which_Stage == "Filtering"){

From b7e3ba6be09ffa810c5ca7b86f44fceea4608d6f Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Wed, 23 Jun 2021 14:50:52 +0900
Subject: [PATCH 05/18] update version and documentation

---
 README.md | 2 ++
 zUMIs.sh  | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d21d381..2eb98b6 100755
--- a/README.md
+++ b/README.md
@@ -31,6 +31,8 @@ We provide a script to convert zUMIs output into loom file automatically based o
 zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.
 
 ## Changelog
+23 June 2021: zUMIs2.9.7: merged code from @TomKellyGenetics. Update for compatbility with STAR 2.7.9a and samtools 1.3.1-3. Prevent crash when unmapped reads are counted as a special character for chromosomes from samtools idxstats is not permitted as a factor levels. Excludes unmapped reads for counting.
+
 07 Apr 2021: zUMIs2.9.6: Fixed a potential crash when the input file to the barcode sharing feature does not contain an equal number of columns.
 
 18 Feb 2021: zUMIs2.9.5: Mismatches for detecting Smart-seq3 UMI-read pattern are now user-settable in the YAML file as follows: `find_pattern: ATTGCGCAATG;2` would allow 2 mismatches. Usage as prior to this version (ie `find_pattern: ATTGCGCAATG`) will default to the previous value of 1 mismatch allowed.
diff --git a/zUMIs.sh b/zUMIs.sh
index aa7238f..dd54654 100755
--- a/zUMIs.sh
+++ b/zUMIs.sh
@@ -3,7 +3,7 @@
 # Pipeline to run UMI-seq analysis from fastq to read count tables.
 # Authors: Swati Parekh, Christoph Ziegenhain, Beate Vieth & Ines Hellmann
 # Contact: sparekh@age.mpg.de or christoph.ziegenhain@ki.se
-vers=2.9.6
+vers=2.9.7
 currentv=$(curl -s https://raw.githubusercontent.com/sdparekh/zUMIs/main/zUMIs.sh | grep '^vers=' | cut -f2 -d "=")
 if [ "$currentv" != "$vers" ] ; then
     echo -e "------------- \n\n Good news! A newer version of zUMIs is available at https://github.com/sdparekh/zUMIs \n\n-------------";

From 3cc73a1d6ffa28168418bab37cfcc334bfa5afab Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Wed, 23 Jun 2021 14:56:11 +0900
Subject: [PATCH 06/18] remove hardcoded variables

---
 zUMIs-mapping.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zUMIs-mapping.R b/zUMIs-mapping.R
index 1bbc47c..3332873 100644
--- a/zUMIs-mapping.R
+++ b/zUMIs-mapping.R
@@ -92,7 +92,7 @@ getmode <- function(v) {
 cDNA_read_length <- getmode(nchar(cDNA_peek$V1))
 # Setup STAR mapping ------------------------------------------------------
 samtools_load_cores <- ifelse(inp$num_threads>8,2,1)
-avail_cores <- 16 # inp$num_threads - samtools_load_cores #reserve threads for samtools file opening
+avail_cores <- inp$num_threads - samtools_load_cores #reserve threads for samtools file opening
 if(inp$which_Stage == "Filtering"){
   avail_cores <- floor(avail_cores / num_star_instances)
 }

From d4b5c85f69ee6cb39ee8c138f6b260f71699fcd5 Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Wed, 30 Jun 2021 10:18:35 +0900
Subject: [PATCH 07/18] add Docker configuration

---
 Dockerfile      | 82 +++++++++++++++++++++++++++++++++++++++++++++++++
 podman_build.sh |  6 ++++
 2 files changed, 88 insertions(+)
 create mode 100644 Dockerfile
 create mode 100644 podman_build.sh

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..fb99b29
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,82 @@
+FROM ubuntu:bionic
+
+#RUN /bin/sh -c [ -z "$(apt-get indextargets)" ]
+
+RUN set -xe 		&& echo '#!/bin/sh' > /usr/sbin/policy-rc.d 	&& echo 'exit 101' >> /usr/sbin/policy-rc.d 	&& chmod +x /usr/sbin/policy-rc.d 		&& dpkg-divert --local --rename --add /sbin/initctl 	&& cp -a /usr/sbin/policy-rc.d /sbin/initctl 	&& sed -i 's/^exit.*/exit 0/' /sbin/initctl 		&& echo 'force-unsafe-io' > /etc/dpkg/dpkg.cfg.d/docker-apt-speedup 		&& echo 'DPkg::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; };' > /etc/apt/apt.conf.d/docker-clean 	&& echo 'APT::Update::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; };' >> /etc/apt/apt.conf.d/docker-clean 	&& echo 'Dir::Cache::pkgcache ""; Dir::Cache::srcpkgcache "";' >> /etc/apt/apt.conf.d/docker-clean 		&& echo 'Acquire::Languages "none";' > /etc/apt/apt.conf.d/docker-no-languages 		&& echo 'Acquire::GzipIndexes "true"; Acquire::CompressionTypes::Order:: "gz";' > /etc/apt/apt.conf.d/docker-gzip-indexes 		&& echo 'Apt::AutoRemove::SuggestsImportant "false";' > /etc/apt/apt.conf.d/docker-autoremove-suggests
+
+RUN mkdir -p /run/systemd && echo 'docker' > /run/systemd/container
+
+RUN apt-get update \
+ && apt-get install -y \
+ software-properties-common \
+ build-essential \
+ && apt-get update
+
+ENV DEBIAN_FRONTEND="noninteractive" TZ="Asia/Tokyo"
+
+ENV PATH="/root/miniconda3/bin:${PATH}"
+ARG PATH="/root/miniconda3/bin:${PATH}"
+
+RUN apt-get update \
+  && apt-get install -y wget && rm -rf /var/lib/apt/lists/*
+
+RUN wget \
+  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+  && mkdir /root/.conda \
+  && bash Miniconda3-latest-Linux-x86_64.sh -b \
+  && rm -f Miniconda3-latest-Linux-x86_64.sh 
+RUN conda --version
+
+RUN apt-get update \
+ && apt-get upgrade -y \
+ && apt-get install -y --fix-missing \
+ git \
+ nano \
+ r-base r-recommended \
+ samtools \
+ htop \
+ pigz \
+ wget \
+ curl \
+ libcurl4-openssl-dev \
+ libcairo2-dev cairo-dock cairo-dock-plug-ins \
+ libxt-dev \
+ libharfbuzz-dev libfribidi-dev \
+ libhdf5-dev \
+ libgl1-mesa-glx libegl1-mesa \
+ libxrandr2 libxrandr2 \
+ libxss1 libxcursor1 libxcomposite1 \
+ libasound2 libxi6 libxtst6 \
+ libssl-dev \
+ python-pip
+
+RUN conda install numpy scipy cython numba matplotlib scikit-learn h5py click
+
+RUN pip install pysam velocyto
+
+RUN apt update -qq \
+  && apt install --no-install-recommends software-properties-common dirmngr \
+  && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \
+  && add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/" \
+  && apt install -y --no-install-recommends r-base
+
+RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' \
+  && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \
+  && apt update \
+  && gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \
+  && gpg -a --export E298A3A825C0D65DFD57CBB651716619E084DAB9 | apt-key add - \
+  && apt-get install -y r-base r-base-core r-recommended r-base-dev
+
+RUN R -e 'install.packages(c("yaml", "BiocManager", "Cairo", "ggplot2", "cowplot", "data.table", "devtools", "dplyr", "ggraster", "ggrastr", "hdf5r", "inflection", "ragg", "textshaping"), dependencies = TRUE)'
+
+RUN R -e 'BiocManager::install(c("GenomicFeatures", "plyranges", "Rhtslib", "Rsamtools"), ask = FALSE)'
+
+RUN R -e 'install.packages("devtools"); devtools::install_github(repo = "hhoeflin/hdf5r"); devtools::install_github(repo = "mojaveazure/loomR", ref = "develop")'
+
+RUN mkdir -p home && cd home && git clone https://github.com/TomKellyGenetics/zUMIs.git
+
+RUN cd /home  && wget https://github.com/alexdobin/STAR/archive/2.7.9a.tar.gz && tar -xzf 2.7.9a.tar.gz && cd STAR-2.7.9a/source && make
+
+CMD ["/bin/bash"]
+
+#bash
diff --git a/podman_build.sh b/podman_build.sh
new file mode 100644
index 0000000..e5fa956
--- /dev/null
+++ b/podman_build.sh
@@ -0,0 +1,6 @@
+#sleep 86400
+podman build -t zumis:v2.9.7 .
+podman tag zumis:v2.9.7 tomkellygenetics/zumis:v2.9.7
+podman tag zumis:v2.9.7 tomkellygenetics/zumis:latest
+podman push tomkellygenetics/zumis:v2.9.7
+podman push tomkellygenetics/zumis:latest

From 3c3782d053fdac0f0145873acc2df2bcf29436b8 Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Wed, 30 Jun 2021 15:56:22 +0900
Subject: [PATCH 08/18] update docker build to install dependencies

---
 Dockerfile | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index fb99b29..41f79db 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -40,6 +40,7 @@ RUN apt-get update \
  curl \
  libcurl4-openssl-dev \
  libcairo2-dev cairo-dock cairo-dock-plug-ins \
+ libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev \
  libxt-dev \
  libharfbuzz-dev libfribidi-dev \
  libhdf5-dev \
@@ -71,11 +72,20 @@ RUN R -e 'install.packages(c("yaml", "BiocManager", "Cairo", "ggplot2", "cowplot
 
 RUN R -e 'BiocManager::install(c("GenomicFeatures", "plyranges", "Rhtslib", "Rsamtools"), ask = FALSE)'
 
+RUN echo 'PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"' > /root/.Renviron
+
 RUN R -e 'install.packages("devtools"); devtools::install_github(repo = "hhoeflin/hdf5r"); devtools::install_github(repo = "mojaveazure/loomR", ref = "develop")'
 
 RUN mkdir -p home && cd home && git clone https://github.com/TomKellyGenetics/zUMIs.git
 
-RUN cd /home  && wget https://github.com/alexdobin/STAR/archive/2.7.9a.tar.gz && tar -xzf 2.7.9a.tar.gz && cd STAR-2.7.9a/source && make
+RUN cd /home  && wget https://github.com/alexdobin/STAR/archive/2.7.9a.tar.gz \
+ && tar xf 2.7.9a.tar.gz \
+ && rm 2.7.9a.tar.gz \
+ && cd STAR-2.7.9a/source \
+ && make \
+ && mv ./STAR* /usr/bin \
+ && cd ../.. \
+ && rm -rf STAR-2.7.9a
 
 CMD ["/bin/bash"]
 

From 8fd3c6ba246324db1ac520f0e89390b8a3e21287 Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Wed, 30 Jun 2021 16:25:14 +0900
Subject: [PATCH 09/18] Argument list too long for ls
 https://unix.stackexchange.com/questions/38955/argument-list-too-long-for-ls

---
 splitfq.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/splitfq.sh b/splitfq.sh
index 709eeb2..be6b984 100755
--- a/splitfq.sh
+++ b/splitfq.sh
@@ -38,7 +38,7 @@ function splitfqgz() {
   d=`dirname $fqfile`
   $pexc -dc -p $nthreads $d/$pref.gz | split --lines=$nl --filter=''$pexc' -p '$nthreads' > $FILE.gz' - $t$pref$project
 
-	ls $t$pref$project* | sed "s|$t$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt
+	find -type f -name $t$pref$project* | sed "s|$t$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt
 
 	exit 1
 }

From 10d91c09a42e4716742ff3024e28e976662d8862 Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Wed, 30 Jun 2021 16:26:41 +0900
Subject: [PATCH 10/18] Argument list too long for ls
 https://unix.stackexchange.com/questions/38955/argument-list-too-long-for-ls

---
 splitfq.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/splitfq.sh b/splitfq.sh
index be6b984..7b2bc64 100755
--- a/splitfq.sh
+++ b/splitfq.sh
@@ -38,7 +38,7 @@ function splitfqgz() {
   d=`dirname $fqfile`
   $pexc -dc -p $nthreads $d/$pref.gz | split --lines=$nl --filter=''$pexc' -p '$nthreads' > $FILE.gz' - $t$pref$project
 
-	find -type f -name $t$pref$project* | sed "s|$t$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt
+	find -type f -name '$t$pref$project*' | sed "s|$t$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt
 
 	exit 1
 }

From f3e1cc28864757108987ce6ea7005b40cb84c75f Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Wed, 30 Jun 2021 16:48:03 +0900
Subject: [PATCH 11/18] find arguments too long

---
 splitfq.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/splitfq.sh b/splitfq.sh
index 7b2bc64..f0e9bf6 100755
--- a/splitfq.sh
+++ b/splitfq.sh
@@ -38,7 +38,9 @@ function splitfqgz() {
   d=`dirname $fqfile`
   $pexc -dc -p $nthreads $d/$pref.gz | split --lines=$nl --filter=''$pexc' -p '$nthreads' > $FILE.gz' - $t$pref$project
 
-	find -type f -name '$t$pref$project*' | sed "s|$t$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt
+	ls $t$pref$project* > $t/$project.listPrefix.txt
+        sed -i "s|$t$pref$project||" $t/$project.listPrefix.txt
+        sed -i's/.gz//' $t/$project.listPrefix.txt
 
 	exit 1
 }

From 4937d8c4058852466c49ee8abd56164682f3a2ca Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Wed, 30 Jun 2021 16:50:01 +0900
Subject: [PATCH 12/18] ls without arguments

---
 splitfq.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/splitfq.sh b/splitfq.sh
index f0e9bf6..27e9bef 100755
--- a/splitfq.sh
+++ b/splitfq.sh
@@ -38,9 +38,9 @@ function splitfqgz() {
   d=`dirname $fqfile`
   $pexc -dc -p $nthreads $d/$pref.gz | split --lines=$nl --filter=''$pexc' -p '$nthreads' > $FILE.gz' - $t$pref$project
 
-	ls $t$pref$project* > $t/$project.listPrefix.txt
-        sed -i "s|$t$pref$project||" $t/$project.listPrefix.txt
-        sed -i's/.gz//' $t/$project.listPrefix.txt
+        cd  $t$pref$project
+        ls | sed 's/.gz//' > $t/$project.listPrefix.txt
+        cd -
 
 	exit 1
 }

From 153b53b171ee9e868b84a62ee465de2a7697f8cf Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Wed, 30 Jun 2021 22:35:55 +0900
Subject: [PATCH 13/18] resolve bug for merge temp files for large inputs

---
 Dockerfile | 2 +-
 splitfq.sh | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 41f79db..8844500 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -53,7 +53,7 @@ RUN apt-get update \
 
 RUN conda install numpy scipy cython numba matplotlib scikit-learn h5py click
 
-RUN pip install pysam velocyto
+RUN pip install pysam velocyto; pip2 install pysam
 
 RUN apt update -qq \
   && apt install --no-install-recommends software-properties-common dirmngr \
diff --git a/splitfq.sh b/splitfq.sh
index 27e9bef..523e681 100755
--- a/splitfq.sh
+++ b/splitfq.sh
@@ -38,9 +38,7 @@ function splitfqgz() {
   d=`dirname $fqfile`
   $pexc -dc -p $nthreads $d/$pref.gz | split --lines=$nl --filter=''$pexc' -p '$nthreads' > $FILE.gz' - $t$pref$project
 
-        cd  $t$pref$project
-        ls | sed 's/.gz//' > $t/$project.listPrefix.txt
-        cd -
+        ls $t| grep $pref$project | sed "s|$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt
 
 	exit 1
 }

From 2757a7fcc35faf0b1526e34b33fffc91e13bec8f Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Thu, 1 Jul 2021 10:27:19 +0900
Subject: [PATCH 14/18] Argument list too long for rm tmpMerge

---
 mergeBAM.sh | 2 +-
 zUMIs.sh    | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/mergeBAM.sh b/mergeBAM.sh
index 7c2909f..7d48323 100755
--- a/mergeBAM.sh
+++ b/mergeBAM.sh
@@ -13,7 +13,7 @@ yaml=$6
 samtoolsexc=$7
 
 ls $tmpMerge/$project.*.filtered.tagged.bam > $tmpMerge/$project.bamlist.txt
-#samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1
+samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1
 
 cat $tmpMerge/$project.*.BCstats.txt > $outdir/$project.BCstats.txt
 
diff --git a/zUMIs.sh b/zUMIs.sh
index dd54654..8e40812 100755
--- a/zUMIs.sh
+++ b/zUMIs.sh
@@ -261,10 +261,6 @@ if [[ "${whichStage}" == "Filtering" ]] ; then
   for x in ${l} ; do perl ${zumisdir}/fqfilter_v2.pl ${yaml} ${samtoolsexc} ${Rexc} ${pigzexc} ${zumisdir} ${x} & done
   wait
   bash ${zumisdir}/mergeBAM.sh ${zumisdir} ${tmpMerge} ${num_threads} ${project} ${outdir} ${yaml} ${samtoolsexc}
-  for i in ${fqfiles} ; do
-      pref=$(basename ${i} | sed 's/.fastq.gz//' | sed 's/.fq.gz//')
-      rm ${tmpMerge}${pref}*gz
-  done
   date
 
   #run barcode detection
@@ -292,6 +288,8 @@ if
   date
 fi
 
+rm -rf ${tmpMerge}
+
 if
 [[ "${whichStage}" == "Filtering" ]] ||
 [[ "${whichStage}" == "Mapping" ]] ||

From 8f7c841a0fa87ef6636585bd6f1d2cf1c8c317db Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Thu, 1 Jul 2021 10:39:40 +0900
Subject: [PATCH 15/18] add script for building docker container

---
 docker_build.sh | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 docker_build.sh

diff --git a/docker_build.sh b/docker_build.sh
new file mode 100644
index 0000000..e5fa956
--- /dev/null
+++ b/docker_build.sh
@@ -0,0 +1,6 @@
+#sleep 86400
+podman build -t zumis:v2.9.7 .
+podman tag zumis:v2.9.7 tomkellygenetics/zumis:v2.9.7
+podman tag zumis:v2.9.7 tomkellygenetics/zumis:latest
+podman push tomkellygenetics/zumis:v2.9.7
+podman push tomkellygenetics/zumis:latest

From e3e3cccd87d8a02c05f45168c44a3bc761e68e72 Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Thu, 1 Jul 2021 10:40:44 +0900
Subject: [PATCH 16/18] update docker build

---
 docker_build.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docker_build.sh b/docker_build.sh
index e5fa956..ca323ab 100644
--- a/docker_build.sh
+++ b/docker_build.sh
@@ -1,6 +1,6 @@
 #sleep 86400
-podman build -t zumis:v2.9.7 .
-podman tag zumis:v2.9.7 tomkellygenetics/zumis:v2.9.7
-podman tag zumis:v2.9.7 tomkellygenetics/zumis:latest
-podman push tomkellygenetics/zumis:v2.9.7
-podman push tomkellygenetics/zumis:latest
+podman build -t zumis2:v2.9.7 .
+podman tag zumis2:v2.9.7 chrzie/zumis2:v2.9.7
+podman tag zumis2:v2.9.7 chrzie/zumis2:latest
+podman push chrzie/zumis2:v2.9.7
+podman push chrzie/zumis2:latest

From fbfd41874ef441e087129e246fcccde3074872b4 Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Fri, 2 Jul 2021 16:20:37 +0900
Subject: [PATCH 17/18] Argument list too long for rm

---
 mergeBAM.sh | 2 +-
 splitfq.sh  | 2 +-
 zUMIs.sh    | 6 ++----
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/mergeBAM.sh b/mergeBAM.sh
index 7c2909f..7d48323 100755
--- a/mergeBAM.sh
+++ b/mergeBAM.sh
@@ -13,7 +13,7 @@ yaml=$6
 samtoolsexc=$7
 
 ls $tmpMerge/$project.*.filtered.tagged.bam > $tmpMerge/$project.bamlist.txt
-#samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1
+samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1
 
 cat $tmpMerge/$project.*.BCstats.txt > $outdir/$project.BCstats.txt
 
diff --git a/splitfq.sh b/splitfq.sh
index 709eeb2..523e681 100755
--- a/splitfq.sh
+++ b/splitfq.sh
@@ -38,7 +38,7 @@ function splitfqgz() {
   d=`dirname $fqfile`
   $pexc -dc -p $nthreads $d/$pref.gz | split --lines=$nl --filter=''$pexc' -p '$nthreads' > $FILE.gz' - $t$pref$project
 
-	ls $t$pref$project* | sed "s|$t$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt
+        ls $t| grep $pref$project | sed "s|$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt
 
 	exit 1
 }
diff --git a/zUMIs.sh b/zUMIs.sh
index dd54654..d97942f 100755
--- a/zUMIs.sh
+++ b/zUMIs.sh
@@ -261,10 +261,6 @@ if [[ "${whichStage}" == "Filtering" ]] ; then
   for x in ${l} ; do perl ${zumisdir}/fqfilter_v2.pl ${yaml} ${samtoolsexc} ${Rexc} ${pigzexc} ${zumisdir} ${x} & done
   wait
   bash ${zumisdir}/mergeBAM.sh ${zumisdir} ${tmpMerge} ${num_threads} ${project} ${outdir} ${yaml} ${samtoolsexc}
-  for i in ${fqfiles} ; do
-      pref=$(basename ${i} | sed 's/.fastq.gz//' | sed 's/.fq.gz//')
-      rm ${tmpMerge}${pref}*gz
-  done
   date
 
   #run barcode detection
@@ -322,3 +318,5 @@ fi
 if [[ ${conda} = true ]] ; then
   source ${zumisenv}/bin/deactivate
 fi
+
+rm -rf ${tmpMerge}

From 71b5a0c872953ee29306c16474e0a52fea835bc9 Mon Sep 17 00:00:00 2001
From: TomKellyGenetics <tomkellygenetics@gmail.com>
Date: Fri, 2 Jul 2021 16:22:24 +0900
Subject: [PATCH 18/18] Argument list too long for rm

---
 zUMIs.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/zUMIs.sh b/zUMIs.sh
index c7d8f8f..d97942f 100755
--- a/zUMIs.sh
+++ b/zUMIs.sh
@@ -288,8 +288,6 @@ if
   date
 fi
 
-rm -rf ${tmpMerge}
-
 if
 [[ "${whichStage}" == "Filtering" ]] ||
 [[ "${whichStage}" == "Mapping" ]] ||