sdparekh · TomKellyGenetics · Jun 19, 2021 · Jun 19, 2021 · Jun 21, 2021 · Jun 22, 2021
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,92 @@
+FROM ubuntu:bionic
+
+#RUN /bin/sh -c [ -z "$(apt-get indextargets)" ]
+
+RUN set -xe 		&& echo '#!/bin/sh' > /usr/sbin/policy-rc.d 	&& echo 'exit 101' >> /usr/sbin/policy-rc.d 	&& chmod +x /usr/sbin/policy-rc.d 		&& dpkg-divert --local --rename --add /sbin/initctl 	&& cp -a /usr/sbin/policy-rc.d /sbin/initctl 	&& sed -i 's/^exit.*/exit 0/' /sbin/initctl 		&& echo 'force-unsafe-io' > /etc/dpkg/dpkg.cfg.d/docker-apt-speedup 		&& echo 'DPkg::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; };' > /etc/apt/apt.conf.d/docker-clean 	&& echo 'APT::Update::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; };' >> /etc/apt/apt.conf.d/docker-clean 	&& echo 'Dir::Cache::pkgcache ""; Dir::Cache::srcpkgcache "";' >> /etc/apt/apt.conf.d/docker-clean 		&& echo 'Acquire::Languages "none";' > /etc/apt/apt.conf.d/docker-no-languages 		&& echo 'Acquire::GzipIndexes "true"; Acquire::CompressionTypes::Order:: "gz";' > /etc/apt/apt.conf.d/docker-gzip-indexes 		&& echo 'Apt::AutoRemove::SuggestsImportant "false";' > /etc/apt/apt.conf.d/docker-autoremove-suggests
+
+RUN mkdir -p /run/systemd && echo 'docker' > /run/systemd/container
+
+RUN apt-get update \
+ && apt-get install -y \
+ software-properties-common \
+ build-essential \
+ && apt-get update
+
+ENV DEBIAN_FRONTEND="noninteractive" TZ="Asia/Tokyo"
+
+ENV PATH="/root/miniconda3/bin:${PATH}"
+ARG PATH="/root/miniconda3/bin:${PATH}"
+
+RUN apt-get update \
+  && apt-get install -y wget && rm -rf /var/lib/apt/lists/*
+
+RUN wget \
+  https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
+  && mkdir /root/.conda \
+  && bash Miniconda3-latest-Linux-x86_64.sh -b \
+  && rm -f Miniconda3-latest-Linux-x86_64.sh 
+RUN conda --version
+
+RUN apt-get update \
+ && apt-get upgrade -y \
+ && apt-get install -y --fix-missing \
+ git \
+ nano \
+ r-base r-recommended \
+ samtools \
+ htop \
+ pigz \
+ wget \
+ curl \
+ libcurl4-openssl-dev \
+ libcairo2-dev cairo-dock cairo-dock-plug-ins \
+ libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev \
+ libxt-dev \
+ libharfbuzz-dev libfribidi-dev \
+ libhdf5-dev \
+ libgl1-mesa-glx libegl1-mesa \
+ libxrandr2 libxrandr2 \
+ libxss1 libxcursor1 libxcomposite1 \
+ libasound2 libxi6 libxtst6 \
+ libssl-dev \
+ python-pip
+
+RUN conda install numpy scipy cython numba matplotlib scikit-learn h5py click
+
+RUN pip install pysam velocyto; pip2 install pysam
+
+RUN apt update -qq \
+  && apt install --no-install-recommends software-properties-common dirmngr \
+  && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \
+  && add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/" \
+  && apt install -y --no-install-recommends r-base
+
+RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' \
+  && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \
+  && apt update \
+  && gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \
+  && gpg -a --export E298A3A825C0D65DFD57CBB651716619E084DAB9 | apt-key add - \
+  && apt-get install -y r-base r-base-core r-recommended r-base-dev
+
+RUN R -e 'install.packages(c("yaml", "BiocManager", "Cairo", "ggplot2", "cowplot", "data.table", "devtools", "dplyr", "ggraster", "ggrastr", "hdf5r", "inflection", "ragg", "textshaping"), dependencies = TRUE)'
+
+RUN R -e 'BiocManager::install(c("GenomicFeatures", "plyranges", "Rhtslib", "Rsamtools"), ask = FALSE)'
+
+RUN echo 'PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"' > /root/.Renviron
+
+RUN R -e 'install.packages("devtools"); devtools::install_github(repo = "hhoeflin/hdf5r"); devtools::install_github(repo = "mojaveazure/loomR", ref = "develop")'
+
+RUN mkdir -p home && cd home && git clone https://github.com/TomKellyGenetics/zUMIs.git
+
+RUN cd /home  && wget https://github.com/alexdobin/STAR/archive/2.7.9a.tar.gz \
+ && tar xf 2.7.9a.tar.gz \
+ && rm 2.7.9a.tar.gz \
+ && cd STAR-2.7.9a/source \
+ && make \
+ && mv ./STAR* /usr/bin \
+ && cd ../.. \
+ && rm -rf STAR-2.7.9a
+
+CMD ["/bin/bash"]
+
+#bash
diff --git a/README.md b/README.md
@@ -31,8 +31,11 @@ We provide a script to convert zUMIs output into loom file automatically based o
 zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.
 
 ## Changelog
+DD MMM YYYY: zUMIs2.9.8: merged code from @TomKellyGenetics. Update for compatbility with STAR 2.7.9a and samtools 1.3.1-3. Prevent crash when unmapped reads are counted as a special character for chromosomes from samtools idxstats is not permitted as a factor levels. Excludes unmapped reads for counting.
+
 16 Jul 2021: zUMIs2.9.7: Change perl [shebang line](https://github.com/sdparekh/zUMIs/issues/261). Change featureCounts settings to assign reads to genes with the longest overlap in case of ambiguous/overlapping annotations. Allow to control the minimum required overlap of reads to genes with a setting `fraction_overlap:` in the counting section.
 
+
 07 Apr 2021: zUMIs2.9.6: Fixed a potential crash when the input file to the barcode sharing feature does not contain an equal number of columns.
 
 18 Feb 2021: zUMIs2.9.5: Mismatches for detecting Smart-seq3 UMI-read pattern are now user-settable in the YAML file as follows: `find_pattern: ATTGCGCAATG;2` would allow 2 mismatches. Usage as prior to this version (ie `find_pattern: ATTGCGCAATG`) will default to the previous value of 1 mismatch allowed.

diff --git a/UMIstuffFUN.R b/UMIstuffFUN.R
@@ -68,6 +68,12 @@ reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigne
     taglist <- c(taglist, "GI")
   }
 
+  # remove special characters for unmapped reads from list of chromosomes
+  idxstats <- idxstats[grep("\\*", idxstats[,1], invert = TRUE), ]
+  idxstats[,1] <- droplevels(idxstats[,1])
+  idxstats[,1] <- factor(as.character(idxstats[,1]), levels = seqnames(seqinfo(BamFile(featfile))))
+  print(idxstats)
+
   rsamtools_reads <- mclapply(1:nrow(idxstats), function(x) {
     if(opt$read_layout == "PE"){
       parms <- ScanBamParam(tag=taglist,

diff --git a/docker_build.sh b/docker_build.sh
@@ -0,0 +1,6 @@
+#sleep 86400
+podman build -t zumis2:v2.9.7 .
+podman tag zumis2:v2.9.7 chrzie/zumis2:v2.9.7
+podman tag zumis2:v2.9.7 chrzie/zumis2:latest
+podman push chrzie/zumis2:v2.9.7
+podman push chrzie/zumis2:latest
diff --git a/mergeBAM.sh b/mergeBAM.sh
@@ -13,7 +13,7 @@ yaml=$6
 samtoolsexc=$7
 
 ls $tmpMerge/$project.*.filtered.tagged.bam > $tmpMerge/$project.bamlist.txt
-#samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1
+samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1
 
 cat $tmpMerge/$project.*.BCstats.txt > $outdir/$project.BCstats.txt
 

diff --git a/misc/countUMIfrags.py b/misc/countUMIfrags.py
@@ -24,7 +24,7 @@ def count_UMItags(inpath, bcs, chr):
         bc = read.get_tag('BC')
         if bc in bcs:
             ub = read.get_tag('UB')
-            if ub is '':
+            if ub == '':
                 bccounts[bc]['int'] += 1
             else:
                  bccounts[bc]['umi'] += 1

diff --git a/podman_build.sh b/podman_build.sh
@@ -0,0 +1,6 @@
+#sleep 86400
+podman build -t zumis:v2.9.7 .
+podman tag zumis:v2.9.7 tomkellygenetics/zumis:v2.9.7
+podman tag zumis:v2.9.7 tomkellygenetics/zumis:latest
+podman push tomkellygenetics/zumis:v2.9.7
+podman push tomkellygenetics/zumis:latest
diff --git a/runfeatureCountFUN.R b/runfeatureCountFUN.R
@@ -348,7 +348,6 @@ get_gr <- function(y){GenomicRanges::makeGRangesFromDataFrame(y,
 
 .runFeatureCount<-function(abamfile,RG,saf,strand,type,primaryOnly,cpu,mem,fcounts_clib,multi_overlap_var,fraction_overlap){
   print(paste0("Assigning reads to features (",type,")"))
-  #  fc.stat<-Rsubread::featureCounts(files=abamfile,
      fc.stat <- featureCounts(files=abamfile,
                                    annot.ext=saf,
                                    isGTFAnnotationFile=F,

diff --git a/splitfq.sh b/splitfq.sh
@@ -38,7 +38,7 @@ function splitfqgz() {
   d=`dirname $fqfile`
   $pexc -dc -p $nthreads $d/$pref.gz | split --lines=$nl --filter=''$pexc' -p '$nthreads' > $FILE.gz' - $t$pref$project
 
-	ls $t$pref$project* | sed "s|$t$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt
+        ls $t| grep $pref$project | sed "s|$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt
 
 	exit 1
 }

diff --git a/zUMIs-dge2.R b/zUMIs-dge2.R
@@ -8,7 +8,7 @@ suppressPackageStartupMessages(library(Rsamtools))
 ##########################
 myYaml <- commandArgs(trailingOnly = T)
 
-opt   <-read_yaml(myYaml)
+opt   <- read_yaml(myYaml)
 setwd(opt$out_dir)
 #try(unixtools::set.tempdir(opt$out_dir))
 source(paste0(opt$zUMIs_directory,"/runfeatureCountFUN.R"))
@@ -26,14 +26,15 @@ data.table::setDTthreads(threads=1)
 fcounts_clib <- paste0(opt$zUMIs_directory,"/misc/fcountsLib2")
 
 opt <- fixMissingOptions(opt)
+print(opt)
 #######################################################################
 ########################## double check for non-UMI method
 UMIcheck <- check_nonUMIcollapse(opt$sequence_files)
 if(UMIcheck == "nonUMI"){
   opt$counting_opts$Ham_Dist <- 0
 }
 #is the data Smart-seq3?
-smart3_flag <- ifelse(any(grepl(pattern = "ATTGCGCAATG",x = unlist(opt$sequence_files))), TRUE, FALSE)
+smart3_flag <- ifelse(any(grepl(pattern = "ATTGCGCAATG", x = unlist(opt$sequence_files))), TRUE, FALSE)
 
 #######################################################################
 ##### Barcode handling & chunking
@@ -211,6 +212,7 @@ if( opt$counting_opts$introns ){
 for(i in unique(bccount$chunkID)){
      print( paste( "Working on barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
      print( paste( "Processing",length(bccount[chunkID==i]$XC), "barcodes in this chunk..." ))
+     print(sortbamfile)
      reads <- reads2genes_new(featfile = sortbamfile,
                               bccount  = bccount,
                               inex     = opt$counting_opts$introns,

diff --git a/zUMIs-mapping.R b/zUMIs-mapping.R
@@ -26,7 +26,6 @@ if(inp$which_Stage == "Filtering"){
   filtered_bams <- paste0(inp$out_dir,"/",inp$project,".filtered.tagged.unmapped.bam") # for resuming from mapping state using the merged unmapped bam
 }
 
-
 # check if multiple STAR instances can be run -----------------------------
 
 genome_size <- system(command = paste("du -sh",inp$reference$STAR_index,"| cut -f1"), intern = TRUE)
@@ -41,7 +40,6 @@ if(num_star_instances < 1){
 if(num_star_instances > inp$num_threads){
   num_star_instances = inp$num_threads
 } 
-
 # GTF file setup ----------------------------------------------------------
 #in case of additional sequences, we need to create a custom GTF
 
@@ -77,13 +75,12 @@ if ( is.null(additional_fq[1]) | length(additional_fq)==0 ) {
   gtf_to_use <- paste(inp$out_dir,"/",inp$project,".final_annot.gtf",sep="")
   param_additional_fa <- paste("--genomeFastaFiles",paste(inp$reference$additional_files,collapse = " "))
 }
-
-#inp$reference$GTF_file_final <- gtf_to_use
-#yaml::write_yaml(inp,file = paste(inp$out_dir,"/",inp$project,".postmap.yaml",sep=""))
+inp$reference$GTF_file_final <- gtf_to_use
+yaml::write_yaml(inp,file = paste(inp$out_dir,"/",inp$project,".postmap.yaml",sep=""))
 
 # Detect read length ------------------------------------------------------
 #check the first 100 reads to detect the read length of the cDNA read
-#filtered_bam <- paste(inp$out_dir,"/",inp$project,".filtered.tagged.bam",sep="")
+filtered_bam <- paste(inp$out_dir,"/",inp$project,".filtered.tagged.bam",sep="")
 
 cDNA_peek <- data.table::fread(cmd = paste(samtools,"view",filtered_bams[1],"| cut -f10 | head -n 1000"),stringsAsFactors = F,data.table = T, header = F)
 
@@ -93,8 +90,6 @@ getmode <- function(v) {
 }
 
 cDNA_read_length <- getmode(nchar(cDNA_peek$V1))
-
-
 # Setup STAR mapping ------------------------------------------------------
 samtools_load_cores <- ifelse(inp$num_threads>8,2,1)
 avail_cores <- inp$num_threads - samtools_load_cores #reserve threads for samtools file opening
@@ -111,14 +106,14 @@ param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_
 param_misc <- paste("--genomeDir",inp$reference$STAR_index,
                     "--sjdbGTFfile",gtf_to_use,
                     "--runThreadN",avail_cores,
-                    "--sjdbOverhang", cDNA_read_length-1,
-                    "--readFilesType SAM",inp$read_layout)
+                    "--readFilesType SAM", inp$read_layout,
+	            "--genomeSAindexNbases 11",
+		    "--limitOutSJcollapsed 5000000")
 
 STAR_command <- paste(STAR_exec,param_defaults,param_misc,inp$reference$additional_STAR_params,param_additional_fa)
 if(inp$counting_opts$twoPass==TRUE){
   STAR_command <- paste(STAR_command,"--twopassMode Basic")
 }
-
 #finally, run STAR
 if(num_star_instances>1 & inp$which_Stage == "Filtering"){
   map_tmp_dir <- paste0(inp$out_dir,"/zUMIs_output/.tmpMap/")
@@ -137,7 +132,7 @@ if(num_star_instances>1 & inp$which_Stage == "Filtering"){
   #after parallel instance STAR, collect output data in the usual file places
   out_logs <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Log.final.out"), full = TRUE)
   merge_logs <- paste("cat",paste(out_logs, collapse = " "),">",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Log.final.out"))
-  out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.out.bam"), full = TRUE)
+  out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*Aligned.out.bam"), full = TRUE)
   merge_bams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam"),paste(out_bams, collapse = " "))
   out_txbams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.toTranscriptome.out.bam"), full = TRUE)
   merge_txbams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.toTranscriptome.out.bam"),paste(out_txbams, collapse = " "))
@@ -155,7 +150,6 @@ if(num_star_instances>1 & inp$which_Stage == "Filtering"){
   }
 }
 
-
 #clean up chunked bam files
 if(inp$which_Stage == "Filtering"){
   system(paste0("rm ",tmpfolder,"/",inp$project,".*"))

diff --git a/zUMIs.sh b/zUMIs.sh
@@ -261,10 +261,6 @@ if [[ "${whichStage}" == "Filtering" ]] ; then
   for x in ${l} ; do perl ${zumisdir}/fqfilter_v2.pl ${yaml} ${samtoolsexc} ${Rexc} ${pigzexc} ${zumisdir} ${x} & done
   wait
   bash ${zumisdir}/mergeBAM.sh ${zumisdir} ${tmpMerge} ${num_threads} ${project} ${outdir} ${yaml} ${samtoolsexc}
-  for i in ${fqfiles} ; do
-      pref=$(basename ${i} | sed 's/.fastq.gz//' | sed 's/.fq.gz//')
-      rm ${tmpMerge}${pref}*gz
-  done
   date
 
   #run barcode detection
@@ -322,3 +318,5 @@ fi
 if [[ ${conda} = true ]] ; then
   source ${zumisenv}/bin/deactivate
 fi
+
+rm -rf ${tmpMerge}