diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8844500 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,92 @@ +FROM ubuntu:bionic + +#RUN /bin/sh -c [ -z "$(apt-get indextargets)" ] + +RUN set -xe && echo '#!/bin/sh' > /usr/sbin/policy-rc.d && echo 'exit 101' >> /usr/sbin/policy-rc.d && chmod +x /usr/sbin/policy-rc.d && dpkg-divert --local --rename --add /sbin/initctl && cp -a /usr/sbin/policy-rc.d /sbin/initctl && sed -i 's/^exit.*/exit 0/' /sbin/initctl && echo 'force-unsafe-io' > /etc/dpkg/dpkg.cfg.d/docker-apt-speedup && echo 'DPkg::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; };' > /etc/apt/apt.conf.d/docker-clean && echo 'APT::Update::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; };' >> /etc/apt/apt.conf.d/docker-clean && echo 'Dir::Cache::pkgcache ""; Dir::Cache::srcpkgcache "";' >> /etc/apt/apt.conf.d/docker-clean && echo 'Acquire::Languages "none";' > /etc/apt/apt.conf.d/docker-no-languages && echo 'Acquire::GzipIndexes "true"; Acquire::CompressionTypes::Order:: "gz";' > /etc/apt/apt.conf.d/docker-gzip-indexes && echo 'Apt::AutoRemove::SuggestsImportant "false";' > /etc/apt/apt.conf.d/docker-autoremove-suggests + +RUN mkdir -p /run/systemd && echo 'docker' > /run/systemd/container + +RUN apt-get update \ + && apt-get install -y \ + software-properties-common \ + build-essential \ + && apt-get update + +ENV DEBIAN_FRONTEND="noninteractive" TZ="Asia/Tokyo" + +ENV PATH="/root/miniconda3/bin:${PATH}" +ARG PATH="/root/miniconda3/bin:${PATH}" + +RUN apt-get update \ + && apt-get install -y wget && rm -rf /var/lib/apt/lists/* + +RUN wget \ + https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \ + && mkdir /root/.conda \ + && bash Miniconda3-latest-Linux-x86_64.sh -b \ + && rm -f Miniconda3-latest-Linux-x86_64.sh +RUN conda --version + +RUN apt-get update \ + && apt-get upgrade -y \ + && apt-get install -y --fix-missing \ + git \ + nano \ + r-base r-recommended \ + samtools \ + htop \ + pigz \ + wget \ + curl \ + libcurl4-openssl-dev \ + libcairo2-dev cairo-dock cairo-dock-plug-ins \ + libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev \ + libxt-dev \ + libharfbuzz-dev libfribidi-dev \ + libhdf5-dev \ + libgl1-mesa-glx libegl1-mesa \ + libxrandr2 libxrandr2 \ + libxss1 libxcursor1 libxcomposite1 \ + libasound2 libxi6 libxtst6 \ + libssl-dev \ + python-pip + +RUN conda install numpy scipy cython numba matplotlib scikit-learn h5py click + +RUN pip install pysam velocyto; pip2 install pysam + +RUN apt update -qq \ + && apt install --no-install-recommends software-properties-common dirmngr \ + && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \ + && add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/" \ + && apt install -y --no-install-recommends r-base + +RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' \ + && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \ + && apt update \ + && gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \ + && gpg -a --export E298A3A825C0D65DFD57CBB651716619E084DAB9 | apt-key add - \ + && apt-get install -y r-base r-base-core r-recommended r-base-dev + +RUN R -e 'install.packages(c("yaml", "BiocManager", "Cairo", "ggplot2", "cowplot", "data.table", "devtools", "dplyr", "ggraster", "ggrastr", "hdf5r", "inflection", "ragg", "textshaping"), dependencies = TRUE)' + +RUN R -e 'BiocManager::install(c("GenomicFeatures", "plyranges", "Rhtslib", "Rsamtools"), ask = FALSE)' + +RUN echo 'PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"' > /root/.Renviron + +RUN R -e 'install.packages("devtools"); devtools::install_github(repo = "hhoeflin/hdf5r"); devtools::install_github(repo = "mojaveazure/loomR", ref = "develop")' + +RUN mkdir -p home && cd home && git clone https://github.com/TomKellyGenetics/zUMIs.git + +RUN cd /home && wget https://github.com/alexdobin/STAR/archive/2.7.9a.tar.gz \ + && tar xf 2.7.9a.tar.gz \ + && rm 2.7.9a.tar.gz \ + && cd STAR-2.7.9a/source \ + && make \ + && mv ./STAR* /usr/bin \ + && cd ../.. \ + && rm -rf STAR-2.7.9a + +CMD ["/bin/bash"] + +#bash diff --git a/README.md b/README.md index fb2b975..cbfc334 100755 --- a/README.md +++ b/README.md @@ -31,8 +31,11 @@ We provide a script to convert zUMIs output into loom file automatically based o zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`. ## Changelog +DD MMM YYYY: zUMIs2.9.8: merged code from @TomKellyGenetics. Update for compatbility with STAR 2.7.9a and samtools 1.3.1-3. Prevent crash when unmapped reads are counted as a special character for chromosomes from samtools idxstats is not permitted as a factor levels. Excludes unmapped reads for counting. + 16 Jul 2021: zUMIs2.9.7: Change perl [shebang line](https://github.com/sdparekh/zUMIs/issues/261). Change featureCounts settings to assign reads to genes with the longest overlap in case of ambiguous/overlapping annotations. Allow to control the minimum required overlap of reads to genes with a setting `fraction_overlap:` in the counting section. + 07 Apr 2021: zUMIs2.9.6: Fixed a potential crash when the input file to the barcode sharing feature does not contain an equal number of columns. 18 Feb 2021: zUMIs2.9.5: Mismatches for detecting Smart-seq3 UMI-read pattern are now user-settable in the YAML file as follows: `find_pattern: ATTGCGCAATG;2` would allow 2 mismatches. Usage as prior to this version (ie `find_pattern: ATTGCGCAATG`) will default to the previous value of 1 mismatch allowed. diff --git a/UMIstuffFUN.R b/UMIstuffFUN.R index 356cec5..b08e88c 100644 --- a/UMIstuffFUN.R +++ b/UMIstuffFUN.R @@ -68,6 +68,12 @@ reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigne taglist <- c(taglist, "GI") } + # remove special characters for unmapped reads from list of chromosomes + idxstats <- idxstats[grep("\\*", idxstats[,1], invert = TRUE), ] + idxstats[,1] <- droplevels(idxstats[,1]) + idxstats[,1] <- factor(as.character(idxstats[,1]), levels = seqnames(seqinfo(BamFile(featfile)))) + print(idxstats) + rsamtools_reads <- mclapply(1:nrow(idxstats), function(x) { if(opt$read_layout == "PE"){ parms <- ScanBamParam(tag=taglist, diff --git a/docker_build.sh b/docker_build.sh new file mode 100644 index 0000000..ca323ab --- /dev/null +++ b/docker_build.sh @@ -0,0 +1,6 @@ +#sleep 86400 +podman build -t zumis2:v2.9.7 . +podman tag zumis2:v2.9.7 chrzie/zumis2:v2.9.7 +podman tag zumis2:v2.9.7 chrzie/zumis2:latest +podman push chrzie/zumis2:v2.9.7 +podman push chrzie/zumis2:latest diff --git a/mergeBAM.sh b/mergeBAM.sh index 7c2909f..7d48323 100755 --- a/mergeBAM.sh +++ b/mergeBAM.sh @@ -13,7 +13,7 @@ yaml=$6 samtoolsexc=$7 ls $tmpMerge/$project.*.filtered.tagged.bam > $tmpMerge/$project.bamlist.txt -#samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1 +samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1 cat $tmpMerge/$project.*.BCstats.txt > $outdir/$project.BCstats.txt diff --git a/misc/countUMIfrags.py b/misc/countUMIfrags.py index aa7e1d5..775529f 100755 --- a/misc/countUMIfrags.py +++ b/misc/countUMIfrags.py @@ -24,7 +24,7 @@ def count_UMItags(inpath, bcs, chr): bc = read.get_tag('BC') if bc in bcs: ub = read.get_tag('UB') - if ub is '': + if ub == '': bccounts[bc]['int'] += 1 else: bccounts[bc]['umi'] += 1 diff --git a/podman_build.sh b/podman_build.sh new file mode 100644 index 0000000..e5fa956 --- /dev/null +++ b/podman_build.sh @@ -0,0 +1,6 @@ +#sleep 86400 +podman build -t zumis:v2.9.7 . +podman tag zumis:v2.9.7 tomkellygenetics/zumis:v2.9.7 +podman tag zumis:v2.9.7 tomkellygenetics/zumis:latest +podman push tomkellygenetics/zumis:v2.9.7 +podman push tomkellygenetics/zumis:latest diff --git a/runfeatureCountFUN.R b/runfeatureCountFUN.R index f2b748c..241381b 100755 --- a/runfeatureCountFUN.R +++ b/runfeatureCountFUN.R @@ -348,7 +348,6 @@ get_gr <- function(y){GenomicRanges::makeGRangesFromDataFrame(y, .runFeatureCount<-function(abamfile,RG,saf,strand,type,primaryOnly,cpu,mem,fcounts_clib,multi_overlap_var,fraction_overlap){ print(paste0("Assigning reads to features (",type,")")) - # fc.stat<-Rsubread::featureCounts(files=abamfile, fc.stat <- featureCounts(files=abamfile, annot.ext=saf, isGTFAnnotationFile=F, diff --git a/splitfq.sh b/splitfq.sh index 709eeb2..523e681 100755 --- a/splitfq.sh +++ b/splitfq.sh @@ -38,7 +38,7 @@ function splitfqgz() { d=`dirname $fqfile` $pexc -dc -p $nthreads $d/$pref.gz | split --lines=$nl --filter=''$pexc' -p '$nthreads' > $FILE.gz' - $t$pref$project - ls $t$pref$project* | sed "s|$t$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt + ls $t| grep $pref$project | sed "s|$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt exit 1 } diff --git a/zUMIs-dge2.R b/zUMIs-dge2.R index efe5fec..fc6b7e1 100644 --- a/zUMIs-dge2.R +++ b/zUMIs-dge2.R @@ -8,7 +8,7 @@ suppressPackageStartupMessages(library(Rsamtools)) ########################## myYaml <- commandArgs(trailingOnly = T) -opt <-read_yaml(myYaml) +opt <- read_yaml(myYaml) setwd(opt$out_dir) #try(unixtools::set.tempdir(opt$out_dir)) source(paste0(opt$zUMIs_directory,"/runfeatureCountFUN.R")) @@ -26,6 +26,7 @@ data.table::setDTthreads(threads=1) fcounts_clib <- paste0(opt$zUMIs_directory,"/misc/fcountsLib2") opt <- fixMissingOptions(opt) +print(opt) ####################################################################### ########################## double check for non-UMI method UMIcheck <- check_nonUMIcollapse(opt$sequence_files) @@ -33,7 +34,7 @@ if(UMIcheck == "nonUMI"){ opt$counting_opts$Ham_Dist <- 0 } #is the data Smart-seq3? -smart3_flag <- ifelse(any(grepl(pattern = "ATTGCGCAATG",x = unlist(opt$sequence_files))), TRUE, FALSE) +smart3_flag <- ifelse(any(grepl(pattern = "ATTGCGCAATG", x = unlist(opt$sequence_files))), TRUE, FALSE) ####################################################################### ##### Barcode handling & chunking @@ -211,6 +212,7 @@ if( opt$counting_opts$introns ){ for(i in unique(bccount$chunkID)){ print( paste( "Working on barcode chunk", i, "out of",length(unique(bccount$chunkID)) )) print( paste( "Processing",length(bccount[chunkID==i]$XC), "barcodes in this chunk..." )) + print(sortbamfile) reads <- reads2genes_new(featfile = sortbamfile, bccount = bccount, inex = opt$counting_opts$introns, diff --git a/zUMIs-mapping.R b/zUMIs-mapping.R old mode 100755 new mode 100644 index 3479eb3..3332873 --- a/zUMIs-mapping.R +++ b/zUMIs-mapping.R @@ -26,7 +26,6 @@ if(inp$which_Stage == "Filtering"){ filtered_bams <- paste0(inp$out_dir,"/",inp$project,".filtered.tagged.unmapped.bam") # for resuming from mapping state using the merged unmapped bam } - # check if multiple STAR instances can be run ----------------------------- genome_size <- system(command = paste("du -sh",inp$reference$STAR_index,"| cut -f1"), intern = TRUE) @@ -41,7 +40,6 @@ if(num_star_instances < 1){ if(num_star_instances > inp$num_threads){ num_star_instances = inp$num_threads } - # GTF file setup ---------------------------------------------------------- #in case of additional sequences, we need to create a custom GTF @@ -77,13 +75,12 @@ if ( is.null(additional_fq[1]) | length(additional_fq)==0 ) { gtf_to_use <- paste(inp$out_dir,"/",inp$project,".final_annot.gtf",sep="") param_additional_fa <- paste("--genomeFastaFiles",paste(inp$reference$additional_files,collapse = " ")) } - -#inp$reference$GTF_file_final <- gtf_to_use -#yaml::write_yaml(inp,file = paste(inp$out_dir,"/",inp$project,".postmap.yaml",sep="")) +inp$reference$GTF_file_final <- gtf_to_use +yaml::write_yaml(inp,file = paste(inp$out_dir,"/",inp$project,".postmap.yaml",sep="")) # Detect read length ------------------------------------------------------ #check the first 100 reads to detect the read length of the cDNA read -#filtered_bam <- paste(inp$out_dir,"/",inp$project,".filtered.tagged.bam",sep="") +filtered_bam <- paste(inp$out_dir,"/",inp$project,".filtered.tagged.bam",sep="") cDNA_peek <- data.table::fread(cmd = paste(samtools,"view",filtered_bams[1],"| cut -f10 | head -n 1000"),stringsAsFactors = F,data.table = T, header = F) @@ -93,8 +90,6 @@ getmode <- function(v) { } cDNA_read_length <- getmode(nchar(cDNA_peek$V1)) - - # Setup STAR mapping ------------------------------------------------------ samtools_load_cores <- ifelse(inp$num_threads>8,2,1) avail_cores <- inp$num_threads - samtools_load_cores #reserve threads for samtools file opening @@ -111,14 +106,14 @@ param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_ param_misc <- paste("--genomeDir",inp$reference$STAR_index, "--sjdbGTFfile",gtf_to_use, "--runThreadN",avail_cores, - "--sjdbOverhang", cDNA_read_length-1, - "--readFilesType SAM",inp$read_layout) + "--readFilesType SAM", inp$read_layout, + "--genomeSAindexNbases 11", + "--limitOutSJcollapsed 5000000") STAR_command <- paste(STAR_exec,param_defaults,param_misc,inp$reference$additional_STAR_params,param_additional_fa) if(inp$counting_opts$twoPass==TRUE){ STAR_command <- paste(STAR_command,"--twopassMode Basic") } - #finally, run STAR if(num_star_instances>1 & inp$which_Stage == "Filtering"){ map_tmp_dir <- paste0(inp$out_dir,"/zUMIs_output/.tmpMap/") @@ -137,7 +132,7 @@ if(num_star_instances>1 & inp$which_Stage == "Filtering"){ #after parallel instance STAR, collect output data in the usual file places out_logs <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Log.final.out"), full = TRUE) merge_logs <- paste("cat",paste(out_logs, collapse = " "),">",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Log.final.out")) - out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.out.bam"), full = TRUE) + out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*Aligned.out.bam"), full = TRUE) merge_bams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam"),paste(out_bams, collapse = " ")) out_txbams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.toTranscriptome.out.bam"), full = TRUE) merge_txbams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.toTranscriptome.out.bam"),paste(out_txbams, collapse = " ")) @@ -155,7 +150,6 @@ if(num_star_instances>1 & inp$which_Stage == "Filtering"){ } } - #clean up chunked bam files if(inp$which_Stage == "Filtering"){ system(paste0("rm ",tmpfolder,"/",inp$project,".*")) diff --git a/zUMIs.sh b/zUMIs.sh index 2907733..43fb3ad 100755 --- a/zUMIs.sh +++ b/zUMIs.sh @@ -261,10 +261,6 @@ if [[ "${whichStage}" == "Filtering" ]] ; then for x in ${l} ; do perl ${zumisdir}/fqfilter_v2.pl ${yaml} ${samtoolsexc} ${Rexc} ${pigzexc} ${zumisdir} ${x} & done wait bash ${zumisdir}/mergeBAM.sh ${zumisdir} ${tmpMerge} ${num_threads} ${project} ${outdir} ${yaml} ${samtoolsexc} - for i in ${fqfiles} ; do - pref=$(basename ${i} | sed 's/.fastq.gz//' | sed 's/.fq.gz//') - rm ${tmpMerge}${pref}*gz - done date #run barcode detection @@ -322,3 +318,5 @@ fi if [[ ${conda} = true ]] ; then source ${zumisenv}/bin/deactivate fi + +rm -rf ${tmpMerge}