Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix: corrects for errors in counting with special characters #266

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
7ca5efc
hardcoded smartseq settings
TomKellyGenetics Jun 19, 2021
55df69f
Merge branch 'main' of https://github.com/sdparekh/zUMIs into main
TomKellyGenetics Jun 19, 2021
fbe994a
set up smartseq run with STAR 2.7.9a
TomKellyGenetics Jun 21, 2021
eb76463
correct counting by excluding unmapped reads from chromosomes
TomKellyGenetics Jun 22, 2021
37ec456
remove verbose logs for debugging
TomKellyGenetics Jun 23, 2021
b7e3ba6
update version and documentation
TomKellyGenetics Jun 23, 2021
3cc73a1
remove hardcoded variables
TomKellyGenetics Jun 23, 2021
d4b5c85
add Docker configuration
TomKellyGenetics Jun 30, 2021
3c3782d
update docker build to install dependencies
TomKellyGenetics Jun 30, 2021
8fd3c6b
Argument list too long for ls
TomKellyGenetics Jun 30, 2021
10d91c0
Argument list too long for ls
TomKellyGenetics Jun 30, 2021
f3e1cc2
find arguments too long
TomKellyGenetics Jun 30, 2021
4937d8c
ls without arguments
TomKellyGenetics Jun 30, 2021
153b53b
resolve bug for merge temp files for large inputs
TomKellyGenetics Jun 30, 2021
2757a7f
Argument list too long for rm tmpMerge
TomKellyGenetics Jul 1, 2021
8f7c841
add script for building docker container
TomKellyGenetics Jul 1, 2021
e3e3ccc
update docker build
TomKellyGenetics Jul 1, 2021
fbfd418
Argument list too long for rm
TomKellyGenetics Jul 2, 2021
6df70eb
Merge branch 'main' of https://github.com/TomKellyGenetics/zUMIs into…
TomKellyGenetics Jul 2, 2021
71b5a0c
Argument list too long for rm
TomKellyGenetics Jul 2, 2021
1ee22a4
Merge branch 'main' into main
TomKellyGenetics Nov 10, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
FROM ubuntu:bionic

#RUN /bin/sh -c [ -z "$(apt-get indextargets)" ]

RUN set -xe && echo '#!/bin/sh' > /usr/sbin/policy-rc.d && echo 'exit 101' >> /usr/sbin/policy-rc.d && chmod +x /usr/sbin/policy-rc.d && dpkg-divert --local --rename --add /sbin/initctl && cp -a /usr/sbin/policy-rc.d /sbin/initctl && sed -i 's/^exit.*/exit 0/' /sbin/initctl && echo 'force-unsafe-io' > /etc/dpkg/dpkg.cfg.d/docker-apt-speedup && echo 'DPkg::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; };' > /etc/apt/apt.conf.d/docker-clean && echo 'APT::Update::Post-Invoke { "rm -f /var/cache/apt/archives/*.deb /var/cache/apt/archives/partial/*.deb /var/cache/apt/*.bin || true"; };' >> /etc/apt/apt.conf.d/docker-clean && echo 'Dir::Cache::pkgcache ""; Dir::Cache::srcpkgcache "";' >> /etc/apt/apt.conf.d/docker-clean && echo 'Acquire::Languages "none";' > /etc/apt/apt.conf.d/docker-no-languages && echo 'Acquire::GzipIndexes "true"; Acquire::CompressionTypes::Order:: "gz";' > /etc/apt/apt.conf.d/docker-gzip-indexes && echo 'Apt::AutoRemove::SuggestsImportant "false";' > /etc/apt/apt.conf.d/docker-autoremove-suggests

RUN mkdir -p /run/systemd && echo 'docker' > /run/systemd/container

RUN apt-get update \
&& apt-get install -y \
software-properties-common \
build-essential \
&& apt-get update

ENV DEBIAN_FRONTEND="noninteractive" TZ="Asia/Tokyo"

ENV PATH="/root/miniconda3/bin:${PATH}"
ARG PATH="/root/miniconda3/bin:${PATH}"

RUN apt-get update \
&& apt-get install -y wget && rm -rf /var/lib/apt/lists/*

RUN wget \
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
&& mkdir /root/.conda \
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
&& rm -f Miniconda3-latest-Linux-x86_64.sh
RUN conda --version

RUN apt-get update \
&& apt-get upgrade -y \
&& apt-get install -y --fix-missing \
git \
nano \
r-base r-recommended \
samtools \
htop \
pigz \
wget \
curl \
libcurl4-openssl-dev \
libcairo2-dev cairo-dock cairo-dock-plug-ins \
libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev \
libxt-dev \
libharfbuzz-dev libfribidi-dev \
libhdf5-dev \
libgl1-mesa-glx libegl1-mesa \
libxrandr2 libxrandr2 \
libxss1 libxcursor1 libxcomposite1 \
libasound2 libxi6 libxtst6 \
libssl-dev \
python-pip

RUN conda install numpy scipy cython numba matplotlib scikit-learn h5py click

RUN pip install pysam velocyto; pip2 install pysam

RUN apt update -qq \
&& apt install --no-install-recommends software-properties-common dirmngr \
&& apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \
&& add-apt-repository "deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/" \
&& apt install -y --no-install-recommends r-base

RUN add-apt-repository 'deb https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/' \
&& apt-key adv --keyserver keyserver.ubuntu.com --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \
&& apt update \
&& gpg --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 \
&& gpg -a --export E298A3A825C0D65DFD57CBB651716619E084DAB9 | apt-key add - \
&& apt-get install -y r-base r-base-core r-recommended r-base-dev

RUN R -e 'install.packages(c("yaml", "BiocManager", "Cairo", "ggplot2", "cowplot", "data.table", "devtools", "dplyr", "ggraster", "ggrastr", "hdf5r", "inflection", "ragg", "textshaping"), dependencies = TRUE)'

RUN R -e 'BiocManager::install(c("GenomicFeatures", "plyranges", "Rhtslib", "Rsamtools"), ask = FALSE)'

RUN echo 'PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"' > /root/.Renviron

RUN R -e 'install.packages("devtools"); devtools::install_github(repo = "hhoeflin/hdf5r"); devtools::install_github(repo = "mojaveazure/loomR", ref = "develop")'

RUN mkdir -p home && cd home && git clone https://github.com/TomKellyGenetics/zUMIs.git

RUN cd /home && wget https://github.com/alexdobin/STAR/archive/2.7.9a.tar.gz \
&& tar xf 2.7.9a.tar.gz \
&& rm 2.7.9a.tar.gz \
&& cd STAR-2.7.9a/source \
&& make \
&& mv ./STAR* /usr/bin \
&& cd ../.. \
&& rm -rf STAR-2.7.9a

CMD ["/bin/bash"]

#bash
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,11 @@ We provide a script to convert zUMIs output into loom file automatically based o
zUMIs will try to automatically do this, otherwise convert zUMIs output to loom by simply running `Rscript rds2loom.R myRun.yaml`.

## Changelog
DD MMM YYYY: zUMIs2.9.8: merged code from @TomKellyGenetics. Update for compatbility with STAR 2.7.9a and samtools 1.3.1-3. Prevent crash when unmapped reads are counted as a special character for chromosomes from samtools idxstats is not permitted as a factor levels. Excludes unmapped reads for counting.

16 Jul 2021: zUMIs2.9.7: Change perl [shebang line](https://github.com/sdparekh/zUMIs/issues/261). Change featureCounts settings to assign reads to genes with the longest overlap in case of ambiguous/overlapping annotations. Allow to control the minimum required overlap of reads to genes with a setting `fraction_overlap:` in the counting section.


07 Apr 2021: zUMIs2.9.6: Fixed a potential crash when the input file to the barcode sharing feature does not contain an equal number of columns.

18 Feb 2021: zUMIs2.9.5: Mismatches for detecting Smart-seq3 UMI-read pattern are now user-settable in the YAML file as follows: `find_pattern: ATTGCGCAATG;2` would allow 2 mismatches. Usage as prior to this version (ie `find_pattern: ATTGCGCAATG`) will default to the previous value of 1 mismatch allowed.
Expand Down
6 changes: 6 additions & 0 deletions UMIstuffFUN.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigne
taglist <- c(taglist, "GI")
}

# remove special characters for unmapped reads from list of chromosomes
idxstats <- idxstats[grep("\\*", idxstats[,1], invert = TRUE), ]
idxstats[,1] <- droplevels(idxstats[,1])
idxstats[,1] <- factor(as.character(idxstats[,1]), levels = seqnames(seqinfo(BamFile(featfile))))
print(idxstats)

rsamtools_reads <- mclapply(1:nrow(idxstats), function(x) {
if(opt$read_layout == "PE"){
parms <- ScanBamParam(tag=taglist,
Expand Down
6 changes: 6 additions & 0 deletions docker_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#sleep 86400
podman build -t zumis2:v2.9.7 .
podman tag zumis2:v2.9.7 chrzie/zumis2:v2.9.7
podman tag zumis2:v2.9.7 chrzie/zumis2:latest
podman push chrzie/zumis2:v2.9.7
podman push chrzie/zumis2:latest
2 changes: 1 addition & 1 deletion mergeBAM.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ yaml=$6
samtoolsexc=$7

ls $tmpMerge/$project.*.filtered.tagged.bam > $tmpMerge/$project.bamlist.txt
#samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1
samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1

cat $tmpMerge/$project.*.BCstats.txt > $outdir/$project.BCstats.txt

Expand Down
2 changes: 1 addition & 1 deletion misc/countUMIfrags.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def count_UMItags(inpath, bcs, chr):
bc = read.get_tag('BC')
if bc in bcs:
ub = read.get_tag('UB')
if ub is '':
if ub == '':
bccounts[bc]['int'] += 1
else:
bccounts[bc]['umi'] += 1
Expand Down
6 changes: 6 additions & 0 deletions podman_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#sleep 86400
podman build -t zumis:v2.9.7 .
podman tag zumis:v2.9.7 tomkellygenetics/zumis:v2.9.7
podman tag zumis:v2.9.7 tomkellygenetics/zumis:latest
podman push tomkellygenetics/zumis:v2.9.7
podman push tomkellygenetics/zumis:latest
1 change: 0 additions & 1 deletion runfeatureCountFUN.R
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,6 @@ get_gr <- function(y){GenomicRanges::makeGRangesFromDataFrame(y,

.runFeatureCount<-function(abamfile,RG,saf,strand,type,primaryOnly,cpu,mem,fcounts_clib,multi_overlap_var,fraction_overlap){
print(paste0("Assigning reads to features (",type,")"))
# fc.stat<-Rsubread::featureCounts(files=abamfile,
fc.stat <- featureCounts(files=abamfile,
annot.ext=saf,
isGTFAnnotationFile=F,
Expand Down
2 changes: 1 addition & 1 deletion splitfq.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ function splitfqgz() {
d=`dirname $fqfile`
$pexc -dc -p $nthreads $d/$pref.gz | split --lines=$nl --filter=''$pexc' -p '$nthreads' > $FILE.gz' - $t$pref$project

ls $t$pref$project* | sed "s|$t$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt
ls $t| grep $pref$project | sed "s|$pref$project||" | sed 's/.gz//' > $t/$project.listPrefix.txt

exit 1
}
Expand Down
6 changes: 4 additions & 2 deletions zUMIs-dge2.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ suppressPackageStartupMessages(library(Rsamtools))
##########################
myYaml <- commandArgs(trailingOnly = T)

opt <-read_yaml(myYaml)
opt <- read_yaml(myYaml)
setwd(opt$out_dir)
#try(unixtools::set.tempdir(opt$out_dir))
source(paste0(opt$zUMIs_directory,"/runfeatureCountFUN.R"))
Expand All @@ -26,14 +26,15 @@ data.table::setDTthreads(threads=1)
fcounts_clib <- paste0(opt$zUMIs_directory,"/misc/fcountsLib2")

opt <- fixMissingOptions(opt)
print(opt)
#######################################################################
########################## double check for non-UMI method
UMIcheck <- check_nonUMIcollapse(opt$sequence_files)
if(UMIcheck == "nonUMI"){
opt$counting_opts$Ham_Dist <- 0
}
#is the data Smart-seq3?
smart3_flag <- ifelse(any(grepl(pattern = "ATTGCGCAATG",x = unlist(opt$sequence_files))), TRUE, FALSE)
smart3_flag <- ifelse(any(grepl(pattern = "ATTGCGCAATG", x = unlist(opt$sequence_files))), TRUE, FALSE)

#######################################################################
##### Barcode handling & chunking
Expand Down Expand Up @@ -211,6 +212,7 @@ if( opt$counting_opts$introns ){
for(i in unique(bccount$chunkID)){
print( paste( "Working on barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
print( paste( "Processing",length(bccount[chunkID==i]$XC), "barcodes in this chunk..." ))
print(sortbamfile)
reads <- reads2genes_new(featfile = sortbamfile,
bccount = bccount,
inex = opt$counting_opts$introns,
Expand Down
20 changes: 7 additions & 13 deletions zUMIs-mapping.R
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ if(inp$which_Stage == "Filtering"){
filtered_bams <- paste0(inp$out_dir,"/",inp$project,".filtered.tagged.unmapped.bam") # for resuming from mapping state using the merged unmapped bam
}


# check if multiple STAR instances can be run -----------------------------

genome_size <- system(command = paste("du -sh",inp$reference$STAR_index,"| cut -f1"), intern = TRUE)
Expand All @@ -41,7 +40,6 @@ if(num_star_instances < 1){
if(num_star_instances > inp$num_threads){
num_star_instances = inp$num_threads
}

# GTF file setup ----------------------------------------------------------
#in case of additional sequences, we need to create a custom GTF

Expand Down Expand Up @@ -77,13 +75,12 @@ if ( is.null(additional_fq[1]) | length(additional_fq)==0 ) {
gtf_to_use <- paste(inp$out_dir,"/",inp$project,".final_annot.gtf",sep="")
param_additional_fa <- paste("--genomeFastaFiles",paste(inp$reference$additional_files,collapse = " "))
}

#inp$reference$GTF_file_final <- gtf_to_use
#yaml::write_yaml(inp,file = paste(inp$out_dir,"/",inp$project,".postmap.yaml",sep=""))
inp$reference$GTF_file_final <- gtf_to_use
yaml::write_yaml(inp,file = paste(inp$out_dir,"/",inp$project,".postmap.yaml",sep=""))

# Detect read length ------------------------------------------------------
#check the first 100 reads to detect the read length of the cDNA read
#filtered_bam <- paste(inp$out_dir,"/",inp$project,".filtered.tagged.bam",sep="")
filtered_bam <- paste(inp$out_dir,"/",inp$project,".filtered.tagged.bam",sep="")

cDNA_peek <- data.table::fread(cmd = paste(samtools,"view",filtered_bams[1],"| cut -f10 | head -n 1000"),stringsAsFactors = F,data.table = T, header = F)

Expand All @@ -93,8 +90,6 @@ getmode <- function(v) {
}

cDNA_read_length <- getmode(nchar(cDNA_peek$V1))


# Setup STAR mapping ------------------------------------------------------
samtools_load_cores <- ifelse(inp$num_threads>8,2,1)
avail_cores <- inp$num_threads - samtools_load_cores #reserve threads for samtools file opening
Expand All @@ -111,14 +106,14 @@ param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_
param_misc <- paste("--genomeDir",inp$reference$STAR_index,
"--sjdbGTFfile",gtf_to_use,
"--runThreadN",avail_cores,
"--sjdbOverhang", cDNA_read_length-1,
"--readFilesType SAM",inp$read_layout)
"--readFilesType SAM", inp$read_layout,
"--genomeSAindexNbases 11",
"--limitOutSJcollapsed 5000000")

STAR_command <- paste(STAR_exec,param_defaults,param_misc,inp$reference$additional_STAR_params,param_additional_fa)
if(inp$counting_opts$twoPass==TRUE){
STAR_command <- paste(STAR_command,"--twopassMode Basic")
}

#finally, run STAR
if(num_star_instances>1 & inp$which_Stage == "Filtering"){
map_tmp_dir <- paste0(inp$out_dir,"/zUMIs_output/.tmpMap/")
Expand All @@ -137,7 +132,7 @@ if(num_star_instances>1 & inp$which_Stage == "Filtering"){
#after parallel instance STAR, collect output data in the usual file places
out_logs <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Log.final.out"), full = TRUE)
merge_logs <- paste("cat",paste(out_logs, collapse = " "),">",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Log.final.out"))
out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.out.bam"), full = TRUE)
out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*Aligned.out.bam"), full = TRUE)
merge_bams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam"),paste(out_bams, collapse = " "))
out_txbams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.toTranscriptome.out.bam"), full = TRUE)
merge_txbams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.toTranscriptome.out.bam"),paste(out_txbams, collapse = " "))
Expand All @@ -155,7 +150,6 @@ if(num_star_instances>1 & inp$which_Stage == "Filtering"){
}
}


#clean up chunked bam files
if(inp$which_Stage == "Filtering"){
system(paste0("rm ",tmpfolder,"/",inp$project,".*"))
Expand Down
6 changes: 2 additions & 4 deletions zUMIs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -261,10 +261,6 @@ if [[ "${whichStage}" == "Filtering" ]] ; then
for x in ${l} ; do perl ${zumisdir}/fqfilter_v2.pl ${yaml} ${samtoolsexc} ${Rexc} ${pigzexc} ${zumisdir} ${x} & done
wait
bash ${zumisdir}/mergeBAM.sh ${zumisdir} ${tmpMerge} ${num_threads} ${project} ${outdir} ${yaml} ${samtoolsexc}
for i in ${fqfiles} ; do
pref=$(basename ${i} | sed 's/.fastq.gz//' | sed 's/.fq.gz//')
rm ${tmpMerge}${pref}*gz
done
date

#run barcode detection
Expand Down Expand Up @@ -322,3 +318,5 @@ fi
if [[ ${conda} = true ]] ; then
source ${zumisenv}/bin/deactivate
fi

rm -rf ${tmpMerge}