Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix: corrects for errors in counting with special characters #266

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
7ca5efc
hardcoded smartseq settings
TomKellyGenetics Jun 19, 2021
55df69f
Merge branch 'main' of https://github.com/sdparekh/zUMIs into main
TomKellyGenetics Jun 19, 2021
fbe994a
set up smartseq run with STAR 2.7.9a
TomKellyGenetics Jun 21, 2021
eb76463
correct counting by excluding unmapped reads from chromosomes
TomKellyGenetics Jun 22, 2021
37ec456
remove verbose logs for debugging
TomKellyGenetics Jun 23, 2021
b7e3ba6
update version and documentation
TomKellyGenetics Jun 23, 2021
3cc73a1
remove hardcoded variables
TomKellyGenetics Jun 23, 2021
d4b5c85
add Docker configuration
TomKellyGenetics Jun 30, 2021
3c3782d
update docker build to install dependencies
TomKellyGenetics Jun 30, 2021
8fd3c6b
Argument list too long for ls
TomKellyGenetics Jun 30, 2021
10d91c0
Argument list too long for ls
TomKellyGenetics Jun 30, 2021
f3e1cc2
find arguments too long
TomKellyGenetics Jun 30, 2021
4937d8c
ls without arguments
TomKellyGenetics Jun 30, 2021
153b53b
resolve bug for merge temp files for large inputs
TomKellyGenetics Jun 30, 2021
2757a7f
Argument list too long for rm tmpMerge
TomKellyGenetics Jul 1, 2021
8f7c841
add script for building docker container
TomKellyGenetics Jul 1, 2021
e3e3ccc
update docker build
TomKellyGenetics Jul 1, 2021
fbfd418
Argument list too long for rm
TomKellyGenetics Jul 2, 2021
6df70eb
Merge branch 'main' of https://github.com/TomKellyGenetics/zUMIs into…
TomKellyGenetics Jul 2, 2021
71b5a0c
Argument list too long for rm
TomKellyGenetics Jul 2, 2021
1ee22a4
Merge branch 'main' into main
TomKellyGenetics Nov 10, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
correct counting by excluding unmapped reads from chromosomes
TomKellyGenetics committed Jun 22, 2021
commit eb7646317f00b0223f427b4531b33c79aadc7d85
14 changes: 14 additions & 0 deletions UMIstuffFUN.R
Original file line number Diff line number Diff line change
@@ -68,6 +68,17 @@ reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigne
taglist <- c(taglist, "GI")
}

idxstats <- idxstats[grep("\\*", idxstats[,1], invert = TRUE), ]
print(idxstats)
#print(str(idxstats))
#print(levels(idxstats[,"seqnames"]))

idxstats[,1] <- droplevels(idxstats[,1])
print(levels(idxstats[,"seqnames"]))

idxstats[,1] <- factor(as.character(idxstats[,1]), levels = seqnames(seqinfo(BamFile(featfile))))

#print(idxstats)
rsamtools_reads <- mclapply(1:nrow(idxstats), function(x) {
if(opt$read_layout == "PE"){
parms <- ScanBamParam(tag=taglist,
@@ -83,6 +94,9 @@ reads2genes_new <- function(featfile, bccount, inex, chunk, cores, keepUnassigne
}

dat <- scanBam(file = featfile, param = parms)
#print(head(dat))
#print(dim(dat))
#print(str(dat))
if(inex){
dt <- data.table(RG = dat[[1]]$tag$BC, UB = dat[[1]]$tag$UB, GE = dat[[1]]$tag$GE, GEin = dat[[1]]$tag$GI)
}else{
2 changes: 1 addition & 1 deletion mergeBAM.sh
Original file line number Diff line number Diff line change
@@ -13,7 +13,7 @@ yaml=$6
samtoolsexc=$7

ls $tmpMerge/$project.*.filtered.tagged.bam > $tmpMerge/$project.bamlist.txt
samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1
#samtools merge -f -@ $nthreads -b $tmpMerge/$project.bamlist.txt $outdir/$project.filtered.tagged.bam > /dev/null 2>&1

cat $tmpMerge/$project.*.BCstats.txt > $outdir/$project.BCstats.txt

2 changes: 1 addition & 1 deletion misc/countUMIfrags.py
Original file line number Diff line number Diff line change
@@ -24,7 +24,7 @@ def count_UMItags(inpath, bcs, chr):
bc = read.get_tag('BC')
if bc in bcs:
ub = read.get_tag('UB')
if ub is '':
if ub == '':
bccounts[bc]['int'] += 1
else:
bccounts[bc]['umi'] += 1
4 changes: 2 additions & 2 deletions runfeatureCountFUN.R
Original file line number Diff line number Diff line change
@@ -348,8 +348,8 @@ get_gr <- function(y){GenomicRanges::makeGRangesFromDataFrame(y,

.runFeatureCount<-function(abamfile,RG,saf,strand,type,primaryOnly,cpu,mem,fcounts_clib,multi_overlap_var){
print(paste0("Assigning reads to features (",type,")"))
fc.stat<-Rsubread::featureCounts(files=abamfile,
# fc.stat <- featureCounts(files=abamfile,
# fc.stat<-Rsubread::featureCounts(files=abamfile,
fc.stat <- featureCounts(files=abamfile,
annot.ext=saf,
isGTFAnnotationFile=F,
primaryOnly=primaryOnly,
11 changes: 7 additions & 4 deletions zUMIs-dge2.R
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@ suppressPackageStartupMessages(library(Rsamtools))
##########################
myYaml <- commandArgs(trailingOnly = T)

opt <-read_yaml(myYaml)
opt <- read_yaml(myYaml)
setwd(opt$out_dir)
#try(unixtools::set.tempdir(opt$out_dir))
source(paste0(opt$zUMIs_directory,"/runfeatureCountFUN.R"))
@@ -26,14 +26,15 @@ data.table::setDTthreads(threads=1)
fcounts_clib <- paste0(opt$zUMIs_directory,"/misc/fcountsLib2")

opt <- fixMissingOptions(opt)
print(opt)
#######################################################################
########################## double check for non-UMI method
UMIcheck <- check_nonUMIcollapse(opt$sequence_files)
if(UMIcheck == "nonUMI"){
opt$counting_opts$Ham_Dist <- 0
}
#is the data Smart-seq3?
smart3_flag <- TRUE #ifelse(any(grepl(pattern = "ATTGCGCAATG",x = unlist(opt$sequence_files))), TRUE, FALSE)
smart3_flag <- ifelse(any(grepl(pattern = "ATTGCGCAATG", x = unlist(opt$sequence_files))), TRUE, FALSE)

#######################################################################
##### Barcode handling & chunking
@@ -66,8 +67,7 @@ try(gene_name_mapping <- .get_gene_names(gtf = paste0(opt$out_dir,"/",opt$projec
try(data.table::fwrite(gene_name_mapping, file = paste0(opt$out_dir,"/zUMIs_output/expression/",opt$project,".gene_names.txt"), sep ="\t", quote = FALSE), silent = TRUE)
##

#if(smart3_flag & opt$counting_opts$strand == 1){
if(TRUE){
if(smart3_flag & opt$counting_opts$strand == 1){
#split bam in UMU ends and internal
print("Preparing Smart-seq3 data for stranded gene assignment...")
print(Sys.time())
@@ -210,6 +210,9 @@ if( opt$counting_opts$introns ){
for(i in unique(bccount$chunkID)){
print( paste( "Working on barcode chunk", i, "out of",length(unique(bccount$chunkID)) ))
print( paste( "Processing",length(bccount[chunkID==i]$XC), "barcodes in this chunk..." ))
print(sortbamfile)
print(bccount)
print(opt$counting_opts$introns)
reads <- reads2genes_new(featfile = sortbamfile,
bccount = bccount,
inex = opt$counting_opts$introns,
9 changes: 4 additions & 5 deletions zUMIs-mapping.R
Original file line number Diff line number Diff line change
@@ -113,12 +113,11 @@ param_defaults <- paste("--readFilesCommand ",samtools," view -@",samtools_load_
param_misc <- paste("--genomeDir",inp$reference$STAR_index,
"--sjdbGTFfile",gtf_to_use,
"--runThreadN",avail_cores,
"--sjdbOverhang", cDNA_read_length-1,
# "--sjdbOverhang", cDNA_read_length-1,
# "--sjdbOverhang 149",
"--readFilesType SAM PE",
"--readFilesType SAM", inp$read_layout,
"--genomeSAindexNbases 11",
"--limitOutSJcollapsed 5000000",
inp$read_layout)
"--limitOutSJcollapsed 5000000")

STAR_command <- paste(STAR_exec,param_defaults,param_misc,inp$reference$additional_STAR_params,param_additional_fa)
if(inp$counting_opts$twoPass==TRUE){
@@ -143,7 +142,7 @@ if(num_star_instances>1 & inp$which_Stage == "Filtering"){
#after parallel instance STAR, collect output data in the usual file places
out_logs <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Log.final.out"), full = TRUE)
merge_logs <- paste("cat",paste(out_logs, collapse = " "),">",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Log.final.out"))
out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.out.bam"), full = TRUE)
out_bams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*Aligned.out.bam"), full = TRUE)
merge_bams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.out.bam"),paste(out_bams, collapse = " "))
out_txbams <- list.files(map_tmp_dir, pattern = paste0("tmp.",inp$project,".*.Aligned.toTranscriptome.out.bam"), full = TRUE)
merge_txbams <- paste(inp$samtools_exec,"cat -o",paste0(inp$out_dir,"/",inp$project,".filtered.tagged.Aligned.toTranscriptome.out.bam"),paste(out_txbams, collapse = " "))