diff --git a/.circleci/config.yml b/.circleci/config.yml index df69598e..7454e22e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -51,7 +51,7 @@ jobs: name: build image command: | source ${BASH_ENV} - export DOCKER_CACHE_TAG=v1.1.5 + export DOCKER_CACHE_TAG=v1.1.6 echo "pulling ${DOCKER_CACHE_TAG}!" docker pull quay.io/encode-dcc/chip-seq-pipeline:${DOCKER_CACHE_TAG} docker login -u=${QUAY_ROBOT_USER} -p=${QUAY_ROBOT_USER_TOKEN} quay.io diff --git a/README.md b/README.md index f7aaa0bf..3871b131 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ This ChIP-Seq pipeline is based off the ENCODE (phase-3) transcription factor an * **Flexibility**: Support for `docker`, `singularity` and `Conda`. * **Portability**: Support for many cloud platforms (Google/DNAnexus) and cluster engines (SLURM/SGE/PBS). +* **Resumability**: [Resume](utils/qc_jsons_to_tsv/README.md) a failed workflow from where it left off. * **User-friendly HTML report**: tabulated quality metrics including alignment/peak statistics and FRiP along with many useful plots (IDR/cross-correlation measures). - Examples: [HTML](https://storage.googleapis.com/encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/example_output/qc.html), [JSON](docs/example_output/v1.1.5/qc.json) * **Genomes**: Pre-built database for GRCh38, hg19, mm10, mm9 and additional support for custom genomes. @@ -49,3 +50,11 @@ There are some useful tools to post-process outputs of the pipeline. ### qc_jsons_to_tsv [This tool](utils/qc_jsons_to_tsv/README.md) recursively finds and parses all `qc.json` (pipeline's [final output](docs/example_output/v1.1.5/qc.json)) found from a specified root directory. It generates a TSV file that has all quality metrics tabulated in rows for each experiment and replicate. This tool also estimates overall quality of a sample by [a criteria definition JSON file](utils/qc_jsons_to_tsv/criteria.default.json) which can be a good guideline for QC'ing experiments. + +### resumer + +[This tool](utils/resumer/README.md) parses a metadata JSON file from a previous failed workflow and generates a new input JSON file to start a pipeline from where it left off. + +### ENCODE downloader + +[This tool](https://github.com/kundajelab/ENCODE_downloader) downloads any type (FASTQ, BAM, PEAK, ...) of data from the ENCODE portal. It also generates a metadata JSON file per experiment which will be very useful to make an input JSON file for the pipeline. \ No newline at end of file diff --git a/chip.wdl b/chip.wdl index 2d01e998..c3cd6583 100644 --- a/chip.wdl +++ b/chip.wdl @@ -2,7 +2,7 @@ # Author: Jin Lee (leepc12@gmail.com) workflow chip { - String pipeline_ver = 'v1.1.5' + String pipeline_ver = 'v1.1.6' ### sample name, description String title = 'Untitled' String description = 'No description' @@ -30,10 +30,11 @@ workflow chip { Boolean no_dup_removal = false # no dupe reads removal when filtering BAM # dup.qc and pbc.qc will be empty files # and nodup_bam in the output is - # filtered bam with dupes + # filtered bam with dupes + String mito_chr_name = 'chrM' # name of mito chromosome. THIS IS NOT A REG-EX! you can define only one chromosome name for mito. String regex_filter_reads = 'chrM' # Perl-style regular expression pattern for chr name to filter out reads - # to remove matching reads from TAGALIGN + # those reads will be excluded from peak calling Int subsample_reads = 0 # number of reads to subsample TAGALIGN # 0 for no subsampling. this affects all downstream analysis Int ctl_subsample_reads = 0 # number of reads to subsample control TAGALIGN @@ -149,6 +150,37 @@ workflow chip { File? peak_ppr2 # do not define if you have a single replicate or true_rep=true File? peak_pooled # do not define if you have a single replicate or true_rep=true + ### other inputs used for resuming pipelines (QC/txt/log/png files, ...) + File? ta_pooled + File? ctl_ta_pooled + Array[File] flagstat_qcs = [] + Array[File] pbc_qcs = [] + Array[File] dup_qcs = [] + Array[File] nodup_flagstat_qcs = [] + Array[File] ctl_flagstat_qcs = [] + Array[File] ctl_pbc_qcs = [] + Array[File] ctl_dup_qcs = [] + Array[File] ctl_nodup_flagstat_qcs = [] + Array[File] sig_pvals = [] + Array[File] xcor_plots = [] + Array[File] xcor_scores = [] + + Array[File] macs2_frip_qcs = [] + Array[File] macs2_pr1_frip_qcs = [] + Array[File] macs2_pr2_frip_qcs = [] + File? macs2_pooled_frip_qc_ + File? macs2_ppr1_frip_qc_ + File? macs2_ppr2_frip_qc_ + Array[File] spp_frip_qcs = [] + Array[File] spp_pr1_frip_qcs = [] + Array[File] spp_pr2_frip_qcs = [] + File? spp_pooled_frip_qc_ + File? spp_ppr1_frip_qc_ + File? spp_ppr2_frip_qc_ + + Array[File] jsd_qcs = [] + File? jsd_plot + ### temp vars (do not define these) String peak_caller_ = if pipeline_type=='tf' then select_first([peak_caller,'spp']) else select_first([peak_caller,'macs2']) @@ -189,7 +221,13 @@ workflow chip { else if length(fastqs_rep6)<1 then [fastqs_rep1,fastqs_rep2,fastqs_rep3,fastqs_rep4,fastqs_rep5] else [fastqs_rep1,fastqs_rep2,fastqs_rep3,fastqs_rep4,fastqs_rep5,fastqs_rep6] - scatter(fastq_set in fastqs_) { + ## temp vars for resuming pipelines + Boolean need_to_process_ta = length(peaks_pr1)==0 && length(peaks)==0 + Boolean need_to_process_nodup_bam = need_to_process_ta && length(tas)==0 + Boolean need_to_process_bam = need_to_process_nodup_bam && length(nodup_bams)==0 + Boolean need_to_process_fastq = need_to_process_bam && length(bams)==0 + + scatter(fastq_set in if need_to_process_fastq then fastqs_ else []) { # merge fastqs call merge_fastq { input : fastqs = fastq_set, @@ -234,6 +272,7 @@ workflow chip { paired_end = false, subsample = 0, regex_grep_v_ta = regex_filter_reads, + mito_chr_name = mito_chr_name, cpu = bam2ta_cpu, mem_mb = bam2ta_mem_mb, @@ -243,7 +282,7 @@ workflow chip { } Array[File] bams_ = flatten([bwa.bam, bams]) - scatter(bam in bams_) { + scatter(bam in if need_to_process_bam then bams_ else []) { # filter/dedup bam call filter { input : bam = bam, @@ -251,6 +290,7 @@ workflow chip { dup_marker = dup_marker, mapq_thresh = mapq_thresh, no_dup_removal = no_dup_removal, + mito_chr_name = mito_chr_name, cpu = filter_cpu, mem_mb = filter_mem_mb, @@ -263,6 +303,7 @@ workflow chip { paired_end = paired_end, subsample = 0, regex_grep_v_ta = regex_filter_reads, + mito_chr_name = mito_chr_name, cpu = bam2ta_cpu, mem_mb = bam2ta_mem_mb, @@ -272,13 +313,14 @@ workflow chip { } Array[File] nodup_bams_ = flatten([filter.nodup_bam, nodup_bams]) - scatter(bam in nodup_bams_) { + scatter(bam in if need_to_process_nodup_bam then nodup_bams_ else []) { # convert bam to tagalign and subsample it if necessary call bam2ta { input : bam = bam, paired_end = paired_end, subsample = subsample_reads, regex_grep_v_ta = regex_filter_reads, + mito_chr_name = mito_chr_name, cpu = bam2ta_cpu, mem_mb = bam2ta_mem_mb, @@ -287,37 +329,17 @@ workflow chip { } } - Array[File] tas_xcor = if length(bam2ta_no_filt_R1.ta)>0 then bam2ta_no_filt_R1.ta - else if length(bam2ta_no_filt.ta)>0 then bam2ta_no_filt.ta - else flatten([bam2ta.ta, tas]) - Boolean paired_end_xcor = paired_end && length(bam2ta_no_filt_R1.ta)<1 - scatter(ta in tas_xcor) { - # use trimmed/unfilitered R1 tagAlign for paired end dataset - # if not starting from fastqs, keep using old method - # (mapping with both ends for tag-aligns to be used for xcor) - # subsample tagalign (non-mito) and cross-correlation analysis - call xcor { input : - ta = ta, - paired_end = paired_end_xcor, - subsample = xcor_subsample_reads, - - cpu = xcor_cpu, - mem_mb = xcor_mem_mb, - time_hr = xcor_time_hr, - disks = xcor_disks, - } - } - Array[File] tas_ = if align_only then [] else flatten([bam2ta.ta, tas]) - if ( length(tas_)>1 ) { + Array[File] tas__ = if need_to_process_ta then tas_ else [] + if ( length(tas__)>1 ) { # pool tagaligns from true replicates call pool_ta { input : - tas = tas_, + tas = tas__, } } if ( !true_rep_only ) { - scatter( ta in tas_ ) { + scatter( ta in tas__ ) { # make two self pseudo replicates per true replicate call spr { input : ta = ta, @@ -326,7 +348,7 @@ workflow chip { } } } - if ( !true_rep_only && length(tas_)>1 ) { + if ( !true_rep_only && length(tas__)>1 ) { # pool tagaligns from pseudo replicates call pool_ta as pool_ta_pr1 { input : tas = spr.ta_pr1, @@ -336,6 +358,29 @@ workflow chip { } } + Array[File] tas_xcor = if length(xcor_scores)>0 then [] + else if length(bam2ta_no_filt_R1.ta)>0 then bam2ta_no_filt_R1.ta + else if length(bam2ta_no_filt.ta)>0 then bam2ta_no_filt.ta + else flatten([bam2ta.ta, tas__]) + Boolean paired_end_xcor = paired_end && length(bam2ta_no_filt_R1.ta)<1 + scatter(ta in tas_xcor) { + # use trimmed/unfilitered R1 tagAlign for paired end dataset + # if not starting from fastqs, keep using old method + # (mapping with both ends for tag-aligns to be used for xcor) + # subsample tagalign (non-mito) and cross-correlation analysis + call xcor { input : + ta = ta, + paired_end = paired_end_xcor, + subsample = xcor_subsample_reads, + mito_chr_name = mito_chr_name, + + cpu = xcor_cpu, + mem_mb = xcor_mem_mb, + time_hr = xcor_time_hr, + disks = xcor_disks, + } + } + # align controls Array[Array[File]] ctl_fastqs_rep1 = if length(ctl_fastqs_rep1_R2)>0 then transpose([ctl_fastqs_rep1_R1,ctl_fastqs_rep1_R2]) else transpose([ctl_fastqs_rep1_R1]) @@ -357,7 +402,12 @@ workflow chip { else if length(ctl_fastqs_rep6)<1 then [ctl_fastqs_rep1,ctl_fastqs_rep2,ctl_fastqs_rep3,ctl_fastqs_rep4,ctl_fastqs_rep5] else [ctl_fastqs_rep1,ctl_fastqs_rep2,ctl_fastqs_rep3,ctl_fastqs_rep4,ctl_fastqs_rep5,ctl_fastqs_rep6] - scatter(fastq_set in ctl_fastqs_) { + ## temp vars for resuming pipelines + Boolean need_to_process_ctl_nodup_bam = length(ctl_tas)==0 + Boolean need_to_process_ctl_bam = need_to_process_ctl_nodup_bam && length(ctl_nodup_bams)==0 + Boolean need_to_process_ctl_fastq = need_to_process_ctl_bam && length(ctl_bams)==0 + + scatter(fastq_set in if need_to_process_ctl_fastq then ctl_fastqs_ else []) { # merge fastqs call merge_fastq as merge_fastq_ctl { input : fastqs = fastq_set, @@ -376,7 +426,7 @@ workflow chip { } Array[File] ctl_bams_ = flatten([bwa_ctl.bam, ctl_bams]) - scatter(bam in ctl_bams_) { + scatter(bam in if need_to_process_ctl_bam then ctl_bams_ else []) { # filter/dedup bam call filter as filter_ctl { input : bam = bam, @@ -384,6 +434,7 @@ workflow chip { dup_marker = dup_marker, mapq_thresh = mapq_thresh, no_dup_removal = no_dup_removal, + mito_chr_name = mito_chr_name, cpu = filter_cpu, mem_mb = filter_mem_mb, @@ -393,13 +444,14 @@ workflow chip { } Array[File] ctl_nodup_bams_ = flatten([filter_ctl.nodup_bam, ctl_nodup_bams]) - scatter(bam in ctl_nodup_bams_) { + scatter(bam in if need_to_process_ctl_nodup_bam then ctl_nodup_bams_ else []) { # convert bam to tagalign and subsample it if necessary call bam2ta as bam2ta_ctl { input : bam = bam, paired_end = paired_end, subsample = ctl_subsample_reads, regex_grep_v_ta = regex_filter_reads, + mito_chr_name = mito_chr_name, cpu = bam2ta_cpu, mem_mb = bam2ta_mem_mb, @@ -408,15 +460,15 @@ workflow chip { } } - Array[String] ctl_tas_ = if align_only then [] else flatten([bam2ta_ctl.ta, ctl_tas]) - if ( length(ctl_tas_)>0 ) { + Array[String] ctl_tas_ = if align_only then [] else flatten([bam2ta_ctl.ta, ctl_tas]) + if ( length(ctl_tas_)>0 && !defined(ctl_ta_pooled) ) { # pool tagaligns from true replicates call pool_ta as pool_ta_ctl { input : tas = ctl_tas_, } } - if ( !disable_fingerprint && length(nodup_bams_)>0 && length(ctl_nodup_bams_)>0 && basename(blacklist)!='null' ) { + if ( !disable_fingerprint && length(nodup_bams_)>0 && length(ctl_nodup_bams_)>0 && basename(blacklist)!='null' && length(jsd_qcs)<1 ) { # fingerprint and JS-distance plot call fingerprint { input : nodup_bams = nodup_bams_, @@ -430,16 +482,16 @@ workflow chip { } } - if ( length(tas_)>0 && length(ctl_tas_)>0 ) { + if ( length(tas__)>0 && length(ctl_tas_)>0 ) { # choose appropriate control for each exp IP replicate # outputs: # choose_ctl.idx : control replicate index for each exp replicate # -1 means pooled ctl replicate call choose_ctl { input: - tas = tas_, + tas = tas__, ctl_tas = ctl_tas_, ta_pooled = pool_ta.ta_pooled, - ctl_ta_pooled = pool_ta_ctl.ta_pooled, + ctl_ta_pooled = if !defined(ctl_ta_pooled) then pool_ta_ctl.ta_pooled else ctl_ta_pooled, always_use_pooled_ctl = always_use_pooled_ctl, ctl_depth_ratio = ctl_depth_ratio, } @@ -451,15 +503,15 @@ workflow chip { else xcor.fraglen # make control ta array [[1,2,3,4]] -> [[1],[2],[3],[4]], will be zipped with exp ta array latter - Array[Array[File]] chosen_ctl_tas = if length(tas_)<1 || length(ctl_tas_)<1 then [[],[],[],[],[],[]] + Array[Array[File]] chosen_ctl_tas = if length(tas__)<1 || length(ctl_tas_)<1 then [[],[],[],[],[],[]] else transpose(select_all([choose_ctl.chosen_ctl_tas])) # we have all tas and ctl_tas (optional for histone chipseq) ready, let's call peaks - scatter(i in range(length(tas_))) { + scatter(i in range(length(tas__))) { # always call MACS2 peaks for true replicates to get signal tracks # call peaks on tagalign call macs2 { input : - tas = flatten([[tas_[i]], chosen_ctl_tas[i]]), + tas = flatten([[tas__[i]], chosen_ctl_tas[i]]), gensz = gensz, chrsz = chrsz, cap_num_peak = macs2_cap_num_peak, @@ -477,10 +529,10 @@ workflow chip { # SPP cannot call peaks without controls if ( peak_caller_=='spp' ) { - scatter(i in range(length(tas_))) { + scatter(i in range(length(tas__))) { # call peaks on tagalign call spp { input : - tas = flatten([[tas_[i]], chosen_ctl_tas[i]]), + tas = flatten([[tas__[i]], chosen_ctl_tas[i]]), chrsz = chrsz, cap_num_peak = spp_cap_num_peak, fraglen = fraglen_[i], @@ -495,8 +547,8 @@ workflow chip { } } - if ( peak_caller_=='macs2' ) { - scatter(i in range(length(tas_))) { + if ( peak_caller_=='macs2' && !true_rep_only ) { + scatter(i in range(length(tas__))) { # call peaks on 1st pseudo replicated tagalign call macs2 as macs2_pr1 { input : tas = flatten([[select_first([spr.ta_pr1])[i]], chosen_ctl_tas[i]]), @@ -531,8 +583,8 @@ workflow chip { } } - if ( peak_caller_=='spp' ) { - scatter(i in range(length(tas_))) { + if ( peak_caller_=='spp' && !true_rep_only ) { + scatter(i in range(length(tas__))) { # call peaks on 1st pseudo replicated tagalign call spp as spp_pr1 { input : tas = flatten([[select_first([spr.ta_pr1])[i]], chosen_ctl_tas[i]]), @@ -572,11 +624,12 @@ workflow chip { } # actually not an array - Array[File] chosen_ctl_ta_pooled = if length(tas_)<2 || length(ctl_tas_)<1 then [] + Array[File] chosen_ctl_ta_pooled = if length(tas__)<2 || length(ctl_tas_)<1 then [] else if length(ctl_tas_)<2 then [ctl_tas_[0]] # choose first (only) control + else if defined(ctl_ta_pooled) then select_all([ctl_ta_pooled]) # choose pooled control else select_all([pool_ta_ctl.ta_pooled]) # choose pooled control - if ( length(tas_)>1 ) { + if ( length(tas__)>1 ) { # call peaks on pooled replicate # always call MACS2 peaks for pooled replicate to get signal tracks call macs2 as macs2_pooled { input : @@ -595,7 +648,7 @@ workflow chip { time_hr = macs2_time_hr, } } - if ( length(tas_)>1 && peak_caller_=='spp' ) { + if ( length(tas__)>1 && peak_caller_=='spp' ) { # call peaks on pooled replicate call spp as spp_pooled { input : tas = flatten([select_all([pool_ta.ta_pooled]), chosen_ctl_ta_pooled]), @@ -612,7 +665,7 @@ workflow chip { } } - if ( !true_rep_only && length(tas_)>1 && peak_caller_=='macs2' ) { + if ( !true_rep_only && length(tas__)>1 && peak_caller_=='macs2' ) { # call peaks on 1st pooled pseudo replicates call macs2 as macs2_ppr1 { input : tas = flatten([select_all([pool_ta_pr1.ta_pooled]), chosen_ctl_ta_pooled]), @@ -630,7 +683,7 @@ workflow chip { time_hr = macs2_time_hr, } } - if ( !true_rep_only && length(tas_)>1 && peak_caller_=='macs2' ) { + if ( !true_rep_only && length(tas__)>1 && peak_caller_=='macs2' ) { # call peaks on 2nd pooled pseudo replicates call macs2 as macs2_ppr2 { input : tas = flatten([select_all([pool_ta_pr2.ta_pooled]), chosen_ctl_ta_pooled]), @@ -648,7 +701,7 @@ workflow chip { time_hr = macs2_time_hr, } } - if ( !true_rep_only && length(tas_)>1 && peak_caller_=='spp' ) { + if ( !true_rep_only && length(tas__)>1 && peak_caller_=='spp' ) { # call peaks on 1st pooled pseudo replicates call spp as spp_ppr1 { input : tas = flatten([select_all([pool_ta_pr1.ta_pooled]), chosen_ctl_ta_pooled]), @@ -664,7 +717,7 @@ workflow chip { time_hr = spp_time_hr, } } - if ( !true_rep_only && length(tas_)>1 && peak_caller_=='spp' ) { + if ( !true_rep_only && length(tas__)>1 && peak_caller_=='spp' ) { # call peaks on 2nd pooled pseudo replicates call spp as spp_ppr2 { input : tas = flatten([select_all([pool_ta_pr2.ta_pooled]), chosen_ctl_ta_pooled]), @@ -682,7 +735,10 @@ workflow chip { } # make peak arrays - Array[File] peaks_ = if align_only then [] else select_first([spp.rpeak ,flatten([macs2.npeak, peaks])]) + Array[File] peaks_ = if align_only then [] + else if peak_caller_=='spp' then flatten(select_all([spp.rpeak, peaks])) + else if peak_caller_=='macs2' then flatten([macs2.npeak, peaks]) + else [] # generate all possible pairs of true replicates (pair: left=prefix, right=[peak1,peak2]) Array[Pair[String,Array[File]]] peak_pairs = @@ -707,22 +763,24 @@ workflow chip { ('rep3-rep4',[peaks_[2],peaks_[3]]), ('rep3-rep5',[peaks_[2],peaks_[4]]), ('rep3-rep6',[peaks_[2],peaks_[5]]), ('rep4-rep5',[peaks_[3],peaks_[4]]), ('rep4-rep6',[peaks_[3],peaks_[5]]), ('rep5-rep6',[peaks_[4],peaks_[5]])] - scatter( pair in peak_pairs ) { - # Naive overlap on every pair of true replicates - call overlap { input : - prefix = pair.left, - peak1 = pair.right[0], - peak2 = pair.right[1], - peak_pooled = select_first([spp_pooled.rpeak, macs2_pooled.npeak, peak_pooled]), - fraglen = fraglen_mean.rounded_mean, - peak_type = peak_type, - blacklist = blacklist, - chrsz = chrsz, - keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - ta = pool_ta.ta_pooled, + if ( length(peaks_)>0 ) { + scatter( pair in peak_pairs ) { + # Naive overlap on every pair of true replicates + call overlap { input : + prefix = pair.left, + peak1 = pair.right[0], + peak2 = pair.right[1], + peak_pooled = select_first([spp_pooled.rpeak, macs2_pooled.npeak, peak_pooled]), + fraglen = fraglen_mean.rounded_mean, + peak_type = peak_type, + blacklist = blacklist, + chrsz = chrsz, + keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, + ta = if defined(ta_pooled) then ta_pooled else pool_ta.ta_pooled, + } } } - if ( enable_idr ) { + if ( length(peaks_)>0 && enable_idr ) { scatter( pair in peak_pairs ) { # IDR on every pair of true replicates call idr { input : @@ -737,13 +795,23 @@ workflow chip { blacklist = blacklist, chrsz = chrsz, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - ta = pool_ta.ta_pooled, + ta = if defined(ta_pooled) then ta_pooled else pool_ta.ta_pooled, } } } - Array[File] peaks_pr1_ = select_first([spp_pr1.rpeak, macs2_pr1.npeak, peaks_pr1]) - Array[File] peaks_pr2_ = select_first([spp_pr2.rpeak, macs2_pr2.npeak, peaks_pr2]) + Array[File] peaks_pr1_ = flatten(select_all([spp_pr1.rpeak, macs2_pr1.npeak, peaks_pr1])) + Array[File] peaks_pr2_ = flatten(select_all([spp_pr2.rpeak, macs2_pr2.npeak, peaks_pr2])) + + #Array[File] peaks_pr1_ = if align_only then [] + # else if peak_caller=='spp' then flatten(select_all([spp_pr1.rpeak, peaks_pr1])) + # else if peak_caller=='macs2' then flatten(select_all([macs2_pr1.npeak, peaks_pr1])) + # else [] + #Array[File] peaks_pr2_ = if align_only then [] + # else if peak_caller=='spp' then flatten(select_all([spp_pr2.rpeak, peaks_pr2])) + # else if peak_caller=='macs2' then flatten(select_all([macs2_pr2.npeak, peaks_pr2])) + # else [] + scatter( i in range(length(peaks_pr1_)) ) { # Naive overlap on pseduo replicates call overlap as overlap_pr { input : @@ -756,7 +824,7 @@ workflow chip { blacklist = blacklist, chrsz = chrsz, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - ta = if length(tas_)>0 then tas_[i] else pool_ta.ta_pooled, + ta = if length(tas_)>0 then tas_[i] else if defined(ta_pooled) then ta_pooled else pool_ta.ta_pooled, } } if ( enable_idr ) { @@ -774,11 +842,10 @@ workflow chip { blacklist = blacklist, chrsz = chrsz, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - ta = if length(tas_)>0 then tas_[i] else pool_ta.ta_pooled, + ta = if length(tas_)>0 then tas_[i] else if defined(ta_pooled) then ta_pooled else pool_ta.ta_pooled, } } } - if ( length(peaks_pr1_)>1 ) { # Naive overlap on pooled pseudo replicates call overlap as overlap_ppr { input : @@ -791,7 +858,7 @@ workflow chip { blacklist = blacklist, chrsz = chrsz, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - ta = pool_ta.ta_pooled, + ta = if defined(ta_pooled) then ta_pooled else pool_ta.ta_pooled, } } if ( enable_idr && length(peaks_pr1_)>1 ) { @@ -808,7 +875,7 @@ workflow chip { blacklist = blacklist, chrsz = chrsz, keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, - ta = pool_ta.ta_pooled, + ta = if defined(ta_pooled) then ta_pooled else pool_ta.ta_pooled, } } @@ -836,6 +903,28 @@ workflow chip { keep_irregular_chr_in_bfilt_peak = keep_irregular_chr_in_bfilt_peak, } } + + Array[File] flagstat_qcs_ = flatten([flagstat_qcs, bwa.flagstat_qc]) + Array[File] pbc_qcs_ = flatten([pbc_qcs, filter.pbc_qc]) + Array[File] dup_qcs_ = flatten([dup_qcs, filter.dup_qc]) + Array[File] nodup_flagstat_qcs_ = flatten([nodup_flagstat_qcs, filter.flagstat_qc]) + + Array[File] ctl_flagstat_qcs_ = flatten([ctl_flagstat_qcs, bwa_ctl.flagstat_qc]) + Array[File] ctl_pbc_qcs_ = flatten([ctl_pbc_qcs, filter_ctl.pbc_qc]) + Array[File] ctl_dup_qcs_ = flatten([ctl_dup_qcs, filter_ctl.dup_qc]) + Array[File] ctl_nodup_flagstat_qcs_ = flatten([ctl_nodup_flagstat_qcs, filter_ctl.flagstat_qc]) + + Array[File] xcor_plots_ = flatten([xcor_plots, xcor.plot_png]) + Array[File] xcor_scores_ = flatten([xcor_scores, xcor.score]) + Array[File] sig_pvals_ = flatten([sig_pvals, macs2.sig_pval]) + + Array[File] macs2_frip_qcs_ = flatten([macs2_frip_qcs, macs2.frip_qc]) + Array[File] macs2_pr1_frip_qcs_ = flatten(select_all([macs2_pr1_frip_qcs, macs2_pr1.frip_qc])) + Array[File] macs2_pr2_frip_qcs_ = flatten(select_all([macs2_pr2_frip_qcs, macs2_pr2.frip_qc])) + Array[File] spp_frip_qcs_ = flatten(select_all([spp_frip_qcs, spp.frip_qc])) + Array[File] spp_pr1_frip_qcs_ = flatten(select_all([spp_pr1_frip_qcs, spp_pr1.frip_qc])) + Array[File] spp_pr2_frip_qcs_ = flatten(select_all([spp_pr2_frip_qcs, spp_pr2.frip_qc])) + # Generate final QC report and JSON call qc_report { input : pipeline_ver = pipeline_ver, @@ -848,32 +937,34 @@ workflow chip { macs2_cap_num_peak = macs2_cap_num_peak, macs2_cap_num_peak = spp_cap_num_peak, idr_thresh = idr_thresh, - flagstat_qcs = bwa.flagstat_qc, - nodup_flagstat_qcs = filter.flagstat_qc, - dup_qcs = filter.dup_qc, - pbc_qcs = filter.pbc_qc, - ctl_flagstat_qcs = bwa_ctl.flagstat_qc, - ctl_nodup_flagstat_qcs = filter_ctl.flagstat_qc, - ctl_dup_qcs = filter_ctl.dup_qc, - ctl_pbc_qcs = filter_ctl.pbc_qc, - xcor_plots = xcor.plot_png, - xcor_scores = xcor.score, - - jsd_plot = fingerprint.plot, - jsd_qcs = select_first([fingerprint.jsd_qcs,[]]), - - frip_macs2_qcs = macs2.frip_qc, - frip_macs2_qcs_pr1 = macs2_pr1.frip_qc, - frip_macs2_qcs_pr2 = macs2_pr2.frip_qc, - frip_macs2_qc_pooled = macs2_pooled.frip_qc, - frip_macs2_qc_ppr1 = macs2_ppr1.frip_qc, - frip_macs2_qc_ppr2 = macs2_ppr2.frip_qc, - frip_spp_qcs = spp.frip_qc, - frip_spp_qcs_pr1 = spp_pr1.frip_qc, - frip_spp_qcs_pr2 = spp_pr2.frip_qc, - frip_spp_qc_pooled = spp_pooled.frip_qc, - frip_spp_qc_ppr1 = spp_ppr1.frip_qc, - frip_spp_qc_ppr2 = spp_ppr2.frip_qc, + + flagstat_qcs = flagstat_qcs_, + nodup_flagstat_qcs = nodup_flagstat_qcs_, + dup_qcs = dup_qcs_, + pbc_qcs = pbc_qcs_, + ctl_flagstat_qcs = ctl_flagstat_qcs_, + ctl_nodup_flagstat_qcs = ctl_nodup_flagstat_qcs_, + ctl_dup_qcs = ctl_dup_qcs_, + ctl_pbc_qcs = ctl_pbc_qcs_, + xcor_plots = xcor_plots_, + xcor_scores = xcor_scores_, + + jsd_plot = if length(jsd_qcs)>0 then jsd_plot else fingerprint.plot, + jsd_qcs = if length(jsd_qcs)>0 then jsd_qcs else select_first([fingerprint.jsd_qcs,[]]), + + frip_macs2_qcs = macs2_frip_qcs_, + frip_macs2_qcs_pr1 = macs2_pr1_frip_qcs_, + frip_macs2_qcs_pr2 = macs2_pr2_frip_qcs_, + frip_macs2_qc_pooled = if defined(macs2_pooled_frip_qc_) then macs2_pooled_frip_qc_ else macs2_pooled.frip_qc, + frip_macs2_qc_ppr1 = if defined(macs2_ppr1_frip_qc_) then macs2_ppr1_frip_qc_ else macs2_ppr1.frip_qc, + frip_macs2_qc_ppr2 = if defined(macs2_ppr2_frip_qc_) then macs2_ppr2_frip_qc_ else macs2_ppr2.frip_qc, + + frip_spp_qcs = spp_frip_qcs_, + frip_spp_qcs_pr1 = spp_pr1_frip_qcs_, + frip_spp_qcs_pr2 = spp_pr2_frip_qcs_, + frip_spp_qc_pooled = if defined(spp_pooled_frip_qc_) then spp_pooled_frip_qc_ else spp_pooled.frip_qc, + frip_spp_qc_ppr1 = if defined(spp_ppr1_frip_qc_) then spp_ppr1_frip_qc_ else spp_ppr1.frip_qc, + frip_spp_qc_ppr2 = if defined(spp_ppr2_frip_qc_) then spp_ppr2_frip_qc_ else spp_ppr2.frip_qc, idr_plots = idr.idr_plot, idr_plots_pr = idr_pr.idr_plot, @@ -983,6 +1074,7 @@ task filter { # dup.qc and pbc.qc will be empty files # and nodup_bam in the output is # filtered bam with dupes + String mito_chr_name Int cpu Int mem_mb Int time_hr @@ -998,6 +1090,7 @@ task filter { ${"--dup-marker " + dup_marker} \ ${"--mapq-thresh " + mapq_thresh} \ ${if no_dup_removal then "--no-dup-removal" else ""} \ + ${"--mito-chr-name " + mito_chr_name} \ ${"--nth " + cpu} } output { @@ -1020,6 +1113,7 @@ task bam2ta { Boolean paired_end String regex_grep_v_ta # Perl-style regular expression pattern # to remove matching reads from TAGALIGN + String mito_chr_name # mito chromosome name Int subsample # number of reads to subsample TAGALIGN # this affects all downstream analysis Int cpu @@ -1033,6 +1127,7 @@ task bam2ta { --disable-tn5-shift \ ${if paired_end then "--paired-end" else ""} \ ${if regex_grep_v_ta!="" then "--regex-grep-v-ta '"+regex_grep_v_ta+"'" else ""} \ + ${"--mito-chr-name " + mito_chr_name} \ ${"--subsample " + subsample} \ ${"--nth " + cpu} } @@ -1091,6 +1186,7 @@ task pool_ta { task xcor { File ta Boolean paired_end + String mito_chr_name Int subsample # number of reads to subsample TAGALIGN # this will be used for xcor only # will not affect any downstream analysis @@ -1103,6 +1199,7 @@ task xcor { python $(which encode_xcor.py) \ ${ta} \ ${if paired_end then "--paired-end" else ""} \ + ${"--mito-chr-name " + mito_chr_name} \ ${"--subsample " + subsample} \ ${"--nth " + cpu} } diff --git a/conda/requirements.txt b/conda/requirements.txt index 7fd45785..423cfd41 100644 --- a/conda/requirements.txt +++ b/conda/requirements.txt @@ -44,3 +44,4 @@ pybigwig==0.3.11 phantompeakqualtools ==1.2 tabix==0.2.6 openssl==1.0.2p +readline==6.2 diff --git a/conda/requirements_py3.txt b/conda/requirements_py3.txt index 21f0cb7c..08bf6c43 100644 --- a/conda/requirements_py3.txt +++ b/conda/requirements_py3.txt @@ -11,3 +11,4 @@ libgcc==5.2.0 # this does not work with MacOS... matplotlib #==1.5.1 ncurses ==6.1 tabix==0.2.6 +readline==6.2 diff --git a/docker_image/Dockerfile b/docker_image/Dockerfile index be1edc65..73c84171 100644 --- a/docker_image/Dockerfile +++ b/docker_image/Dockerfile @@ -9,7 +9,8 @@ # then install with --no-dependencies # Set the base image to Ubuntu 16.04 -FROM ubuntu:16.04 +#FROM ubuntu:16.04 +FROM ubuntu@sha256:e4a134999bea4abb4a27bc437e6118fdddfb172e1b9d683129b74d254af51675 # File Author / Maintainer MAINTAINER Jin Lee @@ -78,7 +79,7 @@ RUN wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedGraphToBigWig # Install samtools 1.2 RUN git clone --branch 1.2 --single-branch https://github.com/samtools/samtools.git && \ - git clone --branch 1.2 --single-branch git://github.com/samtools/htslib.git && \ + git clone --branch 1.2 --single-branch https://github.com/samtools/htslib.git && \ cd samtools && make && make install && cd ../ && rm -rf samtools* htslib* # Install bedtools 2.26.0 diff --git a/docs/dev.md b/docs/dev.md index ce94ec54..2cc80ea3 100644 --- a/docs/dev.md +++ b/docs/dev.md @@ -2,12 +2,18 @@ ## Command line for version change ```bash -PREV_VER=v1.1.4 -NEW_VER=v1.1.5 -for f in $(grep -rl ${PREV_VER} --include=*.{wdl,json,md,sh,yml}) +PREV_VER=v1.1.5 +NEW_VER=v1.1.6 +for f in $(grep -rl ${PREV_VER} --include=*.{wdl,md,sh,yml}) do sed -i "s/${PREV_VER}/${NEW_VER}/g" ${f} done +cd workflow_opts +for f in $(grep -rl ${PREV_VER} --include=*.json) +do + sed -i "s/${PREV_VER}/${NEW_VER}/g" ${f} +done +cd .. ``` ## Building templates on DX for each genome @@ -18,7 +24,7 @@ Run the following command line locally to build out DX workflows for this pipeli ```bash # version -VER=v1.1.5 +VER=v1.1.6 # general java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/general -defaults examples/dx/template_general.json @@ -41,11 +47,11 @@ java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing # test sample SE ENCSR000DYI (full) java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/test_ENCSR000DYI -defaults examples/dx/ENCSR000DYI_dx.json -# test sample PE ENCSR936XTK (subsampled) -java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/test_ENCSR936XTK_subsampled -defaults examples/dx/ENCSR936XTK_subsampled_dx.json +# test sample PE ENCSR936XTK (subsampled, chr19/chrM only) +java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/test_ENCSR936XTK_subsampled_chr19_only -defaults examples/dx/ENCSR936XTK_subsampled_chr19_only_dx.json -# test sample SE ENCSR000DYI (subsampled) -java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/test_ENCSR000DYI_subsampled -defaults examples/dx/ENCSR000DYI_subsampled_dx.json +# test sample SE ENCSR000DYI (subsampled, chr19/chrM only) +java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/test_ENCSR000DYI_subsampled_chr19_only -defaults examples/dx/ENCSR000DYI_subsampled_chr19_only_dx.json ## DX Azure @@ -70,9 +76,9 @@ java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing # test sample SE ENCSR000DYI (full) java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines Azure" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/test_ENCSR000DYI -defaults examples/dx_azure/ENCSR000DYI_dx_azure.json -# test sample PE ENCSR936XTK (subsampled) -java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines Azure" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/test_ENCSR936XTK_subsampled -defaults examples/dx_azure/ENCSR936XTK_subsampled_dx_azure.json +# test sample PE ENCSR936XTK (subsampled, chr19/chrM only) +java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines Azure" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/test_ENCSR936XTK_subsampled_chr19_only -defaults examples/dx_azure/ENCSR936XTK_subsampled_chr19_only_dx_azure.json -# test sample SE ENCSR000DYI (subsampled) -java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines Azure" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/test_ENCSR000DYI_subsampled -defaults examples/dx_azure/ENCSR000DYI_subsampled_dx_azure.json +# test sample SE ENCSR000DYI (subsampled, chr19/chrM only) +java -jar ~/dxWDL-0.77.jar compile chip.wdl -project "ENCODE Uniform Processing Pipelines Azure" -extras workflow_opts/docker.json -f -folder /ChIP-seq2/workflows/$VER/test_ENCSR000DYI_subsampled_chr19_only -defaults examples/dx_azure/ENCSR000DYI_subsampled_chr19_only_dx_azure.json ``` diff --git a/docs/input.md b/docs/input.md index 7c4b7b2a..360d1c95 100644 --- a/docs/input.md +++ b/docs/input.md @@ -129,8 +129,13 @@ Let us take a close look at the following template JSON. Comments are not allowe // Skip dup removal in a BAM filtering stage. "chip.no_dup_removal" : false, - // Regular expression to filter out reads - // Any read that matches with this reg-ex pattern will be removed from outputs + // Name of mito chromosome. THIS IS NOT A REG-EX! you can define only one chromosome name for mito. + "chip.mito_chr_name" : "chrM", + + // Regular expression to filter out reads with given chromosome name (1st column of BED/TAG-ALIGN) + // Any read with chr name that matches with this reg-ex pattern will be removed from outputs + // If your have changed the above parameter "chip.mito_chr_name" and still want to filter out mito reads, + // then make sure that "chip.mito_chr_name" and "chip.regex_filter_reads" are the same. "chip.regex_filter_reads" : "chrM", // Subsample reads (0: no subsampling) diff --git a/docs/tutorial_dx_cli.md b/docs/tutorial_dx_cli.md index cc5c90b5..0303e557 100644 --- a/docs/tutorial_dx_cli.md +++ b/docs/tutorial_dx_cli.md @@ -34,24 +34,24 @@ This document describes instruction for the item 1). 6. Choose an appropriate input for your project (AWS or Azure): * AWS ```bash - $ INPUT=examples/dx/ENCSR936XTK_subsampled_dx.json + $ INPUT=examples/dx/ENCSR936XTK_subsampled_chr19_only_dx.json ``` * Azure ```bash - $ INPUT=examples/dx_azure/ENCSR936XTK_subsampled_dx_azure.json + $ INPUT=examples/dx_azure/ENCSR936XTK_subsampled_chr19_only_dx_azure.json ``` 7. Compile `chip.wdl` with an input JSON for the SUBSAMPLED paired-end sample of [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/). ```bash $ PROJECT=[YOUR_PROJECT_NAME] - $ OUT_FOLDER=/test_sample_chip_ENCSR936XTK_subsampled + $ OUT_FOLDER=/test_sample_chip_ENCSR936XTK_subsampled_chr19_only $ java -jar dxWDL-0.77.jar compile chip.wdl -project ${PROJECT} -f -folder ${OUT_FOLDER} -defaults ${INPUT} -extras workflow_opts/docker.json ``` 8. Go to DNAnexus [project page](https://platform.DNAnexus.com/projects) and click on your project. -9. Move to the directory `/test_sample_chip_ENCSR936XTK_subsampled`. +9. Move to the directory `/test_sample_chip_ENCSR936XTK_subsampled_chr19_only`. 10. You will find a DX workflow `chip` with all parameters pre-defined. Click on it. diff --git a/docs/tutorial_dx_web.md b/docs/tutorial_dx_web.md index 87296f05..e3097212 100644 --- a/docs/tutorial_dx_web.md +++ b/docs/tutorial_dx_web.md @@ -15,8 +15,8 @@ This document describes instruction for the item 2). 3. Move to one of the following workflow directories according to the platform you have chosen for your project (AWS or Azure). These DX workflows are pre-built with all parameters defined. -* [AWS test workflow](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/test_ENCSR936XTK_subsampled) -* [Azure test workflow](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/test_ENCSR936XTK_subsampled) +* [AWS test workflow](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6/test_ENCSR936XTK_subsampled_chr19_only) +* [Azure test workflow](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6/test_ENCSR936XTK_subsampled_chr19_only) 4. Copy it to your project by right-clicking on the DX workflow `chip` and choose "Copy". @@ -40,16 +40,16 @@ This document describes instruction for the item 2). 1. DNAnexus allows only one copy of a workflow per project. The example workflow in the previous section is pre-built for the subsampled test sample [ENCSR936XTK](https://www.encodeproject.org/experiments/ENCSR936XTK/) with all parameters defined already. 2. Copy one of the following workflows according to the platform you have chosen for your project (AWS or Azure). -* [AWS general](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.5/general) without pre-defined reference genome. -* [AWS hg38](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.5/hg38) with pre-defined hg38 reference genome. -* [AWS hg19](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.5/hg19) with pre-defined hg19 reference genome. -* [AWS mm10](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.5/mm10) with pre-defined mm10 reference genome. -* [AWS mm9](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.5/mm9) with pre-defined mm9 reference genome. -* [Azure general](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.5/general) without pre-defined reference genome. -* [Azure hg38](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.5/hg38) with pre-defined hg38 reference genome. -* [Azure hg19](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.5/hg19) with pre-defined hg19 reference genome. -* [Azure mm10](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.5/mm10) with pre-defined mm10 reference genome. -* [Azure mm9](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.5/mm9) with pre-defined mm9 reference genome. +* [AWS general](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6/general) without pre-defined reference genome. +* [AWS hg38](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6/hg38) with pre-defined hg38 reference genome. +* [AWS hg19](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6/hg19) with pre-defined hg19 reference genome. +* [AWS mm10](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6/mm10) with pre-defined mm10 reference genome. +* [AWS mm9](https://platform.DNAnexus.com/projects/BKpvFg00VBPV975PgJ6Q03v6/data/ChIP-seq2/workflows/v1.1.6/mm9) with pre-defined mm9 reference genome. +* [Azure general](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6/general) without pre-defined reference genome. +* [Azure hg38](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6/hg38) with pre-defined hg38 reference genome. +* [Azure hg19](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6/hg19) with pre-defined hg19 reference genome. +* [Azure mm10](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6/mm10) with pre-defined mm10 reference genome. +* [Azure mm9](https://platform.DNAnexus.com/projects/F6K911Q9xyfgJ36JFzv03Z5J/data/ChIP-seq2/workflows/v1.1.6/mm9) with pre-defined mm9 reference genome. 3. Click on the DX workflow `chip`. diff --git a/docs/tutorial_google.md b/docs/tutorial_google.md index d7ad7e8e..e596aac5 100644 --- a/docs/tutorial_google.md +++ b/docs/tutorial_google.md @@ -43,11 +43,11 @@ All test samples and genome data are shared on our public Google Cloud buckets. $ cd chip-seq-pipeline2 ``` -10. Run a pipeline for a SUBSAMPLED (1/400) paired-end sample of [ENCSR356KRQ](https://www.encodeproject.org/experiments/ENCSR356KRQ/). +10. Run a pipeline for the test sample. ```bash $ PROJECT=[YOUR_PROJECT_NAME] - $ BUCKET=gs://[YOUR_BUCKET_NAME]/ENCSR356KRQ_subsampled - $ INPUT=examples/google/ENCSR356KRQ_subsampled.json + $ BUCKET=gs://[YOUR_BUCKET_NAME]/ENCSR936XTK_subsampled + $ INPUT=examples/google/ENCSR936XTK_subsampled_chr19_only.json $ java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=google -Dbackend.providers.google.config.project=${PROJECT} -Dbackend.providers.google.config.root=${BUCKET} cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/docker.json ``` @@ -56,6 +56,8 @@ All test samples and genome data are shared on our public Google Cloud buckets. 12. See full specification for [input JSON file](input.md). +13. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=examples/google/ENCSR936XTK_subsampled_chr19_only.json`. + ## Extras for advanced users 1. Set quota for [Google Compute Engine API](https://console.cloud.google.com/iam-admin/quotas) per region. Increase quota for SSD/HDD storage, number of vCPUs to process more samples faster simulateneouly. diff --git a/docs/tutorial_local_conda.md b/docs/tutorial_local_conda.md index fb8de490..f982700e 100644 --- a/docs/tutorial_local_conda.md +++ b/docs/tutorial_local_conda.md @@ -20,9 +20,9 @@ $ tar xvf ENCSR936XTK_fastq_subsampled.tar ``` -4. Download pre-built genome database for hg38. +4. Download pre-built chr19/chrM-only genome database for hg38. ```bash - $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chip.tar + $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chr19_chrM_chip.tar $ tar xvf test_genome_database_hg38_chip.tar ``` @@ -41,10 +41,13 @@ 7. Run a pipeline for the test sample. ```bash $ source activate encode-chip-seq-pipeline # IMPORTANT! - $ INPUT=examples/local/ENCSR936XTK_subsampled.json - $ java -jar -Dconfig.file=backends/backend.conf cromwell-34.jar run chip.wdl -i ${INPUT} + $ INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json + $ PIPELINE_METADATA=metadata.json + $ java -jar -Dconfig.file=backends/backend.conf cromwell-34.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} ``` 8. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. 9. See full specification for [input JSON file](input.md). + +10. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json`. \ No newline at end of file diff --git a/docs/tutorial_local_docker.md b/docs/tutorial_local_docker.md index d8f8bf98..0d290092 100644 --- a/docs/tutorial_local_docker.md +++ b/docs/tutorial_local_docker.md @@ -18,18 +18,21 @@ $ tar xvf ENCSR936XTK_fastq_subsampled.tar ``` -4. Download pre-built genome database for hg38. +4. Download pre-built chr19/chrM-only genome database for hg38. ```bash - $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chip.tar + $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chr19_chrM_chip.tar $ tar xvf test_genome_database_hg38_chip.tar ``` 5. Run a pipeline for the test sample. ```bash - $ INPUT=examples/local/ENCSR936XTK_subsampled.json - $ java -jar -Dconfig.file=backends/backend.conf cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/docker.json + $ INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json + $ PIPELINE_METADATA=metadata.json + $ java -jar -Dconfig.file=backends/backend.conf cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/docker.json -m ${PIPELINE_METADATA} ``` 6. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. 7. See full specification for [input JSON file](input.md). + +8. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json`. \ No newline at end of file diff --git a/docs/tutorial_local_singularity.md b/docs/tutorial_local_singularity.md index 8f2a90b5..93aab633 100644 --- a/docs/tutorial_local_singularity.md +++ b/docs/tutorial_local_singularity.md @@ -20,9 +20,9 @@ $ tar xvf ENCSR936XTK_fastq_subsampled.tar ``` -4. Download pre-built genome database for hg38. +4. Download pre-built chr19/chrM-only genome database for hg38. ```bash - $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chip.tar + $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chr19_chrM_chip.tar $ tar xvf test_genome_database_hg38_chip.tar ``` @@ -33,24 +33,27 @@ 6. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.5.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.5 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.6.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.6 ``` 7. Run a pipeline for the test sample. ```bash - $ INPUT=examples/local/ENCSR936XTK_subsampled.json - $ java -jar -Xmx1G -Dconfig.file=backends/backend.conf -Dbackend.default=singularity cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/singularity.json + $ INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json + $ PIPELINE_METADATA=metadata.json + $ java -jar -Xmx1G -Dconfig.file=backends/backend.conf -Dbackend.default=singularity cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/singularity.json -m ${PIPELINE_METADATA} ``` 8. It will take about 6 hours. You will be able to find all outputs on `cromwell-executions/chip/[RANDOM_HASH_STRING]/`. See [output directory structure](output.md) for details. 9. See full specification for [input JSON file](input.md). -10. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `workflow_opts/singularity.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. +10. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json`. + +11. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `workflow_opts/singularity.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.5.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.6.simg", "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." } } diff --git a/docs/tutorial_scg.md b/docs/tutorial_scg.md index ab04edc4..1d58919e 100644 --- a/docs/tutorial_scg.md +++ b/docs/tutorial_scg.md @@ -41,14 +41,14 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 6. Run a pipeline for the test sample. You must have a paid account on SCG4. ```bash - $ sbatch --account [YOUR_PAID_ACCOUNT_ON_SCG4] examples/scg/ENCSR936XTK_subsampled_scg_conda.sh + $ sbatch --account [YOUR_PAID_ACCOUNT_ON_SCG4] examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_conda.sh ``` ## For singularity users 6. Run a pipeline for the test sample. You must have a paid account on SCG4. ```bash - $ sbatch --account [YOUR_PAID_ACCOUNT_ON_SCG4] examples/scg/ENCSR936XTK_subsampled_scg_singularity.sh + $ sbatch --account [YOUR_PAID_ACCOUNT_ON_SCG4] examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_singularity.sh ``` ## For all users @@ -60,13 +60,15 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 8. See full specification for [input JSON file](input.md). +9. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, then edit your shell script (`examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_*.sh`) to use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=examples/...`. + ## For singularity users -9. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `workflow_opts/scg.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. +10. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `workflow_opts/scg.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.5.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.6.simg", "singularity_bindpath" : "/reference/ENCODE,/scratch,/srv/gsfs0,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR1,..." } } diff --git a/docs/tutorial_scg_backend.md b/docs/tutorial_scg_backend.md index 8c905691..b78248c9 100644 --- a/docs/tutorial_scg_backend.md +++ b/docs/tutorial_scg_backend.md @@ -59,7 +59,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 5. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. ```bash $ sdev # SCG cluster does not allow building a container on login node - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.5.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.5 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.6.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.6 $ exit ``` @@ -78,7 +78,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.5.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.6.simg", "singularity_bindpath" : "/scratch/users,/srv/gsfs0,/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR1,..." } } diff --git a/docs/tutorial_sge.md b/docs/tutorial_sge.md index f42e68fc..11d374fa 100644 --- a/docs/tutorial_sge.md +++ b/docs/tutorial_sge.md @@ -20,9 +20,9 @@ $ tar xvf ENCSR936XTK_fastq_subsampled.tar ``` -4. Download pre-built genome database for hg38. +4. Download pre-built chr19/chrM-only genome database for hg38. ```bash - $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chip.tar + $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chr19_chrM_chip.tar $ tar xvf test_genome_database_hg38_chip.tar ``` @@ -49,7 +49,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 8. Run a pipeline for the test sample. If your parallel environment (PE) found from step 5) has a different name from `shm` then edit the following shell script to change the PE name. ```bash - $ qsub examples/local/ENCSR936XTK_subsampled_sge_conda.sh + $ qsub examples/local/ENCSR936XTK_subsampled_chr19_only_sge_conda.sh ``` ## For singularity users @@ -61,7 +61,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 7. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.5.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.5 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.6.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.6 ``` 8. Run a pipeline for the test sample. If your parallel environment (PE) found from step 5) has a different name from `shm` then edit the following shell script to change the PE name. @@ -75,13 +75,15 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 10. See full specification for [input JSON file](input.md). +11. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, then edit your shell script (`examples/local/ENCSR936XTK_subsampled_chr19_only_sge_*.sh`) to use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=examples/...`. + ## For singularity users -11. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `workflow_opts/sge.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. +12. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `workflow_opts/sge.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.5.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.6.simg", "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." } } diff --git a/docs/tutorial_sge_backend.md b/docs/tutorial_sge_backend.md index 1475e1b0..9f9fb951 100644 --- a/docs/tutorial_sge_backend.md +++ b/docs/tutorial_sge_backend.md @@ -68,7 +68,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 7. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.5.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.6.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1 ``` 8. Run a pipeline for the test sample. diff --git a/docs/tutorial_sherlock.md b/docs/tutorial_sherlock.md index 5b7bf3ab..9e794931 100644 --- a/docs/tutorial_sherlock.md +++ b/docs/tutorial_sherlock.md @@ -41,14 +41,14 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 6. Run a pipeline for the test sample. ```bash - $ sbatch --partition normal examples/sherlock/ENCSR936XTK_subsampled_sherlock_conda.sh + $ sbatch --partition normal examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_conda.sh ``` ## For singularity users 6. Run a pipeline for the test sample. ```bash - $ sbatch --partition normal examples/sherlock/ENCSR936XTK_subsampled_sherlock_singularity.sh + $ sbatch --partition normal examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_singularity.sh ``` ## For all users @@ -60,13 +60,15 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 8. See full specification for [input JSON file](input.md). +9. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, then edit your shell script (`examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_*.sh`) to use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=examples/...`. + ## For singularity users -9. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `workflow_opts/sherlock.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. +10. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `workflow_opts/sherlock.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.5.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.6.simg", "singularity_bindpath" : "/scratch,/lscratch,/oak/stanford,/home/groups/cherry/encode,/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR1,..." } } diff --git a/docs/tutorial_sherlock_backend.md b/docs/tutorial_sherlock_backend.md index 40eef031..7d2cd8df 100644 --- a/docs/tutorial_sherlock_backend.md +++ b/docs/tutorial_sherlock_backend.md @@ -63,7 +63,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 6. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. Stanford Sherlock does not allow building a container on login nodes. Wait until you get a command prompt after `sdev`. ```bash $ sdev # sherlock cluster does not allow building a container on login node - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.5.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.5 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.6.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.6 $ exit # exit from an interactive node ``` @@ -82,7 +82,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.5.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.6.simg", "singularity_bindpath" : "/scratch,/oak/stanford,/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR1,..." } } diff --git a/docs/tutorial_slurm.md b/docs/tutorial_slurm.md index acd6062f..eb6d77af 100644 --- a/docs/tutorial_slurm.md +++ b/docs/tutorial_slurm.md @@ -20,9 +20,9 @@ $ tar xvf ENCSR936XTK_fastq_subsampled.tar ``` -4. Download pre-built genome database for hg38. +4. Download pre-built chr19/chrM-only genome database for hg38. ```bash - $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chip.tar + $ wget https://storage.googleapis.com/encode-pipeline-genome-data/test_genome_database_hg38_chr19_chrM_chip.tar $ tar xvf test_genome_database_hg38_chip.tar ``` @@ -44,7 +44,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 7. Run a pipeline for the test sample. Try without partition and account settings first. If your cluster requires to specify any of them then add one to the command line. ```bash - $ sbatch --partition [YOUR_PARTITION] --account [YOUR_ACCOUNT] examples/local/ENCSR936XTK_subsampled_slurm_conda.sh + $ sbatch --partition [YOUR_PARTITION] --account [YOUR_ACCOUNT] examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_conda.sh ``` ## For singularity users @@ -56,12 +56,12 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 7. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.5.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.5 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.6.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.6 ``` 8. Run a pipeline for the test sample. If your cluster requires to specify any of them then add one to the command line. ```bash - $ sbatch --partition [YOUR_PARTITION] --account [YOUR_ACCOUNT] examples/local/ENCSR936XTK_subsampled_slurm_singularity.sh + $ sbatch --partition [YOUR_PARTITION] --account [YOUR_ACCOUNT] examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_singularity.sh ``` ## For all users @@ -70,13 +70,15 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 9. See full specification for [input JSON file](input.md). +10. You can resume a failed pipeline from where it left off by using `PIPELINE_METADATA`(`metadata.json`) file. This file is created for each pipeline run. See [here](../utils/resumer/README.md) for details. Once you get a new input JSON file from the resumer, then edit your shell script (`examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_*.sh`) to use it `INPUT=resume.[FAILED_WORKFLOW_ID].json` instead of `INPUT=examples/...`. + ## For singularity users -10. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `workflow_opts/slurm.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. +11. IF YOU WANT TO RUN PIPELINES WITH YOUR OWN INPUT DATA/GENOME DATABASE, PLEASE ADD THEIR DIRECTORIES TO `workflow_opts/slurm.json`. For example, you have input FASTQs on `/your/input/fastqs/` and genome database installed on `/your/genome/database/` then add `/your/` to `singularity_bindpath`. You can also define multiple directories there. It's comma-separated. ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.5.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.6.simg", "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." } } diff --git a/docs/tutorial_slurm_backend.md b/docs/tutorial_slurm_backend.md index f0be2ea4..a62a4d7c 100644 --- a/docs/tutorial_slurm_backend.md +++ b/docs/tutorial_slurm_backend.md @@ -68,7 +68,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt 7. Pull a singularity container for the pipeline. This will pull pipeline's docker container first and build a singularity one on `~/.singularity`. ```bash - $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.5.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.5 + $ mkdir -p ~/.singularity && cd ~/.singularity && SINGULARITY_CACHEDIR=~/.singularity SINGULARITY_PULLFOLDER=~/.singularity singularity pull --name chip-seq-pipeline-v1.1.6.simg -F docker://quay.io/encode-dcc/chip-seq-pipeline:v1.1.6 ``` 8. Run a pipeline for the test sample. @@ -86,7 +86,7 @@ Our pipeline supports both [Conda](https://conda.io/docs/) and [Singularity](htt ```javascript { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.5.simg", + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.6.simg", "singularity_bindpath" : "/your/,YOUR_OWN_DATA_DIR1,YOUR_OWN_DATA_DIR2,..." } } diff --git a/examples/dx/ENCSR000DYI_subsampled_chr19_only_dx.json b/examples/dx/ENCSR000DYI_subsampled_chr19_only_dx.json index 12947996..fe39b590 100644 --- a/examples/dx/ENCSR000DYI_subsampled_chr19_only_dx.json +++ b/examples/dx/ENCSR000DYI_subsampled_chr19_only_dx.json @@ -1,6 +1,6 @@ { "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/hg38_chr19_only_dx.tsv", + "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/hg38_chr19_chrM_dx.tsv", "chip.fastqs" : [ [["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz"]], [["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep2.subsampled.20.fastq.gz"]] diff --git a/examples/dx/ENCSR000DYI_subsampled_chr19_only_old_fraglen_dx.json b/examples/dx/ENCSR000DYI_subsampled_chr19_only_old_fraglen_dx.json index eec2527f..4d3bce6e 100644 --- a/examples/dx/ENCSR000DYI_subsampled_chr19_only_old_fraglen_dx.json +++ b/examples/dx/ENCSR000DYI_subsampled_chr19_only_old_fraglen_dx.json @@ -1,6 +1,6 @@ { "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/hg38_chr19_only_dx.tsv", + "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/hg38_chr19_chrM_dx.tsv", "chip.fastqs" : [ [["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz"]], [["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep2.subsampled.20.fastq.gz"]] diff --git a/examples/dx/ENCSR000DYI_subsampled_dx.json b/examples/dx/ENCSR000DYI_subsampled_dx.json index 9a125752..f4af1231 100644 --- a/examples/dx/ENCSR000DYI_subsampled_dx.json +++ b/examples/dx/ENCSR000DYI_subsampled_dx.json @@ -1,6 +1,6 @@ { "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/hg38_chr19_only_dx.tsv", + "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/hg38_dx.tsv", "chip.fastqs" : [ [["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz"]], [["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep2.subsampled.15.fastq.gz"]] diff --git a/examples/dx/ENCSR000DYI_subsampled_rep1_dx.json b/examples/dx/ENCSR000DYI_subsampled_rep1_dx.json new file mode 100644 index 00000000..0da8fb52 --- /dev/null +++ b/examples/dx/ENCSR000DYI_subsampled_rep1_dx.json @@ -0,0 +1,16 @@ +{ + "chip.pipeline_type" : "tf", + "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/hg38_dx.tsv", + "chip.fastqs" : [ + [["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz"]] + ], + "chip.ctl_fastqs" : [ + [["dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl1.subsampled.25.fastq.gz"]] + ], + + "chip.paired_end" : false, + "chip.always_use_pooled_ctl" : true, + + "chip.title" : "ENCSR000DYI (unreplicated, subsampled 1/25)", + "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" +} diff --git a/examples/dx/ENCSR936XTK_subsampled_chr19_only_dx.json b/examples/dx/ENCSR936XTK_subsampled_chr19_only_dx.json index c883e1f0..7550f8b3 100644 --- a/examples/dx/ENCSR936XTK_subsampled_chr19_only_dx.json +++ b/examples/dx/ENCSR936XTK_subsampled_chr19_only_dx.json @@ -1,6 +1,6 @@ { "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/hg38_chr19_only_dx.tsv", + "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/hg38_chr19_chrM_dx.tsv", "chip.fastqs_rep1_R1" : [ "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz" diff --git a/examples/dx/ENCSR936XTK_subsampled_chr19_only_old_fraglen_dx.json b/examples/dx/ENCSR936XTK_subsampled_chr19_only_old_fraglen_dx.json index ec1f79a6..8c2a9c62 100644 --- a/examples/dx/ENCSR936XTK_subsampled_chr19_only_old_fraglen_dx.json +++ b/examples/dx/ENCSR936XTK_subsampled_chr19_only_old_fraglen_dx.json @@ -1,6 +1,6 @@ { "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/hg38_chr19_only_dx.tsv", + "chip.genome_tsv" : "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-genome-data/hg38_chr19_chrM_dx.tsv", "chip.fastqs_rep1_R1" : [ "dx://project-BKpvFg00VBPV975PgJ6Q03v6:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz" diff --git a/examples/dx_azure/ENCSR000DYI_subsampled_chr19_only_dx_azure.json b/examples/dx_azure/ENCSR000DYI_subsampled_chr19_only_dx_azure.json new file mode 100644 index 00000000..d8bdabbb --- /dev/null +++ b/examples/dx_azure/ENCSR000DYI_subsampled_chr19_only_dx_azure.json @@ -0,0 +1,18 @@ +{ + "chip.pipeline_type" : "tf", + "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/hg38_chr19_chrM_dx_azure.tsv", + "chip.fastqs" : [ + [["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz"]], + [["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep2.subsampled.15.fastq.gz"]] + ], + "chip.ctl_fastqs" : [ + [["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl1.subsampled.25.fastq.gz"]], + [["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/ctl2.subsampled.25.fastq.gz"]] + ], + + "chip.paired_end" : false, + "chip.always_use_pooled_ctl" : true, + + "chip.title" : "ENCSR000DYI (subsampled 1/25, chr19/chrM only)", + "chip.description" : "CEBPB ChIP-seq on human A549 produced by the Snyder lab" +} diff --git a/examples/dx_azure/ENCSR000DYI_subsampled_dx_azure.json b/examples/dx_azure/ENCSR000DYI_subsampled_dx_azure.json index c17f240f..195d0ab2 100644 --- a/examples/dx_azure/ENCSR000DYI_subsampled_dx_azure.json +++ b/examples/dx_azure/ENCSR000DYI_subsampled_dx_azure.json @@ -1,6 +1,6 @@ { "chip.pipeline_type" : "tf", - "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/hg38_chr19_only_dx_azure.tsv", + "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/hg38_dx_azure.tsv", "chip.fastqs" : [ [["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep1.subsampled.25.fastq.gz"]], [["dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/fastq_subsampled/rep2.subsampled.15.fastq.gz"]] diff --git a/examples/dx_azure/ENCSR936XTK_subsampled_chr19_only_dx_azure.json b/examples/dx_azure/ENCSR936XTK_subsampled_chr19_only_dx_azure.json new file mode 100644 index 00000000..46437f62 --- /dev/null +++ b/examples/dx_azure/ENCSR936XTK_subsampled_chr19_only_dx_azure.json @@ -0,0 +1,35 @@ +{ + "chip.pipeline_type" : "tf", + "chip.genome_tsv" : "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-genome-data/hg38_chr19_chrM_dx_azure.tsv", + + "chip.fastqs_rep1_R1" : [ + "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz" + ], + "chip.fastqs_rep1_R2" : [ + "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.67.fastq.gz" + ], + "chip.fastqs_rep2_R1" : [ + "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R1.subsampled.67.fastq.gz" + ], + "chip.fastqs_rep2_R2" : [ + "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/rep2-R2.subsampled.67.fastq.gz" + ], + "chip.ctl_fastqs_rep1_R1" : [ + "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz" + ], + "chip.ctl_fastqs_rep1_R2" : [ + "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz" + ], + "chip.ctl_fastqs_rep2_R1" : [ + "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R1.subsampled.80.fastq.gz" + ], + "chip.ctl_fastqs_rep2_R2" : [ + "dx://project-F6K911Q9xyfgJ36JFzv03Z5J:pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq_subsampled/ctl2-R2.subsampled.80.fastq.gz" + ], + + "chip.paired_end" : true, + + "chip.always_use_pooled_ctl" : true, + "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", + "chip.description" : "ZNF143 ChIP-seq on human GM12878" +} diff --git a/examples/dx_azure/ENCSR936XTK_subsampled_dx_azure.json b/examples/dx_azure/ENCSR936XTK_subsampled_dx_azure.json index 8c23510b..a0e7e144 100644 --- a/examples/dx_azure/ENCSR936XTK_subsampled_dx_azure.json +++ b/examples/dx_azure/ENCSR936XTK_subsampled_dx_azure.json @@ -30,6 +30,6 @@ "chip.paired_end" : true, "chip.always_use_pooled_ctl" : true, - "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19 and chrM Only)", + "chip.title" : "ENCSR936XTK (subsampled 1/67)", "chip.description" : "ZNF143 ChIP-seq on human GM12878" } diff --git a/examples/google/ENCSR936XTK_subsampled_chr19_only.json b/examples/google/ENCSR936XTK_subsampled_chr19_only.json new file mode 120000 index 00000000..846d8d87 --- /dev/null +++ b/examples/google/ENCSR936XTK_subsampled_chr19_only.json @@ -0,0 +1 @@ +../../test/test_workflow/ENCSR936XTK_subsampled_chr19_only.json \ No newline at end of file diff --git a/examples/local/ENCSR936XTK_subsampled_chr19_only.json b/examples/local/ENCSR936XTK_subsampled_chr19_only.json new file mode 100644 index 00000000..ea7e4fea --- /dev/null +++ b/examples/local/ENCSR936XTK_subsampled_chr19_only.json @@ -0,0 +1,51 @@ +{ + "chip.pipeline_type" : "tf", + "chip.genome_tsv" : "test_genome_database/hg38_chr19_chrM_local.tsv", + "chip.fastqs" : [ + [["test_sample/ENCSR936XTK/fastq_subsampled/rep1-R1.subsampled.67.fastq.gz", + "test_sample/ENCSR936XTK/fastq_subsampled/rep1-R2.subsampled.67.fastq.gz"]], + [["test_sample/ENCSR936XTK/fastq_subsampled/rep2-R1.subsampled.67.fastq.gz", + "test_sample/ENCSR936XTK/fastq_subsampled/rep2-R2.subsampled.67.fastq.gz"]] + ], + "chip.ctl_fastqs" : [ + [["test_sample/ENCSR936XTK/fastq_subsampled/ctl1-R1.subsampled.80.fastq.gz", + "test_sample/ENCSR936XTK/fastq_subsampled/ctl1-R2.subsampled.80.fastq.gz"]], + [["test_sample/ENCSR936XTK/fastq_subsampled/ctl2-R1.subsampled.80.fastq.gz", + "test_sample/ENCSR936XTK/fastq_subsampled/ctl2-R2.subsampled.80.fastq.gz"]] + ], + + "chip.paired_end" : true, + + "chip.always_use_pooled_ctl" : true, + "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19_chrM only)", + "chip.description" : "ZNF143 ChIP-seq on human GM12878", + + "chip.bwa_cpu" : 1, + "chip.bwa_mem_mb" : 4000, + "chip.bwa_time_hr" : 4, + + "chip.filter_cpu" : 1, + "chip.filter_mem_mb" : 4000, + "chip.filter_time_hr" : 4, + + "chip.bam2ta_cpu" : 1, + "chip.bam2ta_mem_mb" : 4000, + "chip.bam2ta_time_hr" : 4, + + "chip.spr_mem_mb" : 4000, + + "chip.fingerprint_cpu" : 1, + "chip.fingerprint_mem_mb" : 4000, + "chip.fingerprint_time_hr" : 6, + + "chip.xcor_cpu" : 1, + "chip.xcor_mem_mb" : 4000, + "chip.xcor_time_hr" : 4, + + "chip.macs2_mem_mb" : 4000, + "chip.macs2_time_hr" : 4, + + "chip.spp_cpu" : 1, + "chip.spp_mem_mb" : 4000, + "chip.spp_time_hr" : 4 +} diff --git a/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_conda.sh b/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_conda.sh new file mode 100644 index 00000000..f59acf2a --- /dev/null +++ b/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_conda.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# do not touch these settings +# number of tasks and nodes are fixed at 1 +#$ -S /bin/sh +#$ -terse +#$ -V + +# job name for pipeline +# this name will appear when you monitor jobs with "squeue -u $USER" +#$ -N ENCSR936XTK_subsampled_chr19_only + +# walltime for your job +# give long time enough to finish your pipeline +# <12 hr: small/test samples +# >24 hr: large samples +#$ -l h_rt=12:00:00 +#$ -l s_rt=12:00:00 + +# total amount of memory +# depends on the size of your FASTQs +# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples +# or <= NUM_CONCURRENT_TASK x 10GB for small samples +# do not request too much memory +# cluster will not accept your job +#$ -l h_vmem=20G +#$ -l s_vmem=20G + +# max number of cpus for each pipeline +# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file +# since bwa is a bottlenecking task in the pipeline +# "chip.bwa_cpu" is a number of cpus per replicate +# SGE has a parallel environment (PE). +# ask your admin to add a new PE named "shm" +# or use your cluster's own PE instead of "shm" +# 2 means number of cpus per pipeline +#$ -pe shm 2 + +# load java module if it exists +module load java || true + +# activate pipeline's Conda environment if Conda env exists +source activate encode-chip-seq-pipeline + +# use input JSON for a small test sample +# you make an input JSON for your own sample +# start from any of two templates for single-ended and paired-ended samples +# (examples/template_se.json, examples/template_pe.json) +# do not use an input JSON file for a test sample (ENCSR936XTK) +# it's a sample with multimapping reads +INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json + +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + +# limit number of concurrent tasks +# we recommend to use a number of replicates here +# so that all replicates are processed in parellel at the same time. +# make sure that resource settings in your input JSON file +# are consistent with SBATCH resource settings (--mem, --cpus-per-task) +# in this script +NUM_CONCURRENT_TASK=2 + +# run pipeline +# you can monitor your jobs with "squeue -u $USER" +java -jar -Dconfig.file=backends/backend.conf \ +-Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_singularity.sh b/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_singularity.sh new file mode 100644 index 00000000..6078933a --- /dev/null +++ b/examples/local/ENCSR936XTK_subsampled_chr19_only_sge_singularity.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# do not touch these settings +# number of tasks and nodes are fixed at 1 +#$ -S /bin/sh +#$ -terse +#$ -V + +# job name for pipeline +# this name will appear when you monitor jobs with "squeue -u $USER" +#$ -N ENCSR936XTK_subsampled_chr19_only + +# walltime for your job +# give long time enough to finish your pipeline +# <12 hr: small/test samples +# >24 hr: large samples +#$ -l h_rt=12:00:00 +#$ -l s_rt=12:00:00 + +# total amount of memory +# depends on the size of your FASTQs +# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples +# or <= NUM_CONCURRENT_TASK x 10GB for small samples +# do not request too much memory +# cluster will not accept your job +#$ -l h_vmem=20G +#$ -l s_vmem=20G + +# max number of cpus for each pipeline +# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file +# since bwa is a bottlenecking task in the pipeline +# "chip.bwa_cpu" is a number of cpus per replicate +# SGE has a parallel environment (PE). +# ask your admin to add a new PE named "shm" +# or use your cluster's own PE instead of "shm" +# 2 means number of cpus per pipeline +#$ -pe shm 2 + +# load java module if it exists +module load java || true + +# use input JSON for a small test sample +# you make an input JSON for your own sample +# start from any of two templates for single-ended and paired-ended samples +# (examples/template_se.json, examples/template_pe.json) +# do not use an input JSON file for a test sample (ENCSR936XTK) +# it's a sample with multimapping reads +INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json + +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + +# limit number of concurrent tasks +# we recommend to use a number of replicates here +# so that all replicates are processed in parellel at the same time. +# make sure that resource settings in your input JSON file +# are consistent with SBATCH resource settings (--mem, --cpus-per-task) +# in this script +NUM_CONCURRENT_TASK=2 + +# run pipeline +# you can monitor your jobs with "squeue -u $USER" +java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ +-Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/singularity.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_conda.sh b/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_conda.sh new file mode 100644 index 00000000..9993ea58 --- /dev/null +++ b/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_conda.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# do not touch these settings +# number of tasks and nodes are fixed at 1 +#SBATCH -n 1 +#SBATCH --ntasks-per-node=1 + +# job name for pipeline +# this name will appear when you monitor jobs with "squeue -u $USER" +#SBATCH --job-name=ENCSR936XTK_subsampled_chr19_only + +# walltime for your job +# give long time enough to finish your pipeline +# <12 hr: small/test samples +# >24 hr: large samples +#SBATCH --time=12:00:00 + +# total amount of memory +# depends on the size of your FASTQs +# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples +# or <= NUM_CONCURRENT_TASK x 10GB for small samples +# do not request too much memory +# cluster will not accept your job +#SBATCH --mem=20G + +# max number of cpus for each pipeline +# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file +# since bwa is a bottlenecking task in the pipeline +# "chip.bwa_cpu" is a number of cpus per replicate +#SBATCH --cpus-per-task=2 + +# email notification for job status +#SBATCH --mail-type=END,FAIL + +# load java module if it exists +module load java || true + +# activate pipeline's Conda environment if Conda env exists +source activate encode-chip-seq-pipeline + +# use input JSON for a small test sample +# you make an input JSON for your own sample +# start from any of two templates for single-ended and paired-ended samples +# (examples/template_se.json, examples/template_pe.json) +# do not use an input JSON file for a test sample (ENCSR936XTK) +# it's a sample with multimapping reads +INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json + +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + +# limit number of concurrent tasks +# we recommend to use a number of replicates here +# so that all replicates are processed in parellel at the same time. +# make sure that resource settings in your input JSON file +# are consistent with SBATCH resource settings (--mem, --cpus-per-task) +# in this script +NUM_CONCURRENT_TASK=2 + +# run pipeline +# you can monitor your jobs with "squeue -u $USER" +java -jar -Dconfig.file=backends/backend.conf \ +-Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_singularity.sh b/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_singularity.sh new file mode 100644 index 00000000..221a2b74 --- /dev/null +++ b/examples/local/ENCSR936XTK_subsampled_chr19_only_slurm_singularity.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# do not touch these settings +# number of tasks and nodes are fixed at 1 +#SBATCH -n 1 +#SBATCH --ntasks-per-node=1 + +# job name for pipeline +# this name will appear when you monitor jobs with "squeue -u $USER" +#SBATCH --job-name=ENCSR936XTK_subsampled_chr19_only + +# walltime for your job +# give long time enough to finish your pipeline +# <12 hr: small/test samples +# >24 hr: large samples +#SBATCH --time=12:00:00 + +# total amount of memory +# depends on the size of your FASTQs +# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples +# or <= NUM_CONCURRENT_TASK x 10GB for small samples +# do not request too much memory +# cluster will not accept your job +#SBATCH --mem=20G + +# max number of cpus for each pipeline +# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file +# since bwa is a bottlenecking task in the pipeline +# "chip.bwa_cpu" is a number of cpus per replicate +#SBATCH --cpus-per-task=2 + +# email notification for job status +#SBATCH --mail-type=END,FAIL + +# load java module if it exists +module load java || true + +# use input JSON for a small test sample +# you make an input JSON for your own sample +# start from any of two templates for single-ended and paired-ended samples +# (examples/template_se.json, examples/template_pe.json) +# do not use an input JSON file for a test sample (ENCSR936XTK) +# it's a sample with multimapping reads +INPUT=examples/local/ENCSR936XTK_subsampled_chr19_only.json + +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + +# limit number of concurrent tasks +# we recommend to use a number of replicates here +# so that all replicates are processed in parellel at the same time. +# make sure that resource settings in your input JSON file +# are consistent with SBATCH resource settings (--mem, --cpus-per-task) +# in this script +NUM_CONCURRENT_TASK=2 + +# run pipeline +# you can monitor your jobs with "squeue -u $USER" +java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ +-Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/singularity.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/local/ENCSR936XTK_subsampled_sge_conda.sh b/examples/local/ENCSR936XTK_subsampled_sge_conda.sh index ef2e682b..90f146a4 100644 --- a/examples/local/ENCSR936XTK_subsampled_sge_conda.sh +++ b/examples/local/ENCSR936XTK_subsampled_sge_conda.sh @@ -50,6 +50,10 @@ source activate encode-chip-seq-pipeline # it's a sample with multimapping reads INPUT=examples/local/ENCSR936XTK_subsampled.json +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + # limit number of concurrent tasks # we recommend to use a number of replicates here # so that all replicates are processed in parellel at the same time. @@ -62,4 +66,4 @@ NUM_CONCURRENT_TASK=2 # you can monitor your jobs with "squeue -u $USER" java -jar -Dconfig.file=backends/backend.conf \ -Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} \ No newline at end of file +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/local/ENCSR936XTK_subsampled_sge_singularity.sh b/examples/local/ENCSR936XTK_subsampled_sge_singularity.sh index 15e7358b..e57163aa 100644 --- a/examples/local/ENCSR936XTK_subsampled_sge_singularity.sh +++ b/examples/local/ENCSR936XTK_subsampled_sge_singularity.sh @@ -47,6 +47,10 @@ module load java || true # it's a sample with multimapping reads INPUT=examples/local/ENCSR936XTK_subsampled.json +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + # limit number of concurrent tasks # we recommend to use a number of replicates here # so that all replicates are processed in parellel at the same time. @@ -59,4 +63,4 @@ NUM_CONCURRENT_TASK=2 # you can monitor your jobs with "squeue -u $USER" java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ -Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/singularity.json \ No newline at end of file +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/singularity.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/local/ENCSR936XTK_subsampled_slurm_conda.sh b/examples/local/ENCSR936XTK_subsampled_slurm_conda.sh index ec36fbe6..f44ca45a 100644 --- a/examples/local/ENCSR936XTK_subsampled_slurm_conda.sh +++ b/examples/local/ENCSR936XTK_subsampled_slurm_conda.sh @@ -46,6 +46,10 @@ source activate encode-chip-seq-pipeline # it's a sample with multimapping reads INPUT=examples/local/ENCSR936XTK_subsampled.json +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + # limit number of concurrent tasks # we recommend to use a number of replicates here # so that all replicates are processed in parellel at the same time. @@ -58,4 +62,4 @@ NUM_CONCURRENT_TASK=2 # you can monitor your jobs with "squeue -u $USER" java -jar -Dconfig.file=backends/backend.conf \ -Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} \ No newline at end of file +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/local/ENCSR936XTK_subsampled_slurm_singularity.sh b/examples/local/ENCSR936XTK_subsampled_slurm_singularity.sh index 141b3f9f..542544f1 100644 --- a/examples/local/ENCSR936XTK_subsampled_slurm_singularity.sh +++ b/examples/local/ENCSR936XTK_subsampled_slurm_singularity.sh @@ -43,6 +43,10 @@ module load java || true # it's a sample with multimapping reads INPUT=examples/local/ENCSR936XTK_subsampled.json +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + # limit number of concurrent tasks # we recommend to use a number of replicates here # so that all replicates are processed in parellel at the same time. @@ -55,4 +59,4 @@ NUM_CONCURRENT_TASK=2 # you can monitor your jobs with "squeue -u $USER" java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ -Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/singularity.json \ No newline at end of file +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/singularity.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg.json b/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg.json new file mode 100644 index 00000000..62229e7f --- /dev/null +++ b/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg.json @@ -0,0 +1,51 @@ +{ + "chip.pipeline_type" : "tf", + "chip.genome_tsv" : "/reference/ENCODE/pipeline_genome_data/hg38_chr19_chrM_scg.tsv", + "chip.fastqs" : [ + [["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.subsampled.67.fastq.gz", + "/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.subsampled.67.fastq.gz"]], + [["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.subsampled.67.fastq.gz", + "/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.subsampled.67.fastq.gz"]] + ], + "chip.ctl_fastqs" : [ + [["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.subsampled.80.fastq.gz", + "/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.subsampled.80.fastq.gz"]], + [["/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.subsampled.80.fastq.gz", + "/reference/ENCODE/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.subsampled.80.fastq.gz"]] + ], + + "chip.paired_end" : true, + + "chip.always_use_pooled_ctl" : true, + "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19_chrM only)", + "chip.description" : "ZNF143 ChIP-seq on human GM12878", + + "chip.bwa_cpu" : 1, + "chip.bwa_mem_mb" : 4000, + "chip.bwa_time_hr" : 4, + + "chip.filter_cpu" : 1, + "chip.filter_mem_mb" : 4000, + "chip.filter_time_hr" : 4, + + "chip.bam2ta_cpu" : 1, + "chip.bam2ta_mem_mb" : 4000, + "chip.bam2ta_time_hr" : 4, + + "chip.spr_mem_mb" : 4000, + + "chip.fingerprint_cpu" : 1, + "chip.fingerprint_mem_mb" : 4000, + "chip.fingerprint_time_hr" : 6, + + "chip.xcor_cpu" : 1, + "chip.xcor_mem_mb" : 4000, + "chip.xcor_time_hr" : 4, + + "chip.macs2_mem_mb" : 4000, + "chip.macs2_time_hr" : 4, + + "chip.spp_cpu" : 1, + "chip.spp_mem_mb" : 4000, + "chip.spp_time_hr" : 4 +} diff --git a/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_conda.sh b/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_conda.sh new file mode 100644 index 00000000..2dca54a1 --- /dev/null +++ b/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_conda.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# do not touch these settings +# number of tasks and nodes are fixed at 1 +#SBATCH -n 1 +#SBATCH --ntasks-per-node=1 + +# job name for pipeline +# this name will appear when you monitor jobs with "squeue -u $USER" +#SBATCH --job-name=ENCSR936XTK_subsampled_chr19_only + +# walltime for your job +# give long time enough to finish your pipeline +# <12 hr: small/test samples +# >24 hr: large samples +#SBATCH --time=12:00:00 + +# total amount of memory +# depends on the size of your FASTQs +# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples +# or <= NUM_CONCURRENT_TASK x 10GB for small samples +# do not request too much memory +# cluster will not accept your job +#SBATCH --mem=20G + +# max number of cpus for each pipeline +# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file +# since bwa is a bottlenecking task in the pipeline +# "chip.bwa_cpu" is a number of cpus per replicate +#SBATCH --cpus-per-task=2 + +# email notification for job status +#SBATCH --mail-type=END,FAIL + +# load java module if it exists +module load java + +# activate pipeline's Conda environment if Conda env exists +source activate encode-chip-seq-pipeline + +# use input JSON for a small test sample +# you make an input JSON for your own sample +# start from any of two templates for single-ended and paired-ended samples +# (examples/template_se.json, examples/template_pe.json) +# do not use an input JSON file for a test sample (ENCSR936XTK) +# it's a sample with multimapping reads +INPUT=examples/scg/ENCSR936XTK_subsampled_chr19_only_scg.json + +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + +# limit number of concurrent tasks +# we recommend to use a number of replicates here +# so that all replicates are processed in parellel at the same time. +# make sure that resource settings in your input JSON file +# are consistent with SBATCH resource settings (--mem, --cpus-per-task) +# in this script +NUM_CONCURRENT_TASK=2 + +# run pipeline +# you can monitor your jobs with "squeue -u $USER" +java -jar -Dconfig.file=backends/backend.conf \ +-Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_singularity.sh b/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_singularity.sh new file mode 100644 index 00000000..bb0775b5 --- /dev/null +++ b/examples/scg/ENCSR936XTK_subsampled_chr19_only_scg_singularity.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# do not touch these settings +# number of tasks and nodes are fixed at 1 +#SBATCH -n 1 +#SBATCH --ntasks-per-node=1 + +# job name for pipeline +# this name will appear when you monitor jobs with "squeue -u $USER" +#SBATCH --job-name=ENCSR936XTK_subsampled_chr19_only + +# walltime for your job +# give long time enough to finish your pipeline +# <12 hr: small/test samples +# >24 hr: large samples +#SBATCH --time=12:00:00 + +# total amount of memory +# depends on the size of your FASTQs +# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples +# or <= NUM_CONCURRENT_TASK x 10GB for small samples +# do not request too much memory +# cluster will not accept your job +#SBATCH --mem=20G + +# max number of cpus for each pipeline +# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file +# since bwa is a bottlenecking task in the pipeline +# "chip.bwa_cpu" is a number of cpus per replicate +#SBATCH --cpus-per-task=2 + +# email notification for job status +#SBATCH --mail-type=END,FAIL + +# load java module if it exists +module load java + +# use input JSON for a small test sample +# you make an input JSON for your own sample +# start from any of two templates for single-ended and paired-ended samples +# (examples/template_se.json, examples/template_pe.json) +# do not use an input JSON file for a test sample (ENCSR936XTK) +# it's a sample with multimapping reads +INPUT=examples/scg/ENCSR936XTK_subsampled_chr19_only_scg.json + +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + +# limit number of concurrent tasks +# we recommend to use a number of replicates here +# so that all replicates are processed in parellel at the same time. +# make sure that resource settings in your input JSON file +# are consistent with SBATCH resource settings (--mem, --cpus-per-task) +# in this script +NUM_CONCURRENT_TASK=2 + +# run pipeline +# you can monitor your jobs with "squeue -u $USER" +java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ +-Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/scg.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/scg/ENCSR936XTK_subsampled_scg_conda.sh b/examples/scg/ENCSR936XTK_subsampled_scg_conda.sh index 82d8b18c..bbea348c 100644 --- a/examples/scg/ENCSR936XTK_subsampled_scg_conda.sh +++ b/examples/scg/ENCSR936XTK_subsampled_scg_conda.sh @@ -46,6 +46,10 @@ source activate encode-chip-seq-pipeline # it's a sample with multimapping reads INPUT=examples/scg/ENCSR936XTK_subsampled_scg.json +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + # limit number of concurrent tasks # we recommend to use a number of replicates here # so that all replicates are processed in parellel at the same time. @@ -58,4 +62,4 @@ NUM_CONCURRENT_TASK=2 # you can monitor your jobs with "squeue -u $USER" java -jar -Dconfig.file=backends/backend.conf \ -Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} \ No newline at end of file +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/scg/ENCSR936XTK_subsampled_scg_singularity.sh b/examples/scg/ENCSR936XTK_subsampled_scg_singularity.sh index b2c51ea4..09b1dce3 100644 --- a/examples/scg/ENCSR936XTK_subsampled_scg_singularity.sh +++ b/examples/scg/ENCSR936XTK_subsampled_scg_singularity.sh @@ -43,6 +43,10 @@ module load java # it's a sample with multimapping reads INPUT=examples/scg/ENCSR936XTK_subsampled_scg.json +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + # limit number of concurrent tasks # we recommend to use a number of replicates here # so that all replicates are processed in parellel at the same time. @@ -55,4 +59,4 @@ NUM_CONCURRENT_TASK=2 # you can monitor your jobs with "squeue -u $USER" java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ -Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/scg.json \ No newline at end of file +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/scg.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock.json b/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock.json new file mode 100644 index 00000000..a91bfb45 --- /dev/null +++ b/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock.json @@ -0,0 +1,51 @@ +{ + "chip.pipeline_type" : "tf", + "chip.genome_tsv" : "/home/groups/cherry/encode/pipeline_genome_data/hg38_chr19_chrM_sherlock.tsv", + "chip.fastqs" : [ + [["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R1.subsampled.67.fastq.gz", + "/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep1-R2.subsampled.67.fastq.gz"]], + [["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R1.subsampled.67.fastq.gz", + "/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/rep2-R2.subsampled.67.fastq.gz"]] + ], + "chip.ctl_fastqs" : [ + [["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R1.subsampled.80.fastq.gz", + "/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl1-R2.subsampled.80.fastq.gz"]], + [["/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R1.subsampled.80.fastq.gz", + "/home/groups/cherry/encode/pipeline_test_samples/encode-chip-seq-pipeline/ENCSR936XTK/fastq/ctl2-R2.subsampled.80.fastq.gz"]] + ], + + "chip.paired_end" : true, + + "chip.always_use_pooled_ctl" : true, + "chip.title" : "ENCSR936XTK (subsampled 1/67, chr19_chrM only)", + "chip.description" : "ZNF143 ChIP-seq on human GM12878", + + "chip.bwa_cpu" : 1, + "chip.bwa_mem_mb" : 4000, + "chip.bwa_time_hr" : 4, + + "chip.filter_cpu" : 1, + "chip.filter_mem_mb" : 4000, + "chip.filter_time_hr" : 4, + + "chip.bam2ta_cpu" : 1, + "chip.bam2ta_mem_mb" : 4000, + "chip.bam2ta_time_hr" : 4, + + "chip.spr_mem_mb" : 4000, + + "chip.fingerprint_cpu" : 1, + "chip.fingerprint_mem_mb" : 4000, + "chip.fingerprint_time_hr" : 6, + + "chip.xcor_cpu" : 1, + "chip.xcor_mem_mb" : 4000, + "chip.xcor_time_hr" : 4, + + "chip.macs2_mem_mb" : 4000, + "chip.macs2_time_hr" : 4, + + "chip.spp_cpu" : 1, + "chip.spp_mem_mb" : 4000, + "chip.spp_time_hr" : 4 +} diff --git a/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_conda.sh b/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_conda.sh new file mode 100644 index 00000000..916cd3c9 --- /dev/null +++ b/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_conda.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +# do not touch these settings +# number of tasks and nodes are fixed at 1 +#SBATCH -n 1 +#SBATCH --ntasks-per-node=1 + +# job name for pipeline +# this name will appear when you monitor jobs with "squeue -u $USER" +#SBATCH --job-name=ENCSR936XTK_subsampled_chr19_only + +# walltime for your job +# give long time enough to finish your pipeline +# <12 hr: small/test samples +# >24 hr: large samples +#SBATCH --time=24:00:00 + +# total amount of memory +# depends on the size of your FASTQs +# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples +# or <= NUM_CONCURRENT_TASK x 10GB for small samples +# do not request too much memory +# cluster will not accept your job +#SBATCH --mem=20G + +# max number of cpus for each pipeline +# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file +# since bwa is a bottlenecking task in the pipeline +# "chip.bwa_cpu" is a number of cpus per replicate +#SBATCH --cpus-per-task=2 + +# email notification for job status +#SBATCH --mail-type=END,FAIL + +# load java module if it exists +module load java + +# activate pipeline's Conda environment if Conda env exists +source activate encode-chip-seq-pipeline + +# use input JSON for a small test sample +# you make an input JSON for your own sample +# start from any of two templates for single-ended and paired-ended samples +# (examples/template_se.json, examples/template_pe.json) +# do not use an input JSON file for a test sample (ENCSR936XTK) +# it's a sample with multimapping reads +INPUT=examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock.json + +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + +# limit number of concurrent tasks +# we recommend to use a number of replicates here +# so that all replicates are processed in parellel at the same time. +# make sure that resource settings in your input JSON file +# are consistent with SBATCH resource settings (--mem, --cpus-per-task) +# in this script +NUM_CONCURRENT_TASK=2 + +# run pipeline +# you can monitor your jobs with "squeue -u $USER" +java -jar -Dconfig.file=backends/backend.conf \ +-Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_singularity.sh b/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_singularity.sh new file mode 100644 index 00000000..decb9da1 --- /dev/null +++ b/examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock_singularity.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# do not touch these settings +# number of tasks and nodes are fixed at 1 +#SBATCH -n 1 +#SBATCH --ntasks-per-node=1 + +# job name for pipeline +# this name will appear when you monitor jobs with "squeue -u $USER" +#SBATCH --job-name=ENCSR936XTK_subsampled_chr19_only + +# walltime for your job +# give long time enough to finish your pipeline +# <12 hr: small/test samples +# >24 hr: large samples +#SBATCH --time=24:00:00 + +# total amount of memory +# depends on the size of your FASTQs +# but should be <= NUM_CONCURRENT_TASK x 20GB for big samples +# or <= NUM_CONCURRENT_TASK x 10GB for small samples +# do not request too much memory +# cluster will not accept your job +#SBATCH --mem=20G + +# max number of cpus for each pipeline +# should be <= NUM_CONCURRENT_TASK x "chip.bwa_cpu" in input JSON file +# since bwa is a bottlenecking task in the pipeline +# "chip.bwa_cpu" is a number of cpus per replicate +#SBATCH --cpus-per-task=2 + +# email notification for job status +#SBATCH --mail-type=END,FAIL + +# load java module if it exists +module load java + +# use input JSON for a small test sample +# you make an input JSON for your own sample +# start from any of two templates for single-ended and paired-ended samples +# (examples/template_se.json, examples/template_pe.json) +# do not use an input JSON file for a test sample (ENCSR936XTK) +# it's a sample with multimapping reads +INPUT=examples/sherlock/ENCSR936XTK_subsampled_chr19_only_sherlock.json + +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + +# limit number of concurrent tasks +# we recommend to use a number of replicates here +# so that all replicates are processed in parellel at the same time. +# make sure that resource settings in your input JSON file +# are consistent with SBATCH resource settings (--mem, --cpus-per-task) +# in this script +NUM_CONCURRENT_TASK=2 + +# run pipeline +# you can monitor your jobs with "squeue -u $USER" +java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ +-Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/sherlock.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/sherlock/ENCSR936XTK_subsampled_sherlock_conda.sh b/examples/sherlock/ENCSR936XTK_subsampled_sherlock_conda.sh index e327f7bc..6b68c1eb 100644 --- a/examples/sherlock/ENCSR936XTK_subsampled_sherlock_conda.sh +++ b/examples/sherlock/ENCSR936XTK_subsampled_sherlock_conda.sh @@ -13,7 +13,7 @@ # give long time enough to finish your pipeline # <12 hr: small/test samples # >24 hr: large samples -#SBATCH --time=12:00:00 +#SBATCH --time=24:00:00 # total amount of memory # depends on the size of your FASTQs @@ -46,6 +46,10 @@ source activate encode-chip-seq-pipeline # it's a sample with multimapping reads INPUT=examples/sherlock/ENCSR936XTK_subsampled_sherlock.json +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + # limit number of concurrent tasks # we recommend to use a number of replicates here # so that all replicates are processed in parellel at the same time. @@ -58,4 +62,4 @@ NUM_CONCURRENT_TASK=2 # you can monitor your jobs with "squeue -u $USER" java -jar -Dconfig.file=backends/backend.conf \ -Dbackend.providers.Local.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} \ No newline at end of file +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/sherlock/ENCSR936XTK_subsampled_sherlock_singularity.sh b/examples/sherlock/ENCSR936XTK_subsampled_sherlock_singularity.sh index 3fdfceaa..c087c9eb 100644 --- a/examples/sherlock/ENCSR936XTK_subsampled_sherlock_singularity.sh +++ b/examples/sherlock/ENCSR936XTK_subsampled_sherlock_singularity.sh @@ -13,7 +13,7 @@ # give long time enough to finish your pipeline # <12 hr: small/test samples # >24 hr: large samples -#SBATCH --time=12:00:00 +#SBATCH --time=24:00:00 # total amount of memory # depends on the size of your FASTQs @@ -43,6 +43,10 @@ module load java # it's a sample with multimapping reads INPUT=examples/sherlock/ENCSR936XTK_subsampled_sherlock.json +# If this pipeline fails, then use this metadata JSON file to resume a failed pipeline from where it left +# See details in /utils/resumer/README.md +PIPELINE_METADATA=metadata.json + # limit number of concurrent tasks # we recommend to use a number of replicates here # so that all replicates are processed in parellel at the same time. @@ -55,4 +59,4 @@ NUM_CONCURRENT_TASK=2 # you can monitor your jobs with "squeue -u $USER" java -jar -Dconfig.file=backends/backend.conf -Dbackend.default=singularity \ -Dbackend.providers.singularity.config.concurrent-job-limit=${NUM_CONCURRENT_TASK} \ -$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/sherlock.json \ No newline at end of file +$HOME/cromwell-34.jar run chip.wdl -i ${INPUT} -o workflow_opts/sherlock.json -m ${PIPELINE_METADATA} \ No newline at end of file diff --git a/examples/template_pe.json b/examples/template_pe.json index aa093740..51916828 100644 --- a/examples/template_pe.json +++ b/examples/template_pe.json @@ -39,8 +39,8 @@ "ctl2.tagAlign.gz" ], - "chip.title" : "Example (single-ended)", - "chip.description" : "This is an template input JSON for single-ended sample.", + "chip.title" : "Example (paired-end)", + "chip.description" : "This is an template input JSON for paired-end sample.", "chip.pipeline_type" : "tf", "chip.peak_caller" : "spp", @@ -57,6 +57,7 @@ "chip.mapq_thresh" : 30, "chip.no_dup_removal" : false, + "chip.mito_chr_name" : "chrM", "chip.regex_filter_reads" : "chrM", "chip.subsample_reads" : 0, "chip.ctl_subsample_reads" : 0, @@ -106,5 +107,5 @@ "chip.spp_cpu" : 2, "chip.spp_mem_mb" : 16000, "chip.spp_time_hr" : 72, - "chip.spp_disks" : "local-disk 100 HDD", + "chip.spp_disks" : "local-disk 100 HDD" } \ No newline at end of file diff --git a/examples/template_se.json b/examples/template_se.json index 2653e044..3b70f26a 100644 --- a/examples/template_se.json +++ b/examples/template_se.json @@ -53,6 +53,7 @@ "chip.mapq_thresh" : 30, "chip.no_dup_removal" : false, + "chip.mito_chr_name" : "chrM", "chip.regex_filter_reads" : "chrM", "chip.subsample_reads" : 0, "chip.ctl_subsample_reads" : 0, @@ -102,5 +103,5 @@ "chip.spp_cpu" : 2, "chip.spp_mem_mb" : 16000, "chip.spp_time_hr" : 72, - "chip.spp_disks" : "local-disk 100 HDD", + "chip.spp_disks" : "local-disk 100 HDD" } \ No newline at end of file diff --git a/genome/dx/hg38_chr19_only_dx.tsv b/genome/dx/hg38_chr19_chrM_dx.tsv similarity index 100% rename from genome/dx/hg38_chr19_only_dx.tsv rename to genome/dx/hg38_chr19_chrM_dx.tsv diff --git a/genome/dx/mm10_chr19_only_dx.tsv b/genome/dx/mm10_chr19_chrM_dx.tsv similarity index 100% rename from genome/dx/mm10_chr19_only_dx.tsv rename to genome/dx/mm10_chr19_chrM_dx.tsv diff --git a/genome/dx_azure/hg38_chr19_only_dx_azure.tsv b/genome/dx_azure/hg38_chr19_chrM_dx_azure.tsv similarity index 100% rename from genome/dx_azure/hg38_chr19_only_dx_azure.tsv rename to genome/dx_azure/hg38_chr19_chrM_dx_azure.tsv diff --git a/genome/dx_azure/mm10_chr19_only_dx_azure.tsv b/genome/dx_azure/mm10_chr19_chrM_dx_azure.tsv similarity index 100% rename from genome/dx_azure/mm10_chr19_only_dx_azure.tsv rename to genome/dx_azure/mm10_chr19_chrM_dx_azure.tsv diff --git a/src/encode_bam2ta.py b/src/encode_bam2ta.py index 00547a91..26d028ec 100755 --- a/src/encode_bam2ta.py +++ b/src/encode_bam2ta.py @@ -15,6 +15,8 @@ def parse_arguments(): help='Path for BAM file.') parser.add_argument('--disable-tn5-shift', action="store_true", help='Disable TN5 shifting for DNase-Seq.') + parser.add_argument('--mito-chr-name', default='chrM', + help='Mito chromosome name.') parser.add_argument('--regex-grep-v-ta', default='chrM', help='Perl-style regular expression pattern \ to remove matching reads from TAGALIGN.') @@ -45,7 +47,7 @@ def bam2ta_se(bam, regex_grep_v_ta, out_dir): cmd = 'bedtools bamtobed -i {} | ' cmd += 'awk \'BEGIN{{OFS="\\t"}}{{$4="N";$5="1000";print $0}}\' | ' if regex_grep_v_ta: - cmd += 'grep -P -v \'{}\' | '.format(regex_grep_v_ta) + cmd += 'grep -P -v \'^{}\\b\' | '.format(regex_grep_v_ta) cmd += 'gzip -nc > {}' cmd = cmd.format( bam, @@ -77,7 +79,7 @@ def bam2ta_pe(bam, regex_grep_v_ta, nth, out_dir): cmd2 += '%s\\t%s\\t%s\\tN\\t1000\\t%s\\n",' cmd2 += '$1,$2,$3,$9,$4,$5,$6,$10}}\' | ' if regex_grep_v_ta: - cmd2 += 'grep -P -v \'{}\' | '.format(regex_grep_v_ta) + cmd2 += 'grep -P -v \'^{}\\b\' | '.format(regex_grep_v_ta) cmd2 += 'gzip -nc > {}' cmd2 = cmd2.format( bedpe, @@ -124,10 +126,10 @@ def main(): log.info('Subsampling TAGALIGN...') if args.paired_end: subsampled_ta = subsample_ta_pe( - ta, args.subsample, False, False, args.out_dir) + ta, args.subsample, False, args.mito_chr_name, False, args.out_dir) else: subsampled_ta = subsample_ta_se( - ta, args.subsample, False, args.out_dir) + ta, args.subsample, False, args.mito_chr_name, args.out_dir) temp_files.append(ta) else: subsampled_ta = ta diff --git a/src/encode_common_genomic.py b/src/encode_common_genomic.py index 9f08a057..bdf23713 100755 --- a/src/encode_common_genomic.py +++ b/src/encode_common_genomic.py @@ -129,19 +129,20 @@ def locate_picard(): raise Exception(msg) -def subsample_ta_se(ta, subsample, non_mito, out_dir): +def subsample_ta_se(ta, subsample, non_mito, mito_chr_name, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) - ta_subsampled = '{}.{}{}.tagAlign.gz'.format( + ta_subsampled = '{}.{}{}tagAlign.gz'.format( prefix, - 'no_chrM' if non_mito else '', - '.{}'.format(human_readable_number(subsample)) if subsample>0 else '' + 'no_chrM.' if non_mito else '', + '{}.'.format(human_readable_number(subsample)) if subsample>0 else '' ) # use bash cmd = 'bash -c "zcat -f {} | ' if non_mito: - cmd += 'grep -v chrM | ' + # cmd += 'awk \'{{if ($1!="'+mito_chr_name+'") print $0}}\' | ' + cmd += 'grep -v \'^'+mito_chr_name+'\\b\' | ' if subsample>0: cmd += 'shuf -n {} --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt /dev/null) | ' cmd += 'gzip -nc > {}"' @@ -159,20 +160,21 @@ def subsample_ta_se(ta, subsample, non_mito, out_dir): run_shell_cmd(cmd) return ta_subsampled -def subsample_ta_pe(ta, subsample, non_mito, r1_only, out_dir): +def subsample_ta_pe(ta, subsample, non_mito, mito_chr_name, r1_only, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) - ta_subsampled = '{}.{}{}{}.tagAlign.gz'.format( + ta_subsampled = '{}.{}{}{}tagAlign.gz'.format( prefix, 'no_chrM.' if non_mito else '', - 'R1' if r1_only else '', - '.{}'.format(human_readable_number(subsample)) if subsample>0 else '' + 'R1.' if r1_only else '', + '{}.'.format(human_readable_number(subsample)) if subsample>0 else '' ) ta_tmp = '{}.tagAlign.tmp'.format(prefix) cmd0 = 'bash -c "zcat -f {} | ' if non_mito: - cmd0 += 'grep -v chrM | ' + # cmd0 += 'awk \'{{if ($1!="'+mito_chr_name+'") print $0}}\' | ' + cmd0 += 'grep -v \'^'+mito_chr_name+'\\b\' | ' cmd0 += 'sed \'N;s/\\n/\\t/\' ' if subsample>0: cmd0 += '| shuf -n {} --random-source=<(openssl enc -aes-256-ctr -pass pass:$(zcat -f {} | wc -c) -nosalt /dev/null) > {}"' @@ -344,7 +346,7 @@ def peak_to_bigbed(peak, peak_type, chrsz, keep_irregular_chr, out_dir): with open(as_file,'w') as fp: fp.write(as_file_contents) if not keep_irregular_chr: - cmd1 = "cat {} | grep -P 'chr[\dXY]+[ \\t]' > {}".format(chrsz, chrsz_tmp) + cmd1 = "cat {} | grep -P 'chr[\\dXY]+\\b' > {}".format(chrsz, chrsz_tmp) else: cmd1 = "cat {} > {}".format(chrsz, chrsz_tmp) run_shell_cmd(cmd1) diff --git a/src/encode_filter.py b/src/encode_filter.py index c2aaf8ec..993f540e 100755 --- a/src/encode_filter.py +++ b/src/encode_filter.py @@ -25,6 +25,8 @@ def parse_arguments(): help='Paired-end BAM.') parser.add_argument('--multimapping', default=0, type=int, help='Multimapping reads.') + parser.add_argument('--mito-chr-name', default='chrM', + help='Mito chromosome name.') parser.add_argument('--nth', type=int, default=1, help='Number of threads to parallelize.') parser.add_argument('--out-dir', default='', type=str, @@ -228,7 +230,7 @@ def rm_dup_pe(dupmark_bam, nth, out_dir): run_shell_cmd(cmd1) return nodup_bam -def pbc_qc_se(bam, out_dir): +def pbc_qc_se(bam, mito_chr_name, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) # strip extension appended in the previous step @@ -237,7 +239,7 @@ def pbc_qc_se(bam, out_dir): cmd2 = 'bedtools bamtobed -i {} | ' cmd2 += 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$3,$6}}\' | ' - cmd2 += 'grep -v "chrM" | sort | uniq -c | ' + cmd2 += 'grep -v "^{}\\b" | sort | uniq -c | ' cmd2 += 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ' cmd2 += '($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; ' cmd2 += 'if(m2>0) m1_m2=m1/m2; m0_mt=0; if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; ' @@ -245,11 +247,12 @@ def pbc_qc_se(bam, out_dir): cmd2 += 'mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {}' cmd2 = cmd2.format( bam, + mito_chr_name, pbc_qc) run_shell_cmd(cmd2) return pbc_qc -def pbc_qc_pe(bam, nth, out_dir): +def pbc_qc_pe(bam, mito_chr_name, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_bam(bam))) pbc_qc = '{}.pbc.qc'.format(prefix) @@ -258,7 +261,7 @@ def pbc_qc_pe(bam, nth, out_dir): nmsrt_bam = sambamba_name_sort(bam, nth, out_dir) cmd3 = 'bedtools bamtobed -bedpe -i {} | ' cmd3 += 'awk \'BEGIN{{OFS="\\t"}}{{print $1,$2,$4,$6,$9,$10}}\' | ' - cmd3 += 'grep -v "chrM" | sort | uniq -c | ' + cmd3 += 'grep -v "^{}\\b" | sort | uniq -c | ' cmd3 += 'awk \'BEGIN{{mt=0;m0=0;m1=0;m2=0}} ($1==1){{m1=m1+1}} ' cmd3 += '($1==2){{m2=m2+1}} {{m0=m0+1}} {{mt=mt+$1}} END{{m1_m2=-1.0; ' cmd3 += 'if(m2>0) m1_m2=m1/m2; m0_mt=0; if (mt>0) m0_mt=m0/mt; m1_m0=0; if (m0>0) m1_m0=m1/m0; ' @@ -266,6 +269,7 @@ def pbc_qc_pe(bam, nth, out_dir): cmd3 += ',mt,m0,m1,m2,m0_mt,m1_m0,m1_m2}}\' > {}' cmd3 = cmd3.format( nmsrt_bam, + mito_chr_name, pbc_qc) run_shell_cmd(cmd3) rm_f(nmsrt_bam) @@ -362,12 +366,12 @@ def main(): if not args.no_dup_removal: if args.paired_end: ret_val_3 = pool.apply_async(pbc_qc_pe, - (dupmark_bam, + (dupmark_bam, args.mito_chr_name, max(1,args.nth-2), args.out_dir)) else: ret_val_3 = pool.apply_async(pbc_qc_se, - (dupmark_bam, args.out_dir)) + (dupmark_bam, args.mito_chr_name, args.out_dir)) # gather nodup_bai = ret_val_1.get(BIG_INT) diff --git a/src/encode_trim_fastq.py b/src/encode_trim_fastq.py index 1cfbc383..5c48bc69 100755 --- a/src/encode_trim_fastq.py +++ b/src/encode_trim_fastq.py @@ -35,6 +35,13 @@ def trim_fastq(fastq, trim_bp, out_dir): cmd = 'python $(which trimfastq.py) {} {} | gzip -nc > {}'.format( fastq, trim_bp, trimmed) run_shell_cmd(cmd) + + # if shorter than trim_bp + cmd2 = 'zcat -f {} | grep \'sequences shorter than desired length\' | wc -l'.format( + trimmed) + if int(run_shell_cmd(cmd2))>0: + copy_f_to_f(fastq, trimmed) + return trimmed def main(): diff --git a/src/encode_xcor.py b/src/encode_xcor.py index f115cc0c..7f98adc8 100755 --- a/src/encode_xcor.py +++ b/src/encode_xcor.py @@ -14,6 +14,8 @@ def parse_arguments(): description='') parser.add_argument('ta', type=str, help='Path for TAGALIGN file.') + parser.add_argument('--mito-chr-name', default='chrM', + help='Mito chromosome name.') parser.add_argument('--subsample', type=int, default=0, help='Subsample TAGALIGN.') parser.add_argument('--speak', type=int, default=-1, @@ -35,7 +37,7 @@ def parse_arguments(): log.info(sys.argv) return args -def xcor(ta, speak, nth, out_dir): +def xcor(ta, speak, mito_chr_name, nth, out_dir): prefix = os.path.join(out_dir, os.path.basename(strip_ext_ta(ta))) xcor_plot_pdf = '{}.cc.plot.pdf'.format(prefix) @@ -43,10 +45,11 @@ def xcor(ta, speak, nth, out_dir): fraglen_txt = '{}.cc.fraglen.txt'.format(prefix) cmd1 = 'Rscript --max-ppsize=500000 $(which run_spp.R) -rf -c={} -p={} ' - cmd1 += '-filtchr=chrM -savp={} -out={} {}' + cmd1 += '-filtchr="{}" -savp={} -out={} {}' cmd1 = cmd1.format( ta, nth, + mito_chr_name, xcor_plot_pdf, xcor_score, '-speak={}'.format(speak) if speak>=0 else '') @@ -77,15 +80,15 @@ def main(): log.info('Subsampling TAGALIGN for xcor...') if args.paired_end: ta_subsampled = subsample_ta_pe( - args.ta, args.subsample, True, True, args.out_dir) + args.ta, args.subsample, True, args.mito_chr_name, True, args.out_dir) else: ta_subsampled = subsample_ta_se( - args.ta, args.subsample, True, args.out_dir) + args.ta, args.subsample, True, args.mito_chr_name, args.out_dir) temp_files.append(ta_subsampled) log.info('Cross-correlation analysis...') xcor_plot_pdf, xcor_plot_png, xcor_score, fraglen_txt = xcor( - ta_subsampled, args.speak, args.nth, args.out_dir) + ta_subsampled, args.speak, args.mito_chr_name, args.nth, args.out_dir) log.info('Removing temporary files...') rm_f(temp_files) diff --git a/test/test_task/test.sh b/test/test_task/test.sh index 292c65a6..4c9124a7 100755 --- a/test/test_task/test.sh +++ b/test/test_task/test.sh @@ -12,7 +12,7 @@ INPUT=$2 if [ $# -gt 2 ]; then DOCKER_IMAGE=$3 else - DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:v1.1.5 + DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:v1.1.6 fi if [ $# -gt 3 ]; then NUM_TASK=$4 diff --git a/test/test_task/test_bam2ta.wdl b/test/test_task/test_bam2ta.wdl index 93c179ea..06209c83 100644 --- a/test/test_task/test_bam2ta.wdl +++ b/test/test_task/test_bam2ta.wdl @@ -13,6 +13,7 @@ workflow test_bam2ta { String ref_pe_ta_subsample String ref_se_ta String ref_se_ta_subsample + String mito_chr_name = 'chrM' Int bam2ta_cpu = 1 Int bam2ta_mem_mb = 10000 @@ -24,6 +25,7 @@ workflow test_bam2ta { subsample = 0, regex_grep_v_ta = regex_filter_reads, paired_end = true, + mito_chr_name = mito_chr_name, cpu = bam2ta_cpu, mem_mb = bam2ta_mem_mb, @@ -35,6 +37,7 @@ workflow test_bam2ta { subsample = bam2ta_subsample, regex_grep_v_ta = regex_filter_reads, paired_end = true, + mito_chr_name = mito_chr_name, cpu = bam2ta_cpu, mem_mb = bam2ta_mem_mb, @@ -46,6 +49,7 @@ workflow test_bam2ta { subsample = 0, regex_grep_v_ta = regex_filter_reads, paired_end = false, + mito_chr_name = mito_chr_name, cpu = bam2ta_cpu, mem_mb = bam2ta_mem_mb, @@ -57,6 +61,7 @@ workflow test_bam2ta { subsample = bam2ta_subsample, regex_grep_v_ta = regex_filter_reads, paired_end = false, + mito_chr_name = mito_chr_name, cpu = bam2ta_cpu, mem_mb = bam2ta_mem_mb, diff --git a/test/test_task/test_filter.wdl b/test/test_task/test_filter.wdl index 6e4b6ca5..3c8e5a96 100644 --- a/test/test_task/test_filter.wdl +++ b/test/test_task/test_filter.wdl @@ -17,6 +17,7 @@ workflow test_filter { String ref_pe_filt_bam String ref_se_nodup_bam String ref_se_filt_bam + String mito_chr_name = 'chrM' Int filter_cpu = 1 Int filter_mem_mb = 20000 @@ -29,6 +30,7 @@ workflow test_filter { paired_end = true, dup_marker = dup_marker, mapq_thresh = mapq_thresh, + mito_chr_name = mito_chr_name, cpu = filter_cpu, mem_mb = filter_mem_mb, @@ -41,6 +43,7 @@ workflow test_filter { paired_end = true, dup_marker = dup_marker, mapq_thresh = mapq_thresh, + mito_chr_name = mito_chr_name, cpu = filter_cpu, mem_mb = filter_mem_mb, @@ -53,6 +56,7 @@ workflow test_filter { paired_end = false, dup_marker = dup_marker, mapq_thresh = mapq_thresh, + mito_chr_name = mito_chr_name, cpu = filter_cpu, mem_mb = filter_mem_mb, @@ -65,6 +69,7 @@ workflow test_filter { paired_end = false, dup_marker = dup_marker, mapq_thresh = mapq_thresh, + mito_chr_name = mito_chr_name, cpu = filter_cpu, mem_mb = filter_mem_mb, diff --git a/test/test_task/test_xcor.wdl b/test/test_task/test_xcor.wdl index 42befe92..2f570f0e 100644 --- a/test/test_task/test_xcor.wdl +++ b/test/test_task/test_xcor.wdl @@ -13,6 +13,7 @@ workflow test_xcor { String ref_pe_xcor_log_subsample String ref_se_xcor_log String ref_se_xcor_log_subsample + String mito_chr_name = 'chrM' Int xcor_cpu = 1 Int xcor_mem_mb = 16000 @@ -23,6 +24,7 @@ workflow test_xcor { ta = pe_ta, subsample = xcor_subsample_default, paired_end = true, + mito_chr_name = mito_chr_name, cpu = xcor_cpu, mem_mb = xcor_mem_mb, @@ -33,6 +35,7 @@ workflow test_xcor { ta = pe_ta, subsample = xcor_subsample, paired_end = true, + mito_chr_name = mito_chr_name, cpu = xcor_cpu, mem_mb = xcor_mem_mb, @@ -43,6 +46,7 @@ workflow test_xcor { ta = se_ta, subsample = xcor_subsample_default, paired_end = false, + mito_chr_name = mito_chr_name, cpu = xcor_cpu, mem_mb = xcor_mem_mb, @@ -53,6 +57,7 @@ workflow test_xcor { ta = se_ta, subsample = xcor_subsample, paired_end = false, + mito_chr_name = mito_chr_name, cpu = xcor_cpu, mem_mb = xcor_mem_mb, diff --git a/test/test_workflow/ENCSR000DYI.json b/test/test_workflow/ENCSR000DYI.json index 1a441495..f63fa8a2 100644 --- a/test/test_workflow/ENCSR000DYI.json +++ b/test/test_workflow/ENCSR000DYI.json @@ -1,5 +1,5 @@ { - "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/ref_output_v1.1.5/ENCSR000DYI/qc.json", + "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.1.5/ENCSR000DYI/qc.json", "chip.pipeline_type" : "tf", "chip.genome_tsv" : "gs://encode-pipeline-genome-data/hg38_google.tsv", "chip.fastqs" : [ diff --git a/test/test_workflow/ENCSR000DYI_subsampled.json b/test/test_workflow/ENCSR000DYI_subsampled.json index 215ce31f..652b5e68 100644 --- a/test/test_workflow/ENCSR000DYI_subsampled.json +++ b/test/test_workflow/ENCSR000DYI_subsampled.json @@ -1,5 +1,5 @@ { - "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/ref_output_v1.1.5/ENCSR000DYI_subsampled/qc.json", + "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.1.5/ENCSR000DYI_subsampled/qc.json", "chip.pipeline_type" : "tf", "chip.genome_tsv" : "gs://encode-pipeline-genome-data/hg38_google.tsv", "chip.fastqs" : [ diff --git a/test/test_workflow/ENCSR000DYI_subsampled_chr19_only.json b/test/test_workflow/ENCSR000DYI_subsampled_chr19_only.json index 4db09f06..0a10b32c 100644 --- a/test/test_workflow/ENCSR000DYI_subsampled_chr19_only.json +++ b/test/test_workflow/ENCSR000DYI_subsampled_chr19_only.json @@ -1,5 +1,5 @@ { - "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR000DYI/ref_output_v1.1.5/ENCSR000DYI_subsampled_chr19_only/qc.json", + "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.1.5/ENCSR000DYI_subsampled_chr19_only/qc.json", "chip.pipeline_type" : "tf", "chip.genome_tsv" : "gs://encode-pipeline-genome-data/hg38_chr19_chrM_google.tsv", "chip.fastqs" : [ diff --git a/test/test_workflow/ENCSR936XTK.json b/test/test_workflow/ENCSR936XTK.json index b9d143a9..c162d552 100644 --- a/test/test_workflow/ENCSR936XTK.json +++ b/test/test_workflow/ENCSR936XTK.json @@ -1,5 +1,5 @@ { - "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/ref_output_v1.1.5/ENCSR936XTK/qc.json", + "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.1.5/ENCSR936XTK/qc.json", "chip.pipeline_type" : "tf", "chip.genome_tsv" : "gs://encode-pipeline-genome-data/hg38_google.tsv", "chip.fastqs" : [ diff --git a/test/test_workflow/ENCSR936XTK_subsampled.json b/test/test_workflow/ENCSR936XTK_subsampled.json index f95ee869..034b470a 100644 --- a/test/test_workflow/ENCSR936XTK_subsampled.json +++ b/test/test_workflow/ENCSR936XTK_subsampled.json @@ -1,5 +1,5 @@ { - "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/ref_output_v1.1.5/ENCSR936XTK_subsampled/qc.json", + "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.1.5/ENCSR936XTK_subsampled/qc.json", "chip.pipeline_type" : "tf", "chip.genome_tsv" : "gs://encode-pipeline-genome-data/hg38_google.tsv", "chip.fastqs" : [ diff --git a/test/test_workflow/ENCSR936XTK_subsampled_chr19_only.json b/test/test_workflow/ENCSR936XTK_subsampled_chr19_only.json index 8a98bdc3..f1e463bf 100644 --- a/test/test_workflow/ENCSR936XTK_subsampled_chr19_only.json +++ b/test/test_workflow/ENCSR936XTK_subsampled_chr19_only.json @@ -1,5 +1,5 @@ { - "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ENCSR936XTK/ref_output_v1.1.5/ENCSR936XTK_subsampled_chr19_only/qc.json", + "chip.qc_report.qc_json_ref" : "gs://encode-pipeline-test-samples/encode-chip-seq-pipeline/ref_output/v1.1.5/ENCSR936XTK_subsampled_chr19_only/qc.json", "chip.pipeline_type" : "tf", "chip.genome_tsv" : "gs://encode-pipeline-genome-data/hg38_chr19_chrM_google.tsv", "chip.fastqs" : [ diff --git a/test/test_workflow/ref_output/v1.1.5/ENCSR936XTK/qc.json b/test/test_workflow/ref_output/v1.1.5/ENCSR936XTK/qc.json new file mode 100644 index 00000000..fc3f021b --- /dev/null +++ b/test/test_workflow/ref_output/v1.1.5/ENCSR936XTK/qc.json @@ -0,0 +1,439 @@ +{ + "general": { + "date": "2019-01-21 11:40:20", + "pipeline_ver": "v1.1.6", + "peak_caller": "spp", + "genome": "hg38_google.tsv", + "description": "ZNF143 ChIP-seq on human GM12878", + "title": "ENCSR936XTK", + "paired_end": true + }, + "flagstat_qc": { + "rep1": { + "total": 69122852, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 68063712, + "mapped_qc_failed": 0, + "mapped_pct": 98.47, + "paired": 69122852, + "paired_qc_failed": 0, + "read1": 34561426, + "read1_qc_failed": 0, + "read2": 34561426, + "read2_qc_failed": 0, + "paired_properly": 67691510, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 97.93, + "with_itself": 67846541, + "with_itself_qc_failed": 0, + "singletons": 217171, + "singletons_qc_failed": 0, + "singletons_pct": 0.31, + "diff_chroms": 28970, + "diff_chroms_qc_failed": 0 + }, + "rep2": { + "total": 86590386, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 83658387, + "mapped_qc_failed": 0, + "mapped_pct": 96.61, + "paired": 86590386, + "paired_qc_failed": 0, + "read1": 43295193, + "read1_qc_failed": 0, + "read2": 43295193, + "read2_qc_failed": 0, + "paired_properly": 83126958, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 96.0, + "with_itself": 83375973, + "with_itself_qc_failed": 0, + "singletons": 282414, + "singletons_qc_failed": 0, + "singletons_pct": 0.33, + "diff_chroms": 44892, + "diff_chroms_qc_failed": 0 + } + }, + "ctl_flagstat_qc": { + "rep1": { + "total": 90753054, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 89164565, + "mapped_qc_failed": 0, + "mapped_pct": 98.25, + "paired": 90753054, + "paired_qc_failed": 0, + "read1": 45376527, + "read1_qc_failed": 0, + "read2": 45376527, + "read2_qc_failed": 0, + "paired_properly": 88383742, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 97.39, + "with_itself": 88810063, + "with_itself_qc_failed": 0, + "singletons": 354502, + "singletons_qc_failed": 0, + "singletons_pct": 0.39, + "diff_chroms": 123377, + "diff_chroms_qc_failed": 0 + }, + "rep2": { + "total": 86825372, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 85203319, + "mapped_qc_failed": 0, + "mapped_pct": 98.13, + "paired": 86825372, + "paired_qc_failed": 0, + "read1": 43412686, + "read1_qc_failed": 0, + "read2": 43412686, + "read2_qc_failed": 0, + "paired_properly": 84455733, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 97.27, + "with_itself": 84838815, + "with_itself_qc_failed": 0, + "singletons": 364504, + "singletons_qc_failed": 0, + "singletons_pct": 0.42, + "diff_chroms": 113675, + "diff_chroms_qc_failed": 0 + } + }, + "dup_qc": { + "rep1": { + "unpaired_reads": 0, + "paired_reads": 30291259, + "unmapped_reads": 0, + "unpaired_dupes": 0, + "paired_dupes": 3664057, + "paired_opt_dupes": 8743, + "dupes_pct": 0.120961 + }, + "rep2": { + "unpaired_reads": 0, + "paired_reads": 37072602, + "unmapped_reads": 0, + "unpaired_dupes": 0, + "paired_dupes": 2833119, + "paired_opt_dupes": 24855, + "dupes_pct": 0.076421 + } + }, + "ctl_dup_qc": { + "rep1": { + "unpaired_reads": 0, + "paired_reads": 39854681, + "unmapped_reads": 0, + "unpaired_dupes": 0, + "paired_dupes": 4092825, + "paired_opt_dupes": 8937, + "dupes_pct": 0.102694 + }, + "rep2": { + "unpaired_reads": 0, + "paired_reads": 38118550, + "unmapped_reads": 0, + "unpaired_dupes": 0, + "paired_dupes": 4128511, + "paired_opt_dupes": 8553, + "dupes_pct": 0.108307 + } + }, + "pbc_qc": { + "rep1": { + "total_read_pairs": 30099108, + "distinct_read_pairs": 26466206, + "one_read_pair": 23226034, + "two_read_pair": 2888067, + "NRF": 0.879302, + "PBC1": 0.877573, + "PBC2": 8.042069 + }, + "rep2": { + "total_read_pairs": 36950601, + "distinct_read_pairs": 34130627, + "one_read_pair": 31510426, + "two_read_pair": 2434584, + "NRF": 0.923683, + "PBC1": 0.92323, + "PBC2": 12.942838 + } + }, + "ctl_pbc_qc": { + "rep1": { + "total_read_pairs": 39400720, + "distinct_read_pairs": 35389661, + "one_read_pair": 31987615, + "two_read_pair": 2993615, + "NRF": 0.898198, + "PBC1": 0.903869, + "PBC2": 10.68528 + }, + "rep2": { + "total_read_pairs": 37604471, + "distinct_read_pairs": 33580171, + "one_read_pair": 29976868, + "two_read_pair": 3230526, + "NRF": 0.892983, + "PBC1": 0.892696, + "PBC2": 9.279253 + } + }, + "nodup_flagstat_qc": { + "rep1": { + "total": 53254404, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 53254404, + "mapped_qc_failed": 0, + "mapped_pct": 100.0, + "paired": 53254404, + "paired_qc_failed": 0, + "read1": 26627202, + "read1_qc_failed": 0, + "read2": 26627202, + "read2_qc_failed": 0, + "paired_properly": 53254404, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 100.0, + "with_itself": 53254404, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + }, + "rep2": { + "total": 68478966, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 68478966, + "mapped_qc_failed": 0, + "mapped_pct": 100.0, + "paired": 68478966, + "paired_qc_failed": 0, + "read1": 34239483, + "read1_qc_failed": 0, + "read2": 34239483, + "read2_qc_failed": 0, + "paired_properly": 68478966, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 100.0, + "with_itself": 68478966, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + } + }, + "ctl_nodup_flagstat_qc": { + "rep1": { + "total": 71523712, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 71523712, + "mapped_qc_failed": 0, + "mapped_pct": 100.0, + "paired": 71523712, + "paired_qc_failed": 0, + "read1": 35761856, + "read1_qc_failed": 0, + "read2": 35761856, + "read2_qc_failed": 0, + "paired_properly": 71523712, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 100.0, + "with_itself": 71523712, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + }, + "rep2": { + "total": 67980078, + "total_qc_failed": 0, + "duplicates": 0, + "duplicates_qc_failed": 0, + "mapped": 67980078, + "mapped_qc_failed": 0, + "mapped_pct": 100.0, + "paired": 67980078, + "paired_qc_failed": 0, + "read1": 33990039, + "read1_qc_failed": 0, + "read2": 33990039, + "read2_qc_failed": 0, + "paired_properly": 67980078, + "paired_properly_qc_failed": 0, + "paired_properly_pct": 100.0, + "with_itself": 67980078, + "with_itself_qc_failed": 0, + "singletons": 0, + "singletons_qc_failed": 0, + "singletons_pct": 0.0, + "diff_chroms": 0, + "diff_chroms_qc_failed": 0 + } + }, + "overlap_reproducibility_qc": { + "Nt": 94395, + "N1": 83370, + "N2": 66706, + "Np": 97950, + "N_opt": 97950, + "N_consv": 94395, + "opt_set": "ppr", + "consv_set": "rep1-rep2", + "rescue_ratio": 1.03766089306, + "self_consistency_ratio": 1.24981261056, + "reproducibility": "pass" + }, + "idr_reproducibility_qc": { + "Nt": 27396, + "N1": 20968, + "N2": 13849, + "Np": 29253, + "N_opt": 29253, + "N_consv": 27396, + "opt_set": "ppr", + "consv_set": "rep1-rep2", + "rescue_ratio": 1.06778361805, + "self_consistency_ratio": 1.51404433533, + "reproducibility": "pass" + }, + "xcor_score": { + "rep1": { + "num_reads": 15000000, + "est_frag_len": 225, + "corr_est_frag_len": 0.25498953123155, + "phantom_peak": 50, + "corr_phantom_peak": 0.2034027, + "argmin_corr": 1500, + "min_corr": 0.1469003, + "NSC": 1.735799, + "RSC": 1.913004 + }, + "rep2": { + "num_reads": 15000000, + "est_frag_len": 210, + "corr_est_frag_len": 0.238009871543598, + "phantom_peak": 50, + "corr_phantom_peak": 0.2127357, + "argmin_corr": 1500, + "min_corr": 0.1698985, + "NSC": 1.400895, + "RSC": 1.590003 + } + }, + "frip_macs2_qc": { + "rep1": { + "FRiP": 0.195798121295 + }, + "rep2": { + "FRiP": 0.138519505793 + }, + "pooled": { + "FRiP": 0.166707202334 + } + }, + "frip_spp_qc": { + "rep1": { + "FRiP": 0.245311389327 + }, + "rep2": { + "FRiP": 0.179917818687 + }, + "rep1-pr1": { + "FRiP": 0.262707429587 + }, + "rep2-pr1": { + "FRiP": 0.194224015203 + }, + "rep1-pr2": { + "FRiP": 0.262833043555 + }, + "rep2-pr2": { + "FRiP": 0.194158619398 + }, + "pooled": { + "FRiP": 0.20014044792 + }, + "ppr1": { + "FRiP": 0.209443627754 + }, + "ppr2": { + "FRiP": 0.209594533629 + } + }, + "overlap_frip_qc": { + "rep1-rep2": { + "FRiP": 0.131240096534 + }, + "rep1-pr": { + "FRiP": 0.158421933417 + }, + "rep2-pr": { + "FRiP": 0.0954740214288 + }, + "ppr": { + "FRiP": 0.133249615545 + } + }, + "idr_frip_qc": { + "rep1-rep2": { + "FRiP": 0.0928734665516 + }, + "rep1-pr": { + "FRiP": 0.111877734359 + }, + "rep2-pr": { + "FRiP": 0.0611730430994 + }, + "ppr": { + "FRiP": 0.0946995519024 + } + }, + "jsd_qc": { + "rep1": { + "pct_gen_enrich": 0.253084537232, + "auc": 0.495213192034, + "ch_div": 0.109013409253, + "elbow_pt": 0.0, + "jsd": 0.658133020911, + "syn_auc": 0.507396253608, + "syn_elbow_pt": 0.218110068613, + "syn_jsd": 0.351742728062 + }, + "rep2": { + "pct_gen_enrich": 0.28284376826, + "auc": 0.495774226111, + "ch_div": 0.100348476639, + "elbow_pt": 0.0, + "jsd": 0.616301829152, + "syn_auc": 0.504944745786, + "syn_elbow_pt": 0.171867247167, + "syn_jsd": 0.303263760136 + } + } +} diff --git a/test/test_workflow/test_chip.sh b/test/test_workflow/test_chip.sh index bde5b218..d806d28a 100755 --- a/test/test_workflow/test_chip.sh +++ b/test/test_workflow/test_chip.sh @@ -8,7 +8,7 @@ fi if [ $# -gt 2 ]; then DOCKER_IMAGE=$3 else - DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:v1.1.5 + DOCKER_IMAGE=quay.io/encode-dcc/chip-seq-pipeline:v1.1.6 fi INPUT=$1 GCLOUD_SERVICE_ACCOUNT_SECRET_JSON_FILE=$2 diff --git a/utils/qc_jsons_to_tsv/README.md b/utils/qc_jsons_to_tsv/README.md index 93dfa68a..d56466c8 100644 --- a/utils/qc_jsons_to_tsv/README.md +++ b/utils/qc_jsons_to_tsv/README.md @@ -49,4 +49,4 @@ optional arguments: ``` python qc_jsons_to_tsv.py --search-dir test/v1.1.4 --criteria-def-json-file criteria.default.json > test_v1.1.4.tsv python qc_jsons_to_tsv.py --search-dir test/v1.1.5 --criteria-def-json-file criteria.default.json > test_v1.1.5.tsv -``` \ No newline at end of file +``` diff --git a/utils/resumer/README.md b/utils/resumer/README.md new file mode 100644 index 00000000..31ed960f --- /dev/null +++ b/utils/resumer/README.md @@ -0,0 +1,187 @@ +# resumer + +## Introduction + +This python script parses a metadata JSON file from a previous failed workflow and generates a new input JSON file to start a pipeline from where it left off. + +## How to use it + +Before running this script, you should have a metadata JSON file for a previous failed workflow. You can get it by adding a parameter `-m metadata.json` to the cromwell Java command line. If you stop a workflow (CTRL+C or kill) metadata then JSON file will not be generated. +```bash +$ java -jar ... cromwell-34.jar run chip.wdl -i original_input.json ... -m metadata.json +``` + +Unfortunately your workflow failed for some reasons but you can fix the problem and want to resume it from where it left off. +```bash +$ python resumer.py metadata.json +``` + +You will get a new input JSON file `resume.FAILED_WORKFLOW_ID.json` and run cromwell with it instead of the original one `original_input.json`. +```bash +$ java -jar ... cromwell-34.jar run chip.wdl -i resume.FAILED_WORKFLOW_ID.json ... +``` + +## Usage + +```bash +usage: Resumer for ENCODE ATAC/Chip-Seq pipelines [-h] + [--output-def-json-file OUTPUT_DEF_JSON_FILE] + metadata_json_file + +Parse cromwell's metadata JSON file and generate a new input JSON file to +resume a pipeline from where it left off. + +positional arguments: + metadata_json_file Cromwell metadata JSON file from a previous failed + run. + +optional arguments: + -h, --help show this help message and exit + --output-def-json-file OUTPUT_DEF_JSON_FILE + Output definition JSON file for your pipeline. If not + specified, it will look for a valid JSON file on + script's directory. You can use your own JSON file for + your pipeline. Entries in "Array[Object]" is for + Array[Object] in an input JSON. This is useful to take + outputs from a scatter block. For example, the 1st + entry of "Array[Object]" in chip.json is "chip.bwa" : + {"bam" : "chip.bams", "flagstat_qc" : + "chip.flagstat_qcs"}. chip.flagstat_qcs : [...(taken + from an output of chip.bwa.flagstat_qc)...] will be + added to your new input JSON. For example, the 1st + entry of "Object" in chip.json is "chip.pool_ta" : + {"ta_pooled" : "chip.ta_pooled"}. chip.ta_pooled : + "(taken from an output of chip.pool_ta.ta_pooled)" + will be added to your new input JSON. +``` + +## Examples + +```bash +$ python resumer.py metadata.json +``` + +## How it works (for developers) + +In order to use this script, your pipeline should be able to start from any type of inputs (e.g. FASTQ, BAM, ...) and inputs to the previous task (e.g. map_fastq) should be ignored if next step (e.g. filter_bam)'s input is already given in the input JSON file. + +``` +# example toy_chip workflow that processes through FASTQ->BAM->FILT_BAM->PEAK->REPORT +# this pipeline can start from any types of input FASTQ, BAM, FILT_BAM, PEAK +# key idea of resuming workflow is to skip previous step +# if next step's input is already given in the input JSON file +# this is controlled by `Boolean` variables (`need_to_process_XXX`). + +workflow toy_chip { + # input definition + Array[File] fastqs = [] # per replicate + Array[File] bams = [] # per replicate + Array[File] filt_bams = [] # per replicate + Array[File] peaks = [] # per replicate + + Boolean need_to_process_peak = true # trivial + Boolean need_to_process_filt_bam = need_to_process_peak && length(peaks)==0 + Boolean need_to_process_bam = need_to_process_filt_bam && length(filt_bams)==0 + Boolean need_to_process_fastq = need_to_process_bam && length(bams)==0 + + scatter(fastq in if need_to_process_fastq then fastqs else []) { + call map_fastq { input: fastq = fastq } + } + + # temporary array to deal with outputs from either previous step or from an input JSON file + Array[File] bams_ = flatten([map_fastq.bam, bams]) + scatter(bam in if need_to_process_bam then bams_ else []) { + call filter_bam { input: bam = bam } + } + + Array[File] filt_bams_ = flatten([filter_bam.filt_bam, filt_bams]) # temporary array again + scatter(filt_bam in if need_to_process_filt_bam then filt_bams_ else []) { + call call_peak { input: filt_bam = filt_bam } + } + + Array[File] peaks_ = flatten([call_peak.peak, peaks]) # temporary array again + if (need_to_process_peak) { + call generate_report { input: peaks = peaks_ } + } +} +``` + +Output definition JSON file `toy_chip.json` for the above example workflow should look like: +```javascript +{ + "Array[Object]" : { + "toy_chip.map_fastq" : { + "bam" : "toy_chip.bams" + }, + "toy_chip.filter_bam" : { + "filt_bam" : "toy_chip.filt_bams" + } + "toy_chip.call_peak" : { + "peak" : "toy_chip.peaks" + } + } +} +``` + +An original input JSON file to start from fastqs. +```javscript +{ + "toy_chip.fastqs" : ["rep1.fastq.gz", "rep1.fastq.gz"] +} +``` + +Run a pipeline with this original input JSON. +```bash +$ java -jar cromwell-34.jar run toy_chip.wdl -i org_input.json -m metadata.json +``` + +Pipeline fails due to some errors in `call_peak` task. Run `resumer.py` to make a new input JSON file to resume. +```bash +$ python resumer.py metadata.json --output-def-json-file toy_chip.json +``` + +Then `result.WORKFLOW_ID.json` will be generated. +```javscript +{ + "toy_chip.fastqs" : ["rep1.fastq.gz", "rep1.fastq.gz"] + "toy_chip.bams" : ["rep1.bam", "rep1.bam"] + "toy_chip.filt_bams" : ["rep1.filt.bam", "rep1.filt.bam"] +} +``` + +You feed it to the cromwell java command line after fixing the problem. Then pipeline will start from ``scatter` block for `call_peak` tasks. +```bash +$ java -jar cromwell-34.jar run toy_chip.wdl -i resume.WORKFLOW_ID.json +``` + +## Output definition JSON file (for developers) + +An output definition JSON file must have at least one object from `"Array[Object]"` and `"Object"`. It can have both. The following JSON is a simplified version of an output definition JSON file for ChIP-Seq pipeline (`chip.json`). +```javascript +{ + "Array[Object]" : { + "chip.bwa" : { + "bam" : "chip.bams", + "flagstat_qc" : "chip.flagstat_qcs" + } + }, + + "Object" : { + "chip.pool_ta" : { + "ta_pooled" : "chip.ta_pooled" + } + } +} +``` + +`"Array[Object]"` is useful to take an array of outputs from a `scatter` block and `"Object"` is good for taking a single value from any tasks. + +Using this JSON file for `resumer.py` will add the following extra input data definitions to the original input JSON file. +```javascript +{ + "chip.bams" : [...(an array of values taken from chip.bwa.bam)...], + "chip.flagstat_qcs" : [...(an array of values taken from chip.bwa.flagstat_qc)...], + "chip.ta_pooled" : "...(a value taken from chip.pool_ta.ta_pooled)..." +} + + diff --git a/utils/resumer/chip.json b/utils/resumer/chip.json new file mode 100644 index 00000000..545d8520 --- /dev/null +++ b/utils/resumer/chip.json @@ -0,0 +1,97 @@ +{ + "Array[Object]" : { + "chip.bwa" : { + "bam" : "chip.bams", + "flagstat_qc" : "chip.flagstat_qcs" + }, + "chip.filter" : { + "nodup_bam" : "chip.nodup_bams", + "flagstat_qc" : "chip.nodup_flagstat_qcs", + "dup_qc" : "chip.dup_qcs", + "pbc_qc" : "chip.pbc_qcs" + }, + "chip.bam2ta" : { + "ta" : "chip.tas" + }, + "chip.bwa_ctl" : { + "bam" : "chip.ctl_bams", + "flagstat_qc" : "chip.ctl_flagstat_qcs" + }, + "chip.filter_ctl" : { + "nodup_bam" : "chip.ctl_nodup_bams", + "flagstat_qc" : "chip.ctl_nodup_flagstat_qcs", + "dup_qc" : "chip.ctl_dup_qcs", + "pbc_qc" : "chip.ctl_pbc_qcs" + }, + "chip.bam2ta_ctl" : { + "ta" : "chip.ctl_tas" + }, + "chip.xcor" : { + "plot_png" : "chip.xcor_plots", + "score" : "chip.xcor_scores", + "fraglen" : "chip.fraglen" + }, + "chip.macs2" : { + "npeak" : "chip.peaks", + "frip_qc" : "chip.macs2_frip_qcs", + "sig_pval" : "chip.sig_pvals" + }, + "chip.macs2_pr1" : { + "npeak" : "chip.peaks_pr1", + "frip_qc" : "chip.macs2_pr1_frip_qcs" + }, + "chip.macs2_pr2" : { + "npeak" : "chip.peaks_pr2", + "frip_qc" : "chip.macs2_pr2_frip_qcs" + }, + "chip.spp" : { + "rpeak" : "chip.peaks", + "frip_qc" : "chip.spp_frip_qcs" + }, + "chip.spp_pr1" : { + "rpeak" : "chip.peaks_pr1", + "frip_qc" : "chip.spp_pr1_frip_qcs" + }, + "chip.spp_pr2" : { + "rpeak" : "chip.peaks_pr2", + "frip_qc" : "chip.spp_pr2_frip_qcs" + } + }, + + "Object" : { + "chip.pool_ta" : { + "ta_pooled" : "chip.ta_pooled" + }, + "chip.pool_ta_ctl" : { + "ta_pooled" : "chip.ctl_ta_pooled" + }, + "chip.macs2_pooled" : { + "npeak" : "chip.peak_pooled", + "frip_qc" : "chip.macs2_pooled_frip_qc_" + }, + "chip.macs2_ppr1" : { + "npeak" : "chip.peak_ppr1", + "frip_qc" : "chip.macs2_ppr1_frip_qc_" + }, + "chip.macs2_ppr2" : { + "npeak" : "chip.peak_ppr2", + "frip_qc" : "chip.macs2_ppr2_frip_qc_" + }, + "chip.spp_pooled" : { + "rpeak" : "chip.peak_pooled", + "frip_qc" : "chip.spp_pooled_frip_qc_" + }, + "chip.spp_ppr1" : { + "rpeak" : "chip.peak_ppr1", + "frip_qc" : "chip.spp_ppr1_frip_qc_" + }, + "chip.spp_ppr2" : { + "rpeak" : "chip.peak_ppr2", + "frip_qc" : "chip.spp_ppr2_frip_qc_" + }, + "chip.fingerprint" : { + "jsd_qcs" : "chip.jsd_qcs", + "plot" : "chip.jsd_plot" + } + } +} diff --git a/utils/resumer/default.json b/utils/resumer/default.json new file mode 120000 index 00000000..e1156357 --- /dev/null +++ b/utils/resumer/default.json @@ -0,0 +1 @@ +chip.json \ No newline at end of file diff --git a/utils/resumer/resumer.py b/utils/resumer/resumer.py new file mode 100755 index 00000000..5b8709ca --- /dev/null +++ b/utils/resumer/resumer.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python2 + +# written by Jin Lee, 2019 + +import os +import argparse +import json +from collections import OrderedDict + +def parse_arguments(): + parser = argparse.ArgumentParser(prog='Resumer for ENCODE ATAC/Chip-Seq pipelines', + description='Parse cromwell\'s metadata JSON file and generate a new input JSON file ' + 'to resume a pipeline from where it left off.') + parser.add_argument('metadata_json_file', type=str, help='Cromwell metadata JSON file from a previous failed run.') + parser.add_argument('--output-def-json-file', type=str, help='Output definition JSON file for your pipeline. ' + 'If not specified, it will look for a valid JSON file on script\'s directory. ' + 'You can use your own JSON file for your pipeline. ' + 'Entries in "Array[Object]" is for Array[Object] in an input JSON. This is useful to take outputs from a scatter block. ' + 'For example, the 1st entry of "Array[Object]" in chip.json is "chip.bwa" : {"bam" : "chip.bams", "flagstat_qc" : "chip.flagstat_qcs"}. ' + 'chip.flagstat_qcs : [...(taken from an output of chip.bwa.flagstat_qc)...] will be added to your new input JSON. ' + 'For example, the 1st entry of "Object" in chip.json is "chip.pool_ta" : {"ta_pooled" : "chip.ta_pooled"}. ' + 'chip.ta_pooled : "(taken from an output of chip.pool_ta.ta_pooled)" will be added to your new input JSON. ') + args = parser.parse_args() + + # if not specified by user, look into this array on script's directory + if not args.output_def_json_file: + script_dir = os.path.dirname(os.path.realpath(__file__)) + default_output_def_json_files = ['default.json'] + for f in default_output_def_json_files: + json_file = os.path.join(script_dir, f) + if os.path.exists(json_file): + args.output_def_json_file = json_file + break + return args + +def read_json_file(json_file): + with open(json_file,'r') as fp: + return json.load(fp, object_pairs_hook=OrderedDict) + +def parse_cromwell_metadata_json_file(json_file): + metadata_json = read_json_file(json_file) + + workflow_id = metadata_json['labels']['cromwell-workflow-id'].replace('cromwell-','') + org_input_json = json.loads(metadata_json['submittedFiles']['inputs'], object_pairs_hook=OrderedDict) + calls = metadata_json['calls'] + + return workflow_id, org_input_json, calls + +def find_output_of_successful_calls(calls, output_def_json): + result = OrderedDict() + + if 'Array[Object]' in output_def_json: + for call_name in output_def_json['Array[Object]']: + if call_name in calls: + call = calls[call_name] # call is a list of the same task for multiple replicates + failed = False + for i, c in enumerate(call): # i = 0-based replicate id + if c['executionStatus']!='Done': + failed = True + break + if not failed: + for key in output_def_json['Array[Object]'][call_name]: + wdl_var_name = output_def_json['Array[Object]'][call_name][key] + result[wdl_var_name] = [call[i]['outputs'][key] for i, _ in enumerate(call)] + + if 'Object' in output_def_json: + for call_name in output_def_json['Object']: + if call_name in calls: + call = calls[call_name] # call is a list of the same task for multiple replicates + failed = False + for i, c in enumerate(call): # i = 0-based replicate id + if c['executionStatus']!='Done': + failed = True + break + if not failed: + assert(len(call)==1) + for key in output_def_json['Object'][call_name]: + wdl_var_name = output_def_json['Object'][call_name][key] + result[wdl_var_name] = call[0]['outputs'][key] + + return result + +def main(): + args = parse_arguments() + + workflow_id, org_input_json, calls = parse_cromwell_metadata_json_file(args.metadata_json_file) + + output_def_json = read_json_file(args.output_def_json_file) + + new_input_json = find_output_of_successful_calls(calls, output_def_json) + + # merge new input json over original input json + for key in new_input_json: + org_input_json[key] = new_input_json[key] + + with open('resume.{}.json'.format(workflow_id),'w') as fp: + fp.write(json.dumps(org_input_json, indent=4)) + +if __name__=='__main__': + main() diff --git a/workflow_opts/docker.json b/workflow_opts/docker.json index 8c0f68c8..6881aa99 100644 --- a/workflow_opts/docker.json +++ b/workflow_opts/docker.json @@ -1,6 +1,6 @@ { "default_runtime_attributes" : { - "docker" : "quay.io/encode-dcc/chip-seq-pipeline:v1.1.5", + "docker" : "quay.io/encode-dcc/chip-seq-pipeline:v1.1.6", "zones": "us-west1-a us-west1-b us-west1-c us-central1-c us-central1-b", "failOnStderr" : false, "continueOnReturnCode" : 0, diff --git a/workflow_opts/scg.json b/workflow_opts/scg.json index 9ca47e48..09b76364 100644 --- a/workflow_opts/scg.json +++ b/workflow_opts/scg.json @@ -1,7 +1,7 @@ { "default_runtime_attributes" : { "slurm_account" : "YOUR_SLURM_ACCOUNT", - "singularity_container" : "/reference/ENCODE/pipeline_singularity_images/chip-seq-pipeline-v1.1.5.simg", + "singularity_container" : "/reference/ENCODE/pipeline_singularity_images/chip-seq-pipeline-v1.1.6.simg", "singularity_bindpath" : "/reference/ENCODE,/scratch,/srv/gsfs0" } } diff --git a/workflow_opts/sge.json b/workflow_opts/sge.json index 8defdd4a..45b1e442 100644 --- a/workflow_opts/sge.json +++ b/workflow_opts/sge.json @@ -1,6 +1,6 @@ { "default_runtime_attributes" : { "sge_pe" : "shm", - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.5.simg" + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.6.simg" } } diff --git a/workflow_opts/sherlock.json b/workflow_opts/sherlock.json index 8c696179..580b83f7 100644 --- a/workflow_opts/sherlock.json +++ b/workflow_opts/sherlock.json @@ -1,7 +1,7 @@ { "default_runtime_attributes" : { "slurm_partition" : "normal", - "singularity_container" : "/home/groups/cherry/encode/pipeline_singularity_images/chip-seq-pipeline-v1.1.5.simg", + "singularity_container" : "/home/groups/cherry/encode/pipeline_singularity_images/chip-seq-pipeline-v1.1.6.simg", "singularity_bindpath" : "/scratch,/lscratch,/oak/stanford,/home/groups/cherry/encode" } } diff --git a/workflow_opts/singularity.json b/workflow_opts/singularity.json index 2ab4295f..612eaec6 100644 --- a/workflow_opts/singularity.json +++ b/workflow_opts/singularity.json @@ -1,5 +1,5 @@ { "default_runtime_attributes" : { - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.5.simg" + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.6.simg" } } diff --git a/workflow_opts/slurm.json b/workflow_opts/slurm.json index 5268aae0..a00288cd 100644 --- a/workflow_opts/slurm.json +++ b/workflow_opts/slurm.json @@ -2,6 +2,6 @@ "default_runtime_attributes" : { "slurm_partition" : "YOUR_SLURM_PARTITION", "slurm_account" : "YOUR_SLURM_ACCOUNT", - "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.5.simg" + "singularity_container" : "~/.singularity/chip-seq-pipeline-v1.1.6.simg" } }