diff --git a/pipeline/Makefile b/pipeline/Makefile index 894183e71..5d1b02316 100644 --- a/pipeline/Makefile +++ b/pipeline/Makefile @@ -1,48 +1,35 @@ # set the working directory where the makefile will be created and analysis will be done. -WORKDIR=./test +TEMP_DIR = ~/tmp # analysis-specific files. -SPEC=templates/metabarcode_qc/metabarcode_spec.hjson -TEMPLATE=templates/metabarcode_qc/metabarcode_makefile.html +SPEC=templates/qc/qc_spec.hjson +TEMPLATE=templates/qc/qc_makefile.html COMMAND=all -# check if spec file exists. +TEST_DATA=$(TEMP_DIR)/data.tar.gz +SAMPLE_DATA=$(TEMP_DIR)/sampleinfo.txt -ifeq ($(wildcard $(SPEC)),) -$(error $(SPEC) not found.) -endif - -# check if template file exists. - -ifeq ($(wildcard $(TEMPLATE)),) -$(error $(TEMPLATE) not found.) -endif +run: setup + @echo runs all commands to setup the pipeline and then runs it. +$(TEMP_DIR): + mkdir -p $(TEMP_DIR) -workdir: - # - # creates work directory. - # - mkdir -p $(WORKDIR) +$(TEST_DATA): $(TEMP_DIR) + curl -o $(TEST_DATA) -O http://iris.bx.psu.edu/projects/metabarcode-data/data.tar.gz -get_data: workdir - # - # Download data and sampleinfo to workdir. - # - wget -P $(WORKDIR) http://iris.bx.psu.edu/projects/metabarcode-data/data.tar.gz - wget -P $(WORKDIR) http://iris.bx.psu.edu/projects/metabarcode-data/sampleinfo.txt +$(SAMPLE_DATA): $(TEMP_DIR) + curl -o $(SAMPLE_DATA) -O http://iris.bx.psu.edu/projects/metabarcode-data/sampleinfo.txt -setup: +setup: $(TEST_DATA) $(SAMPLE_DATA) # # creates the makefile and runs the pipeline. # - python make.py $(SPEC) $(TEMPLATE) > $(WORKDIR)/Makefile + #python make.py $(SPEC) $(TEMPLATE) > $(WORKDIR)/Makefile # # runs make # - cd $(WORKDIR); make $(COMMAND) + #cd $(WORKDIR); make $(COMMAND) -run: workdir get_data setup - @echo runs all commands to setup the pipeline and then runs it. diff --git a/pipeline/classify/__init__.py b/pipeline/classify/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipeline/classify/test.py b/pipeline/classify/test.py new file mode 100644 index 000000000..02aca1c9f --- /dev/null +++ b/pipeline/classify/test.py @@ -0,0 +1 @@ +print("HELLO") diff --git a/pipeline/data/__init__.py b/pipeline/data/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipeline/samplesheet.py b/pipeline/data/samplesheet.py similarity index 96% rename from pipeline/samplesheet.py rename to pipeline/data/samplesheet.py index c0193352c..234bd94bd 100644 --- a/pipeline/samplesheet.py +++ b/pipeline/data/samplesheet.py @@ -69,14 +69,14 @@ def get_fnames(sname , datadir): if __name__ == "__main__": samplesheet = sys.argv[1] - datadir =sys.argv[2] + datadir = sys.argv[2] #samplesheet ="./test/sampleinfo.txt" #datadir ="test/data" data = update_sampleinfo(samplesheet, datadir) - outfile = open("updated_sampleinfo.txt", "w") + outfile = sys.stdout header ="\t".join(['sample_name','sample_group','target_gene','barcode','fwd_primer','rev_primer','file1','file2']) outfile.write(header + "\n") diff --git a/pipeline/make.py b/pipeline/make.py deleted file mode 100644 index c69ef0d68..000000000 --- a/pipeline/make.py +++ /dev/null @@ -1,98 +0,0 @@ -import django -from django.conf import settings -from django.template.loader import get_template -import sys, os, json, hjson - - -def path_join(*args): - return os.path.abspath(os.path.join(*args)) - - -CURR_DIR = os.path.dirname(os.path.realpath(__file__)) -TEMPLATE_DIR = CURR_DIR -#TEMPLATE_DIR = path_join(CURR_DIR, "templates") - - -def set_env(): - ''' - Specify template engine and template directory. - Pass the settings variable to configure. - ''' - - TEMPLATES = [ - { - 'BACKEND': 'django.template.backends.django.DjangoTemplates', - 'DIRS': [TEMPLATE_DIR], - } - ] - settings.configure(TEMPLATES=TEMPLATES) - return - - -def render(data, template): - set_env() - django.setup() - temp = get_template(template) - html = temp.render(data) - return html - - -def name_val_pair(element): - ''' - input is a dictionary object and it returns a name, value pair. - ''' - - try: - value = element['default'] if not element['value'] else element['value'] - except KeyError: - print("{0} raises an error".format(element['name']) ) - return element['name'], value - - -def parse_configs(configs): - ''' - parse config file and returns a dictionary to be rendered. - ''' - - specs = dict() - for element in configs: - if not element['name'] == "analysis_spec": - name, value = name_val_pair(element) - specs[name] = value - return specs - - -def load_specs(fname): - try: - data = open(fname).read() - except OSError as e: - print("File not found: {0}".format(e)) - sys.stderr.close() - #data = json.loads(data) - data = hjson.loads(data) - return data - - -def run(): - config = sys.argv[1] - template = sys.argv[2] - - #config = "./templates/metabarcode_qc/metabarcode_spec.hjson" - #template = "./templates/metabarcode_qc/metabarcode_makefile.html" - - - # loads sepc file. - configs = load_specs(config) - - # parse configs to render in template. - specs = parse_configs(configs) - - context = {'specs': specs, 'jobid': 'job0' } - - # render template. - html = render(context, template) - print(html) - - -if __name__ == "__main__": - run() diff --git a/pipeline/qc/__init__.py b/pipeline/qc/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/pipeline/qc/qc_spec.hjson b/pipeline/qc/qc_spec.hjson new file mode 100644 index 000000000..b66ae7547 --- /dev/null +++ b/pipeline/qc/qc_spec.hjson @@ -0,0 +1,124 @@ +{ +sampleinfo : + { + value : ~/tmp/sampleinfo.txt + help : + ''' +Enter a tab delimited text file. Order of the fields should be +sample name, sample group, barcode, forward primer sequence, reverse primer sequence, target gene. +Sample name should match the sample name in the dataset. + ''' + label : Specify the sample file. + css: red + visible: 1 + display_type: UPLOAD + } +data : + { + value: ~/tmp/data.tar.gz + visible: 1 + origin: PROJECT + data_type: TAR_FASTQ_GZ + display_type: DROPDOWN + + } +library_type : + { + value : true + label : Data entered is paired end. + visible: 1 + display_type: CHECKBOX + } +trim_primer: + { + label : Trims barcode and primer sequences. + value : true + help : Barcode and primer sequences should be in the sample sheet. + visible: 1 + display_type: CHECKBOX + + } +trim_quality: + { + + label : Specify average quality value for trimming. + help : Leave empty for not trimming + value : 30 + range : [20,32] + visible : 1 + display_type : INTEGER + } +merge_mismatch: + { + label : Maximum mismatch allowed. Default is 3.0. + value : 3.0 + range : [0.9,5.1] + visible : 1 + display_type : FLOAT + } +merge_overlap: + { + label : Minimum number of overlapping bases to allow merging. Default is 12. + value :12 + range : [8,16] + visible : 1 + display_type : INTEGER + } +action : + { + choices : { + fastqc : Create fastqc report + merge : Merge paired end dataset + } + + value : merge + visible : 1 + display_type: RADIO + } + // scripts info +input_check : + { + + value : ../../check_input.py + display_type : SCRIPT + } +report_template : + { + value : metabarcode_report.html + display_type : SCRIPT + } +report_script : + { + value : metabarcode_results.py + display_type: SCRIPT + } +analysis_spec : + { + value: Quality control of sequencing data + description: + ''' +# Purpose + +This analysis takes a pired end dataset and produces a merged improved dataset. +It can optionally trimmed the data before merging. +MultiQC quality reports of each step are also produced. + +# Parameters + +**Input:** + +1. Paired end sequence data as a tar archived gzip file. +2. Sample information sheet as a tab delimited text file. + +# Results + +This analysis produces + +1. MultiQC report - an aggregate fastqc report of the input data. +2. Trimmed dataset and its quality report (optional). +3. Merged dataset and its quality report. + + ''' + display_type:MODEL + } +} diff --git a/pipeline/qc/templates/qc_makefile.html b/pipeline/qc/templates/qc_makefile.html new file mode 100644 index 000000000..555125dbe --- /dev/null +++ b/pipeline/qc/templates/qc_makefile.html @@ -0,0 +1,96 @@ + +INPUT_DATA = {{data.value}} +INPUT_SAMPLE_INFO := {{sampleinfo.value}} + + +# +# Internal parameters after this. +# + +DATA_DIR = data +RESULT_DIR = results + +PTRIM_DIR = $(DATA_DIR)/trim/trimp +QTRIM_DIR = $(DATA_DIR)/trim/trimq +UPDATED_SAMPLE_INFO = updated_sampleinfo.txt + +UNPACKED_FASTQ = $(DATA_DIR)/*.fastq.gz +FASTQC_REPORT = $(DATA_DIR)/*_fastqc.html +TRIMMED_PRIMER = $(PTRIM_DIR)/*.gz +TRIMMED_QUALITY =$(QTRIM_DIR)/*.gz + +all: $(FASTQC_REPORT) $(TRIMMED_QUALITY) + @echo "executes all commands" + +$(UNPACKED_FASTQ): $(INPUT_DATA) + tar -xzvmf $(INPUT_DATA) + +$(FASTQC_REPORT): $(UNPACKED_FASTQ) + ls $(DATA_DIR)/*.gz | parallel --verbose --progress -j 8 fastqc --nogroup {} -o $(DATA_DIR) + +ERR_LOG = stderr.txt + +$(UPDATED_SAMPLE_INFO): $(INPUT_SAMPLE_INFO) + # + # Make new sample sheet named $(UPDATED_SAMPLE_INFO) + # + python -m pipeline.data.samplesheet $(INPUT_SAMPLE_INFO) $(DATA_DIR) > $(UPDATED_SAMPLE_INFO) + + +multiqc: fastqc $(RESDIR) + # + # run multiqc to create an aggregate report + # + mkdir -p $(RESDIR) + multiqc -f -n input_multiqc -o $(DATADIR) $(DATADIR) + cp $(DATADIR)/input_multiqc.html $(RESDIR) + # + # clean + # + #rm -f $(DATADIR)/*fastqc.zip + +$(TRIMMED_PRIMER): $(UPDATED_SAMPLE_INFO) $(INPUT_DATA) + # + mkdir -p $(PTRIM_DIR)/logs + # + # + # trim left primer + mkdir -p $(RESULT_DIR) + cat $(UPDATED_SAMPLE_INFO) |parallel --verbose --progress --header : --colsep '\t' bbduk.sh \ + in1=$(DATA_DIR)/{file1} in2=$(DATA_DIR)/{file2} \ + out1=$(PTRIM_DIR)/{sample_name}_fwd1_trim.fq.gz \ + out2=$(PTRIM_DIR)/{sample_name}_fwd2_trim.fq.gz \ + literal={barcode}{fwd_primer} ktrim=l k=23 hdist=1 tpe tbo overwrite=true \ + overwrite=t skipr2=t stats=$(PTRIM_DIR)/logs/{sample_name}_fwd_stats.txt + + # + # trim right primer + # + cat $(UPDATED_SAMPLE_INFO) | parallel --verbose --progress --header : --colsep '\t' bbduk.sh \ + in1=$(PTRIM_DIR)/{sample_name}_fwd1_trim.fq.gz \ + in2=$(PTRIM_DIR)/{sample_name}_fwd2_trim.fq.gz \ + out1=$(PTRIM_DIR)/{sample_name}_R1_trimp.fq.gz \ + out2=$(PTRIM_DIR)/{sample_name}_R2_trimp.fq.gz \ + literal={barcode}{rev_primer} ktrim=l k=28 hdist=1 tpe tbo overwrite=true \ + skipr1=t stats=$(PTRIM_DIR)/logs/{sample_name}_rev_stats.txt + # + # clean + # + cat $(PTRIM_DIR)/logs/*stats.txt > $(RESULT_DIR)/trimp_stats.txt + #rm -f $(PTRIM_DIR)/*trimfwd* + + +$(TRIMMED_QUALITY): $(TRIMMED_PRIMER) + mkdir -p $(QTRIM_DIR)/logs + cat $(UPDATED_SAMPLE_INFO) |parallel --verbose --progress --header : --colsep '\t' bbduk.sh \ + in1=$(PTRIM_DIR)/{sample_name}_R1_trimp.fq.gz in2=$(PTRIM_DIR)/{sample_name}_R2_trimp.fq.gz \ + qtrim=rl trimq={{trim_quality.value}} minlength=35 overwrite=true \ + out1=$(QTRIM_DIR)/{sample_name}_R1_trim.fq.gz \ + out2=$(QTRIM_DIR)/{sample_name}_R2_trim.fq.gz stats=$(QTRIM_DIR)/logs/{sample_name}_stats.txt + cat $(QTRIM_DIR)/logs/*stats.txt > $(RESULT_DIR)/trimq_stats.txt + # + # quality report + # + ls $(QTRIM_DIR)/*gz | parallel --verbose --progress -j 8 fastqc --nogroup {} -o $(QTRIM_DIR) + multiqc -n trim_multiqc --force -o $(QTRIM_DIR) $(QTRIM_DIR) + cp $(QTRIM_DIR)/trim_multiqc.html $(RESULT_DIR) diff --git a/pipeline/qc/test.py b/pipeline/qc/test.py new file mode 100644 index 000000000..86a8517b7 --- /dev/null +++ b/pipeline/qc/test.py @@ -0,0 +1,15 @@ +from pipeline import render + +import os + +CURR_DIR = os.path.dirname(os.path.realpath(__file__)) + +TEMPLATE_FILE= os.path.join(CURR_DIR, 'templates', 'qc_makefile.html') +SPEC_FILE = os.path.join(CURR_DIR, 'qc_spec.hjson' ) + +if __name__ =="__main__": + spec = open(SPEC_FILE).read() + tmpl = open(TEMPLATE_FILE).read() + html = render.render_string(spec, tmpl) + print(html) + diff --git a/pipeline/render.py b/pipeline/render.py new file mode 100644 index 000000000..214c77d99 --- /dev/null +++ b/pipeline/render.py @@ -0,0 +1,56 @@ +import django +from django.conf import settings +from django.template.loader import get_template +from django.template import Template,Context +import os, hjson + + +def set_env(): + ''' + Specify template engine and template directory. + Pass the settings variable to configure. + ''' + + TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': 'foo', + 'OPTIONS': { + 'string_if_invalid': "***MISSING**" + }, + } + ] + settings.configure(TEMPLATES=TEMPLATES) + django.setup() + return + + +def render_file(data, template_file): + template_txt = open(template_file).read() + return render_string(data, template_txt) + + +def render_string(data, template_txt): + set_env() + + spec = hjson.loads(data) + + + template = Template(template_txt) + context = Context(spec) + html = template.render(context) + return html + + +if __name__ == "__main__": + + config = "./templates/metabarcode_qc/metabarcode_spec.hjson" + template = "./templates/metabarcode_qc/metabarcode_makefile.html" + + html = render_file(config, template) + print(html) + + + + + diff --git a/pipeline/run_job.py b/pipeline/run_job.py index 053a8d77e..a5ece4c67 100644 --- a/pipeline/run_job.py +++ b/pipeline/run_job.py @@ -1,67 +1,99 @@ import subprocess, os -import urllib.request -#from engine.models import Job +import sys, json, hjson +from pipeline.render import render_file,render_string - -DATA="http://iris.bx.psu.edu/projects/metabarcode-data/data.tar.gz" -SINFO="http://iris.bx.psu.edu/projects/metabarcode-data/sampleinfo.txt" +CURR_DIR = os.path.dirname(os.path.realpath(__file__)) def run(job): - ''' + '''' takes job object, runs the job and return job status ''' spec = job.spec template = job.template outdir = job.outdir - jobid = job.jobid - - # create Makefile for analysis - process1 = subprocess.run(['python','make.py', spec,template], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - print(process1.check_returncode()) - - if not os.path.exists(outdir): - os.system('mkdir -p {0}'.format(outdir)) - - mfile = open(os.path.join(outdir, "Makefile"), "w") - mfile.write(process1.stdout.decode('utf-8')) - mfile.close() - - # get analysis data - urllib.request.urlretrieve(SINFO, os.path.join(outdir, 'sampleinfo.txt')) - urllib.request.urlretrieve(DATA, os.path.join(outdir, 'data.tar.gz')) - - # run analysis - process2 = subprocess.run(['make', 'all'], cwd=outdir) - print(process2.check_returncode()) - - print("{0} ran successfully!".format(jobid)) - return + #jobid = job.jobid + errorlog = [] -class Job: - pass + try: + mtext = render_string(spec,template) -if __name__ == "__main__": + if not os.path.isdir(outdir): + os.mkdir(outdir) + + with open(os.path.join(outdir, "Makefile"), 'wt') as fp: + fp.write(mtext) + + process = subprocess.run(['make', 'all'], cwd=outdir, stderr=subprocess.PIPE, check=True) + job.status = process.returncode + + except subprocess.CalledProcessError as err: + print("ERROR!!") + errorlog.append(err.stderr.decode('utf-8')) + print("***",err.returncode) + job.status = err.returncode + + finally: + print(CURR_DIR) + #errorlog.append(process.stderr.decode('utf-8')) + job.log = "\n".join(errorlog) + print("printing errlog") + print(job.log) + #job.save() + #print("Job Done") + + return job + + +def get_spec(specfile='spec'): + spec = hjson.load(open(specfile)) + return spec + + +def test_job(data='data', sinfo='sinfo'): + + spec_file = "templates/qc/qc_spec.hjson" + template_file = "templates/qc/qc_makefile.html" - # these won't be needed later. - ############# - spec = "templates/metabarcode_qc/metabarcode_spec.hjson" - template = "templates/metabarcode_qc/metabarcode_makefile.html" - out = "./workout" - jobid = "job0" + + spec = get_spec(specfile=spec_file) + + spec['data']['value'] = data + spec['sampleinfo']['value'] = sinfo + + new_spec = hjson.dumps(spec) + template_txt = open(template_file).read() + + #html=render_string(new_spec,template_txt) + #print(html) + #1/0 job = Job() - job.jobid= jobid - job.spec = spec - job.template = template - job.outdir = out - ########## + job.jobid= "job0" + job.spec = new_spec + job.template = template_txt + job.outdir = "./work" + job.status ="" + run(job) + print (job.status) + print (job.log) + + +class Job: + pass + + +if __name__ == "__main__": + + test_job(data="~/work/web-dev/biostar-engine/pipeline/testdata/data.tar.gz",sinfo="~/work/web-dev/biostar-engine/pipeline/testdata/sampleinfo.txt") + #run(job) + diff --git a/pipeline/templates/qc/qc_makefile.html b/pipeline/templates/qc/qc_makefile.html index 2e9846d3b..6a3e417e1 100644 --- a/pipeline/templates/qc/qc_makefile.html +++ b/pipeline/templates/qc/qc_makefile.html @@ -1,9 +1,8 @@ -JOBID = {{jobid}} DATADIR = data RESDIR = results -INPUTDATA = {{specs.data}} -INPUT_SAMPLEINFO := {{specs.sampleinfo}} +INPUTDATA = {{data.value}} +INPUT_SAMPLEINFO = {{sampleinfo.value}} PTRIM_DIR = $(DATADIR)/trim/trimp QTRIM_DIR = $(DATADIR)/trim/trimq @@ -32,13 +31,13 @@ # # Make new sample sheet named $(UPDATED_SAMPLE_SHEET) # - python ../samplesheet.py $(INPUT_SAMPLEINFO) $(DATADIR) 2>>$(ERR_LOG) + python -m pipeline.qc.samplesheet.py $(INPUT_SAMPLEINFO) $(DATADIR) fastqc: extract # # run fastqc on all files # - ls $(DATADIR)/*gz | parallel --verbose --progress -j 8 fastqc --nogroup {} -o $(DATADIR) 2>>$(ERR_LOG) + ls $(DATADIR)/*gz | parallel --verbose --progress -j 8 fastqc --nogroup {} -o $(DATADIR) multiqc: fastqc $(RESDIR) # @@ -63,7 +62,7 @@ out1=$(PTRIM_DIR)/{sample_name}_fwd1_trim.fq.gz \ out2=$(PTRIM_DIR)/{sample_name}_fwd2_trim.fq.gz \ literal={barcode}{fwd_primer} ktrim=l k=23 hdist=1 tpe tbo \ - overwrite=t skipr2=t stats=$(PTRIM_DIR)/logs/{sample_name}_fwd_stats.txt 2>>$(ERR_LOG) + overwrite=t skipr2=t stats=$(PTRIM_DIR)/logs/{sample_name}_fwd_stats.txt # # trim right primer @@ -74,7 +73,7 @@ out1=$(PTRIM_DIR)/{sample_name}_R1_trimp.fq.gz \ out2=$(PTRIM_DIR)/{sample_name}_R2_trimp.fq.gz \ literal={barcode}{rev_primer} ktrim=l k=28 hdist=1 tpe tbo \ - skipr1=t stats=$(PTRIM_DIR)/logs/{sample_name}_rev_stats.txt 2>>$(ERR_LOG) + skipr1=t stats=$(PTRIM_DIR)/logs/{sample_name}_rev_stats.txt # # clean # @@ -88,7 +87,7 @@ in1=$(PTRIM_DIR)/{sample_name}_R1_trimp.fq.gz in2=$(PTRIM_DIR)/{sample_name}_R2_trimp.fq.gz \ qtrim=rl trimq={{specs.trim_quality}} minlength=35 \ out1=$(QTRIM_DIR)/{sample_name}_R1_trim.fq.gz \ - out2=$(QTRIM_DIR)/{sample_name}_R2_trim.fq.gz stats=$(QTRIM_DIR)/logs/{sample_name}_stats.txt 2>>$(ERR_LOG) + out2=$(QTRIM_DIR)/{sample_name}_R2_trim.fq.gz stats=$(QTRIM_DIR)/logs/{sample_name}_stats.txt cat $(QTRIM_DIR)/logs/*stats.txt > $(RESDIR)/trimq_stats.txt # # quality report diff --git a/pipeline/templates/qc/qc_spec.hjson b/pipeline/templates/qc/qc_spec.hjson index 96533995f..9271137dd 100644 --- a/pipeline/templates/qc/qc_spec.hjson +++ b/pipeline/templates/qc/qc_spec.hjson @@ -1,7 +1,7 @@ { sampleinfo : { - value : test/sampleinfo.txt + value : sampleinfo.txt help : ''' Enter a tab delimited text file. Order of the fields should be @@ -15,7 +15,7 @@ Sample name should match the sample name in the dataset. } data : { - value: test/data.tar.gz + value: data.tar.gz visible: 1 origin: PROJECT data_type: TAR_FASTQ_GZ @@ -121,4 +121,4 @@ This analysis produces ''' display_type:MODEL } -} \ No newline at end of file +} diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..1fcae46cb --- /dev/null +++ b/setup.py @@ -0,0 +1,6 @@ +from setuptools import setup, find_packages +setup( + name="Biostar Pipelines", + version="0.1", + packages= [ "pipeline" ], +)