d3b-center · migbro · Jul 1, 2020 · Jul 2, 2020 · Jul 15, 2020 · Jul 20, 2020
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -36,4 +36,5 @@ Please describe the tests that you ran to verify your changes. Provide instructi
 - [ ] I have added tests that prove my fix is effective or that my feature works
 - [ ] New and existing unit tests pass locally with my changes
 - [ ] Any dependent changes have been merged and published in downstream modules
-- [ ] I have checked my code and corrected any misspellings
+- [ ] I have checked my code and corrected any misspellings
+- [ ] Have I ensured that commits used for testing have been pushed before PR approval?
diff --git a/LICENSE b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2020 Center for Data-Driven Discovery in Biomedicine 
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

diff --git a/README.md b/README.md
@@ -221,6 +221,7 @@ outputs:
 [Variant Effect Predictor](https://useast.ensembl.org/info/docs/tools/vep/index.html) is an ENSEMBL tool for annotating variants.
 The tool built for this repo has very basic and rigid functionality, but can be run on any of the vcf outputs from the worfklows.
 `tools/variant_effect_predictor.cwl`.
+It can be run using a cache or a gtf - in this case a gtf was used to be able to generate gene models for both human and SARS CoV-2
 Run time (n=782) COV-IRT dataset,  ~6 minutes, cost on cavatica ~$0.22 per sample
 
 
@@ -235,13 +236,12 @@ Run time (n=782) COV-IRT dataset,  ~6 minutes, cost on cavatica ~$0.22 per sampl
 ```yaml
 inputs:
   reference: {type: File,  secondaryFiles: [.fai], label: Fasta genome assembly with index}
-  input_vcf:
-    type: File
-    secondaryFiles: [.tbi]
+  input_vcf: {type: File, secondaryFiles: [.tbi]}
   output_basename: string
-  merged_cache: {type: boolean, doc: "If merged cache being used", default: true}
+  merged_cache: {type: boolean?, doc: "If merged cache being used", default: true}
   tool_name: {type: string, doc: "Name of tool used to generate calls"}
-  cache: {type: File, label: tar gzipped cache from ensembl/local converted cache}
+  cache: {type: File?, label: tar gzipped cache from ensembl/local converted cache, doc: "Use this if not using a gtf for gene models"}
+  bgzipped_gtf: {type: File?, doc: "If merged cache being used", secondaryFiles: ['.tbi'], doc: "Use this if not using a cahce, but using gtf instead for gene models"}
   ```
 
 ```yaml

diff --git a/fq_split_rg_instructions.md b/fq_split_rg_instructions.md
@@ -0,0 +1,106 @@
+# Get code and install in python env
+
+```sh
+git clone https://github.com/kmhernan/gdc-fastq-splitter
+cd gdc-fastq-splitter
+```
+*if you don't have it yet*, get virtual env for python3
+
+`pip3 install virtualenv # depends on your set up`
+
+```sh
+virtualenv venv --python /usr/bin/python3 # again, wherever your python3 (should be version 3.5+) is installed
+./venv/bin/pip install .
+venv/bin/gdc-fastq-splitter -h # if successful, you should get help menu
+```
+
+# Fun time run time
+Note that processing paired end files will use two cores instead on one (likely a good thing).
+
+```text
+inputs: 
+ -rw-rw-r-- 1 ubuntu ubuntu  84M May  7 04:31 COVHA-P1-D06-N.filtered.human.R1.fastq.gz
+ -rw-rw-r-- 1 ubuntu ubuntu  88M May  7 04:31 COVHA-P1-D06-N.filtered.human.R2.fastq.gz
+```
+
+```sh
+gdc-fastq-splitter/venv/bin/gdc-fastq-splitter COVHA-P1-D06-N.filtered.human.R1.fastq.gz COVHA-P1-D06-N.filtered.human.R2.fastq.gz -o COVHA-P1-D06-N.filtered.human_ 2> errs.log 
+```
+
+
+## Result outputs:
+
+```text
+ -rw-rw-r-- 1 ubuntu ubuntu  34M Jul  1 18:38 COVHA-P1-D06-N.filtered.human_H5NKGDSXY_1_R1.fq.gz
+ -rw-rw-r-- 1 ubuntu ubuntu 8.9K Jul  1 18:38 COVHA-P1-D06-N.filtered.human_H5NKGDSXY_1_R1.report.json
+ -rw-rw-r-- 1 ubuntu ubuntu  36M Jul  1 18:38 COVHA-P1-D06-N.filtered.human_H5NKGDSXY_1_R2.fq.gz
+ -rw-rw-r-- 1 ubuntu ubuntu 8.9K Jul  1 18:38 COVHA-P1-D06-N.filtered.human_H5NKGDSXY_1_R2.report.json
+ -rw-rw-r-- 1 ubuntu ubuntu  34M Jul  1 18:38 COVHA-P1-D06-N.filtered.human_H5NKGDSXY_2_R1.fq.gz
+ -rw-rw-r-- 1 ubuntu ubuntu 9.6K Jul  1 18:38 COVHA-P1-D06-N.filtered.human_H5NKGDSXY_2_R1.report.json
+ -rw-rw-r-- 1 ubuntu ubuntu  36M Jul  1 18:38 COVHA-P1-D06-N.filtered.human_H5NKGDSXY_2_R2.fq.gz
+ -rw-rw-r-- 1 ubuntu ubuntu 9.6K Jul  1 18:38 COVHA-P1-D06-N.filtered.human_H5NKGDSXY_2_R2.report.json
+```
+
+json outputs have relevant RG info, metadata field most relevant, early barcode field is informative
+
+from `COVHA-P1-D06-N.filtered.human_H5NKGDSXY_1_R1.report.json`
+
+```json
+   "metadata": {
+     "fastq_filename": "COVHA-P1-D06-N.filtered.human_H5NKGDSXY_1_R1.fq.gz",
+     "flowcell_barcode": "H5NKGDSXY",
+    "lane_number": 1,
+     "multiplex_barcode": "TGAACACC+CGCCTAGG",
+     "record_count": 494598
+   }
+```
+
+from `COVHA-P1-D06-N.filtered.human_H5NKGDSXY_2_R1.report.json`
+
+```json
+ "metadata": {
+     "fastq_filename": "COVHA-P1-D06-N.filtered.human_H5NKGDSXY_2_R1.fq.gz",
+     "flowcell_barcode": "H5NKGDSXY",
+     "lane_number": 2,
+     "multiplex_barcode": "TGAACACC+CGCCTAGG",
+     "record_count": 488895
+   }
+```
+
+Recommended RGs based on these two files:
+
+```text
+@RG ID:H5NKGDSXY.1	PL:ILLUMINA PU:H5NKGDSXY.1.TGAACACC+CGCCTAGG	LB:COVHA-P1-D06-N.filtered.human 	:COVHA-P1-D06-N.filtered.human
+@RG	ID:H5NKGDSXY.2	PL:ILLUMINA	PU:H5NKGDSXY.1.TGAACACC+CGCCTAGG	LB:COVHA-P1-D06-N.filtered.human SM:COVHA-P1-D06-N.filtered.human
+```
+
+# Updated STAR Command, based on [Google Drive doc](https://drive.google.com/file/d/1z7euPSE77jbthKi8ty7dscq-oNx4wgF3/view?usp=sharing) follwing instruction from [STAR manual](https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf), page 8, section 3.2:
+
+```sh
+STAR --twopassMode Basic \
+    --limitBAMsortRAM 105000000000 \
+--genomeDir /path/to/STAR/genome/directory \
+    --outSAMunmapped Within \
+    --outFilterType BySJout \
+    --outSAMattributes NH HI AS nM NM MD jM jI MC ch \ #same as All
+    --outSAMattrRGline ID:H5NKGDSXY.1	PL:ILLUMINA PU:H5NKGDSXY.1.TGAACACC+CGCCTAGG	LB:COVHA-P1-D06-N.filtered.human 	SM:COVHA-P1-D06-N.filtered.human , ID:H5NKGDSXY.2	PL:ILLUMINA	PU:H5NKGDSXY.1.TGAACACC+CGCCTAGG	LB:COVHA-P1-D06-N.filtered.human SM:COVHA-P1-D06-N.filtered.human \
+    --outFilterMultimapNmax 20 \
+    --outFilterMismatchNmax 999 \
+    --outFilterMismatchNoverReadLmax 0.04 \
+    --alignIntronMin 20 \
+    --alignIntronMax 1000000 \
+    --alignMatesGapMax 1000000 \
+    --alignSJoverhangMin 8 \
+    --alignSJDBoverhangMin 1 \
+    --sjdbScore 1 \
+    --readFilesCommand zcat \
+    --runThreadN 24 \
+    --chimOutType Junctions \
+    --chimOutJunctionFormat 1
+    --chimSegmentMin 20 \
+    --outSAMtype BAM SortedByCoordinate \
+    --quantMode TranscriptomeSAM GeneCounts \
+    --outSAMheaderHD @HD VN:1.4 SO:coordinate \
+    --outFileNamePrefix /path/to/output/directory/${sample} \
+    --readFilesIn COVHA-P1-D06-N.filtered.human_H5NKGDSXY_1_R1.fq.gz,COVHA-P1-D06-N.filtered.human_H5NKGDSXY_2_R1.fq.gz COVHA-P1-D06-N.filtered.human_H5NKGDSXY_1_R2.fq.gz,COVHA-P1-D06-N.filtered.human_H5NKGDSXY_2_R2.fq.gz
+```
diff --git a/notebooks/.ipynb_checkpoints/extract_bash_calls_from_task-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/extract_bash_calls_from_task-checkpoint.ipynb
@@ -0,0 +1,145 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sevenbridges as sbg\n",
+    "from sevenbridges.errors import SbgError\n",
+    "from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper\n",
+    "import sys\n",
+    "import re\n",
+    "import pdb\n",
+    "import concurrent.futures\n",
+    "from requests import request\n",
+    "import json\n",
+    "config = sbg.Config(profile='turbo')\n",
+    "api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project = \"kfdrc-harmonization/kf-reference-pipeline\"\n",
+    "task_id = \"731be06c-87b2-45d0-aed9-f55330240462\"\n",
+    "out_file = open(\"/Users/brownm28/Documents/playground/strelka2_subwf.tsv\", \"w\")\n",
+    "# task_id = \"3c20cc8e-18d7-43f2-bc2c-4a76d38a88f8\"\n",
+    "task = api.tasks.get(task_id)\n",
+    "jobs = {}\n",
+    "temp = {}\n",
+    "for job in task.get_execution_details().jobs:\n",
+    "    if job.status == \"COMPLETED\":\n",
+    "        check = job.name.split('_')\n",
+    "        cmd = job.command_line\n",
+    "        if job.command_line == None:\n",
+    "            # pdb.set_trace()\n",
+    "            cmd = \"embedded script or task retry\"\n",
+    "            sys.stderr.write(\"WARN: Job \" + job.name + \" had null cmd\\n\")\n",
+    "        if check[-1] == \"s\":\n",
+    "            key = \"_\".join(check[:-2])\n",
+    "            if key not in temp:\n",
+    "                jobs[job.start_time] = {}\n",
+    "                jobs[job.start_time][key] = cmd\n",
+    "                temp[key] = 1\n",
+    "            else:\n",
+    "                temp[key] += 1\n",
+    "        else:\n",
+    "            jobs[job.start_time] = {}\n",
+    "            jobs[job.start_time][job.name] = cmd\n",
+    "out_file.write(\"Step\\tType\\tNum scatter\\tCommand\\n\")\n",
+    "for rtime in sorted(jobs.keys()):\n",
+    "    for key in jobs[rtime]:\n",
+    "        rtype = \"run step\"\n",
+    "        sct = \"NA\"\n",
+    "        if key in temp and temp[key] > 1:\n",
+    "            rtype = \"scatter\"\n",
+    "            sct = str(temp[key])\n",
+    "        cmds = jobs[rtime][key].split('\\n')\n",
+    "        for cmd in cmds:\n",
+    "            out_file.write(key + \"\\t\" + rtype + \"\\t\" + sct + \"\\t\" + cmd + \"\\n\")\n",
+    "out_file.close()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "\n",
+    "# max desired col width\n",
+    "max_w = 200\n",
+    "tsv_in = open(\"/Users/brownm28/Documents/playground/strelka2_subwf.tsv\")\n",
+    "out_md = open(\"/Users/brownm28/Documents/playground/strelka2_subwf.md\", \"w\")\n",
+    "data = []\n",
+    "max_len = []\n",
+    "\n",
+    "\n",
+    "for line in tsv_in:\n",
+    "    info = line.rstrip('\\n').split('\\t')\n",
+    "    data.append(info)\n",
+    "    if len(max_len) == 0:\n",
+    "        for item in info:\n",
+    "            max_len.append(len(item))\n",
+    "    else:\n",
+    "        for i in range(len(max_len)):\n",
+    "            if len(info[i]) > max_w:\n",
+    "                max_len[i] = max_w\n",
+    "            elif len(info[i]) > max_len[i]:\n",
+    "                max_len[i] = len(info[i])\n",
+    "# print header first\n",
+    "d_ct = []\n",
+    "for i in range(len(data[0])):\n",
+    "    d_ct.append(len(data[0][i]))\n",
+    "    out_md.write(\" | \" + data[0][i] + \"\".join([\" \"] * max_len[i]))\n",
+    "    d_ct[i] += max_len[i]\n",
+    "out_md.write(\" |\\n\")\n",
+    "for i in range(len(data[0])):\n",
+    "    out_md.write(\" | \" + \"\".join([\"-\"] * d_ct[i]))\n",
+    "out_md.write(\" |\\n\")\n",
+    "# pdb.set_trace()\n",
+    "for i in range(1, len(data), 1):\n",
+    "    for j in range(len(data[i])):\n",
+    "        d_ct = len(data[i][j]) + 2\n",
+    "        out_md.write(\" | \" + data[i][j] + \"\".join([\" \"] * max_len[j]))\n",
+    "        d_ct += max_len[j]\n",
+    "    out_md.write(\" |\\n\")\n",
+    "out_md.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}