Merge pull request #28 from icgc-argo-workflows/[email protected]

[email protected]
icgc-argo-workflows · Jun 16, 2022 · f4ede1b · f4ede1b
2 parents b9dbfa1 + 3e5bf17
commit f4ede1b
Show file tree

Hide file tree

Showing 12 changed files with 166 additions and 46 deletions.
diff --git a/fastqc/README.md b/fastqc/README.md
@@ -0,0 +1,70 @@
+# Nextflow Package `fastqc`
+A simple `fastqc` wrapper written in `nextflow`. The outputs include fastqc output files along with a `qc_metrics.json` which contains some of the important metrics. 
+
+For a bunch of fastq/bam files (Illumina PE or SE), run it with:
+```
+nextflow run main.nf --seq 'path/to/bam_or_fastq'
+```
+
+## Package development
+
+The initial version of this package was created by the WorkFlow Package Manager CLI tool, please refer to
+the [documentation](https://wfpm.readthedocs.io) for details on the development procedure including
+versioning, updating, CI testing and releasing.
+
+## Inputs
+### Required
+- `seq`: fastq/bam files
+
+### Optional
+- `cpus`: cpu number for running the tool
+- `mem`: memory (GB) for running the tool
+- `publish_dir`: directory for getting the outputs
+
+## Outputs
+- `fastqc_results`: fastqc results with pattern `*_fastqc.{zip,html}`
+- `fastqc_tar`: a gzip tarball file contains fastqc results and `qc_metrics.json` file
+
+## Example Metrics in `qc_metrics.json`
+```
+{
+  "read_group_id": "C0HVY.2",
+  "basic_statistics": "PASS",
+  "per_base_sequence_quality": "PASS",
+  "per_tile_sequence_quality": "WARN",
+  "per_sequence_quality_scores": "PASS",
+  "per_base_sequence_content": "FAIL",
+  "per_sequence_gc_content": "FAIL",
+  "per_base_n_content": "PASS",
+  "sequence_length_distribution": "PASS",
+  "sequence_duplication_levels": "PASS",
+  "overrepresented_sequences": "FAIL",
+  "adapter_content": "PASS",
+  "file_name": "C0HVY.2_r1.fq.gz",
+  "total_sequences": "28",
+  "poor_sequences": "0",
+  "average_sequence_length": "76",
+  "%GC": "65",
+  "duplicated_percentage": "100.0"
+}
+
+```
+
+## Usage
+### Run the package directly
+
+With inputs prepared, you should be able to run the package directly using the following command.
+Please replace the params file with a real one (with all required parameters and input files). 
+Example params file(s) can be found in the `tests` folder.
+
+```
+nextflow run icgc-argo-workflows/argo-qc-tools/fastqc/main.nf -r fastqc.v0.2.0 -params-file <your-params-json-file>
+```
+
+### Import the package as a dependency
+
+To import this package into another package as a dependency, please follow these steps at the importing package side:
+
+1. add this package's URI `github.com/icgc-argo-workflows/argo-qc-tools/[email protected]` in the `dependencies` list of the `pkg.json` file
+2. run `wfpm install` to install the dependency
+3. add the `include` statement in the main Nextflow script to import the dependent package from this path: `./wfpr_modules/github.com/icgc-argo-workflows/argo-qc-tools/[email protected]/main.nf`
diff --git a/fastqc/main.nf b/fastqc/main.nf
@@ -29,7 +29,7 @@
 /* this block is auto-generated based on info from pkg.json where   */
 /* changes can be made if needed, do NOT modify this block manually */
 nextflow.enable.dsl = 2
-version = '0.1.0.1'
+version = '0.2.0'
 
 container = [
     'ghcr.io': 'ghcr.io/icgc-argo-workflows/argo-qc-tools.fastqc'
@@ -51,26 +51,28 @@ params.publish_dir = ""  // set to empty string will disable publishDir
 // tool specific parmas go here, add / change as needed
 params.seq = ""
 
-
 process fastqc {
   container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
-  publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir
+  publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir ? true : false
 
   cpus params.cpus
   memory "${params.mem} GB"
 
   input:  // input, make update as needed
-    path seq
+    tuple val(name), path(seq)
 
   output:  // output, make update as needed
-    path "${seq}.fastqc.tgz", emit: qc_tar
+    path "*fastqc.tgz", emit: fastqc_tar
+    path "output/*_fastqc.{zip,html}", emit: fastqc_results
+
 
   script:
     // add and initialize variables here as needed
 
     """
     main.py \
       -s ${seq} \
+      -n ${name} \
       -t ${params.cpus}
 
     """
@@ -80,7 +82,16 @@ process fastqc {
 // this provides an entry point for this main script, so it can be run directly without clone the repo
 // using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
 workflow {
+  Channel
+    .fromPath(params.seq)
+    .map { file ->
+        def key = file.name.toString().tokenize('_').get(0)
+        return tuple(key, file)
+     }
+    .groupTuple()
+    .view()
+    .set{ groups_ch }
   fastqc(
-    file(params.seq)
+    groups_ch
   )
 }
diff --git a/fastqc/main.py b/fastqc/main.py
@@ -36,6 +36,8 @@
 from glob import glob
 import tarfile
 import io
+import hashlib
+import re
 
 def run_cmd(cmd):
     proc = subprocess.Popen(
@@ -61,40 +63,59 @@ def get_tool_version(toolname):
 
     return stdout.strip().split(' ')[-1].strip('v')
 
-def prep_qc_metrics(output_dir, tool_ver):
+def prep_qc_metrics(name, output_dir, tool_ver):
     qc_metrics = {
         'tool': {
             'name': 'FastQC',
             'version': tool_ver
         },
-        'metrics': {},
+        'metrics': [],
         'description': 'High level sequencing reads QC metrics generated by FastQC.'
     }
 
-    fastqc_zip = glob(os.path.join(output_dir, "*_fastqc.zip"))[0]
-    fastqc_data = os.path.join(os.path.basename(fastqc_zip).rstrip('.zip'), "summary.txt")
-    with ZipFile(fastqc_zip) as myzip:
-      with myzip.open(fastqc_data) as myfile:
-        with io.TextIOWrapper(myfile, encoding="utf-8") as mytext:
-          for line in mytext:
-            cols = line.rstrip().split('\t')
-            qc_metrics['metrics'].update({
-              cols[1]: cols[0]
-            })
+    for fastqc_zip in sorted(glob(os.path.join(output_dir, "*_fastqc.zip"))):
+      metric = {"read_group_id": name}
+      fastqc_summary = os.path.join(os.path.basename(fastqc_zip).rstrip('.zip'), "summary.txt")
+      fastqc_data = os.path.join(os.path.basename(fastqc_zip).rstrip('.zip'), "fastqc_data.txt")
+      with ZipFile(fastqc_zip) as myzip:
+        with myzip.open(fastqc_summary) as sumfile:
+          with io.TextIOWrapper(sumfile, encoding="utf-8") as sumtext:
+            for line in sumtext:
+              cols = line.rstrip().split('\t')
+              metric.update({
+                cols[1].lower().replace(" ", "_"): cols[0]
+              })
+            metric.update({"file_name": cols[2]})
+        # get matrics from data file
+        with myzip.open(fastqc_data) as datafile:
+          with io.TextIOWrapper(datafile, encoding="utf-8") as datatext:
+            for line in datatext:
+               if line.startswith("Total Sequences"):
+                 metric.update({"total_sequences": line.strip().split("\t")[1]})
+               elif line.startswith("Sequences flagged as poor quality"):
+                 metric.update({"poor_sequences": line.strip().split("\t")[1]})
+               elif line.startswith("Sequence length"):
+                 metric.update({"average_sequence_length": line.strip().split("\t")[1]})
+               elif line.startswith("#Total Deduplicated Percentage"):
+                 metric.update({"duplicated_percentage": line.strip().split("\t")[1]})
+               elif line.startswith("%GC"):
+                 metric.update({"%GC": line.strip().split("\t")[1]})
+        # End of chunk
+      qc_metrics['metrics'].append(metric)
 
     qc_metrics_file = 'qc_metrics.json'
     with open(qc_metrics_file, "w") as j:
         j.write(json.dumps(qc_metrics, indent=2))
 
     return qc_metrics_file
 
-def prepare_tarball(seq, qc_metrics, output_dir):
+def prepare_tarball(name, qc_metrics, output_dir):
 
     files_to_tar = [qc_metrics]
     for f in sorted(glob(output_dir+'/*')):
       files_to_tar.append(f)
 
-    tarfile_name = f"{os.path.basename(seq)}.fastqc.tgz"
+    tarfile_name = f"{name}.fastqc.tgz"
     with tarfile.open(tarfile_name, "w:gz") as tar:
       for f in files_to_tar:
         tar.add(f, arcname=os.path.basename(f))
@@ -106,15 +127,14 @@ def main():
     """
 
     parser = argparse.ArgumentParser(description='Tool: fastqc')
-    parser.add_argument('-s', '--seq', type=str,
-                        help='Input seq', required=True)
-    parser.add_argument('-t', '--threads', type=int, default=cpu_count(),
-                        help='Number of threads')
+    parser.add_argument('-s', '--seq', type=str, nargs="+", help='Input seq', required=True)
+    parser.add_argument('-n', '--rg_id', type=str, help='Read_group name', required=True)
+    parser.add_argument('-t', '--threads', type=int, default=cpu_count(), help='Number of threads')
     args = parser.parse_args()
 
-    if not os.path.isfile(args.seq):
-        sys.exit('Error: specified seq file %s does not exist or is not accessible!' % args.seq)
-
+    for fn in args.seq: 
+      if not os.path.isfile(fn):
+        sys.exit('Error: specified seq file %s does not exist or is not accessible!' % fn)
 
     # get tool version info
     tool_ver = get_tool_version('fastqc')
@@ -125,21 +145,28 @@ def main():
 
     # run fastqc
     fastqc_args = [
+        '-q',
         '-t', str(args.threads),
         '-o', output_dir
     ]
 
-    cmd = ['fastqc'] + fastqc_args + [args.seq]
+    cmd = ['fastqc'] + fastqc_args + args.seq
     stdout, stderr, returncode = run_cmd(" ".join(cmd))
     if returncode:
         sys.exit(f"Error: 'fastqc' failed.\nStdout: {stdout}\nStderr: {stderr}\n")
 
 
     # parse fastqc output and put it in qc_metrics.json
-    qc_metrics_file = prep_qc_metrics(output_dir, tool_ver)
+    qc_metrics_file = prep_qc_metrics(args.rg_id, output_dir, tool_ver)
+
+    # in case the rg_id contains filename not friendly characters
+    friendly_rgid = "".join([ c if re.match(r"[a-zA-Z0-9\.\-_]", c) else "_" for c in args.rg_id ])
+    # calculate md5 and add it in the logfile name to avoid name colision
+    md5sum = hashlib.md5((args.rg_id).encode('utf-8')).hexdigest()
+    prefix_name = '.'.join([friendly_rgid, md5sum])
 
     # prepare tarball to include output files and qc_metrics.json
-    prepare_tarball(args.seq, qc_metrics_file, output_dir)
+    prepare_tarball(prefix_name, qc_metrics_file, output_dir)
 
 if __name__ == "__main__":
     main()
diff --git a/fastqc/pkg.json b/fastqc/pkg.json
@@ -1,6 +1,6 @@
 {
     "name": "fastqc",
-    "version": "0.1.0.1",
+    "version": "0.2.0",
     "description": "FastQC tool",
     "main": "main.nf",
     "deprecated": false,

diff --git a/fastqc/tests/checker.nf b/fastqc/tests/checker.nf
@@ -34,7 +34,7 @@
 /* this block is auto-generated based on info from pkg.json where   */
 /* changes can be made if needed, do NOT modify this block manually */
 nextflow.enable.dsl = 2
-version = '0.1.0.1'
+version = '0.2.0'
 
 container = [
     'ghcr.io': 'ghcr.io/icgc-argo-workflows/argo-qc-tools.fastqc'
@@ -69,10 +69,14 @@ process file_smart_diff {
     mkdir output expected
 
     tar xzf ${output_file} -C output
-    unzip output/*.zip -d output
+    for f in `ls output/*.zip`; do
+      unzip \$f -d output
+    done
     tar xzf ${expected_file} -C expected
-    unzip expected/*.zip -d expected
-    
+    for f in `ls expected/*.zip`; do
+      unzip \$f -d expected
+    done
+
     cd output
     # compare all types of files
     for f in `find . -type f`; do 
@@ -81,8 +85,8 @@ process file_smart_diff {
         echo "Test FAILED, found unexpected file: \$f in the output tarball" && exit 1
       fi
       echo diff \$f ../expected/\$f
-      EFFECTIVE_DIFF=`diff <( cat \$f | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' ) \
-                           <( cat ../expected/\$f | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' ) \
+      EFFECTIVE_DIFF=`diff <( cat \$f | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' | sort ) \
+                           <( cat ../expected/\$f | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' | sort ) \
                            | egrep '<|>' || true`
       if [ ! -z "\$EFFECTIVE_DIFF" ]
       then
@@ -100,20 +104,25 @@ workflow checker {
     expected_output
 
   main:
+    Channel.fromPath(seq).map { file ->
+        def key = file.name.toString().tokenize('_').get(0)
+        return tuple(key, file)
+     }
+     .groupTuple()
+     .set{ groups_ch }
     fastqc(
-      seq
+      groups_ch
     )
-
     file_smart_diff(
-      fastqc.out.qc_tar,
-      expected_output
+      fastqc.out.fastqc_tar,
+      file(expected_output)
     )
 }
 
 
 workflow {
   checker(
-    file(params.seq),
-    file(params.expected_output)
+    params.seq,
+    params.expected_output
   )
 }
diff --git a/fastqc/tests/expected/expected.C0HVY.2_r1.fq.gz.fastqc.tgz b/fastqc/tests/expected/expected.C0HVY.2_r1.fq.gz.fastqc.tgz
diff --git a/fastqc/tests/expected/expected.test-job-1.fastqc.tgz b/fastqc/tests/expected/expected.test-job-1.fastqc.tgz
diff --git a/fastqc/tests/expected/expected.test-job-2.fastqc.tgz b/fastqc/tests/expected/expected.test-job-2.fastqc.tgz
diff --git a/fastqc/tests/expected/expected.test_rg_3.bam.fastqc.tgz b/fastqc/tests/expected/expected.test_rg_3.bam.fastqc.tgz
diff --git a/fastqc/tests/input/C0HVY.2_r2.fq.gz b/fastqc/tests/input/C0HVY.2_r2.fq.gz
diff --git a/fastqc/tests/test-job-1.json b/fastqc/tests/test-job-1.json
@@ -1,6 +1,6 @@
 {
     "seq": "input/test_rg_3.bam",
-    "expected_output": "expected/expected.test_rg_3.bam.fastqc.tgz",
+    "expected_output": "expected/expected.test-job-1.fastqc.tgz",
     "publish_dir": "outdir",
     "cpus": 1,
     "mem": 0.5

diff --git a/fastqc/tests/test-job-2.json b/fastqc/tests/test-job-2.json
@@ -1,6 +1,9 @@
 {
-    "seq": "input/C0HVY.2_r1.fq.gz",
-    "expected_output": "expected/expected.C0HVY.2_r1.fq.gz.fastqc.tgz",
+    "seq": [
+      "input/C0HVY.2_r1.fq.gz",
+      "input/C0HVY.2_r2.fq.gz"
+    ],
+    "expected_output": "expected/expected.test-job-2.fastqc.tgz",
     "publish_dir": "outdir",
     "cpus": 1,
     "mem": 0.5