Skip to content

Commit

Permalink
Merge pull request #28 from icgc-argo-workflows/[email protected]
Browse files Browse the repository at this point in the history
  • Loading branch information
RaqManzano authored Jun 16, 2022
2 parents b9dbfa1 + 3e5bf17 commit f4ede1b
Show file tree
Hide file tree
Showing 12 changed files with 166 additions and 46 deletions.
70 changes: 70 additions & 0 deletions fastqc/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# Nextflow Package `fastqc`
A simple `fastqc` wrapper written in `nextflow`. The outputs include fastqc output files along with a `qc_metrics.json` which contains some of the important metrics.

For a bunch of fastq/bam files (Illumina PE or SE), run it with:
```
nextflow run main.nf --seq 'path/to/bam_or_fastq'
```

## Package development

The initial version of this package was created by the WorkFlow Package Manager CLI tool, please refer to
the [documentation](https://wfpm.readthedocs.io) for details on the development procedure including
versioning, updating, CI testing and releasing.

## Inputs
### Required
- `seq`: fastq/bam files

### Optional
- `cpus`: cpu number for running the tool
- `mem`: memory (GB) for running the tool
- `publish_dir`: directory for getting the outputs

## Outputs
- `fastqc_results`: fastqc results with pattern `*_fastqc.{zip,html}`
- `fastqc_tar`: a gzip tarball file contains fastqc results and `qc_metrics.json` file

## Example Metrics in `qc_metrics.json`
```
{
"read_group_id": "C0HVY.2",
"basic_statistics": "PASS",
"per_base_sequence_quality": "PASS",
"per_tile_sequence_quality": "WARN",
"per_sequence_quality_scores": "PASS",
"per_base_sequence_content": "FAIL",
"per_sequence_gc_content": "FAIL",
"per_base_n_content": "PASS",
"sequence_length_distribution": "PASS",
"sequence_duplication_levels": "PASS",
"overrepresented_sequences": "FAIL",
"adapter_content": "PASS",
"file_name": "C0HVY.2_r1.fq.gz",
"total_sequences": "28",
"poor_sequences": "0",
"average_sequence_length": "76",
"%GC": "65",
"duplicated_percentage": "100.0"
}
```

## Usage
### Run the package directly

With inputs prepared, you should be able to run the package directly using the following command.
Please replace the params file with a real one (with all required parameters and input files).
Example params file(s) can be found in the `tests` folder.

```
nextflow run icgc-argo-workflows/argo-qc-tools/fastqc/main.nf -r fastqc.v0.2.0 -params-file <your-params-json-file>
```

### Import the package as a dependency

To import this package into another package as a dependency, please follow these steps at the importing package side:

1. add this package's URI `github.com/icgc-argo-workflows/argo-qc-tools/[email protected]` in the `dependencies` list of the `pkg.json` file
2. run `wfpm install` to install the dependency
3. add the `include` statement in the main Nextflow script to import the dependent package from this path: `./wfpr_modules/github.com/icgc-argo-workflows/argo-qc-tools/[email protected]/main.nf`
23 changes: 17 additions & 6 deletions fastqc/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
/* this block is auto-generated based on info from pkg.json where */
/* changes can be made if needed, do NOT modify this block manually */
nextflow.enable.dsl = 2
version = '0.1.0.1'
version = '0.2.0'

container = [
'ghcr.io': 'ghcr.io/icgc-argo-workflows/argo-qc-tools.fastqc'
Expand All @@ -51,26 +51,28 @@ params.publish_dir = "" // set to empty string will disable publishDir
// tool specific parmas go here, add / change as needed
params.seq = ""


process fastqc {
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir
publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir ? true : false

cpus params.cpus
memory "${params.mem} GB"

input: // input, make update as needed
path seq
tuple val(name), path(seq)

output: // output, make update as needed
path "${seq}.fastqc.tgz", emit: qc_tar
path "*fastqc.tgz", emit: fastqc_tar
path "output/*_fastqc.{zip,html}", emit: fastqc_results


script:
// add and initialize variables here as needed

"""
main.py \
-s ${seq} \
-n ${name} \
-t ${params.cpus}
"""
Expand All @@ -80,7 +82,16 @@ process fastqc {
// this provides an entry point for this main script, so it can be run directly without clone the repo
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
workflow {
Channel
.fromPath(params.seq)
.map { file ->
def key = file.name.toString().tokenize('_').get(0)
return tuple(key, file)
}
.groupTuple()
.view()
.set{ groups_ch }
fastqc(
file(params.seq)
groups_ch
)
}
75 changes: 51 additions & 24 deletions fastqc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
from glob import glob
import tarfile
import io
import hashlib
import re

def run_cmd(cmd):
proc = subprocess.Popen(
Expand All @@ -61,40 +63,59 @@ def get_tool_version(toolname):

return stdout.strip().split(' ')[-1].strip('v')

def prep_qc_metrics(output_dir, tool_ver):
def prep_qc_metrics(name, output_dir, tool_ver):
qc_metrics = {
'tool': {
'name': 'FastQC',
'version': tool_ver
},
'metrics': {},
'metrics': [],
'description': 'High level sequencing reads QC metrics generated by FastQC.'
}

fastqc_zip = glob(os.path.join(output_dir, "*_fastqc.zip"))[0]
fastqc_data = os.path.join(os.path.basename(fastqc_zip).rstrip('.zip'), "summary.txt")
with ZipFile(fastqc_zip) as myzip:
with myzip.open(fastqc_data) as myfile:
with io.TextIOWrapper(myfile, encoding="utf-8") as mytext:
for line in mytext:
cols = line.rstrip().split('\t')
qc_metrics['metrics'].update({
cols[1]: cols[0]
})
for fastqc_zip in sorted(glob(os.path.join(output_dir, "*_fastqc.zip"))):
metric = {"read_group_id": name}
fastqc_summary = os.path.join(os.path.basename(fastqc_zip).rstrip('.zip'), "summary.txt")
fastqc_data = os.path.join(os.path.basename(fastqc_zip).rstrip('.zip'), "fastqc_data.txt")
with ZipFile(fastqc_zip) as myzip:
with myzip.open(fastqc_summary) as sumfile:
with io.TextIOWrapper(sumfile, encoding="utf-8") as sumtext:
for line in sumtext:
cols = line.rstrip().split('\t')
metric.update({
cols[1].lower().replace(" ", "_"): cols[0]
})
metric.update({"file_name": cols[2]})
# get matrics from data file
with myzip.open(fastqc_data) as datafile:
with io.TextIOWrapper(datafile, encoding="utf-8") as datatext:
for line in datatext:
if line.startswith("Total Sequences"):
metric.update({"total_sequences": line.strip().split("\t")[1]})
elif line.startswith("Sequences flagged as poor quality"):
metric.update({"poor_sequences": line.strip().split("\t")[1]})
elif line.startswith("Sequence length"):
metric.update({"average_sequence_length": line.strip().split("\t")[1]})
elif line.startswith("#Total Deduplicated Percentage"):
metric.update({"duplicated_percentage": line.strip().split("\t")[1]})
elif line.startswith("%GC"):
metric.update({"%GC": line.strip().split("\t")[1]})
# End of chunk
qc_metrics['metrics'].append(metric)

qc_metrics_file = 'qc_metrics.json'
with open(qc_metrics_file, "w") as j:
j.write(json.dumps(qc_metrics, indent=2))

return qc_metrics_file

def prepare_tarball(seq, qc_metrics, output_dir):
def prepare_tarball(name, qc_metrics, output_dir):

files_to_tar = [qc_metrics]
for f in sorted(glob(output_dir+'/*')):
files_to_tar.append(f)

tarfile_name = f"{os.path.basename(seq)}.fastqc.tgz"
tarfile_name = f"{name}.fastqc.tgz"
with tarfile.open(tarfile_name, "w:gz") as tar:
for f in files_to_tar:
tar.add(f, arcname=os.path.basename(f))
Expand All @@ -106,15 +127,14 @@ def main():
"""

parser = argparse.ArgumentParser(description='Tool: fastqc')
parser.add_argument('-s', '--seq', type=str,
help='Input seq', required=True)
parser.add_argument('-t', '--threads', type=int, default=cpu_count(),
help='Number of threads')
parser.add_argument('-s', '--seq', type=str, nargs="+", help='Input seq', required=True)
parser.add_argument('-n', '--rg_id', type=str, help='Read_group name', required=True)
parser.add_argument('-t', '--threads', type=int, default=cpu_count(), help='Number of threads')
args = parser.parse_args()

if not os.path.isfile(args.seq):
sys.exit('Error: specified seq file %s does not exist or is not accessible!' % args.seq)

for fn in args.seq:
if not os.path.isfile(fn):
sys.exit('Error: specified seq file %s does not exist or is not accessible!' % fn)

# get tool version info
tool_ver = get_tool_version('fastqc')
Expand All @@ -125,21 +145,28 @@ def main():

# run fastqc
fastqc_args = [
'-q',
'-t', str(args.threads),
'-o', output_dir
]

cmd = ['fastqc'] + fastqc_args + [args.seq]
cmd = ['fastqc'] + fastqc_args + args.seq
stdout, stderr, returncode = run_cmd(" ".join(cmd))
if returncode:
sys.exit(f"Error: 'fastqc' failed.\nStdout: {stdout}\nStderr: {stderr}\n")


# parse fastqc output and put it in qc_metrics.json
qc_metrics_file = prep_qc_metrics(output_dir, tool_ver)
qc_metrics_file = prep_qc_metrics(args.rg_id, output_dir, tool_ver)

# in case the rg_id contains filename not friendly characters
friendly_rgid = "".join([ c if re.match(r"[a-zA-Z0-9\.\-_]", c) else "_" for c in args.rg_id ])
# calculate md5 and add it in the logfile name to avoid name colision
md5sum = hashlib.md5((args.rg_id).encode('utf-8')).hexdigest()
prefix_name = '.'.join([friendly_rgid, md5sum])

# prepare tarball to include output files and qc_metrics.json
prepare_tarball(args.seq, qc_metrics_file, output_dir)
prepare_tarball(prefix_name, qc_metrics_file, output_dir)

if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion fastqc/pkg.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "fastqc",
"version": "0.1.0.1",
"version": "0.2.0",
"description": "FastQC tool",
"main": "main.nf",
"deprecated": false,
Expand Down
33 changes: 21 additions & 12 deletions fastqc/tests/checker.nf
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
/* this block is auto-generated based on info from pkg.json where */
/* changes can be made if needed, do NOT modify this block manually */
nextflow.enable.dsl = 2
version = '0.1.0.1'
version = '0.2.0'

container = [
'ghcr.io': 'ghcr.io/icgc-argo-workflows/argo-qc-tools.fastqc'
Expand Down Expand Up @@ -69,10 +69,14 @@ process file_smart_diff {
mkdir output expected
tar xzf ${output_file} -C output
unzip output/*.zip -d output
for f in `ls output/*.zip`; do
unzip \$f -d output
done
tar xzf ${expected_file} -C expected
unzip expected/*.zip -d expected
for f in `ls expected/*.zip`; do
unzip \$f -d expected
done
cd output
# compare all types of files
for f in `find . -type f`; do
Expand All @@ -81,8 +85,8 @@ process file_smart_diff {
echo "Test FAILED, found unexpected file: \$f in the output tarball" && exit 1
fi
echo diff \$f ../expected/\$f
EFFECTIVE_DIFF=`diff <( cat \$f | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' ) \
<( cat ../expected/\$f | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' ) \
EFFECTIVE_DIFF=`diff <( cat \$f | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' | sort ) \
<( cat ../expected/\$f | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' | sort ) \
| egrep '<|>' || true`
if [ ! -z "\$EFFECTIVE_DIFF" ]
then
Expand All @@ -100,20 +104,25 @@ workflow checker {
expected_output

main:
Channel.fromPath(seq).map { file ->
def key = file.name.toString().tokenize('_').get(0)
return tuple(key, file)
}
.groupTuple()
.set{ groups_ch }
fastqc(
seq
groups_ch
)

file_smart_diff(
fastqc.out.qc_tar,
expected_output
fastqc.out.fastqc_tar,
file(expected_output)
)
}


workflow {
checker(
file(params.seq),
file(params.expected_output)
params.seq,
params.expected_output
)
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added fastqc/tests/input/C0HVY.2_r2.fq.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion fastqc/tests/test-job-1.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"seq": "input/test_rg_3.bam",
"expected_output": "expected/expected.test_rg_3.bam.fastqc.tgz",
"expected_output": "expected/expected.test-job-1.fastqc.tgz",
"publish_dir": "outdir",
"cpus": 1,
"mem": 0.5
Expand Down
7 changes: 5 additions & 2 deletions fastqc/tests/test-job-2.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
{
"seq": "input/C0HVY.2_r1.fq.gz",
"expected_output": "expected/expected.C0HVY.2_r1.fq.gz.fastqc.tgz",
"seq": [
"input/C0HVY.2_r1.fq.gz",
"input/C0HVY.2_r2.fq.gz"
],
"expected_output": "expected/expected.test-job-2.fastqc.tgz",
"publish_dir": "outdir",
"cpus": 1,
"mem": 0.5
Expand Down

0 comments on commit f4ede1b

Please sign in to comment.