Automatically infer sex if unknown (genomic-medicine-sweden#148)

fellen31 · May 17, 2024 · 55b319f · 55b319f
1 parent f374bc9
commit 55b319f
Show file tree

Hide file tree

Showing 21 changed files with 521 additions and 45 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,13 +7,27 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### `Added`
 
+- Automatically infer sex if unknown [#148](https://github.com/genomic-medicine-sweden/nallo/pull/148)
+- Add read group tag to aligned BAM [#148](https://github.com/genomic-medicine-sweden/nallo/pull/148)
+
 ### `Changed`
 
 - Template merge for nf-core/tools v2.14.1 [#146](https://github.com/genomic-medicine-sweden/nallo/pull/146)
 - Bump to new dev version [#145](https://github.com/genomic-medicine-sweden/nallo/pull/145)
 
 ### `Fixed`
 
+### Parameters
+
+| Old parameter | New parameter      |
+| ------------- | ------------------ |
+|               | `--somalier_sites` |
+
+> [!NOTE]
+> Parameter has been updated if both old and new parameter information is present.
+> Parameter has been added if just the new parameter information is present.
+> Parameter has been removed if new parameter information isn't present.
+
 ## v0.1.0 - [2024-05-08]
 
 Initial release of genomic-medicine-sweden/nallo, created with the [nf-core](https://nf-co.re/) template.

diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv
@@ -1,3 +1,3 @@
 sample,file,family_id,paternal_id,maternal_id,sex,phenotype
-sample_1,/path/to/fastq_or_bam/files/sample_1.fastq.gz,FAM,PAT,MAT,1,1
+sample_1,/path/to/fastq_or_bam/files/sample_1.fastq.gz,FAM,PAT,MAT,0,1
 sample_2,/path/to/fastq_or_bam/files/sample_2.bam,FAM,PAT,MAT,1,1
diff --git a/assets/schema_input.json b/assets/schema_input.json
@@ -39,8 +39,8 @@
             },
             "sex": {
                 "type": "integer",
-                "enum": [1, 2],
-                "errorMessage": "Sex must be provided and cannot contain spaces",
+                "enum": [0, 1, 2],
+                "errorMessage": "Sex must be provided as 0 (missing), 1 (male) or 2 (female).",
                 "meta": ["sex"]
             },
             "phenotype": {

diff --git a/conf/modules/align_reads.config b/conf/modules/align_reads.config
@@ -30,9 +30,21 @@ process {
 
     withName: '.*:ALIGN_READS:MINIMAP2_ALIGN_UNSPLIT' {
         if(params.preset == 'revio' | params.preset == 'pacbio') {
-            ext.args = "-y -x map-hifi --secondary=no -Y"
-        } else if(params.preset == 'ONT_R9' | params.preset == 'ONT_R10') {
-            ext.args = "-y -x map-ont --secondary=no -Y"
+            ext.args = { [
+                "-y",
+                "-x map-hifi",
+                "--secondary=no",
+                "-Y",
+                "-R @RG\\\\tID:${meta.id}\\\\tSM:${meta.id}"
+            ].join(' ') }
+        } else if(params.preset == 'ONT_R10') {
+            ext.args = { [
+                "-y",
+                "-x map-ont",
+                "--secondary=no",
+                "-Y",
+                "-R @RG\\\\tID:${meta.id}\\\\tSM:${meta.id}"
+            ].join(' ') }
         }
 
         publishDir = [
@@ -53,9 +65,21 @@ process {
 
     withName: '.*:ALIGN_READS:MINIMAP2_ALIGN_SPLIT' {
         if(params.preset == 'revio' | params.preset == 'pacbio') {
-            ext.args = "-y -x map-hifi --secondary=no -Y"
-        } else if(params.preset == 'ONT_R9' | params.preset == 'ONT_R10') {
-            ext.args = "-y -x map-ont --secondary=no -Y"
+            ext.args = { [
+                "-y",
+                "-x map-hifi",
+                "--secondary=no",
+                "-Y",
+                "-R @RG\\\\tID:${meta.id}\\\\tSM:${meta.id}"
+            ].join(' ') }
+        } else if(params.preset == 'ONT_R10') {
+            ext.args = { [
+                "-y",
+                "-x map-ont",
+                "--secondary=no",
+                "-Y",
+                "-R @RG\\\\tID:${meta.id}\\\\tSM:${meta.id}"
+            ].join(' ') }
         }
     }
 

diff --git a/conf/modules/bam_infer_sex.config b/conf/modules/bam_infer_sex.config
@@ -0,0 +1,37 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Config file for defining DSL2 per module options and publishing paths
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Available keys to override module options:
+        ext.args   = Additional arguments appended to command in module.
+        ext.args2  = Second set of arguments appended to command in module (multi-tool modules).
+        ext.args3  = Third set of arguments appended to command in module (multi-tool modules).
+        ext.prefix = File name prefix for output files.
+----------------------------------------------------------------------------------------
+*/
+
+process {
+
+    /*
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Extract relate somalier
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    */
+
+    withName: '.*:BAM_INFER_SEX:.*' {
+        publishDir = [
+            enabled: false,
+        ]
+    }
+
+    withName: '.*:BAM_INFER_SEX:SOMALIER_RELATE' {
+
+        ext.args = '--infer'
+
+        publishDir = [
+            path: { "${params.outdir}/qc_aligned_reads/somalier/relate/${meta.id}" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+}
diff --git a/conf/test.config b/conf/test.config
@@ -43,6 +43,9 @@ params {
     vep_cache = "https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/reference/vep_cache_test_data.tar.gz"
     snp_db    = "https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/nallo/testdata/snp_dbs.csv"
 
+    // Somalier
+    somalier_sites = "https://raw.github.com/genomic-medicine-sweden/test-datasets/nallo/reference/somalier_sites.vcf.gz"
+
     parallel_snv = 3 // Create 3 parallel DeepVariant processes
     preset = "revio"
 

diff --git a/docs/output.md b/docs/output.md
@@ -32,6 +32,7 @@ This document roughly describes the output structure produced by the pipeline. T
 | &emsp;└── stats           | Directory containing statistics related to phased reads.                      |
 | pipeline_info             | Directory containing information and reports about the pipeline.              |
 | qc_aligned_reads          | Directory for quality control results of aligned reads.                       |
+| ├── somalier              | Directory containing sample control, relatedness etc. from somalier.          |
 | ├── cramino               | Directory containing QC results using the cramino tool.                       |
 | &nbsp;│&emsp;└── unphased | Directory containing unphased QC results.                                     |
 | └── mosdepth              | Directory containing QC results using the mosdepth tool.                      |

diff --git a/docs/usage.md b/docs/usage.md
@@ -59,26 +59,25 @@ You will need to create a samplesheet with information about the samples you wou
 
 It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below.
 `file` can either be a gzipped-fastq file or an aligned or unalinged BAM file (BAM files will be converted to FASTQ and aligned again).
-`phenotype` is not used at the moment but still required, set it to `1`. If you don't have related samples, set `family_id`, `paternal_id` and `maternal_id` to something of your liking which is not a `sample` name.
+`phenotype` is not used at the moment but still required, set it to `1`. If you don't have related samples, `family_id` could be set to sample name, and `paternal_id` and `maternal_id` to a value that is not another `sample` name.
+
+If sex is unknown, a VCF of known polymorphic sites (e.g. [sites.hg38.vcg.gz](https://github.com/brentp/somalier/files/3412456/sites.hg38.vcf.gz)) needs to be supplied with `--somalier_sites`, from which sex will be inferred if possible.
 
 ```console
 sample,file,family_id,paternal_id,maternal_id,sex,phenotype
 HG002,/path/to/HG002.fastq.gz,FAM,HG003,HG004,1,1
 HG005,/path/to/HG005.bam,FAM,HG003,HG004,2,1
 ```
 
-| Fields                                     | Description                                                                                                |
-| ------------------------------------------ | ---------------------------------------------------------------------------------------------------------- |
-| `sample`                                   | Custom sample name, cannot contain spaces.                                                                 |
-| `file`                                     | Absolute path to gzipped FASTQ or BAM file. File has to have the extension ".fastq.gz", .fq.gz" or ".bam". |
-| `family_id`                                | "Family ID must be provided and cannot contain spaces. If no family ID is avail                            |
-| able, use the same ID as the sample.       |
-| `paternal_id`                              | Paternal ID must be provided and cannot contain spaces. If no paternal ID is a                             |
-| vailable, use any ID not in sample column. |
-| `maternal_id`                              | Maternal ID must be provided and cannot contain spaces. If no maternal ID is a                             |
-| vailable, use any ID not in sample column. |
-| `sex`                                      | Sex (1=male; 2=female).                                                                                    |
-| `phenotype`                                | Affected status of patient (0 = missing; 1=unaffected; 2=affected).                                        |
+| Fields        | Description                                                                                                               |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------- |
+| `sample`      | Custom sample name, cannot contain spaces.                                                                                |
+| `file`        | Absolute path to gzipped FASTQ or BAM file. File has to have the extension ".fastq.gz", .fq.gz" or ".bam".                |
+| `family_id`   | "Family ID must be provided and cannot contain spaces. If no family ID is available you can use the same ID as the sample |
+| `paternal_id` | Paternal ID must be provided and cannot contain spaces. If no paternal ID is available, use any ID not in sample column.  |
+| `maternal_id` | Maternal ID must be provided and cannot contain spaces. If no maternal ID is available, use any ID not in sample column.  |
+| `sex`         | Sex (0=unknown; 1=male; 2=female).                                                                                        |
+| `phenotype`   | Affected status of patient (0 = missing; 1=unaffected; 2=affected).                                                       |
 
 An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline.
 
@@ -102,14 +101,14 @@ The typical command example above requires no additional files except the refere
 Nallo has the ability to skip certain parts of the pipeline, for example `--skip_repeat_wf`.
 Some workflows require additional files:
 
-If running without `--skip_assembly_wf`, download a BED file with PAR regions ([hg38](https://raw.githubusercontent.com/lh3/dipcall/master/data/hs38.PAR.bed)) to supply with `--dipcall_par`.
+- If running without `--skip_assembly_wf`, download a BED file with PAR regions ([hg38](https://raw.githubusercontent.com/lh3/dipcall/master/data/hs38.PAR.bed)) to supply with `--dipcall_par`.
 
 > [!NOTE]
 > Make sure chrY PAR is hard masked in reference.
 
-If running without `--skip_repeat_wf`, download a BED file with tandem repeats ([TRGT](https://github.com/PacificBiosciences/trgt/tree/main/repeats)) matching your reference genome to supply with `--trgt_repeats`.
+- If running without `--skip_repeat_wf`, download a BED file with tandem repeats ([TRGT](https://github.com/PacificBiosciences/trgt/tree/main/repeats)) matching your reference genome to supply with `--trgt_repeats`.
 
-If running without `--skip_snv_annotation`, download [VEP cache](https://ftp.ensembl.org/pub/release-110/variation/vep/homo_sapiens_vep_110_GRCh38.tar.gz) to supply with `--vep_cache` and prepare a samplesheet with annotation databases ([`echtvar encode`](https://github.com/brentp/echtvar)) to supply with `--snp_db`:
+- If running without `--skip_snv_annotation`, download [VEP cache](https://ftp.ensembl.org/pub/release-110/variation/vep/homo_sapiens_vep_110_GRCh38.tar.gz) to supply with `--vep_cache` and prepare a samplesheet with annotation databases ([`echtvar encode`](https://github.com/brentp/echtvar)) to supply with `--snp_db`:
 
 `snp_dbs.csv`
 
@@ -119,9 +118,9 @@ gnomad,/path/to/gnomad.v3.1.2.echtvar.popmax.v2.zip
 cadd,/path/to/cadd.v1.6.hg38.zip
 ```
 
-If running without `--skip_cnv_calling`, expected CN regions for your reference genome can be downloaded from [HiFiCNV GitHub](https://github.com/PacificBiosciences/HiFiCNV/tree/main/data) to supply with `--hificnv_xy`, `--hificnv_xx` (expected_cn) and `--hificnv_exclude` (excluded_regions).
+- If running without `--skip_cnv_calling`, expected CN regions for your reference genome can be downloaded from [HiFiCNV GitHub](https://github.com/PacificBiosciences/HiFiCNV/tree/main/data) to supply with `--hificnv_xy`, `--hificnv_xx` (expected_cn) and `--hificnv_exclude` (excluded_regions).
 
-If you want to include extra samples for mili-sample calling of SVs - prepare a samplesheet with .snf files from Sniffles to supply with `--extra_snfs`:
+- If you want to include extra samples for mili-sample calling of SVs - prepare a samplesheet with .snf files from Sniffles to supply with `--extra_snfs`:
 
 `extra_snfs.csv`
 
@@ -131,7 +130,7 @@ HG01123,/path/to/HG01123_sniffles.snf
 HG01124,/path/to/HG01124_sniffles.snf
 ```
 
-and for SNVs - prepare a samplesheet with gVCF files from DeepVariant to supply with `--extra_gvcfs`:
+- For SNVs - prepare a samplesheet with gVCF files from DeepVariant to supply with `--extra_gvcfs`:
 
 > [!NOTE]
 > These has to have been generated with the same version of reference genome.
@@ -266,6 +265,7 @@ Different processes may need extra input files
 | `hificnv_xy`                       |                                                                                                                                                                                                                                                                           | `string`  |         |          |        |
 | `hificnv_xx`                       |                                                                                                                                                                                                                                                                           | `string`  |         |          |        |
 | `hificnv_exclude`                  | HiFiCNV BED file specifying regions to exclude                                                                                                                                                                                                                            | `string`  |         |          |        |
+| `somalier_sites`                   | A VCF of known polymorphic sites                                                                                                                                                                                                                                          | `string`  |         |          |        |
 | `validationFailUnrecognisedParams` | Validation of parameters fails when an unrecognised parameter is found. <details><summary>Help</summary><small>By default, when an unrecognised parameter is found, it returns a warning.</small></details>                                                               | `boolean` |         |          | True   |
 | `validationLenientMode`            | Validation of parameters in lenient more. <details><summary>Help</summary><small>Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode).</small></details> | `boolean` |         |          | True   |
 

diff --git a/lib/CustomFunctions.groovy b/lib/CustomFunctions.groovy
@@ -0,0 +1,21 @@
+import nextflow.Nextflow
+
+class CustomFunctions {
+
+    // Function to generate a pedigree file
+    public static File makePed(samples, outdir) {
+        def case_name  = "multisample"
+        def outfile  = new File(outdir +"/pipeline_info/${case_name}" + '.ped')
+        outfile.text = ['#family_id', 'sample_id', 'father', 'mother', 'sex', 'phenotype'].join('\t')
+        def samples_list = []
+        for(int i = 0; i<samples.size(); i++) {
+            samples[i] = samples[i][0]
+            def sample_name =  samples[i].id
+            if (!samples_list.contains(sample_name)) {
+                outfile.append('\n' + [samples[i].family_id, sample_name, samples[i].paternal_id, samples[i].maternal_id, samples[i].sex, samples[i].phenotype].join('\t'));
+                samples_list.add(sample_name)
+            }
+        }
+        return outfile
+    }
+}
diff --git a/modules.json b/modules.json
@@ -135,6 +135,16 @@
                         "installed_by": ["modules"],
                         "patch": "modules/nf-core/sniffles/sniffles.diff"
                     },
+                    "somalier/extract": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
+                    },
+                    "somalier/relate": {
+                        "branch": "master",
+                        "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5",
+                        "installed_by": ["modules"]
+                    },
                     "tabix/bgziptabix": {
                         "branch": "master",
                         "git_sha": "5e7b1ef9a5a2d9258635bcbf70fcf37dacd1b247",

diff --git a/modules/nf-core/somalier/extract/environment.yml b/modules/nf-core/somalier/extract/environment.yml
diff --git a/modules/nf-core/somalier/extract/main.nf b/modules/nf-core/somalier/extract/main.nf