From 55bc77d8c563cab8721de878e2a4fbd398a887b1 Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 11 Dec 2023 14:29:49 -0600 Subject: [PATCH 1/6] fix sanity check --- sandbox/anvi-script-filter-hmm-hits-table | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/sandbox/anvi-script-filter-hmm-hits-table b/sandbox/anvi-script-filter-hmm-hits-table index c09b6a88b4..357b1d19d8 100755 --- a/sandbox/anvi-script-filter-hmm-hits-table +++ b/sandbox/anvi-script-filter-hmm-hits-table @@ -109,7 +109,7 @@ class FilterHmmHitsTable(object): filesnpaths.is_file_exists(self.domtblout) self.run.info("Domtblout Path", self.domtblout) - if not self.min_model_coverage or self.min_gene_coverage: + if not self.min_model_coverage and not self.min_gene_coverage: raise ConfigError("You didn't provide anvi-script-filter-hmm-hits-table with either a " "--min-model-coverage or --min-gene-coverage. Please provide at least one " "so anvi'o can filter hmm_hits for you :)") @@ -130,6 +130,15 @@ class FilterHmmHitsTable(object): raise ConfigError(f"--min-gene-coverage must be a percentage between 0 and 1 " f"and you put a a value larger than 100%: {self.min_gene_coverage}") + if self.min_gene_coverage and self.min_model_coverage: + self.run.info("Minimum gene coverage", self.min_gene_coverage) + self.run.info("Minimum model coverage", self.min_model_coverage) + else: + if self.min_gene_coverage: + self.run.info("Minimum gene coverage", self.min_gene_coverage) + if self.min_model_coverage: + self.run.info("Minimum model coverage", self.min_model_coverage) + if not self.hmm_source: raise ConfigError("Please provide a hmm-source :)") From 157a5fbf812d87061de01afab6a6d29983c84224 Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 11 Dec 2023 14:30:02 -0600 Subject: [PATCH 2/6] update component test --- anvio/tests/run_component_tests_for_metagenomics.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/anvio/tests/run_component_tests_for_metagenomics.sh b/anvio/tests/run_component_tests_for_metagenomics.sh index 6e93f56de3..16b8852ec2 100755 --- a/anvio/tests/run_component_tests_for_metagenomics.sh +++ b/anvio/tests/run_component_tests_for_metagenomics.sh @@ -173,7 +173,7 @@ anvi-run-hmms -c $output_dir/CONTIGS.db \ --no-progress \ $thread_controller -INFO "Filtering hmm_hits using target coverage" +INFO "Filtering hmm_hits using query coverage" anvi-script-filter-hmm-hits-table -c $output_dir/CONTIGS.db \ --domain-hits-table $output_dir/hmm.domtable \ --hmm-source Bacteria_71 \ @@ -181,6 +181,14 @@ anvi-script-filter-hmm-hits-table -c $output_dir/CONTIGS.db \ --no-progress \ --filter-out-partial-gene-calls +INFO "Filtering hmm_hits using target coverage" +anvi-script-filter-hmm-hits-table -c $output_dir/CONTIGS.db \ + --domain-hits-table $output_dir/hmm.domtable \ + --hmm-source Bacteria_71 \ + --min-gene-coverage 0.5 \ + --no-progress \ + --filter-out-partial-gene-calls + INFO "Listing all available HMM sources in the contigs database" anvi-delete-hmms -c $output_dir/CONTIGS.db \ --list \ From 4770fdd66e08615afe567962c4867d3da68288b3 Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 11 Dec 2023 14:42:09 -0600 Subject: [PATCH 3/6] update ecophylo docs with HMM coverage suggestions --- anvio/docs/workflows/ecophylo.md | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/anvio/docs/workflows/ecophylo.md b/anvio/docs/workflows/ecophylo.md index a2634042d5..af479feb3c 100644 --- a/anvio/docs/workflows/ecophylo.md +++ b/anvio/docs/workflows/ecophylo.md @@ -28,11 +28,33 @@ anvi-run-workflow -w ecophylo \ ### HMM alignment coverage filtering -The first step to removing bad %(hmm-hits)s is to filter out hits with low quality alignment coverage. This is done with the rule `filter_hmm_hits_by_model_coverage` which leverages %(anvi-script-filter-hmm-hits-table)s. We recommend 80%% model coverage filter for most cases. However, it is always recommended to explore the distribution of model coverage with any new HMM which will help you determine a proper cutoff (citation). To adjust this parameter, go to the `filter_hmm_hits_by_model_coverage` rule and change the parameter `--min-model-coverage`. +The first step to removing bad %(hmm-hits)s is to filter out hits with low quality alignment coverage. This is done with the rule `filter_hmm_hits_by_model_coverage` which leverages %(anvi-script-filter-hmm-hits-table)s. This tool uses the output of hmmsearch to filter out hits basedon the model and/or gene coverage. We recommend 80%% model coverage filter for most cases. However, it is always recommended to explore the distribution of model coverage with any new HMM which will help you determine a proper cutoff (citation). To adjust this parameter, go to the `filter_hmm_hits_by_model_coverage` rule and change the parameter `--min-model-coverage`. You can also adjust the gene coverage by change the parameter `--min-gene-coverage`. This can help remove ORFs with outlier lengths but completely depends on the HMM you are using. + +{:.notice} +Please consider exploring the distribution of alignment coverage values before choosing HMM alignment coverage filtering values. [Interproscan](https://www.ebi.ac.uk/interpro/) is a great way to visualize how publicly available HMMs align to proteins. Additionally, you can parse the domaintblout files from hmmsearch to explore these values in high throughput. + +```bash +{ + "filter_hmm_hits_by_model_coverage": { + "threads": 5, + "--min-model-coverage": 0.8, + "--min-gene-coverage": 0.5, + "additional_params": "" + }, +} +``` {:.notice} Some full gene length HMM models align to a single hmm-hit independently at different coordinates when there should only be one annotation. To merge these independent alignment into one HMM alignment coverage stat, set `--merge-partial-hits-within-X-nts` to any distance between the hits for which you would like to merge and add it to the rule `filter_hmm_hits_by_model_coverage` under `additional_params`. +```bash +{ + "filter_hmm_hits_by_model_coverage": { + "additional_params": "--merge-partial-hits-within-X-nts" + }, +} +``` + ### conservative-mode: complete open-reading frames only Genes predicted from genomes and metagenomes can be partial or complete depending on whether a stop and stop codon is detected. Even if you filter out %(hmm-hits)s with bad alignment coverage as discussed above, HMMs can still detect low quality hits with good alignment coverage and homology statistics due to partial genes. Unfortunately, partial genes can lead to spurious phylogenetic branches and/or inflate the number of observed populations or functions in a given set of genomes/metagenomes. From 6baff86b60c748944ba32f301207c2c604b30db6 Mon Sep 17 00:00:00 2001 From: mschechter Date: Mon, 11 Dec 2023 16:38:48 -0600 Subject: [PATCH 4/6] fix --- anvio/docs/workflows/ecophylo.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anvio/docs/workflows/ecophylo.md b/anvio/docs/workflows/ecophylo.md index af479feb3c..e97993cd79 100644 --- a/anvio/docs/workflows/ecophylo.md +++ b/anvio/docs/workflows/ecophylo.md @@ -31,7 +31,7 @@ anvi-run-workflow -w ecophylo \ The first step to removing bad %(hmm-hits)s is to filter out hits with low quality alignment coverage. This is done with the rule `filter_hmm_hits_by_model_coverage` which leverages %(anvi-script-filter-hmm-hits-table)s. This tool uses the output of hmmsearch to filter out hits basedon the model and/or gene coverage. We recommend 80%% model coverage filter for most cases. However, it is always recommended to explore the distribution of model coverage with any new HMM which will help you determine a proper cutoff (citation). To adjust this parameter, go to the `filter_hmm_hits_by_model_coverage` rule and change the parameter `--min-model-coverage`. You can also adjust the gene coverage by change the parameter `--min-gene-coverage`. This can help remove ORFs with outlier lengths but completely depends on the HMM you are using. {:.notice} -Please consider exploring the distribution of alignment coverage values before choosing HMM alignment coverage filtering values. [Interproscan](https://www.ebi.ac.uk/interpro/) is a great way to visualize how publicly available HMMs align to proteins. Additionally, you can parse the domaintblout files from hmmsearch to explore these values in high throughput. +Please consider exploring the distribution of alignment coverages before choosing HMM alignment coverage filtering values. [Interproscan](https://www.ebi.ac.uk/interpro/) is a great way to visualize how publicly available HMMs align to proteins. Additionally, you can parse the domtblout files from hmmsearch to explore these values in high throughput. ```bash { From 3d673e8dfb91bd135c4d0ad286a9cd8d5f125248 Mon Sep 17 00:00:00 2001 From: mschechter Date: Tue, 12 Dec 2023 09:24:49 -0600 Subject: [PATCH 5/6] update --- .../programs/anvi-script-filter-hmm-hits-table.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/anvio/docs/programs/anvi-script-filter-hmm-hits-table.md b/anvio/docs/programs/anvi-script-filter-hmm-hits-table.md index 068fc3b2c8..fd275bc385 100644 --- a/anvio/docs/programs/anvi-script-filter-hmm-hits-table.md +++ b/anvio/docs/programs/anvi-script-filter-hmm-hits-table.md @@ -2,7 +2,7 @@ This program allows you to remove low quality HMM alignments from a %(hmm-source ## Filter with HMM alignment parameters -Similar to query coverage in BLAST, we can also use HMM alignment coverage to help determine if an hmm-hit is homologous. A small coverage value means only a small proportion of the query/target is aligning. Before anvi'o can filter out %(hmm-hits)s with alignment coverage, you must run %(anvi-run-hmms)s and report a domain hits table by including `--domain-hits-table` flag in your command: +Similar to query coverage in BLAST, we can also use HMM alignment coverage to help determine if an hmm-hit is homologous. A small alignment coverage value means only a small proportion of the query/target is aligning. Before anvi'o can filter out %(hmm-hits)s with alignment coverage, you must run %(anvi-run-hmms)s and report a domain hits table by including `--domain-hits-table` flag in your command. This will write the [domtblout](http://eddylab.org/software/hmmer3/3.1b2/Userguide.pdf) file from hmmsearch: {{ codestart }} anvi-run-hmms -c %(contigs-db)s \ @@ -11,15 +11,16 @@ anvi-run-hmms -c %(contigs-db)s \ --domain-hits-table {{ codestop }} -After the command above, your HMM hits will be stored in your %(contigs-db)s as usual. However, with the domain hits table, you can filter out hits from your %(contigs-db)s using thresholds for model or gene coverage of each hit i.e. you can filter out %(hmm-hits)s where the profile HMM and gene align well to each other. +After the command above, your %(hmm-hits)s will be stored in your %(contigs-db)s as usual. However, with the domain hits table, you can filter out hits from your %(contigs-db)s using thresholds for `--min-model-coverage` or `--min-model-coverage` of each hit i.e. you can filter out %(hmm-hits)s where the profile HMM and gene align well to each other. -For example, following the command above, the command below will remove %(hmm-hits)s from your %(contigs-db)s for profile HMMs that had less than 90%% coverage of the target genes: +For example, following the command above, the command below will remove %(hmm-hits)s from your %(contigs-db)s for profile HMMs that had less than 90%% model coverage and 50%% gene coverage: {{ codestart }} anvi-script-filter-hmm-hits-table -c %(contigs-db)s \ --hmm-source Bacteria_71 \ --domain-hits-table path/to/dir/hmm.domtable \ - --min-model-coverage 0.9 + --min-model-coverage 0.9 \ + --min-gene-coverage 0.5 {{ codestop }} ### HMMs with multiple hits to one gene @@ -39,7 +40,7 @@ The input domtblout file for %(anvi-script-filter-hmm-hits-table)s will be saved ## Filter out hmm-hits from partial genes -HMMs are able to detect partial genes (i.e., genes that are not partial and that start with a start codon and end with a stop codon) with good alignment coverage and homology statistics. However, partial genes can lead to spurious phylogenetic branches and/or inflate the number of observed populations or functions in a given set of genomes/metagenomes. Using `--filter-out-partial-gene-calls`, you can remove partial gene hmm-hits. +HMMs are able to detect partial genes (i.e., genes that do not contain start and/or stop codons) with good alignment coverage and homology statistics. However, partial genes can lead to spurious phylogenetic branches and/or inflate the number of observed populations or functions in a given set of genomes/metagenomes. Using `--filter-out-partial-gene-calls`, you can remove partial gene hmm-hits. {{ codestart }} anvi-script-filter-hmm-hits-table -c %(contigs-db)s \ From 523423011feb1d8f9bc807a86b48f5ddc1c4b24a Mon Sep 17 00:00:00 2001 From: mschechter Date: Tue, 12 Dec 2023 10:07:08 -0600 Subject: [PATCH 6/6] more clear --- anvio/docs/programs/anvi-script-filter-hmm-hits-table.md | 1 - 1 file changed, 1 deletion(-) diff --git a/anvio/docs/programs/anvi-script-filter-hmm-hits-table.md b/anvio/docs/programs/anvi-script-filter-hmm-hits-table.md index fd275bc385..13d027e391 100644 --- a/anvio/docs/programs/anvi-script-filter-hmm-hits-table.md +++ b/anvio/docs/programs/anvi-script-filter-hmm-hits-table.md @@ -31,7 +31,6 @@ Some HMM profiles align multiple times to the same gene at different coordinates anvi-script-filter-hmm-hits-table -c %(contigs-db)s \ --hmm-source Bacteria_71 \ --domain-hits-table path/to/dir/hmm.domtable \ - --min-model-coverage 0.9 \ --merge-partial-hits-within-X-nts {{ codestop }}