Skip to content

Commit

Permalink
Pre-calculated query results for test db
Browse files Browse the repository at this point in the history
  • Loading branch information
jlumpe committed Aug 25, 2021
1 parent 47cbdea commit 4589256
Show file tree
Hide file tree
Showing 8 changed files with 218 additions and 74 deletions.
38 changes: 38 additions & 0 deletions gambit/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

from gambit.kmers import KmerSpec, KmerSignature, dense_to_sparse, kmer_to_index, reverse_complement
from gambit.signatures import SignatureArray
from gambit.query import QueryResultItem
from gambit.classify import ClassifierResult, GenomeMatch


def bernoulli(size: Union[int, tuple], p: float) -> np.ndarray:
Expand Down Expand Up @@ -159,3 +161,39 @@ def make_kmer_seq(kspec: KmerSpec, seqlen: int, kmer_interval: int, n_interval:
seq_array[p:p + kspec.total_len] = match

return bytes(seq_array), dense_to_sparse(vec)


def compare_genome_matches(match1: Optional[GenomeMatch], match2: Optional[GenomeMatch]) -> bool:
"""Compare two ``GenomeMatch`` instances for equality.
The values for the ``distance`` attribute are only checked for approximate equality, to support
instances where one was loaded from a results archive (saving and loading a float in JSON is
lossy).
Also allows one or both values to be None.
"""
if match1 is None or match2 is None:
return match1 is None and match2 is None

return match1.genome == match2.genome and \
match1.matched_taxon == match2.matched_taxon and \
np.isclose(match1.distance, match2.distance)


def compare_classifier_results(result1: ClassifierResult, result2: ClassifierResult) -> bool:
"""Compare two ``ClassifierResult`` instances for equality."""
return result1.success == result2.success and \
result1.predicted_taxon == result2.predicted_taxon and \
compare_genome_matches(result1.primary_match, result2.primary_match) and \
compare_genome_matches(result1.closest_match, result2.closest_match) and \
set(result1.warnings) == set(result2.warnings) and \
result1.error == result2.error


def compare_result_items(item1: QueryResultItem, item2: QueryResultItem) -> bool:
"""Compare two ``QueryResultItem`` instances for equality.
Does not compare the value of the ``input`` attributes.
"""
return item1.report_taxon == item2.report_taxon and \
compare_classifier_results(item1.classifier_result, item2.classifier_result)
85 changes: 53 additions & 32 deletions tests/cli/test_cli_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,75 +4,96 @@

import json
from csv import DictReader
from io import StringIO
from pathlib import Path

import pytest
import numpy as np

from gambit.cli import cli
from gambit.util.misc import zip_strict
from gambit.io.export.json import JSONResultsExporter
from gambit.io.export.csv import CSVResultsExporter
from gambit import __version__ as GAMBIT_VERSION


@pytest.mark.parametrize('out_fmt', ['csv', 'json'])
@pytest.mark.parametrize('classify_strict', [False, True])
def test_query_cmd(testdb_files, testdb, testdb_queries, out_fmt, classify_strict, tmp_path):
def test_query_cmd(testdb_files, testdb, testdb_queries, testdb_results, out_fmt, tmp_path):
"""Run a full query using the command line interface."""
if not classify_strict:
pytest.skip() # TODO

results_file = tmp_path / 'results.json'
results_file = tmp_path / ('results.' + out_fmt)
query_files = [query['file'] for query in testdb_queries]
params = testdb_results.params

args = [
f'--db={testdb_files["root"]}',
'query',
f'--output={results_file}',
f'--outfmt={out_fmt}',
'--strict' if classify_strict else '--no-strict',
'--strict' if params.classify_strict else '--no-strict',
*(str(f.path) for f in query_files),
]

cli.main(args, standalone_mode=False)

# Detailed checks of output format are already present in tests for exporter classes, just need
# to check that the results themselves seem correct
# Detailed checks of output format are already present in tests for exporter classes, just check
# that the exported data matches an export of the reference results
if out_fmt == 'json':
_check_results_json(results_file, testdb, testdb_queries)
_check_results_json(results_file, query_files, testdb_results)
elif out_fmt == 'csv':
_check_results_csv(results_file, testdb_queries)
_check_results_csv(results_file, query_files, testdb_results)
else:
assert False


def _check_results_json(results_file, testdb, queries):
def _check_results_json(results_file, query_files, ref_results):
with results_file.open() as f:
results = json.load(f)
data = json.load(f)

# Equivalent data for reference results
exporter = JSONResultsExporter()
buf = StringIO()
exporter.export(buf, ref_results)
buf.seek(0)
ref_data = json.load(buf)

assert results['genomeset']['key'] == testdb.genomeset.key
assert results['signaturesmeta']['id'] == testdb.signatures.meta.id
assert results['gambit_version'] == GAMBIT_VERSION
assert data['gambit_version'] == GAMBIT_VERSION
assert len(data['items']) == len(query_files)

items = results['items']
assert len(items) == len(queries)
for key in ['genomeset', 'signaturesmeta', 'extra']:
assert data[key] == ref_data[key]

for item, query in zip_strict(items, queries):
assert item['query']['path'] == str(query['file'].path)
for item, ref_item, query_file in zip_strict(data['items'], ref_data['items'], query_files):
assert Path(item['query']['path']).name == query_file.path.name
assert item['query']['format'] == query_file.format

if query['predicted']:
assert item['predicted_taxon']['name'] == query['predicted']
else:
assert item['predicted_taxon'] is None
assert item['predicted_taxon'] == ref_item['predicted_taxon']
assert item['closest_genome'] == ref_item['closest_genome']
assert np.isclose(item['closest_genome_distance'], ref_item['closest_genome_distance'])


def _check_results_csv(results_file, queries):
def _check_results_csv(results_file, query_files, ref_results):
with results_file.open() as f:
rows = list(DictReader(f))

assert len(rows) == len(queries)
exporter = CSVResultsExporter()
buf = StringIO()
exporter.export(buf, ref_results)
buf.seek(0)
ref_rows = list(DictReader(buf))

assert len(rows) == len(ref_rows)

cmp_cols = [
'predicted.name',
'predicted.rank',
'predicted.ncbi_id',
'predicted.threshold',
'closest.description',
]

for row, query in zip_strict(rows, queries):
assert row['query.path'] == str(query['file'].path)
for row, ref_row, file in zip_strict(rows, ref_rows, query_files):
assert row['query.path'] == str(file.path)
assert np.isclose(float(row['closest.distance']), float(ref_row['closest.distance']))

if query['predicted']:
assert row['predicted.name'] == query['predicted']
else:
assert row['predicted.name'] == ''
for key in cmp_cols:
assert row[key] == ref_row[key]
16 changes: 16 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,19 @@ def testdb_queries(testdb, testdb_files):
)

return rows

@pytest.fixture(scope='session', params=['non_strict', 'strict'])
def testdb_results(request, testdb_files, testdb_session):
"""Pre-calculated query results.
Use a yield statement here instead of a return, we want to keep a reference
to the session object until teardown or else it may be garbage collected,
which would render any ORM instances in the results object invalid.
"""
from gambit.io.export.archive import ResultsArchiveReader

session = testdb_session()
reader = ResultsArchiveReader(session)

path = testdb_files['results'] / (request.param + '.json')
yield reader.read(path)
4 changes: 3 additions & 1 deletion tests/data/testdb_210818/Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,6 @@ version control so nothing needs to be downloaded from an external data reposito
* `queries.csv` - table listing all query files and expected results.
* `genomes/` - contains query genome files in FASTA format.
* `query-signatures.h5` - precalculated signatures for query genomes.

* `results/` - pre-calculated results using query files in `queries`.
* `generate-results.py` - script which generates result files in `results/`. Verifies against
expected result attributes in `queries.csv`.
98 changes: 98 additions & 0 deletions tests/data/testdb_210818/generate-results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#!/usr/bin/env python3

"""
Runs query with testdb_210818 database and its included query sequences and saves results in archive
format.
This script should be re-run whenever the expected results change.
"""

from pathlib import Path
from csv import DictReader

from gambit.io.seq import SequenceFile
from gambit.db import load_database_from_dir
from gambit.query import QueryParams, query_parse
from gambit.io.export.archive import ResultsArchiveWriter
from gambit.util.misc import zip_strict


PARAMS = {
'non_strict': QueryParams(classify_strict=False),
'strict': QueryParams(classify_strict=True),
}


def load_query_data():
with open('queries/queries.csv', newline='') as f:
rows = list(DictReader(f))

genomes_dir = Path('queries/genomes')

for row in rows:
row['warnings'] = row['warnings'].lower() == 'true'
row['file'] = SequenceFile(
path=genomes_dir / (row['name'] + '.fasta'),
format='fasta',
)

return rows


def check_results(queries, results):

for query, item in zip_strict(queries, results.items):
clsresult = item.classifier_result
predicted = clsresult.predicted_taxon

assert item.input.file == query['file']
assert clsresult.success
assert clsresult.error is None

if results.params.classify_strict:
if query['predicted']:
assert predicted is not None
assert predicted.name == query['predicted']
assert clsresult.primary_match is not None
assert clsresult.primary_match.genome.description == query['primary']
assert item.report_taxon is (predicted if predicted.report else predicted.parent)

else:
assert predicted is None
assert clsresult.primary_match is None
assert item.report_taxon is None

assert clsresult.closest_match.genome.description == query['closest']
assert bool(clsresult.warnings) == query['warnings']

else:
if query['predicted']:
assert clsresult.primary_match == clsresult.closest_match
assert predicted is clsresult.primary_match.matched_taxon
assert item.report_taxon is (predicted if predicted.report else predicted.parent)

else:
assert predicted is None
assert clsresult.primary_match is None
assert item.report_taxon is None

assert clsresult.closest_match.genome.description == query['closest']


def main():
queries = load_query_data()
query_files = [query['file'] for query in queries]
db = load_database_from_dir('')

writer = ResultsArchiveWriter()

for label, params in PARAMS.items():
results = query_parse(db, query_files, params)
check_results(queries, results)

with open(f'results/{label}.json', 'wt') as f:
writer.export(f, results)


if __name__ == '__main__':
main()
1 change: 1 addition & 0 deletions tests/data/testdb_210818/results/non_strict.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions tests/data/testdb_210818/results/strict.json

Large diffs are not rendered by default.

49 changes: 8 additions & 41 deletions tests/test_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

import pytest

from gambit.query import QueryInput, QueryParams, query_parse
from gambit.query import QueryInput, query_parse
from gambit.io.seq import SequenceFile
from gambit.util.misc import zip_strict
from gambit.test import compare_result_items
from gambit import __version__ as GAMBIT_VERSION


Expand All @@ -23,12 +24,11 @@ def test_convert(self):
QueryInput.convert(3.4)


@pytest.mark.parametrize('classify_strict', [False, True])
def test_query_python(testdb, testdb_queries, classify_strict):
def test_query_python(testdb, testdb_queries, testdb_results):
"""Run a full query using the Python API."""

ref_results = testdb_results
params = ref_results.params
query_files = [item['file'] for item in testdb_queries]
params = QueryParams(classify_strict=classify_strict)

results = query_parse(testdb, query_files, params)

Expand All @@ -37,39 +37,6 @@ def test_query_python(testdb, testdb_queries, classify_strict):
assert results.signaturesmeta == testdb.signatures.meta
assert results.gambit_version == GAMBIT_VERSION

for query, item in zip_strict(testdb_queries, results.items):
clsresult = item.classifier_result
predicted = clsresult.predicted_taxon

assert item.input.file == query['file']
assert clsresult.success
assert clsresult.error is None

if classify_strict:
if query['predicted']:
assert predicted is not None
assert predicted.name == query['predicted']
assert clsresult.primary_match is not None
assert clsresult.primary_match.genome.description == query['primary']
assert item.report_taxon is (predicted if predicted.report else predicted.parent)

else:
assert predicted is None
assert clsresult.primary_match is None
assert item.report_taxon is None

assert clsresult.closest_match.genome.description == query['closest']
assert bool(clsresult.warnings) == query['warnings']

else:
if query['predicted']:
assert clsresult.primary_match == clsresult.closest_match
assert predicted is clsresult.primary_match.matched_taxon
assert item.report_taxon is (predicted if predicted.report else predicted.parent)

else:
assert predicted is None
assert clsresult.primary_match is None
assert item.report_taxon is None

assert clsresult.closest_match.genome.description == query['closest']
for file, item, ref_item in zip_strict(query_files, results.items, ref_results.items):
assert item.input.file == file
compare_result_items(item, ref_item)

0 comments on commit 4589256

Please sign in to comment.