From 366f6d6319638b88370a0746acaa9e704576fa25 Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Thu, 25 Jul 2024 01:26:36 -0700 Subject: [PATCH] Remove gambit.sigs.convert module --- src/gambit/sigs/calc.py | 45 ++++++++++++ src/gambit/sigs/convert.py | 144 ------------------------------------- tests/common.py | 2 +- tests/sigs/test_calc.py | 28 +++++++- tests/sigs/test_convert.py | 92 ------------------------ tests/test_metric.py | 2 +- tests/test_tests_common.py | 2 +- 7 files changed, 75 insertions(+), 240 deletions(-) delete mode 100644 src/gambit/sigs/convert.py delete mode 100644 tests/sigs/test_convert.py diff --git a/src/gambit/sigs/calc.py b/src/gambit/sigs/calc.py index c4c4614..74b0eef 100644 --- a/src/gambit/sigs/calc.py +++ b/src/gambit/sigs/calc.py @@ -278,3 +278,48 @@ def calc_file_signatures(kspec: KmerSpec, assert all(sig is not None for sig in sigs) return SignatureList(sigs, kspec) + + +def dense_to_sparse(vec: Sequence[bool]) -> KmerSignature: + """Convert k-mer set from dense bit vector to sparse coordinate representation. + + Parameters + ---------- + vec + Boolean vector indicating which k-mers are present. + + Returns + ------- + numpy.ndarray + Sorted array of coordinates of k-mers present in vector. Data type will be ``numpy.intp``. + + See Also + -------- + .sparse_to_dense + """ + return np.flatnonzero(vec) + + +def sparse_to_dense(k_or_kspec: Union[int, KmerSpec], coords: KmerSignature) -> np.ndarray: + """Convert k-mer set from sparse coordinate representation back to dense bit vector. + + Parameters + ---------- + k_or_kspec + Value of k or a :class:`.KmerSpec` instance. + coords + Sparse coordinate array. + + Returns + ------- + numpy.ndarray + Dense k-mer bit vector. + + See Also + -------- + .dense_to_sparse + """ + idx_len = k_or_kspec.nkmers if isinstance(k_or_kspec, KmerSpec) else nkmers(k_or_kspec) + vec = np.zeros(idx_len, dtype=np.bool_) + vec[coords] = 1 + return vec diff --git a/src/gambit/sigs/convert.py b/src/gambit/sigs/convert.py deleted file mode 100644 index f0308b5..0000000 --- a/src/gambit/sigs/convert.py +++ /dev/null @@ -1,144 +0,0 @@ -"""Convert signatures between representations or from one ``KmerSpec`` to another.""" - -from typing import Sequence, Union - -import numpy as np - -from .base import KmerSignature -from gambit.kmers import KmerSpec, nkmers, kmer_to_index - - -def dense_to_sparse(vec: Sequence[bool]) -> KmerSignature: - """Convert k-mer set from dense bit vector to sparse coordinate representation. - - Parameters - ---------- - vec - Boolean vector indicating which k-mers are present. - - Returns - ------- - numpy.ndarray - Sorted array of coordinates of k-mers present in vector. Data type will be ``numpy.intp``. - - See Also - -------- - .sparse_to_dense - """ - return np.flatnonzero(vec) - - -def sparse_to_dense(k_or_kspec: Union[int, KmerSpec], coords: KmerSignature) -> np.ndarray: - """Convert k-mer set from sparse coordinate representation back to dense bit vector. - - Parameters - ---------- - k_or_kspec - Value of k or a :class:`.KmerSpec` instance. - coords - Sparse coordinate array. - - Returns - ------- - numpy.ndarray - Dense k-mer bit vector. - - See Also - -------- - .dense_to_sparse - """ - idx_len = k_or_kspec.nkmers if isinstance(k_or_kspec, KmerSpec) else nkmers(k_or_kspec) - vec = np.zeros(idx_len, dtype=np.bool_) - vec[coords] = 1 - return vec - - -def can_convert(from_kspec: KmerSpec, to_kspec: KmerSpec) -> bool: - """Check if signatures from one KmerSpec can be converted to another. - - Conversion is possible if ``to_kspec.prefix`` is equal to or starts with ``from_kspec.prefix`` - and ``to_kspec.total_len <= from_kspec.total_len``. - """ - return to_kspec.prefix.startswith(from_kspec.prefix) and to_kspec.total_len <= from_kspec.total_len - - -def check_can_convert(from_kspec: KmerSpec, to_kspec: KmerSpec): - """ - Check that signatures can be converted from one KmerSpec to another or raise an error with an - informative message. - - Raises - ------ - ValueError - If conversion is not possible. - """ - if not to_kspec.prefix.startswith(from_kspec.prefix): - raise ValueError('Destination prefix must start with source prefix.') - if to_kspec.total_len > from_kspec.total_len: - raise ValueError('Cannot convert to KmerSpec with longer total length.') - - -def _convert_params(from_kspec: KmerSpec, to_kspec: KmerSpec): - extra_prefix = to_kspec.prefix[from_kspec.prefix_len:] - extra_ind = kmer_to_index(extra_prefix) - extra_len = len(extra_prefix) - - range_ = nkmers(from_kspec.k - extra_len) - start = extra_ind * range_ - stop = (extra_ind + 1) * range_ - reduce = from_kspec.k - to_kspec.k - extra_len - - return start, stop, reduce - - -def convert_dense(from_kspec: KmerSpec, to_kspec: KmerSpec, vec: np.ndarray) -> np.ndarray: - """Convert a k-mer signature in dense format from one ``KmerSpec`` to another. - - In the ideal case, if ``vec`` is the result of ``calc_signature(from_kspec, seq, sparse=False)`` - the output of this function should be identical to ``calc_signature(to_kspec, seq, sparse=False)``. - In reality this may not hold if any potential matches of ``from_kspec`` in ``seq`` are discarded - due to an invalid nucleotide which is not included in the corresponding ``to_kspec`` match. - """ - check_can_convert(from_kspec, to_kspec) - start, stop, reduce = _convert_params(from_kspec, to_kspec) - block_size = nkmers(reduce) - - out = np.zeros(to_kspec.nkmers, dtype=bool) - - for i in range(block_size): - out |= vec[start+i:stop:block_size] - - return out - - -def convert_sparse(from_kspec: KmerSpec, to_kspec: KmerSpec, sig: KmerSignature) -> KmerSignature: - """Convert a k-mer signature in sparse format from one ``KmerSpec`` to another. - - In the ideal case, if ``sig`` is the result of ``calc_signature(from_kspec, seq)`` - the output of this function should be identical to ``calc_signature(to_kspec, seq)``. - In reality this may not hold if any potential matches of ``from_kspec`` in ``seq`` are discarded - due to an invalid nucleotide which is not included in the corresponding ``to_kspec`` match. - """ - assert can_convert(from_kspec, to_kspec) - start, stop, reduce = _convert_params(from_kspec, to_kspec) - reduce_bits = 2 * reduce - - out = np.empty(len(sig), dtype=to_kspec.index_dtype) - i = 0 - next_ = start - - for from_idx in sig: - if from_idx < next_: - continue - if from_idx >= stop: - break - - to_idx = (from_idx - start) >> reduce_bits - out[i] = to_idx - i += 1 - - # Next possible input index that won't reduce to the same output - next_ = ((to_idx + 1) << reduce_bits) + start - - out.resize(i) - return out diff --git a/tests/common.py b/tests/common.py index 998963a..4eb294e 100644 --- a/tests/common.py +++ b/tests/common.py @@ -7,7 +7,7 @@ from gambit.kmers import KmerSpec, kmer_to_index from gambit.seq import seq_to_bytes, revcomp from gambit.sigs import KmerSignature, SignatureArray -from gambit.sigs.convert import dense_to_sparse, sparse_to_dense +from gambit.sigs.calc import dense_to_sparse, sparse_to_dense from gambit.db import Taxon diff --git a/tests/sigs/test_calc.py b/tests/sigs/test_calc.py index efed455..52924d2 100644 --- a/tests/sigs/test_calc.py +++ b/tests/sigs/test_calc.py @@ -7,7 +7,8 @@ from Bio import SeqIO from Bio.Seq import Seq -from gambit.sigs.calc import calc_signature, calc_file_signature, calc_file_signatures +from gambit.sigs.calc import calc_signature, calc_file_signature, calc_file_signatures, \ + dense_to_sparse, sparse_to_dense from gambit.kmers import KmerSpec, index_to_kmer from gambit.seq import SEQ_TYPES, revcomp, SequenceFile import gambit.util.io as ioutil @@ -171,3 +172,28 @@ def test_calc_file_signatures(self, record_sets, files, concurrency): sigs2 = calc_file_signatures(KSPEC, files, progress=pconf, concurrency=concurrency) assert sigarray_eq(sigs, sigs2) + + +def test_dense_sparse_conversion(): + """Test conversion between dense and sparse representations of k-mer coordinates.""" + + for k in range(1, 10): + + kspec = KmerSpec(k, 'ATGAC') + + # Create dense signature with every 3rd k-mer + vec = np.zeros(kspec.nkmers, dtype=bool) + vec[np.arange(vec.size) % 3 == 0] = True + + # Convert to sparse + sig = dense_to_sparse(vec) + + assert len(sig) == vec.sum() + for index in sig: + assert vec[index] + + # Check sorted + assert np.all(np.diff(sig) > 0) + + # Check converting back + assert np.array_equal(vec, sparse_to_dense(kspec, sig)) diff --git a/tests/sigs/test_convert.py b/tests/sigs/test_convert.py deleted file mode 100644 index 964cc15..0000000 --- a/tests/sigs/test_convert.py +++ /dev/null @@ -1,92 +0,0 @@ -"""Test the gambit.sigs.convert module.""" - -import pytest -import numpy as np - -from gambit.sigs.convert import dense_to_sparse, sparse_to_dense, can_convert, \ - check_can_convert, convert_dense, convert_sparse -from gambit.kmers import KmerSpec -from ..common import random_seq - - -def test_dense_sparse_conversion(): - """Test conversion between dense and sparse representations of k-mer coordinates.""" - - for k in range(1, 10): - - kspec = KmerSpec(k, 'ATGAC') - - # Create dense signature with every 3rd k-mer - vec = np.zeros(kspec.nkmers, dtype=bool) - vec[np.arange(vec.size) % 3 == 0] = True - - # Convert to sparse - sig = dense_to_sparse(vec) - - assert len(sig) == vec.sum() - for index in sig: - assert vec[index] - - # Check sorted - assert np.all(np.diff(sig) > 0) - - # Check converting back - assert np.array_equal(vec, sparse_to_dense(kspec, sig)) - - -class TestKmerSpecConversion: - """Test converting signatures from one KmerSpec to another.""" - - def test_can_convert(self): - from_kspec = KmerSpec(11, 'ATGAC') - - compatible = [ - KmerSpec(11, 'ATGAC'), - KmerSpec(8, 'ATGAC'), - KmerSpec(10, 'ATGACA'), - KmerSpec(8, 'ATGACA'), - ] - - for to_kspec in compatible: - assert can_convert(from_kspec, to_kspec) - check_can_convert(from_kspec, to_kspec) - - incompatible = [ - KmerSpec(11, 'CAGTA'), - KmerSpec(12, 'ATGAC'), - KmerSpec(11, 'ATGA'), - KmerSpec(11, 'ATGACT'), - ] - - for to_kspec in incompatible: - assert not can_convert(from_kspec, to_kspec) - with pytest.raises(ValueError): - check_can_convert(from_kspec, to_kspec) - - @pytest.fixture(scope='class') - def seqs(self): - np.random.seed(0) - return [random_seq(100_000) for _ in range(100)] - - @pytest.mark.parametrize('to_kspec', [ - KmerSpec(10, 'ATGAC'), # Reduce k - KmerSpec(8, 'ATGAC'), # Reduce k - KmerSpec(9, 'ATGACGT'), # Extend prefix - KmerSpec(7, 'ATGACGT'), # Extend prefix and reduce k further - ]) - def test_convert(self, seqs, to_kspec): - from gambit.sigs.calc import calc_signature - - from_kspec = KmerSpec(11, 'ATGAC') - - for seq in seqs: - from_sig = calc_signature(from_kspec, seq) - from_vec = sparse_to_dense(from_kspec.k, from_sig) - - to_vec = convert_dense(from_kspec, to_kspec, from_vec) - to_sig = convert_sparse(from_kspec, to_kspec, from_sig) - - found_sig = calc_signature(to_kspec, seq) - - assert np.array_equal(to_sig, found_sig) - assert np.array_equal(to_vec, sparse_to_dense(to_kspec.k, found_sig)) diff --git a/tests/test_metric.py b/tests/test_metric.py index 2784804..25b775d 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -7,7 +7,7 @@ from gambit.metric import jaccard, jaccarddist, jaccard_bits, jaccard_generic, jaccarddist_array, \ jaccarddist_matrix, jaccarddist_pairwise, num_pairs, SCORE_DTYPE, BOUNDS_DTYPE -from gambit.sigs.convert import sparse_to_dense +from gambit.sigs.calc import sparse_to_dense from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures from gambit.kmers import KmerSpec from gambit.util.progress import check_progress diff --git a/tests/test_tests_common.py b/tests/test_tests_common.py index 899f45e..9ec7f7a 100644 --- a/tests/test_tests_common.py +++ b/tests/test_tests_common.py @@ -5,7 +5,7 @@ from gambit.kmers import KmerSpec, kmer_to_index, nkmers from gambit.seq import revcomp -from gambit.sigs.convert import dense_to_sparse +from gambit.sigs.calc import dense_to_sparse from gambit.util.progress import get_progress from . import common