Type aliases

jlumpe · Aug 4, 2024 · f1f718a · f1f718a
1 parent 2d5675f
commit f1f718a
Show file tree

Hide file tree

Showing 15 changed files with 62 additions and 42 deletions.
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -88,6 +88,11 @@
 autodoc_member_order = 'groupwise'
 autodoc_typehints = 'description'
 
+autodoc_type_aliases = {
+    'FilePath': 'FilePath',
+    'DNASeq': 'DNASeq',
+}
+
 intersphinx_mapping = {
 	'python': ('https://docs.python.org/3', None),
 	'numpy': ('https://numpy.org/doc/stable/', None),

diff --git a/setup.cfg b/setup.cfg
@@ -28,6 +28,7 @@ install_requires =
 	click>=7.0
 	h5py~=3.0
 	scipy~=1.7
+	typing-extensions>=4.0
 
 tests_require =
 	pytest

diff --git a/src/gambit/cli/common.py b/src/gambit/cli/common.py
@@ -282,7 +282,7 @@ def strip_seq_file_ext(filename: str) -> str:
 	return filename
 
 
-def get_file_id(path: FilePath, strip_dir: bool = True, strip_ext: bool = True) -> str:
+def get_file_id(path: 'FilePath', strip_dir: bool = True, strip_ext: bool = True) -> str:
 	"""Get sequence file ID derived from file path.
 
 	Parameters

diff --git a/src/gambit/cluster.py b/src/gambit/cluster.py
@@ -119,7 +119,7 @@ def check_clade(clade):
 	assert root_i == nleaves * 2 - 2
 
 
-def dump_dmat_csv(file: Union[FilePath, TextIO],
+def dump_dmat_csv(file: Union['FilePath', TextIO],
                   dmat: np.ndarray,
                   row_ids: Sequence,
                   col_ids: Sequence,
@@ -136,7 +136,7 @@ def dump_dmat_csv(file: Union[FilePath, TextIO],
 			writer.writerow([str(row_id), *values_str])
 
 
-def load_dmat_csv(file: Union[FilePath, TextIO]) -> tuple[np.ndarray, list[str], list[str]]:
+def load_dmat_csv(file: Union['FilePath', TextIO]) -> tuple[np.ndarray, list[str], list[str]]:
 	"""Load distance matrix from CSV file.
 
 	Returns

diff --git a/src/gambit/db/refdb.py b/src/gambit/db/refdb.py
@@ -38,7 +38,7 @@ def __init__(self, msg, directory=None, genomes_file=None, signatures_file=None)
 		self.signatures_file = signatures_file
 
 
-def load_genomeset(db_file: FilePath) -> tuple[Session, ReferenceGenomeSet]:
+def load_genomeset(db_file: 'FilePath') -> tuple[Session, ReferenceGenomeSet]:
 	"""Get the only :class:`gambit.db.models.ReferenceGenomeSet` from a genomes database file."""
 	session = file_sessionmaker(db_file)()
 	gset = only_genomeset(session)
@@ -211,7 +211,7 @@ def __init__(self, genomeset: ReferenceGenomeSet, signatures: ReferenceSignature
 			raise ValueError(f'{missing} of {n} genomes not matched to signature IDs. Is the id_attr attribute of the signatures metadata correct?')
 
 	@classmethod
-	def locate_files(cls, path: FilePath) -> tuple[Path, Path]:
+	def locate_files(cls, path: 'FilePath') -> tuple[Path, Path]:
 		"""Locate an SQLite genome database file and HDF5 signatures file in a directory.
 
 		Files are located by extension, ``.gdb`` or ``.db`` for SQLite file and ``.gs`` or ``.h5``
@@ -258,14 +258,14 @@ def check_single_match(matches, desc: str):
 		return genomes_file, signatures_file
 
 	@classmethod
-	def load(cls, genomes_file: FilePath, signatures_file: FilePath) -> 'ReferenceDatabase':
+	def load(cls, genomes_file: 'FilePath', signatures_file: 'FilePath') -> 'ReferenceDatabase':
 		"""Load complete database given paths to SQLite genomes database file and HDF5 signatures file."""
 		session, gset = load_genomeset(genomes_file)
 		sigs = load_signatures(signatures_file)
 		return cls(gset, sigs)
 
 	@classmethod
-	def load_from_dir(cls, path: FilePath) -> 'ReferenceDatabase':
+	def load_from_dir(cls, path: 'FilePath') -> 'ReferenceDatabase':
 		"""
 		Load complete database given directory containing SQLite genomes database file and HDF5
 		signatures file.

diff --git a/src/gambit/db/sqla.py b/src/gambit/db/sqla.py
@@ -37,7 +37,7 @@ def process_result_value(self, value, dialect):
 		return None if value is None else gjson.loads(value)
 
 
-def file_sessionmaker(path: FilePath, readonly: bool = True, cls: type = None, **kw) -> sessionmaker:
+def file_sessionmaker(path: 'FilePath', readonly: bool = True, cls: type = None, **kw) -> sessionmaker:
 	"""Get an SQLAlchemy ``sessionmaker`` for an sqlite database file.
 
 	Parameters

diff --git a/src/gambit/kmers.py b/src/gambit/kmers.py
@@ -30,7 +30,7 @@ def index_dtype(k: int) -> Optional[np.dtype]:
 		return None
 
 
-def kmer_to_index(kmer: DNASeq) -> int:
+def kmer_to_index(kmer: 'DNASeq') -> int:
 	"""Convert a k-mer to its integer index.
 
 	Raises
@@ -41,7 +41,7 @@ def kmer_to_index(kmer: DNASeq) -> int:
 	return ckmers.kmer_to_index(seq_to_bytes(kmer))
 
 
-def kmer_to_index_rc(kmer: DNASeq) -> int:
+def kmer_to_index_rc(kmer: 'DNASeq') -> int:
 	"""Get the integer index of a k-mer's reverse complement.
 
 	Raises
@@ -84,7 +84,7 @@ class KmerSpec(Jsonable):
 	nkmers: int = attrib(eq=False)
 	index_dtype: np.dtype = attrib(eq=False)
 
-	def __init__(self, k: int, prefix: DNASeq):
+	def __init__(self, k: int, prefix: 'DNASeq'):
 		"""
 		Parameters
 		----------
@@ -143,7 +143,7 @@ class KmerMatch:
 		If the match is on the reverse strand.
 	"""
 	kmerspec: KmerSpec = attrib()
-	seq: DNASeq = attrib()
+	seq: 'DNASeq' = attrib()
 	pos: int = attrib()
 	reverse: bool = attrib()
 
@@ -178,7 +178,7 @@ def kmer_index(self) -> int:
 		return kmer_to_index_rc(kmer) if self.reverse else kmer_to_index(kmer)
 
 
-def find_kmers(kmerspec: KmerSpec, seq: DNASeq) -> Iterator[KmerMatch]:
+def find_kmers(kmerspec: KmerSpec, seq: 'DNASeq') -> Iterator[KmerMatch]:
 	"""Locate k-mers with the given prefix in a DNA sequence.
 
 	Searches sequence both backwards and forwards (reverse complement). The sequence may contain

diff --git a/src/gambit/results.py b/src/gambit/results.py
@@ -22,7 +22,7 @@ class AbstractResultsExporter(ABC):
 	"""
 
 	@abstractmethod
-	def export(self, file_or_path: Union[FilePath, IO], results: QueryResults):
+	def export(self, file_or_path: Union['FilePath', IO], results: QueryResults):
 		"""Write query results to file.
 
 		Parameters
@@ -55,7 +55,7 @@ def to_json(self, obj):
 		"""Convert object to JSON-compatible format (need not work recursively)."""
 		return gjson.to_json(obj)
 
-	def export(self, file_or_path: Union[FilePath, TextIO], results: QueryResults):
+	def export(self, file_or_path: Union['FilePath', TextIO], results: QueryResults):
 		opts = dict(indent=4, sort_keys=True) if self.pretty else dict()
 		with maybe_open(file_or_path, 'w') as f:
 			json.dump(results, f, default=self.to_json, **opts)
@@ -112,7 +112,7 @@ def get_row(self, item: QueryResultItem) -> list:
 		"""Get row values for single result item."""
 		return [getattr_nested(item, attrs, pass_none=True) for _, attrs in self.COLUMNS]
 
-	def export(self, file_or_path: Union[FilePath, TextIO], results: QueryResults):
+	def export(self, file_or_path: Union['FilePath', TextIO], results: QueryResults):
 		with maybe_open(file_or_path, 'w') as f:
 			writer = csv.writer(f, **self.format_opts)
 
@@ -229,7 +229,7 @@ def _init_converter(self):
 		self._converter.register_structure_hook(AnnotatedGenome, self._structure_genome)
 		self._converter.register_structure_hook(Taxon, self._structure_taxon)
 
-	def read(self, file_or_path: Union[FilePath, IO]) -> QueryResults:
+	def read(self, file_or_path: Union['FilePath', IO]) -> QueryResults:
 		"""Read query results from JSON file.
 
 		Parameters

diff --git a/src/gambit/seq.py b/src/gambit/seq.py
@@ -3,20 +3,28 @@
 Note that all code in this package operates on DNA sequences as sequences of
 bytes containing ascii-encoded nucleotide codes.
 
+
 .. data:: NUCLEOTIDES
 
 	``bytes`` corresponding to the four DNA nucleotides. Ascii-encoded upper
 	case letters ``ACGT``. Note that the order, while arbitrary, is important
 	in this variable as it defines how unique indices are assigned to k-mer
 	sequences.
+
+.. class:: DNASeq
+
+	Type alias for DNA sequence types accepted for k-mer search / signature calculation
+	(``str``, ``bytes``, ``bytearray``, or :class:`Bio.Seq.Seq`).
 """
+
 from pathlib import Path
 from typing import Union, Optional, IO, Iterable
 from os import PathLike
 
 from Bio import SeqIO
 from Bio.Seq import Seq
 from attr import attrs, attrib
+from typing_extensions import TypeAlias
 
 from gambit._cython.kmers import revcomp
 from gambit.util.io import FilePath
@@ -29,14 +37,12 @@
 
 SEQ_TYPES = (str, bytes, bytearray, Seq)
 
-#: Union of DNA sequence types accepted for k-mer search / signature calculation.
-DNASeq = Union[SEQ_TYPES]
-
-#: Sequence types accepted directly by native (Cython) code.
-DNASeqBytes = Union[bytes, bytearray]
+DNASeq: TypeAlias = Union[SEQ_TYPES]
+# Type alias for sequence types accepted directly by native (Cython) code.
+DNASeqBytes: TypeAlias = Union[bytes, bytearray]
 
 
-def seq_to_bytes(seq: DNASeq) -> DNASeqBytes:
+def seq_to_bytes(seq: 'DNASeq') -> 'DNASeqBytes':
 	"""Convert generic DNA sequence to byte string representation.
 
 	This is for passing sequence data to Cython functions.
@@ -52,7 +58,7 @@ def seq_to_bytes(seq: DNASeq) -> DNASeqBytes:
 	raise TypeError(f'Expected sequence type, got {type(seq)}')
 
 
-def validate_dna_seq_bytes(seq : bytes):
+def validate_dna_seq_bytes(seq: DNASeqBytes):
 	"""Check that a sequence contains only valid nucleotide codes (upper case).
 
 	Parameters
@@ -171,7 +177,7 @@ def absolute(self) -> 'SequenceFile':
 
 	@classmethod
 	def from_paths(cls,
-	               paths: Iterable[FilePath],
+	               paths: Iterable['FilePath'],
 	               format: str,
 	               compression: Optional[str] = None,
 	               ) -> list['SequenceFile']:

diff --git a/src/gambit/sigs/base.py b/src/gambit/sigs/base.py
@@ -409,7 +409,7 @@ class SignaturesFileError(Exception):
 	filename: str
 	format: str
 
-	def __init__(self, message: str, filename: Optional[FilePath], format: Optional[str]):
+	def __init__(self, message: str, filename: Optional['FilePath'], format: Optional[str]):
 		self.message = message
 		self.filename = str(filename)
 		self.format = format
@@ -418,7 +418,7 @@ def __str__(self):
 		return self.message
 
 
-def load_signatures(path: FilePath, **kw) -> AbstractSignatureArray:
+def load_signatures(path: 'FilePath', **kw) -> AbstractSignatureArray:
 	"""Load signatures from file.
 
 	Currently the only format used to store signatures is the one in :mod:`gambit.sigs.hdf5`, but
@@ -435,7 +435,7 @@ def load_signatures(path: FilePath, **kw) -> AbstractSignatureArray:
 	return load_signatures_hdf5(path, **kw)
 
 
-def dump_signatures(path: FilePath,
+def dump_signatures(path: 'FilePath',
                     signatures: AbstractSignatureArray,
                     format: str = 'hdf5',
                     **kw,

diff --git a/src/gambit/sigs/calc.py b/src/gambit/sigs/calc.py
@@ -127,7 +127,7 @@ def default_accumulator(k: int) -> KmerAccumulator:
 	return SetAccumulator(k) if k > 11 else ArrayAccumulator(k)
 
 
-def accumulate_kmers(accumulator: KmerAccumulator, kmerspec: KmerSpec, seq: DNASeq):
+def accumulate_kmers(accumulator: KmerAccumulator, kmerspec: KmerSpec, seq: 'DNASeq'):
 	"""Find k-mer matches in sequence and add their indices to an accumulator."""
 	for match in find_kmers(kmerspec, seq):
 		try:
@@ -138,7 +138,7 @@ def accumulate_kmers(accumulator: KmerAccumulator, kmerspec: KmerSpec, seq: DNAS
 
 
 def calc_signature(kmerspec: KmerSpec,
-                   seqs: Union[DNASeq, Iterable[DNASeq]],
+                   seqs: Union['DNASeq', Iterable['DNASeq']],
                    *,
                    accumulator: Optional[KmerAccumulator] = None,
                    ) -> KmerSignature:

diff --git a/src/gambit/sigs/hdf5.py b/src/gambit/sigs/hdf5.py
@@ -218,7 +218,7 @@ def create(cls,
 		return cls(group)
 
 
-def load_signatures_hdf5(path: FilePath, **kw) -> HDF5Signatures:
+def load_signatures_hdf5(path: 'FilePath', **kw) -> HDF5Signatures:
 	"""Open HDF5 signature file.
 
 	Parameters
@@ -254,7 +254,7 @@ def load_signatures_hdf5(path: FilePath, **kw) -> HDF5Signatures:
 		raise
 
 
-def dump_signatures_hdf5(path: FilePath, signatures: AbstractSignatureArray, **kw):
+def dump_signatures_hdf5(path: 'FilePath', signatures: AbstractSignatureArray, **kw):
 	"""Write k-mer signatures and associated metadata to an HDF5 file.
 
 	Parameters

diff --git a/src/gambit/util/io.py b/src/gambit/util/io.py
@@ -1,12 +1,20 @@
-"""Utility code for reading/writing data files."""
+"""Utility code for reading/writing data files.
+
+
+.. class:: FilePath
+
+	Alias for types which can represent a file system path (``str`` or :class:`os.PathLike`).
+"""
 
 import os
 from io import TextIOWrapper
 from typing import Union, Optional, IO, TextIO, BinaryIO, ContextManager, Iterable, TypeVar
 from contextlib import nullcontext
 
-#: Alias for types which can represent a file system path
-FilePath = Union[str, os.PathLike]
+from typing_extensions import TypeAlias
+
+
+FilePath: TypeAlias = Union[str, os.PathLike]
 
 T = TypeVar('T')
 
@@ -69,7 +77,7 @@ def guess_compression(fobj: BinaryIO) -> Optional[str]:
 
 
 def open_compressed(compression: Optional[str],
-                    path: FilePath,
+                    path: 'FilePath',
                     mode: str = 'rt',
                     **kwargs,
                     ) -> IO:
@@ -172,7 +180,7 @@ def __exit__(self, *args):
 		self.close()
 
 
-def maybe_open(file_or_path: Union[FilePath, IO], mode: str = 'r', **open_kw) -> ContextManager[IO]:
+def maybe_open(file_or_path: Union['FilePath', IO], mode: str = 'r', **open_kw) -> ContextManager[IO]:
 	"""Open a file given a file path as an argument, but pass existing file objects though.
 
 	Intended to be used by API functions that take either type as an argument. If a file path is
@@ -208,7 +216,7 @@ def maybe_open(file_or_path: Union[FilePath, IO], mode: str = 'r', **open_kw) ->
 		return open(path, mode, **open_kw)
 
 
-def read_lines(file_or_path: Union[FilePath, TextIO], strip: bool=True, skip_empty: bool=False) -> Iterable[str]:
+def read_lines(file_or_path: Union['FilePath', TextIO], strip: bool=True, skip_empty: bool=False) -> Iterable[str]:
 	"""Iterate over lines in text file.
 
 	Parameters
@@ -232,7 +240,7 @@ def read_lines(file_or_path: Union[FilePath, TextIO], strip: bool=True, skip_emp
 				yield line
 
 
-def write_lines(lines: Iterable, file_or_path: Union[FilePath, TextIO]):
+def write_lines(lines: Iterable, file_or_path: Union['FilePath', TextIO]):
 	"""Write strings to text file, one per line.
 
 	Parameters

diff --git a/tests/cli/test_common.py b/tests/cli/test_common.py
@@ -99,7 +99,7 @@ def test_strip_seq_file_ext():
 class TestGetSequenceFiles:
 	"""Test the get_sequence_files() function."""
 
-	def check_ids(self, ids: Iterable[str], paths: Iterable[FilePath], strip_dir: bool, strip_ext: bool):
+	def check_ids(self, ids: Iterable[str], paths: Iterable['FilePath'], strip_dir: bool, strip_ext: bool):
 		for id_, path in zip_strict(ids, paths):
 			if strip_dir:
 				expected = Path(path).name

diff --git a/tests/cli/test_query.py b/tests/cli/test_query.py
@@ -21,9 +21,9 @@
 
 def make_args(testdb: TestDB, *,
 			  positional_files: Optional[Iterable[SequenceFile]] = None,
-			  list_file: Optional[FilePath] = None,
+			  list_file: Optional['FilePath'] = None,
 			  sig_file: bool = False,
-			  output: Optional[FilePath] = None,
+			  output: Optional['FilePath'] = None,
 			  outfmt: Optional[str] = None,
 			  strict: bool=False,
 			  ) -> list[str]: