From 84c363fe1a04a458510cd774e8d3fcc57ced16fa Mon Sep 17 00:00:00 2001 From: Jared Lumpe Date: Sat, 3 Aug 2024 23:22:29 -0700 Subject: [PATCH] Remove Cython dependency on Numpy --- pyproject.toml | 3 --- setup.py | 3 --- src/gambit/_cython/kmers.pxd | 8 +++---- src/gambit/_cython/kmers.pyx | 14 ++++++------- src/gambit/_cython/metric.pxd | 1 + src/gambit/_cython/metric.pyx | 17 +++++---------- src/gambit/_cython/threads.pyx | 38 +++++++++++++++++++++++++--------- src/gambit/_cython/types.pxd | 31 +++++++++++++-------------- src/gambit/metric.py | 11 +++++++--- src/gambit/sigs/__init__.py | 2 +- src/gambit/sigs/base.py | 7 ++++++- src/gambit/sigs/hdf5.py | 3 +-- tests/test_metric.py | 4 ++-- 13 files changed, 79 insertions(+), 63 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a040ee9..b685eb4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,8 +3,5 @@ requires = [ "setuptools", "wheel", "Cython >= 3.0", - # If the Numpy version is different at runtime than build time, the build version should be - # lower as the ABI is forward- but not backwards-compatible. - "oldest-supported-numpy", ] build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index 01bbbad..de06f94 100644 --- a/setup.py +++ b/setup.py @@ -3,15 +3,12 @@ from setuptools import setup from distutils.extension import Extension from Cython.Build import cythonize -import numpy # Cython extensions -np_include = numpy.get_include() extensions = [Extension( 'gambit._cython.*', ['src/gambit/_cython/*.pyx'], - include_dirs=[np_include], extra_compile_args=['-fopenmp', '-Wno-sign-compare'], extra_link_args=['-fopenmp'], )] diff --git a/src/gambit/_cython/kmers.pxd b/src/gambit/_cython/kmers.pxd index 87dcfab..c3f57be 100644 --- a/src/gambit/_cython/kmers.pxd +++ b/src/gambit/_cython/kmers.pxd @@ -1,9 +1,9 @@ -cimport numpy as np +from libc.stdint cimport uint64_t, intptr_t ctypedef unsigned char CHAR -cdef np.uint64_t c_kmer_to_index(const CHAR[:], bint*) nogil -cdef np.uint64_t c_kmer_to_index_rc(const CHAR[:], bint*) nogil -cdef void c_index_to_kmer(np.uint64_t, CHAR[:]) nogil +cdef uint64_t c_kmer_to_index(const CHAR[:], bint*) nogil +cdef uint64_t c_kmer_to_index_rc(const CHAR[:], bint*) nogil +cdef void c_index_to_kmer(uint64_t, CHAR[:]) nogil cdef void c_revcomp(const CHAR[:], CHAR[:]) nogil diff --git a/src/gambit/_cython/kmers.pyx b/src/gambit/_cython/kmers.pyx index 3dacc70..022853d 100644 --- a/src/gambit/_cython/kmers.pyx +++ b/src/gambit/_cython/kmers.pyx @@ -14,7 +14,7 @@ def kmer_to_index(const CHAR[:] kmer): Convert k-mer byte string to its integer index. """ cdef: - np.uint64_t idx + uint64_t idx bint exc = False if kmer.shape[0] > 32: @@ -28,9 +28,9 @@ def kmer_to_index(const CHAR[:] kmer): return idx -cdef np.uint64_t c_kmer_to_index(const CHAR[:] kmer, bint *exc) nogil: +cdef uint64_t c_kmer_to_index(const CHAR[:] kmer, bint *exc) nogil: cdef: - np.uint64_t idx = 0 + uint64_t idx = 0 int i, k = kmer.shape[0] CHAR nuc @@ -61,7 +61,7 @@ def kmer_to_index_rc(const CHAR[:] kmer): Get the integer index of the reverse complement of a k-mer byte string. """ cdef: - np.uint64_t idx + uint64_t idx bint exc = False if kmer.shape[0] > 32: @@ -75,9 +75,9 @@ def kmer_to_index_rc(const CHAR[:] kmer): return idx -cdef np.uint64_t c_kmer_to_index_rc(const CHAR[:] kmer, bint *exc) nogil: +cdef uint64_t c_kmer_to_index_rc(const CHAR[:] kmer, bint *exc) nogil: cdef: - np.uint64_t idx = 0 + uint64_t idx = 0 int i, k = kmer.shape[0] CHAR nuc @@ -112,7 +112,7 @@ def index_to_kmer(index, int k): return bytes(buf) -cdef void c_index_to_kmer(np.uint64_t index, CHAR[:] out) nogil: +cdef void c_index_to_kmer(uint64_t index, CHAR[:] out) nogil: """Convert k-mer index to sequence.""" cdef: int k = out.shape[0] diff --git a/src/gambit/_cython/metric.pxd b/src/gambit/_cython/metric.pxd index 62d3f8e..125ec64 100644 --- a/src/gambit/_cython/metric.pxd +++ b/src/gambit/_cython/metric.pxd @@ -1,3 +1,4 @@ +from libc.stdint cimport intptr_t from .types cimport SCORE_T, BOUNDS_T, COORDS_T, COORDS_T_2 cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil diff --git a/src/gambit/_cython/metric.pyx b/src/gambit/_cython/metric.pyx index caebced..45f9074 100644 --- a/src/gambit/_cython/metric.pyx +++ b/src/gambit/_cython/metric.pyx @@ -1,15 +1,8 @@ """Cython functions for calculating k-mer distance metrics""" -cimport numpy as np -import numpy as np from cython.parallel import prange, parallel -# Numpy dtypes equivalent to SCORE_T and BOUNDS_T -SCORE_DTYPE = np.dtype(np.float32) -BOUNDS_DTYPE = np.dtype(np.intp) - - def jaccard(COORDS_T[:] coords1, COORDS_T_2[:] coords2): """Compute the Jaccard index between two k-mer sets in sparse coordinate format. @@ -76,15 +69,15 @@ cdef SCORE_T c_jaccarddist(COORDS_T[:] coords1, COORDS_T_2[:] coords2) nogil: cdef: # Lengths of the two arrays - np.intp_t N = coords1.shape[0] - np.intp_t M = coords2.shape[0] + intptr_t N = coords1.shape[0] + intptr_t M = coords2.shape[0] # Index and value of items in each array as we are iterating - np.intp_t i = 0, j = 0 + intptr_t i = 0, j = 0 COORDS_T a COORDS_T_2 b - np.intp_t u = 0 # Size of union + intptr_t u = 0 # Size of union # Iterate through both arrays simultaneously, advance index for the array # with the smaller value. Advance both if they are equal. Increment the @@ -136,7 +129,7 @@ def _jaccarddist_parallel(COORDS_T[:] query, COORDS_T_2[:] ref_coords, BOUNDS_T[ out : numpy.ndarray Pre-allocated array to write distances to. """ - cdef np.intp_t N = ref_bounds.shape[0] - 1 + cdef intptr_t N = ref_bounds.shape[0] - 1 cdef BOUNDS_T begin, end cdef int i diff --git a/src/gambit/_cython/threads.pyx b/src/gambit/_cython/threads.pyx index dd3c49d..d663efa 100644 --- a/src/gambit/_cython/threads.pyx +++ b/src/gambit/_cython/threads.pyx @@ -1,9 +1,10 @@ """OpenMP stuff.""" from cython import parallel +import array -import numpy as np -cimport numpy as np +cimport cython +from cpython cimport array cimport openmp @@ -25,18 +26,35 @@ def omp_get_max_threads(): return openmp.omp_get_max_threads() -def get_thread_ids(int num_threads): - """Run a multithreaded loop and get the thread ID running in each iteration.""" +@cython.boundscheck(True) +def get_thread_ids(int n): + """Run a multithreaded loop and get the thread ID running in each iteration. + + Used to check that Cython code parallelization is working correctly. Result should contain + integers from 0 to ``num_threads``, repeated up to length ``n``. + + Parameters + ---------- + n: int + Size of loop. Make this at least as large as the expected number of threads. + + Returns + ------- + array.array + Array of size ``n`` containing the thread ID running in each loop iteration. + """ cdef: - np.ndarray[np.intp_t, ndim=1] thread_ids - np.intp_t thread_id = -1 + array.array thread_ids_arr = array.array('i') + int[:] thread_ids int i - thread_ids = np.full(num_threads, -1, dtype=np.intp) + for i in range(n): + thread_ids_arr.append(-1) + + thread_ids = thread_ids_arr - for i in parallel.prange(num_threads, nogil=True, schedule='static', chunksize=1): - thread_id = parallel.threadid() - thread_ids[i] = thread_id + for i in parallel.prange(n, nogil=True, schedule='static', chunksize=1): + thread_ids[i] = parallel.threadid() return thread_ids diff --git a/src/gambit/_cython/types.pxd b/src/gambit/_cython/types.pxd index 56992bb..43e9ad7 100644 --- a/src/gambit/_cython/types.pxd +++ b/src/gambit/_cython/types.pxd @@ -1,28 +1,29 @@ """Shared typedefs.""" -cimport numpy as np +from libc.stdint cimport int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t, intptr_t # Type for similarity scores -ctypedef np.float32_t SCORE_T +ctypedef float SCORE_T # Type for bounds on c_jaccard_coords_col -ctypedef np.intp_t BOUNDS_T +# This should be equal to Numpy's intp dtype +ctypedef intptr_t BOUNDS_T # Fused type for storing k-mer coordinates/indices ctypedef fused COORDS_T: - np.int16_t - np.uint16_t - np.int32_t - np.uint32_t - np.int64_t - np.uint64_t + int16_t + uint16_t + int32_t + uint32_t + int64_t + uint64_t # Copy of COORDS_T, used when two arguments have types in this set but may be different than each other. ctypedef fused COORDS_T_2: - np.int16_t - np.uint16_t - np.int32_t - np.uint32_t - np.int64_t - np.uint64_t + int16_t + uint16_t + int32_t + uint32_t + int64_t + uint64_t diff --git a/src/gambit/metric.py b/src/gambit/metric.py index 2b0f292..b7027c3 100644 --- a/src/gambit/metric.py +++ b/src/gambit/metric.py @@ -5,13 +5,18 @@ import numpy as np -from gambit._cython.metric import BOUNDS_DTYPE, SCORE_DTYPE, jaccard, jaccarddist, \ - _jaccarddist_parallel -from gambit.sigs.base import KmerSignature, SignatureArray, AbstractSignatureArray, SignatureList +from gambit._cython.metric import jaccard, jaccarddist, _jaccarddist_parallel +from gambit.sigs.base import KmerSignature, SignatureArray, AbstractSignatureArray, SignatureList, \ + BOUNDS_DTYPE from gambit.util.misc import chunk_slices from gambit.util.progress import get_progress +#: Numpy dtype for output of Cython Jaccard distance calculation code +# Equivalent to SCORE_T in types.pxd +SCORE_DTYPE = np.dtype(np.float32) + + def jaccard_generic(set1: Iterable, set2: Iterable) -> float: """Get the Jaccard index of of two arbitrary sets. diff --git a/src/gambit/sigs/__init__.py b/src/gambit/sigs/__init__.py index 017470b..fd43484 100644 --- a/src/gambit/sigs/__init__.py +++ b/src/gambit/sigs/__init__.py @@ -1,4 +1,4 @@ """Calculate and store collections of k-mer signatures.""" from .base import KmerSignature, SignatureArray, SignatureList, sigarray_eq, SignaturesMeta,\ - AnnotatedSignatures, dump_signatures, load_signatures + AnnotatedSignatures, dump_signatures, load_signatures, BOUNDS_DTYPE diff --git a/src/gambit/sigs/base.py b/src/gambit/sigs/base.py index 5ca383c..0712edf 100644 --- a/src/gambit/sigs/base.py +++ b/src/gambit/sigs/base.py @@ -5,7 +5,6 @@ from attr import attrs, attrib from gambit.kmers import KmerSpec -from gambit._cython.metric import BOUNDS_DTYPE from gambit.util.indexing import AdvancedIndexingMixin from gambit.util.io import FilePath @@ -15,6 +14,12 @@ # TODO - use nptyping package to specify dimensions and data type? +#: Preferred Numpy dtype for :attr:`.ConcatenatedSignatureArray.bounds`. Can be used in parallelized +#: Cython metric calculation code without conversion. +# Equivalent to BOUNDS_T in types.pxd +BOUNDS_DTYPE = np.dtype(np.intp) + + def sigarray_eq(a1: Sequence[KmerSignature], a2: Sequence[KmerSignature]) -> bool: """Check two sequences of sparse k-mer signatures for equality. diff --git a/src/gambit/sigs/hdf5.py b/src/gambit/sigs/hdf5.py index c095f40..b52f9da 100644 --- a/src/gambit/sigs/hdf5.py +++ b/src/gambit/sigs/hdf5.py @@ -7,9 +7,8 @@ import h5py as h5 from .base import SignatureArray, ConcatenatedSignatureArray, AbstractSignatureArray, SignaturesMeta,\ - ReferenceSignatures, SignaturesFileError + ReferenceSignatures, SignaturesFileError, BOUNDS_DTYPE from gambit.kmers import KmerSpec -from gambit._cython.metric import BOUNDS_DTYPE from gambit.util.io import FilePath diff --git a/tests/test_metric.py b/tests/test_metric.py index 25b775d..1b16e1b 100644 --- a/tests/test_metric.py +++ b/tests/test_metric.py @@ -6,9 +6,9 @@ import numpy as np from gambit.metric import jaccard, jaccarddist, jaccard_bits, jaccard_generic, jaccarddist_array, \ - jaccarddist_matrix, jaccarddist_pairwise, num_pairs, SCORE_DTYPE, BOUNDS_DTYPE + jaccarddist_matrix, jaccarddist_pairwise, num_pairs, SCORE_DTYPE from gambit.sigs.calc import sparse_to_dense -from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures +from gambit.sigs import SignatureArray, SignatureList, dump_signatures, load_signatures, BOUNDS_DTYPE from gambit.kmers import KmerSpec from gambit.util.progress import check_progress from .common import make_signatures