diff --git a/adelie/data.py b/adelie/data.py index 80f44418..7c802da7 100644 --- a/adelie/data.py +++ b/adelie/data.py @@ -151,7 +151,7 @@ def create_snp_unphased( n : int Number of data points. p : int - Number of features. + Number of SNPs. sparsity : float, optional Proportion of :math:`\\beta` entries to be zeroed out. Default is ``0.95``. @@ -227,4 +227,124 @@ def create_snp_unphased( "groups": groups, "group_sizes": group_sizes, "penalty": penalty, + } + + +def create_snp_phased_ancestry( + n: int, + s: int, + A: int, + *, + sparsity: float =0.95, + one_ratio: float =0.25, + two_ratio: float =0.05, + zero_penalty: float =0, + snr: float =1, + seed: int =0, +): + """Creates a SNP Unphased dataset. + + The data matrix ``X`` is a phased version of a matrix with sparsity ratio ``1 - one_ratio - two_ratio`` + where ``one_ratio`` of the entries are randomly set to ``1`` + and ``two_ratio`` are randomly set to ``2``. + The response ``y`` is generated from a linear model :math:`y = X\\beta + \\epsilon` + with :math:`\\epsilon \\sim N(0, \\sigma^2 I_n)` + and :math:`\\beta` such that ``sparsity`` proportion of the entries are set to :math:`0`. + We compute :math:`\\sigma^2` such that the signal-to-noise ratio is given by ``snr``. + The penalty factors are by default set to ``np.sqrt(group_sizes)``, + however if ``zero_penalty > 0``, a random set of penalties will be set to zero, + in which case, ``penalty`` is rescaled such that the :math:`\\ell_2` norm squared is ``p``. + + Parameters + ---------- + n : int + Number of data points. + s : int + Number of SNPs. + A : int + Number of ancestries. + sparsity : float, optional + Proportion of :math:`\\beta` entries to be zeroed out. + Default is ``0.95``. + one_ratio : float, optional + Proportion of the entries of ``X`` that is set to ``1``. + Default is ``0.25``. + two_ratio : float, optional + Proportion of the entries of ``X`` that is set to ``2``. + Default is ``0.05``. + zero_penalty : float, optional + Proportion of ``penalty`` entries to be zeroed out. + Default is ``0``. + snr : float, optional + Signal-to-noise ratio. + Default is ``1``. + seed : int, optional + Random seed. + Default is ``0``. + + Returns + ------- + data : dict + A dictionary containing the generated data: + + - ``"X"``: feature matrix. + - ``"y"``: response vector. + - ``"groups"``: mapping of group index to the starting column index of ``X``. + - ``"group_sizes"``: mapping of group index to the group size. + - ``"penalty"``: penalty factor for each group index. + """ + assert n >= 1 + assert s >= 1 + assert A >= 1 + assert sparsity >= 0 and sparsity <= 1 + assert one_ratio >= 0 and one_ratio <= 1 + assert two_ratio >= 0 and two_ratio <= 1 + assert zero_penalty >= 0 and zero_penalty <= 1 + assert snr > 0 + assert seed >= 0 + + np.random.seed(seed) + + nnz_ratio = one_ratio + two_ratio + one_ratio = one_ratio / nnz_ratio + two_ratio = two_ratio / nnz_ratio + nnz = int(nnz_ratio * n * s) + nnz_indices = np.random.choice(n * s, nnz, replace=False) + nnz_indices = np.random.permutation(nnz_indices) + one_indices = nnz_indices[:int(one_ratio * nnz)] + two_indices = nnz_indices[int(one_ratio * nnz):] + one_indices = 2 * one_indices + np.random.binomial(1, 0.5, len(one_indices)) + two_indices *= 2 + X = np.zeros((n, 2 * s), dtype=np.int8) + X.ravel()[one_indices] = 1 + X.ravel()[two_indices] = 1 + X.ravel()[two_indices + 1] = 1 + ancestries = np.zeros((n, 2 * s), dtype=np.int8) + ancestries.ravel()[one_indices] = np.random.choice(A, len(one_indices), replace=True) + ancestries.ravel()[two_indices] = np.random.choice(A, len(two_indices), replace=True) + ancestries.ravel()[two_indices + 1] = np.random.choice(A, len(two_indices), replace=True) + + groups = np.arange(s) + group_sizes = np.full(s, A) + + penalty = np.sqrt(group_sizes) + penalty[np.random.choice(s, int(zero_penalty * s), replace=False)] = 0 + penalty /= np.linalg.norm(penalty) / np.sqrt(s * A) + + beta = np.random.normal(0, 1, 2 * s) + beta_nnz_indices = np.random.choice(2 * s, int((1-sparsity) * s * 2), replace=False) + X_sub = X[:, beta_nnz_indices] + beta_sub = beta[beta_nnz_indices] + + signal_var = np.dot(beta_sub, np.dot(np.cov(X_sub.T), beta_sub)) + noise_scale = np.sqrt(signal_var / snr) + y = X_sub @ beta_sub + noise_scale * np.random.normal(0, 1, n) + + return { + "X": X, + "ancestries": ancestries, + "y": y, + "groups": groups, + "group_sizes": group_sizes, + "penalty": penalty, } \ No newline at end of file diff --git a/adelie/io.py b/adelie/io.py index 827e931a..a663be04 100644 --- a/adelie/io.py +++ b/adelie/io.py @@ -2,20 +2,10 @@ from .adelie_core import io as core_io -class snp_unphased: - """IO handler for SNP Unphased data. - - Parameters - ---------- - filename : str - File name to either read from or write to related to the SNP data. - """ - def __init__( - self, - filename, - ): - self._core = core_io.IOSNPUnphased(filename) - +class snp_base: + def __init__(self, core): + self._core = core + def endian(self): """Gets the endianness used in the file. @@ -50,6 +40,47 @@ def cols(self): """ return self._core.cols() + def read(self): + """Read and load the matrix from file. + + Returns + ------- + total_bytes : int + Number of bytes read. + """ + return self._core.read() + + def to_dense(self, n_threads: int =1): + """Creates a dense matrix. + + Parameters + ---------- + n_threads : int, optional + Number of threads. + Default is ``1``. + + Returns + ------- + dense : (n, p) np.ndarray + Dense matrix. + """ + return self._core.to_dense(n_threads) + + +class snp_unphased(snp_base): + """IO handler for SNP Unphased data. + + Parameters + ---------- + filename : str + File name to either read from or write to related to the SNP data. + """ + def __init__( + self, + filename, + ): + super().__init__(core_io.IOSNPUnphased(filename)) + def outer(self): """Gets the outer indexing vector. @@ -105,43 +136,121 @@ def value(self, j: int): """ return self._core.value(j) - def to_dense(self, n_threads: int =1): - """Creates a dense matrix. + def write( + self, + calldata: np.ndarray, + n_threads: int =1, + ): + """Write dense array to the file in special format. Parameters ---------- + calldata : (n, p) np.ndarray + SNP unphased calldata in dense format. n_threads : int, optional Number of threads. Default is ``1``. Returns ------- - dense : (n, p) np.ndarray - Dense matrix. + total_bytes : int + Number of bytes written. """ - return self._core.to_dense(n_threads) + return self._core.write(calldata, n_threads) - def read(self): - """Read and load the matrix from file. + +class snp_phased_ancestry(snp_base): + """IO handler for SNP Phased Ancestry data. + + Parameters + ---------- + filename : str + File name to either read from or write to related to the SNP data. + """ + def __init__( + self, + filename, + ): + super().__init__(core_io.IOSNPPhasedAncestry(filename)) + + def outer(self): + """Gets the outer indexing vector. Returns ------- - total_bytes : int - Number of bytes read. + outer : (2*s+1,) np.ndarray + Outer indexing vector. """ - return self._core.read() + return self._core.outer() + + def nnz(self, j: int, hap: int): + """Gets the number of non-zero entries at SNP and haplotype. + + Parameters + ---------- + j : int + SNP index. + hap : int + Haplotype for the corresponding SNP. + + Returns + ------- + nnz : int + Number of non-zero entries at SNP ``j`` and haplotype ``hap``. + """ + return self._core.nnz(j, hap) + + def inner(self, j: int, hap: int): + """Gets the inner indexing vector at SNP and haplotype. + + Parameters + ---------- + j : int + SNP index. + hap : int + Haplotype for the corresponding SNP. + + Returns + ------- + inner : np.ndarray + Inner indexing vector at SNP ``j`` and haplotype ``hap``. + """ + return self._core.inner(j, hap) + + def ancestry(self, j: int, hap: int): + """Gets the value vector at a SNP and haplotype. + + Parameters + ---------- + j : int + SNP index. + hap : int + Haplotype for the corresponding SNP. + + Returns + ------- + v : np.ndarray + Ancestry vector at SNP ``j`` and haplotype ``hap``. + """ + return self._core.ancestry(j, hap) def write( self, calldata: np.ndarray, + ancestries: np.ndarray, + A: int, n_threads: int =1, ): - """Write dense array to the file in special format. + """Write dense arrays to the file in special format. Parameters ---------- - calldata : (n, p) np.ndarray - SNP unphased calldata in dense format. + calldata : (n, 2*s) np.ndarray + SNP phased calldata in dense format. + ancestries : (n, 2*s) np.ndarray + Ancestry in dense format. + A : int + Number of ancestries. n_threads : int, optional Number of threads. Default is ``1``. @@ -151,4 +260,5 @@ def write( total_bytes : int Number of bytes written. """ - return self._core.write(calldata, n_threads) + return self._core.write(calldata, ancestries, A, n_threads) + diff --git a/adelie/matrix.py b/adelie/matrix.py index 178abea9..3e647866 100644 --- a/adelie/matrix.py +++ b/adelie/matrix.py @@ -132,6 +132,53 @@ def __init__( return _snp_unphased(filenames, n_threads) +def snp_phased_ancestry( + filenames: list, + *, + n_threads: int =1, + dtype: np.float32 | np.float64 =np.float64, +): + """Creates a SNP phased ancestry matrix. + + .. note:: + This matrix only works for naive method! + + Parameters + ---------- + filenames : list + List of file names that contain column-block slices of the matrix. + n_threads : int, optional + Number of threads. + Default is ``1``. + dtype : Union[np.float32, np.float64], optional + Underlying value type. + Default is ``np.float64``. + + Returns + ------- + wrap + Wrapper matrix object. + """ + if n_threads < 1: + raise ValueError("Number of threads must be >= 1.") + + dispatcher = { + np.float64: core.matrix.MatrixNaiveSNPPhasedAncestry64, + np.float32: core.matrix.MatrixNaiveSNPPhasedAncestry32, + } + core_base = dispatcher[dtype] + + class _snp_phased_ancestry(core_base): + def __init__( + self, + filenames: list, + n_threads: int, + ): + core_base.__init__(self, filenames, n_threads) + + return _snp_phased_ancestry(filenames, n_threads) + + def cov_lazy( mat: np.ndarray, *, diff --git a/adelie/src/include/adelie_core/io/io_snp_base.hpp b/adelie/src/include/adelie_core/io/io_snp_base.hpp new file mode 100644 index 00000000..0666cd1c --- /dev/null +++ b/adelie/src/include/adelie_core/io/io_snp_base.hpp @@ -0,0 +1,105 @@ +#pragma once +#include +#include +#include +#include +#include + +namespace adelie_core { +namespace io { + +class IOSNPBase +{ +public: + using string_t = std::string; + using file_unique_ptr_t = std::unique_ptr< + std::FILE, + std::function + >; + using bool_t = bool; + using buffer_t = util::rowvec_type; + +protected: + const string_t _filename; + buffer_t _buffer; + bool_t _is_read; + + static void throw_no_read() + { + throw std::runtime_error( + "File is not read yet. Call read() first." + ); + } + + static auto fopen_safe( + const char* filename, + const char* mode + ) + { + file_unique_ptr_t file_ptr( + std::fopen(filename, mode), + [](std::FILE* fp) { std::fclose(fp); } + ); + auto fp = file_ptr.get(); + if (!fp) { + throw std::runtime_error("Cannot open file " + std::string(filename)); + } + return file_ptr; + } + + static bool is_big_endian() + { + union { + uint32_t i; + char c[4]; + } _bint = {0x01020304}; + + return _bint.c[0] == 1; + } + +public: + IOSNPBase( + const string_t& filename + ): + _filename(filename), + _buffer(), + _is_read(false) + {} + + bool_t endian() const { + if (!_is_read) throw_no_read(); + return reinterpret_cast(_buffer[0]); + } + + size_t read() + { + _is_read = true; + + auto file_ptr = fopen_safe(_filename.c_str(), "rb"); + auto fp = file_ptr.get(); + std::fseek(fp, 0, SEEK_END); + const size_t total_bytes = std::ftell(fp); + + _buffer.resize(total_bytes); + std::fseek(fp, 0, SEEK_SET); + const size_t read = std::fread(_buffer.data(), sizeof(char), _buffer.size(), fp); + if (read != _buffer.size()) { + throw std::runtime_error( + "Could not read the whole file into buffer." + ); + } + + if (endian() != is_big_endian()) { + throw std::runtime_error( + "Endianness is inconsistent! " + "Regenerate the file on a machine with the same endianness." + ); + } + + return total_bytes; + } + +}; + +} // namespace io +} // namespace adelie_core \ No newline at end of file diff --git a/adelie/src/include/adelie_core/io/io_snp_phased_ancestry.hpp b/adelie/src/include/adelie_core/io/io_snp_phased_ancestry.hpp new file mode 100644 index 00000000..0f0844dd --- /dev/null +++ b/adelie/src/include/adelie_core/io/io_snp_phased_ancestry.hpp @@ -0,0 +1,203 @@ +#pragma once +#include + +namespace adelie_core { +namespace io { + +class IOSNPPhasedAncestry : public IOSNPBase +{ +public: + using base_t = IOSNPBase; + using outer_t = uint64_t; + using inner_t = uint32_t; + using value_t = int8_t; + using vec_outer_t = util::rowvec_type; + using vec_inner_t = util::rowvec_type; + using vec_value_t = util::rowvec_type; + using rowarr_value_t = util::rowarr_type; + +protected: + static constexpr size_t _multiplier = ( + sizeof(inner_t) + + sizeof(value_t) + ); + + using base_t::_buffer; + using base_t::_filename; + using base_t::_is_read; + +public: + using base_t::base_t; + using base_t::read; + + inner_t rows() const { + if (!_is_read) throw_no_read(); + constexpr size_t idx = sizeof(bool_t); + return reinterpret_cast(_buffer[idx]); + } + + inner_t snps() const { + if (!_is_read) throw_no_read(); + constexpr size_t idx = sizeof(bool_t) + sizeof(inner_t); + return reinterpret_cast(_buffer[idx]); + } + + value_t ancestries() const + { + if (!_is_read) throw_no_read(); + constexpr size_t idx = sizeof(bool_t) + 2 * sizeof(inner_t); + return reinterpret_cast(_buffer[idx]); + } + + inner_t cols() const + { + return snps() * ancestries(); + } + + Eigen::Ref outer() const + { + if (!_is_read) throw_no_read(); + constexpr size_t idx = sizeof(bool_t) + 2 * sizeof(inner_t) + sizeof(value_t); + return Eigen::Map( + reinterpret_cast(&_buffer[idx]), + 2 * snps() + 1 + ); + } + + inner_t nnz(int j, bool haplotype) const + { + if (!_is_read) throw_no_read(); + const auto _outer = outer(); + j = 2 * j + haplotype; + return (_outer[j+1] - _outer[j]) / _multiplier; + } + + Eigen::Ref inner(int j, bool haplotype) const + { + if (!_is_read) throw_no_read(); + const auto _outer = outer(); + return Eigen::Map( + reinterpret_cast(&_buffer[_outer[2 * j + haplotype]]), + nnz(j, haplotype) + ); + } + + Eigen::Ref ancestry(int j, bool haplotype) const + { + if (!_is_read) throw_no_read(); + const auto _outer = outer(); + const auto _nnz = nnz(j, haplotype); + return Eigen::Map( + reinterpret_cast(&_buffer[ + _outer[2 * j + haplotype] + sizeof(inner_t) * _nnz + ]), + _nnz + ); + } + + rowarr_value_t to_dense( + size_t n_threads + ) const + { + if (!_is_read) throw_no_read(); + const auto n = rows(); + const auto s = snps(); + const auto A = ancestries(); + rowarr_value_t dense(n, s * A); + + #pragma omp parallel for schedule(auto) num_threads(n_threads) + for (inner_t j = 0; j < s; ++j) { + auto dense_j = dense.middleCols(A * j, A); + dense_j.setZero(); + for (char hap = 0; hap < 2; ++hap) { + const auto _inner = inner(j, hap); + const auto _ancestry = ancestry(j, hap); + for (inner_t i = 0; i < _inner.size(); ++i) { + dense_j(_inner[i], _ancestry[i]) += 1; + } + } + } + + return dense; + } + + size_t write( + const Eigen::Ref& calldata, + const Eigen::Ref& ancestries, + size_t A, + size_t n_threads + ) + { + const bool_t endian = is_big_endian(); + const inner_t n = calldata.rows(); + const inner_t s = calldata.cols() / 2; + + // outer[i] = number of bytes to jump from beginning of file + // to start reading column i information. + // outer[i+1] - outer[i] = total number of bytes for column i. + vec_outer_t outer(2*s + 1); + outer[0] = 0; + outer.tail(2*s) = (calldata != 0).colwise().count().template cast(); + for (int i = 1; i < outer.size(); ++i) { + outer[i] += outer[i-1]; + } + outer *= _multiplier; + outer += ( + sizeof(bool_t) + + 2 * sizeof(inner_t) + + sizeof(value_t) + + outer.size() * sizeof(outer_t) + ); + + auto& buffer = _buffer; + buffer.resize(outer[2*s]); + + size_t idx = 0; + reinterpret_cast(buffer[idx]) = endian; idx += sizeof(bool_t); + reinterpret_cast(buffer[idx]) = n; idx += sizeof(inner_t); + reinterpret_cast(buffer[idx]) = s; idx += sizeof(inner_t); + reinterpret_cast(buffer[idx]) = A; idx += sizeof(value_t); + Eigen::Map( + reinterpret_cast(&buffer[idx]), + outer.size() + ) = outer; + + #pragma omp parallel for schedule(auto) num_threads(n_threads) + for (inner_t j = 0; j < calldata.cols(); ++j) { + const auto col_j = calldata.col(j); + const auto ancestry_j = ancestries.col(j); + const auto nnz_bytes = outer[j+1] - outer[j]; + const auto nnz = nnz_bytes / _multiplier; + Eigen::Map inner( + reinterpret_cast(&buffer[outer[j]]), + nnz + ); + Eigen::Map ancestry( + reinterpret_cast(&buffer[outer[j] + sizeof(inner_t) * nnz]), + nnz + ); + + size_t count = 0; + for (int i = 0; i < n; ++i) { + if (col_j[i] == 0) continue; + inner[count] = i; + ancestry[count] = ancestry_j(i); + ++count; + } + } + + auto file_ptr = fopen_safe(_filename.c_str(), "wb"); + auto fp = file_ptr.get(); + auto total_bytes = std::fwrite(buffer.data(), sizeof(char), buffer.size(), fp); + if (total_bytes != buffer.size()) { + throw std::runtime_error( + "Could not write the full buffer." + ); + } + + return total_bytes; + } +}; + +} // namespace io +} // namespace adelie_core \ No newline at end of file diff --git a/adelie/src/include/adelie_core/io/io_snp_unphased.hpp b/adelie/src/include/adelie_core/io/io_snp_unphased.hpp index 331ed9c7..121ae9bb 100644 --- a/adelie/src/include/adelie_core/io/io_snp_unphased.hpp +++ b/adelie/src/include/adelie_core/io/io_snp_unphased.hpp @@ -1,31 +1,20 @@ #pragma once -#include -#include -#include -#include -#include -#include +#include namespace adelie_core { namespace io { -class IOSNPUnphased +class IOSNPUnphased : public IOSNPBase { public: - using string_t = std::string; - using file_unique_ptr_t = std::unique_ptr< - std::FILE, - std::function - >; - using bool_t = bool; + using base_t = IOSNPBase; using outer_t = uint64_t; using inner_t = uint32_t; using value_t = int8_t; using vec_outer_t = util::rowvec_type; using vec_inner_t = util::rowvec_type; using vec_value_t = util::rowvec_type; - using buffer_t = util::rowvec_type; - using rowarr_value_t = util::rowarr_type; + using rowarr_value_t = util::rowarr_type; protected: static constexpr size_t _multiplier = ( @@ -33,68 +22,26 @@ class IOSNPUnphased sizeof(value_t) ); - const string_t _filename; - buffer_t _buffer; - bool_t _is_read; - - static void throw_no_read() - { - throw std::runtime_error( - "File is not read yet. Call read() first." - ); - } - - static auto fopen_safe( - const char* filename, - const char* mode - ) - { - file_unique_ptr_t file_ptr( - std::fopen(filename, mode), - [](std::FILE* fp) { std::fclose(fp); } - ); - auto fp = file_ptr.get(); - if (!fp) { - throw std::runtime_error("Cannot open file " + std::string(filename)); - } - return file_ptr; - } - - static bool is_big_endian() - { - union { - uint32_t i; - char c[4]; - } _bint = {0x01020304}; - - return _bint.c[0] == 1; - } + using base_t::_buffer; + using base_t::_filename; + using base_t::_is_read; public: - IOSNPUnphased( - const string_t& filename - ): - _filename(filename), - _buffer(), - _is_read(false) - {} - - bool_t endian() const { - if (!_is_read) throw_no_read(); - return reinterpret_cast(_buffer[0]); - } + using base_t::base_t; + using base_t::read; inner_t rows() const { if (!_is_read) throw_no_read(); return reinterpret_cast(_buffer[sizeof(bool_t)]); } - inner_t cols() const - { + inner_t snps() const { if (!_is_read) throw_no_read(); return reinterpret_cast(_buffer[sizeof(bool_t) + sizeof(inner_t)]); } + inner_t cols() const { return snps(); } + Eigen::Ref outer() const { if (!_is_read) throw_no_read(); @@ -155,35 +102,6 @@ class IOSNPUnphased return dense; } - size_t read() - { - _is_read = true; - - auto file_ptr = fopen_safe(_filename.c_str(), "rb"); - auto fp = file_ptr.get(); - std::fseek(fp, 0, SEEK_END); - const size_t total_bytes = std::ftell(fp); - - _buffer.resize(total_bytes); - std::fseek(fp, 0, SEEK_SET); - const size_t read = std::fread(_buffer.data(), sizeof(char), _buffer.size(), fp); - if (read != _buffer.size()) { - throw std::runtime_error( - "Could not read the whole file into buffer." - ); - } - - bool endian = _buffer[0]; - if (endian != is_big_endian()) { - throw std::runtime_error( - "Endianness is inconsistent! " - "Regenerate the file on a machine with the same endianness." - ); - } - - return total_bytes; - } - size_t write( const Eigen::Ref& calldata, size_t n_threads diff --git a/adelie/src/include/adelie_core/matrix/matrix_naive_snp_base.hpp b/adelie/src/include/adelie_core/matrix/matrix_naive_snp_base.hpp new file mode 100644 index 00000000..b10af2ab --- /dev/null +++ b/adelie/src/include/adelie_core/matrix/matrix_naive_snp_base.hpp @@ -0,0 +1,99 @@ +#pragma once +#include + +namespace adelie_core { +namespace matrix { + +class MatrixNaiveSNPBase +{ +public: + using vec_index_t = util::rowvec_type; + using string_t = std::string; + using dyn_vec_string_t = std::vector; + +protected: + const dyn_vec_string_t _filenames; // (F,) array of file names + const size_t _n_threads; // number of threads + + template + static auto init_ios( + const dyn_vec_string_t& filenames + ) + { + VecIOType ios; + ios.reserve(filenames.size()); + for (int i = 0; i < filenames.size(); ++i) { + ios.emplace_back(filenames[i]); + ios.back().read(); + } + return ios; + } + + template + static auto init_snps( + const VecIOType& ios + ) + { + size_t snps = 0; + for (const auto& io : ios) { + snps += io.snps(); + } + return snps; + } + + template + static auto init_io_slice_map( + const VecIOType& ios, + size_t snps + ) + { + vec_index_t io_slice_map(snps); + size_t begin = 0; + for (int i = 0; i < ios.size(); ++i) { + const auto& io = ios[i]; + const auto si = io.snps(); + for (int j = 0; j < si; ++j) { + io_slice_map[begin + j] = i; + } + begin += si; + } + return io_slice_map; + } + + template + static auto init_io_index_map( + const VecIOType& ios, + size_t snps + ) + { + vec_index_t io_index_map(snps); + size_t begin = 0; + for (int i = 0; i < ios.size(); ++i) { + const auto& io = ios[i]; + const auto si = io.snps(); + for (int j = 0; j < si; ++j) { + io_index_map[begin + j] = j; + } + begin += si; + } + return io_index_map; + } + +public: + MatrixNaiveSNPBase( + const dyn_vec_string_t& filenames, + size_t n_threads + ): + _filenames(filenames), + _n_threads(n_threads) + { + if (filenames.size() == 0) { + throw std::runtime_error( + "filenames must be non-empty!" + ); + } + } +}; + +} // namespace matrix +} // namespace adelie_core \ No newline at end of file diff --git a/adelie/src/include/adelie_core/matrix/matrix_naive_snp_phased_ancestry.hpp b/adelie/src/include/adelie_core/matrix/matrix_naive_snp_phased_ancestry.hpp new file mode 100644 index 00000000..bd36fc3b --- /dev/null +++ b/adelie/src/include/adelie_core/matrix/matrix_naive_snp_phased_ancestry.hpp @@ -0,0 +1,240 @@ +#pragma once +#include +#include +#include +#include + +namespace adelie_core { +namespace matrix { + +template +class MatrixNaiveSNPPhasedAncestry : + public MatrixNaiveBase, + public MatrixNaiveSNPBase +{ +public: + using base_t = MatrixNaiveBase; + using snp_base_t = MatrixNaiveSNPBase; + using typename base_t::value_t; + using typename base_t::vec_value_t; + using typename base_t::vec_index_t; + using typename base_t::colmat_value_t; + using typename base_t::rowmat_value_t; + using typename base_t::sp_mat_value_t; + using typename snp_base_t::string_t; + using typename snp_base_t::dyn_vec_string_t; + using io_t = io::IOSNPPhasedAncestry; + using dyn_vec_io_t = std::vector; + +protected: + using snp_base_t::init_ios; + using snp_base_t::init_snps; + using snp_base_t::init_io_slice_map; + using snp_base_t::init_io_index_map; + + const dyn_vec_io_t _ios; // (F,) array of IO handlers + const size_t _snps; // total number of SNPs across all slices + const vec_index_t _io_slice_map; // (s,) array mapping to matrix slice + const vec_index_t _io_index_map; // (s,) array mapping to (relative) index of the slice + + static void throw_bad_start_index(int j, int A) + { + throw std::runtime_error( + "Bad starting index " + std::to_string(j) + + " with ancestries " + std::to_string(A) + ); + } + + static void throw_bad_size(int q, int A) + { + throw std::runtime_error( + "Bad size " + std::to_string(q) + + " with ancestries " + std::to_string(A) + ); + } + +public: + MatrixNaiveSNPPhasedAncestry( + const dyn_vec_string_t& filenames, + size_t n_threads + ): + snp_base_t(filenames, n_threads), + _ios(init_ios(filenames)), + _snps(init_snps(_ios)), + _io_slice_map(init_io_slice_map(_ios, _snps)), + _io_index_map(init_io_index_map(_ios, _snps)) + { + // make sure every file has the same number of rows. + const size_t rows = _ios[0].rows(); + for (const auto& io : _ios) { + if (io.rows() != rows) { + throw std::runtime_error( + "Every slice must have same number of rows." + ); + } + } + + const size_t A = _ios[0].ancestries(); + for (const auto& io : _ios) { + if (io.ancestries() != A) { + throw std::runtime_error( + "Every slice must have same number of ancestries." + ); + } + } + } + + auto ancestries() const { return _ios[0].ancestries(); } + + value_t cmul( + int j, + const Eigen::Ref& v + ) const override + { + const auto A = ancestries(); + const auto snp = j / A; + const auto anc = j % A; + const auto slice = _io_slice_map[snp]; + const auto& io = _ios[slice]; + const auto index = _io_index_map[snp]; + + value_t sum = 0; + for (int hap = 0; hap < 2; ++hap) { + const auto inner = io.inner(index, hap); + const auto ancestry = io.ancestry(index, hap); + for (int i = 0; i < inner.size(); ++i) { + if (ancestry[i] != anc) continue; + sum += v[inner[i]]; + } + } + + return sum; + } + + void ctmul( + int j, + value_t v, + const Eigen::Ref& weights, + Eigen::Ref out + ) const override + { + const auto A = ancestries(); + const auto snp = j / A; + const auto anc = j % A; + const auto slice = _io_slice_map[snp]; + const auto& io = _ios[slice]; + const auto index = _io_index_map[snp]; + + dvzero(out, _n_threads); + for (int hap = 0; hap < 2; ++hap) { + const auto inner = io.inner(index, hap); + const auto ancestry = io.ancestry(index, hap); + for (int i = 0; i < inner.size(); ++i) { + if (ancestry[i] != anc) continue; + out[inner[i]] += v * weights[inner[i]]; + } + } + } + + void bmul( + int j, int q, + const Eigen::Ref& v, + Eigen::Ref out + ) override + { + const auto A = ancestries(); + out.setZero(); + + int begin = 0; + while (begin < q) + { + const auto snp = (j + begin) / A; + const auto slice = _io_slice_map[snp]; + const auto& io = _ios[slice]; + const auto index = _io_index_map[snp]; + const auto ancestry_lower = (j + begin) % A; + const auto ancestry_upper = std::min(ancestry_lower + q, A); + + if (ancestry_lower == 0 && ancestry_upper == A) { + for (int hap = 0; hap < 2; ++hap) { + const auto inner = io.inner(index, hap); + const auto ancestry = io.ancestry(index, hap); + for (int i = 0; i < inner.size(); ++i) { + out[begin+ancestry[i]] += v[inner[i]]; + } + } + } else { + for (int hap = 0; hap < 2; ++hap) { + const auto inner = io.inner(index, hap); + const auto ancestry = io.ancestry(index, hap); + for (int i = 0; i < inner.size(); ++i) { + if (ancestry[i] < ancestry_lower || ancestry[i] >= ancestry_upper) continue; + out[begin+ancestry[i]-ancestry_lower] += v[inner[i]]; + } + } + } + + begin += ancestry_upper - ancestry_lower; + } + } + + void btmul( + int j, int q, + const Eigen::Ref& v, + const Eigen::Ref& weights, + Eigen::Ref out + ) override + { + // TODO + } + + void mul( + const Eigen::Ref& v, + Eigen::Ref out + ) override + { + // TODO + } + + void cov( + int j, int q, + const Eigen::Ref& sqrt_weights, + Eigen::Ref out, + Eigen::Ref buffer + ) const override + { + // TODO + } + + int rows() const override { return _ios[0].rows(); } + int cols() const override { return _snps * ancestries(); } + + void sp_btmul( + int j, int q, + const sp_mat_value_t& v, + const Eigen::Ref& weights, + Eigen::Ref out + ) const override + { + // TODO + } + + void to_dense( + int j, int q, + Eigen::Ref out + ) const override + { + // TODO + } + + void means( + const Eigen::Ref& weights, + Eigen::Ref out + ) const override + { + // TODO + } +}; + +} // namespace matrix +} // namespace adelie_core \ No newline at end of file diff --git a/adelie/src/include/adelie_core/matrix/matrix_naive_snp_unphased.hpp b/adelie/src/include/adelie_core/matrix/matrix_naive_snp_unphased.hpp index cc11a573..80d40e24 100644 --- a/adelie/src/include/adelie_core/matrix/matrix_naive_snp_unphased.hpp +++ b/adelie/src/include/adelie_core/matrix/matrix_naive_snp_unphased.hpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include @@ -10,107 +10,46 @@ namespace adelie_core { namespace matrix { template -class MatrixNaiveSNPUnphased : public MatrixNaiveBase +class MatrixNaiveSNPUnphased : + public MatrixNaiveBase, + public MatrixNaiveSNPBase { public: using base_t = MatrixNaiveBase; + using snp_base_t = MatrixNaiveSNPBase; using typename base_t::value_t; using typename base_t::vec_value_t; using typename base_t::vec_index_t; using typename base_t::colmat_value_t; using typename base_t::rowmat_value_t; using typename base_t::sp_mat_value_t; + using typename snp_base_t::string_t; + using typename snp_base_t::dyn_vec_string_t; using io_t = io::IOSNPUnphased; - using string_t = std::string; - using dyn_vec_string_t = std::vector; using dyn_vec_io_t = std::vector; protected: - const dyn_vec_string_t _filenames; // (F,) array of file names - const dyn_vec_io_t _ios; // (F,) array of IO handlers - const size_t _p; // total number of feature across all slices - const vec_index_t _io_slice_map; // (p,) array mapping to matrix slice - const vec_index_t _io_index_map; // (p,) array mapping to (relative) index of the slice - const size_t _n_threads; // number of threads - - static auto init_ios( - const dyn_vec_string_t& filenames - ) - { - dyn_vec_io_t ios; - ios.reserve(filenames.size()); - for (int i = 0; i < filenames.size(); ++i) { - ios.emplace_back(filenames[i]); - ios.back().read(); - } - return ios; - } + using snp_base_t::init_ios; + using snp_base_t::init_snps; + using snp_base_t::init_io_slice_map; + using snp_base_t::init_io_index_map; - static auto init_p( - const dyn_vec_io_t& ios - ) - { - size_t p = 0; - for (const auto& io : ios) { - p += io.cols(); - } - return p; - } - - static auto init_io_slice_map( - const dyn_vec_io_t& ios, - size_t p - ) - { - vec_index_t io_slice_map(p); - size_t begin = 0; - for (int i = 0; i < ios.size(); ++i) { - const auto& io = ios[i]; - const auto pi = io.cols(); - for (int j = 0; j < pi; ++j) { - io_slice_map[begin + j] = i; - } - begin += pi; - } - return io_slice_map; - } - - static auto init_io_index_map( - const dyn_vec_io_t& ios, - size_t p - ) - { - vec_index_t io_index_map(p); - size_t begin = 0; - for (int i = 0; i < ios.size(); ++i) { - const auto& io = ios[i]; - const auto pi = io.cols(); - for (int j = 0; j < pi; ++j) { - io_index_map[begin + j] = j; - } - begin += pi; - } - return io_index_map; - } + const dyn_vec_io_t _ios; // (F,) array of IO handlers + const size_t _snps; // total number of SNPs across all slices + const vec_index_t _io_slice_map; // (s,) array mapping to matrix slice + const vec_index_t _io_index_map; // (s,) array mapping to (relative) index of the slice public: MatrixNaiveSNPUnphased( const dyn_vec_string_t& filenames, size_t n_threads ): - _filenames(filenames), - _ios(init_ios(filenames)), - _p(init_p(_ios)), - _io_slice_map(init_io_slice_map(_ios, _p)), - _io_index_map(init_io_index_map(_ios, _p)), - _n_threads(n_threads) + snp_base_t(filenames, n_threads), + _ios(init_ios(filenames)), + _snps(init_snps(_ios)), + _io_slice_map(init_io_slice_map(_ios, _snps)), + _io_index_map(init_io_index_map(_ios, _snps)) { - if (filenames.size() == 0) { - throw std::runtime_error( - "filenames must be non-empty!" - ); - } - // make sure every file has the same number of rows. const size_t rows = _ios[0].rows(); for (const auto& io : _ios) { @@ -249,7 +188,7 @@ class MatrixNaiveSNPUnphased : public MatrixNaiveBase } int rows() const override { return _ios[0].rows(); } - int cols() const override { return _p; } + int cols() const override { return _snps; } void sp_btmul( int j, int q, diff --git a/adelie/src/io.cpp b/adelie/src/io.cpp index 4f9a94d8..647af788 100644 --- a/adelie/src/io.cpp +++ b/adelie/src/io.cpp @@ -1,25 +1,40 @@ #include "decl.hpp" #include +#include namespace py = pybind11; namespace ad = adelie_core; +void io_snp_base(py::module_& m) +{ + using io_t = ad::io::IOSNPBase; + using string_t = typename io_t::string_t; + py::class_(m, "IOSNPBase") + .def(py::init(), + py::arg("filename") + ) + .def("endian", &io_t::endian) + .def("read", &io_t::read) + ; +} + void io_snp_unphased(py::module_& m) { using io_t = ad::io::IOSNPUnphased; - py::class_(m, "IOSNPUnphased") - .def(py::init(), + using base_t = typename io_t::base_t; + using string_t = typename io_t::string_t; + py::class_(m, "IOSNPUnphased") + .def(py::init(), py::arg("filename") ) - .def("endian", &io_t::endian) .def("rows", &io_t::rows) + .def("snps", &io_t::snps) .def("cols", &io_t::cols) .def("outer", &io_t::outer) .def("nnz", &io_t::nnz) .def("inner", &io_t::inner) .def("value", &io_t::value) .def("to_dense", &io_t::to_dense) - .def("read", &io_t::read) .def("write", &io_t::write, py::arg("calldata").noconvert(), py::arg("n_threads") @@ -27,7 +42,36 @@ void io_snp_unphased(py::module_& m) ; } +void io_snp_phased_ancestry(py::module_& m) +{ + using io_t = ad::io::IOSNPPhasedAncestry; + using base_t = typename io_t::base_t; + using string_t = typename io_t::string_t; + py::class_(m, "IOSNPPhasedAncestry") + .def(py::init(), + py::arg("filename") + ) + .def("rows", &io_t::rows) + .def("snps", &io_t::snps) + .def("ancestries", &io_t::ancestries) + .def("cols", &io_t::cols) + .def("outer", &io_t::outer) + .def("nnz", &io_t::nnz) + .def("inner", &io_t::inner) + .def("ancestry", &io_t::ancestry) + .def("to_dense", &io_t::to_dense) + .def("write", &io_t::write, + py::arg("calldata").noconvert(), + py::arg("ancestries").noconvert(), + py::arg("A"), + py::arg("n_threads") + ) + ; +} + void register_io(py::module_& m) { + io_snp_base(m); io_snp_unphased(m); + io_snp_phased_ancestry(m); } \ No newline at end of file diff --git a/adelie/src/matrix.cpp b/adelie/src/matrix.cpp index 1da25801..924f36f9 100644 --- a/adelie/src/matrix.cpp +++ b/adelie/src/matrix.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace py = pybind11; namespace ad = adelie_core; @@ -475,6 +476,24 @@ void matrix_naive_snp_unphased(py::module_& m, const char* name) ; } +template +void matrix_naive_snp_phased_ancestry(py::module_& m, const char* name) +{ + using internal_t = ad::matrix::MatrixNaiveSNPPhasedAncestry; + using base_t = typename internal_t::base_t; + using dyn_vec_string_t = typename internal_t::dyn_vec_string_t; + py::class_(m, name) + .def( + py::init< + const dyn_vec_string_t&, + size_t + >(), + py::arg("filenames").noconvert(), + py::arg("n_threads") + ) + ; +} + template void matrix_cov_dense(py::module_& m, const char* name) { @@ -524,6 +543,8 @@ void register_matrix(py::module_& m) matrix_naive_snp_unphased(m, "MatrixNaiveSNPUnphased64"); matrix_naive_snp_unphased(m, "MatrixNaiveSNPUnphased32"); + matrix_naive_snp_phased_ancestry(m, "MatrixNaiveSNPPhasedAncestry64"); + matrix_naive_snp_phased_ancestry(m, "MatrixNaiveSNPPhasedAncestry32"); /* cov matrices */ matrix_cov_dense>(m, "MatrixCovDense64C"); diff --git a/adelie/state.py b/adelie/state.py index 949816a8..9765f29f 100644 --- a/adelie/state.py +++ b/adelie/state.py @@ -781,8 +781,7 @@ def pin_naive( See Also -------- - adelie.state.pin_naive_64 - adelie.state.pin_naive_32 + adelie.adelie_core.state.StatePinNaive64 """ if not ( isinstance(X, matrix.MatrixNaiveBase64) or @@ -1062,8 +1061,7 @@ def pin_cov( See Also -------- - adelie.state.pin_cov_64 - adelie.state.pin_cov_32 + adelie.adelie_core.state.StatePinCov64 """ if not ( isinstance(A, matrix.MatrixCovBase64) or @@ -1819,8 +1817,7 @@ def basil_naive( See Also -------- - adelie.state.basil_naive_64 - adelie.state.basil_naive_32 + adelie.adelie_core.state.StateBasilNaive64 """ if not ( isinstance(X, matrix.MatrixNaiveBase64) or diff --git a/docs/sphinx/api_reference.rst b/docs/sphinx/api_reference.rst index a834752a..50859b19 100644 --- a/docs/sphinx/api_reference.rst +++ b/docs/sphinx/api_reference.rst @@ -72,6 +72,7 @@ adelie.io snp_unphased + snp_phased_ancestry adelie.matrix @@ -135,12 +136,10 @@ Internal :toctree: generated/ - matrix.MatrixCovBase32 + adelie_core.state.StateBasilNaive64 + adelie_core.state.StatePinBase64 + adelie_core.state.StatePinCov64 + adelie_core.state.StatePinNaive64 matrix.MatrixCovBase64 - matrix.MatrixNaiveBase32 matrix.MatrixNaiveBase64 - state.base - adelie_core.state.StateBasilBase32 - adelie_core.state.StateBasilBase64 - adelie_core.state.StatePinBase32 - adelie_core.state.StatePinBase64 + state.base \ No newline at end of file diff --git a/docs/sphinx/notebooks/snp_io.ipynb b/docs/sphinx/notebooks/snp_io.ipynb index e4d46576..e8146312 100644 --- a/docs/sphinx/notebooks/snp_io.ipynb +++ b/docs/sphinx/notebooks/snp_io.ipynb @@ -222,26 +222,158 @@ "# os.remove(filename)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## __SNP Phased Ancestry__" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "n = 100\n", + "s = 2000\n", + "A = 8\n", + "n_threads = os.cpu_count() // 2\n", + "data = ad.data.create_snp_phased_ancestry(n, s, A)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "filename = \"/tmp/dummy_spa.snpdat\"\n", + "handler = ad.io.snp_phased_ancestry(filename)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.91 s, sys: 241 ms, total: 2.15 s\n", + "Wall time: 679 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "353200018" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "handler.write(data[\"X\"], data[\"ancestries\"], n_threads=n_threads)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 0 ns, sys: 98.1 ms, total: 98.1 ms\n", + "Wall time: 97.9 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "353200018" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%%time\n", + "handler.read()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'little-endian'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "handler.endian()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0],\n", + " ...,\n", + " [0, 0, 0, ..., 1, 0, 0],\n", + " [0, 0, 1, ..., 0, 0, 0],\n", + " [0, 0, 0, ..., 0, 0, 0]], dtype=int8)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "handler.to_dense()" + ] + }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([0, 0, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4])" + "160" ] }, - "execution_count": 14, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "x = np.arange(5)\n", - "r = np.array([2, 1, 3, 5, 1])\n", - "np.repeat(x, r)" + "handler.cols()" ] } ], diff --git a/tests/test_io.py b/tests/test_io.py index 15f25469..4fd784c4 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -58,3 +58,59 @@ def _test(n, p, seed=0): _test(200, 32) _test(2000, 3000) _test(1421, 927) + + +def test_io_snp_phased_ancestry(): + def create_dense(calldata, ancestries, A): + n, s = calldata.shape[0], calldata.shape[1] // 2 + dense = np.zeros((n, s * A), dtype=np.int8) + base_indices = A * np.arange(n * s, dtype=int)[None] + dense.ravel()[ + base_indices + + ancestries.reshape(n, s, 2)[:,:,0].ravel() + ] += calldata.reshape(n, s, 2)[:,:,0].ravel() + dense.ravel()[ + base_indices + + ancestries.reshape(n, s, 2)[:,:,1].ravel() + ] += calldata.reshape(n, s, 2)[:,:,1].ravel() + return dense + + def _test(n, s, A, seed=0): + data = ad.data.create_snp_phased_ancestry(n, s, A, seed=seed) + calldata = data["X"] + ancestries = data["ancestries"] + dense = create_dense(calldata, ancestries, A) + + filename = "/tmp/dummy_snp_phased_ancestry.snpdat" + handler = ad.io.snp_phased_ancestry(filename) + w_bytes = handler.write(calldata, ancestries, A, n_threads=8) + r_bytes = handler.read() + os.remove(filename) + + assert w_bytes == r_bytes + assert handler.rows() == n + assert handler.cols() == s * A + + outer = handler.outer() + nnzs = np.array([handler.nnz(j, h) for j in range(s) for h in range(2)]) + inners = [handler.inner(j, h) for j in range(s) for h in range(2)] + my_ancestries = [handler.ancestry(j, h) for j in range(s) for h in range(2)] + + assert np.allclose((outer[1:] - outer[:-1]) / 5, nnzs) + for j in range(calldata.shape[-1]): + assert np.allclose( + np.arange(n)[calldata[:, j] != 0], + inners[j], + ) + assert np.allclose( + ancestries[:, j][calldata[:, j] != 0], + my_ancestries[j], + ) + + my_dense = handler.to_dense() + assert np.allclose(my_dense, dense) + + _test(1, 1, 1) + _test(200, 32, 4) + _test(2000, 3000, 7) + _test(1421, 927, 8) \ No newline at end of file diff --git a/tests/test_matrix.py b/tests/test_matrix.py index 10c52497..9dd169b4 100644 --- a/tests/test_matrix.py +++ b/tests/test_matrix.py @@ -200,4 +200,55 @@ def _test(n, p, n_files, dtype, seed=0): for dtype in dtypes: _test(10, 20, 3, dtype) _test(1, 13, 3, dtype) - _test(144, 1, 3, dtype) \ No newline at end of file + _test(144, 1, 3, dtype) + + +def test_snp_phased_ancestry(): + # TODO: this was copied from test_io.py + def create_dense(calldata, ancestries, A): + n, s = calldata.shape[0], calldata.shape[1] // 2 + dense = np.zeros((n, s * A), dtype=np.int8) + base_indices = A * np.arange(n * s, dtype=int)[None] + dense.ravel()[ + base_indices + + ancestries.reshape(n, s, 2)[:,:,0].ravel() + ] += calldata.reshape(n, s, 2)[:,:,0].ravel() + dense.ravel()[ + base_indices + + ancestries.reshape(n, s, 2)[:,:,1].ravel() + ] += calldata.reshape(n, s, 2)[:,:,1].ravel() + return dense + + def _test(n, s, A, n_files, dtype, seed=0): + np.random.seed(seed) + datas = [ + ad.data.create_snp_phased_ancestry(n, s, A, seed=seed+i) + for i in range(n_files) + ] + filenames = [ + f"/tmp/test_snp_phased_ancestry_{i}.snpdat" + for i in range(n_files) + ] + for i in range(n_files): + handler = ad.io.snp_phased_ancestry(filenames[i]) + handler.write(datas[i]["X"], datas[i]["ancestries"], A) + cX = mod.snp_phased_ancestry( + filenames=filenames, + dtype=dtype, + n_threads=15, + ) + for f in filenames: + os.remove(f) + + X = np.concatenate([ + create_dense(data["X"], data["ancestries"], A) + for data in datas + ], axis=-1, dtype=np.int8) + run_naive(X, cX, dtype) + + + dtypes = [np.float64, np.float32] + for dtype in dtypes: + _test(10, 20, 4, 3, dtype) + _test(1, 13, 3, 3, dtype) + _test(144, 1, 2, 3, dtype) \ No newline at end of file