Finish SNP Unphased

JamesYang007 · Nov 8, 2023 · 7f74499 · 7f74499
1 parent 6bdbc7b
commit 7f74499
Showing 24 changed files with 1,379 additions and 522 deletions.
diff --git a/adelie/__init__.py b/adelie/__init__.py
@@ -2,6 +2,7 @@
 from . import bcd
 from . import data
 from . import diagnostic
+from . import io
 from . import matrix
 from . import optimization
 from . import research

diff --git a/adelie/data.py b/adelie/data.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 
-def create_test_data_basil(
+def create_dense(
     n: int, 
     p: int, 
     G: int,
@@ -13,7 +13,7 @@ def create_test_data_basil(
     snr: float = 1,
     seed: int =0,
 ):
-    """Creates a test dataset for BASIL method.
+    """Creates a dense dataset.
 
     The groups and group sizes are generated randomly
     such that ``G`` groups are created and the sum of the group sizes is ``p``.
@@ -71,6 +71,7 @@ def create_test_data_basil(
     assert rho >= -1 / (p-1)
     assert (0 <= sparsity) & (sparsity <= 1)
     assert (0 <= zero_penalty) & (zero_penalty <= 1)
+    assert snr > 0
     assert seed >= 0
 
     np.random.seed(seed)
@@ -99,13 +100,125 @@ def create_test_data_basil(
     X = np.asfortranarray(X)
 
     beta = np.random.normal(0, 1, p)
-    beta[np.random.choice(p, int(sparsity * p), replace=False)] = 0
+    beta_zero_indices = np.random.choice(p, int(sparsity * p), replace=False)
+    beta_nnz_indices = np.array(list(set(np.arange(p)) - set(beta_zero_indices)))
+    X_sub = X[:, beta_nnz_indices]
+    beta_sub = beta[beta_nnz_indices]
 
     noise_scale = np.sqrt(
-        (rho * np.sum(beta) ** 2 + (1-rho) * np.sum(beta ** 2))
+        (rho * np.sum(beta_sub) ** 2 + (1-rho) * np.sum(beta_sub ** 2))
         / snr
     )
-    y = X @ beta + noise_scale * np.random.normal(0, 1, n)
+    y = X_sub @ beta_sub + noise_scale * np.random.normal(0, 1, n)
+
+    return {
+        "X": X, 
+        "y": y,
+        "groups": groups,
+        "group_sizes": group_sizes,
+        "penalty": penalty,
+    }
+
+
+def create_snp_unphased(
+    n: int, 
+    p: int, 
+    *,
+    sparsity: float =0.95,
+    one_ratio: float =0.25,
+    two_ratio: float =0.05,
+    zero_penalty: float =0,
+    snr: float =1,
+    seed: int =0,
+):
+    """Creates a SNP Unphased dataset.
+
+    This dataset is only used for lasso, so ``groups`` is simply each individual feature
+    and ``group_sizes`` is a vector of ones.
+    The data matrix ``X`` has sparsity ratio ``1 - one_ratio - two_ratio``
+    where ``one_ratio`` of the entries are randomly set to ``1``
+    and ``two_ratio`` are randomly set to ``2``.
+    The response ``y`` is generated from a linear model :math:`y = X\\beta + \\epsilon`
+    with :math:`\\epsilon \\sim N(0, \\sigma^2 I_n)` 
+    and :math:`\\beta` such that ``sparsity`` proportion of the entries are set to :math:`0`.
+    We compute :math:`\\sigma^2` such that the signal-to-noise ratio is given by ``snr``.
+    The penalty factors are by default set to ``np.sqrt(group_sizes)``,
+    however if ``zero_penalty > 0``, a random set of penalties will be set to zero,
+    in which case, ``penalty`` is rescaled such that the :math:`\\ell_2` norm squared is ``p``.
+
+    Parameters
+    ----------
+    n : int
+        Number of data points.
+    p : int
+        Number of features.
+    sparsity : float, optional
+        Proportion of :math:`\\beta` entries to be zeroed out.
+        Default is ``0.95``.
+    one_ratio : float, optional
+        Proportion of the entries of ``X`` that is set to ``1``.
+        Default is ``0.25``.
+    two_ratio : float, optional
+        Proportion of the entries of ``X`` that is set to ``2``.
+        Default is ``0.05``.
+    zero_penalty : float, optional
+        Proportion of ``penalty`` entries to be zeroed out.
+        Default is ``0``.
+    snr : float, optional
+        Signal-to-noise ratio.
+        Default is ``1``.
+    seed : int, optional
+        Random seed.
+        Default is ``0``.
+
+    Returns
+    -------
+    data : dict
+        A dictionary containing the generated data:
+        
+            - ``"X"``: feature matrix.
+            - ``"y"``: response vector.
+            - ``"groups"``: mapping of group index to the starting column index of ``X``.
+            - ``"group_sizes"``: mapping of group index to the group size.
+            - ``"penalty"``: penalty factor for each group index.
+    """
+    assert n >= 1
+    assert p >= 1
+    assert sparsity >= 0 and sparsity <= 1
+    assert one_ratio >= 0 and one_ratio <= 1
+    assert two_ratio >= 0 and two_ratio <= 1
+    assert zero_penalty >= 0 and zero_penalty <= 1
+    assert snr > 0
+    assert seed >= 0
+
+    np.random.seed(seed)
+
+    nnz_ratio = one_ratio + two_ratio
+    one_ratio = one_ratio / nnz_ratio
+    two_ratio = two_ratio / nnz_ratio
+    nnz = int(nnz_ratio * n * p)
+    nnz_indices = np.random.choice(n * p, nnz, replace=False)
+    one_indices = np.random.choice(nnz_indices, int(one_ratio * nnz), replace=False)
+    two_indices = np.array(list(set(nnz_indices) - set(one_indices)))
+    X = np.zeros((n, p), dtype=np.int8)
+    X.ravel()[one_indices] = 1
+    X.ravel()[two_indices] = 2
+
+    groups = np.arange(p)
+    group_sizes = np.ones(p)
+
+    penalty = np.sqrt(group_sizes)
+    penalty[np.random.choice(p, int(zero_penalty * p), replace=False)] = 0
+    penalty /= np.linalg.norm(penalty) / np.sqrt(p)
+
+    beta = np.random.normal(0, 1, p)
+    beta_nnz_indices = np.random.choice(p, int((1-sparsity) * p), replace=False)
+    X_sub = X[:, beta_nnz_indices]
+    beta_sub = beta[beta_nnz_indices]
+
+    signal_var = np.dot(beta_sub, np.dot(np.cov(X_sub.T), beta_sub))
+    noise_scale = np.sqrt(signal_var / snr)
+    y = X_sub @ beta_sub + noise_scale * np.random.normal(0, 1, n)
 
     return {
         "X": X, 

diff --git a/adelie/io.py b/adelie/io.py
@@ -0,0 +1,154 @@
+import numpy as np
+from .adelie_core import io as core_io
+
+
+class snp_unphased:
+    """IO handler for SNP Unphased data.
+
+    Parameters
+    ----------
+    filename : str
+        File name to either read from or write to related to the SNP data.
+    """
+    def __init__(
+        self,
+        filename,
+    ):
+        self._core = core_io.IOSNPUnphased(filename)
+
+    def endian(self):
+        """Gets the endianness used in the file.
+
+        Returns
+        -------
+        endian : str
+            ``"big-endian"`` if big-endian and ``"little-endian"`` if little-endian.
+        """
+        return (
+            "big-endian" 
+            if self._core.endian() else
+            "little-endian"
+        )
+
+    def rows(self):
+        """Gets the number of rows of the matrix.
+
+        Returns
+        -------
+        rows : int
+            Number of rows.
+        """
+        return self._core.rows()
+
+    def cols(self):
+        """Gets the number of columns of the matrix.
+
+        Returns
+        -------
+        cols : int
+            Number of columns.
+        """
+        return self._core.cols()
+
+    def outer(self):
+        """Gets the outer indexing vector.
+
+        Returns
+        -------
+        outer : (p+1,) np.ndarray
+            Outer indexing vector.
+        """
+        return self._core.outer()
+
+    def nnz(self, j: int):
+        """Gets the number of non-zero entries at a column.
+
+        Parameters
+        ----------
+        j : int
+            Column index.
+
+        Returns
+        -------
+        nnz : int
+            Number of non-zero entries column ``j``.
+        """
+        return self._core.nnz(j)
+
+    def inner(self, j: int):
+        """Gets the inner indexing vector at a column.
+
+        Parameters
+        ----------
+        j : int
+            Column index.
+
+        Returns
+        -------
+        inner : np.ndarray
+            Inner indexing vector at column ``j``.
+        """
+        return self._core.inner(j)
+
+    def value(self, j: int):
+        """Gets the value vector at a column.
+
+        Parameters
+        ----------
+        j : int
+            Column index.
+
+        Returns
+        -------
+        v : np.ndarray
+            Value vector at column ``j``.
+        """
+        return self._core.value(j)
+
+    def to_dense(self, n_threads: int =1):
+        """Creates a dense matrix.
+
+        Parameters
+        ----------
+        n_threads : int, optional
+            Number of threads.
+            Default is ``1``.
+
+        Returns
+        -------
+        dense : (n, p) np.ndarray
+            Dense matrix.
+        """
+        return self._core.to_dense(n_threads)
+
+    def read(self):
+        """Read and load the matrix from file.
+
+        Returns
+        -------
+        total_bytes : int
+            Number of bytes read.
+        """
+        return self._core.read()
+
+    def write(
+        self, 
+        calldata: np.ndarray,
+        n_threads: int =1,
+    ):
+        """Write dense array to the file in special format.
+
+        Parameters
+        ----------
+        calldata : (n, p) np.ndarray
+            SNP unphased calldata in dense format.
+        n_threads : int, optional
+            Number of threads.
+            Default is ``1``.
+
+        Returns
+        -------
+        total_bytes : int
+            Number of bytes written.
+        """
+        return self._core.write(calldata, n_threads)
diff --git a/adelie/matrix.py b/adelie/matrix.py
@@ -8,17 +8,24 @@
 import numpy as np
 
 
-def naive_dense(
+def dense(
     mat: np.ndarray,
     *,
+    method: str,
     n_threads: int =1,
 ):
-    """Creates a viewer of a dense matrix for naive method.
+    """Creates a viewer of a dense matrix.
     
     Parameters
     ----------
     mat : np.ndarray
         The matrix to view.
+    method : str
+        Method type. It must be one of the following:
+
+            - ``"naive"``: naive method.
+            - ``"cov"``: covariance method.
+
     n_threads : int, optional
         Number of threads.
         Default is ``1``.
@@ -31,7 +38,7 @@ def naive_dense(
     if n_threads < 1:
         raise ValueError("Number of threads must be >= 1.")
 
-    dispatcher = {
+    naive_dispatcher = {
         np.dtype("float64"): {
             "C": core.matrix.MatrixNaiveDense64C,
             "F": core.matrix.MatrixNaiveDense64F,
@@ -41,15 +48,32 @@ def naive_dense(
             "F": core.matrix.MatrixNaiveDense32F,
         },
     }
+
+    cov_dispatcher = {
+        np.dtype("float64"): {
+            "C": core.matrix.MatrixCovDense64C,
+            "F": core.matrix.MatrixCovDense64F,
+        },
+        np.dtype("float32"): {
+            "C": core.matrix.MatrixCovDense32C,
+            "F": core.matrix.MatrixCovDense32F,
+        },
+    }
+
+    dispatcher = {
+        "naive" : naive_dispatcher,
+        "cov" : cov_dispatcher,
+    }
+
     dtype = mat.dtype
     order = (
         "C"
         if mat.flags.c_contiguous else
         "F"
     )
-    core_base = dispatcher[dtype][order]
+    core_base = dispatcher[method][dtype][order]
 
-    class _naive_dense(core_base):
+    class _dense(core_base):
         def __init__(
             self,
             mat: np.ndarray,
@@ -58,23 +82,30 @@ def __init__(
             self.mat = mat
             core_base.__init__(self, self.mat, n_threads)
 
-    return _naive_dense(mat, n_threads)
+    return _dense(mat, n_threads)
 
 
-def cov_dense(
-    mat: np.ndarray,
+def snp_unphased(
+    filenames: list,
     *,
     n_threads: int =1,
+    dtype: np.float32 | np.float64 =np.float64,
 ):
-    """Creates a viewer of a dense matrix for covariance method.
+    """Creates a SNP unphased matrix.
+
+    .. note::
+        This matrix only works for naive method!
     
     Parameters
     ----------
-    mat : np.ndarray
-        The matrix to view.
+    filenames : list
+        List of file names that contain column-block slices of the matrix.
     n_threads : int, optional
         Number of threads.
         Default is ``1``.
+    dtype : Union[np.float32, np.float64], optional
+        Underlying value type.
+        Default is ``np.float64``.
 
     Returns
     -------
@@ -85,46 +116,35 @@ def cov_dense(
         raise ValueError("Number of threads must be >= 1.")
 
     dispatcher = {
-        np.dtype("float64"): {
-            "C": core.matrix.MatrixCovDense64C,
-            "F": core.matrix.MatrixCovDense64F,
-        },
-        np.dtype("float32"): {
-            "C": core.matrix.MatrixCovDense32C,
-            "F": core.matrix.MatrixCovDense32F,
-        },
+        np.float64: core.matrix.MatrixNaiveSNPUnphased64,
+        np.float32: core.matrix.MatrixNaiveSNPUnphased32,
     }
+    core_base = dispatcher[dtype]
 
-    dtype = mat.dtype
-    order = (
-        "C"
-        if mat.flags.c_contiguous else
-        "F"
-    )
-    core_base = dispatcher[dtype][order]
-
-    class _cov_dense(core_base):
+    class _snp_unphased(core_base):
         def __init__(
             self,
-            mat: np.ndarray,
-            n_threads: int =1,
+            filenames: list,
+            n_threads: int,
         ):
-            self.mat = mat
-            core_base.__init__(self, self.mat, n_threads)
+            core_base.__init__(self, filenames, n_threads)
 
-    return _cov_dense(mat, n_threads)
+    return _snp_unphased(filenames, n_threads)
 
 
 def cov_lazy(
     mat: np.ndarray,
     *,
     n_threads: int =1,
 ):
-    """Creates a viewer of a lazy matrix for covariance method.
+    """Creates a viewer of a lazy covariance matrix.
     
+    .. note::
+        This matrix only works for covariance method!
+
     Parameters
     ----------
-    mat : np.ndarray
+    mat : (n, p) np.ndarray
         The data matrix from which to lazily compute the covariance.
     n_threads : int, optional
         Number of threads.

diff --git a/adelie/solver.py b/adelie/solver.py
@@ -76,10 +76,7 @@ def objective(
     )
 
 
-def solve_pin(
-    state,
-    logger=logger.logger,
-):
+def solve_pin(state):
     """Solves the pinned group elastic net problem.
 
     The pinned group elastic net problem is given by
@@ -119,7 +116,7 @@ def solve_pin(
 
     # raise any errors
     if out["error"] != "":
-        logger.warning(RuntimeError(out["error"]))
+        logger.logger.warning(RuntimeError(out["error"]))
 
     # return a subsetted Python result object
     core_state = out["state"]
@@ -128,10 +125,7 @@ def solve_pin(
     return state
 
 
-def solve_basil(
-    state,
-    logger=logger.logger,
-):
+def solve_basil(state):
     """Solves the group elastic net problem using BASIL.
 
     Parameters
@@ -162,7 +156,7 @@ def solve_basil(
 
     # raise any errors
     if out["error"] != "":
-        logger.error(RuntimeError(out["error"]))
+        logger.logger.warning(RuntimeError(out["error"]))
 
     # return a subsetted Python result object
     core_state = out["state"]
@@ -311,8 +305,7 @@ def grpnet(
         The type is the same as that of ``state``.
     """
     if isinstance(X, np.ndarray):
-        X_raw = X
-        X = ad.matrix.naive_dense(X_raw, n_threads=n_threads)
+        X = ad.matrix.dense(X, method="naive", n_threads=n_threads)
 
     assert (
         isinstance(X, matrix.MatrixNaiveBase64) or

diff --git a/adelie/src/adelie_core.cpp b/adelie/src/adelie_core.cpp
@@ -10,6 +10,9 @@ PYBIND11_MODULE(adelie_core, m) {
 
     auto m_bcd = m.def_submodule("bcd", "BCD submodule.");
     register_bcd(m_bcd);
+
+    auto m_io = m.def_submodule("io", "IO submodule.");
+    register_io(m_io);
 
     auto m_matrix = m.def_submodule("matrix", "Matrix submodule.");
     register_matrix(m_matrix);

diff --git a/adelie/src/decl.hpp b/adelie/src/decl.hpp
@@ -14,4 +14,5 @@ void register_bcd(py::module_&);
 void register_matrix(py::module_&);
 void register_optimization(py::module_&);
 void register_state(py::module_&);
-void register_solver(py::module_&);
+void register_solver(py::module_&);
+void register_io(py::module_&);
diff --git a/adelie/src/include/adelie_core/bcd.hpp b/adelie/src/include/adelie_core/bcd.hpp
@@ -201,9 +201,9 @@ auto root_lower_bound(
  * @param vbuffer1      vector containing L + l2.
  * @param v             any vector of same length as vbuffer1.
  * @param zero_tol      if a float is <= zero_tol, it is considered to be 0.
- * @return (h_max, vbuffer1_min_nzn)
+ * @return (h_max, vbuffer1_min_nnz)
  *  h_max: the upper bound
- *  vbuffer1_min_nzn:   smallest value in vbuffer1 among non-zero values based on zero_tol.
+ *  vbuffer1_min_nnz:   smallest value in vbuffer1 among non-zero values based on zero_tol.
  */
 template <class DiagType, class VType, class ValueType>
 inline
@@ -217,7 +217,7 @@ auto root_upper_bound(
 
     const value_t vbuffer1_min = vbuffer1.minCoeff();
 
-    value_t vbuffer1_min_nzn = std::numeric_limits<value_t>::infinity();
+    value_t vbuffer1_min_nnz = std::numeric_limits<value_t>::infinity();
     value_t h_max = 0;
 
     // If L+l2 have entries <= threshold,
@@ -230,15 +230,15 @@ auto root_upper_bound(
             const bool is_nonzero = vbuffer1[i] > zero_tol;
             const auto vi2 = v[i] * v[i];
             h_max += is_nonzero ? vi2 / (vbuffer1[i] * vbuffer1[i]) : 0;
-            vbuffer1_min_nzn = is_nonzero ? std::min(vbuffer1_min_nzn, vbuffer1[i]) : vbuffer1_min_nzn;
+            vbuffer1_min_nnz = is_nonzero ? std::min(vbuffer1_min_nnz, vbuffer1[i]) : vbuffer1_min_nnz;
         }
         h_max = std::sqrt(h_max);
     } else {
-        vbuffer1_min_nzn = vbuffer1_min;
+        vbuffer1_min_nnz = vbuffer1_min;
         h_max = (v / vbuffer1).matrix().norm();
     }
 
-    return std::make_pair(h_max, vbuffer1_min_nzn);
+    return std::make_pair(h_max, vbuffer1_min_nnz);
 }
 
 template <class ValueType, class DiagType, class VType>
@@ -453,7 +453,7 @@ void newton_abs_solver(
         const value_t h_min = root_lower_bound(vbuffer1, v, l1);
         const auto h_max_out = root_upper_bound(vbuffer1, v, l1);
         const value_t h_max = std::get<0>(h_max_out);
-        const value_t vbuffer1_min_nzn = std::get<1>(h_max_out);
+        const value_t vbuffer1_min_nnz = std::get<1>(h_max_out);
 
         value_t h;
 
@@ -471,7 +471,7 @@ void newton_abs_solver(
 
             const auto ada_bisect = [&]() {
                 // enforce some movement towards h_min for safety.
-                w = std::max<value_t>(l1 / (vbuffer1_min_nzn * h_cand + l1), 0.05);
+                w = std::max<value_t>(l1 / (vbuffer1_min_nnz * h_cand + l1), 0.05);
                 h_cand = w * h_min + (1-w) * h_cand;
                 fh = root_function(h_cand, vbuffer1, v, l1);
             };
@@ -567,21 +567,21 @@ void newton_abs_debug_solver(
 
         // compute h_max
         const value_t vbuffer1_min = vbuffer1.minCoeff();
-        value_t vbuffer1_min_nzn = vbuffer1_min;
+        value_t vbuffer1_min_nnz = vbuffer1_min;
 
         // If L+l2 have entries <= threshold,
         // find h_max with more numerically-stable routine.
         // There is NO guarantee that f(h_max) <= 0, 
         // but we will use this to bisect and find an h where f(h) >= 0,
         // so we don't necessarily need h_max to be f(h_max) <= 0.
         if (vbuffer1_min <= 1e-10) {
-            vbuffer1_min_nzn = std::numeric_limits<value_t>::infinity();
+            vbuffer1_min_nnz = std::numeric_limits<value_t>::infinity();
             h_max = 0;
             for (int i = 0; i < vbuffer1.size(); ++i) {
                 const bool is_nonzero = vbuffer1[i] > 1e-10;
                 const auto vi2 = v[i] * v[i];
                 h_max += is_nonzero ? vi2 / (vbuffer1[i] * vbuffer1[i]) : 0;
-                vbuffer1_min_nzn = is_nonzero ? std::min<value_t>(vbuffer1_min_nzn, vbuffer1[i]) : vbuffer1_min_nzn;
+                vbuffer1_min_nnz = is_nonzero ? std::min<value_t>(vbuffer1_min_nnz, vbuffer1[i]) : vbuffer1_min_nnz;
             }
             h_max = std::sqrt(h_max);
         } else {
@@ -598,14 +598,14 @@ void newton_abs_debug_solver(
             //// NEW METHOD: bisection
             // Adaptive method enforces some movement towards h_min for safety.
 
-            value_t w = std::max<value_t>(l1 / (vbuffer1_min_nzn * h_max + l1), 0.05);
+            value_t w = std::max<value_t>(l1 / (vbuffer1_min_nnz * h_max + l1), 0.05);
             h = w * h_min + (1-w) * h_max;
             value_t fh = (v / (vbuffer1 * h + l1)).matrix().squaredNorm() - 1;
 
             smart_iters.push_back(h);
 
             while ((fh < 0) && std::abs(fh) > tol) {
-                w = std::max<value_t>(l1 / (vbuffer1_min_nzn * h + l1), 0.05);
+                w = std::max<value_t>(l1 / (vbuffer1_min_nnz * h + l1), 0.05);
                 h = w * h_min + (1-w) * h;
                 fh = (v / (vbuffer1 * h + l1)).matrix().squaredNorm() - 1;
                 smart_iters.push_back(h);

diff --git a/adelie/src/include/adelie_core/io/io_snp_unphased.hpp b/adelie/src/include/adelie_core/io/io_snp_unphased.hpp
@@ -0,0 +1,263 @@
+#pragma once
+#include <string>
+#include <cstdio>
+#include <functional>
+#include <memory>
+#include <iostream>
+#include <adelie_core/util/types.hpp>
+
+namespace adelie_core {
+namespace io {
+
+class IOSNPUnphased
+{
+public:
+    using string_t = std::string;
+    using file_unique_ptr_t = std::unique_ptr<
+        std::FILE, 
+        std::function<void(std::FILE*)>
+    >;
+    using bool_t = bool;
+    using outer_t = uint64_t;
+    using inner_t = uint32_t;
+    using value_t = int8_t;
+    using vec_outer_t = util::rowvec_type<outer_t>;
+    using vec_inner_t = util::rowvec_type<inner_t>;
+    using vec_value_t = util::rowvec_type<value_t>;
+    using buffer_t = util::rowvec_type<char>;
+    using rowarr_value_t = util::rowarr_type<int8_t>;
+
+protected:
+    static constexpr size_t _multiplier = (
+        sizeof(inner_t) + 
+        sizeof(value_t)
+    );
+
+    const string_t _filename;
+    buffer_t _buffer;
+    bool_t _is_read;
+
+    static void throw_no_read() 
+    {
+        throw std::runtime_error(
+            "File is not read yet. Call read() first."
+        );
+    }
+
+    static auto fopen_safe(
+        const char* filename,
+        const char* mode
+    )
+    {
+        file_unique_ptr_t file_ptr(
+            std::fopen(filename, mode),
+            [](std::FILE* fp) { std::fclose(fp); }
+        );
+        auto fp = file_ptr.get();
+        if (!fp) {
+            throw std::runtime_error("Cannot open file " + std::string(filename));
+        }
+        return file_ptr;
+    }
+
+    static bool is_big_endian() 
+    {
+        union {
+            uint32_t i;
+            char c[4];
+        } _bint = {0x01020304};
+
+        return _bint.c[0] == 1;
+    }
+
+public:
+    IOSNPUnphased(
+        const string_t& filename
+    ):
+        _filename(filename),
+        _buffer(),
+        _is_read(false)
+    {}
+
+    bool_t endian() const { 
+        if (!_is_read) throw_no_read();
+        return reinterpret_cast<const bool_t&>(_buffer[0]); 
+    }
+
+    inner_t rows() const {
+        if (!_is_read) throw_no_read();
+        return reinterpret_cast<const inner_t&>(_buffer[sizeof(bool_t)]);
+    }
+
+    inner_t cols() const
+    {
+        if (!_is_read) throw_no_read();
+        return reinterpret_cast<const inner_t&>(_buffer[sizeof(bool_t) + sizeof(inner_t)]);
+    }
+
+    Eigen::Ref<const vec_outer_t> outer() const
+    {
+        if (!_is_read) throw_no_read();
+        return Eigen::Map<const vec_outer_t>(
+            reinterpret_cast<const outer_t*>(&_buffer[sizeof(bool_t) + 2 * sizeof(inner_t)]),
+            cols() + 1
+        );
+    }
+
+    inner_t nnz(int j) const
+    {
+        if (!_is_read) throw_no_read();
+        const auto _outer = outer();
+        return (_outer[j+1] - _outer[j]) / _multiplier;
+    }
+
+    Eigen::Ref<const vec_inner_t> inner(int j) const 
+    {
+        if (!_is_read) throw_no_read();
+        const auto _outer = outer();
+        return Eigen::Map<const vec_inner_t>(
+            reinterpret_cast<const inner_t*>(&_buffer[_outer[j]]),
+            nnz(j)
+        );
+    }
+
+    Eigen::Ref<const vec_value_t> value(int j) const 
+    {
+        if (!_is_read) throw_no_read();
+        const auto _outer = outer();
+        const auto _nnz = nnz(j);
+        return Eigen::Map<const vec_value_t>(
+            reinterpret_cast<const value_t*>(&_buffer[_outer[j] + sizeof(inner_t) * _nnz]),
+            _nnz
+        );
+    }
+
+    rowarr_value_t to_dense(
+        size_t n_threads
+    ) const
+    {
+        if (!_is_read) throw_no_read();
+        const auto n = rows();
+        const auto p = cols();
+        rowarr_value_t dense(n, p);
+
+        n_threads = std::min<size_t>(n_threads, p);
+        #pragma omp parallel for schedule(auto) num_threads(n_threads)
+        for (inner_t j = 0; j < p; ++j) {
+            const auto _inner = inner(j);
+            const auto _value = value(j);
+            auto dense_j = dense.col(j);
+            dense_j.setZero();
+            for (inner_t i = 0; i < _inner.size(); ++i) {
+                dense_j[_inner[i]] = _value[i];
+            }
+        }
+
+        return dense;
+    }
+
+    size_t read()
+    {
+        _is_read = true;
+
+        auto file_ptr = fopen_safe(_filename.c_str(), "rb");
+        auto fp = file_ptr.get();
+        std::fseek(fp, 0, SEEK_END);
+        const size_t total_bytes = std::ftell(fp);
+
+        _buffer.resize(total_bytes);
+        std::fseek(fp, 0, SEEK_SET);
+        const size_t read = std::fread(_buffer.data(), sizeof(char), _buffer.size(), fp);
+        if (read != _buffer.size()) {
+            throw std::runtime_error(
+                "Could not read the whole file into buffer."
+            );
+        }
+
+        bool endian = _buffer[0];
+        if (endian != is_big_endian()) {
+            throw std::runtime_error(
+                "Endianness is inconsistent! "
+                "Regenerate the file on a machine with the same endianness."
+            );
+        }
+
+        return total_bytes;
+    }
+
+    size_t write(
+        const Eigen::Ref<const rowarr_value_t>& calldata,
+        size_t n_threads
+    )
+    {
+        const bool_t endian = is_big_endian();
+        const inner_t n = calldata.rows();
+        const inner_t p = calldata.cols();
+
+        // outer[i] = number of bytes to jump from beginning of file 
+        // to start reading column i information.
+        // outer[i+1] - outer[i] = total number of bytes for column i. 
+        vec_outer_t outer(p+1); 
+        outer[0] = 0;
+        outer.tail(p) = (calldata != 0).colwise().count().template cast<outer_t>();
+        for (int i = 1; i < outer.size(); ++i) {
+            outer[i] += outer[i-1];
+        }
+        outer *= _multiplier;
+        outer += (
+            sizeof(bool_t) +
+            2 * sizeof(inner_t) +
+            outer.size() * sizeof(outer_t)
+        );
+
+        auto& buffer = _buffer;
+        buffer.resize(outer[p]);
+
+        size_t idx = 0;
+        reinterpret_cast<bool_t&>(buffer[idx]) = endian; idx += sizeof(bool_t);
+        reinterpret_cast<inner_t&>(buffer[idx]) = n; idx += sizeof(inner_t);
+        reinterpret_cast<inner_t&>(buffer[idx]) = p; idx += sizeof(inner_t);
+        Eigen::Map<vec_outer_t>(
+            reinterpret_cast<outer_t*>(&buffer[idx]),
+            outer.size()
+        ) = outer;
+
+        n_threads = std::min<size_t>(n_threads, p);
+        #pragma omp parallel for schedule(auto) num_threads(n_threads)
+        for (inner_t j = 0; j < p; ++j) {
+            const auto col_j = calldata.col(j);
+            const auto nnz_bytes = outer[j+1] - outer[j];
+            const auto nnz = nnz_bytes / _multiplier;
+            Eigen::Map<vec_inner_t> inner(
+                reinterpret_cast<inner_t*>(&buffer[outer[j]]),
+                nnz
+            );
+            Eigen::Map<vec_value_t> value(
+                reinterpret_cast<value_t*>(&buffer[outer[j] + sizeof(inner_t) * nnz]),
+                nnz
+            );
+
+            size_t count = 0;
+            for (int i = 0; i < n; ++i) {
+                if (col_j[i] == 0) continue;
+                inner[count] = i;
+                value[count] = col_j[i];
+                ++count;
+            }
+        }
+
+        auto file_ptr = fopen_safe(_filename.c_str(), "wb");
+        auto fp = file_ptr.get();
+        auto total_bytes = std::fwrite(buffer.data(), sizeof(char), buffer.size(), fp);
+        if (total_bytes != buffer.size()) {
+            throw std::runtime_error(
+                "Could not write the full buffer."
+            );
+        }
+
+        return total_bytes;
+    }
+};
+
+} // namespace io
+} // namespace adelie_core
diff --git a/adelie/src/include/adelie_core/matrix/matrix_naive_snp_unphased.hpp b/adelie/src/include/adelie_core/matrix/matrix_naive_snp_unphased.hpp
@@ -4,6 +4,7 @@
 #include <vector>
 #include <adelie_core/matrix/matrix_naive_base.hpp>
 #include <adelie_core/matrix/utils.hpp>
+#include <adelie_core/io/io_snp_unphased.hpp>
 
 namespace adelie_core {
 namespace matrix {
@@ -19,40 +20,78 @@ class MatrixNaiveSNPUnphased : public MatrixNaiveBase<ValueType>
     using typename base_t::colmat_value_t;
     using typename base_t::rowmat_value_t;
     using typename base_t::sp_mat_value_t;
+    using io_t = io::IOSNPUnphased;
+    using string_t = std::string;
     using vec_vec_index_t = util::rowvec_type<vec_index_t>;
-    using dyn_vec_string_t = std::vector<std::string>;
+    using dyn_vec_string_t = std::vector<string_t>;
+    using dyn_vec_io_t = std::vector<io_t>;
 
 protected:
     const dyn_vec_string_t _filenames;  // (F,) array of file names
-    const vec_index_t _file_begins;     // (F+1,) array of file begin indices.
-                                        // _file_begins[i] == starting feature index for file i.
-    const size_t _n_threads;
+    const dyn_vec_io_t _ios;            // (F,) array of IO handlers
+    const size_t _p;                    // total number of feature across all slices
+    const vec_index_t _io_slice_map;    // (p,) array mapping to matrix slice
+    const vec_index_t _io_index_map;    // (p,) array mapping to (relative) index of the slice
+    const size_t _n_threads;            // number of threads
 
-    auto init_file_begins(
+    static auto init_ios(
         const dyn_vec_string_t& filenames
     )
     {
-        vec_index_t file_begins(filenames.size() + 1);
-        file_begins[0] = 0;
-        for (int i = 1; i < file_begins.size(); ++i) {
-            const auto& name = filenames[i-1];
-            FILE* fp = fopen(name.c_str(), "rb");
-            if (!fp) {
-                throw std::runtime_error("Cannot open file " + name);
-            }
-            int32_t n_cols;
-            size_t status = fread(&n_cols, sizeof(int32_t), 1, fp);
-            if (status < 1) {
-                throw std::runtime_error("Could not read the first byte of file " + name);
-            }
-            file_begins[i] = n_cols;
-            fclose(fp);
+        dyn_vec_io_t ios;
+        ios.reserve(filenames.size());
+        for (int i = 0; i < filenames.size(); ++i) {
+            ios.emplace_back(filenames[i]);
+            ios.back().read();
         }
+        return ios;
+    }
 
-        for (int i = 1; i < file_begins.size(); ++i) {
-            file_begins[i] += file_begins[i-1];
+    static auto init_p(
+        const dyn_vec_io_t& ios
+    )
+    {
+        size_t p = 0;
+        for (const auto& io : ios) {
+            p += io.cols();
         }
-        return file_begins;
+        return p;
+    }
+
+    static auto init_io_slice_map(
+        const dyn_vec_io_t& ios,
+        size_t p
+    )
+    {
+        vec_index_t io_slice_map(p);
+        size_t begin = 0;
+        for (int i = 0; i < ios.size(); ++i) {
+            const auto& io = ios[i];
+            const auto pi = io.cols();
+            for (int j = 0; j < pi; ++j) {
+                io_slice_map[begin + j] = i;
+            }
+            begin += pi;
+        } 
+        return io_slice_map;
+    }
+
+    static auto init_io_index_map(
+        const dyn_vec_io_t& ios,
+        size_t p
+    )
+    {
+        vec_index_t io_index_map(p);
+        size_t begin = 0;
+        for (int i = 0; i < ios.size(); ++i) {
+            const auto& io = ios[i];
+            const auto pi = io.cols();
+            for (int j = 0; j < pi; ++j) {
+                io_index_map[begin + j] = j;
+            }
+            begin += pi;
+        } 
+        return io_index_map;
     }
 
 public:
@@ -61,28 +100,192 @@ class MatrixNaiveSNPUnphased : public MatrixNaiveBase<ValueType>
         size_t n_threads
     ): 
         _filenames(filenames),
-        _file_begins(init_file_begins(filenames)),
+        _ios(init_ios(filenames)),
+        _p(init_p(_ios)),
+        _io_slice_map(init_io_slice_map(_ios, _p)),
+        _io_index_map(init_io_index_map(_ios, _p)),
         _n_threads(n_threads)
-    {}
+    {
+        if (filenames.size() == 0) {
+            throw std::runtime_error(
+                "filenames must be non-empty!"
+            );
+        }
+
+        // make sure every file has the same number of rows.
+        const size_t rows = _ios[0].rows();
+        for (const auto& io : _ios) {
+            if (io.rows() != rows) {
+                throw std::runtime_error(
+                    "Every slice must have same number of rows."
+                );
+            }
+        }
+    }
 
     value_t cmul(
         int j, 
         const Eigen::Ref<const vec_value_t>& v
     ) const override
     {
-        if (is_cached(j)) {
+        const auto slice = _io_slice_map[j];
+        const auto& io = _ios[slice];
+        const auto index = _io_index_map[j];
+        const auto inner = io.inner(index);
+        const auto value = io.value(index);
 
+        value_t sum = 0;
+        #pragma omp simd reduction(+:sum)
+        for (int i = 0; i < inner.size(); ++i) {
+            sum += v[inner[i]] * value[i];
         }
+
+        return sum;
     }
 
-    bool is_cached(int j) const
+    void ctmul(
+        int j, 
+        value_t v, 
+        const Eigen::Ref<const vec_value_t>& weights,
+        Eigen::Ref<vec_value_t> out
+    ) const override
     {
+        const auto slice = _io_slice_map[j];
+        const auto& io = _ios[slice];
+        const auto index = _io_index_map[j];
+        const auto inner = io.inner(index);
+        const auto value = io.value(index);
 
+        dvzero(out, _n_threads);
+
+        for (int i = 0; i < inner.size(); ++i) {
+            out[inner[i]] = v * value[i] * weights[inner[i]];
+        }
     }
 
-    void cache()
+    void bmul(
+        int j, int q, 
+        const Eigen::Ref<const vec_value_t>& v, 
+        Eigen::Ref<vec_value_t> out
+    ) override
     {
+        const auto n_threads = std::min<size_t>(_n_threads, q);
+        #pragma omp parallel for schedule(static) num_threads(n_threads)
+        for (int t = 0; t < q; ++t) 
+        {
+            const auto slice = _io_slice_map[j+t];
+            const auto& io = _ios[slice];
+            const auto index = _io_index_map[j+t];
+            const auto inner = io.inner(index);
+            const auto value = io.value(index);
 
+            value_t sum = 0;
+            #pragma omp simd reduction(+:sum)
+            for (int i = 0; i < inner.size(); ++i) {
+                sum += v[inner[i]] * value[i];
+            }
+            out[t] = sum;
+        }
+    }
+
+    void btmul(
+        int j, int q, 
+        const Eigen::Ref<const vec_value_t>& v, 
+        const Eigen::Ref<const vec_value_t>& weights,
+        Eigen::Ref<vec_value_t> out
+    ) override
+    {
+        dvzero(out, _n_threads);
+        for (int t = 0; t < q; ++t) 
+        {
+            const auto slice = _io_slice_map[j+t];
+            const auto& io = _ios[slice];
+            const auto index = _io_index_map[j+t];
+            const auto inner = io.inner(index);
+            const auto value = io.value(index);
+            for (int i = 0; i < inner.size(); ++i) {
+                out[inner[i]] += value[i] * weights[inner[i]] * v[t];
+            } 
+        }
+    }
+
+    void mul(
+        const Eigen::Ref<const vec_value_t>& v, 
+        Eigen::Ref<vec_value_t> out
+    ) override
+    {
+        bmul(0, cols(), v, out);
+    }
+
+    int rows() const override { return _ios[0].rows(); }
+    int cols() const override { return _p; }
+
+    void sp_btmul(
+        int j, int q,
+        const sp_mat_value_t& v,
+        const Eigen::Ref<const vec_value_t>& weights,
+        Eigen::Ref<rowmat_value_t> out
+    ) const override
+    {
+        const auto n_threads = std::min<size_t>(_n_threads, v.outerSize());
+        #pragma omp parallel for schedule(static) num_threads(n_threads)
+        for (int k = 0; k < v.outerSize(); ++k) {
+            typename sp_mat_value_t::InnerIterator it(v, k);
+            auto out_k = out.row(k);
+            out_k.setZero();
+            for (; it; ++it) 
+            {
+                const auto t = it.index();
+                const auto slice = _io_slice_map[j+t];
+                const auto& io = _ios[slice];
+                const auto index = _io_index_map[j+t];
+                const auto inner = io.inner(index);
+                const auto value = io.value(index);
+                for (int i = 0; i < inner.size(); ++i) {
+                    out_k[inner[i]] += value[i] * weights[inner[i]] * it.value();
+                } 
+            }
+        }
+    }
+
+    void to_dense(
+        int j, int q,
+        Eigen::Ref<colmat_value_t> out
+    ) const override
+    {
+        auto begin = 0;
+        while (begin < q) {
+            const auto slice = _io_slice_map[j+begin];
+            const auto& io = _ios[slice];
+            const auto index = _io_index_map[j+begin];
+            const auto size = std::min<size_t>(q - begin, io.cols() - index);
+            out.middleCols(begin, size) = io.to_dense(1).middleCols(index, size).template cast<value_t>();
+            begin += size;
+        }
+    }
+
+    void means(
+        const Eigen::Ref<const vec_value_t>& weights,
+        Eigen::Ref<vec_value_t> out
+    ) const override
+    {
+        const auto p = cols();
+        const auto n_threads = std::min<size_t>(_n_threads, p);
+        #pragma omp parallel for schedule(static) num_threads(n_threads)
+        for (int j = 0; j < p; ++j) 
+        {
+            const auto slice = _io_slice_map[j];
+            const auto& io = _ios[slice];
+            const auto index = _io_index_map[j];
+            const auto inner = io.inner(index);
+            const auto value = io.value(index);
+            value_t sum = 0;
+            #pragma omp simd reduction(+:sum)
+            for (int i = 0; i < inner.size(); ++i) {
+                sum += weights[inner[i]] * value[i];
+            }
+            out[j] = sum;
+        }
     }
 };
 

diff --git a/adelie/src/include/adelie_core/matrix/utils.hpp b/adelie/src/include/adelie_core/matrix/utils.hpp
@@ -184,5 +184,30 @@ void dgemv(
     }
 }
 
+template <class OutType>
+ADELIE_CORE_STRONG_INLINE
+void dvzero(
+    OutType& out,
+    size_t n_threads
+)
+{
+    assert(n_threads > 0);
+    const size_t n = out.size();
+    const int n_blocks = std::min(n_threads, n);
+    const int block_size = n / n_blocks;
+    const int remainder = n % n_blocks;
+
+    #pragma omp parallel for schedule(static) num_threads(n_blocks)
+    for (int t = 0; t < n_blocks; ++t) 
+    {
+        const auto begin = (
+            std::min<int>(t, remainder) * (block_size + 1) 
+            + std::max<int>(t-remainder, 0) * block_size
+        );
+        const auto size = block_size + (t < remainder);
+        out.segment(begin, size).setZero();
+    }
+}
+
 } // namespace matrix
 } // namespace adelie_core
diff --git a/adelie/src/include/adelie_core/solver/solve_basil_naive.hpp b/adelie/src/include/adelie_core/solver/solve_basil_naive.hpp
@@ -281,7 +281,6 @@ size_t kkt(
     const auto& groups = state.groups;
     const auto alpha = state.alpha;
     const auto& penalty = state.penalty;
-    const auto& weights = state.weights;
     const auto intercept = state.intercept;
     const auto n_threads = state.n_threads;
     const auto& screen_hashset = state.screen_hashset;
@@ -330,7 +329,6 @@ inline void solve_basil(
     const auto& X_means = state.X_means;
     const auto alpha = state.alpha;
     const auto& penalty = state.penalty;
-    const auto& weights = state.weights;
     const auto& screen_set = state.screen_set;
     const auto& rsqs = state.rsqs;
     const auto early_exit = state.early_exit;

diff --git a/adelie/src/io.cpp b/adelie/src/io.cpp
@@ -0,0 +1,33 @@
+#include "decl.hpp"
+#include <adelie_core/io/io_snp_unphased.hpp>
+
+namespace py = pybind11;
+namespace ad = adelie_core;
+
+void io_snp_unphased(py::module_& m)
+{
+    using io_t = ad::io::IOSNPUnphased;
+    py::class_<io_t>(m, "IOSNPUnphased")
+        .def(py::init<std::string>(),
+            py::arg("filename")
+        )
+        .def("endian", &io_t::endian)
+        .def("rows", &io_t::rows)
+        .def("cols", &io_t::cols)
+        .def("outer", &io_t::outer)
+        .def("nnz", &io_t::nnz)
+        .def("inner", &io_t::inner)
+        .def("value", &io_t::value)
+        .def("to_dense", &io_t::to_dense)
+        .def("read", &io_t::read)
+        .def("write", &io_t::write,
+            py::arg("calldata").noconvert(),
+            py::arg("n_threads")
+        )
+        ;
+}
+
+void register_io(py::module_& m)
+{
+    io_snp_unphased(m);
+}
diff --git a/adelie/src/matrix.cpp b/adelie/src/matrix.cpp
@@ -4,6 +4,7 @@
 #include <adelie_core/matrix/matrix_cov_lazy.hpp>
 #include <adelie_core/matrix/matrix_naive_base.hpp>
 #include <adelie_core/matrix/matrix_naive_dense.hpp>
+#include <adelie_core/matrix/matrix_naive_snp_unphased.hpp>
 
 namespace py = pybind11;
 namespace ad = adelie_core;
@@ -422,6 +423,24 @@ void matrix_naive_dense(py::module_& m, const char* name)
         ;
 }
 
+template <class ValueType>
+void matrix_naive_snp_unphased(py::module_& m, const char* name)
+{
+    using internal_t = ad::matrix::MatrixNaiveSNPUnphased<ValueType>;
+    using base_t = typename internal_t::base_t;
+    using dyn_vec_string_t = typename internal_t::dyn_vec_string_t;
+    py::class_<internal_t, base_t>(m, name)
+        .def(
+            py::init<
+                const dyn_vec_string_t&,
+                size_t
+            >(), 
+            py::arg("filenames").noconvert(),
+            py::arg("n_threads")
+        )
+        ;
+}
+
 template <class DenseType>
 void matrix_cov_dense(py::module_& m, const char* name)
 {
@@ -469,6 +488,9 @@ void register_matrix(py::module_& m)
     matrix_naive_dense<dense_type<float, Eigen::RowMajor>>(m, "MatrixNaiveDense32C");
     matrix_naive_dense<dense_type<float, Eigen::ColMajor>>(m, "MatrixNaiveDense32F");
 
+    matrix_naive_snp_unphased<double>(m, "MatrixNaiveSNPUnphased64");
+    matrix_naive_snp_unphased<float>(m, "MatrixNaiveSNPUnphased32");
+
     /* cov matrices */
     matrix_cov_dense<dense_type<double, Eigen::RowMajor>>(m, "MatrixCovDense64C");
     matrix_cov_dense<dense_type<double, Eigen::ColMajor>>(m, "MatrixCovDense64F");

diff --git a/adelie/state.py b/adelie/state.py
@@ -125,9 +125,6 @@ def create_from_core(
 
 
 class pin_base(base):
-    def __init__(self):
-        self.solver = "pin"
-
     def check(
         self, 
         method: str =None, 
@@ -1487,7 +1484,7 @@ def check(
         # This one is tricky! Since we keep track of ever-active set,
         # some coefficients may have once been active but now zero'ed out.
         # We can only check that if the non-zero coefficient blocks are active.
-        nzn_idxs = np.array([
+        nnz_idxs = np.array([
             i 
             for i, sb, gs in zip(
                 np.arange(len(self.screen_set)),
@@ -1497,7 +1494,7 @@ def check(
             if np.any(self.screen_beta[sb:sb+gs] != 0)
         ], dtype=int)
         self._check(
-            np.all(self.screen_is_active[nzn_idxs]),
+            np.all(self.screen_is_active[nnz_idxs]),
             "check screen_is_active is only active on non-zeros of screen_beta",
             method, logger,
         )

diff --git a/docs/sphinx/api_reference.rst b/docs/sphinx/api_reference.rst
@@ -32,7 +32,8 @@ adelie.data
     :toctree: generated/
 
 
-    create_test_data_basil
+    create_dense
+    create_snp_unphased
 
 
 adelie.diagnostic
@@ -59,6 +60,20 @@ adelie.diagnostic
     Diagnostic
 
 
+adelie.io
+---------
+
+
+.. currentmodule:: adelie.io
+
+
+.. autosummary::
+    :toctree: generated/
+
+
+    snp_unphased
+
+
 adelie.matrix
 -------------
 
@@ -70,9 +85,9 @@ adelie.matrix
     :toctree: generated/
 
 
-    naive_dense
-    cov_dense
+    dense
     cov_lazy
+    snp_unphased
 
 
 adelie.state

diff --git a/docs/sphinx/notebooks/edpp_study.ipynb b/docs/sphinx/notebooks/edpp_study.ipynb
diff --git a/docs/sphinx/notebooks/snp_io.ipynb b/docs/sphinx/notebooks/snp_io.ipynb
@@ -0,0 +1,176 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# __SNP IO Examples__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import adelie as ad\n",
+    "import numpy as np\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## __SNP Unphased__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = 400000\n",
+    "ps = [10, 20, 30]\n",
+    "seed = 0\n",
+    "\n",
+    "filenames = [\n",
+    "    f\"/tmp/dummy_{i}.snpdat\"\n",
+    "    for i in range(len(ps))\n",
+    "]\n",
+    "\n",
+    "np.random.seed(seed)\n",
+    "calldatas = [\n",
+    "    ad.data.create_snp_unphased(n, pi, seed=i)[\"X\"]\n",
+    "    for i, pi in enumerate(ps)\n",
+    "]\n",
+    "calldata = np.concatenate(calldatas, axis=-1)\n",
+    "\n",
+    "for name, dat in zip(filenames, calldatas):\n",
+    "    handler = ad.io.snp_unphased(name)\n",
+    "    handler.write(dat, n_threads=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 114,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mat = ad.matrix.snp_unphased(filenames, n_threads=8)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 115,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "w = np.random.uniform(1, 2, n)\n",
+    "w /= np.sum(w)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 127,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "j = 0\n",
+    "q = 1\n",
+    "v = np.random.normal(0, 1, q)\n",
+    "out = np.empty(n)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 132,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 374 µs, sys: 0 ns, total: 374 µs\n",
+      "Wall time: 381 µs\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "mat.btmul(j, q, v, w, out)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 29.6 ms, sys: 0 ns, total: 29.6 ms\n",
+      "Wall time: 12.2 ms\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "expected = (v.T @ (w[:, None] * calldata[:, j:j+q]).T)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 101,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.allclose(out, expected)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "adelie",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/sphinx/user_guide.rst b/docs/sphinx/user_guide.rst
@@ -6,5 +6,4 @@ User Guide
     :maxdepth: 1
 
 
-    notebooks/edpp_study
     notebooks/pivot_rule
diff --git a/docs/tex/pivot_rule/main.pdf b/docs/tex/pivot_rule/main.pdf
diff --git a/tests/test_io.py b/tests/test_io.py
@@ -0,0 +1,60 @@
+import adelie as ad
+import numpy as np
+import os
+
+
+def create_calldata(
+    n, p, seed
+):
+    np.random.seed(seed)
+    calldata = np.zeros((n, p), dtype=np.int8)
+    calldata.ravel()[
+        np.random.choice(np.arange(n * p), int(0.25 * n * p), replace=False)
+    ] = 1
+    calldata.ravel()[
+        np.random.choice(np.arange(n * p), int(0.05 * n * p), replace=False)
+    ] = 2
+    return calldata
+
+
+def test_io_snp_unphased():
+    def _test(n, p, seed=0):
+        calldata = create_calldata(n, p, seed)
+
+        filename = "/tmp/dummy_snp_unphased.snpdat"
+        handler = ad.io.snp_unphased(filename)
+        w_bytes = handler.write(calldata)
+        r_bytes = handler.read()
+        os.remove(filename)
+
+        total_bytes_exp = (
+            1 + 2 * 4 + 8 * (p + 1) + 5 * np.sum(calldata != 0)
+        )
+        assert w_bytes == total_bytes_exp
+        assert w_bytes == r_bytes
+        assert handler.rows() == n
+        assert handler.cols() == p
+
+        outer = handler.outer()
+        nnzs = np.array([handler.nnz(j) for j in range(p)])
+        inners = [handler.inner(j) for j in range(p)]
+        values = [handler.value(j) for j in range(p)]
+
+        assert np.allclose((outer[1:] - outer[:-1]) / 5, nnzs)
+        for j in range(p):
+            assert np.allclose(
+                np.arange(n)[calldata[:, j] != 0],
+                inners[j],
+            )
+            assert np.allclose(
+                calldata[:, j][calldata[:, j] != 0],
+                values[j],
+            )
+
+        dense = handler.to_dense()
+        assert np.allclose(dense, calldata)
+
+    _test(1, 1)
+    _test(200, 32)
+    _test(2000, 3000)
+    _test(1421, 927)
diff --git a/tests/test_matrix.py b/tests/test_matrix.py
@@ -1,71 +1,78 @@
+import adelie as ad
 import adelie.matrix as mod
 import numpy as np
 import scipy
+import os
 
 
 def test_naive_dense():
     def _test(n, p, dtype, order, seed=0):
         np.random.seed(seed)
         X = np.random.normal(0, 1, (n, p))
 
-        for dtype in dtypes:
-            atol = 1e-6 if dtype == np.float32 else 1e-14
-
-            X = np.array(X, dtype=dtype, order=order)
-            cX = mod.naive_dense(X, n_threads=1)
-            w = np.random.uniform(1, 2, n).astype(dtype)
-            w = w / np.sum(w)
-
-            # test cmul
-            v = np.random.normal(0, 1, n).astype(dtype)
-            out = cX.cmul(p//2, v)
-            assert np.allclose(v @ X[:, p//2], out, atol=atol)
-
-            # test ctmul
-            v = np.random.normal(0, 1)
-            out = np.empty(n, dtype=dtype)
-            cX.ctmul(p//2, v, w, out)
-            assert np.allclose(v * (w * X[:, p//2]), out, atol=atol)
-
-            # test bmul
-            v = np.random.normal(0, 1, n).astype(dtype)
-            out = np.empty(p // 2, dtype=dtype)
-            cX.bmul(0, p // 2, v, out)
-            assert np.allclose(v.T @ X[:, :p//2], out, atol=atol)
-
-            # test btmul
-            v = np.random.normal(0, 1, p//2).astype(dtype)
-            out = np.empty(n, dtype=dtype)
-            cX.btmul(0, p // 2, v, w, out)
-            assert np.allclose(v.T @ (w[:, None] * X[:, :p//2]).T, out, atol=atol)
-
-            # test mul
-            v = np.random.normal(0, 1, n).astype(dtype)
-            out = np.empty(p, dtype=dtype)
-            cX.mul(v, out)
-            assert np.allclose(v.T @ X, out, atol=atol)
-
-            # test sp_btmul
-            v = np.random.normal(0, 1, (2, p // 2)).astype(dtype)
-            v[:, :p//4] = 0
-            expected = v @ (w[:, None] * X[:, :p//2]).T
+        atol = 1e-6 if dtype == np.float32 else 1e-14
+
+        X = np.array(X, dtype=dtype, order=order)
+        cX = mod.dense(X, method="naive", n_threads=1)
+        w = np.random.uniform(1, 2, n).astype(dtype)
+        w = w / np.sum(w)
+
+        # test cmul
+        v = np.random.normal(0, 1, n).astype(dtype)
+        for i in range(p):
+            out = cX.cmul(i, v)
+            assert np.allclose(v @ X[:, i], out, atol=atol)
+
+        # test ctmul
+        v = np.random.normal(0, 1)
+        out = np.empty(n, dtype=dtype)
+        for i in range(p):
+            cX.ctmul(i, v, w, out)
+            assert np.allclose(v * (w * X[:, i]), out, atol=atol)
+
+        # test bmul
+        v = np.random.normal(0, 1, n).astype(dtype)
+        for i in range(1, p+1):
+            out = np.empty(i, dtype=dtype)
+            cX.bmul(0, i, v, out)
+            assert np.allclose(v.T @ X[:, :i], out, atol=atol)
+
+        # test btmul
+        out = np.empty(n, dtype=dtype)
+        for i in range(1, p+1):
+            v = np.random.normal(0, 1, i).astype(dtype)
+            cX.btmul(0, i, v, w, out)
+            assert np.allclose(v.T @ (w[:, None] * X[:, :i]).T, out, atol=atol)
+
+        # test mul
+        v = np.random.normal(0, 1, n).astype(dtype)
+        out = np.empty(p, dtype=dtype)
+        cX.mul(v, out)
+        assert np.allclose(v.T @ X, out, atol=atol)
+
+        # test sp_btmul
+        out = np.empty((2, n), dtype=dtype)
+        for i in range(1, p+1):
+            v = np.random.normal(0, 1, (2, i)).astype(dtype)
+            v[:, :i//2] = 0
+            expected = v @ (w[:, None] * X[:, :i]).T
             v = scipy.sparse.csr_matrix(v)
-            out = np.empty((2, n), dtype=dtype)
-            cX.sp_btmul(0, p // 2, v, w, out)
+            cX.sp_btmul(0, i, v, w, out)
             assert np.allclose(expected, out, atol=atol)
 
-            # test to_dense
-            out = np.empty((n, p // 2), dtype=dtype, order="F")
-            cX.to_dense(0, p // 2, out)
-            assert np.allclose(X[:, :p//2], out)
+        # test to_dense
+        for i in range(1, p+1):
+            out = np.empty((n, i), dtype=dtype, order="F")
+            cX.to_dense(0, i, out)
+            assert np.allclose(X[:, :i], out)
 
-            # test means
-            X_means = np.empty(p, dtype=dtype)
-            cX.means(w, X_means)
-            assert np.allclose(np.sum(w[:, None] * X, axis=0), X_means)
+        # test means
+        X_means = np.empty(p, dtype=dtype)
+        cX.means(w, X_means)
+        assert np.allclose(np.sum(w[:, None] * X, axis=0), X_means)
 
-            assert cX.rows() == n
-            assert cX.cols() == p
+        assert cX.rows() == n
+        assert cX.cols() == p
 
     dtypes = [np.float32, np.float64]
     orders = ["C", "F"]
@@ -82,25 +89,24 @@ def _test(n, p, dtype, order, seed=0):
         X = np.random.normal(0, 1, (n, p))
         A = X.T @ X / n
 
-        for dtype in dtypes:
-            atol = 1e-6 if dtype == np.float32 else 1e-14
+        atol = 1e-6 if dtype == np.float32 else 1e-14
 
-            A = np.array(A, dtype=dtype, order=order)
-            cA = mod.cov_dense(A, n_threads=4)
+        A = np.array(A, dtype=dtype, order=order)
+        cA = mod.dense(A, method="cov", n_threads=4)
 
-            # test bmul
-            v = np.random.normal(0, 1, p // 2).astype(dtype)
-            out = np.empty(p, dtype=dtype)
-            cA.bmul(0, 0, p // 2, p, v, out)
-            assert np.allclose(v.T @ A[:p//2, :], out, atol=atol)
+        # test bmul
+        v = np.random.normal(0, 1, p // 2).astype(dtype)
+        out = np.empty(p, dtype=dtype)
+        cA.bmul(0, 0, p // 2, p, v, out)
+        assert np.allclose(v.T @ A[:p//2, :], out, atol=atol)
 
-            # test to_dense
-            out = np.empty((p // 2, p // 2), dtype=dtype, order="F")
-            cA.to_dense(0, 0, p//2, p//2, out)
-            assert np.allclose(A[:p//2, :p//2], out, atol=atol)
+        # test to_dense
+        out = np.empty((p // 2, p // 2), dtype=dtype, order="F")
+        cA.to_dense(0, 0, p//2, p//2, out)
+        assert np.allclose(A[:p//2, :p//2], out, atol=atol)
 
-            assert cA.rows() == p
-            assert cA.cols() == p
+        assert cA.rows() == p
+        assert cA.cols() == p
 
     dtypes = [np.float32, np.float64]
     orders = ["C", "F"]
@@ -118,33 +124,32 @@ def _test(n, p, dtype, order, seed=0):
         X /= np.sqrt(n)
         A = X.T @ X
 
-        for dtype in dtypes:
-            atol = 1e-6 if dtype == np.float32 else 1e-7
+        atol = 1e-6 if dtype == np.float32 else 1e-7
 
-            X = np.array(X, dtype=dtype, order=order)
-            A = np.array(A, dtype=dtype, order=order)
-            cA = mod.cov_lazy(X, n_threads=4)
+        X = np.array(X, dtype=dtype, order=order)
+        A = np.array(A, dtype=dtype, order=order)
+        cA = mod.cov_lazy(X, n_threads=4)
 
-            # test bmul
-            v = np.random.normal(0, 1, p // 2).astype(dtype)
-            out = np.empty(p, dtype=dtype)
-            cA.bmul(0, 0, p // 2, p, v, out)
-            assert np.allclose(v.T @ A[:p//2, :], out, atol=atol)
+        # test bmul
+        v = np.random.normal(0, 1, p // 2).astype(dtype)
+        out = np.empty(p, dtype=dtype)
+        cA.bmul(0, 0, p // 2, p, v, out)
+        assert np.allclose(v.T @ A[:p//2, :], out, atol=atol)
 
-            # check that cache occured properly
-            if p > 2:
-                v = np.random.normal(0, 1, p // 2 - 1).astype(dtype)
-                out = np.empty(p // 2, dtype=dtype)
-                cA.bmul(1, 0, p // 2-1, p // 2, v, out)
-                assert np.allclose(v.T @ A[1:p//2, :p//2], out, atol=atol)
+        # check that cache occured properly
+        if p > 2:
+            v = np.random.normal(0, 1, p // 2 - 1).astype(dtype)
+            out = np.empty(p // 2, dtype=dtype)
+            cA.bmul(1, 0, p // 2-1, p // 2, v, out)
+            assert np.allclose(v.T @ A[1:p//2, :p//2], out, atol=atol)
 
-            # test to_dense
-            out = np.empty((p // 2, p // 2), dtype=dtype, order="F")
-            cA.to_dense(0, 0, p//2, p//2, out)
-            assert np.allclose(A[:p//2, :p//2], out, atol=atol)
+        # test to_dense
+        out = np.empty((p // 2, p // 2), dtype=dtype, order="F")
+        cA.to_dense(0, 0, p//2, p//2, out)
+        assert np.allclose(A[:p//2, :p//2], out, atol=atol)
 
-            assert cA.rows() == p
-            assert cA.cols() == p
+        assert cA.rows() == p
+        assert cA.cols() == p
 
     dtypes = [np.float32, np.float64]
     orders = ["C", "F"]
@@ -153,3 +158,98 @@ def _test(n, p, dtype, order, seed=0):
             _test(2, 2, dtype, order)
             _test(100, 20, dtype, order)
             _test(20, 100, dtype, order)
+
+
+def test_snp_unphased():
+    def _test(n, p, n_files, dtype, seed=0):
+        np.random.seed(seed)
+        datas = [
+            ad.data.create_snp_unphased(n, p, seed=seed+i)
+            for i in range(n_files)
+        ]
+        filenames = [
+            f"/tmp/test_snp_unphased_{i}.snpdat"
+            for i in range(n_files)
+        ]
+        for i in range(n_files):
+            handler = ad.io.snp_unphased(filenames[i])
+            handler.write(datas[i]["X"])
+        cX = mod.snp_unphased(
+            filenames=filenames,
+            dtype=dtype,
+        )
+        for f in filenames:
+            os.remove(f)
+
+        X = np.concatenate([data["X"] for data in datas], axis=-1, dtype=np.int8)
+
+        atol = 1e-6 if dtype == np.float32 else 1e-14
+
+        # total number of features
+        p_total = p * n_files
+
+        # test cmul
+        v = np.random.normal(0, 1, n).astype(dtype)
+        for idx in range(p_total):
+            actual = cX.cmul(idx, v)
+            expected = X[:, idx] @ v
+            assert np.allclose(actual, expected, atol=atol) 
+
+        # test ctmul
+        v = np.random.normal(0, 1)
+        w = np.random.uniform(0, 1, n).astype(dtype)
+        actual = np.empty(n, dtype=dtype)
+        for idx in range(p_total):
+            cX.ctmul(idx, v, w, actual)
+            expected = v * w * X[:, idx]
+            assert np.allclose(actual, expected, atol=atol)
+
+        # test bmul
+        v = np.random.normal(0, 1, n).astype(dtype)
+        for i in range(1, p+1):
+            actual = np.empty(i, dtype=dtype)
+            cX.bmul(0, i, v, actual)
+            expected = v.T @ X[:, :i]
+            assert np.allclose(actual, expected, atol=atol)
+
+        # test btmul
+        actual = np.empty(n, dtype=dtype)
+        for i in range(1, p+1):
+            v = np.random.normal(0, 1, i).astype(dtype)
+            cX.btmul(0, i, v, w, actual)
+            expected = (v.T @ X[:, :i].T) * w[None]
+            assert np.allclose(actual, expected, atol=atol)
+
+        # test sp_btmul
+        actual = np.empty((2, n), dtype=dtype)
+        for i in range(1, p+1):
+            v = np.random.normal(0, 1, (2, i)).astype(dtype)
+            v = scipy.sparse.csr_matrix(v)
+            cX.sp_btmul(0, i, v, w, actual)
+            expected = (v @ X[:, :i].T) * w[None]
+            assert np.allclose(actual, expected, atol=atol)
+
+        # test to_dense
+        for i in range(1, p+1):
+            actual = np.empty((n, i), dtype=dtype, order="F")
+            cX.to_dense(0, i, actual)
+            expected = X[:, :i]
+            assert np.allclose(actual, expected, atol=atol)
+
+        # test means
+        actual = np.empty(p_total, dtype=dtype)
+        cX.means(w, actual)
+        expected = np.concatenate([
+            np.sum(data["X"] * w[:, None], axis=0)
+            for data in datas
+        ], dtype=dtype)
+        assert np.allclose(actual, expected, atol=atol)
+
+        assert cX.rows() == n
+        assert cX.cols() == p_total
+
+    dtypes = [np.float64, np.float32]
+    for dtype in dtypes:
+        _test(10, 20, 3, dtype)
+        _test(1, 13, 3, dtype)
+        _test(144, 1, 3, dtype)
diff --git a/tests/test_solver.py b/tests/test_solver.py
@@ -195,7 +195,7 @@ def _test(n, p, G, S, intercept=True, alpha=1, sparsity=0.95, seed=0):
         resid = weights * (y - intercept * y_mean)
         y_var = np.sum(resid ** 2 / weights)
         Xs = [
-            ad.matrix.naive_dense(X, n_threads=2)
+            ad.matrix.dense(X, method="naive", n_threads=2)
         ]
         for Xpy in Xs:
             state = ad.state.pin_naive(
@@ -272,7 +272,7 @@ def _test(n, p, G, S, alpha=1, sparsity=0.95, seed=0):
 
         # list of different types of cov matrices to test
         As = [
-            ad.matrix.cov_dense(A, n_threads=3),
+            ad.matrix.dense(A, method="cov", n_threads=3),
             ad.matrix.cov_lazy(WsqrtX, n_threads=3),
         ]
 
@@ -320,7 +320,7 @@ def _test(n, p, G, S, alpha=1, sparsity=0.95, seed=0):
 # ========================================================================
 
 
-def create_test_data_basil(
+def create_dense(
     n, p, G,
     intercept=True,
     alpha=1,
@@ -457,13 +457,13 @@ def run_solve_basil(state, X, y):
 
 def test_solve_basil():
     def _test(n, p, G, intercept=True, alpha=1, sparsity=0.95, seed=0):
-        test_data = create_test_data_basil(
+        test_data = create_dense(
             n, p, G, intercept, alpha, sparsity, seed,
         )
         X, y = test_data["X"], test_data["y"]
         test_data.pop("y")
         Xs = [
-            ad.matrix.naive_dense(X, n_threads=2)
+            ad.matrix.dense(X, method="naive", n_threads=2)
         ]
         for Xpy in Xs:
             test_data["X"] = Xpy

diff --git a/tests/test_state.py b/tests/test_state.py
@@ -12,7 +12,7 @@ def test_state_pin_naive():
     p = 100
     G = 2
 
-    X = matrix.naive_dense(np.random.normal(0, 1, (n, p)), n_threads=4)
+    X = matrix.dense(np.random.normal(0, 1, (n, p)), method="naive", n_threads=4)
     groups = np.array([0, 1])
     group_sizes = np.array([1, p-1])
     alpha = 1.0
@@ -66,7 +66,7 @@ def test_state_pin_cov():
     G = 2
 
     X = np.random.normal(0, 1, (n, p))
-    A = matrix.cov_dense(X.T @ X / n, n_threads=4)
+    A = matrix.dense(X.T @ X / n, method="cov", n_threads=4)
     groups = np.array([0, 1])
     group_sizes = np.array([1, p-1])
     alpha = 1.0
@@ -114,7 +114,7 @@ def test_state_basil_naive():
     G = 2
 
     _X = np.random.normal(0, 1, (n, p))
-    X = matrix.naive_dense(_X, n_threads=4)
+    X = matrix.dense(_X, method="naive", n_threads=4)
     X_means = np.mean(_X, axis=0)
     groups = np.array([0, 1])
     group_sizes = np.array([1, p-1])
Original file line number	Diff line number	Diff line change
		@@ -6,5 +6,4 @@ User Guide
		:maxdepth: 1


		notebooks/edpp_study
		notebooks/pivot_rule