Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simple PTC ratios imputer #39

Merged
merged 2 commits into from
Jan 22, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions traffic_prophet/countmatch/derivedvals.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
cczhu marked this conversation as resolved.
Show resolved Hide resolved
from sklearn import impute as skimp


DV_REGISTRY = {}
Expand Down Expand Up @@ -194,8 +196,9 @@ class DerivedValsStandard(DerivedValsBase):

_dv_type = 'Standard'

def __init__(self, impute_ratios=False):
def __init__(self, impute_ratios=False, **kwargs):
self._impute_ratios = impute_ratios
self._imputer_args = kwargs

def get_derived_vals(self, ptc):
"""Get derived values, including ADTs and ratios between them.
Expand All @@ -221,5 +224,26 @@ def get_derived_vals(self, ptc):
if self._impute_ratios:
self.impute_ratios(ptc)

@staticmethod
def fill_nans(df, imp):
"""Fill NaN values in an array with imputed ones.

Parameters
----------
df : pandas.DataFrame
Original data, with NaNs.
imp : numpy.ndarray
Data array with imputed values.
cczhu marked this conversation as resolved.
Show resolved Hide resolved

"""
for i, j in zip(*np.where(df.isnull())):
df.iloc[i, j] = imp[i, j]

def impute_ratios(self, ptc):
raise NotImplementedError
imp = skimp.IterativeImputer(**self._imputer_args)

dom_ijd_imputed = imp.fit_transform(ptc.ratios['DoM_ijd'])
d_ijd_imputed = imp.fit_transform(ptc.ratios['D_ijd'])

self.fill_nans(ptc.ratios['DoM_ijd'], dom_ijd_imputed)
self.fill_nans(ptc.ratios['D_ijd'], d_ijd_imputed)
48 changes: 47 additions & 1 deletion traffic_prophet/countmatch/tests/test_derivedvals.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import pytest
import hypothesis as hyp
import numpy as np
import pandas as pd

Expand Down Expand Up @@ -141,6 +142,7 @@ class TestDerivedValsStandard:

def setup(self):
self.dvc = dv.DerivedValsStandard()
self.imparray = np.arange(132, dtype=float).reshape(12, 11)

@pytest.mark.parametrize('count_id', [-890, -104870])
def test_get_derived_vals(self, sample_counts, cfgcm_test, count_id):
Expand All @@ -151,8 +153,52 @@ def test_get_derived_vals(self, sample_counts, cfgcm_test, count_id):
assert sorted(list(ptc.ratios.keys())) == [
'D_ijd', 'DoM_ijd', 'N_avail_days']

@hyp.given(n_nan=hyp.strategies.integers(min_value=5, max_value=10))
@hyp.settings(max_examples=30)
def test_fill_nans(self, n_nan):
# Flatten ravels and makes a copy.
imparray_nans = self.imparray.flatten()
# Randomly set some values to NaN.
nan_indices = np.random.choice(
np.arange(imparray_nans.shape[0], dtype=int),
n_nan, replace=False)
imparray_nans[nan_indices] = np.nan
assert np.sum(np.isnan(imparray_nans)) == n_nan

# Test `fill_nans`.
impdf_nans = pd.DataFrame(
imparray_nans.reshape(self.imparray.shape))
assert not np.array_equal(impdf_nans.values, self.imparray)
self.dvc.fill_nans(impdf_nans, self.imparray)
assert np.array_equal(impdf_nans.values, self.imparray)

def test_imputer(self, sample_counts, cfgcm_test):
pass
ptc = get_single_ptc(sample_counts, cfgcm_test, -104870)
ptc_imp = get_single_ptc(sample_counts, cfgcm_test, -104870)

dvc_imp = dv.DerivedVals('Standard', impute_ratios=True, max_iter=10)
cczhu marked this conversation as resolved.
Show resolved Hide resolved

self.dvc.get_derived_vals(ptc)
dvc_imp.get_derived_vals(ptc_imp)

# Check that NaNs have been filled.
assert np.isnan(ptc.ratios['DoM_ijd'].at[(2010, 5), 4])
assert np.isnan(ptc.ratios['D_ijd'].at[(2010, 5), 4])
assert not np.isnan(ptc_imp.ratios['DoM_ijd'].at[(2010, 5), 4])
assert not np.isnan(ptc_imp.ratios['D_ijd'].at[(2010, 5), 4])

# Check that non-NaN values are untouched.
notnulls = ~np.isnan(ptc.ratios['DoM_ijd'].values)
cczhu marked this conversation as resolved.
Show resolved Hide resolved
tols = {'rtol': 1e-10}
assert np.allclose(ptc.ratios['DoM_ijd'].values[notnulls],
ptc_imp.ratios['DoM_ijd'].values[notnulls], **tols)
assert np.allclose(ptc.ratios['D_ijd'].values[notnulls],
ptc_imp.ratios['D_ijd'].values[notnulls], **tols)

# Check nothing else is different.
assert ptc.adts['MADT'].equals(ptc_imp.adts['MADT'])
assert ptc.adts['AADT'].equals(ptc_imp.adts['AADT'])
assert ptc.data.equals(ptc_imp.data)


class TestDerivedVals:
Expand Down