-
Notifications
You must be signed in to change notification settings - Fork 14
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* started work on #107 * refactores imputer tests * reworks imputers * reworks imputers * finishes baseline imputer and adds joint-marginal-dist. to MarginalImputer * adds test to joint marginal prediciton and closes #261 * updated documentation
- Loading branch information
Showing
16 changed files
with
612 additions
and
208 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,9 @@ | ||
"""Game objects for the shapiq package.""" | ||
|
||
# from . import benchmark | ||
# from . import benchmark # not imported here to avoid circular imports and long import times | ||
from .base import Game | ||
from .imputer import ConditionalImputer, MarginalImputer | ||
from .imputer import BaselineImputer, ConditionalImputer, MarginalImputer | ||
|
||
__all__ = ["Game", "MarginalImputer", "ConditionalImputer"] # + benchmark.__all__ | ||
__all__ = ["Game", "MarginalImputer", "ConditionalImputer", "BaselineImputer"] | ||
|
||
# Path: shapiq/games/__init__.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
"""Imputer objects for the shapiq package.""" | ||
|
||
from .baseline_imputer import BaselineImputer | ||
from .conditional_imputer import ConditionalImputer | ||
from .marginal_imputer import MarginalImputer | ||
|
||
__all__ = ["MarginalImputer", "ConditionalImputer"] | ||
__all__ = ["MarginalImputer", "ConditionalImputer", "BaselineImputer"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
"""Implementation of the baseline imputer.""" | ||
|
||
import warnings | ||
from typing import Optional | ||
|
||
import numpy as np | ||
|
||
from shapiq.games.imputer.base import Imputer | ||
|
||
|
||
class BaselineImputer(Imputer): | ||
"""The baseline imputer for the shapiq package. | ||
The baseline imputer is used to impute the missing values of a data point by using predefined | ||
values (baseline values). If no baseline values are given, the imputer uses the mean (for | ||
numerical features) or the mode (for categorical features) of the background data. | ||
Args: | ||
model: The model to explain as a callable function expecting a data points as input and | ||
returning the model's predictions. | ||
data: The background data to use for the explainer as either a vector of baseline values | ||
or a two-dimensional array with shape ``(n_samples, n_features)``. If data is a matrix, | ||
the baseline values are calculated from the data. | ||
x: The explanation point to use the imputer to. | ||
categorical_features: A list of indices of the categorical features in the background data. | ||
If no categorical features are given, all features are assumed to be numerical or in | ||
string format (where ``np.mean`` fails) features. Defaults to ``None``. | ||
normalize: A flag to normalize the game values. If ``True``, then the game values are | ||
normalized and centered to be zero for the empty set of features. Defaults to ``True``. | ||
random_state: The random state to use for sampling. Defaults to ``None``. | ||
Attributes: | ||
baseline_values: The baseline values to use for imputation. | ||
empty_prediction: The model's prediction on an empty data point (all features missing). | ||
Examples: | ||
>>> model = lambda x: np.sum(x, axis=1) # some dummy model | ||
>>> data = np.random.rand(1000, 4) # some background data | ||
>>> x_to_impute = np.array([[1, 1, 1, 1]]) # some data point to impute | ||
>>> imputer = BaselineImputer(model=model, data=data, x=x_to_impute) | ||
>>> # get the baseline values | ||
>>> imputer.baseline_values | ||
array([[0.5, 0.5, 0.5, 0.5]]) # computed from data | ||
>>> # set new baseline values | ||
>>> baseline_vector = np.array([0, 0, 0, 0]) | ||
>>> imputer.init_background(baseline_vector) | ||
>>> imputer.baseline_values | ||
array([[0, 0, 0, 0]]) # given as input | ||
>>> # get the model prediction with missing values | ||
>>> imputer(np.array([[True, False, True, False]])) | ||
np.array([2.]) # model prediciton with the last baseline value | ||
""" | ||
|
||
def __init__( | ||
self, | ||
model, | ||
data: np.ndarray, | ||
x: Optional[np.ndarray] = None, | ||
categorical_features: list[int] = None, | ||
normalize: bool = True, | ||
random_state: Optional[int] = None, | ||
) -> None: | ||
super().__init__(model, data, x, 1, categorical_features, random_state) | ||
|
||
# setup attributes | ||
self.baseline_values: np.ndarray = np.zeros((1, self.n_features)) # will be overwritten | ||
self.init_background(self.data) | ||
|
||
# set empty value and normalization | ||
self.empty_prediction: float = self._calc_empty_prediction() | ||
if normalize: | ||
self.normalization_value = self.empty_prediction | ||
|
||
def value_function(self, coalitions: np.ndarray) -> np.ndarray: | ||
"""Imputes the missing values of a data point and calls the model. | ||
Args: | ||
coalitions: A boolean array indicating which features are present (``True``) and which are | ||
missing (``False``). The shape of the array must be ``(n_subsets, n_features)``. | ||
Returns: | ||
The model's predictions on the imputed data points. The shape of the array is | ||
``(n_subsets, n_outputs)``. | ||
""" | ||
n_coalitions = coalitions.shape[0] | ||
data = np.tile(np.copy(self._x), (n_coalitions, 1)) | ||
for i in range(n_coalitions): | ||
data[i, ~coalitions[i]] = self.baseline_values[0, ~coalitions[i]] | ||
outputs = self.predict(data) | ||
return outputs | ||
|
||
def init_background(self, data: np.ndarray) -> "BaselineImputer": | ||
"""Initializes the imputer to the background data. | ||
Args: | ||
data: The background data to use for the imputer. Either a vector of baseline values | ||
of shape ``(n_features,)`` or a matrix of shape ``(n_samples, n_features)``. | ||
If the data is a matrix, the baseline values are calculated from the data. | ||
Returns: | ||
The initialized imputer. | ||
Examples: | ||
>>> import numpy as np | ||
>>> from shapiq.games.imputer import BaselineImputer | ||
>>> data = np.array([[1, 2, "a"], [2, 3, "a"], [2, 4, "b"]], dtype=object) | ||
>>> x = np.array([1, 2, 3]) | ||
>>> imputer = BaselineImputer(model=lambda x: np.sum(x, axis=1), data=data, x=x) | ||
>>> imputer.baseline_values | ||
array([[1.66, 3, 'a']], dtype=object) # computed from data | ||
>>> baseline_vector = np.array([0, 0, 0]) | ||
>>> imputer.init_background(baseline_vector) | ||
>>> imputer.baseline_values | ||
array([[0, 0, 0]]) # given as input | ||
""" | ||
if data.ndim == 1 or data.shape[0] == 1: # data is a vector -> use as baseline values | ||
self.baseline_values = data.reshape(1, self.n_features) | ||
return self | ||
# data is a matrix -> calculate baseline values as mean or mode | ||
self.baseline_values = np.zeros((1, self.n_features), dtype=object) | ||
for feature in range(self.n_features): | ||
feature_column = data[:, feature] | ||
if feature in self._cat_features: # get mode for categorical features | ||
values, counts = np.unique(feature_column, return_counts=True) | ||
summarized_feature = values[np.argmax(counts)] | ||
else: | ||
try: # try to use mean for numerical features | ||
summarized_feature = np.mean(feature_column) | ||
except TypeError: # fallback to mode for potentially string features | ||
values, counts = np.unique(feature_column, return_counts=True) | ||
summarized_feature = values[np.argmax(counts)] | ||
# add feature to categorical features | ||
warnings.warn( | ||
f"Feature {feature} is not numerical. Adding it to categorical features." | ||
) | ||
self._cat_features.append(feature) | ||
self.baseline_values[0, feature] = summarized_feature | ||
return self | ||
|
||
def _calc_empty_prediction(self) -> float: | ||
"""Runs the model on empty data points (all features missing) to get the empty prediction. | ||
Returns: | ||
The empty prediction. | ||
""" | ||
empty_predictions = self.predict(self.baseline_values) | ||
empty_prediction = float(empty_predictions[0]) | ||
if self.normalize: # reset the normalization value | ||
self.normalization_value = empty_prediction | ||
return empty_prediction |
Oops, something went wrong.