Skip to content

Commit

Permalink
Adds BaselineImputer and Reworks MarginalImputer (#262)
Browse files Browse the repository at this point in the history
* started work on #107

* refactores imputer tests

* reworks imputers

* reworks imputers

* finishes baseline imputer and adds joint-marginal-dist. to MarginalImputer

* adds test to joint marginal prediciton and closes #261

* updated documentation
  • Loading branch information
mmschlk authored Oct 30, 2024
1 parent 15532ff commit 320518a
Show file tree
Hide file tree
Showing 16 changed files with 612 additions and 208 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
- renames explanation graph to `si_graph`
- `get_n_order` now has optional lower/upper limits for the order
- computing metrics now tries to resolve not-matching interaction indices and will throw a warning instead of a ValueError [#179](https://github.com/mmschlk/shapiq/issues/179)
- ...
- removed the `sample_replacements` parameter from `MarginalImputer` which is now handled by the `BaselineImputer`. Added a DeprecationWarning for the parameter, which will be removed in the next release.
- adds `BaselineImputer` [#107](https://github.com/mmschlk/shapiq/issues/107)
- adds `joint_marginal_distribution` parameter to `MarginalImputer` [#261](https://github.com/mmschlk/shapiq/issues/261)

### v1.0.1 (2024-06-05)

Expand Down
4 changes: 3 additions & 1 deletion shapiq/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,8 @@
from .explainer import Explainer, TabularExplainer, TreeExplainer

# game classes
from .games import ConditionalImputer, Game, MarginalImputer
# imputer classes
from .games import BaselineImputer, ConditionalImputer, Game, MarginalImputer

# base classes
from .interaction_values import InteractionValues
Expand Down Expand Up @@ -96,6 +97,7 @@
"TreeExplainer",
# imputers
"MarginalImputer",
"BaselineImputer",
"ConditionalImputer",
# plots
"network_plot",
Expand Down
6 changes: 3 additions & 3 deletions shapiq/games/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""Game objects for the shapiq package."""

# from . import benchmark
# from . import benchmark # not imported here to avoid circular imports and long import times
from .base import Game
from .imputer import ConditionalImputer, MarginalImputer
from .imputer import BaselineImputer, ConditionalImputer, MarginalImputer

__all__ = ["Game", "MarginalImputer", "ConditionalImputer"] # + benchmark.__all__
__all__ = ["Game", "MarginalImputer", "ConditionalImputer", "BaselineImputer"]

# Path: shapiq/games/__init__.py
5 changes: 3 additions & 2 deletions shapiq/games/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,8 @@ def _check_coalitions(
>>> coalitions = [("Alice", "Bob"), ("Bob", "Charlie")]
Wrong format:
>>> coalitions = [1, 0, 0, 0]
>>> coalitions = [(1,"Alice")]
>>> coalitions = np.array([1,-1,2])
>>> coalitions = [(1, "Alice")]
>>> coalitions = np.array([1, -1, 2])
"""
Expand Down Expand Up @@ -220,6 +220,7 @@ def _check_coalitions(
f"the number of players in the game ({self.n_players})."
)

# TODO maybe remove this, as it might increase runtime unnecessarily
# Check that values of numpy array are either 0 or 1
if not np.all(np.logical_or(coalitions == 0, coalitions == 1)):
raise TypeError("The values in the array of coalitions are not binary.")
Expand Down
3 changes: 2 additions & 1 deletion shapiq/games/imputer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Imputer objects for the shapiq package."""

from .baseline_imputer import BaselineImputer
from .conditional_imputer import ConditionalImputer
from .marginal_imputer import MarginalImputer

__all__ = ["MarginalImputer", "ConditionalImputer"]
__all__ = ["MarginalImputer", "ConditionalImputer", "BaselineImputer"]
66 changes: 59 additions & 7 deletions shapiq/games/imputer/base.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Base class for all imputers."""
"""Base class for all Imputers."""

from abc import abstractmethod
from typing import Optional
Expand All @@ -10,39 +10,91 @@


class Imputer(Game):
"""Base class for imputers.
"""Base class for Imputers.
Args:
model: The model to explain as a callable function expecting a data points as input and
returning the model's predictions.
data: The background data to use for the explainer as a 2-dimensional array
with shape ``(n_samples, n_features)``.
x: The explanation point to use the imputer on either as a 2-dimensional array with
shape ``(1, n_features)`` or as a vector with shape ``(n_features,)``.
sample_size: The number of samples to draw from the background data. Defaults to ``100`` but
can is usually overwritten in the subclasses.
categorical_features: A list of indices of the categorical features in the background data.
random_state: The random state to use for sampling. Defaults to ``None``.
Attributes:
n_features: The number of features in the data (equals the number of players in the game).
data: The background data to use for the imputer.
model: The model to impute missing values for as a callable function.
sample_size: The number of samples to draw from the background data.
Properties:
x: The explanation point to use the imputer on.
"""

@abstractmethod
def __init__(
self,
model,
data: np.ndarray,
x: Optional[np.ndarray] = None,
sample_size: int = 100,
categorical_features: list[int] = None,
random_state: Optional[int] = None,
) -> None:
if callable(model):
self._predict_function = utils.predict_callable
else: # shapiq.Explainer
else: # shapiq.Explainer adds a predict function to the model to make it callable
self._predict_function = model._predict_function
self.model = model
# check if data is a vector
if data.ndim == 1:
data = data.reshape(1, data.shape[0])
self.data = data
self._n_features = self.data.shape[1]
self.sample_size = sample_size
self.n_features = self.data.shape[1]
self._cat_features: list = [] if categorical_features is None else categorical_features
self._random_state = random_state
self._rng = np.random.default_rng(self._random_state)

# the normalization_value needs to be set in the subclass
super().__init__(n_players=self._n_features, normalize=False)
# fit x
self._x: Optional[np.ndarray] = None # will be overwritten @ fit
if x is not None:
self.fit(x)

# init the game
# developer note: the normalization_value needs to be set in the subclass
super().__init__(n_players=self.n_features, normalize=False)

@property
def x(self) -> Optional[np.ndarray]:
"""Returns the explanation point if it is set."""
return self._x.copy() if self._x is not None else None

def predict(self, x: np.ndarray) -> np.ndarray:
"""Provides a unified prediction interface."""
"""Provides a unified prediction interface.
Args:
x: The data point to predict the model's output for.
Returns:
The model's prediction for the given data point as a vector.
"""
return self._predict_function(self.model, x)

def fit(self, x: np.ndarray) -> "Imputer":
"""Fits the imputer to the explanation point.
Args:
x: The explanation point to use the imputer on either as a 2-dimensional array with
shape ``(1, n_features)`` or as a vector with shape ``(n_features,)``.
Returns:
The fitted imputer.
"""
self._x = x.copy()
if self._x.ndim == 1:
self._x = self._x.reshape(1, x.shape[0])
return self
150 changes: 150 additions & 0 deletions shapiq/games/imputer/baseline_imputer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
"""Implementation of the baseline imputer."""

import warnings
from typing import Optional

import numpy as np

from shapiq.games.imputer.base import Imputer


class BaselineImputer(Imputer):
"""The baseline imputer for the shapiq package.
The baseline imputer is used to impute the missing values of a data point by using predefined
values (baseline values). If no baseline values are given, the imputer uses the mean (for
numerical features) or the mode (for categorical features) of the background data.
Args:
model: The model to explain as a callable function expecting a data points as input and
returning the model's predictions.
data: The background data to use for the explainer as either a vector of baseline values
or a two-dimensional array with shape ``(n_samples, n_features)``. If data is a matrix,
the baseline values are calculated from the data.
x: The explanation point to use the imputer to.
categorical_features: A list of indices of the categorical features in the background data.
If no categorical features are given, all features are assumed to be numerical or in
string format (where ``np.mean`` fails) features. Defaults to ``None``.
normalize: A flag to normalize the game values. If ``True``, then the game values are
normalized and centered to be zero for the empty set of features. Defaults to ``True``.
random_state: The random state to use for sampling. Defaults to ``None``.
Attributes:
baseline_values: The baseline values to use for imputation.
empty_prediction: The model's prediction on an empty data point (all features missing).
Examples:
>>> model = lambda x: np.sum(x, axis=1) # some dummy model
>>> data = np.random.rand(1000, 4) # some background data
>>> x_to_impute = np.array([[1, 1, 1, 1]]) # some data point to impute
>>> imputer = BaselineImputer(model=model, data=data, x=x_to_impute)
>>> # get the baseline values
>>> imputer.baseline_values
array([[0.5, 0.5, 0.5, 0.5]]) # computed from data
>>> # set new baseline values
>>> baseline_vector = np.array([0, 0, 0, 0])
>>> imputer.init_background(baseline_vector)
>>> imputer.baseline_values
array([[0, 0, 0, 0]]) # given as input
>>> # get the model prediction with missing values
>>> imputer(np.array([[True, False, True, False]]))
np.array([2.]) # model prediciton with the last baseline value
"""

def __init__(
self,
model,
data: np.ndarray,
x: Optional[np.ndarray] = None,
categorical_features: list[int] = None,
normalize: bool = True,
random_state: Optional[int] = None,
) -> None:
super().__init__(model, data, x, 1, categorical_features, random_state)

# setup attributes
self.baseline_values: np.ndarray = np.zeros((1, self.n_features)) # will be overwritten
self.init_background(self.data)

# set empty value and normalization
self.empty_prediction: float = self._calc_empty_prediction()
if normalize:
self.normalization_value = self.empty_prediction

def value_function(self, coalitions: np.ndarray) -> np.ndarray:
"""Imputes the missing values of a data point and calls the model.
Args:
coalitions: A boolean array indicating which features are present (``True``) and which are
missing (``False``). The shape of the array must be ``(n_subsets, n_features)``.
Returns:
The model's predictions on the imputed data points. The shape of the array is
``(n_subsets, n_outputs)``.
"""
n_coalitions = coalitions.shape[0]
data = np.tile(np.copy(self._x), (n_coalitions, 1))
for i in range(n_coalitions):
data[i, ~coalitions[i]] = self.baseline_values[0, ~coalitions[i]]
outputs = self.predict(data)
return outputs

def init_background(self, data: np.ndarray) -> "BaselineImputer":
"""Initializes the imputer to the background data.
Args:
data: The background data to use for the imputer. Either a vector of baseline values
of shape ``(n_features,)`` or a matrix of shape ``(n_samples, n_features)``.
If the data is a matrix, the baseline values are calculated from the data.
Returns:
The initialized imputer.
Examples:
>>> import numpy as np
>>> from shapiq.games.imputer import BaselineImputer
>>> data = np.array([[1, 2, "a"], [2, 3, "a"], [2, 4, "b"]], dtype=object)
>>> x = np.array([1, 2, 3])
>>> imputer = BaselineImputer(model=lambda x: np.sum(x, axis=1), data=data, x=x)
>>> imputer.baseline_values
array([[1.66, 3, 'a']], dtype=object) # computed from data
>>> baseline_vector = np.array([0, 0, 0])
>>> imputer.init_background(baseline_vector)
>>> imputer.baseline_values
array([[0, 0, 0]]) # given as input
"""
if data.ndim == 1 or data.shape[0] == 1: # data is a vector -> use as baseline values
self.baseline_values = data.reshape(1, self.n_features)
return self
# data is a matrix -> calculate baseline values as mean or mode
self.baseline_values = np.zeros((1, self.n_features), dtype=object)
for feature in range(self.n_features):
feature_column = data[:, feature]
if feature in self._cat_features: # get mode for categorical features
values, counts = np.unique(feature_column, return_counts=True)
summarized_feature = values[np.argmax(counts)]
else:
try: # try to use mean for numerical features
summarized_feature = np.mean(feature_column)
except TypeError: # fallback to mode for potentially string features
values, counts = np.unique(feature_column, return_counts=True)
summarized_feature = values[np.argmax(counts)]
# add feature to categorical features
warnings.warn(
f"Feature {feature} is not numerical. Adding it to categorical features."
)
self._cat_features.append(feature)
self.baseline_values[0, feature] = summarized_feature
return self

def _calc_empty_prediction(self) -> float:
"""Runs the model on empty data points (all features missing) to get the empty prediction.
Returns:
The empty prediction.
"""
empty_predictions = self.predict(self.baseline_values)
empty_prediction = float(empty_predictions[0])
if self.normalize: # reset the normalization value
self.normalization_value = empty_prediction
return empty_prediction
Loading

0 comments on commit 320518a

Please sign in to comment.