resolve conflicts

mmschlk · Apr 8, 2024 · 6f4c8ef · 6f4c8ef
2 parents b3920ca + d25809e
commit 6f4c8ef
Show file tree

Hide file tree

Showing 19 changed files with 21,786 additions and 58 deletions.
diff --git a/data/california.csv b/data/california.csv
diff --git a/docs/source/notebooks/language_sentiment_analysis_game.ipynb b/docs/source/notebooks/language_sentiment_analysis_game.ipynb
diff --git a/requirements-dev.txt b/requirements-dev.txt
diff --git a/requirements.txt b/requirements.txt
@@ -7,3 +7,5 @@ scikit-learn
 pandas
 ruff
 black
+transformers
+torch
diff --git a/shapiq/games/__init__.py b/shapiq/games/__init__.py
@@ -3,9 +3,15 @@
 from .base import Game
 from .dummy import DummyGame
 from .imputer import MarginalImputer
+from .sentiment_language import SentimentClassificationGame
+from .tabular import CaliforniaHousing, FeatureSelectionGame, LocalExplanation
 
 __all__ = [
     "DummyGame",
     "Game",
     "MarginalImputer",
+    "SentimentClassificationGame",
+    "LocalExplanation",
+    "FeatureSelectionGame",
+    "CaliforniaHousing",
 ]
diff --git a/shapiq/games/base.py b/shapiq/games/base.py
@@ -22,6 +22,8 @@ class Game(ABC):
             value for the empty coalition is zero. Defaults to `None`.  If `normalization` is set
             to `False` this value is not required. Otherwise, the value is needed to normalize and
             center the game. If no value is provided, the game raise a warning.
+        path_to_values: The path to load the game values from. If the path is provided, the game
+            values are loaded from the given path. Defaults to `None`.
 
     Note:
         This class is an abstract base class and should not be instantiated directly. All games
@@ -31,16 +33,18 @@ class Game(ABC):
     @abstractmethod
     def __init__(
         self,
-        n_players: int,
+        n_players: Optional[int] = None,
         normalize: bool = True,
         normalization_value: Optional[float] = None,
+        path_to_values: Optional[str] = None,
     ) -> None:
         # define storage variables
         self.value_storage: np.ndarray = np.zeros(0, dtype=float)
         self.coalition_lookup: dict[tuple[int, ...], int] = {}
+        self.n_players: int = n_players  # if path_to_values is provided, this will be overwritten
 
-        # define some handy variables describing the game
-        self.n_players: int = n_players
+        if path_to_values is not None:
+            self.load_values(path_to_values, precomputed=True)
 
         # setup normalization of the game
         self.normalization_value: float = 0.0
@@ -74,7 +78,7 @@ def precomputed(self) -> bool:
     @property
     def normalize(self) -> bool:
         """Indication whether the game values are normalized."""
-        return int(self.normalization_value) != 0
+        return self.normalization_value != 0
 
     def __call__(self, coalitions: np.ndarray) -> np.ndarray:
         """Calls the game's value function with the given coalitions and returns the output of the
@@ -226,11 +230,12 @@ def load_values(self, path: str, precomputed: bool = False) -> None:
 
         data = np.load(path)
         n_players = data["n_players"]
-        if n_players != self.n_players:
+        if self.n_players is not None and n_players != self.n_players:
             raise ValueError(
                 f"The number of players in the game ({self.n_players}) does not match the number "
                 f"of players in the saved game ({n_players})."
             )
+        self.n_players = n_players
         self.value_storage = data["values"]
         self.coalition_lookup = transform_array_to_coalitions(data["coalitions"])
         self.precompute_flag = precomputed

diff --git a/shapiq/games/imputer/marginal_imputer.py b/shapiq/games/imputer/marginal_imputer.py
@@ -61,7 +61,7 @@ def __init__(
         if normalize:
             self.normalization_value = self.empty_prediction
 
-    def value_function(self, coalitions: np.ndarray) -> np.ndarray:
+    def value_function(self, coalitions: np.ndarray[bool]) -> np.ndarray[float]:
         """Imputes the missing values of a data point and calls the model.
 
         Args:

diff --git a/shapiq/games/precomputed/benchmarks/california_local_xai_sklearn_gbt_id_1.npz b/shapiq/games/precomputed/benchmarks/california_local_xai_sklearn_gbt_id_1.npz
diff --git a/shapiq/games/precomputed/benchmarks/california_local_xai_torch_nn_id_1.npz b/shapiq/games/precomputed/benchmarks/california_local_xai_torch_nn_id_1.npz
diff --git a/shapiq/games/precomputed/models/california_nn_0.812511_0.076331.weights b/shapiq/games/precomputed/models/california_nn_0.812511_0.076331.weights
diff --git a/shapiq/games/sentiment_language.py b/shapiq/games/sentiment_language.py
@@ -0,0 +1,133 @@
+"""This module contains the Sentiment Classification Game class, which is a subclass of the Game"""
+
+import numpy as np
+
+from .base import Game
+
+
+class SentimentClassificationGame(Game):
+    """Sentiment Classification Game.
+
+    The Sentiment Classification Game uses a sentiment classification model from huggingface to
+    classify the sentiment of a given text. The game is defined by the number of players, which is
+    equal to the number of tokens in the input text. The worth of a coalition is the sentiment of
+    the coalition's text. The sentiment is encoded as a number between -1 (strong negative
+    sentiment) and 1 (strong positive sentiment).
+
+    Note:
+        This benchmark game requires the `transformers` package to be installed. You can install it
+        via pip:
+        ```bash
+        pip install transformers
+        ```
+
+    Args:
+        input_text: The input text to be classified.
+        normalize: Whether to normalize the game. Defaults to True.
+        mask_strategy: The strategy to handle the tokens not in the coalition. Either 'remove' or
+            'mask'. Defaults to 'mask'. With 'remove', the tokens not in the coalition are removed
+            from the text. With 'mask', the tokens not in the coalition are replaced by the
+            mask_token_id.
+
+    Attributes:
+        n_players: The number of players in the game.
+        original_input_text: The original input text (as given in the constructor).
+        input_text: The input text after tokenization took place (may differ from the original).
+        original_model_output: The sentiment of the original input text in the range [-1, 1].
+        normalization_value: The score used for normalization.
+
+    Properties:
+        normalize: Whether the game is normalized.
+
+    Examples:
+        >>> game = SentimentClassificationGame("This is a six word sentence")
+        >>> game.n_players
+        6
+        >>> game.original_input_text
+        'This is a six word sentence'
+        >>> game.input_text
+        'this is a six word sentence'
+        >>> game.original_model_output
+        0.6615
+        >>> game(np.asarray([1, 1, 1, 1, 1, 1], dtype=bool))
+        0.6615
+    """
+
+    def __init__(self, input_text: str, normalize: bool = True, mask_strategy: str = "mask"):
+        # import the required modules locally (to avoid having to install them for all)
+        from transformers import pipeline
+
+        if mask_strategy not in ["remove", "mask"]:
+            raise ValueError(
+                f"'mask_strategy' must be either 'remove' or 'mask' and not {mask_strategy}"
+            )
+        self.mask_strategy = mask_strategy
+
+        # get the model
+        self._classifier = pipeline(model="lvwerra/distilbert-imdb", task="sentiment-analysis")
+        self._tokenizer = self._classifier.tokenizer
+        self._mask_toke_id = self._tokenizer.mask_token_id
+        # for this model: {0: [PAD], 100: [UNK], 101: [CLS], 102: [SEP], 103: [MASK]}
+
+        # get the text
+        self.original_input_text: str = input_text
+        self._tokenized_input = np.asarray(
+            self._tokenizer(self.original_input_text)["input_ids"][1:-1]
+        )
+        self.input_text: str = str(self._tokenizer.decode(self._tokenized_input))
+
+        # setup players
+        n_players = len(self._tokenized_input)
+
+        # get original sentiment
+        self.original_model_output = float(self._classifier(self.original_input_text)[0]["score"])
+        self._full_output = float(self.value_function(np.ones((1, n_players), dtype=bool)))
+        self._empty_output = float(self.value_function(np.zeros((1, n_players), dtype=bool)))
+
+        # setup game object
+        super().__init__(n_players, normalize=normalize, normalization_value=self._empty_output)
+
+    def value_function(self, coalitions: np.ndarray[bool]) -> np.ndarray[float]:
+        """Returns the sentiment of the coalition's text.
+
+        Args:
+            coalitions: The coalition as a binary matrix of shape `(n_coalitions, n_players)`.
+
+        Returns:
+            The sentiment of the coalition's text as a vector of length `n_coalitions`.
+        """
+        # get the texts of the coalitions
+        texts = []
+        for coalition in coalitions:
+            if self.mask_strategy == "remove":
+                tokenized_coalition = self._tokenized_input[coalition]
+            else:  # mask_strategy == "mask"
+                tokenized_coalition = self._tokenized_input.copy()
+                # all tokens not in the coalition are set to mask_token_id
+                tokenized_coalition[~coalition] = self._mask_toke_id
+            coalition_text = self._tokenizer.decode(tokenized_coalition)
+            texts.append(coalition_text)
+
+        # get the sentiment of the texts
+        sentiments = self._model_call(texts)
+
+        return sentiments
+
+    def _model_call(self, input_texts: list[str]) -> np.ndarray[float]:
+        """Calls the sentiment classification model with a list of texts.
+
+        Args:
+            input_texts: A list of input texts.
+
+        Returns:
+            The sentiment of the input texts as a vector of length `n_coalitions`.
+        """
+        # get the sentiment of the input texts
+        outputs = self._classifier(input_texts)
+        outputs = [
+            output["score"] * 1 if output["label"] == "POSITIVE" else output["score"] * -1
+            for output in outputs
+        ]
+        sentiments = np.array(outputs, dtype=float)
+
+        return sentiments
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,3 +7,5 @@ scikit-learn @@
     pandas
     ruff
     black
+    transformers
+    torch