Skip to content

Commit

Permalink
resolve conflicts
Browse files Browse the repository at this point in the history
  • Loading branch information
hbaniecki committed Apr 8, 2024
2 parents b3920ca + d25809e commit 6f4c8ef
Show file tree
Hide file tree
Showing 19 changed files with 21,786 additions and 58 deletions.
20,641 changes: 20,641 additions & 0 deletions data/california.csv

Large diffs are not rendered by default.

631 changes: 631 additions & 0 deletions docs/source/notebooks/language_sentiment_analysis_game.ipynb

Large diffs are not rendered by default.

Binary file modified requirements-dev.txt
Binary file not shown.
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ scikit-learn
pandas
ruff
black
transformers
torch
6 changes: 6 additions & 0 deletions shapiq/games/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,15 @@
from .base import Game
from .dummy import DummyGame
from .imputer import MarginalImputer
from .sentiment_language import SentimentClassificationGame
from .tabular import CaliforniaHousing, FeatureSelectionGame, LocalExplanation

__all__ = [
"DummyGame",
"Game",
"MarginalImputer",
"SentimentClassificationGame",
"LocalExplanation",
"FeatureSelectionGame",
"CaliforniaHousing",
]
15 changes: 10 additions & 5 deletions shapiq/games/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class Game(ABC):
value for the empty coalition is zero. Defaults to `None`. If `normalization` is set
to `False` this value is not required. Otherwise, the value is needed to normalize and
center the game. If no value is provided, the game raise a warning.
path_to_values: The path to load the game values from. If the path is provided, the game
values are loaded from the given path. Defaults to `None`.
Note:
This class is an abstract base class and should not be instantiated directly. All games
Expand All @@ -31,16 +33,18 @@ class Game(ABC):
@abstractmethod
def __init__(
self,
n_players: int,
n_players: Optional[int] = None,
normalize: bool = True,
normalization_value: Optional[float] = None,
path_to_values: Optional[str] = None,
) -> None:
# define storage variables
self.value_storage: np.ndarray = np.zeros(0, dtype=float)
self.coalition_lookup: dict[tuple[int, ...], int] = {}
self.n_players: int = n_players # if path_to_values is provided, this will be overwritten

# define some handy variables describing the game
self.n_players: int = n_players
if path_to_values is not None:
self.load_values(path_to_values, precomputed=True)

# setup normalization of the game
self.normalization_value: float = 0.0
Expand Down Expand Up @@ -74,7 +78,7 @@ def precomputed(self) -> bool:
@property
def normalize(self) -> bool:
"""Indication whether the game values are normalized."""
return int(self.normalization_value) != 0
return self.normalization_value != 0

def __call__(self, coalitions: np.ndarray) -> np.ndarray:
"""Calls the game's value function with the given coalitions and returns the output of the
Expand Down Expand Up @@ -226,11 +230,12 @@ def load_values(self, path: str, precomputed: bool = False) -> None:

data = np.load(path)
n_players = data["n_players"]
if n_players != self.n_players:
if self.n_players is not None and n_players != self.n_players:
raise ValueError(
f"The number of players in the game ({self.n_players}) does not match the number "
f"of players in the saved game ({n_players})."
)
self.n_players = n_players
self.value_storage = data["values"]
self.coalition_lookup = transform_array_to_coalitions(data["coalitions"])
self.precompute_flag = precomputed
Expand Down
2 changes: 1 addition & 1 deletion shapiq/games/imputer/marginal_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def __init__(
if normalize:
self.normalization_value = self.empty_prediction

def value_function(self, coalitions: np.ndarray) -> np.ndarray:
def value_function(self, coalitions: np.ndarray[bool]) -> np.ndarray[float]:
"""Imputes the missing values of a data point and calls the model.
Args:
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
133 changes: 133 additions & 0 deletions shapiq/games/sentiment_language.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""This module contains the Sentiment Classification Game class, which is a subclass of the Game"""

import numpy as np

from .base import Game


class SentimentClassificationGame(Game):
"""Sentiment Classification Game.
The Sentiment Classification Game uses a sentiment classification model from huggingface to
classify the sentiment of a given text. The game is defined by the number of players, which is
equal to the number of tokens in the input text. The worth of a coalition is the sentiment of
the coalition's text. The sentiment is encoded as a number between -1 (strong negative
sentiment) and 1 (strong positive sentiment).
Note:
This benchmark game requires the `transformers` package to be installed. You can install it
via pip:
```bash
pip install transformers
```
Args:
input_text: The input text to be classified.
normalize: Whether to normalize the game. Defaults to True.
mask_strategy: The strategy to handle the tokens not in the coalition. Either 'remove' or
'mask'. Defaults to 'mask'. With 'remove', the tokens not in the coalition are removed
from the text. With 'mask', the tokens not in the coalition are replaced by the
mask_token_id.
Attributes:
n_players: The number of players in the game.
original_input_text: The original input text (as given in the constructor).
input_text: The input text after tokenization took place (may differ from the original).
original_model_output: The sentiment of the original input text in the range [-1, 1].
normalization_value: The score used for normalization.
Properties:
normalize: Whether the game is normalized.
Examples:
>>> game = SentimentClassificationGame("This is a six word sentence")
>>> game.n_players
6
>>> game.original_input_text
'This is a six word sentence'
>>> game.input_text
'this is a six word sentence'
>>> game.original_model_output
0.6615
>>> game(np.asarray([1, 1, 1, 1, 1, 1], dtype=bool))
0.6615
"""

def __init__(self, input_text: str, normalize: bool = True, mask_strategy: str = "mask"):
# import the required modules locally (to avoid having to install them for all)
from transformers import pipeline

if mask_strategy not in ["remove", "mask"]:
raise ValueError(
f"'mask_strategy' must be either 'remove' or 'mask' and not {mask_strategy}"
)
self.mask_strategy = mask_strategy

# get the model
self._classifier = pipeline(model="lvwerra/distilbert-imdb", task="sentiment-analysis")
self._tokenizer = self._classifier.tokenizer
self._mask_toke_id = self._tokenizer.mask_token_id
# for this model: {0: [PAD], 100: [UNK], 101: [CLS], 102: [SEP], 103: [MASK]}

# get the text
self.original_input_text: str = input_text
self._tokenized_input = np.asarray(
self._tokenizer(self.original_input_text)["input_ids"][1:-1]
)
self.input_text: str = str(self._tokenizer.decode(self._tokenized_input))

# setup players
n_players = len(self._tokenized_input)

# get original sentiment
self.original_model_output = float(self._classifier(self.original_input_text)[0]["score"])
self._full_output = float(self.value_function(np.ones((1, n_players), dtype=bool)))
self._empty_output = float(self.value_function(np.zeros((1, n_players), dtype=bool)))

# setup game object
super().__init__(n_players, normalize=normalize, normalization_value=self._empty_output)

def value_function(self, coalitions: np.ndarray[bool]) -> np.ndarray[float]:
"""Returns the sentiment of the coalition's text.
Args:
coalitions: The coalition as a binary matrix of shape `(n_coalitions, n_players)`.
Returns:
The sentiment of the coalition's text as a vector of length `n_coalitions`.
"""
# get the texts of the coalitions
texts = []
for coalition in coalitions:
if self.mask_strategy == "remove":
tokenized_coalition = self._tokenized_input[coalition]
else: # mask_strategy == "mask"
tokenized_coalition = self._tokenized_input.copy()
# all tokens not in the coalition are set to mask_token_id
tokenized_coalition[~coalition] = self._mask_toke_id
coalition_text = self._tokenizer.decode(tokenized_coalition)
texts.append(coalition_text)

# get the sentiment of the texts
sentiments = self._model_call(texts)

return sentiments

def _model_call(self, input_texts: list[str]) -> np.ndarray[float]:
"""Calls the sentiment classification model with a list of texts.
Args:
input_texts: A list of input texts.
Returns:
The sentiment of the input texts as a vector of length `n_coalitions`.
"""
# get the sentiment of the input texts
outputs = self._classifier(input_texts)
outputs = [
output["score"] * 1 if output["label"] == "POSITIVE" else output["score"] * -1
for output in outputs
]
sentiments = np.array(outputs, dtype=float)

return sentiments
Loading

0 comments on commit 6f4c8ef

Please sign in to comment.