From 622d96c479b66e82e8d774d6221a16265958c7ff Mon Sep 17 00:00:00 2001 From: afallah Date: Fri, 21 Jun 2024 11:53:33 -0400 Subject: [PATCH 1/9] Refactor dataset.py to use standard practices. --- odyssey/data/dataset.py | 633 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 625 insertions(+), 8 deletions(-) diff --git a/odyssey/data/dataset.py b/odyssey/data/dataset.py index be63162..043ca10 100644 --- a/odyssey/data/dataset.py +++ b/odyssey/data/dataset.py @@ -1,6 +1,7 @@ """Data module for pretraining and finetuning the model.""" import random +from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Tuple, Union import pandas as pd @@ -10,18 +11,634 @@ from odyssey.data.tokenizer import ConceptTokenizer, truncate_and_pad -TASK_INDEX = 1 -LABEL_INDEX = 2 -CUTOFF_INDEX = 3 +TASK_INDEX, LABEL_INDEX, CUTOFF_INDEX = 1, 2, 3 -TASK2INDEX = { +TASK_TO_INDEX = { "mortality_1month": 0, "readmission_1month": 1, - "los_1week": 2, - "c0": 3, + "los_1week": 2, # Length of stay + "c0": 3, # Condition 0 "c1": 4, "c2": 5, } + + +class BaseDataset(Dataset, ABC): + """Base class for datasets used in pretraining and finetuning. + + Parameters + ---------- + data : pd.DataFrame + The input data containing sequences to be tokenized. + tokenizer : ConceptTokenizer + An instance of the ConceptTokenizer class used for tokenizing sequences. + max_len : int, optional + The maximum length of the tokenized sequences, by default 2048. + """ + def __init__( + self, + data: pd.DataFrame, + tokenizer: ConceptTokenizer, + max_len: int = 2048, + ): + self.data = data + self.tokenizer = tokenizer + self.max_len = max_len + self.cutoff_col = next((col for col in self.data.columns if "cutoff" in col), None) + + def __len__(self) -> int: + """Return the length of the dataset.""" + return len(self.data) + + def tokenize_data(self, sequence: Union[str, List[str]]) -> Any: + """Tokenize the sequence. + + Parameters + ---------- + sequence : Union[str, List[str]] + The sequence to be tokenized. + + Returns + ------- + Any + A dictionary containing input_ids and additional information. + """ + return self.tokenizer(sequence, max_length=self.max_len) + + @abstractmethod + def __getitem__(self, idx: int) -> Dict[str, Any]: + """Get data at corresponding index. + + Parameters + ---------- + idx : int + The index of the data to be retrieved. + + Returns + ------- + Dict[str, Any] + A dictionary containing the tokenized data. + """ + pass + + +class TokenizationMixin: + """Mixin class for adding additional token types to the dataset.""" + + def add_additional_tokens(self, data: pd.Series) -> Dict[str, torch.Tensor]: + """Add additional token types to the dataset. + + Parameters + ---------- + data : pd.Series + A series containing token sequences and additional information. + + Returns + ------- + Dict[str, torch.Tensor] + A dictionary containing tensors for each additional token type. + """ + type_tokens = torch.tensor(data[f"type_tokens_{self.max_len}"]) + age_tokens = torch.tensor(data[f"age_tokens_{self.max_len}"]) + time_tokens = torch.tensor(data[f"time_tokens_{self.max_len}"]) + visit_tokens = torch.tensor(data[f"visit_tokens_{self.max_len}"]) + position_tokens = torch.tensor(data[f"position_tokens_{self.max_len}"]) + + return { + "type_ids": type_tokens, + "ages": age_tokens, + "time_stamps": time_tokens, + "visit_orders": position_tokens, + "visit_segments": visit_tokens, + } + +class MaskingMixin: + """Mixin class for masking tokens in the dataset.""" + + def mask_tokens(self, sequence: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Mask the tokens in the sequence using vectorized operations. + + Parameters + ---------- + sequence : torch.Tensor + The sequence of tokens to be masked. + + Returns + ------- + Tuple[torch.Tensor, torch.Tensor] + A tuple containing masked sequence and labels. + + """ + mask_token_id = self.tokenizer.get_mask_token_id() + masked_sequence = sequence.clone() + + # Ignore [PAD], [UNK], [MASK] tokens + prob_matrix = torch.full(masked_sequence.shape, self.mask_prob) + prob_matrix[torch.where(masked_sequence <= mask_token_id)] = 0 + selected = torch.bernoulli(prob_matrix).bool() + + # 80% of the time, replace masked input tokens with respective mask tokens + replaced = torch.bernoulli(torch.full(selected.shape, 0.8)).bool() & selected + masked_sequence[replaced] = mask_token_id + + # 10% of the time, we replace masked input tokens with random vector. + randomized = ( + torch.bernoulli(torch.full(selected.shape, 0.1)).bool() + & selected + & ~replaced + ) + random_idx = torch.randint( + low=self.tokenizer.get_first_token_index(), + high=self.tokenizer.get_last_token_index(), + size=prob_matrix.shape, + dtype=torch.long, + ) + masked_sequence[randomized] = random_idx[randomized] + labels = torch.where(selected, sequence, -100) + + return masked_sequence, labels + +class MultiTaskMixin: + """Mixin class for handling multi-task datasets. + + Parameters + ---------- + tasks : List[str] + A list of tasks for which the model is being trained. + + Attributes + ---------- + tasks : List[str] + A list of tasks for which the model is being trained. + task_to_index : Dict[str, List[Tuple[int, str, int, Optional[int]]]] + A dictionary mapping each task to a list of tuples containing the + index, task, label, and cutoff. + index_mapper : List[Tuple[int, str, int, Optional[int]]] + A list of all datapoints to be used by __getitem__. + """ + def __init__(self, tasks: List[str]): + self.tasks = tasks + self.task_to_index = {task: [] for task in self.tasks} + self.index_mapper = [] + + def prepare_multi_task_data(self) -> None: + """Prepare multi-task data by mapping tasks to corresponding indices. + + This method precomputes indices for quick mapping in __getitem__ that + exclude missing labels. It helps in filtering out entries where the + label is missing for the specified tasks. + """ + self.data.reset_index(drop=True, inplace=True) + + for patient in self.data.itertuples(): + index = patient.Index + + for task in self.tasks: + label_col = f"label_{task}" + # Skip this task for the current patient if the label is missing. + if getattr(patient, label_col) == self.nan_indicator: + continue + + label = getattr(patient, label_col) + # Check for the existence of a task-specific cutoff in the data, + # else use None. + if f"cutoff_{task}" in self.data.columns: + cutoff = getattr(patient, f"cutoff_{task}") + else: + cutoff = None + # Append a tuple containing the necessary information + # for training to index_mapper. + datapoint = (index, task, label, cutoff) + self.task_to_index[task].append(datapoint) + + # Create a list of all datapoints to be used by __getitem__ + self.index_mapper = [ + datapoints + for task_data in self.task_to_index.values() + for datapoints in task_data + ] + + +class LabelBalanceMixin: + """Mixin class for balancing labels in the dataset.""" + + def balance_labels(self, balance_guide: Optional[Dict[str, float]] = None) -> None: + """Balance the labels for the specified tasks in the dataset. + + Parameters + ---------- + balance_guide : Optional[Dict[str, float]] + A dictionary containing the desired positive ratios for each task. + """ + if not balance_guide: return; + + for task, positive_ratio in balance_guide.items(): + # Separate positive and negative datapoints + datapoints = self.task_to_index[task] + positives = [data for data in datapoints if data[LABEL_INDEX] == 1] + negatives = [data for data in datapoints if data[LABEL_INDEX] == 0] + + # Calculate the total number of samples needed to achieve the + # desired positive ratio + num_positives = len(positives) + total_needed = int(num_positives / positive_ratio) - num_positives + num_negatives_to_keep = min(len(negatives), total_needed) + + # Randomly select the negatives to keep + negatives_kept = random.sample(negatives, num_negatives_to_keep) + + # Combine the kept negatives with all positives + self.task_to_index[task] = positives + negatives_kept + + +class PretrainDataset(BaseDataset, TokenizationMixin, MaskingMixin): + """Dataset for pretraining the model. + + Parameters + ---------- + data : pd.DataFrame + The input data containing sequences to be tokenized and masked. + tokenizer : ConceptTokenizer + An instance of the ConceptTokenizer class used for tokenizing sequences. + max_len : int, optional + The maximum length of the tokenized sequences, by default 2048. + mask_prob : float, optional + The probability of masking a token in the sequence, by default 0.15. + + Attributes + ---------- + data : pd.DataFrame + Stores the input data. + tokenizer : ConceptTokenizer + Tokenizer used for tokenizing sequences. + max_len : int + Maximum length of the tokenized sequences. + mask_prob : float + Probability of masking a token in the sequence. + """ + def __init__( + self, + data: pd.DataFrame, + tokenizer: ConceptTokenizer, + max_len: int = 2048, + mask_prob: float = 0.15, + ): + super().__init__(data, tokenizer, max_len) + self.mask_prob = mask_prob + + def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: + """Get data at corresponding index. + + Parameters + ---------- + idx : int + The index of the data to be retrieved. + + Returns + ------- + Dict[str, torch.Tensor] + A dictionary containing all different token sequences along with + attention mask and labels. + """ + data = self.data.iloc[idx] + cutoff = data[self.cutoff_col] if self.cutoff_col else None + data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) + + # Tokenize and mask the input data + tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) + concept_tokens = tokenized_input["input_ids"].squeeze() + attention_mask = tokenized_input["attention_mask"].squeeze() + masked_tokens, labels = self.mask_tokens(concept_tokens) + + # Prepare model input + tokens = self.add_additional_tokens(data) + tokens["concept_ids"] = masked_tokens + tokens["labels"] = labels + tokens["attention_mask"] = attention_mask + + return tokens + +class PretrainDatasetDecoder(BaseDataset, TokenizationMixin): + """Dataset for pretraining a decoder-based model (e.g. Mamba). + + The decoder is trained using the next token prediction task. + + Parameters + ---------- + data : pd.DataFrame + The input data containing sequences to be tokenized. + tokenizer : ConceptTokenizer + An instance of the ConceptTokenizer class used for tokenizing sequences. + max_len : int, optional + The maximum length of the tokenized sequences, by default 2048. + + Attributes + ---------- + data : pd.DataFrame + Stores the input data. + tokenizer : ConceptTokenizer + Tokenizer used for tokenizing sequences. + max_len : int + Maximum length of the tokenized sequences. + """ + def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: + """Get data at corresponding index. + + Parameters + ---------- + idx : int + The index of the data to be retrieved. + + Returns + ------- + Dict[str, torch.Tensor] + A dictionary containing all different token sequences along with + attention mask and labels. + """ + data = self.data.iloc[idx] + cutoff = data[self.cutoff_col] if self.cutoff_col else None + data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) + tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) + + # Prepare model input + tokens = self.add_additional_tokens(data) + tokens["concept_ids"] = tokenized_input["input_ids"].squeeze() + tokens["labels"] = tokens["concept_ids"] + + return tokens + +class FinetuneDataset(BaseDataset, TokenizationMixin): + """Dataset for finetuning the model. + + Parameters + ---------- + data : pd.DataFrame + The input data containing sequences to be tokenized. + tokenizer : ConceptTokenizer + An instance of the ConceptTokenizer class used for tokenizing sequences. + max_len : int, optional + The maximum length of the tokenized sequences, by default 2048. + + Attributes + ---------- + data : pd.DataFrame + Stores the input data. + tokenizer : ConceptTokenizer + Tokenizer used for tokenizing sequences. + max_len : int + Maximum length of the tokenized sequences. + """ + def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: + """Get data at corresponding index. + + Parameters + ---------- + idx : int + The index of the data to be retrieved. + + Returns + ------- + Dict[str, torch.Tensor] + A dictionary containing all different token sequences along with + attention mask and labels. + """ + data = self.data.iloc[idx] + cutoff = data[self.cutoff_col] if self.cutoff_col else None + data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) + tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) + + # Prepare model input + tokens = self.add_additional_tokens(data) + tokens["concept_ids"] = tokenized_input["input_ids"].squeeze() + tokens["attention_mask"] = tokenized_input["attention_mask"].squeeze() + tokens["labels"] = torch.tensor(data["label"]) + + return tokens + +class FinetuneMultiDataset(BaseDataset, TokenizationMixin, MultiTaskMixin, LabelBalanceMixin): + """Dataset for finetuning the model on multiple tasks. + + Parameters + ---------- + data : pd.DataFrame + The input data containing sequences to be tokenized. + tokenizer : ConceptTokenizer + An instance of the ConceptTokenizer class used for tokenizing sequences. + tasks : List[str] + A list of tasks (labels) for which the model is being finetuned. + balance_guide : Optional[Dict[str, float]], optional + A dictionary containing the desired positive ratios for each task, + by default None. + max_len : int, optional + The maximum length of the tokenized sequences, by default 2048. + nan_indicator : int, optional + Value used to represent missing labels in the dataset, by default -1. + + Attributes + ---------- + data : pd.DataFrame + Stores the input data. + tokenizer : ConceptTokenizer + Tokenizer used for tokenizing sequences. + tasks : List[str] + A list of tasks (labels) for which the model is being finetuned. + balance_guide : Optional[Dict[str, float]] + A dictionary containing the desired positive ratios for each task. + max_len : int + Maximum length of the tokenized sequences. + nan_indicator : int + Value used to represent missing labels in the dataset. + task_to_index : Dict[str, List[Tuple[int, str, int, Optional[int]]]] + A dictionary mapping each task to a list of tuples containing the + index, task, label, and cutoff. + index_mapper : List[Tuple[int, str, int, Optional[int]]] + A list of all datapoints to be used by __getitem__. + """ + def __init__( + self, + data: pd.DataFrame, + tokenizer: ConceptTokenizer, + tasks: List[str], + balance_guide: Optional[Dict[str, float]] = None, + max_len: int = 2048, + nan_indicator: int = -1, + ): + BaseDataset.__init__(self, data, tokenizer, max_len) + MultiTaskMixin.__init__(self, tasks) + self.nan_indicator = nan_indicator + self.balance_guide = balance_guide + self.prepare_multi_task_data() + self.balance_labels(self.balance_guide) + + def __len__(self) -> int: + """Return the length of the dataset.""" + return len(self.index_mapper) + + def __getitem__(self, idx: int) -> Dict[str, Any]: + """Get data at corresponding index. + + Parameters + ---------- + idx : int + The index of the data to be retrieved. + + Returns + ------- + Dict[str, Any] + A dictionary containing all different token sequences along with + attention mask and labels. + """ + index, task, label, cutoff = self.index_mapper[idx] + data = self.data.iloc[index] + + # Swap the first token with the task token. + data[f"event_tokens_{self.max_len}"][0] = self.tokenizer.task_to_token(task) + data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) + tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) + + # Prepare model input + tokens = self.add_additional_tokens(data) + tokens["concept_ids"] = tokenized_input["input_ids"].squeeze() + tokens["attention_mask"] = tokenized_input["attention_mask"].squeeze() + tokens["labels"] = torch.tensor(label) + tokens["task"] = task + + return tokens + +class FinetuneDatasetDecoder(BaseDataset, TokenizationMixin, MultiTaskMixin, LabelBalanceMixin): + """Dataset for finetuning a decoder-based model. + + Parameters + ---------- + data : pd.DataFrame + The input data containing sequences to be tokenized. + tokenizer : ConceptTokenizer + An instance of the ConceptTokenizer class used for tokenizing sequences. + tasks : List[str] + A list of tasks (labels) for which the model is being finetuned. + balance_guide : Optional[Dict[str, float]], optional + A dictionary containing the desired positive ratios for each task, + by default None. + max_len : int, optional + The maximum length of the tokenized sequences, by default 2048. + nan_indicator : int, optional + Value used to represent missing labels in the dataset, by default -1. + is_single_head : bool, optional + Indicating if the model uses one head for all classifications or not. + + Attributes + ---------- + data : pd.DataFrame + Stores the input data. + tokenizer : ConceptTokenizer + Tokenizer used for tokenizing sequences. + tasks : List[str] + A list of tasks (labels) for which the model is being finetuned. + balance_guide : Optional[Dict[str, float]] + A dictionary containing the desired positive ratios for each task. + max_len : int + Maximum length of the tokenized sequences. + nan_indicator : int + Value used to represent missing labels in the dataset. + is_single_head : bool + Indicating if the model uses one head for all classifications or not. + task_to_index : Dict[str, List[Tuple[int, str, int, Optional[int]]]] + A dictionary mapping each task to a list of tuples containing the + index, task, label, and cutoff. + index_mapper : List[Tuple[int, str, int, Optional[int]]] + A list of all datapoints to be used by __getitem__. + """ + def __init__( + self, + data: pd.DataFrame, + tokenizer: ConceptTokenizer, + tasks: List[str], + balance_guide: Optional[Dict[str, float]] = None, + max_len: int = 2048, + nan_indicator: int = -1, + is_single_head: bool = True, + ): + BaseDataset.__init__(self, data, tokenizer, max_len) + MultiTaskMixin.__init__(self, tasks) + self.nan_indicator = nan_indicator + self.is_single_head = is_single_head + self.balance_guide = balance_guide + self.prepare_multi_task_data() + self.balance_labels(balance_guide) + + def __len__(self) -> int: + """Return the length of the dataset.""" + return len(self.index_mapper) + + def __getitem__(self, idx: int) -> Dict[str, Any]: + """Get data at corresponding index. + + Parameters + ---------- + idx : int + The index of the data to be retrieved. + + Returns + ------- + Dict[str, Any] + A dictionary containing all different token sequences along with labels. + """ + index, task, label, cutoff = self.index_mapper[idx] + data = self.data.iloc[index] + + # Swap the first and last token with the task token. + if self.is_single_head: + data[f"event_tokens_{self.max_len}"][0] = self.tokenizer.task_to_token(task) + data[f"event_tokens_{self.max_len}"][-1] = self.tokenizer.task_to_token(task) + else: + data[f"event_tokens_{self.max_len}"][-1] = data[f"event_tokens_{self.max_len}"][0] + + data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) + tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) + + # Prepare model input + tokens = self.add_additional_tokens(data) + tokens["concept_ids"] = tokenized_input["input_ids"].squeeze() + tokens["labels"] = torch.tensor(label) + tokens["task"] = task + tokens["task_indices"] = torch.tensor(TASK_TO_INDEX[task]) + + return tokens + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + class PretrainDataset(Dataset): @@ -264,7 +881,7 @@ def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: cutoff = data[self.cutoff_col] if self.cutoff_col else None # Truncate and pad the data to the specified cutoff. - data = truncate_and_pad(data, cutoff, self.max_len) + data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) # Prepare model input tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) @@ -789,7 +1406,7 @@ def __getitem__(self, idx: int) -> Dict[str, Any]: visit_tokens = torch.tensor(visit_tokens) position_tokens = torch.tensor(position_tokens) labels = torch.tensor(labels) - task_indices = torch.tensor(TASK2INDEX[task]) + task_indices = torch.tensor(TASK_TO_INDEX[task]) return { "concept_ids": concept_ids, From 4bcab3a5f6aa577212607419ee632e05a76df406 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 21 Jun 2024 15:55:06 +0000 Subject: [PATCH 2/9] [pre-commit.ci] Add auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- odyssey/data/dataset.py | 85 ++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 48 deletions(-) diff --git a/odyssey/data/dataset.py b/odyssey/data/dataset.py index 043ca10..943171b 100644 --- a/odyssey/data/dataset.py +++ b/odyssey/data/dataset.py @@ -16,8 +16,8 @@ TASK_TO_INDEX = { "mortality_1month": 0, "readmission_1month": 1, - "los_1week": 2, # Length of stay - "c0": 3, # Condition 0 + "los_1week": 2, # Length of stay + "c0": 3, # Condition 0 "c1": 4, "c2": 5, } @@ -35,6 +35,7 @@ class BaseDataset(Dataset, ABC): max_len : int, optional The maximum length of the tokenized sequences, by default 2048. """ + def __init__( self, data: pd.DataFrame, @@ -44,7 +45,9 @@ def __init__( self.data = data self.tokenizer = tokenizer self.max_len = max_len - self.cutoff_col = next((col for col in self.data.columns if "cutoff" in col), None) + self.cutoff_col = next( + (col for col in self.data.columns if "cutoff" in col), None + ) def __len__(self) -> int: """Return the length of the dataset.""" @@ -112,6 +115,7 @@ def add_additional_tokens(self, data: pd.Series) -> Dict[str, torch.Tensor]: "visit_segments": visit_tokens, } + class MaskingMixin: """Mixin class for masking tokens in the dataset.""" @@ -158,6 +162,7 @@ def mask_tokens(self, sequence: torch.Tensor) -> Tuple[torch.Tensor, torch.Tenso return masked_sequence, labels + class MultiTaskMixin: """Mixin class for handling multi-task datasets. @@ -176,6 +181,7 @@ class MultiTaskMixin: index_mapper : List[Tuple[int, str, int, Optional[int]]] A list of all datapoints to be used by __getitem__. """ + def __init__(self, tasks: List[str]): self.tasks = tasks self.task_to_index = {task: [] for task in self.tasks} @@ -230,7 +236,8 @@ def balance_labels(self, balance_guide: Optional[Dict[str, float]] = None) -> No balance_guide : Optional[Dict[str, float]] A dictionary containing the desired positive ratios for each task. """ - if not balance_guide: return; + if not balance_guide: + return for task, positive_ratio in balance_guide.items(): # Separate positive and negative datapoints @@ -276,6 +283,7 @@ class PretrainDataset(BaseDataset, TokenizationMixin, MaskingMixin): mask_prob : float Probability of masking a token in the sequence. """ + def __init__( self, data: pd.DataFrame, @@ -303,7 +311,7 @@ def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: data = self.data.iloc[idx] cutoff = data[self.cutoff_col] if self.cutoff_col else None data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) - + # Tokenize and mask the input data tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) concept_tokens = tokenized_input["input_ids"].squeeze() @@ -318,6 +326,7 @@ def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: return tokens + class PretrainDatasetDecoder(BaseDataset, TokenizationMixin): """Dataset for pretraining a decoder-based model (e.g. Mamba). @@ -341,6 +350,7 @@ class PretrainDatasetDecoder(BaseDataset, TokenizationMixin): max_len : int Maximum length of the tokenized sequences. """ + def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: """Get data at corresponding index. @@ -359,7 +369,7 @@ def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: cutoff = data[self.cutoff_col] if self.cutoff_col else None data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) - + # Prepare model input tokens = self.add_additional_tokens(data) tokens["concept_ids"] = tokenized_input["input_ids"].squeeze() @@ -367,6 +377,7 @@ def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: return tokens + class FinetuneDataset(BaseDataset, TokenizationMixin): """Dataset for finetuning the model. @@ -388,6 +399,7 @@ class FinetuneDataset(BaseDataset, TokenizationMixin): max_len : int Maximum length of the tokenized sequences. """ + def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: """Get data at corresponding index. @@ -406,7 +418,7 @@ def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: cutoff = data[self.cutoff_col] if self.cutoff_col else None data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) - + # Prepare model input tokens = self.add_additional_tokens(data) tokens["concept_ids"] = tokenized_input["input_ids"].squeeze() @@ -415,7 +427,10 @@ def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: return tokens -class FinetuneMultiDataset(BaseDataset, TokenizationMixin, MultiTaskMixin, LabelBalanceMixin): + +class FinetuneMultiDataset( + BaseDataset, TokenizationMixin, MultiTaskMixin, LabelBalanceMixin +): """Dataset for finetuning the model on multiple tasks. Parameters @@ -454,6 +469,7 @@ class FinetuneMultiDataset(BaseDataset, TokenizationMixin, MultiTaskMixin, Label index_mapper : List[Tuple[int, str, int, Optional[int]]] A list of all datapoints to be used by __getitem__. """ + def __init__( self, data: pd.DataFrame, @@ -505,7 +521,10 @@ def __getitem__(self, idx: int) -> Dict[str, Any]: return tokens -class FinetuneDatasetDecoder(BaseDataset, TokenizationMixin, MultiTaskMixin, LabelBalanceMixin): + +class FinetuneDatasetDecoder( + BaseDataset, TokenizationMixin, MultiTaskMixin, LabelBalanceMixin +): """Dataset for finetuning a decoder-based model. Parameters @@ -548,6 +567,7 @@ class FinetuneDatasetDecoder(BaseDataset, TokenizationMixin, MultiTaskMixin, Lab index_mapper : List[Tuple[int, str, int, Optional[int]]] A list of all datapoints to be used by __getitem__. """ + def __init__( self, data: pd.DataFrame, @@ -585,17 +605,21 @@ def __getitem__(self, idx: int) -> Dict[str, Any]: """ index, task, label, cutoff = self.index_mapper[idx] data = self.data.iloc[index] - + # Swap the first and last token with the task token. if self.is_single_head: data[f"event_tokens_{self.max_len}"][0] = self.tokenizer.task_to_token(task) - data[f"event_tokens_{self.max_len}"][-1] = self.tokenizer.task_to_token(task) + data[f"event_tokens_{self.max_len}"][-1] = self.tokenizer.task_to_token( + task + ) else: - data[f"event_tokens_{self.max_len}"][-1] = data[f"event_tokens_{self.max_len}"][0] + data[f"event_tokens_{self.max_len}"][-1] = data[ + f"event_tokens_{self.max_len}" + ][0] data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) - + # Prepare model input tokens = self.add_additional_tokens(data) tokens["concept_ids"] = tokenized_input["input_ids"].squeeze() @@ -606,41 +630,6 @@ def __getitem__(self, idx: int) -> Dict[str, Any]: return tokens - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class PretrainDataset(Dataset): """Dataset for pretraining the model. From 2243fbf1d8ab821228f46cae6067855b15dfcc1a Mon Sep 17 00:00:00 2001 From: afallah Date: Fri, 21 Jun 2024 14:49:08 -0400 Subject: [PATCH 3/9] Add TODOs --- odyssey/data/dataset.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/odyssey/data/dataset.py b/odyssey/data/dataset.py index 043ca10..d934eaf 100644 --- a/odyssey/data/dataset.py +++ b/odyssey/data/dataset.py @@ -1,4 +1,4 @@ -"""Data module for pretraining and finetuning the model.""" +"""Data module for pretraining and finetuning models.""" import random from abc import ABC, abstractmethod @@ -23,6 +23,12 @@ } +# TODO: Make sure the order of keys in the dict is correct +# TODO: Make sure code functionality stays the same +# TODO: Makse sure code passes some tests +# TODO: Make sure code is well documented + + class BaseDataset(Dataset, ABC): """Base class for datasets used in pretraining and finetuning. From d7b799f945690167dde0d8feecc91ff42f8c2bda Mon Sep 17 00:00:00 2001 From: afallah Date: Fri, 21 Jun 2024 23:01:50 -0400 Subject: [PATCH 4/9] Fix bugs and ensure refactor functionality is valid. --- odyssey/data/dataset.py | 820 +--------------------------------------- 1 file changed, 2 insertions(+), 818 deletions(-) diff --git a/odyssey/data/dataset.py b/odyssey/data/dataset.py index 286b0f8..354cc0d 100644 --- a/odyssey/data/dataset.py +++ b/odyssey/data/dataset.py @@ -11,8 +11,10 @@ from odyssey.data.tokenizer import ConceptTokenizer, truncate_and_pad +# Index of features in tuples used in multi-task datasets TASK_INDEX, LABEL_INDEX, CUTOFF_INDEX = 1, 2, 3 +# Mapping of tasks to indices for multi-task datasets TASK_TO_INDEX = { "mortality_1month": 0, "readmission_1month": 1, @@ -23,12 +25,6 @@ } -# TODO: Make sure the order of keys in the dict is correct -# TODO: Make sure code functionality stays the same -# TODO: Makse sure code passes some tests -# TODO: Make sure code is well documented - - class BaseDataset(Dataset, ABC): """Base class for datasets used in pretraining and finetuning. @@ -634,815 +630,3 @@ def __getitem__(self, idx: int) -> Dict[str, Any]: tokens["task_indices"] = torch.tensor(TASK_TO_INDEX[task]) return tokens - - -class PretrainDataset(Dataset): - """Dataset for pretraining the model. - - Parameters - ---------- - data : pd.DataFrame - The input data containing sequences to be tokenized and masked. - tokenizer : ConceptTokenizer - An instance of the ConceptTokenizer class used for tokenizing sequences. - max_len : int, optional - The maximum length of the tokenized sequences, by default 2048. - mask_prob : float, optional - The probability of masking a token in the sequence, by default 0.15. - - Attributes - ---------- - data : pd.DataFrame - Stores the input data. - tokenizer : ConceptTokenizer - Tokenizer used for tokenizing sequences. - max_len : int - Maximum length of the tokenized sequences. - mask_prob : float - Probability of masking a token in the sequence. - """ - - def __init__( - self, - data: pd.DataFrame, - tokenizer: ConceptTokenizer, - max_len: int = 2048, - mask_prob: float = 0.15, - ): - """Initiate the class.""" - super(PretrainDataset, self).__init__() - self.data = data - self.tokenizer = tokenizer - self.max_len = max_len - self.mask_prob = mask_prob - # Find the cutoff column in the data if it exists. - for column in self.data.columns: - if "cutoff" in column: - self.cutoff_col = column - else: - self.cutoff_col = None - - def __len__(self) -> int: - """Return the length of the dataset.""" - return len(self.data) - - def tokenize_data(self, sequence: Union[str, List[str]]) -> Any: - """Tokenize the sequence and return input_ids and attention mask. - - Parameters - ---------- - sequence : Union[str, List[str]] - The sequence to be tokenized. - - Returns - ------- - Any - A dictionary containing input_ids and attention_mask. - - """ - return self.tokenizer(sequence, max_length=self.max_len) - - def mask_tokens(self, sequence: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - """Mask the tokens in the sequence using vectorized operations. - - Parameters - ---------- - sequence : torch.Tensor - The sequence of tokens to be masked. - - Returns - ------- - Tuple[torch.Tensor, torch.Tensor] - A tuple containing masked sequence and labels. - - """ - mask_token_id = self.tokenizer.get_mask_token_id() - masked_sequence = sequence.clone() - - # Ignore [PAD], [UNK], [MASK] tokens - prob_matrix = torch.full(masked_sequence.shape, self.mask_prob) - prob_matrix[torch.where(masked_sequence <= mask_token_id)] = 0 - selected = torch.bernoulli(prob_matrix).bool() - - # 80% of the time, replace masked input tokens with respective mask tokens - replaced = torch.bernoulli(torch.full(selected.shape, 0.8)).bool() & selected - masked_sequence[replaced] = mask_token_id - - # 10% of the time, we replace masked input tokens with random vector. - randomized = ( - torch.bernoulli(torch.full(selected.shape, 0.1)).bool() - & selected - & ~replaced - ) - random_idx = torch.randint( - low=self.tokenizer.get_first_token_index(), - high=self.tokenizer.get_last_token_index(), - size=prob_matrix.shape, - dtype=torch.long, - ) - masked_sequence[randomized] = random_idx[randomized] - labels = torch.where(selected, sequence, -100) - - return masked_sequence, labels - - def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: - """Get data at corresponding index. - - Parameters - ---------- - idx : int - The index of the data to be retrieved. - - Returns - ------- - Dict[str, torch.Tensor] - A dictionary containing all different token sequences along with - attention mask and labels. - - """ - data = self.data.iloc[idx] - cutoff = data[self.cutoff_col] if self.cutoff_col else None - data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) - tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) - concept_tokens = tokenized_input["input_ids"].squeeze() - attention_mask = tokenized_input["attention_mask"].squeeze() - - type_tokens = data[f"type_tokens_{self.max_len}"] - age_tokens = data[f"age_tokens_{self.max_len}"] - time_tokens = data[f"time_tokens_{self.max_len}"] - visit_tokens = data[f"visit_tokens_{self.max_len}"] - position_tokens = data[f"position_tokens_{self.max_len}"] - - masked_tokens, labels = self.mask_tokens(concept_tokens) - - type_tokens = torch.tensor(type_tokens) - age_tokens = torch.tensor(age_tokens) - time_tokens = torch.tensor(time_tokens) - visit_tokens = torch.tensor(visit_tokens) - position_tokens = torch.tensor(position_tokens) - - return { - "concept_ids": masked_tokens, - "type_ids": type_tokens, - "ages": age_tokens, - "time_stamps": time_tokens, - "visit_orders": position_tokens, - "visit_segments": visit_tokens, - "labels": labels, - "attention_mask": attention_mask, - } - - -class PretrainDatasetDecoder(Dataset): - """Dataset for pretraining a decoder-based model (e.g. Mamba). - - The decoder is trained using the next token prediction task. - - Parameters - ---------- - data : pd.DataFrame - The input data containing sequences to be tokenized and masked. - tokenizer : ConceptTokenizer - An instance of the ConceptTokenizer class used for tokenizing sequences. - max_len : int, optional - The maximum length of the tokenized sequences, by default 2048. - - Attributes - ---------- - data : pd.DataFrame - Stores the input data. - tokenizer : ConceptTokenizer - Tokenizer used for tokenizing sequences. - max_len : int - Maximum length of the tokenized sequences. - - """ - - def __init__( - self, - data: pd.DataFrame, - tokenizer: ConceptTokenizer, - max_len: int = 2048, - ): - """Initiate the class.""" - super().__init__() - self.data = data - self.tokenizer = tokenizer - self.max_len = max_len - - # Find the cutoff column in the data if it exists. - for column in self.data.columns: - if "cutoff" in column: - self.cutoff_col = column - else: - self.cutoff_col = None - - def __len__(self) -> int: - """Return the length of the dataset.""" - return len(self.data) - - def tokenize_data(self, sequence: Union[str, List[str]]) -> Any: - """Tokenize the sequence and return input_ids and attention mask. - - Parameters - ---------- - sequence : Union[str, List[str]] - The sequence to be tokenized. - - Returns - ------- - Any - A dictionary containing input_ids and attention_mask. - - """ - return self.tokenizer(sequence, max_length=self.max_len) - - def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: - """Get data at corresponding index. - - Parameters - ---------- - idx : int - The index of the data to be retrieved. - - Returns - ------- - Dict[str, torch.Tensor] - A dictionary containing all different token sequences along with - attention mask and labels. - - """ - data = self.data.iloc[idx] - cutoff = data[self.cutoff_col] if self.cutoff_col else None - - # Truncate and pad the data to the specified cutoff. - data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) - - # Prepare model input - tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) - concept_ids = tokenized_input["input_ids"].squeeze() - - type_tokens = data[f"type_tokens_{self.max_len}"] - age_tokens = data[f"age_tokens_{self.max_len}"] - time_tokens = data[f"time_tokens_{self.max_len}"] - visit_tokens = data[f"visit_tokens_{self.max_len}"] - position_tokens = data[f"position_tokens_{self.max_len}"] - - type_tokens = torch.tensor(type_tokens) - age_tokens = torch.tensor(age_tokens) - time_tokens = torch.tensor(time_tokens) - visit_tokens = torch.tensor(visit_tokens) - position_tokens = torch.tensor(position_tokens) - - return { - "concept_ids": concept_ids, - "type_ids": type_tokens, - "ages": age_tokens, - "time_stamps": time_tokens, - "visit_orders": position_tokens, - "visit_segments": visit_tokens, - "labels": concept_ids, - } - - -class FinetuneDataset(Dataset): - """Dataset for finetuning the model. - - Parameters - ---------- - data : pd.DataFrame - The input data containing sequences to be tokenized and masked. - tokenizer : ConceptTokenizer - An instance of the ConceptTokenizer class used for tokenizing sequences. - max_len : int, optional - The maximum length of the tokenized sequences, by default 2048. - - Attributes - ---------- - data : pd.DataFrame - Stores the input data. - tokenizer : ConceptTokenizer - Tokenizer used for tokenizing sequences. - max_len : int - Maximum length of the tokenized sequences. - - """ - - def __init__( - self, - data: pd.DataFrame, - tokenizer: ConceptTokenizer, - max_len: int = 2048, - ): - """Initiate the class.""" - super(FinetuneDataset, self).__init__() - self.data = data - self.tokenizer = tokenizer - self.max_len = max_len - # Find the cutoff column in the data if it exists. - for column in self.data.columns: - if "cutoff" in column: - self.cutoff_col = column - else: - self.cutoff_col = None - - def __len__(self) -> int: - """Return the length of dataset.""" - return len(self.data) - - def tokenize_data(self, sequence: Union[str, List[str]]) -> Any: - """Tokenize the sequence and return input_ids and attention mask. - - Parameters - ---------- - sequence : Union[str, List[str]] - The sequence to be tokenized. - - Returns - ------- - Any - A dictionary containing input_ids and attention_mask. - - """ - return self.tokenizer(sequence, max_length=self.max_len) - - def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: - """Get data at corresponding index. - - Parameters - ---------- - idx : int - The index of the data to be retrieved. - - Returns - ------- - Dict[str, torch.Tensor] - A dictionary containing all different token sequences along with - attention mask and labels. - - """ - data = self.data.iloc[idx] - cutoff = data[self.cutoff_col] if self.cutoff_col else None - data = truncate_and_pad(data, cutoff=cutoff, max_len=self.max_len) - tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) - concept_tokens = tokenized_input["input_ids"].squeeze() - attention_mask = tokenized_input["attention_mask"].squeeze() - - type_tokens = data[f"type_tokens_{self.max_len}"] - age_tokens = data[f"age_tokens_{self.max_len}"] - time_tokens = data[f"time_tokens_{self.max_len}"] - visit_tokens = data[f"visit_tokens_{self.max_len}"] - position_tokens = data[f"position_tokens_{self.max_len}"] - labels = data["label"] - - type_tokens = torch.tensor(type_tokens) - age_tokens = torch.tensor(age_tokens) - time_tokens = torch.tensor(time_tokens) - visit_tokens = torch.tensor(visit_tokens) - position_tokens = torch.tensor(position_tokens) - labels = torch.tensor(labels) - - return { - "concept_ids": concept_tokens, - "type_ids": type_tokens, - "ages": age_tokens, - "time_stamps": time_tokens, - "visit_orders": position_tokens, - "visit_segments": visit_tokens, - "labels": labels, - "attention_mask": attention_mask, - } - - def __iter__(self) -> Any: - """Return an iterator over the dataset.""" - for i in range(len(self)): - yield self[i] - - -class FinetuneMultiDataset(Dataset): - """Dataset for finetuning the model on multi dataset. - - Parameters - ---------- - data : pd.DataFrame - The input data containing sequences to be tokenized and masked. - tokenizer : ConceptTokenizer - An instance of the ConceptTokenizer class used for tokenizing sequences. - tasks : List[str] - A list of tasks (labels) that need to be predicted. - balance_guide : Optional[Dict[str, float]], optional - A dictionary containing the desired positive ratios for each task, - by default None. - max_len : int, optional - The maximum length of the tokenized sequences, by default 2048. - nan_indicator : int, optional - Value used to represent missing labels in the dataset, by default -1. - - Attributes - ---------- - data : pd.DataFrame - Stores the input data. - tokenizer : ConceptTokenizer - Tokenizer used for tokenizing sequences. - tasks : List[str] - A list of tasks (labels) that need to be predicted. - balance_guide : Optional[Dict[str, float]] - A dictionary containing the desired positive ratios for each task. - max_len : int - Maximum length of the tokenized sequences. - nan_indicator : int - Value used to represent missing labels in the dataset. - task_to_index : Dict[str, List[Tuple[int, str, int, Optional[int]]]] - A dictionary mapping each task to a list of tuples containing the - index, task, label, and cutoff. - index_mapper : List[Tuple[int, str, int, Optional[int]]] - A list of all datapoints to be used by __getitem__. - - """ - - def __init__( - self, - data: pd.DataFrame, - tokenizer: ConceptTokenizer, - tasks: List[str], - balance_guide: Optional[Dict[str, float]] = None, - max_len: int = 2048, - nan_indicator: int = -1, - ): - """Initiate the class.""" - super(FinetuneMultiDataset, self).__init__() - - self.data = data - self.tokenizer = tokenizer - self.tasks = tasks # List of tasks for which the model is being finetuned. - self.balance_guide = balance_guide - self.max_len = max_len - self.nan_indicator = ( - nan_indicator # Value used to indicate missing data in labels. - ) - - # Precompute indices for quick mapping in __getitem__ that - # exclude missing labels. - # This helps in filtering out entries where the label is missing - # for the specified tasks. - self.task_to_index = {task: [] for task in self.tasks} - self.data.reset_index(drop=True, inplace=True) - - for patient in self.data.itertuples(): - index = patient.Index - - for task in self.tasks: - label_col = f"label_{task}" - # Skip this task for the current patient if the label is missing. - if getattr(patient, label_col) == self.nan_indicator: - continue - - label = getattr(patient, label_col) - # Check for the existence of a task-specific cutoff in the data, - # else use None. - if f"cutoff_{task}" in self.data.columns: - cutoff = getattr(patient, f"cutoff_{task}") - else: - cutoff = None - # Append a tuple containing the necessary information - # for training to index_mapper. - datapoint = (index, task, label, cutoff) - self.task_to_index[task].append(datapoint) - - # Balance labels for specified tasks - if self.balance_guide: - for task in self.balance_guide: - self.balance_labels(task=task, positive_ratio=self.balance_guide[task]) - - # Create a list of all datapoints to be used by __getitem__ - self.index_mapper = [ - datapoints - for task_data in self.task_to_index.values() - for datapoints in task_data - ] - - def __len__(self) -> int: - """Return the length of dataset.""" - return len(self.index_mapper) - - def __getitem__(self, idx: int) -> Dict[str, Any]: - """Get data at corresponding index. - - Parameters - ---------- - idx : int - The index of the data to be retrieved. - - Returns - ------- - Dict[str, Any] - A dictionary containing all different token sequences along with - attention mask and labels. - - """ - index, task, labels, cutoff = self.index_mapper[idx] - data = self.data.iloc[index] - - # Swap the first token with the task token. - data[f"event_tokens_{self.max_len}"][0] = self.tokenizer.task_to_token(task) - - # Truncate and pad the data to the specified cutoff. - data = truncate_and_pad(data, cutoff, self.max_len) - - # Prepare model input - tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) - concept_tokens = tokenized_input["input_ids"].squeeze() - attention_mask = tokenized_input["attention_mask"].squeeze() - - type_tokens = data[f"type_tokens_{self.max_len}"] - age_tokens = data[f"age_tokens_{self.max_len}"] - time_tokens = data[f"time_tokens_{self.max_len}"] - visit_tokens = data[f"visit_tokens_{self.max_len}"] - position_tokens = data[f"position_tokens_{self.max_len}"] - - type_tokens = torch.tensor(type_tokens) - age_tokens = torch.tensor(age_tokens) - time_tokens = torch.tensor(time_tokens) - visit_tokens = torch.tensor(visit_tokens) - position_tokens = torch.tensor(position_tokens) - labels = torch.tensor(labels) - - return { - "concept_ids": concept_tokens, - "type_ids": type_tokens, - "ages": age_tokens, - "time_stamps": time_tokens, - "visit_orders": position_tokens, - "visit_segments": visit_tokens, - "labels": labels, - "attention_mask": attention_mask, - "task": task, - } - - def tokenize_data(self, sequence: Union[str, List[str]]) -> Any: - """Tokenize the sequence and return input_ids and attention mask. - - Parameters - ---------- - sequence : Union[str, List[str]] - The sequence to be tokenized. - - Returns - ------- - Any - A dictionary containing input_ids and attention_mask. - - """ - return self.tokenizer(sequence, max_length=self.max_len) - - def balance_labels(self, task: str, positive_ratio: float) -> None: - """Balance the labels for the specified task in the dataset. - - This function modifies the dataset to ensure that the ratio of positive samples - to the total number of samples matches the specified positive_ratio, - while keeping all positive data points. - - Parameters - ---------- - task : str - The task for which the labels need to be balanced. - positive_ratio : float - The desired positive ratio for the task. - - """ - # Separate positive and negative datapoints - datapoints = self.task_to_index[task] - positives = [data for data in datapoints if data[LABEL_INDEX] == 1] - negatives = [data for data in datapoints if data[LABEL_INDEX] == 0] - - # Calculate the total number of samples needed to achieve the - # desired positive ratio - num_positives = len(positives) - total_needed = int(num_positives / positive_ratio) - num_positives - num_negatives_to_keep = min(len(negatives), total_needed) - - # Randomly select the negatives to keep - negatives_kept = random.sample(negatives, num_negatives_to_keep) - - # Combine the kept negatives with all positives - self.task_to_index[task] = positives + negatives_kept - - -class FinetuneDatasetDecoder(Dataset): - """Dataset for finetuning a decoder-based model. - - Parameters - ---------- - data : pd.DataFrame - The input data containing sequences to be tokenized and masked. - tokenizer : ConceptTokenizer - An instance of the ConceptTokenizer class used for tokenizing sequences. - tasks : List[str] - A list of tasks (labels) that need to be predicted. - balance_guide : Optional[Dict[str, float]], optional - A dictionary containing the desired positive ratios for each task, - by default None. - max_len : int, optional - The maximum length of the tokenized sequences, by default 2048. - nan_indicator : int, optional - Value used to represent missing labels in the dataset, by default -1. - is_single_head : bool, optional - Indicating if the model uses one head for all classifications or not. - - Attributes - ---------- - data : pd.DataFrame - Stores the input data. - tokenizer : ConceptTokenizer - Tokenizer used for tokenizing sequences. - tasks : List[str] - A list of tasks (labels) that need to be predicted. - balance_guide : Optional[Dict[str, float]] - A dictionary containing the desired positive ratios for each task. - max_len : int - Maximum length of the tokenized sequences. - nan_indicator : int - Value used to represent missing labels in the dataset. - is_single_head : bool, optional - Indicating if the model uses one head for all classifications or not. - task_to_index : Dict[str, List[Tuple[int, str, int, Optional[int]]]] - A dictionary mapping each task to a list of tuples containing the - index, task, label, and cutoff. - index_mapper : List[Tuple[int, str, int, Optional[int]]] - A list of all datapoints to be used by __getitem__. - """ - - def __init__( - self, - data: pd.DataFrame, - tokenizer: ConceptTokenizer, - tasks: List[str], - balance_guide: Optional[Dict[str, float]] = None, - max_len: int = 2048, - nan_indicator: int = -1, - is_single_head: bool = True, - ): - """Initiate the class.""" - super().__init__() - - self.data = data - self.tokenizer = tokenizer - self.tasks = tasks # List of tasks for which the model is being finetuned. - self.balance_guide = balance_guide - self.max_len = max_len - self.nan_indicator = ( - nan_indicator # Value used to indicate missing data in labels. - ) - self.is_single_head = is_single_head - - # Precompute indices for quick mapping in __getitem__ that - # exclude missing labels. - # This helps in filtering out entries where the label is missing - # for the specified tasks. - self.task_to_index = {task: [] for task in self.tasks} - self.data.reset_index(drop=True, inplace=True) - - for patient in self.data.itertuples(): - index = patient.Index - - for task in self.tasks: - label_col = f"label_{task}" - # Skip this task for the current patient if the label is missing. - if getattr(patient, label_col) == self.nan_indicator: - continue - - label = getattr(patient, label_col) - # Check for the existence of a task-specific cutoff in the data, - # else use None. - if f"cutoff_{task}" in self.data.columns: - cutoff = getattr(patient, f"cutoff_{task}") - else: - cutoff = None - # Append a tuple containing the necessary information - # for training to index_mapper. - datapoint = (index, task, label, cutoff) - self.task_to_index[task].append(datapoint) - - # Balance labels for specified tasks - if self.balance_guide: - for task in self.balance_guide: - self.balance_labels(task=task, positive_ratio=self.balance_guide[task]) - - # Create a list of all datapoints to be used by __getitem__ - self.index_mapper = [ - datapoints - for task_data in self.task_to_index.values() - for datapoints in task_data - ] - - def __len__(self) -> int: - """Return the length of dataset.""" - return len(self.index_mapper) - - def tokenize_data(self, sequence: Union[str, List[str]]) -> Any: - """Tokenize the sequence and return input_ids and attention mask. - - Parameters - ---------- - sequence : Union[str, List[str]] - The sequence to be tokenized. - - Returns - ------- - Any - A dictionary containing input_ids and attention_mask. - - """ - return self.tokenizer(sequence, max_length=self.max_len) - - def __getitem__(self, idx: int) -> Dict[str, Any]: - """Get data at corresponding index. - - Parameters - ---------- - idx : int - The index of the data to be retrieved. - - Returns - ------- - Dict[str, Any] - A dictionary containing all different token sequences along with labels. - """ - index, task, labels, cutoff = self.index_mapper[idx] - data = self.data.iloc[index] - - # Swap the first and last token with the task token. - if self.is_single_head: - data[f"event_tokens_{self.max_len}"][0] = self.tokenizer.task_to_token(task) - data[f"event_tokens_{self.max_len}"][-1] = self.tokenizer.task_to_token( - task - ) - else: - data[f"event_tokens_{self.max_len}"][-1] = data[ - f"event_tokens_{self.max_len}" - ][0] - - # Truncate and pad the data to the specified cutoff. - data = truncate_and_pad(data, cutoff, self.max_len) - - # Prepare model input - tokenized_input = self.tokenize_data(data[f"event_tokens_{self.max_len}"]) - concept_ids = tokenized_input["input_ids"].squeeze() - - type_tokens = data[f"type_tokens_{self.max_len}"] - age_tokens = data[f"age_tokens_{self.max_len}"] - time_tokens = data[f"time_tokens_{self.max_len}"] - visit_tokens = data[f"visit_tokens_{self.max_len}"] - position_tokens = data[f"position_tokens_{self.max_len}"] - - type_tokens = torch.tensor(type_tokens) - age_tokens = torch.tensor(age_tokens) - time_tokens = torch.tensor(time_tokens) - visit_tokens = torch.tensor(visit_tokens) - position_tokens = torch.tensor(position_tokens) - labels = torch.tensor(labels) - task_indices = torch.tensor(TASK_TO_INDEX[task]) - - return { - "concept_ids": concept_ids, - "type_ids": type_tokens, - "ages": age_tokens, - "time_stamps": time_tokens, - "visit_orders": position_tokens, - "visit_segments": visit_tokens, - "labels": labels, - "task": task, - "task_indices": task_indices, - } - - def balance_labels(self, task: str, positive_ratio: float) -> None: - """Balance the labels for the specified task in the dataset. - - This function modifies the dataset to ensure that the ratio of positive samples - to the total number of samples matches the specified positive_ratio, - while keeping all positive data points. - - Parameters - ---------- - task : str - The task for which the labels need to be balanced. - positive_ratio : float - The desired positive ratio for the task. - - """ - # Separate positive and negative datapoints - datapoints = self.task_to_index[task] - positives = [data for data in datapoints if data[LABEL_INDEX] == 1] - negatives = [data for data in datapoints if data[LABEL_INDEX] == 0] - - # Calculate the total number of samples needed to achieve the - # desired positive ratio - num_positives = len(positives) - total_needed = int(num_positives / positive_ratio) - num_positives - num_negatives_to_keep = min(len(negatives), total_needed) - - # Randomly select the negatives to keep - negatives_kept = random.sample(negatives, num_negatives_to_keep) - - # Combine the kept negatives with all positives - self.task_to_index[task] = positives + negatives_kept From 1ecbc4eea573e006e1042ea45bff1d8c9e82c1f4 Mon Sep 17 00:00:00 2001 From: afallah Date: Fri, 21 Jun 2024 23:02:56 -0400 Subject: [PATCH 5/9] Add tests for dataset.py and modularize the tests subdirectory. --- tests/__init__.py | 0 tests/odyssey/__init__.py | 0 tests/odyssey/data/__init__.py | 0 tests/odyssey/data/mimiciv/__init__.py | 0 tests/odyssey/data/test_dataset.py | 192 +++++++++++++++++++++++++ tests/odyssey/utils/__init__.py | 0 6 files changed, 192 insertions(+) create mode 100644 tests/__init__.py create mode 100644 tests/odyssey/__init__.py create mode 100644 tests/odyssey/data/__init__.py create mode 100644 tests/odyssey/data/mimiciv/__init__.py create mode 100644 tests/odyssey/data/test_dataset.py create mode 100644 tests/odyssey/utils/__init__.py diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/odyssey/__init__.py b/tests/odyssey/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/odyssey/data/__init__.py b/tests/odyssey/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/odyssey/data/mimiciv/__init__.py b/tests/odyssey/data/mimiciv/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/odyssey/data/test_dataset.py b/tests/odyssey/data/test_dataset.py new file mode 100644 index 0000000..12e70c9 --- /dev/null +++ b/tests/odyssey/data/test_dataset.py @@ -0,0 +1,192 @@ +"""Test dataset classes.""" + +import unittest +from unittest.mock import MagicMock + +import pandas as pd +import torch + +from odyssey.data.tokenizer import ConceptTokenizer, truncate_and_pad +from odyssey.data.dataset import ( + BaseDataset, + TokenizationMixin, + MaskingMixin, + MultiTaskMixin, + LabelBalanceMixin, + PretrainDataset, + PretrainDatasetDecoder, + FinetuneDataset, + FinetuneMultiDataset, + FinetuneDatasetDecoder, +) + + +class TestDatasets(unittest.TestCase): + def setUp(self): + # Set up mock data and tokenizer + self.data = pd.DataFrame({ + 'event_tokens_2048': [['token1', 'token2'], ['token3', 'token4']], + 'type_tokens_2048': [[1, 2], [3, 4]], + 'age_tokens_2048': [[30, 40], [50, 60]], + 'time_tokens_2048': [[100, 200], [300, 400]], + 'visit_tokens_2048': [[10, 20], [30, 40]], + 'position_tokens_2048': [[1, 1], [2, 2]], + 'elapsed_tokens_2048': [[5, 10], [15, 20]], + 'label': [0, 1], + 'cutoff': [2, 2], + 'label_mortality_1month': [0, 1], + 'label_readmission_1month': [-1, 0] + }) + self.tokenizer = ConceptTokenizer() + self.tokenizer.tokenizer_object = MagicMock() + self.tokenizer.tokenizer = MagicMock() + self.tokenizer.tokenizer.return_value = { + 'input_ids': torch.tensor([[100, 200], [300, 400]]), + 'attention_mask': torch.tensor([[1, 1], [1, 1]]) + } + self.tokenizer.task_to_token = MagicMock(side_effect=lambda x: f"[{x.upper()}]") + self.tokenizer.get_mask_token_id = MagicMock(return_value=103) + self.tokenizer.get_first_token_index = MagicMock(return_value=0) + self.tokenizer.get_last_token_index = MagicMock(return_value=999) + self.tokenizer.token_to_id = MagicMock(return_value=101) + + def test_base_dataset(self): + class DummyDataset(BaseDataset): + def __getitem__(self, idx: int): + return {'dummy_key': 'dummy_value'} + + dataset = DummyDataset(data=self.data, tokenizer=self.tokenizer) + self.assertEqual(len(dataset), len(self.data)) + self.assertEqual(dataset[0], {'dummy_key': 'dummy_value'}) + + def test_tokenization_mixin(self): + class DummyDataset(BaseDataset, TokenizationMixin): + def __getitem__(self, idx: int): + return self.add_additional_tokens(self.data.iloc[idx]) + + dataset = DummyDataset(data=self.data, tokenizer=self.tokenizer) + result = dataset[0] + self.assertIn('type_ids', result) + self.assertIn('ages', result) + self.assertIn('time_stamps', result) + self.assertIn('visit_orders', result) + self.assertIn('visit_segments', result) + self.assertEqual(result['type_ids'].size(0), 2) + self.assertEqual(result['ages'].size(0), 2) + self.assertEqual(result['time_stamps'].size(0), 2) + self.assertEqual(result['visit_orders'].size(0), 2) + self.assertEqual(result['visit_segments'].size(0), 2) + + def test_masking_mixin(self): + class DummyDataset(BaseDataset, MaskingMixin): + def __getitem__(self, idx: int): + return self.mask_tokens(torch.tensor([10, 20, 30])) + + dataset = DummyDataset(data=self.data, tokenizer=self.tokenizer) + dataset.mask_prob = 0.15 + masked_sequence, labels = dataset[0] + self.assertEqual(len(masked_sequence), 3) + self.assertEqual(len(labels), 3) + self.assertTrue((masked_sequence <= 103).all()) + self.assertTrue((labels == -100).any()) + + def test_multi_task_mixin(self): + class DummyDataset(BaseDataset, MultiTaskMixin): + def __init__(self, data, tokenizer, tasks): + BaseDataset.__init__(self, data, tokenizer) + MultiTaskMixin.__init__(self, tasks) + self.nan_indicator = -1 + + def __getitem__(self, idx: int): + return self.index_mapper[idx] + + dataset = DummyDataset(data=self.data, tokenizer=self.tokenizer, tasks=['mortality_1month', 'readmission_1month']) + dataset.prepare_multi_task_data() + self.assertEqual(len(dataset.index_mapper), 3) + + def test_label_balance_mixin(self): + class DummyDataset(BaseDataset, MultiTaskMixin, LabelBalanceMixin): + def __init__(self, data, tokenizer, tasks): + BaseDataset.__init__(self, data, tokenizer) + MultiTaskMixin.__init__(self, tasks) + self.nan_indicator = -1 + + def __getitem__(self, idx: int): + return self.index_mapper[idx] + + dataset = DummyDataset(data=self.data, tokenizer=self.tokenizer, tasks=['mortality_1month', 'readmission_1month']) + dataset.prepare_multi_task_data() + dataset.balance_labels({'mortality_1month': 0.5}) + task_counts = {task: 0 for task in dataset.tasks} + for i in range(len(dataset)): + task = dataset[i][1] + task_counts[task] += 1 + self.assertEqual(task_counts['mortality_1month'], 2) + self.assertEqual(task_counts['readmission_1month'], 0) # If not provided in balance_guide, task is removed + + def test_pretrain_dataset(self): + dataset = PretrainDataset(data=self.data, tokenizer=self.tokenizer) + tokens = dataset[0] + self.assertIn('concept_ids', tokens) + self.assertIn('labels', tokens) + self.assertIn('attention_mask', tokens) + self.assertEqual(tokens['concept_ids'].size(0), 2) + self.assertEqual(tokens['labels'].size(0), 2) + self.assertEqual(tokens['attention_mask'].size(0), 2) + + def test_pretrain_dataset_decoder(self): + dataset = PretrainDatasetDecoder(data=self.data, tokenizer=self.tokenizer) + tokens = dataset[0] + self.assertIn('concept_ids', tokens) + self.assertIn('labels', tokens) + self.assertEqual(tokens['concept_ids'].size(0), 2) + self.assertEqual(tokens['labels'].size(0), 2) + self.assertIs(tokens['labels'], tokens['concept_ids']) + + def test_finetune_dataset(self): + dataset = FinetuneDataset(data=self.data, tokenizer=self.tokenizer) + tokens = dataset[1] + self.assertIn('concept_ids', tokens) + self.assertIn('labels', tokens) + self.assertIn('attention_mask', tokens) + self.assertEqual(tokens['concept_ids'].size(0), 2) + self.assertEqual(tokens['labels'], torch.tensor(1)) + self.assertEqual(tokens['attention_mask'].size(0), 2) + + def test_finetune_multi_dataset(self): + dataset = FinetuneMultiDataset(data=self.data, tokenizer=self.tokenizer, tasks=['mortality_1month', 'readmission_1month']) + tokens = dataset[0] + self.assertIn('concept_ids', tokens) + self.assertIn('labels', tokens) + self.assertIn('attention_mask', tokens) + self.assertIn('task', tokens) + self.assertEqual(tokens['concept_ids'].size(0), 2) + self.assertEqual(tokens['labels'], torch.tensor(0)) + self.assertEqual(tokens['attention_mask'].size(0), 2) + + def test_finetune_dataset_decoder(self): + dataset = FinetuneDatasetDecoder(data=self.data, tokenizer=self.tokenizer, tasks=['mortality_1month', 'readmission_1month']) + tokens = dataset[2] + self.assertIn('concept_ids', tokens) + self.assertIn('labels', tokens) + self.assertIn('task', tokens) + self.assertIn('task_indices', tokens) + self.assertEqual(tokens['concept_ids'].size(0), 2) + self.assertEqual(tokens['labels'], torch.tensor(0)) + + def test_dataset_length(self): + dataset = PretrainDataset(data=self.data, tokenizer=self.tokenizer) + self.assertEqual(len(dataset), 2) + + def test_multitask_balance(self): + dataset = FinetuneMultiDataset(data=self.data, tokenizer=self.tokenizer, tasks=['mortality_1month', 'readmission_1month'], balance_guide={'mortality_1month': 0.5}) + task_counts = {task: 0 for task in dataset.tasks} + for i in range(len(dataset)): + task = dataset[i]['task'] + task_counts[task] += 1 + self.assertEqual(task_counts['mortality_1month'], 2) + self.assertEqual(task_counts['readmission_1month'], 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/odyssey/utils/__init__.py b/tests/odyssey/utils/__init__.py new file mode 100644 index 0000000..e69de29 From dee22ae7f3625f4ec27ad059245ec9efe8192898 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 22 Jun 2024 03:03:56 +0000 Subject: [PATCH 6/9] [pre-commit.ci] Add auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/odyssey/data/test_dataset.py | 175 ++++++++++++++++------------- 1 file changed, 100 insertions(+), 75 deletions(-) diff --git a/tests/odyssey/data/test_dataset.py b/tests/odyssey/data/test_dataset.py index 12e70c9..be8355f 100644 --- a/tests/odyssey/data/test_dataset.py +++ b/tests/odyssey/data/test_dataset.py @@ -6,43 +6,45 @@ import pandas as pd import torch -from odyssey.data.tokenizer import ConceptTokenizer, truncate_and_pad from odyssey.data.dataset import ( BaseDataset, - TokenizationMixin, + FinetuneDataset, + FinetuneDatasetDecoder, + FinetuneMultiDataset, + LabelBalanceMixin, MaskingMixin, MultiTaskMixin, - LabelBalanceMixin, PretrainDataset, PretrainDatasetDecoder, - FinetuneDataset, - FinetuneMultiDataset, - FinetuneDatasetDecoder, + TokenizationMixin, ) +from odyssey.data.tokenizer import ConceptTokenizer class TestDatasets(unittest.TestCase): def setUp(self): # Set up mock data and tokenizer - self.data = pd.DataFrame({ - 'event_tokens_2048': [['token1', 'token2'], ['token3', 'token4']], - 'type_tokens_2048': [[1, 2], [3, 4]], - 'age_tokens_2048': [[30, 40], [50, 60]], - 'time_tokens_2048': [[100, 200], [300, 400]], - 'visit_tokens_2048': [[10, 20], [30, 40]], - 'position_tokens_2048': [[1, 1], [2, 2]], - 'elapsed_tokens_2048': [[5, 10], [15, 20]], - 'label': [0, 1], - 'cutoff': [2, 2], - 'label_mortality_1month': [0, 1], - 'label_readmission_1month': [-1, 0] - }) + self.data = pd.DataFrame( + { + "event_tokens_2048": [["token1", "token2"], ["token3", "token4"]], + "type_tokens_2048": [[1, 2], [3, 4]], + "age_tokens_2048": [[30, 40], [50, 60]], + "time_tokens_2048": [[100, 200], [300, 400]], + "visit_tokens_2048": [[10, 20], [30, 40]], + "position_tokens_2048": [[1, 1], [2, 2]], + "elapsed_tokens_2048": [[5, 10], [15, 20]], + "label": [0, 1], + "cutoff": [2, 2], + "label_mortality_1month": [0, 1], + "label_readmission_1month": [-1, 0], + } + ) self.tokenizer = ConceptTokenizer() self.tokenizer.tokenizer_object = MagicMock() self.tokenizer.tokenizer = MagicMock() self.tokenizer.tokenizer.return_value = { - 'input_ids': torch.tensor([[100, 200], [300, 400]]), - 'attention_mask': torch.tensor([[1, 1], [1, 1]]) + "input_ids": torch.tensor([[100, 200], [300, 400]]), + "attention_mask": torch.tensor([[1, 1], [1, 1]]), } self.tokenizer.task_to_token = MagicMock(side_effect=lambda x: f"[{x.upper()}]") self.tokenizer.get_mask_token_id = MagicMock(return_value=103) @@ -53,11 +55,11 @@ def setUp(self): def test_base_dataset(self): class DummyDataset(BaseDataset): def __getitem__(self, idx: int): - return {'dummy_key': 'dummy_value'} + return {"dummy_key": "dummy_value"} dataset = DummyDataset(data=self.data, tokenizer=self.tokenizer) self.assertEqual(len(dataset), len(self.data)) - self.assertEqual(dataset[0], {'dummy_key': 'dummy_value'}) + self.assertEqual(dataset[0], {"dummy_key": "dummy_value"}) def test_tokenization_mixin(self): class DummyDataset(BaseDataset, TokenizationMixin): @@ -66,16 +68,16 @@ def __getitem__(self, idx: int): dataset = DummyDataset(data=self.data, tokenizer=self.tokenizer) result = dataset[0] - self.assertIn('type_ids', result) - self.assertIn('ages', result) - self.assertIn('time_stamps', result) - self.assertIn('visit_orders', result) - self.assertIn('visit_segments', result) - self.assertEqual(result['type_ids'].size(0), 2) - self.assertEqual(result['ages'].size(0), 2) - self.assertEqual(result['time_stamps'].size(0), 2) - self.assertEqual(result['visit_orders'].size(0), 2) - self.assertEqual(result['visit_segments'].size(0), 2) + self.assertIn("type_ids", result) + self.assertIn("ages", result) + self.assertIn("time_stamps", result) + self.assertIn("visit_orders", result) + self.assertIn("visit_segments", result) + self.assertEqual(result["type_ids"].size(0), 2) + self.assertEqual(result["ages"].size(0), 2) + self.assertEqual(result["time_stamps"].size(0), 2) + self.assertEqual(result["visit_orders"].size(0), 2) + self.assertEqual(result["visit_segments"].size(0), 2) def test_masking_mixin(self): class DummyDataset(BaseDataset, MaskingMixin): @@ -100,7 +102,11 @@ def __init__(self, data, tokenizer, tasks): def __getitem__(self, idx: int): return self.index_mapper[idx] - dataset = DummyDataset(data=self.data, tokenizer=self.tokenizer, tasks=['mortality_1month', 'readmission_1month']) + dataset = DummyDataset( + data=self.data, + tokenizer=self.tokenizer, + tasks=["mortality_1month", "readmission_1month"], + ) dataset.prepare_multi_task_data() self.assertEqual(len(dataset.index_mapper), 3) @@ -114,79 +120,98 @@ def __init__(self, data, tokenizer, tasks): def __getitem__(self, idx: int): return self.index_mapper[idx] - dataset = DummyDataset(data=self.data, tokenizer=self.tokenizer, tasks=['mortality_1month', 'readmission_1month']) + dataset = DummyDataset( + data=self.data, + tokenizer=self.tokenizer, + tasks=["mortality_1month", "readmission_1month"], + ) dataset.prepare_multi_task_data() - dataset.balance_labels({'mortality_1month': 0.5}) + dataset.balance_labels({"mortality_1month": 0.5}) task_counts = {task: 0 for task in dataset.tasks} for i in range(len(dataset)): task = dataset[i][1] task_counts[task] += 1 - self.assertEqual(task_counts['mortality_1month'], 2) - self.assertEqual(task_counts['readmission_1month'], 0) # If not provided in balance_guide, task is removed + self.assertEqual(task_counts["mortality_1month"], 2) + self.assertEqual( + task_counts["readmission_1month"], 0 + ) # If not provided in balance_guide, task is removed def test_pretrain_dataset(self): dataset = PretrainDataset(data=self.data, tokenizer=self.tokenizer) tokens = dataset[0] - self.assertIn('concept_ids', tokens) - self.assertIn('labels', tokens) - self.assertIn('attention_mask', tokens) - self.assertEqual(tokens['concept_ids'].size(0), 2) - self.assertEqual(tokens['labels'].size(0), 2) - self.assertEqual(tokens['attention_mask'].size(0), 2) + self.assertIn("concept_ids", tokens) + self.assertIn("labels", tokens) + self.assertIn("attention_mask", tokens) + self.assertEqual(tokens["concept_ids"].size(0), 2) + self.assertEqual(tokens["labels"].size(0), 2) + self.assertEqual(tokens["attention_mask"].size(0), 2) def test_pretrain_dataset_decoder(self): dataset = PretrainDatasetDecoder(data=self.data, tokenizer=self.tokenizer) tokens = dataset[0] - self.assertIn('concept_ids', tokens) - self.assertIn('labels', tokens) - self.assertEqual(tokens['concept_ids'].size(0), 2) - self.assertEqual(tokens['labels'].size(0), 2) - self.assertIs(tokens['labels'], tokens['concept_ids']) + self.assertIn("concept_ids", tokens) + self.assertIn("labels", tokens) + self.assertEqual(tokens["concept_ids"].size(0), 2) + self.assertEqual(tokens["labels"].size(0), 2) + self.assertIs(tokens["labels"], tokens["concept_ids"]) def test_finetune_dataset(self): dataset = FinetuneDataset(data=self.data, tokenizer=self.tokenizer) tokens = dataset[1] - self.assertIn('concept_ids', tokens) - self.assertIn('labels', tokens) - self.assertIn('attention_mask', tokens) - self.assertEqual(tokens['concept_ids'].size(0), 2) - self.assertEqual(tokens['labels'], torch.tensor(1)) - self.assertEqual(tokens['attention_mask'].size(0), 2) + self.assertIn("concept_ids", tokens) + self.assertIn("labels", tokens) + self.assertIn("attention_mask", tokens) + self.assertEqual(tokens["concept_ids"].size(0), 2) + self.assertEqual(tokens["labels"], torch.tensor(1)) + self.assertEqual(tokens["attention_mask"].size(0), 2) def test_finetune_multi_dataset(self): - dataset = FinetuneMultiDataset(data=self.data, tokenizer=self.tokenizer, tasks=['mortality_1month', 'readmission_1month']) + dataset = FinetuneMultiDataset( + data=self.data, + tokenizer=self.tokenizer, + tasks=["mortality_1month", "readmission_1month"], + ) tokens = dataset[0] - self.assertIn('concept_ids', tokens) - self.assertIn('labels', tokens) - self.assertIn('attention_mask', tokens) - self.assertIn('task', tokens) - self.assertEqual(tokens['concept_ids'].size(0), 2) - self.assertEqual(tokens['labels'], torch.tensor(0)) - self.assertEqual(tokens['attention_mask'].size(0), 2) + self.assertIn("concept_ids", tokens) + self.assertIn("labels", tokens) + self.assertIn("attention_mask", tokens) + self.assertIn("task", tokens) + self.assertEqual(tokens["concept_ids"].size(0), 2) + self.assertEqual(tokens["labels"], torch.tensor(0)) + self.assertEqual(tokens["attention_mask"].size(0), 2) def test_finetune_dataset_decoder(self): - dataset = FinetuneDatasetDecoder(data=self.data, tokenizer=self.tokenizer, tasks=['mortality_1month', 'readmission_1month']) + dataset = FinetuneDatasetDecoder( + data=self.data, + tokenizer=self.tokenizer, + tasks=["mortality_1month", "readmission_1month"], + ) tokens = dataset[2] - self.assertIn('concept_ids', tokens) - self.assertIn('labels', tokens) - self.assertIn('task', tokens) - self.assertIn('task_indices', tokens) - self.assertEqual(tokens['concept_ids'].size(0), 2) - self.assertEqual(tokens['labels'], torch.tensor(0)) + self.assertIn("concept_ids", tokens) + self.assertIn("labels", tokens) + self.assertIn("task", tokens) + self.assertIn("task_indices", tokens) + self.assertEqual(tokens["concept_ids"].size(0), 2) + self.assertEqual(tokens["labels"], torch.tensor(0)) def test_dataset_length(self): dataset = PretrainDataset(data=self.data, tokenizer=self.tokenizer) self.assertEqual(len(dataset), 2) def test_multitask_balance(self): - dataset = FinetuneMultiDataset(data=self.data, tokenizer=self.tokenizer, tasks=['mortality_1month', 'readmission_1month'], balance_guide={'mortality_1month': 0.5}) + dataset = FinetuneMultiDataset( + data=self.data, + tokenizer=self.tokenizer, + tasks=["mortality_1month", "readmission_1month"], + balance_guide={"mortality_1month": 0.5}, + ) task_counts = {task: 0 for task in dataset.tasks} for i in range(len(dataset)): - task = dataset[i]['task'] + task = dataset[i]["task"] task_counts[task] += 1 - self.assertEqual(task_counts['mortality_1month'], 2) - self.assertEqual(task_counts['readmission_1month'], 1) + self.assertEqual(task_counts["mortality_1month"], 2) + self.assertEqual(task_counts["readmission_1month"], 1) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() From 3c3fe5c81f8dca20656ae7057ec74b3e72b491e5 Mon Sep 17 00:00:00 2001 From: afallah Date: Fri, 21 Jun 2024 23:28:48 -0400 Subject: [PATCH 7/9] Fix bug with one test case about balance guide. --- tests/odyssey/data/test_dataset.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/odyssey/data/test_dataset.py b/tests/odyssey/data/test_dataset.py index be8355f..0c92885 100644 --- a/tests/odyssey/data/test_dataset.py +++ b/tests/odyssey/data/test_dataset.py @@ -116,6 +116,9 @@ def __init__(self, data, tokenizer, tasks): BaseDataset.__init__(self, data, tokenizer) MultiTaskMixin.__init__(self, tasks) self.nan_indicator = -1 + + def __len__(self) -> int: + return len(self.index_mapper) def __getitem__(self, idx: int): return self.index_mapper[idx] @@ -133,8 +136,8 @@ def __getitem__(self, idx: int): task_counts[task] += 1 self.assertEqual(task_counts["mortality_1month"], 2) self.assertEqual( - task_counts["readmission_1month"], 0 - ) # If not provided in balance_guide, task is removed + task_counts["readmission_1month"], 1 + ) def test_pretrain_dataset(self): dataset = PretrainDataset(data=self.data, tokenizer=self.tokenizer) From 725b962e9c2a0603adaff3c0362d76b3dbe1dd93 Mon Sep 17 00:00:00 2001 From: afallah Date: Fri, 21 Jun 2024 23:30:54 -0400 Subject: [PATCH 8/9] Document test code. --- tests/odyssey/data/test_dataset.py | 59 ++++++++++++++++++------------ 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/tests/odyssey/data/test_dataset.py b/tests/odyssey/data/test_dataset.py index 0c92885..e8bc9d9 100644 --- a/tests/odyssey/data/test_dataset.py +++ b/tests/odyssey/data/test_dataset.py @@ -2,6 +2,7 @@ import unittest from unittest.mock import MagicMock +from typing import Dict, List, Tuple, Optional import pandas as pd import torch @@ -22,8 +23,8 @@ class TestDatasets(unittest.TestCase): - def setUp(self): - # Set up mock data and tokenizer + def setUp(self) -> None: + """Set up mock data and tokenizer for testing.""" self.data = pd.DataFrame( { "event_tokens_2048": [["token1", "token2"], ["token3", "token4"]], @@ -52,18 +53,20 @@ def setUp(self): self.tokenizer.get_last_token_index = MagicMock(return_value=999) self.tokenizer.token_to_id = MagicMock(return_value=101) - def test_base_dataset(self): + def test_base_dataset(self) -> None: + """Test the BaseDataset class.""" class DummyDataset(BaseDataset): - def __getitem__(self, idx: int): + def __getitem__(self, idx: int) -> Dict[str, str]: return {"dummy_key": "dummy_value"} dataset = DummyDataset(data=self.data, tokenizer=self.tokenizer) self.assertEqual(len(dataset), len(self.data)) self.assertEqual(dataset[0], {"dummy_key": "dummy_value"}) - def test_tokenization_mixin(self): + def test_tokenization_mixin(self) -> None: + """Test the TokenizationMixin class.""" class DummyDataset(BaseDataset, TokenizationMixin): - def __getitem__(self, idx: int): + def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: return self.add_additional_tokens(self.data.iloc[idx]) dataset = DummyDataset(data=self.data, tokenizer=self.tokenizer) @@ -79,9 +82,10 @@ def __getitem__(self, idx: int): self.assertEqual(result["visit_orders"].size(0), 2) self.assertEqual(result["visit_segments"].size(0), 2) - def test_masking_mixin(self): + def test_masking_mixin(self) -> None: + """Test the MaskingMixin class.""" class DummyDataset(BaseDataset, MaskingMixin): - def __getitem__(self, idx: int): + def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: return self.mask_tokens(torch.tensor([10, 20, 30])) dataset = DummyDataset(data=self.data, tokenizer=self.tokenizer) @@ -92,14 +96,15 @@ def __getitem__(self, idx: int): self.assertTrue((masked_sequence <= 103).all()) self.assertTrue((labels == -100).any()) - def test_multi_task_mixin(self): + def test_multi_task_mixin(self) -> None: + """Test the MultiTaskMixin class.""" class DummyDataset(BaseDataset, MultiTaskMixin): - def __init__(self, data, tokenizer, tasks): + def __init__(self, data: pd.DataFrame, tokenizer: ConceptTokenizer, tasks: List[str]) -> None: BaseDataset.__init__(self, data, tokenizer) MultiTaskMixin.__init__(self, tasks) self.nan_indicator = -1 - def __getitem__(self, idx: int): + def __getitem__(self, idx: int) -> Tuple[int, str, int, Optional[int]]: return self.index_mapper[idx] dataset = DummyDataset( @@ -110,9 +115,10 @@ def __getitem__(self, idx: int): dataset.prepare_multi_task_data() self.assertEqual(len(dataset.index_mapper), 3) - def test_label_balance_mixin(self): + def test_label_balance_mixin(self) -> None: + """Test the LabelBalanceMixin class.""" class DummyDataset(BaseDataset, MultiTaskMixin, LabelBalanceMixin): - def __init__(self, data, tokenizer, tasks): + def __init__(self, data: pd.DataFrame, tokenizer: ConceptTokenizer, tasks: List[str]) -> None: BaseDataset.__init__(self, data, tokenizer) MultiTaskMixin.__init__(self, tasks) self.nan_indicator = -1 @@ -120,7 +126,7 @@ def __init__(self, data, tokenizer, tasks): def __len__(self) -> int: return len(self.index_mapper) - def __getitem__(self, idx: int): + def __getitem__(self, idx: int) -> Tuple[int, str, int, Optional[int]]: return self.index_mapper[idx] dataset = DummyDataset( @@ -135,11 +141,10 @@ def __getitem__(self, idx: int): task = dataset[i][1] task_counts[task] += 1 self.assertEqual(task_counts["mortality_1month"], 2) - self.assertEqual( - task_counts["readmission_1month"], 1 - ) + self.assertEqual(task_counts["readmission_1month"], 1) - def test_pretrain_dataset(self): + def test_pretrain_dataset(self) -> None: + """Test the PretrainDataset class.""" dataset = PretrainDataset(data=self.data, tokenizer=self.tokenizer) tokens = dataset[0] self.assertIn("concept_ids", tokens) @@ -149,7 +154,8 @@ def test_pretrain_dataset(self): self.assertEqual(tokens["labels"].size(0), 2) self.assertEqual(tokens["attention_mask"].size(0), 2) - def test_pretrain_dataset_decoder(self): + def test_pretrain_dataset_decoder(self) -> None: + """Test the PretrainDatasetDecoder class.""" dataset = PretrainDatasetDecoder(data=self.data, tokenizer=self.tokenizer) tokens = dataset[0] self.assertIn("concept_ids", tokens) @@ -158,7 +164,8 @@ def test_pretrain_dataset_decoder(self): self.assertEqual(tokens["labels"].size(0), 2) self.assertIs(tokens["labels"], tokens["concept_ids"]) - def test_finetune_dataset(self): + def test_finetune_dataset(self) -> None: + """Test the FinetuneDataset class.""" dataset = FinetuneDataset(data=self.data, tokenizer=self.tokenizer) tokens = dataset[1] self.assertIn("concept_ids", tokens) @@ -168,7 +175,8 @@ def test_finetune_dataset(self): self.assertEqual(tokens["labels"], torch.tensor(1)) self.assertEqual(tokens["attention_mask"].size(0), 2) - def test_finetune_multi_dataset(self): + def test_finetune_multi_dataset(self) -> None: + """Test the FinetuneMultiDataset class.""" dataset = FinetuneMultiDataset( data=self.data, tokenizer=self.tokenizer, @@ -183,7 +191,8 @@ def test_finetune_multi_dataset(self): self.assertEqual(tokens["labels"], torch.tensor(0)) self.assertEqual(tokens["attention_mask"].size(0), 2) - def test_finetune_dataset_decoder(self): + def test_finetune_dataset_decoder(self) -> None: + """Test the FinetuneDatasetDecoder class.""" dataset = FinetuneDatasetDecoder( data=self.data, tokenizer=self.tokenizer, @@ -197,11 +206,13 @@ def test_finetune_dataset_decoder(self): self.assertEqual(tokens["concept_ids"].size(0), 2) self.assertEqual(tokens["labels"], torch.tensor(0)) - def test_dataset_length(self): + def test_dataset_length(self) -> None: + """Test the length of the dataset.""" dataset = PretrainDataset(data=self.data, tokenizer=self.tokenizer) self.assertEqual(len(dataset), 2) - def test_multitask_balance(self): + def test_multitask_balance(self) -> None: + """Test the balancing of tasks in the dataset.""" dataset = FinetuneMultiDataset( data=self.data, tokenizer=self.tokenizer, From 3787a7d8187713bd4bea378cec329910e5e48e99 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 22 Jun 2024 03:31:06 +0000 Subject: [PATCH 9/9] [pre-commit.ci] Add auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/odyssey/data/test_dataset.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/odyssey/data/test_dataset.py b/tests/odyssey/data/test_dataset.py index e8bc9d9..184651b 100644 --- a/tests/odyssey/data/test_dataset.py +++ b/tests/odyssey/data/test_dataset.py @@ -1,8 +1,8 @@ """Test dataset classes.""" import unittest +from typing import Dict, List, Optional, Tuple from unittest.mock import MagicMock -from typing import Dict, List, Tuple, Optional import pandas as pd import torch @@ -55,6 +55,7 @@ def setUp(self) -> None: def test_base_dataset(self) -> None: """Test the BaseDataset class.""" + class DummyDataset(BaseDataset): def __getitem__(self, idx: int) -> Dict[str, str]: return {"dummy_key": "dummy_value"} @@ -65,6 +66,7 @@ def __getitem__(self, idx: int) -> Dict[str, str]: def test_tokenization_mixin(self) -> None: """Test the TokenizationMixin class.""" + class DummyDataset(BaseDataset, TokenizationMixin): def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: return self.add_additional_tokens(self.data.iloc[idx]) @@ -84,6 +86,7 @@ def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]: def test_masking_mixin(self) -> None: """Test the MaskingMixin class.""" + class DummyDataset(BaseDataset, MaskingMixin): def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: return self.mask_tokens(torch.tensor([10, 20, 30])) @@ -98,8 +101,11 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]: def test_multi_task_mixin(self) -> None: """Test the MultiTaskMixin class.""" + class DummyDataset(BaseDataset, MultiTaskMixin): - def __init__(self, data: pd.DataFrame, tokenizer: ConceptTokenizer, tasks: List[str]) -> None: + def __init__( + self, data: pd.DataFrame, tokenizer: ConceptTokenizer, tasks: List[str] + ) -> None: BaseDataset.__init__(self, data, tokenizer) MultiTaskMixin.__init__(self, tasks) self.nan_indicator = -1 @@ -117,12 +123,15 @@ def __getitem__(self, idx: int) -> Tuple[int, str, int, Optional[int]]: def test_label_balance_mixin(self) -> None: """Test the LabelBalanceMixin class.""" + class DummyDataset(BaseDataset, MultiTaskMixin, LabelBalanceMixin): - def __init__(self, data: pd.DataFrame, tokenizer: ConceptTokenizer, tasks: List[str]) -> None: + def __init__( + self, data: pd.DataFrame, tokenizer: ConceptTokenizer, tasks: List[str] + ) -> None: BaseDataset.__init__(self, data, tokenizer) MultiTaskMixin.__init__(self, tasks) self.nan_indicator = -1 - + def __len__(self) -> int: return len(self.index_mapper)