From d075a443d5a1734877e7ebf6a5e49c8d4797eb39 Mon Sep 17 00:00:00 2001 From: Antonio Carta Date: Wed, 14 Feb 2024 11:24:04 +0100 Subject: [PATCH 1/2] fix issue #1597 --- avalanche/benchmarks/scenarios/__init__.py | 1 + .../benchmarks/scenarios/dataset_scenario.py | 86 ------------- avalanche/benchmarks/scenarios/supervised.py | 12 +- .../scenarios/validation_scenario.py | 116 ++++++++++++++++++ .../scenarios/test_dataset_scenario.py | 10 +- tests/training/test_plugins.py | 2 + 6 files changed, 135 insertions(+), 92 deletions(-) create mode 100644 avalanche/benchmarks/scenarios/validation_scenario.py diff --git a/avalanche/benchmarks/scenarios/__init__.py b/avalanche/benchmarks/scenarios/__init__.py index f84a965c2..83db82055 100644 --- a/avalanche/benchmarks/scenarios/__init__.py +++ b/avalanche/benchmarks/scenarios/__init__.py @@ -8,3 +8,4 @@ from .dataset_scenario import * from .exmodel_scenario import * from .online import * +from .validation_scenario import * diff --git a/avalanche/benchmarks/scenarios/dataset_scenario.py b/avalanche/benchmarks/scenarios/dataset_scenario.py index df0848ef0..cf5f7afa3 100644 --- a/avalanche/benchmarks/scenarios/dataset_scenario.py +++ b/avalanche/benchmarks/scenarios/dataset_scenario.py @@ -14,7 +14,6 @@ import random from avalanche.benchmarks.utils.data import AvalancheDataset import torch -from itertools import tee from typing import ( Callable, Generator, @@ -253,94 +252,9 @@ def __iter__( yield self.split_strategy(new_experience.dataset) -def benchmark_with_validation_stream( - benchmark: CLScenario, - validation_size: Union[int, float] = 0.5, - shuffle: bool = False, - seed: Optional[int] = None, - split_strategy: Optional[ - Callable[[AvalancheDataset], Tuple[AvalancheDataset, AvalancheDataset]] - ] = None, -) -> CLScenario: - """Helper to obtain a benchmark with a validation stream. - - This generator accepts an existing benchmark instance and returns a version - of it in which the train stream has been split into training and validation - streams. - - Each train/validation experience will be by splitting the original training - experiences. Patterns selected for the validation experience will be removed - from the training experiences. - - The default splitting strategy is a random split as implemented by `split_validation_random`. - If you want to use class balancing you can use `split_validation_class_balanced`, or - use a custom `split_strategy`, as shown in the following example:: - - validation_size = 0.2 - foo = lambda exp: split_dataset_class_balanced(validation_size, exp) - bm = benchmark_with_validation_stream(bm, custom_split_strategy=foo) - - :param benchmark: The benchmark to split. - :param validation_size: The size of the validation experience, as an int - or a float between 0 and 1. Ignored if `custom_split_strategy` is used. - :param shuffle: If True, patterns will be allocated to the validation - stream randomly. This will use the default PyTorch random number - generator at its current state. Defaults to False. Ignored if - `custom_split_strategy` is used. If False, the first instances will be - allocated to the training dataset by leaving the last ones to the - validation dataset. - :param split_strategy: A function that implements a custom splitting - strategy. The function must accept an AvalancheDataset and return a tuple - containing the new train and validation dataset. By default, the splitting - strategy will split the data according to `validation_size` and `shuffle`). - A good starting to understand the mechanism is to look at the - implementation of the standard splitting function - :func:`random_validation_split_strategy`. - - :return: A benchmark instance in which the validation stream has been added. - """ - - if split_strategy is None: - if seed is None: - seed = random.randint(0, 1000000) - - # functools.partial is a more compact option - # However, MyPy does not understand what a partial is -_- - def random_validation_split_strategy_wrapper(data): - return split_validation_random(validation_size, shuffle, seed, data) - - split_strategy = random_validation_split_strategy_wrapper - else: - split_strategy = split_strategy - - stream = benchmark.streams["train"] - if isinstance(stream, EagerCLStream): # eager split - train_exps, valid_exps = [], [] - - exp: DatasetExperience - for exp in stream: - train_data, valid_data = split_strategy(exp.dataset) - train_exps.append(DatasetExperience(dataset=train_data)) - valid_exps.append(DatasetExperience(dataset=valid_data)) - else: # Lazy splitting (based on a generator) - split_generator = LazyTrainValSplitter(split_strategy, stream) - train_exps = (DatasetExperience(dataset=a) for a, _ in split_generator) - valid_exps = (DatasetExperience(dataset=b) for _, b in split_generator) - - train_stream = make_stream(name="train", exps=train_exps) - valid_stream = make_stream(name="valid", exps=valid_exps) - other_streams = benchmark.streams - - del other_streams["train"] - return CLScenario( - streams=[train_stream, valid_stream] + list(other_streams.values()) - ) - - __all__ = [ "_split_dataset_by_attribute", "benchmark_from_datasets", "DatasetExperience", "split_validation_random", - "benchmark_with_validation_stream", ] diff --git a/avalanche/benchmarks/scenarios/supervised.py b/avalanche/benchmarks/scenarios/supervised.py index 66c0ecc8a..830b37c02 100644 --- a/avalanche/benchmarks/scenarios/supervised.py +++ b/avalanche/benchmarks/scenarios/supervised.py @@ -31,7 +31,7 @@ from avalanche.benchmarks.utils.data import AvalancheDataset from avalanche.benchmarks.utils.data_attribute import DataAttribute from .dataset_scenario import _split_dataset_by_attribute, DatasetExperience -from .. import CLScenario, CLStream, EagerCLStream +from .generic_scenario import CLScenario, CLStream, EagerCLStream def class_incremental_benchmark( @@ -399,12 +399,14 @@ def _decorate_stream(obj: CLStream): new_exp = copy(exp) curr_cls = exp.dataset.targets.uniques - new_exp.classes_in_this_experience = curr_cls - new_exp.previous_classes = set(prev_cls) - new_exp.classes_seen_so_far = curr_cls.union(prev_cls) + new_exp.classes_in_this_experience = list(curr_cls) + new_exp.previous_classes = list(set(prev_cls)) + new_exp.classes_seen_so_far = list(curr_cls.union(prev_cls)) # TODO: future_classes ignores repetitions right now... # implement and test scenario with repetitions - new_exp.future_classes = all_cls.difference(new_exp.classes_seen_so_far) + new_exp.future_classes = list( + all_cls.difference(new_exp.classes_seen_so_far) + ) new_stream.append(new_exp) prev_cls = prev_cls.union(curr_cls) diff --git a/avalanche/benchmarks/scenarios/validation_scenario.py b/avalanche/benchmarks/scenarios/validation_scenario.py new file mode 100644 index 000000000..b3cb08eb3 --- /dev/null +++ b/avalanche/benchmarks/scenarios/validation_scenario.py @@ -0,0 +1,116 @@ +from typing import ( + Callable, + Generator, + Generic, + List, + Sequence, + TypeVar, + Union, + Tuple, + Optional, + Iterable, + Dict, +) + +import random +from avalanche.benchmarks.utils.data import AvalancheDataset +from .generic_scenario import EagerCLStream, CLScenario, CLExperience, make_stream +from .dataset_scenario import ( + LazyTrainValSplitter, + DatasetExperience, + split_validation_random, +) +from .supervised import with_classes_timeline + + +def benchmark_with_validation_stream( + benchmark: CLScenario, + validation_size: Union[int, float] = 0.5, + shuffle: bool = False, + seed: Optional[int] = None, + split_strategy: Optional[ + Callable[[AvalancheDataset], Tuple[AvalancheDataset, AvalancheDataset]] + ] = None, +) -> CLScenario: + """Helper to obtain a benchmark with a validation stream. + + This generator accepts an existing benchmark instance and returns a version + of it in which the train stream has been split into training and validation + streams. + + Each train/validation experience will be by splitting the original training + experiences. Patterns selected for the validation experience will be removed + from the training experiences. + + The default splitting strategy is a random split as implemented by `split_validation_random`. + If you want to use class balancing you can use `split_validation_class_balanced`, or + use a custom `split_strategy`, as shown in the following example:: + + validation_size = 0.2 + foo = lambda exp: split_dataset_class_balanced(validation_size, exp) + bm = benchmark_with_validation_stream(bm, custom_split_strategy=foo) + + :param benchmark: The benchmark to split. + :param validation_size: The size of the validation experience, as an int + or a float between 0 and 1. Ignored if `custom_split_strategy` is used. + :param shuffle: If True, patterns will be allocated to the validation + stream randomly. This will use the default PyTorch random number + generator at its current state. Defaults to False. Ignored if + `custom_split_strategy` is used. If False, the first instances will be + allocated to the training dataset by leaving the last ones to the + validation dataset. + :param split_strategy: A function that implements a custom splitting + strategy. The function must accept an AvalancheDataset and return a tuple + containing the new train and validation dataset. By default, the splitting + strategy will split the data according to `validation_size` and `shuffle`). + A good starting to understand the mechanism is to look at the + implementation of the standard splitting function + :func:`random_validation_split_strategy`. + + :return: A benchmark instance in which the validation stream has been added. + """ + + if split_strategy is None: + if seed is None: + seed = random.randint(0, 1000000) + + # functools.partial is a more compact option + # However, MyPy does not understand what a partial is -_- + def random_validation_split_strategy_wrapper(data): + return split_validation_random(validation_size, shuffle, seed, data) + + split_strategy = random_validation_split_strategy_wrapper + else: + split_strategy = split_strategy + + stream = benchmark.streams["train"] + if isinstance(stream, EagerCLStream): # eager split + train_exps, valid_exps = [], [] + + exp: DatasetExperience + for exp in stream: + train_data, valid_data = split_strategy(exp.dataset) + train_exps.append(DatasetExperience(dataset=train_data)) + valid_exps.append(DatasetExperience(dataset=valid_data)) + else: # Lazy splitting (based on a generator) + split_generator = LazyTrainValSplitter(split_strategy, stream) + train_exps = (DatasetExperience(dataset=a) for a, _ in split_generator) + valid_exps = (DatasetExperience(dataset=b) for _, b in split_generator) + + train_stream = make_stream(name="train", exps=train_exps) + valid_stream = make_stream(name="valid", exps=valid_exps) + other_streams = benchmark.streams + + # don't drop classes-timeline for compatibility with old API + e0 = next(iter(train_stream)) + if hasattr(e0, "dataset") and hasattr(e0.dataset, "targets"): + train_stream = with_classes_timeline(train_stream) + valid_stream = with_classes_timeline(valid_stream) + + del other_streams["train"] + return CLScenario( + streams=[train_stream, valid_stream] + list(other_streams.values()) + ) + + +__all__ = ["benchmark_with_validation_stream"] diff --git a/tests/benchmarks/scenarios/test_dataset_scenario.py b/tests/benchmarks/scenarios/test_dataset_scenario.py index d5c11eabd..dbb86ab77 100644 --- a/tests/benchmarks/scenarios/test_dataset_scenario.py +++ b/tests/benchmarks/scenarios/test_dataset_scenario.py @@ -6,12 +6,14 @@ from avalanche.benchmarks import ( benchmark_from_datasets, - benchmark_with_validation_stream, CLScenario, CLStream, split_validation_random, task_incremental_benchmark, ) +from avalanche.benchmarks.scenarios.validation_scenario import ( + benchmark_with_validation_stream, +) from avalanche.benchmarks.scenarios.dataset_scenario import ( DatasetExperience, split_validation_class_balanced, @@ -383,3 +385,9 @@ def test_gen(): mb = get_mbatch(dd, len(dd)) self.assertTrue(torch.equal(test_x, mb[0])) self.assertTrue(torch.equal(test_y, mb[1])) + + def test_regressioni1597(args): + # regression test for issue #1597 + bm = get_fast_benchmark() + for exp in bm.train_stream: + assert hasattr(exp, "classes_in_this_experience") diff --git a/tests/training/test_plugins.py b/tests/training/test_plugins.py index 6c223c73e..9ba6dda98 100644 --- a/tests/training/test_plugins.py +++ b/tests/training/test_plugins.py @@ -16,6 +16,8 @@ from avalanche.benchmarks import ( nc_benchmark, GenericCLScenario, +) +from avalanche.benchmarks.scenarios.validation_scenario import ( benchmark_with_validation_stream, ) from avalanche.benchmarks.utils.data_loader import TaskBalancedDataLoader From ec3f3c6b27884065f25212aa79da46a13fd986fe Mon Sep 17 00:00:00 2001 From: Antonio Carta Date: Wed, 14 Feb 2024 11:34:34 +0100 Subject: [PATCH 2/2] revert classes timeline values to sets --- avalanche/benchmarks/scenarios/supervised.py | 11 ++++------- avalanche/models/dynamic_modules.py | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/avalanche/benchmarks/scenarios/supervised.py b/avalanche/benchmarks/scenarios/supervised.py index 830b37c02..8e46446db 100644 --- a/avalanche/benchmarks/scenarios/supervised.py +++ b/avalanche/benchmarks/scenarios/supervised.py @@ -26,7 +26,6 @@ from avalanche.benchmarks.utils.classification_dataset import ( ClassificationDataset, _as_taskaware_supervised_classification_dataset, - TaskAwareSupervisedClassificationDataset, ) from avalanche.benchmarks.utils.data import AvalancheDataset from avalanche.benchmarks.utils.data_attribute import DataAttribute @@ -399,14 +398,12 @@ def _decorate_stream(obj: CLStream): new_exp = copy(exp) curr_cls = exp.dataset.targets.uniques - new_exp.classes_in_this_experience = list(curr_cls) - new_exp.previous_classes = list(set(prev_cls)) - new_exp.classes_seen_so_far = list(curr_cls.union(prev_cls)) + new_exp.classes_in_this_experience = curr_cls + new_exp.previous_classes = set(prev_cls) + new_exp.classes_seen_so_far = curr_cls.union(prev_cls) # TODO: future_classes ignores repetitions right now... # implement and test scenario with repetitions - new_exp.future_classes = list( - all_cls.difference(new_exp.classes_seen_so_far) - ) + new_exp.future_classes = all_cls.difference(new_exp.classes_seen_so_far) new_stream.append(new_exp) prev_cls = prev_cls.union(curr_cls) diff --git a/avalanche/models/dynamic_modules.py b/avalanche/models/dynamic_modules.py index 1cf580983..3086681fe 100644 --- a/avalanche/models/dynamic_modules.py +++ b/avalanche/models/dynamic_modules.py @@ -246,7 +246,7 @@ def adaptation(self, experience: CLExperience): self.active_units[: old_act_units.shape[0]] = old_act_units # update with new active classes if self.training: - self.active_units[curr_classes] = 1 + self.active_units[list(curr_classes)] = 1 # update classifier weights if old_nclasses == new_nclasses: