VectorInstitute · sanaAyrml · Jan 25, 2025 · Dec 5, 2024 · Dec 5, 2024 · Dec 5, 2024
diff --git a/.github/workflows/static_code_checks.yaml b/.github/workflows/static_code_checks.yaml
@@ -43,7 +43,9 @@ jobs:
           virtual-environment: .venv/
           # Ignoring vulnerability in cryptography
           # Fix is 43.0.1 but flwr 1.9 depends on < 43
+          # GHSA-cjgq-5qmw-rcj6 is a Keras vulnerability that has no fix yet
           ignore-vulns: |
             GHSA-h4gh-qq45-vh27
             GHSA-q34m-jh98-gwm2
             GHSA-f9vj-2wh5-fj8j
+            GHSA-cjgq-5qmw-rcj6
diff --git a/fl4health/clients/basic_client.py b/fl4health/clients/basic_client.py
@@ -11,7 +11,7 @@
 from flwr.common.typing import Config, NDArrays, Scalar
 from torch.nn.modules.loss import _Loss
 from torch.optim import Optimizer
-from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.lr_scheduler import LRScheduler
 from torch.utils.data import DataLoader
 
 from fl4health.checkpointing.client_module import CheckpointMode, ClientCheckpointAndStateModule
@@ -159,6 +159,9 @@ def get_parameters(self, config: Config) -> NDArrays:
             # Need all parameters even if normally exchanging partial
             return FullParameterExchanger().push_parameters(self.model, config=config)
         else:
+            if hasattr(self, "early_stopper") and self.early_stopper.patience == 0:
+                log(INFO, "Loading saved best model's state before sending model to server.")
+                self.early_stopper.load_snapshot(["model"])
             assert self.model is not None and self.parameter_exchanger is not None
             return self.parameter_exchanger.push_parameters(self.model, config=config)
 
@@ -641,6 +644,10 @@ def train_by_epochs(
                 self.reports_manager.report(report_data, current_round, self.total_epochs, self.total_steps)
                 self.total_steps += 1
                 steps_this_round += 1
+                if hasattr(self, "early_stopper"):
+                    if self.total_steps % self.early_stopper.interval_steps == 0 and self.early_stopper.should_stop():
+                        log(INFO, "Early stopping criterion met. Stopping training.")
+                        break
 
             # Log and report results
             metrics = self.train_metric_manager.compute()
@@ -709,6 +716,10 @@ def train_by_steps(
             report_data.update(self.get_client_specific_reports())
             self.reports_manager.report(report_data, current_round, None, self.total_steps)
             self.total_steps += 1
+            if hasattr(self, "early_stopper"):
+                if self.total_steps % self.early_stopper.interval_steps == 0 and self.early_stopper.should_stop():
+                    log(INFO, "Early stopping criterion met. Stopping training.")
+                    break
 
         loss_dict = self.train_loss_meter.compute().as_dict()
         metrics = self.train_metric_manager.compute()
@@ -879,9 +890,28 @@ def setup_client(self, config: Config) -> None:
         self.parameter_exchanger = self.get_parameter_exchanger(config)
 
         self.reports_manager.report({"host_type": "client", "initialized": str(datetime.datetime.now())})
-
+        try:
+            self.set_early_stopper()
+        except NotImplementedError:
+            log(
+                INFO,
+                """Early stopping not implemented for this client.
+                Override set_early_stopper to activate early stopper.""",
+            )
         self.initialized = True
 
+    def set_early_stopper(self) -> None:
+        """
+        User defined method that sets the early stopper for the client. To override this method, the user must
+        set self.early_stopper to an instance of EarlyStopper. The EarlyStopper class is defined in
+        fl4health.early_stopping. Example implementation:
+
+        from fl4health.utils.early_stopper import EarlyStopper
+        self.early_stopper = EarlyStopper(client=self, patience=3, interval_steps=100)
+
+        """
+        raise NotImplementedError
+
     def get_parameter_exchanger(self, config: Config) -> ParameterExchanger:
         """
         Returns Full Parameter Exchangers. Subclasses that require custom Parameter Exchangers can override this.
@@ -1113,7 +1143,7 @@ def get_model(self, config: Config) -> nn.Module:
         """
         raise NotImplementedError
 
-    def get_lr_scheduler(self, optimizer_key: str, config: Config) -> _LRScheduler | None:
+    def get_lr_scheduler(self, optimizer_key: str, config: Config) -> LRScheduler | None:
         """
         Optional user defined method that returns learning rate scheduler
         to be used throughout training for the given optimizer. Defaults to None.
@@ -1125,7 +1155,7 @@ def get_lr_scheduler(self, optimizer_key: str, config: Config) -> _LRScheduler |
             config (Config): The config from the server.
 
         Returns:
-            _LRScheduler | None: Client learning rate schedulers.
+            LRScheduler | None: Client learning rate schedulers.
         """
         return None
 

diff --git a/fl4health/utils/early_stopper.py b/fl4health/utils/early_stopper.py
@@ -0,0 +1,162 @@
+from collections.abc import Callable
+from logging import INFO
+from pathlib import Path
+from typing import Any
+
+import torch.nn as nn
+from flwr.common.logger import log
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LRScheduler
+
+from fl4health.checkpointing.checkpointer import PerRoundStateCheckpointer
+from fl4health.clients.basic_client import BasicClient
+from fl4health.reporting.reports_manager import ReportsManager
+from fl4health.utils.logging import LoggingMode
+from fl4health.utils.losses import TrainingLosses
+from fl4health.utils.metrics import MetricManager
+from fl4health.utils.snapshotter import (
+    LRSchedulerSnapshotter,
+    NumberSnapshotter,
+    OptimizerSnapshotter,
+    SerizableObjectSnapshotter,
+    Snapshotter,
+    T,
+    TorchModuleSnapshotter,
+)
+
+
+class EarlyStopper:
+    def __init__(
+        self,
+        client: BasicClient,
+        patience: int = 0,
+        interval_steps: int = 5,
+        snapshot_dir: Path | None = None,
+    ) -> None:
+        """
+        Early stopping class is an plugin for the client that allows to stop local training based on the validation
+        loss. At each training step this class saves the best state of the client and restores it if the client is
+        stopped. If the client starts to overfit, the early stopper will stop the training process and restore the best
+        state of the client before sending the model to the server.
+
+        Args:
+            client (BasicClient): The client to be monitored.
+            patience (int, optional): Number of steps to wait before stopping the training. If it is equal to 0 client
+                never stops, but still loads the best state before sending the model to the server. Defaults to 0.
+            interval_steps (int, optional): Determins how often the early stopper should check the validation loss.
+                Defaults to 5.
+            snapshot_dir (Path | None, optional): Rather than keeping best state in the memory we can checkpoint it to
+                the given directory. If it is not given, the best state is kept in the memory. Defaults to None.
+        """
+
+        self.client = client
+
+        self.patience = patience
+        self.counte_down = patience
+        self.interval_steps = interval_steps
+
+        self.best_score: float | None = None
+        self.snapshot_ckpt: dict[str, Any] = {}
+
+        self.default_snapshot_attrs: dict = {
+            "model": (TorchModuleSnapshotter(self.client), nn.Module),
+            "optimizers": (OptimizerSnapshotter(self.client), Optimizer),
+            "lr_schedulers": (
+                LRSchedulerSnapshotter(self.client),
+                LRScheduler,
+            ),
+            "learning_rate": (NumberSnapshotter(self.client), float),
+            "total_steps": (NumberSnapshotter(self.client), int),
+            "total_epochs": (NumberSnapshotter(self.client), int),
+            "reports_manager": (
+                SerizableObjectSnapshotter(self.client),
+                ReportsManager,
+            ),
+            "train_loss_meter": (
+                SerizableObjectSnapshotter(self.client),
+                TrainingLosses,
+            ),
+            "train_metric_manager": (
+                SerizableObjectSnapshotter(self.client),
+                MetricManager,
+            ),
+        }
+
+        if snapshot_dir is not None:
+            self.checkpointer = PerRoundStateCheckpointer(snapshot_dir)
+
+    def add_default_snapshot_attr(
+        self, name: str, snapshot_class: Callable[[BasicClient], Snapshotter], input_type: type[T]
+    ) -> None:
+        self.default_snapshot_attrs.update({name: (snapshot_class(self.client), input_type)})
+
+    def delete_default_snapshot_attr(self, name: str) -> None:
+        del self.default_snapshot_attrs[name]
+
+    def save_snapshot(self) -> None:
+        """
+        Creats a snapshot of the client state and if snapshot_ckpt is given, saves it to the checkpoint.
+        """
+        for attr, (snapshotter_function, expected_type) in self.default_snapshot_attrs.items():
+            self.snapshot_ckpt.update(snapshotter_function.save(attr, expected_type))
+
+        if self.checkpointer is not None:
+            self.checkpointer.save_checkpoint(f"temp_{self.client.client_name}.pt", self.snapshot_ckpt)
+            self.snapshot_ckpt.clear()
+
+        log(
+            INFO,
+            f"""Saving client best state to checkpoint at {self.checkpointer.checkpoint_dir}
+            with name temp_{self.client.client_name}.pt""",
+        )
+
+    def load_snapshot(self, attrs: list[str]) -> None:
+        """
+        Load checkpointed snapshot dict consisting to the respective model attributes.
+
+        Args:
+            args (list[str]): List of attributes to load from the checkpoint.
+        """
+        assert (
+            self.checkpointer.checkpoint_exists(f"temp_{self.client.client_name}.pt") or self.snapshot_ckpt != {}
+        ), "No checkpoint to load"
+
+        if self.checkpointer.checkpoint_exists(f"temp_{self.client.client_name}.pt"):
+            self.snapshot_ckpt = self.checkpointer.load_checkpoint(f"temp_{self.client.client_name}.pt")
+
+        for attr in attrs:
+            snapshotter_function, expected_type = self.default_snapshot_attrs[attr]
+            snapshotter_function.load(self.snapshot_ckpt, attr, expected_type)
+
+    def should_stop(self) -> bool:
+        """
+        Determine if the client should stop training based on early stopping criteria.
+
+        Returns:
+            bool: True if training should stop, otherwise False.
+        """
+
+        val_loss, _ = self.client._validate_or_test(
+            loader=self.client.val_loader,
+            loss_meter=self.client.val_loss_meter,
+            metric_manager=self.client.val_metric_manager,
+            logging_mode=LoggingMode.EARLY_STOP_VALIDATION,
+            include_losses_in_metrics=False,
+        )
+
+        if val_loss is None:
+            return False
+
+        if self.best_score is None or val_loss < self.best_score:
+            self.best_score = val_loss
+
+            self.count_down = self.patience
+            self.save_snapshot()
+            return False
+
+        self.count_down -= 1
+        if self.count_down == 0:
+            self.load_snapshot(list(self.default_snapshot_attrs.keys()))
+            return True
+
+        return False
diff --git a/fl4health/utils/logging.py b/fl4health/utils/logging.py
@@ -3,5 +3,6 @@
 
 class LoggingMode(Enum):
     TRAIN = "Training"
+    EARLY_STOP_VALIDATION = "Early_Stop_Validation"
     VALIDATION = "Validation"
     TEST = "Testing"