remove fp8 parameters

huggingface · Jan 17, 2025 · f70230e · f70230e
1 parent 222bd00
commit f70230e
Show file tree

Hide file tree

Showing 7 changed files with 0 additions and 286 deletions.
diff --git a/src/nanotron/constants.py b/src/nanotron/constants.py
@@ -11,14 +11,4 @@
 CHECKPOINT_FILE_NAME = "checkpoint_metadata.json"
 MODEL_CONFIG_FILE_NAME = "model_config.json"
 
-
-# TODO(xrsrke): remove this shit
-ITERATION_STEP = 1
-# TODO(xrsrke): refactor to training stage,
-# keep it in the same class as iteration_step
-
-is_ready_to_log = False
-
-# TODO(xrsrke): refactor
 CPU_WEIGHTS = {}
-ACCUM_GRADS = {}
diff --git a/src/nanotron/fp8/__init__.py b/src/nanotron/fp8/__init__.py
@@ -2,7 +2,6 @@
 
 from nanotron.fp8.dtypes import DTypes  # noqa
 from nanotron.fp8.linear import FP8Linear  # noqa
-from nanotron.fp8.parameter import FP8Parameter  # noqa
 from nanotron.fp8.tensor import FP8Tensor  # noqa
 
 try:

diff --git a/src/nanotron/fp8/parameter.py b/src/nanotron/fp8/parameter.py
diff --git a/src/nanotron/parallel/parameters.py b/src/nanotron/parallel/parameters.py
@@ -243,7 +243,6 @@ def __repr__(self):
 
     @property
     def data(self):
-        # from nanotron.fp8.parameter import FP8Parameter
         return self._data
 
     @data.setter

diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
@@ -216,12 +216,6 @@ def __init__(
             if hasattr(p, "_is_future_fp8") and p._is_future_fp8 is True:
                 constants.CPU_WEIGHTS[n.replace("module.", "")] = p.data.cpu().clone()
 
-        # NOTE: sanity check all hash are different
-        # param_hash = []
-        # for p in self.model.parameters():
-        #     assert hash(p) not in param_hash
-        #     param_hash.append(hash(p))
-
         # NOTE: if we cast model to FP8 before wrapping it with NanotronParameter,
         # then we can create a NanotronParameter that has dtype=[torch.int8, torch.uint8]
         # which then it allows us to assign [torch.int8, torch.uint8] gradients to the parameter
@@ -231,7 +225,6 @@ def __init__(
         # Please ensure that the gradient and the tensor have the same dtype"
         # NOTE: the reason that we cast after initializing the optimizer is that
         # we want to create some master weights for fp8 parameters, before quantizing them
-
         if self.config.model.dtype == torch.int8:
             self.model = convert_model_to_fp8(self.model, config=self.config)
 

diff --git a/tests/fp8/_test_fp8_parameter.py b/tests/fp8/_test_fp8_parameter.py
diff --git a/tests/fp8/_test_linear.py b/tests/fp8/_test_linear.py
@@ -6,7 +6,6 @@
 from nanotron.fp8.constants import FP8_DTYPES, QTYPE_TO_DTYPE
 from nanotron.fp8.dtypes import DTypes
 from nanotron.fp8.linear import FP8Linear, FP8LinearMeta
-from nanotron.fp8.parameter import FP8Parameter
 from nanotron.fp8.recipe import FP8LinearRecipe
 from nanotron.fp8.tensor import FP8Tensor, convert_tensor_from_fp8
 from nanotron.fp8.utils import convert_linear_to_fp8, convert_to_fp8_module, is_overflow_underflow_nan