perf: faster and less memory-intensive model [re]quantization

huggingface · Aug 28, 2024 · 4c4bff5 · 4c4bff5
1 parent 739309f
commit 4c4bff5
Show file tree

Hide file tree

Showing 2 changed files with 5 additions and 5 deletions.
diff --git a/optimum/quanto/nn/qmodule.py b/optimum/quanto/nn/qmodule.py
@@ -216,9 +216,10 @@ def from_module(
         qmodule.input_scale = torch.ones_like(qmodule.input_scale)
         qmodule.output_scale = torch.ones_like(qmodule.output_scale)
         with torch.no_grad():
-            qmodule.weight.copy_(module.weight)
+            qmodule.weight = module.weight
             if module.bias is not None:
-                qmodule.bias.copy_(module.bias)
+                qmodule.bias = module.bias
+
         return qmodule.to(module.weight.device)
 
     @classmethod

diff --git a/optimum/quanto/quantize.py b/optimum/quanto/quantize.py
@@ -133,10 +133,9 @@ def move_tensor(t, device):
             setattr(m, name, torch.nn.Parameter(move_tensor(param, "cpu")))
         for name, param in m.named_buffers(recurse=False):
             setattr(m, name, move_tensor(param, "cpu"))
-    # Freeze model and move to target device
-    freeze(model)
-    model.to(device)
 
+    # Move to target device
+    model.to(device)
     # Load the quantized model weights
     model.load_state_dict(state_dict, strict=False)