Skip to content

Commit

Permalink
perf: faster and less memory-intensive model [re]quantization
Browse files Browse the repository at this point in the history
  • Loading branch information
latentCall145 authored and dacorvo committed Aug 28, 2024
1 parent 739309f commit 4c4bff5
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 5 deletions.
5 changes: 3 additions & 2 deletions optimum/quanto/nn/qmodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,10 @@ def from_module(
qmodule.input_scale = torch.ones_like(qmodule.input_scale)
qmodule.output_scale = torch.ones_like(qmodule.output_scale)
with torch.no_grad():
qmodule.weight.copy_(module.weight)
qmodule.weight = module.weight
if module.bias is not None:
qmodule.bias.copy_(module.bias)
qmodule.bias = module.bias

return qmodule.to(module.weight.device)

@classmethod
Expand Down
5 changes: 2 additions & 3 deletions optimum/quanto/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,9 @@ def move_tensor(t, device):
setattr(m, name, torch.nn.Parameter(move_tensor(param, "cpu")))
for name, param in m.named_buffers(recurse=False):
setattr(m, name, move_tensor(param, "cpu"))
# Freeze model and move to target device
freeze(model)
model.to(device)

# Move to target device
model.to(device)
# Load the quantized model weights
model.load_state_dict(state_dict, strict=False)

Expand Down

0 comments on commit 4c4bff5

Please sign in to comment.