fix(qlinear): speed up weights transfer

By instantiating the QLinear weights directly on the device, it saves two copies: from device to cpu and back.
huggingface · Feb 13, 2024 · f56d0df · f56d0df
1 parent f9e7779
commit f56d0df
Showing 1 changed file with 2 additions and 1 deletion.
diff --git a/bench/generation/benchmark.py b/bench/generation/benchmark.py
@@ -173,8 +173,9 @@ def main():
                 # Very simple calibration to avoid completely off results
                 with Calibration():
                     generate(model, tokenizer, device, prompt=CALIBRATION_PROMPT)
+            print("Freezing")
             freeze(model)
-            print(f"Finished: {time.time()-start}")
+            print(f"Finished: {time.time()-start:.2f}")
 
     memory = get_device_memory(device)
     if memory is not None: