huggingface · dacorvo · Sep 3, 2024 · Jun 24, 2024 · Sep 5, 2024 · Sep 13, 2024
@@ -5,6 +5,8 @@
 import torch
 
 from optimum.quanto.tensor.weights.awq import AWQPackedTensor, AWQPacking
+from optimum.quanto.tensor.weights.marlin import marlin_permute
+from optimum.quanto.tensor.weights.marlin.int4 import MarlinInt4PackedTensor
 
 
 def benchmark(f, warmup=1, iter=10):
@@ -28,12 +30,15 @@ def get_problem(m, n, k, groupsize=128):
     A = torch.rand((m, k), dtype=torch.half, device=dev)
     B_4bit = torch.randint(0, 2**4, (n, k), dtype=torch.uint8, device=dev)
     B_awq = AWQPackedTensor.pack(B_4bit, packing=AWQPacking.V2)._data
+    B_marlin = MarlinInt4PackedTensor.pack(B_4bit)._data
     B_ref = torch.rand((k, n), dtype=torch.half, device=dev)
     s = torch.rand((k // groupsize, n), dtype=torch.half, device=dev) / 2**4
+    s_marlin = marlin_permute(s)
     z = torch.randint(-(2 ** (4 - 1)), 2 ** (4 - 1), (k // groupsize, n), dtype=torch.int8, device=dev)
     sz = -z * s
+    sz_marlin = marlin_permute(sz)
     torch.cuda.synchronize()
-    return A, B_ref, B_awq, s, sz
+    return A, B_ref, B_awq, B_marlin, s, s_marlin, sz, sz_marlin
 
 
 def benchmark_dense(A, B, m, n, k):
@@ -56,6 +61,16 @@ def benchmark_awq(A, B, s, sz, m, n, k):
     }
 
 
+def benchmark_marlin(A, B, s, sz, m, n, k):
+    workspace = torch.zeros(n // 128 * 16, dtype=torch.int, device=torch.device("cuda:0"))
+    res = benchmark(lambda: torch.ops.quanto.gemm_f16i4_marlin(A, B, s, sz, workspace))
+    return {
+        "s": res,
+        "TFLOP/s": 2 * (m * k) * n / res / 10**12,
+        "GB/s": (2 * A.numel() + 4 * B.numel() + 2 * (m * n) + 2 * s.numel() + 2 * sz.numel()) / res / 10**9,
+    }
+
+
 MODELS = {
     "Llama7B": [(4096, 3 * 4096), (4096, 4096), (4096, 2 * 10752), (10752, 4096)],
     "Llama13B": [(5120, 3 * 5120), (5120, 5120), (5120, 2 * 13568), (13568, 5120)],
@@ -79,23 +94,37 @@ def run_benchmark(model, tokens=None):
     print(model)
     for m in tokens:
         tot_awq = {"s": 0, "TFLOP/s": 0, "GB/s": 0, "speedup": 0}
+        tot_marlin = {"s": 0, "TFLOP/s": 0, "GB/s": 0, "speedup": 0}
         for layer in layers:
             k, n = layer
-            A, B_ref, B_awq, s, sz = get_problem(m, n, k, groupsize)
+            A, B_ref, B_awq, B_marlin, s, s_marlin, sz, sz_marlin = get_problem(m, n, k, groupsize)
             res_d = benchmark_dense(A, B_ref, m, n, k)
             res_awq = benchmark_awq(A, B_awq, s, sz, m, n, k)
             res_awq["speedup"] = res_d["s"] / res_awq["s"]
             tot_awq["s"] += res_awq["s"]
             for key in tot_awq:
                 if key != "s":
                     tot_awq[key] += res_awq[key] * res_awq["s"]
+            res_marlin = benchmark_marlin(A, B_marlin, s_marlin, sz_marlin, m, n, k)
+            res_marlin["speedup"] = res_d["s"] / res_marlin["s"]
+            tot_marlin["s"] += res_marlin["s"]
+            for key in tot_marlin:
+                if key != "s":
+                    tot_marlin[key] += res_marlin[key] * res_marlin["s"]
         for key in tot_awq:
             if key != "s":
                 tot_awq[key] /= tot_awq["s"]
+        for key in tot_marlin:
+            if key != "s":
+                tot_marlin[key] /= tot_marlin["s"]
         print(
             "AWQ, tokens=%04d: s=%.5f, TFLOP/s=%07.3f, GB/s=%08.3f, speedup=%.2f"
             % (m, tot_awq["s"], tot_awq["TFLOP/s"], tot_awq["GB/s"], tot_awq["speedup"])
         )
+        print(
+            "Marlin, batch=%04d: s=%.5f, TFLOP/s=%07.3f, GB/s=%08.3f, speedup=%.2f"
+            % (m, tot_marlin["s"], tot_marlin["TFLOP/s"], tot_marlin["GB/s"], tot_marlin["speedup"])
+        )
 
 
 def main():

@@ -60,6 +60,8 @@ def get_max_cuda_arch():
     "awq/v2/gemv_cuda.cu",
     "marlin/fp8_marlin.cu",
     "marlin/gptq_marlin_repack.cu",
+    "marlin/marlin_cuda.cpp",
+    "marlin/marlin_cuda_kernel.cu",
     "pybind_module.cpp",
 ]
 ext = Extension(
@@ -142,3 +144,38 @@ def gptq_marlin_repack(
     assert b_q_weight.dim() == 2
     assert b_q_weight.dtype == torch.int32
     return ext.lib.gptq_marlin_repack(b_q_weight, perm, size_k, size_n, num_bits)
+
+
+torch.library.define(
+    "quanto::gemm_f16i4_marlin",
+    "(Tensor input, Tensor other, Tensor other_scale, Tensor other_shift, Tensor workspace) -> Tensor",
+)
+
+
+@torch.library.impl("quanto::gemm_f16i4_marlin", ["CUDA"])
+def gemm_f16i4_marlin(
+    input: torch.Tensor, other: torch.Tensor, scales: torch.Tensor, zeropoint: torch.Tensor, workspace: torch.Tensor
+) -> torch.Tensor:
+    assert input.dtype == torch.float16
+    assert other.dtype == torch.int32
+    assert scales.dtype == torch.float16
+    assert zeropoint.dtype == torch.float16
+    assert workspace.dtype == torch.int32
+    output = torch.empty(
+        input.shape[:-1] + (scales.shape[1],),
+        dtype=input.dtype,
+        device=input.device,
+    )
+    ext.lib.marlin_gemm_f16i4(
+        input.view((-1, input.shape[-1])),
+        other,
+        output.view((-1, output.shape[-1])),
+        scales,
+        zeropoint,
+        workspace,
+        -1,
+        -1,
+        -1,
+        16,
+    )
+    return output
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) Marlin.2024 Elias Frantar ([email protected])
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "marlin_cuda.h"
+
+#include <torch/all.h>
+#include <torch/python.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+
+#include "marlin_cuda_kernel.cuh"
+
+const int ERR_PROB_SHAPE = 1;
+const int ERR_KERN_SHAPE = 2;
+
+void mul(
+  const torch::Tensor& A,
+  const torch::Tensor& B,
+        torch::Tensor& C,
+  const torch::Tensor& s,
+  const torch::Tensor& sz, // ADDED: add scaled zero point
+        torch::Tensor& workspace,
+  int thread_k,
+  int thread_n,
+  int sms,
+  int max_par
+) {
+  int prob_m = A.size(0);
+  int prob_n = C.size(1);
+  int prob_k = A.size(1);
+  int groupsize = (s.size(0) == 1) ? -1 : prob_k / s.size(0);
+  if (groupsize != -1 && groupsize * s.size(0) != prob_k)
+    AT_ERROR("k=", prob_k, " not compatible with ", s.size(0), " groups.");
+  if (workspace.numel() < prob_n / 128 * max_par)
+    AT_ERROR("workspace must be of size at least ", prob_n / 128 * max_par, ".");
+  int dev = A.get_device();
+  int err = marlin_cuda(
+    A.data_ptr(),
+    B.data_ptr(),
+    C.data_ptr(),
+    s.data_ptr(),
+    sz.data_ptr(), // ADDED: add scaled zero point
+    prob_m, prob_n, prob_k,
+    workspace.data_ptr(),
+    groupsize,
+    dev,
+    at::cuda::getCurrentCUDAStream(dev),
+    thread_k,
+    thread_n,
+    sms,
+    max_par
+  );
+  if (err == ERR_PROB_SHAPE) {
+    AT_ERROR(
+      "Problem (m=", prob_m, ", n=", prob_n, ", k=", prob_k, ")",
+      " not compatible with thread_k=", thread_k, ", thread_n=", thread_n, "."
+    );
+  } else if (err == ERR_KERN_SHAPE) {
+    AT_ERROR(
+      "No kernel implementation for thread_k=", thread_k, ", thread_n=", thread_n, ", groupsize=", groupsize, "."
+    );
+  }
+}
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) Marlin.2024 Elias Frantar ([email protected])
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *         http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <torch/extension.h>
+
+void mul(
+  const torch::Tensor& A,
+  const torch::Tensor& B,
+        torch::Tensor& C,
+  const torch::Tensor& s,
+  const torch::Tensor& sz,
+        torch::Tensor& workspace,
+  int thread_k = -1,
+  int thread_n = -1,
+  int sms = -1,
+  int max_par = 8
+);