feat(library): add CUDA unpack kernel

huggingface · Mar 13, 2024 · 247b148 · 247b148
1 parent eafdb20
commit 247b148
Show file tree

Hide file tree

Showing 7 changed files with 144 additions and 1 deletion.
diff --git a/quanto/library/ext/__init__.py b/quanto/library/ext/__init__.py
@@ -3,5 +3,8 @@
 from .cpp import *
 
 
+if torch.cuda.is_available():
+    from .cuda import *
+
 if torch.backends.mps.is_available():
     from .mps import *
diff --git a/quanto/library/ext/cpp/__init__.py b/quanto/library/ext/cpp/__init__.py
@@ -38,6 +38,6 @@ def quantize_symmetric_cpp(t: torch.Tensor, scale: torch.Tensor, dtype: torch.Te
     return ext().quantize_symmetric(t, scale, dtype)
 
 
-@torch.library.impl("quanto_ext::unpack", ["CPU", "CUDA"])
+@torch.library.impl("quanto_ext::unpack", ["CPU"])
 def unpack_cpp(t: torch.Tensor, bits: int):
     return ext().unpack(t, bits)
diff --git a/quanto/library/ext/cuda/README.md b/quanto/library/ext/cuda/README.md
@@ -0,0 +1,11 @@
+# Quanto generic CUDA extension
+
+Kernels in this extension can use both the C++ and CUDA syntax.
+
+They can use any pytorch operation defined under `aten::` or `c10::`.
+
+To add a new implementation for an operation defined in `library./ops.py`:
+
+- add the corresponding `.cpp` or `.cu` file to the list of sources in `__init__.py`,
+- add a binding to `pybind_module.cpp`,
+- provide an implementation calling the binding in `__init__.py`.
diff --git a/quanto/library/ext/cuda/__init__.py b/quanto/library/ext/cuda/__init__.py
@@ -0,0 +1,30 @@
+import os
+
+import torch
+from torch.utils.cpp_extension import load
+
+
+__all__ = []
+
+
+_ext = None
+
+
+def ext():
+    """Helper to load the CUDA ext only when it is required"""
+    global _ext
+    if _ext is None:
+        module_path = os.path.dirname(__file__)
+        _ext = load(
+            name="quanto_cuda",
+            sources=[
+                f"{module_path}/unpack.cu",
+                f"{module_path}/pybind_module.cpp",
+            ],
+        )
+    return _ext
+
+
+@torch.library.impl("quanto_ext::unpack", ["CUDA"])
+def unpack_cuda(t: torch.Tensor, bits: int):
+    return ext().unpack(t, bits)
diff --git a/quanto/library/ext/cuda/pybind_module.cpp b/quanto/library/ext/cuda/pybind_module.cpp
@@ -0,0 +1,13 @@
+#include <torch/extension.h>
+#include "unpack.h"
+
+// !IMPORTANT! Some python objects such as dtype, device, are not mapped to C++ types,
+// and need to be explicitly converted using dedicated helpers before calling a C++ method.
+// As a consequence, when an operation takes such an object as parameter, instead
+// of creating a binding directly to the C++ method, you must create a binding to a
+// lambda method that converts the unmapped types and calls the C++ method.
+// See the binding of quantize_symmetric for instance.
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("unpack", &unpack, "unpack");
+}
diff --git a/quanto/library/ext/cuda/unpack.cu b/quanto/library/ext/cuda/unpack.cu
@@ -0,0 +1,83 @@
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <c10/cuda/CUDAException.h>
+
+inline  unsigned int cdiv(unsigned int a, unsigned int b) { return (a + b - 1) / b;}
+#define BLOCK_SIZE 256
+
+using namespace at;
+
+
+static torch::Tensor allocate_output(const torch::Tensor& input, int bits) {
+    int n_packed = 8 / bits;
+    auto output_shape = input.sizes().vec();
+    output_shape[0] = output_shape[0] * n_packed;
+    return torch::empty(output_shape, input.options());
+}
+
+__global__ void unpack_4bit_kernel(unsigned char* input, unsigned char* output, int n) {
+	int i = blockIdx.x*blockDim.x + threadIdx.x;
+	if(i>=n) return;
+
+	output[i]     = (input[i] & 0x0F);
+	output[i + n] = (input[i] & 0xF0) >> 4;
+}
+
+static torch::Tensor unpack_4bit(const torch::Tensor& input){
+
+	auto output = allocate_output(input, 4);
+
+    const auto numel = input.numel();
+	int blocks = cdiv(numel, BLOCK_SIZE);
+	unpack_4bit_kernel<<<blocks, BLOCK_SIZE>>>(
+        input.data_ptr<unsigned char>(),
+        output.data_ptr<unsigned char>(),
+        numel
+    );
+
+	C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+	return output;
+}
+
+__global__ void unpack_2bit_kernel(unsigned char* input, unsigned char* output, int n) {
+	int i = blockIdx.x*blockDim.x + threadIdx.x;
+	if(i>=n) return;
+
+	output[i]       = (input[i] & 0x03);
+	output[i + n]   = (input[i] & 0x0C) >> 2;
+	output[i + n*2] = (input[i] & 0x30) >> 4;
+	output[i + n*3] = (input[i] & 0xC0) >> 6;
+}
+
+static torch::Tensor unpack_2bit(const torch::Tensor& input){
+
+	auto output = allocate_output(input, 2);
+
+    const auto numel = input.numel();
+	int blocks = cdiv(numel, BLOCK_SIZE);
+	unpack_2bit_kernel<<<blocks, BLOCK_SIZE>>>(
+        input.data_ptr<unsigned char>(),
+        output.data_ptr<unsigned char>(),
+        numel
+    );
+
+	C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+	return output;
+}
+
+torch::Tensor unpack(torch::Tensor &t, int bits) {
+    TORCH_CHECK(t.scalar_type() == torch::kUInt8, "Unsupported data type: ", t.scalar_type());
+    TORCH_CHECK(t.device().is_cuda(), "t must be a CUDA tensor.");
+    TORCH_CHECK(t.is_contiguous(), "t must be contiguous.");
+    switch(bits) {
+      case 4:
+        return unpack_4bit(t);
+      case 2:
+        return unpack_2bit(t);
+      default:
+        throw std::invalid_argument("Can only unpack 2-bit or 4-bit tensors.");
+    }
+}
diff --git a/quanto/library/ext/cuda/unpack.h b/quanto/library/ext/cuda/unpack.h
@@ -0,0 +1,3 @@
+#include <torch/extension.h>
+
+torch::Tensor unpack(torch::Tensor &t, int bits);
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		#include <torch/extension.h>

		torch::Tensor unpack(torch::Tensor &t, int bits);