Merge branch 'debug' into cuda-bin-switch-and-cli

bitsandbytes-foundation · Aug 4, 2022 · 758c717 · 758c717
2 parents 96bc209 + ab72a12
commit 758c717
Show file tree

Hide file tree

Showing 5 changed files with 180 additions and 199 deletions.
diff --git a/Makefile b/Makefile
@@ -58,7 +58,7 @@ CC_cublasLt111 += -gencode arch=compute_86,code=sm_86
 
 
 all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
+	$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) 
 	$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
 	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
 

diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
@@ -1,7 +1,7 @@
 from dataclasses import dataclass
 
 import torch
-
+import math
 import bitsandbytes as bnb
 import bitsandbytes.functional as F
 
@@ -199,6 +199,17 @@ def reset_grads(self):
 class MatMul8bitLt(torch.autograd.Function):
     @staticmethod
     def forward(ctx, A, B, out=None, state=MatmulLtState()):
+        # default to pytorch behavior if inputs are empty
+        ctx.is_empty = False
+        if math.prod(A.shape) == 0:
+            ctx.is_empty = True
+            ctx.A = A
+            ctx.B = B
+            if A.shape[-1] == B.shape[0]:
+                return torch.empty(A.shape[:-1]+B.shape[1:], dtype=torch.float16, device=A.device)
+            else:
+                return torch.empty(A.shape[:-1]+B.shape[:1], dtype=torch.float16, device=A.device)
+
         # 1. Quantize A
         # 2. Quantize B
         # 3. Matmul
@@ -339,6 +350,8 @@ def forward(ctx, A, B, out=None, state=MatmulLtState()):
 
     @staticmethod
     def backward(ctx, grad_output):
+        if ctx.is_empty:
+            return torch.zeros_like(ctx.A), torch.zeros_like(ctx.B), None, None
         req_gradA, req_gradB = ctx.req_grads
         CAt, subA = ctx.tensors
         SCAt, idx = ctx.tensor_states
@@ -375,7 +388,7 @@ def backward(ctx, grad_output):
                 ctx.grad_shape
             )
 
-        return grad_A, grad_B, None, None, None, None, None
+        return grad_A, grad_B, None, None
 
 
 matmul = MatMul8bitLt.apply