Added compilation from source instructions; easier compilation.

bitsandbytes-foundation · Oct 22, 2021 · 0fb378b · 0fb378b
1 parent d2f1672
commit 0fb378b
Show file tree

Hide file tree

Showing 6 changed files with 84 additions and 40 deletions.
diff --git a/Makefile b/Makefile
@@ -2,7 +2,11 @@ MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
 ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH)))
 
 GPP:= /usr/bin/g++
+ifeq ($(CUDA_HOME),)
+	CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev)
+endif
 NVCC := $(CUDA_HOME)/bin/nvcc
+
 ###########################################
 
 CSRC := $(ROOT_DIR)/csrc
@@ -15,58 +19,66 @@ INCLUDE :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/inclu
 LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcuda -lcublas -lcurand -lcusparse -L $(CONDA_PREFIX)/lib
 
 # NVIDIA NVCC compilation flags
-#COMPUTE_CAPABILITY := -gencode arch=compute_35,code=sm_35 # Kepler 
-#COMPUTE_CAPABILITY += -gencode arch=compute_37,code=sm_37 # Kepler 
-#COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
-#COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
-#COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
-#COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
-#COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
-#COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta 
-#COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta 
-#
-## CUDA 9.2 supports CC 3.0, but CUDA >= 11.0 does not
-#CC_CUDA92 := -gencode arch=compute_30,code=sm_30
-#
-## Later versions of CUDA support the new architectures
-#CC_CUDA10x := -gencode arch=compute_30,code=sm_30
-#CC_CUDA10x += -gencode arch=compute_75,code=sm_75
-#
-#CC_CUDA110 := -gencode arch=compute_75,code=sm_75
-#CC_CUDA110 += -gencode arch=compute_80,code=sm_80
-#
-#CC_CUDA11x := -gencode arch=compute_75,code=sm_75
-#CC_CUDA11x += -gencode arch=compute_80,code=sm_80
-#CC_CUDA11x += -gencode arch=compute_86,code=sm_86
-
-COMPUTE_CAPABILITY := -gencode arch=compute_70,code=sm_70 # Volta
-
-
-all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR)
+COMPUTE_CAPABILITY := -gencode arch=compute_35,code=sm_35 # Kepler 
+COMPUTE_CAPABILITY += -gencode arch=compute_37,code=sm_37 # Kepler 
+COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
+COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
+COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
+COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
+COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
+COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta 
+COMPUTE_CAPABILITY += -gencode arch=compute_72,code=sm_72 # Volta 
+
+# CUDA 9.2 supports CC 3.0, but CUDA >= 11.0 does not
+CC_CUDA92 := -gencode arch=compute_30,code=sm_30
+
+# Later versions of CUDA support the new architectures
+CC_CUDA10x := -gencode arch=compute_30,code=sm_30
+CC_CUDA10x += -gencode arch=compute_75,code=sm_75
+
+CC_CUDA110 := -gencode arch=compute_75,code=sm_75
+CC_CUDA110 += -gencode arch=compute_80,code=sm_80
+
+CC_CUDA11x := -gencode arch=compute_75,code=sm_75
+CC_CUDA11x += -gencode arch=compute_80,code=sm_80
+CC_CUDA11x += -gencode arch=compute_86,code=sm_86
+
+all: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
 	$(NVCC) $(COMPUTE_CAPABILITY) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
 	$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
 
-cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR)
+cuda92: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA92) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
 	$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
 
-cuda10x: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR)
+cuda10x: $(ROOT_DIR)/dependencies/cub $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
 	$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
 
-cuda110: $(BUILD_DIR)
+cuda110: $(BUILD_DIR) env
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
 	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
 	$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
 
-cuda11x: $(BUILD_DIR)
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA10x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
+cuda11x: $(BUILD_DIR) env
+	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
+	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o 
 	$(GPP) -std=c++11 -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes.so $(LIB)
 
+env:
+	@echo "ENVIRONMENT"
+	@echo "============================"
+	@echo "NVCC path: $(NVCC)"
+	@echo "GPP path: $(GPP)"
+	@echo "CUDA_HOME: $(CUDA_HOME)"
+	@echo "CONDA_PREFIX: $(CONDA_PREFIX)"
+	@echo "PATH: $(PATH)"
+	@echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)"
+	@echo "============================"
+
 $(BUILD_DIR):
 	mkdir -p cuda_build
 	mkdir -p dependencies

diff --git a/README.md b/README.md
@@ -84,6 +84,10 @@ For upcoming features and changes and full history see [Patch Notes](CHANGELOG.m
 
 1. RuntimeError: CUDA error: no kernel image is available for execution on the device. [Solution](errors_and_solutions.md#No-kernel-image-available)
 
+## Compile from source
+
+To compile from source, please follow the [compile_from_source.md](compile_from_source.md) instructions.
+
 ## License
 
 The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms: Pytorch is licensed under the BSD license.

diff --git a/compile_from_source.md b/compile_from_source.md
@@ -0,0 +1,20 @@
+# Compiling from source
+
+Basic steps.
+1. `make cudaXXX` where `cudaXXX` is among `cuda92, cuda10x, cuda110, cuda11x`
+2. `CUDA_VERSION=XXX python setup.py install`
+
+To run these steps you will need to have the nvcc compiler installed that comes with a CUDA installation. If you use anaconda (recommended) then you can figure out which version of CUDA you are using with PyTorch via the command `conda list | grep cudatoolkit`. Then you can install the nvcc compiler by downloading and installing the same CUDA version from the [CUDA toolkit archive](https://developer.nvidia.com/cuda-toolkit-archive). 
+
+For your convenience, there is a install script int he root directory that installs CUDA 11.1 locally and configures it automatically. After installing you should add the `bin` sub-directory to the `$PATH` variable to make the compiler visible to your system. To do this you can add this to your `.bashrc` by executing these commands:
+```bash
+echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64/" >> ~/.bashrc
+echo "export PATH=$PATH:/usr/local/cuda/bin/" >> ~/.bashrc
+source ~/.bashrc
+```
+
+By default, the Makefile will look at your `CUDA_HOME` environmental variable to find your CUDA version for compiling the library. If this path is not set it is inferred from the path of your `nvcc` compiler. 
+
+Either `nvcc` needs to be in path for the `CUDA_HOME` variable needs to be set to the CUDA directory root (e.g. `/usr/local/cuda`) in order for compilation to succeed
+
+If you have problems compiling the library with these instructions from source, please open an issue.
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
@@ -715,7 +715,7 @@ __global__ void kOptimizer32bit2State(T* g, T* p,
           switch(OPTIMIZER)
           {
               case ADAM: 
-									if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0))
+									if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 									{
 										s1_vals[j] = s1_vals[j]*beta1 + ((1.0f -beta1)*((float)g_vals[j]));
 										s2_vals[j] = s2_vals[j]*beta2 + ((1.0f -beta2)*(((float)g_vals[j])*((float)g_vals[j])));
@@ -868,7 +868,7 @@ __global__ void kOptimizer32bit1State(T *g, T *p,
       # pragma unroll 4
       for(unsigned int j = 0; j < NUM_PER_THREAD; j++)
       {
-					if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0))
+					if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 					{
 						switch(OPTIMIZER)
 						{
@@ -1475,7 +1475,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
         {
             g_val = float(g_vals[j]);
             g_val *= gnorm_scale;
-						if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0))
+						if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 						{
 							s1_vals[j] = smem_quantiles1[lane_id][c1s[j]]*absmax1[i/BLOCK_SIZE];
 							s1_vals[j] = (s1_vals[j]*beta1) + (((1.0f-beta1)*g_val));
@@ -1518,7 +1518,7 @@ kOptimizerStatic8bit2StateBlockwise(T* p, T* __restrict__ const g, unsigned char
         # pragma unroll N_PER_TH
         for(unsigned int j = 0; j < N_PER_TH; j++)
         {
-						if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0))
+						if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 						{
 							g_vals[j] = (T)(((float)g_vals[j]) + ((step_size*(__fdividef(s1_vals[j],(sqrtf(s2_vals[j])+(correction2*eps)))))));
 							if(weight_decay > 0.0f)
@@ -1635,7 +1635,7 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char
         {
             g_val = float(g_vals[j]);
             g_val *= gnorm_scale;
-						if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0))
+						if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 						{
 							if(weight_decay > 0.0f)
 								g_val += ((float)p_vals[j])*weight_decay;
@@ -1677,7 +1677,7 @@ kOptimizerStatic8bit1StateBlockwise(T* p, T* __restrict__ const g, unsigned char
         # pragma unroll N_PER_TH
         for(unsigned int j = 0; j < N_PER_TH; j++)
 				{
-						if(!skip_zeros || (skip_zeros && g_vals[j] != (T)0.0))
+						if(!skip_zeros || (skip_zeros && ((float)g_vals[j] != 0.0f)))
 						{
 							switch(OPTIMIZER)
 							{

diff --git a/errors_and_solutions.md b/errors_and_solutions.md
@@ -3,3 +3,6 @@
 This problem arises with the cuda version loaded by bitsandbytes is not supported by your GPU, or if you pytorch CUDA version mismatches. So solve this problem you need to debug ``$LD_LIBRARY_PATH``, ``$CUDA_HOME``, ``$PATH``. You can print these via ``echo $PATH``. You should look for multiple paths to different CUDA versions. This can include versions in your anaconda path, for example ``$HOME/anaconda3/lib``. You can check those versions via ``ls -l $HOME/anaconda3/lib/*cuda*`` or equivalent paths. Look at the CUDA versions of files in these paths. Does it match with ``nvidia-smi``?
 
 If you are feeling lucky, you can also try to compile the library from source. This can be still problematic if your PATH variables have multiple cuda versions. As such, it is recommended to figure out path conflicts before you proceed with compilation.
+
+
+__If you encounter any other error not listed here please create an issue. This will help resolve your problem and will help out others in the future.
diff --git a/install_cuda.sh b/install_cuda.sh
@@ -0,0 +1,5 @@
+wget https://developer.download.nvidia.com/compute/cuda/11.1.1/local_installers/cuda_11.1.1_455.32.00_linux.run
+bash cuda_11.1.1_455.32.00_linux.run --no-drm --no-man-page --override --installpath=~/local --librarypath=~/local/lib --toolkitpath=~/local/cuda-11.1/ --toolkit --silent
+echo "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:~/local/cuda-11.1/lib64/" >> ~/.bashrc
+echo "export PATH=$PATH:~/local/cuda-11.1/bin/" >> ~/.bashrc
+source ~/.bashrc
Original file line number	Diff line number	Diff line change
Expand Up		@@ -3,3 +3,6 @@
		This problem arises with the cuda version loaded by bitsandbytes is not supported by your GPU, or if you pytorch CUDA version mismatches. So solve this problem you need to debug ``$LD_LIBRARY_PATH``, ``$CUDA_HOME``, ``$PATH``. You can print these via ``echo $PATH``. You should look for multiple paths to different CUDA versions. This can include versions in your anaconda path, for example ``$HOME/anaconda3/lib``. You can check those versions via ``ls -l $HOME/anaconda3/lib/cuda`` or equivalent paths. Look at the CUDA versions of files in these paths. Does it match with ``nvidia-smi``?

		If you are feeling lucky, you can also try to compile the library from source. This can be still problematic if your PATH variables have multiple cuda versions. As such, it is recommended to figure out path conflicts before you proceed with compilation.


		__If you encounter any other error not listed here please create an issue. This will help resolve your problem and will help out others in the future.