Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TTran - bitsandbytes 0.43 PR #16

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
293 changes: 293 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,293 @@
# This CMake config hopefully makes it easier to compile.
# Ensure the CUDA Toolkit is available on your path. Then run:
# For GCC: `cmake -B build . && cmake --build build`
# For MSVC: `cmake -B build . && cmake --build build --config Release`
# You can also use the following options and variables
# - COMPUTE_BACKEND: Set to `cpu`, `cuda`, or `mps` to select the backend
# - NO_CUBLASLT: Default OFF, will skip building/linking CUBLASLT support
# - CUDA_VERSION: The expected CUDA version, for sanity checking. The actual version
# is whatever CMake finds on your path.
# - COMPUTE_CAPABILITY: Which GPU Arch/Compute codes to provide to NVCC.
# Separate by semicolons, i.e. `-DCOMPUTE_CAPABILITY=89;90`
# Check your compute capability here: https://developer.nvidia.com/cuda-gpus
# - PTXAS_VERBOSE: Pass the `-v` option to the PTX Assembler
cmake_minimum_required(VERSION 3.22.1)

project(bitsandbytes LANGUAGES CXX)

# If run without specifying a build type, default to using the Release configuration:
# optimizing the generated binaries for performance and also adds the `-DNDEBUG` flag,
# which turns off a bunch of asserts which seem to link to new symbols in libstdc++,
# worsening our many_linux compliance..
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()

# Define included source files
set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.c)
#set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
set(CUDA_FILES csrc/ops.hip.cpp csrc/kernels.hip.cpp)
#set(CUDA_FILES obsolete/ops.cu obsolete/kernels.cu)
set(MPS_FILES csrc/mps_ops.mm)
set(METAL_FILES csrc/mps_kernels.metal)
# C++ sources are always included
list(APPEND SRC_FILES ${CPP_FILES})

set(COMPUTE_BACKEND "hip" CACHE STRING "The compute backend to use (cpu, cuda, mps, hip)")
set_property(CACHE COMPUTE_BACKEND PROPERTY STRINGS cpu cuda mps hip)
option(PTXAS_VERBOSE "Pass through -v flag to PTX Assembler" OFF)

if(NOT DEFINED HIP_PATH)
if(NOT DEFINED ENV{HIP_PATH})
set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
else()
set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
endif()
endif()
message("HIP_PATH: " ${HIP_PATH})
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
find_package(HIP REQUIRED)
if (HIP_FOUND)
message(STATUS "Found HIP: " ${HIP_VERSION})
else()
message(FATAL_ERROR "Could not find HIP")
endif()
find_package(rocthrust REQUIRED)
find_package(hipblas REQUIRED)
find_package(hipsparse REQUIRED)
find_package(rocrand REQUIRED)
# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm /opt/rocm)
list(APPEND HIP_PATH /opt/rocm/llvm/bin/)
# Find HIP.
# The user may override AMDGPU_TARGETS defined in the HIP config file
# to select the AMDGPU archs to compile for.
# ex. set(AMDGPU_TARGETS "gfx803;gfx900;gfx906")
# Find OpenMP.
#find_package(OpenMP REQUIRED)
# Set compiler and linker.
if(NOT WIN32)
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXXFLAGS -D__HIP_PLATFORM_AMD__)
set(CMAKE_CFLAGS -D__HIP_PLATFORM_AMD__)
endif()
message("Current CMAKE_CXX_COMPILER (should show hipcc): " ${CMAKE_CXX_COMPILER})
message("Current CMAKE_CXX_LINKER (should show hipcc): " ${CMAKE_CXX_LINKER})

if(APPLE)
set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1)
endif()

set(BNB_OUTPUT_NAME "bitsandbytes")

message(STATUS "Configuring ${PROJECT_NAME} (Backend: ${COMPUTE_BACKEND})")

if(${COMPUTE_BACKEND} STREQUAL "cuda")
if(APPLE)
message(FATAL_ERROR "CUDA is not supported on macOS" )
endif()
option(NO_CUBLASLT "Disable CUBLAS" OFF)
set(BUILD_CUDA ON)
set(BUILD_MPS OFF)
message(STATUS "NO_CUBLASLT := ${NO_CUBLASLT}")
elseif(${COMPUTE_BACKEND} STREQUAL "mps")
if(NOT APPLE)
message(FATAL_ERROR "MPS is only supported on macOS" )
endif()
set(BUILD_CUDA OFF)
set(BUILD_MPS ON)
elseif(${COMPUTE_BACKEND} STREQUAL "hip")
set(BUILD_HIP on)
set(BUILD_CUDA OFF)
set(BUILD_MPS OFF)
set(NO_CUBLASLT ON)
else()
set(BUILD_CUDA OFF)
set(BUILD_MPS OFF)
endif()


if(BUILD_CUDA)
enable_language(CUDA) # This will fail if CUDA is not found
find_package(CUDAToolkit REQUIRED)

# Convert the CUDA version from X.Y.z to XY. There's probably a shorter way of doing this
string(REGEX MATCH "^[0-9]+.[0-9]+" _CUDA_VERSION_FIRST_TWO "${CMAKE_CUDA_COMPILER_VERSION}")
string(REPLACE "." "" CUDA_VERSION_SHORT "${_CUDA_VERSION_FIRST_TWO}")

# Expose a cache variable that the user can set to ensure the correct version of CUDA is found
set(CUDA_VERSION "${CUDA_VERSION_SHORT}" CACHE STRING "Expected CUDA Version Shortcode")

message(STATUS "CUDA Version: ${CUDA_VERSION_SHORT} (${CMAKE_CUDA_COMPILER_VERSION})")
message(STATUS "CUDA Compiler: ${CMAKE_CUDA_COMPILER}")

# It should match the discovered version
if(NOT CUDA_VERSION STREQUAL "${CUDA_VERSION_SHORT}")
message(FATAL_ERROR "You've specified CUDA version ${CUDA_VERSION} however the CUDA compiler found is ${CUDA_VERSION_SHORT}."
" Ensure the desired CUDA compiler is the first one available on your PATH."
)
endif()

if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS "11.0")
message(FATAL_ERROR "CUDA Version < 11 is not supported")
elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
message(FATAL_ERROR "CUDA Version > 12 is not supported")
endif()

# CMake < 3.23.0 does not define CMAKE_CUDA_ARCHITECTURES_ALL.
if(CMAKE_VERSION VERSION_LESS "3.23.0")
message(STATUS "CMake < 3.23.0; determining CUDA architectures supported...")

# 11.x and 12.x both support these at a minimum.
set(CMAKE_CUDA_ARCHITECTURES_ALL 50 52 53 60 61 62 70 72 75 80)
set(CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 50 60 70 80)

# CUDA 11.1 adds Ampere support for GA102-GA107.
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.1")
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 86)
endif()

# CUDA 11.4 adds Ampere support for GA10B.
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.4")
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 87)
endif()

# CUDA 11.8 adds support for Ada and Hopper.
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "11.8")
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL 89 90)
list(APPEND CMAKE_CUDA_ARCHITECTURES_ALL_MAJOR 90)
endif()
endif()

string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math")

if(PTXAS_VERBOSE)
# Verbose? Outputs register usage information, and other things...
string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v")
endif()

foreach(capability ${CMAKE_CUDA_ARCHITECTURES_ALL})
# Most of the items here are like: `xx-real`, so we just extract the `xx` portion
string(REGEX MATCH "[0-9]+" capability_id "${capability}")
if(capability_id GREATER 0)
list(APPEND POSSIBLE_CAPABILITIES ${capability_id})
endif()
endforeach()

# This can be changed via -D argument to CMake
# By default all possible capabilities are compiled
set(COMPUTE_CAPABILITY "${POSSIBLE_CAPABILITIES}" CACHE STRING "Compute Capabilities Targeted")

message(STATUS "CUDA Capabilities Available: ${POSSIBLE_CAPABILITIES}")
message(STATUS "CUDA Capabilities Selected: ${COMPUTE_CAPABILITY}")

# Use the "real" option to build native cubin for all selections.
# Ensure we build the PTX for the latest version.
# This behavior of adding a PTX (virtual) target for the highest architecture
# is similar to how the "all" and "all-major" options would behave in CMake >= 3.23.
# TODO: Consider bumping CMake requirement and using CMAKE_CUDA_ARCHITECTURES=[all | native] by default
list(REMOVE_DUPLICATES COMPUTE_CAPABILITY)
list(SORT COMPUTE_CAPABILITY COMPARE NATURAL)
list(POP_BACK COMPUTE_CAPABILITY _LATEST_CAPABILITY)
list(TRANSFORM COMPUTE_CAPABILITY APPEND "-real" OUTPUT_VARIABLE CMAKE_CUDA_ARCHITECTURES)
list(APPEND CMAKE_CUDA_ARCHITECTURES ${_LATEST_CAPABILITY})

message(STATUS "CUDA Targets: ${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS "CUDA NVCC Flags: ${CMAKE_CUDA_FLAGS}")

list(APPEND SRC_FILES ${CUDA_FILES})

string(APPEND BNB_OUTPUT_NAME "_cuda${CUDA_VERSION_SHORT}")
if(NO_CUBLASLT)
string(APPEND BNB_OUTPUT_NAME "_cuda110_nocublaslt")
endif()
add_compile_definitions(BUILD_CUDA)
elseif(BUILD_MPS)
if(NOT APPLE)
message(FATAL_ERROR "MPS is only supported on macOS" )
endif()

enable_language(OBJCXX)

list(APPEND SRC_FILES ${MPS_FILES})

string(APPEND BNB_OUTPUT_NAME "_mps")
add_compile_definitions(BUILD_MPS)
file(MAKE_DIRECTORY "build")
add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib"
COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_FILES}
COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib"
DEPENDS "${METAL_FILES}"
COMMENT "Compiling Metal kernels"
VERBATIM)
add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
elseif(BUILD_HIP)
list(APPEND SRC_FILES ${CUDA_FILES})
# real name
string(APPEND BNB_OUTPUT_NAME "_hip_nohipblaslt")
add_compile_definitions(BUILD_HIP)
else()
string(APPEND BNB_OUTPUT_NAME "_cpu")
set(GPU_SOURCES)
endif()


if(WIN32)
# Export all symbols
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
endif()

# Weird MSVC hacks
if(MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2 /fp:fast")
endif()

if (BUILD_HIP)
set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
message("Working on: " ${CPP_FILES})
add_library(bitsandbytes SHARED ${SRC_FILES})
target_include_directories(bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/include /opt/rocm/include/hipblas /opt/rocm/include/rocblas /opt/rocm/include/hipblaslt /opt/rocm/include/hipsparse /opt/rocm/include/hipcub /opt/rocm/include/rocwmma)
target_compile_features(bitsandbytes PUBLIC cxx_std_14)
target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT)
target_include_directories(bitsandbytes PUBLIC csrc include)
target_link_libraries(bitsandbytes PUBLIC hip::device roc::rocthrust roc::hipblas roc::hipsparse roc::rocrand roc::rocprim)
else()
set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
add_library(bitsandbytes SHARED ${SRC_FILES})
target_compile_features(bitsandbytes PUBLIC cxx_std_14)
target_include_directories(bitsandbytes PUBLIC csrc include)
target_link_libraries(bitsandbytes PUBLIC hip::device)
endif()

if(BUILD_CUDA)
target_include_directories(bitsandbytes PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
target_link_libraries(bitsandbytes PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse)
if(NO_CUBLASLT)
target_compile_definitions(bitsandbytes PUBLIC NO_CUBLASLT)
else()
target_link_libraries(bitsandbytes PUBLIC CUDA::cublasLt)
endif()

set_target_properties(bitsandbytes
PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
)
endif()
if(BUILD_MPS)
add_dependencies(bitsandbytes metallib)
target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
endif()

if(WIN32)
set_target_properties(bitsandbytes PROPERTIES PREFIX "lib")
endif()
set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME ${BNB_OUTPUT_NAME})
if(MSVC)
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE "${PROJECT_SOURCE_DIR}/bitsandbytes")
set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG "${PROJECT_SOURCE_DIR}/bitsandbytes")
endif()

set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${PROJECT_SOURCE_DIR}/bitsandbytes")
70 changes: 70 additions & 0 deletions CMakeLists_cpu.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#this method follows "Consuming the HIP API in C++ code"
#https://rocm.docs.amd.com/en/latest/conceptual/cmake-packages.html#consuming-the-hip-api-in-c-code

project(bitsandbytes)

cmake_minimum_required(VERSION 3.21)

message("Is it Window?: " ${CMAKE_HOST_WIN32})
message("Is it Linux?: " ${CMAKE_HOST_UNIX})

if(NOT DEFINED HIP_PATH)
if(NOT DEFINED ENV{HIP_PATH})
set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
else()
set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
endif()
endif()
message("HIP_PATH: " ${HIP_PATH})
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})

find_package(HIP REQUIRED)
if (HIP_FOUND)
message(STATUS "Found HIP: " ${HIP_VERSION})
else()
message(FATAL_ERROR "Could not find HIP")
endif()

find_package(rocThrust REQUIRED)
find_package(MIOpen REQUIRED)

# Search for rocm in common locations
list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm-6.0.2 /opt/rocm)
list(APPEND HIP_PATH /opt/rocm-6.0.2/llvm/bin/)
# Find HIP.
# The user may override AMDGPU_TARGETS defined in the HIP config file
# to select the AMDGPU archs to compile for.
# ex. set(AMDGPU_TARGETS "gfx803;gfx900;gfx906")

# Find OpenMP.
#find_package(OpenMP REQUIRED)

# Set compiler and linker.
if(NOT WIN32)
set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE})
set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE})
endif()
message("Current CMAKE_CXX_COMPILER (should show hipcc): " ${CMAKE_CXX_COMPILER})
message("Current CMAKE_CXX_LINKER (should show hipcc): " ${CMAKE_CXX_LINKER})

set(CMAKE_BUILD_TYPE Release)
set(BUILD_CUDA ON)
set(HIPCC_VERBOSE 1)

# Source files.
# all file
#set(CPP_SOURCES ${CMAKE_SOURCE_DIR}/kernels.hip.cpp ${CMAKE_SOURCE_DIR}/kernels.hip.h ${CMAKE_SOURCE_DIR}/common.hip.cpp ${CMAKE_SOURCE_DIR}/common.hip.h ${CMAKE_SOURCE_DIR}/cpu_ops.hip.cpp ${CMAKE_SOURCE_DIR}/cpu_ops.hip.h ${CMAKE_SOURCE_DIR}/ops.hip.cpp ${CMAKE_SOURCE_DIR}/ops.hip.h ${CMAKE_SOURCE_DIR}/pythonInterface.hip.cpp)
set(CPP_SOURCES ${CMAKE_SOURCE_DIR}/csrc/cpu_ops.hip.cpp ${CMAKE_SOURCE_DIR}/csrc/cpu_ops.hip.h ${CMAKE_SOURCE_DIR}/csrc/common.hip.cpp ${CMAKE_SOURCE_DIR}/csrc/common.hip.h)
message("CMAKE_SOURCE_DIR: " ${CMAKE_SOURCE_DIR})

# Preparing the executable.
#Add an executable target called "test_openmp_helloworld" to be built from the source files listed in the command invocation.
add_executable(test_bitsandbytes ${CPP_SOURCES})

# Link Libraries - HIP Device and OpenMP.
#target_compile_options(test_vector_add PRIVATE ${OpenMP_CXX_FLAGS})
target_link_libraries(test_bitsandbytes PUBLIC hip::device)
target_include_directories(test_bitsandbytes PRIVATE ${CMAKE_SOURCE_DIR} ${CMAKE_SOURCE_DIR}/csrc ${CMAKE_SOURCE_DIR}/include /opt/rocm-6.0.2/include/hipblas /opt/rocm-6.0.2/include/hipblaslt /opt/rocm-6.0.2/include/hipsparse /opt/rocm-6.0.2/include/hipcub /opt/rocm-6.0.2/include/rocwmma)
target_compile_features(test_bitsandbytes PUBLIC cxx_std_14)
#set(CMAKE_CXX_FLAGS_INIT -nostartfiles)
#set(CMAKE_SHARED_LINKER_FLAGS "-nostartfiles")
Loading