From 1f1c53f5188c0aa8ed0c97487421d99e239ae419 Mon Sep 17 00:00:00 2001 From: Justine Tunney Date: Wed, 3 Jan 2024 08:11:29 -0800 Subject: [PATCH] Get AMD GPU support working on Windows Compilation of the ggml-cuda.cu module will happen automatically for AMD users when the $HIP_PATH environment variable is defined pointing to the HIP SDK which lets us link hipBLAS and rocBLAS. This change also lets us bundle a prebuilt DLL for Windows users that will work on stock installs however its batched performance is much slower. Linux support might work however it hasn't been tested yet. See #122 --- llama.cpp/common.cpp | 5 + llama.cpp/ggml-cuda.cu | 93 ++++++- llama.cpp/llava/llava-cli.cpp | 5 +- llama.cpp/main/main.1 | 63 +++-- llama.cpp/main/main.1.asc | 3 + llama.cpp/main/main.cpp | 6 +- llama.cpp/server/server.cpp | 4 + llamafile/copy.sh | 1 + llamafile/cuda.c | 467 ++++++++++++++++++++++++++++------ llamafile/gpu.c | 9 +- llamafile/llamafile.h | 2 + llamafile/rocm.bat | 56 ++++ llamafile/tinyblas.h | 27 +- llamafile/x.c | 42 +++ llamafile/x.h | 14 + 15 files changed, 659 insertions(+), 138 deletions(-) create mode 100644 llamafile/rocm.bat create mode 100644 llamafile/x.c create mode 100644 llamafile/x.h diff --git a/llama.cpp/common.cpp b/llama.cpp/common.cpp index fb1d923444..49b311cb5d 100644 --- a/llama.cpp/common.cpp +++ b/llama.cpp/common.cpp @@ -523,6 +523,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.unsecure = true; } else if (arg == "--nocompile") { FLAG_nocompile = true; + } else if (arg == "--recompile") { + FLAG_recompile = true; } else if (arg == "--tinyblas") { FLAG_tinyblas = true; // undocumented } else if (arg == "--gpu") { @@ -560,6 +562,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.n_gpu_layers = std::stoi(argv[i]); + if (params.n_gpu_layers == 0) { + FLAG_gpu = LLAMAFILE_GPU_DISABLED; + } } else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { passed_gpu_flags = true; if (++i >= argc) { diff --git a/llama.cpp/ggml-cuda.cu b/llama.cpp/ggml-cuda.cu index 37ab1d8657..9af0e4922b 100644 --- a/llama.cpp/ggml-cuda.cu +++ b/llama.cpp/ggml-cuda.cu @@ -17,14 +17,10 @@ #error "you need to use a 64-bit compiler for llamafile" #endif -#if defined(GGML_USE_HIPBLAS) +#if defined(GGML_USE_TINYBLAS) && defined(GGML_USE_HIPBLAS) #include #include #include -#ifdef __HIP_PLATFORM_AMD__ -// for rocblas_initialize() -#include "rocblas/rocblas.h" -#endif // __HIP_PLATFORM_AMD__ #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F @@ -33,19 +29,18 @@ #define CUBLAS_OP_N HIPBLAS_OP_N #define CUBLAS_OP_T HIPBLAS_OP_T #define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED #define CUBLAS_TF32_TENSOR_OP_MATH 0 #define CUDA_R_16F HIPBLAS_R_16F #define CUDA_R_32F HIPBLAS_R_32F #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasGemmAlgo_t hipblasGemmAlgo_t +#define cublasOperation_t hipblasOperation_t #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 #define cublasCreate hipblasCreate -#define cublasGemmEx hipblasGemmEx -#define cublasGemmBatchedEx hipblasGemmBatchedEx -#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx #define cublasHandle_t hipblasHandle_t #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS #define cublasSetStream hipblasSetStream -#define cublasSgemm hipblasSgemm #define cublasStatus_t hipblasStatus_t #define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 #define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer @@ -86,16 +81,92 @@ #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) #define cudaStream_t hipStream_t #define cudaSuccess hipSuccess +#include "tinyblas.cu" +#define cublasSgemm tinyblasSgemm +#define cublasGemmEx tinyblasGemmEx +#define cublasGemmBatchedEx tinyblasGemmBatchedEx +#define cublasGemmStridedBatchedEx tinyblasGemmStridedBatchedEx +#define cublasGetStatusString(x) "REDACTED!cublasGetStatusString" #elif defined(GGML_USE_TINYBLAS) - #include "tinyblas.cu" +#define cublasHandle_t cudaStream_t #define cublasSgemm tinyblasSgemm #define cublasGemmEx tinyblasGemmEx #define cublasGemmBatchedEx tinyblasGemmBatchedEx #define cublasGemmStridedBatchedEx tinyblasGemmStridedBatchedEx #define cublasGetStatusString(x) "REDACTED!cublasGetStatusString" +#elif defined(GGML_USE_HIPBLAS) +#include +#include +#include +#ifdef __HIP_PLATFORM_AMD__ +// for rocblas_initialize() +#include "rocblas/rocblas.h" +#endif // __HIP_PLATFORM_AMD__ +#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F +#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F +#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F +#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT +#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT +#define CUBLAS_OP_N HIPBLAS_OP_N +#define CUBLAS_OP_T HIPBLAS_OP_T +#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS +#define CUBLAS_TF32_TENSOR_OP_MATH 0 +#define CUDA_R_16F HIPBLAS_R_16F +#define CUDA_R_32F HIPBLAS_R_32F +#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) +#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 +#define cublasCreate hipblasCreate +#define cublasGemmEx hipblasGemmEx +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx +#define cublasHandle_t hipblasHandle_t +#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS +#define cublasSetStream hipblasSetStream +#define cublasSgemm hipblasSgemm +#define cublasStatus_t hipblasStatus_t +#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6 +#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer +#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess +#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaError_t hipError_t +#define cudaEventCreateWithFlags hipEventCreateWithFlags +#define cudaEventDisableTiming hipEventDisableTiming +#define cudaEventRecord hipEventRecord +#define cudaEvent_t hipEvent_t +#define cudaEventDestroy hipEventDestroy +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaMalloc hipMalloc +#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault) +#define cudaMemcpy hipMemcpy +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync +#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize +#define cudaSetDevice hipSetDevice +#define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamFireAndForget hipStreamFireAndForget +#define cudaStreamNonBlocking hipStreamNonBlocking +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags) +#define cudaStream_t hipStream_t +#define cudaSuccess hipSuccess + #else #include #include @@ -6961,9 +7032,11 @@ void ggml_init_cublas() { if (!initialized) { #ifdef __HIP_PLATFORM_AMD__ +#ifndef GGML_USE_TINYBLAS // Workaround for a rocBLAS bug when using multiple graphics cards: // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 rocblas_initialize(); +#endif CUDA_CHECK(cudaDeviceSynchronize()); #endif diff --git a/llama.cpp/llava/llava-cli.cpp b/llama.cpp/llava/llava-cli.cpp index 9d9a117a7d..347eb22268 100644 --- a/llama.cpp/llava/llava-cli.cpp +++ b/llama.cpp/llava/llava-cli.cpp @@ -231,8 +231,9 @@ int llava_cli(int argc, char ** argv) { show_additional_info(argc, argv); return 1; } - if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { - fprintf(stderr, "%s: fatal error: --image flag missing\n", argv[0]); + + if (params.mmproj.empty()) { + fprintf(stderr, "%s: fatal error: --mmproj must also be passed when an --image is specified in cli mode\n", argv[0]); return 1; } diff --git a/llama.cpp/main/main.1 b/llama.cpp/main/main.1 index 66847e106d..f0c86b91a7 100644 --- a/llama.cpp/main/main.1 +++ b/llama.cpp/main/main.1 @@ -353,15 +353,19 @@ Force system to keep model in RAM rather than swapping or compressing. Do not memory-map model (slower load but may reduce pageouts if not using mlock). .It Fl Fl numa Attempt optimizations that help on some NUMA systems if run without this previously, it is recommended to drop the system page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437. +.It Fl Fl recompile +Force GPU support to be recompiled at runtime if possible. .It Fl Fl nocompile Never compile GPU support at runtime. .Pp -If -.Pa ~/.llamafile/ggml-cuda.dll -already exists on the file system (or .so for UNIX and .dylib for -MacOS), then it'll be linked as-is without question. Otherwise, +If the appropriate DSO file already exists under +.Pa ~/.llamafile/ +then it'll be linked as-is without question. If a prebuilt DSO is +present in the PKZIP content of the executable, then it'll be extracted +and linked if possible. Otherwise, .Nm -will fall back to CPU inference. +will skip any attempt to compile GPU support and simply fall back to +using CPU inference. .It Fl Fl gpu Ar GPU Specifies which brand of GPU should be used. Valid choices are: .Pp @@ -370,25 +374,39 @@ Specifies which brand of GPU should be used. Valid choices are: .Ar AUTO : Use any GPU if possible, otherwise fall back to CPU inference (default) .It -.Ar AMD : -Use AMD GPU. The AMD ROCm SDK must be installed and the HIP_PATH -environment variable must be defined. If an AMD GPU could not be used -for any reason, then a fatal error will be raised. -.It .Ar APPLE : Use Apple Metal GPU. This is only available on MacOS ARM64. If Metal could not be used for any reason, then a fatal error will be raised. .It +.Ar AMD : +Use AMD GPUs. The AMD HIP ROCm SDK should be installed in which case we +assume the HIP_PATH environment variable has been defined. The set of +gfx microarchitectures needed to run on the host machine is determined +automatically based on the output of the hipInfo command. On Windows, +.Nm +release binaries are distributed with a tinyBLAS DLL so it'll work out +of the box without requiring the HIP SDK to be installed. However, +tinyBLAS is slower than rocBLAS for batch and image processing, so it's +recommended that the SDK be installed anyway. If an AMD GPU could not be +used for any reason, then a fatal error will be raised. +.It .Ar NVIDIA : -Use NVIDIA GPU. If an NVIDIA GPU could not be used for any reason, a +Use NVIDIA GPUs. If an NVIDIA GPU could not be used for any reason, a fatal error will be raised. On Windows, NVIDIA GPU support will use our -tinyBLAS library, since it works on stock Windows installs. If both MSVC -and CUDA are installed beforehand, and +tinyBLAS library, since it works on stock Windows installs. However, +tinyBLAS goes slower for batch and image processing. It's possible to +use NVIDIA's closed-source cuBLAS library instead. To do that, both MSVC +and CUDA need to be installed and the .Nm -is run for the first time on the x64 command prompt, then llamafile will -use NVIDIA's faster cuBLAS library instead. On Linux and other systems, -the CUDA SDK must always be installed, so that native GPU support can be -compiled on the fly. +command should be run once from the x64 MSVC command prompt with the +.Fl Fl recompile +flag passed. The GGML library will then be compiled and saved to +.Pa ~/.llamafile/ +so the special process only needs to happen a single time. +.It +.Ar DISABLED : +Never use GPU and instead use CPU inference. This setting is implied by +.Fl ngl Ar 0 . .El .Pp .It Fl ngl Ar N , Fl Fl n-gpu-layers Ar N @@ -588,8 +606,7 @@ llama.cpp command line interface, utilizing WizardCoder-Python-13B weights: .Bd -literal -offset indent llamafile \[rs] - -m wizardcoder-python-13b-v1.0.Q8_0.gguf \[rs] - --temp 0 -r '}\[rs]n' -r '\`\`\`\[rs]n' \[rs] + -m wizardcoder-python-13b-v1.0.Q8_0.gguf --temp 0 -r '}\[rs]n' -r '\`\`\`\[rs]n' \[rs] -e -p '\`\`\`c\[rs]nvoid *memcpy(void *dst, const void *src, size_t size) {\[rs]n' .Ed .Pp @@ -692,10 +709,12 @@ work to be a production worthy component of a public-facing service. For example, C++ exceptions caused by JSON parsing errors will make it abort and print a backtrace. .Sh PROTIP -NVIDIA users need to pass the +The .Fl ngl Ar 35 -flag to enable GPU acceleration. It's not enabled by default since it -sometimes needs to be tuned for system hardware and model architecture. +flag needs to be passed in order to use GPUs made by NVIDIA and AMD. +It's not enabled by default since it sometimes needs to be tuned based +on the system hardware and model architecture, in order to achieve +optimal performance, and avoid compromising a shared display. .Sh SEE ALSO .Xr llamafile-quantize 1 , .Xr llamafile-perplexity 1 , diff --git a/llama.cpp/main/main.1.asc b/llama.cpp/main/main.1.asc index 4910a75b96..8dffb10239 100644 --- a/llama.cpp/main/main.1.asc +++ b/llama.cpp/main/main.1.asc @@ -324,6 +324,9 @@ OOPPTTIIOONNSS page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/1437. + ----rreeccoommppiillee + Force GPU support to be recompiled at runtime if possible. + ----nnooccoommppiillee Never compile GPU support at runtime. diff --git a/llama.cpp/main/main.cpp b/llama.cpp/main/main.cpp index 1befa6cc65..e663c543b4 100644 --- a/llama.cpp/main/main.cpp +++ b/llama.cpp/main/main.cpp @@ -132,7 +132,7 @@ int main(int argc, char ** argv) { return server_cli(argc, argv); } - if (has_argument(argc, argv, "--mmproj")) { + if (has_argument(argc, argv, "--image")) { return llava_cli(argc, argv); } @@ -142,10 +142,6 @@ int main(int argc, char ** argv) { if (!gpt_params_parse(argc, argv, params)) { return 1; } - if (!params.image.empty()) { - fprintf(stderr, "%s: fatal error: --mmproj must also be passed if --image is passed\n", argv[0]); - return 1; - } llama_sampling_params & sparams = params.sparams; #ifndef LOG_DISABLE_LOGS diff --git a/llama.cpp/server/server.cpp b/llama.cpp/server/server.cpp index 118c3be729..af438dc213 100644 --- a/llama.cpp/server/server.cpp +++ b/llama.cpp/server/server.cpp @@ -2331,6 +2331,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, { FLAG_nocompile = true; } + else if (arg == "--recompile") + { + FLAG_recompile = true; + } else if (arg == "--gpu") { if (++i >= argc) diff --git a/llamafile/copy.sh b/llamafile/copy.sh index 66b4040d90..4f0031fd5a 100755 --- a/llamafile/copy.sh +++ b/llamafile/copy.sh @@ -13,6 +13,7 @@ scp llama.cpp/ggml-cuda.cu \ llamafile/tinyblas.h \ llamafile/tinyblas.cu \ llamafile/llamafile.h \ + llamafile/rocm.bat \ llamafile/cuda.bat \ llamafile/cuda.sh \ $HOST:lfbuild/ diff --git a/llamafile/cuda.c b/llamafile/cuda.c index b741000c46..1432e30334 100644 --- a/llamafile/cuda.c +++ b/llamafile/cuda.c @@ -15,6 +15,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "x.h" #include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -43,9 +45,12 @@ __static_yoink("llama.cpp/ggml-cuda.cu"); __static_yoink("llama.cpp/ggml-backend.h"); __static_yoink("llama.cpp/ggml-backend-impl.h"); -#define NVCC_LIBS (FLAG_tinyblas \ - ? "-DIGNORE" \ - : "-lcublas") +#define THESTRING(x) #x +#define STRINGIFY(x) THESTRING(x) +#define WIND_ONLY(x) (!IsWindows() ? "-DIGNORE" STRINGIFY(__COUNTER__) : x) +#define BLAS_ONLY(x) (FLAG_tinyblas ? "-DIGNORE" STRINGIFY(__COUNTER__) : x) + +#define NVCC_LIBS BLAS_ONLY("-lcublas") #define NVCC_FLAGS "--shared", \ "--forward-unknown-to-host-compiler", \ @@ -203,6 +208,137 @@ static bool Compile(const char *src, return true; } +static bool GetRocmBinPath(char path[static PATH_MAX], const char *bin) { + const char *hip_path = getenv("HIP_PATH"); + if (!hip_path) return false; + strlcpy(path, hip_path, PATH_MAX); + strlcat(path, "/bin/", PATH_MAX); + strlcat(path, bin, PATH_MAX); + if (IsWindows()) { + strlcat(path, ".exe", PATH_MAX); + } + return FileExists(path); +} + +// Returns word-encoded array of 16-bit gfxXXXX gcnArchName numbers. +static bool GetAmdOffloadArchFlag(char out[static 64]) { + + // Get hipInfo executable path. + char hip_info_path[PATH_MAX]; + if (!GetRocmBinPath(hip_info_path, "hipInfo")) { + return false; + } + + // Create pipe. + int pipefds[2]; + if (pipe2(pipefds, O_CLOEXEC)) { + perror("pipe2"); + return false; + } + + // Run HIP info program. + int pid; + char *args[] = {hip_info_path, 0}; + posix_spawn_file_actions_t fa; + posix_spawn_file_actions_init(&fa); + posix_spawn_file_actions_adddup2(&fa, pipefds[1], 1); + errno_t err = posix_spawn(&pid, args[0], &fa, NULL, args, environ); + posix_spawn_file_actions_destroy(&fa); + close(pipefds[1]); + if (err) { + errno = err; + perror(args[0]); + close(pipefds[0]); + return false; + } + + // Parse program output to word-encoded array. + int rc; + int a = 0; + int t = 0; + char buf[512]; + unsigned long archs = 0; + while ((rc = read(pipefds[0], buf, sizeof(buf))) > 0) { + for (int i = 0; i < rc; ++i) { + switch (t) { + case 0: + if (buf[i] == 'g') { + t = 1; + } + break; + case 1: + if (buf[i] == 'f') { + t = 2; + } else { + t = 0; + } + break; + case 2: + if (buf[i] == 'x') { + t = 3; + a = 0; + } else { + t = 0; + } + break; + case 3: + if (isdigit(buf[i])) { + a *= 10; + a += buf[i] - '0'; + } else { + t = 0; + if ((a & 0xffff) && (a & 0xffff) == a) { + a &= 0xffff; + bool dupe = false; + for (int j = 0; j < 4; ++j) { + if (((archs >> (j * 16)) & 0xffff) == a) { + dupe = true; + } + } + if (!dupe) { + archs <<= 16; + archs |= a; + } + } + } + break; + default: + __builtin_unreachable(); + } + } + } + close(pipefds[0]); + + // Wait for program to exit. + int ws; + while (waitpid(pid, &ws, 0) == -1) { + if (errno != EINTR) { + perror(args[0]); + return false; + } + } + if (ws) { + tinyprint(2, "error: hipInfo returned non-zero exit status\n", NULL); + return false; + } + + // Serialize value for --offload-arch=LIST flag. + if (!archs) { + tinyprint(2, "warning: hipInfo output didn't list any graphics cards\n", NULL); + return false; + } + bool gotsome = false; + char *p = stpcpy(out, "--offload-arch="); + do { + if (gotsome) *p++ = ','; + p += sprintf(p, "gfx%d", archs & 0xffff); + gotsome = true; + } while ((archs >>= 16)); + + // woot + return true; +} + // finds nvidia compiler // // 1. nvcc on $PATH environ @@ -231,7 +367,7 @@ static bool GetNvccPath(char path[static PATH_MAX]) { return IsExecutable(path); } -static dontinline bool GetNvccArchFlag(char *nvcc, char flag[static 32]) { +static dontinline bool GetNvccArchFlag(const char *nvcc, char flag[static 32]) { // create path of exe char exe[PATH_MAX]; @@ -261,7 +397,7 @@ static dontinline bool GetNvccArchFlag(char *nvcc, char flag[static 32]) { // run than compiling / running this script and (2) the nvidia-smi // command isn't available on Jetson devices. tinyprint(2, "building nvidia compute capability detector...\n", NULL); - if (!Compile(src, tmp, exe, (char *[]){nvcc, "-o", tmp, src, 0})) { + if (!Compile(src, tmp, exe, (char *[]){(char *)nvcc, "-o", tmp, src, 0})) { return false; } @@ -314,56 +450,77 @@ static dontinline bool GetNvccArchFlag(char *nvcc, char flag[static 32]) { return true; } -static bool CompileNativeCuda(const char *dso) { +static bool CompileAmd(const char *clangxx, const char *dso, const char *src) { + const char *lib = IsWindows() ? "lib" : GetDsoExtension(); + const char *hip_path = getenv("HIP_PATH"); - // extract source code - char src[PATH_MAX]; - bool needs_rebuild = false; - for (int i = 0; i < sizeof(srcs) / sizeof(*srcs); ++i) { - llamafile_get_app_dir(src, sizeof(src)); - if (!i && mkdir(src, 0755) && errno != EEXIST) { - perror(src); - return false; - } - strlcat(src, srcs[i].name, sizeof(src)); - switch (llamafile_is_file_newer_than(srcs[i].zip, src)) { - case -1: - return false; - case false: - break; - case true: - needs_rebuild = true; - if (!llamafile_extract(srcs[i].zip, src)) { - return false; - } - break; - default: - __builtin_unreachable(); - } + // get set of microarchitectures for all installed graphics cards + char offload_arch[64]; + if (!GetAmdOffloadArchFlag(offload_arch)) { + return false; } - // check if dso is already compiled - if (!needs_rebuild) { - switch (llamafile_is_file_newer_than(src, dso)) { - case -1: - return false; - case false: - return true; - case true: - break; - default: - __builtin_unreachable(); - } + // create temporary output path for atomicity + char tmpdso[PATH_MAX]; + if (!CreateTempPath(dso, tmpdso)) { + return false; } - // find full path of nvidia compiler - char nvcc[PATH_MAX]; - if (!GetNvccPath(nvcc)) { - tinyprint(2, "warning: couldn't find nvcc (nvidia c compiler) " - "try setting $CUDA_PATH if it's installed\n", NULL); - return false; + // run the compiler to create a native build + // + // there's a higher level program called hipcc, but we can't use it, + // since it's a perl script and rocm doesn't bundle perl on windows. + // + // TODO(jart): test this on linux computer + if (Compile(src, tmpdso, dso, + (char *[]){ + (char *)clangxx, + "-fuse-ld=lld", + "-shared", + "-nostartfiles", + "-nostdlib", + "-DGGML_BUILD=1", + "-DGGML_SHARED=1", + "-Wno-ignored-attributes", + "-DGGML_CUDA_DMMV_X=32", + "-DGGML_CUDA_MMV_Y=1", + "-DGGML_USE_HIPBLAS", + (FLAG_tinyblas + ? "-DGGML_USE_TINYBLAS" + : "-DGGML_USE_CUBLAS"), + "-DK_QUANTS_PER_ITERATION=2", + "-D_CRT_SECURE_NO_WARNINGS", + "-D_XOPEN_SOURCE=600", + "-D__HIP_PLATFORM_AMD__=1", + "-D__HIP_PLATFORM_HCC__=1", + "-isystem", _gc(xasprintf("%s/include", hip_path)), + "-O3", + "-DNDEBUG", + "-D_DLL", + "-D_MT", + WIND_ONLY("-Xclang"), WIND_ONLY("--dependent-lib=msvcrt"), + "-std=gnu++14", + "-mllvm", "-amdgpu-early-inline-all=true", + "-mllvm", "-amdgpu-function-calls=false", + "-x", "hip", + "--hip-link", + (char *)offload_arch, + "-o", tmpdso, + (char *)src, + BLAS_ONLY("-l"), BLAS_ONLY(_gc(xasprintf("%s/lib/hipblas.%s", hip_path, lib))), + BLAS_ONLY("-l"), BLAS_ONLY(_gc(xasprintf("%s/lib/rocblas.%s", hip_path, lib))), + "-l", _gc(xasprintf("%s/lib/amdhip64.%s", hip_path, lib)), + WIND_ONLY("-lkernel32"), + 0})) { + return true; } + // oh no + return false; +} + +static bool CompileNvidia(const char *nvcc, const char *dso, const char *src) { + // create temporary output path for atomicity char tmpdso[PATH_MAX]; if (!CreateTempPath(dso, tmpdso)) { @@ -373,8 +530,8 @@ static bool CompileNativeCuda(const char *dso) { // try building dso with host nvidia microarchitecture tinyprint(2, "building ggml-cuda with nvcc -arch=native...\n", NULL); if (Compile(src, tmpdso, dso, (char *[]){ - nvcc, "-arch=native", NVCC_FLAGS, "-o", tmpdso, - src, NVCC_LIBS, NULL})) { + (char *)nvcc, "-arch=native", NVCC_FLAGS, "-o", tmpdso, + (char *)src, NVCC_LIBS, NULL})) { return true; } @@ -385,8 +542,8 @@ static bool CompileNativeCuda(const char *dso) { } tinyprint(2, "building ggml-cuda with nvcc ", archflag, "...\n", NULL); if (Compile(src, tmpdso, dso, (char *[]){ - nvcc, archflag, NVCC_FLAGS, "-o", tmpdso, - src, NVCC_LIBS, NULL})) { + (char *)nvcc, archflag, NVCC_FLAGS, "-o", tmpdso, + (char *)src, NVCC_LIBS, NULL})) { return true; } @@ -394,11 +551,13 @@ static bool CompileNativeCuda(const char *dso) { return false; } -static bool ExtractCudaDso(const char *dso) { +static bool ExtractCudaDso(const char *dso, const char *name) { // see if prebuilt dso is bundled in zip assets char zip[80]; - strlcpy(zip, "/zip/ggml-cuda.", sizeof(zip)); + strlcpy(zip, "/zip/", sizeof(zip)); + strlcat(zip, name, sizeof(zip)); + strlcat(zip, ".", sizeof(zip)); strlcat(zip, GetDsoExtension(), sizeof(zip)); if (!FileExists(zip)) { tinyprint(2, "prebuilt binary ", zip, " not found\n", NULL); @@ -409,14 +568,26 @@ static bool ExtractCudaDso(const char *dso) { return llamafile_extract(zip, dso); } -static bool LinkCudaDso(char *dso) { +static bool LinkCudaDso(const char *dso, const char *dir) { + + // Change directory so BLAS library is more likely to be linked. + char cwd[PATH_MAX]; + if (dir) { + getcwd(cwd, sizeof(cwd)); + chdir(dir); + } // runtime link dynamic shared object void *lib; + tinyprint(2, "dynamically linking ", dso, "\n", NULL); lib = cosmo_dlopen(dso, RTLD_LAZY); + if (dir) { + chdir(cwd); + } if (!lib) { + char cc[PATH_MAX]; tinyprint(2, Dlerror(), ": failed to load library\n", NULL); - if ((IsLinux() || IsBsd()) && !commandv("cc", dso, PATH_MAX)) { + if ((IsLinux() || IsBsd()) && !commandv("cc", cc, PATH_MAX)) { tinyprint(2, "you need to install cc for gpu support\n", NULL); } return false; @@ -451,12 +622,18 @@ static bool LinkCudaDso(char *dso) { return false; } - // we're good - return true; + // ask the library if actual gpu devices exist + ggml_cuda.init(); + return ggml_cuda.loaded(); } static bool ImportCudaImpl(void) { + // No dynamic linking support on OpenBSD yet. + if (IsOpenbsd()) { + return false; + } + // Check if we're allowed to even try. switch (FLAG_gpu) { case LLAMAFILE_GPU_AUTO: @@ -466,29 +643,169 @@ static bool ImportCudaImpl(void) { default: return false; } + tinyprint(2, "initializing gpu module...\n", NULL); - // No dynamic linking support on OpenBSD yet. - if (IsOpenbsd()) { - return false; + // extract source code + char src[PATH_MAX]; + bool needs_rebuild = FLAG_recompile; + for (int i = 0; i < sizeof(srcs) / sizeof(*srcs); ++i) { + llamafile_get_app_dir(src, sizeof(src)); + if (!i && mkdir(src, 0755) && errno != EEXIST) { + perror(src); + return false; + } + strlcat(src, srcs[i].name, sizeof(src)); + switch (llamafile_is_file_newer_than(srcs[i].zip, src)) { + case -1: + return false; + case false: + break; + case true: + needs_rebuild = true; + if (!llamafile_extract(srcs[i].zip, src)) { + return false; + } + break; + default: + __builtin_unreachable(); + } } - // Get path of CUDA support DSO. char dso[PATH_MAX]; - llamafile_get_app_dir(dso, PATH_MAX); - strlcat(dso, "ggml-cuda.", PATH_MAX); - strlcat(dso, GetDsoExtension(), PATH_MAX); - if (FLAG_nocompile) { - return LinkCudaDso(dso); - } + char bindir[PATH_MAX]; + const char *compiler_path; + char compiler_path_buf[PATH_MAX]; + const char *library_path; + char library_path_buf[PATH_MAX]; + + // Attempt to load AMD GPU support. + // We favor the underdog on AMD + NVIDIA chimeras. + switch (FLAG_gpu) { + case LLAMAFILE_GPU_AMD: + case LLAMAFILE_GPU_AUTO: - // Try building CUDA from source with mighty cuBLAS. - if (CompileNativeCuda(dso)) { - return LinkCudaDso(dso); + // Get some essential paths. + // ROCm SDK puts BLAS DLLs in same folder as clang++ + if (GetRocmBinPath(compiler_path_buf, "clang++")) { + strcpy(library_path_buf, compiler_path_buf); + dirname(library_path_buf); + compiler_path = compiler_path_buf; + library_path = library_path_buf; + } else { + compiler_path = 0; + library_path = 0; + } + + // Get path of GGML DSO for AMD. + llamafile_get_app_dir(dso, PATH_MAX); + strlcat(dso, "ggml-rocm.", PATH_MAX); + strlcat(dso, GetDsoExtension(), PATH_MAX); + if (FLAG_nocompile) { + if (LinkCudaDso(dso, library_path)) { + return true; + } else { + goto TryNvidia; + } + } + + // Check if DSO is already compiled. + if (!needs_rebuild && !FLAG_recompile) { + switch (llamafile_is_file_newer_than(src, dso)) { + case -1: + return false; + case false: + if (LinkCudaDso(dso, library_path)) { + return true; + } else { + goto TryNvidia; + } + case true: + break; + default: + __builtin_unreachable(); + } + } + + // Try building CUDA with ROCm SDK. + if (compiler_path) { + if (CompileAmd(compiler_path, dso, src)) { + if (LinkCudaDso(dso, library_path)) { + return true; + } else { + goto TryNvidia; + } + } + } else { + tinyprint(2, "note: won't compile AMD GPU support because $HIP_PATH/bin/clang++ is missing\n", NULL); + } + + // Try extracting prebuilt tinyBLAS DSO from PKZIP. + if (ExtractCudaDso(dso, "ggml-rocm")) { + if (LinkCudaDso(dso, library_path)) { + return true; + } else { + goto TryNvidia; + } + } + + break; + default: + break; } - // Try extracting prebuilt tinyBLAS DSO from PKZIP. - if (ExtractCudaDso(dso)) { - return LinkCudaDso(dso); +TryNvidia: + // Attempt to load NVIDIA GPU support. + switch (FLAG_gpu) { + case LLAMAFILE_GPU_AUTO: + case LLAMAFILE_GPU_NVIDIA: + + // Get some essential paths. + // CUDA SDK puts cuBLAS DLL in same folder as NVCC + if (GetNvccPath(compiler_path_buf)) { + strcpy(library_path_buf, compiler_path_buf); + dirname(library_path_buf); + compiler_path = compiler_path_buf; + library_path = library_path_buf; + } else { + compiler_path = 0; + library_path = 0; + } + + // Get path of GGML DSO for NVIDIA. + llamafile_get_app_dir(dso, PATH_MAX); + strlcat(dso, "ggml-cuda.", PATH_MAX); + strlcat(dso, GetDsoExtension(), PATH_MAX); + if (FLAG_nocompile) { + return LinkCudaDso(dso, library_path); + } + + // Check if DSO is already compiled. + if (!needs_rebuild && !FLAG_recompile) { + switch (llamafile_is_file_newer_than(src, dso)) { + case -1: + return false; + case false: + return LinkCudaDso(dso, library_path); + case true: + break; + default: + __builtin_unreachable(); + } + } + + // Try building CUDA from source with mighty cuBLAS. + if (compiler_path && CompileNvidia(compiler_path, dso, src)) { + return LinkCudaDso(dso, library_path); + } + + // Try extracting prebuilt tinyBLAS DSO from PKZIP. + if (ExtractCudaDso(dso, "ggml-cuda")) { + return LinkCudaDso(dso, library_path); + } + + break; + default: + break; } // too bad @@ -497,8 +814,8 @@ static bool ImportCudaImpl(void) { static void ImportCuda(void) { if (!ggml_cuda.disabled && ImportCudaImpl()) { + tinyprint(2, "GPU support successfully linked and loaded\n", NULL); ggml_cuda.supported = true; - tinyprint(2, "NVIDIA cuBLAS GPU support successfully loaded\n", NULL); } } diff --git a/llamafile/gpu.c b/llamafile/gpu.c index 17e8a6a0c9..e45562a5d3 100644 --- a/llamafile/gpu.c +++ b/llamafile/gpu.c @@ -23,8 +23,10 @@ #include "llama.cpp/ggml-metal.h" int FLAG_gpu; +bool FLAG_nogpu; bool FLAG_tinyblas; bool FLAG_nocompile; +bool FLAG_recompile; static const char *describe_required_gpu(void) { switch (FLAG_gpu) { @@ -36,6 +38,8 @@ static const char *describe_required_gpu(void) { return "apple"; case LLAMAFILE_GPU_NVIDIA: return "nvidia"; + case LLAMAFILE_GPU_DISABLED: + return "disabled"; default: __builtin_unreachable(); } @@ -52,12 +56,12 @@ int llamafile_gpu_supported(void) { } // Auto-configure AMD or NVIDIA GPU support. - if (ggml_cublas_loaded()) { + if (ggml_cuda_supported()) { return LLAMAFILE_GPU_NVIDIA; } // Abort if user wants specific GPU but it's unavailable. - if (FLAG_gpu != LLAMAFILE_GPU_AUTO || FLAG_tinyblas) { + if (FLAG_gpu > 0 || FLAG_tinyblas) { tinyprint(2, "fatal error: support for --gpu ", describe_required_gpu(), FLAG_tinyblas ? " --tinyblas" : "", " was explicitly requested, but it wasn't available\n", NULL); @@ -74,6 +78,7 @@ int llamafile_gpu_supported(void) { int llamafile_gpu_parse(const char *s) { // Parse canonical names for GPUs. + if (!strcasecmp(s, "disabled")) return LLAMAFILE_GPU_DISABLED; if (!strcasecmp(s, "auto")) return LLAMAFILE_GPU_AUTO; if (!strcasecmp(s, "amd")) return LLAMAFILE_GPU_AMD; if (!strcasecmp(s, "apple")) return LLAMAFILE_GPU_APPLE; diff --git a/llamafile/llamafile.h b/llamafile/llamafile.h index 24c4f0572b..f5a8b36e40 100644 --- a/llamafile/llamafile.h +++ b/llamafile/llamafile.h @@ -27,6 +27,7 @@ void llamafile_schlep(const void *, size_t); void llamafile_get_app_dir(char *, size_t); bool llamafile_launch_browser(const char *); +#define LLAMAFILE_GPU_DISABLED -1 #define LLAMAFILE_GPU_AUTO 0 #define LLAMAFILE_GPU_AMD 1 #define LLAMAFILE_GPU_APPLE 2 @@ -34,6 +35,7 @@ bool llamafile_launch_browser(const char *); extern int FLAG_gpu; extern bool FLAG_tinyblas; extern bool FLAG_nocompile; +extern bool FLAG_recompile; int llamafile_gpu_supported(void); int llamafile_gpu_parse(const char *); diff --git a/llamafile/rocm.bat b/llamafile/rocm.bat new file mode 100644 index 0000000000..5a4b039b18 --- /dev/null +++ b/llamafile/rocm.bat @@ -0,0 +1,56 @@ +:: Compiles distributable DLL for AMD GPU support +:: +:: The following microarchitectures are supported: +:: +:: - gfx1010 c. 2019 +:: - gfx1012 c. 2019 +:: - gfx906 c. 2020 +:: - gfx1032 c. 2021 +:: - gfx1030 c. 2022 +:: - gfx1031 c. 2022 +:: - gfx1100 c. 2022 +:: - gfx1101 (unreleased) +:: - gfx1102 (unreleased) +:: - gfx1103 (unreleased) +:: +:: The ROCm SDK won't need to be installed on the user's machine. +:: There will be a dependency on AMDHIP64.DLL, but unlike hipBLAS +:: and rocBLAS, that DLL comes with the AMD graphics driver. +:: +:: TODO(jart): How do we get this to not depend on VCRUNTIME140? + +%HIP_PATH%\bin\clang++.exe ^ + -fuse-ld=lld ^ + -shared ^ + -nostartfiles ^ + -nostdlib ^ + -DGGML_BUILD=1 ^ + -DGGML_SHARED=1 ^ + -Wno-ignored-attributes ^ + -DGGML_CUDA_DMMV_X=32 ^ + -DGGML_CUDA_MMV_Y=1 ^ + -DGGML_USE_HIPBLAS ^ + -DGGML_USE_TINYBLAS ^ + -DK_QUANTS_PER_ITERATION=2 ^ + -D_CRT_SECURE_NO_WARNINGS ^ + -D_XOPEN_SOURCE=600 ^ + -D__HIP_PLATFORM_AMD__=1 ^ + -D__HIP_PLATFORM_HCC__=1 ^ + -isystem %HIP_PATH%\include ^ + -O3 ^ + -DNDEBUG ^ + -D_DLL ^ + -D_MT ^ + -Xclang --dependent-lib=msvcrt ^ + -std=gnu++14 ^ + -mllvm -amdgpu-early-inline-all=true ^ + -mllvm -amdgpu-function-calls=false ^ + -x hip ^ + --hip-link ^ + --offload-arch=gfx1010,gfx1012,gfx906,gfx1030,gfx1031,gfx1032,gfx1100,gfx1101,gfx1102,gfx1103 ^ + -o ggml-rocm.dll ^ + ggml-cuda.cu ^ + -l%HIP_PATH%\lib\hipblas.lib ^ + -l%HIP_PATH%\lib\rocblas.lib ^ + -l%HIP_PATH%\lib\amdhip64.lib ^ + -lkernel32 diff --git a/llamafile/tinyblas.h b/llamafile/tinyblas.h index 17ce6ade20..a278e35d48 100644 --- a/llamafile/tinyblas.h +++ b/llamafile/tinyblas.h @@ -1,29 +1,12 @@ #pragma once +#ifdef GGML_USE_HIPBLAS +#include +#include +#else #include #include - -enum cublasOperation_t { - CUBLAS_OP_N = 111, - CUBLAS_OP_T = 112, - CUBLAS_OP_C = 113, -}; - -enum cublasStatus_t { - CUBLAS_STATUS_SUCCESS = 0, - CUBLAS_STATUS_NOT_SUPPORTED = 7, -}; - -enum cublasComputeType_t { - CUBLAS_COMPUTE_16F = 150, - CUBLAS_COMPUTE_32F = 151, -}; - -enum cublasGemmAlgo_t { - CUBLAS_GEMM_DEFAULT_TENSOR_OP = 160, -}; - -#define cublasHandle_t cudaStream_t +#endif cublasStatus_t tinyblasSgemm(cublasHandle_t handle, cublasOperation_t transa, diff --git a/llamafile/x.c b/llamafile/x.c new file mode 100644 index 0000000000..0899532b0f --- /dev/null +++ b/llamafile/x.c @@ -0,0 +1,42 @@ +// -*- mode:c;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*- +// vi: set et ft=c ts=4 sts=4 sw=4 fenc=utf-8 :vi +// +// Copyright 2023 Mozilla Foundation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "x.h" +#include +#include +#include +#include + +static wontreturn void oom(void) { + tinyprint(2, program_invocation_name, ": out of memory\n", NULL); + exit(1); +} + +char *xvasprintf(const char *fmt, va_list va) { + char *buf; + if (vasprintf(&buf, fmt, va) == -1) oom(); + return buf; +} + +char *xasprintf(const char *fmt, ...) { + char *res; + va_list va; + va_start(va, fmt); + res = xvasprintf(fmt, va); + va_end(va); + return res; +} diff --git a/llamafile/x.h b/llamafile/x.h new file mode 100644 index 0000000000..443c7b803e --- /dev/null +++ b/llamafile/x.h @@ -0,0 +1,14 @@ +#ifndef LLAMAFILE_X_H_ +#define LLAMAFILE_X_H_ +#include +#ifdef __cplusplus +extern "C" { +#endif + +char *xasprintf(const char *, ...); +char *xvasprintf(const char *, va_list); + +#ifdef __cplusplus +} +#endif +#endif /* LLAMAFILE_X_H_ */