diff --git a/CMakeLists.txt b/CMakeLists.txt index 782a893e4..06de0d58b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,7 @@ option(SD_CUDA "sd: cuda backend" OFF) option(SD_HIPBLAS "sd: rocm backend" OFF) option(SD_METAL "sd: metal backend" OFF) option(SD_VULKAN "sd: vulkan backend" OFF) +option(SD_OPENCL "sd: opencl backend" OFF) option(SD_SYCL "sd: sycl backend" OFF) option(SD_MUSA "sd: musa backend" OFF) option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF) @@ -52,6 +53,12 @@ if (SD_VULKAN) add_definitions(-DSD_USE_VULKAN) endif () +if (SD_OPENCL) + message("-- Use OpenCL as backend stable-diffusion") + set(GGML_OPENCL ON) + add_definitions(-DSD_USE_OPENCL) +endif () + if (SD_HIPBLAS) message("-- Use HIPBLAS as backend stable-diffusion") set(GGML_HIP ON) diff --git a/README.md b/README.md index 553fb7f8f..1833e281d 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Inference of Stable Diffusion and Flux in pure C/C++ - Accelerated memory-efficient CPU inference - Only requires ~2.3GB when using txt2img with fp16 precision to generate a 512x512 image, enabling Flash Attention just requires ~1.8GB. - AVX, AVX2 and AVX512 support for x86 architectures -- Full CUDA, Metal, Vulkan and SYCL backend for GPU acceleration. +- Full CUDA, Metal, Vulkan, OpenCL and SYCL backend for GPU acceleration. - Can load ckpt, safetensors and diffusers models/checkpoints. Standalone VAEs models - No need to convert to `.ggml` or `.gguf` anymore! - Flash Attention for memory usage optimization @@ -159,6 +159,73 @@ cmake .. -DSD_VULKAN=ON cmake --build . --config Release ``` +##### Using OpenCL (for Adreno GPU) + +Currently, it supports only Adreno GPUs and is primarily optimized for Q4_0 type + +To build for Windows ARM please refers to [Windows 11 Arm64 +](https://github.com/ggml-org/llama.cpp/blob/master/docs/backend/OPENCL.md#windows-11-arm64) + +Building for Android: + + Android NDK: + Download and install the Android NDK from the [official Android developer site](https://developer.android.com/ndk/downloads). + +Setup OpenCL Dependencies for NDK: + +You need to provide OpenCL headers and the ICD loader library to your NDK sysroot. + +* OpenCL Headers: + ```bash + # In a temporary working directory + git clone https://github.com/KhronosGroup/OpenCL-Headers + cd OpenCL-Headers + # Replace with your actual NDK installation path + # e.g., cp -r CL /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include + sudo cp -r CL /toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include + cd .. + ``` + +* OpenCL ICD Loader: + ```bash + # In the same temporary working directory + git clone https://github.com/KhronosGroup/OpenCL-ICD-Loader + cd OpenCL-ICD-Loader + mkdir build_ndk && cd build_ndk + + # Replace in the CMAKE_TOOLCHAIN_FILE and OPENCL_ICD_LOADER_HEADERS_DIR + cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_TOOLCHAIN_FILE=/build/cmake/android.toolchain.cmake \ + -DOPENCL_ICD_LOADER_HEADERS_DIR=/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/include \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=24 \ + -DANDROID_STL=c++_shared + + ninja + # Replace + # e.g., cp libOpenCL.so /path/to/android-ndk-r26c/toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android + sudo cp libOpenCL.so /toolchains/llvm/prebuilt/linux-x86_64/sysroot/usr/lib/aarch64-linux-android + cd ../.. + ``` + +Build `stable-diffusion.cpp` for Android with OpenCL: + +```bash +mkdir build-android && cd build-android + +# Replace with your actual NDK installation path +# e.g., -DCMAKE_TOOLCHAIN_FILE=/path/to/android-ndk-r26c/build/cmake/android.toolchain.cmake +cmake .. -G Ninja \ + -DCMAKE_TOOLCHAIN_FILE=/build/cmake/android.toolchain.cmake \ + -DANDROID_ABI=arm64-v8a \ + -DANDROID_PLATFORM=android-28 \ + -DGGML_OPENMP=OFF \ + -DSD_OPENCL=ON + +ninja +``` +*(Note: Don't forget to include `LD_LIBRARY_PATH=/vendor/lib64` in your command line before running the binary)* + ##### Using SYCL Using SYCL makes the computation run on the Intel GPU. Please make sure you have installed the related driver and [IntelĀ® oneAPI Base toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html) before start. More details and steps can refer to [llama.cpp SYCL backend](https://github.com/ggerganov/llama.cpp/blob/master/docs/backend/SYCL.md#linux). diff --git a/common.hpp b/common.hpp index 337b4a0c4..b20c60ff1 100644 --- a/common.hpp +++ b/common.hpp @@ -56,7 +56,7 @@ class UpSampleBlock : public GGMLBlock { // x: [N, channels, h, w] auto conv = std::dynamic_pointer_cast(blocks["conv"]); - x = ggml_upscale(ctx, x, 2); // [N, channels, h*2, w*2] + x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2] x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2] return x; } diff --git a/esrgan.hpp b/esrgan.hpp index 989d15fee..5cbb4ad8f 100644 --- a/esrgan.hpp +++ b/esrgan.hpp @@ -130,8 +130,8 @@ class RRDBNet : public GGMLBlock { body_feat = conv_body->forward(ctx, body_feat); feat = ggml_add(ctx, feat, body_feat); // upsample - feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2))); - feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2))); + feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST))); + feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST))); auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat))); return out; } diff --git a/ggml b/ggml index ff9052988..0e07f5c15 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit ff9052988b76e137bcf92bb335733933ca196ac0 +Subproject commit 0e07f5c1552fada22de01c8a92c5aa56b4765f4e diff --git a/ggml_extend.hpp b/ggml_extend.hpp index c5913be4d..f18e67142 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -39,6 +39,10 @@ #include "ggml-vulkan.h" #endif +#ifdef SD_USE_OPENCL +#include "ggml-opencl.h" +#endif + #ifdef SD_USE_SYCL #include "ggml-sycl.h" #endif @@ -113,7 +117,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct g a->ne[0] * b->ne[0], a->ne[1] * b->ne[1], a->ne[2] * b->ne[2], - a->ne[3] * b->ne[3]), + a->ne[3] * b->ne[3], + GGML_SCALE_MODE_NEAREST), b); } @@ -1064,12 +1069,13 @@ struct GGMLRunner { struct ggml_context* params_ctx = NULL; ggml_backend_buffer_t params_buffer = NULL; - struct ggml_context* compute_ctx = NULL; - struct ggml_gallocr* compute_allocr = NULL; + struct ggml_context* compute_ctx = NULL; + ggml_backend_sched_t compute_sched = NULL; std::map backend_tensor_data_map; - ggml_backend_t backend = NULL; + ggml_backend_t backend = NULL; + ggml_backend_t cpu_backend = NULL; void alloc_params_ctx() { struct ggml_init_params params; @@ -1090,7 +1096,7 @@ struct GGMLRunner { void alloc_compute_ctx() { struct ggml_init_params params; - params.mem_size = static_cast(ggml_tensor_overhead() * MAX_GRAPH_SIZE + ggml_graph_overhead()); + params.mem_size = static_cast(ggml_tensor_overhead() * MAX_GRAPH_SIZE * 2 + ggml_graph_overhead_custom(MAX_GRAPH_SIZE, false)); params.mem_buffer = NULL; params.no_alloc = true; @@ -1106,47 +1112,72 @@ struct GGMLRunner { } bool alloc_compute_buffer(get_graph_cb_t get_graph) { - if (compute_allocr != NULL) { + if (compute_sched != NULL) { return true; } reset_compute_ctx(); struct ggml_cgraph* gf = get_graph(); backend_tensor_data_map.clear(); - compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); - if (!ggml_gallocr_reserve(compute_allocr, gf)) { - // failed to allocate the compute buffer - LOG_ERROR("%s: failed to allocate the compute buffer\n", get_desc().c_str()); - free_compute_buffer(); + ggml_backend_t backends_list[2]; + int n_backends_for_sched = 0; + + backends_list[n_backends_for_sched++] = this->backend; + if (this->cpu_backend) { + backends_list[n_backends_for_sched++] = this->cpu_backend; + } + + compute_sched = ggml_backend_sched_new(backends_list, NULL, n_backends_for_sched, MAX_GRAPH_SIZE, false, false); + if (!compute_sched) { + LOG_ERROR("%s: failed to create backend scheduler\n", get_desc().c_str()); return false; } - // compute the required memory - size_t compute_buffer_size = ggml_gallocr_get_buffer_size(compute_allocr, 0); - LOG_DEBUG("%s compute buffer size: %.2f MB(%s)", - get_desc().c_str(), - compute_buffer_size / 1024.0 / 1024.0, - ggml_backend_is_cpu(backend) ? "RAM" : "VRAM"); + if (!ggml_backend_sched_reserve(compute_sched, gf)) { + LOG_ERROR("%s: failed to reserve memory with backend scheduler for graph\n", get_desc().c_str()); + ggml_backend_sched_free(compute_sched); + compute_sched = NULL; + return false; + } + + for (int i = 0; i < n_backends_for_sched; ++i) { + size_t buffer_size = ggml_backend_sched_get_buffer_size(compute_sched, backends_list[i]); + LOG_DEBUG("%s compute buffer size for %s: %.2f MB", + get_desc().c_str(), + ggml_backend_name(backends_list[i]), + buffer_size / 1024.0 / 1024.0); + } return true; } void cpy_data_to_backend_tensor() { for (auto& kv : backend_tensor_data_map) { auto tensor = kv.first; - auto data = kv.second; + auto data_src = kv.second; - ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor)); + if (tensor->data == NULL && tensor->buffer == NULL) { + continue; + } + ggml_backend_tensor_set(tensor, data_src, 0, ggml_nbytes(tensor)); } - backend_tensor_data_map.clear(); } public: virtual std::string get_desc() = 0; - GGMLRunner(ggml_backend_t backend) - : backend(backend) { + GGMLRunner(ggml_backend_t backend_in) + : backend(backend_in) { alloc_params_ctx(); + if (!ggml_backend_is_cpu(this->backend)) { + this->cpu_backend = ggml_backend_cpu_init(); + if (!this->cpu_backend) { + // Avoid calling pure virtual get_desc() here. + LOG_ERROR("FATAL: Failed to initialize CPU backend for fallback."); + } + } else { + this->cpu_backend = NULL; + } } virtual ~GGMLRunner() { @@ -1154,6 +1185,10 @@ struct GGMLRunner { free_compute_buffer(); free_params_ctx(); free_compute_ctx(); + if (cpu_backend) { + ggml_backend_free(cpu_backend); + cpu_backend = NULL; + } } void reset_compute_ctx() { @@ -1165,22 +1200,17 @@ struct GGMLRunner { size_t num_tensors = ggml_tensor_num(params_ctx); params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, backend); if (params_buffer == NULL) { - LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i", + LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %zu", get_desc().c_str(), num_tensors); return false; } size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer); - LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)", + LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%zu tensors)", get_desc().c_str(), params_buffer_size / (1024.0 * 1024.0), ggml_backend_is_cpu(backend) ? "RAM" : "VRAM", num_tensors); - // printf("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)\n", - // get_desc().c_str(), - // params_buffer_size / (1024.0 * 1024.0), - // ggml_backend_is_cpu(backend) ? "RAM" : "VRAM", - // num_tensors); return true; } @@ -1199,13 +1229,12 @@ struct GGMLRunner { } void free_compute_buffer() { - if (compute_allocr != NULL) { - ggml_gallocr_free(compute_allocr); - compute_allocr = NULL; + if (compute_sched != NULL) { + ggml_backend_sched_free(compute_sched); + compute_sched = NULL; } } - // do copy after alloc graph void set_backend_tensor_data(struct ggml_tensor* tensor, const void* data) { backend_tensor_data_map[tensor] = data; } @@ -1215,11 +1244,12 @@ struct GGMLRunner { if (tensor == NULL) { return NULL; } - // it's performing a compute, check if backend isn't cpu - if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) { - // pass input tensors to gpu memory - auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor); + bool tensor_on_host_or_unmanaged = tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer); + bool is_param_tensor = false; + if (tensor_on_host_or_unmanaged && !is_param_tensor) { + auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor); + ggml_set_name(backend_tensor, tensor->name); set_backend_tensor_data(backend_tensor, tensor->data); return backend_tensor; } else { @@ -1232,26 +1262,56 @@ struct GGMLRunner { bool free_compute_buffer_immediately = true, struct ggml_tensor** output = NULL, struct ggml_context* output_ctx = NULL) { - alloc_compute_buffer(get_graph); - reset_compute_ctx(); - struct ggml_cgraph* gf = get_graph(); - GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf)); + + if (!alloc_compute_buffer(get_graph)) { + LOG_ERROR("%s: Failed to allocate/reserve compute buffer with scheduler.", get_desc().c_str()); + return; + } + + reset_compute_ctx(); + struct ggml_cgraph* gf = get_graph(); + + GGML_ASSERT(compute_sched != NULL); + ggml_backend_sched_reset(compute_sched); + + if (!ggml_backend_sched_alloc_graph(compute_sched, gf)) { + LOG_ERROR("%s: ggml_backend_sched_alloc_graph failed\n", get_desc().c_str()); + return; + } + cpy_data_to_backend_tensor(); - if (ggml_backend_is_cpu(backend)) { - ggml_backend_cpu_set_n_threads(backend, n_threads); + + if (ggml_backend_is_cpu(this->backend)) { + ggml_backend_cpu_set_n_threads(this->backend, n_threads); + } else if (this->cpu_backend) { + ggml_backend_cpu_set_n_threads(this->cpu_backend, n_threads); + } + + enum ggml_status status = ggml_backend_sched_graph_compute(compute_sched, gf); + if (status != GGML_STATUS_SUCCESS) { + LOG_ERROR("%s: ggml_backend_sched_graph_compute failed with status %d (%s)\n", + get_desc().c_str(), status, ggml_status_to_string(status)); + return; } - ggml_backend_graph_compute(backend, gf); #ifdef GGML_PERF - ggml_graph_print(gf); + // ggml_graph_print(gf); #endif - if (output != NULL) { - auto result = ggml_graph_node(gf, -1); + if (output != NULL && ggml_graph_n_nodes(gf) > 0) { + struct ggml_tensor* result_tensor_in_graph = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1); + if (*output == NULL && output_ctx != NULL) { - *output = ggml_dup_tensor(output_ctx, result); + *output = ggml_dup_tensor(output_ctx, result_tensor_in_graph); } if (*output != NULL) { - ggml_backend_tensor_get_and_sync(backend, result, (*output)->data, 0, ggml_nbytes(*output)); + ggml_backend_t result_backend = ggml_backend_sched_get_tensor_backend(compute_sched, result_tensor_in_graph); + if (result_backend == NULL) { + LOG_ERROR("%s: Could not determine backend for result tensor %s\n", get_desc().c_str(), result_tensor_in_graph->name); + } else { + ggml_backend_tensor_get_and_sync(result_backend, + result_tensor_in_graph, + (*output)->data, 0, ggml_nbytes(*output)); + } } } diff --git a/model.cpp b/model.cpp index 24da39f6d..3e0bde77b 100644 --- a/model.cpp +++ b/model.cpp @@ -26,6 +26,10 @@ #include "ggml-vulkan.h" #endif +#ifdef SD_USE_OPENCL +#include "ggml-opencl.h" +#endif + #define ST_HEADER_SIZE_LEN 8 uint64_t read_u64(uint8_t* buffer) { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index e38a6101f..cdd62f100 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -179,6 +179,14 @@ class StableDiffusionGGML { LOG_WARN("Failed to initialize Vulkan backend"); } #endif +#ifdef SD_USE_OPENCL + LOG_DEBUG("Using OpenCL backend"); + // ggml_log_set(ggml_log_callback_default, nullptr); // Optional ggml logs + backend = ggml_backend_opencl_init(); + if (!backend) { + LOG_WARN("Failed to initialize OpenCL backend"); + } +#endif #ifdef SD_USE_SYCL LOG_DEBUG("Using SYCL backend"); backend = ggml_backend_sycl_init(0); diff --git a/tae.hpp b/tae.hpp index c458b87d2..678c44c57 100644 --- a/tae.hpp +++ b/tae.hpp @@ -149,7 +149,7 @@ class TinyDecoder : public UnaryBlock { if (i == 1) { h = ggml_relu_inplace(ctx, h); } else { - h = ggml_upscale(ctx, h, 2); + h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST); } continue; } diff --git a/upscaler.cpp b/upscaler.cpp index 0c11b666e..137213496 100644 --- a/upscaler.cpp +++ b/upscaler.cpp @@ -28,6 +28,10 @@ struct UpscalerGGML { LOG_DEBUG("Using Vulkan backend"); backend = ggml_backend_vk_init(0); #endif +#ifdef SD_USE_OPENCL + LOG_DEBUG("Using OpenCL backend"); + backend = ggml_backend_opencl_init(); +#endif #ifdef SD_USE_SYCL LOG_DEBUG("Using SYCL backend"); backend = ggml_backend_sycl_init(0);