Vulkan k-quant mmq and ggml-backend offload functionality (#6155)

* Fix Vulkan no kv offload incoherence * Add k-quant mul mat mat shaders * Rework working buffer allocation, reduces vram use noticeably Clean up cpu assist code, replaced with ggml-backend offload function * Default to all dedicated GPUs * Add fallback for integrated GPUs if no dedicated GPUs are found * Add debug info which device is allocating memory * Fix Intel dequant issue Fix validation issue * Fix Vulkan GGML_OP_GET_ROWS implementation * Clean up merge artifacts * Remove Vulkan warning
ggerganov · Mar 29, 2024 · ba0c7c7 · ba0c7c7
1 parent d48ccf3
commit ba0c7c7
Show file tree

Hide file tree

Showing 7 changed files with 37,776 additions and 15,352 deletions.
diff --git a/README.md b/README.md
@@ -636,15 +636,6 @@ Building the program with BLAS support may lead to some performance improvements
 
 - #### Vulkan
 
-> [!WARNING]
->
-> Vulkan support has been broken in https://github.com/ggerganov/llama.cpp/pull/6122
-> due to relying on `GGML_OP_GET_ROWS` which is not yet properly supported by the Vulkan backend,
-> but should be fixed relatively soon (possibly in https://github.com/ggerganov/llama.cpp/pull/6155
-> (ref: https://github.com/ggerganov/llama.cpp/pull/6122#issuecomment-2015327635)).
->
-> Meanwhile, if you want to use the Vulkan backend, you should use the commit right before the breaking change, https://github.com/ggerganov/llama.cpp/commit/55c1b2a3bbd470e9e2a3a0618b92cf64a885f806
-
   **With docker**:
 
   You don't need to install Vulkan SDK. It will be installed inside the container.

diff --git a/ggml-vulkan-shaders.hpp b/ggml-vulkan-shaders.hpp
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
diff --git a/ggml-vulkan.h b/ggml-vulkan.h
@@ -11,17 +11,6 @@ extern "C" {
 #define GGML_VK_MAX_DEVICES 16
 
 GGML_API void ggml_vk_instance_init(void);
-GGML_API void ggml_vk_init_cpu_assist(void);
-
-GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
-GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
-GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
-GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-#ifdef GGML_VULKAN_CHECK_RESULTS
-void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-#endif
-GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
-GGML_API void ggml_vk_free_cpu_assist(void);
 
 // backend API
 GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);

diff --git a/ggml.c b/ggml.c
@@ -278,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 #include <Accelerate/Accelerate.h>
 #if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
 #include "ggml-opencl.h"
-#elif defined(GGML_USE_VULKAN)
-#include "ggml-vulkan.h"
 #endif
 #elif defined(GGML_USE_OPENBLAS)
 #if defined(GGML_BLAS_USE_MKL)
@@ -289,8 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
 #endif
 #elif defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
-#elif defined(GGML_USE_VULKAN)
-#include "ggml-vulkan.h"
 #endif
 
 // floating point type used to accumulate sums
@@ -2717,8 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
 
 #if defined(GGML_USE_CLBLAST)
         ggml_cl_init();
-#elif defined(GGML_USE_VULKAN)
-        ggml_vk_init_cpu_assist();
 #endif
 
         ggml_setup_op_has_task_pass();
@@ -16128,20 +16122,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
         return;
     }
 
-#if defined(GGML_USE_VULKAN)
-    const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
-#ifdef GGML_VULKAN_CHECK_RESULTS
-    if (skip_cpu) {
-        ggml_vk_check_results_1_cpu_assist(params, tensor);
-    }
-#endif
-    if (skip_cpu) {
-        return;
-    }
-    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
-    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
-#endif // GGML_USE_VULKAN
-
     switch (tensor->op) {
         case GGML_OP_DUP:
             {
@@ -18617,17 +18597,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         }
     }
 
-#ifdef GGML_USE_VULKAN
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
-    }
-    ggml_vk_preallocate_buffers_cpu_assist();
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
-    }
-#endif
-
     const int n_threads = cplan->n_threads;
 
     struct ggml_compute_state_shared state_shared = {
@@ -18684,10 +18653,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
         }
     }
 
-#ifdef GGML_USE_VULKAN
-    ggml_vk_graph_cleanup_cpu_assist();
-#endif
-
     // performance stats (graph)
     {
         int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;