Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vulkan k-quant mmq and ggml-backend offload functionality #6155

Merged
merged 12 commits into from
Mar 29, 2024
9 changes: 0 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -632,15 +632,6 @@ Building the program with BLAS support may lead to some performance improvements

- #### Vulkan

> [!WARNING]
>
> Vulkan support has been broken in https://github.com/ggerganov/llama.cpp/pull/6122
> due to relying on `GGML_OP_GET_ROWS` which is not yet properly supported by the Vulkan backend,
> but should be fixed relatively soon (possibly in https://github.com/ggerganov/llama.cpp/pull/6155
> (ref: https://github.com/ggerganov/llama.cpp/pull/6122#issuecomment-2015327635)).
>
> Meanwhile, if you want to use the Vulkan backend, you should use the commit right before the breaking change, https://github.com/ggerganov/llama.cpp/commit/55c1b2a3bbd470e9e2a3a0618b92cf64a885f806

**With docker**:

You don't need to install Vulkan SDK. It will be installed inside the container.
Expand Down
52,138 changes: 37,199 additions & 14,939 deletions ggml-vulkan-shaders.hpp

Large diffs are not rendered by default.

635 changes: 328 additions & 307 deletions ggml-vulkan.cpp

Large diffs are not rendered by default.

11 changes: 0 additions & 11 deletions ggml-vulkan.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,6 @@ extern "C" {
#define GGML_VK_MAX_DEVICES 16

GGML_API void ggml_vk_instance_init(void);
GGML_API void ggml_vk_init_cpu_assist(void);

GGML_API void ggml_vk_preallocate_buffers_graph_cpu_assist(struct ggml_tensor * node);
GGML_API void ggml_vk_preallocate_buffers_cpu_assist(void);
GGML_API void ggml_vk_build_graph_cpu_assist(struct ggml_tensor * node, bool last_node);
GGML_API bool ggml_vk_compute_forward_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
void ggml_vk_check_results_1_cpu_assist(struct ggml_compute_params * params, struct ggml_tensor * tensor);
#endif
GGML_API void ggml_vk_graph_cleanup_cpu_assist(void);
GGML_API void ggml_vk_free_cpu_assist(void);

// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
Expand Down
35 changes: 0 additions & 35 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -278,8 +278,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#include <Accelerate/Accelerate.h>
#if defined(GGML_USE_CLBLAST) // allow usage of CLBlast alongside Accelerate functions
#include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
#include "ggml-vulkan.h"
#endif
#elif defined(GGML_USE_OPENBLAS)
#if defined(GGML_BLAS_USE_MKL)
Expand All @@ -289,8 +287,6 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#endif
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#elif defined(GGML_USE_VULKAN)
#include "ggml-vulkan.h"
#endif

// floating point type used to accumulate sums
Expand Down Expand Up @@ -2717,8 +2713,6 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {

#if defined(GGML_USE_CLBLAST)
ggml_cl_init();
#elif defined(GGML_USE_VULKAN)
ggml_vk_init_cpu_assist();
#endif

ggml_setup_op_has_task_pass();
Expand Down Expand Up @@ -16128,20 +16122,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
return;
}

#if defined(GGML_USE_VULKAN)
const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor);
#ifdef GGML_VULKAN_CHECK_RESULTS
if (skip_cpu) {
ggml_vk_check_results_1_cpu_assist(params, tensor);
}
#endif
if (skip_cpu) {
return;
}
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU);
#endif // GGML_USE_VULKAN

switch (tensor->op) {
case GGML_OP_DUP:
{
Expand Down Expand Up @@ -18617,17 +18597,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
}
}

#ifdef GGML_USE_VULKAN
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_preallocate_buffers_graph_cpu_assist(cgraph->nodes[i]);
}
ggml_vk_preallocate_buffers_cpu_assist();

for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_vk_build_graph_cpu_assist(cgraph->nodes[i], i == cgraph->n_nodes - 1);
}
#endif

const int n_threads = cplan->n_threads;

struct ggml_compute_state_shared state_shared = {
Expand Down Expand Up @@ -18684,10 +18653,6 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
}
}

#ifdef GGML_USE_VULKAN
ggml_vk_graph_cleanup_cpu_assist();
#endif

// performance stats (graph)
{
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
Expand Down
Loading
Loading