ggerganov · l3utterfly · Oct 7, 2023 · Oct 7, 2023 · Oct 7, 2023 · Oct 7, 2023
diff --git a/llama.cpp b/llama.cpp
@@ -9001,7 +9001,21 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
             const size_t elt_size = ggml_element_size(kv_self.k);
 
             ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
-            ggml_cgraph gf{};
+
+            // create a temporary cgraph without initialising ggml objects, code inspired from `ggml.c:ggml_new_graph`
+            struct ggml_cgraph * gf = (struct ggml_cgraph *) (malloc(sizeof(ggml_cgraph)));
+
+            (*gf).n_nodes = 0;
+            (*gf).n_leafs = 0;
+            (*gf).order = GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT;
+            (*gf).perf_runs = 0;
+            (*gf).perf_cycles = 0;
+            (*gf).perf_time_us = 0;
+
+            memset((*gf).nodes, 0, sizeof((*gf).nodes));
+            memset((*gf).grads, 0, sizeof((*gf).grads));
+            memset((*gf).leafs, 0, sizeof((*gf).leafs));
+            memset((*gf).visited_hash_table, 0, sizeof((*gf).visited_hash_table));
 
             ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
             std::vector<uint8_t> kout3d_data(ggml_nbytes(kout3d), 0);
@@ -9019,16 +9033,20 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
                 kv_head, n_embd, n_layer,
                 elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
 
-            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
-            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
-            ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
+            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
+            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
+            ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
 
             // our data is now in the kout3d_data and vout3d_data buffers
             // write them to file
             data_ctx->write(kout3d_data.data(), kout3d_data.size());
             data_ctx->write(vout3d_data.data(), vout3d_data.size());
+
+            // free our allocated graph
+            free(gf);
+            gf = NULL;
         }
 
         for (uint32_t i = 0; i < kv_size; ++i) {
@@ -9129,7 +9147,21 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
             const size_t elt_size = ggml_element_size(kv_self.k);
 
             ggml_context * cpy_ctx = ggml_init({ 4096, NULL, /* no_alloc */ true });
-            ggml_cgraph gf{};
+
+            // create a temporary cgraph without initialising ggml objects, code inspired from `ggml.c:ggml_new_graph`
+            struct ggml_cgraph * gf = (struct ggml_cgraph *) (malloc(sizeof(ggml_cgraph)));
+
+            (*gf).n_nodes = 0;
+            (*gf).n_leafs = 0;
+            (*gf).order = GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT;
+            (*gf).perf_runs = 0;
+            (*gf).perf_cycles = 0;
+            (*gf).perf_time_us = 0;
+
+            memset((*gf).nodes, 0, sizeof((*gf).nodes));
+            memset((*gf).grads, 0, sizeof((*gf).grads));
+            memset((*gf).leafs, 0, sizeof((*gf).leafs));
+            memset((*gf).visited_hash_table, 0, sizeof((*gf).visited_hash_table));
 
             ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_head, n_layer);
             kin3d->data = (void *) inp;
@@ -9147,11 +9179,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
                 kv_head, n_embd, n_layer,
                 elt_size*n_ctx, elt_size*n_ctx*n_embd, 0);
 
-            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
-            ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
-            ggml_graph_compute_helper(ctx->work_buffer, &gf, /*n_threads*/ 1);
+            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
+            ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
+            ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
 
             ggml_free(cpy_ctx);
+
+            // free our allocated graph
+            free(gf);
+            gf = NULL;
         }
 
         ctx->kv_self.head = kv_head;