From f4518b1ebafe7537df804a2a2916457c3285c9ff Mon Sep 17 00:00:00 2001
From: Oleksandr Kuvshynov <661042+okuvshynov@users.noreply.github.com>
Date: Fri, 26 Apr 2024 20:38:14 -0400
Subject: [PATCH] print out logits to get some training data

---
 Makefile                           |   4 +
 examples/async_spec/async_spec.cpp |   2 +-
 examples/async_spec/out_logits.cpp | 119 +++++++++++++++++++++++++++++
 3 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 examples/async_spec/out_logits.cpp
diff --git a/Makefile b/Makefile
index 20647540d0bf4..8ee23501785c6 100644
--- a/Makefile
+++ b/Makefile
@@ -760,6 +760,10 @@ async_spec: examples/async_spec/async_spec.cpp                            ggml.o
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
 
+out_logits: examples/async_spec/out_logits.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+
 draft_rank: examples/async_spec/draft_rank.cpp                            ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
diff --git a/examples/async_spec/async_spec.cpp b/examples/async_spec/async_spec.cpp
index 5403bc534b124..9d9d6020b89e4 100644
--- a/examples/async_spec/async_spec.cpp
+++ b/examples/async_spec/async_spec.cpp
@@ -73,7 +73,7 @@ static int main_loop(
     llama_model *model,
     linear_speculative_context *spec_ctx,
     llama_context *ctx,
-    std::vector<llama_token> tokens_list /* copy here */) {
+    std::vector<llama_token> tokens_list /* making copy here */) {
   const int n_len = 1024;
   int input_len_dist[1024] = {0};
 
diff --git a/examples/async_spec/out_logits.cpp b/examples/async_spec/out_logits.cpp
new file mode 100644
index 0000000000000..869d8472e6f75
--- /dev/null
+++ b/examples/async_spec/out_logits.cpp
@@ -0,0 +1,119 @@
+#include "common.h"
+#include "llama.h"
+
+#include <cmath>
+#include <cstdio>
+#include <fstream>
+#include <mutex>
+#include <sstream>
+#include <string>
+#include <thread>
+#include <vector>
+
+// greedy sampling
+static llama_token greedy_token(llama_model *model, llama_context *ctx, int idx) {
+  auto n_vocab = llama_n_vocab(model);
+  std::vector<llama_token_data> candidates;
+  candidates.resize(n_vocab);
+
+  auto *logits  = llama_get_logits_ith(ctx, idx);
+  for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+    std::cout << logits[token_id] << " ";
+    candidates[token_id] = llama_token_data{ token_id, logits[token_id], 0.0f };
+  }
+
+  llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+  // sample the most likely token
+  return llama_sample_token_greedy(ctx, &candidates_p);
+}
+
+static int main_loop(
+    llama_model *model,
+    llama_context *ctx,
+    std::vector<llama_token> tokens_list /* copy here */) {
+  const int n_len = 1024;
+
+  llama_batch batch = llama_batch_init(1024, 0, 1);
+
+  // evaluate the initial prompt
+  for (size_t i = 0; i < tokens_list.size(); i++) {
+    llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
+  }
+
+  // llama_decode will output logits only for the last token of the prompt
+  batch.logits[batch.n_tokens - 1] = true;
+
+  if (llama_decode(ctx, batch) != 0) {
+    LOG_TEE("%s: llama_decode() failed\n", __func__);
+    return 1;
+  }
+
+  // how many tokens are currently accepted
+  int n_cur  = batch.n_tokens;
+
+  while (n_cur <= n_len) {
+    llama_token new_token_id = greedy_token(model, ctx, batch.n_tokens - 1);
+    // this is where next_tokens start
+    if (new_token_id == llama_token_eos(model)) {
+      break;
+    }
+    if (n_cur >= n_len) {
+      break;
+    }
+    std::cout << llama_token_to_piece(ctx, new_token_id) << std::flush;
+
+    llama_batch_clear(batch);
+    llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
+    if (llama_decode(ctx, batch)) {
+      fprintf(stderr, "%s : failed to eval, return code %d\n", __func__, 1);
+      return 1;
+    }
+    n_cur += 1;
+  }
+
+  llama_batch_free(batch);
+  return 0;
+}
+
+int main(int argc, char ** argv) {
+    gpt_params params;
+
+    llama_backend_init();
+    llama_numa_init(params.numa);
+
+    // init context params
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.seed  = 1234;
+    ctx_params.n_ctx = 2048;
+    ctx_params.n_threads = params.n_threads;
+    ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
+
+    // Init main model and context
+    if (argc >= 2) {
+        params.model = argv[1];
+    }
+    llama_model_params model_params = llama_model_default_params();
+    model_params.n_gpu_layers = 99;
+    llama_model *main_model = llama_load_model_from_file(params.model.c_str(), model_params);
+
+    llama_context *main_ctx = llama_new_context_with_model(main_model, ctx_params);
+    std::ifstream t(argv[2]);
+    std::stringstream buffer;
+    buffer << t.rdbuf();
+    params.prompt = buffer.str();
+
+    if (params.prompt.empty()) {
+        params.prompt = "What's the difference between instruction cache and data cache?";
+    }
+    std::cout << params.prompt << std::flush;
+    std::vector<llama_token> tokens_list = llama_tokenize(main_ctx, params.prompt, true);
+
+    main_loop(main_model, main_ctx, tokens_list);
+
+    llama_free_model(main_model);
+    llama_free(main_ctx);
+    llama_backend_free();
+
+    return 0;
+}