From 0f56a0a287be97ce80c47dccbd75709489c28653 Mon Sep 17 00:00:00 2001
From: Antonis Makropoulos <benuix@gmail.com>
Date: Thu, 5 Sep 2024 13:03:10 +0300
Subject: [PATCH] allow to set the slot of LLMCharacter

---
 README.md               |  4 +++-
 Runtime/LLM.cs          |  4 +++-
 Runtime/LLMCharacter.cs | 13 ++++++++-----
 3 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index c4792950..6228bd6f 100644
--- a/README.md
+++ b/README.md
@@ -376,7 +376,8 @@ If the user's GPU is not supported, the LLM will fall back to the CPU
 - `Debug` select to log the output of the model in the Unity Editor
 - <details><summary>Advanced options</summary>
 
-  - `Parallel Prompts` number of prompts that can happen in parallel (default: -1 = number of LLMCharacter objects)
+  - <details><summary><code>Parallel Prompts</code> number of prompts / slots that can happen in parallel (default: -1 = number of LLMCharacter objects). Note that the context size is divided among the slots.</summary> If you want to retain as much context for the LLM and don't need all the characters present at the same time, you can set this number and specify the slot for each LLMCharacter object.
+  e.g. Setting `Parallel Prompts` to 1 and slot 0 for all LLMCharacter objects will use the full context, but the entire prompt will need to be computed (no caching) whenever a LLMCharacter object is used for chat. </details>
   - `Dont Destroy On Load` select to not destroy the LLM GameObject when loading a new Scene
 
 </details>
@@ -441,6 +442,7 @@ If it is not selected, the full reply from the model is received in one go
   - `Load grammar` click to load a grammar in .gbnf format
   - `Grammar` the path of the grammar being used (relative to the Assets/StreamingAssets folder)
   - <details><summary><code>Cache Prompt</code> save the ongoing prompt from the chat (default: true)</summary> Saves the prompt while it is being created by the chat to avoid reprocessing the entire prompt every time</details>
+  - `Slot` slot of the server to use for computation. Value can be set from 0 to `Parallel Prompts`-1 (default: -1 = new slot for each character)
   - `Seed` seed for reproducibility. For random results every time use -1
   - <details><summary><code>Num Predict</code> maximum number of tokens to predict (default: 256, -1 = infinity, -2 = until context filled)</summary>This is the maximum amount of tokens the model will maximum predict. When N tokens are reached the model will stop generating. This means words / sentences might not get finished if this is too low. </details>
   - <details><summary><code>Temperature</code> LLM temperature, lower values give more deterministic answers (default: 0.2)</summary>The temperature setting adjusts how random the generated responses are. Turning it up makes the generated choices more varied and unpredictable. Turning it down makes the generated responses more predictable and focused on the most likely options.</details>
diff --git a/Runtime/LLM.cs b/Runtime/LLM.cs
index 95c51cf2..13f9ae19 100644
--- a/Runtime/LLM.cs
+++ b/Runtime/LLM.cs
@@ -449,7 +449,9 @@ private void StartService()
         public int Register(LLMCharacter llmCharacter)
         {
             clients.Add(llmCharacter);
-            return clients.IndexOf(llmCharacter);
+            int index = clients.IndexOf(llmCharacter);
+            if (parallelPrompts != -1) return index % parallelPrompts;
+            return index;
         }
 
         protected int GetNumClients()
diff --git a/Runtime/LLMCharacter.cs b/Runtime/LLMCharacter.cs
index c81c620c..99a5bf43 100644
--- a/Runtime/LLMCharacter.cs
+++ b/Runtime/LLMCharacter.cs
@@ -45,6 +45,8 @@ public class LLMCharacter : MonoBehaviour
         [ModelAdvanced] public string grammar = null;
         /// <summary> option to cache the prompt as it is being created by the chat to avoid reprocessing the entire prompt every time (default: true) </summary>
         [ModelAdvanced] public bool cachePrompt = true;
+        /// <summary> specify which slot of the server to use for computation (affects caching) </summary>
+        [ModelAdvanced] public int slot = -1;
         /// <summary> seed for reproducibility. For random results every time set to -1. </summary>
         [ModelAdvanced] public int seed = 0;
         /// <summary> number of tokens to predict (-1 = infinity, -2 = until context filled).
@@ -123,7 +125,6 @@ public class LLMCharacter : MonoBehaviour
         private string chatTemplate;
         private ChatTemplate template = null;
         public string grammarString;
-        protected int id_slot = -1;
         private List<(string, string)> requestHeaders = new List<(string, string)> { ("Content-Type", "application/json") };
         private List<UnityWebRequest> WIPRequests = new List<UnityWebRequest>();
         /// \endcond
@@ -149,7 +150,8 @@ public void Awake()
                     LLMUnitySetup.LogError($"No LLM assigned or detected for LLMCharacter {name}!");
                     return;
                 }
-                id_slot = llm.Register(this);
+                int slotFromServer = llm.Register(this);
+                if (slot == -1) slot = slotFromServer;
             }
 
             InitGrammar();
@@ -159,6 +161,7 @@ public void Awake()
         void OnValidate()
         {
             AssignLLM();
+            if (llm != null && llm.parallelPrompts > -1 && (slot < -1 || slot >= llm.parallelPrompts)) LLMUnitySetup.LogError($"The slot needs to be between 0 and {llm.parallelPrompts-1}, or -1 to be automatically set");
         }
 
         void Reset()
@@ -358,7 +361,7 @@ ChatRequest GenerateRequest(string prompt)
             ChatRequest chatRequest = new ChatRequest();
             if (debugPrompt) LLMUnitySetup.Log(prompt);
             chatRequest.prompt = prompt;
-            chatRequest.id_slot = id_slot;
+            chatRequest.id_slot = slot;
             chatRequest.temperature = temperature;
             chatRequest.top_k = topK;
             chatRequest.top_p = topP;
@@ -613,7 +616,7 @@ public async Task<List<float>> Embeddings(string query, Callback<List<float>> ca
         private async Task<string> Slot(string filepath, string action)
         {
             SlotRequest slotRequest = new SlotRequest();
-            slotRequest.id_slot = id_slot;
+            slotRequest.id_slot = slot;
             slotRequest.filepath = filepath;
             slotRequest.action = action;
             string json = JsonUtility.ToJson(slotRequest);
@@ -683,7 +686,7 @@ protected Ret ConvertContent<Res, Ret>(string response, ContentCallback<Res, Ret
 
         protected void CancelRequestsLocal()
         {
-            if (id_slot >= 0) llm.CancelRequest(id_slot);
+            if (slot >= 0) llm.CancelRequest(slot);
         }
 
         protected void CancelRequestsRemote()