[Bugfix] Add warmup for prefix caching example (vllm-project#5235)

xjpang · Jul 8, 2024 · 1a44ece · 1a44ece
1 parent 7e2e3e4
commit 1a44ece
Showing 1 changed file with 4 additions and 2 deletions.
diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
@@ -51,8 +51,10 @@
 
 print("-" * 80)
 
-# The llm.generate call will batch all prompts and send the batch at once
-# if resources allow.
+# Warmup so that the shared prompt's KV cache is computed.
+prefix_cached_llm.generate(generating_prompts[0], sampling_params)
+
+# Generate with prefix caching.
 start_time_cached = time()
 outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
 duration_cached = time() - start_time_cached