-
Notifications
You must be signed in to change notification settings - Fork 9.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add examples/beam_search/beam_search.cpp for testing.
- Loading branch information
1 parent
4b20567
commit 1ffbc52
Showing
6 changed files
with
265 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
set(TARGET beam_search) | ||
add_executable(${TARGET} beam_search.cpp) | ||
install(TARGETS ${TARGET} RUNTIME) | ||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) | ||
target_compile_features(${TARGET} PRIVATE cxx_std_11) | ||
if(TARGET BUILD_INFO) | ||
add_dependencies(${TARGET} BUILD_INFO) | ||
endif() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
#ifndef _GNU_SOURCE | ||
#define _GNU_SOURCE | ||
#endif | ||
|
||
#include "common.h" | ||
#include "llama.h" | ||
#include "build-info.h" | ||
|
||
#include <cassert> | ||
#include <cinttypes> | ||
#include <cmath> | ||
#include <cstdio> | ||
#include <cstring> | ||
#include <ctime> | ||
#include <fstream> | ||
#include <iostream> | ||
#include <string> | ||
#include <vector> | ||
|
||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) | ||
#include <signal.h> | ||
#include <unistd.h> | ||
#elif defined (_WIN32) | ||
#define WIN32_LEAN_AND_MEAN | ||
#define NOMINMAX | ||
#include <windows.h> | ||
#include <signal.h> | ||
#endif | ||
|
||
|
||
|
||
int main(int argc, char ** argv) | ||
{ | ||
gpt_params params; | ||
|
||
//--------------------------------- | ||
// Print help : | ||
//--------------------------------- | ||
|
||
if ( argc < 2 || argv[1][0] == '-' ) | ||
{ | ||
printf( "Usage: %s MODEL_PATH [PROMPT]\n" , argv[0] ); | ||
return 1 ; | ||
} | ||
|
||
//--------------------------------- | ||
// Load parameters : | ||
//--------------------------------- | ||
|
||
params.model = argv[1]; | ||
|
||
params.n_beams = 2; // Hard-code 2 until we can calculate how much memory is required | ||
|
||
if ( argc > 2 ) | ||
{ | ||
params.prompt = argv[2]; | ||
} | ||
|
||
if ( params.prompt.empty() ) | ||
{ | ||
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n"; | ||
} | ||
|
||
//--------------------------------- | ||
// Init LLM : | ||
//--------------------------------- | ||
|
||
llama_backend_init(params.numa); | ||
|
||
llama_model * model; | ||
llama_context * ctx; | ||
|
||
std::tie(model, ctx) = llama_init_from_gpt_params( params ); | ||
|
||
if ( model == NULL ) | ||
{ | ||
fprintf( stderr , "%s: error: unable to load model\n" , __func__ ); | ||
return 1; | ||
} | ||
|
||
//--------------------------------- | ||
// Tokenize the prompt : | ||
//--------------------------------- | ||
|
||
std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true); | ||
|
||
const size_t max_context_size = llama_n_ctx( ctx ); | ||
const size_t max_tokens_list_size = max_context_size - 4 ; | ||
|
||
if (tokens_list.size() > max_tokens_list_size) | ||
{ | ||
fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" , | ||
__func__ , tokens_list.size() , max_tokens_list_size ); | ||
return 1; | ||
} | ||
|
||
fprintf( stderr, "\n\n" ); | ||
|
||
// Print the tokens from the prompt : | ||
|
||
for( auto id : tokens_list ) | ||
{ | ||
printf( "%s" , llama_token_to_str( ctx , id ) ); | ||
} | ||
|
||
fflush(stdout); | ||
|
||
#if 1 | ||
int n_past = llama_get_kv_cache_token_count(ctx); | ||
if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads)) | ||
{ | ||
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ ); | ||
return 1; | ||
} | ||
n_past += tokens_list.size(); | ||
|
||
int const n_predict = 1024; | ||
char const* response = llama_beam_search(ctx, params.n_beams, n_past, n_predict, params.n_threads); | ||
printf("\nDone:\n\n%s%s\n", params.prompt.c_str(), response); | ||
#else | ||
//--------------------------------- | ||
// Main prediction loop : | ||
//--------------------------------- | ||
// The LLM keeps a contextual cache memory of previous token evaluation. | ||
// Usually, once this cache is full, it is required to recompute a compressed context based on previous | ||
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist | ||
// example, we will just stop the loop once this cache is full or once an end of stream is detected. | ||
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size ) | ||
{ | ||
//--------------------------------- | ||
// Evaluate the tokens : | ||
//--------------------------------- | ||
if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) ) | ||
{ | ||
fprintf( stderr, "%s : failed to eval\n" , __func__ ); | ||
return 1; | ||
} | ||
tokens_list.clear(); | ||
//--------------------------------- | ||
// Select the best prediction : | ||
//--------------------------------- | ||
llama_token new_token_id = 0; | ||
auto logits = llama_get_logits( ctx ); | ||
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens) | ||
std::vector<llama_token_data> candidates; | ||
candidates.reserve( n_vocab ); | ||
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ ) | ||
{ | ||
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } ); | ||
} | ||
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; | ||
// Select it using the "Greedy sampling" method : | ||
new_token_id = llama_sample_token_greedy( ctx , &candidates_p ); | ||
// is it an end of stream ? | ||
if ( new_token_id == llama_token_eos() ) | ||
{ | ||
fprintf(stderr, " [end of text]\n"); | ||
break; | ||
} | ||
// Print the new token : | ||
printf( "%s" , llama_token_to_str( ctx , new_token_id ) ); | ||
fflush( stdout ); | ||
// Push this new token for next evaluation : | ||
tokens_list.push_back( new_token_id ); | ||
} // wend of main loop | ||
#endif | ||
|
||
llama_free( ctx ); | ||
llama_free_model( model ); | ||
|
||
llama_backend_free(); | ||
|
||
return 0; | ||
} | ||
|
||
// EOF |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.