Skip to content

Commit

Permalink
Add examples/beam_search/beam_search.cpp for testing.
Browse files Browse the repository at this point in the history
  • Loading branch information
mattpulver committed Jul 31, 2023
1 parent 4b20567 commit 1ffbc52
Show file tree
Hide file tree
Showing 6 changed files with 265 additions and 31 deletions.
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ else()
add_subdirectory(train-text-from-scratch)
add_subdirectory(simple)
add_subdirectory(embd-input)
add_subdirectory(beam_search)
if (LLAMA_METAL)
add_subdirectory(metal)
endif()
Expand Down
8 changes: 8 additions & 0 deletions examples/beam_search/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
set(TARGET beam_search)
add_executable(${TARGET} beam_search.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
if(TARGET BUILD_INFO)
add_dependencies(${TARGET} BUILD_INFO)
endif()
192 changes: 192 additions & 0 deletions examples/beam_search/beam_search.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include "common.h"
#include "llama.h"
#include "build-info.h"

#include <cassert>
#include <cinttypes>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>

#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
#include <signal.h>
#include <unistd.h>
#elif defined (_WIN32)
#define WIN32_LEAN_AND_MEAN
#define NOMINMAX
#include <windows.h>
#include <signal.h>
#endif



int main(int argc, char ** argv)
{
gpt_params params;

//---------------------------------
// Print help :
//---------------------------------

if ( argc < 2 || argv[1][0] == '-' )
{
printf( "Usage: %s MODEL_PATH [PROMPT]\n" , argv[0] );
return 1 ;
}

//---------------------------------
// Load parameters :
//---------------------------------

params.model = argv[1];

params.n_beams = 2; // Hard-code 2 until we can calculate how much memory is required

if ( argc > 2 )
{
params.prompt = argv[2];
}

if ( params.prompt.empty() )
{
params.prompt = "### Request:\nHow many countries are there?\n\n### Response:\n";
}

//---------------------------------
// Init LLM :
//---------------------------------

llama_backend_init(params.numa);

llama_model * model;
llama_context * ctx;

std::tie(model, ctx) = llama_init_from_gpt_params( params );

if ( model == NULL )
{
fprintf( stderr , "%s: error: unable to load model\n" , __func__ );
return 1;
}

//---------------------------------
// Tokenize the prompt :
//---------------------------------

std::vector<llama_token> tokens_list = llama_tokenize(ctx, params.prompt, true);

const size_t max_context_size = llama_n_ctx( ctx );
const size_t max_tokens_list_size = max_context_size - 4 ;

if (tokens_list.size() > max_tokens_list_size)
{
fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
__func__ , tokens_list.size() , max_tokens_list_size );
return 1;
}

fprintf( stderr, "\n\n" );

// Print the tokens from the prompt :

for( auto id : tokens_list )
{
printf( "%s" , llama_token_to_str( ctx , id ) );
}

fflush(stdout);

#if 1
int n_past = llama_get_kv_cache_token_count(ctx);
if (llama_eval(ctx, tokens_list.data(), tokens_list.size(), n_past, params.n_threads))
{
fprintf(stderr, "%s : failed to eval prompt.\n" , __func__ );
return 1;
}
n_past += tokens_list.size();

int const n_predict = 1024;
char const* response = llama_beam_search(ctx, params.n_beams, n_past, n_predict, params.n_threads);
printf("\nDone:\n\n%s%s\n", params.prompt.c_str(), response);
#else
//---------------------------------
// Main prediction loop :
//---------------------------------
// The LLM keeps a contextual cache memory of previous token evaluation.
// Usually, once this cache is full, it is required to recompute a compressed context based on previous
// tokens (see "infinite text generation via context swapping" in the main example), but in this minimalist
// example, we will just stop the loop once this cache is full or once an end of stream is detected.
while ( llama_get_kv_cache_token_count( ctx ) < max_context_size )
{
//---------------------------------
// Evaluate the tokens :
//---------------------------------
if ( llama_eval( ctx , tokens_list.data() , tokens_list.size() , llama_get_kv_cache_token_count( ctx ) , params.n_threads ) )
{
fprintf( stderr, "%s : failed to eval\n" , __func__ );
return 1;
}
tokens_list.clear();
//---------------------------------
// Select the best prediction :
//---------------------------------
llama_token new_token_id = 0;
auto logits = llama_get_logits( ctx );
auto n_vocab = llama_n_vocab( ctx ); // the size of the LLM vocabulary (in tokens)
std::vector<llama_token_data> candidates;
candidates.reserve( n_vocab );
for( llama_token token_id = 0 ; token_id < n_vocab ; token_id++ )
{
candidates.emplace_back( llama_token_data{ token_id , logits[ token_id ] , 0.0f } );
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// Select it using the "Greedy sampling" method :
new_token_id = llama_sample_token_greedy( ctx , &candidates_p );
// is it an end of stream ?
if ( new_token_id == llama_token_eos() )
{
fprintf(stderr, " [end of text]\n");
break;
}
// Print the new token :
printf( "%s" , llama_token_to_str( ctx , new_token_id ) );
fflush( stdout );
// Push this new token for next evaluation :
tokens_list.push_back( new_token_id );
} // wend of main loop
#endif

llama_free( ctx );
llama_free_model( model );

llama_backend_free();

return 0;
}

// EOF
1 change: 1 addition & 0 deletions examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ struct gpt_params {
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
int32_t n_beams = 0; // Used in mem allocation if > 0 and by llama_beam_search().
float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
float rope_freq_base = 10000.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor
Expand Down
34 changes: 18 additions & 16 deletions llama-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -436,25 +436,27 @@ struct llama_buffer {
}

void resize(size_t len) {
size = 0;
if (size != len) {
size = 0;
#ifdef GGML_USE_METAL
free(addr);
if (len) {
int result = posix_memalign((void **) &addr, getpagesize(), len);
if (result == 0) {
memset(addr, 0, len);
size = len;
} else {
addr = NULL;
}
}
free(addr);
if (len) {
int result = posix_memalign((void **) &addr, getpagesize(), len);
if (result == 0) {
memset(addr, 0, len);
size = len;
} else {
addr = NULL;
}
}
#else
delete[] addr;
if (len) {
addr = new uint8_t[len];
size = len;
}
delete[] addr;
if (len) {
addr = new uint8_t[len];
size = len;
}
#endif
}
}
};

Expand Down
Loading

0 comments on commit 1ffbc52

Please sign in to comment.