diff --git a/.gitignore b/.gitignore index f785746..38a2532 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ build/ .vscode +.cache/ +*.swp +models/*.bin diff --git a/CMakeLists.txt b/CMakeLists.txt index f8b8f3b..81babee 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,14 +10,21 @@ endif() set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) +if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + set(CLIP_STANDALONE ON) +else() + set(CLIP_STANDALONE OFF) +endif() + # # Option list # # general option(CLIP_STATIC "CLIP: static link libraries" OFF) -option(CLIP_NO_EXAMPLES "CLIP: do not build examples" OFF) -option(CLIP_NO_TESTS "CLIP: do not build tests" OFF) +option(CLIP_BUILD_TEST "CLIP: build tests" ${CLIP_STANDALONE}) +option(CLIP_BUILD_EXAMPLES "CLIP: build examples" ${CLIP_STANDALONE}) +option(CLIP_BUILD_IMAGE_SEARCH "CLIP: build image-search" OFF) option(CLIP_NATIVE "CLIP: enable -march=native flag" ON) option(CLIP_LTO "CLIP: enable link time optimization" OFF) @@ -48,6 +55,7 @@ endif() option(CLIP_ACCELERATE "CLIP: enable Accelerate framework" ON) option(CLIP_OPENBLAS "CLIP: use OpenBLAS" OFF) + # # Compile flags # diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 61650f9..c9b89f5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -1,8 +1,11 @@ - add_library(common-clip STATIC common-clip.cpp) target_link_libraries(common-clip PRIVATE ggml) target_include_directories(common-clip PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) +if (CLIP_BUILD_IMAGE_SEARCH) + add_subdirectory(./image-search) +endif() + add_executable(main main.cpp) target_link_libraries(main PRIVATE clip common-clip ggml) diff --git a/examples/image-search/CMakeLists.txt b/examples/image-search/CMakeLists.txt new file mode 100644 index 0000000..243b4e9 --- /dev/null +++ b/examples/image-search/CMakeLists.txt @@ -0,0 +1,25 @@ +project(image-search) + +set(CXX_STANDARD_REQUIRED ON) + +include(FetchContent) +FetchContent_Declare(usearch + GIT_REPOSITORY https://github.com/unum-cloud/usearch.git + GIT_TAG v0.19.3 +) +FetchContent_MakeAvailable(usearch) + +add_executable(image-search-build + build.cpp +) + +target_link_libraries(image-search-build PRIVATE clip ggml usearch) +target_compile_features(image-search-build PUBLIC cxx_std_17) + +add_executable(image-search + search.cpp +) + +target_link_libraries(image-search PRIVATE clip ggml usearch) +target_compile_features(image-search PUBLIC cxx_std_11) + diff --git a/examples/image-search/README.md b/examples/image-search/README.md new file mode 100644 index 0000000..8f54c46 --- /dev/null +++ b/examples/image-search/README.md @@ -0,0 +1,56 @@ +# Image search + +This example implements basic semantic image search using [usearch](https://github.com/unum-cloud/usearch) as a vector database for accelerated similarity search. + +Use `image-search-build` to build the database of images and their embeddings beforehand. Currently it does not support updating. + +Use `image-search` to search for indexed images by semantic similarity. + +### examples + +#### build db + +help: +```sh +./image-search-build -h +Usage: ./image-search-build [options] dir/with/pictures [more/dirs] + +Options: -h, --help: Show this message and exit + -m , --model : path to model. Default: ../models/ggml-model-f16.bin + -t N, --threads N: Number of threads to use for inference. Default: 4 + -v , --verbose : Control the level of verbosity. 0 = minimum, 2 = maximum. Default: 1 +``` + +creating db for `tests/`: +```sh +./image-search-build -m models/openai_clip-vit-base-patch32.ggmlv0.f16.bin ./tests/ +``` + +#### search by text + +help: +```sh +./image-search -h +Usage: ./image-search [options] + +Options: -h, --help: Show this message and exit + -m , --model : overwrite path to model. Read from images.paths by default. + -t N, --threads N: Number of threads to use for inference. Default: 4 + -v , --verbose : Control the level of verbosity. 0 = minimum, 2 = maximum. Default: 1 + -n N, --results N: Number of results to display. Default: 5 +``` + +searching for `apple` in the db in the current directory: +```sh +./image-search apple +clip_model_load: loading model from 'models/openai_clip-vit-base-patch32.ggmlv0.f16.bin' - please wait....................................................clip_model_load: model size = 288.93 MB / num tensors = 397 +clip_model_load: model loaded + +search results: +distance path + 0.674587 /home/xxxx/tests/red_apple.jpg + 0.785591 /home/xxxx/tests/white.jpg +``` + +note: lower score for search results is better as it indicates the distance, not the similarity. + diff --git a/examples/image-search/build.cpp b/examples/image-search/build.cpp new file mode 100644 index 0000000..97a22ac --- /dev/null +++ b/examples/image-search/build.cpp @@ -0,0 +1,167 @@ +#include "clip.h" +#include "usearch/index.hpp" + +#include +#include + +struct my_app_params { + int32_t n_threads {4}; + std::string model {"../models/ggml-model-f16.bin"}; + int32_t verbose {1}; + std::vector image_directories; +}; + +void my_print_help(int argc, char **argv, my_app_params ¶ms) { + printf("Usage: %s [options] dir/with/pictures [more/dirs]\n", argv[0]); + printf("\nOptions:"); + printf(" -h, --help: Show this message and exit\n"); + printf(" -m , --model : path to model. Default: %s\n", params.model.c_str()); + printf(" -t N, --threads N: Number of threads to use for inference. Default: %d\n", params.n_threads); + printf(" -v , --verbose : Control the level of verbosity. 0 = minimum, 2 = maximum. Default: %d\n", params.verbose); +} + +// returns success +bool my_app_params_parse(int argc, char **argv, my_app_params ¶ms) { + bool invalid_param = false; + for (int i = 1; i < argc; i++) { + + std::string arg = argv[i]; + + if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } else if (arg == "-v" || arg == "--verbose") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.verbose = std::stoi(argv[i]); + } else if (arg == "-h" || arg == "--help") { + my_print_help(argc, argv, params); + exit(0); + } else if (arg.starts_with('-')) { + if (i != 0) { + printf("%s: unrecognized argument: %s\n", __func__, arg.c_str()); + return false; + } + } else { + // assume image directory + params.image_directories.push_back(argv[i]); + } + } + + return !(invalid_param || params.image_directories.empty()); +} + +bool is_image_file_extension(std::string_view ext) { + if (ext == ".jpg") return true; + if (ext == ".JPG") return true; + + if (ext == ".jpeg") return true; + if (ext == ".JPEG") return true; + + if (ext == ".gif") return true; + if (ext == ".GIF") return true; + + if (ext == ".png") return true; + if (ext == ".PNG") return true; + + // TODO(green-sky): determine if we should add more formats from stbi. tga/hdr/pnm seem kinda niche. + + return false; +} + +int main(int argc, char** argv) { + my_app_params params; + if (!my_app_params_parse(argc, argv, params)) { + my_print_help(argc, argv, params); + return 1; + } + + auto clip_ctx = clip_model_load(params.model.c_str(), params.verbose); + if (!clip_ctx) { + printf("%s: Unable to load model from %s\n", __func__, params.model.c_str()); + return 1; + } + + std::vector image_file_index; + unum::usearch::index_gt> embd_index; + + const size_t vec_dim = clip_ctx->vision_model.hparams.projection_dim; + + size_t label = 0; + + std::vector vec(vec_dim); + + // search for images in path and write embedding to database + for (const auto& base_dir : params.image_directories) { + fprintf(stdout, "%s: starting base dir scan of '%s'\n", __func__, base_dir.c_str()); + + for (auto const& dir_entry : std::filesystem::recursive_directory_iterator(base_dir)) { + if (!dir_entry.is_regular_file()) { + continue; + } + + // check for image file + const auto& ext = dir_entry.path().extension(); + if (ext.empty()) { + continue; + } + if (!is_image_file_extension(ext.c_str())) { + continue; + } + + std::string img_path {dir_entry.path()}; + if (params.verbose >= 1) { + fprintf(stdout, "%s: found image file '%s'\n", __func__, img_path.c_str()); + } + + clip_image_u8 img0; + if (!clip_image_load_from_file(img_path, img0)) { + fprintf(stderr, "%s: failed to load image from '%s'\n", __func__, img_path.c_str()); + continue; + } + + clip_image_f32 img_res; + clip_image_preprocess(clip_ctx, &img0, &img_res); + + if (!clip_image_encode(clip_ctx, params.n_threads, img_res, vec.data())) { + fprintf(stderr, "%s: failed to encode image from '%s'\n", __func__, img_path.c_str()); + continue; + } + + if (embd_index.capacity() == embd_index.size()) { + embd_index.reserve(embd_index.size() + 32); + } + + // add the image to the database + embd_index.add(label++, {vec.data(), vec.size()}); + image_file_index.push_back(std::filesystem::canonical(dir_entry.path())); + } + } + + clip_free(clip_ctx); + + // save to disk + + embd_index.save("images.usearch"); + + std::ofstream image_file_index_file("images.paths", std::ios::binary | std::ios::trunc); + // first line is model + image_file_index_file << params.model << "\n"; + for (const auto& i_path : image_file_index) { + image_file_index_file << i_path << "\n"; + } + + return 0; +} + diff --git a/examples/image-search/search.cpp b/examples/image-search/search.cpp new file mode 100644 index 0000000..d975e69 --- /dev/null +++ b/examples/image-search/search.cpp @@ -0,0 +1,143 @@ +#include "clip.h" +#include "usearch/index.hpp" + +#include + +struct my_app_params { + int32_t n_threads {4}; + std::string model; + int32_t verbose {1}; + // TODO: index dir + + // TODO: search by image + std::string search_text; + + int32_t n_results {5}; +}; + +void my_print_help(int argc, char **argv, my_app_params ¶ms) { + printf("Usage: %s [options] \n", argv[0]); + printf("\nOptions:"); + printf(" -h, --help: Show this message and exit\n"); + printf(" -m , --model : overwrite path to model. Read from images.paths by default.\n"); + printf(" -t N, --threads N: Number of threads to use for inference. Default: %d\n", params.n_threads); + printf(" -v , --verbose : Control the level of verbosity. 0 = minimum, 2 = maximum. Default: %d\n", params.verbose); + + printf(" -n N, --results N: Number of results to display. Default: %d\n", params.n_results); +} + +// returns success +bool my_app_params_parse(int argc, char **argv, my_app_params ¶ms) { + bool invalid_param = false; + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + + if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } else if (arg == "-n" || arg == "--results") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_results = std::stoi(argv[i]); + } else if (arg == "-v" || arg == "--verbose") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.verbose = std::stoi(argv[i]); + } else if (arg == "-h" || arg == "--help") { + my_print_help(argc, argv, params); + exit(0); + } else if (arg.starts_with('-')) { + if (i != 0) { + printf("%s: unrecognized argument: %s\n", __func__, arg.c_str()); + return false; + } + } else { + // assume search string from here on out + params.search_text = arg; + for (++i; i < argc; i++) { + params.search_text += " "; + params.search_text += argv[i]; + } + } + } + + return !(invalid_param || params.search_text.empty()); +} + +int main(int argc, char** argv) { + my_app_params params; + if (!my_app_params_parse(argc, argv, params)) { + my_print_help(argc, argv, params); + return 1; + } + + // load model path + std::ifstream image_file_index_file("images.paths", std::ios::binary); + std::string line; + std::getline(image_file_index_file, line); + if (params.model.empty()) { + params.model = line; + } else { + printf("%s: using alternative model from %s. Make sure you use the same model you used for indexing, or the embeddings wont work.\n", __func__, params.model.c_str()); + } + + // load model + auto clip_ctx = clip_model_load(params.model.c_str(), params.verbose); + if (!clip_ctx) { + printf("%s: Unable to load model from %s\n", __func__, params.model.c_str()); + return 1; + } + + // load paths and embeddings database + std::vector image_file_index; + unum::usearch::index_gt> embd_index; + + embd_index.view("images.usearch"); + + // load image paths + do { + std::getline(image_file_index_file, line); + if (line.empty()) { + break; + } + image_file_index.push_back(line); + } while(image_file_index_file.good()); + + if (image_file_index.size() != embd_index.size()) { + printf("%s: index files size missmatch\n", __func__); + } + + const size_t vec_dim = clip_ctx->vision_model.hparams.projection_dim; + + auto tokens = clip_tokenize(clip_ctx, params.search_text); + + std::vector txt_vec(vec_dim); + + clip_text_encode(clip_ctx, params.n_threads, tokens, txt_vec.data()); + + auto results = embd_index.search({txt_vec.data(), txt_vec.size()}, params.n_results); + + printf("search results:\n"); + printf("distance path\n"); + for (std::size_t i = 0; i != results.size(); ++i) { + printf(" %f %s\n", results[i].distance, image_file_index.at(results[i].element.label).c_str()); + } + + clip_free(clip_ctx); + + return 0; +} +