diff --git a/examples/LLM/llama/chat_app/docker/Dockerfile b/examples/LLM/llama/chat_app/docker/Dockerfile index fd3435b9039..fa4f21dd7bc 100644 --- a/examples/LLM/llama/chat_app/docker/Dockerfile +++ b/examples/LLM/llama/chat_app/docker/Dockerfile @@ -3,13 +3,10 @@ ARG BASE_IMAGE=pytorch/torchserve:latest-gpu FROM $BASE_IMAGE as server ARG BASE_IMAGE ARG EXAMPLE_DIR -ARG MODEL_NAME ARG HUGGINGFACE_TOKEN USER root -ENV MODEL_NAME=$MODEL_NAME - RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ apt-get update && \ apt-get install libopenmpi-dev git -y @@ -17,6 +14,12 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \ COPY $EXAMPLE_DIR/requirements.txt /home/model-server/chat_bot/requirements.txt RUN pip install -r /home/model-server/chat_bot/requirements.txt && huggingface-cli login --token $HUGGINGFACE_TOKEN +WORKDIR /home/model-server/chat_bot +RUN git clone https://github.com/ggerganov/llama.cpp.git build && \ + cd build && \ + make && \ + python -m pip install -r requirements.txt + COPY $EXAMPLE_DIR /home/model-server/chat_bot COPY $EXAMPLE_DIR/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh COPY $EXAMPLE_DIR/config.properties /home/model-server/config.properties diff --git a/examples/LLM/llama/chat_app/docker/Download_model.py b/examples/LLM/llama/chat_app/docker/Download_model.py new file mode 100644 index 00000000000..94836924aba --- /dev/null +++ b/examples/LLM/llama/chat_app/docker/Download_model.py @@ -0,0 +1,52 @@ +import argparse +import os + +from huggingface_hub import HfApi, snapshot_download + + +def dir_path(path_str): + if os.path.isdir(path_str): + return path_str + elif input(f"{path_str} does not exist, create directory? [y/n]").lower() == "y": + os.makedirs(path_str) + return path_str + else: + raise NotADirectoryError(path_str) + + +class HFModelNotFoundError(Exception): + def __init__(self, model_str): + super().__init__(f"HuggingFace model not found: '{model_str}'") + + +def hf_model(model_str): + api = HfApi() + models = [m.modelId for m in api.list_models()] + if model_str in models: + return model_str + else: + raise HFModelNotFoundError(model_str) + + +parser = argparse.ArgumentParser() +parser.add_argument( + "--model_path", + "-o", + type=dir_path, + default="model", + help="Output directory for downloaded model files", +) +parser.add_argument( + "--model_name", "-m", type=hf_model, required=True, help="HuggingFace model name" +) +parser.add_argument("--revision", "-r", type=str, default="main", help="Revision") +args = parser.parse_args() + +snapshot_path = snapshot_download( + repo_id=args.model_name, + revision=args.revision, + cache_dir=args.model_path, + use_auth_token=True, + ignore_patterns=["original/*", "pytorch_model*.bin"] +) +print(f"Files for '{args.model_name}' is downloaded to '{snapshot_path}'") diff --git a/examples/LLM/llama/chat_app/docker/build_image.sh b/examples/LLM/llama/chat_app/docker/build_image.sh index 7fefc63aa7a..f7fdf51976b 100755 --- a/examples/LLM/llama/chat_app/docker/build_image.sh +++ b/examples/LLM/llama/chat_app/docker/build_image.sh @@ -1,17 +1,8 @@ #!/bin/bash -# Check if there are enough arguments -if [ "$#" -eq 0 ] || [ "$#" -gt 1 ]; then - echo "Usage: $0 " - exit 1 -fi - -MODEL_NAME=$(echo "$1" | sed 's/\//---/g') -echo "Model: " $MODEL_NAME - BASE_IMAGE="pytorch/torchserve:latest-cpu" -DOCKER_TAG="pytorch/torchserve:${MODEL_NAME}" +DOCKER_TAG="pytorch/torchserve:chat_bot" # Get relative path of example dir EXAMPLE_DIR=$(dirname "$(readlink -f "$0")") @@ -20,9 +11,10 @@ ROOT_DIR=$(realpath "$ROOT_DIR") EXAMPLE_DIR=$(echo "$EXAMPLE_DIR" | sed "s|$ROOT_DIR|./|") # Build docker image for the application -DOCKER_BUILDKIT=1 docker buildx build --platform=linux/amd64 --file ${EXAMPLE_DIR}/Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" --build-arg MODEL_NAME="${MODEL_NAME}" --build-arg HUGGINGFACE_TOKEN -t "${DOCKER_TAG}" . +DOCKER_BUILDKIT=1 docker buildx build --platform=linux/amd64 --file ${EXAMPLE_DIR}/Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" --build-arg HUGGINGFACE_TOKEN -t "${DOCKER_TAG}" . echo "Run the following command to start the chat bot" echo "" -echo docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v $(pwd)/model_store_1:/home/model-server/model-store $DOCKER_TAG +echo docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v $(pwd)/model_store_1:/home/model-server/model-store -e MODEL_NAME="meta-llama/Llama-2-7b-chat-hf" $DOCKER_TAG echo "" +echo "Note: You can replace the model identifier as needed" diff --git a/examples/LLM/llama/chat_app/docker/client_app.py b/examples/LLM/llama/chat_app/docker/client_app.py index fdd8a6d4449..19c07d1cf1e 100644 --- a/examples/LLM/llama/chat_app/docker/client_app.py +++ b/examples/LLM/llama/chat_app/docker/client_app.py @@ -6,6 +6,7 @@ import streamlit as st MODEL_NAME = os.environ["MODEL_NAME"] +MODEL_NAME = MODEL_NAME.replace('/',"---") # App title st.set_page_config(page_title="TorchServe Chatbot") diff --git a/examples/LLM/llama/chat_app/docker/dockerd-entrypoint.sh b/examples/LLM/llama/chat_app/docker/dockerd-entrypoint.sh index 11396d3be74..cd20558fad9 100755 --- a/examples/LLM/llama/chat_app/docker/dockerd-entrypoint.sh +++ b/examples/LLM/llama/chat_app/docker/dockerd-entrypoint.sh @@ -1,75 +1,75 @@ #!/bin/bash set -e -export LLAMA2_Q4_MODEL=/home/model-server/model-store/$MODEL_NAME/model/ggml-model-q4_0.gguf +MODEL_DIR=$(echo "$MODEL_NAME" | sed 's/\//---/g') + +export LLAMA_Q4_MODEL=/home/model-server/model-store/$MODEL_DIR/model/ggml-model-q4_0.gguf create_model_cfg_yaml() { # Define the YAML content with a placeholder for the model name yaml_content="# TorchServe frontend parameters\nminWorkers: 1\nmaxWorkers: 1\nresponseTimeout: 1200\n#deviceType: \"gpu\"\n#deviceIds: [0,1]\n#torchrun:\n# nproc-per-node: 1\n\nhandler:\n model_name: \"${2}\"\n manual_seed: 40" - # Create the YAML file with the specified model name - echo -e "$yaml_content" > "model-config-${1}.yaml" + # Create the YAML file + echo -e "$yaml_content" > "model-config.yaml" } create_model_archive() { - MODEL_NAME=$1 - MODEL_CFG=$2 - echo "Create model archive for ${MODEL_NAME} if it doesn't already exist" - if [ -d "/home/model-server/model-store/$MODEL_NAME" ]; then - echo "Model archive for $MODEL_NAME exists." + MODEL_DIR=$1 + echo "Create model archive for ${MODEL_DIR} if it doesn't already exist" + if [ -d "/home/model-server/model-store/$MODEL_DIR" ]; then + echo "Model archive for $MODEL_DIR exists." fi - if [ -d "/home/model-server/model-store/$MODEL_NAME/model" ]; then + if [ -d "/home/model-server/model-store/$MODEL_DIR/model" ]; then echo "Model already download" - mv /home/model-server/model-store/$MODEL_NAME/model /home/model-server/model-store/ + mv /home/model-server/model-store/$MODEL_DIR/model /home/model-server/model-store/ else echo "Model needs to be downloaded" fi - torch-model-archiver --model-name "$MODEL_NAME" --version 1.0 --handler llama_cpp_handler.py --config-file $MODEL_CFG -r requirements.txt --archive-format no-archive --export-path /home/model-server/model-store -f + torch-model-archiver --model-name "$MODEL_DIR" --version 1.0 --handler llama_cpp_handler.py --config-file "model-config.yaml" -r requirements.txt --archive-format no-archive --export-path /home/model-server/model-store -f if [ -d "/home/model-server/model-store/model" ]; then - mv /home/model-server/model-store/model /home/model-server/model-store/$MODEL_NAME/ + mv /home/model-server/model-store/model /home/model-server/model-store/$MODEL_DIR/ fi } download_model() { - MODEL_NAME=$1 - HF_MODEL_NAME=$2 - if [ -d "/home/model-server/model-store/$MODEL_NAME/model" ]; then - echo "Model $HF_MODEL_NAME already downloaded" + MODEL_DIR=$1 + MODEL_NAME=$2 + if [ -d "/home/model-server/model-store/$MODEL_DIR/model" ]; then + echo "Model $MODEL_NAME already downloaded" else - echo "Downloading model $HF_MODEL_NAME" - python Download_model.py --model_path /home/model-server/model-store/$MODEL_NAME/model --model_name $HF_MODEL_NAME + echo "Downloading model $MODEL_NAME" + python Download_model.py --model_path /home/model-server/model-store/$MODEL_DIR/model --model_name $MODEL_NAME fi } quantize_model() { - if [ ! -f "$LLAMA2_Q4_MODEL" ]; then - tmp_model_name=$(echo "$MODEL_NAME" | sed 's/---/--/g') - directory_path=/home/model-server/model-store/$MODEL_NAME/model/models--$tmp_model_name/snapshots/ + if [ ! -f "$LLAMA_Q4_MODEL" ]; then + tmp_model_name=$(echo "$MODEL_DIR" | sed 's/---/--/g') + directory_path=/home/model-server/model-store/$MODEL_DIR/model/models--$tmp_model_name/snapshots/ HF_MODEL_SNAPSHOT=$(find $directory_path -type d -mindepth 1) - echo "Cleaning up previous build of llama-cpp" - git clone https://github.com/ggerganov/llama.cpp.git build cd build - make - python -m pip install -r requirements.txt - echo "Convert the 7B model to ggml FP16 format" - python convert.py $HF_MODEL_SNAPSHOT --outfile ggml-model-f16.gguf + echo "Convert the model to ggml FP16 format" + if [[ $MODEL_NAME == *"Meta-Llama-3"* ]]; then + python convert.py $HF_MODEL_SNAPSHOT --vocab-type bpe,hfft --outfile ggml-model-f16.gguf + else + python convert.py $HF_MODEL_SNAPSHOT --outfile ggml-model-f16.gguf + fi echo "Quantize the model to 4-bits (using q4_0 method)" - ./quantize ggml-model-f16.gguf $LLAMA2_Q4_MODEL q4_0 + ./quantize ggml-model-f16.gguf $LLAMA_Q4_MODEL q4_0 cd .. - echo "Saved quantized model weights to $LLAMA2_Q4_MODEL" + echo "Saved quantized model weights to $LLAMA_Q4_MODEL" fi } -HF_MODEL_NAME=$(echo "$MODEL_NAME" | sed 's/---/\//g') if [[ "$1" = "serve" ]]; then shift 1 - create_model_cfg_yaml $MODEL_NAME $HF_MODEL_NAME - create_model_archive $MODEL_NAME "model-config-$MODEL_NAME.yaml" - download_model $MODEL_NAME $HF_MODEL_NAME + create_model_cfg_yaml $MODEL_DIR $MODEL_NAME + create_model_archive $MODEL_DIR + download_model $MODEL_DIR $MODEL_NAME quantize_model streamlit run torchserve_server_app.py --server.port 8084 & streamlit run client_app.py --server.port 8085 diff --git a/examples/LLM/llama/chat_app/docker/llama_cpp_handler.py b/examples/LLM/llama/chat_app/docker/llama_cpp_handler.py index c607d8e81a2..09de9b503d9 100644 --- a/examples/LLM/llama/chat_app/docker/llama_cpp_handler.py +++ b/examples/LLM/llama/chat_app/docker/llama_cpp_handler.py @@ -23,7 +23,7 @@ def initialize(self, ctx): ctx (context): It is a JSON Object containing information pertaining to the model artifacts parameters. """ - model_path = os.environ["LLAMA2_Q4_MODEL"] + model_path = os.environ["LLAMA_Q4_MODEL"] model_name = ctx.model_yaml_config["handler"]["model_name"] seed = int(ctx.model_yaml_config["handler"]["manual_seed"]) torch.manual_seed(seed) diff --git a/examples/LLM/llama/chat_app/docker/torchserve_server_app.py b/examples/LLM/llama/chat_app/docker/torchserve_server_app.py index c485bf871c7..b7f276df539 100644 --- a/examples/LLM/llama/chat_app/docker/torchserve_server_app.py +++ b/examples/LLM/llama/chat_app/docker/torchserve_server_app.py @@ -7,6 +7,7 @@ import streamlit as st MODEL_NAME = os.environ["MODEL_NAME"] +MODEL_NAME = MODEL_NAME.replace('/',"---") MODEL = MODEL_NAME.split("---")[1] # App title diff --git a/examples/LLM/llama/chat_app/package_llama.sh b/examples/LLM/llama/chat_app/package_llama.sh index b7f2f0f1713..a58d3a3991a 100755 --- a/examples/LLM/llama/chat_app/package_llama.sh +++ b/examples/LLM/llama/chat_app/package_llama.sh @@ -2,12 +2,12 @@ # Check if the argument is empty or unset if [ -z "$1" ]; then echo "Missing Mandatory argument: Path to llama weights" - echo "Usage: ./package_llama.sh ./model/models--meta-llama--Llama-2-7b-chat-hf/snapshots/08751db2aca9bf2f7f80d2e516117a53d7450235" + echo "Usage: ./package_llama.sh ./models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298" exit 1 fi MODEL_GENERATION="true" -LLAMA2_WEIGHTS="$1" +LLAMA_WEIGHTS="$1" if [ -n "$2" ]; then MODEL_GENERATION="$2" @@ -20,18 +20,22 @@ if [ "$MODEL_GENERATION" = "true" ]; then rm -rf build git clone https://github.com/ggerganov/llama.cpp.git build cd build - make + make python -m pip install -r requirements.txt - - echo "Convert the 7B model to ggml FP16 format" - python convert.py $LLAMA2_WEIGHTS --outfile ggml-model-f16.gguf - + + echo "Convert the model to ggml FP16 format" + if [[ $MODEL_NAME == *"Meta-Llama-3"* ]]; then + python convert.py $HF_MODEL_SNAPSHOT --vocab-type bpe,hfft --outfile ggml-model-f16.gguf + else + python convert.py $HF_MODEL_SNAPSHOT --outfile ggml-model-f16.gguf + fi + echo "Quantize the model to 4-bits (using q4_0 method)" ./quantize ggml-model-f16.gguf ../ggml-model-q4_0.gguf q4_0 - + cd .. - export LLAMA2_Q4_MODEL=$PWD/ggml-model-q4_0.gguf - echo "Saved quantized model weights to $LLAMA2_Q4_MODEL" + export LLAMA_Q4_MODEL=$PWD/ggml-model-q4_0.gguf + echo "Saved quantized model weights to $LLAMA_Q4_MODEL" fi echo "Creating torchserve model archive" @@ -43,4 +47,3 @@ if [ "$MODEL_GENERATION" = "true" ]; then echo "Cleaning up build of llama-cpp" rm -rf build fi -