Updated chat bot example to support llama3

pytorch · May 3, 2024 · c1db5c8 · c1db5c8
1 parent 8dacf69
commit c1db5c8
Show file tree

Hide file tree

Showing 8 changed files with 112 additions and 60 deletions.
diff --git a/examples/LLM/llama/chat_app/docker/Dockerfile b/examples/LLM/llama/chat_app/docker/Dockerfile
@@ -3,20 +3,23 @@ ARG BASE_IMAGE=pytorch/torchserve:latest-gpu
 FROM $BASE_IMAGE as server
 ARG BASE_IMAGE
 ARG EXAMPLE_DIR
-ARG MODEL_NAME
 ARG HUGGINGFACE_TOKEN
 
 USER root
 
-ENV MODEL_NAME=$MODEL_NAME
-
 RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
     apt-get update && \
     apt-get install libopenmpi-dev git -y
 
 COPY $EXAMPLE_DIR/requirements.txt /home/model-server/chat_bot/requirements.txt
 RUN pip install -r /home/model-server/chat_bot/requirements.txt && huggingface-cli login --token $HUGGINGFACE_TOKEN
 
+WORKDIR /home/model-server/chat_bot
+RUN git clone https://github.com/ggerganov/llama.cpp.git build && \
+    cd build && \
+    make && \
+    python -m pip install -r requirements.txt
+
 COPY $EXAMPLE_DIR  /home/model-server/chat_bot
 COPY $EXAMPLE_DIR/dockerd-entrypoint.sh /usr/local/bin/dockerd-entrypoint.sh
 COPY $EXAMPLE_DIR/config.properties /home/model-server/config.properties

diff --git a/examples/LLM/llama/chat_app/docker/Download_model.py b/examples/LLM/llama/chat_app/docker/Download_model.py
@@ -0,0 +1,52 @@
+import argparse
+import os
+
+from huggingface_hub import HfApi, snapshot_download
+
+
+def dir_path(path_str):
+    if os.path.isdir(path_str):
+        return path_str
+    elif input(f"{path_str} does not exist, create directory? [y/n]").lower() == "y":
+        os.makedirs(path_str)
+        return path_str
+    else:
+        raise NotADirectoryError(path_str)
+
+
+class HFModelNotFoundError(Exception):
+    def __init__(self, model_str):
+        super().__init__(f"HuggingFace model not found: '{model_str}'")
+
+
+def hf_model(model_str):
+    api = HfApi()
+    models = [m.modelId for m in api.list_models()]
+    if model_str in models:
+        return model_str
+    else:
+        raise HFModelNotFoundError(model_str)
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--model_path",
+    "-o",
+    type=dir_path,
+    default="model",
+    help="Output directory for downloaded model files",
+)
+parser.add_argument(
+    "--model_name", "-m", type=hf_model, required=True, help="HuggingFace model name"
+)
+parser.add_argument("--revision", "-r", type=str, default="main", help="Revision")
+args = parser.parse_args()
+
+snapshot_path = snapshot_download(
+    repo_id=args.model_name,
+    revision=args.revision,
+    cache_dir=args.model_path,
+    use_auth_token=True,
+    ignore_patterns=["original/*", "pytorch_model*.bin"]
+)
+print(f"Files for '{args.model_name}' is downloaded to '{snapshot_path}'")
diff --git a/examples/LLM/llama/chat_app/docker/build_image.sh b/examples/LLM/llama/chat_app/docker/build_image.sh
@@ -1,17 +1,8 @@
 #!/bin/bash
 
-# Check if there are enough arguments
-if [ "$#" -eq 0 ] || [ "$#" -gt 1 ]; then
-  echo "Usage: $0 <HF Model>"
-  exit 1
-fi
-
-MODEL_NAME=$(echo "$1" | sed 's/\//---/g')
-echo "Model: " $MODEL_NAME
-
 BASE_IMAGE="pytorch/torchserve:latest-cpu"
 
-DOCKER_TAG="pytorch/torchserve:${MODEL_NAME}"
+DOCKER_TAG="pytorch/torchserve:chat_bot"
 
 # Get relative path of example dir
 EXAMPLE_DIR=$(dirname "$(readlink -f "$0")")
@@ -20,9 +11,10 @@ ROOT_DIR=$(realpath "$ROOT_DIR")
 EXAMPLE_DIR=$(echo "$EXAMPLE_DIR" | sed "s|$ROOT_DIR|./|")
 
 # Build docker image for the application
-DOCKER_BUILDKIT=1 docker buildx build --platform=linux/amd64 --file ${EXAMPLE_DIR}/Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" --build-arg MODEL_NAME="${MODEL_NAME}"  --build-arg HUGGINGFACE_TOKEN -t "${DOCKER_TAG}" .
+DOCKER_BUILDKIT=1 docker buildx build --platform=linux/amd64 --file ${EXAMPLE_DIR}/Dockerfile --build-arg BASE_IMAGE="${BASE_IMAGE}" --build-arg EXAMPLE_DIR="${EXAMPLE_DIR}" --build-arg HUGGINGFACE_TOKEN -t "${DOCKER_TAG}" .
 
 echo "Run the following command to start the chat bot"
 echo ""
-echo docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v $(pwd)/model_store_1:/home/model-server/model-store $DOCKER_TAG
+echo docker run --rm -it --platform linux/amd64 -p 127.0.0.1:8080:8080 -p 127.0.0.1:8081:8081 -p 127.0.0.1:8082:8082 -p 127.0.0.1:8084:8084 -p 127.0.0.1:8085:8085 -v $(pwd)/model_store_1:/home/model-server/model-store -e MODEL_NAME="meta-llama/Llama-2-7b-chat-hf" $DOCKER_TAG
 echo ""
+echo "Note: You can replace the model identifier as needed"
diff --git a/examples/LLM/llama/chat_app/docker/client_app.py b/examples/LLM/llama/chat_app/docker/client_app.py
@@ -6,6 +6,7 @@
 import streamlit as st
 
 MODEL_NAME = os.environ["MODEL_NAME"]
+MODEL_NAME = MODEL_NAME.replace('/',"---")
 
 # App title
 st.set_page_config(page_title="TorchServe Chatbot")

diff --git a/examples/LLM/llama/chat_app/docker/dockerd-entrypoint.sh b/examples/LLM/llama/chat_app/docker/dockerd-entrypoint.sh
@@ -1,75 +1,75 @@
 #!/bin/bash
 set -e
 
-export LLAMA2_Q4_MODEL=/home/model-server/model-store/$MODEL_NAME/model/ggml-model-q4_0.gguf
+MODEL_DIR=$(echo "$MODEL_NAME" | sed 's/\//---/g')
+
+export LLAMA_Q4_MODEL=/home/model-server/model-store/$MODEL_DIR/model/ggml-model-q4_0.gguf
 
 
 create_model_cfg_yaml() {
   # Define the YAML content with a placeholder for the model name
   yaml_content="# TorchServe frontend parameters\nminWorkers: 1\nmaxWorkers: 1\nresponseTimeout: 1200\n#deviceType: \"gpu\"\n#deviceIds: [0,1]\n#torchrun:\n#    nproc-per-node: 1\n\nhandler:\n    model_name: \"${2}\"\n    manual_seed: 40"
 
-  # Create the YAML file with the specified model name
-  echo -e "$yaml_content" > "model-config-${1}.yaml"
+  # Create the YAML file
+  echo -e "$yaml_content" > "model-config.yaml"
 }
 
 create_model_archive() {
-    MODEL_NAME=$1
-    MODEL_CFG=$2
-    echo "Create model archive for ${MODEL_NAME} if it doesn't already exist"
-    if [ -d "/home/model-server/model-store/$MODEL_NAME" ]; then
-        echo "Model archive for $MODEL_NAME exists."
+    MODEL_DIR=$1
+    echo "Create model archive for ${MODEL_DIR} if it doesn't already exist"
+    if [ -d "/home/model-server/model-store/$MODEL_DIR" ]; then
+        echo "Model archive for $MODEL_DIR exists."
     fi
-    if [ -d "/home/model-server/model-store/$MODEL_NAME/model" ]; then
+    if [ -d "/home/model-server/model-store/$MODEL_DIR/model" ]; then
         echo "Model already download"
-        mv /home/model-server/model-store/$MODEL_NAME/model /home/model-server/model-store/
+        mv /home/model-server/model-store/$MODEL_DIR/model /home/model-server/model-store/
     else
         echo "Model needs to be downloaded"
     fi
-    torch-model-archiver --model-name "$MODEL_NAME" --version 1.0 --handler llama_cpp_handler.py --config-file $MODEL_CFG -r requirements.txt --archive-format no-archive --export-path /home/model-server/model-store -f
+    torch-model-archiver --model-name "$MODEL_DIR" --version 1.0 --handler llama_cpp_handler.py --config-file "model-config.yaml" -r requirements.txt --archive-format no-archive --export-path /home/model-server/model-store -f
     if [ -d "/home/model-server/model-store/model" ]; then
-        mv /home/model-server/model-store/model /home/model-server/model-store/$MODEL_NAME/
+        mv /home/model-server/model-store/model /home/model-server/model-store/$MODEL_DIR/
     fi
 }
 
 download_model() {
-   MODEL_NAME=$1
-   HF_MODEL_NAME=$2
-    if [ -d "/home/model-server/model-store/$MODEL_NAME/model" ]; then
-        echo "Model $HF_MODEL_NAME already downloaded"
+   MODEL_DIR=$1
+   MODEL_NAME=$2
+    if [ -d "/home/model-server/model-store/$MODEL_DIR/model" ]; then
+        echo "Model $MODEL_NAME already downloaded"
     else
-        echo "Downloading  model $HF_MODEL_NAME"
-        python Download_model.py --model_path /home/model-server/model-store/$MODEL_NAME/model --model_name $HF_MODEL_NAME
+        echo "Downloading  model $MODEL_NAME"
+        python Download_model.py --model_path /home/model-server/model-store/$MODEL_DIR/model --model_name $MODEL_NAME
     fi
 }
 
 quantize_model() {
-    if [ ! -f "$LLAMA2_Q4_MODEL" ]; then
-        tmp_model_name=$(echo "$MODEL_NAME" | sed 's/---/--/g')
-        directory_path=/home/model-server/model-store/$MODEL_NAME/model/models--$tmp_model_name/snapshots/
+    if [ ! -f "$LLAMA_Q4_MODEL" ]; then
+        tmp_model_name=$(echo "$MODEL_DIR" | sed 's/---/--/g')
+        directory_path=/home/model-server/model-store/$MODEL_DIR/model/models--$tmp_model_name/snapshots/
         HF_MODEL_SNAPSHOT=$(find $directory_path -type d -mindepth 1)
-        echo "Cleaning up previous build of llama-cpp"
-        git clone https://github.com/ggerganov/llama.cpp.git build
         cd build
-        make
-        python -m pip install -r requirements.txt
 
-        echo "Convert the 7B model to ggml FP16 format"
-        python convert.py $HF_MODEL_SNAPSHOT --outfile ggml-model-f16.gguf
+        echo "Convert the model to ggml FP16 format"
+        if [[ $MODEL_NAME == *"Meta-Llama-3"* ]]; then
+            python convert.py $HF_MODEL_SNAPSHOT --vocab-type bpe,hfft --outfile ggml-model-f16.gguf
+        else
+            python convert.py $HF_MODEL_SNAPSHOT --outfile ggml-model-f16.gguf
+        fi
 
         echo "Quantize the model to 4-bits (using q4_0 method)"
-        ./quantize ggml-model-f16.gguf $LLAMA2_Q4_MODEL q4_0
+        ./quantize ggml-model-f16.gguf $LLAMA_Q4_MODEL q4_0
 
         cd ..
-        echo "Saved quantized model weights to $LLAMA2_Q4_MODEL"
+        echo "Saved quantized model weights to $LLAMA_Q4_MODEL"
     fi
 }
 
-HF_MODEL_NAME=$(echo "$MODEL_NAME" | sed 's/---/\//g')
 if [[ "$1" = "serve" ]]; then
     shift 1
-    create_model_cfg_yaml $MODEL_NAME $HF_MODEL_NAME
-    create_model_archive $MODEL_NAME "model-config-$MODEL_NAME.yaml"
-    download_model $MODEL_NAME $HF_MODEL_NAME
+    create_model_cfg_yaml $MODEL_DIR $MODEL_NAME
+    create_model_archive $MODEL_DIR
+    download_model $MODEL_DIR $MODEL_NAME
     quantize_model
     streamlit run torchserve_server_app.py --server.port 8084 &
     streamlit run client_app.py --server.port 8085

diff --git a/examples/LLM/llama/chat_app/docker/llama_cpp_handler.py b/examples/LLM/llama/chat_app/docker/llama_cpp_handler.py
@@ -23,7 +23,7 @@ def initialize(self, ctx):
             ctx (context): It is a JSON Object containing information
             pertaining to the model artifacts parameters.
         """
-        model_path = os.environ["LLAMA2_Q4_MODEL"]
+        model_path = os.environ["LLAMA_Q4_MODEL"]
         model_name = ctx.model_yaml_config["handler"]["model_name"]
         seed = int(ctx.model_yaml_config["handler"]["manual_seed"])
         torch.manual_seed(seed)

diff --git a/examples/LLM/llama/chat_app/docker/torchserve_server_app.py b/examples/LLM/llama/chat_app/docker/torchserve_server_app.py
@@ -7,6 +7,7 @@
 import streamlit as st
 
 MODEL_NAME = os.environ["MODEL_NAME"]
+MODEL_NAME = MODEL_NAME.replace('/',"---")
 MODEL = MODEL_NAME.split("---")[1]
 
 # App title

diff --git a/examples/LLM/llama/chat_app/package_llama.sh b/examples/LLM/llama/chat_app/package_llama.sh
@@ -2,12 +2,12 @@
 # Check if the argument is empty or unset
 if [ -z "$1" ]; then
   echo "Missing Mandatory argument: Path to llama weights"
-  echo "Usage: ./package_llama.sh ./model/models--meta-llama--Llama-2-7b-chat-hf/snapshots/08751db2aca9bf2f7f80d2e516117a53d7450235"
+  echo "Usage: ./package_llama.sh ./models--meta-llama--Meta-Llama-3-8B-Instruct/snapshots/e5e23bbe8e749ef0efcf16cad411a7d23bd23298"
   exit 1
 fi
 
 MODEL_GENERATION="true"
-LLAMA2_WEIGHTS="$1"
+LLAMA_WEIGHTS="$1"
 
 if [ -n "$2" ]; then
   MODEL_GENERATION="$2"
@@ -20,18 +20,22 @@ if [ "$MODEL_GENERATION" = "true" ]; then
   rm -rf build
   git clone https://github.com/ggerganov/llama.cpp.git build
   cd build
-  make 
+  make
   python -m pip install -r requirements.txt
-
-  echo "Convert the 7B model to ggml FP16 format"
-  python convert.py $LLAMA2_WEIGHTS --outfile ggml-model-f16.gguf
-
+
+  echo "Convert the model to ggml FP16 format"
+  if [[ $MODEL_NAME == *"Meta-Llama-3"* ]]; then
+      python convert.py $HF_MODEL_SNAPSHOT --vocab-type bpe,hfft --outfile ggml-model-f16.gguf
+  else
+      python convert.py $HF_MODEL_SNAPSHOT --outfile ggml-model-f16.gguf
+  fi
+
   echo "Quantize the model to 4-bits (using q4_0 method)"
   ./quantize ggml-model-f16.gguf ../ggml-model-q4_0.gguf q4_0
-  
+
   cd ..
-  export LLAMA2_Q4_MODEL=$PWD/ggml-model-q4_0.gguf
-  echo "Saved quantized model weights to $LLAMA2_Q4_MODEL"
+  export LLAMA_Q4_MODEL=$PWD/ggml-model-q4_0.gguf
+  echo "Saved quantized model weights to $LLAMA_Q4_MODEL"
 fi
 
 echo "Creating torchserve model archive"
@@ -43,4 +47,3 @@ if [ "$MODEL_GENERATION" = "true" ]; then
   echo "Cleaning up build of llama-cpp"
   rm -rf build
 fi
-