Benchmark #51
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Benchmark | |
name: Benchmark | |
on: | |
workflow_dispatch: | |
inputs: | |
gpu-series: | |
description: 'Azure GPU series to run with' | |
required: true | |
type: choice | |
options: | |
- Standard_NC4as_T4_v3 | |
- Standard_NC64as_T4_v3 | |
- Standard_NC24ads_A100_v4 | |
- Standard_NC48ads_A100_v4 | |
- Standard_ND96asr_A100_v4 | |
- Standard_NC40ads_H100_v5 | |
- Standard_NC80adis_H100_v5 | |
push: | |
branches: | |
- master | |
- hp/server/bench/workflow # FIXME remove | |
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] | |
pull_request: | |
types: [opened, synchronize, reopened] | |
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/bench/**.*'] | |
schedule: | |
- cron: '04 2 * * *' | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }} | |
cancel-in-progress: true | |
jobs: | |
bench-server-baseline: | |
runs-on: Standard_NC4as_T4_v3 | |
if: ${{ github.event.schedule || github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.pull_request || github.event.push.ref == 'refs/heads/master' }} | |
steps: | |
- name: Clone | |
id: checkout | |
uses: actions/checkout@v3 | |
with: | |
fetch-depth: 0 | |
- name: Prometheus | |
id: install_prometheus | |
run: | | |
wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz | |
tar xzf prometheus*.tar.gz --strip-components=1 | |
./prometheus --config.file=examples/server/bench/prometheus.yml & | |
while ! nc -z localhost 9090; do | |
sleep 0.1 | |
done | |
- name: Build | |
id: cmake_build | |
run: | | |
set -eux | |
mkdir build | |
cd build | |
cmake .. \ | |
-DLLAMA_NATIVE=OFF \ | |
-DLLAMA_BUILD_SERVER=ON \ | |
-DLLAMA_CURL=ON \ | |
-DLLAMA_CUBLAS=ON \ | |
-DCUDAToolkit_ROOT=/usr/local/cuda \ | |
-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ | |
-DCMAKE_CUDA_ARCHITECTURES=75 \ | |
-DLLAMA_FATAL_WARNINGS=OFF \ | |
-DLLAMA_ALL_WARNINGS=OFF \ | |
-DCMAKE_BUILD_TYPE=Release; | |
cmake --build . --config Release -j $(nproc) --target server | |
- name: Install k6 | |
id: k6_installation | |
run: | | |
wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz | |
tar xzf k6*.tar.gz --strip-components=1 | |
- name: Download the dataset | |
id: download_dataset | |
run: | | |
cd examples/server/bench | |
wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | |
- name: Server bench | |
id: server_bench | |
run: | | |
set -eux | |
# ensure prometheus is started | |
while ! nc -z localhost 9090; do | |
sleep 0.1 | |
done | |
./build/bin/server \ | |
--host 0.0.0.0 \ | |
--port 8080 \ | |
--hf-repo ggml-org/models \ | |
--hf-file phi-2/ggml-model-q4_0.gguf \ | |
--model /models/phi-2/ggml-model-q4_0.gguf \ | |
--metrics \ | |
--parallel 8 \ | |
--batch-size 2048 \ | |
--ubatch-size 256 \ | |
--n-predict 2048 \ | |
--ctx-size 16384 \ | |
--defrag-thold 0.1 \ | |
--log-format text \ | |
--log-format text \ | |
-ngl 33 & | |
# wait for the server to listen | |
while ! nc -z localhost 8080; do | |
sleep 0.1 | |
done | |
# wait for the server to load the model | |
while [[ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8080)" != "200" ]]; do | |
sleep 0.5; | |
done | |
cd examples/server/bench | |
SERVER_BENCH_N_PROMPTS=1000 \ | |
SERVER_BENCH_MAX_PROMPT_TOKENS=1024 \ | |
SERVER_BENCH_MAX_CONTEXT=2048 \ | |
SERVER_BENCH_MAX_TOKENS=1024 \ | |
../../../k6 run script.js --duration 10m --iterations 1000 --vus 8 |