server: bench: create a script to wrap all operations #52

Workflow file for this run

	# Benchmark
	name: Benchmark

	on:
	workflow_dispatch:
	inputs:
	gpu-series:
	description: 'Azure GPU series to run with'
	required: true
	type: choice
	options:
	- Standard_NC4as_T4_v3
	- Standard_NC64as_T4_v3
	- Standard_NC24ads_A100_v4
	- Standard_NC48ads_A100_v4
	- Standard_ND96asr_A100_v4
	- Standard_NC40ads_H100_v5
	- Standard_NC80adis_H100_v5
	push:
	branches:
	- master
	- hp/server/bench/workflow # FIXME remove
	paths: ['.github/workflows/server.yml', '/CMakeLists.txt', '/Makefile', '*/.h', '*/.hpp', '*/.c', '*/.cpp', '*/.cu', '*/.swift', '*/.m', 'examples/server/bench/*.']
	pull_request:
	types: [opened, synchronize, reopened]
	paths: ['.github/workflows/server.yml', '/CMakeLists.txt', '/Makefile', '*/.h', '*/.hpp', '*/.c', '*/.cpp', '*/.cu', '*/.swift', '*/.m', 'examples/server/bench/*.']
	schedule:
	- cron: '04 2 * * *'

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	jobs:
	bench-server-baseline:
	runs-on: Standard_NC4as_T4_v3
	if: ${{ github.event.schedule \|\| github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' \|\| github.event.pull_request \|\| github.event.push.ref == 'refs/heads/master' }}
	steps:
	- name: Clone
	id: checkout
	uses: actions/checkout@v3
	with:
	fetch-depth: 0

	- name: Install python env
	id: pipenv
	run: \|
	cd examples/server/bench
	python -m venv venv
	source venv/bin/activate
	pip install -r requirements.txt

	- name: Prometheus
	id: install_prometheus
	run: \|
	wget --quiet https://github.com/prometheus/prometheus/releases/download/v2.51.0/prometheus-2.51.0.linux-amd64.tar.gz
	tar xzf prometheus*.tar.gz --strip-components=1
	./prometheus --config.file=examples/server/bench/prometheus.yml &
	while ! nc -z localhost 9090; do
	sleep 0.1
	done

	- name: Build
	id: cmake_build
	run: \|
	set -eux
	mkdir build
	cd build
	cmake .. \
	-DLLAMA_NATIVE=OFF \
	-DLLAMA_BUILD_SERVER=ON \
	-DLLAMA_CURL=ON \
	-DLLAMA_CUBLAS=ON \
	-DCUDAToolkit_ROOT=/usr/local/cuda \
	-DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \
	-DCMAKE_CUDA_ARCHITECTURES=75 \
	-DLLAMA_FATAL_WARNINGS=OFF \
	-DLLAMA_ALL_WARNINGS=OFF \
	-DCMAKE_BUILD_TYPE=Release;
	cmake --build . --config Release -j $(nproc) --target server

	- name: Install k6
	id: k6_installation
	run: \|
	wget --quiet https://github.com/grafana/k6/releases/download/v0.49.0/k6-v0.49.0-linux-amd64.tar.gz
	tar xzf k6*.tar.gz --strip-components=1

	- name: Download the dataset
	id: download_dataset
	run: \|
	cd examples/server/bench
	wget --quiet https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json

	- name: Server bench
	id: server_bench
	run: \|
	set -eux

	cd examples/server/bench
	BENCH_K6_BIN_PATH=./k6 python bench.py \
	--runner-label Standard_NC4as_T4_v3 \
	--scenario script.js \
	--duration 5m \
	--hf-repo ggml-org/models \
	--hf-file phi-2/ggml-model-q4_0.gguf \
	--model-path-prefix /models/phi-2/ggml-model-q4_0.gguf \
	--parallel 8 \
	-ngl 33 \
	--batch-size 2048 \
	--ubatch-size 256 \
	--ctx-size 16384 \
	--n-prompts 1000 \
	--max-prompt-tokens 1024 \
	--max-tokens 2048

	cat results.github.env >> $GITHUB_ENV

	- name: Comment PR
	uses: thollander/actions-comment-pull-request@v2
	with:
	message: \|
	Request duration: ${HTTP_REQ_DURATION_AVG}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

server: bench: create a script to wrap all operations #52

Workflow file

server: bench: create a script to wrap all operations #52

Jobs

Run details

Workflow file for this run