forked from johncordeiro/worker-runpod-vllm
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Dockerfile
103 lines (80 loc) · 3.28 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# Base image
# The following docker base image is recommended by VLLM:
FROM runpod/pytorch:2.1.0-py3.10-cuda11.8.0-devel-ubuntu22.04
# Use bash shell with pipefail option
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
# Set the working directory
WORKDIR /
# Update and upgrade the system packages (Worker Template)
ARG DEBIAN_FRONTEND=noninteractive
# Install supported GCC version
#RUN apt-get update && \
# apt-get install -y gcc-11 g++-11 && \
# update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 50 --slave /usr/bin/g++ g++ /usr/bin/g++-11
RUN pip install --upgrade pip
RUN pip uninstall torch -y
RUN pip install torch==2.1.0 -f https://download.pytorch.org/whl/cu118
COPY builder/setup.sh /setup.sh
RUN chmod +x /setup.sh
RUN bash setup.sh
RUN rm /setup.sh
# Set CUDA environment variables
ENV cuda_home=/usr/local/cuda-11.8
ENV PATH=${cuda_home}/bin:$PATH
ENV LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
RUN echo "$(pip list | grep torch)"
RUN echo "$(python -c 'import torch; print(torch.version.cuda)')"
# TODO: change huggingface_hub when hotfix is released
RUN pip install fastapi==0.110.0 \
vllm==0.4.0 \
git+https://github.com/huggingface/huggingface_hub@2186-fix-safetensors-info \
runpod==1.6.2 \
flash-attn==2.5.6
RUN echo "$(pip list | grep torch)"
RUN echo "$(python -c 'import torch; print(torch.version.cuda)')"
# Add src files (Worker Template)
ADD src .
RUN chmod +x ./benchmark.py && \
chmod +x ./download_model.py && \
chmod +x ./handler.py && \
chmod +x ./metrics.py && \
chmod +x ./templates.py && \
chmod +x ./entrypoint.sh
# Prepare the models inside the docker image
ARG HUGGING_FACE_HUB_TOKEN=
ENV HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN
ENV HF_TOKEN=$HUGGING_FACE_HUB_TOKEN
# Prepare argument for the model and tokenizer
ARG MODEL_NAME='Weni/WeniGPT-QA-Mixstral-7B-5.0.0-KTO-AWQ'
ENV MODEL_NAME=$MODEL_NAME
ARG MODEL_REVISION="main"
ENV MODEL_REVISION=$MODEL_REVISION
ARG MODEL_BASE_PATH="/runpod-volume/"
ENV MODEL_BASE_PATH=$MODEL_BASE_PATH
ARG TOKENIZER='Weni/WeniGPT-QA-Mixstral-7B-5.0.0-KTO-AWQ'
ENV TOKENIZER=$TOKENIZER
ARG STREAMING='false'
ENV STREAMING=$STREAMING
ARG DOWNLOAD_MODEL
ENV HF_DATASETS_CACHE="/runpod-volume/huggingface-cache/datasets"
ENV HUGGINGFACE_HUB_CACHE="/runpod-volume/huggingface-cache/hub"
ENV TRANSFORMERS_CACHE="/runpod-volume/huggingface-cache/hub"
# Download the models
RUN mkdir -p /model
COPY docker-entrypoint.sh docker-entrypoint.sh
# Set environment variables
ENV PORT=80 \
MODEL_NAME=$MODEL_NAME \
MODEL_REVISION=$MODEL_REVISION \
MODEL_BASE_PATH=$MODEL_BASE_PATH \
HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN
# Conditionally download the model weights based on DOWNLOAD_MODEL
RUN if [ "$DOWNLOAD_MODEL" = "1" ]; then \
python -u /download_model.py; \
fi
EXPOSE 8000 6379 80
ENTRYPOINT ["docker-entrypoint.sh"]
# Start the handler
#CMD STREAMING=$STREAMING MODEL_NAME=$MODEL_NAME MODEL_BASE_PATH=$MODEL_BASE_PATH TOKENIZER=$TOKENIZER python -u /handler.py
#ENTRYPOINT ["./entrypoint.sh"]
#CMD ["--model", "KaleDivergence/WeniGPT-L-70-AWQ-NO-SAFETENSORS", "--host", "0.0.0.0", "--port", "8000", "--gpu-memory-utilization", "0.95", "--tensor-parallel-size", "1", "--tokenizer-mode", "auto", "--seed", "0", "--quantization", "awq", "--dtype", "half"]