Skip to content

Commit

Permalink
Merge pull request #550 from rhatdan/nvidia
Browse files Browse the repository at this point in the history
Make more options optional
  • Loading branch information
rhatdan committed Jun 12, 2024
2 parents 7e17de4 + 7f07935 commit 178f2c1
Show file tree
Hide file tree
Showing 7 changed files with 46 additions and 49 deletions.
12 changes: 11 additions & 1 deletion training/amd-bootc/Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,17 @@ RUN sed -i -e '/additionalimage.*/a "/usr/lib/containers/storage",' \
cp /run/.input/ilab /usr/local/bin/ilab

ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-amd:latest"
ARG VLLM_IMAGE
ARG VLLM_IMAGE="quay.io/ai-lab/vllm:latest"

ARG SSHPUBKEY

# The --build-arg "SSHPUBKEY=$(cat ~/.ssh/id_rsa.pub)" option inserts your
# public key into the image, allowing root access via ssh.
RUN if [ -n "${SSHPUBKEY}" ]; then \
set -eu; mkdir -p /usr/ssh && \
echo 'AuthorizedKeysFile /usr/ssh/%u.keys .ssh/authorized_keys .ssh/authorized_keys2' >> /etc/ssh/sshd_config.d/30-auth-system.conf && \
echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys; \
fi

RUN sed -i 's/__REPLACE_TRAIN_DEVICE__/cuda/' /usr/local/bin/ilab
RUN sed -i 's/__REPLACE_CONTAINER_DEVICE__/nvidia.com\/gpu=all/' /usr/local/bin/ilab
Expand Down
5 changes: 3 additions & 2 deletions training/amd-bootc/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ bootc: prepare-files growfs
$(ARCH:%=--platform linux/%) \
$(EXTRA_RPM_PACKAGES:%=--build-arg EXTRA_RPM_PACKAGES=%) \
$(FROM:%=--from=%) \
$(INSTRUCTLAB_IMAGE:%=--build-arg INSTRUCTLAB_IMAGE=%) \
$(SOURCE_DATE_EPOCH:%=--timestamp=%) \
--build-arg "INSTRUCTLAB_IMAGE=$(INSTRUCTLAB_IMAGE)" \
--build-arg "VLLM_IMAGE=$(VLLM_IMAGE)" \
$(VLLM_IMAGE:%=--build-arg VLLM_IMAGE=%) \
$(SSH_PUBKEY:%=--build-arg SSHPUBKEY=%) \
--cap-add SYS_ADMIN \
--file Containerfile \
--security-opt label=disable \
Expand Down
26 changes: 2 additions & 24 deletions training/common/Makefile.common
Original file line number Diff line number Diff line change
Expand Up @@ -32,39 +32,17 @@ KERNEL_VERSION ?=
INSTRUCTLAB_IMAGE = $(REGISTRY)/$(REGISTRY_ORG)/instructlab-$(VENDOR):$(IMAGE_TAG)
VLLM_IMAGE = $(REGISTRY)/$(REGISTRY_ORG)/vllm:$(IMAGE_TAG)
TRAIN_IMAGE = $(REGISTRY)/$(REGISTRY_ORG)/deepspeed-trainer:$(IMAGE_TAG)
INSTRUCTLAB_IMAGE_ID = $(shell $(CONTAINER_TOOL) image inspect $(INSTRUCTLAB_IMAGE) --format {{.Id}})
VLLM_IMAGE_ID = $(shell $(CONTAINER_TOOL) image inspect $(VLLM_IMAGE) --format {{.Id}})
TRAIN_IMAGE_ID = $(shell $(CONTAINER_TOOL) image inspect $(TRAIN_IMAGE) --format {{.Id}})
WRAPPER = $(CURDIR)/../ilab-wrapper/ilab
QLORA_WRAPPER = $(CURDIR)/../ilab-wrapper/ilab-qlora
TRAIN_WRAPPER = $(CURDIR)/../ilab-wrapper/ilab-training-launcher
OUTDIR = $(CURDIR)/../build

SSH_PUBKEY ?= $(shell cat ${HOME}/.ssh/id_rsa.pub 2> /dev/null)

.PHONY: prepare-files
prepare-files: $(OUTDIR)/$(WRAPPER) $(OUTDIR)/$(QLORA_WRAPPER) $(OUTDIR)/$(TRAIN_WRAPPER) $(OUTDIR)/$(INSTRUCTLAB_IMAGE_ID) $(OUTDIR)/$(VLLM_IMAGE_ID) $(OUTDIR)/$(TRAIN_IMAGE_ID)
prepare-files: $(OUTDIR)

.PHONY: $(OUTDIR)
$(OUTDIR):
mkdir -p $(OUTDIR)

$(OUTDIR)/$(WRAPPER): $(OUTDIR)
cp -pf $(WRAPPER) $(OUTDIR)
$(OUTDIR)/$(QLORA_WRAPPER): $(OUTDIR)
cp -pf $(QLORA_WRAPPER) $(OUTDIR)
$(OUTDIR)/$(TRAIN_WRAPPER): $(OUTDIR)
cp -pf $(TRAIN_WRAPPER) $(OUTDIR)

$(OUTDIR)/$(INSTRUCTLAB_IMAGE_ID):
@mkdir -p $(OUTDIR)/$(INSTRUCTLAB_IMAGE_ID)
$(CONTAINER_TOOL) push --compress=false $(INSTRUCTLAB_IMAGE) oci:$(OUTDIR)/$(INSTRUCTLAB_IMAGE_ID)/
$(OUTDIR)/$(VLLM_IMAGE_ID):
@mkdir -p $(OUTDIR)/$(VLLM_IMAGE_ID)
$(CONTAINER_TOOL) push --compress=false $(VLLM_IMAGE) oci:$(OUTDIR)/$(VLLM_IMAGE_ID)/
$(OUTDIR)/$(TRAIN_IMAGE_ID):
@mkdir -p $(OUTDIR)/$(TRAIN_IMAGE_ID)
$(CONTAINER_TOOL) push --compress=false $(TRAIN_IMAGE) oci:$(OUTDIR)/$(TRAIN_IMAGE_ID)/

.PHONY: check-sshkey
check-sshkey:
@test -n "$(SSH_PUBKEY)" || \
Expand Down
15 changes: 13 additions & 2 deletions training/intel-bootc/Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,19 @@ RUN depmod -a ${KERNEL_VERSION}
# Include growfs service
COPY build/usr /usr

ARG INSTRUCTLAB_IMAGE
ARG VLLM_IMAGE
ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-intel:latest"
ARG VLLM_IMAGE="quay.io/ai-lab/vllm:latest"


ARG SSHPUBKEY

# The --build-arg "SSHPUBKEY=$(cat ~/.ssh/id_rsa.pub)" option inserts your
# public key into the image, allowing root access via ssh.
RUN if [ -n "${SSHPUBKEY}" ]; then \
set -eu; mkdir -p /usr/ssh && \
echo 'AuthorizedKeysFile /usr/ssh/%u.keys .ssh/authorized_keys .ssh/authorized_keys2' >> /etc/ssh/sshd_config.d/30-auth-system.conf && \
echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys; \
fi

# Prepull the instructlab image
RUN IID=$(podman --root /usr/lib/containers/storage pull oci:/run/.input/vllm) && \
Expand Down
5 changes: 3 additions & 2 deletions training/intel-bootc/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@ bootc: growfs prepare-files
$(DRIVER_VERSION:%=--build-arg DRIVER_VERSION=%) \
$(EXTRA_RPM_PACKAGES:%=--build-arg EXTRA_RPM_PACKAGES=%) \
$(FROM:%=--build-arg BASEIMAGE=%) \
$(INSTRUCTLAB_IMAGE:%=--build-arg INSTRUCTLAB_IMAGE=%) \
$(KERNEL_VERSION:%=--build-arg KERNEL_VERSION=%) \
$(SOURCE_DATE_EPOCH:%=--timestamp=%) \
--build-arg "INSTRUCTLAB_IMAGE=$(INSTRUCTLAB_IMAGE)" \
--build-arg "VLLM_IMAGE=$(VLLM_IMAGE)" \
$(VLLM_IMAGE:%=--build-arg VLLM_IMAGE=%) \
$(SSH_PUBKEY:%=--build-arg SSHPUBKEY=%) \
--cap-add SYS_ADMIN \
--file Containerfile \
--security-opt label=disable \
Expand Down
17 changes: 8 additions & 9 deletions training/nvidia-bootc/Containerfile
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,11 @@ ARG SSHPUBKEY

# The --build-arg "SSHPUBKEY=$(cat ~/.ssh/id_rsa.pub)" option inserts your
# public key into the image, allowing root access via ssh.
RUN set -eu; mkdir -p /usr/ssh && \
echo 'AuthorizedKeysFile /usr/ssh/%u.keys .ssh/authorized_keys .ssh/authorized_keys2' >> /etc/ssh/sshd_config.d/30-auth-system.conf && \
echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys
RUN if [ -n "${SSHPUBKEY}" ]; then \
set -eu; mkdir -p /usr/ssh && \
echo 'AuthorizedKeysFile /usr/ssh/%u.keys .ssh/authorized_keys .ssh/authorized_keys2' >> /etc/ssh/sshd_config.d/30-auth-system.conf && \
echo ${SSHPUBKEY} > /usr/ssh/root.keys && chmod 0600 /usr/ssh/root.keys; \
fi

# Setup /usr/lib/containers/storage as an additional store for images.
# Remove once the base images have this set by default.
Expand All @@ -153,12 +155,9 @@ RUN grep -q /usr/lib/containers/storage /etc/containers/storage.conf || \
cp /run/.input/ilab* /usr/local/bin/


ARG INSTRUCTLAB_IMAGE
ARG INSTRUCTLAB_IMAGE_ID
ARG VLLM_IMAGE
ARG VLLM_IMAGE_ID
ARG TRAIN_IMAGE
ARG TRAIN_IMAGE_ID
ARG INSTRUCTLAB_IMAGE="quay.io/ai-lab/instructlab-nvidia:latest"
ARG VLLM_IMAGE="quay.io/ai-lab/vllm:latest"
ARG TRAIN_IMAGE="quay.io/ai-lab/deepspeed-trainer:latest"
ARG GPU_COUNT_COMMAND="nvidia-ctk --quiet cdi list | grep -P nvidia.com/gpu='\\\\d+' | wc -l"

RUN for i in /usr/local/bin/ilab*; do \
Expand Down
15 changes: 6 additions & 9 deletions training/nvidia-bootc/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ VENDOR ?= nvidia
IMAGE_NAME ?= $(VENDOR)-bootc
DTK_IMAGE_NAME ?= $(VENDOR)-builder
DTK_IMAGE_TAG ?= latest
DRIVER_TOOLKIT_IMAGE = "${REGISTRY}/${REGISTRY_ORG}/${DTK_IMAGE_NAME}:${DTK_IMAGE_TAG}"
DRIVER_TOOLKIT_IMAGE = ${REGISTRY}/${REGISTRY_ORG}/${DTK_IMAGE_NAME}:${DTK_IMAGE_TAG}

CUDA_VERSION ?=
OS_VERSION_MAJOR ?=
Expand All @@ -29,21 +29,18 @@ bootc: dtk check-sshkey prepare-files growfs
"${CONTAINER_TOOL}" build \
$(ARCH:%=--platform linux/%) \
$(CUDA_VERSION:%=--build-arg CUDA_VERSION=%) \
$(DRIVER_TOOLKIT_IMAGE:%=--build-arg DRIVER_TOOLKIT_IMAGE=%) \
$(DRIVER_VERSION:%=--build-arg DRIVER_VERSION=%) \
$(DRIVER_VERSION:%=--label driver-version=%) \
$(EXTRA_RPM_PACKAGES:%=--build-arg EXTRA_RPM_PACKAGES=%) \
$(FROM:%=--build-arg BASEIMAGE=%) \
$(INSTRUCTLAB_IMAGE:%=--build-arg INSTRUCTLAB_IMAGE=%) \
$(KERNEL_VERSION:%=--build-arg KERNEL_VERSION=%) \
$(OS_VERSION_MAJOR:%=--build-arg OS_VERSION_MAJOR=%) \
$(SOURCE_DATE_EPOCH:%=--timestamp=%) \
--build-arg "INSTRUCTLAB_IMAGE=$(INSTRUCTLAB_IMAGE)" \
--build-arg "INSTRUCTLAB_IMAGE_ID=$(INSTRUCTLAB_IMAGE_ID)" \
--build-arg "SSHPUBKEY=$(SSH_PUBKEY)" \
--build-arg "TRAIN_IMAGE=$(TRAIN_IMAGE)" \
--build-arg "TRAIN_IMAGE_ID=$(TRAIN_IMAGE_ID)" \
--build-arg "VLLM_IMAGE=$(VLLM_IMAGE)" \
--build-arg "VLLM_IMAGE_ID=$(VLLM_IMAGE_ID)" \
--build-arg DRIVER_TOOLKIT_IMAGE=${DRIVER_TOOLKIT_IMAGE} \
$(TRAIN_IMAGE:%=--build-arg TRAIN_IMAGE=%) \
$(VLLM_IMAGE:%=--build-arg VLLM_IMAGE=%) \
$(SSH_PUBKEY:%=--build-arg SSHPUBKEY=%) \
--cap-add SYS_ADMIN \
--file Containerfile \
--security-opt label=disable \
Expand Down

0 comments on commit 178f2c1

Please sign in to comment.