Skip to content

Commit

Permalink
Merge pull request #5884 from rbberger/amd_rocm_hpcbind
Browse files Browse the repository at this point in the history
Add AMD ROCm support to hpcbind
  • Loading branch information
dalg24 committed Feb 23, 2023
2 parents a2181fc + 4ec9fb6 commit bc1138f
Showing 1 changed file with 67 additions and 18 deletions.
85 changes: 67 additions & 18 deletions bin/hpcbind
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,45 @@ fi
################################################################################
declare -i HPCBIND_HAS_NVIDIA=0
type nvidia-smi >/dev/null 2>&1
HPCBIND_HAS_NVIDIA=$((!$?))
HPCBIND_HAS_NVIDIA=$((! $?))

################################################################################
# Check if rocm-smi exist
################################################################################
declare -i HPCBIND_HAS_AMD=0
type rocm-smi >/dev/null 2>&1
HPCBIND_HAS_AMD=$((! $?))

################################################################################
# Get visible gpu
################################################################################
declare -i NUM_GPUS=0
HPCBIND_VISIBLE_GPUS=""
if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
NUM_GPUS=$(nvidia-smi -L | wc -l);
HPCBIND_HAS_NVIDIA=$((!$?))
nvidia-smi >/dev/null 2>&1
HPCBIND_HAS_NVIDIA=$((! $?))
if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
NUM_GPUS=$(nvidia-smi -L | wc -l);
HPCBIND_HAS_NVIDIA=$((! $?))
if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}}
fi
fi
fi

if [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then
# rocm-smi doesn't have an error code if there is no hardware
# check for /sys/module/amdgpu/initstate instead
stat /sys/module/amdgpu/initstate >/dev/null 2>&1
HPCBIND_HAS_AMD=$((! $?))
if [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then
NUM_GPUS=$(rocm-smi -i --csv | sed '/^$/d' | tail -n +2 | wc -l);
HPCBIND_HAS_AMD=$((! $?))
if [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then
GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )"
HPCBIND_VISIBLE_GPUS=${ROCR_VISIBLE_DEVICES:-${GPU_LIST}}
fi
fi
fi

Expand Down Expand Up @@ -101,8 +126,8 @@ fi
function show_help {
local cmd=$(basename "$0")
echo "Usage: ${cmd} <options> -- command ..."
echo " Set the process mask, OMP environment variables and CUDA environment"
echo " variables to sane values if possible. Uses hwloc and nvidia-smi if"
echo " Set the process mask, OMP environment variables and CUDA/ROCm environment"
echo " variables to sane values if possible. Uses hwloc and nvidia-smi/rocm-smi if"
echo " available. Will preserve the current process binding, so it is safe"
echo " to use with a queuing system or mpiexec."
echo ""
Expand All @@ -116,10 +141,10 @@ function show_help {
echo " --distribute-partition=I"
echo " Use the i'th partition (zero based)"
echo " --visible-gpus=<L> Comma separated list of gpu ids"
echo " Default: CUDA_VISIBLE_DEVICES or all gpus in"
echo " Default: CUDA_VISIBLE_DEVICES/ROCR_VISIBLE_DEVICES or all gpus in"
echo " sequential order"
echo " --ignore-queue Ignore queue job id when choosing visible GPU and partition"
echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES"
echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES/ROCR_VISIBLE_DEVICES"
echo " --openmp=M.m Set env variables for the given OpenMP version"
echo " Default: 4.0"
echo " --openmp-ratio=N/D Ratio of the cpuset to use for OpenMP"
Expand Down Expand Up @@ -525,13 +550,24 @@ fi
################################################################################

if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
else
declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
else
declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
fi
elif [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then
if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
export ROCR_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
else
declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
export ROCR_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
fi
fi
fi

Expand All @@ -541,6 +577,7 @@ fi
export HPCBIND_HWLOC_VERSION=${HPCBIND_HWLOC_VERSION}
export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
export HPCBIND_HAS_AMD=${HPCBIND_HAS_AMD}
export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
export HPCBIND_NUM_CORES=${HPCBIND_NUM_CORES}
export HPCBIND_NUM_NUMAS=${HPCBIND_NUM_NUMAS}
Expand All @@ -555,8 +592,14 @@ else
export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}"
fi
export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}"
export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then
export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
fi
if [[ ${HPCBIND_HAS_AMD} -eq 1 ]]; then
export HPCBIND_AMD_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
export HPCBIND_AMD_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
fi
export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}"
if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
export HPCBIND_QUEUE_RANK=${HPCBIND_QUEUE_RANK}
Expand All @@ -580,6 +623,9 @@ if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
echo "${TMP_ENV}" | grep -E "^HWLOC_" >> ${HPCBIND_LOG}
echo "[CUDA]" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
echo "[ROCM]" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^ROCM_" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^ROCR_" >> ${HPCBIND_LOG}
echo "[OPENMP]" >> ${HPCBIND_LOG}
echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG}
echo "[GOMP] (gcc, g++, and gfortran)" >> ${HPCBIND_LOG}
Expand All @@ -602,6 +648,9 @@ else
echo "${TMP_ENV}" | grep -E "^HWLOC_" > >(tee -a ${HPCBIND_LOG})
echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
echo "[ROCM]" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^ROCM_" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^ROCR_" > >(tee -a ${HPCBIND_LOG})
echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG})
echo "[GOMP] (gcc, g++, and gfortran)" > >(tee -a ${HPCBIND_LOG})
Expand Down

0 comments on commit bc1138f

Please sign in to comment.