Skip to content

Commit

Permalink
Add CI on AMD GPUs (#1117)
Browse files Browse the repository at this point in the history
* Update env vars for amd ci

* Fix edge case if parthenon is located at /parthenon

* First attempt at AMD GPU CI

* Fix formatting

* Fix userid

* Attempt to debug GPU visibility in container

* Update deprecated Kokkos option

* use local user in docker img

* Enable extendd pipelines

* Change default user only for custom runners
  • Loading branch information
pgrete committed Jul 4, 2024
1 parent f891c02 commit 2e0e981
Show file tree
Hide file tree
Showing 7 changed files with 110 additions and 6 deletions.
58 changes: 58 additions & 0 deletions .github/workflows/ci-extended.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ env:
CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
MACHINE_CFG: cmake/machinecfg/CI.cmake
OMPI_MCA_mpi_common_cuda_event_max: 1000
# https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
OMPI_MCA_btl_vader_single_copy_mechanism: none

jobs:
perf-and-regression:
Expand Down Expand Up @@ -121,3 +123,59 @@ jobs:
example/advection/ascent_render_57.png
retention-days: 3

perf-and-regression-amdgpu:
strategy:
matrix:
parallel: ['serial', 'mpi']
runs-on: [self-hosted, navi1030]
container:
image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5
# Map to local user id on CI machine to allow writing to build cache and
# forward device handles to access AMD GPU within container
options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined
env:
CMAKE_GENERATOR: Ninja
CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'

- name: Setup cache for gold standard
uses: actions/cache@v3
with:
path: tst/regression/gold_standard/
key: gold-standard

- name: Configure
run: |
cmake -B build \
-DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DMACHINE_VARIANT=hip-${{ matrix.parallel }} \
-DCMAKE_CXX_COMPILER=hipcc
- name: Build
run: cmake --build build

# run performance "unit" tests (none use MPI)
- name: Performance tests
if: ${{ matrix.parallel == 'serial' }}
run: |
cd build
ctest -L performance -LE perf-reg
# run regression tests
- name: Regression tests
run: |
cd build
ctest -L regression -L ${{ matrix.parallel }} -LE perf-reg --timeout 3600
- uses: actions/upload-artifact@v3
with:
name: log-and-convergence-${{ matrix.parallel }}
path: |
build/CMakeFiles/CMakeOutput.log
build/tst/regression/outputs/advection_convergence*/advection-errors.dat
build/tst/regression/outputs/advection_convergence*/advection-errors.png
retention-days: 3
43 changes: 43 additions & 0 deletions .github/workflows/ci-short.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ env:
CMAKE_BUILD_PARALLEL_LEVEL: 5 # num threads for build
MACHINE_CFG: cmake/machinecfg/CI.cmake
OMPI_MCA_mpi_common_cuda_event_max: 1000
# https://github.com/open-mpi/ompi/issues/4948#issuecomment-395468231
OMPI_MCA_btl_vader_single_copy_mechanism: none

jobs:
style:
Expand Down Expand Up @@ -130,3 +132,44 @@ jobs:
build/profile.txt
retention-days: 3

integration-amdgpu:
runs-on: [self-hosted, navi1030]
container:
image: ghcr.io/parthenon-hpc-lab/rocm5.4.3-mpi-hdf5
# Map to local user id on CI machine to allow writing to build cache and
# forward device handles to access AMD GPU within container
options: --user 1000 -w /home/ci --device /dev/kfd --device /dev/dri --security-opt seccomp=unconfined
env:
CMAKE_GENERATOR: Ninja
CMAKE_BUILD_PARALLEL_LEVEL: 8 # num threads for build
steps:
- uses: actions/checkout@v3
with:
submodules: 'true'
- name: Configure
run: |
cmake -B build \
-DMACHINE_CFG=${PWD}/cmake/machinecfg/GitHubActions.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DMACHINE_VARIANT=hip-mpi \
-DCMAKE_CXX_COMPILER=hipcc
# Test example with "variables" and output
- name: advection
run: |
cmake --build build -t advection-example
cd build
ctest -R regression_mpi_test:output_hdf5
# Test example with swarms
- name: particle-leapfrog
run: |
cmake --build build -t particle-leapfrog
cd build
ctest -R regression_mpi_test:particle_leapfrog
- uses: actions/upload-artifact@v3
with:
name: configure-log-integration-amdgpu
path: |
build/CMakeFiles/CMakeOutput.log
retention-days: 3

1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
- [[PR 1031]](https://github.com/parthenon-hpc-lab/parthenon/pull/1031) Fix bug in non-cell centered AMR

### Infrastructure (changes irrelevant to downstream codes)
- [[PR 1117]](https://github.com/parthenon-hpc-lab/parthenon/pull/1117) Enable CI pipelines on AMD GPUs with ROCM/HIP
- [[PR 1114]](https://github.com/parthenon-hpc-lab/parthenon/pull/1114) Enable sanitizers for extended CI host build
- [[PR 1123]](https://github.com/parthenon-hpc-lab/parthenon/pull/1123) Default initialize ProResInfo.dir
- [[PR 1121]](https://github.com/parthenon-hpc-lab/parthenon/pull/1121) Default initialize BndInfo.dir
Expand Down
2 changes: 1 addition & 1 deletion cmake/TestSetup.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ function(setup_test_parallel nproc dir arg extra_labels)
list(APPEND labels "${extra_labels}")

if(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_HIP)
set(PARTHENON_KOKKOS_TEST_ARGS "--kokkos-num-devices=${NUM_GPU_DEVICES_PER_NODE}")
set(PARTHENON_KOKKOS_TEST_ARGS "--kokkos-map-device-id-by=mpi_rank")
list(APPEND labels "cuda")
endif()
if (Kokkos_ENABLE_OPENMP)
Expand Down
5 changes: 3 additions & 2 deletions cmake/machinecfg/GitHubActions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,10 @@ if (${MACHINE_VARIANT} MATCHES "cuda")
set(MACHINE_CXX_FLAGS "${MACHINE_CXX_FLAGS} -Wno-unknown-cuda-version")
endif()
elseif (${MACHINE_VARIANT} MATCHES "hip")
# using an arbitrary arch as GitHub Action runners don't have GPUs
set(Kokkos_ARCH_VEGA908 ON CACHE BOOL "GPU architecture")
# using an arch that matches Hamilton at Hamburg Obs
set(Kokkos_ARCH_NAVI1030 ON CACHE BOOL "GPU architecture")
set(Kokkos_ENABLE_HIP ON CACHE BOOL "Enable HIP")
set(Kokkos_ENABLE_ZEN3 ON CACHE BOOL "Enable Zen3")
else()
set(MACHINE_CXX_FLAGS "${MACHINE_CXX_FLAGS} -fopenmp-simd")
endif()
Expand Down
3 changes: 3 additions & 0 deletions scripts/docker/Dockerfile.hip-rocm
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,6 @@ RUN cd /tmp && \
ENV LDFLAGS="-lopen-pal"

RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10

# uid 1000 maps to the one running the container on the CI host
RUN useradd --create-home --shell /bin/bash -u 1000 -G render ci
4 changes: 1 addition & 3 deletions tst/regression/utils/test_case.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,7 @@ def __init__(self, run_test_path, **kwargs):
try:
parthenon_path = os.path.realpath(__file__)
idx = parthenon_path.rindex("/parthenon/")
self.parameters.parthenon_path = os.path.join(
parthenon_path[:idx], "parthenon"
)
self.parameters.parthenon_path = parthenon_path[: idx + 10]
except ValueError:
baseDir = os.path.dirname(__file__)
self.parameters.parthenon_path = os.path.abspath(baseDir + "/../../../")
Expand Down

0 comments on commit 2e0e981

Please sign in to comment.