Skip to content

Commit

Permalink
Add Arm64 builds to CI (#524)
Browse files Browse the repository at this point in the history
* Add Arm64 build and test stages to CI
* Refactor `Dockerfile` as a multi-arch container
* Add Arm64 (`aarch64`) to `dependencies.yaml` matrix
* Applies fix to #525 provided by @willkill07 
* Adds Arm64 Conda package output

Related to nv-morpheus/utilities#90
Closes #525

Authors:
  - David Gardner (https://github.com/dagardner-nv)

Approvers:
  - Will Killian (https://github.com/willkill07)
  - Michael Demoret (https://github.com/mdemoret-nv)

URL: #524
  • Loading branch information
dagardner-nv authored Jan 15, 2025
1 parent aaf402a commit 5f9cc13
Show file tree
Hide file tree
Showing 15 changed files with 186 additions and 58 deletions.
21 changes: 15 additions & 6 deletions .github/workflows/ci_pipe.yml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ jobs:

build:
name: Build
runs-on: linux-amd64-cpu16
runs-on: linux-${{ matrix.arch }}-cpu16
timeout-minutes: 60
container:
credentials:
Expand All @@ -125,6 +125,7 @@ jobs:
strategy:
fail-fast: true
matrix:
arch: ["amd64", "arm64"]
build_cc: ["gcc", "clang"]

steps:
Expand All @@ -141,7 +142,7 @@ jobs:
aws-region: ${{ vars.AWS_REGION }}
role-duration-seconds: 43200 # 12h

- name: Build:linux:x86_64
- name: Build:linux:${{ matrix.arch }}-${{ matrix.build_cc }}
shell: bash
env:
BUILD_CC: ${{ matrix.build_cc }}
Expand All @@ -150,7 +151,7 @@ jobs:
test:
name: Test
needs: [build]
runs-on: linux-amd64-gpu-v100-latest-1
runs-on: ${{ matrix.runner }}
timeout-minutes: 60
container:
credentials:
Expand All @@ -164,7 +165,13 @@ jobs:
strategy:
fail-fast: true
matrix:
arch: ["amd64", "arm64"]
build_cc: ["gcc", "clang"]
include:
- runner: linux-amd64-gpu-v100-latest-1
arch: "amd64"
- runner: linux-arm64-gpu-a100-latest-1
arch: "arm64"

steps:
- name: Checkout
Expand All @@ -180,7 +187,7 @@ jobs:
aws-region: ${{ vars.AWS_REGION }}
role-duration-seconds: 43200 # 12h

- name: Test:linux:x86_64
- name: Test:linux:${{ matrix.arch }}-${{ matrix.build_cc }}
shell: bash
env:
BUILD_CC: ${{ matrix.build_cc }}
Expand Down Expand Up @@ -302,7 +309,7 @@ jobs:
name: Package
if: ${{ inputs.conda_run_build }}
needs: [benchmark, documentation, test]
runs-on: linux-amd64-cpu16
runs-on: linux-${{ matrix.arch }}-cpu16
timeout-minutes: 60
container:
credentials:
Expand All @@ -311,6 +318,8 @@ jobs:
image: ${{ inputs.container }}
strategy:
fail-fast: true
matrix:
arch: ["amd64", "arm64"]

steps:
- name: Checkout
Expand All @@ -327,7 +336,7 @@ jobs:
aws-region: ${{ vars.AWS_REGION }}
role-duration-seconds: 43200 # 12h

- name: conda
- name: conda:${{ matrix.arch }}
shell: bash
env:
CONDA_TOKEN: "${{ secrets.CONDA_TOKEN }}"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/pr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ jobs:
# Update conda package only for non PR branches. Use 'main' for main branch and 'dev' for all other branches
conda_upload_label: ${{ !fromJSON(needs.prepare.outputs.is_pr) && (fromJSON(needs.prepare.outputs.is_main_branch) && 'main' || 'dev') || '' }}
# Build container
container: nvcr.io/ea-nvidia-morpheus/morpheus:mrc-ci-build-241002
container: nvcr.io/ea-nvidia-morpheus/morpheus:mrc-ci-build-241219
# Test container
test_container: nvcr.io/ea-nvidia-morpheus/morpheus:mrc-ci-test-241002
test_container: nvcr.io/ea-nvidia-morpheus/morpheus:mrc-ci-test-241219
# Info about the PR. Empty for non PR branches. Useful for extracting PR number, title, etc.
pr_info: ${{ needs.prepare.outputs.pr_info }}
secrets:
Expand Down
25 changes: 15 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,21 @@ ARG CUDA_VER=12.5.1
ARG LINUX_DISTRO=ubuntu
ARG LINUX_VER=22.04
ARG PYTHON_VER=3.10
ARG REAL_ARCH=notset

# ============= base ===================
FROM ${FROM_IMAGE}:cuda${CUDA_VER}-${LINUX_DISTRO}${LINUX_VER}-py${PYTHON_VER} AS base
FROM --platform=$TARGETPLATFORM ${FROM_IMAGE}:cuda${CUDA_VER}-${LINUX_DISTRO}${LINUX_VER}-py${PYTHON_VER} AS base

ARG PROJ_NAME=mrc
ARG USERNAME=morpheus
ARG USER_UID=1000
ARG USER_GID=$USER_UID
ARG REAL_ARCH

SHELL ["/bin/bash", "-c"]
ENV REAL_ARCH=${REAL_ARCH}

RUN --mount=type=cache,target=/var/cache/apt \
RUN --mount=type=cache,target=/var/cache/apt,id=apt_cache-${REAL_ARCH} \
apt update &&\
apt install --no-install-recommends -y \
libnuma1 \
Expand All @@ -45,13 +48,14 @@ RUN useradd --uid $USER_UID --gid $USER_GID -m $USERNAME && \
echo $USERNAME ALL=\(root\) NOPASSWD:ALL > /etc/sudoers.d/$USERNAME && \
chmod 0440 /etc/sudoers.d/$USERNAME

COPY ./conda/environments/all_cuda-125_arch-x86_64.yaml /opt/mrc/conda/environments/all_cuda-125_arch-x86_64.yaml
COPY ./conda/environments/all_cuda-125_arch-${REAL_ARCH}.yaml /opt/mrc/conda/environments/all_cuda-125_arch-${REAL_ARCH}.yaml

RUN --mount=type=cache,target=/opt/conda/pkgs,sharing=locked \
RUN --mount=type=cache,target=/opt/conda/pkgs,sharing=locked,id=conda_cache-${REAL_ARCH} \
echo "create env: ${PROJ_NAME}" && \
sudo -g conda -u $USERNAME \
CONDA_ALWAYS_YES=true \
/opt/conda/bin/mamba env create -q -n ${PROJ_NAME} --file /opt/mrc/conda/environments/all_cuda-125_arch-x86_64.yaml && \
/opt/conda/bin/conda env create --solver=libmamba -q -n ${PROJ_NAME} \
--file /opt/mrc/conda/environments/all_cuda-125_arch-${REAL_ARCH}.yaml && \
chmod -R a+rwX /opt/conda && \
rm -rf /tmp/conda

Expand All @@ -66,21 +70,22 @@ ENV CMAKE_CXX_COMPILER_LAUNCHER=
ENV CMAKE_C_COMPILER_LAUNCHER=

# ============ build ==================
FROM base as build
FROM --platform=$TARGETPLATFORM base as build

# Add any build only dependencies here. For now there is none but we need the
# target to get the CI runner build scripts to work

# ============ test ==================
FROM base as test
FROM --platform=$TARGETPLATFORM base as test

# Add any test only dependencies here. For now there is none but we need the
# target to get the CI runner build scripts to work

# ========= development ================
FROM base as development
FROM --platform=$TARGETPLATFORM base as development
ARG REAL_ARCH

RUN --mount=type=cache,target=/var/cache/apt \
RUN --mount=type=cache,target=/var/cache/apt,id=apt_cache-${REAL_ARCH} \
apt-get update &&\
apt-get install --no-install-recommends -y \
gdb \
Expand All @@ -94,7 +99,7 @@ RUN --mount=type=cache,target=/var/cache/apt \

# Install the .NET SDK. This is a workaround for https://github.com/dotnet/vscode-dotnet-runtime/issues/159
# Once version 1.6.1 of the extension has been release, this can be removed
RUN --mount=type=cache,target=/var/cache/apt \
RUN --mount=type=cache,target=/var/cache/apt,id=apt_cache-${REAL_ARCH} \
wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb &&\
sudo dpkg -i packages-microsoft-prod.deb &&\
rm packages-microsoft-prod.deb &&\
Expand Down
20 changes: 10 additions & 10 deletions ci/scripts/github/build.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-FileCopyrightText: Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -35,14 +35,14 @@ sccache --version

if [[ "${BUILD_CC}" == "gcc" ]]; then
rapids-logger "Building with GCC"
x86_64-conda-linux-gnu-cc --version
x86_64-conda-linux-gnu-c++ --version
${REAL_ARCH}-conda-linux-gnu-cc --version
${REAL_ARCH}-conda-linux-gnu-c++ --version
CMAKE_FLAGS="${CMAKE_BUILD_ALL_FEATURES} ${CMAKE_CACHE_FLAGS}"
elif [[ "${BUILD_CC}" == "gcc-coverage" ]]; then
rapids-logger "Building with GCC with gcov profile '-g -fprofile-arcs -ftest-coverage"
x86_64-conda-linux-gnu-cc --version
x86_64-conda-linux-gnu-c++ --version
x86_64-conda-linux-gnu-gcov --version
${REAL_ARCH}-conda-linux-gnu-cc --version
${REAL_ARCH}-conda-linux-gnu-c++ --version
${REAL_ARCH}-conda-linux-gnu-gcov --version
CMAKE_FLAGS="${CMAKE_BUILD_ALL_FEATURES} ${CMAKE_BUILD_WITH_CODECOV} ${CMAKE_CACHE_FLAGS}"
else
rapids-logger "Building with Clang"
Expand All @@ -68,13 +68,13 @@ fi

if [[ "${BUILD_CC}" != "gcc-coverage" || ${LOCAL_CI} == "1" ]]; then
rapids-logger "Archiving results"
tar cfj "${WORKSPACE_TMP}/dot_cache.tar.bz" .cache
tar cfj "${WORKSPACE_TMP}/build.tar.bz" build
tar cfj "${WORKSPACE_TMP}/dot_cache-${REAL_ARCH}.tar.bz" .cache
tar cfj "${WORKSPACE_TMP}/build-${REAL_ARCH}.tar.bz" build
ls -lh ${WORKSPACE_TMP}/

rapids-logger "Pushing results to ${DISPLAY_ARTIFACT_URL}/"
upload_artifact "${WORKSPACE_TMP}/build.tar.bz"
upload_artifact "${WORKSPACE_TMP}/dot_cache.tar.bz"
upload_artifact "${WORKSPACE_TMP}/build-${REAL_ARCH}.tar.bz"
upload_artifact "${WORKSPACE_TMP}/dot_cache-${REAL_ARCH}.tar.bz"
fi

rapids-logger "Success"
14 changes: 9 additions & 5 deletions ci/scripts/github/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,28 @@ rapids-logger "Env Setup"
source /opt/conda/etc/profile.d/conda.sh
export MRC_ROOT=${MRC_ROOT:-$(git rev-parse --show-toplevel)}
cd ${MRC_ROOT}
export REAL_ARCH=${REAL_ARCH:-$(arch)}

# For non-gpu hosts nproc will correctly report the number of cores we are able to use
# On a GPU host however nproc will report the total number of cores and PARALLEL_LEVEL
# will be defined specifying the subset we are allowed to use.
NUM_CORES=$(nproc)
export PARALLEL_LEVEL=${PARALLEL_LEVEL:-${NUM_CORES}}
# NUM_PROC is used by some of the other scripts
export NUM_PROC=${PARALLEL_LEVEL}
rapids-logger "Procs: ${NUM_CORES}"
/usr/bin/lscpu

rapids-logger "Memory"

/usr/bin/free -g

rapids-logger "user info"
id

# NUM_PROC is used by some of the other scripts
export NUM_PROC=${PARALLEL_LEVEL:-$(nproc)}
export BUILD_CC=${BUILD_CC:-"gcc"}

export CONDA_ENV_YML="${MRC_ROOT}/conda/environments/all_cuda-125_arch-x86_64.yaml"
export CONDA_ENV_YML="${MRC_ROOT}/conda/environments/all_cuda-125_arch-${REAL_ARCH}.yaml"

export CMAKE_BUILD_ALL_FEATURES="-DCMAKE_MESSAGE_CONTEXT_SHOW=ON -DMRC_BUILD_BENCHMARKS=ON -DMRC_BUILD_EXAMPLES=ON -DMRC_BUILD_PYTHON=ON -DMRC_BUILD_TESTS=ON -DMRC_USE_CONDA=ON -DMRC_PYTHON_BUILD_STUBS=ON"
export CMAKE_BUILD_WITH_CODECOV="-DCMAKE_BUILD_TYPE=Debug -DMRC_ENABLE_CODECOV=ON -DMRC_PYTHON_PERFORM_INSTALL:BOOL=ON -DMRC_PYTHON_INPLACE_BUILD:BOOL=ON"
Expand All @@ -52,7 +56,7 @@ PR_NUM="${GITHUB_REF_NAME##*/}"
# S3 vars
export S3_URL="s3://rapids-downloads/ci/mrc"
export DISPLAY_URL="https://downloads.rapids.ai/ci/mrc"
export ARTIFACT_ENDPOINT="/pull-request/${PR_NUM}/${GIT_COMMIT}/${NVARCH}/${BUILD_CC}"
export ARTIFACT_ENDPOINT="/pull-request/${PR_NUM}/${GIT_COMMIT}/${REAL_ARCH}/${BUILD_CC}"
export ARTIFACT_URL="${S3_URL}${ARTIFACT_ENDPOINT}"

if [[ "${LOCAL_CI}" == "1" ]]; then
Expand All @@ -62,7 +66,7 @@ else
fi

# Set sccache env vars
export SCCACHE_S3_KEY_PREFIX=mrc-${NVARCH}-${BUILD_CC}
export SCCACHE_S3_KEY_PREFIX=mrc-${REAL_ARCH}-${BUILD_CC}
export SCCACHE_BUCKET=rapids-sccache-east
export SCCACHE_REGION="us-east-2"
export SCCACHE_IDLE_TIMEOUT=32768
Expand Down
4 changes: 2 additions & 2 deletions ci/scripts/github/conda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ if [[ " ${CI_SCRIPT_ARGS} " =~ " upload " ]]; then
rapids-logger "Building Conda Package... Done"
else
# if we didn't receive the upload argument, we can still upload the artifact to S3
tar cfj "${WORKSPACE_TMP}/conda.tar.bz" "${RAPIDS_CONDA_BLD_OUTPUT_DIR}"
tar cfj "${WORKSPACE_TMP}/conda-${REAL_ARCH}.tar.bz" "${RAPIDS_CONDA_BLD_OUTPUT_DIR}"
ls -lh ${WORKSPACE_TMP}/

rapids-logger "Pushing results to ${DISPLAY_ARTIFACT_URL}/"
upload_artifact "${WORKSPACE_TMP}/conda.tar.bz"
upload_artifact "${WORKSPACE_TMP}/conda-${REAL_ARCH}.tar.bz"
fi
4 changes: 2 additions & 2 deletions ci/scripts/github/post_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ REPORTS_DIR="${WORKSPACE_TMP}/reports"

rapids-logger "Archiving benchmark reports"
cd $(dirname ${REPORTS_DIR})
tar cfj ${WORKSPACE_TMP}/benchmark_reports.tar.bz $(basename ${REPORTS_DIR})
tar cfj ${WORKSPACE_TMP}/benchmark_reports-${REAL_ARCH}.tar.bz $(basename ${REPORTS_DIR})

rapids-logger "Pushing results to ${DISPLAY_ARTIFACT_URL}/"
upload_artifact ${WORKSPACE_TMP}/benchmark_reports.tar.bz
upload_artifact ${WORKSPACE_TMP}/benchmark_reports-${REAL_ARCH}.tar.bz

exit $(cat ${WORKSPACE_TMP}/exit_status)
4 changes: 2 additions & 2 deletions ci/scripts/github/pre_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ source ${WORKSPACE}/ci/scripts/github/common.sh
update_conda_env

rapids-logger "Fetching Build artifacts from ${DISPLAY_ARTIFACT_URL}/"
download_artifact "build.tar.bz"
download_artifact "build-${REAL_ARCH}.tar.bz"

tar xf "${WORKSPACE_TMP}/build.tar.bz"
tar xf "${WORKSPACE_TMP}/build-${REAL_ARCH}.tar.bz"

mkdir -p ${WORKSPACE_TMP}/reports
12 changes: 6 additions & 6 deletions ci/scripts/github/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ source ${WORKSPACE}/ci/scripts/github/common.sh
update_conda_env

rapids-logger "Fetching Build artifacts from ${DISPLAY_ARTIFACT_URL}/"
download_artifact "dot_cache.tar.bz"
download_artifact "build.tar.bz"
download_artifact "dot_cache-${REAL_ARCH}.tar.bz"
download_artifact "build-${REAL_ARCH}.tar.bz"

tar xf "${WORKSPACE_TMP}/dot_cache.tar.bz"
tar xf "${WORKSPACE_TMP}/build.tar.bz"
tar xf "${WORKSPACE_TMP}/dot_cache-${REAL_ARCH}.tar.bz"
tar xf "${WORKSPACE_TMP}/build-${REAL_ARCH}.tar.bz"

REPORTS_DIR="${WORKSPACE_TMP}/reports"
mkdir -p ${WORKSPACE_TMP}/reports
Expand Down Expand Up @@ -57,10 +57,10 @@ set -e

rapids-logger "Archiving test reports"
cd $(dirname ${REPORTS_DIR})
tar cfj ${WORKSPACE_TMP}/test_reports.tar.bz $(basename ${REPORTS_DIR})
tar cfj ${WORKSPACE_TMP}/test_reports-${REAL_ARCH}.tar.bz $(basename ${REPORTS_DIR})

rapids-logger "Pushing results to ${DISPLAY_ARTIFACT_URL}/"
upload_artifact ${WORKSPACE_TMP}/test_reports.tar.bz
upload_artifact ${WORKSPACE_TMP}/test_reports-${REAL_ARCH}.tar.bz

TEST_RESULTS=$(($CTEST_RESULTS+$PYTEST_RESULTS))
exit ${TEST_RESULTS}
5 changes: 3 additions & 2 deletions ci/scripts/run_ci_local.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ function git_ssh_to_https()
echo $url | sed -e 's|^git@github\.com:|https://github.com/|'
}

CI_ARCH=${CI_ARCH:-$(dpkg --print-architecture)}
MRC_ROOT=${MRC_ROOT:-$(git rev-parse --show-toplevel)}

GIT_URL=$(git remote get-url origin)
Expand All @@ -58,7 +59,7 @@ GIT_BRANCH=$(git branch --show-current)
GIT_COMMIT=$(git log -n 1 --pretty=format:%H)

BASE_LOCAL_CI_TMP=${BASE_LOCAL_CI_TMP:-${MRC_ROOT}/.tmp/local_ci_tmp}
CONTAINER_VER=${CONTAINER_VER:-241002}
CONTAINER_VER=${CONTAINER_VER:-241219}
CUDA_VER=${CUDA_VER:-12.5}
DOCKER_EXTRA_ARGS=${DOCKER_EXTRA_ARGS:-""}

Expand Down Expand Up @@ -99,7 +100,7 @@ for STAGE in "${STAGES[@]}"; do
cp ${MRC_ROOT}/ci/scripts/bootstrap_local_ci.sh ${LOCAL_CI_TMP}


DOCKER_RUN_ARGS="--rm -ti --net=host -v "${LOCAL_CI_TMP}":/ci_tmp ${ENV_LIST} --env STAGE=${STAGE}"
DOCKER_RUN_ARGS="--rm -ti --net=host --platform=linux/${CI_ARCH} -v "${LOCAL_CI_TMP}":/ci_tmp ${ENV_LIST} --env STAGE=${STAGE}"
if [[ "${STAGE}" =~ "test" || "${STAGE}" =~ "codecov" || "${USE_GPU}" == "1" ]]; then
CONTAINER="${TEST_CONTAINER}"
DOCKER_RUN_ARGS="${DOCKER_RUN_ARGS} --runtime=nvidia --gpus all --cap-add=sys_nice --cap-add=sys_ptrace"
Expand Down
Loading

0 comments on commit 5f9cc13

Please sign in to comment.