diff --git a/ci/fedora/.gitlab-ci-fcos.yml b/ci/fedora/.gitlab-ci-fcos.yml index 4611c099..bab7d747 100644 --- a/ci/fedora/.gitlab-ci-fcos.yml +++ b/ci/fedora/.gitlab-ci-fcos.yml @@ -22,47 +22,66 @@ # and stable. In this manner pre-built kernel packages can be created matching # the kernel versions running on the gitlab-runners at that particular time. # These gitlab runners need to be tagged: fcos-next, fcos-testing, fcos-stable. +# Behaviour is branch specific as described below. +# +# Branches != ^fedora.* # # In terms of general operations if a repo is pointed to this CICD script then # a branch without 'fedora' in its name will build one container image on a -# fcos-next runner for the latest targeted DRIVER_VERSION only, suffixing the +# fcos-next runner for the latest targeted DRIVER_VERSION only, prefixing the # image tag with the git commit short SHA. It will be pushed to the internal # gitlab project container registry. The release stage will run but will again # target the internal gitlab registry conditionally overwriting the tag (minus -# the commit sha). This release stage is purely to test out the CICD release -# flow. +# the commit sha). This release stage is purely to test out the CICD code that +# would for the 'fedora' branch publish to a remote repository. +# +# Branches == "fedora" and tags == .*fedora$ # # The protected branch 'fedora' will cause container image builds on all three # fcos runner types and build ALL_DRIVER_VERSIONS. The images will then be scan- # ned and providing there are no detected vulnerabilities will be pushed to the -# remote repository defined by the CICD variables, e.g. docker.io/fifofonix/driver. +# remote repository defined by RELEASE_REGISTRY_PROJECT. +# # By default remote tags are not overwritten during daily scheduled pipelines. # However, a bespoke pipeline run setting OVERWRITE_REMOTE_TAGS can be used to # force overwrite remote tags. Any gitlab tag suffixed 'fedora' will also trigger # this pipeline flow. # +# Branches == "fedora.+" +# # Any other protected branch with the word fedora in it will do the same - build # all the NVIDIA driver versions on all the fcos releases - and scan them, but # will not publish them to the remote registry. # +# All Flows +# # It is possible to suppress compilation of driver kernel modules and build # of the kernel packages byy toggling COMPILE_KERNEL_MODULES. - +# +# If pre-compilation of driver kernel modules fails then the CICD pipeline will +# continue but will not push a kernel-labeled tag. This is done in case these +# fedora major version labeled images may run OK on different kernel versions +# than those used by gitab-runners. +# +# Experimental +# +# It is possible to set the KERNEL_TYPE variable to 'kernel-open' which will +# result in eligible driver versions (515.x.x) being built with the NVIDIA open +# kernel (this is only advertised for newer data center A|T-series processors). +# If this option is elected gitlab container tags will be at driver/open:. +# Kernel-specific tags will be generated for pre-compiled open kernel interface +# modules if possible per COMPILE_KERNEL_MODULES. Note, although this all runs +# and appears to launch stably on an appropriate node the GPU is not detected. +# An issue for this exists: +# https://gitlab.com/nvidia/container-images/driver/-/issues/46 +# # Some notes on driver versions: # -# Driver versions for Tesla-class GPUs, e.g. M60 that we use on AWS nodes, can be -# established here: https://www.nvidia.com/Download/index.aspx +# Driver versions for datacenter class GPUs can be established here: +# https://www.nvidia.com/Download/index.aspx # The driver versions are tied to specific CUDA versions, e.g.: # - M60 only supports # 11.4, 11.2, 11.0 ... (but not 11.3 for example?) # -# CUDA 11.6 has a minimum kernel of Fedora 35 / 5.14.10 -# (https://docs.nvidia.com/cuda/archive/11.6.0/cuda-installation-guide-linux/index.html) -# 5/27/22 - Current recommended Tesla / Linux driver version is 510.73.05 -# -# CUDA 11.4 has a minimum kernel of Fedora 35 / 5.10.13 -# (https://docs.nvidia.com/cuda/archive/11.4.0/cuda-installation-guide-linux/index.html) -# 1/17/22 - Current recommended Tesla / Linux driver version is 470.82.01 -# variables: # Set these in your CICD variables if your gitlab-runner is behind a proxy. @@ -73,15 +92,24 @@ variables: http_proxy: ${HTTP_PROXY} https_proxy: ${HTTPS_PROXY} no_proxy: ${NO_PROXY} - # Project specific variables - DRIVER_VERSION: "515.86.01" - ALL_DRIVER_VERSIONS: "515.86.01" - DRIVER_VERSIONS: 515.86.01 - BUILD_MULTI_ARCH_IMAGES: "true" + # To survey latest Data Center driver versions available: + # https://www.nvidia.com/Download/Find.aspx + # https://www.nvidia.com/en-us/drivers/unix/ + DRIVER_VERSION: "535.129.03" + DRIVER_VERSIONS: 535.129.03 525.147.05 + + CUDA_VERSION: 12.2.0 + + CVE_UPDATES: "curl libc6" + OVERWRITE_TAGS: 1 OVERWRITE_REMOTE_TAGS: 0 - COMPILE_KERNEL_MODULES: 1 # Requires 16GB RAM on FCOS gitlab-runner + + # https://developer.nvidia.com/blog/nvidia-releases-open-source-gpu-kernel-modules/ + KERNEL_TYPE: "kernel" # NVIDIA: kernel, OPEN: kernel-open, 515/525-only. + COMPILE_KERNEL_MODULES: 1 # Requires 16GB RAM on FCOS gitlab-runner + RELEASE_REGISTRY_PROJECT: "" RELEASE_REGISTRY_USER: "" RELEASE_REGISTRY_TOKEN: "" @@ -93,7 +121,6 @@ default: stages: - build - - test - scan - deploy - release @@ -102,31 +129,57 @@ stages: before_script: - export FEDORA_VERSION=$(uname -r | sed -r "s/^.*fc([0-9][0-9]).*/\1/") - export FEDORA_UNAME=$(uname -r) + - | + if [[ "${KERNEL_TYPE}" == "kernel-open" ]]; then + export IMAGE_BASE_NAME=${CI_REGISTRY_IMAGE}/open + else + export IMAGE_BASE_NAME=${CI_REGISTRY_IMAGE} + fi .build_push_fn-script: &build_push_fn-script - | function build_push_fn() { local _driver_version=$1 local _overwrite_tag=$2 - local _tag_suffix=$3 - export DOCKER_IMAGE=${CI_REGISTRY_IMAGE}:${_driver_version}-fedora${FEDORA_VERSION}-${FEDORA_UNAME}${_tag_suffix} + local _tag_prefix=$3 + + if [[ "${KERNEL_TYPE}" == "kernel-open" ]]; then + local _driver_major_version=$(echo ${_driver_version} | cut -c1-3) + if [[ "${_driver_major_version}" != "515" && "${_driver_major_version}" != "525" ]]; then + echo "Driver version ${_driver_version} does not support ${KERNEL_TYPE}, skipping build-push." + return + fi + fi + export DOCKER_IMAGE=${IMAGE_BASE_NAME}:${_tag_prefix}${_driver_version}-${FEDORA_UNAME}-fedora${FEDORA_VERSION} + echo -e "\033[33mBuilding driver ${_driver_version} (${KERNEL_TYPE}) for Fedora ${FEDORA_VERSION} (${FEDORA_UNAME}).\033[0m" docker build \ --build-arg FEDORA_VERSION=${FEDORA_VERSION} \ --build-arg DRIVER_VERSION=${_driver_version} \ --build-arg TARGETARCH=$(uname -m) \ + --build-arg CUDA_VERSION=${CUDA_VERSION} \ + --build-arg KERNEL_TYPE=${KERNEL_TYPE} \ --build-arg HTTP_PROXY=${HTTP_PROXY} \ --build-arg HTTPS_PROXY=${HTTPS_PROXY} \ -t ${DOCKER_IMAGE} fedora if ! $(docker manifest inspect ${DOCKER_IMAGE} > /dev/null 2>&1) || [[ "${_overwrite_tag}" == "1" ]]; then if [[ "${COMPILE_KERNEL_MODULES}" == "1" ]]; then - docker run --privileged --pid=host --name build-kernel-modules-${_driver_version} \ - --entrypoint nvidia-driver ${DOCKER_IMAGE} update -t builtin - docker commit -m '"Compile Linux kernel modules version ${FEDORA_UNAME} for NVIDIA driver version ${_driver_version}"' \ - --change='ENTRYPOINT ["nvidia-driver", "init"]' build-kernel-modules-${_driver_version} ${DOCKER_IMAGE} - echo "Pushing ${DOCKER_IMAGE} with compiled kernel modules." + if ((docker run --privileged --pid=host --name build-kernel-modules-${_driver_version} \ + --entrypoint nvidia-driver ${DOCKER_IMAGE} update -t builtin 2>&1 ) > /tmp/kernel-module-build.log); then + cat /tmp/kernel-module-build.log | grep -Ev "'naked' return found in RETHUNK build|missing int3 after ret" + docker commit -m '"Compile Linux kernel modules version ${FEDORA_UNAME} for NVIDIA driver version ${_driver_version}"' \ + --change='ENTRYPOINT ["nvidia-driver", "init"]' build-kernel-modules-${_driver_version} ${DOCKER_IMAGE} + echo "Pushing ${DOCKER_IMAGE} with compiled kernel interface modules." + else + cat /tmp/kernel-module-build.log | grep -Ev "'naked' return found in RETHUNK build|missing int3 after ret" + export DOCKER_IMAGE_NO_PRECOMPILED_KERNEL_MODULES=${IMAGE_BASE_NAME}:${_tag_prefix}${_driver_version}-fedora${FEDORA_VERSION} + docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_NO_PRECOMPILED_KERNEL_MODULES} + export DOCKER_IMAGE=${DOCKER_IMAGE_NO_PRECOMPILED_KERNEL_MODULES} + echo "Pushing ${DOCKER_IMAGE} without compiled kernel interface modules." + fi else echo "Pushing ${DOCKER_IMAGE} to internal gitlab repository." fi + docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" docker push -q ${DOCKER_IMAGE} else echo "Skipping push of ${DOCKER_IMAGE} to internal gitlab repository." @@ -135,129 +188,149 @@ stages: .build-push-script: &build-push-script - *build_push_fn-script - - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" - - for driver_version in ${ALL_DRIVER_VERSIONS:-${DRIVER_VERSION}}; do build_push_fn ${driver_version} $OVERWRITE_TAGS;done + - for driver_version in ${DRIVER_VERSIONS:-${DRIVER_VERSION}}; do build_push_fn ${driver_version} $OVERWRITE_TAGS;done build-push-next-one-only: stage: build extends: .common script: - *build_push_fn-script - - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" # Here we pass in the short git sha which gets suffixed to the built image. - - for driver_version in ${DRIVER_VERSION}; do build_push_fn ${driver_version} $OVERWRITE_TAGS -${CI_COMMIT_SHORT_SHA}; done + - for driver_version in ${DRIVER_VERSION}; do build_push_fn ${driver_version} $OVERWRITE_TAGS ${CI_COMMIT_SHORT_SHA}-; done tags: - fcos-next except: - /fedora/ -.common-build: +build-push: stage: build extends: .common script: - *build-push-script - only: - - /fedora/ - -build-push-next: - stage: build - extends: .common-build + parallel: + matrix: + - STREAM: [next, testing, stable] tags: - - fcos-next + - fcos-${STREAM} only: - /fedora/ -build-push-testing: - extends: .common-build - tags: - - fcos-testing - only: - - /fedora/ - -build-push-stable: - extends: .common-build - tags: - - fcos-stable - only: - - /fedora/ - -test: - stage: test - extends: .common - script: - - "true" # TODO: write a test! - tags: - - fcos-next - .common-scan: - # Trivy, which is the gitlab default, does not supported fedora. - # image: registry.gitlab.com/security-products/container-scanning:4 - image: registry.gitlab.com/security-products/container-scanning/grype:4 + image: registry.gitlab.com/security-products/container-scanning:6 stage: scan - extends: .common variables: - # https://docs.gitlab.com/ee/user/application_security/container_scanning/index.html#overriding-the-container-scanning-template - # https://docs.gitlab.com/ee/user/application_security/container_scanning/#vulnerability-allowlisting GIT_STRATEGY: fetch DOCKERFILE_PATH: '$CI_PROJECT_DIR/fedora/Dockerfile' + # SECURE_LOG_LEVEL: 'debug' + TRIVY_TIMEOUT: 30m + allow_failure: false + artifacts: + paths: [driver-artifacts] + dependencies: [] before_script: - !reference [.common, before_script] - - echo ${DOCKERFILE_PATH} - if ! [[ -z ${HTTP_PROXY} ]]; then sudo -- sh -c 'echo Acquire::http::Proxy \"'${HTTP_PROXY}'\"\; >> /etc/apt/apt.conf.d/proxy.conf'; fi - if ! [[ -z ${HTTPS_PROXY} ]]; then sudo -- sh -c 'echo Acquire::https::Proxy \"'${HTTPS_PROXY}'\"\; >> /etc/apt/apt.conf.d/proxy.conf'; fi - sudo apt-get -qy update > /dev/null - sudo apt-get -qy install jq > /dev/null + - mkdir driver-artifacts - | function scan_fn() { local _driver_version=$1 - export DOCKER_IMAGE=$CI_REGISTRY_IMAGE:${_driver_version}-fedora${FEDORA_VERSION}-${FEDORA_UNAME} - gtcs scan + local _tag_prefix=$2 + if [[ "${KERNEL_TYPE}" == "kernel-open" ]]; then + local _driver_major_version=$(echo ${_driver_version} | cut -c1-3) + if [[ "${_driver_major_version}" != "515" && "${_driver_major_version}" != "525" ]]; then + echo "Driver version ${_driver_version} does not support ${KERNEL_TYPE}, skipping scan." + return + fi + fi + export DOCKER_TAG=${_tag_prefix}${_driver_version}-${FEDORA_UNAME}-fedora${FEDORA_VERSION} + export CS_IMAGE=${IMAGE_BASE_NAME}:${DOCKER_TAG} + # Try to scan the kernel-specific tag...if it fails go to non-kernel-specific tag... + if ! (gtcs scan); then + export DOCKER_TAG=${_tag_prefix}${_driver_version}-fedora${FEDORA_VERSION} + export CS_IMAGE=${IMAGE_BASE_NAME}:${DOCKER_TAG} + gtcs scan + fi cat gl-container-scanning-report.json | jq '.vulnerabilities[].severity' | sort | uniq -c if [[ $(cat gl-container-scanning-report.json | jq '.vulnerabilities | any') == 'true' ]]; then exit 1 fi + if [[ $(cat gl-dependency-scanning-report.json | jq '.vulnerabilities | any') == 'true' ]]; then + exit 1 + fi + mv gl-container-scanning-report.json driver-artifacts/gl-container-scanning-report-${_driver_version}.json + mv gl-dependency-scanning-report.json driver-artifacts/gl-dependency-scanning-report-${_driver_version}.json } script: - - for driver_version in ${ALL_DRIVER_VERSIONS:-${DRIVER_VERSION}}; do scan_fn ${driver_version};done - artifacts: - reports: - container_scanning: gl-container-scanning-report.json - paths: [gl-container-scanning-report.json] - only: + - for driver_version in ${DRIVER_VERSIONS:-${DRIVER_VERSION}}; do scan_fn ${driver_version};done + +scan-next-one-only: + extends: .common-scan + needs: ["build-push-next-one-only"] + script: + - scan_fn ${DRIVER_VERSION} ${CI_COMMIT_SHORT_SHA}- + tags: + - fcos-next + except: - /fedora/ +# Gitlab does not yet support matrix jobs with dynamic matrix-based dependencies. +# https://forum.gitlab.com/t/ci-specifying-artifact-dependencies-when-using-parallel-matrix/45026/2 +scan: + extends: .common-scan + tags: ["not-a-branch"] + only: ["not-a-branch"] + scan-next: extends: .common-scan - needs: ["build-push-next"] + needs: ["build-push: [next]"] tags: - fcos-next + only: + - /fedora/ scan-testing: extends: .common-scan - needs: ["build-push-testing"] + needs: ["build-push: [testing]"] tags: - fcos-testing + only: + - /fedora/ scan-stable: extends: .common-scan - needs: ["build-push-stable"] + needs: ["build-push: [stable]"] tags: - fcos-stable + only: + - /fedora/ .common-release-fn-script: &common-release-fn-script - | function release_fn() { local _driver_version=$1 local _overwrite_remote_tag=${2:-${OVERWRITE_REMOTE_TAGS}} - local _tag_suffix=$3 - export DOCKER_IMAGE=$CI_REGISTRY_IMAGE:${_driver_version}-fedora${FEDORA_VERSION}-${FEDORA_UNAME}${_tag_suffix} - docker pull -q ${DOCKER_IMAGE} - docker tag ${DOCKER_IMAGE} ${RELEASE_REGISTRY_PROJECT}:${_driver_version}-fedora${FEDORA_VERSION}-${FEDORA_UNAME} - if ! $(docker manifest inspect ${RELEASE_REGISTRY_PROJECT}:${_driver_version}-fedora${FEDORA_VERSION}-${FEDORA_UNAME} > /dev/null 2>&1) || [[ ${_overwrite_remote_tag} -eq 1 ]]; then - echo "Pushing ${RELEASE_REGISTRY_PROJECT}:${_driver_version}-fedora${FEDORA_VERSION}-${FEDORA_UNAME} to remote repository." - docker push -q ${RELEASE_REGISTRY_PROJECT}:${_driver_version}-fedora${FEDORA_VERSION}-${FEDORA_UNAME} + local _tag_prefix=$3 + if [[ "${KERNEL_TYPE}" == "kernel-open" ]]; then + local _driver_major_version=$(echo ${_driver_version} | cut -c1-3) + if [[ "${_driver_major_version}" != "515" && "${_driver_major_version}" != "525" ]]; then + echo "Driver version ${_driver_version} does not support ${KERNEL_TYPE}, skipping release." + return + fi + fi + export DOCKER_TAG=${_tag_prefix}${_driver_version}-${FEDORA_UNAME}-fedora${FEDORA_VERSION} + if ! (docker manifest inspect ${IMAGE_BASE_NAME}:${DOCKER_TAG} > /dev/null 2>&1); then + export DOCKER_TAG=${_tag_prefix}${_driver_version}-fedora${FEDORA_VERSION} + fi + docker pull -q ${IMAGE_BASE_NAME}:${DOCKER_TAG} + docker tag ${IMAGE_BASE_NAME}:${DOCKER_TAG} ${RELEASE_REGISTRY_PROJECT}:${DOCKER_TAG} + if ! $(docker manifest inspect ${RELEASE_REGISTRY_PROJECT}:${DOCKER_TAG} > /dev/null 2>&1) || [[ ${_overwrite_remote_tag} -eq 1 ]]; then + echo "Pushing ${RELEASE_REGISTRY_PROJECT}:${DOCKER_TAG} to remote repository." + docker push -q ${RELEASE_REGISTRY_PROJECT}:${DOCKER_TAG} else - echo "Skipping push of ${RELEASE_REGISTRY_PROJECT}:${_driver_version}-fedora${FEDORA_VERSION}-${FEDORA_UNAME} to remote repository." + echo "Skipping push of ${RELEASE_REGISTRY_PROJECT}:${DOCKER_TAG} to remote repository." fi } @@ -272,7 +345,7 @@ scan-stable: - *common-release-fn-script - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" - docker login -u "${RELEASE_REGISTRY_USER}" -p "${RELEASE_REGISTRY_TOKEN}" "${RELEASE_REGISTRY}" - - for driver_version in ${ALL_DRIVER_VERSIONS:-${DRIVER_VERSION}}; do release_fn ${driver_version};done + - for driver_version in ${DRIVER_VERSIONS:-${DRIVER_VERSION}}; do release_fn ${driver_version};done rules: - if: $CI_COMMIT_TAG =~ /fedora$/ || $CI_COMMIT_REF_NAME == 'fedora' @@ -283,25 +356,36 @@ release-next-one-only: # Here we create a circular reference defining the 'remote' registry to in # fact be this repo's registry. This is purely for CICD testing purposes # in a non-protected branch...validating the release stage... - RELEASE_REGISTRY_PROJECT: ${CI_REGISTRY_IMAGE} + RELEASE_REGISTRY_PROJECT: ${IMAGE_BASE_NAME} RELEASE_REGISTRY: ${CI_REGISTRY} RELEASE_REGISTRY_USER: ${CI_REGISTRY_USER} RELEASE_REGISTRY_TOKEN: ${CI_REGISTRY_PASSWORD} before_script: - !reference [.common, before_script] script: + - export RELEASE_REGISTRY_PROJECT=${IMAGE_BASE_NAME} - *common-release-fn-script - docker login -u "${CI_REGISTRY_USER}" -p "${CI_REGISTRY_PASSWORD}" "${CI_REGISTRY}" - docker login -u "${RELEASE_REGISTRY_USER}" -p "${RELEASE_REGISTRY_TOKEN}" "${RELEASE_REGISTRY}" - - for driver_version in ${DRIVER_VERSION}; do release_fn ${driver_version} ${OVERWRITE_REMOTE_TAGS} -${CI_COMMIT_SHORT_SHA}; done + - for driver_version in ${DRIVER_VERSION}; do release_fn ${driver_version} ${OVERWRITE_REMOTE_TAGS} ${CI_COMMIT_SHORT_SHA}-; done tags: - fcos-next except: - /fedora/ -# The build will build/push N images to the local gitlab registry. The scan -# will scan each of them, and if any of the scans fail, none of the new images -# will get released...releases run for the fedora branch only. +# Gitlab does not yet support matrix jobs with dynamic matrix-based dependencies. +# https://forum.gitlab.com/t/ci-specifying-artifact-dependencies-when-using-parallel-matrix/45026/2 +# release: +# extends: .common-release +# before_script: +# - !reference [.common, before_script] +# needs: ["scan"] +# parallel: +# matrix: +# - STREAM: [next, testing, stable] +# tags: +# - fcos-${STREAM} + release-next: extends: .common-release before_script: @@ -324,4 +408,4 @@ release-stable: - !reference [.common, before_script] needs: ["scan-stable"] tags: - - fcos-stable + - fcos-stable \ No newline at end of file diff --git a/fedora/Dockerfile b/fedora/Dockerfile index de60a0bf..1e5407cd 100644 --- a/fedora/Dockerfile +++ b/fedora/Dockerfile @@ -1,50 +1,49 @@ -# ****************************************************************************** -# REMOVED VGPU UTIL RHEL WORK FOR NOW - HAVEN'T ATTEMPTED INCLUSION -# ****************************************************************************** +ARG FEDORA_VERSION=36 +ARG CUDA_VERSION -# FROM nvidia/cuda:11.6.0-base-ubi8 as build +FROM nvidia/cuda:${CUDA_VERSION}-base-ubi8 as build -# ARG TARGETARCH +ARG TARGETARCH -# SHELL ["/bin/bash", "-c"] +SHELL ["/bin/bash", "-c"] -# RUN dnf install -y git wget +RUN dnf install -y git wget -# ENV GOLANG_VERSION=1.20 +ENV GOLANG_VERSION=1.21.5 -# # download appropriate binary based on the target architecture for multi-arch builds -# RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && \ -# curl https://storage.googleapis.com/golang/go${GOLANG_VERSION}.linux-${OS_ARCH}.tar.gz \ -# | tar -C /usr/local -xz +# download appropriate binary based on the target architecture for multi-arch builds +RUN OS_ARCH=${TARGETARCH/x86_64/amd64} && OS_ARCH=${OS_ARCH/aarch64/arm64} && \ + curl https://dl.google.com/go/go${GOLANG_VERSION}.linux-${OS_ARCH}.tar.gz \ + | tar -C /usr/local -xz -# ENV PATH /usr/local/go/bin:$PATH +ENV PATH /usr/local/go/bin:$PATH -# WORKDIR /work +WORKDIR /work -# RUN git clone https://gitlab.com/nvidia/container-images/driver && \ -# cd driver/vgpu/src && \ -# go build -o vgpu-util && \ -# mv vgpu-util /work +RUN git clone https://gitlab.com/nvidia/container-images/driver && \ + cd driver/vgpu/src && \ + go build -o vgpu-util && \ + mv vgpu-util /work ARG FEDORA_VERSION=36 ARG CUDA_VERSION=11.7.0 -FROM nvidia/cuda:$CUDA_VERSION-base-ubi8 as license - -FROM fedora:$FEDORA_VERSION +FROM fedora:${FEDORA_VERSION} ARG TARGETARCH ENV TARGETARCH=$TARGETARCH + +ARG KERNEL_TYPE +ENV KERNEL_TYPE=$KERNEL_TYPE + ARG HTTP_PROXY ENV HTTP_PROXY=$HTTP_PROXY ARG HTTPS_PROXY ENV HTTPS_PROXY=$HTTPS_PROXY - SHELL ["/bin/bash", "-c"] -# Note that this prior url is non-https....so don't use it... -# ARG BASE_URL=http://us.download.nvidia.com/XFree86/Linux-x86_64 +#ARG BASE_URL=http://us.download.nvidia.com/XFree86/Linux-x86_64 ARG BASE_URL=https://us.download.nvidia.com/tesla ENV BASE_URL=${BASE_URL} ARG DRIVER_VERSION @@ -53,10 +52,10 @@ ENV DRIVER_VERSION=$DRIVER_VERSION # Arg to indicate if driver type is either of passthrough/baremetal or vgpu ARG DRIVER_TYPE=passthrough ENV DRIVER_TYPE=$DRIVER_TYPE -ARG VGPU_LICENSE_SERVER_TYPE=FNE +ARG VGPU_LICENSE_SERVER_TYPE=NLS ENV VGPU_LICENSE_SERVER_TYPE=$VGPU_LICENSE_SERVER_TYPE # Enable vGPU version compability check by default -ARG DISABLE_VGPU_VERSION_CHECK=false +ARG DISABLE_VGPU_VERSION_CHECK=true ENV DISABLE_VGPU_VERSION_CHECK=$DISABLE_VGPU_VERSION_CHECK # Avoid dependency of container-toolkit for driver container ENV NVIDIA_VISIBLE_DEVICES=void @@ -109,7 +108,8 @@ WORKDIR /usr/src/nvidia-$DRIVER_VERSION # COPY ocp_dtk_entrypoint /usr/local/bin COPY common.sh /usr/local/bin -# COPY --from=build /work/vgpu-util /usr/local/bin +COPY --from=build /work/vgpu-util /usr/local/bin + WORKDIR /drivers ARG PUBLIC_KEY=empty @@ -126,15 +126,15 @@ LABEL release="N/A" LABEL summary="Provision the NVIDIA driver through containers" LABEL description="See summary" -# Add NGC DL license -COPY --from=license /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE +# Add NGC DL license from the CUDA image +COPY --from=build /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-LICENSE -# # Install / upgrade packages here that are required to resolve CVEs -# ARG CVE_UPDATES -# RUN if [ -n "${CVE_UPDATES}" ]; then \ -# yum update -y ${CVE_UPDATES} && \ -# rm -rf /var/cache/yum/*; \ -# fi +# Install / upgrade packages here that are required to resolve CVEs +ARG CVE_UPDATES +RUN if [ -n "${CVE_UPDATES}" ]; then \ + yum update -y ${CVE_UPDATES} && \ + rm -rf /var/cache/yum/*; \ + fi # Remove cuda repository to avoid GPG errors RUN rm -f /etc/yum.repos.d/cuda.repo diff --git a/fedora/README.md b/fedora/README.md index 760a78c1..54174658 100644 --- a/fedora/README.md +++ b/fedora/README.md @@ -12,59 +12,63 @@ When run as a privileged 'driver container' they install/run NVIDIA kernel modul See [here](https://github.com/NVIDIA/nvidia-docker/wiki/) for an overview of the overall architecture. -Together with the separate [NVIDIA Container Runtime](https://github.com/NVIDIA/nvidia-docker) which can be installed via a Fedora-specific [fork](https://container-toolkit-fcos.gitlab.io/container-runtime) containerized GPU workloads can be run. - ## Supported GPUs/Drivers NVIDIA datacenter GPUs based on Pascal+ architecture (e.g. P100, V100, T4, A100) running x86 FCOS are supported. NVIDIA datacenter drivers support a specific CUDA version and have minimum supported Linux kernel constraints. -Currently built driver version are specified in `ci/fedora/.common-ci-fcos.yml` with 510.47.03 the latest target. +Currently built driver versions are specified in `ci/fedora/.common-ci-fcos.yml`. ## Getting Started ### Running the Driver Container -The driver container is privileged, and here we choose to launch via podman instead of docker. +The driver container is privileged, and here we choose to launch via podman instead of docker although both work. ```bash -$ DRIVER_VERSION=510.47.03-fedora$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2)-$(uname -r) +$ DRIVER_VERSION=535.104.12 # Check ci/fedora/.common-ci-fcos.yml for latest +$ FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2) $ podman run -d --privileged --pid=host \ -v /run/nvidia:/run/nvidia:shared \ - -v /tmp/nvidia:/var/log \ + -v /var/log:/var/log \ --name nvidia-driver \ - registry.gitlab.com/container-toolkit-fcos/driver:${DRIVER_VERSION} + registry.gitlab.com/container-toolkit-fcos/driver:${DRIVER_VERSION}-fedora$$FEDORA_VERSION_ID ``` -Or, on FCOS registering as a systemd unit via an ignition snippet. +Or, on FCOS registering as a systemd unit via an ignition snippet, and using an image with kernel headers pre-installed for faster start up: -``` -... +```yaml +variant: fcos +version: 1.4.0 +storage: + files: - name: acme-nvidia-driver.service enabled: true contents: | [Unit] Requires=network-online.target After=network-online.target - + StartLimitInterval=1600 + StartLimitBurst=5 [Service] TimeoutStartSec=250 ExecStartPre=-/bin/podman stop nvidia-driver ExecStartPre=-/bin/podman rm nvidia-driver - - # Switch off SELINUX enforcement...interested in knowing how to avoid requiring this... ExecStartPre=-setenforce 0 ExecStartPre=-/bin/mkdir -p /run/nvidia ExecStartPre=-/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \ - /bin/podman pull registry.gitlab.com/container-toolkit-fcos/driver:510.47.03-$$FEDORA_VERSION_ID-$$KERNEL_VERSION' - ExecStart=/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2);/bin/podman run --name nvidia-driver \ - -v /run/nvidia:/run/nvidia:shared \ - -v /var/log:/var/log \ - --privileged \ - --pid=host \ - registry.gitlab.com/container-toolkit-fcos/driver:510.47.03-fedora$$FEDORA_VERSION_ID-$$KERNEL_VERSION \ - --accept-license' + /bin/podman pull registry.gitlab.com/container-toolkit-fcos/driver:535.104.12-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID' + ExecStartPre=-/usr/sbin/modprobe video + ExecStart=/bin/sh -c 'KERNEL_VERSION=$(/bin/uname -r);FEDORA_VERSION_ID=$(cat /etc/os-release | grep VERSION_ID | cut -d = -f2); \ + /bin/podman run --name nvidia-driver \ + -v /run/nvidia:/run/nvidia:shared \ + -v /var/log:/var/log \ + --privileged --pid=host \ + # No need for network IF using container image with pre-built kernel headers \ + --network=none \ + registry.gitlab.com/container-toolkit-fcos/driver:535.104.12-$$KERNEL_VERSION-fedora$$FEDORA_VERSION_ID \ + --accept-license' ExecStop=/bin/podman stop nvidia-driver Restart=on-failure @@ -72,7 +76,6 @@ Or, on FCOS registering as a systemd unit via an ignition snippet. [Install] WantedBy=multi-user.target -... ``` ### Validating the Driver Container @@ -80,19 +83,19 @@ Or, on FCOS registering as a systemd unit via an ignition snippet. You should be able to step into the driver container and run the `nvidia-smi` tool to validate the GPU has been recognized and see what CUDA version you are running. ```bash -$ # Assumes you've named the container nvidia-driver as above../ -$ podman exec -it nvidia-driver bash +$ # Assumes you're running the driver container via podman and named nvidia-driver as above... +$ podman exec -it nvidia-driver bash [root@8dc88dad905e nvidia-510.47.03]# nvidia-smi Wed May 25 15:24:00 2022 +-----------------------------------------------------------------------------+ -| NVIDIA-SMI 510.47.03 Driver Version: 510.47.03 CUDA Version: 11.6 | +| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| -| 0 Tesla M60 On | 00000000:00:1E.0 Off | 0 | -| N/A 52C P0 119W / 150W | 7313MiB / 7680MiB | 100% Default | +| 0 NVIDIA A10G On | 00000000:00:1E.0 Off | 0 | +| 0% 39C P0 197W / 300W | 22022MiB / 23028MiB | 96% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ @@ -103,16 +106,50 @@ Wed May 25 15:24:00 2022 |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+ -[root@8dc88dad905e nvidia-510.47.03]# +[root@8dc88dad905e]# ``` ### Install Container Runtime / Toolkit -To run a CUDA container that leverages the NVIDIA driver container you now have running, install the separate NVIDIA container runtime and register it with your container runtime system (e.g. docker). +To run a CUDA container that leverages the NVIDIA driver container you now have running, install the separate NVIDIA container runtime and register it with your container runtime system (e.g. docker) following NVIDIA's instructions [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html). -NVIDIA do not support Fedora artifacts for the container runtime but this Fedora-specific [fork](https://container-toolkit-fcos.gitlab.io/container-runtime) of the NVIDIA Container Runtime project does. +On FedoraCoreOS you may choose to layer the container toolkit using `rpm-ostree`, and configure your runtime, with an ignition snippet like this (substitute your runtime, docker is shown, but containerd works too for example): + +```yaml +variant: fcos +version: 1.4.0 +storage: + files: + - name: acme-layer-nvidia-container-runtime.service + enabled: true + # We run before `zincati.service` to avoid conflicting rpm-ostree transactions. + contents: | + [Unit] + After=network-online.target + Before=zincati.service + ConditionPathExists=!/var/lib/%N.stamp + StartLimitInterval=350 + StartLimitBurst=5 + [Service] + Type=oneshot + RemainAfterExit=yes + ExecStartPre=-/bin/rm -rf /var/cache/rpm-ostree/repomd/{libnvidia,nvidia}* + ExecStartPre=-/bin/sh -c 'curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo \ + > /etc/yum.repos.d/nvidia-container-toolkit.repo' + # Perhaps consider pinning the rpm version here depending on change aversion... + ExecStart=/usr/bin/rpm-ostree install --idempotent --allow-inactive --apply-live nvidia-container-toolkit + ExecStart=/bin/sh -c 'echo "/run/nvidia/driver/usr/lib64" > /etc/ld.so.conf.d/nv.conf; ldconfig' + # If we see that the nvidia-ctk is present, then we can configure docker... + ExecStart=/bin/sh -c 'if [[ -f /usr/bin/nvidia-ctk ]]; then \ + /usr/bin/nvidia-ctk runtime configure --runtime=docker --nvidia-set-as-default; \ + systemctl restart docker; \ + /bin/touch /var/lib/%N.stamp; fi' + Restart=on-failure + RestartSec=60 -Installation instructions include a potential FCOS ignition snippet for applying changes to `/etc/docker/daemon.json` to register the runtime with docker. + [Install] + WantedBy=multi-user.target +``` ### Running a CUDA Container @@ -128,10 +165,10 @@ Test PASSED Done ``` -# License Information +## License Information View license information for the software contained in this image in the git repo. As with all Docker images, these likely also contain other software which may be under other licenses (such as Bash, etc from the base distribution, along with any direct or indirect dependencies of the primary software being contained). -As for any pre-built image usage, it is the image user's responsibility to ensure that any use of this image complies with any relevant licenses for all software contained within. \ No newline at end of file +As for any pre-built image usage, it is the image user's responsibility to ensure that any use of this image complies with any relevant licenses for all software contained within. diff --git a/fedora/nvidia-driver b/fedora/nvidia-driver index 79d4b2b2..6dbc9c2c 100755 --- a/fedora/nvidia-driver +++ b/fedora/nvidia-driver @@ -15,6 +15,10 @@ NVIDIA_MODESET_MODULE_PARAMS=() NVIDIA_PEERMEM_MODULE_PARAMS=() TARGETARCH=${TARGETARCH:?"Missing TARGETARCH env"} USE_HOST_MOFED="${USE_HOST_MOFED:-false}" +DNF_RELEASEVER=${DNF_RELEASEVER:-""} + +OPEN_KERNEL_MODULES_ENABLED=${OPEN_KERNEL_MODULES_ENABLED:-false} +[[ "${OPEN_KERNEL_MODULES_ENABLED}" == "true" ]] && KERNEL_TYPE=kernel-open || KERNEL_TYPE=kernel DRIVER_ARCH=${TARGETARCH/amd64/x86_64} && DRIVER_ARCH=${DRIVER_ARCH/arm64/aarch64} echo "DRIVER_ARCH is $DRIVER_ARCH" @@ -89,6 +93,8 @@ _install_prerequisites() ( dnf -q -y install kernel-headers fi fi + echo "List kernel-devel rpm version installed:" + rpm -qa | grep ^kernel-headers || true echo "Installing Linux development files..." if ! dnf -q -y install kernel-devel-${KERNEL_VERSION} > /dev/null; then @@ -102,9 +108,11 @@ _install_prerequisites() ( exit 1 fi fi + echo "List kernel-devel rpm version installed:" + rpm -qa | grep ^kernel-devel || true ln -s /usr/src/kernels/${KERNEL_VERSION} /lib/modules/${KERNEL_VERSION}/build - echo "Installing Linux kernel module files..." + echo "Installing Linux kernel-core files..." if ! dnf -q -y download kernel-core-${KERNEL_VERSION} > /dev/null; then echo "Failed to find kernel-core-${KERNEL_VERSION} in repositories." echo "Trying to download kernel-core from koji..." @@ -115,19 +123,44 @@ _install_prerequisites() ( exit 1 fi fi + echo "List kernel-core rpm version installed:" + rpm -qa | grep ^kernel-core || true cat ./kernel-core-*.rpm | rpm2cpio | cpio -idm --quiet rm ./*.rpm + echo "Installing Linux kernel-modules-core files..." + if ! dnf -q -y download kernel-modules-core${KERNEL_VERSION} > /dev/null; then + echo "Failed to find kernel-modules-core-${KERNEL_VERSION} in repositories." + echo "Trying to download kernel-modules-core from koji..." + KOJI_KERNEL_CORE_RPM=$KOJI_BASE_URL/packages/kernel/$KERNEL_RPM_VERSION/$KERNEL_RPM_RELEASE/$KERNEL_RPM_ARCH/kernel-modules-core-$KERNEL_VERSION.rpm + if ! dnf -q -y download $KOJI_KERNEL_CORE_RPM \ + --setopt=install_weak_deps=False; then + echo "Can't find kernel-modules-core-${KERNEL_VERSION}" + echo "Please try to update your kernel on the host system." + exit 1 + else + echo 'Kernel-modules-core installed from koji' + fi + fi + echo "List kernel-modules rpm version installed:" + rpm -qa | grep ^kernel-modules || true + cat ./kernel-modules-*.rpm | rpm2cpio | cpio -idm --quiet + rm ./*.rpm + mv lib/modules/${KERNEL_VERSION}/modules.* /lib/modules/${KERNEL_VERSION} mv lib/modules/${KERNEL_VERSION}/kernel /lib/modules/${KERNEL_VERSION} # Prevent depmod from giving a WARNING about missing files touch /lib/modules/${KERNEL_VERSION}/modules.order touch /lib/modules/${KERNEL_VERSION}/modules.builtin + depmod ${KERNEL_VERSION} echo "Generating Linux kernel version string..." - - extract-vmlinux ./lib/modules/${KERNEL_VERSION}/vmlinuz | strings | grep -E '^Linux version' | sed 's/^\(.*\)\s\+(.*)$/\1/' > version + if [ "$TARGETARCH" = "arm64" ]; then + gunzip -c /lib/modules/${KERNEL_VERSION}/vmlinuz | strings | grep -E '^Linux version' | sed 's/^\(.*\)\s\+(.*)$/\1/' > version + else + extract-vmlinux /lib/modules/${KERNEL_VERSION}/vmlinuz | strings | grep -E '^Linux version' | sed 's/^\(.*\)\s\+(.*)$/\1/' > version + fi if [ -z "$( version # Example: @@ -144,7 +177,7 @@ _install_prerequisites() ( local gcc_version=$(cat /lib/modules/${KERNEL_VERSION}/proc/version | grep -Eo "Red Hat ([0-9\.-]+)" | grep -Eo "([0-9\.-]+)").${KERNEL_OS_VERSION}.${KERNEL_RPM_ARCH} local current_gcc=$(rpm -qa gcc) - if ! [[ "${current_gcc}" =~ "gcc-${gcc_version}"-.* ]]; then + if ! [[ "${current_gcc}" =~ "gcc-${gcc_version}".* ]]; then echo "kernel requires gcc version: 'gcc-${gcc_version}', current gcc version is '${current_gcc}'" # Sometimes there are -v differences in the x.y.z-v gcc versions available. # So we try, but allow failure of installation of precise gcc version. @@ -173,13 +206,10 @@ _kernel_requires_package() { echo "Checking NVIDIA driver packages..." - [[ ! -d /usr/src/nvidia-${DRIVER_VERSION}/kernel ]] && return 0 - cd /usr/src/nvidia-${DRIVER_VERSION}/kernel + [[ ! -d /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} ]] && return 0 + cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} - # When the kernel version is latest on host, this check fails and lead to recompilation, even when precompiled modules exist. - #if [ "${KERNEL_VERSION}" != "$(uname -r)" ]; then proc_mount_arg="--proc-mount-point /lib/modules/${KERNEL_VERSION}/proc" - #fi for pkg_name in $(ls -d -1 precompiled/** 2> /dev/null); do is_match=$(../mkprecompiled --match ${pkg_name} ${proc_mount_arg}) if [ "${is_match}" == "kernel interface matches." ]; then @@ -200,7 +230,7 @@ _create_driver_package() ( trap "make -s -j SYSSRC=/lib/modules/${KERNEL_VERSION}/build clean > /dev/null" EXIT echo "Compiling NVIDIA driver kernel modules..." - cd /usr/src/nvidia-${DRIVER_VERSION}/kernel + cd /usr/src/nvidia-${DRIVER_VERSION}/${KERNEL_TYPE} if _gpu_direct_rdma_enabled; then ln -s /run/mellanox/drivers/usr/src/ofa_kernel /usr/src/ @@ -217,7 +247,7 @@ _create_driver_package() ( fi fi - IGNORE_CC_MISMATCH=1 make -s -e NV_EXCLUDE_KERNEL_MODULES=nvidia-drm -j SYSSRC=/lib/modules/${KERNEL_VERSION}/build nv-linux.o nv-modeset-linux.o > /dev/null + make -s -j SYSSRC=/lib/modules/${KERNEL_VERSION}/build nv-linux.o nv-modeset-linux.o > /dev/null echo "Relinking NVIDIA driver kernel modules..." rm -f nvidia.ko nvidia-modeset.ko @@ -308,6 +338,12 @@ _get_module_params() { # Load the kernel modules and start persistenced. _load_driver() { + # Apply SELinux labels before loading modules + # if [ -e /sys/fs/selinux ]; then + # echo "Change module files security context to modules_file_t" + # chcon -R -t modules_object_t /lib/modules/$KERNEL_VERSION/kernel/drivers/video + # fi + echo "Parsing kernel module parameters..." _get_module_params @@ -449,8 +485,7 @@ _install_driver() { if [ "${ACCEPT_LICENSE}" = "yes" ]; then install_args+=("--accept-license") fi - - IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check ${install_args[@]+"${install_args[@]}"} + IGNORE_CC_MISMATCH=1 nvidia-installer --kernel-module-only --no-drm --ui=none --no-nouveau-check -m=${KERNEL_TYPE} ${install_args[@]+"${install_args[@]}"} # May need to add no-cc-check for Rhel, otherwise it complains about cc missing in path # /proc/version and lib/modules/KERNEL_VERSION/proc are different, by default installer looks at /proc/ so, added the proc-mount-point # TODO: remove the -a flag. its not needed. in the new driver version, license-acceptance is implicit @@ -465,8 +500,14 @@ _mount_rootfs() { mkdir -p ${RUN_DIR}/driver mount --rbind / ${RUN_DIR}/driver - echo "Change device files security context for selinux compatibility" - chcon -R -t container_file_t ${RUN_DIR}/driver/dev + echo "Check SELinux status" + if [ -e /sys/fs/selinux ]; then + echo "SELinux is enabled" + echo "Change device files security context for selinux compatibility" + chcon -R -t container_file_t ${RUN_DIR}/driver/dev + else + echo "SELinux is disabled, skipping..." + fi } # Unmount the driver rootfs from the run directory. @@ -499,6 +540,11 @@ EOF } _shutdown() { + if [ -f /var/log/nvidia-installer.log ]; then + echo *** + cat /var/log/nvidia-installer.log + echo *** + fi if _unload_driver; then _unmount_rootfs rm -f ${PID_FILE} ${KERNEL_UPDATE_HOOK} @@ -555,11 +601,11 @@ _prepare() { fi # Install the userspace components and copy the kernel module sources. - sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x -m=${KERNEL_TYPE} && \ cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ sh /tmp/install.sh nvinstall && \ mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ - mv LICENSE mkprecompiled kernel /usr/src/nvidia-$DRIVER_VERSION && \ + mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \ sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest echo -e "\n========== NVIDIA Software Installer ==========\n" @@ -587,7 +633,7 @@ _build() { # Install dependencies if _kernel_requires_package; then _update_package_cache - #_resolve_kernel_version || exit 1 + # _resolve_kernel_version || exit 1 _install_prerequisites _create_driver_package #_remove_prerequisites @@ -649,11 +695,11 @@ update() { if [ "${DRIVER_TYPE}" != "vgpu" ]; then # Install the userspace components and copy the kernel module sources. if [ ! -e /usr/src/nvidia-${DRIVER_VERSION}/mkprecompiled ]; then - sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x && \ + sh NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION.run -x -m=${KERNEL_TYPE} && \ cd NVIDIA-Linux-$DRIVER_ARCH-$DRIVER_VERSION && \ sh /tmp/install.sh nvinstall && \ mkdir -p /usr/src/nvidia-$DRIVER_VERSION && \ - mv LICENSE mkprecompiled kernel /usr/src/nvidia-$DRIVER_VERSION && \ + mv LICENSE mkprecompiled ${KERNEL_TYPE} /usr/src/nvidia-$DRIVER_VERSION && \ sed '9,${/^\(kernel\|LICENSE\)/!d}' .manifest > /usr/src/nvidia-$DRIVER_VERSION/.manifest fi fi @@ -679,14 +725,14 @@ update() { # Wait for MOFED drivers to be loaded and load nvidia-peermem whenever it gets unloaded during MOFED driver updates reload_nvidia_peermem() { if [ "$USE_HOST_MOFED" = "true" ]; then - until lsmod | grep mlx5_core > /dev/null 2>&1 && [ -f /sys/module/nvidia/refcnt ]; + until lsmod | grep mlx5_core > /dev/null 2>&1 && [ -f /run/nvidia/validations/.driver-ctr-ready ]; do echo "waiting for mellanox ofed and nvidia drivers to be installed" sleep 10 done else # use driver readiness flag created by MOFED container - until [ -f /run/mellanox/drivers/.driver-ready ] && [ -f /sys/module/nvidia/refcnt ]; + until [ -f /run/mellanox/drivers/.driver-ready ] && [ -f /run/nvidia/validations/.driver-ctr-ready ]; do echo "waiting for mellanox ofed and nvidia drivers to be installed" sleep 10 @@ -723,8 +769,7 @@ usage() { Usage: $0 COMMAND [ARG...] Commands: - init [-a | --accept-license] - init-debug [-a | --accept-license] [-x] + init [-a | --accept-license] [-x] build [-a | --accept-license] load update [-k | --kernel VERSION] [-s | --sign KEYID] [-t | --tag TAG]