From fbc570b82013f4da88179139d376b6126bbcf4e0 Mon Sep 17 00:00:00 2001 From: Rob Young Date: Fri, 5 Nov 2021 17:26:22 -0400 Subject: [PATCH 1/5] first pass of gpu smoke test Script collects version of driver, toolkit and tensorflow in a way that doesn't require communication with the card --- scripts/test-config-cuda.sh | 81 ++++++++++++++++++++++++++++++++++++ tests/gpu/misc/examples_tf.R | 36 ++++++++++++++++ tests/gpu/misc/nvblas.R | 1 + tests/ml/nvblas.R | 11 +++++ 4 files changed, 129 insertions(+) create mode 100644 scripts/test-config-cuda.sh create mode 100644 tests/gpu/misc/examples_tf.R create mode 120000 tests/gpu/misc/nvblas.R create mode 100644 tests/ml/nvblas.R diff --git a/scripts/test-config-cuda.sh b/scripts/test-config-cuda.sh new file mode 100644 index 00000000..220e6d4e --- /dev/null +++ b/scripts/test-config-cuda.sh @@ -0,0 +1,81 @@ +#!/bin/bash +set -e + +# set log location from command invokation +LOG_LOC=$1 +TEST_FAIL=false + +#!/bin/bash +set -e + +# set log location from command invokation +LOG_LOC=$1 +TEST_FAIL=false + +# driver +PROC_DRIVER_FILE=/proc/driver/nvidia/version +if [ ! -f "$PROC_DRIVER_FILE" ] +then + echo "$PROC_DRIVER_FILE doesn't exist" | tee -a $LOG_LOC + echo "WARNING: CUDA driver may not be correctly installed." | tee -a $LOG_LOC + TEST_FAIL=true +else + # 2 possible command line options + # 1) we could parse /proc/driver/nvidia/version, but output isn't easy to parse: + # NVRM version: NVIDIA UNIX x86_64 Kernel Module 470.74 Mon Sep 13 23:09:15 UTC 2021 + # GCC version: gcc version 9.3.0 (Ubuntu 9.3.0-17ubuntu1~20.04) + # 2) nvidia-smi --query-gpu=driver_version --format=csv + # output is easy to parse: + # driver_version + # 470.74 + # but nvidia-smi may require communication with the card that we won't have. + # testing will be needed + while read line; do + IFS=' ' read -ra tmp_array <<< $line + if [ ${tmp_array[0]} = "NVRM" ] && [ ${tmp_array[1]} = "version:" ] + then + VERSION_DRIVER=${tmp_array[7]} + fi + done < $PROC_DRIVER_FILE +fi + +echo $VERSION_DRIVER + +# toolkit +if ! TOOLKIT_CHECK_OUTPUT=$(nvcc -V 2>&1); +then + echo "Failed to run 'nvcc -V' with error message: $TOOLKIT_CHECK_OUTPUT" | tee -a $LOG_LOC + echo "WARNING: CUDA toolkit may not be correctly installed." | tee -a $LOG_LOC + TEST_FAIL=true +else + # parse output to get version number + while IFS= read -r line + do + IFS=' ' read -ra tmp_array <<< $line + if [ "${tmp_array[3]}" = "release" ] + then + VERSION_TOOLKIT=${tmp_array[5]} + fi + done <<< $TOOLKIT_CHECK_OUTPUT +fi + +echo $VERSION_TOOLKIT + +# tensorflow +if ! VERSION_TF_OUTPUT=`python -c 'import tensorflow as tf; print(tf.__version__)' 2>&1`; +then + echo "Error: trying to get tensorflow version: $TF_VERSION" +else + while IFS= read -r line + do + VERSION_TF=$line + done <<< $VERSION_TF_OUTPUT +fi + +echo $VERSION_TF + +if [ "$TEST_FAIL" = true ] +then + echo "WARNING: at least one of the GPU functionality tests has failed." | tee -a $LOG_LOC + echo "Please run rocker-versioned2/tests/gpu/test-gpu.sh script for more detailed information." | tee -a $LOG_LOC +fi diff --git a/tests/gpu/misc/examples_tf.R b/tests/gpu/misc/examples_tf.R new file mode 100644 index 00000000..ca3f933c --- /dev/null +++ b/tests/gpu/misc/examples_tf.R @@ -0,0 +1,36 @@ + +## Tensorflow: +install.packages('keras', repos='http://cran.us.r-project.org') +library(keras) +mnist <- dataset_mnist() +x_train <- mnist$train$x +y_train <- mnist$train$y +x_test <- mnist$test$x +y_test <- mnist$test$y +# reshape +x_train <- array_reshape(x_train, c(nrow(x_train), 784)) +x_test <- array_reshape(x_test, c(nrow(x_test), 784)) +# rescale +x_train <- x_train / 255 +x_test <- x_test / 255 +y_train <- to_categorical(y_train, 10) +y_test <- to_categorical(y_test, 10) +model <- keras_model_sequential() +model %>% + layer_dense(units = 256, activation = 'relu', input_shape = c(784)) %>% + layer_dropout(rate = 0.4) %>% + layer_dense(units = 128, activation = 'relu') %>% + layer_dropout(rate = 0.3) %>% + layer_dense(units = 10, activation = 'softmax') + + model %>% compile( + loss = 'categorical_crossentropy', + optimizer = optimizer_rmsprop(), + metrics = c('accuracy') + ) + history <- model %>% fit( + x_train, y_train, + epochs = 30, batch_size = 128, + validation_split = 0.2 + ) +model %>% evaluate(x_test, y_test) diff --git a/tests/gpu/misc/nvblas.R b/tests/gpu/misc/nvblas.R new file mode 120000 index 00000000..499cf1a8 --- /dev/null +++ b/tests/gpu/misc/nvblas.R @@ -0,0 +1 @@ +../../ml/nvblas.R \ No newline at end of file diff --git a/tests/ml/nvblas.R b/tests/ml/nvblas.R new file mode 100644 index 00000000..c0bbbda4 --- /dev/null +++ b/tests/ml/nvblas.R @@ -0,0 +1,11 @@ +install.packages("callr") + + +callr::r(function(){ + system.time({ + N <- 2^14 + M <- matrix(rnorm(N*N), nrow=N, ncol=N) + M %*% M + }) + }, env = c(LD_PRELOAD="libnvblas.so") +) From 5e4a5ba3bee256abb3d0ed4205a5bc9f0e6b67e5 Mon Sep 17 00:00:00 2001 From: Rob Young Date: Fri, 12 Nov 2021 16:55:43 -0500 Subject: [PATCH 2/5] removed duplicated code Thanks eitsupi! --- scripts/test-config-cuda.sh | 7 ------- 1 file changed, 7 deletions(-) diff --git a/scripts/test-config-cuda.sh b/scripts/test-config-cuda.sh index 220e6d4e..9c874026 100644 --- a/scripts/test-config-cuda.sh +++ b/scripts/test-config-cuda.sh @@ -5,13 +5,6 @@ set -e LOG_LOC=$1 TEST_FAIL=false -#!/bin/bash -set -e - -# set log location from command invokation -LOG_LOC=$1 -TEST_FAIL=false - # driver PROC_DRIVER_FILE=/proc/driver/nvidia/version if [ ! -f "$PROC_DRIVER_FILE" ] From a623852ab8324096c514ae2be4762357f31caabf Mon Sep 17 00:00:00 2001 From: Rob Young Date: Thu, 6 Jan 2022 14:11:02 -0500 Subject: [PATCH 3/5] changed from backticks to parentheses and added command line argument checking --- scripts/test-config-cuda.sh | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/scripts/test-config-cuda.sh b/scripts/test-config-cuda.sh index 9c874026..1848174f 100644 --- a/scripts/test-config-cuda.sh +++ b/scripts/test-config-cuda.sh @@ -1,6 +1,14 @@ #!/bin/bash set -e +# require one command line argument +if [ "$#" -ne 1 ] +then + echo "Error: one argument for log location required (e.g. ./gpu-test.log)" + echo "Usage: $0 log-location" + exit 1 +fi + # set log location from command invokation LOG_LOC=$1 TEST_FAIL=false @@ -13,16 +21,6 @@ then echo "WARNING: CUDA driver may not be correctly installed." | tee -a $LOG_LOC TEST_FAIL=true else - # 2 possible command line options - # 1) we could parse /proc/driver/nvidia/version, but output isn't easy to parse: - # NVRM version: NVIDIA UNIX x86_64 Kernel Module 470.74 Mon Sep 13 23:09:15 UTC 2021 - # GCC version: gcc version 9.3.0 (Ubuntu 9.3.0-17ubuntu1~20.04) - # 2) nvidia-smi --query-gpu=driver_version --format=csv - # output is easy to parse: - # driver_version - # 470.74 - # but nvidia-smi may require communication with the card that we won't have. - # testing will be needed while read line; do IFS=' ' read -ra tmp_array <<< $line if [ ${tmp_array[0]} = "NVRM" ] && [ ${tmp_array[1]} = "version:" ] @@ -32,7 +30,7 @@ else done < $PROC_DRIVER_FILE fi -echo $VERSION_DRIVER +echo $VERSION_DRIVER | tee -a $LOG_LOC # toolkit if ! TOOLKIT_CHECK_OUTPUT=$(nvcc -V 2>&1); @@ -52,10 +50,10 @@ else done <<< $TOOLKIT_CHECK_OUTPUT fi -echo $VERSION_TOOLKIT +echo $VERSION_TOOLKIT | tee -a $LOG_LOC # tensorflow -if ! VERSION_TF_OUTPUT=`python -c 'import tensorflow as tf; print(tf.__version__)' 2>&1`; +if ! VERSION_TF_OUTPUT=$(python -c 'import tensorflow as tf; print(tf.__version__)' 2>&1); then echo "Error: trying to get tensorflow version: $TF_VERSION" else @@ -65,7 +63,7 @@ else done <<< $VERSION_TF_OUTPUT fi -echo $VERSION_TF +echo $VERSION_TF | tee -a $LOG_LOC if [ "$TEST_FAIL" = true ] then From 2ef3245bb8e6ebc7a8a2a099baab64cf02f6934d Mon Sep 17 00:00:00 2001 From: Rob Young Date: Fri, 7 Jan 2022 14:12:48 -0500 Subject: [PATCH 4/5] removed repo from keras install --- scripts/tests/examples_tf.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/tests/examples_tf.R b/scripts/tests/examples_tf.R index ca3f933c..3a942ddf 100755 --- a/scripts/tests/examples_tf.R +++ b/scripts/tests/examples_tf.R @@ -1,6 +1,6 @@ ## Tensorflow: -install.packages('keras', repos='http://cran.us.r-project.org') +install.packages('keras') library(keras) mnist <- dataset_mnist() x_train <- mnist$train$x From 96d585576090da23306f9355a49c085db598fa9c Mon Sep 17 00:00:00 2001 From: Rob Young Date: Fri, 11 Mar 2022 13:58:46 -0500 Subject: [PATCH 5/5] removed repo from keras install command --- tests/gpu/misc/examples_tf.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/gpu/misc/examples_tf.R b/tests/gpu/misc/examples_tf.R index ca3f933c..3a942ddf 100644 --- a/tests/gpu/misc/examples_tf.R +++ b/tests/gpu/misc/examples_tf.R @@ -1,6 +1,6 @@ ## Tensorflow: -install.packages('keras', repos='http://cran.us.r-project.org') +install.packages('keras') library(keras) mnist <- dataset_mnist() x_train <- mnist$train$x