From 8d14a80e84c99fded1f64b545ca023d8a3c88cea Mon Sep 17 00:00:00 2001 From: Jamie Dougherty Date: Wed, 5 Jun 2024 17:33:34 +0100 Subject: [PATCH] Disable cudnn option (#123) * add disable cudnn option * correct comment * clang format * add to readme --------- Co-authored-by: jamied --- README.md | 20 ++++++++++++++++++++ src/libtorch.cc | 30 ++++++++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 106eb13..731a7c3 100644 --- a/README.md +++ b/README.md @@ -144,6 +144,26 @@ key: "INFERENCE_MODE" } ``` +* `DISABLE_CUDNN`: Boolean flag to disable the cuDNN library. By default, cuDNN is enabled. + +[cuDNN](https://developer.nvidia.com/cudnn) is a GPU-accelerated library of primitives for +deep neural networks. cuDNN provides highly tuned implementations for standard routines. + +Typically, models run with cuDNN enabled are faster. However there are some exceptions +where using cuDNN can be slower, cause higher memory usage or result in errors. + + +The section of model config file specifying this parameter will look like: + +``` +parameters: { +key: "DISABLE_CUDNN" + value: { + string_value: "true" + } +} +``` + * `ENABLE_WEIGHT_SHARING`: Boolean flag to enable model instances on the same device to share weights. This optimization should not be used with stateful models. If not specified, weight sharing is disabled. diff --git a/src/libtorch.cc b/src/libtorch.cc index c6d0b5a..dbea502 100644 --- a/src/libtorch.cc +++ b/src/libtorch.cc @@ -104,6 +104,7 @@ class ModelState : public BackendModel { return enable_jit_executor_pair_; } bool EnabledInferenceMode() { return enable_inference_mode_; } + bool EnabledCudnn() { return enable_cudnn_; } bool EnabledCacheCleaning() { return enable_cache_cleaning_; } bool EnabledWeightSharing() { return enable_weight_sharing_; } @@ -125,6 +126,9 @@ class ModelState : public BackendModel { // Flag to indicate whether inference mode is enabled. Defaults to false. bool enable_inference_mode_; + // Flag to indicate whether cudnn is enabled. Defaults to true. + bool enable_cudnn_; + // Flag to indicate whether cache cleaning after each run is enabled. // Defaults to false. bool enable_cache_cleaning_; @@ -227,8 +231,9 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state) ModelState::ModelState(TRITONBACKEND_Model* triton_model) : BackendModel(triton_model), enable_optimized_execution_(true), - enable_inference_mode_(true), enable_cache_cleaning_(false), - enable_weight_sharing_(false), enable_tensor_fuser_pair_({false, true}), + enable_inference_mode_(true), enable_cudnn_(true), + enable_cache_cleaning_(false), enable_weight_sharing_(false), + enable_tensor_fuser_pair_({false, true}), enable_jit_profiling_pair_({false, true}), enable_jit_executor_pair_({false, true}) { @@ -393,6 +398,24 @@ ModelState::ParseParameters() " for model instance '" + Name() + "'") .c_str()); + // If 'DISABLE_CUDNN' is not present in 'parameters' then no update is made + // to 'enable_cudnn_'. + bool disable_cudnn = false; + err = ParseParameter(params, "DISABLE_CUDNN", &disable_cudnn); + if (err != nullptr) { + if (TRITONSERVER_ErrorCode(err) != TRITONSERVER_ERROR_NOT_FOUND) { + return err; + } else { + TRITONSERVER_ErrorDelete(err); + } + } + enable_cudnn_ = !disable_cudnn; + LOG_MESSAGE( + TRITONSERVER_LOG_INFO, + (std::string("cuDNN is ") + (enable_cudnn_ ? "enabled" : "disabled") + + " for model instance '" + Name() + "'") + .c_str()); + // If 'ENABLE_TENSOR_FUSER' is not present in 'parameters' then no // update is made to 'enable_tensor_fuser'. bool enable_tensor_fuser = false; @@ -1562,6 +1585,9 @@ ModelInstanceState::Execute( // enable/disable inference mode - supersedes NoGradGuard torch::InferenceMode infer_guard(model_state_->EnabledInferenceMode()); + // enable/disable cudnn + at::globalContext().setUserEnabledCuDNN(model_state_->EnabledCudnn()); + // JIT. No change is made unless parameter is explicitly set. if (std::get<0>(model_state_->EnabledJitProfiling())) { torch::jit::getProfilingMode() =