diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt index c8c68f19bf..9d80cbaac2 100644 --- a/cpp/bench/prims/CMakeLists.txt +++ b/cpp/bench/prims/CMakeLists.txt @@ -80,7 +80,14 @@ if(BUILD_PRIMS_BENCH) PATH core/bitset.cu core/copy.cu - core/popc.cu + main.cpp + ) + + ConfigureBench( + NAME + UTIL_BENCH + PATH + util/popc.cu main.cpp ) diff --git a/cpp/bench/prims/core/popc.cu b/cpp/bench/prims/util/popc.cu similarity index 99% rename from cpp/bench/prims/core/popc.cu rename to cpp/bench/prims/util/popc.cu index dfa4335140..249dc13d1e 100644 --- a/cpp/bench/prims/core/popc.cu +++ b/cpp/bench/prims/util/popc.cu @@ -16,7 +16,7 @@ #include -#include +#include namespace raft::bench::core { diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp index cb1accc95e..c5d64f6a29 100644 --- a/cpp/include/raft/comms/detail/std_comms.hpp +++ b/cpp/include/raft/comms/detail/std_comms.hpp @@ -307,13 +307,16 @@ class std_comms : public comms_iface { bool restart = false; // resets the timeout when any progress was made if (worker->isProgressThreadRunning()) { - // Wait for a UCXX progress thread roundtrip + // Wait for a UCXX progress thread roundtrip, prevent waiting for longer + // than 10ms for each operation, will retry in next iteration. ucxx::utils::CallbackNotifier callbackNotifierPre{}; - worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); }); + worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); }, + 10000000 /* 10ms */); callbackNotifierPre.wait(); ucxx::utils::CallbackNotifier callbackNotifierPost{}; - worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); }); + worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); }, + 10000000 /* 10ms */); callbackNotifierPost.wait(); } else { // Causes UCXX to progress through the send/recv message queue diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh index 3b67e56eea..0cdb4c1fb6 100644 --- a/cpp/include/raft/core/bitset.cuh +++ b/cpp/include/raft/core/bitset.cuh @@ -19,12 +19,12 @@ #include #include #include -#include #include #include #include #include #include +#include #include diff --git a/cpp/include/raft/linalg/detail/eig.cuh b/cpp/include/raft/linalg/detail/eig.cuh index 2a4cfd52ec..ba7ed3dcdf 100644 --- a/cpp/include/raft/linalg/detail/eig.cuh +++ b/cpp/include/raft/linalg/detail/eig.cuh @@ -19,10 +19,12 @@ #include "cusolver_wrappers.hpp" #include +#include #include #include #include +#include #include #include @@ -90,7 +92,19 @@ void eigDC(raft::resources const& handle, { #if CUDART_VERSION < 11010 eigDC_legacy(handle, in, n_rows, n_cols, eig_vectors, eig_vals, stream); + return; +#endif + +#if CUDART_VERSION <= 12040 + // Use a new stream instead of `cudaStreamPerThread` to avoid cusolver bug # 4580093. + rmm::cuda_stream stream_new_wrapper; + cudaStream_t stream_new = stream_new_wrapper.value(); + cudaEvent_t sync_event = resource::detail::get_cuda_stream_sync_event(handle); + RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream)); + RAFT_CUDA_TRY(cudaStreamWaitEvent(stream_new, sync_event)); #else + cudaStream_t stream_new = stream; +#endif cusolverDnHandle_t cusolverH = resource::get_cusolver_dn_handle(handle); cusolverDnParams_t dn_params = nullptr; @@ -108,15 +122,13 @@ void eigDC(raft::resources const& handle, eig_vals, &workspaceDevice, &workspaceHost, - stream)); + stream_new)); - rmm::device_uvector d_work(workspaceDevice / sizeof(math_t), stream); - rmm::device_scalar d_dev_info(stream); + rmm::device_uvector d_work(workspaceDevice / sizeof(math_t), stream_new); + rmm::device_scalar d_dev_info(stream_new); std::vector h_work(workspaceHost / sizeof(math_t)); - raft::matrix::copy(handle, - make_device_matrix_view(in, n_rows, n_cols), - make_device_matrix_view(eig_vectors, n_rows, n_cols)); + raft::copy(eig_vectors, in, n_rows * n_cols, stream_new); RAFT_CUSOLVER_TRY(cusolverDnxsyevd(cusolverH, dn_params, @@ -131,14 +143,19 @@ void eigDC(raft::resources const& handle, h_work.data(), workspaceHost, d_dev_info.data(), - stream)); + stream_new)); RAFT_CUDA_TRY(cudaGetLastError()); RAFT_CUSOLVER_TRY(cusolverDnDestroyParams(dn_params)); - int dev_info = d_dev_info.value(stream); + int dev_info = d_dev_info.value(stream_new); ASSERT(dev_info == 0, "eig.cuh: eigensolver couldn't converge to a solution. " "This usually occurs when some of the features do not vary enough."); + +#if CUDART_VERSION <= 12040 + // Synchronize the created stream with the original stream before return + RAFT_CUDA_TRY(cudaEventRecord(sync_event, stream_new)); + RAFT_CUDA_TRY(cudaStreamWaitEvent(stream, sync_event)); #endif } diff --git a/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh index 208328f2f3..ef74316d04 100644 --- a/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh +++ b/cpp/include/raft/sparse/linalg/detail/masked_matmul.cuh @@ -16,7 +16,6 @@ #pragma once #include -#include #include #include #include diff --git a/cpp/include/raft/core/detail/popc.cuh b/cpp/include/raft/util/detail/popc.cuh similarity index 100% rename from cpp/include/raft/core/detail/popc.cuh rename to cpp/include/raft/util/detail/popc.cuh diff --git a/cpp/include/raft/core/popc.hpp b/cpp/include/raft/util/popc.cuh similarity index 97% rename from cpp/include/raft/core/popc.hpp rename to cpp/include/raft/util/popc.cuh index fc6b6bd177..153694e45e 100644 --- a/cpp/include/raft/core/popc.hpp +++ b/cpp/include/raft/util/popc.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include +#include namespace raft { /** diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt index cb96ce2264..e3af6ebb78 100644 --- a/cpp/test/CMakeLists.txt +++ b/cpp/test/CMakeLists.txt @@ -122,7 +122,6 @@ if(BUILD_TESTS) core/math_host.cpp core/operators_device.cu core/operators_host.cpp - core/popc.cu core/handle.cpp core/interruptible.cu core/nvtx.cpp @@ -509,6 +508,7 @@ if(BUILD_TESTS) util/integer_utils.cpp util/integer_utils.cu util/memory_type_dispatcher.cu + util/popc.cu util/pow2_utils.cu util/reduction.cu ) diff --git a/cpp/test/linalg/eig.cu b/cpp/test/linalg/eig.cu index 460b99aaa0..3ff117cf08 100644 --- a/cpp/test/linalg/eig.cu +++ b/cpp/test/linalg/eig.cu @@ -156,6 +156,24 @@ class EigTest : public ::testing::TestWithParam> { eig_vals_large, eig_vals_jacobi_large; }; +TEST(Raft, EigStream) +{ + // Separate test to check eig_dc stream workaround for CUDA 12+ + raft::resources handle; + auto n_rows = 5000; + auto cov_matrix_stream = + raft::make_device_matrix(handle, n_rows, n_rows); + auto eig_vectors_stream = + raft::make_device_matrix(handle, n_rows, n_rows); + auto eig_vals_stream = raft::make_device_vector(handle, n_rows); + + raft::linalg::eig_dc(handle, + raft::make_const_mdspan(cov_matrix_stream.view()), + eig_vectors_stream.view(), + eig_vals_stream.view()); + raft::resource::sync_stream(handle, raft::resource::get_cuda_stream(handle)); +} + const std::vector> inputsf2 = {{0.001f, 4 * 4, 4, 4, 1234ULL, 256}}; const std::vector> inputsd2 = {{0.001, 4 * 4, 4, 4, 1234ULL, 256}}; diff --git a/cpp/test/core/popc.cu b/cpp/test/util/popc.cu similarity index 99% rename from cpp/test/core/popc.cu rename to cpp/test/util/popc.cu index 83dda79b6e..c08faacb07 100644 --- a/cpp/test/core/popc.cu +++ b/cpp/test/util/popc.cu @@ -19,10 +19,10 @@ #include #include #include -#include #include #include #include +#include #include