Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use cuda-python bindings for getting device properties. #4830

Merged
merged 9 commits into from
Jan 7, 2025
Merged
46 changes: 22 additions & 24 deletions ci/notebook_list.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021-2023, NVIDIA CORPORATION.
# Copyright (c) 2021-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -17,7 +17,7 @@
import glob
from pathlib import Path

from numba import cuda
from cuda.bindings import runtime

# for adding another run type and skip file name add to this dictionary
runtype_dict = {
Expand All @@ -30,20 +30,27 @@

def skip_book_dir(runtype):
# Add all run types here, currently only CI supported
return runtype in runtype_dict and Path(runtype_dict.get(runtype)).is_file()

if runtype in runtype_dict.keys():
if Path(runtype_dict.get(runtype)).is_file():
return True
return False

def _get_cuda_version_string():
status, version = runtime.getLocalRuntimeVersion()
if status != runtime.cudaError_t.cudaSuccess:
raise RuntimeError("Could not get CUDA runtime version.")
major, minor = divmod(version, 1000)
minor //= 10
return f"{major}.{minor}"


def _is_ampere_or_newer():
status, device_id = runtime.cudaGetDevice()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I implemented what was here before, but I would double-check this logic: are our notebooks still failing on Ampere and newer? Does this check need to be removed?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM. I'm in favor of committing what's here now then checking the notebooks that use the "# Does not run on Ampere" comment on Ampere to see if it's still needed. If not needed, we can have a followup PR to remove it and/or the comments.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@rlratzel Sounds good. I will follow up with a PR that removes this, and our CI will cover the check. Our ARM runners use Ampere.

if status != runtime.cudaError_t.cudaSuccess:
raise RuntimeError("Could not get CUDA device.")
status, device_prop = runtime.cudaGetDeviceProperties(device_id)
if status != runtime.cudaError_t.cudaSuccess:
raise RuntimeError("Could not get CUDA device properties.")
return (device_prop.major, device_prop.minor) >= (8, 0)

cuda_version_string = ".".join([str(n) for n in cuda.runtime.get_version()])
#
# Not strictly true... however what we mean is
# Pascal or earlier
#
ampere = False
device = cuda.get_current_device()

parser = argparse.ArgumentParser(description="Condition for running the notebook tests")
parser.add_argument("runtype", type=str)
Expand All @@ -52,19 +59,10 @@ def skip_book_dir(runtype):

runtype = args.runtype

if runtype not in runtype_dict.keys():
if runtype not in runtype_dict:
print(f"Unknown Run Type = {runtype}", file=sys.stderr)
exit()


# check for the attribute using both pre and post numba 0.53 names
cc = getattr(device, "COMPUTE_CAPABILITY", None) or getattr(
device, "compute_capability"
)
if cc[0] >= 8:
ampere = True

skip = False
for filename in glob.iglob("**/*.ipynb", recursive=True):
skip = False
if skip_book_dir(runtype):
Expand All @@ -88,7 +86,7 @@ def skip_book_dir(runtype):
)
skip = True
break
elif ampere and re.search("# Does not run on Ampere", line):
elif _is_ampere_or_newer() and re.search("# Does not run on Ampere", line):
print(f"SKIPPING {filename} (does not run on Ampere)", file=sys.stderr)
skip = True
break
Expand Down
13 changes: 6 additions & 7 deletions python/cugraph/cugraph/dask/common/mg_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -13,7 +13,7 @@

import os
import gc
import numba.cuda
from cuda.bindings import runtime


# FIXME: this raft import breaks the library if ucx-py is
Expand Down Expand Up @@ -53,11 +53,10 @@ def prepare_worker_to_parts(data, client=None):


def is_single_gpu():
ngpus = len(numba.cuda.gpus)
if ngpus > 1:
return False
else:
return True
status, count = runtime.cudaGetDeviceCount()
if status != runtime.cudaError_t.cudaSuccess:
raise RuntimeError("Could not get CUDA device count.")
return count > 1


def get_visible_devices():
Expand Down
14 changes: 11 additions & 3 deletions python/cugraph/cugraph/tests/docs/test_doctests.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
# Copyright (c) 2022-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -25,14 +25,21 @@
import cugraph
import pylibcugraph
import cudf
from numba import cuda
from cuda.bindings import runtime
from cugraph.testing import utils


modules_to_skip = ["dask", "proto", "raft"]
datasets = utils.RAPIDS_DATASET_ROOT_DIR_PATH

cuda_version_string = ".".join([str(n) for n in cuda.runtime.get_version()])

def _get_cuda_version_string():
status, version = runtime.getLocalRuntimeVersion()
if status != runtime.cudaError_t.cudaSuccess:
raise RuntimeError("Could not get CUDA runtime version.")
major = version // 1000
minor = (version % 1000) // 10
return f"{major}.{minor}"


def _is_public_name(name):
Expand Down Expand Up @@ -131,6 +138,7 @@ def skip_docstring(docstring_obj):
NOTE: this function is currently not available on CUDA 11.4 systems.
"""
docstring = docstring_obj.docstring
cuda_version_string = _get_cuda_version_string()
for line in docstring.splitlines():
if f"currently not available on CUDA {cuda_version_string} systems" in line:
return f"docstring example not supported on CUDA {cuda_version_string}"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021, NVIDIA CORPORATION.
# Copyright (c) 2021-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -19,7 +19,6 @@
from cugraph.utilities.path_retrieval cimport get_traversed_cost as c_get_traversed_cost
from cugraph.structure.graph_primtypes cimport *
from libc.stdint cimport uintptr_t
from numba import cuda
import cudf
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was an unused import.

import numpy as np

Expand Down
54 changes: 24 additions & 30 deletions python/cugraph/cugraph/utilities/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
Expand All @@ -15,13 +15,10 @@
import os
import shutil

from numba import cuda

import cudf
from cudf.core.column import as_column

from cuda.cudart import cudaDeviceAttr
from rmm._cuda.gpu import getDeviceAttribute
from cuda.bindings import runtime

from warnings import warn

Expand Down Expand Up @@ -210,45 +207,42 @@ def get_traversed_path_list(df, id):
return answer


def is_cuda_version_less_than(min_version=(10, 2)):
def is_cuda_version_less_than(min_version):
"""
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the function name, this should not have a default value. Its default was also outdated.

This function also appears to be unused. Do we want to keep it?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removing it seems like a good idea to me.

Returns True if the version of CUDA being used is less than min_version
"""
this_cuda_ver = cuda.runtime.get_version() # returns (<major>, <minor>)
if this_cuda_ver[0] > min_version[0]:
return False
if this_cuda_ver[0] < min_version[0]:
return True
if this_cuda_ver[1] < min_version[1]:
return True
return False
status, version = runtime.getLocalRuntimeVersion()
if status != runtime.cudaError_t.cudaSuccess:
raise RuntimeError("Could not get CUDA runtime version.")
major = version // 1000
minor = (version % 1000) // 10
return (major, minor) < min_version


def is_device_version_less_than(min_version=(7, 0)):
def is_device_version_less_than(min_version):
"""
Copy link
Contributor Author

@bdice bdice Dec 11, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on the function name, this should not have a default value.

It appears this is only used once, to guard against use on Pascal. However, we dropped Pascal a year ago. Can we remove this guard on the test? Then, can we delete this function since it is unused?

is_device_version_less_than((7, 0)), reason="Not supported on Pascal"

Returns True if the version of CUDA being used is less than min_version
"""
major_version = getDeviceAttribute(
cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, 0
)
minor_version = getDeviceAttribute(
cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, 0
)
if major_version > min_version[0]:
return False
if major_version < min_version[0]:
return True
if minor_version < min_version[1]:
return True
return False
status, device_id = runtime.cudaGetDevice()
if status != runtime.cudaError_t.cudaSuccess:
raise RuntimeError("Could not get CUDA device.")
status, device_prop = runtime.cudaGetDeviceProperties(device_id)
if status != runtime.cudaError_t.cudaSuccess:
raise RuntimeError("Could not get CUDA device properties.")
return (device_prop.major, device_prop.minor) < min_version


def get_device_memory_info():
"""
Returns the total amount of global memory on the device in bytes
"""
meminfo = cuda.current_context().get_memory_info()
return meminfo[1]
status, device_id = runtime.cudaGetDevice()
if status != runtime.cudaError_t.cudaSuccess:
raise RuntimeError("Could not get CUDA device.")
status, device_prop = runtime.cudaGetDeviceProperties(device_id)
if status != runtime.cudaError_t.cudaSuccess:
raise RuntimeError("Could not get CUDA device properties.")
return device_prop.totalGlobalMem


# FIXME: if G is a Nx type, the weight attribute is assumed to be "weight", if
Expand Down
Loading