From 4defeecf65d1224a519b2065e5414150dc03f24e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 30 Dec 2024 16:01:53 -0800 Subject: [PATCH 1/7] Move dtype_from_pylibcudf_column to column.pyx --- python/cudf/cudf/_lib/column.pyx | 34 ++++++++++++++++++++- python/cudf/cudf/_lib/types.pyx | 52 ++------------------------------ 2 files changed, 35 insertions(+), 51 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index f7dcd89ea48..9a7c4ea0e0b 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -34,7 +34,7 @@ from cudf._lib.types cimport ( dtype_to_pylibcudf_type, ) -from cudf._lib.types import dtype_from_pylibcudf_column +from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES from pylibcudf cimport DataType as plc_DataType cimport pylibcudf.libcudf.copying as cpp_copying @@ -64,6 +64,38 @@ cdef get_element(column_view col_view, size_type index): ) +def dtype_from_pylibcudf_column(col): + type_ = col.type() + tid = type_.id() + + if tid == plc.TypeId.LIST: + child = col.list_view().child() + return cudf.ListDtype(dtype_from_pylibcudf_column(child)) + elif tid == plc.TypeId.STRUCT: + fields = { + str(i): dtype_from_pylibcudf_column(col.child(i)) + for i in range(col.num_children()) + } + return cudf.StructDtype(fields) + elif tid == plc.TypeId.DECIMAL64: + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + elif tid == plc.TypeId.DECIMAL32: + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + elif tid == plc.TypeId.DECIMAL128: + return cudf.Decimal128Dtype( + precision=cudf.Decimal128Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + else: + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid] + + cdef class Column: """ A Column stores columnar data in device memory. diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 777bd070b32..432df743161 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view import pylibcudf as plc import cudf +from cudf.utils.dtypes import _get_base_dtype SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { @@ -117,56 +118,7 @@ cpdef dtype_to_pylibcudf_type(dtype): return plc.DataType(tid, -dtype.scale) # libcudf types don't support timezones so convert to the base type elif isinstance(dtype, pd.DatetimeTZDtype): - dtype = np.dtype(f" Date: Mon, 30 Dec 2024 16:34:43 -0800 Subject: [PATCH 2/7] migrate away dtype_to_pylibcudf_type to utils.dtypes --- python/cudf/cudf/_lib/column.pyx | 19 ++++----- python/cudf/cudf/_lib/types.pxd | 2 - python/cudf/cudf/_lib/types.pyx | 43 +++----------------- python/cudf/cudf/core/_internals/binaryop.py | 2 +- python/cudf/cudf/core/_internals/unary.py | 2 +- python/cudf/cudf/core/column/column.py | 3 +- python/cudf/cudf/core/column/string.py | 7 ++-- python/cudf/cudf/io/csv.py | 6 ++- python/cudf/cudf/io/json.py | 6 ++- python/cudf/cudf/io/orc.py | 2 +- python/cudf/cudf/utils/dtypes.py | 26 ++++++++++++ 11 files changed, 56 insertions(+), 62 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 9a7c4ea0e0b..576dd99157e 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -19,7 +19,7 @@ from cudf.core.buffer import ( as_buffer, cuda_array_interface_wrapper, ) -from cudf.utils.dtypes import _get_base_dtype +from cudf.utils.dtypes import _get_base_dtype, dtype_to_pylibcudf_type from cpython.buffer cimport PyObject_CheckBuffer from libc.stdint cimport uintptr_t @@ -29,10 +29,7 @@ from libcpp.vector cimport vector from rmm.pylibrmm.device_buffer cimport DeviceBuffer -from cudf._lib.types cimport ( - dtype_from_column_view, - dtype_to_pylibcudf_type, -) +from cudf._lib.types cimport dtype_from_column_view from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES @@ -68,26 +65,26 @@ def dtype_from_pylibcudf_column(col): type_ = col.type() tid = type_.id() - if tid == plc.TypeId.LIST: + if tid == pylibcudf.TypeId.LIST: child = col.list_view().child() return cudf.ListDtype(dtype_from_pylibcudf_column(child)) - elif tid == plc.TypeId.STRUCT: + elif tid == pylibcudf.TypeId.STRUCT: fields = { str(i): dtype_from_pylibcudf_column(col.child(i)) for i in range(col.num_children()) } return cudf.StructDtype(fields) - elif tid == plc.TypeId.DECIMAL64: + elif tid == pylibcudf.TypeId.DECIMAL64: return cudf.Decimal64Dtype( precision=cudf.Decimal64Dtype.MAX_PRECISION, scale=-type_.scale() ) - elif tid == plc.TypeId.DECIMAL32: + elif tid == pylibcudf.TypeId.DECIMAL32: return cudf.Decimal32Dtype( precision=cudf.Decimal32Dtype.MAX_PRECISION, scale=-type_.scale() ) - elif tid == plc.TypeId.DECIMAL128: + elif tid == pylibcudf.TypeId.DECIMAL128: return cudf.Decimal128Dtype( precision=cudf.Decimal128Dtype.MAX_PRECISION, scale=-type_.scale() @@ -456,7 +453,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[column_view] children cdef void* data diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd index 18b1d26e4db..be81ba54d3b 100644 --- a/python/cudf/cudf/_lib/types.pxd +++ b/python/cudf/cudf/_lib/types.pxd @@ -7,5 +7,3 @@ from pylibcudf.libcudf.column.column_view cimport column_view ctypedef int32_t underlying_type_t_type_id cdef dtype_from_column_view(column_view cv) - -cpdef dtype_to_pylibcudf_type(dtype) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx index 432df743161..fe09b3644c2 100644 --- a/python/cudf/cudf/_lib/types.pyx +++ b/python/cudf/cudf/_lib/types.pyx @@ -1,7 +1,6 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. import numpy as np -import pandas as pd from libcpp.memory cimport make_shared, shared_ptr @@ -12,7 +11,6 @@ from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view import pylibcudf as plc import cudf -from cudf.utils.dtypes import _get_base_dtype SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { @@ -61,26 +59,19 @@ cdef dtype_from_lists_column_view(column_view cv): if child.type().id() == libcudf_types.type_id.LIST: return cudf.ListDtype(dtype_from_lists_column_view(child)) - elif child.type().id() == libcudf_types.type_id.EMPTY: - return cudf.ListDtype("int8") else: - return cudf.ListDtype( - dtype_from_column_view(child) - ) - -cdef dtype_from_structs_column_view(column_view cv): - fields = { - str(i): dtype_from_column_view(cv.child(i)) - for i in range(cv.num_children()) - } - return cudf.StructDtype(fields) + return cudf.ListDtype(dtype_from_column_view(child)) cdef dtype_from_column_view(column_view cv): cdef libcudf_types.type_id tid = cv.type().id() if tid == libcudf_types.type_id.LIST: return dtype_from_lists_column_view(cv) elif tid == libcudf_types.type_id.STRUCT: - return dtype_from_structs_column_view(cv) + fields = { + str(i): dtype_from_column_view(cv.child(i)) + for i in range(cv.num_children()) + } + return cudf.StructDtype(fields) elif tid == libcudf_types.type_id.DECIMAL64: return cudf.Decimal64Dtype( precision=cudf.Decimal64Dtype.MAX_PRECISION, @@ -100,25 +91,3 @@ cdef dtype_from_column_view(column_view cv): return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ (tid) ] - - -cpdef dtype_to_pylibcudf_type(dtype): - if isinstance(dtype, cudf.ListDtype): - return plc.DataType(plc.TypeId.LIST) - elif isinstance(dtype, cudf.StructDtype): - return plc.DataType(plc.TypeId.STRUCT) - elif isinstance(dtype, cudf.Decimal128Dtype): - tid = plc.TypeId.DECIMAL128 - return plc.DataType(tid, -dtype.scale) - elif isinstance(dtype, cudf.Decimal64Dtype): - tid = plc.TypeId.DECIMAL64 - return plc.DataType(tid, -dtype.scale) - elif isinstance(dtype, cudf.Decimal32Dtype): - tid = plc.TypeId.DECIMAL32 - return plc.DataType(tid, -dtype.scale) - # libcudf types don't support timezones so convert to the base type - elif isinstance(dtype, pd.DatetimeTZDtype): - dtype = _get_base_dtype(dtype) - else: - dtype = np.dtype(dtype) - return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype]) diff --git a/python/cudf/cudf/core/_internals/binaryop.py b/python/cudf/cudf/core/_internals/binaryop.py index 212150f505e..954fc60c277 100644 --- a/python/cudf/cudf/core/_internals/binaryop.py +++ b/python/cudf/cudf/core/_internals/binaryop.py @@ -6,8 +6,8 @@ import pylibcudf as plc from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.core.buffer import acquire_spill_lock +from cudf.utils.dtypes import dtype_to_pylibcudf_type if TYPE_CHECKING: from cudf._typing import Dtype diff --git a/python/cudf/cudf/core/_internals/unary.py b/python/cudf/cudf/core/_internals/unary.py index 3b8e3db60a7..19deeafad96 100644 --- a/python/cudf/cudf/core/_internals/unary.py +++ b/python/cudf/cudf/core/_internals/unary.py @@ -5,9 +5,9 @@ import pylibcudf as plc -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.api.types import is_decimal_dtype from cudf.core.buffer import acquire_spill_lock +from cudf.utils.dtypes import dtype_to_pylibcudf_type if TYPE_CHECKING: from cudf._typing import Dtype diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 31efe267c96..e743d467bcb 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -26,7 +26,7 @@ from cudf import _lib as libcudf from cudf._lib.column import Column from cudf._lib.scalar import as_device_scalar -from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype +from cudf._lib.types import size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, @@ -64,6 +64,7 @@ _maybe_convert_to_default_type, cudf_dtype_from_pa_type, cudf_dtype_to_pa_type, + dtype_to_pylibcudf_type, find_common_type, get_time_unit, is_column_like, diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index fcdcb789f23..08099d4f69f 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -19,16 +19,15 @@ import cudf.api.types import cudf.core.column.column as column import cudf.core.column.datetime as datetime -from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype +from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import can_convert_to_column +from cudf.utils.dtypes import can_convert_to_column, dtype_to_pylibcudf_type if TYPE_CHECKING: from collections.abc import Callable, Sequence @@ -5870,7 +5869,7 @@ def as_decimal_column( ) -> cudf.core.column.DecimalBaseColumn: plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( self.to_pylibcudf(mode="read"), - libcudf.types.dtype_to_pylibcudf_type(dtype), + dtype_to_pylibcudf_type(dtype), ) result = Column.from_pylibcudf(plc_column) result.dtype.precision = dtype.precision # type: ignore[union-attr] diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 6d617cbf38e..a9a0bfd4ee4 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -16,11 +16,13 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.api.types import is_hashable, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils -from cudf.utils.dtypes import _maybe_convert_to_default_type +from cudf.utils.dtypes import ( + _maybe_convert_to_default_type, + dtype_to_pylibcudf_type, +) from cudf.utils.performance_tracking import _performance_tracking _CSV_HEX_TYPE_MAP = { diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index ff326e09315..95577aee30c 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -14,10 +14,12 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils -from cudf.utils.dtypes import _maybe_convert_to_default_type +from cudf.utils.dtypes import ( + _maybe_convert_to_default_type, + dtype_to_pylibcudf_type, +) if TYPE_CHECKING: from cudf.core.column import ColumnBase diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index f3124552fd1..ee12790d954 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -11,11 +11,11 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock from cudf.core.index import _index_from_data from cudf.utils import ioutils +from cudf.utils.dtypes import dtype_to_pylibcudf_type try: import ujson as json # type: ignore[import-untyped] diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index ca8f9cac2d0..e5d528cdbfe 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -11,6 +11,8 @@ import pyarrow as pa from pandas.core.dtypes.common import infer_dtype_from_object +import pylibcudf as plc + import cudf if TYPE_CHECKING: @@ -604,6 +606,30 @@ def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype: return dtype.base +def dtype_to_pylibcudf_type(dtype) -> plc.DataType: + if isinstance(dtype, cudf.ListDtype): + return plc.DataType(plc.TypeId.LIST) + elif isinstance(dtype, cudf.StructDtype): + return plc.DataType(plc.TypeId.STRUCT) + elif isinstance(dtype, cudf.Decimal128Dtype): + tid = plc.TypeId.DECIMAL128 + return plc.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal64Dtype): + tid = plc.TypeId.DECIMAL64 + return plc.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal32Dtype): + tid = plc.TypeId.DECIMAL32 + return plc.DataType(tid, -dtype.scale) + # libcudf types don't support timezones so convert to the base type + elif isinstance(dtype, pd.DatetimeTZDtype): + dtype = _get_base_dtype(dtype) + else: + dtype = np.dtype(dtype) + return plc.DataType( + cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype] + ) + + # Type dispatch loops similar to what are found in `np.add.types` # In NumPy, whether or not an op can be performed between two # operands is determined by checking to see if NumPy has a c/c++ From 5b221dcfd5ff31807ec0473312f31b2a0ee59d7e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 31 Dec 2024 11:24:55 -0800 Subject: [PATCH 3/7] Remove types.pyx --- python/cudf/cudf/_lib/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/column.pxd | 2 + python/cudf/cudf/_lib/column.pyx | 57 ++++++++++-- python/cudf/cudf/_lib/scalar.pyx | 47 +++------- python/cudf/cudf/_lib/types.pxd | 9 -- python/cudf/cudf/_lib/types.pyx | 93 ------------------- python/cudf/cudf/core/_base_index.py | 9 +- .../cudf/cudf/core/_internals/aggregation.py | 2 +- python/cudf/cudf/core/column/categorical.py | 8 +- python/cudf/cudf/core/column/column.py | 26 ++---- python/cudf/cudf/core/column/lists.py | 4 +- python/cudf/cudf/core/column/string.py | 9 +- python/cudf/cudf/core/copy_types.py | 4 +- python/cudf/cudf/core/dtypes.py | 3 +- python/cudf/cudf/core/groupby/groupby.py | 20 ++-- python/cudf/cudf/core/index.py | 6 +- python/cudf/cudf/core/indexed_frame.py | 3 +- python/cudf/cudf/core/join/join.py | 4 +- python/cudf/cudf/core/multiindex.py | 11 +-- python/cudf/cudf/core/reshape.py | 7 +- python/cudf/cudf/utils/dtypes.py | 44 ++++++++- 21 files changed, 164 insertions(+), 206 deletions(-) delete mode 100644 python/cudf/cudf/_lib/types.pxd delete mode 100644 python/cudf/cudf/_lib/types.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index ff6fba1c3e8..e9bf3882923 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx) +set(cython_sources column.pyx scalar.pyx strings_udf.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd index 8b1d16f0d85..dc99ebb175c 100644 --- a/python/cudf/cudf/_lib/column.pxd +++ b/python/cudf/cudf/_lib/column.pxd @@ -13,6 +13,8 @@ from pylibcudf.libcudf.column.column_view cimport ( from pylibcudf.libcudf.types cimport size_type from rmm.librmm.device_buffer cimport device_buffer +cdef dtype_from_lists_column_view(column_view cv) +cdef dtype_from_column_view(column_view cv) cdef class Column: cdef public: diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 576dd99157e..710ccd55cf8 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -19,20 +19,20 @@ from cudf.core.buffer import ( as_buffer, cuda_array_interface_wrapper, ) -from cudf.utils.dtypes import _get_base_dtype, dtype_to_pylibcudf_type +from cudf.utils.dtypes import ( + _get_base_dtype, + dtype_to_pylibcudf_type, + PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES, +) from cpython.buffer cimport PyObject_CheckBuffer -from libc.stdint cimport uintptr_t -from libcpp.memory cimport make_unique, unique_ptr +from libc.stdint cimport uintptr_t, int32_t +from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector from rmm.pylibrmm.device_buffer cimport DeviceBuffer -from cudf._lib.types cimport dtype_from_column_view - -from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES - from pylibcudf cimport DataType as plc_DataType cimport pylibcudf.libcudf.copying as cpp_copying cimport pylibcudf.libcudf.types as libcudf_types @@ -42,6 +42,7 @@ from pylibcudf.libcudf.column.column_factories cimport ( make_numeric_column ) from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count from pylibcudf.libcudf.scalar.scalar cimport scalar @@ -93,6 +94,48 @@ def dtype_from_pylibcudf_column(col): return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid] +cdef dtype_from_lists_column_view(column_view cv): + # lists_column_view have no default constructor, so we heap + # allocate it to get around Cython's limitation of requiring + # default constructors for stack allocated objects + cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv) + cdef column_view child = lv.get()[0].child() + + if child.type().id() == libcudf_types.type_id.LIST: + return cudf.ListDtype(dtype_from_lists_column_view(child)) + else: + return cudf.ListDtype(dtype_from_column_view(child)) + + +cdef dtype_from_column_view(column_view cv): + cdef libcudf_types.type_id tid = cv.type().id() + if tid == libcudf_types.type_id.LIST: + return dtype_from_lists_column_view(cv) + elif tid == libcudf_types.type_id.STRUCT: + fields = { + str(i): dtype_from_column_view(cv.child(i)) + for i in range(cv.num_children()) + } + return cudf.StructDtype(fields) + elif tid == libcudf_types.type_id.DECIMAL64: + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + elif tid == libcudf_types.type_id.DECIMAL32: + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + elif tid == libcudf_types.type_id.DECIMAL128: + return cudf.Decimal128Dtype( + precision=cudf.Decimal128Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + else: + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[(tid)] + + cdef class Column: """ A Column stores columnar data in device memory. diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 40bd50acf16..82d873724ce 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -14,17 +14,16 @@ import pylibcudf as plc import cudf from cudf.core.dtypes import ListDtype, StructDtype -from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id from cudf.core.missing import NA, NaT +from cudf.utils.dtypes import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES # We currently need this cimport because some of the implementations here # access the c_obj of the scalar, and because we need to be able to call # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until # DeviceScalar is phased out entirely from cuDF Cython (at which point # cudf.Scalar will be directly backed by pylibcudf.Scalar). -from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID -from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar +from pylibcudf cimport Scalar as plc_Scalar +from pylibcudf.libcudf.scalar.scalar cimport scalar def _replace_nested(obj, check, replacement): @@ -223,43 +222,25 @@ cdef class DeviceScalar: return s cdef void _set_dtype(self, dtype=None): - cdef plc_TypeID cdtype_id = self.c_value.type().id() + cdtype_id = self.c_value.type().id() if dtype is not None: self._dtype = dtype elif cdtype_id in { - plc_TypeID.DECIMAL32, - plc_TypeID.DECIMAL64, - plc_TypeID.DECIMAL128, + plc.TypeID.DECIMAL32, + plc.TypeID.DECIMAL64, + plc.TypeID.DECIMAL128, }: raise TypeError( "Must pass a dtype when constructing from a fixed-point scalar" ) - elif cdtype_id == plc_TypeID.STRUCT: - struct_table_view = (self.get_raw_ptr())[0].view() - self._dtype = StructDtype({ - str(i): dtype_from_column_view(struct_table_view.column(i)) - for i in range(struct_table_view.num_columns()) - }) - elif cdtype_id == plc_TypeID.LIST: - if ( - self.get_raw_ptr() - )[0].view().type().id() == plc_TypeID.LIST: - self._dtype = dtype_from_column_view( - (self.get_raw_ptr())[0].view() - ) - else: - self._dtype = ListDtype( - PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - ( - (self.get_raw_ptr())[0] - .view().type().id() - ) - ] - ) + elif cdtype_id == plc.TypeID.STRUCT: + self._dtype = StructDtype.from_arrow( + plc.interop.to_arrow(self.c_value).type + ) + elif cdtype_id == plc.TypeID.LIST: + self._dtype = ListDtype.from_arrow(plc.interop.to_arrow(self.c_value).type) else: - self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (cdtype_id) - ] + self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[cdtype_id] def as_device_scalar(val, dtype=None): diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd deleted file mode 100644 index be81ba54d3b..00000000000 --- a/python/cudf/cudf/_lib/types.pxd +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t - -from pylibcudf.libcudf.column.column_view cimport column_view - -ctypedef int32_t underlying_type_t_type_id - -cdef dtype_from_column_view(column_view cv) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx deleted file mode 100644 index fe09b3644c2..00000000000 --- a/python/cudf/cudf/_lib/types.pyx +++ /dev/null @@ -1,93 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np - -from libcpp.memory cimport make_shared, shared_ptr - -cimport pylibcudf.libcudf.types as libcudf_types -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view - -import pylibcudf as plc - -import cudf - - -SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { - np.dtype("int8"): plc.types.TypeId.INT8, - np.dtype("int16"): plc.types.TypeId.INT16, - np.dtype("int32"): plc.types.TypeId.INT32, - np.dtype("int64"): plc.types.TypeId.INT64, - np.dtype("uint8"): plc.types.TypeId.UINT8, - np.dtype("uint16"): plc.types.TypeId.UINT16, - np.dtype("uint32"): plc.types.TypeId.UINT32, - np.dtype("uint64"): plc.types.TypeId.UINT64, - np.dtype("float32"): plc.types.TypeId.FLOAT32, - np.dtype("float64"): plc.types.TypeId.FLOAT64, - np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS, - np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS, - np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS, - np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS, - np.dtype("object"): plc.types.TypeId.STRING, - np.dtype("bool"): plc.types.TypeId.BOOL8, - np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS, - np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS, - np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS, - np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS, -} -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { - plc_type: np_type - for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items() -} -# There's no equivalent to EMPTY in cudf. We translate EMPTY -# columns from libcudf to ``int8`` columns of all nulls in Python. -# ``int8`` is chosen because it uses the least amount of memory. -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8") -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype("object") -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object") - - -size_type_dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] - - -cdef dtype_from_lists_column_view(column_view cv): - # lists_column_view have no default constructor, so we heap - # allocate it to get around Cython's limitation of requiring - # default constructors for stack allocated objects - cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv) - cdef column_view child = lv.get()[0].child() - - if child.type().id() == libcudf_types.type_id.LIST: - return cudf.ListDtype(dtype_from_lists_column_view(child)) - else: - return cudf.ListDtype(dtype_from_column_view(child)) - -cdef dtype_from_column_view(column_view cv): - cdef libcudf_types.type_id tid = cv.type().id() - if tid == libcudf_types.type_id.LIST: - return dtype_from_lists_column_view(cv) - elif tid == libcudf_types.type_id.STRUCT: - fields = { - str(i): dtype_from_column_view(cv.child(i)) - for i in range(cv.num_children()) - } - return cudf.StructDtype(fields) - elif tid == libcudf_types.type_id.DECIMAL64: - return cudf.Decimal64Dtype( - precision=cudf.Decimal64Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL32: - return cudf.Decimal32Dtype( - precision=cudf.Decimal32Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL128: - return cudf.Decimal128Dtype( - precision=cudf.Decimal128Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - else: - return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (tid) - ] diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index c2f3c782d10..265264f9d7c 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -10,7 +10,6 @@ from typing_extensions import Self import cudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_scalar from cudf.core._internals import copying @@ -24,7 +23,11 @@ from cudf.core.copy_types import GatherMap from cudf.errors import MixedTypeError from cudf.utils import ioutils -from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype +from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, + can_convert_to_column, + is_mixed_with_object_dtype, +) from cudf.utils.utils import _is_same_name if TYPE_CHECKING: @@ -2047,7 +2050,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True): # TODO: For performance, the check and conversion of gather map should # be done by the caller. This check will be removed in future release. if gather_map.dtype.kind not in "iu": - gather_map = gather_map.astype(size_type_dtype) + gather_map = gather_map.astype(SIZE_TYPE_DTYPE) GatherMap(gather_map, len(self), nullify=not check_bounds or nullify) return self._from_columns_like_self( diff --git a/python/cudf/cudf/core/_internals/aggregation.py b/python/cudf/cudf/core/_internals/aggregation.py index 1d21d34b1bf..c7eb4807608 100644 --- a/python/cudf/cudf/core/_internals/aggregation.py +++ b/python/cudf/cudf/core/_internals/aggregation.py @@ -8,9 +8,9 @@ import pylibcudf as plc import cudf -from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES from cudf.api.types import is_scalar from cudf.utils import cudautils +from cudf.utils.dtypes import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES if TYPE_CHECKING: from collections.abc import Callable diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index b10b8dfe207..c80fa1c29f1 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -12,12 +12,12 @@ from typing_extensions import Self import cudf -from cudf import _lib as libcudf from cudf.core._internals import unary from cudf.core.column import column from cudf.core.column.methods import ColumnMethods from cudf.core.dtypes import CategoricalDtype, IntervalDtype from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, find_common_type, is_mixed_with_object_dtype, min_signed_type, @@ -1140,7 +1140,7 @@ def _get_decategorized_column(self) -> ColumnBase: if self.null_count == len(self): # self.categories is empty; just return codes return self.codes - gather_map = self.codes.astype(libcudf.types.size_type_dtype).fillna(0) + gather_map = self.codes.astype(SIZE_TYPE_DTYPE).fillna(0) out = self.categories.take(gather_map) out = out.set_mask(self.mask) return out @@ -1192,10 +1192,10 @@ def _concat( codes = [o.codes for o in objs] newsize = sum(map(len, codes)) - if newsize > np.iinfo(libcudf.types.size_type_dtype).max: + if newsize > np.iinfo(SIZE_TYPE_DTYPE).max: raise MemoryError( f"Result of concat cannot have " - f"size > {libcudf.types.size_type_dtype}_MAX" + f"size > {SIZE_TYPE_DTYPE}_MAX" ) elif newsize == 0: codes_col = column.column_empty(0, head.codes.dtype) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e743d467bcb..82bc4d5b328 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -26,7 +26,6 @@ from cudf import _lib as libcudf from cudf._lib.column import Column from cudf._lib.scalar import as_device_scalar -from cudf._lib.types import size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, @@ -61,6 +60,7 @@ from cudf.core.mixins import BinaryOperand, Reducible from cudf.errors import MixedTypeError from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, _maybe_convert_to_default_type, cudf_dtype_from_pa_type, cudf_dtype_to_pa_type, @@ -860,7 +860,7 @@ def indices_of( value = as_column(value, dtype=self.dtype, length=1) mask = value.contains(self) return apply_boolean_mask( # type: ignore[return-value] - [as_column(range(0, len(self)), dtype=size_type_dtype)], mask + [as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask )[0] def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]: @@ -940,7 +940,7 @@ def take( # TODO: For performance, the check and conversion of gather map should # be done by the caller. This check will be removed in future release. if indices.dtype.kind not in {"u", "i"}: - indices = indices.astype(libcudf.types.size_type_dtype) + indices = indices.astype(SIZE_TYPE_DTYPE) GatherMap(indices, len(self), nullify=not check_bounds or nullify) gathered = copying.gather([self], indices, nullify=nullify) # type: ignore[arg-type] return gathered[0]._with_type_metadata(self.dtype) # type: ignore[return-value] @@ -1729,9 +1729,7 @@ def column_empty( elif isinstance(dtype, ListDtype): data = None children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), + as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE), column_empty(row_count, dtype=dtype.element_type), ) elif isinstance(dtype, CategoricalDtype): @@ -1740,21 +1738,16 @@ def column_empty( cudf.core.column.NumericalColumn( data=as_buffer( rmm.DeviceBuffer( - size=row_count - * cudf.dtype(libcudf.types.size_type_dtype).itemsize + size=row_count * cudf.dtype(SIZE_TYPE_DTYPE).itemsize ) ), size=None, - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ), ) elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype): data = as_buffer(rmm.DeviceBuffer(size=0)) - children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), - ) + children = (as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE),) else: data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) @@ -2538,10 +2531,9 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: ) newsize = sum(map(len, objs)) - if newsize > np.iinfo(libcudf.types.size_type_dtype).max: + if newsize > np.iinfo(SIZE_TYPE_DTYPE).max: raise MemoryError( - f"Result of concat cannot have " - f"size > {libcudf.types.size_type_dtype}_MAX" + f"Result of concat cannot have " f"size > {SIZE_TYPE_DTYPE}_MAX" ) elif newsize == 0: return column_empty(0, head.dtype) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 3d9440cdf21..b4d501deb23 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -14,7 +14,6 @@ import cudf import cudf.core.column.column as column -from cudf._lib.types import size_type_dtype from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, as_column @@ -22,6 +21,7 @@ from cudf.core.column.numerical import NumericalColumn from cudf.core.dtypes import ListDtype from cudf.core.missing import NA +from cudf.utils.dtypes import SIZE_TYPE_DTYPE if TYPE_CHECKING: from collections.abc import Sequence @@ -247,7 +247,7 @@ def from_sequences( offset_col = cast( NumericalColumn, - column.as_column(offset_vals, dtype=size_type_dtype), + column.as_column(offset_vals, dtype=SIZE_TYPE_DTYPE), ) # Build ListColumn diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 08099d4f69f..2e2111db3ce 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -20,14 +20,17 @@ import cudf.core.column.column as column import cudf.core.column.datetime as datetime from cudf._lib.column import Column -from cudf._lib.types import size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import can_convert_to_column, dtype_to_pylibcudf_type +from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, + can_convert_to_column, + dtype_to_pylibcudf_type, +) if TYPE_CHECKING: from collections.abc import Callable, Sequence @@ -5592,7 +5595,7 @@ def __init__( if len(children) == 0 and size != 0: # all nulls-column: offsets = column.as_column( - 0, length=size + 1, dtype=size_type_dtype + 0, length=size + 1, dtype=SIZE_TYPE_DTYPE ) children = (offsets,) diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py index 4b6ad59c8e1..540059731b2 100644 --- a/python/cudf/cudf/core/copy_types.py +++ b/python/cudf/cudf/core/copy_types.py @@ -5,7 +5,7 @@ from typing_extensions import Self import cudf -from cudf._lib.types import size_type_dtype +from cudf.utils.dtypes import SIZE_TYPE_DTYPE if TYPE_CHECKING: from cudf.core.column import NumericalColumn @@ -63,7 +63,7 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool): # Alternately we can have an Optional[Column] and handle None # specially in _gather. self.column = cast( - "NumericalColumn", self.column.astype(size_type_dtype) + "NumericalColumn", self.column.astype(SIZE_TYPE_DTYPE) ) else: if self.column.dtype.kind not in {"i", "u"}: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 8ed233ba737..00016e3d9a5 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -57,7 +57,8 @@ def dtype(arbitrary): if np_dtype.kind in set("OU"): return np.dtype("object") elif ( - np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES + np_dtype + not in cudf.utils.dtypes.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES ): raise TypeError(f"Unsupported type {np_dtype}") return np_dtype diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 4137109cc96..7a25680f152 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -20,7 +20,6 @@ import cudf import cudf.core._internals from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( is_list_like, @@ -45,6 +44,7 @@ from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply +from cudf.utils.dtypes import SIZE_TYPE_DTYPE from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import GetAttrGetItemMixin @@ -586,7 +586,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]: offsets, group_keys, (indices,) = self._groups( [ cudf.core.column.as_column( - range(len(self.obj)), dtype=size_type_dtype + range(len(self.obj)), dtype=SIZE_TYPE_DTYPE ) ] ) @@ -1181,7 +1181,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # aggregation scheme in libcudf. This is probably "fast # enough" for most reasonable input sizes. _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=size_type_dtype) + group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE) size_per_group = np.diff(group_offsets) # "Out of bounds" n for the group size either means no entries # (negative) or all the entries (positive) @@ -1195,7 +1195,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): group_offsets = group_offsets[:-1] else: group_offsets = group_offsets[1:] - size_per_group - to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype) + to_take = np.arange(size_per_group.sum(), dtype=SIZE_TYPE_DTYPE) fixup = np.empty_like(size_per_group) fixup[0] = 0 np.cumsum(size_per_group[:-1], out=fixup[1:]) @@ -1496,11 +1496,11 @@ def sample( # into a numpy array directly, rather than a list. # TODO: this uses the sort-based groupby, could one use hash-based? _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=size_type_dtype) + group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE) size_per_group = np.diff(group_offsets) if n is not None: samples_per_group = np.broadcast_to( - size_type_dtype.type(n), size_per_group.shape + SIZE_TYPE_DTYPE.type(n), size_per_group.shape ) if not replace and (minsize := size_per_group.min()) < n: raise ValueError( @@ -1513,7 +1513,7 @@ def sample( # which is round-to-nearest, ties to sgn(x) * inf). samples_per_group = np.round( size_per_group * frac, decimals=0 - ).astype(size_type_dtype) + ).astype(SIZE_TYPE_DTYPE) if replace: # We would prefer to use cupy here, but their rng.integers # interface doesn't take array-based low and high @@ -1521,7 +1521,7 @@ def sample( low = 0 high = np.repeat(size_per_group, samples_per_group) rng = np.random.default_rng(seed=random_state) - indices = rng.integers(low, high, dtype=size_type_dtype) + indices = rng.integers(low, high, dtype=SIZE_TYPE_DTYPE) indices += np.repeat(group_offsets[:-1], samples_per_group) else: # Approach: do a segmented argsort of the index array and take @@ -1529,7 +1529,7 @@ def sample( # We will shuffle the group indices and then pick them out # from the grouped dataframe index. nrows = len(group_values) - indices = cp.arange(nrows, dtype=size_type_dtype) + indices = cp.arange(nrows, dtype=SIZE_TYPE_DTYPE) if len(size_per_group) < 500: # Empirically shuffling with cupy is faster at this scale rs = cp.random.get_random_state() @@ -1553,7 +1553,7 @@ def sample( indices = ColumnBase.from_pylibcudf(plc_table.columns()[0]) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? - want = np.arange(samples_per_group.sum(), dtype=size_type_dtype) + want = np.arange(samples_per_group.sum(), dtype=SIZE_TYPE_DTYPE) scan = np.empty_like(samples_per_group) scan[0] = 0 np.cumsum(samples_per_group[:-1], out=scan[1:]) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index eac5b9d71ae..e7aba7ead09 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -19,7 +19,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -53,6 +52,7 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, _maybe_convert_to_default_type, find_common_type, is_mixed_with_object_dtype, @@ -1002,7 +1002,7 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn: i = [self._range.index(value)] except ValueError: i = [] - return as_column(i, dtype=size_type_dtype) + return as_column(i, dtype=SIZE_TYPE_DTYPE) def isin(self, values, level=None): if level is not None and level > 0: @@ -1354,7 +1354,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): result = as_column( -1, length=len(needle), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ) if not len(self): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6854cb02aa5..8d01c8b1441 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -60,6 +60,7 @@ from cudf.utils import docutils, ioutils from cudf.utils._numba import _CUDFNumbaConfig from cudf.utils.docutils import copy_docstring +from cudf.utils.dtypes import SIZE_TYPE_DTYPE from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import _warn_no_dask_cudf @@ -3026,7 +3027,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: NumericalColumn, as_column( range(start, stop, stride), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ), ), len(self), diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 6e965ceca66..4390b85225d 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -7,7 +7,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.core._internals import sorting from cudf.core.buffer import acquire_spill_lock from cudf.core.copy_types import GatherMap @@ -17,6 +16,7 @@ _IndexIndexer, _match_join_keys, ) +from cudf.utils.dtypes import SIZE_TYPE_DTYPE class Merge: @@ -243,7 +243,7 @@ def _gather_maps(self, left_cols, right_cols): # tables, we gather from iota on both right and left, and then # sort the gather maps with those two columns as key. key_order = [ - cudf.core.column.as_column(range(n), dtype=size_type_dtype).take( + cudf.core.column.as_column(range(n), dtype=SIZE_TYPE_DTYPE).take( map_, nullify=null, check_bounds=False ) for map_, n, null in zip(maps, lengths, nullify) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 1e613e49ffc..a1fa4efad68 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -17,7 +17,6 @@ import cudf import cudf._lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column @@ -34,7 +33,7 @@ ensure_index, ) from cudf.core.join._join_helpers import _match_join_keys -from cudf.utils.dtypes import is_column_like +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_column_like from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name @@ -199,7 +198,7 @@ def __init__( ) if lo == -1: # Now we can gather and insert null automatically - code[code == -1] = np.iinfo(size_type_dtype).min + code[code == -1] = np.iinfo(SIZE_TYPE_DTYPE).min result_col = level._column.take(code, nullify=True) source_data[i] = result_col._with_type_metadata(level.dtype) @@ -1571,11 +1570,11 @@ def droplevel(self, level=-1) -> Self | cudf.Index: def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.MultiIndex: - # cudf uses np.iinfo(size_type_dtype).min as missing code + # cudf uses np.iinfo(SIZE_TYPE_DTYPE).min as missing code # pandas uses -1 as missing code pd_codes = ( code.find_and_replace( - column.as_column(np.iinfo(size_type_dtype).min, length=1), + column.as_column(np.iinfo(SIZE_TYPE_DTYPE).min, length=1), column.as_column(-1, length=1), ) for code in self._codes @@ -1906,7 +1905,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): result = column.as_column( -1, length=len(target), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ) if not len(self): return _return_get_indexer_result(result.values) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 0abd42d4d4e..b093fbb3e37 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -12,13 +12,12 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty from cudf.core.column_accessor import ColumnAccessor -from cudf.utils.dtypes import min_unsigned_type +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, min_unsigned_type if TYPE_CHECKING: from cudf._typing import Dtype @@ -1333,10 +1332,10 @@ def _one_hot_encode_column( else: column = column._get_decategorized_column() # type: ignore[attr-defined] - if column.size * categories.size >= np.iinfo(size_type_dtype).max: + if column.size * categories.size >= np.iinfo(SIZE_TYPE_DTYPE).max: raise ValueError( "Size limitation exceeded: column.size * category.size < " - f"np.iinfo({size_type_dtype}).max. Consider reducing " + f"np.iinfo({SIZE_TYPE_DTYPE}).max. Consider reducing " "size of category" ) result_labels = ( diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index e5d528cdbfe..d329fa95f44 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -153,7 +153,7 @@ def cudf_dtype_from_pydata_dtype(dtype): return cudf.core.dtypes.Decimal64Dtype elif cudf.api.types.is_decimal128_dtype(dtype): return cudf.core.dtypes.Decimal128Dtype - elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: + elif dtype in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: return dtype.type return infer_dtype_from_object(dtype) @@ -625,10 +625,46 @@ def dtype_to_pylibcudf_type(dtype) -> plc.DataType: dtype = _get_base_dtype(dtype) else: dtype = np.dtype(dtype) - return plc.DataType( - cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype] - ) + return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype]) + + +SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { + np.dtype("int8"): plc.types.TypeId.INT8, + np.dtype("int16"): plc.types.TypeId.INT16, + np.dtype("int32"): plc.types.TypeId.INT32, + np.dtype("int64"): plc.types.TypeId.INT64, + np.dtype("uint8"): plc.types.TypeId.UINT8, + np.dtype("uint16"): plc.types.TypeId.UINT16, + np.dtype("uint32"): plc.types.TypeId.UINT32, + np.dtype("uint64"): plc.types.TypeId.UINT64, + np.dtype("float32"): plc.types.TypeId.FLOAT32, + np.dtype("float64"): plc.types.TypeId.FLOAT64, + np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS, + np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS, + np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS, + np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS, + np.dtype("object"): plc.types.TypeId.STRING, + np.dtype("bool"): plc.types.TypeId.BOOL8, + np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS, + np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS, + np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS, + np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS, +} +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { + plc_type: np_type + for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items() +} +# There's no equivalent to EMPTY in cudf. We translate EMPTY +# columns from libcudf to ``int8`` columns of all nulls in Python. +# ``int8`` is chosen because it uses the least amount of memory. +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8") +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype( + "object" +) +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object") + +SIZE_TYPE_DTYPE = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] # Type dispatch loops similar to what are found in `np.add.types` # In NumPy, whether or not an op can be performed between two From 5e9459cd2531052e9b6915c291bf1425503bf529 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 3 Jan 2025 15:01:05 -0800 Subject: [PATCH 4/7] Update copyright --- python/cudf/cudf/_lib/CMakeLists.txt | 2 +- python/cudf/cudf/_lib/column.pxd | 2 +- python/cudf/cudf/_lib/column.pyx | 2 +- python/cudf/cudf/_lib/scalar.pyx | 2 +- python/cudf/cudf/core/_base_index.py | 2 +- python/cudf/cudf/core/_internals/aggregation.py | 2 +- python/cudf/cudf/core/_internals/binaryop.py | 2 +- python/cudf/cudf/core/_internals/unary.py | 2 +- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/column.py | 2 +- python/cudf/cudf/core/column/lists.py | 2 +- python/cudf/cudf/core/column/string.py | 2 +- python/cudf/cudf/core/copy_types.py | 2 +- python/cudf/cudf/core/dtypes.py | 2 +- python/cudf/cudf/core/groupby/groupby.py | 2 +- python/cudf/cudf/core/index.py | 2 +- python/cudf/cudf/core/indexed_frame.py | 2 +- python/cudf/cudf/core/join/join.py | 2 +- python/cudf/cudf/core/multiindex.py | 2 +- python/cudf/cudf/core/reshape.py | 2 +- python/cudf/cudf/io/csv.py | 2 +- python/cudf/cudf/io/json.py | 2 +- python/cudf/cudf/io/orc.py | 2 +- python/cudf/cudf/utils/dtypes.py | 2 +- 24 files changed, 24 insertions(+), 24 deletions(-) diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index e9bf3882923..ec44a6aa8c5 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd index dc99ebb175c..026c12895e8 100644 --- a/python/cudf/cudf/_lib/column.pxd +++ b/python/cudf/cudf/_lib/column.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from typing import Literal diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 710ccd55cf8..b10fb186cce 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from typing import Literal diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 82d873724ce..227c2786c6f 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import copy diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 265264f9d7c..57b0c9fc7b4 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/_internals/aggregation.py b/python/cudf/cudf/core/_internals/aggregation.py index c7eb4807608..e6e6c3bcedf 100644 --- a/python/cudf/cudf/core/_internals/aggregation.py +++ b/python/cudf/cudf/core/_internals/aggregation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import TYPE_CHECKING, Literal diff --git a/python/cudf/cudf/core/_internals/binaryop.py b/python/cudf/cudf/core/_internals/binaryop.py index 954fc60c277..a9023f8fd59 100644 --- a/python/cudf/cudf/core/_internals/binaryop.py +++ b/python/cudf/cudf/core/_internals/binaryop.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import TYPE_CHECKING diff --git a/python/cudf/cudf/core/_internals/unary.py b/python/cudf/cudf/core/_internals/unary.py index 19deeafad96..c45c4a1b5cf 100644 --- a/python/cudf/cudf/core/_internals/unary.py +++ b/python/cudf/cudf/core/_internals/unary.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import TYPE_CHECKING diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index c80fa1c29f1..ee9fe756718 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 82bc4d5b328..c3c3bd3c9ed 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index b4d501deb23..33817716d66 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 2e2111db3ce..2955577d818 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py index 540059731b2..aaaf6c7ee4f 100644 --- a/python/cudf/cudf/core/copy_types.py +++ b/python/cudf/cudf/core/copy_types.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from dataclasses import dataclass from typing import TYPE_CHECKING, Any, cast diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 00016e3d9a5..ce7fb968069 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import decimal diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 7a25680f152..9de92c0a009 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import copy diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e7aba7ead09..40f839e2f7c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 8d01c8b1441..51a50e4429d 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. """Base class for Frame types that have an index.""" from __future__ import annotations diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 4390b85225d..ce7edc8fdbe 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import Any diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index a1fa4efad68..ffcc9c85087 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index b093fbb3e37..eedd777aafe 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations import itertools diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index a9a0bfd4ee4..7e8468c8e8a 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations import errno diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 95577aee30c..16c7d189dfd 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import os diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index ee12790d954..0ac2950a22b 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import itertools diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index d329fa95f44..385e262028d 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import datetime From a4667cc909f802a99adcf79a791c1a2f483e7bae Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 Jan 2025 11:01:13 -0800 Subject: [PATCH 5/7] address reviews --- python/cudf/cudf/_lib/column.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index b10fb186cce..18ca72eb540 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -62,7 +62,7 @@ cdef get_element(column_view col_view, size_type index): ) -def dtype_from_pylibcudf_column(col): +def dtype_from_pylibcudf_column(Column col not None): type_ = col.type() tid = type_.id() @@ -433,7 +433,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[mutable_column_view] children cdef void* data From 43ce557ca730b8ecd7c7d1bf079243f6d9743967 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 7 Jan 2025 15:25:50 -0800 Subject: [PATCH 6/7] use plc_Column as type --- python/cudf/cudf/_lib/column.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 18ca72eb540..581b40321d4 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -33,7 +33,7 @@ from libcpp.vector cimport vector from rmm.pylibrmm.device_buffer cimport DeviceBuffer -from pylibcudf cimport DataType as plc_DataType +from pylibcudf cimport DataType as plc_DataType, Column as plc_Column cimport pylibcudf.libcudf.copying as cpp_copying cimport pylibcudf.libcudf.types as libcudf_types cimport pylibcudf.libcudf.unary as libcudf_unary @@ -62,7 +62,7 @@ cdef get_element(column_view col_view, size_type index): ) -def dtype_from_pylibcudf_column(Column col not None): +def dtype_from_pylibcudf_column(plc_Column col not None): type_ = col.type() tid = type_.id() From 82e259f6335d19bac9d3c42a8df04cb23ef47db8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Jan 2025 11:53:19 -0800 Subject: [PATCH 7/7] Update python/cudf/cudf/_lib/column.pyx Co-authored-by: Lawrence Mitchell --- python/cudf/cudf/_lib/column.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 581b40321d4..c59bbc0f40c 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -496,7 +496,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[column_view] children cdef void* data