From 7e9bb8f2967155cda882bf154cbaf5e4075addfb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 15 Nov 2024 15:19:12 -0800 Subject: [PATCH] Remove cudf._lib.labeling in favor of inlining pylibcudf --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/column.pyi | 9 ++++++ python/cudf/cudf/_lib/labeling.pyx | 24 --------------- python/cudf/cudf/core/column/datetime.py | 39 ++++++++++++++---------- python/cudf/cudf/core/cut.py | 22 ++++++++++--- python/cudf/cudf/core/resample.py | 32 +++++++++++-------- 7 files changed, 70 insertions(+), 58 deletions(-) delete mode 100644 python/cudf/cudf/_lib/labeling.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 41a7db2285a..a626f8dfaa3 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -26,7 +26,6 @@ set(cython_sources interop.pyx join.pyx json.pyx - labeling.pyx lists.pyx merge.pyx null_mask.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index 57df6899a22..43d71971d89 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -13,7 +13,6 @@ interop, join, json, - labeling, merge, null_mask, nvtext, diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index bb38488eefb..bdd90be45b8 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -2,8 +2,12 @@ from __future__ import annotations +from typing import Literal + from typing_extensions import Self +import pylibcudf as plc + from cudf._typing import Dtype, DtypeObj, ScalarLike from cudf.core.buffer import Buffer from cudf.core.column import ColumnBase @@ -71,3 +75,8 @@ class Column: # TODO: The val parameter should be Scalar, not ScalarLike @staticmethod def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ... + @staticmethod + def from_pylibcudf( + col: plc.Column, data_ptr_exposed: bool = False + ) -> ColumnBase: ... + def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ... diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx deleted file mode 100644 index 524bfd3b2e8..00000000000 --- a/python/cudf/cudf/_lib/labeling.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. - -from libcpp cimport bool as cbool - -import pylibcudf as plc - -from cudf._lib.column cimport Column -from cudf.core.buffer import acquire_spill_lock - - -# Note that the parameter input shadows a Python built-in in the local scope, -# but I'm not too concerned about that since there's no use-case for actual -# input in this context. -@acquire_spill_lock() -def label_bins(Column input, Column left_edges, cbool left_inclusive, - Column right_edges, cbool right_inclusive): - plc_column = plc.labeling.label_bins( - input.to_pylibcudf(mode="read"), - left_edges.to_pylibcudf(mode="read"), - plc.labeling.Inclusive.YES if left_inclusive else plc.labeling.Inclusive.NO, - right_edges.to_pylibcudf(mode="read"), - plc.labeling.Inclusive.YES if right_inclusive else plc.labeling.Inclusive.NO, - ) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index bd0d72b9bc0..32f068144a3 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -14,9 +14,10 @@ import pandas as pd import pyarrow as pa +import pylibcudf as plc + import cudf from cudf import _lib as libcudf -from cudf._lib.labeling import label_bins from cudf._lib.search import search_sorted from cudf.core._compat import PANDAS_GE_220 from cudf.core._internals.timezones import ( @@ -24,7 +25,7 @@ get_compatible_timezone, get_tz_data, ) -from cudf.core.buffer import Buffer +from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils.dtypes import _get_base_dtype @@ -818,13 +819,16 @@ def _find_ambiguous_and_nonexistent( # The end of an ambiguous time period is what Clock 2 reads at # the moment of transition: ambiguous_end = clock_2.apply_boolean_mask(cond) - ambiguous = label_bins( - self, - left_edges=ambiguous_begin, - left_inclusive=True, - right_edges=ambiguous_end, - right_inclusive=False, - ).notnull() + with acquire_spill_lock(): + plc_column = plc.labeling.label_bins( + self.to_pylibcudf(mode="read"), + ambiguous_begin.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.YES, + ambiguous_end.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.NO, + ) + ambiguous = libcudf.column.Column.from_pylibcudf(plc_column) + ambiguous = ambiguous.notnull() # At the start of a non-existent time period, Clock 2 reads less # than Clock 1 (which has been turned forward): @@ -834,13 +838,16 @@ def _find_ambiguous_and_nonexistent( # The end of the non-existent time period is what Clock 1 reads # at the moment of transition: nonexistent_end = clock_1.apply_boolean_mask(cond) - nonexistent = label_bins( - self, - left_edges=nonexistent_begin, - left_inclusive=True, - right_edges=nonexistent_end, - right_inclusive=False, - ).notnull() + with acquire_spill_lock(): + plc_column = plc.labeling.label_bins( + self.to_pylibcudf(mode="read"), + nonexistent_begin.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.YES, + nonexistent_end.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.NO, + ) + nonexistent = libcudf.column.Column.from_pylibcudf(plc_column) + nonexistent = nonexistent.notnull() return ambiguous, nonexistent diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py index c9b1fa2669c..a4d12cfc7f0 100644 --- a/python/cudf/cudf/core/cut.py +++ b/python/cudf/cudf/core/cut.py @@ -6,8 +6,12 @@ import numpy as np import pandas as pd +import pylibcudf as plc + import cudf +from cudf._lib.column import Column from cudf.api.types import is_list_like +from cudf.core.buffer import acquire_spill_lock from cudf.core.column import as_column from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes from cudf.core.index import IntervalIndex, interval_range @@ -256,9 +260,19 @@ def cut( # the input arr must be changed to the same type as the edges input_arr = input_arr.astype(left_edges.dtype) # get the indexes for the appropriate number - index_labels = cudf._lib.labeling.label_bins( - input_arr, left_edges, left_inclusive, right_edges, right_inclusive - ) + with acquire_spill_lock(): + plc_column = plc.labeling.label_bins( + input_arr.to_pylibcudf(mode="read"), + left_edges.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.YES + if left_inclusive + else plc.labeling.Inclusive.NO, + right_edges.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.YES + if right_inclusive + else plc.labeling.Inclusive.NO, + ) + index_labels = Column.from_pylibcudf(plc_column) if labels is False: # if labels is false we return the index labels, we return them @@ -283,7 +297,7 @@ def cut( # should allow duplicate categories. return interval_labels[index_labels] - index_labels = as_unsigned_codes(len(interval_labels), index_labels) + index_labels = as_unsigned_codes(len(interval_labels), index_labels) # type: ignore[arg-type] col = CategoricalColumn( data=None, diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py index e0aee28bfeb..d95d252559f 100644 --- a/python/cudf/cudf/core/resample.py +++ b/python/cudf/cudf/core/resample.py @@ -22,9 +22,11 @@ import numpy as np import pandas as pd +import pylibcudf as plc + import cudf -import cudf._lib.labeling -import cudf.core.index +from cudf._lib.column import Column +from cudf.core.buffer import acquire_spill_lock from cudf.core.groupby.groupby import ( DataFrameGroupBy, GroupBy, @@ -48,7 +50,7 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs): func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs ) if len(self.grouping.bin_labels) != len(result): - index = cudf.core.index.Index( + index = cudf.Index( self.grouping.bin_labels, name=self.grouping.names[0] ) return result._align_to_index( @@ -125,7 +127,7 @@ class SeriesResampler(_Resampler, SeriesGroupBy): class _ResampleGrouping(_Grouping): - bin_labels: cudf.core.index.Index + bin_labels: cudf.Index def __init__(self, obj, by=None, level=None): self._freq = getattr(by, "freq", None) @@ -170,7 +172,7 @@ def deserialize(cls, header, frames): out.names = names out._named_columns = _named_columns out._key_columns = key_columns - out.bin_labels = cudf.core.index.Index.deserialize( + out.bin_labels = cudf.Index.deserialize( header["__bin_labels"], frames[-header["__bin_labels_count"] :] ) out._freq = header["_freq"] @@ -268,13 +270,19 @@ def _handle_frequency_grouper(self, by): cast_bin_labels = bin_labels.astype(result_type) # bin the key column: - bin_numbers = cudf._lib.labeling.label_bins( - cast_key_column, - left_edges=cast_bin_labels[:-1]._column, - left_inclusive=(closed == "left"), - right_edges=cast_bin_labels[1:]._column, - right_inclusive=(closed == "right"), - ) + with acquire_spill_lock(): + plc_column = plc.labeling.label_bins( + cast_key_column.to_pylibcudf(mode="read"), + cast_bin_labels[:-1]._column.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.YES + if closed == "left" + else plc.labeling.Inclusive.NO, + cast_bin_labels[1:]._column.to_pylibcudf(mode="read"), + plc.labeling.Inclusive.YES + if closed == "right" + else plc.labeling.Inclusive.NO, + ) + bin_numbers = Column.from_pylibcudf(plc_column) if label == "right": cast_bin_labels = cast_bin_labels[1:]