Remove cudf._lib.labeling in favor of inlining pylibcudf (#17346)

Contributes to #17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #17346
rapidsai · Nov 18, 2024 · 302e625 · 302e625
1 parent 02c35bf
commit 302e625
Show file tree

Hide file tree

Showing 6 changed files with 61 additions and 58 deletions.
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -24,7 +24,6 @@ set(cython_sources
     interop.pyx
     join.pyx
     json.pyx
-    labeling.pyx
     lists.pyx
     merge.pyx
     null_mask.pyx

diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
@@ -11,7 +11,6 @@
     interop,
     join,
     json,
-    labeling,
     merge,
     null_mask,
     nvtext,

diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -14,9 +14,10 @@
 import pandas as pd
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.labeling import label_bins
 from cudf._lib.search import search_sorted
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals import unary
@@ -25,7 +26,7 @@
     get_compatible_timezone,
     get_tz_data,
 )
-from cudf.core.buffer import Buffer
+from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
@@ -819,13 +820,16 @@ def _find_ambiguous_and_nonexistent(
         # The end of an ambiguous time period is what Clock 2 reads at
         # the moment of transition:
         ambiguous_end = clock_2.apply_boolean_mask(cond)
-        ambiguous = label_bins(
-            self,
-            left_edges=ambiguous_begin,
-            left_inclusive=True,
-            right_edges=ambiguous_end,
-            right_inclusive=False,
-        ).notnull()
+        with acquire_spill_lock():
+            plc_column = plc.labeling.label_bins(
+                self.to_pylibcudf(mode="read"),
+                ambiguous_begin.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.YES,
+                ambiguous_end.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.NO,
+            )
+            ambiguous = libcudf.column.Column.from_pylibcudf(plc_column)
+        ambiguous = ambiguous.notnull()
 
         # At the start of a non-existent time period, Clock 2 reads less
         # than Clock 1 (which has been turned forward):
@@ -835,13 +839,16 @@ def _find_ambiguous_and_nonexistent(
         # The end of the non-existent time period is what Clock 1 reads
         # at the moment of transition:
         nonexistent_end = clock_1.apply_boolean_mask(cond)
-        nonexistent = label_bins(
-            self,
-            left_edges=nonexistent_begin,
-            left_inclusive=True,
-            right_edges=nonexistent_end,
-            right_inclusive=False,
-        ).notnull()
+        with acquire_spill_lock():
+            plc_column = plc.labeling.label_bins(
+                self.to_pylibcudf(mode="read"),
+                nonexistent_begin.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.YES,
+                nonexistent_end.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.NO,
+            )
+            nonexistent = libcudf.column.Column.from_pylibcudf(plc_column)
+        nonexistent = nonexistent.notnull()
 
         return ambiguous, nonexistent
 

diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
@@ -6,8 +6,12 @@
 import numpy as np
 import pandas as pd
 
+import pylibcudf as plc
+
 import cudf
+from cudf._lib.column import Column
 from cudf.api.types import is_list_like
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import as_column
 from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
 from cudf.core.index import IntervalIndex, interval_range
@@ -256,9 +260,19 @@ def cut(
         # the input arr must be changed to the same type as the edges
         input_arr = input_arr.astype(left_edges.dtype)
     # get the indexes for the appropriate number
-    index_labels = cudf._lib.labeling.label_bins(
-        input_arr, left_edges, left_inclusive, right_edges, right_inclusive
-    )
+    with acquire_spill_lock():
+        plc_column = plc.labeling.label_bins(
+            input_arr.to_pylibcudf(mode="read"),
+            left_edges.to_pylibcudf(mode="read"),
+            plc.labeling.Inclusive.YES
+            if left_inclusive
+            else plc.labeling.Inclusive.NO,
+            right_edges.to_pylibcudf(mode="read"),
+            plc.labeling.Inclusive.YES
+            if right_inclusive
+            else plc.labeling.Inclusive.NO,
+        )
+        index_labels = Column.from_pylibcudf(plc_column)
 
     if labels is False:
         # if labels is false we return the index labels, we return them
@@ -283,7 +297,7 @@ def cut(
             # should allow duplicate categories.
             return interval_labels[index_labels]
 
-    index_labels = as_unsigned_codes(len(interval_labels), index_labels)
+    index_labels = as_unsigned_codes(len(interval_labels), index_labels)  # type: ignore[arg-type]
 
     col = CategoricalColumn(
         data=None,

diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
@@ -22,9 +22,11 @@
 import numpy as np
 import pandas as pd
 
+import pylibcudf as plc
+
 import cudf
-import cudf._lib.labeling
-import cudf.core.index
+from cudf._lib.column import Column
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.groupby.groupby import (
     DataFrameGroupBy,
     GroupBy,
@@ -48,7 +50,7 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
             func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
         )
         if len(self.grouping.bin_labels) != len(result):
-            index = cudf.core.index.Index(
+            index = cudf.Index(
                 self.grouping.bin_labels, name=self.grouping.names[0]
             )
             return result._align_to_index(
@@ -125,7 +127,7 @@ class SeriesResampler(_Resampler, SeriesGroupBy):
 
 
 class _ResampleGrouping(_Grouping):
-    bin_labels: cudf.core.index.Index
+    bin_labels: cudf.Index
 
     def __init__(self, obj, by=None, level=None):
         self._freq = getattr(by, "freq", None)
@@ -170,7 +172,7 @@ def deserialize(cls, header, frames):
         out.names = names
         out._named_columns = _named_columns
         out._key_columns = key_columns
-        out.bin_labels = cudf.core.index.Index.deserialize(
+        out.bin_labels = cudf.Index.deserialize(
             header["__bin_labels"], frames[-header["__bin_labels_count"] :]
         )
         out._freq = header["_freq"]
@@ -268,13 +270,19 @@ def _handle_frequency_grouper(self, by):
             cast_bin_labels = bin_labels.astype(result_type)
 
         # bin the key column:
-        bin_numbers = cudf._lib.labeling.label_bins(
-            cast_key_column,
-            left_edges=cast_bin_labels[:-1]._column,
-            left_inclusive=(closed == "left"),
-            right_edges=cast_bin_labels[1:]._column,
-            right_inclusive=(closed == "right"),
-        )
+        with acquire_spill_lock():
+            plc_column = plc.labeling.label_bins(
+                cast_key_column.to_pylibcudf(mode="read"),
+                cast_bin_labels[:-1]._column.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.YES
+                if closed == "left"
+                else plc.labeling.Inclusive.NO,
+                cast_bin_labels[1:]._column.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.YES
+                if closed == "right"
+                else plc.labeling.Inclusive.NO,
+            )
+            bin_numbers = Column.from_pylibcudf(plc_column)
 
         if label == "right":
             cast_bin_labels = cast_bin_labels[1:]
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,7 +11,6 @@ @@
         interop,
         join,
         json,
-        labeling,
         merge,
         null_mask,
         nvtext,
@@ Expand Down @@