From 7e9bb8f2967155cda882bf154cbaf5e4075addfb Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 15 Nov 2024 15:19:12 -0800
Subject: [PATCH] Remove cudf._lib.labeling in favor of inlining pylibcudf

---
 python/cudf/cudf/_lib/CMakeLists.txt     |  1 -
 python/cudf/cudf/_lib/__init__.py        |  1 -
 python/cudf/cudf/_lib/column.pyi         |  9 ++++++
 python/cudf/cudf/_lib/labeling.pyx       | 24 ---------------
 python/cudf/cudf/core/column/datetime.py | 39 ++++++++++++++----------
 python/cudf/cudf/core/cut.py             | 22 ++++++++++---
 python/cudf/cudf/core/resample.py        | 32 +++++++++++--------
 7 files changed, 70 insertions(+), 58 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/labeling.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 41a7db2285a..a626f8dfaa3 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -26,7 +26,6 @@ set(cython_sources
     interop.pyx
     join.pyx
     json.pyx
-    labeling.pyx
     lists.pyx
     merge.pyx
     null_mask.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 57df6899a22..43d71971d89 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -13,7 +13,6 @@
     interop,
     join,
     json,
-    labeling,
     merge,
     null_mask,
     nvtext,
diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
index bb38488eefb..bdd90be45b8 100644
--- a/python/cudf/cudf/_lib/column.pyi
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -2,8 +2,12 @@
 
 from __future__ import annotations
 
+from typing import Literal
+
 from typing_extensions import Self
 
+import pylibcudf as plc
+
 from cudf._typing import Dtype, DtypeObj, ScalarLike
 from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase
@@ -71,3 +75,8 @@ class Column:
     # TODO: The val parameter should be Scalar, not ScalarLike
     @staticmethod
     def from_scalar(val: ScalarLike, size: int) -> ColumnBase: ...
+    @staticmethod
+    def from_pylibcudf(
+        col: plc.Column, data_ptr_exposed: bool = False
+    ) -> ColumnBase: ...
+    def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column: ...
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
deleted file mode 100644
index 524bfd3b2e8..00000000000
--- a/python/cudf/cudf/_lib/labeling.pyx
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool as cbool
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-from cudf.core.buffer import acquire_spill_lock
-
-
-# Note that the parameter input shadows a Python built-in in the local scope,
-# but I'm not too concerned about that since there's no use-case for actual
-# input in this context.
-@acquire_spill_lock()
-def label_bins(Column input, Column left_edges, cbool left_inclusive,
-               Column right_edges, cbool right_inclusive):
-    plc_column = plc.labeling.label_bins(
-        input.to_pylibcudf(mode="read"),
-        left_edges.to_pylibcudf(mode="read"),
-        plc.labeling.Inclusive.YES if left_inclusive else plc.labeling.Inclusive.NO,
-        right_edges.to_pylibcudf(mode="read"),
-        plc.labeling.Inclusive.YES if right_inclusive else plc.labeling.Inclusive.NO,
-    )
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index bd0d72b9bc0..32f068144a3 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -14,9 +14,10 @@
 import pandas as pd
 import pyarrow as pa
 
+import pylibcudf as plc
+
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.labeling import label_bins
 from cudf._lib.search import search_sorted
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals.timezones import (
@@ -24,7 +25,7 @@
     get_compatible_timezone,
     get_tz_data,
 )
-from cudf.core.buffer import Buffer
+from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
@@ -818,13 +819,16 @@ def _find_ambiguous_and_nonexistent(
         # The end of an ambiguous time period is what Clock 2 reads at
         # the moment of transition:
         ambiguous_end = clock_2.apply_boolean_mask(cond)
-        ambiguous = label_bins(
-            self,
-            left_edges=ambiguous_begin,
-            left_inclusive=True,
-            right_edges=ambiguous_end,
-            right_inclusive=False,
-        ).notnull()
+        with acquire_spill_lock():
+            plc_column = plc.labeling.label_bins(
+                self.to_pylibcudf(mode="read"),
+                ambiguous_begin.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.YES,
+                ambiguous_end.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.NO,
+            )
+            ambiguous = libcudf.column.Column.from_pylibcudf(plc_column)
+        ambiguous = ambiguous.notnull()
 
         # At the start of a non-existent time period, Clock 2 reads less
         # than Clock 1 (which has been turned forward):
@@ -834,13 +838,16 @@ def _find_ambiguous_and_nonexistent(
         # The end of the non-existent time period is what Clock 1 reads
         # at the moment of transition:
         nonexistent_end = clock_1.apply_boolean_mask(cond)
-        nonexistent = label_bins(
-            self,
-            left_edges=nonexistent_begin,
-            left_inclusive=True,
-            right_edges=nonexistent_end,
-            right_inclusive=False,
-        ).notnull()
+        with acquire_spill_lock():
+            plc_column = plc.labeling.label_bins(
+                self.to_pylibcudf(mode="read"),
+                nonexistent_begin.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.YES,
+                nonexistent_end.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.NO,
+            )
+            nonexistent = libcudf.column.Column.from_pylibcudf(plc_column)
+        nonexistent = nonexistent.notnull()
 
         return ambiguous, nonexistent
 
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index c9b1fa2669c..a4d12cfc7f0 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -6,8 +6,12 @@
 import numpy as np
 import pandas as pd
 
+import pylibcudf as plc
+
 import cudf
+from cudf._lib.column import Column
 from cudf.api.types import is_list_like
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import as_column
 from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
 from cudf.core.index import IntervalIndex, interval_range
@@ -256,9 +260,19 @@ def cut(
         # the input arr must be changed to the same type as the edges
         input_arr = input_arr.astype(left_edges.dtype)
     # get the indexes for the appropriate number
-    index_labels = cudf._lib.labeling.label_bins(
-        input_arr, left_edges, left_inclusive, right_edges, right_inclusive
-    )
+    with acquire_spill_lock():
+        plc_column = plc.labeling.label_bins(
+            input_arr.to_pylibcudf(mode="read"),
+            left_edges.to_pylibcudf(mode="read"),
+            plc.labeling.Inclusive.YES
+            if left_inclusive
+            else plc.labeling.Inclusive.NO,
+            right_edges.to_pylibcudf(mode="read"),
+            plc.labeling.Inclusive.YES
+            if right_inclusive
+            else plc.labeling.Inclusive.NO,
+        )
+        index_labels = Column.from_pylibcudf(plc_column)
 
     if labels is False:
         # if labels is false we return the index labels, we return them
@@ -283,7 +297,7 @@ def cut(
             # should allow duplicate categories.
             return interval_labels[index_labels]
 
-    index_labels = as_unsigned_codes(len(interval_labels), index_labels)
+    index_labels = as_unsigned_codes(len(interval_labels), index_labels)  # type: ignore[arg-type]
 
     col = CategoricalColumn(
         data=None,
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index e0aee28bfeb..d95d252559f 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -22,9 +22,11 @@
 import numpy as np
 import pandas as pd
 
+import pylibcudf as plc
+
 import cudf
-import cudf._lib.labeling
-import cudf.core.index
+from cudf._lib.column import Column
+from cudf.core.buffer import acquire_spill_lock
 from cudf.core.groupby.groupby import (
     DataFrameGroupBy,
     GroupBy,
@@ -48,7 +50,7 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
             func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
         )
         if len(self.grouping.bin_labels) != len(result):
-            index = cudf.core.index.Index(
+            index = cudf.Index(
                 self.grouping.bin_labels, name=self.grouping.names[0]
             )
             return result._align_to_index(
@@ -125,7 +127,7 @@ class SeriesResampler(_Resampler, SeriesGroupBy):
 
 
 class _ResampleGrouping(_Grouping):
-    bin_labels: cudf.core.index.Index
+    bin_labels: cudf.Index
 
     def __init__(self, obj, by=None, level=None):
         self._freq = getattr(by, "freq", None)
@@ -170,7 +172,7 @@ def deserialize(cls, header, frames):
         out.names = names
         out._named_columns = _named_columns
         out._key_columns = key_columns
-        out.bin_labels = cudf.core.index.Index.deserialize(
+        out.bin_labels = cudf.Index.deserialize(
             header["__bin_labels"], frames[-header["__bin_labels_count"] :]
         )
         out._freq = header["_freq"]
@@ -268,13 +270,19 @@ def _handle_frequency_grouper(self, by):
             cast_bin_labels = bin_labels.astype(result_type)
 
         # bin the key column:
-        bin_numbers = cudf._lib.labeling.label_bins(
-            cast_key_column,
-            left_edges=cast_bin_labels[:-1]._column,
-            left_inclusive=(closed == "left"),
-            right_edges=cast_bin_labels[1:]._column,
-            right_inclusive=(closed == "right"),
-        )
+        with acquire_spill_lock():
+            plc_column = plc.labeling.label_bins(
+                cast_key_column.to_pylibcudf(mode="read"),
+                cast_bin_labels[:-1]._column.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.YES
+                if closed == "left"
+                else plc.labeling.Inclusive.NO,
+                cast_bin_labels[1:]._column.to_pylibcudf(mode="read"),
+                plc.labeling.Inclusive.YES
+                if closed == "right"
+                else plc.labeling.Inclusive.NO,
+            )
+            bin_numbers = Column.from_pylibcudf(plc_column)
 
         if label == "right":
             cast_bin_labels = cast_bin_labels[1:]