Skip to content

Commit

Permalink
Remove cudf._lib.labeling in favor of inlining pylibcudf (#17346)
Browse files Browse the repository at this point in the history
Contributes to #17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #17346
  • Loading branch information
mroeschke authored Nov 18, 2024
1 parent 02c35bf commit 302e625
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 58 deletions.
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ set(cython_sources
interop.pyx
join.pyx
json.pyx
labeling.pyx
lists.pyx
merge.pyx
null_mask.pyx
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
interop,
join,
json,
labeling,
merge,
null_mask,
nvtext,
Expand Down
24 changes: 0 additions & 24 deletions python/cudf/cudf/_lib/labeling.pyx

This file was deleted.

39 changes: 23 additions & 16 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@
import pandas as pd
import pyarrow as pa

import pylibcudf as plc

import cudf
from cudf import _lib as libcudf
from cudf._lib.labeling import label_bins
from cudf._lib.search import search_sorted
from cudf.core._compat import PANDAS_GE_220
from cudf.core._internals import unary
Expand All @@ -25,7 +26,7 @@
get_compatible_timezone,
get_tz_data,
)
from cudf.core.buffer import Buffer
from cudf.core.buffer import Buffer, acquire_spill_lock
from cudf.core.column import ColumnBase, as_column, column, string
from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
from cudf.utils.dtypes import _get_base_dtype
Expand Down Expand Up @@ -819,13 +820,16 @@ def _find_ambiguous_and_nonexistent(
# The end of an ambiguous time period is what Clock 2 reads at
# the moment of transition:
ambiguous_end = clock_2.apply_boolean_mask(cond)
ambiguous = label_bins(
self,
left_edges=ambiguous_begin,
left_inclusive=True,
right_edges=ambiguous_end,
right_inclusive=False,
).notnull()
with acquire_spill_lock():
plc_column = plc.labeling.label_bins(
self.to_pylibcudf(mode="read"),
ambiguous_begin.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.YES,
ambiguous_end.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.NO,
)
ambiguous = libcudf.column.Column.from_pylibcudf(plc_column)
ambiguous = ambiguous.notnull()

# At the start of a non-existent time period, Clock 2 reads less
# than Clock 1 (which has been turned forward):
Expand All @@ -835,13 +839,16 @@ def _find_ambiguous_and_nonexistent(
# The end of the non-existent time period is what Clock 1 reads
# at the moment of transition:
nonexistent_end = clock_1.apply_boolean_mask(cond)
nonexistent = label_bins(
self,
left_edges=nonexistent_begin,
left_inclusive=True,
right_edges=nonexistent_end,
right_inclusive=False,
).notnull()
with acquire_spill_lock():
plc_column = plc.labeling.label_bins(
self.to_pylibcudf(mode="read"),
nonexistent_begin.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.YES,
nonexistent_end.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.NO,
)
nonexistent = libcudf.column.Column.from_pylibcudf(plc_column)
nonexistent = nonexistent.notnull()

return ambiguous, nonexistent

Expand Down
22 changes: 18 additions & 4 deletions python/cudf/cudf/core/cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,12 @@
import numpy as np
import pandas as pd

import pylibcudf as plc

import cudf
from cudf._lib.column import Column
from cudf.api.types import is_list_like
from cudf.core.buffer import acquire_spill_lock
from cudf.core.column import as_column
from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
from cudf.core.index import IntervalIndex, interval_range
Expand Down Expand Up @@ -256,9 +260,19 @@ def cut(
# the input arr must be changed to the same type as the edges
input_arr = input_arr.astype(left_edges.dtype)
# get the indexes for the appropriate number
index_labels = cudf._lib.labeling.label_bins(
input_arr, left_edges, left_inclusive, right_edges, right_inclusive
)
with acquire_spill_lock():
plc_column = plc.labeling.label_bins(
input_arr.to_pylibcudf(mode="read"),
left_edges.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.YES
if left_inclusive
else plc.labeling.Inclusive.NO,
right_edges.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.YES
if right_inclusive
else plc.labeling.Inclusive.NO,
)
index_labels = Column.from_pylibcudf(plc_column)

if labels is False:
# if labels is false we return the index labels, we return them
Expand All @@ -283,7 +297,7 @@ def cut(
# should allow duplicate categories.
return interval_labels[index_labels]

index_labels = as_unsigned_codes(len(interval_labels), index_labels)
index_labels = as_unsigned_codes(len(interval_labels), index_labels) # type: ignore[arg-type]

col = CategoricalColumn(
data=None,
Expand Down
32 changes: 20 additions & 12 deletions python/cudf/cudf/core/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@
import numpy as np
import pandas as pd

import pylibcudf as plc

import cudf
import cudf._lib.labeling
import cudf.core.index
from cudf._lib.column import Column
from cudf.core.buffer import acquire_spill_lock
from cudf.core.groupby.groupby import (
DataFrameGroupBy,
GroupBy,
Expand All @@ -48,7 +50,7 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
)
if len(self.grouping.bin_labels) != len(result):
index = cudf.core.index.Index(
index = cudf.Index(
self.grouping.bin_labels, name=self.grouping.names[0]
)
return result._align_to_index(
Expand Down Expand Up @@ -125,7 +127,7 @@ class SeriesResampler(_Resampler, SeriesGroupBy):


class _ResampleGrouping(_Grouping):
bin_labels: cudf.core.index.Index
bin_labels: cudf.Index

def __init__(self, obj, by=None, level=None):
self._freq = getattr(by, "freq", None)
Expand Down Expand Up @@ -170,7 +172,7 @@ def deserialize(cls, header, frames):
out.names = names
out._named_columns = _named_columns
out._key_columns = key_columns
out.bin_labels = cudf.core.index.Index.deserialize(
out.bin_labels = cudf.Index.deserialize(
header["__bin_labels"], frames[-header["__bin_labels_count"] :]
)
out._freq = header["_freq"]
Expand Down Expand Up @@ -268,13 +270,19 @@ def _handle_frequency_grouper(self, by):
cast_bin_labels = bin_labels.astype(result_type)

# bin the key column:
bin_numbers = cudf._lib.labeling.label_bins(
cast_key_column,
left_edges=cast_bin_labels[:-1]._column,
left_inclusive=(closed == "left"),
right_edges=cast_bin_labels[1:]._column,
right_inclusive=(closed == "right"),
)
with acquire_spill_lock():
plc_column = plc.labeling.label_bins(
cast_key_column.to_pylibcudf(mode="read"),
cast_bin_labels[:-1]._column.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.YES
if closed == "left"
else plc.labeling.Inclusive.NO,
cast_bin_labels[1:]._column.to_pylibcudf(mode="read"),
plc.labeling.Inclusive.YES
if closed == "right"
else plc.labeling.Inclusive.NO,
)
bin_numbers = Column.from_pylibcudf(plc_column)

if label == "right":
cast_bin_labels = cast_bin_labels[1:]
Expand Down

0 comments on commit 302e625

Please sign in to comment.