From 8c834c72770c4e2cb99a82c490296963bd9decc7 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 10 Oct 2024 15:23:14 -0400 Subject: [PATCH] API --- docs/api.rst | 14 ++++--------- docs/index.rst | 6 +++++- src/akimbo/ak_from_cudf.py | 28 ++++++++++++++++---------- src/akimbo/apply_tree.py | 14 +++++++------ src/akimbo/cudf.py | 40 +++++++++++++++++++++++++++----------- src/akimbo/utils.py | 20 +++++++++++++++++++ 6 files changed, 84 insertions(+), 38 deletions(-) create mode 100644 src/akimbo/utils.py diff --git a/docs/api.rst b/docs/api.rst index e33ff25..a2282f8 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -24,9 +24,9 @@ Accessor .. autosummary:: :toctree: generated/ - akimbo.mixin.Accessor + Accessor -.. autoclass:: akimbo.mixin.Accessor +.. autoclass:: Accessor :members: @@ -45,7 +45,7 @@ Backends .. autoclass:: akimbo.polars.PolarsAwkwardAccessor -.. autoclass:: akimbo.polars.CudfAwkwardAccessor +.. autoclass:: akimbo.cudf.CudfAwkwardAccessor Extensions @@ -57,7 +57,7 @@ being acted on. Check the ``dir()`` of each (or use tab-completion) to see the operations available. .. autoclass:: akimbo.datetimes.DatetimeAccessor - :members: cast + :members: .. autoclass:: akimbo.strings.StringAccessor :members: @@ -69,9 +69,3 @@ to see the operations available. The cuDF backend also has these implemented with GPU-specific variants, ``akimbo.cudf.CudfStringAccessor`` and ``akimbo.cudf.CudfDatetimeAccessor``. - -Adding Extensions (advanced) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The patterns used in the two builtin extensions above can be used to add -type-specific functionality to ``akimbo``. One diff --git a/docs/index.rst b/docs/index.rst index 68a99b0..c3daebd 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -23,7 +23,7 @@ identical syntax: - pandas - dask.dataframe - polars -- cuDF (in development) +- cuDF numpy-like API @@ -34,6 +34,7 @@ for slicing and accessing data deep in nested structures, Example: choose every second inner element in a list-of-lists .. code-block:: python + series.ak[:, ::2] Any function, ufunc or aggregation at any level @@ -43,6 +44,7 @@ For manipulating numerics at deeper levels of your nested structures or ragged arrays while maintaining the original layout .. code-block:: python + series.ak.abs() # absolute for all numerical values series.ak.sum(axis=3) # sum over deeply nested level series.ak + 1 # numpy-like broadcasting into deeper levels @@ -52,6 +54,7 @@ arrays of values, and they will only affect the appropriate parts of the structu without changing the layout. .. code-block:: python + series.ak.str.upper() CPU/GPU numba support @@ -64,6 +67,7 @@ in groupby/window operations. If your data is on the GPU, you can use numba-cuda with slight modifications to your original function. .. code-block:: python + @numba.njit def sum_list_of_list(x): total = 0 diff --git a/src/akimbo/ak_from_cudf.py b/src/akimbo/ak_from_cudf.py index a139a6c..9996ffc 100644 --- a/src/akimbo/ak_from_cudf.py +++ b/src/akimbo/ak_from_cudf.py @@ -1,12 +1,18 @@ -import cudf -import pyarrow -import cupy -import numpy +from typing import Optional + +from akimbo.utils import NoAttributes +try: + import cudf + import cupy +except ImportError: + cudf = NoAttributes() + cupy = NoAttributes() import awkward as ak -from awkward._backends.numpy import NumpyBackend +import numpy +import pyarrow from awkward._backends.cupy import CupyBackend - +from awkward._backends.numpy import NumpyBackend # COPIED from awkward/studies/cudf-to-awkward.py @@ -351,7 +357,7 @@ def remove_revertable(layout, **kwargs): def recurse_finalize( out: ak.contents.Content, column: cudf.core.column.column.ColumnBase, - validbits: None | cudf.core.buffer.buffer.Buffer, + validbits: Optional[cudf.core.buffer.buffer.Buffer], generate_bitmasks: bool, fix_offsets: bool = True, ): @@ -569,13 +575,15 @@ def recurse( validbits = column.base_mask to64, dt = _pyarrow_to_numpy_dtype.get(str(arrow_type), (False, None)) - if to64: - data = cupy.asarray(data).view(cupy.int32).astype(cupy.int64) if dt is None: dt = arrow_type.to_pandas_dtype() + if to64: + data = cupy.asarray(column.base_data).view(cupy.int32).astype(cupy.int64) + else: + data = cupy.asarray(column.base_data) out = ak.contents.NumpyArray( - cupy.asarray(column.base_data).view(dt), + data.view(dt), parameters=None, backend=CupyBackend.instance(), ) diff --git a/src/akimbo/apply_tree.py b/src/akimbo/apply_tree.py index 6db39cd..9aed44d 100644 --- a/src/akimbo/apply_tree.py +++ b/src/akimbo/apply_tree.py @@ -41,10 +41,12 @@ def func(layout, **kwargs): return ak.transform(func, arr, *others) -def dec(func: callable, match: Callable[[ak.contents.Content], bool] = leaf, - outtype: Callable[[ak.contents.Content], ak.contents.Content] | None = None, - inmode: Literal["arrow", "numpy", "ak"] = "arrow"): - +def dec( + func: callable, + match: Callable[[ak.contents.Content], bool] = leaf, + outtype: Callable[[ak.contents.Content], ak.contents.Content] | None = None, + inmode: Literal["arrow", "numpy", "ak"] = "arrow", +): """Make a nested/ragged version of an operation to apply throughout a tree Parameters @@ -121,8 +123,8 @@ def f(self, *args, where=None, match_kwargs=None, **kwargs): match_kwargs: None | dict any extra field identifiers for matching a record as OK to process -{'-Kernel documentation follows from the original function-' if f.__doc__ else ''} -=== +{'--Kernel documentation follows from the original function--' if f.__doc__ else ''} + {f.__doc__ or str(f)} """ diff --git a/src/akimbo/cudf.py b/src/akimbo/cudf.py index abcbeef..6ef07a0 100644 --- a/src/akimbo/cudf.py +++ b/src/akimbo/cudf.py @@ -2,16 +2,28 @@ from typing import Callable import awkward as ak -import cudf -from cudf import DataFrame, Series, _lib as libcudf -from cudf.core.column.string import StringMethods -from cudf.core.column.datetime import DatetimeColumn + +from akimbo.utils import NoAttributes + +try: + import cudf + from cudf import DataFrame, Series + from cudf import _lib as libcudf + from cudf.core.column.datetime import DatetimeColumn + from cudf.core.column.string import StringMethods +except ImportError: + StringMethods = NoAttributes() + DatetimeColumn = NoAttributes() + libcudf = NoAttributes() + DataFrame = Series = NoAttributes() + from akimbo.ak_from_cudf import cudf_to_awkward as from_cudf +from akimbo.apply_tree import dec, leaf +from akimbo.datetimes import DatetimeAccessor +from akimbo.datetimes import match as match_t from akimbo.mixin import Accessor -from akimbo.datetimes import DatetimeAccessor, match as match_t from akimbo.strings import StringAccessor -from akimbo.apply_tree import dec, leaf def match_string(arr): @@ -22,14 +34,15 @@ class CudfStringAccessor(StringAccessor): """String operations on nested/var-length data""" def decode(self, encoding: str = "utf-8"): - raise NotImplementedError("cudf does not support bytearray type, so we can't automatically identify them") + raise NotImplementedError( + "cudf does not support bytearray type, so we can't automatically identify them" + ) def encode(self, encoding: str = "utf-8"): raise NotImplementedError("cudf does not support bytearray type") def dec_cu(op, match=match_string): - @functools.wraps(op) def f(lay, **kwargs): # op(column, ...)->column @@ -47,14 +60,15 @@ def f(lay, **kwargs): def f(lay, method=meth, **kwargs): # this is different from dec_cu, because we need to instantiate StringMethods # before getting the method from it - col = getattr(StringMethods(cudf.Series(lay._to_cudf(cudf, None, len(lay)))), method)(**kwargs) + col = getattr( + StringMethods(cudf.Series(lay._to_cudf(cudf, None, len(lay)))), method + )(**kwargs) return from_cudf(col).layout setattr(CudfStringAccessor, meth, dec(func=f, match=match_string, inmode="ak")) class CudfDatetimeAccessor(DatetimeAccessor): - ... @@ -76,7 +90,11 @@ def f(lay, method=meth, **kwargs): return from_cudf(cudf.Series(col)).layout if isinstance(getattr(DatetimeColumn, meth), property): - setattr(CudfDatetimeAccessor, meth, property(dec(func=f, match=match_t, inmode="ak"))) + setattr( + CudfDatetimeAccessor, + meth, + property(dec(func=f, match=match_t, inmode="ak")), + ) else: setattr(CudfDatetimeAccessor, meth, dec(func=f, match=match_t, inmode="ak")) diff --git a/src/akimbo/utils.py b/src/akimbo/utils.py new file mode 100644 index 0000000..e837a3a --- /dev/null +++ b/src/akimbo/utils.py @@ -0,0 +1,20 @@ +class NoAttributes: + """Allows importing akimbo.cudf even if cudf isn't installed + + This is done so that sphinx can still build docs on non-GPU systems. + """ + + def __dir__(self): + return [] + + def __getattr__(self, item): + if item == "__qualname__": + return "akimbo.utils.DummyAttributesObject" + return self + + def __call__(self, *args, **kwargs): + return self + + __name__ = "DummyAttributesObject" + __doc__ = None + __annotations__ = None