diff --git a/example/cudf-ak.ipynb b/example/cudf-ak.ipynb index 8d82eba..f786c4e 100644 --- a/example/cudf-ak.ipynb +++ b/example/cudf-ak.ipynb @@ -9,9 +9,9 @@ "```python\n", "import awkward as ak\n", "\n", - "def make_data(fn):\n", + "def make_data(fn, N=1000000):\n", " part = [[[1, 2, 3], [], [4, 5]],\n", - " [[6, 7]]] * 1000000\n", + " [[6, 7]]] * N\n", " arr = ak.Array({\"a\": part})\n", " ak.to_parquet(arr, fn, extensionarray=False)\n", "```" @@ -19,17 +19,17 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "id": "cefd8e53-a56f-4b0c-88d2-d662d59849a7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "('2.6.9', '2024.8.1.dev29+g9b9f27f.d20240927')" + "('2.7.1', '2024.10.1.dev9+g9f64d31')" ] }, - "execution_count": 1, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -43,6 +43,7 @@ "import subprocess\n", "\n", "def gpu_mem():\n", + " return\n", " print(subprocess.check_output(\"nvidia-smi | grep py\", shell=True).split()[-2].decode())\n", "\n", "ak.__version__, akimbo.__version__" @@ -50,28 +51,20 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "0490043a-564a-4c11-bb0d-a54fb4c6fb10", "metadata": { "scrolled": true }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "160MiB\n" - ] - } - ], + "outputs": [], "source": [ - "df = cudf.read_parquet(\"/floppy/code/awkward/s.parquet\")\n", + "df = cudf.read_parquet(\"s.parquet\")\n", "gpu_mem()" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "id": "e29ff9a4-60e4-4260-9a44-c135ad6d7d6b", "metadata": {}, "outputs": [ @@ -82,7 +75,7 @@ "dtype: object" ] }, - "execution_count": 3, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -93,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "58d16a80-041e-4260-8c56-9de932dde557", "metadata": {}, "outputs": [ @@ -104,7 +97,7 @@ "Name: 0, dtype: list" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -115,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "c7b65320-e1fa-44b2-a232-6ffb97ba1d18", "metadata": { "scrolled": true @@ -185,6 +178,7 @@ " 'from_raggedtensor',\n", " 'from_rdataframe',\n", " 'from_regular',\n", + " 'from_tensorflow',\n", " 'from_torch',\n", " 'full_like',\n", " 'highlevel',\n", @@ -209,6 +203,7 @@ " 'mixin_class',\n", " 'mixin_class_method',\n", " 'moment',\n", + " 'named_axis',\n", " 'nan_to_none',\n", " 'nan_to_num',\n", " 'nanargmax',\n", @@ -229,6 +224,7 @@ " 'operations',\n", " 'pad_none',\n", " 'parameters',\n", + " 'positional_axis',\n", " 'prettyprint',\n", " 'prod',\n", " 'ptp',\n", @@ -265,6 +261,7 @@ " 'to_raggedtensor',\n", " 'to_rdataframe',\n", " 'to_regular',\n", + " 'to_tensorflow',\n", " 'to_torch',\n", " 'tolist',\n", " 'transform',\n", @@ -281,14 +278,16 @@ " 'where',\n", " 'with_field',\n", " 'with_name',\n", + " 'with_named_axis',\n", " 'with_parameter',\n", " 'without_field',\n", + " 'without_named_axis',\n", " 'without_parameters',\n", " 'zeros_like',\n", " 'zip']" ] }, - "execution_count": 5, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -300,7 +299,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "8ff11e13-8503-4d79-a64c-993028709ca4", "metadata": {}, "outputs": [ @@ -310,7 +309,7 @@ "array(28000000)" ] }, - "execution_count": 6, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -321,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "2dd99fe5-0523-46c9-87ec-1392070f5139", "metadata": {}, "outputs": [ @@ -331,7 +330,7 @@ "cupy.ndarray" ] }, - "execution_count": 7, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -343,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "id": "9d8e55cf-8cf1-40a0-8733-24b7719f431d", "metadata": {}, "outputs": [ @@ -351,7 +350,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "4.83 ms ± 16 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "12.6 ms ± 779 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -362,7 +361,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "id": "fae94aea-d9cf-4228-bcab-f843c7cc9c98", "metadata": {}, "outputs": [ @@ -383,7 +382,7 @@ "Length: 2000000, dtype: list" ] }, - "execution_count": 9, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -413,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "id": "558ca2c3-d6c7-4404-bcab-557b9b03f795", "metadata": {}, "outputs": [ @@ -445,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "id": "d240ea54-87b4-4b99-b67f-b2f885a4bf5e", "metadata": { "scrolled": true @@ -457,7 +456,7 @@ "array([15, 13, 15, ..., 13, 15, 13], dtype=int32)" ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -485,7 +484,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "id": "73a35144-292f-4b1d-bbc0-4ebba2a84b0d", "metadata": {}, "outputs": [ @@ -493,7 +492,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "6.17 ms ± 118 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" + "16.7 ms ± 233 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -529,7 +528,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "d039a508-e77c-4e23-a583-ec7997a88bb1", "metadata": {}, "outputs": [ @@ -550,7 +549,7 @@ "Length: 2000000, dtype: list" ] }, - "execution_count": 15, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -562,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "f149dfaf-c01e-4d0a-8e01-2d20623d216f", "metadata": {}, "outputs": [ @@ -583,7 +582,7 @@ "Length: 2000000, dtype: list" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -604,9 +603,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:cuda] *", + "display_name": "Python 3 (ipykernel)", "language": "python", - "name": "conda-env-cuda-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -618,7 +617,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/src/akimbo/cudf.py b/src/akimbo/cudf.py index c9df0b0..1fcb073 100644 --- a/src/akimbo/cudf.py +++ b/src/akimbo/cudf.py @@ -47,7 +47,7 @@ def dec_cu(op, match=match_string): def f(lay, **kwargs): # op(column, ...)->column col = op(lay._to_cudf(cudf, None, len(lay)), **kwargs) - return from_cudf(cudf.Series(col)).layout + return from_cudf(cudf.Series._from_column(col)).layout return dec(func=f, match=match, inmode="ak") @@ -61,7 +61,7 @@ def f(lay, method=meth, **kwargs): # this is different from dec_cu, because we need to instantiate StringMethods # before getting the method from it col = getattr( - StringMethods(cudf.Series(lay._to_cudf(cudf, None, len(lay)))), method + StringMethods(cudf.Series._from_column(lay._to_cudf(cudf, None, len(lay)))), method )(**kwargs) return from_cudf(col).layout @@ -87,7 +87,7 @@ def f(lay, method=meth, **kwargs): else: # attributes giving components col = m - return from_cudf(cudf.Series(col)).layout + return from_cudf(cudf.Series._from_column(col)).layout if isinstance(getattr(DatetimeColumn, meth), property): setattr( @@ -118,7 +118,12 @@ def _to_output(cls, arr): @classmethod def to_array(cls, data) -> ak.Array: - return from_cudf(data) + if isinstance(data, cls.series_type): + return from_cudf(data) + out = {} + for col in data.columns: + out[col] = from_cudf(data[col]) + return ak.Array(out) @property def array(self) -> ak.Array: @@ -151,3 +156,4 @@ def ak_property(self): Series.ak = ak_property # no official register function? +DataFrame.ak = ak_property # no official register function? diff --git a/tests/test_cudf.py b/tests/test_cudf.py index c1ef680..5aa9174 100644 --- a/tests/test_cudf.py +++ b/tests/test_cudf.py @@ -93,3 +93,10 @@ def test_times(): s = akimbo.io.ak_to_series(arr, "cudf") s2 = s.ak.dt.second assert s2.ak.to_list() == [[[0, 1, None, 2]], [], [[0, 1, None, 2]]] + + +def test_dataframe(): + df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + arr = df.ak.array + assert ak.backend(arr) == "cuda" + assert arr.fields == ["a", "b"] diff --git a/tests/test_spark.py b/tests/test_spark.py new file mode 100644 index 0000000..2a9a7f0 --- /dev/null +++ b/tests/test_spark.py @@ -0,0 +1,19 @@ +import pytest + +pd = pytest.importorskip("pandas") +pyspark = pytest.importorskip("pyspark") +import akimbo.spark + + +@pytest.fixture(scope="module") +def spark(): + from pyspark.sql import SparkSession + + return SparkSession.builder.appName("test").getOrCreate() + + +def test1(spark): + x = pd.Series([1, 2, 3]) + df = spark.createDataFrame(pd.DataFrame(x, columns=["x"])) + out = df.ak.is_none.collect() + assert out.tolist() == [False, False, False]