Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement first-class List type #60629

Draft
wants to merge 21 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pandas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
PeriodDtype,
IntervalDtype,
DatetimeTZDtype,
ListDtype,
StringDtype,
BooleanDtype,
# missing
Expand Down Expand Up @@ -261,6 +262,7 @@
"Interval",
"IntervalDtype",
"IntervalIndex",
"ListDtype",
"MultiIndex",
"NaT",
"NamedAgg",
Expand Down
6 changes: 6 additions & 0 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
TimedeltaArray,
)
from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin
from pandas.core.arrays.list_ import ListDtype
from pandas.core.arrays.string_ import StringDtype
from pandas.core.indexes.api import safe_sort_index

Expand Down Expand Up @@ -824,6 +825,11 @@ def assert_extension_array_equal(
[np.isnan(val) for val in right._ndarray[right_na]] # type: ignore[attr-defined]
), "wrong missing value sentinels"

# TODO: not every array type may be convertible to NumPy; should catch here
if isinstance(left.dtype, ListDtype) and isinstance(right.dtype, ListDtype):
assert left._pa_array == right._pa_array
return

left_valid = left[~left_na].to_numpy(dtype=object)
right_valid = right[~right_na].to_numpy(dtype=object)
if check_exact:
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
UInt32Dtype,
UInt64Dtype,
)
from pandas.core.arrays.list_ import ListDtype
from pandas.core.arrays.string_ import StringDtype
from pandas.core.construction import array # noqa: ICN001
from pandas.core.flags import Flags
Expand Down Expand Up @@ -103,6 +104,7 @@
"Interval",
"IntervalDtype",
"IntervalIndex",
"ListDtype",
"MultiIndex",
"NaT",
"NamedAgg",
Expand Down
137 changes: 137 additions & 0 deletions pandas/core/arrays/list_.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
from __future__ import annotations

from typing import (
TYPE_CHECKING,
ClassVar,
)

import numpy as np

from pandas._libs import missing as libmissing
from pandas.compat import HAS_PYARROW
from pandas.util._decorators import set_module

from pandas.core.dtypes.base import (
ExtensionDtype,
register_extension_dtype,
)
from pandas.core.dtypes.common import (
is_object_dtype,
is_string_dtype,
)

from pandas.core.arrays import ExtensionArray

if TYPE_CHECKING:
from pandas._typing import type_t

import pyarrow as pa


@register_extension_dtype
@set_module("pandas")
class ListDtype(ExtensionDtype):
"""
An ExtensionDtype suitable for storing homogeneous lists of data.
"""

type = list
name: ClassVar[str] = "list"

@property
def na_value(self) -> libmissing.NAType:
return libmissing.NA

@property
def kind(self) -> str:
# TODO: our extension interface says this field should be the
# NumPy type character, but no such thing exists for list
# this assumes a PyArrow large list
return "+L"

@classmethod
def construct_array_type(cls) -> type_t[ListArray]:
"""
Return the array type associated with this dtype.

Returns
-------
type
"""
return ListArray


class ListArray(ExtensionArray):
dtype = ListDtype()
__array_priority__ = 1000

def __init__(self, values: pa.Array | pa.ChunkedArray | list | ListArray) -> None:
if not HAS_PYARROW:
raise NotImplementedError("ListArray requires pyarrow to be installed")

if isinstance(values, type(self)):
self._pa_array = values._pa_array
elif not isinstance(values, pa.ChunkedArray):
# To support NA, we need to create an Array first :-(
arr = pa.array(values, from_pandas=True)
self._pa_array = pa.chunked_array(arr)
else:
self._pa_array = values

@classmethod
def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False):
if isinstance(scalars, ListArray):
return cls(scalars)

values = pa.array(scalars, from_pandas=True)
if values.type == "null":
# TODO(wayd): this is a hack to get the tests to pass, but the overall issue
# is that our extension types don't support parametrization but the pyarrow
values = pa.array(values, type=pa.list_(pa.null()))

return cls(values)

def __getitem__(self, item):
# PyArrow does not support NumPy's selection with an equal length
# mask, so let's convert those to integral positions if needed
if isinstance(item, np.ndarray) and item.dtype == bool:
pos = np.array(range(len(item)))
mask = pos[item]
return type(self)(self._pa_array.take(mask))
elif isinstance(item, int): # scalar case
return self._pa_array[item]

return type(self)(self._pa_array[item])

def __len__(self) -> int:
return len(self._pa_array)

def isna(self):
return np.array(self._pa_array.is_null())

def take(self, indexer, allow_fill=False, fill_value=None):
# TODO: what do we need to do with allow_fill and fill_value here?
return type(self)(self._pa_array.take(indexer))

def copy(self):
return type(self)(self._pa_array.take(pa.array(range(len(self._pa_array)))))

def astype(self, dtype, copy=True):
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype:
if copy:
return self.copy()
return self
elif is_string_dtype(dtype) and not is_object_dtype(dtype):
# numpy has problems with astype(str) for nested elements
# and pyarrow cannot cast from list[string] to string
return np.array([str(x) for x in self._pa_array], dtype=dtype)

if not copy:
raise TypeError(f"astype from ListArray to {dtype} requires a copy")

return np.array(self._pa_array.to_pylist(), dtype=dtype, copy=copy)

@classmethod
def _concat_same_type(cls, to_concat):
data = [x._pa_array for x in to_concat]
return cls(data)
11 changes: 9 additions & 2 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,7 +576,10 @@ def convert_dtypes(
@final
@cache_readonly
def dtype(self) -> DtypeObj:
return self.values.dtype
try:
return self.values.dtype
except AttributeError: # PyArrow fallback
return self.values.type
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't make sense to me. self.values should be the EA, and the EA.dtype should be the right thing here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah OK thanks. I think this is a holdover from an intermediate state and I didn't recognize the requirement here. Reverting this fixes a lot of the other comments you've made here as well - thanks!


@final
def astype(
Expand Down Expand Up @@ -2234,12 +2237,16 @@ def new_block(
*,
ndim: int,
refs: BlockValuesRefs | None = None,
dtype: DtypeObj | None,
) -> Block:
# caller is responsible for ensuring:
# - values is NOT a NumpyExtensionArray
# - check_ndim/ensure_block_shape already checked
# - maybe_coerce_values already called/unnecessary
klass = get_block_type(values.dtype)
if dtype:
klass = get_block_type(dtype)
else:
klass = get_block_type(values.dtype)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as above, values.dtype should be the ListDtype already. I don't see why passing dtype separately is necessary.

return klass(values, ndim=ndim, placement=placement, refs=refs)


Expand Down
8 changes: 6 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1976,14 +1976,18 @@ def from_blocks(

@classmethod
def from_array(
cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None
cls,
array: ArrayLike,
dtype: DtypeObj | None,
index: Index,
refs: BlockValuesRefs | None = None,
) -> SingleBlockManager:
"""
Constructor for if we have an array that is not yet a Block.
"""
array = maybe_coerce_values(array)
bp = BlockPlacement(slice(0, len(index)))
block = new_block(array, placement=bp, ndim=1, refs=refs)
block = new_block(array, placement=bp, ndim=1, refs=refs, dtype=dtype)
return cls(block, index)

def to_2d_mgr(self, columns: Index) -> BlockManager:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,7 +505,7 @@ def __init__(
data = data.copy()
else:
data = sanitize_array(data, index, dtype, copy)
data = SingleBlockManager.from_array(data, index, refs=refs)
data = SingleBlockManager.from_array(data, dtype, index, refs=refs)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if dtype is your ListDtype, then data.dtype should be ListDtype at this point so the new argument should be unnecessary


NDFrame.__init__(self, data)
self.name = name
Expand Down
27 changes: 26 additions & 1 deletion pandas/io/formats/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1103,7 +1103,11 @@ def format_array(
List[str]
"""
fmt_klass: type[_GenericArrayFormatter]
if lib.is_np_dtype(values.dtype, "M"):
if hasattr(values, "type") and values.type == "null":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we do something more explicit than hasattr checks? i.e. isinstance(dtype, ListDtype) or whatever?

fmt_klass = _NullFormatter
if hasattr(values, "type") and str(values.type).startswith("list"):
fmt_klass = _ListFormatter
elif lib.is_np_dtype(values.dtype, "M"):
fmt_klass = _Datetime64Formatter
values = cast(DatetimeArray, values)
elif isinstance(values.dtype, DatetimeTZDtype):
Expand Down Expand Up @@ -1467,6 +1471,27 @@ def _format_strings(self) -> list[str]:
return fmt_values


class _NullFormatter(_GenericArrayFormatter):
def _format_strings(self) -> list[str]:
fmt_values = [str(x) for x in self.values]
return fmt_values


class _ListFormatter(_GenericArrayFormatter):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

doesnt look like this is used?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep dead code - thanks!

def _format_strings(self) -> list[str]:
# TODO(wayd): This doesn't seem right - where should missing values
# be handled
fmt_values = []
for x in self.values:
pyval = x.as_py()
if pyval:
fmt_values.append(pyval)
else:
fmt_values.append("")

return fmt_values


class _Datetime64Formatter(_GenericArrayFormatter):
values: DatetimeArray

Expand Down
7 changes: 0 additions & 7 deletions pandas/tests/extension/list/__init__.py

This file was deleted.

Loading
Loading