-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement first-class List type #60629
base: main
Are you sure you want to change the base?
Changes from 4 commits
c55bc0a
66d8a1d
ef378f7
e25c0d4
21a69c9
5859e96
9edda32
b20572d
9d404e5
6e83ae0
fe6e3be
4a8ea29
8f71766
cf2fb6f
4cef00b
47c7af8
4a5da0c
305fee3
25087f7
5a2b113
cc345db
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
from __future__ import annotations | ||
|
||
from typing import ( | ||
TYPE_CHECKING, | ||
ClassVar, | ||
) | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import missing as libmissing | ||
from pandas.compat import HAS_PYARROW | ||
from pandas.util._decorators import set_module | ||
|
||
from pandas.core.dtypes.base import ( | ||
ExtensionDtype, | ||
register_extension_dtype, | ||
) | ||
from pandas.core.dtypes.common import ( | ||
is_object_dtype, | ||
is_string_dtype, | ||
) | ||
|
||
from pandas.core.arrays import ExtensionArray | ||
|
||
if TYPE_CHECKING: | ||
from pandas._typing import ( | ||
type_t, | ||
Shape, | ||
) | ||
|
||
import pyarrow as pa | ||
|
||
|
||
@register_extension_dtype | ||
@set_module("pandas") | ||
class ListDtype(ExtensionDtype): | ||
""" | ||
An ExtensionDtype suitable for storing homogeneous lists of data. | ||
""" | ||
|
||
type = list | ||
name: ClassVar[str] = "list" | ||
|
||
@property | ||
def na_value(self) -> libmissing.NAType: | ||
return libmissing.NA | ||
|
||
@property | ||
def kind(self) -> str: | ||
# TODO: our extension interface says this field should be the | ||
# NumPy type character, but no such thing exists for list | ||
# this assumes a PyArrow large list | ||
return "+L" | ||
|
||
@classmethod | ||
def construct_array_type(cls) -> type_t[ListArray]: | ||
""" | ||
Return the array type associated with this dtype. | ||
|
||
Returns | ||
------- | ||
type | ||
""" | ||
return ListArray | ||
|
||
|
||
class ListArray(ExtensionArray): | ||
dtype = ListDtype() | ||
__array_priority__ = 1000 | ||
|
||
def __init__(self, values: pa.Array | pa.ChunkedArray | list | ListArray) -> None: | ||
if not HAS_PYARROW: | ||
raise NotImplementedError("ListArray requires pyarrow to be installed") | ||
|
||
if isinstance(values, type(self)): | ||
self._pa_array = values._pa_array | ||
elif not isinstance(values, pa.ChunkedArray): | ||
# To support NA, we need to create an Array first :-( | ||
arr = pa.array(values, from_pandas=True) | ||
self._pa_array = pa.chunked_array(arr) | ||
else: | ||
self._pa_array = values | ||
|
||
@classmethod | ||
def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): | ||
if isinstance(scalars, ListArray): | ||
return cls(scalars) | ||
elif isinstance(scalars, pa.Scalar): | ||
scalars = [scalars] | ||
return cls(scalars) | ||
|
||
try: | ||
values = pa.array(scalars, from_pandas=True) | ||
except TypeError: | ||
# TypeError: object of type 'NoneType' has no len() if you have | ||
# pa.ListScalar(None). Upstream issue in Arrow - see: | ||
# https://github.com/apache/arrow/issues/40319 | ||
for i in range(len(scalars)): | ||
if not scalars[i].is_valid: | ||
scalars[i] = None | ||
|
||
values = pa.array(scalars, from_pandas=True) | ||
if values.type == "null": | ||
# TODO(wayd): this is a hack to get the tests to pass, but the overall issue | ||
# is that our extension types don't support parametrization but the pyarrow | ||
values = pa.array(values, type=pa.list_(pa.null())) | ||
|
||
return cls(values) | ||
|
||
def __getitem__(self, item): | ||
# PyArrow does not support NumPy's selection with an equal length | ||
# mask, so let's convert those to integral positions if needed | ||
if isinstance(item, np.ndarray) and item.dtype == bool: | ||
pos = np.array(range(len(item))) | ||
mask = pos[item] | ||
return type(self)(self._pa_array.take(mask)) | ||
elif isinstance(item, int): # scalar case | ||
return self._pa_array[item] | ||
|
||
return type(self)(self._pa_array[item]) | ||
|
||
def __len__(self) -> int: | ||
return len(self._pa_array) | ||
|
||
def isna(self): | ||
return np.array(self._pa_array.is_null()) | ||
|
||
def take(self, indexer, allow_fill=False, fill_value=None): | ||
# TODO: what do we need to do with allow_fill and fill_value here? | ||
return type(self)(self._pa_array.take(indexer)) | ||
|
||
@classmethod | ||
def _empty(cls, shape: Shape, dtype: ExtensionDtype): | ||
""" | ||
Create an ExtensionArray with the given shape and dtype. | ||
|
||
See also | ||
-------- | ||
ExtensionDtype.empty | ||
ExtensionDtype.empty is the 'official' public version of this API. | ||
""" | ||
# Implementer note: while ExtensionDtype.empty is the public way to | ||
# call this method, it is still required to implement this `_empty` | ||
# method as well (it is called internally in pandas) | ||
if isinstance(shape, tuple): | ||
if len(shape) > 1: | ||
raise ValueError("ListArray may only be 1-D") | ||
else: | ||
length = shape[0] | ||
else: | ||
length = shape | ||
return cls._from_sequence([None] * length, dtype=pa.list_(pa.null())) | ||
|
||
def copy(self): | ||
mm = pa.default_cpu_memory_manager() | ||
|
||
# TODO(wayd): ChunkedArray does not implement copy_to so this | ||
# ends up creating an Array | ||
copied = self._pa_array.combine_chunks().copy_to(mm.device) | ||
return type(self)(copied) | ||
|
||
def astype(self, dtype, copy=True): | ||
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: | ||
if copy: | ||
return self.copy() | ||
return self | ||
elif is_string_dtype(dtype) and not is_object_dtype(dtype): | ||
# numpy has problems with astype(str) for nested elements | ||
# and pyarrow cannot cast from list[string] to string | ||
return np.array([str(x) for x in self._pa_array], dtype=dtype) | ||
|
||
if not copy: | ||
raise TypeError(f"astype from ListArray to {dtype} requires a copy") | ||
|
||
return np.array(self._pa_array.to_pylist(), dtype=dtype, copy=copy) | ||
|
||
@classmethod | ||
def _concat_same_type(cls, to_concat): | ||
data = [x._pa_array for x in to_concat] | ||
return cls(data) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -111,6 +111,7 @@ | |
StructAccessor, | ||
) | ||
from pandas.core.arrays.categorical import CategoricalAccessor | ||
from pandas.core.arrays.list_ import ListDtype | ||
from pandas.core.arrays.sparse import SparseAccessor | ||
from pandas.core.arrays.string_ import StringDtype | ||
from pandas.core.construction import ( | ||
|
@@ -494,7 +495,7 @@ def __init__( | |
if not is_list_like(data): | ||
data = [data] | ||
index = default_index(len(data)) | ||
elif is_list_like(data): | ||
elif is_list_like(data) and not isinstance(dtype, ListDtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about nested list? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea this is a tough one to handle. I'm not sure if something like: pd.Series([1, 2, 3], index=range(3), dtype=pd.ListDtype()) should raise or broadcast. I think the tests currently want it to broadcast, but we could override that expectation for this array There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good example. also indexing is going to be a pain point. a lot of checking for nested listlikes happens in indexing.py and im wary of stumbling blocks there. also indexing in general with a ListDtype index. i think the solution is to interpret lists as sequences and only treat them as a scalar if explicitly wrapped in pa.ListScalar (or something equivalent) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I originally started down that path but I think the problem with expecting There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think we have to just disallow that. Otherwise we have a long tail of places where we expect labels to be non-listlike and nested objects to mean 2D. e.g. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Do we normally require index elements to be hashable? I think there's an argument to be made that we shouldn't allow this as a index, given Python lists aren't hashable (even though PyArrow lists are) If we do want to allow this as an index, the example cited is a single column with label [1, 2, 3]
Definitely a valid concern. Do you know what might not be covered in this area by the extension tests? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We don't do any validation at construction-time for an object dtype Index. I expect a bunch of methods will break with non-hashable elements.
Wouldn't that break a lot e.g. The simple solution to all of these problems is to require users to be explicit when they want to treat a list as a scalar.
I can come up with a few examples if we go down this road, but that part of the code has a lot of paths so it won't be comprehensive. We'll be chasing down corner cases for a long time. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea that's a good point about the groupby stuff. I don't have a strong objection to requiring an explicit scalar argument, though I guess the follow up question is do we want users passing a pyarrow scalar or do we want to create our own scalar type. Not sure how other libraries like polars handle this, but @MarcoGorelli might have some insight |
||
com.require_length_match(data, index) | ||
|
||
# create/copy the manager | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1467,6 +1467,27 @@ def _format_strings(self) -> list[str]: | |
return fmt_values | ||
|
||
|
||
class _NullFormatter(_GenericArrayFormatter): | ||
def _format_strings(self) -> list[str]: | ||
fmt_values = [str(x) for x in self.values] | ||
return fmt_values | ||
|
||
|
||
class _ListFormatter(_GenericArrayFormatter): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesnt look like this is used? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep dead code - thanks! |
||
def _format_strings(self) -> list[str]: | ||
# TODO(wayd): This doesn't seem right - where should missing values | ||
# be handled | ||
fmt_values = [] | ||
for x in self.values: | ||
pyval = x.as_py() | ||
if pyval: | ||
fmt_values.append(pyval) | ||
else: | ||
fmt_values.append("") | ||
|
||
return fmt_values | ||
|
||
|
||
class _Datetime64Formatter(_GenericArrayFormatter): | ||
values: DatetimeArray | ||
|
||
|
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think have is list like return False for pyarrow scalar is less hacky?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's probably true in this particular case, although I'm not sure how it will generalize to all uses of
is_list_like
. Will do more research