-
-
Notifications
You must be signed in to change notification settings - Fork 18.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement first-class List type #60629
base: main
Are you sure you want to change the base?
Changes from 1 commit
c55bc0a
66d8a1d
ef378f7
e25c0d4
21a69c9
5859e96
9edda32
b20572d
9d404e5
6e83ae0
fe6e3be
4a8ea29
8f71766
cf2fb6f
4cef00b
47c7af8
4a5da0c
305fee3
25087f7
5a2b113
cc345db
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
from __future__ import annotations | ||
|
||
from typing import ( | ||
TYPE_CHECKING, | ||
ClassVar, | ||
) | ||
|
||
import numpy as np | ||
|
||
from pandas._libs import missing as libmissing | ||
from pandas.compat import HAS_PYARROW | ||
from pandas.util._decorators import set_module | ||
|
||
from pandas.core.dtypes.base import ( | ||
ExtensionDtype, | ||
register_extension_dtype, | ||
) | ||
from pandas.core.dtypes.common import ( | ||
is_object_dtype, | ||
is_string_dtype, | ||
) | ||
|
||
from pandas.core.arrays import ExtensionArray | ||
|
||
if TYPE_CHECKING: | ||
from pandas._typing import type_t | ||
|
||
import pyarrow as pa | ||
|
||
|
||
@register_extension_dtype | ||
@set_module("pandas") | ||
class ListDtype(ExtensionDtype): | ||
""" | ||
An ExtensionDtype suitable for storing homogeneous lists of data. | ||
""" | ||
|
||
type = list | ||
name: ClassVar[str] = "list" | ||
|
||
@property | ||
def na_value(self) -> libmissing.NAType: | ||
return libmissing.NA | ||
|
||
@property | ||
def kind(self) -> str: | ||
# TODO: our extension interface says this field should be the | ||
# NumPy type character, but no such thing exists for list | ||
# this assumes a PyArrow large list | ||
return "+L" | ||
|
||
@classmethod | ||
def construct_array_type(cls) -> type_t[ListArray]: | ||
""" | ||
Return the array type associated with this dtype. | ||
|
||
Returns | ||
------- | ||
type | ||
""" | ||
return ListArray | ||
|
||
|
||
class ListArray(ExtensionArray): | ||
dtype = ListDtype() | ||
__array_priority__ = 1000 | ||
|
||
def __init__(self, values: pa.Array | pa.ChunkedArray | list | ListArray) -> None: | ||
if not HAS_PYARROW: | ||
raise NotImplementedError("ListArray requires pyarrow to be installed") | ||
|
||
if isinstance(values, type(self)): | ||
self._pa_array = values._pa_array | ||
elif not isinstance(values, pa.ChunkedArray): | ||
# To support NA, we need to create an Array first :-( | ||
arr = pa.array(values, from_pandas=True) | ||
self._pa_array = pa.chunked_array(arr) | ||
else: | ||
self._pa_array = values | ||
|
||
@classmethod | ||
def _from_sequence(cls, scalars, *, dtype=None, copy: bool = False): | ||
if isinstance(scalars, ListArray): | ||
return cls(scalars) | ||
|
||
values = pa.array(scalars, from_pandas=True) | ||
if values.type == "null": | ||
# TODO(wayd): this is a hack to get the tests to pass, but the overall issue | ||
# is that our extension types don't support parametrization but the pyarrow | ||
values = pa.array(values, type=pa.list_(pa.null())) | ||
|
||
return cls(values) | ||
|
||
def __getitem__(self, item): | ||
# PyArrow does not support NumPy's selection with an equal length | ||
# mask, so let's convert those to integral positions if needed | ||
if isinstance(item, np.ndarray) and item.dtype == bool: | ||
pos = np.array(range(len(item))) | ||
mask = pos[item] | ||
return type(self)(self._pa_array.take(mask)) | ||
elif isinstance(item, int): # scalar case | ||
return self._pa_array[item] | ||
|
||
return type(self)(self._pa_array[item]) | ||
|
||
def __len__(self) -> int: | ||
return len(self._pa_array) | ||
|
||
def isna(self): | ||
return np.array(self._pa_array.is_null()) | ||
|
||
def take(self, indexer, allow_fill=False, fill_value=None): | ||
# TODO: what do we need to do with allow_fill and fill_value here? | ||
return type(self)(self._pa_array.take(indexer)) | ||
|
||
def copy(self): | ||
return type(self)(self._pa_array.take(pa.array(range(len(self._pa_array))))) | ||
|
||
def astype(self, dtype, copy=True): | ||
if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: | ||
if copy: | ||
return self.copy() | ||
return self | ||
elif is_string_dtype(dtype) and not is_object_dtype(dtype): | ||
# numpy has problems with astype(str) for nested elements | ||
# and pyarrow cannot cast from list[string] to string | ||
return np.array([str(x) for x in self._pa_array], dtype=dtype) | ||
|
||
if not copy: | ||
raise TypeError(f"astype from ListArray to {dtype} requires a copy") | ||
|
||
return np.array(self._pa_array.to_pylist(), dtype=dtype, copy=copy) | ||
|
||
@classmethod | ||
def _concat_same_type(cls, to_concat): | ||
data = [x._pa_array for x in to_concat] | ||
return cls(data) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -576,7 +576,10 @@ def convert_dtypes( | |
@final | ||
@cache_readonly | ||
def dtype(self) -> DtypeObj: | ||
return self.values.dtype | ||
try: | ||
return self.values.dtype | ||
except AttributeError: # PyArrow fallback | ||
return self.values.type | ||
|
||
@final | ||
def astype( | ||
|
@@ -2234,12 +2237,16 @@ def new_block( | |
*, | ||
ndim: int, | ||
refs: BlockValuesRefs | None = None, | ||
dtype: DtypeObj | None, | ||
) -> Block: | ||
# caller is responsible for ensuring: | ||
# - values is NOT a NumpyExtensionArray | ||
# - check_ndim/ensure_block_shape already checked | ||
# - maybe_coerce_values already called/unnecessary | ||
klass = get_block_type(values.dtype) | ||
if dtype: | ||
klass = get_block_type(dtype) | ||
else: | ||
klass = get_block_type(values.dtype) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. as above, values.dtype should be the ListDtype already. I don't see why passing dtype separately is necessary. |
||
return klass(values, ndim=ndim, placement=placement, refs=refs) | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -505,7 +505,7 @@ def __init__( | |
data = data.copy() | ||
else: | ||
data = sanitize_array(data, index, dtype, copy) | ||
data = SingleBlockManager.from_array(data, index, refs=refs) | ||
data = SingleBlockManager.from_array(data, dtype, index, refs=refs) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if |
||
|
||
NDFrame.__init__(self, data) | ||
self.name = name | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1103,7 +1103,11 @@ def format_array( | |
List[str] | ||
""" | ||
fmt_klass: type[_GenericArrayFormatter] | ||
if lib.is_np_dtype(values.dtype, "M"): | ||
if hasattr(values, "type") and values.type == "null": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we do something more explicit than hasattr checks? i.e. isinstance(dtype, ListDtype) or whatever? |
||
fmt_klass = _NullFormatter | ||
if hasattr(values, "type") and str(values.type).startswith("list"): | ||
fmt_klass = _ListFormatter | ||
elif lib.is_np_dtype(values.dtype, "M"): | ||
fmt_klass = _Datetime64Formatter | ||
values = cast(DatetimeArray, values) | ||
elif isinstance(values.dtype, DatetimeTZDtype): | ||
|
@@ -1467,6 +1471,27 @@ def _format_strings(self) -> list[str]: | |
return fmt_values | ||
|
||
|
||
class _NullFormatter(_GenericArrayFormatter): | ||
def _format_strings(self) -> list[str]: | ||
fmt_values = [str(x) for x in self.values] | ||
return fmt_values | ||
|
||
|
||
class _ListFormatter(_GenericArrayFormatter): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesnt look like this is used? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep dead code - thanks! |
||
def _format_strings(self) -> list[str]: | ||
# TODO(wayd): This doesn't seem right - where should missing values | ||
# be handled | ||
fmt_values = [] | ||
for x in self.values: | ||
pyval = x.as_py() | ||
if pyval: | ||
fmt_values.append(pyval) | ||
else: | ||
fmt_values.append("") | ||
|
||
return fmt_values | ||
|
||
|
||
class _Datetime64Formatter(_GenericArrayFormatter): | ||
values: DatetimeArray | ||
|
||
|
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This doesn't make sense to me. self.values should be the EA, and the EA.dtype should be the right thing here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah OK thanks. I think this is a holdover from an intermediate state and I didn't recognize the requirement here. Reverting this fixes a lot of the other comments you've made here as well - thanks!