diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f0778c1e021..caf03562575 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -28,6 +28,8 @@ Performance - Small optimization to the netCDF4 and h5netcdf backends (:issue:`9058`, :pull:`9067`). By `Deepak Cherian `_. +- Small optimizations to help reduce indexing speed of datasets (:pull:`9002`). + By `Mark Harfouche `_. Breaking changes @@ -2906,7 +2908,7 @@ Bug fixes process (:issue:`4045`, :pull:`4684`). It also enables encoding and decoding standard calendar dates with time units of nanoseconds (:pull:`4400`). By `Spencer Clark `_ and `Mark Harfouche - `_. + `_. - :py:meth:`DataArray.astype`, :py:meth:`Dataset.astype` and :py:meth:`Variable.astype` support the ``order`` and ``subok`` parameters again. This fixes a regression introduced in version 0.16.1 (:issue:`4644`, :pull:`4683`). diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py index a005e1ebfe2..f25c0ecf936 100644 --- a/xarray/core/indexes.py +++ b/xarray/core/indexes.py @@ -575,13 +575,24 @@ class PandasIndex(Index): __slots__ = ("index", "dim", "coord_dtype") - def __init__(self, array: Any, dim: Hashable, coord_dtype: Any = None): - # make a shallow copy: cheap and because the index name may be updated - # here or in other constructors (cannot use pd.Index.rename as this - # constructor is also called from PandasMultiIndex) - index = safe_cast_to_index(array).copy() + def __init__( + self, + array: Any, + dim: Hashable, + coord_dtype: Any = None, + *, + fastpath: bool = False, + ): + if fastpath: + index = array + else: + index = safe_cast_to_index(array) if index.name is None: + # make a shallow copy: cheap and because the index name may be updated + # here or in other constructors (cannot use pd.Index.rename as this + # constructor is also called from PandasMultiIndex) + index = index.copy() index.name = dim self.index = index @@ -596,7 +607,7 @@ def _replace(self, index, dim=None, coord_dtype=None): dim = self.dim if coord_dtype is None: coord_dtype = self.coord_dtype - return type(self)(index, dim, coord_dtype) + return type(self)(index, dim, coord_dtype, fastpath=True) @classmethod def from_variables( @@ -642,6 +653,11 @@ def from_variables( obj = cls(data, dim, coord_dtype=var.dtype) assert not isinstance(obj.index, pd.MultiIndex) + # Rename safely + # make a shallow copy: cheap and because the index name may be updated + # here or in other constructors (cannot use pd.Index.rename as this + # constructor is also called from PandasMultiIndex) + obj.index = obj.index.copy() obj.index.name = name return obj @@ -1773,6 +1789,36 @@ def check_variables(): return not not_equal +def _apply_indexes_fast(indexes: Indexes[Index], args: Mapping[Any, Any], func: str): + # This function avoids the call to indexes.group_by_index + # which is really slow when repeatidly iterating through + # an array. However, it fails to return the correct ID for + # multi-index arrays + indexes_fast, coords = indexes._indexes, indexes._variables + + new_indexes: dict[Hashable, Index] = {k: v for k, v in indexes_fast.items()} + new_index_variables: dict[Hashable, Variable] = {} + for name, index in indexes_fast.items(): + coord = coords[name] + if hasattr(coord, "_indexes"): + index_vars = {n: coords[n] for n in coord._indexes} + else: + index_vars = {name: coord} + index_dims = {d for var in index_vars.values() for d in var.dims} + index_args = {k: v for k, v in args.items() if k in index_dims} + + if index_args: + new_index = getattr(index, func)(index_args) + if new_index is not None: + new_indexes.update({k: new_index for k in index_vars}) + new_index_vars = new_index.create_variables(index_vars) + new_index_variables.update(new_index_vars) + else: + for k in index_vars: + new_indexes.pop(k, None) + return new_indexes, new_index_variables + + def _apply_indexes( indexes: Indexes[Index], args: Mapping[Any, Any], @@ -1801,7 +1847,13 @@ def isel_indexes( indexes: Indexes[Index], indexers: Mapping[Any, Any], ) -> tuple[dict[Hashable, Index], dict[Hashable, Variable]]: - return _apply_indexes(indexes, indexers, "isel") + # TODO: remove if clause in the future. It should be unnecessary. + # See failure introduced when removed + # https://github.com/pydata/xarray/pull/9002#discussion_r1590443756 + if any(isinstance(v, PandasMultiIndex) for v in indexes._indexes.values()): + return _apply_indexes(indexes, indexers, "isel") + else: + return _apply_indexes_fast(indexes, indexers, "isel") def roll_indexes( diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 0926da6fd80..06e7efdbb48 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -296,15 +296,16 @@ def slice_slice(old_slice: slice, applied_slice: slice, size: int) -> slice: def _index_indexer_1d(old_indexer, applied_indexer, size: int): - assert isinstance(applied_indexer, integer_types + (slice, np.ndarray)) if isinstance(applied_indexer, slice) and applied_indexer == slice(None): # shortcut for the usual case return old_indexer if isinstance(old_indexer, slice): if isinstance(applied_indexer, slice): indexer = slice_slice(old_indexer, applied_indexer, size) + elif isinstance(applied_indexer, integer_types): + indexer = range(*old_indexer.indices(size))[applied_indexer] # type: ignore[assignment] else: - indexer = _expand_slice(old_indexer, size)[applied_indexer] # type: ignore[assignment] + indexer = _expand_slice(old_indexer, size)[applied_indexer] else: indexer = old_indexer[applied_indexer] return indexer @@ -591,7 +592,7 @@ def __getitem__(self, key: Any): class LazilyIndexedArray(ExplicitlyIndexedNDArrayMixin): """Wrap an array to make basic and outer indexing lazy.""" - __slots__ = ("array", "key") + __slots__ = ("array", "key", "_shape") def __init__(self, array: Any, key: ExplicitIndexer | None = None): """ @@ -614,6 +615,14 @@ def __init__(self, array: Any, key: ExplicitIndexer | None = None): self.array = as_indexable(array) self.key = key + shape: _Shape = () + for size, k in zip(self.array.shape, self.key.tuple): + if isinstance(k, slice): + shape += (len(range(*k.indices(size))),) + elif isinstance(k, np.ndarray): + shape += (k.size,) + self._shape = shape + def _updated_key(self, new_key: ExplicitIndexer) -> BasicIndexer | OuterIndexer: iter_new_key = iter(expanded_indexer(new_key.tuple, self.ndim)) full_key = [] @@ -630,13 +639,7 @@ def _updated_key(self, new_key: ExplicitIndexer) -> BasicIndexer | OuterIndexer: @property def shape(self) -> _Shape: - shape = [] - for size, k in zip(self.array.shape, self.key.tuple): - if isinstance(k, slice): - shape.append(len(range(*k.indices(size)))) - elif isinstance(k, np.ndarray): - shape.append(k.size) - return tuple(shape) + return self._shape def get_duck_array(self): if isinstance(self.array, ExplicitlyIndexedNDArrayMixin):