From 9f2c748b0bcf1a28e41acd0e5486794295924370 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 30 Oct 2024 16:56:44 +0000 Subject: [PATCH 01/27] test_Data passes --- cf/data/array/h5netcdfarray.py | 89 +- cf/data/array/mixin/cfamixin.py | 3 +- cf/data/array/mixin/compressedarraymixin.py | 3 +- cf/data/array/netcdf4array.py | 121 +- cf/data/array/umarray.py | 5 - cf/data/collapse/collapse_active.py | 1 - cf/data/collapse/dask_collapse.py | 46 +- cf/data/creation.py | 126 - cf/data/dask_utils.py | 227 +- cf/data/data.py | 4709 ++----------------- cf/data/utils.py | 509 +- cf/mixin/propertiesdata.py | 3 +- cf/mixin/propertiesdatabounds.py | 3 +- cf/mixin2/container.py | 21 + cf/read_write/netcdf/netcdfread.py | 660 ++- cf/read_write/netcdf/netcdfwrite.py | 11 +- cf/read_write/read.py | 313 +- cf/test/test_Data.py | 192 +- 18 files changed, 1163 insertions(+), 5879 deletions(-) diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index f4355ac4f0..a8f78f94d9 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -2,13 +2,18 @@ import cfdm from ...mixin_container import Container -from .locks import netcdf_lock -from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin + +# from .locks import netcdf_lock +from .mixin import ( # , IndexMixin + ActiveStorageMixin, + ArrayMixin, + FileArrayMixin, +) class H5netcdfArray( ActiveStorageMixin, - IndexMixin, + # IndexMixin, FileArrayMixin, ArrayMixin, Container, @@ -25,60 +30,58 @@ class H5netcdfArray( """ - def __dask_tokenize__(self): - """Return a value fully representative of the object. + # def __dask_tokenize__(self): + # """Return a value fully representative of the object. - .. versionadded:: NEXTVERSION + # .. versionadded:: NEXTVERSION - """ - return super().__dask_tokenize__() + (self.get_mask(),) + # """ + # return super().__dask_tokenize__() + (self.get_mask(),) - @property - def _lock(self): - """Set the lock for use in `dask.array.from_array`. + # @property + # def _lock(self): + # """Set the lock for use in `dask.array.from_array`. - Returns a lock object because concurrent reads are not - currently supported by the HDF5 library. The lock object will - be the same for all `NetCDF4Array` and `H5netcdfArray` - instances, regardless of the dataset they access, which means - that access to all netCDF and HDF files coordinates around the - same lock. + # Returns a lock object because concurrent reads are not + # currently supported by the HDF5 library. The lock object will + # be the same for all `NetCDF4Array` and `H5netcdfArray` + # instances, regardless of the dataset they access, which means + # that access to all netCDF and HDF files coordinates around the + # same lock. - .. versionadded:: NEXTVERSION + # .. versionadded:: NEXTVERSION - """ - return netcdf_lock + # """ + # return netcdf_lock - # REVIEW: h5: `_get_array`: Ignore this for h5 review - # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array. - def _get_array(self, index=None): - """Returns a subspace of the dataset variable. + # def _get_array(self, index=None): + # """Returns a subspace of the dataset variable. - .. versionadded:: NEXTVERSION + # .. versionadded:: NEXTVERSION - .. seealso:: `__array__`, `index` + # .. seealso:: `__array__`, `index` - :Parameters: + # :Parameters: - {{index: `tuple` or `None`, optional}} + # {{index: `tuple` or `None`, optional}} - :Returns: + # :Returns: - `numpy.ndarray` - The subspace. + # `numpy.ndarray` + # The subspace. - """ - if index is None: - index = self.index() + # """ + # if index is None: + # index = self.index() - # We need to lock because the netCDF file is about to be accessed. - self._lock.acquire() + # # We need to lock because the netCDF file is about to be accessed. + # self._lock.acquire() - # It's cfdm.H5netcdfArray.__getitem__ that we want to - # call here, but we use 'Container' in super because - # that comes immediately before cfdm.H5netcdfArray in - # the method resolution order. - array = super(Container, self).__getitem__(index) + # # It's cfdm.H5netcdfArray.__getitem__ that we want to + # # call here, but we use 'Container' in super because + # # that comes immediately before cfdm.H5netcdfArray in + # # the method resolution order. + # array = super(Container, self).__getitem__(index) - self._lock.release() - return array + # self._lock.release() + # return array diff --git a/cf/data/array/mixin/cfamixin.py b/cf/data/array/mixin/cfamixin.py index 43fc23cf85..8af0840465 100644 --- a/cf/data/array/mixin/cfamixin.py +++ b/cf/data/array/mixin/cfamixin.py @@ -3,8 +3,7 @@ from itertools import accumulate, product import numpy as np - -from ...utils import chunk_locations, chunk_positions +from cfdm.data.utils import chunk_locations, chunk_positions class CFAMixin: diff --git a/cf/data/array/mixin/compressedarraymixin.py b/cf/data/array/mixin/compressedarraymixin.py index 3e74f2ffaf..8a1d5dfbe1 100644 --- a/cf/data/array/mixin/compressedarraymixin.py +++ b/cf/data/array/mixin/compressedarraymixin.py @@ -76,12 +76,11 @@ def to_dask_array(self, chunks="auto"): from functools import partial import dask.array as da + from cfdm.data.utils import normalize_chunks from dask import config from dask.array.core import getter from dask.base import tokenize - from ...utils import normalize_chunks - name = (f"{self.__class__.__name__}-{tokenize(self)}",) dtype = self.dtype diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py index ece5f3d3c4..6a5eb80b71 100644 --- a/cf/data/array/netcdf4array.py +++ b/cf/data/array/netcdf4array.py @@ -2,13 +2,14 @@ import cfdm from ...mixin_container import Container -from .locks import netcdf_lock + +# from .locks import netcdf_lock from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin class NetCDF4Array( ActiveStorageMixin, - IndexMixin, + # IndexMixin, FileArrayMixin, ArrayMixin, Container, @@ -23,61 +24,61 @@ class NetCDF4Array( """ - def __dask_tokenize__(self): - """Return a value fully representative of the object. - - .. versionadded:: 3.15.0 - - """ - return super().__dask_tokenize__() + (self.get_mask(),) - - @property - def _lock(self): - """Set the lock for use in `dask.array.from_array`. - - Returns a lock object because concurrent reads are not - currently supported by the netCDF and HDF libraries. The lock - object will be the same for all `NetCDF4Array` and - `H5netcdfArray` instances, regardless of the dataset they - access, which means that access to all netCDF and HDF files - coordinates around the same lock. - - .. versionadded:: 3.14.0 - - """ - return netcdf_lock - - # REVIEW: getitem: `_get_array`: Ignore this for h5 review - # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array - def _get_array(self, index=None): - """Returns a subspace of the dataset variable. - - .. versionadded:: NEXTVERSION - - .. seealso:: `__array__`, `index` - - :Parameters: - - {{index: `tuple` or `None`, optional}} - - :Returns: - - `numpy.ndarray` - The subspace. - - """ - if index is None: - index = self.index() - - # Note: We need to lock because the netCDF file is about to be - # accessed. - self._lock.acquire() - - # Note: It's cfdm.NetCDFArray.__getitem__ that we want to call - # here, but we use 'Container' in super because that - # comes immediately before cfdm.NetCDFArray in the - # method resolution order. - array = super(Container, self).__getitem__(index) - - self._lock.release() - return array + # def __dask_tokenize__(self): + # """Return a value fully representative of the object. + # + # .. versionadded:: 3.15.0 + # + # """ + # return super().__dask_tokenize__() + (self.get_mask(),) + + +# +# @property +# def _lock(self): +# """Set the lock for use in `dask.array.from_array`. +# +# Returns a lock object because concurrent reads are not +# currently supported by the netCDF and HDF libraries. The lock +# object will be the same for all `NetCDF4Array` and +# `H5netcdfArray` instances, regardless of the dataset they +# access, which means that access to all netCDF and HDF files +# coordinates around the same lock. +# +# .. versionadded:: 3.14.0 +# +# """ +# return netcdf_lock +# +# def _get_array(self, index=None): +# """Returns a subspace of the dataset variable. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `__array__`, `index` +# +# :Parameters: +# +# {{index: `tuple` or `None`, optional}} +# +# :Returns: +# +# `numpy.ndarray` +# The subspace. +# +# """ +# if index is None: +# index = self.index() +# +# # Note: We need to lock because the netCDF file is about to be +# # accessed. +# self._lock.acquire() +# +# # Note: It's cfdm.NetCDFArray.__getitem__ that we want to call +# # here, but we use 'Container' in super because that +# # comes immediately before cfdm.NetCDFArray in the +# # method resolution order. +# array = super(Container, self).__getitem__(index) +# +# self._lock.release() +# return array diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index bc30acb692..510b9c97ee 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -12,7 +12,6 @@ class UMArray( ): """A sub-array stored in a PP or UM fields file.""" - # REVIEW: h5: `__init__`: replace units/calendar API with attributes def __init__( self, filename=None, @@ -171,7 +170,6 @@ def __init__( # By default, close the UM file after data array access self._set_component("close", True, copy=False) - # REVIEW: getitem: `_get_array`: new method to convert subspace to numpy array def _get_array(self, index=None): """Returns a subspace of the dataset variable. @@ -272,7 +270,6 @@ def _get_rec(self, f, header_offset): # if r.hdr_offset == header_offset: # return r - # REVIEW: getitem: `_set_FillValue`: record _FillValue in attributes def _set_FillValue(self, int_hdr, real_hdr, attributes): """Set the ``_FillValue`` attribute. @@ -367,10 +364,8 @@ def _set_units(self, int_hdr, attributes): units = units0 break - # REVIEW: getitem: `_set_units`: record units in attributes attributes["units"] = units - # REVIEW: getitem: `_set_unpack`: record unpack in attributes def _set_unpack(self, int_hdr, real_hdr, attributes): """Set the ``add_offset`` and ``scale_factor`` attributes. diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 97dc955d38..db8fc277f6 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -1,4 +1,3 @@ -# REVIEW: active: `collapse_active.py`: new module for active storage functionality import datetime import logging import time diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index b58f9daf00..51b0cd1d0a 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -1,4 +1,3 @@ -# REVIEW: active: `dask_collapse.py`: all unlabelled changes in this module are general tidying, and should be reviewed at the same time as active storage """Reduction functions intended to be passed to be dask. Most of these functions are expected to be passed to @@ -11,13 +10,13 @@ from operator import mul import numpy as np +from cfdm.data.dask_utils import cfdm_asanyarray from dask.array import chunk from dask.array.core import _concatenate2 from dask.array.reductions import divide, numel from dask.core import flatten from dask.utils import deepmap -from ..dask_utils import cf_asanyarray from .collapse_active import actify from .collapse_utils import double_precision_dtype @@ -231,7 +230,6 @@ def sum_sample_sizes(pairs, axis, computing_meta=False, **kwargs): # -------------------------------------------------------------------- # mean # -------------------------------------------------------------------- -# REVIEW: active: `cf_mean_chunk`: active storage decoration @actify("mean") def cf_mean_chunk( x, @@ -278,9 +276,9 @@ def cf_mean_chunk( if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_asanyarray(x) if weights is not None: - weights = cf_asanyarray(weights) + weights = cfdm_asanyarray(weights) # N, sum d = cf_sum_chunk(x, weights=weights, dtype=dtype, **kwargs) @@ -378,7 +376,6 @@ def cf_mean_agg( # -------------------------------------------------------------------- # maximum # -------------------------------------------------------------------- -# REVIEW: active: `cf_max_chunk`: active storage decoration @actify("max") def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the maximum. @@ -404,7 +401,7 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_asanyarray(x) return { "max": chunk.max(x, **kwargs), @@ -533,7 +530,6 @@ def cf_mid_range_agg( # -------------------------------------------------------------------- # minimum # -------------------------------------------------------------------- -# REVIEW: active: `cf_min_chunk`: active storage decoration @actify("min") def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the minimum. @@ -559,7 +555,7 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_asanyarray(x) return { "min": chunk.min(x, **kwargs), @@ -640,7 +636,6 @@ def cf_min_agg( # -------------------------------------------------------------------- # range # -------------------------------------------------------------------- -# REVIEW: active: `cf_range_chunk`: active storage decoration @actify("range") def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the range. @@ -667,7 +662,7 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_asanyarray(x) # N, max d = cf_max_chunk(x, **kwargs) @@ -754,7 +749,6 @@ def cf_range_agg( # -------------------------------------------------------------------- # root mean square # -------------------------------------------------------------------- -# REVIEW: active: `cf_rms_chunk`: active storage decoration @actify("rms") def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): """Chunk calculations for the root mean square (RMS). @@ -785,7 +779,7 @@ def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_asanyarray(x) return cf_mean_chunk( np.multiply(x, x, dtype=dtype), weights=weights, dtype=dtype, **kwargs @@ -839,7 +833,6 @@ def cf_rms_agg( # -------------------------------------------------------------------- # sample size # -------------------------------------------------------------------- -# REVIEW: active: `cf_sample_size_chunk`: active storage decoration @actify("sample_size") def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): """Chunk calculations for the sample size. @@ -864,7 +857,7 @@ def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_asanyarray(x) if np.ma.isMA(x): N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs) @@ -953,7 +946,6 @@ def cf_sample_size_agg( # -------------------------------------------------------------------- # sum # -------------------------------------------------------------------- -# REVIEW: active: `cf_sum_chunk`: active storage decoration @actify("sum") def cf_sum_chunk( x, @@ -993,10 +985,10 @@ def cf_sum_chunk( if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_asanyarray(x) if weights is not None: - weights = cf_asanyarray(weights) + weights = cfdm_asanyarray(weights) if check_weights: w_min = weights.min() if w_min <= 0: @@ -1089,7 +1081,6 @@ def cf_sum_agg( # -------------------------------------------------------------------- # sum of weights # -------------------------------------------------------------------- -# REVIEW: active: `cf_sum_of_weights_chunk`: active storage decoration @actify("sum_of_weights") def cf_sum_of_weights_chunk( x, weights=None, dtype="f8", computing_meta=False, **kwargs @@ -1116,9 +1107,9 @@ def cf_sum_of_weights_chunk( if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_asanyarray(x) if weights is not None: - weights = cf_asanyarray(weights) + weights = cfdm_asanyarray(weights) # N d = cf_sample_size_chunk(x, **kwargs) @@ -1133,7 +1124,6 @@ def cf_sum_of_weights_chunk( # -------------------------------------------------------------------- # sum of squares of weights # -------------------------------------------------------------------- -# REVIEW: active: `cf_sum_of_weights2_chunk`: active storage decoration @actify("sum_of_weights2") def cf_sum_of_weights2_chunk( x, weights=None, dtype="f8", computing_meta=False, **kwargs @@ -1162,9 +1152,9 @@ def cf_sum_of_weights2_chunk( if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_asanyarray(x) if weights is not None: - weights = cf_asanyarray(weights) + weights = cfdm_asanyarray(weights) # N d = cf_sample_size_chunk(x, **kwargs) @@ -1179,7 +1169,6 @@ def cf_sum_of_weights2_chunk( # -------------------------------------------------------------------- # unique # -------------------------------------------------------------------- -# REVIEW: active: `cf_unique_chunk`: active storage decoration @actify("unique") def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the unique values. @@ -1204,7 +1193,7 @@ def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_asanyarray(x) return {"unique": np.unique(x)} @@ -1244,7 +1233,6 @@ def cf_unique_agg(pairs, axis=None, computing_meta=False, **kwargs): # -------------------------------------------------------------------- # variance # -------------------------------------------------------------------- -# REVIEW: active: `cf_var_chunk`: active storage decoration @actify("var") def cf_var_chunk( x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs @@ -1310,11 +1298,11 @@ def cf_var_chunk( if computing_meta: return x - x = cf_asanyarray(x) + x = cfdm_asanyarray(x) weighted = weights is not None if weighted: - weights = cf_asanyarray(weights) + weights = cfdm_asanyarray(weights) # N, V1, sum d = cf_mean_chunk(x, weights=weights, dtype=dtype, **kwargs) diff --git a/cf/data/creation.py b/cf/data/creation.py index 0d4067a373..e69de29bb2 100644 --- a/cf/data/creation.py +++ b/cf/data/creation.py @@ -1,126 +0,0 @@ -"""Functions used during the creation of `Data` objects.""" - -from functools import lru_cache - -import dask.array as da -import numpy as np -from dask.base import is_dask_collection - - -def to_dask(array, chunks, **from_array_options): - """Create a `dask` array. - - .. versionadded:: 3.14.0 - - :Parameters: - - array: array_like - The array to be converted to a `dask` array. Examples of - valid types include anything with a `to_dask_array` - method, `numpy` arrays, `dask` arrays, `xarray` arrays, - `cf.Array` subclasses, `list`, `tuple`, scalars. - - chunks: `int`, `tuple`, `dict` or `str`, optional - Specify the chunking of the returned dask array. Any - value accepted by the *chunks* parameter of the - `dask.array.from_array` function is allowed. - - Might be ignored if *array* is a `dask` array that already - defines its own chunks. - - Might get automatically modified if *array* is a - compressed `Array` subclass. - - from_array_options: `dict`, optional - Keyword arguments to be passed to `dask.array.from_array`. - - If *from_array_options* has no ``'meta'`` key then the - `meta` keyword is set to the `_meta` attribute of *array* - or, if there is no such attribute, `None`. - - :Returns: - - `dask.array.Array` - The `dask` array representation of the array. - - **Examples** - - >>> cf.data.creation.to_dask([1, 2, 3], 'auto') - dask.array - >>> cf.data.creation.to_dask([1, 2, 3], chunks=2) - dask.array - >>> cf.data.creation.to_dask([1, 2, 3], chunks=2, {'asarray': True}) - dask.array - >>> cf.data.creation.to_dask(cf.dt(2000, 1, 1), 'auto') - dask.array - >>> cf.data.creation.to_dask([cf.dt(2000, 1, 1)], 'auto') - dask.array - - """ - if is_dask_collection(array): - return array - - # REVIEW: getitem: `to_dask`: set '_asanyarray' - if hasattr(array, "to_dask_array"): - try: - return array.to_dask_array(chunks=chunks) - except TypeError: - try: - return array.to_dask_array(_asanyarray=False) - except TypeError: - return array.to_dask_array() - - if type(array).__module__.split(".")[0] == "xarray": - data = getattr(array, "data", None) - if data is not None: - return da.asanyarray(data) - - if not isinstance( - array, (np.ndarray, list, tuple, memoryview) + np.ScalarType - ) and not hasattr(array, "shape"): - # 'array' is not of a type that `da.from_array` can cope with, - # so convert it to a numpy array. - array = np.asanyarray(array) - - kwargs = from_array_options - # REVIEW: active: `to_dask`: '_dask_meta' renamed to '_meta' for consistency with Dask - # REVIEW: getitem: `to_dask`: The file lock is now on the `Array` object (in its `_get_array` method), rather than being set on the Dask array itself. - kwargs.setdefault("meta", getattr(array, "_meta", None)) - - try: - return da.from_array(array, chunks=chunks, **kwargs) - except NotImplementedError: - # Try again with 'chunks=-1', in case the failure was due to - # not being able to use auto rechunking with object dtype. - return da.from_array(array, chunks=-1, **kwargs) - - -@lru_cache(maxsize=32) -def generate_axis_identifiers(n): - """Return new axis identifiers for a given number of axes. - - The names are arbitrary and have no semantic meaning. - - .. versionadded:: 3.14.0 - - :Parameters: - - n: `int` - Generate this number of axis identifiers. - - :Returns: - - `list` - The new axis identifiers. - - **Examples** - - >>> cf.data.creation.generate_axis_identifiers(0) - [] - >>> cf.data.creation.generate_axis_identifiers(1) - ['dim0'] - >>> cf.data.creation.generate_axis_identifiers(3) - ['dim0', 'dim1', 'dim2'] - - """ - return [f"dim{i}" for i in range(n)] diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index c50e16d85f..a958702883 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -9,6 +9,7 @@ import dask.array as da import numpy as np +from cfdm.data.dask_utils import cfdm_asanyarray from dask.core import flatten from scipy.ndimage import convolve1d @@ -127,9 +128,8 @@ def cf_contains(a, value): value. """ - # REVIEW: getitem: `cf_contains`: convert a to a usable array - a = cf_asanyarray(a) - value = cf_asanyarray(value) + a = cfdm_asanyarray(a) + value = cfdm_asanyarray(value) return np.array(value in a).reshape((1,) * a.ndim) @@ -163,8 +163,7 @@ def cf_convolve1d(a, window=None, axis=-1, origin=0): Convolved float array with same shape as input. """ - # REVIEW: getitem: `cf_convolve1d`: convert a to a usable array - a = cf_asanyarray(a) + a = cfdm_asanyarray(a) # Cast to float to ensure that NaNs can be stored if a.dtype != float: @@ -187,39 +186,6 @@ def cf_convolve1d(a, window=None, axis=-1, origin=0): return c -def cf_harden_mask(a): - """Harden the mask of a masked `numpy` array. - - Has no effect if the array is not a masked array. - - .. versionadded:: 3.14.0 - - .. seealso:: `cf.Data.harden_mask` - - :Parameters: - - a: `numpy.ndarray` - The array to have a hardened mask. - - :Returns: - - `numpy.ndarray` - The array with hardened mask. - - """ - # REVIEW: getitem: `cf_harden_mask`: convert a to a usable array - a = cf_asanyarray(a) - if np.ma.isMA(a): - try: - a.harden_mask() - except AttributeError: - # Trap cases when the input array is not a numpy array - # (e.g. it might be numpy.ma.masked). - pass - - return a - - def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): """Compute percentiles of the data along the specified axes. @@ -279,8 +245,7 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): """ from math import prod - # REVIEW: getitem: `cf_percentile`: convert a to a usable array - a = cf_asanyarray(a) + a = cfdm_asanyarray(a) if np.ma.isMA(a) and not np.ma.is_masked(a): # Masked array with no masked elements @@ -355,144 +320,6 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): return p -def cf_soften_mask(a): - """Soften the mask of a masked `numpy` array. - - Has no effect if the array is not a masked array. - - .. versionadded:: 3.14.0 - - .. seealso:: `cf.Data.soften_mask` - - :Parameters: - - a: `numpy.ndarray` - The array to have a softened mask. - - :Returns: - - `numpy.ndarray` - The array with softened mask. - - """ - # REVIEW: getitem: `cf_soften_mask`: convert a to a usable array - a = cf_asanyarray(a) - - if np.ma.isMA(a): - try: - a.soften_mask() - except AttributeError: - # Trap cases when the input array is not a numpy array - # (e.g. it might be numpy.ma.masked). - pass - - return a - - -def cf_where(array, condition, x, y, hardmask): - """Set elements of *array* from *x* or *y* depending on *condition*. - - The input *array* is not changed in-place. - - See `where` for details on the expected functionality. - - .. note:: This function correctly sets the mask hardness of the - output array. - - .. versionadded:: 3.14.0 - - .. seealso:: `cf.Data.where` - - :Parameters: - - array: numpy.ndarray - The array to be assigned to. - - condition: numpy.ndarray - Where False or masked, assign from *y*, otherwise assign - from *x*. - - x: numpy.ndarray or `None` - *x* and *y* must not both be `None`. - - y: numpy.ndarray or `None` - *x* and *y* must not both be `None`. - - hardmask: `bool` - Set the mask hardness for a returned masked array. If True - then a returned masked array will have a hardened mask, and - the mask of the input *array* (if there is one) will be - applied to the returned array, in addition to any masked - elements arising from assignments from *x* or *y*. - - :Returns: - - `numpy.ndarray` - A copy of the input *array* with elements from *y* where - *condition* is False or masked, and elements from *x* - elsewhere. - - """ - # REVIEW: getitem: `cf_where`: convert array, condition, x, y to usable arrays - array = cf_asanyarray(array) - condition = cf_asanyarray(condition) - if x is not None: - x = cf_asanyarray(x) - - if y is not None: - y = cf_asanyarray(y) - - mask = None - - if np.ma.isMA(array): - # Do a masked where - where = np.ma.where - if hardmask: - mask = array.mask - elif np.ma.isMA(x) or np.ma.isMA(y): - # Do a masked where - where = np.ma.where - else: - # Do a non-masked where - where = np.where - hardmask = False - - condition_is_masked = np.ma.isMA(condition) - if condition_is_masked: - condition = condition.astype(bool) - - if x is not None: - # Assign values from x - if condition_is_masked: - # Replace masked elements of condition with False, so that - # masked locations are assigned from array - c = condition.filled(False) - else: - c = condition - - array = where(c, x, array) - - if y is not None: - # Assign values from y - if condition_is_masked: - # Replace masked elements of condition with True, so that - # masked locations are assigned from array - c = condition.filled(True) - else: - c = condition - - array = where(c, array, y) - - if hardmask: - if mask is not None and mask.any(): - # Apply the mask from the input array to the result - array.mask |= mask - - array.harden_mask() - - return array - - def _getattr(x, attr): return getattr(x, attr, False) @@ -537,7 +364,7 @@ def cf_YMDhms(a, attr): array([1, 2]) """ - a = cf_asanyarray(a) + a = cfdm_asanyarray(a) return _array_getattr(a, attr=attr) @@ -570,8 +397,7 @@ def cf_rt2dt(a, units): cftime.DatetimeGregorian(2000, 1, 2, 0, 0, 0, 0, has_year_zero=False)] """ - # REVIEW: getitem: `cf_rt2dt`: convert a to a usable array - a = cf_asanyarray(a) + a = cfdm_asanyarray(a) if not units.iscalendartime: return rt2dt(a, units_in=units) @@ -626,8 +452,7 @@ def cf_dt2rt(a, units): [365 366] """ - # REVIEW: getitem: `cf_dt2rt`: convert a to a usable array - a = cf_asanyarray(a) + a = cfdm_asanyarray(a) return dt2rt(a, units_out=units, units_in=None) @@ -668,8 +493,7 @@ def cf_units(a, from_units, to_units): [1000. 2000.] """ - # REVIEW: getitem: `cf_units`: convert a to a usable array - a = cf_asanyarray(a) + a = cfdm_asanyarray(a) return Units.conform( a, from_units=from_units, to_units=to_units, inplace=False ) @@ -693,8 +517,7 @@ def cf_is_masked(a): values. """ - # REVIEW: getitem: `cf_is_masked`: convert a to a usable array - a = cf_asanyarray(a) + a = cfdm_asanyarray(a) out = np.ma.is_masked(a) return np.array(out).reshape((1,) * a.ndim) @@ -727,33 +550,5 @@ def cf_filled(a, fill_value=None): [[-999 2 3]] """ - # REVIEW: getitem: `cf_filled`: convert a to a usable array - a = cf_asanyarray(a) + a = cfdm_asanyarray(a) return np.ma.filled(a, fill_value=fill_value) - - -# REVIEW: getitem: `cf_asanyarray`: convert a to a usable array -def cf_asanyarray(a): - """Convert to a `numpy` array. - - Only do this if the input *a* has an `__asanyarray__` attribute - with value True. - - .. versionadded:: NEXTVERSION - - :Parameters: - - a: array_like - The array. - - :Returns: - - The array converted to a `numpy` array, or the input array - unchanged if ``a.__asanyarray__`` False. - - """ - # REVIEW: getitem: `cf_asanyarray`: convert a to a usable array - if getattr(a, "__asanyarray__", False): - return np.asanyarray(a) - - return a diff --git a/cf/data/data.py b/cf/data/data.py index d310e37aec..bb13633d39 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -11,16 +11,15 @@ import cftime import dask.array as da import numpy as np -from cfdm import is_log_level_info +from cfdm.data.dask_utils import cfdm_where +from cfdm.data.utils import new_axis_identifier from dask import compute, delayed # noqa: F401 from dask.array.core import normalize_chunks -from dask.base import collections_to_dsk, is_dask_collection, tokenize +from dask.base import is_dask_collection, tokenize from dask.highlevelgraph import HighLevelGraph -from dask.optimization import cull -from scipy.sparse import issparse from ..cfdatetime import dt as cf_dt -from ..constants import masked as cf_masked +from ..constants import masked from ..decorators import ( _deprecated_kwarg_check, _display_or_return, @@ -33,44 +32,23 @@ _numpy_allclose, _section, abspath, - atol, default_netCDF_fillvals, free_memory, parse_indices, - rtol, ) from ..mixin2 import CFANetCDF, Container from ..units import Units from .collapse import Collapse -from .creation import generate_axis_identifiers, to_dask - -# REVIEW: getitem: `data.py`: import cf_asanyarray, cf_filled, cf_is_masked from .dask_utils import ( - _da_ma_allclose, - cf_asanyarray, cf_contains, cf_dt2rt, - cf_filled, - cf_harden_mask, cf_is_masked, cf_percentile, cf_rt2dt, - cf_soften_mask, cf_units, - cf_where, ) from .mixin import DataClassDeprecationsMixin -from .utils import ( - YMDhms, - collapse, - conform_units, - convert_to_datetime, - convert_to_reftime, - first_non_missing_value, - is_numeric_dtype, - new_axis_identifier, - scalar_masked_array, -) +from .utils import YMDhms, collapse, conform_units, scalar_masked_array logger = logging.getLogger(__name__) @@ -93,8 +71,6 @@ _dtype_float = np.dtype(float) _dtype_bool = np.dtype(bool) -_DEFAULT_CHUNKS = "auto" -_DEFAULT_HARDMASK = True # Contstants used to specify which `Data` components should be cleared # when a new dask array is set. See `Data._clear_after_dask_update` @@ -170,14 +146,20 @@ class Data(DataClassDeprecationsMixin, CFANetCDF, Container, cfdm.Data): """ + def __new__(cls, *args, **kwargs): + """Store component classes.""" + instance = super().__new__(cls) + instance._Units_class = Units + return instance + def __init__( self, array=None, units=None, calendar=None, fill_value=None, - hardmask=_DEFAULT_HARDMASK, - chunks=_DEFAULT_CHUNKS, + hardmask=True, + chunks="auto", dt=False, source=None, copy=True, @@ -194,7 +176,7 @@ def __init__( array: optional The array of values. May be a scalar or array-like - object, including another `Data` instance, anything + object, including another `{{class}}` instance, anything with a `!to_dask_array` method, `numpy` array, `dask` array, `xarray` array, `cf.Array` subclass, `list`, `tuple`, scalar. @@ -265,7 +247,7 @@ def __init__( Apply this mask to the data given by the *array* parameter. By default, or if *mask* is `None`, no mask is applied. May be any scalar or array-like object - (such as a `list`, `numpy` array or `Data` instance) + (such as a `list`, `numpy` array or `{{class}}` instance) that is broadcastable to the shape of *array*. Masking will be carried out where the mask elements evaluate to `True`. @@ -273,30 +255,28 @@ def __init__( This mask will applied in addition to any mask already defined by the *array* parameter. - .. versionadded:: 3.0.5 - mask_value: scalar array_like Mask *array* where it is equal to *mask_value*, using numerically tolerant floating point equality. - .. versionadded:: 3.16.0 - - {{init source: optional}} + .. versionadded:: (cfdm) 1.11.0.0 hardmask: `bool`, optional - If False then the mask is soft. By default the mask is - hard. + If True (the default) then the mask is hard. If False + then the mask is soft. dt: `bool`, optional If True then strings (such as ``'1990-12-01 12:00'``) given by the *array* parameter are re-interpreted as date-time objects. By default they are not. + {{init source: optional}} + {{init copy: `bool`, optional}} {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - .. versionadded:: 3.14.0 + .. versionadded:: (cfdm) NEXTVERSION to_memory: `bool`, optional If True then ensure that the original data are in @@ -310,7 +290,7 @@ def __init__( data are computed. In general, setting *to_memory* to True is not the same - as calling the `persist` of the newly created `Data` + as calling the `persist` of the newly created `{{class}}` object, which also decompresses data compressed by convention and computes any data type, mask and date-time modifications. @@ -318,7 +298,7 @@ def __init__( If the input *array* is a `dask.array.Array` object then *to_memory* is ignored. - .. versionadded:: 3.14.0 + .. versionadded:: (cfdm) NEXTVERSION init_options: `dict`, optional Provide optional keyword arguments to methods and @@ -337,7 +317,7 @@ def __init__( * ``'first_non_missing_value'``: Provide keyword arguments to the - `cf.data.utils.first_non_missing_value` + `cfdm.data.utils.first_non_missing_value` function. This is used when the input array contains date-time strings or objects, and may affect performance. @@ -345,218 +325,44 @@ def __init__( *Parameter example:* ``{'from_array': {'inline_array': True}}`` - chunk: deprecated at version 3.14.0 - Use the *chunks* parameter instead. - **Examples** - >>> d = cf.Data(5) - >>> d = cf.Data([1,2,3], units='K') + >>> d = {{package}}.{{class}}(5) + >>> d = {{package}}.{{class}}([1,2,3], units='K') >>> import numpy - >>> d = cf.Data(numpy.arange(10).reshape(2,5), - ... units=Units('m/s'), fill_value=-999) - >>> d = cf.Data('fly') - >>> d = cf.Data(tuple('fly')) + >>> d = {{package}}.{{class}}(numpy.arange(10).reshape(2,5), + ... units='m/s', fill_value=-999) + >>> d = {{package}}.{{class}}('fly') + >>> d = {{package}}.{{class}}(tuple('fly')) """ - if source is None and isinstance(array, self.__class__): - source = array - - if init_options is None: - init_options = {} - - if source is not None: - try: - array = source._get_Array(None) - except AttributeError: - array = None - - super().__init__( - source=source, _use_array=_use_array and array is not None - ) - if _use_array: - # REVIEW: getitem: `__init__`: set 'asanyarray' - try: - array = source.to_dask_array(_asanyarray=False) - except (AttributeError, TypeError): - try: - array = source.to_dask_array() - except (AttributeError, TypeError): - pass - else: - self._set_dask(array, copy=copy, clear=_NONE) - else: - self._set_dask( - array, copy=copy, clear=_NONE, asanyarray=None - ) - else: - self._del_dask(None, clear=_NONE) - - # Set the mask hardness - self.hardmask = getattr(source, "hardmask", _DEFAULT_HARDMASK) - - return - super().__init__( array=array, + units=units, + calendar=calendar, fill_value=fill_value, - _use_array=False, + hardmask=hardmask, + chunks=chunks, + dt=dt, + source=source, + copy=copy, + dtype=dtype, + mask=mask, + mask_value=mask_value, + to_memory=to_memory, + init_options=init_options, + _use_array=_use_array, ) - # Set the units - units = Units(units, calendar=calendar) - self._Units = units - - # Set the mask hardness - self.hardmask = hardmask - - if array is None: - # No data has been set - return - - sparse_array = issparse(array) - - try: - ndim = array.ndim - except AttributeError: - ndim = np.ndim(array) - - # Create the _cyclic attribute: identifies which axes are - # cyclic (and therefore allow cyclic slicing). It must be a - # subset of the axes given by the _axes attribute. If an axis - # is removed from _axes then it must also be removed from - # _cyclic. - # - # Never change the value of the _cyclic attribute in-place. - self._cyclic = _empty_set - - # Create the _axes attribute: an ordered sequence of unique - # (within this `Data` instance) names for each array axis. - self._axes = generate_axis_identifiers(ndim) - - if not _use_array: - return - - # Still here? Then create a dask array and store it. - custom = self._custom - - # Find out if the input data is compressed by convention - try: - compressed = array.get_compression_type() - except AttributeError: - compressed = "" - - if compressed and init_options.get("from_array"): - raise ValueError( - "Can't define 'from_array' initialisation options " - "for compressed input arrays" - ) - - # Bring the compressed data into memory without - # decompressing it - if to_memory: + if source is not None: try: - array = array.to_memory() + deterministic = source.has_deterministic_name() except AttributeError: - pass - - if self._is_abstract_Array_subclass(array): - # Save the input array in case it's useful later. For - # compressed input arrays this will contain extra - # information, such as a count or index variable. - self._set_Array(array) - - # Cast the input data as a dask array - kwargs = init_options.get("from_array", {}) - if "chunks" in kwargs: - raise TypeError( - "Can't define 'chunks' in the 'from_array' initialisation " - "options. Use the 'chunks' parameter instead." - ) - - # Set whether or not we're sure that the Data instance has a - # deterministic name - is_dask = is_dask_collection(array) - custom["deterministic"] = not is_dask - - # REVIEW: getitem: `__init__`: Set whether or not to call `np.asanyarray` on chunks to convert them to numpy arrays. - # Set whether or not to call `np.asanyarray` on chunks to - # convert them to numpy arrays. - if is_dask: - # We don't know what's in the dask array, so we should - # assume that it might need converting to a numpy array. - custom["__asanyarray__"] = True + deterministic = False else: - # Use the array's __asanyarray__ value, if it has one. - custom["__asanyarray__"] = bool( - getattr(array, "__asanyarray__", False) - ) - - dx = to_dask(array, chunks, **kwargs) - - # Find out if we have an array of date-time objects - if units.isreftime: - dt = True + deterministic = not is_dask_collection(array) - first_value = None - if not dt and dx.dtype.kind == "O": - kwargs = init_options.get("first_non_missing_value", {}) - first_value = first_non_missing_value(dx, **kwargs) - - if first_value is not None: - dt = hasattr(first_value, "timetuple") - - # Convert string or object date-times to floating point - # reference times - if dt and dx.dtype.kind in "USO": - dx, units = convert_to_reftime(dx, units, first_value) - # Reset the units - self._Units = units - - # REVIEW: getitem: `__init__`: set 'asanyarray' - # Store the dask array - self._set_dask(dx, clear=_NONE, asanyarray=None) - - # Override the data type - if dtype is not None: - self.dtype = dtype - - # Apply a mask - if mask is not None: - if sparse_array: - raise ValueError("Can't mask sparse array") - - self.where(mask, cf_masked, inplace=True) - - # Apply masked values - if mask_value is not None: - if sparse_array: - raise ValueError("Can't mask sparse array") - - self.masked_values(mask_value, inplace=True) - - @property - def dask_compressed_array(self): - """Returns a dask array of the compressed data. - - .. versionadded:: 3.14.0 - - :Returns: - - `dask.array.Array` - The compressed data. - - **Examples** - - >>> a = d.dask_compressed_array - - """ - ca = self.source(None) - - if ca is None or not ca.get_compression_type(): - raise ValueError("not compressed: can't get compressed dask array") - - return ca.to_dask_array() + self._custom["has_deterministic_name"] = deterministic def __contains__(self, value): """Membership test operator ``in`` @@ -636,7 +442,6 @@ def __contains__(self, value): f"not {value!r}" ) - # REVIEW: getitem: `cf_contains`: set 'asanyarray' # If value is a scalar Data object then conform its units if isinstance(value, self.__class__): self_units = self.Units @@ -650,11 +455,11 @@ def __contains__(self, value): # are incompatible return False - # 'cf_contains' has its own calls to 'cf_asanyarray', so + # 'cf_contains' has its own calls to 'cfdm_asanyarray', so # we can set '_asanyarray=False'. value = value.to_dask_array(_asanyarray=False) - # 'cf_contains' has its own calls to 'cf_asanyarray', so we + # 'cf_contains' has its own calls to 'cfdm_asanyarray', so we # can set '_asanyarray=False'. dx = self.to_dask_array(_asanyarray=False) @@ -674,157 +479,10 @@ def __contains__(self, value): return bool(dx.any()) - @property - def _atol(self): - """Return the current value of the `cf.atol` function.""" - return atol().value - - @property - def _rtol(self): - """Return the current value of the `cf.rtol` function.""" - return rtol().value - def __data__(self): """Returns a new reference to self.""" return self - def __float__(self): - """Called to implement the built-in function `float` - - x.__float__() <==> float(x) - - **Performance** - - `__float__` causes all delayed operations to be executed, - unless the dask array size is already known to be greater than - 1. - - """ - return float(self.to_dask_array()) - - def __int__(self): - """Called to implement the built-in function `int` - - x.__int__() <==> int(x) - - **Performance** - - `__int__` causes all delayed operations to be executed, unless - the dask array size is already known to be greater than 1. - - """ - return int(self.to_dask_array()) - - def __iter__(self): - """Called when an iterator is required. - - x.__iter__() <==> iter(x) - - **Performance** - - If the shape of the data is unknown then it is calculated - immediately by executing all delayed operations. - - **Examples** - - >>> d = cf.Data([1, 2, 3], 'metres') - >>> for e in d: - ... print(repr(e)) - ... - - - - - >>> d = cf.Data([[1, 2], [3, 4]], 'metres') - >>> for e in d: - ... print(repr(e)) - ... - - - - >>> d = cf.Data(99, 'metres') - >>> for e in d: - ... print(repr(e)) - ... - Traceback (most recent call last): - ... - TypeError: iteration over a 0-d Data - - """ - try: - n = len(self) - except TypeError: - raise TypeError(f"iteration over a 0-d {self.__class__.__name__}") - - if self.__keepdims_indexing__: - for i in range(n): - out = self[i] - out.reshape(out.shape[1:], inplace=True) - yield out - else: - for i in range(n): - yield self[i] - - def __len__(self): - """Called to implement the built-in function `len`. - - x.__len__() <==> len(x) - - **Performance** - - If the shape of the data is unknown then it is calculated - immediately by executing all delayed operations. - - **Examples** - - >>> len(cf.Data([1, 2, 3])) - 3 - >>> len(cf.Data([[1, 2, 3]])) - 1 - >>> len(cf.Data([[1, 2, 3], [4, 5, 6]])) - 2 - >>> len(cf.Data(1)) - Traceback (most recent call last): - ... - TypeError: len() of unsized object - - """ - # REVIEW: getitem: `__len__`: set 'asanyarray' - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - if math.isnan(dx.size): - logger.debug("Computing data len: Performance may be degraded") - dx.compute_chunk_sizes() - - return len(dx) - - def __bool__(self): - """Truth value testing and the built-in operation `bool` - - x.__bool__() <==> bool(x) - - **Performance** - - `__bool__` causes all delayed operations to be computed. - - **Examples** - - >>> bool(cf.Data(1.5)) - True - >>> bool(cf.Data([[False]])) - False - - """ - size = self.size - if size != 1: - raise ValueError( - f"The truth value of a {self.__class__.__name__} with {size} " - "elements is ambiguous. Use d.any() or d.all()" - ) - - return bool(self.to_dask_array()) - def __getitem__(self, indices): """Return a subspace of the data defined by indices. @@ -886,20 +544,17 @@ def __getitem__(self, indices): indices = indices[2:] shape = self.shape + axes = self._axes + cyclic_axes = self._cyclic keepdims = self.__keepdims_indexing__ indices, roll = parse_indices( shape, indices, cyclic=True, keepdims=keepdims ) - - axes = self._axes - cyclic_axes = self._cyclic - - # ------------------------------------------------------------ - # Roll axes with cyclic slices - # ------------------------------------------------------------ - # REVIEW: getitem: `__getitem__`: set 'asanyarray' + indices = tuple(indices) if roll: + # Roll axes with cyclic slices. + # # For example, if slice(-2, 3) has been requested on a # cyclic axis, then we roll that axis by two points and # apply the slice(0, 5) instead. @@ -908,94 +563,21 @@ def __getitem__(self, indices): "Can't take a cyclic slice of a non-cyclic axis" ) - new = self.roll( - axis=tuple(roll.keys()), shift=tuple(roll.values()) - ) - dx = new.to_dask_array(_asanyarray=False) - else: - new = self.copy() - dx = self.to_dask_array(_asanyarray=False) - - # ------------------------------------------------------------ - # Subspace the dask array - # ------------------------------------------------------------ - if self.__orthogonal_indexing__: - # Apply 'orthogonal indexing': indices that are 1-d arrays - # or lists subspace along each dimension - # independently. This behaviour is similar to Fortran, but - # different to dask. - axes_with_list_indices = [ - i - for i, x in enumerate(indices) - if isinstance(x, list) or getattr(x, "shape", False) - ] - n_axes_with_list_indices = len(axes_with_list_indices) - - if n_axes_with_list_indices < 2: - # At most one axis has a list/1-d array index so do a - # normal dask subspace - dx = dx[tuple(indices)] - else: - # At least two axes have list/1-d array indices so we - # can't do a normal dask subspace - - # Subspace axes which have list/1-d array indices - for axis in axes_with_list_indices: - dx = da.take(dx, indices[axis], axis=axis) - - if n_axes_with_list_indices < len(indices): - # Subspace axes which don't have list/1-d array - # indices. (Do this after subspacing axes which do - # have list/1-d array indices, in case - # __keepdims_indexing__ is False.) - slice_indices = [ - slice(None) if i in axes_with_list_indices else x - for i, x in enumerate(indices) - ] - dx = dx[tuple(slice_indices)] + d = self.roll(axis=tuple(roll.keys()), shift=tuple(roll.values())) else: - raise NotImplementedError( - "Non-orthogonal indexing has not yet been implemented" - ) - - # REVIEW: getitem: `__getitem__`: set 'asanyarray=True' because subspaced chunks might not be in memory - # ------------------------------------------------------------ - # Set the subspaced dask array - # - # * A subspaced chunk might not result in an array in memory, - # so we set asanyarray=True to ensure that, if required, - # they are converted at compute time. - # ------------------------------------------------------------ - new._set_dask(dx, asanyarray=True) + d = self - # ------------------------------------------------------------ - # Get the axis identifiers for the subspace - # ------------------------------------------------------------ - shape0 = shape - if keepdims: - new_axes = axes - else: - new_axes = [ - axis - for axis, x in zip(axes, indices) - if not isinstance(x, Integral) and getattr(x, "shape", True) - ] - if new_axes != axes: - new._axes = new_axes - cyclic_axes = new._cyclic - if cyclic_axes: - shape0 = [ - n for n, axis in zip(shape, axes) if axis in new_axes - ] + new = super(Data, d).__getitem__(indices) - # ------------------------------------------------------------ - # Cyclic axes that have been reduced in size are no longer - # considered to be cyclic - # ------------------------------------------------------------ if cyclic_axes: + # Cyclic axes that have been reduced in size are no longer + # considered to be cyclics + shape0 = [ + n for n, axis in zip(shape, self._axes) if axis in new._axes + ] x = [ axis - for axis, n0, n1 in zip(new_axes, shape0, new.shape) + for axis, n0, n1 in zip(new._axes, shape0, new.shape) if axis in cyclic_axes and n0 != n1 ] if x: @@ -1003,15 +585,10 @@ def __getitem__(self, indices): # in-place new._cyclic = cyclic_axes.difference(x) - # ------------------------------------------------------------ - # Apply ancillary masks - # ------------------------------------------------------------ - for mask in ancillary_mask: - new.where(mask, cf_masked, None, inplace=True) - - if new.shape != self.shape: - # Delete hdf5 chunksizes when the shape has changed. - new.nc_clear_hdf5_chunksizes() + if ancillary_mask: + # Apply ancillary masks + for mask in ancillary_mask: + new.where(mask, masked, None, inplace=True) return new @@ -1061,8 +638,6 @@ def __setitem__(self, indices, value): `hardmask`, `where` """ - shape = self.shape - ancillary_mask = () try: arg = indices[0] @@ -1077,64 +652,15 @@ def __setitem__(self, indices, value): indices = indices[2:] indices, roll = parse_indices( - shape, + self.shape, indices, cyclic=True, keepdims=self.__keepdims_indexing__, ) - axes_with_list_indices = [ - i - for i, x in enumerate(indices) - if isinstance(x, list) or getattr(x, "shape", False) - ] - - # When there are two or more 1-d array indices of Booleans or - # integers, convert them to slices, if possible. - # - # Note: If any of these 1-d arrays is a dask collection, then - # this will be computed. - if len(axes_with_list_indices) > 1: - for i, index in enumerate(indices): - if not ( - isinstance(index, list) or getattr(index, "shape", False) - ): - # Not a 1-d array - continue - - index = np.array(index) - - size = shape[i] - if index.dtype == bool: - # Convert True values to integers - index = np.arange(size)[index] - else: - # Make sure all integer values are non-negative - index = np.where(index < 0, index + size, index) - - if size == 1: - start = index[0] - index = slice(start, start + 1) - else: - steps = index[1:] - index[:-1] - step = steps[0] - if step and not (steps - step).any(): - # Array has a regular step, and so can be - # converted to a slice. - if step > 0: - start, stop = index[0], index[-1] + 1 - elif step < 0: - start, stop = index[0], index[-1] - 1 - - if stop < 0: - stop = None - - index = slice(start, stop, step) - - indices[i] = index - - # Roll axes with cyclic slices if roll: + # Roll axes with cyclic slices + # # For example, if assigning to slice(-2, 3) has been # requested on a cyclic axis (and we're not using numpy # indexing), then we roll that axis by two points and @@ -1153,25 +679,20 @@ def __setitem__(self, indices, value): # Make sure that the units of value are the same as self value = conform_units(value, self.Units) - # Missing values could be affected, so make sure that the mask - # hardness has been applied. - dx = self.to_dask_array(apply_mask_hardness=True) - # Do the assignment - self._set_subspace(dx, indices, value) - self._set_dask(dx) + indices = tuple(indices) + super().__setitem__(indices, value) - # Unroll any axes that were rolled to enable a cyclic - # assignment if roll: + # Unroll any axes that were rolled to enable a cyclic + # assignment shifts = [-shift for shift in shifts] self.roll(shift=shifts, axis=roll_axes, inplace=True) - # Reset the original array values at locations that are - # excluded from the assignment by True values in any ancillary - # masks if ancillary_mask: - indices = tuple(indices) + # Reset the original array values at locations that are + # excluded from the assignment by True values in any + # ancillary masks original_self = original_self[indices] reset = self[indices] for mask in ancillary_mask: @@ -1181,455 +702,57 @@ def __setitem__(self, indices, value): return - # REVIEW: getitem: `__asanyarray__`: new property `__asanyarray__` - @property - def __asanyarray__(self): - """Whether or not chunks need conversion to `numpy` arrays. + def _cfa_del_write(self): + """Set the CFA write status of the data to `False`. - .. versionadded:: NEXTVERSION + .. versionadded:: 3.15.0 - ..seealso:: `to_dask_array`, `todict`, `_set_dask` + .. seealso:: `cfa_get_write`, `_cfa_set_write` :Returns: `bool` + The CFA status prior to deletion. """ - return self._custom.get("__asanyarray__", True) + return self._custom.pop("cfa_write", False) - @property - def __orthogonal_indexing__(self): - """Flag to indicate that orthogonal indexing is supported. + def _cfa_set_term(self, value): + """Set the CFA aggregation instruction term status. - Always True, indicating that 'orthogonal indexing' is - applied. This means that when indices are 1-d arrays or lists - then they subspace along each dimension independently. This - behaviour is similar to Fortran, but different to `numpy`. + .. versionadded:: 3.15.0 - .. versionadded:: 3.14.0 + .. seealso:: `cfa_get_term`, `cfa_set_term` - .. seealso:: `__keepdims_indexing__`, `__getitem__`, - `__setitem__`, - `netCDF4.Variable.__orthogonal_indexing__` + :Parameters: - **Examples** + status: `bool` + The new CFA aggregation instruction term status. - >>> d = cf.Data([[1, 2, 3], - ... [4, 5, 6]]) - >>> e = d[[0], [0, 2]] - >>> e.shape - (1, 2) - >>> print(e.array) - [[1 3]] - >>> e = d[[0, 1], [0, 2]] - >>> e.shape - (2, 2) - >>> print(e.array) - [[1 3] - [4 6]] + :Returns: + + `None` """ - return True + if not value: + self._custom.pop("cfa_term", None) - @property - def __keepdims_indexing__(self): - """Flag to indicate whether dimensions indexed with integers are - kept. + self._custom["cfa_term"] = bool(value) - If set to True (the default) then providing a single integer - as a single-axis index does *not* reduce the number of array - dimensions by 1. This behaviour is different to `numpy`. + def _is_abstract_Array_subclass(self, array): + """Whether or not an array is a type of Array. - If set to False then providing a single integer as a - single-axis index reduces the number of array dimensions by - 1. This behaviour is the same as `numpy`. + :Parameters: - .. versionadded:: 3.14.0 + array: - .. seealso:: `__orthogonal_indexing__`, `__getitem__`, - `__setitem__` - - **Examples** - - >>> d = cf.Data([[1, 2, 3], - ... [4, 5, 6]]) - >>> d.__keepdims_indexing__ - True - >>> e = d[0] - >>> e.shape - (1, 3) - >>> print(e.array) - [[1 2 3]] - - >>> d.__keepdims_indexing__ - True - >>> e = d[:, 1] - >>> e.shape - (2, 1) - >>> print(e.array) - [[2] - [5]] - - >>> d.__keepdims_indexing__ - True - >>> e = d[0, 1] - >>> e.shape - (1, 1) - >>> print(e.array) - [[2]] - - >>> d.__keepdims_indexing__ = False - >>> e = d[0] - >>> e.shape - (3,) - >>> print(e.array) - [1 2 3] - - >>> d.__keepdims_indexing__ - False - >>> e = d[:, 1] - >>> e.shape - (2,) - >>> print(e.array) - [2 5] - - >>> d.__keepdims_indexing__ - False - >>> e = d[0, 1] - >>> e.shape - () - >>> print(e.array) - 2 - - """ - return self._custom.get("__keepdims_indexing__", True) - - @__keepdims_indexing__.setter - def __keepdims_indexing__(self, value): - self._custom["__keepdims_indexing__"] = bool(value) - - def _cfa_del_write(self): - """Set the CFA write status of the data to `False`. - - .. versionadded:: 3.15.0 - - .. seealso:: `cfa_get_write`, `_cfa_set_write` - - :Returns: - - `bool` - The CFA status prior to deletion. - - """ - return self._custom.pop("cfa_write", False) - - def _cfa_set_term(self, value): - """Set the CFA aggregation instruction term status. - - .. versionadded:: 3.15.0 - - .. seealso:: `cfa_get_term`, `cfa_set_term` - - :Parameters: - - status: `bool` - The new CFA aggregation instruction term status. - - :Returns: - - `None` - - """ - if not value: - self._custom.pop("cfa_term", None) - - self._custom["cfa_term"] = bool(value) - - def _clear_after_dask_update(self, clear=_ALL): - """Remove components invalidated by updating the `dask` array. - - Removes or modifies components that can't be guaranteed to be - consistent with an updated `dask` array. See the *clear* - parameter for details. - - .. versionadded:: 3.14.0 - - .. seealso:: `_del_Array`, `_del_cached_elements`, - `_cfa_del_write`, `_set_dask` - - :Parameters: - - clear: `int`, optional - Specify which components should be removed. Which - components are removed is determined by sequentially - combining *clear* with the ``_ARRAY``, ``_CACHE`` and - ``_CFA`` integer-valued contants, using the bitwise - AND operator: - - * If ``clear & _ARRAY`` is non-zero then a source - array is deleted. - - * If ``clear & _CACHE`` is non-zero then cached - element values are deleted. - - * If ``clear & _CFA`` is non-zero then the CFA write - status is set to `False`. - - By default *clear* is the ``_ALL`` integer-valued - constant, which results in all components being - removed. - - If *clear* is the ``_NONE`` integer-valued constant - then no components are removed. - - To retain a component and remove all others, use - ``_ALL`` with the bitwise OR operator. For instance, - if *clear* is ``_ALL ^ _CACHE`` then the cached - element values will be kept but all other components - will be removed. - - .. versionadded:: 3.15.0 - - :Returns: - - `None` - - """ - if not clear: - return - - if clear & _ARRAY: - # Delete a source array - self._del_Array(None) - - if clear & _CACHE: - # Delete cached element values - self._del_cached_elements() - - if clear & _CFA: - # Set the CFA write status to False - self._cfa_del_write() - - # REVIEW: getitem: `_set_dask`: new keyword 'asanyarray' - def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): - """Set the dask array. - - .. versionadded:: 3.14.0 - - .. seealso:: `to_dask_array`, `_clear_after_dask_update`, - `_del_dask` - - :Parameters: - - dx: `dask.array.Array` - The array to be inserted. - - copy: `bool`, optional - If True then copy *array* before setting it. By - default it is not copied. - - clear: `int`, optional - Specify which components should be removed. By default - *clear* is the ``_ALL`` integer-valued constant, which - results in all components being removed. See - `_clear_after_dask_update` for details. - - asanyarray: `None` or `bool`, optional - If `None` then do nothing. Otherwise set the - `__asanyarray__` attribute to *asanyarray*. - - .. versionadded:: NEXTVERSION - - :Returns: - - `None` - - """ - if dx is NotImplemented: - logger.warning( - "WARNING: NotImplemented has been set in the place of a " - "dask array." - "\n\n" - "This could occur if any sort of exception is raised " - "by a function that is run on chunks (via, for " - "instance, da.map_blocks or " - "dask.array.core.elemwise). Such a function could get " - "run at definition time in order to ascertain " - "suitability (such as data type casting, " - "broadcasting, etc.). Note that the exception may be " - "difficult to diagnose, as dask will have silently " - "trapped it and returned NotImplemented (see, for " - "instance, dask.array.core.elemwise). Print " - "statements in a local copy of dask are possibly the " - "way to go if the cause of the error is not obvious." - ) - - if copy: - dx = dx.copy() - - custom = self._custom - custom["dask"] = dx - # REVIEW: getitem: `_set_dask`: set '__asanyarray__' - if asanyarray is not None: - custom["__asanyarray__"] = bool(asanyarray) - - self._clear_after_dask_update(clear) - - def _del_dask(self, default=ValueError(), clear=_ALL): - """Remove the dask array. - - .. versionadded:: 3.14.0 - - .. seealso:: `to_dask_array`, `_clear_after_dask_update`, - `_set_dask` - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - dask array axes has not been set. If set to an - `Exception` instance then it will be raised instead. - - clear: `int`, optional - Specify which components should be removed. By default - *clear* is the ``_ALL`` integer-valued constant, which - results in all components being removed. See - `_clear_after_dask_update` for details. If there is - no dask array then no components are removed, - regardless of the value of *clear*. - - :Returns: - - `dask.array.Array` - The removed dask array. - - **Examples** - - >>> d = cf.Data([1, 2, 3]) - >>> dx = d._del_dask() - >>> d._del_dask("No dask array") - 'No dask array' - >>> d._del_dask() - Traceback (most recent call last): - ... - ValueError: 'Data' has no dask array - >>> d._del_dask(RuntimeError('No dask array')) - Traceback (most recent call last): - ... - RuntimeError: No dask array - - """ - try: - out = self._custom.pop("dask") - except KeyError: - return self._default( - default, f"{self.__class__.__name__!r} has no dask array" - ) - - self._clear_after_dask_update(clear) - return out - - def _del_cached_elements(self): - """Delete any cached element values. - - Updates *data* in-place to remove the cached element values. - - .. versionadded:: 3.14.0 - - .. seealso:: `_get_cached_elements`, `_set_cached_elements` - - :Returns: - - `None` - - """ - self._custom.pop("cached_elements", None) - - def _get_cached_elements(self): - """Return the cache of selected element values. - - .. versionadded:: 3.14.1 - - .. seealso:: `_del_cached_elements`, `_set_cached_elements` - - :Returns: - - `dict` - The cached element values, where the keys are the element - positions within the dask array and the values are the cached - values for each position. - - **Examples** - - >>> d._get_cached_elements() - {} - - >>> d._get_cached_elements() - {0: 273.15, 1: 274.56, -1: 269.95} - - """ - cache = self._custom.get("cached_elements") - if not cache: - return {} - - return cache.copy() - - def _is_abstract_Array_subclass(self, array): - """Whether or not an array is a type of Array. - - :Parameters: - - array: - - :Returns: + :Returns: `bool` """ return isinstance(array, cfdm.Array) - def _set_cached_elements(self, elements): - """Cache selected element values. - - Updates *data* in-place to store the given element values - within its ``custom`` dictionary. - - .. warning:: Never change ``_custom['cached_elements']`` - in-place. - - .. versionadded:: 3.14.0 - - .. seealso:: `_del_cached_elements`, `_get_cached_elements` - - :Parameters: - - elements: `dict` - Zero or more element values to be cached, each keyed by - a unique identifier to allow unambiguous retrieval. - Existing cached elements not specified by *elements* - will not be removed. - - :Returns: - - `None` - - **Examples** - - >>> d._set_cached_elements({0: 273.15}) - - """ - if not elements: - return - - cache = self._custom.get("cached_elements") - if cache: - cache = cache.copy() - cache.update(elements) - else: - cache = elements.copy() - - self._custom["cached_elements"] = cache - def _cfa_set_write(self, status): """Set the CFA write status of the data. @@ -1677,17 +800,17 @@ def _update_deterministic(self, other): """ if other is False: - self._custom["deterministic"] = False + self._custom["has_deterministic_name"] = False return if other is True: return custom = self._custom - deterministic = custom["deterministic"] + deterministic = custom["has_deterministic_name"] if deterministic: - custom["deterministic"] = ( - deterministic and other._custom["deterministic"] + custom["has_deterministic_name"] = ( + deterministic and other._custom["has_deterministic_name"] ) @_inplace_enabled(default=False) @@ -1782,7 +905,7 @@ def diff(self, axis=-1, n=1, inplace=False): # whenever that issue is resolved. units = self.Units if units.isreftime: - units = Units(units._units_since_reftime) + units = d._Units_class(units._units_since_reftime) d.override_units(units, inplace=True) return d @@ -2240,7 +1363,7 @@ def mean_of_upper_decile( # masked at those locations less_than_p90.filled(True, inplace=True) - d.where(less_than_p90, cf_masked, inplace=True) + d.where(less_than_p90, masked, inplace=True) # Find the mean of elements greater than (or equal to) the # 90th percentile @@ -2255,109 +1378,6 @@ def mean_of_upper_decile( return d - @_inplace_enabled(default=False) - def pad_missing(self, axis, pad_width=None, to_size=None, inplace=False): - """Pad an axis with missing data. - - :Parameters: - - axis: `int` - Select the axis for which the padding is to be - applied. - - *Parameter example:* - Pad second axis: ``axis=1``. - - *Parameter example:* - Pad the last axis: ``axis=-1``. - - {{pad_width: sequence of `int`, optional}} - - {{to_size: `int`, optional}} - - {{inplace: `bool`, optional}} - - :Returns: - - `Data` or `None` - The padded data, or `None` if the operation was - in-place. - - **Examples** - - >>> d = cf.Data(np.arange(6).reshape(2, 3)) - >>> print(d.array) - [[0 1 2] - [3 4 5]] - >>> e = d.pad_missing(1, (1, 2)) - >>> print(e.array) - [[-- 0 1 2 -- --] - [-- 3 4 5 -- --]] - >>> f = e.pad_missing(0, (0, 1)) - >>> print(f.array) - [[-- 0 1 2 -- --] - [-- 3 4 5 -- --] - [-- -- -- -- -- --]] - - >>> g = d.pad_missing(1, to_size=5) - >>> print(g.array) - [[0 1 2 -- --] - [3 4 5 -- --]] - - """ - if not 0 <= axis < self.ndim: - raise ValueError( - f"'axis' must be a valid dimension position. Got {axis}" - ) - - if to_size is not None: - # Set pad_width from to_size - if pad_width is not None: - raise ValueError("Can't set both 'pad_width' and 'to_size'") - - pad_width = (0, to_size - self.shape[axis]) - elif pad_width is None: - raise ValueError("Must set either 'pad_width' or 'to_size'") - - pad_width = np.asarray(pad_width) - if pad_width.shape != (2,) or not pad_width.dtype.kind == "i": - raise ValueError( - "'pad_width' must be a sequence of two integers. " - f"Got: {pad_width}" - ) - - pad_width = tuple(pad_width) - if any(n < 0 for n in pad_width): - if to_size is not None: - raise ValueError( - f"'to_size' ({to_size}) must not be smaller than the " - f"original axis size ({self.shape[axis]})" - ) - - raise ValueError( - f"Can't set a negative number of pad values. Got: {pad_width}" - ) - - d = _inplace_enabled_define_and_cleanup(self) - - dx = d.to_dask_array() - mask0 = da.ma.getmaskarray(dx) - - pad = [(0, 0)] * dx.ndim - pad[axis] = pad_width - - # Pad the data with zero. This will lose the original mask. - dx = da.pad(dx, pad, mode="constant", constant_values=0) - - # Pad the mask with True - mask = da.pad(mask0, pad, mode="constant", constant_values=True) - - # Set the mask - dx = da.ma.masked_where(mask, dx) - - d._set_dask(dx) - return d - @_inplace_enabled(default=False) def percentile( self, @@ -2375,14 +1395,14 @@ def percentile( The default is to compute the percentiles along a flattened version of the data. - If the input data are integers, or floats smaller than float64, or - the input data contains missing values, then output data-type is - float64. Otherwise, the output data-type is the same as that of - the input. + If the input data are integers, or floats smaller than + float64, or the input data contains missing values, then + output data-type is float64. Otherwise, the output data-type + is the same as that of the input. - If multiple percentile ranks are given then a new, leading data - dimension is created so that percentiles can be stored for each - percentile rank. + If multiple percentile ranks are given then a new, leading + data dimension is created so that percentiles can be stored + for each percentile rank. **Accuracy** @@ -2419,8 +1439,9 @@ def percentile( axes: (sequence of) `int`, optional Select the axes. The *axes* argument may be one, or a - sequence, of integers that select the axis corresponding to - the given position in the list of axes of the data array. + sequence, of integers that select the axis + corresponding to the given position in the list of + axes of the data array. By default, of *axes* is `None`, all axes are selected. @@ -2430,11 +1451,11 @@ def percentile( squeeze: `bool`, optional If True then all axes over which percentiles are - calculated are removed from the returned data. By default - axes over which percentiles have been calculated are left - in the result as axes with size 1, meaning that the result - is guaranteed to broadcast correctly against the original - data. + calculated are removed from the returned data. By + default axes over which percentiles have been + calculated are left in the result as axes with size 1, + meaning that the result is guaranteed to broadcast + correctly against the original data. {{mtol: number, optional}} @@ -2545,8 +1566,7 @@ def percentile( else: axes = tuple(sorted(d._parse_axes(axes))) - # REVIEW: getitem: `percentile`: set 'asanyarray' - # 'cf_percentile' has its own call to 'cf_asanyarray', so we + # 'cf_percentile' has its own call to 'cfdm_asanyarray', so we # can set '_asanyarray=False'. dx = d.to_dask_array(_asanyarray=False) dtype = dx.dtype @@ -2626,60 +1646,17 @@ def percentile( return d + @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) - def persist(self, inplace=False): - """Persist the underlying dask array into memory. + def ceil(self, inplace=False, i=False): + """The ceiling of the data, element-wise. - This turns an underlying lazy dask array into a equivalent - chunked dask array, but now with the results fully computed. + The ceiling of ``x`` is the smallest integer ``n``, such that + ``n>=x``. - `persist` is particularly useful when using distributed - systems, because the results will be kept in distributed - memory, rather than returned to the local process. + .. versionadded:: 1.0 - Compare with `compute` and `array`. - - **Performance** - - `persist` causes all delayed operations to be computed. - - .. versionadded:: 3.14.0 - - .. seealso:: `compute`, `array`, `datetime_array`, - `dask.array.Array.persist` - - :Parameters: - - {{inplace: `bool`, optional}} - - :Returns: - - `Data` or `None` - The persisted data. If the operation was in-place then - `None` is returned. - - **Examples** - - >>> e = d.persist() - - """ - d = _inplace_enabled_define_and_cleanup(self) - dx = self.to_dask_array() - dx = dx.persist() - d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE) - return d - - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") - @_inplace_enabled(default=False) - def ceil(self, inplace=False, i=False): - """The ceiling of the data, element-wise. - - The ceiling of ``x`` is the smallest integer ``n``, such that - ``n>=x``. - - .. versionadded:: 1.0 - - .. seealso:: `floor`, `rint`, `trunc` + .. seealso:: `floor`, `rint`, `trunc` :Parameters: @@ -2812,63 +1789,6 @@ def cfa_set_write(self, status): self._cfa_del_write() - def compute(self): # noqa: F811 - """A view of the computed data. - - In-place changes to the returned array *might* affect the - underlying dask array, depending on how the dask array has - been defined, including any delayed operations. - - The returned array has the same mask hardness and fill values - as the data. - - Compare with `array`. - - **Performance** - - `compute` causes all delayed operations to be computed. - - .. versionadded:: 3.14.0 - - .. seealso:: `persist`, `array`, `datetime_array`, - `sparse_array` - - :Returns: - - An in-memory view of the data - - **Examples** - - >>> d = cf.Data([1, 2, 3.0], 'km') - >>> d.compute() - array([1., 2., 3.]) - - >>> from scipy.sparse import csr_array - >>> d = cf.Data(csr_array((2, 3))) - >>> d.compute() - <2x3 sparse array of type '' - with 0 stored elements in Compressed Sparse Row format> - >>>: d.array - array([[0., 0., 0.], - [0., 0., 0.]]) - >>> d.compute().toarray() - array([[0., 0., 0.], - [0., 0., 0.]]) - - """ - dx = self.to_dask_array() - a = dx.compute() - - if np.ma.isMA(a): - if self.hardmask: - a.harden_mask() - else: - a.soften_mask() - - a.set_fill_value(self.fill_value) - - return a - @_inplace_enabled(default=False) def convolution_filter( self, @@ -3045,7 +1965,6 @@ def convolution_filter( dx = d.to_dask_array() - # REVIEW: getitem: `percentile`: rectify comment # Cast to float to ensure that NaNs can be stored (so # map_overlap can correctly assign the halos) if dx.dtype != float: @@ -3162,88 +2081,6 @@ def cumsum( return d - @_inplace_enabled(default=False) - def rechunk( - self, - chunks=_DEFAULT_CHUNKS, - threshold=None, - block_size_limit=None, - balance=False, - inplace=False, - ): - """Change the chunk structure of the data. - - **Performance** - - Rechunking can sometimes be expensive and incur a lot of - communication overheads. - - .. versionadded:: 3.14.0 - - .. seealso:: `chunks`, `dask.array.rechunk` - - :Parameters: - - {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - - {{threshold: `int`, optional}} - - {{block_size_limit: `int`, optional}} - - {{balance: `bool`, optional}} - - :Returns: - - `Data` or `None` - The rechunked data, or `None` if the operation was - in-place. - - **Examples** - - >>> x = cf.Data.ones((1000, 1000), chunks=(100, 100)) - - Specify uniform chunk sizes with a tuple - - >>> y = x.rechunk((1000, 10)) - - Or chunk only specific dimensions with a dictionary - - >>> y = x.rechunk({0: 1000}) - - Use the value ``-1`` to specify that you want a single chunk - along a dimension or the value ``"auto"`` to specify that dask - can freely rechunk a dimension to attain blocks of a uniform - block size. - - >>> y = x.rechunk({0: -1, 1: 'auto'}, block_size_limit=1e8) - - If a chunk size does not divide the dimension then rechunk - will leave any unevenness to the last chunk. - - >>> x.rechunk(chunks=(400, -1)).chunks - ((400, 400, 200), (1000,)) - - However if you want more balanced chunks, and don't mind - `dask` choosing a different chunksize for you then you can use - the ``balance=True`` option. - - >>> x.rechunk(chunks=(400, -1), balance=True).chunks - ((500, 500), (1000,)) - - """ - d = _inplace_enabled_define_and_cleanup(self) - - # REVIEW: getitem: `rechunk`: set 'asanyarray' - dx = d.to_dask_array(_asanyarray=False) - dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - # Dask rechunking is essentially a wrapper for __getitem__ - # calls on the chunks, which means that we can use the same - # 'asanyarray' and 'clear' keyword values to `_set_dask` as - # are used in `__gettem__`. - d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE, asanyarray=True) - - return d - @_inplace_enabled(default=False) def _asdatetime(self, inplace=False): """Change the internal representation of data array elements @@ -3291,8 +2128,7 @@ def _asdatetime(self, inplace=False): ) if not d._isdatetime(): - # REVIEW: getitem: `_asdatetime`: set 'asanyarray' - # 'cf_rt2dt' has its own call to 'cf_asanyarray', so we + # 'cf_rt2dt' has its own call to 'cfdm_asanyarray', so we # can set '_asanyarray=False'. dx = d.to_dask_array(_asanyarray=False) dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) @@ -3349,8 +2185,7 @@ def _asreftime(self, inplace=False): ) if d._isdatetime(): - # REVIEW: getitem: `_asreftime`: set 'asanyarray' - # 'cf_dt2rt' has its own call to 'cf_asanyarray', so we + # 'cf_dt2rt' has its own call to 'cfdm_asanyarray', so we # can set '_asanyarray=False'. dx = d.to_dask_array(_asanyarray=False) dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) @@ -3472,7 +2307,11 @@ def _combined_units(self, data1, method, inplace): if not units0.equals(units1): data1 = data1.copy() data1.Units = units0 - return data0, data1, Units(_ut_unit=units0._ut_unit) + return ( + data0, + data1, + self._Units_class(_ut_unit=units0._ut_unit), + ) else: # Non-equivalent reference_times: raise an # exception @@ -3480,7 +2319,7 @@ def _combined_units(self, data1, method, inplace): elif units1.istime: # reference_time minus time: the output units are # reference_time - time0 = Units(_ut_unit=units0._ut_unit) + time0 = self._Units_class(_ut_unit=units0._ut_unit) if not units1.equals(time0): data1 = data1.copy() data1.Units = time0 @@ -3524,7 +2363,7 @@ def _combined_units(self, data1, method, inplace): if units0.istime: # Time plus reference_time: the output units are # reference_time - time1 = Units(_ut_unit=units1._ut_unit) + time1 = self._Units_class(_ut_unit=units1._ut_unit) if not units0.equals(time1): if not inplace: data0 = data0.copy() @@ -3745,7 +2584,8 @@ def _combined_units(self, data1, method, inplace): ) ) - def _binary_operation(self, other, method): + @classmethod + def _binary_operation(cls, data, other, method): """Implement binary arithmetic and comparison operations with the numpy broadcasting rules. @@ -3796,101 +2636,42 @@ def _binary_operation(self, other, method): # Ensure other is an independent Data object, for example # so that combination with cf.Query objects works. # ------------------------------------------------------------ - if not isinstance(other, self.__class__): + if not isinstance(other, cls): if ( isinstance(other, cftime.datetime) and other.calendar == "" - and self.Units.isreftime + and data.Units.isreftime ): other = cf_dt( - other, calendar=getattr(self.Units, "calendar", "standard") + other, calendar=getattr(data.Units, "calendar", "standard") ) elif other is None: # Can't sensibly initialise a Data object from a bare # `None` (issue #281) other = np.array(None, dtype=object) - other = type(self).asdata(other) + other = cls.asdata(other) # ------------------------------------------------------------ # Prepare data0 (i.e. self copied) and data1 (i.e. other) # ------------------------------------------------------------ - data0 = self.copy() + data0 = data.copy() # Parse units data0, other, new_Units = data0._combined_units(other, method, True) - # Cast as dask arrays - dx0 = data0.to_dask_array() - dx1 = other.to_dask_array() + d = super()._binary_operation(data0, other, method) - # Set if applicable the tolerance levels for the result - if method in ("__eq__", "__ne__"): - rtol = self._rtol - atol = self._atol - - # ------------------------------------------------------------ - # Perform the binary operation with data0 (self) and data1 - # (other) - # ------------------------------------------------------------ - if method == "__eq__": - if dx0.dtype.kind in "US" or dx1.dtype.kind in "US": - result = getattr(dx0, method)(dx1) - else: - result = da.isclose(dx0, dx1, rtol=rtol, atol=atol) - elif method == "__ne__": - if dx0.dtype.kind in "US" or dx1.dtype.kind in "US": - result = getattr(dx0, method)(dx1) - else: - result = ~da.isclose(dx0, dx1, rtol=rtol, atol=atol) - elif inplace: - # Find non-in-place equivalent operator (remove 'i') - equiv_method = method[:2] + method[3:] - # Need to add check in here to ensure that the operation is not - # trying to cast in a way which is invalid. For example, doing - # [an int array] ** float value = [a float array] is fine, but - # doing this in-place would try to chance an int array into a - # float one, which isn't valid casting. Therefore we need to - # catch cases where __i__ isn't possible even if ____ - # is due to datatype consistency rules. - result = getattr(dx0, equiv_method)(dx1) - else: - result = getattr(dx0, method)(dx1) - - if result is NotImplemented: - raise TypeError( - f"Unsupported operands for {method}: {self!r} and {other!r}" - ) + d.override_units(new_Units, inplace=True) + d._update_deterministic(other) - # Set axes when other has more dimensions than self - axes = None - ndim0 = dx0.ndim - if not ndim0: - axes = other._axes + if inplace: + data.__dict__ = d.__dict__ else: - diff = dx1.ndim - ndim0 - if diff > 0: - axes = list(self._axes) - for _ in range(diff): - axes.insert(0, new_axis_identifier(tuple(axes))) - - if inplace: # in-place so concerns original self - self._set_dask(result) - self.override_units(new_Units, inplace=True) - if axes is not None: - self._axes = axes - - self._update_deterministic(other) - return self - - else: # not, so concerns a new Data object copied from self, data0 - data0._set_dask(result) - data0.override_units(new_Units, inplace=True) - if axes is not None: - data0._axes = axes + data = d - data0._update_deterministic(other) - return data0 + # Inplace? + return d def _parse_indices(self, *args, **kwargs): """'cf.Data._parse_indices' is not available. @@ -3962,8 +2743,7 @@ def _regrid( f"the shape of the regrid operator: {operator.src_shape}" ) - # REVIEW: getitem: `_regrid`: set 'asanyarray' - # 'regrid' has its own calls to 'cf_asanyarray', so we can set + # 'regrid' has its own calls to 'cfdm_asanyarray', so we can set # '_asanyarray=False'. dx = self.to_dask_array(_asanyarray=False) @@ -4207,7 +2987,6 @@ def concatenate( processed_data.append(data1) copied = not copy # to avoid making two copies in a given case - # REVIEW: getitem: `concatenate`: set 'asanyarray' # Get data as dask arrays and apply concatenation # operation. We can set '_asanyarray=False' because at compute # time the concatenation operation does not need to access the @@ -4240,7 +3019,6 @@ def concatenate( cfa = _NONE break - # REVIEW: getitem: `concatenate`: define the asanyarray status # Define the __asanyarray__ status asanyarray = processed_data[0].__asanyarray__ for d in processed_data[1:]: @@ -4251,7 +3029,6 @@ def concatenate( asanyarray = True break - # REVIEW: getitem: `concatenate`: set 'asanyarray' # Set the new dask array data0._set_dask(dx, clear=_ALL ^ cfa, asanyarray=asanyarray) @@ -4310,57 +3087,13 @@ def concatenate( return data0 - def _unary_operation(self, operation): - """Implement unary arithmetic operations. - - It is called by the unary arithmetic methods, such as - __abs__(). - - .. seealso:: `_binary_operation` - - :Parameters: - - operation: `str` - The unary arithmetic method name (such as "__invert__"). - - :Returns: - - `Data` - A new Data array. - - **Examples** - - >>> d = cf.Data([[1, 2, -3, -4, -5]]) - - >>> e = d._unary_operation('__abs__') - >>> print(e.array) - [[1 2 3 4 5]] - - >>> e = d.__abs__() - >>> print(e.array) - [[1 2 3 4 5]] - - >>> e = abs(d) - >>> print(e.array) - [[1 2 3 4 5]] - - """ - out = self.copy(array=False) - - dx = self.to_dask_array() - dx = getattr(operator, operation)(dx) - - out._set_dask(dx) - - return out - def __add__(self, other): """The binary arithmetic operation ``+`` x.__add__(y) <==> x+y """ - return self._binary_operation(other, "__add__") + return self._binary_operation(self, other, "__add__") def __iadd__(self, other): """The augmented arithmetic assignment ``+=`` @@ -4368,7 +3101,7 @@ def __iadd__(self, other): x.__iadd__(y) <==> x+=y """ - return self._binary_operation(other, "__iadd__") + return self._binary_operation(self, other, "__iadd__") def __radd__(self, other): """The binary arithmetic operation ``+`` with reflected @@ -4377,7 +3110,7 @@ def __radd__(self, other): x.__radd__(y) <==> y+x """ - return self._binary_operation(other, "__radd__") + return self._binary_operation(self, other, "__radd__") def __sub__(self, other): """The binary arithmetic operation ``-`` @@ -4385,7 +3118,7 @@ def __sub__(self, other): x.__sub__(y) <==> x-y """ - return self._binary_operation(other, "__sub__") + return self._binary_operation(self, other, "__sub__") def __isub__(self, other): """The augmented arithmetic assignment ``-=`` @@ -4393,7 +3126,7 @@ def __isub__(self, other): x.__isub__(y) <==> x-=y """ - return self._binary_operation(other, "__isub__") + return self._binary_operation(self, other, "__isub__") def __rsub__(self, other): """The binary arithmetic operation ``-`` with reflected @@ -4402,7 +3135,7 @@ def __rsub__(self, other): x.__rsub__(y) <==> y-x """ - return self._binary_operation(other, "__rsub__") + return self._binary_operation(self, other, "__rsub__") def __mul__(self, other): """The binary arithmetic operation ``*`` @@ -4410,7 +3143,7 @@ def __mul__(self, other): x.__mul__(y) <==> x*y """ - return self._binary_operation(other, "__mul__") + return self._binary_operation(self, other, "__mul__") def __imul__(self, other): """The augmented arithmetic assignment ``*=`` @@ -4418,7 +3151,7 @@ def __imul__(self, other): x.__imul__(y) <==> x*=y """ - return self._binary_operation(other, "__imul__") + return self._binary_operation(self, other, "__imul__") def __rmul__(self, other): """The binary arithmetic operation ``*`` with reflected @@ -4427,7 +3160,7 @@ def __rmul__(self, other): x.__rmul__(y) <==> y*x """ - return self._binary_operation(other, "__rmul__") + return self._binary_operation(self, other, "__rmul__") def __div__(self, other): """The binary arithmetic operation ``/`` @@ -4435,7 +3168,7 @@ def __div__(self, other): x.__div__(y) <==> x/y """ - return self._binary_operation(other, "__div__") + return self._binary_operation(self, other, "__div__") def __idiv__(self, other): """The augmented arithmetic assignment ``/=`` @@ -4443,7 +3176,7 @@ def __idiv__(self, other): x.__idiv__(y) <==> x/=y """ - return self._binary_operation(other, "__idiv__") + return self._binary_operation(self, other, "__idiv__") def __rdiv__(self, other): """The binary arithmetic operation ``/`` with reflected @@ -4452,7 +3185,7 @@ def __rdiv__(self, other): x.__rdiv__(y) <==> y/x """ - return self._binary_operation(other, "__rdiv__") + return self._binary_operation(self, other, "__rdiv__") def __floordiv__(self, other): """The binary arithmetic operation ``//`` @@ -4460,7 +3193,7 @@ def __floordiv__(self, other): x.__floordiv__(y) <==> x//y """ - return self._binary_operation(other, "__floordiv__") + return self._binary_operation(self, other, "__floordiv__") def __ifloordiv__(self, other): """The augmented arithmetic assignment ``//=`` @@ -4468,7 +3201,7 @@ def __ifloordiv__(self, other): x.__ifloordiv__(y) <==> x//=y """ - return self._binary_operation(other, "__ifloordiv__") + return self._binary_operation(self, other, "__ifloordiv__") def __rfloordiv__(self, other): """The binary arithmetic operation ``//`` with reflected @@ -4477,7 +3210,7 @@ def __rfloordiv__(self, other): x.__rfloordiv__(y) <==> y//x """ - return self._binary_operation(other, "__rfloordiv__") + return self._binary_operation(self, other, "__rfloordiv__") def __truediv__(self, other): """The binary arithmetic operation ``/`` (true division) @@ -4485,7 +3218,7 @@ def __truediv__(self, other): x.__truediv__(y) <==> x/y """ - return self._binary_operation(other, "__truediv__") + return self._binary_operation(self, other, "__truediv__") def __itruediv__(self, other): """The augmented arithmetic assignment ``/=`` (true division) @@ -4493,7 +3226,7 @@ def __itruediv__(self, other): x.__itruediv__(y) <==> x/=y """ - return self._binary_operation(other, "__itruediv__") + return self._binary_operation(self, other, "__itruediv__") def __rtruediv__(self, other): """The binary arithmetic operation ``/`` (true division) with @@ -4502,7 +3235,7 @@ def __rtruediv__(self, other): x.__rtruediv__(y) <==> y/x """ - return self._binary_operation(other, "__rtruediv__") + return self._binary_operation(self, other, "__rtruediv__") def __pow__(self, other, modulo=None): """The binary arithmetic operations ``**`` and ``pow`` @@ -4517,7 +3250,7 @@ def __pow__(self, other, modulo=None): ) ) - return self._binary_operation(other, "__pow__") + return self._binary_operation(self, other, "__pow__") def __ipow__(self, other, modulo=None): """The augmented arithmetic assignment ``**=`` @@ -4532,7 +3265,7 @@ def __ipow__(self, other, modulo=None): ) ) - return self._binary_operation(other, "__ipow__") + return self._binary_operation(self, other, "__ipow__") def __rpow__(self, other, modulo=None): """The binary arithmetic operations ``**`` and ``pow`` with @@ -4548,7 +3281,7 @@ def __rpow__(self, other, modulo=None): ) ) - return self._binary_operation(other, "__rpow__") + return self._binary_operation(self, other, "__rpow__") def __mod__(self, other): """The binary arithmetic operation ``%`` @@ -4556,7 +3289,7 @@ def __mod__(self, other): x.__mod__(y) <==> x % y """ - return self._binary_operation(other, "__mod__") + return self._binary_operation(self, other, "__mod__") def __imod__(self, other): """The binary arithmetic operation ``%=`` @@ -4564,7 +3297,7 @@ def __imod__(self, other): x.__imod__(y) <==> x %= y """ - return self._binary_operation(other, "__imod__") + return self._binary_operation(self, other, "__imod__") def __rmod__(self, other): """The binary arithmetic operation ``%`` with reflected @@ -4573,207 +3306,7 @@ def __rmod__(self, other): x.__rmod__(y) <==> y % x """ - return self._binary_operation(other, "__rmod__") - - def __eq__(self, other): - """The rich comparison operator ``==`` - - x.__eq__(y) <==> x==y - - """ - return self._binary_operation(other, "__eq__") - - def __ne__(self, other): - """The rich comparison operator ``!=`` - - x.__ne__(y) <==> x!=y - - """ - return self._binary_operation(other, "__ne__") - - def __ge__(self, other): - """The rich comparison operator ``>=`` - - x.__ge__(y) <==> x>=y - - """ - return self._binary_operation(other, "__ge__") - - def __gt__(self, other): - """The rich comparison operator ``>`` - - x.__gt__(y) <==> x>y - - """ - return self._binary_operation(other, "__gt__") - - def __le__(self, other): - """The rich comparison operator ``<=`` - - x.__le__(y) <==> x<=y - - """ - return self._binary_operation(other, "__le__") - - def __lt__(self, other): - """The rich comparison operator ``<`` - - x.__lt__(y) <==> x x&y - - """ - return self._binary_operation(other, "__and__") - - def __iand__(self, other): - """The augmented bitwise assignment ``&=`` - - x.__iand__(y) <==> x&=y - - """ - return self._binary_operation(other, "__iand__") - - def __rand__(self, other): - """The binary bitwise operation ``&`` with reflected operands. - - x.__rand__(y) <==> y&x - - """ - return self._binary_operation(other, "__rand__") - - def __or__(self, other): - """The binary bitwise operation ``|`` - - x.__or__(y) <==> x|y - - """ - return self._binary_operation(other, "__or__") - - def __ior__(self, other): - """The augmented bitwise assignment ``|=`` - - x.__ior__(y) <==> x|=y - - """ - return self._binary_operation(other, "__ior__") - - def __ror__(self, other): - """The binary bitwise operation ``|`` with reflected operands. - - x.__ror__(y) <==> y|x - - """ - return self._binary_operation(other, "__ror__") - - def __xor__(self, other): - """The binary bitwise operation ``^`` - - x.__xor__(y) <==> x^y - - """ - return self._binary_operation(other, "__xor__") - - def __ixor__(self, other): - """The augmented bitwise assignment ``^=`` - - x.__ixor__(y) <==> x^=y - - """ - return self._binary_operation(other, "__ixor__") - - def __rxor__(self, other): - """The binary bitwise operation ``^`` with reflected operands. - - x.__rxor__(y) <==> y^x - - """ - return self._binary_operation(other, "__rxor__") - - def __lshift__(self, y): - """The binary bitwise operation ``<<`` - - x.__lshift__(y) <==> x< x<<=y - - """ - return self._binary_operation(y, "__ilshift__") - - def __rlshift__(self, y): - """The binary bitwise operation ``<<`` with reflected operands. - - x.__rlshift__(y) <==> y<>`` - - x.__lshift__(y) <==> x>>y - - """ - return self._binary_operation(y, "__rshift__") - - def __irshift__(self, y): - """The augmented bitwise assignment ``>>=`` - - x.__irshift__(y) <==> x>>=y - - """ - return self._binary_operation(y, "__irshift__") - - def __rrshift__(self, y): - """The binary bitwise operation ``>>`` with reflected operands. - - x.__rrshift__(y) <==> y>>x - - """ - return self._binary_operation(y, "__rrshift__") - - def __abs__(self): - """The unary arithmetic operation ``abs`` - - x.__abs__() <==> abs(x) - - """ - return self._unary_operation("__abs__") - - def __neg__(self): - """The unary arithmetic operation ``-`` - - x.__neg__() <==> -x - - """ - return self._unary_operation("__neg__") - - def __invert__(self): - """The unary bitwise operation ``~`` - - x.__invert__() <==> ~x - - """ - return self._unary_operation("__invert__") - - def __pos__(self): - """The unary arithmetic operation ``+`` - - x.__pos__() <==> +x - - """ - return self._unary_operation("__pos__") + return self._binary_operation(self, other, "__rmod__") def __query_isclose__(self, value, rtol, atol): """Query interface method for an "is close" condition. @@ -4794,37 +3327,6 @@ def __query_isclose__(self, value, rtol, atol): """ return self.isclose(value, rtol=rtol, atol=atol) - @property - def _Units(self): - """Storage for the units. - - The units are stored in a `Units` object, and reflect the - units of the (yet to be computed) elements of the underlying - data. - - .. warning:: Assigning to `_Units` does *not* trigger a units - conversion of the underlying data - values. Therefore assigning to `_Units` should - only be done in cases when it is known that the - intrinsic units represented by the data values - are inconsistent with the existing value of - `_Units`. Before assigning to `_Units`, first - consider if assigning to `Units`, or calling the - `override_units` or `override_calendar` method is - a more appropriate course of action, and use one - of those if possible. - - """ - return self._custom["_Units"] - - @_Units.setter - def _Units(self, value): - self._custom["_Units"] = value - - @_Units.deleter - def _Units(self): - self._custom["_Units"] = _units_None - @property def _cyclic(self): """Storage for axis cyclicity. @@ -4841,7 +3343,7 @@ def _cyclic(self): the `_cyclic` attribute. """ - return self._custom["_cyclic"] + return self._custom.get("_cyclic", _empty_set) @_cyclic.setter def _cyclic(self, value): @@ -4857,17 +3359,12 @@ def _axes(self): Contains a `tuple` of identifiers, one for each array axis. - .. note:: When the axis identifiers are reset, then any axis - identifier named by the `_cyclic` attribute which is - not in the new `_axes` set is automatically removed - from the `_cyclic` attribute. - """ - return self._custom["_axes"] + return super()._axes @_axes.setter def _axes(self, value): - self._custom["_axes"] = tuple(value) + self._set_component("axes", tuple(value), copy=False) # Remove cyclic axes that are not in the new axes cyclic = self._cyclic @@ -4875,53 +3372,19 @@ def _axes(self, value): # Never change the value of the _cyclic attribute in-place self._cyclic = cyclic.intersection(value) - # ---------------------------------------------------------------- - # Dask attributes - # ---------------------------------------------------------------- @property - def chunks(self): - """The `dask` chunk sizes for each dimension. - - .. versionadded:: 3.14.0 - - .. seealso:: `npartitions`, `numblocks`, `rechunk` - - **Examples** - - >>> d = cf.Data.ones((6, 5), chunks=(2, 4)) - >>> d.chunks - ((2, 2, 2), (4, 1)) - >>> d.numblocks - (3, 2) - >>> d.npartitions - 6 - - """ - # REVIEW: getitem: `chunks`: set 'asanyarray' - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - return self.to_dask_array(_asanyarray=False).chunks - - # ---------------------------------------------------------------- - # Attributes - # ---------------------------------------------------------------- - @property - def Units(self): - """The `cf.Units` object containing the units of the data array. - - Can be set to any units equivalent to the existing units. - - .. seealso `override_units`, `override_calendar` + def Units(self): + """The `Units` object containing the units of the data array. **Examples** - >>> d = cf.Data([1, 2, 3], units='m') + >>> d = {{package}}.{{class}}([1, 2, 3], units='m') >>> d.Units - >>> d.Units = cf.Units('kilmetres') + >>> d.Units = {{package}}.Units('kilometres') >>> d.Units - - >>> d.Units = cf.Units('km') + + >>> d.Units = {{package}}.Units('km') >>> d.Units @@ -4932,7 +3395,7 @@ def Units(self): def Units(self, value): try: old_units = self._Units - except KeyError: + except ValueError: pass else: if not old_units or self.Units.equals(value): @@ -4946,32 +3409,36 @@ def Units(self, value): "Consider using the override_units method instead." ) - dtype = self.dtype - if dtype.kind in "iu": - if dtype.char in "iI": - dtype = _dtype_float32 - else: - dtype = _dtype_float + try: + dtype = self.dtype + except ValueError: + dtype = None - cf_func = partial(cf_units, from_units=old_units, to_units=value) + if dtype is not None: + if dtype.kind in "iu": + if dtype.char in "iI": + dtype = _dtype_float32 + else: + dtype = _dtype_float - # REVIEW: getitem: `Units`: set 'asanyarray' - # 'cf_units' has its own call to 'cf_asanyarray', so we can - # set '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - dx = dx.map_blocks(cf_func, dtype=dtype) - - # Setting equivalent units doesn't affect the CFA write - # status. Nor does it invalidate any cached values, but only - # because we'll adjust those, too. - self._set_dask(dx, clear=_ALL ^ _CACHE ^ _CFA) - - # Adjust cached values for the new units - cache = self._get_cached_elements() - if cache: - self._set_cached_elements( - {index: cf_func(value) for index, value in cache.items()} - ) + cf_func = partial(cf_units, from_units=old_units, to_units=value) + + # 'cf_units' has its own call to 'cfdm_asanyarray', so we + # can set '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) + dx = dx.map_blocks(cf_func, dtype=dtype) + + # Setting equivalent units doesn't affect the CFA write + # status. Nor does it invalidate any cached values, but + # only because we'll adjust those, too. + self._set_dask(dx, clear=_ALL ^ _CACHE ^ _CFA) + + # Adjust cached values for the new units + cache = self._get_cached_elements() + if cache: + self._set_cached_elements( + {index: cf_func(value) for index, value in cache.items()} + ) self._Units = value @@ -4982,147 +3449,6 @@ def Units(self): "Consider using the override_units method instead." ) - @property - def data(self): - """The data as an object identity. - - **Examples** - - >>> d = cf.Data([1, 2], 'm') - >>> d.data is d - True - - """ - return self - - @property - def dtype(self): - """The `numpy` data-type of the data. - - Always returned as a `numpy` data-type instance, but may be set - as any object that converts to a `numpy` data-type. - - **Examples** - - >>> d = cf.Data([1, 2.5, 3.9]) - >>> d.dtype - dtype('float64') - >>> print(d.array) - [1. 2.5 3.9] - >>> d.dtype = int - >>> d.dtype - dtype('int64') - >>> print(d.array) - [1 2 3] - >>> d.dtype = 'float32' - >>> print(d.array) - [1. 2. 3.] - >>> import numpy as np - >>> d.dtype = np.dtype('int32') - >>> d.dtype - dtype('int32') - >>> print(d.array) - [1 2 3] - - """ - # REVIEW: getitem: `dtype`: set 'asanyarray' - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - return dx.dtype - - @dtype.setter - def dtype(self, value): - # Only change the datatype if it's different to that of the - # dask array - if self.dtype != value: - dx = self.to_dask_array() - dx = dx.astype(value) - self._set_dask(dx) - - @property - def fill_value(self): - """The data array missing data value. - - If set to `None` then the default `numpy` fill value appropriate to - the data array's data-type will be used. - - Deleting this attribute is equivalent to setting it to None, so - this attribute is guaranteed to always exist. - - **Examples** - - >>> d.fill_value = 9999.0 - >>> d.fill_value - 9999.0 - >>> del d.fill_value - >>> d.fill_value - None - - """ - return self.get_fill_value(None) - - @fill_value.setter - def fill_value(self, value): - self.set_fill_value(value) - - @fill_value.deleter - def fill_value(self): - self.del_fill_value(None) - - @property - def hardmask(self): - """Hardness of the mask. - - If the `hardmask` attribute is `True`, i.e. there is a hard - mask, then unmasking an entry will silently not occur. This is - the default, and prevents overwriting the mask. - - If the `hardmask` attribute is `False`, i.e. there is a soft - mask, then masked entries may be overwritten with non-missing - values. - - .. note:: Setting the `hardmask` attribute does not - immediately change the mask hardness, rather its - value indicates to other methods (such as `where`, - `transpose`, etc.) whether or not the mask needs - hardening or softening prior to an operation being - defined, and those methods will reset the mask - hardness if required. - - By contrast, the `harden_mask` and `soften_mask` - methods immediately reset the mask hardness of the - underlying `dask` array, and also set the value of - the `hardmask` attribute. - - .. seealso:: `harden_mask`, `soften_mask`, `to_dask_array`, - `where`, `__setitem__` - - **Examples** - - >>> d = cf.Data([1, 2, 3]) - >>> d.hardmask - True - >>> d[0] = cf.masked - >>> print(d.array) - [-- 2 3] - >>> d[...] = 999 - >>> print(d.array) - [-- 999 999] - >>> d.hardmask = False - >>> d.hardmask - False - >>> d[...] = -1 - >>> print(d.array) - [-1 -1 -1] - - """ - return self._custom.get("hardmask", _DEFAULT_HARDMASK) - - @hardmask.setter - def hardmask(self, value): - self._custom["hardmask"] = value - @property def is_masked(self): """True if the data array has any masked values. @@ -5141,8 +3467,7 @@ def is_masked(self): True """ - # REVIEW: getitem: `is_masked`: set 'asanyarray' - # 'cf_is_masked' has its own call to 'cf_asanyarray', so we + # 'cf_is_masked' has its own call to 'cfdm_asanyarray', so we # can set '_asanyarray=False'. dx = self.to_dask_array(_asanyarray=False) @@ -5152,381 +3477,13 @@ def is_masked(self): dx = da.blockwise( cf_is_masked, out_ind, - dx, - dx_ind, - adjust_chunks={i: 1 for i in out_ind}, - dtype=bool, - ) - - return bool(dx.any()) - - @property - def nbytes(self): - """Total number of bytes consumed by the elements of the array. - - Does not include bytes consumed by the array mask - - **Performance** - - If the number of bytes is unknown then it is calculated - immediately by executing all delayed operations. - - **Examples** - - >>> d = cf.Data([[1, 1.5, 2]]) - >>> d.dtype - dtype('float64') - >>> d.size, d.dtype.itemsize - (3, 8) - >>> d.nbytes - 24 - >>> d[0] = cf.masked - >>> print(d.array) - [[-- 1.5 2.0]] - >>> d.nbytes - 24 - - """ - # REVIEW: getitem: `nbytes`: set 'asanyarray' - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - if math.isnan(dx.size): - logger.debug("Computing data nbytes: Performance may be degraded") - dx.compute_chunk_sizes() - - return dx.nbytes - - @property - def ndim(self): - """Number of dimensions in the data array. - - **Examples** - - >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) - >>> d.ndim - 2 - - >>> d = cf.Data([[1, 2, 3]]) - >>> d.ndim - 2 - - >>> d = cf.Data([[3]]) - >>> d.ndim - 2 - - >>> d = cf.Data([3]) - >>> d.ndim - 1 - - >>> d = cf.Data(3) - >>> d.ndim - 0 - - """ - # REVIEW: getitem: `ndim`: set 'asanyarray' - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - return dx.ndim - - @property - def npartitions(self): - """The total number of chunks. - - .. versionadded:: 3.14.0 - - .. seealso:: `chunks`, `numblocks`, `rechunk` - - **Examples** - - >>> d = cf.Data.ones((6, 5), chunks=(2, 4)) - >>> d.chunks - ((2, 2, 2), (4, 1)) - >>> d.numblocks - (3, 2) - >>> d.npartitions - 6 - - """ - # REVIEW: getitem: `npartitions`: set 'asanyarray' - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - return self.to_dask_array(_asanyarray=False).npartitions - - @property - def numblocks(self): - """The number of chunks along each dimension. - - .. versionadded:: 3.14.0 - - .. seealso:: `chunks`, `npartitions`, `rechunk` - - **Examples** - - >>> d = cf.Data.ones((6, 5), chunks=(2, 4)) - >>> d.chunks - ((2, 2, 2), (4, 1)) - >>> d.numblocks - (3, 2) - >>> d.npartitions - 6 - - """ - # REVIEW: getitem: `numblocks` set 'asanyarray' - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - return self.to_dask_array(_asanyarray=False).numblocks - - @property - def shape(self): - """Tuple of the data array's dimension sizes. - - **Performance** - - If the shape of the data is unknown then it is calculated - immediately by executing all delayed operations. - - **Examples** - - >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) - >>> d.shape - (2, 3) - - >>> d = cf.Data([[1, 2, 3]]) - >>> d.shape - (1, 3) - - >>> d = cf.Data([[3]]) - >>> d.shape - (1, 1) - - >>> d = cf.Data(3) - >>> d.shape - () - - """ - # REVIEW: getitem: `shape`: set 'asanyarray' - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - if math.isnan(dx.size): - logger.debug("Computing data shape: Performance may be degraded") - dx.compute_chunk_sizes() - - return dx.shape - - @property - def size(self): - """Number of elements in the data array. - - **Performance** - - If the size of the data is unknown then it is calculated - immediately by executing all delayed operations. - - **Examples** - - >>> d = cf.Data([[1, 2, 3], [4, 5, 6]]) - >>> d.size - 6 - - >>> d = cf.Data([[1, 2, 3]]) - >>> d.size - 3 - - >>> d = cf.Data([[3]]) - >>> d.size - 1 - - >>> d = cf.Data([3]) - >>> d.size - 1 - - >>> d = cf.Data(3) - >>> d.size - 1 - - """ - # REVIEW: getitem: `size` set 'asanyarray' - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - size = dx.size - if math.isnan(size): - logger.debug("Computing data size: Performance may be degraded") - dx.compute_chunk_sizes() - size = dx.size - - return size - - @property - def array(self): - """A numpy array copy of the data. - - In-place changes to the returned numpy array do not affect the - underlying dask array. - - The returned numpy array has the same mask hardness and fill - values as the data. - - Compare with `compute`. - - **Performance** - - `array` causes all delayed operations to be computed. The - returned `numpy` array is a deep copy of that returned by - created `compute`. - - .. seealso:: `datetime_array`, `compute`, `persist` - - **Examples** - - >>> d = cf.Data([1, 2, 3.0], 'km') - >>> a = d.array - >>> isinstance(a, numpy.ndarray) - True - >>> print(a) - [ 1. 2. 3.] - >>> d[0] = -99 - >>> print(a[0]) - 1.0 - >>> a[0] = 88 - >>> print(d[0]) - -99.0 km - - >>> d = cf.Data('2000-12-1', units='days since 1999-12-1') - >>> print(d.array) - 366 - >>> print(d.datetime_array) - 2000-12-01 00:00:00 - - """ - a = self.compute().copy() - if issparse(a): - a = a.toarray() - elif not isinstance(a, np.ndarray): - a = np.asanyarray(a) - - if not a.size: - return a - - # Set cached elements - items = [0, -1] - if a.ndim == 2 and a.shape[-1] == 2: - items.extend((1, -2)) - elif a.size == 3: - items.append(1) - - self._set_cached_elements({i: a.item(i) for i in items}) - - return a - - @property - def datetime_array(self): - """An independent numpy array of date-time objects. - - Only applicable to data arrays with reference time units. - - If the calendar has not been set then the CF default calendar will - be used and the units will be updated accordingly. - - The data-type of the data array is unchanged. - - .. seealso:: `array`, `compute`, `persist` - - **Performance** - - `datetime_array` causes all delayed operations to be computed. - - **Examples** - - """ - units = self.Units - - if not units.isreftime: - raise ValueError( - f"Can't create date-time array from units {self.Units!r}" - ) - - if getattr(units, "calendar", None) == "none": - raise ValueError( - f"Can't create date-time array from units {self.Units!r} " - "because calendar is 'none'" - ) - - units1, reftime = units.units.split(" since ") - - # Convert months and years to days, because cftime won't work - # otherwise. - if units1 in ("months", "month"): - d = self * _month_length - d.override_units( - Units( - f"days since {reftime}", - calendar=getattr(units, "calendar", None), - ), - inplace=True, - ) - elif units1 in ("years", "year", "yr"): - d = self * _year_length - d.override_units( - Units( - f"days since {reftime}", - calendar=getattr(units, "calendar", None), - ), - inplace=True, - ) - else: - d = self - - dx = d.to_dask_array() - dx = convert_to_datetime(dx, d.Units) - - a = dx.compute() - - if np.ma.isMA(a): - if self.hardmask: - a.harden_mask() - else: - a.soften_mask() - - a.set_fill_value(self.fill_value) - - return a - - @property - def mask(self): - """The Boolean missing data mask of the data array. - - The Boolean mask has True where the data array has missing data - and False otherwise. - - :Returns: - - `Data` - - **Examples** - - >>> d.shape - (12, 73, 96) - >>> m = d.mask - >>> m.dtype - dtype('bool') - >>> m.shape - (12, 73, 96) - - """ - mask_data_obj = self.copy(array=False) - - dx = self.to_dask_array() - mask = da.ma.getmaskarray(dx) - - mask_data_obj._set_dask(mask) - mask_data_obj.override_units(_units_None, inplace=True) - mask_data_obj.hardmask = _DEFAULT_HARDMASK + dx, + dx_ind, + adjust_chunks={i: 1 for i in out_ind}, + dtype=bool, + ) - return mask_data_obj + return bool(dx.any()) @_inplace_enabled(default=False) def arctan(self, inplace=False): @@ -5842,70 +3799,6 @@ def arccosh(self, inplace=False): return d - def all(self, axis=None, keepdims=True, split_every=None): - """Test whether all data array elements evaluate to True. - - .. seealso:: `allclose`, `any`, `isclose` - - :Parameters: - - axis: (sequence of) `int`, optional - Axis or axes along which a logical AND reduction is - performed. The default (`None`) is to perform a - logical AND over all the dimensions of the input - array. *axis* may be negative, in which case it counts - from the last to the first axis. - - {{collapse keepdims: `bool`, optional}} - - {{split_every: `int` or `dict`, optional}} - - :Returns: - - `Data` - Whether or not all data array elements evaluate to True. - - **Examples** - - >>> d = cf.Data([[1, 2], [3, 4]]) - >>> d.all() - - >>> d.all(keepdims=False) - - >>> d.all(axis=0) - - >>> d.all(axis=1) - - >>> d.all(axis=()) - - - >>> d[0] = cf.masked - >>> d[1, 0] = 0 - >>> print(d.array) - [[-- --] - [0 4]] - >>> d.all(axis=0) - - >>> d.all(axis=1) - - - >>> d[...] = cf.masked - >>> d.all() - - >>> bool(d.all()) - True - >>> bool(d.all(keepdims=False)) - False - - """ - d = self.copy(array=False) - dx = self.to_dask_array() - dx = da.all(dx, axis=axis, keepdims=keepdims, split_every=split_every) - d._set_dask(dx) - d.hardmask = _DEFAULT_HARDMASK - d.override_units(_units_None, inplace=True) - return d - def allclose(self, y, rtol=None, atol=None): """Whether an array is element-wise equal within a tolerance. @@ -5955,272 +3848,6 @@ def allclose(self, y, rtol=None, atol=None): """ return self.isclose(y, rtol=rtol, atol=atol).all() - def any(self, axis=None, keepdims=True, split_every=None): - """Test whether any data array elements evaluate to True. - - .. seealso:: `all`, `allclose`, `isclose` - - :Parameters: - - axis: (sequence of) `int`, optional - Axis or axes along which a logical OR reduction is - performed. The default (`None`) is to perform a - logical OR over all the dimensions of the input - array. *axis* may be negative, in which case it counts - from the last to the first axis. - - {{collapse keepdims: `bool`, optional}} - - {{split_every: `int` or `dict`, optional}} - - :Returns: - - `Data` - Whether or any data array elements evaluate to True. - - **Examples** - - >>> d = cf.Data([[0, 2], [0, 4]]) - >>> d.any() - - >>> d.any(keepdims=False) - - >>> d.any(axis=0) - - >>> d.any(axis=1) - - >>> d.any(axis=()) - - - >>> d[0] = cf.masked - >>> print(d.array) - [[-- --] - [0 4]] - >>> d.any(axis=0) - - >>> d.any(axis=1) - - - >>> d[...] = cf.masked - >>> d.any() - - >>> bool(d.any()) - False - >>> bool(d.any(keepdims=False)) - False - - """ - d = self.copy(array=False) - dx = self.to_dask_array() - dx = da.any(dx, axis=axis, keepdims=keepdims, split_every=split_every) - d._set_dask(dx) - d.hardmask = _DEFAULT_HARDMASK - d.override_units(_units_None, inplace=True) - return d - - @_inplace_enabled(default=False) - def apply_masking( - self, - fill_values=None, - valid_min=None, - valid_max=None, - valid_range=None, - inplace=False, - ): - """Apply masking. - - Masking is applied according to the values of the keyword - parameters. - - Elements that are already masked remain so. - - .. versionadded:: 3.4.0 - - .. seealso:: `get_fill_value`, `hardmask`, `mask`, `where` - - :Parameters: - - fill_values: `bool` or sequence of scalars, optional - Specify values that will be set to missing data. Data - elements exactly equal to any of the values are set to - missing data. - - If True then the value returned by the - `get_fill_value` method, if such a value exists, is - used. - - Zero or more values may be provided in a sequence of - scalars. - - *Parameter example:* - Specify a fill value of 999: ``fill_values=[999]`` - - *Parameter example:* - Specify fill values of 999 and -1.0e30: - ``fill_values=[999, -1.0e30]`` - - *Parameter example:* - Use the fill value already set for the data: - ``fill_values=True`` - - *Parameter example:* - Use no fill values: ``fill_values=False`` or - ``fill_value=[]`` - - valid_min: number, optional - A scalar specifying the minimum valid value. Data - elements strictly less than this number will be set to - missing data. - - valid_max: number, optional - A scalar specifying the maximum valid value. Data - elements strictly greater than this number will be set - to missing data. - - valid_range: (number, number), optional - A vector of two numbers specifying the minimum and - maximum valid values, equivalent to specifying values - for both *valid_min* and *valid_max* parameters. The - *valid_range* parameter must not be set if either - *valid_min* or *valid_max* is defined. - - *Parameter example:* - ``valid_range=[-999, 10000]`` is equivalent to setting - ``valid_min=-999, valid_max=10000`` - - {{inplace: `bool`, optional}} - - :Returns: - - `Data` or `None` - The data with masked values. If the operation was in-place - then `None` is returned. - - **Examples** - - >>> import numpy - >>> d = cf.Data(numpy.arange(12).reshape(3, 4), 'm') - >>> d[1, 1] = cf.masked - >>> print(d.array) - [[0 1 2 3] - [4 -- 6 7] - [8 9 10 11]] - >>> print(d.apply_masking().array) - [[0 1 2 3] - [4 -- 6 7] - [8 9 10 11]] - >>> print(d.apply_masking(fill_values=[0]).array) - [[-- 1 2 3] - [4 -- 6 7] - [8 9 10 11]] - >>> print(d.apply_masking(fill_values=[0, 11]).array) - [[-- 1 2 3] - [4 -- 6 7] - [8 9 10 --]] - >>> print(d.apply_masking(valid_min=3).array) - [[-- -- -- 3] - [4 -- 6 7] - [8 9 10 11]] - >>> print(d.apply_masking(valid_max=6).array) - [[0 1 2 3] - [4 -- 6 --] - [-- -- -- --]] - >>> print(d.apply_masking(valid_range=[2, 8]).array) - [[-- -- 2 3] - [4 -- 6 7] - [8 -- -- --]] - >>> d.set_fill_value(7) - >>> print(d.apply_masking(fill_values=True).array) - [[0 1 2 3] - [4 -- 6 --] - [8 9 10 11]] - >>> print(d.apply_masking(fill_values=True, - ... valid_range=[2, 8]).array) - [[-- -- 2 3] - [4 -- 6 --] - [8 -- -- --]] - - """ - # Parse valid_range - if valid_range is not None: - if valid_min is not None or valid_max is not None: - raise ValueError( - "Can't set 'valid_range' parameter with either the " - "'valid_min' nor 'valid_max' parameters" - ) - - try: - if len(valid_range) != 2: - raise ValueError( - "'valid_range' parameter must be a vector of " - "two elements" - ) - except TypeError: - raise ValueError( - "'valid_range' parameter must be a vector of " - "two elements" - ) - - valid_min, valid_max = valid_range - - # Parse fill_values - if fill_values is None: - fill_values = False - - if isinstance(fill_values, bool): - if fill_values: - fill_value = self.get_fill_value(None) - if fill_value is not None: - fill_values = (fill_value,) - else: - fill_values = () - else: - fill_values = () - else: - try: - iter(fill_values) - except TypeError: - raise TypeError( - "'fill_values' parameter must be a sequence or " - f"of type bool. Got type {type(fill_values)}" - ) - else: - if isinstance(fill_values, str): - raise TypeError( - "'fill_values' parameter must be a sequence or " - f"of type bool. Got type {type(fill_values)}" - ) - - d = _inplace_enabled_define_and_cleanup(self) - - dx = self.to_dask_array() - - mask = None - if fill_values: - mask = dx == fill_values[0] - - for fill_value in fill_values[1:]: - mask |= dx == fill_value - - if valid_min is not None: - if mask is None: - mask = dx < valid_min - else: - mask |= dx < valid_min - - if valid_max is not None: - if mask is None: - mask = dx > valid_max - else: - mask |= dx > valid_max - - if mask is not None: - dx = da.ma.masked_where(mask, dx) - - d._set_dask(dx) - - return d - def argmax(self, axis=None, unravel=False): """Return the indices of the maximum values along an axis. @@ -6514,7 +4141,7 @@ def convert_reference_time( # By default, set the target units to "days since # , calendar=" - units = Units( + units = self._Units_class( "days since " + units0.units.split(" since ")[1], calendar=units0._calendar, ) @@ -6526,29 +4153,28 @@ def convert_reference_time( units0_since_reftime = units0._units_since_reftime if units0_since_reftime in _month_units: if calendar_months: - units0 = Units( + units0 = self._Units_class( "calendar_" + units0.units, calendar=units0._calendar ) else: - units0 = Units( + units0 = self._Units_class( "days since " + units0.units.split(" since ")[1], calendar=units0._calendar, ) d.Units = units0 elif units0_since_reftime in _year_units: if calendar_years: - units0 = Units( + units0 = self._Units_class( "calendar_" + units0.units, calendar=units0._calendar ) else: - units0 = Units( + units0 = self._Units_class( "days since " + units0.units.split(" since ")[1], calendar=units0._calendar, ) d.Units = units0 - # REVIEW: getitem: `convert_reference_time`: set 'asanyarray' - # 'cf_rt2dt' its own call to 'cf_asanyarray', so we can set + # 'cf_rt2dt' its own call to 'cfdm_asanyarray', so we can set # '_asanyarray=False'. dx = d.to_dask_array(_asanyarray=False) @@ -6563,18 +4189,6 @@ def convert_reference_time( return d - def get_data(self, default=ValueError(), _units=None, _fill_value=None): - """Returns the data. - - .. versionadded:: 3.0.0 - - :Returns: - - `Data` - - """ - return self - def get_deterministic_name(self): """Get the deterministic name for the data. @@ -6610,199 +4224,33 @@ def get_deterministic_name(self): >>> d1.get_deterministic_name() '6380dd3674fbf10d30561484b084e9b3' >>> d1.get_deterministic_name() == d.get_deterministic_name() - True - >>> d1.equals(d) - True - - >>> e = d + 1 - 1 - >>> e.get_deterministic_name() - '0b83ada62d4b014bae83c3de1c1d3a80' - >>> e.get_deterministic_name() == d.get_deterministic_name() - False - >>> e.equals(d) - True - - """ - if not self._custom["deterministic"]: - raise ValueError() - - units = self._Units - - # REVIEW: getitem: `get_deterministic_name`: set 'asanyarray' - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - return tokenize( - self.to_dask_array(_asanyarray=False).name, - units.formatted(definition=True, names=True), - units._canonical_calendar, - ) - - def get_filenames(self): - """The names of files containing parts of the data array. - - Returns the names of any files that may be required to deliver - the computed data array. This set may contain fewer names than - the collection of file names that defined the data when it was - first instantiated, as could be the case after the data has - been subspaced. - - **Implementation** - - A `dask` chunk that contributes to the computed array is - assumed to reference data within a file if that chunk's array - object has a callable `get_filenames` method, the output of - which is added to the returned `set`. - - :Returns: - - `set` - The file names. If no files are required to compute - the data then an empty `set` is returned. - - **Examples** - - >>> d = cf.Data.full((5, 8), 1, chunks=4) - >>> d.get_filenames() - set() - - >>> f = cf.example_field(0) - >>> cf.write(f, "file_A.nc") - >>> cf.write(f, "file_B.nc") - - >>> a = cf.read("file_A.nc", chunks=4)[0].data - >>> a += 999 - >>> b = cf.read("file_B.nc", chunks=4)[0].data - >>> c = cf.Data(b.array, units=b.Units, chunks=4) - >>> print(a.shape, b.shape, c.shape) - (5, 8) (5, 8) (5, 8) - >>> d = cf.Data.concatenate([a, a.copy(), b, c], axis=1) - >>> print(d.shape) - (5, 32) - - >>> d.get_filenames() - {'file_A.nc', 'file_B.nc'} - >>> d[:, 2:7].get_filenames() - {'file_A.nc'} - >>> d[:, 2:14].get_filenames() - {'file_A.nc', 'file_B.nc'} - >>> d[:, 2:20].get_filenames() - {'file_A.nc', 'file_B.nc'} - >>> d[:, 2:30].get_filenames() - {'file_A.nc', 'file_B.nc'} - >>> d[:, 29:30].get_filenames() - set() - >>> d[2, 3] = -99 - >>> d[2, 3].get_filenames() - {'file_A.nc'} - - """ - out = set() - - # REVIEW: getitem: `get_filenames`: set 'asanyarray' - # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - for a in self.todict(_asanyarray=False).values(): - try: - out.update(a.get_filenames()) - except AttributeError: - pass - - return out - - def get_units(self, default=ValueError()): - """Return the units. - - .. seealso:: `del_units`, `set_units` - - :Parameters: - - default: optional - Return the value of the *default* parameter if the units - have not been set. If set to an `Exception` instance then - it will be raised instead. - - :Returns: - - The units. - - **Examples** - - >>> d.set_units('metres') - >>> d.get_units() - 'metres' - >>> d.del_units() - >>> d.get_units() - ValueError: Can't get non-existent units - >>> print(d.get_units(None)) - None - - """ - try: - return self.Units.units - except AttributeError: - return super().get_units(default=default) - - def get_calendar(self, default=ValueError()): - """Return the calendar. - - .. seealso:: `del_calendar`, `set_calendar` - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - calendar has not been set. If set to an `Exception` - instance then it will be raised instead. - - :Returns: - - The calendar. - - **Examples** - - >>> d.set_calendar('julian') - >>> d.get_calendar - 'metres' - >>> d.del_calendar() - >>> d.get_calendar() - ValueError: Can't get non-existent calendar - >>> print(d.get_calendar(None)) - None - - """ - try: - return self.Units.calendar - except (AttributeError, KeyError): - return super().get_calendar(default=default) - - def set_calendar(self, calendar): - """Set the calendar. - - .. seealso:: `override_calendar`, `override_units`, - `del_calendar`, `get_calendar` - - :Parameters: - - value: `str` - The new calendar. - - :Returns: + True + >>> d1.equals(d) + True - `None` + >>> e = d + 1 - 1 + >>> e.get_deterministic_name() + '0b83ada62d4b014bae83c3de1c1d3a80' + >>> e.get_deterministic_name() == d.get_deterministic_name() + False + >>> e.equals(d) + True - **Examples** + """ + if not self.has_deterministic_name(): + raise ValueError() - >>> d.set_calendar('none') - >>> d.get_calendar - 'none' - >>> d.del_calendar() - >>> d.get_calendar() - ValueError: Can't get non-existent calendar - >>> print(d.get_calendar(None)) - None + units = self._Units - """ - self.Units = Units(self.get_units(default=None), calendar) + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + return tokenize( + self.to_dask_array( + _apply_mask_hardness=False, _asanyarray=False + ).name, + units.formatted(definition=True, names=True), + units._canonical_calendar, + ) def add_file_location(self, location): """Add a new file location in-place. @@ -6835,7 +4283,6 @@ def add_file_location(self, location): updated = False - # REVIEW: getitem: `add_file_location`: set 'asanyarray' # The dask graph is never going to be computed, so we can set # '_asanyarray=False'. dsk = self.todict(_asanyarray=False) @@ -6884,7 +4331,51 @@ def set_units(self, value): None """ - self.Units = Units(value, self.get_calendar(default=None)) + self.Units = self._Units_class(value, self.get_calendar(default=None)) + + @_inplace_enabled(default=False) + def masked_where(self, condition, inplace=False): + """Mask the data where a condition is met. + + ``d.masked_where(condition)`` is equivalent to + ``d.where(condition, cf.masked)``. + + **Performance** + + `masked_where` causes all delayed operations to be executed. + + .. versionadded:: NEXTVERSION + + .. seealso:: `mask`, `masked_values`, `where` + + :Parameters: + + condition: array_like + The masking condition. The data is masked where + *condition* is True. Any masked values already in the + data are also masked in the result. + + {{inplace: `bool`, optional}} + + :Returns: + + {{inplace: `bool`, optional}} + + :Returns: + + `{{class}}` or `None` + The result of masking the data, or `None` if the + operation was in-place. + + **Examples** + + >>> d = {{package}}.{{class}}([1, 2, 3, 4, 5]) + >>> e = d.masked_where([0, 1, 0, 1, 0]) + >>> print(e.array) + [1 -- 3 -- 5] + + """ + return self.where(condition, masked, None, inplace=inplace) @_inplace_enabled(default=False) @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @@ -7553,7 +5044,7 @@ def clip(self, a_min, a_max, units=None, inplace=False, i=False): """ if units is not None: # Convert the limits to the same units as the data array - units = Units(units) + units = self._Units_class(units) self_units = self.Units if self_units != units: a_min = Units.conform(np.asanyarray(a_min), units, self_units) @@ -7899,7 +5390,7 @@ def count(self, axis=None, keepdims=True, split_every=None): dx, axis=axis, keepdims=keepdims, split_every=split_every ) d._set_dask(dx) - d.hardmask = _DEFAULT_HARDMASK + d.hardmask = self._DEFAULT_HARDMASK d.override_units(_units_None, inplace=True) return d @@ -8150,92 +5641,6 @@ def second(self): """ return YMDhms(self, "second") - @property - def sparse_array(self): - """Return an independent `scipy` sparse array of the data. - - In-place changes to the returned sparse array do not affect - the underlying dask array. - - An `AttributeError` is raised if a sparse array representation - is not available. - - **Performance** - - `sparse_array` causes all delayed operations to be - computed. The returned sparse array is a deep copy of that - returned by created `compute`. - - .. versionadded:: 3.16.0 - - .. seealso:: `array` - - :Returns: - - An independent `scipy` sparse array of the data. - - **Examples** - - >>> from scipy.sparse import issparse - >>> issparse(d.sparse_array) - True - - """ - array = self.compute() - if issparse(array): - return array.copy() - - raise AttributeError( - "A sparse array representation of the data is not available" - ) - - @_inplace_enabled(default=False) - def uncompress(self, inplace=False): - """Uncompress the data. - - Only affects data that is compressed by convention, i.e. - - * Ragged arrays for discrete sampling geometries (DSG) and - simple geometry cell definitions. - - * Compression by gathering. - - * Compression by coordinate subsampling. - - Data that is already uncompressed is returned - unchanged. Whether the data is compressed or not does not - alter its functionality nor external appearance, but may - affect how the data are written to a dataset on disk. - - .. versionadded:: 3.0.6 - - .. seealso:: `array`, `compressed_array`, `source` - - :Parameters: - - {{inplace: `bool`, optional}} - - :Returns: - - `Data` or `None` - The uncompressed data, or `None` if the operation was - in-place. - - **Examples** - - >>> d.get_compression_type() - 'ragged contiguous' - >>> d.uncompress() - >>> d.get_compression_type() - '' - - """ - d = _inplace_enabled_define_and_cleanup(self) - if d.get_compression_type(): - d._del_Array(None) - - return d - def unique(self, split_every=None): """The unique elements of the data. @@ -8279,15 +5684,14 @@ def unique(self, split_every=None): # in the result. d.soften_mask() - # REVIEW: getitem: `unique`: set 'asanyarray' # The applicable chunk function will have its own call to - # 'cf_asanyarray', so we can set '_asanyarray=False'. + # 'cfdm_asanyarray', so we can set '_asanyarray=False'. dx = d.to_dask_array(_asanyarray=False) dx = Collapse().unique(dx, split_every=split_every) d._set_dask(dx) - d.hardmask = _DEFAULT_HARDMASK + d.hardmask = self._DEFAULT_HARDMASK return d @@ -8366,194 +5770,6 @@ def ndindex(self): """ return product(*[range(0, r) for r in self.shape]) - @_deprecated_kwarg_check("traceback", version="3.0.0", removed_at="4.0.0") - @_manage_log_level_via_verbosity - def equals( - self, - other, - rtol=None, - atol=None, - ignore_fill_value=False, - ignore_data_type=False, - ignore_type=False, - verbose=None, - traceback=False, - ignore_compression=False, - ): - """True if two data arrays are logically equal, False otherwise. - - {{equals tolerance}} - - :Parameters: - - other: - The object to compare for equality. - - {{rtol: number, optional}} - - {{atol: number, optional}} - - ignore_fill_value: `bool`, optional - If True then data arrays with different fill values are - considered equal. By default they are considered unequal. - - {{ignore_data_type: `bool`, optional}} - - {{ignore_type: `bool`, optional}} - - {{verbose: `int` or `str` or `None`, optional}} - - traceback: deprecated at version 3.0.0 - Use the *verbose* parameter instead. - - {{ignore_compression: `bool`, optional}} - - :Returns: - - `bool` - Whether or not the two instances are equal. - - **Examples** - - >>> d.equals(d) - True - >>> d.equals(d + 1) - False - - """ - # Set default tolerances - if rtol is None: - rtol = self._rtol - - if atol is None: - atol = self._atol - - if not super().equals( - other, - rtol=rtol, - atol=atol, - verbose=verbose, - ignore_data_type=ignore_data_type, - ignore_fill_value=ignore_fill_value, - ignore_type=ignore_type, - _check_values=False, - ): - # TODODASK: consistency with cfdm Data.equals needs to be verified - # possibly via a follow-up PR to cfdm to implement any changes. - return False - - # ------------------------------------------------------------ - # Check that each instance has equal array values - # ------------------------------------------------------------ - self_dx = self.to_dask_array() - other_dx = other.to_dask_array() - - # Check that each instance has the same units. Do this before - # any other possible short circuits. - self_Units = self.Units - other_Units = other.Units - if self_Units != other_Units: - if is_log_level_info(logger): - logger.info( - f"{self.__class__.__name__}: Different Units " - f"({self_Units!r}, {other_Units!r})" - ) - - return False - - rtol = float(rtol) - atol = float(atol) - - # Return False if there are different cached elements. This - # provides a possible short circuit for that case that two - # arrays are not equal (but not in the case that they are). - cache0 = self._get_cached_elements() - if cache0: - cache1 = other._get_cached_elements() - if cache1 and sorted(cache0) == sorted(cache1): - a = [] - b = [] - for key, value0 in cache0.items(): - value1 = cache1[key] - if value0 is np.ma.masked or value1 is np.ma.masked: - # Don't test on masked values - this logic is - # determined elsewhere. - continue - - # Make sure strings are unicode - try: - value0 = value0.decode() - value1 = value1.decode() - except AttributeError: - pass - - a.append(value0) - b.append(value1) - - if a and not _numpy_allclose(a, b, rtol=rtol, atol=atol): - if is_log_level_info(logger): - logger.info( - f"{self.__class__.__name__}: Different array " - f"values (atol={atol}, rtol={rtol})" - ) - - return False - - # Now check that corresponding elements are equal within a tolerance. - # We assume that all inputs are masked arrays. Note we compare the - # data first as this may return False due to different dtype without - # having to wait until the compute call. - self_is_numeric = is_numeric_dtype(self_dx) - other_is_numeric = is_numeric_dtype(other_dx) - if self_is_numeric and other_is_numeric: - data_comparison = _da_ma_allclose( - self_dx, - other_dx, - masked_equal=True, - rtol=rtol, - atol=atol, - ) - elif not self_is_numeric and not other_is_numeric: - # If the array (say d) is fully masked, then the output of - # np.all(d == d) and therefore da.all(d == d) will be a - # np.ma.masked object which has dtype('float64'), and not - # a Boolean, causing issues later. To ensure data_comparison - # is Boolean, we must do an early compute to check if it is - # a masked object and if so, force the desired result (True). - # - # This early compute won't degrade performance because it - # would be performed towards result.compute() below anyway. - data_comparison = da.all(self_dx == other_dx).compute() - if data_comparison is np.ma.masked: - data_comparison = True - - else: # one is numeric and other isn't => not equal (incompat. dtype) - if is_log_level_info(logger): - logger.info( - f"{self.__class__.__name__}: Different data types:" - f"{self_dx.dtype} != {other_dx.dtype}" - ) - - return False - - mask_comparison = da.all( - da.equal(da.ma.getmaskarray(self_dx), da.ma.getmaskarray(other_dx)) - ) - - # Apply a (dask) logical 'and' to confirm if both the mask and the - # data are equal for the pair of masked arrays: - result = da.logical_and(data_comparison, mask_comparison) - if not result.compute(): - if is_log_level_info(logger): - logger.info( - f"{self.__class__.__name__}: Different array values (" - f"atol={atol}, rtol={rtol})" - ) - - return False - else: - return True - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def exp(self, inplace=False, i=False): @@ -8590,70 +5806,6 @@ def exp(self, inplace=False, i=False): return d - @_inplace_enabled(default=False) - def insert_dimension(self, position=0, inplace=False): - """Expand the shape of the data array in place. - - .. seealso:: `flip`, `squeeze`, `swapaxes`, `transpose` - - :Parameters: - - position: `int`, optional - Specify the position that the new axis will have in the data - array axes. By default the new axis has position 0, the - slowest varying position. - - {{inplace: `bool`, optional}} - - :Returns: - - `Data` or `None` - - **Examples** - - """ - # TODODASKAPI bring back expand_dims alias (or rather alias this to - # that) - - d = _inplace_enabled_define_and_cleanup(self) - - # Parse position - if not isinstance(position, int): - raise ValueError("Position parameter must be an integer") - - ndim = d.ndim - if -ndim - 1 <= position < 0: - position += ndim + 1 - elif not 0 <= position <= ndim: - raise ValueError( - f"Can't insert dimension: Invalid position {position!r}" - ) - - shape = list(d.shape) - shape.insert(position, 1) - - dx = d.to_dask_array() - dx = dx.reshape(shape) - - # Inserting a dimension doesn't affect the cached elements nor - # the CFA write status - d._set_dask(dx, clear=_ALL ^ _CACHE ^ _CFA) - - # Expand _axes - axis = new_axis_identifier(d._axes) - data_axes = list(d._axes) - data_axes.insert(position, axis) - d._axes = data_axes - - # Update the HDF5 chunking strategy - chunksizes = d.nc_hdf5_chunksizes() - if isinstance(chunksizes, tuple): - chunksizes = list(chunksizes) - chunksizes.insert(position, 1) - d.nc_set_hdf5_chunksizes(chunksizes) - - return d - @_deprecated_kwarg_check("size", version="3.14.0", removed_at="5.0.0") @_inplace_enabled(default=False) @_manage_log_level_via_verbosity @@ -8998,166 +6150,30 @@ def halo( # Set expanded axes to be non-cyclic d.cyclic(axes=tuple(depth), iscyclic=False) - return d - - def harden_mask(self): - """Force the mask to hard. - - Whether the mask of a masked array is hard or soft is - determined by its `hardmask` property. `harden_mask` sets - `hardmask` to `True`. - - .. versionadded:: 3.14.0 - - .. seealso:: `hardmask`, `soften_mask` - - **Examples** - - >>> d = cf.Data([1, 2, 3], hardmask=False) - >>> d.hardmask - False - >>> d.harden_mask() - >>> d.hardmask - True - - >>> d = cf.Data([1, 2, 3], mask=[False, True, False]) - >>> d.hardmask - True - >>> d[1] = 999 - >>> print(d.array) - [1 -- 3] - - """ - # REVIEW: getitem: `hardmask`: set 'asanyarray' - # 'cf_harden_mask' has its own call to 'cf_asanyarray', so we - # can set '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype) - self._set_dask(dx, clear=_NONE) - self.hardmask = True - - def has_calendar(self): - """Whether a calendar has been set. - - .. seealso:: `del_calendar`, `get_calendar`, `set_calendar`, - `has_units`, `Units` - - :Returns: - - `bool` - True if the calendar has been set, otherwise False. - - **Examples** - - >>> d = cf.Data(1, "days since 2000-1-1", calendar="noleap") - >>> d.has_calendar() - True - - >>> d = cf.Data(1, calendar="noleap") - >>> d.has_calendar() - True - - >>> d = cf.Data(1, "days since 2000-1-1") - >>> d.has_calendar() - False - - >>> d = cf.Data(1, "m") - >>> d.has_calendar() - False - - """ - return hasattr(self.Units, "calendar") - - def has_deterministic_name(self): - """Whether there is a deterministic name for the data. - - See `get_deterministic_name` for details. - - .. versionadded:: 3.15.1 - - .. seealso:: `get_deterministic_name` - - :Returns: - - `bool` - Whether or not there is a deterministic name. - - **Examples** - - >>> d = cf.Data([1, 2, 3], 'm') - >>> d.has_deterministic_name() - True - - """ - return self._custom["deterministic"] - - def has_units(self): - """Whether units have been set. - - .. seealso:: `del_units`, `get_units`, `set_units`, - `has_calendar`, `Units` - - :Returns: - - `bool` - True if units have been set, otherwise False. - - **Examples** - - >>> d = cf.Data(1, "") - >>> d.has_units() - True - - >>> d = cf.Data(1, "m") - >>> d.has_units() - True - - >>> d = cf.Data(1) - >>> d.has_units() - False + return d - >>> d = cf.Data(1, calendar='noleap') - >>> d.has_units() - False + def has_deterministic_name(self): + """Whether there is a deterministic name for the data. - """ - return hasattr(self.Units, "units") + See `get_deterministic_name` for details. - def soften_mask(self): - """Force the mask to soft. + .. versionadded:: 3.15.1 - Whether the mask of a masked array is hard or soft is - determined by its `hardmask` property. `soften_mask` sets - `hardmask` to `False`. + .. seealso:: `get_deterministic_name` - .. versionadded:: 3.14.0 + :Returns: - .. seealso:: `hardmask`, `harden_mask` + `bool` + Whether or not there is a deterministic name. **Examples** - >>> d = cf.Data([1, 2, 3]) - >>> d.hardmask + >>> d = cf.Data([1, 2, 3], 'm') + >>> d.has_deterministic_name() True - >>> d.soften_mask() - >>> d.hardmask - False - - >>> d = cf.Data([1, 2, 3], mask=[False, True, False], hardmask=False) - >>> d.hardmask - False - >>> d[1] = 999 - >>> print(d.array) - [ 1 999 3] """ - # REVIEW: getitem: `soften_mask`: set 'asanyarray' - # 'cf_soften_mask' has its own call to 'cf_asanyarray', so we - # can set '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) - dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype) - self._set_dask(dx, clear=_NONE) - self.hardmask = False + return self._custom.get("has_deterministic_name", False) def file_locations(self): """The locations of files containing parts of the data. @@ -9183,7 +6199,6 @@ def file_locations(self): """ out = set() - # REVIEW: getitem: `file_locations`: set 'asanyarray' # The dask graph is never going to be computed, so we can set # '_asanyarray=False'. for key, a in self.todict(_asanyarray=False).items(): @@ -9195,191 +6210,6 @@ def file_locations(self): return out - @_inplace_enabled(default=False) - def filled(self, fill_value=None, inplace=False): - """Replace masked elements with a fill value. - - .. versionadded:: 3.4.0 - - :Parameters: - - fill_value: scalar, optional - The fill value. By default the fill returned by - `get_fill_value` is used, or if this is not set then the - netCDF default fill value for the data type is used (as - defined by `netCDF.fillvals`). - - {{inplace: `bool`, optional}} - - :Returns: - - `Data` or `None` - The filled data, or `None` if the operation was in-place. - - **Examples** - - >>> d = cf.Data([[1, 2, 3]]) - >>> print(d.filled().array) - [[1 2 3]] - >>> d[0, 0] = cf.masked - >>> print(d.filled().array) - [-9223372036854775806 2 3] - >>> d.set_fill_value(-99) - >>> print(d.filled().array) - [[-99 2 3]] - - """ - d = _inplace_enabled_define_and_cleanup(self) - - if fill_value is None: - fill_value = d.get_fill_value(None) - if fill_value is None: # still... - fill_value = default_netCDF_fillvals().get(d.dtype.str[1:]) - if fill_value is None and d.dtype.kind in ("SU"): - fill_value = default_netCDF_fillvals().get("S1", None) - - if fill_value is None: - raise ValueError( - "Can't determine fill value for " - f"data type {d.dtype.str!r}" - ) - - # REVIEW: getitem: `filled`: set 'asanyarray' - # 'cf_filled' has its own call to 'cf_asanyarray', so we can - # set '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) - dx = dx.map_blocks(cf_filled, fill_value=fill_value, dtype=d.dtype) - d._set_dask(dx) - - return d - - def first_element(self): - """Return the first element of the data as a scalar. - - .. seealso:: `last_element`, `second_element` - - **Performance** - - If possible, a cached value is returned. Otherwise the delayed - operations needed to compute the element are executed, and - cached for subsequent calls. - - :Returns: - - The first element of the data. - - **Examples** - - >>> d = {{package}}.{{class}}(9.0) - >>> x = d.first_element() - >>> print(x, type(x)) - 9.0 - - >>> d = {{package}}.{{class}}([[1, 2], [3, 4]]) - >>> x = d.first_element() - >>> print(x, type(x)) - 1 - >>> d[0, 0] = {{package}}.masked - >>> y = d.first_element() - >>> print(y, type(y)) - -- - - >>> d = {{package}}.{{class}}(['foo', 'bar']) - >>> x = d.first_element() - >>> print(x, type(x)) - foo - - """ - try: - return self._custom["cached_elements"][0] - except KeyError: - item = super().first_element() - self._set_cached_elements({0: item}) - return item - - def second_element(self): - """Return the second element of the data as a scalar. - - .. seealso:: `first_element`, `last_element` - - **Performance** - - If possible, a cached value is returned. Otherwise the delayed - operations needed to compute the element are executed, and - cached for subsequent calls. - - :Returns: - - The second element of the data. - - **Examples** - - >>> d = {{package}}.{{class}}([[1, 2], [3, 4]]) - >>> x = d.second_element() - >>> print(x, type(x)) - 2 - >>> d[0, 1] = {{package}}.masked - >>> y = d.second_element() - >>> print(y, type(y)) - -- - - >>> d = {{package}}.{{class}}(['foo', 'bar']) - >>> x = d.second_element() - >>> print(x, type(x)) - bar - - """ - try: - return self._custom["cached_elements"][1] - except KeyError: - item = super().second_element() - self._set_cached_elements({1: item}) - return item - - def last_element(self): - """Return the last element of the data as a scalar. - - .. seealso:: `first_element`, `second_element` - - **Performance** - - If possible, a cached value is returned. Otherwise the delayed - operations needed to compute the element are executed, and - cached for subsequent calls. - - :Returns: - - The last element of the data. - - **Examples** - - >>> d = {{package}}.{{class}}(9.0) - >>> x = d.last_element() - >>> print(x, type(x)) - 9.0 - - >>> d = {{package}}.{{class}}([[1, 2], [3, 4]]) - >>> x = d.last_element() - >>> print(x, type(x)) - 4 - >>> d[-1, -1] = {{package}}.masked - >>> y = d.last_element() - >>> print(y, type(y)) - -- - - >>> d = {{package}}.{{class}}(['foo', 'bar']) - >>> x = d.last_element() - >>> print(x, type(x)) - bar - - """ - try: - return self._custom["cached_elements"][-1] - except KeyError: - item = super().last_element() - self._set_cached_elements({-1: item}) - return item - def flat(self, ignore_masked=True): """Return a flat iterator over elements of the data array. @@ -9427,138 +6257,7 @@ def flat(self, ignore_masked=True): if not mask[index]: yield self[index].array.item() else: - yield cf_masked - - @_inplace_enabled(default=False) - def flatten(self, axes=None, inplace=False): - """Flatten specified axes of the data. - - Any subset of the axes may be flattened. - - The shape of the data may change, but the size will not. - - The flattening is executed in row-major (C-style) order. For - example, the array ``[[1, 2], [3, 4]]`` would be flattened across - both dimensions to ``[1 2 3 4]``. - - .. versionadded:: 3.0.2 - - .. seealso:: `compressed`, `flat`, `insert_dimension`, `flip`, - `swapaxes`, `transpose` - - :Parameters: - - axes: (sequence of) `int` - Select the axes to be flattened. By default all axes - are flattened. Each axis is identified by its integer - position. No axes are flattened if *axes* is an empty - sequence. - - {{inplace: `bool`, optional}} - - :Returns: - - `Data` or `None` - The flattened data, or `None` if the operation was - in-place. - - **Examples** - - >>> import numpy as np - >>> d = cf.Data(np.arange(24).reshape(1, 2, 3, 4)) - >>> d - - >>> print(d.array) - [[[[ 0 1 2 3] - [ 4 5 6 7] - [ 8 9 10 11]] - [[12 13 14 15] - [16 17 18 19] - [20 21 22 23]]]] - - >>> e = d.flatten() - >>> e - - >>> print(e.array) - [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23] - - >>> e = d.flatten([]) - >>> e - - - >>> e = d.flatten([1, 3]) - >>> e - - >>> print(e.array) - [[[ 0 4 8] - [ 1 5 9] - [ 2 6 10] - [ 3 7 11] - [12 16 20] - [13 17 21] - [14 18 22] - [15 19 23]]] - - >>> d.flatten([0, -1], inplace=True) - >>> d - - >>> print(d.array) - [[[ 0 4 8] - [12 16 20]] - [[ 1 5 9] - [13 17 21]] - [[ 2 6 10] - [14 18 22]] - [[ 3 7 11] - [15 19 23]]] - - """ - d = _inplace_enabled_define_and_cleanup(self) - - ndim = d.ndim - if not ndim: - if axes or axes == 0: - raise ValueError( - "Can't flatten: Can't remove axes from " - f"scalar {self.__class__.__name__}" - ) - - return d - - if axes is None: - axes = list(range(ndim)) - else: - axes = sorted(d._parse_axes(axes)) - - n_axes = len(axes) - if n_axes <= 1: - return d - - dx = d.to_dask_array() - - # It is important that the first axis in the list is the - # left-most flattened axis. - # - # E.g. if the shape is (10, 20, 30, 40, 50, 60) and the axes - # to be flattened are [2, 4], then the data must be - # transposed with order [0, 1, 2, 4, 3, 5] - order = [i for i in range(ndim) if i not in axes] - order[axes[0] : axes[0]] = axes - dx = dx.transpose(order) - - # Find the flattened shape. - # - # E.g. if the *transposed* shape is (10, 20, 30, 50, 40, 60) - # and *transposed* axes [2, 3] are to be flattened then - # the new shape will be (10, 20, 1500, 40, 60) - shape = d.shape - new_shape = [n for i, n in enumerate(shape) if i not in axes] - new_shape.insert(axes[0], reduce(mul, [shape[i] for i in axes], 1)) - - dx = dx.reshape(new_shape) - d._set_dask(dx) - - return d + yield masked @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) @@ -9750,46 +6449,6 @@ def change_calendar(self, calendar, inplace=False, i=False): return d - def chunk_indices(self): - """Return indices that define each dask compute chunk. - - .. versionadded:: 3.15.0 - - .. seealso:: `chunks` - - :Returns: - - `itertools.product` - An iterator over tuples of indices of the data array. - - **Examples** - - >>> d = cf.Data(np.arange(405).reshape(3, 9, 15), - ... chunks=((1, 2), (9,), (4, 5, 6))) - >>> d.npartitions - 6 - >>> for index in d.chunk_indices(): - ... print(index) - ... - (slice(0, 1, None), slice(0, 9, None), slice(0, 4, None)) - (slice(0, 1, None), slice(0, 9, None), slice(4, 9, None)) - (slice(0, 1, None), slice(0, 9, None), slice(9, 15, None)) - (slice(1, 3, None), slice(0, 9, None), slice(0, 4, None)) - (slice(1, 3, None), slice(0, 9, None), slice(4, 9, None)) - (slice(1, 3, None), slice(0, 9, None), slice(9, 15, None)) - - """ - from dask.utils import cached_cumsum - - chunks = self.chunks - - cumdims = [cached_cumsum(bds, initial_zero=True) for bds in chunks] - indices = [ - [slice(s, s + dim) for s, dim in zip(starts, shapes)] - for starts, shapes in zip(cumdims, chunks) - ] - return product(*indices) - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def override_units(self, units, inplace=False, i=False): @@ -9831,7 +6490,7 @@ def override_units(self, units, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) - d._Units = Units(units) + d._Units = self._Units_class(units) return d @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @@ -9873,92 +6532,9 @@ def override_calendar(self, calendar, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) - d._Units = Units(d.Units._units, calendar) + d._Units = d._Units_class(d.Units._units, calendar) return d - # REVIEW: getitem: `to_dask_array`: new keyword 'asanyarray' - def to_dask_array(self, apply_mask_hardness=False, _asanyarray=True): - """Convert the data to a `dask` array. - - .. warning:: By default, the mask hardness of the returned - dask array might not be the same as that - specified by the `hardmask` attribute. - - This could cause problems if a subsequent - operation on the returned dask array involves the - un-masking of masked values (such as by indexed - assignment). - - To guarantee that the mask hardness of the - returned dask array is correct, set the - *apply_mask_hardness* parameter to True. - - .. versionadded:: 3.14.0 - - :Parameters: - - apply_mask_hardness: `bool`, optional - If True then force the mask hardness of the returned - array to be that given by the `hardmask` attribute. - - _asanyarray: `bool`, optional - If True (the default) and the `__asanyarray__` - attribute is also `True`, then a `cf_asanyarray` - operation is added to the graph of the returned Dask - array. If False then this operation is not added. - - In general, setting *_asanyarray* to False should only - be done if it is known that a) the returned Dask array - is never going to be computed; or b) it is not - necessary to add a `cf_asanyarray` operation in lieu of - its functionality being implemented by a new Dask graph - layer that is going to be created at a later stage. See - `cf.data.dask_utils.cf_asanyarray` for further details. - - .. versionadded:: NEXTVERSION - - :Returns: - - `dask.array.Array` - The dask array contained within the `Data` instance. - - **Examples** - - >>> d = cf.Data([1, 2, 3, 4], 'm') - >>> dx = d.to_dask_array() - >>> dx - >>> dask.array - >>> dask.array.asanyarray(d) is dx - True - - >>> d.to_dask_array(apply_mask_hardness=True) - dask.array - - >>> d = cf.Data([1, 2, 3, 4], 'm', hardmask=False) - >>> d.to_dask_array(apply_mask_hardness=True) - dask.array - - """ - dx = self._custom.get("dask") - if dx is None: - raise ValueError(f"{self.__class__.__name__} object has no data") - - if apply_mask_hardness: - if self.hardmask: - self.harden_mask() - else: - self.soften_mask() - - dx = self._custom["dask"] - # Note: The mask hardness functions have their own calls - # to 'cf_asanyarray', so we don't need to worry about - # setting another one. - elif _asanyarray and self.__asanyarray__: - # Add a new cf_asanyarray layer to the output graph - dx = dx.map_blocks(cf_asanyarray, dtype=dx.dtype) - - return dx - def datum(self, *index): """Return an element of the data array as a standard Python scalar. @@ -10101,7 +6677,7 @@ def datum(self, *index): if mask is np.ma.nomask or not mask.item(): return array.item() - return cf_masked + return masked @_inplace_enabled(default=False) def masked_invalid(self, inplace=False): @@ -10136,55 +6712,6 @@ def masked_invalid(self, inplace=False): d._set_dask(dx) return d - def del_calendar(self, default=ValueError()): - """Delete the calendar. - - .. seealso:: `get_calendar`, `has_calendar`, `set_calendar`, - `del_units`, `Units` - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - calendar has not been set. If set to an `Exception` - instance then it will be raised instead. - - :Returns: - - `str` - The value of the deleted calendar. - - **Examples** - - >>> d = cf.Data(1, "days since 2000-1-1", calendar="noleap") - >>> d.del_calendar() - 'noleap' - >>> print(d.del_calendar()) - None - - >>> d = cf.Data(1, "days since 2000-1-1") - >>> print(d.del_calendar()) - None - - >>> d = cf.Data(1, "m") - Traceback (most recent call last): - ... - ValueError: Units have no calendar - - """ - units = self.Units - if not units.isreftime: - return self._default(default, f"Units {units!r} have no calendar") - - calendar = getattr(units, "calendar", None) - if calendar is None: - return self._default( - default, f"{self.__class__.__name__} has no calendar" - ) - - self.override_calendar(None, inplace=True) - return calendar - def del_file_location(self, location): """Remove a file location in-place. @@ -10216,7 +6743,6 @@ def del_file_location(self, location): updated = False - # REVIEW: getitem: `del_file_location`: set 'asanyarray' # The dask graph is never going to be computed, so we can set # '_asanyarray=False'. dsk = self.todict(_asanyarray=False) @@ -10238,63 +6764,9 @@ def del_file_location(self, location): return location - def del_units(self, default=ValueError()): - """Delete the units. - - .. seealso:: `get_units`, `has_units`, `set_units`, - `del_calendar`, `Units` - - :Parameters: - - default: optional - Return the value of the *default* parameter if the - units has not been set. If set to an `Exception` - instance then it will be raised instead. - - :Returns: - - `str` - The value of the deleted units. - - **Examples** - - >>> d = cf.Data(1, "m") - >>> d.del_units() - 'm' - >>> d.Units - - >>> d.del_units() - Traceback (most recent call last): - ... - ValueError: Data has no units - - >>> d = cf.Data(1, "days since 2000-1-1", calendar="noleap") - >>> d.del_units() - 'days since 2000-1-1' - >>> d.Units - - - """ - u = self.Units - units = getattr(u, "units", None) - calendar = getattr(u, "calendar", None) - self.override_units(Units(None, calendar), inplace=True) - - if units is not None: - return units - - return self._default( - default, f"{self.__class__.__name__} has no units" - ) - @classmethod def masked_all( - cls, - shape, - dtype=None, - units=None, - calendar=None, - chunks=_DEFAULT_CHUNKS, + cls, shape, dtype=None, units=None, calendar=None, chunks="auto" ): """Return an empty masked array with all elements masked. @@ -10351,60 +6823,6 @@ def masked_all( d._set_dask(dx) return d - @_inplace_enabled(default=False) - def masked_values(self, value, rtol=None, atol=None, inplace=False): - """Mask using floating point equality. - - Masks the data where elements are approximately equal to the - given value. For integer types, exact equality is used. - - .. versionadded:: 3.16.0 - - .. seealso:: `mask` - - :Parameters: - - value: number - Masking value. - - {{rtol: number, optional}} - - {{atol: number, optional}} - - {{inplace: `bool`, optional}} - - :Returns: - - `{{class}}` or `None` - The result of masking the data where approximately - equal to *value*, or `None` if the operation was - in-place. - - **Examples** - - >>> d = {{package}}.{{class}}([1, 1.1, 2, 1.1, 3]) - >>> e = d.masked_values(1.1) - >>> print(e.array) - [1.0 -- 2.0 -- 3.0] - - """ - d = _inplace_enabled_define_and_cleanup(self) - - if rtol is None: - rtol = self._rtol - else: - rtol = float(rtol) - - if atol is None: - atol = self._atol - else: - atol = float(atol) - - dx = d.to_dask_array() - dx = da.ma.masked_values(dx, value, rtol=rtol, atol=atol) - d._set_dask(dx) - return d - @_inplace_enabled(default=False) @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") def mid_range( @@ -10543,19 +6961,6 @@ def inspect(self): `None` - **Examples** - - >>> d = cf.Data([9], 'm') - >>> d.inspect() - - ------------------- - {'_components': {'custom': {'_Units': , - '_axes': ('dim0',), - '_cyclic': set(), - '_hardmask': True, - 'dask': dask.array}, - 'netcdf': {}}} - """ from ..functions import inspect @@ -10633,7 +7038,7 @@ def isclose(self, y, rtol=None, atol=None): d = self.copy() d._set_dask(dx) - d.hardmask = _DEFAULT_HARDMASK + d.hardmask = self._DEFAULT_HARDMASK d.override_units(_units_None, inplace=True) d._update_deterministic(not is_dask_collection(y)) @@ -10701,26 +7106,10 @@ def reshape(self, *shape, merge_chunks=True, limit=None, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() - dx = dx.reshape(*shape, merge_chunks=merge_chunks, limit=limit) - - # Set axes when the new array has more dimensions than self - axes = None - ndim0 = self.ndim - if not ndim0: - axes = generate_axis_identifiers(dx.ndim) - else: - diff = dx.ndim - ndim0 - if diff > 0: - axes = list(self._axes) - for _ in range(diff): - axes.insert(0, new_axis_identifier(tuple(axes))) - - if axes is not None: - d._axes = axes - - d._set_dask(dx) - + super(Data, d).reshape( + *shape, merge_chunks=merge_chunks, limit=limit, inplace=True + ) + # TODODASK: reshape: Need to clear cyclic axes, as we can't help but lose them in this operation return d @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @@ -10884,7 +7273,7 @@ def round(self, decimals=0, inplace=False, i=False): def stats( self, all=False, - compute=True, + values=True, minimum=True, mean=True, median=True, @@ -10923,7 +7312,7 @@ def stats( Calculate all possible statistics, regardless of the value of individual metric parameters. - compute: `bool`, optional + values: `bool`, optional If True (the default), returned values for the statistical calculations in the output dictionary are computed, else each is given in the form of a delayed `Data` operation. @@ -10988,11 +7377,11 @@ def stats( :Returns: `dict` - The statistics, with keys giving the operation names and - values being the result of the corresponding statistical - calculation, which are either the computed numerical - values if `compute` is True, else the delayed `Data` - operations which encapsulate those. + The statistics, with keys giving the operation names + and values being the result of the corresponding + statistical calculation, which are either the computed + numerical values if *values*` is True, else the + delayed `Data` operations which encapsulate those. **Examples** @@ -11040,7 +7429,7 @@ def stats( To ask for delayed operations instead of computed values: - >>> d.stats(compute=False) + >>> d.stats(values=False) {'minimum': , 'mean': , 'median': , @@ -11092,8 +7481,8 @@ def stats( if all or sample_size: out["sample_size"] = delayed(lambda: self.sample_size())() - data_values = globals()["compute"](out)[0] # noqa: F811 - if compute: + data_values = compute(out)[0] + if values: # Convert cf.Data objects holding the scalars (or scalar array # for the case of sample_size only) to scalar values return {op: val.array.item() for op, val in data_values.items()} @@ -11401,10 +7790,9 @@ def where( # Missing values could be affected, so make sure that the mask # hardness has been applied. # - # REVIEW: getitem: `where`: set 'asanyarray' - # 'cf_where' has its own calls to 'cf_asanyarray', so we can + # 'cf_where' has its own calls to 'cfdm_asanyarray', so we can # set '_asanyarray=False'. - dx = d.to_dask_array(apply_mask_hardness=True, _asanyarray=False) + dx = d.to_dask_array(_asanyarray=False) units = d.Units @@ -11419,8 +7807,7 @@ def where( condition = type(self).asdata(condition) condition = where_broadcastable(d, condition, "condition") - # REVIEW: getitem: `where`: set 'asanyarray' - # 'cf_where' has its own calls to 'cf_asanyarray', so we can + # 'cf_where' has its own calls to 'cfdm_asanyarray', so we can # set '_asanyarray=False'. condition = condition.to_dask_array(_asanyarray=False) @@ -11444,7 +7831,7 @@ def where( xy.append(arg) continue - if arg is cf_masked: + if arg is masked: # Replace masked constant with array xy.append(scalar_masked_array(self.dtype)) continue @@ -11465,9 +7852,8 @@ def where( x, y = xy - # REVIEW: getitem: `where`: 'da.asanyarray' is no longer required # Apply the where operation - dx = da.core.elemwise(cf_where, dx, condition, x, y, d.hardmask) + dx = da.core.elemwise(cfdm_where, dx, condition, x, y, d.hardmask) d._set_dask(dx) # Don't know (yet) if 'x' and 'y' have a deterministic names @@ -11655,49 +8041,6 @@ def cosh(self, inplace=False): return d - def cull_graph(self): - """Remove unnecessary tasks from the dask graph in-place. - - **Performance** - - An unnecessary task is one which does not contribute to the - computed result. Such tasks are always automatically removed - (culled) at compute time, but removing them beforehand might - improve performance by reducing the amount of work done in - later steps. - - .. versionadded:: 3.14.0 - - .. seealso:: `dask.optimization.cull` - - :Returns: - - `None` - - **Examples** - - >>> d = cf.Data([1, 2, 3, 4, 5], chunks=3) - >>> d = d[:2] - >>> dict(d.to_dask_array().dask) - {('array-21ea057f160746a3d3f0943bba945460', 0): array([1, 2, 3]), - ('array-21ea057f160746a3d3f0943bba945460', 1): array([4, 5]), - ('getitem-3e4edac0a632402f6b45923a6b9d215f', - 0): (, ('array-21ea057f160746a3d3f0943bba945460', - 0), (slice(0, 2, 1),))} - >>> d.cull_graph() - >>> dict(d.to_dask_array().dask) - {('getitem-3e4edac0a632402f6b45923a6b9d215f', - 0): (, ('array-21ea057f160746a3d3f0943bba945460', - 0), (slice(0, 2, 1),)), - ('array-21ea057f160746a3d3f0943bba945460', 0): array([1, 2, 3])} - - """ - # REVIEW: getitem: `cull_graph`: set 'asanyarray' - dx = self.to_dask_array(_asanyarray=False) - dsk, _ = cull(dx.dask, dx.__dask_keys__()) - dx = da.Array(dsk, name=dx.name, chunks=dx.chunks, dtype=dx.dtype) - self._set_dask(dx, clear=_NONE, asanyarray=None) - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def tanh(self, inplace=False): @@ -11799,118 +8142,6 @@ def log(self, base=None, inplace=False, i=False): return d - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") - @_inplace_enabled(default=False) - def squeeze(self, axes=None, inplace=False, i=False): - """Remove size 1 axes from the data array. - - By default all size 1 axes are removed, but particular axes - may be selected with the keyword arguments. - - .. seealso:: `flatten`, `insert_dimension`, `flip`, - `swapaxes`, `transpose` - - :Parameters: - - axes: (sequence of) int, optional - Select the axes. By default all size 1 axes are - removed. The *axes* argument may be one, or a - sequence, of integers that select the axis - corresponding to the given position in the list of - axes of the data array. - - No axes are removed if *axes* is an empty sequence. - - {{inplace: `bool`, optional}} - - {{i: deprecated at version 3.0.0}} - - :Returns: - - `Data` or `None` - The squeezed data array. - - **Examples** - - >>> v.shape - (1,) - >>> v.squeeze() - >>> v.shape - () - - >>> v.shape - (1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1) - >>> v.squeeze((0,)) - >>> v.shape - (2, 1, 3, 1, 4, 1, 5, 1, 6, 1) - >>> v.squeeze(1) - >>> v.shape - (2, 3, 1, 4, 1, 5, 1, 6, 1) - >>> v.squeeze([2, 4]) - >>> v.shape - (2, 3, 4, 5, 1, 6, 1) - >>> v.squeeze([]) - >>> v.shape - (2, 3, 4, 5, 1, 6, 1) - >>> v.squeeze() - >>> v.shape - (2, 3, 4, 5, 6) - - """ - d = _inplace_enabled_define_and_cleanup(self) - - if not d.ndim: - if axes or axes == 0: - raise ValueError( - "Can't squeeze: Can't remove an axis from " - f"scalar {d.__class__.__name__}" - ) - - if inplace: - d = None - - return d - - shape = d.shape - - if axes is None: - iaxes = tuple([i for i, n in enumerate(shape) if n == 1]) - else: - iaxes = d._parse_axes(axes) - - # Check the squeeze axes - for i in iaxes: - if shape[i] > 1: - raise ValueError( - f"Can't squeeze {d.__class__.__name__}: " - f"Can't remove axis of size {shape[i]}" - ) - - if not iaxes: - # Short circuit if the squeeze is a null operation - return d - - # Still here? Then the data array is not scalar and at least - # one size 1 axis needs squeezing. - dx = d.to_dask_array() - dx = dx.squeeze(axis=iaxes) - - # Squeezing a dimension doesn't affect the cached elements - d._set_dask(dx, clear=_ALL ^ _CACHE) - - # Remove the squeezed axes names - d._axes = [axis for i, axis in enumerate(d._axes) if i not in iaxes] - - # Update the HDF5 chunking strategy - chunksizes = d.nc_hdf5_chunksizes() - if isinstance(chunksizes, tuple): - chunksizes = [ - size for i, size in enumerate(chunksizes) if i not in iaxes - ] - d.nc_set_hdf5_chunksizes(chunksizes) - - return d - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def tan(self, inplace=False, i=False): @@ -11972,110 +8203,6 @@ def tan(self, inplace=False, i=False): return d - # REVIEW: getitem: `todict`: new keywords 'apply_mask_hardness', 'asanyarray' - def todict( - self, optimize_graph=True, apply_mask_hardness=False, _asanyarray=True - ): - """Return a dictionary of the dask graph key/value pairs. - - .. versionadded:: 3.15.0 - - .. seealso:: `to_dask_array` - - :Parameters: - - optimize_graph: `bool` - If True, the default, then prior to being converted to - a dictionary, the graph is optimised to remove unused - chunks. Note that optimising the graph can add a - considerable performance overhead. - - apply_mask_hardness: `bool`, optional - If True then force the mask hardness of the returned - array to be that given by the `hardmask` attribute. - - .. versionadded:: NEXTVERSION - - _asanyarray: `bool`, optional - If True (the default) and the `__asanyarray__` - attribute is also `True`, then a `cf_asanyarray` - operation is added to the dictionary representation of - the Dask graph. If False then this operation is not - added. See `to_dask_array` for details. - - .. versionadded:: NEXTVERSION - - :Returns: - - `dict` - The dictionary of the dask graph key/value pairs. - - **Examples** - - >>> d = cf.Data([1, 2, 3, 4], chunks=2) - >>> d.todict() - {('array-2f41b21b4cd29f757a7bfa932bf67832', 0): array([1, 2]), - ('array-2f41b21b4cd29f757a7bfa932bf67832', 1): array([3, 4])} - >>> e = d[0] - >>> e.todict() - {('getitem-153fd24082bc067cf438a0e213b41ce6', - 0): (, ('array-2f41b21b4cd29f757a7bfa932bf67832', - 0), (slice(0, 1, 1),)), - ('array-2f41b21b4cd29f757a7bfa932bf67832', 0): array([1, 2])} - >>> e.todict(optimize_graph=False) - {('array-2f41b21b4cd29f757a7bfa932bf67832', 0): array([1, 2]), - ('array-2f41b21b4cd29f757a7bfa932bf67832', 1): array([3, 4]), - ('getitem-153fd24082bc067cf438a0e213b41ce6', - 0): (, ('array-2f41b21b4cd29f757a7bfa932bf67832', - 0), (slice(0, 1, 1),))} - - """ - dx = self.to_dask_array( - apply_mask_hardness=apply_mask_hardness, _asanyarray=_asanyarray - ) - - if optimize_graph: - return collections_to_dsk((dx,), optimize_graph=True) - - return dict(collections_to_dsk((dx,), optimize_graph=False)) - - def tolist(self): - """Return the data as a scalar or (nested) list. - - Returns the data as an ``N``-levels deep nested list of Python - scalars, where ``N`` is the number of data dimensions. - - If ``N`` is 0 then, since the depth of the nested list is 0, - it will not be a list at all, but a simple Python scalar. - - .. sealso:: `todict` - - :Returns: - - `list` or scalar - The (nested) list of array elements, or a scalar if - the data has 0 dimensions. - - **Examples** - - >>> d = cf.Data(9) - >>> d.tolist() - 9 - - >>> d = cf.Data([1, 2]) - >>> d.tolist() - [1, 2] - - >>> d = cf.Data(([[1, 2], [3, 4]])) - >>> d.tolist() - [[1, 2], [3, 4]] - - >>> d.equals(cf.Data(d.tolist())) - True - - """ - return self.array.tolist() - def to_memory(self): """Bring data on disk into memory. @@ -12087,81 +8214,6 @@ def to_memory(self): "Consider using 'Data.persist' instead." ) - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") - @_inplace_enabled(default=False) - def transpose(self, axes=None, inplace=False, i=False): - """Permute the axes of the data array. - - .. seealso:: `flatten', `insert_dimension`, `flip`, `squeeze`, - `swapaxes` - - :Parameters: - - axes: (sequence of) `int` - The new axis order of the data array. By default the order - is reversed. Each axis of the new order is identified by - its original integer position. - - {{inplace: `bool`, optional}} - - {{i: deprecated at version 3.0.0}} - - :Returns: - - `Data` or `None` - - **Examples** - - >>> d.shape - (19, 73, 96) - >>> d.transpose() - >>> d.shape - (96, 73, 19) - >>> d.transpose([1, 0, 2]) - >>> d.shape - (73, 96, 19) - >>> d.transpose((-1, 0, 1)) - >>> d.shape - (19, 73, 96) - - """ - d = _inplace_enabled_define_and_cleanup(self) - - ndim = d.ndim - if axes is None: - iaxes = tuple(range(ndim - 1, -1, -1)) - else: - iaxes = d._parse_axes(axes) - - if iaxes == tuple(range(ndim)): - # Short circuit if the transpose is a null operation - return d - - # Note: The _axes attribute is important because e.g. axes - # labelled as cyclic by the _cyclic attribute use it to - # determine their position (see #discussion_r694096462 - # on PR #247). - data_axes = d._axes - d._axes = [data_axes[i] for i in iaxes] - - dx = d.to_dask_array() - try: - dx = da.transpose(dx, axes=axes) - except ValueError: - raise ValueError( - f"Can't transpose: Axes don't match array: {axes}" - ) - - d._set_dask(dx) - - # Update the HDF5 chunking strategy - chunksizes = d.nc_hdf5_chunksizes() - if isinstance(chunksizes, tuple): - chunksizes = [chunksizes[i] for i in axes] - d.nc_set_hdf5_chunksizes(chunksizes) - - return d - @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) def trunc(self, inplace=False, i=False): @@ -12200,64 +8252,6 @@ def trunc(self, inplace=False, i=False): d._set_dask(dx) return d - @classmethod - def empty( - cls, - shape, - dtype=None, - units=None, - calendar=None, - fill_value=None, - chunks=_DEFAULT_CHUNKS, - ): - """Return a new array of given shape and type, without - initialising entries. - - .. seealso:: `full`, `ones`, `zeros` - - :Parameters: - - shape: `int` or `tuple` of `int` - The shape of the new array. e.g. ``(2, 3)`` or ``2``. - - dtype: data-type - The desired output data-type for the array, e.g. - `numpy.int8`. The default is `numpy.float64`. - - units: `str` or `Units` - The units for the new data array. - - calendar: `str`, optional - The calendar for reference time units. - - {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - - .. versionadded:: 3.14.0 - - fill_value: deprecated at version 3.14.0 - Use `set_fill_value` instead. - - :Returns: - - `Data` - Array of uninitialised (arbitrary) data of the given - shape and dtype. - - **Examples** - - >>> d = cf.Data.empty((2, 2)) - >>> print(d.array) - [[ -9.74499359e+001 6.69583040e-309], - [ 2.13182611e-314 3.06959433e-309]] #uninitialised - - >>> d = cf.Data.empty((2,), dtype=bool) - >>> print(d.array) - [ False True] #uninitialised - - """ - dx = da.empty(shape, dtype=dtype, chunks=chunks) - return cls(dx, units=units, calendar=calendar) - @classmethod def full( cls, @@ -12266,7 +8260,7 @@ def full( dtype=None, units=None, calendar=None, - chunks=_DEFAULT_CHUNKS, + chunks="auto", ): """Return a new array of given shape and type, filled with a fill value. @@ -12328,14 +8322,7 @@ def full( return cls(dx, units=units, calendar=calendar) @classmethod - def ones( - cls, - shape, - dtype=None, - units=None, - calendar=None, - chunks=_DEFAULT_CHUNKS, - ): + def ones(cls, shape, dtype=None, units=None, calendar=None, chunks="auto"): """Returns a new array filled with ones of set shape and type. .. seealso:: `empty`, `full`, `zeros` @@ -12381,12 +8368,7 @@ def ones( @classmethod def zeros( - cls, - shape, - dtype=None, - units=None, - calendar=None, - chunks=_DEFAULT_CHUNKS, + cls, shape, dtype=None, units=None, calendar=None, chunks="auto" ): """Returns a new array filled with zeros of set shape and type. @@ -13433,11 +9415,6 @@ def sqrt(self, dtype=None, inplace=False): # ---------------------------------------------------------------- # Aliases # ---------------------------------------------------------------- - @property - def dtarray(self): - """Alias for `datetime_array`""" - return self.datetime_array - @_inplace_enabled(default=False) @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") def maximum( diff --git a/cf/data/utils.py b/cf/data/utils.py index 2c34757c22..8888178133 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -21,303 +21,6 @@ _units_None = Units(None) -def is_numeric_dtype(array): - """True if the given array is of a numeric or boolean data type. - - .. versionadded:: 3.14.0 - - :Parameters: - - array: numpy-like array - - :Returns: - - `bool` - Whether or not the array holds numeric elements. - - **Examples** - - >>> a = np.array([0, 1, 2]) - >>> cf.data.utils.is_numeric_dtype(a) - True - >>> a = np.array([False, True, True]) - >>> cf.data.utils.is_numeric_dtype(a) - True - >>> a = np.array(["a", "b", "c"], dtype="S1") - >>> cf.data.utils.is_numeric_dtype(a) - False - >>> a = np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]) - >>> cf.data.utils.is_numeric_dtype(a) - True - >>> a = np.array(10) - >>> cf.data.utils.is_numeric_dtype(a) - True - >>> a = np.empty(1, dtype=object) - >>> cf.data.utils.is_numeric_dtype(a) - False - - """ - dtype = array.dtype - - # This checks if the dtype is either a standard "numeric" type (i.e. - # int types, floating point types or complex floating point types) - # or Boolean, which are effectively a restricted int type (0 or 1). - # We determine the former by seeing if it sits under the 'np.number' - # top-level dtype in the NumPy dtype hierarchy; see the - # 'Hierarchy of type objects' figure diagram under: - # https://numpy.org/doc/stable/reference/arrays.scalars.html#scalars - return np.issubdtype(dtype, np.number) or np.issubdtype(dtype, np.bool_) - - -def convert_to_datetime(a, units): - """Convert a dask array of numbers to one of date-time objects. - - .. versionadded:: 3.14.0 - - .. seealso `convert_to_reftime` - - :Parameters: - - a: `dask.array.Array` - The input numeric reference time values. - - units: `Units` - The reference time units that define the output - date-time objects. - - :Returns: - - `dask.array.Array` - A new dask array containing date-time objects. - - **Examples** - - >>> import dask.array as da - >>> d = da.from_array(2.5) - >>> e = cf.data.utils.convert_to_datetime(d, cf.Units("days since 2000-12-01")) - >>> print(e.compute()) - 2000-12-03 12:00:00 - - """ - return a.map_blocks( - partial(rt2dt, units_in=units), - dtype=object, - meta=np.array((), dtype=object), - ) - - -def convert_to_reftime(a, units=None, first_value=None): - """Convert a dask array of string or object date-times to floating - point reference times. - - .. versionadded:: 3.14.0 - - .. seealso `convert_to_datetime` - - :Parameters: - - a: `dask.array.Array` - - units: `Units`, optional - Specify the units for the output reference time - values. By default the units are inferred from the first - non-missing value in the array, or set to ```` if all values are missing. - - first_value: optional - If set, then assumed to be equal to the first non-missing - value of the array, thereby removing the need to find it - by inspection of *a*, which may be expensive. By default - the first non-missing value is found from *a*. - - :Returns: - - (`dask.array.Array`, `Units`) - The reference times, and their units. - - >>> import dask.array as da - >>> d = da.from_array(2.5) - >>> e = cf.data.utils.convert_to_datetime(d, cf.Units("days since 2000-12-01")) - - >>> f, u = cf.data.utils.convert_to_reftime(e) - >>> f.compute() - 0.5 - >>> u - - - >>> f, u = cf.data.utils.convert_to_reftime(e, cf.Units("days since 1999-12-01")) - >>> f.compute() - 368.5 - >>> u - - - """ - kind = a.dtype.kind - if kind in "US": - # Convert date-time strings to reference time floats - if not units: - first_value = first_non_missing_value(a, cached=first_value) - if first_value is not None: - YMD = str(first_value).partition("T")[0] - else: - YMD = "1970-01-01" - - units = Units( - "days since " + YMD, - getattr(units, "calendar", default_calendar), - ) - - a = a.map_blocks( - partial(st2rt, units_in=units, units_out=units), dtype=float - ) - - elif kind == "O": - # Convert date-time objects to reference time floats - first_value = first_non_missing_value(a, cached=first_value) - if first_value is not None: - x = first_value - else: - x = dt(1970, 1, 1, calendar=default_calendar) - - x_since = "days since " + "-".join(map(str, (x.year, x.month, x.day))) - x_calendar = getattr(x, "calendar", default_calendar) - - d_calendar = getattr(units, "calendar", None) - d_units = getattr(units, "units", None) - - if x_calendar != "": - if not units: - d_calendar = x_calendar - elif not units.equivalent(Units(x_since, x_calendar)): - raise ValueError( - "Incompatible units: " - f"{units!r}, {Units(x_since, x_calendar)!r}" - ) - - if not units: - # Set the units to something that is (hopefully) close to - # all of the datetimes, in an attempt to reduce errors - # arising from the conversion to reference times - units = Units(x_since, calendar=d_calendar) - else: - units = Units(d_units, calendar=d_calendar) - - # Convert the date-time objects to reference times - a = a.map_blocks(dt2rt, units_in=None, units_out=units, dtype=float) - - if not units.isreftime: - raise ValueError( - f"Can't create a reference time array with units {units!r}" - ) - - return a, units - - -def first_non_missing_value(a, cached=None, method="index"): - """Return the first non-missing value of a dask array. - - .. versionadded:: 3.14.0 - - :Parameters: - - a: `dask.array.Array` - The array to be inspected. - - cached: scalar, optional - If set to a value other than `None`, then return without - inspecting the array. This allows a previously found first - value to be used instead of a potentially costly array - access. - - method: `str`, optional - Select the method used to find the first non-missing - value. - - The default ``'index'`` method evaulates sequentially the - elements of the flattened array and returns when the first - non-missing value is found. - - The ``'mask'`` method finds the first non-missing value of - the flattened array as that which has the same location as - the first False element of the flattened array mask. - - It is considered likely that the ``'index'`` method is - fastest for data for which the first element is not - missing, but this may not always be the case. - - :Returns: - - If set, then *cached* is returned. Otherwise returns the - first non-missing value of *a*, or `None` if there isn't - one. - - **Examples** - - >>> import dask.array as da - >>> d = da.arange(8).reshape(2, 4) - >>> print(d.compute()) - [[0 1 2 3] - [4 5 6 7]] - >>> cf.data.utils.first_non_missing_value(d) - 0 - >>> cf.data.utils.first_non_missing_value(d, cached=99) - 99 - >>> d[0, 0] = cf.masked - >>> cf.data.utils.first_non_missing_value(d) - 1 - >>> d[0, :] = cf.masked - >>> cf.data.utils.first_non_missing_value(d) - 4 - >>> cf.data.utils.first_non_missing_value(d, cached=99) - 99 - >>> d[...] = cf.masked - >>> print(cf.data.utils.first_non_missing_value(d)) - None - >>> print(cf.data.utils.first_non_missing_value(d, cached=99)) - 99 - - """ - if cached is not None: - return cached - - if method == "index": - shape = a.shape - for i in range(a.size): - index = np.unravel_index(i, shape) - x = a[index].compute() - if not (x is np.ma.masked or np.ma.getmask(x)): - try: - return x.item() - except AttributeError: - return x - - return - - if method == "mask": - mask = da.ma.getmaskarray(a) - if not a.ndim: - # Scalar data - if mask: - return - - a = a.compute() - try: - return a.item() - except AttributeError: - return a - - x = a[da.unravel_index(mask.argmin(), a.shape)].compute() - if x is np.ma.masked: - return - - try: - return x.item() - except AttributeError: - return x - - raise ValueError(f"Unknown value of 'method': {method!r}") - - def unique_calendars(a): """Find the unique calendars from a dask array of date-time objects. @@ -360,162 +63,6 @@ def _get_calendar(x): return set(out) -@lru_cache(maxsize=32) -def new_axis_identifier(existing_axes=(), basename="dim"): - """Return a new, unique axis identifier. - - The name is arbitrary and has no semantic meaning. - - .. versionadded:: 3.14.0 - - :Parameters: - - existing_axes: sequence of `str`, optional - Any existing axis names that are not to be duplicated. - - basename: `str`, optional - The root of the new axis identifier. The new axis - identifier will be this root followed by an integer. - - :Returns: - - `str` - The new axis idenfifier. - - **Examples** - - >>> cf.data.utils.new_axis_identifier() - 'dim0' - >>> cf.data.utils.new_axis_identifier(['dim0']) - 'dim1' - >>> cf.data.utils.new_axis_identifier(['dim3']) - 'dim1' - >>> cf.data.utils.new_axis_identifier(['dim1']) - 'dim2' - >>> cf.data.utils.new_axis_identifier(['dim1', 'dim0']) - 'dim2' - >>> cf.data.utils.new_axis_identifier(['dim3', 'dim4']) - 'dim2' - >>> cf.data.utils.new_axis_identifier(['dim2', 'dim0']) - 'dim3' - >>> cf.data.utils.new_axis_identifier(['dim3', 'dim4', 'dim0']) - 'dim5' - >>> cf.data.utils.new_axis_identifier(basename='axis') - 'axis0' - >>> cf.data.utils.new_axis_identifier(basename='axis') - 'axis0' - >>> cf.data.utils.new_axis_identifier(['dim0'], basename='axis') - 'axis1' - >>> cf.data.utils.new_axis_identifier(['dim0', 'dim1'], basename='axis') - 'axis2' - - """ - n = len(existing_axes) - axis = f"{basename}{n}" - while axis in existing_axes: - n += 1 - axis = f"{basename}{n}" - - return axis - - -def chunk_positions(chunks): - """Find the position of each chunk. - - .. versionadded:: 3.14.0 - - .. seealso:: `chunk_indices`, `chunk_locations`, `chunk_shapes` - - :Parameters: - - chunks: `tuple` - The chunk sizes along each dimension, as output by - `dask.array.Array.chunks`. - - **Examples** - - >>> chunks = ((1, 2), (9,), (44, 55, 66)) - >>> for position in cf.data.utils.chunk_positions(chunks): - ... print(position) - ... - (0, 0, 0) - (0, 0, 1) - (0, 0, 2) - (1, 0, 0) - (1, 0, 1) - (1, 0, 2) - - """ - return product(*(range(len(bds)) for bds in chunks)) - - -def chunk_shapes(chunks): - """Find the shape of each chunk. - - .. versionadded:: 3.14.0 - - .. seealso:: `chunk_indices`, `chunk_locations`, `chunk_positions` - - :Parameters: - - chunks: `tuple` - The chunk sizes along each dimension, as output by - `dask.array.Array.chunks`. - - **Examples** - - >>> chunks = ((1, 2), (9,), (4, 5, 6)) - >>> for shape in cf.data.utils.chunk_shapes(chunks): - ... print(shape) - ... - (1, 9, 4) - (1, 9, 5) - (1, 9, 6) - (2, 9, 4) - (2, 9, 5) - (2, 9, 6) - - """ - return product(*chunks) - - -def chunk_locations(chunks): - """Find the shape of each chunk. - - .. versionadded:: 3.15.0 - - .. seealso:: `chunk_indices`, `chunk_positions`, `chunk_shapes` - - :Parameters: - - chunks: `tuple` - The chunk sizes along each dimension, as output by - `dask.array.Array.chunks`. - - **Examples** - - >>> chunks = ((1, 2), (9,), (4, 5, 6)) - >>> for location in cf.data.utils.chunk_locations(chunks): - ... print(location) - ... - ((0, 1), (0, 9), (0, 4)) - ((0, 1), (0, 9), (4, 9)) - ((0, 1), (0, 9), (9, 15)) - ((1, 3), (0, 9), (0, 4)) - ((1, 3), (0, 9), (4, 9)) - ((1, 3), (0, 9), (9, 15)) - - """ - from dask.utils import cached_cumsum - - cumdims = [cached_cumsum(bds, initial_zero=True) for bds in chunks] - locations = [ - [(s, s + dim) for s, dim in zip(starts, shapes)] - for starts, shapes in zip(cumdims, chunks) - ] - return product(*locations) - - def scalar_masked_array(dtype=float): """Return a scalar masked array. @@ -869,8 +416,6 @@ def collapse( "keepdims": keepdims, "split_every": split_every, "mtol": mtol, - # REVIEW: active: `collapse`: pass the active storage status onto the collapse functions - # "active_storage": d.active_storage, } weights = parse_weights(d, weights, axis) @@ -880,9 +425,8 @@ def collapse( if ddof is not None: kwargs["ddof"] = ddof - # REVIEW: getitem: `collapse`: set 'asanyarray' # The applicable chunk function will have its own call to - # 'cf_asanyarray', so we can set '_asanyarray=False'. Also, setting + # 'cfdm_asanyarray', so we can set '_asanyarray=False'. Also, setting # _asanyarray=False will ensure that any active storage operations # are not compromised. dx = d.to_dask_array(_asanyarray=False) @@ -998,7 +542,7 @@ def parse_weights(d, weights, axis=None): w = [] shape = d.shape axes = d._axes - # REVIEW: active: `parse_weights`: minor refactor + Data = type(d) for key, value in weights.items(): value = Data.asdata(value) @@ -1015,52 +559,3 @@ def parse_weights(d, weights, axis=None): # Return the product of the weights components, which will be # broadcastable to d return reduce(mul, w) - - -def normalize_chunks(chunks, shape=None, dtype=None): - """Normalize chunks to tuple of tuples. - - The shape may contain sizes of ``nan``. This could occur when the - underlying data is compressed in a way which makes the shape - impossible to infer without actually uncompressing the data. - - If *shape* contains no ``nan`` sizes then this function is - identical to `dask.array.core.normalize_chunks`. If it does, then - the output chunks for each such axis will be ``(nan,)``. - - .. versionadded 3.16.0 - - :Parameters: - - chunks: tuple, int, dict, or string - The chunks to be normalized. See - `dask.array.core.normalize_chunks` for details. - - shape: `tuple` - The shape of the data. - - dtype: data-type - The data-type for the data. - - :Returns: - - `tuple` - The normalized chunks. - - """ - from math import isnan, nan - - from dask.array.core import normalize_chunks - - if not any(map(isnan, shape)): - return normalize_chunks(chunks, shape=shape, dtype=dtype) - - out = [ - ( - (nan,) - if isnan(size) - else normalize_chunks(chunk, shape=(size,), dtype=dtype)[0] - ) - for chunk, size in zip(chunks, shape) - ] - return tuple(out) diff --git a/cf/mixin/propertiesdata.py b/cf/mixin/propertiesdata.py index 269b9d79ef..fa72e0c0cb 100644 --- a/cf/mixin/propertiesdata.py +++ b/cf/mixin/propertiesdata.py @@ -7,7 +7,6 @@ from ..cfdatetime import dt from ..data import Data -from ..data.data import _DEFAULT_CHUNKS from ..decorators import ( _deprecated_kwarg_check, _inplace_enabled, @@ -5392,7 +5391,7 @@ def override_units(self, units, inplace=False, i=False): @_inplace_enabled(default=False) def rechunk( self, - chunks=_DEFAULT_CHUNKS, + chunks="auto", threshold=None, block_size_limit=None, balance=False, diff --git a/cf/mixin/propertiesdatabounds.py b/cf/mixin/propertiesdatabounds.py index a5581ca478..b369db4336 100644 --- a/cf/mixin/propertiesdatabounds.py +++ b/cf/mixin/propertiesdatabounds.py @@ -4,7 +4,6 @@ from cfdm import is_log_level_debug, is_log_level_info from ..data import Data -from ..data.data import _DEFAULT_CHUNKS from ..decorators import ( _deprecated_kwarg_check, _inplace_enabled, @@ -4052,7 +4051,7 @@ def persist(self, bounds=True, inplace=False): @_inplace_enabled(default=False) def rechunk( self, - chunks=_DEFAULT_CHUNKS, + chunks="auto", threshold=None, block_size_limit=None, balance=False, diff --git a/cf/mixin2/container.py b/cf/mixin2/container.py index 44397301f4..c5f0081462 100644 --- a/cf/mixin2/container.py +++ b/cf/mixin2/container.py @@ -6,6 +6,7 @@ """ from ..docstring import _docstring_substitution_definitions +from ..functions import atol, rtol class Container: @@ -54,3 +55,23 @@ def __docstring_package_depth__(self): """ return 0 + + @property + def _atol(self): + """Internal alias for `{{package}}.atol`. + + An alias is necessary to avoid a name clash with the keyword + argument of identical name (`atol`) in calling functions. + + """ + return atol().value + + @property + def _rtol(self): + """Internal alias for `{{package}}.rtol`. + + An alias is necessary to avoid a name clash with the keyword + argument of identical name (`rtol`) in calling functions. + + """ + return rtol().value diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index dd0fb89c1e..dedd3e6ead 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -209,22 +209,22 @@ def _create_data( if data.npartitions == 1: data._cfa_set_write(True) - # REVIEW: h5: `_create_data`: control caching - if ( - not compression_index - and self.read_vars.get("cache") - and self.implementation.get_construct_type(construct) - != "field" - ): - # Only cache values from non-field data and - # non-compression-index data, on the assumptions that: - # - # a) Field data is, in general, so large that finding - # the cached values takes too long. - # - # b) Cached values are never really required for - # compression index data. - self._cache_data_elements(data, ncvar) + # # REVIEW: h5: `_create_data`: control caching + # if ( + # not compression_index + # and self.read_vars.get("cache") + # and self.implementation.get_construct_type(construct) + # != "field" + # ): + # # Only cache values from non-field data and + # # non-compression-index data, on the assumptions that: + # # + # # a) Field data is, in general, so large that finding + # # the cached values takes too long. + # # + # # b) Cached values are never really required for + # # compression index data. + # self._cache_data_elements(data, ncvar) return data @@ -254,7 +254,6 @@ def _create_data( coord_ncvar=coord_ncvar, ) - # REVIEW: h5: `_create_data`: replace units/calendar API with attributes attributes = kwargs["attributes"] data = self._create_Data( cfa_array, @@ -263,7 +262,6 @@ def _create_data( calendar=attributes.get("calendar"), ) - # REVIEW: h5: `_create_data`: don't cache data from CFA variables # Note: We don't cache elements from CFA variables, because # the data are in fragment files which have not been # opened and may not not even be openable (such as could @@ -317,98 +315,99 @@ def _is_cfa_variable(self, ncvar): and ncvar not in g["external_variables"] ) - def _create_Data( - self, - array, - ncvar, - units=None, - calendar=None, - ncdimensions=(), - **kwargs, - ): - """Create a Data object from a netCDF variable. - - .. versionadded:: 3.0.0 - - :Parameters: - - array: `Array` - The file array. - - ncvar: `str` - The netCDF variable containing the array. - - units: `str`, optional - The units of *array*. By default, or if `None`, it is - assumed that there are no units. - - calendar: `str`, optional - The calendar of *array*. By default, or if `None`, it is - assumed that there is no calendar. - - ncdimensions: sequence of `str`, optional - The netCDF dimensions spanned by the array. - - .. versionadded:: 3.14.0 - - kwargs: optional - Extra parameters to pass to the initialisation of the - returned `Data` object. - - :Returns: - - `Data` - - """ - if array.dtype is None: - # The array is based on a netCDF VLEN variable, and - # therefore has unknown data type. To find the correct - # data type (e.g. "=1) netCDF string type variable comes out - # as a numpy object array, so convert it to numpy - # string array. - array = array.astype("U", copy=False) - # NetCDF4 doesn't auto-mask VLEN variables - array = np.ma.where(array == "", np.ma.masked, array) - - # Parse dask chunks - chunks = self._parse_chunks(ncvar) - - data = super()._create_Data( - array, - ncvar, - units=units, - calendar=calendar, - chunks=chunks, - **kwargs, - ) - - return data + # def _create_Data( + # self, + # array, + # ncvar, + # units=None, + # calendar=None, + # ncdimensions=(), + # **kwargs, + # ): + # """Create a Data object from a netCDF variable. + # + # .. versionadded:: 3.0.0 + # + # :Parameters: + # + # array: `Array` + # The file array. + # + # ncvar: `str` + # The netCDF variable containing the array. + # + # units: `str`, optional + # The units of *array*. By default, or if `None`, it is + # assumed that there are no units. + # + # calendar: `str`, optional + # The calendar of *array*. By default, or if `None`, it is + # assumed that there is no calendar. + # + # ncdimensions: sequence of `str`, optional + # The netCDF dimensions spanned by the array. + # + # .. versionadded:: 3.14.0 + # + # kwargs: optional + # Extra parameters to pass to the initialisation of the + # returned `Data` object. + # + # :Returns: + # + # `Data` + # + # """ + # if array.dtype is None: + # # The array is based on a netCDF VLEN variable, and + # # therefore has unknown data type. To find the correct + # # data type (e.g. "=1) netCDF string type variable comes out + # # as a numpy object array, so convert it to numpy + # # string array. + # array = array.astype("U", copy=False) + # # NetCDF4 doesn't auto-mask VLEN variables + # array = np.ma.where(array == "", np.ma.masked, array) + # + # # Parse dask chunks + ## chunks = self._parse_chunks(ncvar) + # chunks = self._dask_chunks(array, ncvar, compressed) + # + # data = super()._create_Data( + # array, + # ncvar, + # units=units, + # calendar=calendar, + # chunks=chunks, + # **kwargs, + # ) + # + # return data def _customise_read_vars(self): """Customise the read parameters. @@ -470,161 +469,160 @@ def _customise_read_vars(self): for term_ncvar in parsed_aggregated_data.values(): g["do_not_create_field"].add(term_ncvar) - def _cache_data_elements(self, data, ncvar): - """Cache selected element values. - - Updates *data* in-place to store its first, second, - penultimate, and last element values (as appropriate). - - These values are used by, amongst other things, - `cf.Data.equals`, `cf.aggregate` and for inspection. - - Doing this here is quite cheap because only the individual - elements are read from the already-open file, as opposed to - being retrieved from *data* (which would require a whole dask - chunk to be read to get each single value). - - However, empirical evidence shows that using netCDF4 to access - the first and last elements of a large array on disk - (e.g. shape (1, 75, 1207, 1442)) is slow (e.g. ~2 seconds) and - doesn't scale well with array size (i.e. it takes - disproportionally longer for larger arrays). Such arrays are - usually in field constructs, for which `cf.aggregate` does not - need to know any array values, so this method should be used - with caution, if at all, on field construct data. - - .. versionadded:: 3.14.0 - - :Parameters: - - data: `Data` - The data to be updated with its cached values. - - ncvar: `str` - The name of the netCDF variable that contains the - data. - - :Returns: - - `None` - - """ - - if data.data.get_compression_type(): - # Don't get cached elements from arrays compressed by - # convention, as they'll likely be wrong. - return - - g = self.read_vars + # def _cache_data_elements(self, data, ncvar): + # """Cache selected element values. + # + # Updates *data* in-place to store its first, second, + # penultimate, and last element values (as appropriate). + # + # These values are used by, amongst other things, + # `cf.Data.equals`, `cf.aggregate` and for inspection. + # + # Doing this here is quite cheap because only the individual + # elements are read from the already-open file, as opposed to + # being retrieved from *data* (which would require a whole dask + # chunk to be read to get each single value). + # + # However, empirical evidence shows that using netCDF4 to access + # the first and last elements of a large array on disk + # (e.g. shape (1, 75, 1207, 1442)) is slow (e.g. ~2 seconds) and + # doesn't scale well with array size (i.e. it takes + # disproportionally longer for larger arrays). Such arrays are + # usually in field constructs, for which `cf.aggregate` does not + # need to know any array values, so this method should be used + # with caution, if at all, on field construct data. + # + # .. versionadded:: 3.14.0 + # + # :Parameters: + # + # data: `Data` + # The data to be updated with its cached values. + # + # ncvar: `str` + # The name of the netCDF variable that contains the + # data. + # + # :Returns: + # + # `None` + # + # """ + # + # if data.data.get_compression_type(): + # # Don't get cached elements from arrays compressed by + # # convention, as they'll likely be wrong. + # return + # + # g = self.read_vars + # + # # Get the netCDF4.Variable for the data + # if g["has_groups"]: + # group, name = self._netCDF4_group( + # g["variable_grouped_dataset"][ncvar], ncvar + # ) + # variable = group.variables.get(name) + # else: + # variable = g["variables"].get(ncvar) + # + # # Get the required element values + # size = data.size + # ndim = data.ndim + # + # char = False + # if variable.ndim == ndim + 1: + # dtype = variable.dtype + # if dtype is not str and dtype.kind in "SU": + # # This variable is a netCDF classic style char array + # # with a trailing dimension that needs to be collapsed + # char = True + # + # if ndim == 1: + # # Also cache the second element for 1-d data, on the + # # assumption that they may well be dimension coordinate + # # data. + # if size == 1: + # indices = (0, -1) + # value = variable[...] + # values = (value, value) + # elif size == 2: + # indices = (0, 1, -1) + # value = variable[-1:] + # values = (variable[:1], value, value) + # else: + # indices = (0, 1, -1) + # values = (variable[:1], variable[1:2], variable[-1:]) + # elif ndim == 2 and data.shape[-1] == 2: + # # Assume that 2-d data with a last dimension of size 2 + # # contains coordinate bounds, for which it is useful to + # # cache the upper and lower bounds of the the first and + # # last cells. + # indices = (0, 1, -2, -1) + # ndim1 = ndim - 1 + # values = ( + # variable[(slice(0, 1),) * ndim1 + (slice(0, 1),)], + # variable[(slice(0, 1),) * ndim1 + (slice(1, 2),)], + # ) + # if data.size == 2: + # values = values + values + # else: + # values += ( + # variable[(slice(-1, None, 1),) * ndim1 + (slice(0, 1),)], + # variable[(slice(-1, None, 1),) * ndim1 + (slice(1, 2),)], + # ) + # elif size == 1: + # indices = (0, -1) + # value = variable[...] + # values = (value, value) + # elif size == 3: + # indices = (0, 1, -1) + # if char: + # values = variable[...].reshape(3, variable.shape[-1]) + # else: + # values = variable[...].flatten() + # else: + # indices = (0, -1) + # values = ( + # variable[(slice(0, 1),) * ndim], + # variable[(slice(-1, None, 1),) * ndim], + # ) + # + # # Create a dictionary of the element values + # elements = {} + # for index, value in zip(indices, values): + # if char: + # # Variable is a netCDF classic style char array, so + # # collapse (by concatenation) the outermost (fastest + # # varying) dimension. E.g. [['a','b','c']] becomes + # # ['abc'] + # if value.dtype.kind == "U": + # value = value.astype("S") + # + # a = netCDF4.chartostring(value) + # shape = a.shape + # a = np.array([x.rstrip() for x in a.flat]) + # a = np.reshape(a, shape) + # value = np.ma.masked_where(a == "", a) + # + # if np.ma.is_masked(value): + # value = np.ma.masked + # else: + # try: + # value = value.item() + # except (AttributeError, ValueError): + # # AttributeError: A netCDF string type scalar + # # variable comes out as Python str object, which + # # has no 'item' method. + # # + # # ValueError: A size-0 array can't be converted to + # # a Python scalar. + # pass + # + # elements[index] = value + # + # # Store the elements in the data object + # data._set_cached_elements(elements) - # Get the netCDF4.Variable for the data - if g["has_groups"]: - group, name = self._netCDF4_group( - g["variable_grouped_dataset"][ncvar], ncvar - ) - variable = group.variables.get(name) - else: - variable = g["variables"].get(ncvar) - - # Get the required element values - size = data.size - ndim = data.ndim - - char = False - if variable.ndim == ndim + 1: - dtype = variable.dtype - if dtype is not str and dtype.kind in "SU": - # This variable is a netCDF classic style char array - # with a trailing dimension that needs to be collapsed - char = True - - if ndim == 1: - # Also cache the second element for 1-d data, on the - # assumption that they may well be dimension coordinate - # data. - if size == 1: - indices = (0, -1) - value = variable[...] - values = (value, value) - elif size == 2: - indices = (0, 1, -1) - value = variable[-1:] - values = (variable[:1], value, value) - else: - indices = (0, 1, -1) - values = (variable[:1], variable[1:2], variable[-1:]) - elif ndim == 2 and data.shape[-1] == 2: - # Assume that 2-d data with a last dimension of size 2 - # contains coordinate bounds, for which it is useful to - # cache the upper and lower bounds of the the first and - # last cells. - indices = (0, 1, -2, -1) - ndim1 = ndim - 1 - values = ( - variable[(slice(0, 1),) * ndim1 + (slice(0, 1),)], - variable[(slice(0, 1),) * ndim1 + (slice(1, 2),)], - ) - if data.size == 2: - values = values + values - else: - values += ( - variable[(slice(-1, None, 1),) * ndim1 + (slice(0, 1),)], - variable[(slice(-1, None, 1),) * ndim1 + (slice(1, 2),)], - ) - elif size == 1: - indices = (0, -1) - value = variable[...] - values = (value, value) - elif size == 3: - indices = (0, 1, -1) - if char: - values = variable[...].reshape(3, variable.shape[-1]) - else: - values = variable[...].flatten() - else: - indices = (0, -1) - values = ( - variable[(slice(0, 1),) * ndim], - variable[(slice(-1, None, 1),) * ndim], - ) - - # Create a dictionary of the element values - elements = {} - for index, value in zip(indices, values): - if char: - # Variable is a netCDF classic style char array, so - # collapse (by concatenation) the outermost (fastest - # varying) dimension. E.g. [['a','b','c']] becomes - # ['abc'] - if value.dtype.kind == "U": - value = value.astype("S") - - a = netCDF4.chartostring(value) - shape = a.shape - a = np.array([x.rstrip() for x in a.flat]) - a = np.reshape(a, shape) - value = np.ma.masked_where(a == "", a) - - if np.ma.is_masked(value): - value = np.ma.masked - else: - try: - value = value.item() - except (AttributeError, ValueError): - # AttributeError: A netCDF string type scalar - # variable comes out as Python str object, which - # has no 'item' method. - # - # ValueError: A size-0 array can't be converted to - # a Python scalar. - pass - - elements[index] = value - - # Store the elements in the data object - data._set_cached_elements(elements) - - # REVIEW: h5: `_create_cfanetcdfarray`: docstring/comment improvements def _create_cfanetcdfarray( self, ncvar, @@ -675,7 +673,6 @@ def _create_cfanetcdfarray( # Get rid of the incorrect shape. This will end up getting set # correctly by the CFANetCDFArray instance. kwargs.pop("shape", None) - aggregated_data = g["cfa_aggregated_data"][ncvar] standardised_terms = ("location", "file", "address", "format") @@ -699,7 +696,6 @@ def _create_cfanetcdfarray( kwargs["x"] = aggregation_instructions kwargs["instructions"] = " ".join(sorted(instructions)) - # REVIEW: h5: `_create_cfanetcdfarray`: choose the correct netCDF backend # Use the kwargs to create a CFANetCDFArray instance if g["original_netCDF4"]: array = self.implementation.initialise_CFANetCDF4Array(**kwargs) @@ -753,7 +749,6 @@ def _create_cfanetcdfarray_term( return_kwargs_only=True, ) - # REVIEW: h5: `_create_cfanetcdfarray_term`: fix unknown fragment shape # Get rid of the incorrect shape. This will end up getting set # correctly by the CFANetCDFArray instance. kwargs.pop("shape", None) @@ -772,7 +767,6 @@ def _create_cfanetcdfarray_term( kwargs["x"] = aggregation_instructions kwargs["instructions"] = " ".join(sorted(instructions)) - # REVIEW: h5: `_create_cfanetcdfarray_term`: choose the correct netCDF backend if g["original_netCDF4"]: array = self.implementation.initialise_CFANetCDF4Array(**kwargs) else: @@ -781,70 +775,71 @@ def _create_cfanetcdfarray_term( return array, kwargs - def _parse_chunks(self, ncvar): - """Parse the dask chunks. - - .. versionadded:: 3.14.0 - - :Parameters: - - ncvar: `str` - The name of the netCDF variable containing the array. - - :Returns: - - `str`, `int` or `dict` - The parsed chunks that are suitable for passing to a - `Data` object containing the variable's array. - - """ - g = self.read_vars - - default_chunks = "auto" - chunks = g.get("chunks", default_chunks) - - if chunks is None: - return -1 - - if isinstance(chunks, dict): - if not chunks: - return default_chunks - - # For ncdimensions = ('time', 'lat'): - # - # chunks={} -> ["auto", "auto"] - # chunks={'ncdim%time': 12} -> [12, "auto"] - # chunks={'ncdim%time': 12, 'ncdim%lat': 10000} -> [12, 10000] - # chunks={'ncdim%time': 12, 'ncdim%lat': "20MB"} -> [12, "20MB"] - # chunks={'ncdim%time': 12, 'latitude': -1} -> [12, -1] - # chunks={'ncdim%time': 12, 'Y': None} -> [12, None] - # chunks={'ncdim%time': 12, 'ncdim%lat': (30, 90)} -> [12, (30, 90)] - # chunks={'ncdim%time': 12, 'ncdim%lat': None, 'X': 5} -> [12, None] - attributes = g["variable_attributes"] - chunks2 = [] - for ncdim in g["variable_dimensions"][ncvar]: - key = f"ncdim%{ncdim}" - if key in chunks: - chunks2.append(chunks[key]) - continue - - found_coord_attr = False - dim_coord_attrs = attributes.get(ncdim) - if dim_coord_attrs is not None: - for attr in ("standard_name", "axis"): - key = dim_coord_attrs.get(attr) - if key in chunks: - found_coord_attr = True - chunks2.append(chunks[key]) - break - - if not found_coord_attr: - # Use default chunks for this dimension - chunks2.append(default_chunks) - - chunks = chunks2 - - return chunks + # + # def _parse_chunks(self, ncvar): + # """Parse the dask chunks. + # + # .. versionadded:: 3.14.0 + # + # :Parameters: + # + # ncvar: `str` + # The name of the netCDF variable containing the array. + # + # :Returns: + # + # `str`, `int` or `dict` + # The parsed chunks that are suitable for passing to a + # `Data` object containing the variable's array. + # + # """ + # g = self.read_vars + # + # default_chunks = "auto" + # chunks = g.get("chunks", default_chunks) + # + # if chunks is None: + # return -1 + # + # if isinstance(chunks, dict): + # if not chunks: + # return default_chunks + # + # # For ncdimensions = ('time', 'lat'): + # # + # # chunks={} -> ["auto", "auto"] + # # chunks={'ncdim%time': 12} -> [12, "auto"] + # # chunks={'ncdim%time': 12, 'ncdim%lat': 10000} -> [12, 10000] + # # chunks={'ncdim%time': 12, 'ncdim%lat': "20MB"} -> [12, "20MB"] + # # chunks={'ncdim%time': 12, 'latitude': -1} -> [12, -1] + # # chunks={'ncdim%time': 12, 'Y': None} -> [12, None] + # # chunks={'ncdim%time': 12, 'ncdim%lat': (30, 90)} -> [12, (30, 90)] + # # chunks={'ncdim%time': 12, 'ncdim%lat': None, 'X': 5} -> [12, None] + # attributes = g["variable_attributes"] + # chunks2 = [] + # for ncdim in g["variable_dimensions"][ncvar]: + # key = f"ncdim%{ncdim}" + # if key in chunks: + # chunks2.append(chunks[key]) + # continue + # + # found_coord_attr = False + # dim_coord_attrs = attributes.get(ncdim) + # if dim_coord_attrs is not None: + # for attr in ("standard_name", "axis"): + # key = dim_coord_attrs.get(attr) + # if key in chunks: + # found_coord_attr = True + # chunks2.append(chunks[key]) + # break + # + # if not found_coord_attr: + # # Use default chunks for this dimension + # chunks2.append(default_chunks) + # + # chunks = chunks2 + # + # return chunks def _customise_field_ancillaries(self, parent_ncvar, f): """Create customised field ancillary constructs. @@ -962,7 +957,6 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): aggregation_instructions = g["cfa_aggregation_instructions"] variable_attributes = g["variable_attributes"] - # REVIEW: h5: `_cfa_parse_aggregated_data`: use `cfdm.netcdf_indexer` to get data # Loop round aggregation instruction terms out = {} for x in self._parse_x( diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index 256c4b1392..068c55b968 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -3,8 +3,8 @@ import cfdm import dask.array as da import numpy as np +from cfdm.data.dask_utils import cfdm_asanyarray -from ...data.dask_utils import cf_asanyarray from .netcdfread import NetCDFRead @@ -579,8 +579,6 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar): }, ) - # REVIEW: h5: Deleted function _convert_to_builtin_type was a CFA-0.4 thing - def _check_valid(self, array, cfvar=None, attributes=None): """Checks for array values outside of the valid range. @@ -749,8 +747,7 @@ def _cfa_write_non_standard_terms( # more than one unique value then the fragment's value is # missing data. # - # REVIEW: getitem: `_cfa_write_non_standard_terms`: set '_asanyarray' - # '_cfa_unique' has its own call to 'cf_asanyarray', so + # '_cfa_unique' has its own call to 'cfdm_asanyarray', so # we can set '_asanyarray=False'. dx = data.to_dask_array(_asanyarray=False) dx_ind = tuple(range(dx.ndim)) @@ -810,8 +807,7 @@ def _cfa_unique(cls, a): data if there is not a unique value. """ - # REVIEW: getitem: `_cfa_unique`: convert a to a usable array - a = cf_asanyarray(a) + a = cfdm_asanyarray(a) out_shape = (1,) * a.ndim a = np.unique(a) @@ -966,7 +962,6 @@ def _cfa_aggregation_instructions(self, data, cfvar): # Create the location array # ------------------------------------------------------------ dtype = np.dtype(np.int32) - # REVIEW: getitem: `_cfa_aggregation_instructions`: set 'asanyarray' if ( max(data.to_dask_array(_asanyarray=False).chunksize) > np.iinfo(dtype).max diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 3e1f2ec6ae..6b23357fe4 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -58,18 +58,16 @@ def read( select_options=None, follow_symlinks=False, mask=True, - # REVIEW: h5: `read`: new 'unpack' parameter to control auto-unpacking (previously always True) unpack=True, warn_valid=False, - chunks="auto", + dask_chunks="storage-aligned", + store_hdf5_chunks=True, domain=False, cfa=None, - # REVIEW: h5: `read`: new 'netcdf_backend' parameter to control how to read files netcdf_backend=None, - # REVIEW: h5: `read`: new 'storage_options' parameter to control access to S3 storage_options=None, - # REVIEW: h5: `read`: 'cache' parameter to control whether or not to get to cache selected data elements cache=True, + chunks="auto", ): """Read field or domain constructs from files. @@ -561,80 +559,201 @@ def read( .. versionadded:: 1.5 - chunks: `str`, `int`, `None`, or `dict`, optional - Specify the `dask` chunking of dimensions for data in - the input files. - - By default, ``'auto'`` is used to specify the array - chunking, which uses a chunk size in bytes defined by - the `cf.chunksize` function, preferring square-like - chunk shapes across all data dimensions. - - If *chunks* is a `str` then each data array uses this - chunk size in bytes, preferring square-like chunk - shapes across all data dimensions. Any string value - accepted by the *chunks* parameter of the - `dask.array.from_array` function is permitted. - - *Parameter example:* - A chunksize of 2 MiB may be specified as - ``'2097152'`` or ``'2 MiB'``. - - If *chunks* is `-1` or `None` then for each there is no - chunking, i.e. every data array has one chunk - regardless of its size. - - If *chunks* is a positive `int` then each data array - dimension has chunks with this number of elements. - - If *chunks* is a `dict`, then each of its keys - identifies dimension in the file, with a value that - defines the chunking for that dimension whenever it is - spanned by data. - - Each dictionary key identifies a file dimension in one - of three ways: 1. the netCDF dimension name, preceded - by ``ncdim%`` (e.g. ``'ncdim%lat'``); 2. the "standard - name" attribute of a CF-netCDF coordinate variable - that spans the dimension (e.g. ``'latitude'``); or - 3. the "axis" attribute of a CF-netCDF coordinate - variable that spans the dimension (e.g. ``'Y'``). - - The dictionary values may be `str`, `int` or `None`, - with the same meanings as those types for the *chunks* - parameter but applying only to the specified - dimension. A `tuple` or `list` of integers that sum to - the dimension size may also be given. - - Not specifying a file dimension in the dictionary is - equivalent to it being defined with a value of - ``'auto'``. - - *Parameter example:* - ``{'T': '0.5 MiB', 'Y': [36, 37], 'X': None}`` - - *Parameter example:* - If a netCDF file contains dimensions ``time``, - ``z``, ``lat`` and ``lon``, then ``{'ncdim%time': - 12, 'ncdim%lat', None, 'ncdim%lon': None}`` will - ensure that all ``time`` axes have a chunksize of - 12; and all ``lat`` and ``lon`` axes are not - chunked; and all ``z`` axes are chunked to comply as - closely as possible with the default chunks size. - - If the netCDF also contains a ``time`` coordinate - variable with a ``standard_name`` attribute of - ``'time'`` and an ``axis`` attribute of ``'T'``, - then the same chunking could be specified with - either ``{'time': 12, 'ncdim%lat', None, 'ncdim%lon': - None}`` or ``{'T': 12, 'ncdim%lat', None, - 'ncdim%lon': None}``. - - .. note:: The *chunks* parameter is ignored for PP and - UM fields files, for which the chunking is - pre-determined by the file format. - - .. versionadded:: 3.14.0 + dask_chunks: `str`, `int`, `None`, or `dict`, optional + Specify the Dask chunking for data. May be one of the + following: + + * ``'storage-aligned'`` + + This is the default. The Dask chunk size in bytes will + be as close as possible the size given by + `cf.chunksize`, favouring square-like chunk shapes, + with the added restriction that the entirety of each + storage chunk must also lie within exactly one Dask + chunk. + + When reading the data from disk, an entire storage chunk + will be read once per Dask storage chunk that contains + any part of it, so ensuring that a storage chunk lies + within only one Dask chunk can increase performance by + reducing the amount of disk access (particularly when + the data are stored remotely to the client). + + For instance, consider a file variable that has an array + of 64-bit floats with shape (400, 300, 60) and a storage + chunk shape of (100, 5, 60), giving 240 storage chunks + each of size 100*5*60*8 bytes = 0.23 MiB. Then: + + * If `cf.chunksize` returned 134217728 (i.e. 128 MiB), + then the storage-aligned Dask chunks will have shape + (400, 300, 60), giving 1 Dask chunk with size of 54.93 + MiB (compare with a Dask chunk shape of (400, 300, 60) + and size 54.93 MiB, if *dask_chunks* were ``'auto'``.) + + * If `cf.chunksize` returned 33554432 (i.e. 32 MiB), + then the storage-aligned Dask chunks will have shape + (200, 260, 60), giving 4 Dask chunks with a maximum + size of 23.80 MiB (compare with a Dask chunk shape of + (264, 264, 60) and maximum size 31.90 MiB, if + *dask_chunks* were ``'auto'``.) + + * If `cf.chunksize` returned 4194304 (i.e. 4 MiB), + then the storage-aligned Dask chunks will have shape + (100, 85, 60), giving 16 Dask chunks with a maximum + size of 3.89 MiB (compare with a Dask chunk shape of + (93, 93, 60) and maximum size 3.96 MiB, if + *dask_chunks* were ``'auto'``.) + + There are, however, some occasions when, for particular + data arrays in the file, the ``'auto'`` option will + automatically be used instead of storage-aligned Dask + chunks. This occurs when: + + * The data array in the file is stored contiguously. + + * The data array in the file is compressed by convention + (e.g. ragged array representations, compression by + gathering, subsampled coordinates, etc.). In this case + the Dask chunks are for the uncompressed data, and so + cannot be aligned with the storage chunks of the + compressed array in the file. + + * ``'storage-exact'`` + + Each Dask chunk will contain exactly one storage chunk + and each storage chunk will lie within exactly one Dask + chunk. + + For instance, consider a file variable that has an array + of 64-bit floats with shape (400, 300, 60) and a storage + chunk shape of (100, 5, 60) (i.e. there are 240 storage + chunks, each of size 0.23 MiB). Then the storage-exact + Dask chunks will also have shape (100, 5, 60) giving 240 + Dask chunks with a maximum size of 0.23 MiB. + + There are, however, some occasions when, for particular + data arrays in the file, the ``'auto'`` option will + automatically be used instead of storage-exact Dask + chunks. This occurs when: + + * The data array in the file is stored contiguously. + + * The data array in the file is compressed by convention + (e.g. ragged array representations, compression by + gathering, subsampled coordinates, etc.). In this case + the Dask chunks are for the uncompressed data, and so + cannot be aligned with the storage chunks of the + compressed array in the file. + + * ``auto`` + + The Dask chunk size in bytes will be as close as + possible to the size given by `cf.chunksize`, + favouring square-like chunk shapes. This may give + similar Dask chunk shapes as the ``'storage-aligned'`` + option, but without the guarantee that each storage + chunk will lie within exactly one Dask chunk. + + * A byte-size given by a `str` + + The Dask chunk size in bytes will be as close as + possible to the given byte-size, favouring square-like + chunk shapes. Any string value, accepted by the *chunks* + parameter of the `dask.array.from_array` function is + permitted. + + *Example:* + A Dask chunksize of 2 MiB may be specified as + ``'2097152'`` or ``'2 MiB'``. + + * `-1` or `None` + + There is no Dask chunking, i.e. every data array has one + Dask chunk regardless of its size. + + * Positive `int` + + Every dimension of all Dask chunks has this number of + elements. + + *Example:* + For 3-dimensional data, *dask_chunks* of `10` will + give Dask chunks with shape (10, 10, 10). + + * `dict` + + Each of dictionary key identifies a file dimension, with + a value that defines the Dask chunking for that + dimension whenever it is spanned by a data array. A file + dimension is identified in one of three ways: + + 1. the netCDF dimension name, preceded by ``ncdim%`` + (e.g. ``'ncdim%lat'``); + + 2. the value of the "standard name" attribute of a + CF-netCDF coordinate variable that spans the + dimension (e.g. ``'latitude'``); + + 3. the value of the "axis" attribute of a CF-netCDF + coordinate variable that spans the dimension + (e.g. ``'Y'``). + + The dictionary values may be a byte-size string, + ``'auto'``, `int` or `None`, with the same meanings as + those types for the *dask_chunks* parameter itself, but + applying only to the specified dimension. In addition, a + dictionary value may be a `tuple` or `list` of integers + that sum to the dimension size. + + Not specifying a file dimension in the dictionary is + equivalent to it being defined with a value of + ``'auto'``. + + *Example:* + ``{'T': '0.5 MiB', 'Z': 'auto', 'Y': [36, 37], 'X': + None}`` + + *Example:* + If a netCDF file contains dimensions ``time``, ``z``, + ``lat`` and ``lon``, then ``{'ncdim%time': 12, + 'ncdim%lat', None, 'ncdim%lon': None}`` will ensure + that, for all applicable data arrays, all ``time`` + axes have a `dask` chunksize of 12; all ``lat`` and + ``lon`` axes are not `dask` chunked; and all ``z`` + axes are `dask` chunked to comply as closely as + possible with the default `dask` chunk size. + + If the netCDF file also contains a ``time`` coordinate + variable with a "standard_name" attribute of + ``'time'`` and an "axis" attribute of ``'T'``, then + the same `dask` chunking could be specified with + either ``{'time': 12, 'ncdim%lat', None, 'ncdim%lon': + None}`` or ``{'T': 12, 'ncdim%lat', None, 'ncdim%lon': + None}``. + + .. versionadded:: NEXTVERSION + + store_hdf5_chunks: `bool`, optional + If True (the default) then store the HDF5 chunking + strategy for each returned data array. The HDF5 chunking + strategy is then accessible via an object's + `nc_hdf5_chunksizes` method. When the HDF5 chunking + strategy is stored, it will be used when the data is + written to a new netCDF4 file with `cf.write` (unless + the strategy was modified prior to writing). + + If False, or if the file being read is not in netCDF4 + format, then no HDF5 chunking strategy is stored. + (i.e. an `nc_hdf5_chunksizes` method will return `None` + for all `Data` objects). In this case, when the data is + written to a new netCDF4 file, the HDF5 chunking strategy + will be determined by `cf.write`. + + See the `cf.write` *hdf5_chunks* parameter for details + on how the HDF5 chunking strategy is determined at the + time of writing. + + .. versionadded:: NEXTVERSION domain: `bool`, optional If True then return only the domain constructs that are @@ -771,7 +890,10 @@ def read( Use methods on the returned `FieldList` instead. chunk: deprecated at version 3.14.0 - Use the *chunks* parameter instead. + Use the *dask_chunks* parameter instead. + + chunks: deprecated at version NEXTVERSION + Use the *dask_chunks* parameter instead. :Returns: @@ -852,7 +974,16 @@ def read( _DEPRECATION_ERROR_FUNCTION_KWARGS( "cf.read", {"chunk": chunk}, - "Use keyword 'chunks' instead.", + "Use keyword 'dask_chunks' instead.", + version="3.14.0", + removed_at="5.0.0", + ) # pragma: no cover + + if chunks is not "auto": + _DEPRECATION_ERROR_FUNCTION_KWARGS( + "cf.read", + {"chunk": chunk}, + "Use keyword 'dask_chunks' instead.", version="3.14.0", removed_at="5.0.0", ) # pragma: no cover @@ -861,13 +992,6 @@ def read( if isinstance(select, (str, Query, Pattern)): select = (select,) - # Check chunks - if chunks is not None and not isinstance(chunks, (str, Integral, dict)): - raise ValueError( - "'chunks' parameter must be of type str, int, None or dict. " - f"Got: {chunks!r}" - ) - # Manage input parameters where contradictions are possible: if cdl_string and fmt: if fmt == "CDL": @@ -914,8 +1038,6 @@ def read( cfa_options["substitutions"] = substitutions - cache = bool(cache) - # Initialise the output list of fields/domains if domain: out = DomainList() @@ -1042,7 +1164,8 @@ def read( um=um, extra=extra, height_at_top_of_model=height_at_top_of_model, - chunks=chunks, + dask_chunks=dask_chunks, + store_hdf5_chunks=store_hdf5_chunks, mask=mask, unpack=unpack, warn_valid=warn_valid, @@ -1163,7 +1286,8 @@ def _read_a_file( mask=True, unpack=True, warn_valid=False, - chunks="auto", + dask_chunks="storage-aligned", + store_hdf5_chunks=True, select=None, domain=False, cfa_options=None, @@ -1253,11 +1377,9 @@ def _read_a_file( umversion = float(str(umversion).replace(".", "0", 1)) extra_read_vars = { - "chunks": chunks, "fmt": selected_fmt, "ignore_read_error": ignore_read_error, "cfa_options": cfa_options, - "cache": cache, } # ---------------------------------------------------------------- @@ -1300,6 +1422,9 @@ def _read_a_file( domain=domain, storage_options=storage_options, netcdf_backend=netcdf_backend, + dask_chunks=dask_chunks, + store_hdf5_chunks=store_hdf5_chunks, + cache=cache, ) except MaskError: # Some data required for field interpretation is missing, diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index a62b49b76e..55d2f3469a 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -39,7 +39,8 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) -# To facilitate the testing of logging outputs (see comment tag 'Logging note') +# To facilitate the testing of logging outputs (see comment tag +# 'Logging note') logger = cf.logging.getLogger(__name__) @@ -60,11 +61,6 @@ def _remove_tmpfiles(): mw = np.ma.array(w, mask=ma.mask) -# If True, all tests that will not pass temporarily due to the LAMA-to-Dask -# migration will be skipped. These skips will be incrementally removed as the -# migration progresses. TODODASK: ensure all skips are removed once complete. -TEST_DASKIFIED_ONLY = True - def reshape_array(a, axes): """Reshape array reducing given axes' dimensions to a final axis.""" @@ -77,24 +73,19 @@ def reshape_array(a, axes): return b -def axis_combinations(a): - """Return a list of axes combinations to iterate over.""" +def axis_combinations(ndim): + """Create axes permutations for `test_Data_flatten`""" return [ axes - for n in range(1, a.ndim + 1) - for axes in itertools.combinations(range(a.ndim), n) + for n in range(1, ndim + 1) + for axes in itertools.permutations(range(ndim), n) ] class DataTest(unittest.TestCase): """Unit test for the Data class.""" - axes_combinations = axis_combinations(a) - # [ - # axes - # for n in range(1, a.ndim + 1) - # for axes in itertools.combinations(range(a.ndim), n) - # ] + axes_combinations = axis_combinations(a.ndim) filename = os.path.join( os.path.dirname(os.path.abspath(__file__)), "test_file.nc" @@ -751,7 +742,7 @@ def test_Data_stats(self): # Test outputs covering a representative selection of parameters s1 = d.stats() - s1_lazy = d.stats(compute=False) + s1_lazy = d.stats(values=False) exp_result = { "minimum": 1, "mean": 1.0, @@ -771,7 +762,7 @@ def test_Data_stats(self): ) s2 = d.stats(all=True) - s2_lazy = d.stats(compute=False, all=True) + s2_lazy = d.stats(values=False, all=True) exp_result = { "minimum": 1, "mean": 1.0, @@ -798,7 +789,7 @@ def test_Data_stats(self): ) s3 = d.stats(sum=True, weights=1) - s3_lazy = d.stats(compute=False, sum=True, weights=1) + s3_lazy = d.stats(values=False, sum=True, weights=1) exp_result = { "minimum": 1, "mean": 1.0, @@ -820,7 +811,7 @@ def test_Data_stats(self): s4 = d.stats(mean_of_upper_decile=True, range=False, weights=2.0) s4_lazy = d.stats( - compute=False, mean_of_upper_decile=True, range=False, weights=2.0 + values=False, mean_of_upper_decile=True, range=False, weights=2.0 ) exp_result = { "minimum": 1, @@ -1002,21 +993,25 @@ def test_Data_cumsum(self): self.assertTrue(cf.functions._numpy_allclose(e.array, b)) def test_Data_flatten(self): - """Test the `flatten` Data method.""" - d = cf.Data(self.ma.copy()) - self.assertTrue(d.equals(d.flatten([]), verbose=2)) + """Test Data.flatten.""" + ma = np.ma.arange(24).reshape(1, 2, 3, 4) + ma[0, 1, 1, 2] = cf.masked + ma[0, 0, 2, 1] = cf.masked + + d = cf.Data(ma.copy()) + self.assertTrue(d.equals(d.flatten([]), verbose=3)) self.assertIsNone(d.flatten(inplace=True)) - d = cf.Data(self.ma.copy()) + d = cf.Data(ma.copy()) - b = self.ma.flatten() + b = ma.flatten() for axes in (None, list(range(d.ndim))): e = d.flatten(axes) self.assertEqual(e.ndim, 1) self.assertEqual(e.shape, b.shape) - self.assertTrue(cf.functions._numpy_allclose(e.array, b)) + self.assertTrue(e.equals(cf.Data(b), verbose=3)) - for axes in self.axes_combinations: + for axes in axis_combinations(d.ndim): e = d.flatten(axes) if len(axes) <= 1: @@ -1028,10 +1023,25 @@ def test_Data_flatten(self): np.prod([n for i, n in enumerate(d.shape) if i in axes]), ) - self.assertEqual(e.shape, tuple(shape)) + self.assertEqual(e.shape, tuple(shape), axes) self.assertEqual(e.ndim, d.ndim - len(axes) + 1) self.assertEqual(e.size, d.size) + for n in range(4): + e = d.flatten(n) + f = d.flatten([n]) + self.assertTrue(e.equals(f)) + + with self.assertRaises(ValueError): + d.flatten(99) + + d = cf.Data(9) + self.assertTrue(d.equals(d.flatten())) + self.assertTrue(d.equals(d.flatten([]))) + + with self.assertRaises(ValueError): + d.flatten(0) + def test_Data_cached_arithmetic_units(self): """Test arithmetic with, and units of, Data cached to disk.""" d = cf.Data(self.a, "m") @@ -1479,7 +1489,6 @@ def test_Data__getitem__(self): f = cf.Data([-999, 35], mask=[True, False]).reshape(2, 1) self.assertTrue(e.equals(f)) - # REVIEW: getitem: `test_Data__getitem__`: Chained subspaces reading from disk # Chained subspaces reading from disk f = cf.read(self.filename)[0] d = f.data @@ -3213,19 +3222,20 @@ def test_Data_compute(self): self.assertEqual(d.compute(), 2.5) def test_Data_persist(self): - """Test the `persist` Data method.""" + """Test Data.persist.""" d = cf.Data(9, "km") self.assertIsNone(d.persist(inplace=True)) - d = cf.Data([1, 2, 3.0, 4], "km", mask=[0, 1, 0, 0], chunks=2) - self.assertGreater(len(d.to_dask_array().dask.layers), 1) + d = cf.Data([[1, 2, 3.0, 4]], "km", chunks=2) + self.assertEqual(len(d.to_dask_array().dask.layers), 2) + d.transpose(inplace=True) + self.assertEqual(len(d.to_dask_array().dask.layers), 3) e = d.persist() self.assertIsInstance(e, cf.Data) - self.assertEqual(len(e.to_dask_array().dask.layers), 1) - self.assertEqual( - e.to_dask_array().npartitions, d.to_dask_array().npartitions - ) + self.assertEqual(len(e.to_dask_array().dask.layers), 2) + self.assertEqual(d.npartitions, 2) + self.assertEqual(e.npartitions, d.npartitions) self.assertTrue(e.equals(d)) def test_Data_cyclic(self): @@ -3290,7 +3300,6 @@ def test_Data_rechunk(self): self.assertEqual(e.chunks, ((4,), (5,))) self.assertTrue(e.equals(d)) - # REVIEW: getitem: `test_Data_rechunk`: rechunking after a __getitem__ # Test rechunking after a __getitem__ e = d[:2].rechunk((2, 5)) self.assertTrue(e.equals(d[:2])) @@ -3414,7 +3423,7 @@ def test_Data_integral(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.sum(b * w, axis=-1) @@ -3432,7 +3441,7 @@ def test_Data_max(self): a = self.ma d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.max(b, axis=-1) b = np.ma.asanyarray(b) @@ -3449,7 +3458,7 @@ def test_Data_maximum_absolute_value(self): a = self.ma d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.max(abs(b), axis=-1) b = np.ma.asanyarray(b) @@ -3467,7 +3476,7 @@ def test_Data_mean(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.ma.average(b, axis=-1, weights=w) @@ -3486,7 +3495,7 @@ def test_Data_mean_absolute_value(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.ma.average(abs(b), axis=-1, weights=w) @@ -3504,7 +3513,7 @@ def test_Data_mid_range(self): a = self.ma d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = (np.max(b, axis=-1) + np.min(b, axis=-1)) / 2.0 b = np.ma.asanyarray(b) @@ -3524,7 +3533,7 @@ def test_Data_min(self): a = self.ma d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.min(b, axis=-1) b = np.ma.asanyarray(b) @@ -3541,7 +3550,7 @@ def test_Data_minimum_absolute_value(self): a = self.ma d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.min(abs(b), axis=-1) b = np.ma.asanyarray(b) @@ -3559,7 +3568,7 @@ def test_Data_range(self): d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.max(b, axis=-1) - np.min(b, axis=-1) b = np.ma.asanyarray(b) @@ -3580,7 +3589,7 @@ def test_Data_root_mean_square(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.ma.average(b * b, axis=-1, weights=w) ** 0.5 @@ -3598,7 +3607,7 @@ def test_Data_sample_size(self): a = self.ma d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.sum(np.ones_like(b), axis=-1) b = np.ma.asanyarray(b) @@ -3613,7 +3622,7 @@ def test_Data_sample_size(self): a = self.a d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.sum(np.ones_like(b), axis=-1) b = np.asanyarray(b) @@ -3642,7 +3651,7 @@ def test_Data_sum(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.sum(b * w, axis=-1) @@ -3661,7 +3670,7 @@ def test_Data_sum_of_squares(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.sum(b * b * w, axis=-1) @@ -3681,7 +3690,7 @@ def test_Data_sum_of_weights(self): d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) # Weights=None - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) b = np.sum(np.ones_like(b), axis=-1) b = np.ma.asanyarray(b) @@ -3692,7 +3701,7 @@ def test_Data_sum_of_weights(self): self.assertTrue((e.mask == b.mask).all()) self.assertTrue(np.allclose(e, b)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) w = np.ma.masked_where(b.mask, w) @@ -3713,12 +3722,12 @@ def test_Data_sum_of_weights2(self): d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) # Weights=None - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): e = d.sum_of_weights2(axes=axis) f = d.sum_of_weights(axes=axis) self.assertTrue(e.equals(f)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) w = np.ma.masked_where(b.mask, w) @@ -3739,7 +3748,7 @@ def test_Data_var(self): d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) # Weighted ddof = 0 - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) mu, V1 = np.ma.average(b, axis=-1, weights=w, returned=True) @@ -3757,7 +3766,7 @@ def test_Data_var(self): self.assertTrue(np.allclose(e, b), f"e={e}\nb={b}\ne-b={e - b}") # Weighted ddof = 1 - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) mu, V1 = np.ma.average(b, axis=-1, weights=w, returned=True) @@ -3776,7 +3785,7 @@ def test_Data_var(self): self.assertTrue(np.allclose(e, b)) # Unweighted ddof = 1 - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) mu, V1 = np.ma.average(b, axis=-1, returned=True) mu = mu.reshape(mu.shape + (1,)) @@ -3798,7 +3807,7 @@ def test_Data_mean_of_upper_decile(self): weights = self.w d = cf.Data(a, "m", chunks=(2, 3, 2, 5)) - for axis in axis_combinations(a): + for axis in axis_combinations(a.ndim): b = reshape_array(a, axis) w = reshape_array(weights, axis) b = np.ma.filled(b, np.nan) @@ -3946,12 +3955,12 @@ def test_Data_collapse_keepdims(self): d.var, d.mean_of_upper_decile, ): - for axis in axis_combinations(d): + for axis in axis_combinations(d.ndim): e = func(axes=axis, squeeze=False) s = [1 if i in axis else n for i, n in enumerate(d.shape)] self.assertEqual(e.shape, tuple(s)) - for axis in axis_combinations(d): + for axis in axis_combinations(d.ndim): e = func(axes=axis, squeeze=True) s = [n for i, n in enumerate(d.shape) if i not in axis] self.assertEqual(e.shape, tuple(s)) @@ -4132,10 +4141,9 @@ def test_Data_to_dask_array(self): dx = d.to_dask_array() self.assertIsInstance(dx, da.Array) self.assertTrue((d.array == dx.compute()).all()) - self.assertIs(da.asanyarray(d), dx) def test_Data_flat(self): - """Test the `flat` Data method.""" + """Test the Data.flat.""" d = cf.Data([[1, 2], [3, 4]], mask=[[0, 1], [0, 0]]) self.assertEqual(list(d.flat()), [1, 3, 4]) self.assertEqual( @@ -4143,7 +4151,7 @@ def test_Data_flat(self): ) def test_Data_tolist(self): - """Test the `tolist` Data method.""" + """Test the Data.tolist""" for x in (1, [1, 2], [[1, 2], [3, 4]]): d = cf.Data(x) e = d.tolist() @@ -4259,12 +4267,10 @@ def test_Data_rtol(self): self.assertEqual(d._rtol, 0.001) def test_Data_hardmask(self): - """Test the `hardmask` Data property.""" + """Test Data.hardmask.""" d = cf.Data([1, 2, 3]) d.hardmask = True self.assertTrue(d.hardmask) - self.assertEqual(len(d.to_dask_array().dask.layers), 1) - d[0] = cf.masked self.assertTrue((d.array.mask == [True, False, False]).all()) d[...] = 999 @@ -4275,18 +4281,24 @@ def test_Data_hardmask(self): self.assertTrue((d.array.mask == [False, False, False]).all()) def test_Data_harden_mask(self): - """Test the `harden_mask` Data method.""" + """Test Data.harden_mask.""" d = cf.Data([1, 2, 3], hardmask=False) d.harden_mask() self.assertTrue(d.hardmask) - self.assertEqual(len(d.to_dask_array().dask.layers), 2) + d[0] = cf.masked + self.assertEqual(d[0].array, np.ma.masked) + d[0] = 99 + self.assertEqual(d[0].array, np.ma.masked) def test_Data_soften_mask(self): - """Test the `soften_mask` Data method.""" + """Test Data.soften_mask.""" d = cf.Data([1, 2, 3], hardmask=True) d.soften_mask() self.assertFalse(d.hardmask) - self.assertEqual(len(d.to_dask_array().dask.layers), 2) + d[0] = cf.masked + self.assertEqual(d[0].array, np.ma.masked) + d[0] = 99 + self.assertEqual(d[0].array, 99) def test_Data_compressed_array(self): """Test the `compressed_array` Data property.""" @@ -4450,8 +4462,8 @@ def test_Data_get_filenames(self): cf.write(f, file_A) cf.write(f, file_B) - a = cf.read(file_A, chunks=4)[0].data - b = cf.read(file_B, chunks=4)[0].data + a = cf.read(file_A, dask_chunks=4)[0].data + b = cf.read(file_B, dask_chunks=4)[0].data b += 999 c = cf.Data(b.array, units=b.Units, chunks=4) @@ -4520,18 +4532,33 @@ def test_Data__str__(self): for element in elements0: self.assertNotIn(element, d._get_cached_elements()) - # REVIEW: getitem: `test_Data_cull_graph`: prevent new asanyarray layer def test_Data_cull_graph(self): - """Test `Data.cull`""" - # Note: The number of layers in the culled graphs include a - # `cf_asanyarray` layer + """Test Data.cull_graph.""" d = cf.Data([1, 2, 3, 4, 5], chunks=3) d = d[:2] - self.assertEqual(len(dict(d.to_dask_array(_asanyarray=False).dask)), 3) + self.assertEqual( + len( + dict( + d.to_dask_array( + _apply_mask_hardness=False, _asanyarray=False + ).dask + ) + ), + 3, + ) # Check that there are fewer keys after culling d.cull_graph() - self.assertEqual(len(dict(d.to_dask_array(_asanyarray=False).dask)), 2) + self.assertEqual( + len( + dict( + d.to_dask_array( + _apply_mask_hardness=False, _asanyarray=False + ).dask + ) + ), + 2, + ) def test_Data_npartitions(self): """Test the `npartitions` Data property.""" @@ -4677,7 +4704,7 @@ def test_Data_file_location(self): ) cf.write(f, file_A) - d = cf.read(file_A, chunks=4)[0].data + d = cf.read(file_A, dask_chunks=4)[0].data self.assertGreater(d.npartitions, 1) e = d.copy() @@ -4696,9 +4723,9 @@ def test_Data_file_location(self): self.assertEqual(d.file_locations(), set((location,))) def test_Data_todict(self): - """Test Data.todict""" + """Test Data.todict.""" d = cf.Data([1, 2, 3, 4], chunks=2) - key = d.to_dask_array().name + key = d.to_dask_array(_apply_mask_hardness=False).name x = d.todict() self.assertIsInstance(x, dict) @@ -4779,7 +4806,6 @@ def test_Data_pad_missing(self): with self.assertRaises(ValueError): d.pad_missing(99, to_size=99) - # REVIEW: getitem: `test_Data_is_masked`: test `Data.is_masked` def test_Data_is_masked(self): """Test Data.is_masked.""" d = cf.Data(np.arange(6).reshape(2, 3)) From a18e59c8f9e6f713e7d095270c2a759ce1a86106 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Nov 2024 14:27:10 +0000 Subject: [PATCH 02/27] dev --- cf/data/data.py | 17 +++++++++++++++++ cf/field.py | 6 ++++-- cf/mixin/propertiesdata.py | 4 ++-- cf/test/test_Field.py | 35 +++++++++++++++++++++++++---------- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index bb13633d39..733769a530 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -6349,6 +6349,9 @@ def outerproduct(self, a, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) + shape = d.shape + chunksizes0 = d.nc_hdf5_chunksizes() + # Cast 'a' as a Data object so that it definitely has sensible # Units. We don't mind if the units of 'a' are incompatible # with those of 'self', but if they are then it's nice if the @@ -6378,6 +6381,20 @@ def outerproduct(self, a, inplace=False, i=False): for a_axis in a._cyclic: d.cyclic(ndim + a._axes.index(a_axis)) + # Update the HDF5 chunking strategy + chunksizes1 = a.nc_hdf5_chunksizes() + if chunksizes0 or chunksizes1: + if isinstance(chunksizes0, tuple): + if isinstance(chunksizes1, tuple): + chunksizes = chunksizes0 + chunksizes1 + else: + chunksizes = chunksizes0 + a.shape + + d.nc_set_hdf5_chunksizes(chunksizes) + elif isinstance(chunksizes1, tuple): + chunksizes = shape + chunksizes1 + d.nc_set_hdf5_chunksizes(chunksizes) + d._update_deterministic(a) return d diff --git a/cf/field.py b/cf/field.py index d73e2b7b49..d9059cf235 100644 --- a/cf/field.py +++ b/cf/field.py @@ -1320,7 +1320,9 @@ def _binary_operation(self, other, method): # ------------------------------------------------------------ # Operate on the data # ------------------------------------------------------------ - new_data = field0.data._binary_operation(field1.data, method) + new_data = field0.data._binary_operation( + field0.data, field1.data, method + ) field0.set_data(new_data, set_axes=False, copy=False) @@ -4087,7 +4089,7 @@ def weights( # ------------------------------------------------------------ # Still here? Return a weights field which is the outer - # product of the component weights + # product of the component weights. # ------------------------------------------------------------ pp = sorted(comp.items()) waxes, wdata = pp.pop(0) diff --git a/cf/mixin/propertiesdata.py b/cf/mixin/propertiesdata.py index fa72e0c0cb..6437ed3fc7 100644 --- a/cf/mixin/propertiesdata.py +++ b/cf/mixin/propertiesdata.py @@ -621,11 +621,11 @@ def _binary_operation(self, y, method): if not inplace: new = self.copy() # data=False) TODO - new_data = data._binary_operation(y, method) + new_data = data._binary_operation(data, y, method) new.set_data(new_data, copy=False) else: new = self - new.data._binary_operation(y, method) + new.data._binary_operation(new.data, y, method) if method in _relational_methods: # Booleans have no units diff --git a/cf/test/test_Field.py b/cf/test/test_Field.py index 92392b29f5..e8bb58bde6 100644 --- a/cf/test/test_Field.py +++ b/cf/test/test_Field.py @@ -2499,24 +2499,25 @@ def test_Field_grad_xy(self): self.assertEqual(x.Units, y.Units) self.assertEqual(y.Units, cf.Units("m-1")) - x0 = f.derivative( + x0 = 57.2957795130823 * f.derivative( "X", wrap=wrap, one_sided_at_boundary=one_sided, ) / (sin_theta * r) - y0 = ( + y0 = 57.2957795130823 * ( f.derivative( "Y", one_sided_at_boundary=one_sided, ) / r ) + x0.override_units('m-1', inplace=True) + y0.override_units('m-1', inplace=True) # Check the data - with cf.rtol(1e-10): - self.assertTrue((x.data == x0.data).all()) - self.assertTrue((y.data == y0.data).all()) - + self.assertTrue(x.data.allclose(x0.data)) + self.assertTrue(y.data.allclose(y0.data)) + # Check that x and y have the same metadata as f # (except standard_name, long_name, and units). f0 = f.copy() @@ -2644,9 +2645,9 @@ def test_Field_laplacian_xy(self): g.dimension_coordinate("X").standard_name, "longitude" ) - def test_Field_to_dask_array(self): + def test_Field__aaa_to_dask_array(self): f = self.f0.copy() - self.assertIs(f.to_dask_array(), f.data.to_dask_array()) + self.assertTrue((f.array == f.to_dask_array().compute()).all()) f.del_data() with self.assertRaises(ValueError): @@ -2755,11 +2756,25 @@ def test_Field_persist(self): f = cf.example_field(0) f *= 2 - self.assertGreater(len(f.to_dask_array().dask.layers), 1) + self.assertGreater( + len( + f.data.to_dask_array( + _apply_mask_hardness=False, _asanyarray=False + ).dask.layers + ), + 2, + ) g = f.persist() self.assertIsInstance(g, cf.Field) - self.assertEqual(len(g.to_dask_array().dask.layers), 1) + self.assertEqual( + len( + g.data.to_dask_array( + _apply_mask_hardness=False, _asanyarray=False + ).dask.layers + ), + 1, + ) self.assertTrue(g.equals(f)) self.assertIsNone(g.persist(inplace=True)) From 5754f8b26d0acbb3cb2dc19d2b09ddcc4da6aa16 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 4 Nov 2024 16:50:18 +0000 Subject: [PATCH 03/27] dev --- cf/data/dask_regrid.py | 9 +- cf/read_write/netcdf/netcdfread.py | 1 - cf/test/test_Data_utils.py | 237 ---------------------------- cf/test/test_DimensionCoordinate.py | 9 +- cf/test/test_Field.py | 26 +-- cf/test/test_Maths.py | 14 +- cf/test/test_active_storage.py | 4 +- cf/test/test_functions.py | 8 +- cf/test/test_read_write.py | 43 ----- cf/test/test_regrid.py | 4 +- 10 files changed, 47 insertions(+), 308 deletions(-) diff --git a/cf/data/dask_regrid.py b/cf/data/dask_regrid.py index f825fe1488..9c64b42cb7 100644 --- a/cf/data/dask_regrid.py +++ b/cf/data/dask_regrid.py @@ -1,8 +1,7 @@ """Regridding functions used within a dask graph.""" import numpy as np - -from .dask_utils import cf_asanyarray +from cfdm.data.dask_utils import cfdm_asanyarray def regrid( @@ -176,12 +175,12 @@ def regrid( """ weights, dst_mask = weights_dst_mask - a = cf_asanyarray(a) + a = cfdm_asanyarray(a) if dst_mask is not None: - dst_mask = cf_asanyarray(dst_mask) + dst_mask = cfdm_asanyarray(dst_mask) if ref_src_mask is not None: - ref_src_mask = cf_asanyarray(ref_src_mask) + ref_src_mask = cfdm_asanyarray(ref_src_mask) # ---------------------------------------------------------------- # Reshape the array into a form suitable for the regridding dot diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 1392e0c7a5..571b83e1ec 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -394,7 +394,6 @@ def _is_cfa_variable(self, ncvar): # array = np.ma.where(array == "", np.ma.masked, array) # # # Parse dask chunks - ## chunks = self._parse_chunks(ncvar) # chunks = self._dask_chunks(array, ncvar, compressed) # # data = super()._create_Data( diff --git a/cf/test/test_Data_utils.py b/cf/test/test_Data_utils.py index 874e2f8c84..f6f75dcc65 100644 --- a/cf/test/test_Data_utils.py +++ b/cf/test/test_Data_utils.py @@ -12,156 +12,6 @@ class DataUtilsTest(unittest.TestCase): - def test_Data_Utils__da_ma_allclose(self): - """TODO.""" - # Create a range of inputs to test against. - # Note that 'a' and 'a2' should be treated as 'allclose' for this - # method, the same result as np.ma.allclose would give because all - # of the *unmasked* elements are 'allclose', whereas in our - # Data.equals method that builds on this method, we go even further - # and insist on the mask being identical as well as the data - # (separately, i.e. unmasked) all being 'allclose', so inside our - # cf.Data objects 'a' and 'a2' would instead *not* be considered equal. - a_np = np.ma.array([1.0, 2.0, 3.0], mask=[1, 0, 0]) - a = da.from_array(a_np) - a2 = da.from_array(np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0])) - b_np = np.ma.array([1.0, 2.0, 3.0], mask=[0, 1, 0]) - b = da.from_array(b_np) - c_np = np.ma.array([1.0, 2.0, 100.0], mask=[1, 0, 0]) - c = da.from_array(c_np) - d = da.from_array(np.array([1.0, 2.0, 3.0])) - e = a + 5e-04 # outside of tolerance to set, namely rtol=1e-05 - f = a + 5e-06 # within set tolerance to be specified, as above - - # Test the function with these inputs as both numpy and dask arrays... - allclose = cf.data.dask_utils._da_ma_allclose - - self.assertTrue(allclose(a, a).compute()) - self.assertTrue(allclose(a2, a).compute()) - self.assertTrue(allclose(b, a).compute()) - - # ...including testing the 'masked_equal' parameter - self.assertFalse(allclose(b, a, masked_equal=False).compute()) - - self.assertFalse(allclose(c, a).compute()) - self.assertTrue(allclose(d, a).compute()) - self.assertFalse(allclose(e, a).compute()) - - self.assertTrue(allclose(f, a, rtol=1e-05).compute()) - - # Test when array inputs have different chunk sizes - a_chunked = da.from_array(a_np, chunks=(1, 2)) - self.assertTrue( - allclose(da.from_array(b_np, chunks=(3,)), a_chunked).compute() - ) - self.assertFalse( - allclose( - da.from_array(b_np, chunks=(3,)), a_chunked, masked_equal=False - ).compute() - ) - self.assertFalse( - allclose(da.from_array(c_np, chunks=(3,)), a_chunked).compute() - ) - - # Test the 'rtol' and 'atol' parameters: - self.assertFalse(allclose(e, a, rtol=1e-06).compute()) - b1 = e / 10000 - b2 = a / 10000 - self.assertTrue(allclose(b1, b2, atol=1e-05).compute()) - - def test_Data_Utils_is_numeric_dtype(self): - """TODO.""" - is_numeric_dtype = cf.data.utils.is_numeric_dtype - for a in [ - np.array([0, 1, 2]), - np.array([False, True, True]), - np.ma.array([10.0, 2.0, 3.0], mask=[1, 0, 0]), - np.array(10), - ]: - self.assertTrue(is_numeric_dtype(a)) - - for b in [ - np.array(["a", "b", "c"], dtype="S1"), - np.empty(1, dtype=object), - ]: - self.assertFalse(is_numeric_dtype(b)) - - def test_Data_Utils_convert_to_datetime(self): - """TODO.""" - a = cftime.DatetimeGregorian(2000, 12, 3, 12) - for x in (2.5, [2.5]): - d = da.from_array(x) - e = cf.data.utils.convert_to_datetime( - d, cf.Units("days since 2000-12-01") - ) - self.assertEqual(e.compute(), a) - - a = [ - cftime.DatetimeGregorian(2000, 12, 1), - cftime.DatetimeGregorian(2000, 12, 2), - cftime.DatetimeGregorian(2000, 12, 3), - ] - for x in ([0, 1, 2], [[0, 1, 2]]): - d = da.from_array([0, 1, 2], chunks=2) - e = cf.data.utils.convert_to_datetime( - d, cf.Units("days since 2000-12-01") - ) - self.assertTrue((e.compute() == a).all()) - - def test_Data_Utils_convert_to_reftime(self): - """TODO.""" - a = cftime.DatetimeGregorian(2000, 12, 3, 12) - d = da.from_array(np.array(a, dtype=object)) - - e, u = cf.data.utils.convert_to_reftime(d) - self.assertEqual(e.compute(), 0.5) - self.assertEqual(u, cf.Units("days since 2000-12-03", "standard")) - - units = cf.Units("days since 2000-12-01") - e, u = cf.data.utils.convert_to_reftime(d, units=units) - self.assertEqual(e.compute(), 2.5) - self.assertEqual(u, units) - - a = "2000-12-03T12:00" - d = da.from_array(np.array(a, dtype=str)) - - e, u = cf.data.utils.convert_to_reftime(d) - self.assertEqual(e.compute(), 0.5) - self.assertEqual(u, cf.Units("days since 2000-12-03", "standard")) - - units = cf.Units("days since 2000-12-01") - e, u = cf.data.utils.convert_to_reftime(d, units=units) - self.assertEqual(e.compute(), 2.5) - self.assertEqual(u, units) - - a = [ - [ - cftime.DatetimeGregorian(2000, 12, 1), - cftime.DatetimeGregorian(2000, 12, 2), - cftime.DatetimeGregorian(2000, 12, 3), - ] - ] - d = da.from_array(np.ma.array(a, mask=[[1, 0, 0]]), chunks=2) - - e, u = cf.data.utils.convert_to_reftime(d) - self.assertTrue((e.compute() == [-99, 0, 1]).all()) - self.assertEqual(u, cf.Units("days since 2000-12-02", "standard")) - - units = cf.Units("days since 2000-12-03") - e, u = cf.data.utils.convert_to_reftime(d, units=units) - self.assertTrue((e.compute() == [-99, -1, 0]).all()) - self.assertEqual(u, units) - - d = cf.Data( - ["2004-02-29", "2004-02-30", "2004-03-01"], calendar="360_day" - ) - self.assertEqual(d.Units, cf.Units("days since 2004-02-29", "360_day")) - self.assertTrue((d.array == [0, 1, 2]).all()) - - d = cf.Data(["2004-02-29", "2004-03-01"], dt=True) - self.assertEqual(d.Units, cf.Units("days since 2004-02-29")) - self.assertTrue((d.array == [0, 1]).all()) - def test_Data_Utils_unique_calendars(self): """TODO.""" a = [ @@ -198,93 +48,6 @@ def test_Data_Utils_unique_calendars(self): c = cf.data.utils.unique_calendars(d) self.assertEqual(c, set(["all_leap", "standard"])) - def test_Data_Utils_first_non_missing_value(self): - """TODO.""" - for method in ("index", "mask"): - # Scalar data - d = da.from_array(0) - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), 0 - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - d[()] = np.ma.masked - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), None - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - # 1-d data - d = da.arange(8) - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), 0 - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - d[0] = np.ma.masked - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), 1 - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - # 2-d data - d = da.arange(8).reshape(2, 4) - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), 0 - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - d[0] = np.ma.masked - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), 4 - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - d[...] = np.ma.masked - self.assertEqual( - cf.data.utils.first_non_missing_value(d, method=method), None - ) - self.assertEqual( - cf.data.utils.first_non_missing_value( - d, cached=99, method=method - ), - 99, - ) - - # Bad method - with self.assertRaises(ValueError): - cf.data.utils.first_non_missing_value(d, method="bad") - def test_Data_Utils_conform_units(self): for x in (1, [1, 2], "foo", np.array([[1]])): self.assertEqual(cf.data.utils.conform_units(x, cf.Units("m")), x) diff --git a/cf/test/test_DimensionCoordinate.py b/cf/test/test_DimensionCoordinate.py index 9244a202b1..bb4c41ac90 100644 --- a/cf/test/test_DimensionCoordinate.py +++ b/cf/test/test_DimensionCoordinate.py @@ -611,7 +611,14 @@ def test_DimensiconCoordinate_persist(self): e = d.persist() self.assertIsInstance(e, cf.DimensionCoordinate) - self.assertEqual(len(e.to_dask_array().dask.layers), 1) + self.assertEqual( + len( + e.data.to_dask_array( + _apply_mask_hardness=False, _asanyarray=False + ).dask.layers + ), + 1, + ) self.assertTrue(e.equals(d)) self.assertIsNone(d.persist(inplace=True)) diff --git a/cf/test/test_Field.py b/cf/test/test_Field.py index e8bb58bde6..b514fdbed3 100644 --- a/cf/test/test_Field.py +++ b/cf/test/test_Field.py @@ -2483,13 +2483,19 @@ def test_Field_percentile(self): def test_Field_grad_xy(self): f = cf.example_field(0) - # Spherical polar coordinates + # theta=0 is at the north pole theta = 90 - f.convert("Y", full_domain=True) sin_theta = theta.sin() radius = 2 r = f.radius(radius) + g = f.copy() + lon = g.dimension_coordinate("latitude") + lat = g.dimension_coordinate("longitude") + lon.Units = cf.Units("radians") + lat.Units = cf.Units("radians") + for wrap in (False, True, None): for one_sided in (True, False): x, y = f.grad_xy( @@ -2499,25 +2505,25 @@ def test_Field_grad_xy(self): self.assertEqual(x.Units, y.Units) self.assertEqual(y.Units, cf.Units("m-1")) - x0 = 57.2957795130823 * f.derivative( + x0 = g.derivative( "X", wrap=wrap, one_sided_at_boundary=one_sided, + ignore_coordinate_units=True, ) / (sin_theta * r) - y0 = 57.2957795130823 * ( - f.derivative( + y0 = ( + g.derivative( "Y", one_sided_at_boundary=one_sided, + ignore_coordinate_units=True, ) / r ) - x0.override_units('m-1', inplace=True) - y0.override_units('m-1', inplace=True) # Check the data self.assertTrue(x.data.allclose(x0.data)) self.assertTrue(y.data.allclose(y0.data)) - + # Check that x and y have the same metadata as f # (except standard_name, long_name, and units). f0 = f.copy() @@ -2548,7 +2554,9 @@ def test_Field_grad_xy(self): self.assertEqual(y.Units, cf.Units("m-1")) x0 = f.derivative( - "X", wrap=wrap, one_sided_at_boundary=one_sided + "X", + wrap=wrap, + one_sided_at_boundary=one_sided, ) y0 = f.derivative("Y", one_sided_at_boundary=one_sided) @@ -2645,7 +2653,7 @@ def test_Field_laplacian_xy(self): g.dimension_coordinate("X").standard_name, "longitude" ) - def test_Field__aaa_to_dask_array(self): + def test_Field_to_dask_array(self): f = self.f0.copy() self.assertTrue((f.array == f.to_dask_array().compute()).all()) diff --git a/cf/test/test_Maths.py b/cf/test/test_Maths.py index 349bd495dd..1c93b7910c 100644 --- a/cf/test/test_Maths.py +++ b/cf/test/test_Maths.py @@ -2,6 +2,8 @@ import faulthandler import unittest +import numpy as np + faulthandler.enable() # to debug seg faults and timeouts import cf @@ -45,7 +47,7 @@ def test_curl_xy(self): # Check the data with cf.rtol(1e-10): - self.assertTrue((c.data == c0.data).all()) + self.assertTrue(c.data.allclose(c0.data)) del c.long_name c0.set_data(c.data) @@ -120,21 +122,23 @@ def test_div_xy(self): x_wrap=wrap, one_sided_at_boundary=one_sided, ) - self.assertEqual(d.Units, cf.Units("m-2")) term1 = x.derivative( - "X", wrap=wrap, one_sided_at_boundary=one_sided + "X", + wrap=wrap, + one_sided_at_boundary=one_sided, ) term2 = (y * sin_theta).derivative( - "Y", one_sided_at_boundary=one_sided + "Y", + one_sided_at_boundary=one_sided, ) d0 = (term1 + term2) / (sin_theta * r) # Check the data with cf.rtol(1e-10): - self.assertTrue((d.data == d0.data).all()) + self.assertTrue(d.data.allclose(d0.data)) del d.long_name d0.set_data(d.data) diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py index f14e063849..4770368808 100644 --- a/cf/test/test_active_storage.py +++ b/cf/test/test_active_storage.py @@ -41,7 +41,9 @@ def test_active_storage(self): f = cf.example_field(0) cf.write(f, tmpfile) - f = cf.read(tmpfile, chunks={"latitude": (4, 1), "longitude": (3, 5)}) + f = cf.read( + tmpfile, dask_chunks={"latitude": (4, 1), "longitude": (3, 5)} + ) f = f[0] self.assertEqual(f.data.chunks, ((4, 1), (3, 5))) diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index f6cce13ae0..b59b29635b 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -31,12 +31,10 @@ def test_example_field_example_fields(self): def test_keyword_deprecation(self): # Use as test case 'i' kwarg, the deprecated old name for # 'inplace': - a = cf.Data([list(range(100))]) - a.squeeze(inplace=True) # new way to specify operation tested below - - b = cf.Data([list(range(100))]) + f = cf.example_field(0) + f.squeeze(inplace=True) # new way to specify operation tested below with self.assertRaises(cf.functions.DeprecationError): - b.squeeze(i=True) + f.squeeze(i=True) def test_aliases(self): self.assertEqual(cf.log_level(), cf.LOG_LEVEL()) diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py index f0bf697fac..613280eaf3 100644 --- a/cf/test/test_read_write.py +++ b/cf/test/test_read_write.py @@ -844,49 +844,6 @@ def test_read_write_domain(self): self.assertIsInstance(e[1], cf.Domain) self.assertTrue(e[0].equals(e[1])) - def test_read_chunks(self): - f = cf.example_field(0) - f.construct("latitude").axis = "Y" - cf.write(f, tmpfile) - - f = cf.read(tmpfile, chunks={})[0] - self.assertEqual(f.data.chunks, ((5,), (8,))) - - f = cf.read(tmpfile, chunks=-1)[0] - self.assertEqual(f.data.chunks, ((5,), (8,))) - - f = cf.read(tmpfile, chunks=None)[0] - self.assertEqual(f.data.chunks, ((5,), (8,))) - - f = cf.read(tmpfile, chunks={"foo": 2, "bar": 3})[0] - self.assertEqual(f.data.chunks, ((5,), (8,))) - - with cf.chunksize("200GB"): - f = cf.read(tmpfile)[0] - self.assertEqual(f.data.chunks, ((5,), (8,))) - - with cf.chunksize("150B"): - f = cf.read(tmpfile)[0] - self.assertEqual(f.data.chunks, ((4, 1), (4, 4))) - - f = cf.read(tmpfile, chunks="150B")[0] - self.assertEqual(f.data.chunks, ((4, 1), (4, 4))) - - f = cf.read(tmpfile, chunks=3)[0] - self.assertEqual(f.data.chunks, ((3, 2), (3, 3, 2))) - - y = f.construct("Y") - self.assertEqual(y.data.chunks, ((3, 2),)) - - f = cf.read(tmpfile, chunks={"ncdim%lon": 3})[0] - self.assertEqual(f.data.chunks, ((5,), (3, 3, 2))) - - f = cf.read(tmpfile, chunks={"longitude": 5, "Y": "150B"})[0] - self.assertEqual(f.data.chunks, ((3, 2), (5, 3))) - - y = f.construct("Y") - self.assertEqual(y.data.chunks, ((5,),)) - def test_write_omit_data(self): """Test the `omit_data` parameter to `write`.""" f = cf.example_field(1) diff --git a/cf/test/test_regrid.py b/cf/test/test_regrid.py index f171e83994..2001b4cce6 100644 --- a/cf/test/test_regrid.py +++ b/cf/test/test_regrid.py @@ -756,7 +756,9 @@ def test_Field_regrid_chunks(self): filename = os.path.join( os.path.dirname(os.path.abspath(__file__)), "regrid.nc" ) - dst, src = cf.read(filename, chunks={"latitude": 20, "longitude": 30}) + dst, src = cf.read( + filename, dask_chunks={"latitude": 20, "longitude": 30} + ) self.assertEqual(src.data.numblocks, (1, 2, 2)) self.assertEqual(dst.data.numblocks, (1, 4, 4)) From bd5dd1617e7aee4cef9e606d985d15df41ac60a5 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 6 Nov 2024 13:35:11 +0000 Subject: [PATCH 04/27] dev --- cf/constants.py | 2 +- cf/data/array/fullarray.py | 2 +- cf/data/array/mixin/__init__.py | 3 +- cf/data/array/mixin/indexmixin.py | 730 ++++++++++++------------ cf/data/array/netcdf4array.py | 5 +- cf/data/array/umarray.py | 7 +- cf/data/data.py | 85 ++- cf/data/fragment/netcdffragmentarray.py | 4 +- cf/data/utils.py | 13 +- cf/functions.py | 76 +-- cf/read_write/netcdf/netcdfread.py | 2 - cf/read_write/read.py | 3 +- cf/test/test_Data.py | 14 +- cf/test/test_Maths.py | 2 - cf/test/test_functions.py | 4 + cf/units.py | 2 + 16 files changed, 462 insertions(+), 492 deletions(-) diff --git a/cf/constants.py b/cf/constants.py index 1472bd83d2..aa2bfd0fcd 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -37,7 +37,7 @@ Find the total amount of physical memory (in bytes). CHUNKSIZE: `int` - The chunk size (in bytes) for data storage and processing. + The Dask chunk size (in bytes). See `cf.chunksize`. TEMPDIR: `str` The location to store temporary files. By default it is the diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index 81278c3407..559a9cb410 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -1,8 +1,8 @@ import numpy as np +from cfdm.data.mixin import IndexMixin from ...functions import indices_shape, parse_indices from .abstract import Array -from .mixin import IndexMixin _FULLARRAY_HANDLED_FUNCTIONS = {} diff --git a/cf/data/array/mixin/__init__.py b/cf/data/array/mixin/__init__.py index 8e5dd7690d..5bf63658df 100644 --- a/cf/data/array/mixin/__init__.py +++ b/cf/data/array/mixin/__init__.py @@ -3,4 +3,5 @@ from .cfamixin import CFAMixin from .compressedarraymixin import CompressedArrayMixin from .filearraymixin import FileArrayMixin -from .indexmixin import IndexMixin + +# from .indexmixin import IndexMixin diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py index d105ba943a..3e0ccc7afb 100644 --- a/cf/data/array/mixin/indexmixin.py +++ b/cf/data/array/mixin/indexmixin.py @@ -1,364 +1,366 @@ -from numbers import Integral - -import numpy as np -from dask.array.slicing import normalize_index -from dask.base import is_dask_collection - -from ....functions import indices_shape, parse_indices - - -class IndexMixin: - """Mixin class for lazy indexing of a data array. - - A data for a subspace is retrieved by casting the object as a - `numpy` array. See `__getitem__` for more details. - - **Examples** - - >>> a = cf.{{class}}(...) - >>> a.shape - (6, 5) - >>> print(np.asanyarray(a)) - [[ 0 1 2 3 4]) - [ 5 6 7 8 9] - [10 11 12 13 14] - [15 16 17 18 19] - [20 21 22 23 24] - [25 26 27 28 29]] - >>> a = a[::2, [1, 2, 4]] - >>> a = a[[True, False, True], :] - >>> a.shape - (2, 3) - >>> print(np.asanyarray(a)) - [[ 1, 2, 4], - [21, 22, 24]] - - .. versionadded:: NEXTVERSION - - """ - - def __array__(self, *dtype): - """Convert the `{{class}}` into a `numpy` array. - - .. versionadded:: NEXTVERSION - - :Parameters: - - dtype: optional - Typecode or data-type to which the array is cast. - - :Returns: - - `numpy.ndarray` - An independent `numpy` array of the subspace of the - data defined by the `indices` attribute. - - """ - array = self._get_array() - if dtype: - return array.astype(dtype[0], copy=False) - - return array - - def __getitem__(self, index): - """Returns a subspace of the data as a new `{{class}}`. - - x.__getitem__(indices) <==> x[indices] - - Subspaces created by indexing are lazy and are not applied - until the `{{class}}` object is converted to a `numpy` array, - by which time all lazily-defined subspaces will have been - converted to a single combined index which defines only the - actual elements that need to be retrieved from the original - data. - - The combined index is orthogonal, meaning that the index for - each dimension is to be applied independently, regardless of - how that index was defined. For instance, the indices ``[[0, - 1], [1, 3], 0]`` and ``[:2, 1::2, 0]`` will give identical - results. - - For example, if the original data has shape ``(12, 145, 192)`` - and consecutive subspaces of ``[::2, [1, 3, 4], 96:]`` and - ``[[0, 5], [True, False, True], 0]`` are applied, then only - the elements defined by the combined index``[[0, 10], [1, 4], - 96]`` will be retrieved from the data when `__array__` is - called. - - .. versionadded:: NEXTVERSION - - .. seealso:: `index`, `original_shape`, `__array__`, - `__getitem__` - - :Returns: - - `{{class}}` - The subspaced data. - - """ - shape0 = self.shape - index0 = self.index(conform=False) - original_shape = self.original_shape - - index1 = parse_indices(shape0, index, keepdims=False) - - new = self.copy() - new_indices = [] - new_shape = [] - - i = 0 - for ind0, original_size in zip(index0, original_shape): - if isinstance(ind0, Integral): - # The previous call to __getitem__ resulted in a - # dimension being removed (i.e. 'ind0' is - # integer-valued). Therefore 'index1' must have fewer - # elements than 'index0', so we need to "carry - # forward" the integer-valued index so that it is - # available at evaluation time. - new_indices.append(ind0) - continue - - ind1 = index1[i] - size0 = shape0[i] - i += 1 - - # If this dimension is not subspaced by the new index then - # we don't need to update the old index. - if isinstance(ind1, slice) and ind1 == slice(None): - new_indices.append(ind0) - continue - - # Still here? Then we have to work out the index of the - # full array that is equivalent to applying - # 'ind0' followed by 'ind1'. - if is_dask_collection(ind1): - # Note: This will never occur when this __getitem__ is - # being called from within a Dask graph, because - # any lazy indices will have already been - # computed as part of the whole graph execution; - # i.e. we don't have to worry about a - # compute-within-a-compute situation. (If this - # were not the case then we could add - # `scheduler="synchronous"` to the compute - # call.) - ind1 = ind1.compute() - - if isinstance(ind0, slice): - if isinstance(ind1, slice): - # ind0: slice - # ind1: slice - start, stop, step = ind0.indices(original_size) - start1, stop1, step1 = ind1.indices(size0) - size1, mod1 = divmod(stop1 - start1, step1) - - if mod1 != 0: - size1 += 1 - - start += start1 * step - step *= step1 - stop = start + (size1 - 1) * step - - if step > 0: - stop += 1 - else: - stop -= 1 - - if stop < 0: - stop = None - - new_index = slice(start, stop, step) - else: - # ind0: slice - # ind1: int, or array of int/bool - new_index = np.arange(*ind0.indices(original_size))[ind1] - else: - # ind0: array of int. If we made it to here then it - # can't be anything else. This is - # because we've dealt with ind0 - # being a slice or an int, the - # very first ind0 is always - # slice(None), and a previous ind1 - # that was an array of bool will - # have resulted in this ind0 being - # an array of int. - # - # ind1: anything - new_index = np.asanyarray(ind0)[ind1] - - new_indices.append(new_index) - - new._custom["index"] = tuple(new_indices) - - # Find the shape defined by the new index - new_shape = indices_shape(new_indices, original_shape, keepdims=False) - new._set_component("shape", tuple(new_shape), copy=False) - - return new - - def __repr__(self): - """Called by the `repr` built-in function. - - x.__repr__() <==> repr(x) - - """ - return ( - f"" - ) - - @property - def __asanyarray__(self): - """Whether the array is accessed by conversion to a `numpy` array. - - .. versionadded:: NEXTVERSION - - :Returns: - - `True` - - """ - return True - - def _get_array(self, index=None): - """Returns a subspace of the data as a `numpy` array. - - .. versionadded:: NEXTVERSION - - .. seealso:: `__array__`, `index` - - :Parameters: - - index: `tuple` or `None`, optional - Provide the indices that define the subspace. If - `None` then the `index` attribute is used. - - :Returns: - - `numpy.ndarray` - The subspace. - - """ - return NotImplementedError( - f"Must implement {self.__class__.__name__}._get_array" - ) - - def index(self, conform=True): - """The index to be applied when converting to a `numpy` array. - - The `shape` is defined by the `index` applied to the - `original_shape`. - - .. versionadded:: NEXTVERSION - - .. seealso:: `shape`, `original_shape` - - :Parameters: - - conform: `bool`, optional - If True, the default, then - - * Convert a decreasing size 1 slice to an increasing - one. - - * Convert, where possible, a sequence of integers to a - slice. - - These transformations are to allow subspacing on data - objects that have restricted indexing functionality, - such as `h5py.Variable` objects. - - If False then these transformations are not done. - - :Returns: - - `tuple` - - **Examples** - - >>> x.shape - (12, 145, 192) - >>> x.index() - (slice(None), slice(None), slice(None)) - >>> x = x[8:7:-1, 10:19:3, [15, 1, 4, 12]] - >>> x = x[[0], [True, False, True], ::-2] - >>> x.shape - (1, 2, 2) - >>> x.index() - (slice(8, 9, None), slice(10, 17, 6), slice(12, -1, -11)) - >>> x.index(conform=False) - (array([8]), array([10, 16]), array([12, 1])) - - """ - ind = self._custom.get("index") - if ind is None: - # No indices have been applied yet, so define indices that - # are equivalent to Ellipsis, and set the original shape. - ind = (slice(None),) * self.ndim - self._custom["index"] = ind - self._custom["original_shape"] = self.shape - return ind - - if not conform: - return ind - - # Still here? Then conform the indices by: - # - # 1) Converting decreasing size 1 slices to increasing - # ones. This helps when the parent class can't cope with - # decreasing slices. - # - # 2) Converting, where possible, sequences of integers to - # slices. This helps when the parent class can't cope with - # indices that are sequences of integers. - ind = list(ind) - for n, (i, size) in enumerate(zip(ind[:], self.original_shape)): - if isinstance(i, slice): - if size == 1: - start, _, step = i.indices(size) - if step and step < 0: - # Decreasing slices are not universally - # accepted (e.g. `h5py` doesn't like them), - # but we can convert them to increasing ones. - ind[n] = slice(start, start + 1) - elif np.iterable(i): - i = normalize_index((i,), (size,))[0] - if i.size == 1: - # Convert a sequence of one integer into a slice - start = i.item() - ind[n] = slice(start, start + 1) - else: - # Convert a sequence of two or more evenly spaced - # integers into a slice. - step = np.unique(np.diff(i)) - if step.size == 1: - start, stop = i[[0, -1]] - if stop >= start: - stop += 1 - elif stop: - stop = -1 - else: - stop = None - - ind[n] = slice(start, stop, step.item()) - - return tuple(ind) - - @property - def original_shape(self): - """The original shape of the data, before any subspacing. - - The `shape` is defined by the result of subspacing the data in - its original shape with the indices given by `index`. - - .. versionadded:: NEXTVERSION - - .. seealso:: `index`, `shape` - - """ - out = self._custom.get("original_shape") - if out is None: - # No subspace has been defined yet - out = self.shape - self._custom["original_shape"] = out - - return out +# from numbers import Integral +# +# import numpy as np +# from dask.array.slicing import normalize_index +# from dask.base import is_dask_collection +# +# from ....functions import indices_shape, parse_indices +# +# +# class IndexMixin: +# """Mixin class for lazy indexing of a data array. +# +# A data for a subspace is retrieved by casting the object as a +# `numpy` array. See `__getitem__` for more details. +# +# **Examples** +# +# >>> a = cf.{{class}}(...) +# >>> a.shape +# (6, 5) +# >>> print(np.asanyarray(a)) +# [[ 0 1 2 3 4]) +# [ 5 6 7 8 9] +# [10 11 12 13 14] +# [15 16 17 18 19] +# [20 21 22 23 24] +# [25 26 27 28 29]] +# >>> a = a[::2, [1, 2, 4]] +# >>> a = a[[True, False, True], :] +# >>> a.shape +# (2, 3) +# >>> print(np.asanyarray(a)) +# [[ 1, 2, 4], +# [21, 22, 24]] +# +# .. versionadded:: NEXTVERSION +# +# """ +# +# +# +# def __array__(self, *dtype): +# """Convert the `{{class}}` into a `numpy` array. +# +# .. versionadded:: NEXTVERSION +# +# :Parameters: +# +# dtype: optional +# Typecode or data-type to which the array is cast. +# +# :Returns: +# +# `numpy.ndarray` +# An independent `numpy` array of the subspace of the +# data defined by the `indices` attribute. +# +# """ +# array = self._get_array() +# if dtype: +# return array.astype(dtype[0], copy=False) +# +# return array +# +# def __getitem__(self, index): +# """Returns a subspace of the data as a new `{{class}}`. +# +# x.__getitem__(indices) <==> x[indices] +# +# Subspaces created by indexing are lazy and are not applied +# until the `{{class}}` object is converted to a `numpy` array, +# by which time all lazily-defined subspaces will have been +# converted to a single combined index which defines only the +# actual elements that need to be retrieved from the original +# data. +# +# The combined index is orthogonal, meaning that the index for +# each dimension is to be applied independently, regardless of +# how that index was defined. For instance, the indices ``[[0, +# 1], [1, 3], 0]`` and ``[:2, 1::2, 0]`` will give identical +# results. +# +# For example, if the original data has shape ``(12, 145, 192)`` +# and consecutive subspaces of ``[::2, [1, 3, 4], 96:]`` and +# ``[[0, 5], [True, False, True], 0]`` are applied, then only +# the elements defined by the combined index``[[0, 10], [1, 4], +# 96]`` will be retrieved from the data when `__array__` is +# called. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `index`, `original_shape`, `__array__`, +# `__getitem__` +# +# :Returns: +# +# `{{class}}` +# The subspaced data. +# +# """ +# shape0 = self.shape +# index0 = self.index(conform=False) +# original_shape = self.original_shape +# +# index1 = parse_indices(shape0, index, keepdims=False) +# +# new = self.copy() +# new_indices = [] +# new_shape = [] +# +# i = 0 +# for ind0, original_size in zip(index0, original_shape): +# if isinstance(ind0, Integral): +# # The previous call to __getitem__ resulted in a +# # dimension being removed (i.e. 'ind0' is +# # integer-valued). Therefore 'index1' must have fewer +# # elements than 'index0', so we need to "carry +# # forward" the integer-valued index so that it is +# # available at evaluation time. +# new_indices.append(ind0) +# continue +# +# ind1 = index1[i] +# size0 = shape0[i] +# i += 1 +# +# # If this dimension is not subspaced by the new index then +# # we don't need to update the old index. +# if isinstance(ind1, slice) and ind1 == slice(None): +# new_indices.append(ind0) +# continue +# +# # Still here? Then we have to work out the index of the +# # full array that is equivalent to applying +# # 'ind0' followed by 'ind1'. +# if is_dask_collection(ind1): +# # Note: This will never occur when this __getitem__ is +# # being called from within a Dask graph, because +# # any lazy indices will have already been +# # computed as part of the whole graph execution; +# # i.e. we don't have to worry about a +# # compute-within-a-compute situation. (If this +# # were not the case then we could add +# # `scheduler="synchronous"` to the compute +# # call.) +# ind1 = ind1.compute() +# +# if isinstance(ind0, slice): +# if isinstance(ind1, slice): +# # ind0: slice +# # ind1: slice +# start, stop, step = ind0.indices(original_size) +# start1, stop1, step1 = ind1.indices(size0) +# size1, mod1 = divmod(stop1 - start1, step1) +# +# if mod1 != 0: +# size1 += 1 +# +# start += start1 * step +# step *= step1 +# stop = start + (size1 - 1) * step +# +# if step > 0: +# stop += 1 +# else: +# stop -= 1 +# +# if stop < 0: +# stop = None +# +# new_index = slice(start, stop, step) +# else: +# # ind0: slice +# # ind1: int, or array of int/bool +# new_index = np.arange(*ind0.indices(original_size))[ind1] +# else: +# # ind0: array of int. If we made it to here then it +# # can't be anything else. This is +# # because we've dealt with ind0 +# # being a slice or an int, the +# # very first ind0 is always +# # slice(None), and a previous ind1 +# # that was an array of bool will +# # have resulted in this ind0 being +# # an array of int. +# # +# # ind1: anything +# new_index = np.asanyarray(ind0)[ind1] +# +# new_indices.append(new_index) +# +# new._custom["index"] = tuple(new_indices) +# +# # Find the shape defined by the new index +# new_shape = indices_shape(new_indices, original_shape, keepdims=False) +# new._set_component("shape", tuple(new_shape), copy=False) +# +# return new +# +# def __repr__(self): +# """Called by the `repr` built-in function. +# +# x.__repr__() <==> repr(x) +# +# """ +# return ( +# f"" +# ) +# +# @property +# def __asanyarray__(self): +# """Whether the array is accessed by conversion to a `numpy` array. +# +# .. versionadded:: NEXTVERSION +# +# :Returns: +# +# `True` +# +# """ +# return True +# +# def _get_array(self, index=None): +# """Returns a subspace of the data as a `numpy` array. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `__array__`, `index` +# +# :Parameters: +# +# index: `tuple` or `None`, optional +# Provide the indices that define the subspace. If +# `None` then the `index` attribute is used. +# +# :Returns: +# +# `numpy.ndarray` +# The subspace. +# +# """ +# return NotImplementedError( +# f"Must implement {self.__class__.__name__}._get_array" +# ) +# +# def index(self, conform=True): +# """The index to be applied when converting to a `numpy` array. +# +# The `shape` is defined by the `index` applied to the +# `original_shape`. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `shape`, `original_shape` +# +# :Parameters: +# +# conform: `bool`, optional +# If True, the default, then +# +# * Convert a decreasing size 1 slice to an increasing +# one. +# +# * Convert, where possible, a sequence of integers to a +# slice. +# +# These transformations are to allow subspacing on data +# objects that have restricted indexing functionality, +# such as `h5py.Variable` objects. +# +# If False then these transformations are not done. +# +# :Returns: +# +# `tuple` +# +# **Examples** +# +# >>> x.shape +# (12, 145, 192) +# >>> x.index() +# (slice(None), slice(None), slice(None)) +# >>> x = x[8:7:-1, 10:19:3, [15, 1, 4, 12]] +# >>> x = x[[0], [True, False, True], ::-2] +# >>> x.shape +# (1, 2, 2) +# >>> x.index() +# (slice(8, 9, None), slice(10, 17, 6), slice(12, -1, -11)) +# >>> x.index(conform=False) +# (array([8]), array([10, 16]), array([12, 1])) +# +# """ +# ind = self._custom.get("index") +# if ind is None: +# # No indices have been applied yet, so define indices that +# # are equivalent to Ellipsis, and set the original shape. +# ind = (slice(None),) * self.ndim +# self._custom["index"] = ind +# self._custom["original_shape"] = self.shape +# return ind +# +# if not conform: +# return ind +# +# # Still here? Then conform the indices by: +# # +# # 1) Converting decreasing size 1 slices to increasing +# # ones. This helps when the parent class can't cope with +# # decreasing slices. +# # +# # 2) Converting, where possible, sequences of integers to +# # slices. This helps when the parent class can't cope with +# # indices that are sequences of integers. +# ind = list(ind) +# for n, (i, size) in enumerate(zip(ind[:], self.original_shape)): +# if isinstance(i, slice): +# if size == 1: +# start, _, step = i.indices(size) +# if step and step < 0: +# # Decreasing slices are not universally +# # accepted (e.g. `h5py` doesn't like them), +# # but we can convert them to increasing ones. +# ind[n] = slice(start, start + 1) +# elif np.iterable(i): +# i = normalize_index((i,), (size,))[0] +# if i.size == 1: +# # Convert a sequence of one integer into a slice +# start = i.item() +# ind[n] = slice(start, start + 1) +# else: +# # Convert a sequence of two or more evenly spaced +# # integers into a slice. +# step = np.unique(np.diff(i)) +# if step.size == 1: +# start, stop = i[[0, -1]] +# if stop >= start: +# stop += 1 +# elif stop: +# stop = -1 +# else: +# stop = None +# +# ind[n] = slice(start, stop, step.item()) +# +# return tuple(ind) +# +# @property +# def original_shape(self): +# """The original shape of the data, before any subspacing. +# +# The `shape` is defined by the result of subspacing the data in +# its original shape with the indices given by `index`. +# +# .. versionadded:: NEXTVERSION +# +# .. seealso:: `index`, `shape` +# +# """ +# out = self._custom.get("original_shape") +# if out is None: +# # No subspace has been defined yet +# out = self.shape +# self._custom["original_shape"] = out +# +# return out diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py index 796925542a..f8eb5e8ad5 100644 --- a/cf/data/array/netcdf4array.py +++ b/cf/data/array/netcdf4array.py @@ -1,14 +1,11 @@ import cfdm from ...mixin_container import Container - -# from .locks import netcdf_lock -from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin +from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin class NetCDF4Array( ActiveStorageMixin, - # IndexMixin, FileArrayMixin, ArrayMixin, Container, diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index 510b9c97ee..a560365d9b 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -4,11 +4,14 @@ from ...functions import _DEPRECATION_ERROR_ATTRIBUTE, load_stash2standard_name from ...umread_lib.umfile import File, Rec from .abstract import Array -from .mixin import FileArrayMixin, IndexMixin +from .mixin import FileArrayMixin class UMArray( - IndexMixin, FileArrayMixin, cfdm.data.mixin.FileArrayMixin, Array + FileArrayMixin, + cfdm.data.mixin.IndexMixin, + cfdm.data.mixin.FileArrayMixin, + Array, ): """A sub-array stored in a PP or UM fields file.""" diff --git a/cf/data/data.py b/cf/data/data.py index 733769a530..9a273f15ab 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1,9 +1,7 @@ import logging import math -import operator from functools import partial, reduce from itertools import product -from numbers import Integral from operator import mul from os import sep @@ -29,10 +27,8 @@ ) from ..functions import ( _DEPRECATION_ERROR_KWARGS, - _numpy_allclose, _section, abspath, - default_netCDF_fillvals, free_memory, parse_indices, ) @@ -72,16 +68,6 @@ _dtype_bool = np.dtype(bool) -# Contstants used to specify which `Data` components should be cleared -# when a new dask array is set. See `Data._clear_after_dask_update` -# for details. -_NONE = 0 # = 0b0000 -_ARRAY = 1 # = 0b0001 -_CACHE = 2 # = 0b0010 -_CFA = 4 # = 0b0100 -_ALL = 15 # = 0b1111 - - class Data(DataClassDeprecationsMixin, CFANetCDF, Container, cfdm.Data): """An N-dimensional data array with units and masked values. @@ -146,6 +132,20 @@ class Data(DataClassDeprecationsMixin, CFANetCDF, Container, cfdm.Data): """ + # Constants used to specify which components should be cleared + # when a new dask array is set. See `_clear_after_dask_update` for + # details. + # + # These must constants must have values 2**N (N>=1), except for + # `_NONE` which must be 0, and `_ALL` which must be the sum of + # other constants. It is therefore convenient to define these + # constants in binary. + _NONE = 0b000 + _ARRAY = 0b001 + _CACHE = 0b010 + _CFA = 0b100 + _ALL = 0b111 + def __new__(cls, *args, **kwargs): """Store component classes.""" instance = super().__new__(cls) @@ -2193,6 +2193,47 @@ def _asreftime(self, inplace=False): return d + def _clear_after_dask_update(self, clear=None): + """Remove components invalidated by updating the `dask` array. + + Removes or modifies components that can't be guaranteed to be + consistent with an updated `dask` array. See the *clear* + parameter for details. + + .. versionadded:: NEXTVERSION + + .. seealso:: `_del_Array`, `_del_cached_elements`, + `_set_dask`, `_cfa_del_write` + + :Parameters: + + clear: `int` or `None`, optional + Specify which components to remove, determined by + sequentially combining an integer value of *clear* + with the relevant class-level constants (such as + ``{{class}}._ARRAY``), using the bitwise AND (&) + operator. If ``clear & `` is + True then the corresponding component is cleared. The + default value of `None` is equivalent to *clear* being + set to ``{{class}}._ALL``. + + The bitwise OR (^) operator can be used to retain a + component (or components) but remove all others. For + instance, if *clear* is ``{{class}}._ALL ^ + {{class}}._CACHE`` then all components except the + cached array values will be removed. + + :Returns: + + `int` TODODASK + + """ + clear = super()._clear_after_dask_update(clear) + + if clear & self._CFA: + # Set the CFA write status to False + self._cfa_del_write() + def _combined_units(self, data1, method, inplace): """Combines by given method the data's units with other units. @@ -2998,15 +3039,15 @@ def concatenate( # # Assume at first that all input data instances have True # status, but ... - cfa = _CFA + cfa = cls._CFA for d in processed_data: if not d.cfa_get_write(): # ... the CFA write status is False when any input # data instance has False status ... - cfa = _NONE + cfa = cls._NONE break - if cfa != _NONE: + if cfa != cls._NONE: non_concat_axis_chunks0 = list(processed_data[0].chunks) non_concat_axis_chunks0.pop(axis) for d in processed_data[1:]: @@ -3016,7 +3057,7 @@ def concatenate( # ... the CFA write status is False when any two # input data instances have different chunk # patterns for the non-concatenated axes. - cfa = _NONE + cfa = cls._NONE break # Define the __asanyarray__ status @@ -3030,7 +3071,7 @@ def concatenate( break # Set the new dask array - data0._set_dask(dx, clear=_ALL ^ cfa, asanyarray=asanyarray) + data0._set_dask(dx, clear=cls._ALL ^ cfa, asanyarray=asanyarray) # Set appropriate cached elements cached_elements = {} @@ -3431,7 +3472,7 @@ def Units(self, value): # Setting equivalent units doesn't affect the CFA write # status. Nor does it invalidate any cached values, but # only because we'll adjust those, too. - self._set_dask(dx, clear=_ALL ^ _CACHE ^ _CFA) + self._set_dask(dx, clear=self._ALL ^ self._CACHE ^ self._CFA) # Adjust cached values for the new units cache = self._get_cached_elements() @@ -4300,7 +4341,7 @@ def add_file_location(self, location): if updated: dx = self.to_dask_array(_asanyarray=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=_NONE, asanyarray=None) + self._set_dask(dx, clear=self._NONE, asanyarray=None) return location @@ -6777,7 +6818,7 @@ def del_file_location(self, location): if updated: dx = self.to_dask_array(_asanyarray=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=_NONE, asanyarray=None) + self._set_dask(dx, clear=self._NONE, asanyarray=None) return location diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index ee34501e94..4719741904 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -1,7 +1,7 @@ import cfdm from ..array.abstract import Array -from ..array.mixin import FileArrayMixin, IndexMixin +from ..array.mixin import FileArrayMixin from .h5netcdffragmentarray import H5netcdfFragmentArray from .mixin import FragmentArrayMixin from .netcdf4fragmentarray import NetCDF4FragmentArray @@ -9,9 +9,9 @@ class NetCDFFragmentArray( FragmentArrayMixin, - IndexMixin, cfdm.data.mixin.NetCDFFileMixin, FileArrayMixin, + cfdm.data.mixin.IndexMixin, cfdm.data.mixin.FileArrayMixin, Array, ): diff --git a/cf/data/utils.py b/cf/data/utils.py index 481cf81623..4436556387 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -1,20 +1,11 @@ """General functions useful for `Data` functionality.""" -from functools import lru_cache, partial, reduce -from itertools import product +from functools import partial, reduce from operator import mul -import dask.array as da import numpy as np -from ..cfdatetime import ( - canonical_calendar, - default_calendar, - dt, - dt2rt, - rt2dt, - st2rt, -) +from ..cfdatetime import canonical_calendar, default_calendar from ..units import Units from .dask_utils import cf_YMDhms diff --git a/cf/functions.py b/cf/functions.py index 22820bc3db..04e35e84f1 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -24,9 +24,7 @@ import cfdm import netCDF4 import numpy as np -from dask import config as _config from dask.base import is_dask_collection -from dask.utils import parse_bytes from psutil import virtual_memory from . import __cfa_version__, __file__, __version__ @@ -548,7 +546,7 @@ def FREE_MEMORY(): # We can inherit the generic logic for the cf-python log_level() # function as contained in _log_level, but can't inherit the # user-facing log_level() from cfdm as it operates on cfdm's CONSTANTS -# dict. Define cf-python's own. This also means the log_level +# dict. Define cf-python's own. This also means the log_level # dostrings are independent which is important for providing # module-specific documentation links and directives, etc. _reset_log_emergence_level = cfdm._reset_log_emergence_level @@ -573,6 +571,10 @@ class atol(ConstantAccess, cfdm.atol): pass +class chunksize(ConstantAccess, cfdm.chunksize): + pass + + class rtol(ConstantAccess, cfdm.rtol): pass @@ -778,74 +780,6 @@ def _parse(cls, arg): return bool(arg) -class chunksize(ConstantAccess): - """Set the default chunksize used by `dask` arrays. - - If called without any arguments then the existing chunksize is - returned. - - .. note:: Setting the chunk size will also change the `dask` - global configuration value ``'array.chunk-size'``. If - `chunksize` is used in a context manager then the `dask` - configuration value is only altered within that context. - Setting the chunk size directly from the `dask` - configuration API will affect subsequent data creation, - but will *not* change the value of `chunksize`. - - :Parameters: - - arg: number or `str` or `Constant`, optional - The chunksize in bytes. Any size accepted by - `dask.utils.parse_bytes` is accepted, for instance - ``100``, ``'100'``, ``'1e6'``, ``'100 MB'``, ``'100M'``, - ``'5kB'``, ``'5.4 kB'``, ``'1kiB'``, ``'1e6 kB'``, and - ``'MB'`` are all valid sizes. - - Note that if *arg* is a `float`, or a string that implies - a non-integral amount of bytes, then the integer part - (rounded down) will be used. - - *Parameter example:* - A chunksize of 2 MiB may be specified as ``'2097152'`` - or ``'2 MiB'`` - - *Parameter example:* - Chunksizes of ``'2678.9'`` and ``'2.6789 KB'`` are both - equivalent to ``2678``. - - :Returns: - - `Constant` - The value prior to the change, or the current value if no - new value was specified. - - """ - - _name = "CHUNKSIZE" - - def _parse(cls, arg): - """Parse a new constant value. - - .. versionaddedd:: 3.8.0 - - :Parameters: - - cls: - This class. - - arg: - The given new constant value. - - :Returns: - - A version of the new constant value suitable for insertion - into the `CONSTANTS` dictionary. - - """ - _config.set({"array.chunk-size": arg}) - return parse_bytes(arg) - - class tempdir(ConstantAccess): """The directory for internally generated temporary files. diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 571b83e1ec..54d92305fd 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -1,6 +1,4 @@ import cfdm -import netCDF4 -import numpy as np from packaging.version import Version diff --git a/cf/read_write/read.py b/cf/read_write/read.py index 6b23357fe4..9b18944e87 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -2,7 +2,6 @@ import os import tempfile from glob import glob -from numbers import Integral from os.path import isdir from re import Pattern from urllib.parse import urlparse @@ -979,7 +978,7 @@ def read( removed_at="5.0.0", ) # pragma: no cover - if chunks is not "auto": + if chunks != "auto": _DEPRECATION_ERROR_FUNCTION_KWARGS( "cf.read", {"chunk": chunk}, diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 55d2f3469a..3b51206af5 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -4256,15 +4256,15 @@ def test_Data_atol(self): """Test the `_atol` Data property.""" d = cf.Data(1) self.assertEqual(d._atol, cf.atol()) - cf.atol(0.001) - self.assertEqual(d._atol, 0.001) + with cf.atol(0.001): + self.assertEqual(d._atol, 0.001) def test_Data_rtol(self): """Test the `_rtol` Data property.""" d = cf.Data(1) self.assertEqual(d._rtol, cf.rtol()) - cf.rtol(0.001) - self.assertEqual(d._rtol, 0.001) + with cf.rtol(0.001): + self.assertEqual(d._rtol, 0.001) def test_Data_hardmask(self): """Test Data.hardmask.""" @@ -4589,7 +4589,7 @@ def test_Data_convert_reference_time(self): self.assertTrue((e.array == [72, 48, 24, 0]).all()) def test_Data_clear_after_dask_update(self): - """Test Data._clear_after_dask_update""" + """Test Data._clear_after_dask_update.""" d = cf.Data([1, 2, 3], "m") dx = d.to_dask_array() @@ -4599,8 +4599,8 @@ def test_Data_clear_after_dask_update(self): self.assertTrue(d._get_cached_elements()) - _ALL = cf.data.data._ALL - _CACHE = cf.data.data._CACHE + _ALL = cf.Data._ALL + _CACHE = cf.Data._CACHE d._set_dask(dx, clear=_ALL ^ _CACHE) self.assertTrue(d._get_cached_elements()) diff --git a/cf/test/test_Maths.py b/cf/test/test_Maths.py index 1c93b7910c..add50ae710 100644 --- a/cf/test/test_Maths.py +++ b/cf/test/test_Maths.py @@ -2,8 +2,6 @@ import faulthandler import unittest -import numpy as np - faulthandler.enable() # to debug seg faults and timeouts import cf diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index b59b29635b..ad7a59c4f3 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -195,6 +195,10 @@ def test_configuration(self): # messages: cf.log_level("DISABLE") + # Reset configuration + cf.configuration(**org) + self.assertEqual(cf.configuration(), org) + def test_context_managers(self): # rtol, atol for func in (cf.atol, cf.rtol): diff --git a/cf/units.py b/cf/units.py index 09c2d79c51..d1e50a6cae 100644 --- a/cf/units.py +++ b/cf/units.py @@ -22,8 +22,10 @@ class Units: """ def __new__(cls, *args, **kwargs): + """Return a new new Units instance.""" return cfUnits(*args, **kwargs) @staticmethod def conform(*args, **kwargs): + """Conform values to equivalent values in a compatible unit.""" return cfUnits.conform(*args, **kwargs) From 3585b163d27576ce0f393694e54d354b43abee97 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 8 Nov 2024 11:06:16 +0000 Subject: [PATCH 05/27] dev --- cf/test/test_Maths.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cf/test/test_Maths.py b/cf/test/test_Maths.py index add50ae710..ac7945e4ab 100644 --- a/cf/test/test_Maths.py +++ b/cf/test/test_Maths.py @@ -43,10 +43,14 @@ def test_curl_xy(self): c0 = (term1 - term2) / (sin_theta * r) + print('p', c.data._get_cached_elements()) + print('p', c0.data._get_cached_elements()) # Check the data with cf.rtol(1e-10): self.assertTrue(c.data.allclose(c0.data)) + print('p2', c.data._get_cached_elements()) + print('p2', c0.data._get_cached_elements()) del c.long_name c0.set_data(c.data) self.assertTrue(c.equals(c0)) From 6322e5a426daaeefee31a433cfb9fb4a95b44249 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 11 Nov 2024 09:26:13 +0000 Subject: [PATCH 06/27] dev --- cf/functions.py | 146 ++++++++++++++---------------------------- cf/test/test_Maths.py | 4 -- 2 files changed, 48 insertions(+), 102 deletions(-) diff --git a/cf/functions.py b/cf/functions.py index 04e35e84f1..bdfe306f00 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -11,7 +11,6 @@ from collections.abc import Iterable from itertools import product from math import isnan -from numbers import Integral from os import mkdir from os.path import abspath as _os_path_abspath from os.path import dirname as _os_path_dirname @@ -2125,6 +2124,10 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): indices: `tuple` The indices to be applied. + cyclic: `bool`, optional + If True then allow cyclic slices (such as ``slice(-4, 3, + 1)``). + keepdims: `bool`, optional If True then an integral index is converted to a slice. For instance, ``3`` would become ``slice(3, 4)``. @@ -2153,108 +2156,55 @@ def parse_indices(shape, indices, cyclic=False, keepdims=True): >>> cf.parse_indices((5, 8), (cf.Data([1, 3]),)) [dask.array, slice(None, None, None)] - """ - parsed_indices = [] - roll = {} - - if not isinstance(indices, tuple): - indices = (indices,) - - # Initialise the list of parsed indices as the input indices with any - # Ellipsis objects expanded - length = len(indices) - n = len(shape) - ndim = n - for index in indices: - if index is Ellipsis: - m = n - length + 1 - parsed_indices.extend([slice(None)] * m) - n -= m - else: - parsed_indices.append(index) - n -= 1 - - length -= 1 - - len_parsed_indices = len(parsed_indices) + parsed_indices = cfdm.parse_indices(shape, indices, keepdims=keepdims) - if ndim and len_parsed_indices > ndim: - raise IndexError( - f"Invalid indices {parsed_indices} for array with shape {shape}" - ) - - if len_parsed_indices < ndim: - parsed_indices.extend([slice(None)] * (ndim - len_parsed_indices)) - - if not ndim and parsed_indices: - raise IndexError( - "Scalar array can only be indexed with () or Ellipsis" - ) + if not cyclic: + return parsed_indices + roll = {} for i, (index, size) in enumerate(zip(parsed_indices, shape)): - if cyclic and isinstance(index, slice): - # Check for a cyclic slice - try: - index = normalize_slice(index, size, cyclic=True) - except IndexError: - # Non-cyclic slice - pass - else: - # Cyclic slice - start = index.start - stop = index.stop - step = index.step - if ( - step > 0 - and -size <= start < 0 - and 0 <= stop <= size + start - ): - # x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - # x[ -1:0:1] => [9] - # x[ -1:1:1] => [9, 0] - # x[ -1:3:1] => [9, 0, 1, 2] - # x[ -1:9:1] => [9, 0, 1, 2, 3, 4, 5, 6, 7, 8] - # x[ -4:0:1] => [6, 7, 8, 9] - # x[ -4:1:1] => [6, 7, 8, 9, 0] - # x[ -4:3:1] => [6, 7, 8, 9, 0, 1, 2] - # x[ -4:6:1] => [6, 7, 8, 9, 0, 1, 2, 3, 4, 5] - # x[ -9:0:1] => [1, 2, 3, 4, 5, 6, 7, 8, 9] - # x[ -9:1:1] => [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] - # x[-10:0:1] => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - index = slice(0, stop - start, step) - roll[i] = -start - - elif ( - step < 0 and 0 <= start < size and start - size <= stop < 0 - ): - # x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] - # x[0: -4:-1] => [0, 9, 8, 7] - # x[6: -1:-1] => [6, 5, 4, 3, 2, 1, 0] - # x[6: -2:-1] => [6, 5, 4, 3, 2, 1, 0, 9] - # x[6: -4:-1] => [6, 5, 4, 3, 2, 1, 0, 9, 8, 7] - # x[0: -2:-1] => [0, 9] - # x[0:-10:-1] => [0, 9, 8, 7, 6, 5, 4, 3, 2, 1] - index = slice(start - stop - 1, None, step) - roll[i] = -1 - stop - - elif keepdims and isinstance(index, Integral): - # Convert an integral index to a slice - if index == -1: - index = slice(-1, None, None) - else: - index = slice(index, index + 1, 1) - - elif hasattr(index, "to_dask_array"): - to_dask_array = index.to_dask_array - if callable(to_dask_array): - # Replace index with its Dask array - index = to_dask_array() - - parsed_indices[i] = index + if not isinstance(index, slice): + continue - if not cyclic: - return parsed_indices + try: + index = normalize_slice(index, size, cyclic=True) + except IndexError: + # Non-cyclic slice + pass + else: + # Cyclic slice + start = index.start + stop = index.stop + step = index.step + if step > 0 and -size <= start < 0 and 0 <= stop <= size + start: + # x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + # x[ -1:0:1] => [9] + # x[ -1:1:1] => [9, 0] + # x[ -1:3:1] => [9, 0, 1, 2] + # x[ -1:9:1] => [9, 0, 1, 2, 3, 4, 5, 6, 7, 8] + # x[ -4:0:1] => [6, 7, 8, 9] + # x[ -4:1:1] => [6, 7, 8, 9, 0] + # x[ -4:3:1] => [6, 7, 8, 9, 0, 1, 2] + # x[ -4:6:1] => [6, 7, 8, 9, 0, 1, 2, 3, 4, 5] + # x[ -9:0:1] => [1, 2, 3, 4, 5, 6, 7, 8, 9] + # x[ -9:1:1] => [1, 2, 3, 4, 5, 6, 7, 8, 9, 0] + # x[-10:0:1] => [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + index = slice(0, stop - start, step) + roll[i] = -start + + elif step < 0 and 0 <= start < size and start - size <= stop < 0: + # x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] + # x[0: -4:-1] => [0, 9, 8, 7] + # x[6: -1:-1] => [6, 5, 4, 3, 2, 1, 0] + # x[6: -2:-1] => [6, 5, 4, 3, 2, 1, 0, 9] + # x[6: -4:-1] => [6, 5, 4, 3, 2, 1, 0, 9, 8, 7] + # x[0: -2:-1] => [0, 9] + # x[0:-10:-1] => [0, 9, 8, 7, 6, 5, 4, 3, 2, 1] + index = slice(start - stop - 1, None, step) + roll[i] = -1 - stop + + parsed_indices[i] = index return parsed_indices, roll diff --git a/cf/test/test_Maths.py b/cf/test/test_Maths.py index ac7945e4ab..add50ae710 100644 --- a/cf/test/test_Maths.py +++ b/cf/test/test_Maths.py @@ -43,14 +43,10 @@ def test_curl_xy(self): c0 = (term1 - term2) / (sin_theta * r) - print('p', c.data._get_cached_elements()) - print('p', c0.data._get_cached_elements()) # Check the data with cf.rtol(1e-10): self.assertTrue(c.data.allclose(c0.data)) - print('p2', c.data._get_cached_elements()) - print('p2', c0.data._get_cached_elements()) del c.long_name c0.set_data(c.data) self.assertTrue(c.equals(c0)) From c9640601d886454251ea757b28efaa0ff442f823 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 19 Nov 2024 14:42:27 +0000 Subject: [PATCH 07/27] dev --- cf/data/data.py | 11 ++++-- cf/docstring/docstring.py | 78 --------------------------------------- 2 files changed, 7 insertions(+), 82 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 9a273f15ab..c06941d1c0 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -2225,7 +2225,7 @@ def _clear_after_dask_update(self, clear=None): :Returns: - `int` TODODASK + `None` """ clear = super()._clear_after_dask_update(clear) @@ -2711,8 +2711,7 @@ def _binary_operation(cls, data, other, method): else: data = d - # Inplace? - return d + return data def _parse_indices(self, *args, **kwargs): """'cf.Data._parse_indices' is not available. @@ -7167,7 +7166,11 @@ def reshape(self, *shape, merge_chunks=True, limit=None, inplace=False): super(Data, d).reshape( *shape, merge_chunks=merge_chunks, limit=limit, inplace=True ) - # TODODASK: reshape: Need to clear cyclic axes, as we can't help but lose them in this operation + + # Clear cyclic axes, as we can't help but lose them in this + # operation + d._cyclic = _empty_set + return d @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index a11e0129b9..88ef41837c 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -177,42 +177,6 @@ value given by the *radius* parameter is used instead. A value of ``'earth'`` is equivalent to a default value of 6371229 metres.""", - # chunks - "{{chunks: `int`, `tuple`, `dict` or `str`, optional}}": """chunks: `int`, `tuple`, `dict` or `str`, optional - Specify the chunking of the underlying dask array. - - Any value accepted by the *chunks* parameter of the - `dask.array.from_array` function is allowed. - - By default, ``"auto"`` is used to specify the array - chunking, which uses a chunk size in bytes defined by - the `cf.chunksize` function, preferring square-like - chunk shapes. - - *Parameter example:* - A blocksize like ``1000``. - - *Parameter example:* - A blockshape like ``(1000, 1000)``. - - *Parameter example:* - Explicit sizes of all blocks along all dimensions - like ``((1000, 1000, 500), (400, 400))``. - - *Parameter example:* - A size in bytes, like ``"100MiB"`` which will choose - a uniform block-like shape, preferring square-like - chunk shapes. - - *Parameter example:* - A blocksize of ``-1`` or `None` in a tuple or - dictionary indicates the size of the corresponding - dimension. - - *Parameter example:* - Blocksizes of some or all dimensions mapped to - dimension positions, like ``{1: 200}``, or ``{0: -1, - 1: (400, 400)}``.""", # Returns formula "{{Returns formula}}": """5-`tuple` * The standard name of the parametric coordinates. @@ -231,28 +195,6 @@ domain axis. If the vertical axis does not appear in the computed non-parametric coordinates then this an empty tuple.""", - # collapse axes - "{{collapse axes: (sequence of) `int`, optional}}": """axes: (sequence of) `int`, optional - The axes to be collapsed. By default all axes are - collapsed, resulting in output with size 1. Each axis - is identified by its integer position. If *axes* is an - empty sequence then the collapse is applied to each - scalar element and the result has the same shape as - the input data.""", - # collapse squeeze - "{{collapse squeeze: `bool`, optional}}": """squeeze: `bool`, optional - By default, the axes which are collapsed are left in - the result as dimensions with size one, so that the - result will broadcast correctly against the input - array. If set to True then collapsed axes are removed - from the data.""", - # collapse keepdims - "{{collapse keepdims: `bool`, optional}}": """keepdims: `bool`, optional - By default, the axes which are collapsed are left in - the result as dimensions with size one, so that the - result will broadcast correctly against the input - array. If set to False then collapsed axes are removed - from the data.""", # weights "{{weights: data_like, `dict`, or `None`, optional}}": """weights: data_like, `dict`, or `None`, optional Weights associated with values of the data. By default @@ -302,26 +244,6 @@ non-missing elements. A value of 1 applies Bessel's correction. If the calculation is weighted then *ddof* can only be 0 or 1.""", - # split_every - "{{split_every: `int` or `dict`, optional}}": """split_every: `int` or `dict`, optional - Determines the depth of the recursive aggregation. If - set to or more than the number of input chunks, the - aggregation will be performed in two steps, one - partial collapse per input chunk and a single - aggregation at the end. If set to less than that, an - intermediate aggregation step will be used, so that - any of the intermediate or final aggregation steps - operates on no more than ``split_every`` inputs. The - depth of the aggregation graph will be - :math:`log_{split\_every}}(\textnormal{input chunks - along reduced axes})`. Setting to a low value can reduce - cache size and network transfers, at the cost of more - CPU and a larger dask graph. - - By default, `dask` heuristically decides on a good - value. A default can also be set globally with the - ``split_every`` key in `dask.config`. See - `dask.array.reduction` for details.""", # active_storage "{{active_storage: `bool`, optional}}": """{{active_storage: `bool`, optional}} If True then attempt to perform the collapse using From 4b9e1564b6e35d470dc3371633ee410fb37314d7 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 25 Nov 2024 11:33:10 +0000 Subject: [PATCH 08/27] dev --- cf/data/array/h5netcdfarray.py | 65 +------------------------ cf/data/array/locks.py | 4 -- cf/data/array/netcdf4array.py | 59 ----------------------- cf/regrid/regrid.py | 87 ++++++++++++++++++---------------- cf/regrid/regridoperator.py | 32 ++++++------- 5 files changed, 61 insertions(+), 186 deletions(-) delete mode 100644 cf/data/array/locks.py diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 465a9ebf81..b6c603b8d4 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -1,18 +1,11 @@ import cfdm from ...mixin_container import Container - -# from .locks import netcdf_lock -from .mixin import ( # , IndexMixin - ActiveStorageMixin, - ArrayMixin, - FileArrayMixin, -) +from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin class H5netcdfArray( ActiveStorageMixin, - # IndexMixin, FileArrayMixin, ArrayMixin, Container, @@ -28,59 +21,3 @@ class H5netcdfArray( .. versionadded:: NEXTVERSION """ - - # def __dask_tokenize__(self): - # """Return a value fully representative of the object. - - # .. versionadded:: NEXTVERSION - - # """ - # return super().__dask_tokenize__() + (self.get_mask(),) - - # @property - # def _lock(self): - # """Set the lock for use in `dask.array.from_array`. - - # Returns a lock object because concurrent reads are not - # currently supported by the HDF5 library. The lock object will - # be the same for all `NetCDF4Array` and `H5netcdfArray` - # instances, regardless of the dataset they access, which means - # that access to all netCDF and HDF files coordinates around the - # same lock. - - # .. versionadded:: NEXTVERSION - - # """ - # return netcdf_lock - - # def _get_array(self, index=None): - # """Returns a subspace of the dataset variable. - - # .. versionadded:: NEXTVERSION - - # .. seealso:: `__array__`, `index` - - # :Parameters: - - # {{index: `tuple` or `None`, optional}} - - # :Returns: - - # `numpy.ndarray` - # The subspace. - - # """ - # if index is None: - # index = self.index() - - # # We need to lock because the netCDF file is about to be accessed. - # self._lock.acquire() - - # # It's cfdm.H5netcdfArray.__getitem__ that we want to - # # call here, but we use 'Container' in super because - # # that comes immediately before cfdm.H5netcdfArray in - # # the method resolution order. - # array = super(Container, self).__getitem__(index) - - # self._lock.release() - # return array diff --git a/cf/data/array/locks.py b/cf/data/array/locks.py deleted file mode 100644 index 5a7b2bd333..0000000000 --- a/cf/data/array/locks.py +++ /dev/null @@ -1,4 +0,0 @@ -from dask.utils import SerializableLock - -# Global lock for netCDF file access -netcdf_lock = SerializableLock() diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py index f8eb5e8ad5..5255109006 100644 --- a/cf/data/array/netcdf4array.py +++ b/cf/data/array/netcdf4array.py @@ -19,62 +19,3 @@ class NetCDF4Array( method. See `cf.data.collapse.Collapse` for details. """ - - # def __dask_tokenize__(self): - # """Return a value fully representative of the object. - # - # .. versionadded:: 3.15.0 - # - # """ - # return super().__dask_tokenize__() + (self.get_mask(),) - - -# -# @property -# def _lock(self): -# """Set the lock for use in `dask.array.from_array`. -# -# Returns a lock object because concurrent reads are not -# currently supported by the netCDF and HDF libraries. The lock -# object will be the same for all `NetCDF4Array` and -# `H5netcdfArray` instances, regardless of the dataset they -# access, which means that access to all netCDF and HDF files -# coordinates around the same lock. -# -# .. versionadded:: 3.14.0 -# -# """ -# return netcdf_lock -# -# def _get_array(self, index=None): -# """Returns a subspace of the dataset variable. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `__array__`, `index` -# -# :Parameters: -# -# {{index: `tuple` or `None`, optional}} -# -# :Returns: -# -# `numpy.ndarray` -# The subspace. -# -# """ -# if index is None: -# index = self.index() -# -# # Note: We need to lock because the netCDF file is about to be -# # accessed. -# self._lock.acquire() -# -# # Note: It's cfdm.NetCDFArray.__getitem__ that we want to call -# # here, but we use 'Container' in super because that -# # comes immediately before cfdm.NetCDFArray in the -# # method resolution order. -# array = super(Container, self).__getitem__(index) -# -# self._lock.release() -# return array diff --git a/cf/regrid/regrid.py b/cf/regrid/regrid.py index eb7ee6656a..b2ce54ee55 100644 --- a/cf/regrid/regrid.py +++ b/cf/regrid/regrid.py @@ -2462,10 +2462,10 @@ def create_esmpy_weights( # Write the weights to a netCDF file (copying the # dimension and variable names and structure of a weights # file created by ESMF). + from cfdm.data.locks import netcdf_lock from netCDF4 import Dataset from .. import __version__ - from ..data.array.locks import netcdf_lock if ( max(dst_esmpy_field.data.size, src_esmpy_field.data.size) @@ -2491,48 +2491,51 @@ def create_esmpy_weights( if src_grid.ln_z: regrid_method += f", ln {src_grid.method} in vertical" - netcdf_lock.acquire() - nc = Dataset(weights_file, "w", format="NETCDF4") + with netcdf_lock: + nc = Dataset(weights_file, "w", format="NETCDF4") - nc.title = ( - f"Regridding weights from source {src_grid.type} " - f"with shape {src_shape} to destination " - f"{dst_grid.type} with shape {dst_shape}" - ) - nc.source = f"cf v{__version__}, esmpy v{esmpy.__version__}" - nc.history = f"Created at {datetime.now()}" - nc.regrid_method = regrid_method - nc.ESMF_unmapped_action = r.unmapped_action - nc.ESMF_ignore_degenerate = int(r.ignore_degenerate) - - nc.createDimension("n_s", weights.size) - nc.createDimension("src_grid_rank", src_esmpy_grid.rank) - nc.createDimension("dst_grid_rank", dst_esmpy_grid.rank) - - v = nc.createVariable("src_grid_dims", i_dtype, ("src_grid_rank",)) - v.long_name = "Source grid shape" - v[...] = src_shape - - v = nc.createVariable("dst_grid_dims", i_dtype, ("dst_grid_rank",)) - v.long_name = "Destination grid shape" - v[...] = dst_shape - - v = nc.createVariable("S", weights.dtype, ("n_s",)) - v.long_name = "Weights values" - v[...] = weights - - v = nc.createVariable("row", i_dtype, ("n_s",), zlib=True) - v.long_name = "Destination/row indices" - v.start_index = start_index - v[...] = row - - v = nc.createVariable("col", i_dtype, ("n_s",), zlib=True) - v.long_name = "Source/col indices" - v.start_index = start_index - v[...] = col - - nc.close() - netcdf_lock.release() + nc.title = ( + f"Regridding weights from source {src_grid.type} " + f"with shape {src_shape} to destination " + f"{dst_grid.type} with shape {dst_shape}" + ) + nc.source = f"cf v{__version__}, esmpy v{esmpy.__version__}" + nc.history = f"Created at {datetime.now()}" + nc.regrid_method = regrid_method + nc.ESMF_unmapped_action = r.unmapped_action + nc.ESMF_ignore_degenerate = int(r.ignore_degenerate) + + nc.createDimension("n_s", weights.size) + nc.createDimension("src_grid_rank", src_esmpy_grid.rank) + nc.createDimension("dst_grid_rank", dst_esmpy_grid.rank) + + v = nc.createVariable( + "src_grid_dims", i_dtype, ("src_grid_rank",) + ) + v.long_name = "Source grid shape" + v[...] = src_shape + + v = nc.createVariable( + "dst_grid_dims", i_dtype, ("dst_grid_rank",) + ) + v.long_name = "Destination grid shape" + v[...] = dst_shape + + v = nc.createVariable("S", weights.dtype, ("n_s",)) + v.long_name = "Weights values" + v[...] = weights + + v = nc.createVariable("row", i_dtype, ("n_s",), zlib=True) + v.long_name = "Destination/row indices" + v.start_index = start_index + v[...] = row + + v = nc.createVariable("col", i_dtype, ("n_s",), zlib=True) + v.long_name = "Source/col indices" + v.start_index = start_index + v[...] = col + + nc.close() if esmpy_regrid_operator is None: # Destroy esmpy objects (the esmpy.Grid objects exist even if diff --git a/cf/regrid/regridoperator.py b/cf/regrid/regridoperator.py index 10a77bc641..7621addc7e 100644 --- a/cf/regrid/regridoperator.py +++ b/cf/regrid/regridoperator.py @@ -725,28 +725,26 @@ def tosparse(self): weights_file = self.weights_file if weights_file is not None: # Read the weights from the weights file + from cfdm.data.locks import netcdf_lock from netCDF4 import Dataset - from ..data.array.locks import netcdf_lock + with netcdf_lock: + nc = Dataset(weights_file, "r") + weights = nc.variables["S"][...] + row = nc.variables["row"][...] + col = nc.variables["col"][...] - netcdf_lock.acquire() - nc = Dataset(weights_file, "r") - weights = nc.variables["S"][...] - row = nc.variables["row"][...] - col = nc.variables["col"][...] + try: + col_start_index = nc.variables["col"].start_index + except AttributeError: + col_start_index = 1 - try: - col_start_index = nc.variables["col"].start_index - except AttributeError: - col_start_index = 1 + try: + row_start_index = nc.variables["row"].start_index + except AttributeError: + row_start_index = 1 - try: - row_start_index = nc.variables["row"].start_index - except AttributeError: - row_start_index = 1 - - nc.close() - netcdf_lock.release() + nc.close() else: raise ValueError( "Conversion to sparse array format requires at least " From 09849af7b779b4d93b2a96217cce40bbc40a4d1a Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 25 Nov 2024 12:20:06 +0000 Subject: [PATCH 09/27] dev --- cf/data/dask_utils.py | 90 ------------------------------------------- 1 file changed, 90 deletions(-) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index b088d97bb8..04a0c09921 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -7,104 +7,14 @@ from functools import partial -import dask.array as da import numpy as np from cfdm.data.dask_utils import cfdm_asanyarray -from dask.core import flatten from scipy.ndimage import convolve1d from ..cfdatetime import dt, dt2rt, rt2dt -from ..functions import atol as cf_atol -from ..functions import rtol as cf_rtol from ..units import Units -def _da_ma_allclose(x, y, masked_equal=True, rtol=None, atol=None): - """An effective dask.array.ma.allclose method. - - True if two dask arrays are element-wise equal within a tolerance. - - Equivalent to allclose except that masked values are treated as - equal (default) or unequal, depending on the masked_equal - argument. - - Define an effective da.ma.allclose method here because one is - currently missing in the Dask codebase. - - Note that all default arguments are the same as those provided to - the corresponding NumPy method (see the `numpy.ma.allclose` API - reference). - - .. versionadded:: 3.14.0 - - :Parameters: - - x: a dask array to compare with y - - y: a dask array to compare with x - - masked_equal: `bool`, optional - Whether masked values in a and b are considered equal - (True) or not (False). They are considered equal by - default. - - {{rtol: number, optional}} - - {{atol: number, optional}} - - :Returns: - - `bool` - A Boolean value indicating whether or not the two dask - arrays are element-wise equal to the given *rtol* and - *atol* tolerance. - - """ - # TODODASK: put in a PR to Dask to request to add as genuine method. - - if rtol is None: - rtol = cf_rtol() - if atol is None: - atol = cf_atol() - - # Must pass rtol=rtol, atol=atol in as kwargs to allclose, rather than it - # using those in local scope from the outer function arguments, because - # Dask's internal algorithms require these to be set as parameters. - def allclose(a_blocks, b_blocks, rtol=rtol, atol=atol): - """Run `ma.allclose` across multiple blocks over two arrays.""" - result = True - # Handle scalars, including 0-d arrays, for which a_blocks and - # b_blocks will have the corresponding type and hence not be iterable. - # With this approach, we avoid inspecting sizes or lengths, and for - # the 0-d array blocks the following iteration can be used unchanged - # and will only execute once with block sizes as desired of: - # (np.array(),)[0] = array(). Note - # can't check against more general case of collections.abc.Iterable - # because a 0-d array is also iterable, but in practice always a list. - if not isinstance(a_blocks, list): - a_blocks = (a_blocks,) - if not isinstance(b_blocks, list): - b_blocks = (b_blocks,) - - # Note: If a_blocks or b_blocks has more than one chunk in - # more than one dimension they will comprise a nested - # sequence of sequences, that needs to be flattened so - # that we can safely iterate through the actual numpy - # array elements. - - for a, b in zip(flatten(a_blocks), flatten(b_blocks)): - result &= np.ma.allclose( - a, b, masked_equal=masked_equal, rtol=rtol, atol=atol - ) - - return result - - axes = tuple(range(x.ndim)) - return da.blockwise( - allclose, "", x, axes, y, axes, dtype=bool, rtol=rtol, atol=atol - ) - - def cf_contains(a, value): """Whether or not an array contains a value. From 15f4541242b3dcd90c165a5d8c384923e6fbf892 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 2 Dec 2024 13:09:03 +0000 Subject: [PATCH 10/27] dev --- cf/data/array/mixin/indexmixin.py | 366 ---------------------------- cf/data/collapse/dask_collapse.py | 34 +-- cf/data/dask_utils.py | 22 +- cf/data/data.py | 114 ++++----- cf/data/utils.py | 8 +- cf/read_write/netcdf/netcdfwrite.py | 6 +- 6 files changed, 92 insertions(+), 458 deletions(-) delete mode 100644 cf/data/array/mixin/indexmixin.py diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py deleted file mode 100644 index 3e0ccc7afb..0000000000 --- a/cf/data/array/mixin/indexmixin.py +++ /dev/null @@ -1,366 +0,0 @@ -# from numbers import Integral -# -# import numpy as np -# from dask.array.slicing import normalize_index -# from dask.base import is_dask_collection -# -# from ....functions import indices_shape, parse_indices -# -# -# class IndexMixin: -# """Mixin class for lazy indexing of a data array. -# -# A data for a subspace is retrieved by casting the object as a -# `numpy` array. See `__getitem__` for more details. -# -# **Examples** -# -# >>> a = cf.{{class}}(...) -# >>> a.shape -# (6, 5) -# >>> print(np.asanyarray(a)) -# [[ 0 1 2 3 4]) -# [ 5 6 7 8 9] -# [10 11 12 13 14] -# [15 16 17 18 19] -# [20 21 22 23 24] -# [25 26 27 28 29]] -# >>> a = a[::2, [1, 2, 4]] -# >>> a = a[[True, False, True], :] -# >>> a.shape -# (2, 3) -# >>> print(np.asanyarray(a)) -# [[ 1, 2, 4], -# [21, 22, 24]] -# -# .. versionadded:: NEXTVERSION -# -# """ -# -# -# -# def __array__(self, *dtype): -# """Convert the `{{class}}` into a `numpy` array. -# -# .. versionadded:: NEXTVERSION -# -# :Parameters: -# -# dtype: optional -# Typecode or data-type to which the array is cast. -# -# :Returns: -# -# `numpy.ndarray` -# An independent `numpy` array of the subspace of the -# data defined by the `indices` attribute. -# -# """ -# array = self._get_array() -# if dtype: -# return array.astype(dtype[0], copy=False) -# -# return array -# -# def __getitem__(self, index): -# """Returns a subspace of the data as a new `{{class}}`. -# -# x.__getitem__(indices) <==> x[indices] -# -# Subspaces created by indexing are lazy and are not applied -# until the `{{class}}` object is converted to a `numpy` array, -# by which time all lazily-defined subspaces will have been -# converted to a single combined index which defines only the -# actual elements that need to be retrieved from the original -# data. -# -# The combined index is orthogonal, meaning that the index for -# each dimension is to be applied independently, regardless of -# how that index was defined. For instance, the indices ``[[0, -# 1], [1, 3], 0]`` and ``[:2, 1::2, 0]`` will give identical -# results. -# -# For example, if the original data has shape ``(12, 145, 192)`` -# and consecutive subspaces of ``[::2, [1, 3, 4], 96:]`` and -# ``[[0, 5], [True, False, True], 0]`` are applied, then only -# the elements defined by the combined index``[[0, 10], [1, 4], -# 96]`` will be retrieved from the data when `__array__` is -# called. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `index`, `original_shape`, `__array__`, -# `__getitem__` -# -# :Returns: -# -# `{{class}}` -# The subspaced data. -# -# """ -# shape0 = self.shape -# index0 = self.index(conform=False) -# original_shape = self.original_shape -# -# index1 = parse_indices(shape0, index, keepdims=False) -# -# new = self.copy() -# new_indices = [] -# new_shape = [] -# -# i = 0 -# for ind0, original_size in zip(index0, original_shape): -# if isinstance(ind0, Integral): -# # The previous call to __getitem__ resulted in a -# # dimension being removed (i.e. 'ind0' is -# # integer-valued). Therefore 'index1' must have fewer -# # elements than 'index0', so we need to "carry -# # forward" the integer-valued index so that it is -# # available at evaluation time. -# new_indices.append(ind0) -# continue -# -# ind1 = index1[i] -# size0 = shape0[i] -# i += 1 -# -# # If this dimension is not subspaced by the new index then -# # we don't need to update the old index. -# if isinstance(ind1, slice) and ind1 == slice(None): -# new_indices.append(ind0) -# continue -# -# # Still here? Then we have to work out the index of the -# # full array that is equivalent to applying -# # 'ind0' followed by 'ind1'. -# if is_dask_collection(ind1): -# # Note: This will never occur when this __getitem__ is -# # being called from within a Dask graph, because -# # any lazy indices will have already been -# # computed as part of the whole graph execution; -# # i.e. we don't have to worry about a -# # compute-within-a-compute situation. (If this -# # were not the case then we could add -# # `scheduler="synchronous"` to the compute -# # call.) -# ind1 = ind1.compute() -# -# if isinstance(ind0, slice): -# if isinstance(ind1, slice): -# # ind0: slice -# # ind1: slice -# start, stop, step = ind0.indices(original_size) -# start1, stop1, step1 = ind1.indices(size0) -# size1, mod1 = divmod(stop1 - start1, step1) -# -# if mod1 != 0: -# size1 += 1 -# -# start += start1 * step -# step *= step1 -# stop = start + (size1 - 1) * step -# -# if step > 0: -# stop += 1 -# else: -# stop -= 1 -# -# if stop < 0: -# stop = None -# -# new_index = slice(start, stop, step) -# else: -# # ind0: slice -# # ind1: int, or array of int/bool -# new_index = np.arange(*ind0.indices(original_size))[ind1] -# else: -# # ind0: array of int. If we made it to here then it -# # can't be anything else. This is -# # because we've dealt with ind0 -# # being a slice or an int, the -# # very first ind0 is always -# # slice(None), and a previous ind1 -# # that was an array of bool will -# # have resulted in this ind0 being -# # an array of int. -# # -# # ind1: anything -# new_index = np.asanyarray(ind0)[ind1] -# -# new_indices.append(new_index) -# -# new._custom["index"] = tuple(new_indices) -# -# # Find the shape defined by the new index -# new_shape = indices_shape(new_indices, original_shape, keepdims=False) -# new._set_component("shape", tuple(new_shape), copy=False) -# -# return new -# -# def __repr__(self): -# """Called by the `repr` built-in function. -# -# x.__repr__() <==> repr(x) -# -# """ -# return ( -# f"" -# ) -# -# @property -# def __asanyarray__(self): -# """Whether the array is accessed by conversion to a `numpy` array. -# -# .. versionadded:: NEXTVERSION -# -# :Returns: -# -# `True` -# -# """ -# return True -# -# def _get_array(self, index=None): -# """Returns a subspace of the data as a `numpy` array. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `__array__`, `index` -# -# :Parameters: -# -# index: `tuple` or `None`, optional -# Provide the indices that define the subspace. If -# `None` then the `index` attribute is used. -# -# :Returns: -# -# `numpy.ndarray` -# The subspace. -# -# """ -# return NotImplementedError( -# f"Must implement {self.__class__.__name__}._get_array" -# ) -# -# def index(self, conform=True): -# """The index to be applied when converting to a `numpy` array. -# -# The `shape` is defined by the `index` applied to the -# `original_shape`. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `shape`, `original_shape` -# -# :Parameters: -# -# conform: `bool`, optional -# If True, the default, then -# -# * Convert a decreasing size 1 slice to an increasing -# one. -# -# * Convert, where possible, a sequence of integers to a -# slice. -# -# These transformations are to allow subspacing on data -# objects that have restricted indexing functionality, -# such as `h5py.Variable` objects. -# -# If False then these transformations are not done. -# -# :Returns: -# -# `tuple` -# -# **Examples** -# -# >>> x.shape -# (12, 145, 192) -# >>> x.index() -# (slice(None), slice(None), slice(None)) -# >>> x = x[8:7:-1, 10:19:3, [15, 1, 4, 12]] -# >>> x = x[[0], [True, False, True], ::-2] -# >>> x.shape -# (1, 2, 2) -# >>> x.index() -# (slice(8, 9, None), slice(10, 17, 6), slice(12, -1, -11)) -# >>> x.index(conform=False) -# (array([8]), array([10, 16]), array([12, 1])) -# -# """ -# ind = self._custom.get("index") -# if ind is None: -# # No indices have been applied yet, so define indices that -# # are equivalent to Ellipsis, and set the original shape. -# ind = (slice(None),) * self.ndim -# self._custom["index"] = ind -# self._custom["original_shape"] = self.shape -# return ind -# -# if not conform: -# return ind -# -# # Still here? Then conform the indices by: -# # -# # 1) Converting decreasing size 1 slices to increasing -# # ones. This helps when the parent class can't cope with -# # decreasing slices. -# # -# # 2) Converting, where possible, sequences of integers to -# # slices. This helps when the parent class can't cope with -# # indices that are sequences of integers. -# ind = list(ind) -# for n, (i, size) in enumerate(zip(ind[:], self.original_shape)): -# if isinstance(i, slice): -# if size == 1: -# start, _, step = i.indices(size) -# if step and step < 0: -# # Decreasing slices are not universally -# # accepted (e.g. `h5py` doesn't like them), -# # but we can convert them to increasing ones. -# ind[n] = slice(start, start + 1) -# elif np.iterable(i): -# i = normalize_index((i,), (size,))[0] -# if i.size == 1: -# # Convert a sequence of one integer into a slice -# start = i.item() -# ind[n] = slice(start, start + 1) -# else: -# # Convert a sequence of two or more evenly spaced -# # integers into a slice. -# step = np.unique(np.diff(i)) -# if step.size == 1: -# start, stop = i[[0, -1]] -# if stop >= start: -# stop += 1 -# elif stop: -# stop = -1 -# else: -# stop = None -# -# ind[n] = slice(start, stop, step.item()) -# -# return tuple(ind) -# -# @property -# def original_shape(self): -# """The original shape of the data, before any subspacing. -# -# The `shape` is defined by the result of subspacing the data in -# its original shape with the indices given by `index`. -# -# .. versionadded:: NEXTVERSION -# -# .. seealso:: `index`, `shape` -# -# """ -# out = self._custom.get("original_shape") -# if out is None: -# # No subspace has been defined yet -# out = self.shape -# self._custom["original_shape"] = out -# -# return out diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index 51b0cd1d0a..9e4c75080c 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -10,7 +10,7 @@ from operator import mul import numpy as np -from cfdm.data.dask_utils import cfdm_asanyarray +from cfdm.data.dask_utils import cfdm_to_memory from dask.array import chunk from dask.array.core import _concatenate2 from dask.array.reductions import divide, numel @@ -276,9 +276,9 @@ def cf_mean_chunk( if computing_meta: return x - x = cfdm_asanyarray(x) + x = cfdm_to_memory(x) if weights is not None: - weights = cfdm_asanyarray(weights) + weights = cfdm_to_memory(weights) # N, sum d = cf_sum_chunk(x, weights=weights, dtype=dtype, **kwargs) @@ -401,7 +401,7 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - x = cfdm_asanyarray(x) + x = cfdm_to_memory(x) return { "max": chunk.max(x, **kwargs), @@ -555,7 +555,7 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - x = cfdm_asanyarray(x) + x = cfdm_to_memory(x) return { "min": chunk.min(x, **kwargs), @@ -662,7 +662,7 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - x = cfdm_asanyarray(x) + x = cfdm_to_memory(x) # N, max d = cf_max_chunk(x, **kwargs) @@ -779,7 +779,7 @@ def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): if computing_meta: return x - x = cfdm_asanyarray(x) + x = cfdm_to_memory(x) return cf_mean_chunk( np.multiply(x, x, dtype=dtype), weights=weights, dtype=dtype, **kwargs @@ -857,7 +857,7 @@ def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): if computing_meta: return x - x = cfdm_asanyarray(x) + x = cfdm_to_memory(x) if np.ma.isMA(x): N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs) @@ -985,10 +985,10 @@ def cf_sum_chunk( if computing_meta: return x - x = cfdm_asanyarray(x) + x = cfdm_to_memory(x) if weights is not None: - weights = cfdm_asanyarray(weights) + weights = cfdm_to_memory(weights) if check_weights: w_min = weights.min() if w_min <= 0: @@ -1107,9 +1107,9 @@ def cf_sum_of_weights_chunk( if computing_meta: return x - x = cfdm_asanyarray(x) + x = cfdm_to_memory(x) if weights is not None: - weights = cfdm_asanyarray(weights) + weights = cfdm_to_memory(weights) # N d = cf_sample_size_chunk(x, **kwargs) @@ -1152,9 +1152,9 @@ def cf_sum_of_weights2_chunk( if computing_meta: return x - x = cfdm_asanyarray(x) + x = cfdm_to_memory(x) if weights is not None: - weights = cfdm_asanyarray(weights) + weights = cfdm_to_memory(weights) # N d = cf_sample_size_chunk(x, **kwargs) @@ -1193,7 +1193,7 @@ def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x - x = cfdm_asanyarray(x) + x = cfdm_to_memory(x) return {"unique": np.unique(x)} @@ -1298,11 +1298,11 @@ def cf_var_chunk( if computing_meta: return x - x = cfdm_asanyarray(x) + x = cfdm_to_memory(x) weighted = weights is not None if weighted: - weights = cfdm_asanyarray(weights) + weights = cfdm_to_memory(weights) # N, V1, sum d = cf_mean_chunk(x, weights=weights, dtype=dtype, **kwargs) diff --git a/cf/data/dask_utils.py b/cf/data/dask_utils.py index 04a0c09921..1fd3be11af 100644 --- a/cf/data/dask_utils.py +++ b/cf/data/dask_utils.py @@ -8,7 +8,7 @@ from functools import partial import numpy as np -from cfdm.data.dask_utils import cfdm_asanyarray +from cfdm.data.dask_utils import cfdm_to_memory from scipy.ndimage import convolve1d from ..cfdatetime import dt, dt2rt, rt2dt @@ -38,8 +38,8 @@ def cf_contains(a, value): value. """ - a = cfdm_asanyarray(a) - value = cfdm_asanyarray(value) + a = cfdm_to_memory(a) + value = cfdm_to_memory(value) return np.array(value in a).reshape((1,) * a.ndim) @@ -73,7 +73,7 @@ def cf_convolve1d(a, window=None, axis=-1, origin=0): Convolved float array with same shape as input. """ - a = cfdm_asanyarray(a) + a = cfdm_to_memory(a) # Cast to float to ensure that NaNs can be stored if a.dtype != float: @@ -155,7 +155,7 @@ def cf_percentile(a, q, axis, method, keepdims=False, mtol=1): """ from math import prod - a = cfdm_asanyarray(a) + a = cfdm_to_memory(a) if np.ma.isMA(a) and not np.ma.is_masked(a): # Masked array with no masked elements @@ -274,7 +274,7 @@ def cf_YMDhms(a, attr): array([1, 2]) """ - a = cfdm_asanyarray(a) + a = cfdm_to_memory(a) return _array_getattr(a, attr=attr) @@ -307,7 +307,7 @@ def cf_rt2dt(a, units): cftime.DatetimeGregorian(2000, 1, 2, 0, 0, 0, 0, has_year_zero=False)] """ - a = cfdm_asanyarray(a) + a = cfdm_to_memory(a) if not units.iscalendartime: return rt2dt(a, units_in=units) @@ -363,7 +363,7 @@ def cf_dt2rt(a, units): [365 366] """ - a = cfdm_asanyarray(a) + a = cfdm_to_memory(a) return dt2rt(a, units_out=units, units_in=None) @@ -404,7 +404,7 @@ def cf_units(a, from_units, to_units): [1000. 2000.] """ - a = cfdm_asanyarray(a) + a = cfdm_to_memory(a) return Units.conform( a, from_units=from_units, to_units=to_units, inplace=False ) @@ -428,7 +428,7 @@ def cf_is_masked(a): values. """ - a = cfdm_asanyarray(a) + a = cfdm_to_memory(a) out = np.ma.is_masked(a) return np.array(out).reshape((1,) * a.ndim) @@ -461,5 +461,5 @@ def cf_filled(a, fill_value=None): [[-999 2 3]] """ - a = cfdm_asanyarray(a) + a = cfdm_to_memory(a) return np.ma.filled(a, fill_value=fill_value) diff --git a/cf/data/data.py b/cf/data/data.py index c06941d1c0..61b3f952cf 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1,4 +1,4 @@ -import logging +Importre logging import math from functools import partial, reduce from itertools import product @@ -455,13 +455,13 @@ def __contains__(self, value): # are incompatible return False - # 'cf_contains' has its own calls to 'cfdm_asanyarray', so - # we can set '_asanyarray=False'. - value = value.to_dask_array(_asanyarray=False) + # 'cf_contains' has its own calls to 'cfdm_to_memory', so + # we can set '_force_in_memory=False'. + value = value.to_dask_array(_force_in_memory=False) - # 'cf_contains' has its own calls to 'cfdm_asanyarray', so we - # can set '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) + # 'cf_contains' has its own calls to 'cfdm_to_memory', so we + # can set '_force_in_memory=False'. + dx = self.to_dask_array(_force_in_memory=False) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind @@ -1566,9 +1566,9 @@ def percentile( else: axes = tuple(sorted(d._parse_axes(axes))) - # 'cf_percentile' has its own call to 'cfdm_asanyarray', so we - # can set '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) + # 'cf_percentile' has its own call to 'cfdm_to_memory', so we + # can set '_force_in_memory=False'. + dx = d.to_dask_array(_force_in_memory=False) dtype = dx.dtype shape = dx.shape @@ -2128,9 +2128,9 @@ def _asdatetime(self, inplace=False): ) if not d._isdatetime(): - # 'cf_rt2dt' has its own call to 'cfdm_asanyarray', so we - # can set '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) + # 'cf_rt2dt' has its own call to 'cfdm_to_memory', so we + # can set '_force_in_memory=False'. + dx = d.to_dask_array(_force_in_memory=False) dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) d._set_dask(dx) @@ -2185,9 +2185,9 @@ def _asreftime(self, inplace=False): ) if d._isdatetime(): - # 'cf_dt2rt' has its own call to 'cfdm_asanyarray', so we - # can set '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) + # 'cf_dt2rt' has its own call to 'cfdm_to_memory', so we + # can set '_force_in_memory=False'. + dx = d.to_dask_array(_force_in_memory=False) dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) d._set_dask(dx) @@ -2783,9 +2783,9 @@ def _regrid( f"the shape of the regrid operator: {operator.src_shape}" ) - # 'regrid' has its own calls to 'cfdm_asanyarray', so we can set - # '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) + # 'regrid' has its own calls to 'cfdm_to_memory', so we can set + # '_force_in_memory=False'. + dx = self.to_dask_array(_force_in_memory=False) # Rechunk so that each chunk contains data in the form # expected by the regrid operator, i.e. the regrid axes all @@ -3028,10 +3028,10 @@ def concatenate( copied = not copy # to avoid making two copies in a given case # Get data as dask arrays and apply concatenation - # operation. We can set '_asanyarray=False' because at compute + # operation. We can set '_force_in_memory=False' because at compute # time the concatenation operation does not need to access the # actual data. - dxs = [d.to_dask_array(_asanyarray=False) for d in processed_data] + dxs = [d.to_dask_array(_force_in_memory=False) for d in processed_data] dx = da.concatenate(dxs, axis=axis) # Set the CFA write status @@ -3059,18 +3059,18 @@ def concatenate( cfa = cls._NONE break - # Define the __asanyarray__ status - asanyarray = processed_data[0].__asanyarray__ + # Define the __in_memory__ status + in_memory = processed_data[0].__in_memory__ for d in processed_data[1:]: - if d.__asanyarray__ != asanyarray: + if d.__in_memory__ != in_memory: # If and only if any two input Data objects have - # different __asanyarray__ values, then set - # asanyarray=True on the concatenation. - asanyarray = True + # different __in_memory__ values, then set + # in_memory=False on the concatenation. + in_memory= False break # Set the new dask array - data0._set_dask(dx, clear=cls._ALL ^ cfa, asanyarray=asanyarray) + data0._set_dask(dx, clear=cls._ALL ^ cfa, in_memory=in_memory) # Set appropriate cached elements cached_elements = {} @@ -3463,9 +3463,9 @@ def Units(self, value): cf_func = partial(cf_units, from_units=old_units, to_units=value) - # 'cf_units' has its own call to 'cfdm_asanyarray', so we - # can set '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) + # 'cf_units' has its own call to 'cfdm_to_memory', so we + # can set '_force_in_memory=False'. + dx = self.to_dask_array(_force_in_memory=False) dx = dx.map_blocks(cf_func, dtype=dtype) # Setting equivalent units doesn't affect the CFA write @@ -3507,9 +3507,9 @@ def is_masked(self): True """ - # 'cf_is_masked' has its own call to 'cfdm_asanyarray', so we - # can set '_asanyarray=False'. - dx = self.to_dask_array(_asanyarray=False) + # 'cf_is_masked' has its own call to 'cfdm_to_memory', so we + # can set '_force_in_memory=False'. + dx = self.to_dask_array(_force_in_memory=False) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind @@ -4214,9 +4214,9 @@ def convert_reference_time( ) d.Units = units0 - # 'cf_rt2dt' its own call to 'cfdm_asanyarray', so we can set - # '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) + # 'cf_rt2dt' its own call to 'cfdm_to_memory', so we can set + # '_force_in_memory=False'. + dx = d.to_dask_array(_force_in_memory=False) # Convert to the correct date-time objects dx = dx.map_blocks(cf_rt2dt, units=units0, dtype=object) @@ -4283,10 +4283,10 @@ def get_deterministic_name(self): units = self._Units # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. + # '_force_in_memory=False'. return tokenize( self.to_dask_array( - _apply_mask_hardness=False, _asanyarray=False + _apply_mask_hardness=False, _force_in_memory=False ).name, units.formatted(definition=True, names=True), units._canonical_calendar, @@ -4324,8 +4324,8 @@ def add_file_location(self, location): updated = False # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dsk = self.todict(_asanyarray=False) + # '_force_in_memory=False'. + dsk = self.todict(_force_in_memory=False) for key, a in dsk.items(): try: dsk[key] = a.add_file_location(location) @@ -4338,9 +4338,9 @@ def add_file_location(self, location): updated = True if updated: - dx = self.to_dask_array(_asanyarray=False) + dx = self.to_dask_array(_force_in_memory=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=self._NONE, asanyarray=None) + self._set_dask(dx, clear=self._NONE, in_memory=None) return location @@ -5725,8 +5725,8 @@ def unique(self, split_every=None): d.soften_mask() # The applicable chunk function will have its own call to - # 'cfdm_asanyarray', so we can set '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) + # 'cfdm_to_memory', so we can set '_force_in_memory=False'. + dx = d.to_dask_array(_force_in_memory=False) dx = Collapse().unique(dx, split_every=split_every) d._set_dask(dx) @@ -6240,8 +6240,8 @@ def file_locations(self): out = set() # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - for key, a in self.todict(_asanyarray=False).items(): + # '_force_in_memory=False'. + for key, a in self.todict(_force_in_memory=False).items(): try: out.update(a.file_locations()) except AttributeError: @@ -6801,8 +6801,8 @@ def del_file_location(self, location): updated = False # The dask graph is never going to be computed, so we can set - # '_asanyarray=False'. - dsk = self.todict(_asanyarray=False) + # '_force_in_memory=False'. + dsk = self.todict(_force_in_memory=False) for key, a in dsk.items(): try: dsk[key] = a.del_file_location(location) @@ -6815,9 +6815,9 @@ def del_file_location(self, location): updated = True if updated: - dx = self.to_dask_array(_asanyarray=False) + dx = self.to_dask_array(_force_in_memory=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=self._NONE, asanyarray=None) + self._set_dask(dx, clear=self._NONE, in_memory=None) return location @@ -7851,9 +7851,9 @@ def where( # Missing values could be affected, so make sure that the mask # hardness has been applied. # - # 'cf_where' has its own calls to 'cfdm_asanyarray', so we can - # set '_asanyarray=False'. - dx = d.to_dask_array(_asanyarray=False) + # 'cf_where' has its own calls to 'cfdm_to_memory', so we can + # set '_force_in_memory=False'. + dx = d.to_dask_array(_force_in_memory=False) units = d.Units @@ -7868,9 +7868,9 @@ def where( condition = type(self).asdata(condition) condition = where_broadcastable(d, condition, "condition") - # 'cf_where' has its own calls to 'cfdm_asanyarray', so we can - # set '_asanyarray=False'. - condition = condition.to_dask_array(_asanyarray=False) + # 'cf_where' has its own calls to 'cfdm_to_memory', so we can + # set '_force_in_memory=False'. + condition = condition.to_dask_array(_force_in_memory=False) # If x or y is self then change it to None. This prevents an # unnecessary copy; and, at compute time, an unncessary numpy diff --git a/cf/data/utils.py b/cf/data/utils.py index 4436556387..1cd70fc388 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -417,10 +417,10 @@ def collapse( kwargs["ddof"] = ddof # The applicable chunk function will have its own call to - # 'cfdm_asanyarray', so we can set '_asanyarray=False'. Also, setting - # _asanyarray=False will ensure that any active storage operations - # are not compromised. - dx = d.to_dask_array(_asanyarray=False) + # 'cfdm_to_memory', so we can set '_force_in_memory=False'. Also, + # setting _force_in_memory=False will ensure that any active + # storage operations are not compromised. + dx = d.to_dask_array(_force_in_memory=False) dx = func(dx, **kwargs) d._set_dask(dx) diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index 068c55b968..d4fd04a3ee 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -3,7 +3,7 @@ import cfdm import dask.array as da import numpy as np -from cfdm.data.dask_utils import cfdm_asanyarray +from cfdm.data.dask_utils import cfdm_to_memory from .netcdfread import NetCDFRead @@ -747,7 +747,7 @@ def _cfa_write_non_standard_terms( # more than one unique value then the fragment's value is # missing data. # - # '_cfa_unique' has its own call to 'cfdm_asanyarray', so + # '_cfa_unique' has its own call to 'cfdm_to_memory', so # we can set '_asanyarray=False'. dx = data.to_dask_array(_asanyarray=False) dx_ind = tuple(range(dx.ndim)) @@ -807,7 +807,7 @@ def _cfa_unique(cls, a): data if there is not a unique value. """ - a = cfdm_asanyarray(a) + a = cfdm_to_memory(a) out_shape = (1,) * a.ndim a = np.unique(a) From a44f7ba6b31ac669fed9befd9e3707e61b0a1d38 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Mon, 2 Dec 2024 14:01:25 +0000 Subject: [PATCH 11/27] dev --- cf/data/dask_regrid.py | 8 +-- cf/data/data.py | 76 ++++++++++++++--------------- cf/data/utils.py | 6 +-- cf/read_write/netcdf/netcdfwrite.py | 6 +-- cf/test/individual_tests.sh | 23 +++++++-- cf/test/test_Data.py | 6 +-- cf/test/test_DimensionCoordinate.py | 2 +- cf/test/test_Field.py | 4 +- 8 files changed, 72 insertions(+), 59 deletions(-) diff --git a/cf/data/dask_regrid.py b/cf/data/dask_regrid.py index 9c64b42cb7..ab282b44f8 100644 --- a/cf/data/dask_regrid.py +++ b/cf/data/dask_regrid.py @@ -1,7 +1,7 @@ """Regridding functions used within a dask graph.""" import numpy as np -from cfdm.data.dask_utils import cfdm_asanyarray +from cfdm.data.dask_utils import cfdm_to_memory def regrid( @@ -175,12 +175,12 @@ def regrid( """ weights, dst_mask = weights_dst_mask - a = cfdm_asanyarray(a) + a = cfdm_to_memory(a) if dst_mask is not None: - dst_mask = cfdm_asanyarray(dst_mask) + dst_mask = cfdm_to_memory(dst_mask) if ref_src_mask is not None: - ref_src_mask = cfdm_asanyarray(ref_src_mask) + ref_src_mask = cfdm_to_memory(ref_src_mask) # ---------------------------------------------------------------- # Reshape the array into a form suitable for the regridding dot diff --git a/cf/data/data.py b/cf/data/data.py index 61b3f952cf..6334edfb95 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -1,4 +1,4 @@ -Importre logging +import logging import math from functools import partial, reduce from itertools import product @@ -456,12 +456,12 @@ def __contains__(self, value): return False # 'cf_contains' has its own calls to 'cfdm_to_memory', so - # we can set '_force_in_memory=False'. - value = value.to_dask_array(_force_in_memory=False) + # we can set '_force_to_memory=False'. + value = value.to_dask_array(_force_to_memory=False) # 'cf_contains' has its own calls to 'cfdm_to_memory', so we - # can set '_force_in_memory=False'. - dx = self.to_dask_array(_force_in_memory=False) + # can set '_force_to_memory=False'. + dx = self.to_dask_array(_force_to_memory=False) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind @@ -1567,8 +1567,8 @@ def percentile( axes = tuple(sorted(d._parse_axes(axes))) # 'cf_percentile' has its own call to 'cfdm_to_memory', so we - # can set '_force_in_memory=False'. - dx = d.to_dask_array(_force_in_memory=False) + # can set '_force_to_memory=False'. + dx = d.to_dask_array(_force_to_memory=False) dtype = dx.dtype shape = dx.shape @@ -2129,8 +2129,8 @@ def _asdatetime(self, inplace=False): if not d._isdatetime(): # 'cf_rt2dt' has its own call to 'cfdm_to_memory', so we - # can set '_force_in_memory=False'. - dx = d.to_dask_array(_force_in_memory=False) + # can set '_force_to_memory=False'. + dx = d.to_dask_array(_force_to_memory=False) dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) d._set_dask(dx) @@ -2186,8 +2186,8 @@ def _asreftime(self, inplace=False): if d._isdatetime(): # 'cf_dt2rt' has its own call to 'cfdm_to_memory', so we - # can set '_force_in_memory=False'. - dx = d.to_dask_array(_force_in_memory=False) + # can set '_force_to_memory=False'. + dx = d.to_dask_array(_force_to_memory=False) dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) d._set_dask(dx) @@ -2784,8 +2784,8 @@ def _regrid( ) # 'regrid' has its own calls to 'cfdm_to_memory', so we can set - # '_force_in_memory=False'. - dx = self.to_dask_array(_force_in_memory=False) + # '_force_to_memory=False'. + dx = self.to_dask_array(_force_to_memory=False) # Rechunk so that each chunk contains data in the form # expected by the regrid operator, i.e. the regrid axes all @@ -3028,10 +3028,10 @@ def concatenate( copied = not copy # to avoid making two copies in a given case # Get data as dask arrays and apply concatenation - # operation. We can set '_force_in_memory=False' because at compute + # operation. We can set '_force_to_memory=False' because at compute # time the concatenation operation does not need to access the # actual data. - dxs = [d.to_dask_array(_force_in_memory=False) for d in processed_data] + dxs = [d.to_dask_array(_force_to_memory=False) for d in processed_data] dx = da.concatenate(dxs, axis=axis) # Set the CFA write status @@ -3066,7 +3066,7 @@ def concatenate( # If and only if any two input Data objects have # different __in_memory__ values, then set # in_memory=False on the concatenation. - in_memory= False + in_memory = False break # Set the new dask array @@ -3464,8 +3464,8 @@ def Units(self, value): cf_func = partial(cf_units, from_units=old_units, to_units=value) # 'cf_units' has its own call to 'cfdm_to_memory', so we - # can set '_force_in_memory=False'. - dx = self.to_dask_array(_force_in_memory=False) + # can set '_force_to_memory=False'. + dx = self.to_dask_array(_force_to_memory=False) dx = dx.map_blocks(cf_func, dtype=dtype) # Setting equivalent units doesn't affect the CFA write @@ -3508,8 +3508,8 @@ def is_masked(self): """ # 'cf_is_masked' has its own call to 'cfdm_to_memory', so we - # can set '_force_in_memory=False'. - dx = self.to_dask_array(_force_in_memory=False) + # can set '_force_to_memory=False'. + dx = self.to_dask_array(_force_to_memory=False) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind @@ -4215,8 +4215,8 @@ def convert_reference_time( d.Units = units0 # 'cf_rt2dt' its own call to 'cfdm_to_memory', so we can set - # '_force_in_memory=False'. - dx = d.to_dask_array(_force_in_memory=False) + # '_force_to_memory=False'. + dx = d.to_dask_array(_force_to_memory=False) # Convert to the correct date-time objects dx = dx.map_blocks(cf_rt2dt, units=units0, dtype=object) @@ -4283,10 +4283,10 @@ def get_deterministic_name(self): units = self._Units # The dask graph is never going to be computed, so we can set - # '_force_in_memory=False'. + # '_force_to_memory=False'. return tokenize( self.to_dask_array( - _apply_mask_hardness=False, _force_in_memory=False + _force_mask_hardness=False, _force_to_memory=False ).name, units.formatted(definition=True, names=True), units._canonical_calendar, @@ -4324,8 +4324,8 @@ def add_file_location(self, location): updated = False # The dask graph is never going to be computed, so we can set - # '_force_in_memory=False'. - dsk = self.todict(_force_in_memory=False) + # '_force_to_memory=False'. + dsk = self.todict(_force_to_memory=False) for key, a in dsk.items(): try: dsk[key] = a.add_file_location(location) @@ -4338,7 +4338,7 @@ def add_file_location(self, location): updated = True if updated: - dx = self.to_dask_array(_force_in_memory=False) + dx = self.to_dask_array(_force_to_memory=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) self._set_dask(dx, clear=self._NONE, in_memory=None) @@ -5725,8 +5725,8 @@ def unique(self, split_every=None): d.soften_mask() # The applicable chunk function will have its own call to - # 'cfdm_to_memory', so we can set '_force_in_memory=False'. - dx = d.to_dask_array(_force_in_memory=False) + # 'cfdm_to_memory', so we can set '_force_to_memory=False'. + dx = d.to_dask_array(_force_to_memory=False) dx = Collapse().unique(dx, split_every=split_every) d._set_dask(dx) @@ -6240,8 +6240,8 @@ def file_locations(self): out = set() # The dask graph is never going to be computed, so we can set - # '_force_in_memory=False'. - for key, a in self.todict(_force_in_memory=False).items(): + # '_force_to_memory=False'. + for key, a in self.todict(_force_to_memory=False).items(): try: out.update(a.file_locations()) except AttributeError: @@ -6801,8 +6801,8 @@ def del_file_location(self, location): updated = False # The dask graph is never going to be computed, so we can set - # '_force_in_memory=False'. - dsk = self.todict(_force_in_memory=False) + # '_force_to_memory=False'. + dsk = self.todict(_force_to_memory=False) for key, a in dsk.items(): try: dsk[key] = a.del_file_location(location) @@ -6815,7 +6815,7 @@ def del_file_location(self, location): updated = True if updated: - dx = self.to_dask_array(_force_in_memory=False) + dx = self.to_dask_array(_force_to_memory=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) self._set_dask(dx, clear=self._NONE, in_memory=None) @@ -7852,8 +7852,8 @@ def where( # hardness has been applied. # # 'cf_where' has its own calls to 'cfdm_to_memory', so we can - # set '_force_in_memory=False'. - dx = d.to_dask_array(_force_in_memory=False) + # set '_force_to_memory=False'. + dx = d.to_dask_array(_force_to_memory=False) units = d.Units @@ -7869,8 +7869,8 @@ def where( condition = type(self).asdata(condition) condition = where_broadcastable(d, condition, "condition") # 'cf_where' has its own calls to 'cfdm_to_memory', so we can - # set '_force_in_memory=False'. - condition = condition.to_dask_array(_force_in_memory=False) + # set '_force_to_memory=False'. + condition = condition.to_dask_array(_force_to_memory=False) # If x or y is self then change it to None. This prevents an # unnecessary copy; and, at compute time, an unncessary numpy diff --git a/cf/data/utils.py b/cf/data/utils.py index 1cd70fc388..c1b1a63920 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -417,10 +417,10 @@ def collapse( kwargs["ddof"] = ddof # The applicable chunk function will have its own call to - # 'cfdm_to_memory', so we can set '_force_in_memory=False'. Also, - # setting _force_in_memory=False will ensure that any active + # 'cfdm_to_memory', so we can set '_force_to_memory=False'. Also, + # setting _force_to_memory=False will ensure that any active # storage operations are not compromised. - dx = d.to_dask_array(_force_in_memory=False) + dx = d.to_dask_array(_force_to_memory=False) dx = func(dx, **kwargs) d._set_dask(dx) diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index d4fd04a3ee..c8bc9e254e 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -748,8 +748,8 @@ def _cfa_write_non_standard_terms( # missing data. # # '_cfa_unique' has its own call to 'cfdm_to_memory', so - # we can set '_asanyarray=False'. - dx = data.to_dask_array(_asanyarray=False) + # we can set '_force_to_memory=False'. + dx = data.to_dask_array(_force_to_memory=False) dx_ind = tuple(range(dx.ndim)) out_ind = dx_ind dx = da.blockwise( @@ -963,7 +963,7 @@ def _cfa_aggregation_instructions(self, data, cfvar): # ------------------------------------------------------------ dtype = np.dtype(np.int32) if ( - max(data.to_dask_array(_asanyarray=False).chunksize) + max(data.to_dask_array(_force_to_memory=False).chunksize) > np.iinfo(dtype).max ): dtype = np.dtype(np.int64) diff --git a/cf/test/individual_tests.sh b/cf/test/individual_tests.sh index 425c7dd435..f67383e173 100755 --- a/cf/test/individual_tests.sh +++ b/cf/test/individual_tests.sh @@ -5,9 +5,6 @@ do echo "Running $file" python $file rc=$? -# if [[ $rc != 0 ]]; then -# exit $rc -# fi done file=setup_create_field.py @@ -18,14 +15,30 @@ if [[ $rc != 0 ]]; then exit $rc fi +style="lots" + for file in test_*.py do echo "Running $file" python $file rc=$? if [[ $rc != 0 ]]; then - exit $rc - # echo -e "\n\n$file FAILED \n\n" + if [[ "$file" == "test_style.py" ]] ; then + style="none" + else + exit $rc + # echo -e "\n\n$file FAILED \n\n" + fi fi done +echo +if [[ "$style" == "none" ]] ; then + echo "------------------------------------------" + echo "All tests passed, APART FROM test_style.py" + echo "------------------------------------------" +else + echo "================" + echo "All tests passed" + echo "================" +fi diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index 3b51206af5..dcb28cc85f 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -4540,7 +4540,7 @@ def test_Data_cull_graph(self): len( dict( d.to_dask_array( - _apply_mask_hardness=False, _asanyarray=False + _force_mask_hardness=False, _force_to_memory=False ).dask ) ), @@ -4553,7 +4553,7 @@ def test_Data_cull_graph(self): len( dict( d.to_dask_array( - _apply_mask_hardness=False, _asanyarray=False + _force_mask_hardness=False, _force_to_memory=False ).dask ) ), @@ -4725,7 +4725,7 @@ def test_Data_file_location(self): def test_Data_todict(self): """Test Data.todict.""" d = cf.Data([1, 2, 3, 4], chunks=2) - key = d.to_dask_array(_apply_mask_hardness=False).name + key = d.to_dask_array(_force_mask_hardness=False).name x = d.todict() self.assertIsInstance(x, dict) diff --git a/cf/test/test_DimensionCoordinate.py b/cf/test/test_DimensionCoordinate.py index bb4c41ac90..b4d35ee203 100644 --- a/cf/test/test_DimensionCoordinate.py +++ b/cf/test/test_DimensionCoordinate.py @@ -614,7 +614,7 @@ def test_DimensiconCoordinate_persist(self): self.assertEqual( len( e.data.to_dask_array( - _apply_mask_hardness=False, _asanyarray=False + _force_mask_hardness=False, _force_to_memory=False ).dask.layers ), 1, diff --git a/cf/test/test_Field.py b/cf/test/test_Field.py index b514fdbed3..6edaa81921 100644 --- a/cf/test/test_Field.py +++ b/cf/test/test_Field.py @@ -2767,7 +2767,7 @@ def test_Field_persist(self): self.assertGreater( len( f.data.to_dask_array( - _apply_mask_hardness=False, _asanyarray=False + _force_mask_hardness=False, _force_to_memory=False ).dask.layers ), 2, @@ -2778,7 +2778,7 @@ def test_Field_persist(self): self.assertEqual( len( g.data.to_dask_array( - _apply_mask_hardness=False, _asanyarray=False + _force_mask_hardness=False, _force_to_memory=False ).dask.layers ), 1, From 4ac56c0baee4e9e7c509316b6e5f0c36f2aca628 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Tue, 3 Dec 2024 15:59:45 +0000 Subject: [PATCH 12/27] dev --- cf/data/data.py | 359 ++---------------------------------------------- 1 file changed, 12 insertions(+), 347 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index 6334edfb95..d0697cf1e6 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -152,218 +152,6 @@ def __new__(cls, *args, **kwargs): instance._Units_class = Units return instance - def __init__( - self, - array=None, - units=None, - calendar=None, - fill_value=None, - hardmask=True, - chunks="auto", - dt=False, - source=None, - copy=True, - dtype=None, - mask=None, - mask_value=None, - to_memory=False, - init_options=None, - _use_array=True, - ): - """**Initialisation** - - :Parameters: - - array: optional - The array of values. May be a scalar or array-like - object, including another `{{class}}` instance, anything - with a `!to_dask_array` method, `numpy` array, `dask` - array, `xarray` array, `cf.Array` subclass, `list`, - `tuple`, scalar. - - *Parameter example:* - ``array=34.6`` - - *Parameter example:* - ``array=[[1, 2], [3, 4]]`` - - *Parameter example:* - ``array=numpy.ma.arange(10).reshape(2, 1, 5)`` - - units: `str` or `Units`, optional - The physical units of the data. if a `Units` object is - provided then this an also set the calendar. - - The units (without the calendar) may also be set after - initialisation with the `set_units` method. - - *Parameter example:* - ``units='km hr-1'`` - - *Parameter example:* - ``units='days since 2018-12-01'`` - - calendar: `str`, optional - The calendar for reference time units. - - The calendar may also be set after initialisation with the - `set_calendar` method. - - *Parameter example:* - ``calendar='360_day'`` - - fill_value: optional - The fill value of the data. By default, or if set to - `None`, the `numpy` fill value appropriate to the array's - data-type will be used (see - `numpy.ma.default_fill_value`). - - The fill value may also be set after initialisation with - the `set_fill_value` method. - - *Parameter example:* - ``fill_value=-999.`` - - dtype: data-type, optional - The desired data-type for the data. By default the - data-type will be inferred form the *array* - parameter. - - The data-type may also be set after initialisation with - the `dtype` attribute. - - *Parameter example:* - ``dtype=float`` - - *Parameter example:* - ``dtype='float32'`` - - *Parameter example:* - ``dtype=numpy.dtype('i2')`` - - .. versionadded:: 3.0.4 - - mask: optional - Apply this mask to the data given by the *array* - parameter. By default, or if *mask* is `None`, no mask - is applied. May be any scalar or array-like object - (such as a `list`, `numpy` array or `{{class}}` instance) - that is broadcastable to the shape of *array*. Masking - will be carried out where the mask elements evaluate - to `True`. - - This mask will applied in addition to any mask already - defined by the *array* parameter. - - mask_value: scalar array_like - Mask *array* where it is equal to *mask_value*, using - numerically tolerant floating point equality. - - .. versionadded:: (cfdm) 1.11.0.0 - - hardmask: `bool`, optional - If True (the default) then the mask is hard. If False - then the mask is soft. - - dt: `bool`, optional - If True then strings (such as ``'1990-12-01 12:00'``) - given by the *array* parameter are re-interpreted as - date-time objects. By default they are not. - - {{init source: optional}} - - {{init copy: `bool`, optional}} - - {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - - .. versionadded:: (cfdm) NEXTVERSION - - to_memory: `bool`, optional - If True then ensure that the original data are in - memory, rather than on disk. - - If the original data are on disk, then reading data - into memory during initialisation will slow down the - initialisation process, but can considerably improve - downstream performance by avoiding the need for - independent reads for every dask chunk, each time the - data are computed. - - In general, setting *to_memory* to True is not the same - as calling the `persist` of the newly created `{{class}}` - object, which also decompresses data compressed by - convention and computes any data type, mask and - date-time modifications. - - If the input *array* is a `dask.array.Array` object - then *to_memory* is ignored. - - .. versionadded:: (cfdm) NEXTVERSION - - init_options: `dict`, optional - Provide optional keyword arguments to methods and - functions called during the initialisation process. A - dictionary key identifies a method or function. The - corresponding value is another dictionary whose - key/value pairs are the keyword parameter names and - values to be applied. - - Supported keys are: - - * ``'from_array'``: Provide keyword arguments to - the `dask.array.from_array` function. This is used - when initialising data that is not already a dask - array and is not compressed by convention. - - * ``'first_non_missing_value'``: Provide keyword - arguments to the - `cfdm.data.utils.first_non_missing_value` - function. This is used when the input array contains - date-time strings or objects, and may affect - performance. - - *Parameter example:* - ``{'from_array': {'inline_array': True}}`` - - **Examples** - - >>> d = {{package}}.{{class}}(5) - >>> d = {{package}}.{{class}}([1,2,3], units='K') - >>> import numpy - >>> d = {{package}}.{{class}}(numpy.arange(10).reshape(2,5), - ... units='m/s', fill_value=-999) - >>> d = {{package}}.{{class}}('fly') - >>> d = {{package}}.{{class}}(tuple('fly')) - - """ - super().__init__( - array=array, - units=units, - calendar=calendar, - fill_value=fill_value, - hardmask=hardmask, - chunks=chunks, - dt=dt, - source=source, - copy=copy, - dtype=dtype, - mask=mask, - mask_value=mask_value, - to_memory=to_memory, - init_options=init_options, - _use_array=_use_array, - ) - - if source is not None: - try: - deterministic = source.has_deterministic_name() - except AttributeError: - deterministic = False - else: - deterministic = not is_dask_collection(array) - - self._custom["has_deterministic_name"] = deterministic - def __contains__(self, value): """Membership test operator ``in`` @@ -777,42 +565,6 @@ def _cfa_set_write(self, status): """ self._custom["cfa_write"] = bool(status) - def _update_deterministic(self, other): - """Update the deterministic name status. - - .. versionadded:: 3.15.1 - - .. seealso:: `get_deterministic_name`, - `has_deterministic_name` - - :Parameters: - - other: `bool` or `Data` - If `False` then set the deterministic name status to - `False`. If `True` then do not change the - deterministic name status. If `Data` then set the - deterministic name status to `False` if and only if - *other* has a False deterministic name status. - - :Returns: - - `None` - - """ - if other is False: - self._custom["has_deterministic_name"] = False - return - - if other is True: - return - - custom = self._custom - deterministic = custom["has_deterministic_name"] - if deterministic: - custom["has_deterministic_name"] = ( - deterministic and other._custom["has_deterministic_name"] - ) - @_inplace_enabled(default=False) def diff(self, axis=-1, n=1, inplace=False): """Calculate the n-th discrete difference along the given axis. @@ -883,7 +635,7 @@ def diff(self, axis=-1, n=1, inplace=False): [1.0 1.5 0.5]] >>> print(d.diff(n=2).array) [[0.0 0.0] - [ -- --] + [ -- --]s [0.5 -1.0]] >>> print(d.diff(axis=0).array) [[4.0 3.5 -- 4.0] @@ -2211,16 +1963,16 @@ def _clear_after_dask_update(self, clear=None): Specify which components to remove, determined by sequentially combining an integer value of *clear* with the relevant class-level constants (such as - ``{{class}}._ARRAY``), using the bitwise AND (&) + ``Data._ARRAY``), using the bitwise AND (&) operator. If ``clear & `` is True then the corresponding component is cleared. The default value of `None` is equivalent to *clear* being - set to ``{{class}}._ALL``. + set to ``Data._ALL``. The bitwise OR (^) operator can be used to retain a component (or components) but remove all others. For - instance, if *clear* is ``{{class}}._ALL ^ - {{class}}._CACHE`` then all components except the + instance, if *clear* is ``Data._ALL ^ + Data._CACHE`` then all components except the cached array values will be removed. :Returns: @@ -2704,7 +2456,6 @@ def _binary_operation(cls, data, other, method): d = super()._binary_operation(data0, other, method) d.override_units(new_Units, inplace=True) - d._update_deterministic(other) if inplace: data.__dict__ = d.__dict__ @@ -3418,13 +3169,13 @@ def Units(self): **Examples** - >>> d = {{package}}.{{class}}([1, 2, 3], units='m') + >>> d = cf.Data([1, 2, 3], units='m') >>> d.Units - >>> d.Units = {{package}}.Units('kilometres') + >>> d.Units = cf.Units('kilometres') >>> d.Units - >>> d.Units = {{package}}.Units('km') + >>> d.Units = cf.Units('km') >>> d.Units @@ -4139,7 +3890,7 @@ def convert_reference_time( :Returns: - `{{class}}` or `None` + `Data` or `None` The data with converted reference time values, or `None` if the operation was in-place. @@ -4229,69 +3980,6 @@ def convert_reference_time( return d - def get_deterministic_name(self): - """Get the deterministic name for the data. - - If there is a deterministic name then the data array may be - assumed to be 'equal' to that of another `Data` object with - the same deterministic name. This measure of equality is - different to that applied by the `equals` method in that NaN - and inf values are, in effect, always considered equal. - - Note that the opposite is not always true. Two `Data` objects - that are considered equal by their `equals` methods might not - have the same deterministic name. - - An exception is raised if there is no deterministic name. - - .. versionadded:: 3.15.1 - - .. seealso:: `has_deterministic_name` - - :Returns: - - `str` - The deterministic name. - - **Examples** - - >>> d = cf.Data([1, 2, 3], 'm') - >>> d.has_deterministic_name() - True - >>> d.get_deterministic_name() - '6380dd3674fbf10d30561484b084e9b3' - >>> d1 = cf.Data([1, 2, 3], 'metre') - >>> d1.get_deterministic_name() - '6380dd3674fbf10d30561484b084e9b3' - >>> d1.get_deterministic_name() == d.get_deterministic_name() - True - >>> d1.equals(d) - True - - >>> e = d + 1 - 1 - >>> e.get_deterministic_name() - '0b83ada62d4b014bae83c3de1c1d3a80' - >>> e.get_deterministic_name() == d.get_deterministic_name() - False - >>> e.equals(d) - True - - """ - if not self.has_deterministic_name(): - raise ValueError() - - units = self._Units - - # The dask graph is never going to be computed, so we can set - # '_force_to_memory=False'. - return tokenize( - self.to_dask_array( - _force_mask_hardness=False, _force_to_memory=False - ).name, - units.formatted(definition=True, names=True), - units._canonical_calendar, - ) - def add_file_location(self, location): """Add a new file location in-place. @@ -4403,13 +4091,13 @@ def masked_where(self, condition, inplace=False): :Returns: - `{{class}}` or `None` + `Data` or `None` The result of masking the data, or `None` if the operation was in-place. **Examples** - >>> d = {{package}}.{{class}}([1, 2, 3, 4, 5]) + >>> d = cf.Data([1, 2, 3, 4, 5]) >>> e = d.masked_where([0, 1, 0, 1, 0]) >>> print(e.array) [1 -- 3 -- 5] @@ -6192,29 +5880,6 @@ def halo( return d - def has_deterministic_name(self): - """Whether there is a deterministic name for the data. - - See `get_deterministic_name` for details. - - .. versionadded:: 3.15.1 - - .. seealso:: `get_deterministic_name` - - :Returns: - - `bool` - Whether or not there is a deterministic name. - - **Examples** - - >>> d = cf.Data([1, 2, 3], 'm') - >>> d.has_deterministic_name() - True - - """ - return self._custom.get("has_deterministic_name", False) - def file_locations(self): """The locations of files containing parts of the data. @@ -7097,7 +6762,7 @@ def isclose(self, y, rtol=None, atol=None): d._set_dask(dx) d.hardmask = self._DEFAULT_HARDMASK d.override_units(_units_None, inplace=True) - d._update_deterministic(not is_dask_collection(y)) + d._update_deterministic(y) return d From 4a2bda3b68699ec39dae215d4d0f2ec817f6aa51 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 4 Dec 2024 09:09:16 +0000 Subject: [PATCH 13/27] dev --- cf/data/data.py | 232 ------------------------------------------------ 1 file changed, 232 deletions(-) diff --git a/cf/data/data.py b/cf/data/data.py index d0697cf1e6..19340dd699 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -267,10 +267,6 @@ def __contains__(self, value): return bool(dx.any()) - def __data__(self): - """Returns a new reference to self.""" - return self - def __getitem__(self, indices): """Return a subspace of the data defined by indices. @@ -4784,73 +4780,6 @@ def clip(self, a_min, a_max, units=None, inplace=False, i=False): d._set_dask(dx) return d - @classmethod - def asdata(cls, d, dtype=None, copy=False): - """Convert the input to a `Data` object. - - If the input *d* has the Data interface (i.e. it has a - `__data__` method), then the output of this method is used as - the returned `Data` object. Otherwise, `Data(d)` is returned. - - :Parameters: - - d: data-like - Input data in any form that can be converted to a - `Data` object. This includes `Data` and `Field` - objects, and objects with the Data interface, numpy - arrays and any object which may be converted to a - numpy array. - - dtype: data-type, optional - By default, the data-type is inferred from the input data. - - copy: `bool`, optional - If True and *d* has the Data interface, then a copy of - `d.__data__()` is returned. - - :Returns: - - `Data` - `Data` interpretation of *d*. No copy is performed on the - input if it is already a `Data` object with matching dtype - and *copy* is False. - - **Examples** - - >>> d = cf.Data([1, 2]) - >>> cf.Data.asdata(d) is d - True - >>> d.asdata(d) is d - True - - >>> cf.Data.asdata([1, 2]) - - - >>> cf.Data.asdata(numpy.array([1, 2])) - - - """ - data = getattr(d, "__data__", None) - if data is None: - # d does not have a Data interface - data = cls(d) - if dtype is not None: - data.dtype = dtype - - return data - - # d does have a Data interface - data = data() - if copy: - data = data.copy() - if dtype is not None and np.dtype(dtype) != data.dtype: - data.dtype = dtype - elif dtype is not None and np.dtype(dtype) != data.dtype: - data = data.copy() - data.dtype = dtype - - return data - @classmethod def arctan2(cls, x1, x2): """Element-wise arc tangent of ``x1/x2`` with correct quadrant. @@ -7978,167 +7907,6 @@ def trunc(self, inplace=False, i=False): d._set_dask(dx) return d - @classmethod - def full( - cls, - shape, - fill_value, - dtype=None, - units=None, - calendar=None, - chunks="auto", - ): - """Return a new array of given shape and type, filled with a - fill value. - - .. seealso:: `empty`, `ones`, `zeros` - - :Parameters: - - shape: `int` or `tuple` of `int` - The shape of the new array. e.g. ``(2, 3)`` or ``2``. - - fill_value: scalar - The fill value. - - dtype: data-type - The desired data-type for the array. The default, `None`, - means ``np.array(fill_value).dtype``. - - units: `str` or `Units` - The units for the new data array. - - calendar: `str`, optional - The calendar for reference time units. - - {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - - .. versionadded:: 3.14.0 - - :Returns: - - `Data` - Array of *fill_value* with the given shape and data - type. - - **Examples** - - >>> d = cf.Data.full((2, 3), -99) - >>> print(d.array) - [[-99 -99 -99] - [-99 -99 -99]] - - >>> d = cf.Data.full(2, 0.0) - >>> print(d.array) - [0. 0.] - - >>> d = cf.Data.full((2,), 0, dtype=bool) - >>> print(d.array) - [False False] - - """ - if dtype is None: - # Need to explicitly set the default because dtype is not - # a named keyword of da.full - dtype = getattr(fill_value, "dtype", None) - if dtype is None: - dtype = np.array(fill_value).dtype - - dx = da.full(shape, fill_value, dtype=dtype, chunks=chunks) - return cls(dx, units=units, calendar=calendar) - - @classmethod - def ones(cls, shape, dtype=None, units=None, calendar=None, chunks="auto"): - """Returns a new array filled with ones of set shape and type. - - .. seealso:: `empty`, `full`, `zeros` - - :Parameters: - - shape: `int` or `tuple` of `int` - The shape of the new array. e.g. ``(2, 3)`` or ``2``. - - dtype: data-type - The desired data-type for the array, e.g. - `numpy.int8`. The default is `numpy.float64`. - - units: `str` or `Units` - The units for the new data array. - - calendar: `str`, optional - The calendar for reference time units. - - {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - - .. versionadded:: 3.14.0 - - :Returns: - - `Data` - Array of ones with the given shape and data type. - - **Examples** - - >>> d = cf.Data.ones((2, 3)) - >>> print(d.array) - [[1. 1. 1.] - [1. 1. 1.]] - - >>> d = cf.Data.ones((2,), dtype=bool) - >>> print(d.array) - [ True True] - - """ - dx = da.ones(shape, dtype=dtype, chunks=chunks) - return cls(dx, units=units, calendar=calendar) - - @classmethod - def zeros( - cls, shape, dtype=None, units=None, calendar=None, chunks="auto" - ): - """Returns a new array filled with zeros of set shape and type. - - .. seealso:: `empty`, `full`, `ones` - - :Parameters: - - shape: `int` or `tuple` of `int` - The shape of the new array. - - dtype: data-type - The data-type of the new array. By default the - data-type is ``float``. - - units: `str` or `Units` - The units for the new data array. - - calendar: `str`, optional - The calendar for reference time units. - - {{chunks: `int`, `tuple`, `dict` or `str`, optional}} - - .. versionadded:: 3.14.0 - - :Returns: - - `Data` - Array of zeros with the given shape and data type. - - **Examples** - - >>> d = cf.Data.zeros((2, 3)) - >>> print(d.array) - [[0. 0. 0.] - [0. 0. 0.]] - - >>> d = cf.Data.zeros((2,), dtype=bool) - >>> print(d.array) - [False False] - - """ - dx = da.zeros(shape, dtype=dtype, chunks=chunks) - return cls(dx, units=units, calendar=calendar) - @_deprecated_kwarg_check("out", version="3.14.0", removed_at="5.0.0") @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) From cb6fbb37e28ab759cfd430d19c27e7218fd0355c Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 4 Dec 2024 12:03:54 +0000 Subject: [PATCH 14/27] Fix bug in is_discrete_axis call --- cf/field.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/field.py b/cf/field.py index d9059cf235..4944202b66 100644 --- a/cf/field.py +++ b/cf/field.py @@ -1002,7 +1002,7 @@ def _binary_operation(self, other, method): for axis in f.domain_axes(todict=True): identity = None - if self.is_discrete_axis(axis): + if f.is_discrete_axis(axis): # This is a discrete axis whose identity is # inferred from all of its auxiliary coordinates x = {} From 940f010cad77dc0df0a7473b104bb295b6c15ad5 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 4 Dec 2024 16:29:40 +0000 Subject: [PATCH 15/27] dev --- cf/data/array/mixin/__init__.py | 2 - cf/read_write/netcdf/netcdfread.py | 329 ----------------------------- docs/source/class/cf.Data.rst | 2 + docs/source/tutorial.rst | 2 +- 4 files changed, 3 insertions(+), 332 deletions(-) diff --git a/cf/data/array/mixin/__init__.py b/cf/data/array/mixin/__init__.py index 5bf63658df..af036620cf 100644 --- a/cf/data/array/mixin/__init__.py +++ b/cf/data/array/mixin/__init__.py @@ -3,5 +3,3 @@ from .cfamixin import CFAMixin from .compressedarraymixin import CompressedArrayMixin from .filearraymixin import FileArrayMixin - -# from .indexmixin import IndexMixin diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index 54d92305fd..4ed7e3462d 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -207,22 +207,6 @@ def _create_data( if data.npartitions == 1: data._cfa_set_write(True) - # if ( - # not compression_index - # and self.read_vars.get("cache") - # and self.implementation.get_construct_type(construct) - # != "field" - # ): - # # Only cache values from non-field data and - # # non-compression-index data, on the assumptions that: - # # - # # a) Field data is, in general, so large that finding - # # the cached values takes too long. - # # - # # b) Cached values are never really required for - # # compression index data. - # self._cache_data_elements(data, ncvar) - return data # ------------------------------------------------------------ @@ -312,99 +296,6 @@ def _is_cfa_variable(self, ncvar): and ncvar not in g["external_variables"] ) - # def _create_Data( - # self, - # array, - # ncvar, - # units=None, - # calendar=None, - # ncdimensions=(), - # **kwargs, - # ): - # """Create a Data object from a netCDF variable. - # - # .. versionadded:: 3.0.0 - # - # :Parameters: - # - # array: `Array` - # The file array. - # - # ncvar: `str` - # The netCDF variable containing the array. - # - # units: `str`, optional - # The units of *array*. By default, or if `None`, it is - # assumed that there are no units. - # - # calendar: `str`, optional - # The calendar of *array*. By default, or if `None`, it is - # assumed that there is no calendar. - # - # ncdimensions: sequence of `str`, optional - # The netCDF dimensions spanned by the array. - # - # .. versionadded:: 3.14.0 - # - # kwargs: optional - # Extra parameters to pass to the initialisation of the - # returned `Data` object. - # - # :Returns: - # - # `Data` - # - # """ - # if array.dtype is None: - # # The array is based on a netCDF VLEN variable, and - # # therefore has unknown data type. To find the correct - # # data type (e.g. "=1) netCDF string type variable comes out - # # as a numpy object array, so convert it to numpy - # # string array. - # array = array.astype("U", copy=False) - # # NetCDF4 doesn't auto-mask VLEN variables - # array = np.ma.where(array == "", np.ma.masked, array) - # - # # Parse dask chunks - # chunks = self._dask_chunks(array, ncvar, compressed) - # - # data = super()._create_Data( - # array, - # ncvar, - # units=units, - # calendar=calendar, - # chunks=chunks, - # **kwargs, - # ) - # - # return data - def _customise_read_vars(self): """Customise the read parameters. @@ -465,160 +356,6 @@ def _customise_read_vars(self): for term_ncvar in parsed_aggregated_data.values(): g["do_not_create_field"].add(term_ncvar) - # def _cache_data_elements(self, data, ncvar): - # """Cache selected element values. - # - # Updates *data* in-place to store its first, second, - # penultimate, and last element values (as appropriate). - # - # These values are used by, amongst other things, - # `cf.Data.equals`, `cf.aggregate` and for inspection. - # - # Doing this here is quite cheap because only the individual - # elements are read from the already-open file, as opposed to - # being retrieved from *data* (which would require a whole dask - # chunk to be read to get each single value). - # - # However, empirical evidence shows that using netCDF4 to access - # the first and last elements of a large array on disk - # (e.g. shape (1, 75, 1207, 1442)) is slow (e.g. ~2 seconds) and - # doesn't scale well with array size (i.e. it takes - # disproportionally longer for larger arrays). Such arrays are - # usually in field constructs, for which `cf.aggregate` does not - # need to know any array values, so this method should be used - # with caution, if at all, on field construct data. - # - # .. versionadded:: 3.14.0 - # - # :Parameters: - # - # data: `Data` - # The data to be updated with its cached values. - # - # ncvar: `str` - # The name of the netCDF variable that contains the - # data. - # - # :Returns: - # - # `None` - # - # """ - # - # if data.data.get_compression_type(): - # # Don't get cached elements from arrays compressed by - # # convention, as they'll likely be wrong. - # return - # - # g = self.read_vars - # - # # Get the netCDF4.Variable for the data - # if g["has_groups"]: - # group, name = self._netCDF4_group( - # g["variable_grouped_dataset"][ncvar], ncvar - # ) - # variable = group.variables.get(name) - # else: - # variable = g["variables"].get(ncvar) - # - # # Get the required element values - # size = data.size - # ndim = data.ndim - # - # char = False - # if variable.ndim == ndim + 1: - # dtype = variable.dtype - # if dtype is not str and dtype.kind in "SU": - # # This variable is a netCDF classic style char array - # # with a trailing dimension that needs to be collapsed - # char = True - # - # if ndim == 1: - # # Also cache the second element for 1-d data, on the - # # assumption that they may well be dimension coordinate - # # data. - # if size == 1: - # indices = (0, -1) - # value = variable[...] - # values = (value, value) - # elif size == 2: - # indices = (0, 1, -1) - # value = variable[-1:] - # values = (variable[:1], value, value) - # else: - # indices = (0, 1, -1) - # values = (variable[:1], variable[1:2], variable[-1:]) - # elif ndim == 2 and data.shape[-1] == 2: - # # Assume that 2-d data with a last dimension of size 2 - # # contains coordinate bounds, for which it is useful to - # # cache the upper and lower bounds of the the first and - # # last cells. - # indices = (0, 1, -2, -1) - # ndim1 = ndim - 1 - # values = ( - # variable[(slice(0, 1),) * ndim1 + (slice(0, 1),)], - # variable[(slice(0, 1),) * ndim1 + (slice(1, 2),)], - # ) - # if data.size == 2: - # values = values + values - # else: - # values += ( - # variable[(slice(-1, None, 1),) * ndim1 + (slice(0, 1),)], - # variable[(slice(-1, None, 1),) * ndim1 + (slice(1, 2),)], - # ) - # elif size == 1: - # indices = (0, -1) - # value = variable[...] - # values = (value, value) - # elif size == 3: - # indices = (0, 1, -1) - # if char: - # values = variable[...].reshape(3, variable.shape[-1]) - # else: - # values = variable[...].flatten() - # else: - # indices = (0, -1) - # values = ( - # variable[(slice(0, 1),) * ndim], - # variable[(slice(-1, None, 1),) * ndim], - # ) - # - # # Create a dictionary of the element values - # elements = {} - # for index, value in zip(indices, values): - # if char: - # # Variable is a netCDF classic style char array, so - # # collapse (by concatenation) the outermost (fastest - # # varying) dimension. E.g. [['a','b','c']] becomes - # # ['abc'] - # if value.dtype.kind == "U": - # value = value.astype("S") - # - # a = netCDF4.chartostring(value) - # shape = a.shape - # a = np.array([x.rstrip() for x in a.flat]) - # a = np.reshape(a, shape) - # value = np.ma.masked_where(a == "", a) - # - # if np.ma.is_masked(value): - # value = np.ma.masked - # else: - # try: - # value = value.item() - # except (AttributeError, ValueError): - # # AttributeError: A netCDF string type scalar - # # variable comes out as Python str object, which - # # has no 'item' method. - # # - # # ValueError: A size-0 array can't be converted to - # # a Python scalar. - # pass - # - # elements[index] = value - # - # # Store the elements in the data object - # data._set_cached_elements(elements) - def _create_cfanetcdfarray( self, ncvar, @@ -771,72 +508,6 @@ def _create_cfanetcdfarray_term( return array, kwargs - # - # def _parse_chunks(self, ncvar): - # """Parse the dask chunks. - # - # .. versionadded:: 3.14.0 - # - # :Parameters: - # - # ncvar: `str` - # The name of the netCDF variable containing the array. - # - # :Returns: - # - # `str`, `int` or `dict` - # The parsed chunks that are suitable for passing to a - # `Data` object containing the variable's array. - # - # """ - # g = self.read_vars - # - # default_chunks = "auto" - # chunks = g.get("chunks", default_chunks) - # - # if chunks is None: - # return -1 - # - # if isinstance(chunks, dict): - # if not chunks: - # return default_chunks - # - # # For ncdimensions = ('time', 'lat'): - # # - # # chunks={} -> ["auto", "auto"] - # # chunks={'ncdim%time': 12} -> [12, "auto"] - # # chunks={'ncdim%time': 12, 'ncdim%lat': 10000} -> [12, 10000] - # # chunks={'ncdim%time': 12, 'ncdim%lat': "20MB"} -> [12, "20MB"] - # # chunks={'ncdim%time': 12, 'latitude': -1} -> [12, -1] - # # chunks={'ncdim%time': 12, 'Y': None} -> [12, None] - # # chunks={'ncdim%time': 12, 'ncdim%lat': (30, 90)} -> [12, (30, 90)] - # # chunks={'ncdim%time': 12, 'ncdim%lat': None, 'X': 5} -> [12, None] - # attributes = g["variable_attributes"] - # chunks2 = [] - # for ncdim in g["variable_dimensions"][ncvar]: - # key = f"ncdim%{ncdim}" - # if key in chunks: - # chunks2.append(chunks[key]) - # continue - # - # found_coord_attr = False - # dim_coord_attrs = attributes.get(ncdim) - # if dim_coord_attrs is not None: - # for attr in ("standard_name", "axis"): - # key = dim_coord_attrs.get(attr) - # if key in chunks: - # found_coord_attr = True - # chunks2.append(chunks[key]) - # break - # - # if not found_coord_attr: - # # Use default chunks for this dimension - # chunks2.append(default_chunks) - # - # chunks = chunks2 - # - # return chunks - def _customise_field_ancillaries(self, parent_ncvar, f): """Create customised field ancillary constructs. diff --git a/docs/source/class/cf.Data.rst b/docs/source/class/cf.Data.rst index eb1fc87826..625daccbe2 100644 --- a/docs/source/class/cf.Data.rst +++ b/docs/source/class/cf.Data.rst @@ -86,6 +86,7 @@ Dask :template: attribute.rst ~cf.Data.chunks + ~cf.Data.chunksize ~cf.Data.npartitions ~cf.Data.numblocks @@ -332,6 +333,7 @@ Mask support ~cf.Data.has_fill_value ~cf.Data.set_fill_value ~cf.Data.soften_mask + ~cf.Data.masked_where .. rubric:: Attributes diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index ceafb8815e..57f17bd07d 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -624,7 +624,7 @@ retrieved with the `~Field.properties` method: .. code-block:: python :caption: *Retrieve all of the descriptive properties* - >>> q, t = cf.read('file.nc')[1] + >>> t = cf.read('file.nc')[1] >>> t.properties() {'Conventions': 'CF-1.11', 'project': 'research', From e99862ff651b2194e6e109cc426e3274a1c803ba Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 5 Dec 2024 09:06:42 +0000 Subject: [PATCH 16/27] dask --- Changelog.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Changelog.rst b/Changelog.rst index 14723cf7a4..62267376e4 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,7 +1,7 @@ version NEXTVERSION ------------------- -**2024-??-??** +**2024-12-??** * New method: `cf.Field.filled` (https://github.com/NCAS-CMS/cf-python/issues/811) @@ -20,6 +20,8 @@ version NEXTVERSION * New class `cf.NetCDF4Array` * New class `cf.CFAH5netcdfArray` * New class `cf.CFANetCDF4Array` +* Replace core `dask` functionality with that imported from `cfdm`. + https://github.com/NCAS-CMS/cf-python/pull/836) * Fix bug that sometimes puts an incorrect ``radian-1`` or ``radian-2`` in the returned units of the differential operator methods and functions From 2d2810eac498fb56c176828c92a1dcaf664aaee6 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Thu, 5 Dec 2024 11:22:06 +0000 Subject: [PATCH 17/27] typo --- docs/source/check_docs_api_coverage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/check_docs_api_coverage.py b/docs/source/check_docs_api_coverage.py index 5b8ab98f99..8cdd5f8aa2 100644 --- a/docs/source/check_docs_api_coverage.py +++ b/docs/source/check_docs_api_coverage.py @@ -72,7 +72,7 @@ print(f"Method {method} not in {rst_file}") except FileNotFoundError: n_missing_files += 1 - print("File {rst_file} does not exist") + print(f"File {rst_file} does not exist") if n_undocumented_methods or n_missing_files: raise ValueError( From 63cd320c50fd23fcb12f244040067db948f4dd82 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 8 Jan 2025 09:44:37 +0000 Subject: [PATCH 18/27] Typo Co-authored-by: Sadie L. Bartholomew --- Changelog.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index c8dcd4d9d7..3decded36c 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -23,8 +23,8 @@ version NEXTVERSION * New class `cf.NetCDF4Array` * New class `cf.CFAH5netcdfArray` * New class `cf.CFANetCDF4Array` -* Replace core `dask` functionality with that imported from `cfdm`. - https://github.com/NCAS-CMS/cf-python/pull/836) +* Replace core `dask` functionality with that imported from `cfdm` + (https://github.com/NCAS-CMS/cf-python/pull/836) * Fix bug that sometimes puts an incorrect ``radian-1`` or ``radian-2`` in the returned units of the differential operator methods and functions From eb7e5737eb8928036f728795f655784b20128f28 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 8 Jan 2025 10:20:00 +0000 Subject: [PATCH 19/27] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index 19340dd699..43584e5113 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -3119,7 +3119,7 @@ def _cyclic(self): """Storage for axis cyclicity. Contains a `set` that identifies which axes are cyclic (and - therefore allow cyclic slicing). The set contains a subset of + therefore allows cyclic slicing). The set contains a subset of the axis identifiers defined by the `_axes` attribute. .. warning:: Never change the value of the `_cyclic` attribute From d39d7045fa611d507526f44fb30d4cccb38388bb Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 8 Jan 2025 10:20:21 +0000 Subject: [PATCH 20/27] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index 43584e5113..31ea32e208 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -136,7 +136,7 @@ class Data(DataClassDeprecationsMixin, CFANetCDF, Container, cfdm.Data): # when a new dask array is set. See `_clear_after_dask_update` for # details. # - # These must constants must have values 2**N (N>=1), except for + # These constants must have values 2**N (N>=1), except for # `_NONE` which must be 0, and `_ALL` which must be the sum of # other constants. It is therefore convenient to define these # constants in binary. From cdd0eb478d5090f796a277d2f6ac09c1197b669f Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 8 Jan 2025 10:20:57 +0000 Subject: [PATCH 21/27] Typo Co-authored-by: Sadie L. Bartholomew --- cf/units.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/units.py b/cf/units.py index d1e50a6cae..49486800a8 100644 --- a/cf/units.py +++ b/cf/units.py @@ -22,7 +22,7 @@ class Units: """ def __new__(cls, *args, **kwargs): - """Return a new new Units instance.""" + """Return a new Units instance.""" return cfUnits(*args, **kwargs) @staticmethod From e1047ef58e4e30391f554b1a88bf56b78a49b7a1 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 8 Jan 2025 10:21:15 +0000 Subject: [PATCH 22/27] Typo Co-authored-by: Sadie L. Bartholomew --- cf/data/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cf/data/data.py b/cf/data/data.py index 31ea32e208..7e1cac6cf9 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -631,7 +631,7 @@ def diff(self, axis=-1, n=1, inplace=False): [1.0 1.5 0.5]] >>> print(d.diff(n=2).array) [[0.0 0.0] - [ -- --]s + [ -- --] [0.5 -1.0]] >>> print(d.diff(axis=0).array) [[4.0 3.5 -- 4.0] From f0ca543dc1dd1dec68ee382e098d7524b05553dd Mon Sep 17 00:00:00 2001 From: David Hassell Date: Fri, 10 Jan 2025 08:39:07 +0000 Subject: [PATCH 23/27] linting --- docs/source/recipes/plot_17_recipe.py | 4 +--- docs/source/recipes/plot_18_recipe.py | 20 ++++++++++++-------- docs/source/recipes/plot_20_recipe.py | 6 ++++-- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/docs/source/recipes/plot_17_recipe.py b/docs/source/recipes/plot_17_recipe.py index 0738c62a3a..656f7c8717 100644 --- a/docs/source/recipes/plot_17_recipe.py +++ b/docs/source/recipes/plot_17_recipe.py @@ -91,9 +91,7 @@ if i == 0: set_title = "Perceptually uniform\ncolour maps" elif i == 1: - set_title = ( - "NCL colour maps enhanced to \nhelp with colour blindness" - ) + set_title = "NCL colour maps enhanced to \nhelp with colour blindness" elif i == 2: set_title = "Orography/bathymetry\ncolour maps" else: diff --git a/docs/source/recipes/plot_18_recipe.py b/docs/source/recipes/plot_18_recipe.py index d219bdfe19..3e306906ed 100644 --- a/docs/source/recipes/plot_18_recipe.py +++ b/docs/source/recipes/plot_18_recipe.py @@ -13,11 +13,11 @@ # %% # 1. Import cf-python, cf-plot and other required packages: import cfplot as cfp -import cf - import matplotlib.pyplot as plt import scipy.stats.mstats as mstats +import cf + # %% # 2. Read the data in and unpack the Fields from FieldLists using indexing. # In our example We are investigating the influence of the land height on @@ -62,7 +62,7 @@ # unitless fraction, but the values are in the tens, so we need to # normalise these to all lie between 0 and 1 and change the units # appropriately: -sub_snow = ((sub_snow - sub_snow.minimum()) / (sub_snow.range())) +sub_snow = (sub_snow - sub_snow.minimum()) / (sub_snow.range()) sub_snow.override_units("1", inplace=True) # %% @@ -93,7 +93,9 @@ # and its strength visually. We use 'gpos' to position the plots in two # columns and apply some specific axes ticks and labels for clarity. cfp.gopen( - rows=1, columns=2, top=0.85, + rows=1, + columns=2, + top=0.85, file="snow_and_orog_on_same_grid.png", user_position=True, ) @@ -131,10 +133,12 @@ # Don't add extentions on the colourbar since it can only be 0 to 1 inclusive cfp.levs(min=0, max=1, step=0.1, extend="neither") cfp.cscale("precip_11lev", ncols=11, reverse=1) -cfp.con(sub_snow, lines=False, - title="Snow cover extent (from satellite imagery)", - colorbar_drawedges=False, - **label_info +cfp.con( + sub_snow, + lines=False, + title="Snow cover extent (from satellite imagery)", + colorbar_drawedges=False, + **label_info, ) cfp.gclose() diff --git a/docs/source/recipes/plot_20_recipe.py b/docs/source/recipes/plot_20_recipe.py index 11c3250842..1745652afc 100644 --- a/docs/source/recipes/plot_20_recipe.py +++ b/docs/source/recipes/plot_20_recipe.py @@ -10,6 +10,7 @@ # %% # 1. Import cf-python and cf-plot: import cfplot as cfp + import cf # %% @@ -81,7 +82,8 @@ cfp.mapset(resolution="10m") cfp.cscale("ncl_default") cfp.gopen( - file=f"irish-sea-currents-divergence-{chosen_time.replace(' ', '-')}.png") + file=f"irish-sea-currents-divergence-{chosen_time.replace(' ', '-')}.png" +) cfp.vect(u=u_2, v=v_2, stride=6, scale=3, key_length=1) cfp.con( div, @@ -89,6 +91,6 @@ title=( f"Depth-averaged Irish Sea currents at {chosen_time} with " "their divergence" - ) + ), ) cfp.gclose() From bd237490d09179cf11d19b6fa4677203d4bb3235 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 15 Jan 2025 16:33:56 +0000 Subject: [PATCH 24/27] update dependcies - particular numpy<2 and dask<=2024.7.2 --- Changelog.rst | 4 +++- cf/__init__.py | 35 +++++++++++++++++++++++++++-------- docs/source/installation.rst | 8 +++++--- requirements.txt | 6 +++--- 4 files changed, 38 insertions(+), 15 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index 3decded36c..8d86dc87f4 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -43,9 +43,11 @@ version NEXTVERSION (https://github.com/NCAS-CMS/cf-python/issues/828) * New dependency: ``h5netcdf>=1.3.0`` * New dependency: ``h5py>=3.10.0`` -* New dependency: ``s3fs>=2024.2.0`` +* New dependency: ``s3fs>=2024.6.0`` +* Changed dependency: ``numpy>=1.15,<2.0`` * Changed dependency: ``1.11.2.0<=cfdm<1.11.3.0`` * Changed dependency: ``cfunits>=3.3.7`` +* Changed dependency: ``dask>=2024.6.0,<=2024.7.1`` ---- diff --git a/cf/__init__.py b/cf/__init__.py index 9e630d86ea..419df85896 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -123,7 +123,7 @@ raise ImportError(_error0 + str(error1)) try: - import numpy + import numpy as np except ImportError as error1: raise ImportError(_error0 + str(error1)) @@ -191,10 +191,11 @@ # Check the version of numpy _minimum_vn = "1.22" -if Version(numpy.__version__) < Version(_minimum_vn): - raise RuntimeError( - f"Bad numpy version: cf requires numpy>={_minimum_vn}. " - f"Got {numpy.__version__} at {numpy.__file__}" +_maximum_vn = "2.0" +if not Version(_minimum_vn) <= Version(np.__version__) < Version(_maximum_vn): + raise ValueError( + "Bad numpy version: cf requires _minimum_vn}<=numpy<{_maximum_vn}. " + f"Got {np.__version__} at {np.__file__}" ) # Check the version of cfunits @@ -208,15 +209,31 @@ # Check the version of cfdm _minimum_vn = "1.11.2.0" _maximum_vn = "1.11.3.0" -_cfdm_version = Version(cfdm.__version__) -if not Version(_minimum_vn) <= _cfdm_version < Version(_maximum_vn): +if ( + not Version(_minimum_vn) + <= Version(cfdm.__version__) + < Version(_maximum_vn) +): raise RuntimeError( f"Bad cfdm version: cf requires {_minimum_vn}<=cfdm<{_maximum_vn}. " - f"Got {_cfdm_version} at {cfdm.__file__}" + f"Got {cfdm.__version__} at {cfdm.__file__}" ) # Check the version of dask +# Check the version of numpy +_minimum_vn = "2024.6.1" +_maximum_vn = "2024.7.1" +if ( + not Version(_minimum_vn) + <= Version(dask.__version__) + <= Version(_maximum_vn) +): + raise ValueError( + "Bad dask version: cf requires {_minimum_vn}<=dask<={_maximum_vn}. " + f"Got {dask.__version__} at {dask.__file__}" + ) + # Check the version of Python _minimum_vn = "3.8.0" if Version(platform.python_version()) < Version(_minimum_vn): @@ -233,6 +250,8 @@ f"Got {scipy.__version__} at {scipy.__file__}" ) +del _minimum_vn, _maximum_vn + from .constructs import Constructs from .mixin import Coordinate diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 601fc487fc..a39acd77d6 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -192,9 +192,11 @@ Required * `Python `_, 3.8.0 or newer. -* `numpy `_, 1.22.0 or newer. +* `numpy `_, versions 1.15 up to, but not + including, 2.0. -* `dask `_, 2022.12.1 or newer. +* `dask `_, versions 2024.6.0 to + 2024.7.1 inclusive. * `netCDF4 `_, 1.6.5 or newer. @@ -206,7 +208,7 @@ Required * `h5py `_, version 3.10.0 or newer. -* `s3fs `_, version 2024.2.0 or newer. +* `s3fs `_, version 2024.6.0 or newer. * `scipy `_, version 1.10.0 or newer. diff --git a/requirements.txt b/requirements.txt index 8b01daddca..94886c0d57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,12 +1,12 @@ netCDF4>=1.6.5 cftime>=1.6.2 -numpy>=1.22 +numpy>=1.22,<2.0 cfdm>=1.11.2.0, <1.11.3.0 psutil>=0.6.0 cfunits>=3.3.7 -dask>=2024.4.0 +dask>=2024.6.0,<=2024.7.1 packaging>=20.0 scipy>=1.10.0 h5netcdf>=1.3.0 h5py>=3.10.0 -s3fs>=2024.2.0 +s3fs>=2024.6.0 From 0dd62f6172234f1d272a5c3dea4d401dbb330137 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 15 Jan 2025 16:38:19 +0000 Subject: [PATCH 25/27] Dask from cfdm in changelog --- Changelog.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Changelog.rst b/Changelog.rst index 8d86dc87f4..5d302a8186 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -3,6 +3,8 @@ version NEXTVERSION **2024-12-??** +* Import a lot of `dask` functionality from `cfdm` + (https://github.com/NCAS-CMS/cf-python/issues/839) * Allow ``'nearest_dtos'`` 2-d regridding to work with discrete sampling geometry source grids (https://github.com/NCAS-CMS/cf-python/issues/832) From 1c52a0f97dc31e277dd8ccfd48b55f237863f5da Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 15 Jan 2025 17:28:29 +0000 Subject: [PATCH 26/27] Typo Co-authored-by: Sadie L. Bartholomew --- cf/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cf/__init__.py b/cf/__init__.py index 419df85896..0c6f8f0464 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -221,7 +221,6 @@ # Check the version of dask -# Check the version of numpy _minimum_vn = "2024.6.1" _maximum_vn = "2024.7.1" if ( From 582a076de2b8cdb4396b61919fc83dbf95afdee5 Mon Sep 17 00:00:00 2001 From: David Hassell Date: Wed, 15 Jan 2025 17:31:24 +0000 Subject: [PATCH 27/27] fix duplicate entry --- Changelog.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/Changelog.rst b/Changelog.rst index 5d302a8186..b81bef9971 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -3,8 +3,6 @@ version NEXTVERSION **2024-12-??** -* Import a lot of `dask` functionality from `cfdm` - (https://github.com/NCAS-CMS/cf-python/issues/839) * Allow ``'nearest_dtos'`` 2-d regridding to work with discrete sampling geometry source grids (https://github.com/NCAS-CMS/cf-python/issues/832) @@ -26,7 +24,7 @@ version NEXTVERSION * New class `cf.CFAH5netcdfArray` * New class `cf.CFANetCDF4Array` * Replace core `dask` functionality with that imported from `cfdm` - (https://github.com/NCAS-CMS/cf-python/pull/836) + (https://github.com/NCAS-CMS/cf-python/issues/839) * Fix bug that sometimes puts an incorrect ``radian-1`` or ``radian-2`` in the returned units of the differential operator methods and functions