Skip to content

Commit

Permalink
Convert datetimes to numeric. (#108)
Browse files Browse the repository at this point in the history
  • Loading branch information
dcherian authored Jun 2, 2022
1 parent 227ce04 commit ccbda27
Show file tree
Hide file tree
Showing 3 changed files with 199 additions and 3 deletions.
22 changes: 22 additions & 0 deletions flox/xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pandas as pd
import xarray as xr
from packaging.version import Version
from xarray.core.duck_array_ops import _datetime_nanmin

from .aggregations import Aggregation, _atleast_1d
from .core import (
Expand All @@ -15,6 +16,7 @@
rechunk_for_blockwise as rechunk_array_for_blockwise,
rechunk_for_cohorts as rechunk_array_for_cohorts,
)
from .xrutils import _contains_cftime_datetimes, _to_pytimedelta, datetime_to_numeric

if TYPE_CHECKING:
from xarray import DataArray, Dataset, Resample
Expand Down Expand Up @@ -289,7 +291,27 @@ def wrapper(array, *by, func, skipna, **kwargs):
if "nan" not in func and func not in ["all", "any", "count"]:
func = f"nan{func}"

requires_numeric = func not in ["count", "any", "all"]
if requires_numeric:
is_npdatetime = array.dtype.kind in "Mm"
is_cftime = _contains_cftime_datetimes(array)
if is_npdatetime:
offset = _datetime_nanmin(array)
# xarray always uses np.datetime64[ns] for np.datetime64 data
dtype = "timedelta64[ns]"
array = datetime_to_numeric(array, offset)
elif _contains_cftime_datetimes(array):
offset = min(array)
array = datetime_to_numeric(array, offset, datetime_unit="us")

result, *groups = groupby_reduce(array, *by, func=func, **kwargs)

if requires_numeric:
if is_npdatetime:
return result.astype(dtype) + offset
elif is_cftime:
return _to_pytimedelta(result, unit="us") + offset

return result

# These data variables do not have any of the core dimension,
Expand Down
173 changes: 173 additions & 0 deletions flox/xrutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,18 @@
# defined in xarray


import datetime
from typing import Any, Iterable

import numpy as np
import pandas as pd

try:
import cftime
except ImportError:
cftime = None


try:
import dask.array

Expand All @@ -15,6 +22,10 @@
dask_array_type = ()


def asarray(data, xp=np):
return data if is_duck_array(data) else xp.asarray(data)


def is_duck_array(value: Any) -> bool:
"""Checks if value is a duck array."""
if isinstance(value, np.ndarray):
Expand Down Expand Up @@ -110,3 +121,165 @@ def isnull(data):
# a null value as well as NaN, but it isn't clear how to do this
# with duck typing.
return data != data


def datetime_to_numeric(array, offset=None, datetime_unit=None, dtype=float):
"""Convert an array containing datetime-like data to numerical values.
Convert the datetime array to a timedelta relative to an offset.
Parameters
----------
array : array-like
Input data
offset : None, datetime or cftime.datetime
Datetime offset. If None, this is set by default to the array's minimum
value to reduce round off errors.
datetime_unit : {None, Y, M, W, D, h, m, s, ms, us, ns, ps, fs, as}
If not None, convert output to a given datetime unit. Note that some
conversions are not allowed due to non-linear relationships between units.
dtype : dtype
Output dtype.
Returns
-------
array
Numerical representation of datetime object relative to an offset.
Notes
-----
Some datetime unit conversions won't work, for example from days to years, even
though some calendars would allow for them (e.g. no_leap). This is because there
is no `cftime.timedelta` object.
"""
# TODO: make this function dask-compatible?
# Set offset to minimum if not given
from xarray.core.duck_array_ops import _datetime_nanmin

if offset is None:
if array.dtype.kind in "Mm":
offset = _datetime_nanmin(array)
else:
offset = min(array)

# Compute timedelta object.
# For np.datetime64, this can silently yield garbage due to overflow.
# One option is to enforce 1970-01-01 as the universal offset.

# This map_blocks call is for backwards compatibility.
# dask == 2021.04.1 does not support subtracting object arrays
# which is required for cftime
if is_duck_dask_array(array) and np.issubdtype(array.dtype, object):
array = array.map_blocks(lambda a, b: a - b, offset, meta=array._meta)
else:
array = array - offset

# Scalar is converted to 0d-array
if not hasattr(array, "dtype"):
array = np.array(array)

# Convert timedelta objects to float by first converting to microseconds.
if array.dtype.kind in "O":
return py_timedelta_to_float(array, datetime_unit or "ns").astype(dtype)

# Convert np.NaT to np.nan
elif array.dtype.kind in "mM":

# Convert to specified timedelta units.
if datetime_unit:
array = array / np.timedelta64(1, datetime_unit)
return np.where(isnull(array), np.nan, array.astype(dtype))


def timedelta_to_numeric(value, datetime_unit="ns", dtype=float):
"""Convert a timedelta-like object to numerical values.
Parameters
----------
value : datetime.timedelta, numpy.timedelta64, pandas.Timedelta, str
Time delta representation.
datetime_unit : {Y, M, W, D, h, m, s, ms, us, ns, ps, fs, as}
The time units of the output values. Note that some conversions are not allowed due to
non-linear relationships between units.
dtype : type
The output data type.
"""
import datetime as dt

if isinstance(value, dt.timedelta):
out = py_timedelta_to_float(value, datetime_unit)
elif isinstance(value, np.timedelta64):
out = np_timedelta64_to_float(value, datetime_unit)
elif isinstance(value, pd.Timedelta):
out = pd_timedelta_to_float(value, datetime_unit)
elif isinstance(value, str):
try:
a = pd.to_timedelta(value)
except ValueError:
raise ValueError(
f"Could not convert {value!r} to timedelta64 using pandas.to_timedelta"
)
return py_timedelta_to_float(a, datetime_unit)
else:
raise TypeError(
f"Expected value of type str, pandas.Timedelta, datetime.timedelta "
f"or numpy.timedelta64, but received {type(value).__name__}"
)
return out.astype(dtype)


def _to_pytimedelta(array, unit="us"):
return array.astype(f"timedelta64[{unit}]").astype(datetime.timedelta)


def np_timedelta64_to_float(array, datetime_unit):
"""Convert numpy.timedelta64 to float.
Notes
-----
The array is first converted to microseconds, which is less likely to
cause overflow errors.
"""
array = array.astype("timedelta64[ns]").astype(np.float64)
conversion_factor = np.timedelta64(1, "ns") / np.timedelta64(1, datetime_unit)
return conversion_factor * array


def pd_timedelta_to_float(value, datetime_unit):
"""Convert pandas.Timedelta to float.
Notes
-----
Built on the assumption that pandas timedelta values are in nanoseconds,
which is also the numpy default resolution.
"""
value = value.to_timedelta64()
return np_timedelta64_to_float(value, datetime_unit)


def _timedelta_to_seconds(array):
return np.reshape([a.total_seconds() for a in array.ravel()], array.shape) * 1e6


def py_timedelta_to_float(array, datetime_unit):
"""Convert a timedelta object to a float, possibly at a loss of resolution."""
array = asarray(array)
if is_duck_dask_array(array):
array = array.map_blocks(_timedelta_to_seconds, meta=np.array([], dtype=np.float64))
else:
array = _timedelta_to_seconds(array)
conversion_factor = np.timedelta64(1, "us") / np.timedelta64(1, datetime_unit)
return conversion_factor * array


def _contains_cftime_datetimes(array) -> bool:
"""Check if an array contains cftime.datetime objects"""
if cftime is None:
return False
else:
if array.dtype == np.dtype("O") and array.size > 0:
sample = array.ravel()[0]
if is_duck_dask_array(sample):
sample = sample.compute()
if isinstance(sample, np.ndarray):
sample = sample.item()
return isinstance(sample, cftime.datetime)
else:
return False
7 changes: 4 additions & 3 deletions tests/test_xarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,13 +414,14 @@ def test_cache():


@pytest.mark.parametrize("use_cftime", [True, False])
def test_datetime_array_reduce(use_cftime):
@pytest.mark.parametrize("func", ["count", "mean"])
def test_datetime_array_reduce(use_cftime, func):

time = xr.DataArray(
xr.date_range("2009-01-01", "2012-12-31", use_cftime=use_cftime),
dims=("time",),
name="time",
)
expected = time.resample(time="YS").count() # fails
actual = resample_reduce(time.resample(time="YS"), func="count", engine="flox")
expected = getattr(time.resample(time="YS"), func)()
actual = resample_reduce(time.resample(time="YS"), func=func, engine="flox")
assert_equal(expected, actual)

0 comments on commit ccbda27

Please sign in to comment.