diff --git a/ci/docs.yml b/ci/docs.yml index b11768282..9cdfb38e5 100644 --- a/ci/docs.yml +++ b/ci/docs.yml @@ -5,6 +5,7 @@ dependencies: - dask-core - pip - xarray + - numpy>=1.20 - numpydoc - numpy_groupies - toolz diff --git a/ci/environment.yml b/ci/environment.yml index bbaf5ded6..aff6bc911 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -7,6 +7,7 @@ dependencies: - dask-core - netcdf4 - pandas + - numpy>=1.20 - pip - pytest - pytest-cov diff --git a/ci/minimal-requirements.yml b/ci/minimal-requirements.yml index 81b483e74..882c8d1fb 100644 --- a/ci/minimal-requirements.yml +++ b/ci/minimal-requirements.yml @@ -8,7 +8,8 @@ dependencies: - pytest - pytest-cov - pytest-xdist - - numpy_groupies>=0.9.15 + - numpy==1.20 + - numpy_groupies==0.9.15 - pandas - pooch - toolz diff --git a/ci/no-dask.yml b/ci/no-dask.yml index 698297918..31ce0ade3 100644 --- a/ci/no-dask.yml +++ b/ci/no-dask.yml @@ -5,6 +5,7 @@ dependencies: - codecov - netcdf4 - pandas + - numpy>=1.20 - pip - pytest - pytest-cov diff --git a/ci/no-xarray.yml b/ci/no-xarray.yml index 6e54d8f4b..25c777fa1 100644 --- a/ci/no-xarray.yml +++ b/ci/no-xarray.yml @@ -5,6 +5,7 @@ dependencies: - codecov - netcdf4 - pandas + - numpy>=1.20 - pip - pytest - pytest-cov diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py index fc84250e7..62a760653 100644 --- a/flox/aggregate_flox.py +++ b/flox/aggregate_flox.py @@ -5,6 +5,21 @@ from .xrutils import isnull +def _prepare_for_flox(group_idx, array): + """ + Sort the input array once to save time. + """ + assert array.shape[-1] == group_idx.shape[0] + issorted = (group_idx[:-1] <= group_idx[1:]).all() + if issorted: + ordered_array = array + else: + perm = group_idx.argsort(kind="stable") + group_idx = group_idx[..., perm] + ordered_array = array[..., perm] + return group_idx, ordered_array + + def _np_grouped_op(group_idx, array, op, axis=-1, size=None, fill_value=None, dtype=None, out=None): """ most of this code is from shoyer's gist @@ -13,7 +28,7 @@ def _np_grouped_op(group_idx, array, op, axis=-1, size=None, fill_value=None, dt # assumes input is sorted, which I do in core._prepare_for_flox aux = group_idx - flag = np.concatenate(([True], aux[1:] != aux[:-1])) + flag = np.concatenate((np.array([True], like=array), aux[1:] != aux[:-1])) uniques = aux[flag] (inv_idx,) = flag.nonzero() @@ -25,11 +40,11 @@ def _np_grouped_op(group_idx, array, op, axis=-1, size=None, fill_value=None, dt if out is None: out = np.full(array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype) - if (len(uniques) == size) and (uniques == np.arange(size)).all(): + if (len(uniques) == size) and (uniques == np.arange(size, like=array)).all(): # The previous version of this if condition # ((uniques[1:] - uniques[:-1]) == 1).all(): # does not work when group_idx is [1, 2] for e.g. - # This happens during binning + # This happens during binning op.reduceat(array, inv_idx, axis=axis, dtype=dtype, out=out) else: out[..., uniques] = op.reduceat(array, inv_idx, axis=axis, dtype=dtype) @@ -91,8 +106,7 @@ def nanlen(group_idx, array, *args, **kwargs): def mean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): if fill_value is None: fill_value = 0 - out = np.full(array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype) - sum(group_idx, array, axis=axis, size=size, dtype=dtype, out=out) + out = sum(group_idx, array, axis=axis, size=size, dtype=dtype, fill_value=fill_value) out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0) return out @@ -100,7 +114,6 @@ def mean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): def nanmean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None): if fill_value is None: fill_value = 0 - out = np.full(array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype) - nansum(group_idx, array, size=size, axis=axis, dtype=dtype, out=out) + out = nansum(group_idx, array, size=size, axis=axis, dtype=dtype, fill_value=fill_value) out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0) return out diff --git a/flox/aggregations.py b/flox/aggregations.py index c97c97477..fad92a975 100644 --- a/flox/aggregations.py +++ b/flox/aggregations.py @@ -46,6 +46,8 @@ def generic_aggregate( f"Expected engine to be one of ['flox', 'numpy', 'numba']. Received {engine} instead." ) + group_idx = np.asarray(group_idx, like=array) + return method( group_idx, array, axis=axis, size=size, fill_value=fill_value, dtype=dtype, **kwargs ) diff --git a/flox/core.py b/flox/core.py index f39a3fe4e..943fd029e 100644 --- a/flox/core.py +++ b/flox/core.py @@ -13,6 +13,7 @@ import toolz as tlz from . import xrdtypes +from .aggregate_flox import _prepare_for_flox from .aggregations import ( Aggregation, _atleast_1d, @@ -44,21 +45,6 @@ def _is_arg_reduction(func: str | Aggregation) -> bool: return False -def _prepare_for_flox(group_idx, array): - """ - Sort the input array once to save time. - """ - assert array.shape[-1] == group_idx.shape[0] - issorted = (group_idx[:-1] <= group_idx[1:]).all() - if issorted: - ordered_array = array - else: - perm = group_idx.argsort(kind="stable") - group_idx = group_idx[..., perm] - ordered_array = array[..., perm] - return group_idx, ordered_array - - def _get_expected_groups(by, sort, *, raise_if_dask=True) -> pd.Index | None: if is_duck_dask_array(by): if raise_if_dask: @@ -1367,7 +1353,7 @@ def groupby_reduce( min_count: int | None = None, split_out: int = 1, method: str = "map-reduce", - engine: str = "flox", + engine: str = "numpy", reindex: bool | None = None, finalize_kwargs: Mapping | None = None, ) -> tuple[DaskArray, np.ndarray | DaskArray]: @@ -1434,13 +1420,14 @@ def groupby_reduce( and is identical to xarray's default strategy. engine : {"flox", "numpy", "numba"}, optional Algorithm to compute the groupby reduction on non-dask arrays and on each dask chunk: + * ``"numpy"``: + Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``. + This is the default choice because it works for most array types. * ``"flox"``: Use an internal implementation where the data is sorted so that all members of a group occur sequentially, and then numpy.ufunc.reduceat is to used for the reduction. This will fall back to ``numpy_groupies.aggregate_numpy`` for a reduction that is not yet implemented. - * ``"numpy"``: - Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``. * ``"numba"``: Use the implementations in ``numpy_groupies.aggregate_numba``. reindex : bool, optional diff --git a/flox/xarray.py b/flox/xarray.py index 358b57abd..9302dc318 100644 --- a/flox/xarray.py +++ b/flox/xarray.py @@ -61,7 +61,7 @@ def xarray_reduce( split_out: int = 1, fill_value=None, method: str = "map-reduce", - engine: str = "flox", + engine: str = "numpy", keep_attrs: bool | None = True, skipna: bool | None = None, min_count: int | None = None, @@ -125,13 +125,14 @@ def xarray_reduce( and is identical to xarray's default strategy. engine : {"flox", "numpy", "numba"}, optional Algorithm to compute the groupby reduction on non-dask arrays and on each dask chunk: + * ``"numpy"``: + Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``. + This is the default choice because it works for other array types. * ``"flox"``: Use an internal implementation where the data is sorted so that all members of a group occur sequentially, and then numpy.ufunc.reduceat is to used for the reduction. This will fall back to ``numpy_groupies.aggregate_numpy`` for a reduction that is not yet implemented. - * ``"numpy"``: - Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``. * ``"numba"``: Use the implementations in ``numpy_groupies.aggregate_numba``. keep_attrs : bool, optional diff --git a/flox/xrutils.py b/flox/xrutils.py index 047a83408..17ad2d71d 100644 --- a/flox/xrutils.py +++ b/flox/xrutils.py @@ -98,7 +98,8 @@ def is_scalar(value: Any, include_0d: bool = True) -> bool: def isnull(data): - data = np.asarray(data) + if not is_duck_array(data): + data = np.asarray(data) scalar_type = data.dtype.type if issubclass(scalar_type, (np.datetime64, np.timedelta64)): # datetime types use NaT for null diff --git a/setup.cfg b/setup.cfg index e99882db4..f254a2f19 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,6 +27,7 @@ include_package_data = True python_requires = >=3.8 install_requires = pandas + numpy >= '1.20' numpy_groupies >= '0.9.15' toolz