diff --git a/ci/docs.yml b/ci/docs.yml
index b11768282..9cdfb38e5 100644
--- a/ci/docs.yml
+++ b/ci/docs.yml
@@ -5,6 +5,7 @@ dependencies:
   - dask-core
   - pip
   - xarray
+  - numpy>=1.20
   - numpydoc
   - numpy_groupies
   - toolz
diff --git a/ci/environment.yml b/ci/environment.yml
index bbaf5ded6..aff6bc911 100644
--- a/ci/environment.yml
+++ b/ci/environment.yml
@@ -7,6 +7,7 @@ dependencies:
   - dask-core
   - netcdf4
   - pandas
+  - numpy>=1.20
   - pip
   - pytest
   - pytest-cov
diff --git a/ci/minimal-requirements.yml b/ci/minimal-requirements.yml
index 81b483e74..882c8d1fb 100644
--- a/ci/minimal-requirements.yml
+++ b/ci/minimal-requirements.yml
@@ -8,7 +8,8 @@ dependencies:
   - pytest
   - pytest-cov
   - pytest-xdist
-  - numpy_groupies>=0.9.15
+  - numpy==1.20
+  - numpy_groupies==0.9.15
   - pandas
   - pooch
   - toolz
diff --git a/ci/no-dask.yml b/ci/no-dask.yml
index 698297918..31ce0ade3 100644
--- a/ci/no-dask.yml
+++ b/ci/no-dask.yml
@@ -5,6 +5,7 @@ dependencies:
   - codecov
   - netcdf4
   - pandas
+  - numpy>=1.20
   - pip
   - pytest
   - pytest-cov
diff --git a/ci/no-xarray.yml b/ci/no-xarray.yml
index 6e54d8f4b..25c777fa1 100644
--- a/ci/no-xarray.yml
+++ b/ci/no-xarray.yml
@@ -5,6 +5,7 @@ dependencies:
   - codecov
   - netcdf4
   - pandas
+  - numpy>=1.20
   - pip
   - pytest
   - pytest-cov
diff --git a/flox/aggregate_flox.py b/flox/aggregate_flox.py
index fc84250e7..62a760653 100644
--- a/flox/aggregate_flox.py
+++ b/flox/aggregate_flox.py
@@ -5,6 +5,21 @@
 from .xrutils import isnull
 
 
+def _prepare_for_flox(group_idx, array):
+    """
+    Sort the input array once to save time.
+    """
+    assert array.shape[-1] == group_idx.shape[0]
+    issorted = (group_idx[:-1] <= group_idx[1:]).all()
+    if issorted:
+        ordered_array = array
+    else:
+        perm = group_idx.argsort(kind="stable")
+        group_idx = group_idx[..., perm]
+        ordered_array = array[..., perm]
+    return group_idx, ordered_array
+
+
 def _np_grouped_op(group_idx, array, op, axis=-1, size=None, fill_value=None, dtype=None, out=None):
     """
     most of this code is from shoyer's gist
@@ -13,7 +28,7 @@ def _np_grouped_op(group_idx, array, op, axis=-1, size=None, fill_value=None, dt
     # assumes input is sorted, which I do in core._prepare_for_flox
     aux = group_idx
 
-    flag = np.concatenate(([True], aux[1:] != aux[:-1]))
+    flag = np.concatenate((np.array([True], like=array), aux[1:] != aux[:-1]))
     uniques = aux[flag]
     (inv_idx,) = flag.nonzero()
 
@@ -25,11 +40,11 @@ def _np_grouped_op(group_idx, array, op, axis=-1, size=None, fill_value=None, dt
     if out is None:
         out = np.full(array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype)
 
-    if (len(uniques) == size) and (uniques == np.arange(size)).all():
+    if (len(uniques) == size) and (uniques == np.arange(size, like=array)).all():
         # The previous version of this if condition
         #     ((uniques[1:] - uniques[:-1]) == 1).all():
         # does not work when group_idx is [1, 2] for e.g.
-        # This happens  during binning
+        # This happens during binning
         op.reduceat(array, inv_idx, axis=axis, dtype=dtype, out=out)
     else:
         out[..., uniques] = op.reduceat(array, inv_idx, axis=axis, dtype=dtype)
@@ -91,8 +106,7 @@ def nanlen(group_idx, array, *args, **kwargs):
 def mean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None):
     if fill_value is None:
         fill_value = 0
-    out = np.full(array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype)
-    sum(group_idx, array, axis=axis, size=size, dtype=dtype, out=out)
+    out = sum(group_idx, array, axis=axis, size=size, dtype=dtype, fill_value=fill_value)
     out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0)
     return out
 
@@ -100,7 +114,6 @@ def mean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None):
 def nanmean(group_idx, array, *, axis=-1, size=None, fill_value=None, dtype=None):
     if fill_value is None:
         fill_value = 0
-    out = np.full(array.shape[:-1] + (size,), fill_value=fill_value, dtype=dtype)
-    nansum(group_idx, array, size=size, axis=axis, dtype=dtype, out=out)
+    out = nansum(group_idx, array, size=size, axis=axis, dtype=dtype, fill_value=fill_value)
     out /= nanlen(group_idx, array, size=size, axis=axis, fill_value=0)
     return out
diff --git a/flox/aggregations.py b/flox/aggregations.py
index c97c97477..fad92a975 100644
--- a/flox/aggregations.py
+++ b/flox/aggregations.py
@@ -46,6 +46,8 @@ def generic_aggregate(
             f"Expected engine to be one of ['flox', 'numpy', 'numba']. Received {engine} instead."
         )
 
+    group_idx = np.asarray(group_idx, like=array)
+
     return method(
         group_idx, array, axis=axis, size=size, fill_value=fill_value, dtype=dtype, **kwargs
     )
diff --git a/flox/core.py b/flox/core.py
index f39a3fe4e..943fd029e 100644
--- a/flox/core.py
+++ b/flox/core.py
@@ -13,6 +13,7 @@
 import toolz as tlz
 
 from . import xrdtypes
+from .aggregate_flox import _prepare_for_flox
 from .aggregations import (
     Aggregation,
     _atleast_1d,
@@ -44,21 +45,6 @@ def _is_arg_reduction(func: str | Aggregation) -> bool:
     return False
 
 
-def _prepare_for_flox(group_idx, array):
-    """
-    Sort the input array once to save time.
-    """
-    assert array.shape[-1] == group_idx.shape[0]
-    issorted = (group_idx[:-1] <= group_idx[1:]).all()
-    if issorted:
-        ordered_array = array
-    else:
-        perm = group_idx.argsort(kind="stable")
-        group_idx = group_idx[..., perm]
-        ordered_array = array[..., perm]
-    return group_idx, ordered_array
-
-
 def _get_expected_groups(by, sort, *, raise_if_dask=True) -> pd.Index | None:
     if is_duck_dask_array(by):
         if raise_if_dask:
@@ -1367,7 +1353,7 @@ def groupby_reduce(
     min_count: int | None = None,
     split_out: int = 1,
     method: str = "map-reduce",
-    engine: str = "flox",
+    engine: str = "numpy",
     reindex: bool | None = None,
     finalize_kwargs: Mapping | None = None,
 ) -> tuple[DaskArray, np.ndarray | DaskArray]:
@@ -1434,13 +1420,14 @@ def groupby_reduce(
             and is identical to xarray's default strategy.
     engine : {"flox", "numpy", "numba"}, optional
         Algorithm to compute the groupby reduction on non-dask arrays and on each dask chunk:
+          * ``"numpy"``:
+            Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``.
+            This is the default choice because it works for most array types.
           * ``"flox"``:
             Use an internal implementation where the data is sorted so that
             all members of a group occur sequentially, and then numpy.ufunc.reduceat
             is to used for the reduction. This will fall back to ``numpy_groupies.aggregate_numpy``
             for a reduction that is not yet implemented.
-          * ``"numpy"``:
-            Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``.
           * ``"numba"``:
             Use the implementations in ``numpy_groupies.aggregate_numba``.
     reindex : bool, optional
diff --git a/flox/xarray.py b/flox/xarray.py
index 358b57abd..9302dc318 100644
--- a/flox/xarray.py
+++ b/flox/xarray.py
@@ -61,7 +61,7 @@ def xarray_reduce(
     split_out: int = 1,
     fill_value=None,
     method: str = "map-reduce",
-    engine: str = "flox",
+    engine: str = "numpy",
     keep_attrs: bool | None = True,
     skipna: bool | None = None,
     min_count: int | None = None,
@@ -125,13 +125,14 @@ def xarray_reduce(
             and is identical to xarray's default strategy.
     engine : {"flox", "numpy", "numba"}, optional
         Algorithm to compute the groupby reduction on non-dask arrays and on each dask chunk:
+          * ``"numpy"``:
+            Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``.
+            This is the default choice because it works for other array types.
           * ``"flox"``:
             Use an internal implementation where the data is sorted so that
             all members of a group occur sequentially, and then numpy.ufunc.reduceat
             is to used for the reduction. This will fall back to ``numpy_groupies.aggregate_numpy``
             for a reduction that is not yet implemented.
-          * ``"numpy"``:
-            Use the vectorized implementations in ``numpy_groupies.aggregate_numpy``.
           * ``"numba"``:
             Use the implementations in ``numpy_groupies.aggregate_numba``.
     keep_attrs : bool, optional
diff --git a/flox/xrutils.py b/flox/xrutils.py
index 047a83408..17ad2d71d 100644
--- a/flox/xrutils.py
+++ b/flox/xrutils.py
@@ -98,7 +98,8 @@ def is_scalar(value: Any, include_0d: bool = True) -> bool:
 
 
 def isnull(data):
-    data = np.asarray(data)
+    if not is_duck_array(data):
+        data = np.asarray(data)
     scalar_type = data.dtype.type
     if issubclass(scalar_type, (np.datetime64, np.timedelta64)):
         # datetime types use NaT for null
diff --git a/setup.cfg b/setup.cfg
index e99882db4..f254a2f19 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -27,6 +27,7 @@ include_package_data = True
 python_requires = >=3.8
 install_requires =
     pandas
+    numpy >= '1.20'
     numpy_groupies >= '0.9.15'
     toolz