Merge pull request #78 from dcherian/optimize

ml31415 · web-flow · commit 5e265ef5e7b8 · 2023-04-26T09:24:57.000+03:00
Some optimizations for numpy aggregations
diff --git a/numpy_groupies/aggregate_numpy.py b/numpy_groupies/aggregate_numpy.py
@@ -24,7 +24,7 @@ def _sum(group_idx, a, size, fill_value, dtype=None):
     dtype = minimum_dtype_scalar(fill_value, dtype, a)
 
     if np.ndim(a) == 0:
-        ret = np.bincount(group_idx, minlength=size).astype(dtype)
+        ret = np.bincount(group_idx, minlength=size).astype(dtype, copy=False)
         if a != 1:
             ret *= a
     else:
@@ -33,7 +33,9 @@ def _sum(group_idx, a, size, fill_value, dtype=None):
             ret.real = np.bincount(group_idx, weights=a.real, minlength=size)
             ret.imag = np.bincount(group_idx, weights=a.imag, minlength=size)
         else:
-            ret = np.bincount(group_idx, weights=a, minlength=size).astype(dtype)
+            ret = np.bincount(group_idx, weights=a, minlength=size).astype(
+                dtype, copy=False
+            )
 
     if fill_value != 0:
         _fill_untouched(group_idx, ret, fill_value)
@@ -146,19 +148,19 @@ def _mean(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)):
         sums.real = np.bincount(group_idx, weights=a.real, minlength=size)
         sums.imag = np.bincount(group_idx, weights=a.imag, minlength=size)
     else:
-        sums = np.bincount(group_idx, weights=a, minlength=size).astype(dtype)
+        sums = np.bincount(group_idx, weights=a, minlength=size).astype(dtype, copy=False)
 
     with np.errstate(divide="ignore", invalid="ignore"):
-        ret = sums.astype(dtype) / counts
+        ret = sums.astype(dtype, copy=False) / counts
     if not np.isnan(fill_value):
         ret[counts == 0] = fill_value
     return ret
 
 
 def _sum_of_squres(group_idx, a, size, fill_value, dtype=np.dtype(np.float64)):
     ret = np.bincount(group_idx, weights=a * a, minlength=size)
-    counts = np.bincount(group_idx, minlength=size)
     if fill_value != 0:
+        counts = np.bincount(group_idx, minlength=size)
         ret[counts == 0] = fill_value
     return ret
 
@@ -171,7 +173,7 @@ def _var(
     counts = np.bincount(group_idx, minlength=size)
     sums = np.bincount(group_idx, weights=a, minlength=size)
     with np.errstate(divide="ignore", invalid="ignore"):
-        means = sums.astype(dtype) / counts
+        means = sums.astype(dtype, copy=False) / counts
         counts = np.where(counts > ddof, counts - ddof, 0)
         ret = (
             np.bincount(group_idx, (a - means[group_idx]) ** 2, minlength=size) / counts
@@ -299,6 +301,7 @@ def _aggregate_base(
     dtype=None,
     axis=None,
     _impl_dict=_impl_dict,
+    is_pandas=False,
     **kwargs
 ):
     iv = input_validation(group_idx, a, size=size, order=order, axis=axis, func=func)
@@ -324,7 +327,9 @@ def _aggregate_base(
                     kwargs["_nansqueeze"] = True
                 else:
                     good = ~np.isnan(a)
-                    a = a[good]
+                    if "len" not in func or is_pandas:
+                        # a is not needed for len, nanlen!
+                        a = a[good]
                     group_idx = group_idx[good]
 
         dtype = check_dtype(dtype, func, a, flat_size)
diff --git a/numpy_groupies/aggregate_pandas.py b/numpy_groupies/aggregate_pandas.py
@@ -69,6 +69,7 @@ def aggregate(
         func=func,
         axis=axis,
         _impl_dict=_impl_dict,
+        is_pandas=True,
         **kwargs
     )
 

Original file line number	Diff line number	Diff line change
`@@ -69,6 +69,7 @@ def aggregate(`
`69`	`69`	`func=func,`
`70`	`70`	`axis=axis,`
`71`	`71`	`_impl_dict=_impl_dict,`
	`72`	`+ is_pandas=True,`
`72`	`73`	`**kwargs`
`73`	`74`	`)`
`74`	`75`