Address comments in the PR

mdboom · mdboom · commit a685a14d1efc · 2024-12-18T14:03:54.000-05:00
diff --git a/doc/api.rst b/doc/api.rst
@@ -345,6 +345,19 @@ Benchmark class
 
       Raise an exception if the benchmark has no values.
 
+   .. method:: required_nprocesses()
+
+      Determines the number of separate process runs that would be required
+      achieve stable results. Specifically, the target is to have 95% certainty
+      that there is a variance of less than 1%. If the result is greater than
+      the number of processes recorded in the input data, the value is
+      meaningless and only means "more samples are required".
+
+      The method used is described in this Wikipedia article about estimating
+      the sampling of a mean:
+
+      https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean
+
    .. method:: update_metadata(metadata: dict)
 
       Update metadata of all runs of the benchmark.
diff --git a/pyperf/__main__.py b/pyperf/__main__.py
@@ -455,7 +455,8 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False,
                                            dump=dump,
                                            checks=checks,
                                            result=result,
-                                           display_runs_args=display_runs_args)
+                                           display_runs_args=display_runs_args,
+                                           only_checks=only_checks)
 
             if bench_lines:
                 empty_line(lines)
diff --git a/pyperf/_bench.py b/pyperf/_bench.py
@@ -424,17 +424,23 @@ def median_abs_dev(self):
             raise ValueError("MAD must be >= 0")
         return value
 
-    def required_nsamples(self):
+    def required_nprocesses(self):
         """
-        Determines the number of samples that would be required to have 95%
-        certainty that the samples have a variance of less than 1%.
+        Determines the number of separate process runs that would be required
+        achieve stable results. Specifically, the target is to have 95%
+        certainty that there is a variance of less than 1%. If the result is
+        greater than the number of processes recorded in the input data, the
+        value is meaningless and only means "more samples are required".
 
-        This is described in this Wikipedia article about estimating the sampling of
-        a mean:
+        The method used is described in this Wikipedia article about estimating
+        the sampling of a mean:
 
         https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean
         """
-        # Get the means of the values per run
+        # Get the means of the values per process. The values within the process
+        # often vary considerably (e.g. due to cache effects), but the variances
+        # between processes should be fairly consistent. Additionally, this
+        # value is intended to be advice for the number of processes to run.
         values = []
         for run in self._runs:
             if len(run.values):
@@ -446,6 +452,7 @@ def required_nsamples(self):
         total = math.fsum(values)
         mean = total / len(values)
         stddev = statistics.stdev(values)
+
         # Normalize the stddev so we can target "percentage changed" rather than
         # absolute time
         sigma = stddev / mean
@@ -455,6 +462,7 @@ def required_nsamples(self):
         # 1% variation
         W = 0.01
 
+        # (4Z²σ²)/(W²)
         return int(math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2)))
 
     def percentile(self, p):
diff --git a/pyperf/_cli.py b/pyperf/_cli.py
@@ -400,7 +400,7 @@ def value_bucket(value):
     return lines
 
 
-def format_checks(bench, lines=None):
+def format_checks(bench, lines=None, check_too_many_processes=False):
     if lines is None:
         lines = []
 
@@ -413,7 +413,7 @@ def format_checks(bench, lines=None):
     warnings = []
     warn = warnings.append
 
-    required_nsamples = bench.required_nsamples()
+    required_nprocesses = bench.required_nprocesses()
 
     # Display a warning if the standard deviation is greater than 10%
     # of the mean
@@ -426,8 +426,8 @@ def format_checks(bench, lines=None):
         else:
             # display a warning if the number of samples isn't enough to get a stable result
             if (
-                required_nsamples is not None and
-                required_nsamples > len(bench._runs)
+                required_nprocesses is not None and
+                required_nprocesses > len(bench._runs)
             ):
                 warn("Not enough samples to get a stable result (95% certainly of less than 1% variation)")
 
@@ -467,13 +467,14 @@ def format_checks(bench, lines=None):
         lines.append("Use --quiet option to hide these warnings.")
 
     if (
-        required_nsamples is not None and
-        required_nsamples < len(bench._runs) * 0.75
+        check_too_many_processes and
+        required_nprocesses is not None and
+        required_nprocesses < len(bench._runs) * 0.75
     ):
         lines.append("Benchmark was run more times than necessary to get a stable result.")
         lines.append(
             "Consider passing processes=%d to the Runner constructor to save time." %
-            required_nsamples
+            required_nprocesses
         )
 
     # Warn if nohz_full+intel_pstate combo if found in cpu_config metadata
@@ -568,7 +569,7 @@ def format_result(bench):
 
 def format_benchmark(bench, checks=True, metadata=False,
                      dump=False, stats=False, hist=False, show_name=False,
-                     result=True, display_runs_args=None):
+                     result=True, display_runs_args=None, only_checks=False):
     lines = []
 
     if metadata:
@@ -587,7 +588,7 @@ def format_benchmark(bench, checks=True, metadata=False,
         format_stats(bench, lines=lines)
 
     if checks:
-        format_checks(bench, lines=lines)
+        format_checks(bench, lines=lines, check_too_many_processes=only_checks)
 
     if result:
         empty_line(lines)
diff --git a/pyperf/tests/test_perf_cli.py b/pyperf/tests/test_perf_cli.py
@@ -478,16 +478,11 @@ def test_hist(self):
             22.8 ms:  3 ##############
             22.9 ms:  4 ###################
             22.9 ms:  4 ###################
-            Benchmark was run more times than necessary to get a stable result.
-            Consider passing processes=7 to the Runner constructor to save time.
         """)
         self.check_command(expected, 'hist', TELCO, env=env)
 
     def test_show(self):
         expected = ("""
-            Benchmark was run more times than necessary to get a stable result.
-            Consider passing processes=7 to the Runner constructor to save time.
-
             Mean +- std dev: 22.5 ms +- 0.2 ms
         """)
         self.check_command(expected, 'show', TELCO)
@@ -523,8 +518,6 @@ def test_stats(self):
             100th percentile: 22.9 ms (+2% of the mean) -- maximum
 
             Number of outlier (out of 22.0 ms..23.0 ms): 0
-            Benchmark was run more times than necessary to get a stable result.
-            Consider passing processes=7 to the Runner constructor to save time.
         """)
         self.check_command(expected, 'stats', TELCO)
 
@@ -635,6 +628,14 @@ def test_slowest(self):
 
     def test_check_stable(self):
         stdout = self.run_command('check', TELCO)
+        self.assertTrue(
+            textwrap.dedent(
+                """
+                Benchmark was run more times than necessary to get a stable result.
+                Consider passing processes=7 to the Runner constructor to save time.
+                """
+            ).strip() in stdout.rstrip()
+        )
         self.assertTrue(
             'The benchmark seems to be stable' in
             stdout.rstrip()