diff --git a/doc/cli.rst b/doc/cli.rst
index 7c4229e5..3cac1f4a 100644
--- a/doc/cli.rst
+++ b/doc/cli.rst
@@ -491,6 +491,7 @@ Usage
         [--duplicate DUPLICATE]
         [-s SETUP]
         [--teardown TEARDOWN]
+        [--profile PROFILE]
         stmt [stmt ...]
 
 Options:
@@ -512,6 +513,7 @@ Options:
 * ``--duplicate=DUPLICATE``: Duplicate statements (``stmt`` statements, not
   ``SETUP``) to reduce the overhead of the outer loop and multiply
   inner loops by DUPLICATE (see ``--inner-loops`` option).
+* ``--profile=PROFILE``: Run the benchmark inside the cProfile profiler and output to the given file. This is a convenient way to profile a specific benchmark, but it will make the actual benchmark timings much less accurate.
 
 .. note::
    timeit ``-n`` (number) and ``-r`` (repeat) options become ``-l`` (loops) and
diff --git a/pyperf/_manager.py b/pyperf/_manager.py
index f29af1fa..d478708d 100644
--- a/pyperf/_manager.py
+++ b/pyperf/_manager.py
@@ -66,6 +66,9 @@ def worker_cmd(self, calibrate_loops, calibrate_warmups, wpipe):
         if args.track_memory:
             cmd.append('--track-memory')
 
+        if args.profile:
+            cmd.extend(['--profile', args.profile])
+
         if self.runner._add_cmdline_args:
             self.runner._add_cmdline_args(cmd, args)
 
diff --git a/pyperf/_process_time.py b/pyperf/_process_time.py
index d3589b28..7010195d 100644
--- a/pyperf/_process_time.py
+++ b/pyperf/_process_time.py
@@ -17,6 +17,7 @@
 import os
 import subprocess
 import sys
+import tempfile
 import time
 
 try:
@@ -35,11 +36,30 @@ def get_max_rss():
         return 0
 
 
-def bench_process(loops, args, kw):
+def merge_profile_stats_files(src, dst):
+    """
+    Merging one existing pstats file into another.
+    """
+    import pstats
+    if os.path.isfile(dst):
+        src_stats = pstats.Stats(src)
+        dst_stats = pstats.Stats(dst)
+        dst_stats.add(src_stats)
+        dst_stats.dump_stats(dst)
+        os.unlink(src)
+    else:
+        os.rename(src, dst)
+
+
+def bench_process(loops, args, kw, profile_filename=None):
     max_rss = 0
     range_it = range(loops)
     start_time = time.perf_counter()
 
+    if profile_filename:
+        temp_profile_filename = tempfile.mktemp()
+        args = [args[0], "-m", "cProfile", "-o", temp_profile_filename] + args[1:]
+
     for loop in range_it:
         start_rss = get_max_rss()
 
@@ -51,6 +71,8 @@ def bench_process(loops, args, kw):
         if exitcode != 0:
             print("Command failed with exit code %s" % exitcode,
                   file=sys.stderr)
+            if profile_filename:
+                os.unlink(temp_profile_filename)
             sys.exit(exitcode)
 
         proc = None
@@ -58,6 +80,11 @@ def bench_process(loops, args, kw):
         rss = get_max_rss() - start_rss
         max_rss = max(max_rss, rss)
 
+        if profile_filename:
+            merge_profile_stats_files(
+                temp_profile_filename, profile_filename
+            )
+
     dt = time.perf_counter() - start_time
     return (dt, max_rss)
 
@@ -70,10 +97,18 @@ def main():
         sys.exit(1)
 
     if len(sys.argv) < 3:
-        print("Usage: %s %s loops program [arg1 arg2 ...]"
+        print("Usage: %s %s loops program [arg1 arg2 ...] [--profile profile]"
               % (os.path.basename(sys.executable), __file__))
         sys.exit(1)
 
+    if "--profile" in sys.argv:
+        profile_idx = sys.argv.index("--profile")
+        profile_filename = sys.argv[profile_idx + 1]
+        del sys.argv[profile_idx]
+        del sys.argv[profile_idx]
+    else:
+        profile_filename = None
+
     loops = int(sys.argv[1])
     args = sys.argv[2:]
 
@@ -88,7 +123,7 @@ def main():
         kw['stdout'] = devnull
     kw['stderr'] = subprocess.STDOUT
 
-    dt, max_rss = bench_process(loops, args, kw)
+    dt, max_rss = bench_process(loops, args, kw, profile_filename)
 
     if devnull is not None:
         devnull.close()
diff --git a/pyperf/_runner.py b/pyperf/_runner.py
index b70e3e52..6c0d9e2b 100644
--- a/pyperf/_runner.py
+++ b/pyperf/_runner.py
@@ -12,7 +12,8 @@
                                set_highest_priority)
 from pyperf._formatter import format_timedelta
 from pyperf._utils import (MS_WINDOWS, MAC_OS, abs_executable,
-                           WritePipe, get_python_names)
+                           WritePipe, get_python_names,
+                           merge_profile_stats)
 from pyperf._worker import WorkerProcessTask
 
 
@@ -48,6 +49,19 @@ def parse_python_names(names):
     return parts
 
 
+def profiling_wrapper(func):
+    """
+    Wrap a function to collect profiling.
+    """
+    import cProfile
+    profiler = cProfile.Profile()
+
+    def profiling_func(*args):
+        return profiler.runcall(func, *args)
+
+    return profiler, profiling_func
+
+
 class CLIError(Exception):
     pass
 
@@ -222,6 +236,11 @@ def __init__(self, values=None, warmups=None, processes=None,
                                  'PYTHON as CHANGED_NAME '
                                  'and REF_PYTHON as REF_NAME in results')
 
+        parser.add_argument('--profile',
+                            type=str,
+                            help='Collect profile data using cProfile '
+                                 'and output to the given file.')
+
         memory = parser.add_mutually_exclusive_group()
         memory.add_argument('--tracemalloc', action="store_true",
                             help='Trace memory allocations using tracemalloc')
@@ -454,12 +473,21 @@ def bench_time_func(self, name, time_func, *args, **kwargs):
         if not self._check_worker_task():
             return None
 
+        if self.args.profile:
+            profiler, time_func = profiling_wrapper(time_func)
+
         def task_func(task, loops):
             return time_func(loops, *args)
 
         task = WorkerProcessTask(self, name, task_func, metadata)
+
         task.inner_loops = inner_loops
-        return self._main(task)
+        result = self._main(task)
+
+        if self.args.profile:
+            merge_profile_stats(profiler, self.args.profile)
+
+        return result
 
     def bench_func(self, name, func, *args, **kwargs):
         """"Benchmark func(*args)."""
@@ -474,6 +502,9 @@ def bench_func(self, name, func, *args, **kwargs):
         if args:
             func = functools.partial(func, *args)
 
+        if self.args.profile:
+            profiler, func = profiling_wrapper(func)
+
         def task_func(task, loops):
             # use fast local variables
             local_timer = time.perf_counter
@@ -494,7 +525,12 @@ def task_func(task, loops):
 
         task = WorkerProcessTask(self, name, task_func, metadata)
         task.inner_loops = inner_loops
-        return self._main(task)
+        result = self._main(task)
+
+        if self.args.profile:
+            merge_profile_stats(profiler, self.args.profile)
+
+        return result
 
     def bench_async_func(self, name, func, *args, **kwargs):
         """Benchmark await func(*args)"""
@@ -509,6 +545,9 @@ def bench_async_func(self, name, func, *args, **kwargs):
         if args:
             func = functools.partial(func, *args)
 
+        if self.args.profile:
+            profiler, func = profiling_wrapper(func)
+
         def task_func(task, loops):
             if loops != 1:
                 async def main():
@@ -549,7 +588,12 @@ async def main():
 
         task = WorkerProcessTask(self, name, task_func, metadata)
         task.inner_loops = inner_loops
-        return self._main(task)
+        result = self._main(task)
+
+        if self.args.profile:
+            merge_profile_stats(profiler, self.args.profile)
+
+        return result
 
     def timeit(self, name, stmt=None, setup="pass", teardown="pass",
                inner_loops=None, duplicate=None, metadata=None, globals=None):
@@ -671,6 +715,9 @@ def bench_command(self, name, command):
         if not self._check_worker_task():
             return None
 
+        if self.args.profile:
+            command.extend(["--profile", self.args.profile])
+
         # Use lazy import to limit imports on 'import pyperf'
         from pyperf._command import BenchCommandTask
         task = BenchCommandTask(self, name, command)
diff --git a/pyperf/_utils.py b/pyperf/_utils.py
index 05dc54a6..99225466 100644
--- a/pyperf/_utils.py
+++ b/pyperf/_utils.py
@@ -405,3 +405,24 @@ def geometric_mean(data):
     if not data:
         raise ValueError("empty data")
     return _geometric_mean(data)
+
+
+def merge_profile_stats(profiler, dst):
+    """
+    Save pstats by merging into an existing file.
+    """
+    import pstats
+    if os.path.isfile(dst):
+        try:
+            src_stats = pstats.Stats(profiler)
+        except TypeError:
+            # If no actual stats were collected, a TypeError is raised
+            # and we don't need to merge anything into the output.
+            return
+
+        dst_stats = pstats.Stats(dst)
+        dst_stats.add(src_stats)
+        dst_stats.dump_stats(dst)
+    else:
+        profiler.dump_stats(dst)
+
diff --git a/pyperf/tests/test_runner.py b/pyperf/tests/test_runner.py
index a61053ba..c456a9ae 100644
--- a/pyperf/tests/test_runner.py
+++ b/pyperf/tests/test_runner.py
@@ -1,5 +1,6 @@
 import collections
 import os.path
+import pstats
 import sys
 import tempfile
 import textwrap
@@ -73,7 +74,6 @@ class ClockInfo:
         self.assertEqual(bench.get_nrun(), 1)
 
         return Result(runner, bench, stdout)
-
     def test_worker(self):
         result = self.exec_runner('--worker', '-l1', '-w1')
         self.assertRegex(result.stdout,
@@ -83,6 +83,53 @@ def test_debug_single_value(self):
         result = self.exec_runner('--debug-single-value', '--worker')
         self.assertEqual(result.bench.get_nvalue(), 1)
 
+    def test_profile_time_func(self):
+        with tempfile.NamedTemporaryFile('wb+') as tmp:
+            name = tmp.name
+        args = ['--worker', '-l1', '-w1', '--profile', name]
+        runner = self.create_runner(args)
+
+        def time_func(loops):
+            return 1.0
+
+        runner.bench_time_func('bench1', time_func)
+
+        try:
+            s = pstats.Stats(name)
+            if sys.version_info < (3, 9):
+                assert len(s.stats)
+            else:
+                assert len(s.get_stats_profile().func_profiles)
+        finally:
+            if os.path.isfile(name):
+                os.unlink(name)
+
+    def test_profile_func(self):
+        with tempfile.NamedTemporaryFile('wb+') as tmp:
+            name = tmp.name
+        args = ['--worker', '-l1', '-w1', '--profile', name]
+        runner = self.create_runner(args)
+
+        def external():
+            return [1] * 1000
+
+        def func():
+            external()
+            return 1.0
+
+        runner.bench_func('bench1', func)
+
+        try:
+            import pstats
+            s = pstats.Stats(name)
+            if sys.version_info < (3, 9):
+                assert len(s.stats)
+            else:
+                assert len(s.get_stats_profile().func_profiles)
+        finally:
+            if os.path.isfile(name):
+                os.unlink(name)
+
     def test_pipe(self):
         rpipe, wpipe = create_pipe()
         with rpipe: