KernelTuner · isazi · Jan 22, 2024 · Jan 23, 2024 · Jan 23, 2024 · Jan 23, 2024
diff --git a/doc/source/optimization.rst b/doc/source/optimization.rst
@@ -25,6 +25,7 @@ the ``strategy=`` optional argument of ``tune_kernel()``. Kernel Tuner currently
  * "pso" particle swarm optimization
  * "random_sample" takes a random sample of the search space
  * "simulated_annealing" simulated annealing strategy
+ * "ensemble" ensemble strategy
 
 Most strategies have some mechanism built in to detect when to stop tuning, which may be controlled through specific 
 parameters that can be passed to the strategies using the ``strategy_options=`` optional argument of ``tune_kernel()``. You 

diff --git a/kernel_tuner/accuracy.py b/kernel_tuner/accuracy.py
@@ -46,9 +46,7 @@ def select_for_configuration(self, params):
 
         if option not in self.data:
             list = ", ".join(map(str, self.data.keys()))
-            raise KeyError(
-                f"'{option}' is not a valid parameter value, should be one of: {list}"
-            )
+            raise KeyError(f"'{option}' is not a valid parameter value, should be one of: {list}")
 
         return self.data[option]
 
@@ -60,12 +58,14 @@ def _find_bfloat16_if_available():
     # Try to get bfloat16 if available.
     try:
         from bfloat16 import bfloat16
+
         return bfloat16
     except ImportError:
         pass
 
     try:
         from tensorflow import bfloat16
+
         return bfloat16.as_numpy_dtype
     except ImportError:
         pass
@@ -102,9 +102,7 @@ def _to_float_dtype(x: str) -> np.dtype:
 
 
 class TunablePrecision(Tunable):
-    def __init__(
-        self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None
-    ):
+    def __init__(self, param_key: str, array: np.ndarray, dtypes: Dict[str, np.dtype] = None):
         """The ``Tunable`` object can be used as an input argument when tuning
         kernels. It is a container that internally holds several arrays
         containing the same data, but stored in using different levels of
@@ -135,7 +133,6 @@ def __init__(
             if bfloat16 is not None:
                 dtypes["bfloat16"] = bfloat16
 
-
         # If dtype is a list, convert it to a dictionary
         if isinstance(dtypes, (list, tuple)):
             dtypes = dict((name, _to_float_dtype(name)) for name in dtypes)
@@ -257,9 +254,7 @@ def metric(a, b):
         raise ValueError(f"invalid error metric provided: {user_key}")
 
     # cast both arguments to f64 before passing them to the metric
-    return lambda a, b: metric(
-        a.astype(np.float64, copy=False), b.astype(np.float64, copy=False)
-    )
+    return lambda a, b: metric(a.astype(np.float64, copy=False), b.astype(np.float64, copy=False))
 
 
 class AccuracyObserver(OutputObserver):

diff --git a/kernel_tuner/backends/compiler.py b/kernel_tuner/backends/compiler.py
@@ -34,7 +34,7 @@
 try:
     from hip._util.types import DeviceArray
 except ImportError:
-    Pointer = Exception # using Exception here as a type that will never be among kernel arguments
+    Pointer = Exception  # using Exception here as a type that will never be among kernel arguments
     DeviceArray = Exception
 
 
@@ -157,7 +157,9 @@ def ready_argument_list(self, arguments):
 
         for i, arg in enumerate(arguments):
             if not (isinstance(arg, (np.ndarray, np.number, DeviceArray)) or is_cupy_array(arg)):
-                raise TypeError(f"Argument is not numpy or cupy ndarray or numpy scalar or HIP Python DeviceArray but a {type(arg)}")
+                raise TypeError(
+                    f"Argument is not numpy or cupy ndarray or numpy scalar or HIP Python DeviceArray but a {type(arg)}"
+                )
             dtype_str = arg.typestr if isinstance(arg, DeviceArray) else str(arg.dtype)
             if isinstance(arg, np.ndarray):
                 if dtype_str in dtype_map.keys():
@@ -288,7 +290,7 @@ def compile(self, kernel_instance):
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
-                check=True
+                check=True,
             )
 
             subprocess.run(
@@ -299,7 +301,7 @@ def compile(self, kernel_instance):
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
                 text=True,
-                check=True
+                check=True,
             )
 
             self.lib = np.ctypeslib.load_library(filename, ".")
@@ -439,7 +441,7 @@ def cleanup_lib(self):
         """unload the previously loaded shared library"""
         if self.lib is None:
             return
-        
+
         if not self.using_openmp and not self.using_openacc:
             # this if statement is necessary because shared libraries that use
             # OpenMP will core dump when unloaded, this is a well-known issue with OpenMP

diff --git a/kernel_tuner/backends/cupy.py b/kernel_tuner/backends/cupy.py
@@ -70,9 +70,7 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
         # collect environment information
         env = dict()
         cupy_info = str(cp._cupyx.get_runtime_info()).split("\n")[:-1]
-        info_dict = {
-            s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info
-        }
+        info_dict = {s.split(":")[0].strip(): s.split(":")[1].strip() for s in cupy_info}
         env["device_name"] = info_dict[f"Device {device} Name"]
 
         env["cuda_version"] = cp.cuda.runtime.driverGetVersion()
@@ -129,9 +127,7 @@ def compile(self, kernel_instance):
 
         options = tuple(compiler_options)
 
-        self.current_module = cp.RawModule(
-            code=kernel_string, options=options, name_expressions=[kernel_name]
-        )
+        self.current_module = cp.RawModule(code=kernel_string, options=options, name_expressions=[kernel_name])
 
         self.func = self.current_module.get_function(kernel_name)
         self.num_regs = self.func.num_regs

diff --git a/kernel_tuner/backends/nvcuda.py b/kernel_tuner/backends/nvcuda.py
@@ -56,13 +56,9 @@ def __init__(self, device=0, iterations=7, compiler_options=None, observers=None
             CudaFunctions.last_selected_device = device
 
         # compute capabilities and device properties
-        err, major = cudart.cudaDeviceGetAttribute(
-            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device
-        )
+        err, major = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, device)
         cuda_error_check(err)
-        err, minor = cudart.cudaDeviceGetAttribute(
-            cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device
-        )
+        err, minor = cudart.cudaDeviceGetAttribute(cudart.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, device)
         cuda_error_check(err)
         err, self.max_threads = cudart.cudaDeviceGetAttribute(
             cudart.cudaDeviceAttr.cudaDevAttrMaxThreadsPerBlock, device
@@ -164,20 +160,14 @@ def compile(self, kernel_instance):
         if not any(["--std=" in opt for opt in self.compiler_options]):
             self.compiler_options.append("--std=c++11")
         if not any([b"--gpu-architecture=" in opt or b"-arch" in opt for opt in compiler_options]):
-            compiler_options.append(
-                f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8")
-            )
+            compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}".encode("UTF-8"))
         if not any(["--gpu-architecture=" in opt or "-arch" in opt for opt in self.compiler_options]):
             self.compiler_options.append(f"--gpu-architecture=compute_{to_valid_nvrtc_gpu_arch_cc(self.cc)}")
 
-        err, program = nvrtc.nvrtcCreateProgram(
-            str.encode(kernel_string), b"CUDAProgram", 0, [], []
-        )
+        err, program = nvrtc.nvrtcCreateProgram(str.encode(kernel_string), b"CUDAProgram", 0, [], [])
         try:
             cuda_error_check(err)
-            err = nvrtc.nvrtcCompileProgram(
-                program, len(compiler_options), compiler_options
-            )
+            err = nvrtc.nvrtcCompileProgram(program, len(compiler_options), compiler_options)
             cuda_error_check(err)
             err, size = nvrtc.nvrtcGetPTXSize(program)
             cuda_error_check(err)
@@ -189,9 +179,7 @@ def compile(self, kernel_instance):
                 raise SkippableFailure("uses too much shared data")
             else:
                 cuda_error_check(err)
-            err, self.func = cuda.cuModuleGetFunction(
-                self.current_module, str.encode(kernel_name)
-            )
+            err, self.func = cuda.cuModuleGetFunction(self.current_module, str.encode(kernel_name))
             cuda_error_check(err)
 
             # get the number of registers per thread used in this kernel

diff --git a/kernel_tuner/backends/opencl.py b/kernel_tuner/backends/opencl.py
@@ -16,9 +16,7 @@
 class OpenCLFunctions(GPUBackend):
     """Class that groups the OpenCL functions on maintains some state about the device."""
 
-    def __init__(
-        self, device=0, platform=0, iterations=7, compiler_options=None, observers=None
-    ):
+    def __init__(self, device=0, platform=0, iterations=7, compiler_options=None, observers=None):
         """Creates OpenCL device context and reads device properties.
 
         :param device: The ID of the OpenCL device to use for benchmarking
@@ -37,14 +35,10 @@ def __init__(
         platforms = cl.get_platforms()
         self.ctx = cl.Context(devices=[platforms[platform].get_devices()[device]])
 
-        self.queue = cl.CommandQueue(
-            self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE
-        )
+        self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
         self.mf = cl.mem_flags
         # inspect device properties
-        self.max_threads = self.ctx.devices[0].get_info(
-            cl.device_info.MAX_WORK_GROUP_SIZE
-        )
+        self.max_threads = self.ctx.devices[0].get_info(cl.device_info.MAX_WORK_GROUP_SIZE)
         self.compiler_options = compiler_options or []
 
         # observer stuff
@@ -108,9 +102,7 @@ def compile(self, kernel_instance):
         :returns: An OpenCL kernel that can be called directly.
         :rtype: pyopencl.Kernel
         """
-        prg = cl.Program(self.ctx, kernel_instance.kernel_string).build(
-            options=self.compiler_options
-        )
+        prg = cl.Program(self.ctx, kernel_instance.kernel_string).build(options=self.compiler_options)
         func = getattr(prg, kernel_instance.name)
         return func
 

diff --git a/kernel_tuner/backends/pycuda.py b/kernel_tuner/backends/pycuda.py
@@ -97,13 +97,9 @@ def _finish_up():
             PyCudaFunctions.last_selected_context = self.context
 
         # inspect device properties
-        devprops = {
-            str(k): v for (k, v) in self.context.get_device().get_attributes().items()
-        }
+        devprops = {str(k): v for (k, v) in self.context.get_device().get_attributes().items()}
         self.max_threads = devprops["MAX_THREADS_PER_BLOCK"]
-        cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(
-            devprops.get("COMPUTE_CAPABILITY_MINOR", "0")
-        )
+        cc = str(devprops.get("COMPUTE_CAPABILITY_MAJOR", "0")) + str(devprops.get("COMPUTE_CAPABILITY_MINOR", "0"))
         if cc == "00":
             cc = self.context.get_device().compute_capability()
         self.cc = str(cc[0]) + str(cc[1])
@@ -347,14 +343,7 @@ def run_kernel(self, func, gpu_args, threads, grid, stream=None):
         """
         if stream is None:
             stream = self.stream
-        func(
-            *gpu_args,
-            block=threads,
-            grid=grid,
-            stream=stream,
-            shared=self.smem_size,
-            texrefs=self.texrefs
-        )
+        func(*gpu_args, block=threads, grid=grid, stream=stream, shared=self.smem_size, texrefs=self.texrefs)
 
     def memset(self, allocation, value, size):
         """Set the memory in allocation to the value in value.