diff --git a/doc/tutorials/content/custom_executor.rst b/doc/tutorials/content/custom_executor.rst new file mode 100644 index 00000000000..73d4230dc0b --- /dev/null +++ b/doc/tutorials/content/custom_executor.rst @@ -0,0 +1,157 @@ +.. _custom_executor: + +Creating Custom Executors +-------------------------------------------------- + +Since executors have a standard interface, you can create custom executors +and use them. +PCL offers support for a few executors, namely: + +1. Inline Executor +2. SSE Executor +3. OMP Executor +4. CUDA Executor + +You can create a specialized version of these executors i.e., executors derived from +the one supported by PCL that adds some additional functionality. + +In this tutorial, we will learn how to create an executor derived from the OMP executor, +that measures and reports the time taken by a functor filter to execute. + +.. note:: + This tutorial is for advanced users and requires some knowledge on working of executors + and its implementation in PCL. + +The code +-------- + +First, create a file, let's say, ``custom_executor.cpp``, and place the following inside it: + +.. literalinclude:: sources/custom_executor/custom_executor.cpp + :language: cpp + :linenos: + +The explanation +--------------- + +Now, let's break down the code piece by piece. + +Here, we forward declare our custom executor struct called `omp_benchmark_executor`. +This is required for the trait `is_executor_available` declared in the next few lines, +which is used inside our custom executor's definition. + +.. literalinclude:: sources/custom_executor/custom_executor.cpp + :language: cpp + :lines: 10-12 + +We mark the executor as available, by creating a specialization of `omp_benchmark_executor` +which inherits from `std::true_type`. This acts as an indicator that the system supports the +executor. You can wrap this inside a `#ifdef` and check for certain macros, which +indicate the presence of certain features like `_OPENMP` is used for OpenMP. + +.. literalinclude:: sources/custom_executor/custom_executor.cpp + :language: cpp + :lines: 14-18 + +Here, we define the structure for our custom executor, which we had forward declared earlier. +It is templated with the two properties it supports, which are `blocking_t` and `allocator_t`. +Since this is a specialization of OMP executor we inherit from `omp_executor`, this allows +us to use our custom executor wherever the OpenMP executor provided in PCL is supported. + +.. literalinclude:: sources/custom_executor/custom_executor.cpp + :language: cpp + :lines: 20-23 + +We need to introduce the base struct, i.e., `omp_executor` members into +our current struct. You can read more on why this is needed over +`here `_. + +.. literalinclude:: sources/custom_executor/custom_executor.cpp + :language: cpp + :lines: 24-29 + +Our custom executor's special feature is the ability to time functions executed +by `bulk_execute`, so we need to override the function. We also perform checks on the executor's +availability. + +.. literalinclude:: sources/custom_executor/custom_executor.cpp + :language: cpp + :lines: 31-37 + +This is where we define what happens before and after we invoke our callable. +We measure the time before and after all the thread executes the code. We enclose all our code +in a parallel region with the specified maximum number of threads. + +.. literalinclude:: sources/custom_executor/custom_executor.cpp + :language: cpp + :lines: 39-43 + +We then measure the time taken by each thread. The callable is invoked in a loop which is +automatically parallelized using OpenMP. + +.. literalinclude:: sources/custom_executor/custom_executor.cpp + :language: cpp + :lines: 44-51 + +In the following lines, we create a Point Cloud structure for the input and output point clouds, +then fill the input cloud using `CloudGenerator`. The generator uses 128 as a seed value to uniformly +fill the input cloud with a point cloud having width & height as 200. Each point is generated +with x,y & z coordinates in the range [-20, 20]. + +.. literalinclude:: sources/custom_executor/custom_executor.cpp + :language: cpp + :lines: 59-65 + +We then create a FunctorFilter called `positive_filter` that filters out +any points which have negative coordinates. + +.. literalinclude:: sources/custom_executor/custom_executor.cpp + :language: cpp + :lines: 67-76 + +Finally, we create an instance of our custom executor `omp_benchmark_executor` and limit +the max number of threads to four. Then we call the `filter` function of `positive_filter` with +our custom executor. We repeat the same and limit the number of threads to one the +second time. + +.. literalinclude:: sources/custom_executor/custom_executor.cpp + :language: cpp + :lines: 78-85 + +.. note:: + Not all code inside the functor filter is executed by the executor. So it does not measure the + time taken by the entire filter to execute and only measures the execution time, of the portion of + the program that was executed by the executor. + + Refer to the implementation of FunctorFilter for more insight. + +Compiling and running the program +--------------------------------- + +Add the following lines to your CMakeLists.txt file: + +.. literalinclude:: sources/custom_executor/CMakeLists.txt + :language: cmake + :linenos: + + +After you have made the executable, you can run it. Simply do: + + $ ./custom_executor + +You should get an output similar to this (the time taken will change based +on your system configuration): + +.. code-block:: bash + + Filtering using 4 Threads + Time taken by thread 0: took 2.97464ms. + Time taken by thread 3: took 4.24397ms. + Time taken by thread 2: took 4.26735ms. + Time taken by thread 1: took 5.19864ms. + Total time taken: took 5.44704ms. + + Filtering using 1 Thread + Time taken by thread 0: took 9.38748ms. + Total time taken: took 9.39384ms. + diff --git a/doc/tutorials/content/sources/custom_executor/CMakeLists.txt b/doc/tutorials/content/sources/custom_executor/CMakeLists.txt new file mode 100644 index 00000000000..c9f58137bd0 --- /dev/null +++ b/doc/tutorials/content/sources/custom_executor/CMakeLists.txt @@ -0,0 +1,8 @@ +cmake_minimum_required(VERSION 3.5 FATAL_ERROR) + +project(custom_executor) + +find_package(PCL 1.11.1.99 COMPONENTS common filters REQUIRED) + +add_executable (custom_executor custom_executor.cpp) +target_link_libraries (custom_executor ${PCL_LIBRARIES}) diff --git a/doc/tutorials/content/sources/custom_executor/custom_executor.cpp b/doc/tutorials/content/sources/custom_executor/custom_executor.cpp new file mode 100644 index 00000000000..9933778cb87 --- /dev/null +++ b/doc/tutorials/content/sources/custom_executor/custom_executor.cpp @@ -0,0 +1,88 @@ +#include +#include +#include + +#include +#include + +using namespace pcl::executor; + +// Forward declaration for custom executor +template +struct omp_benchmark_executor; + +// Mark the executor as available +#ifdef _OPENMP +template <> +struct pcl::executor::is_executor_available : std::true_type {}; +#endif + +// Custom executor derived from OMP executor +template > +struct omp_benchmark_executor : public omp_executor { + // Introduce base struct members + using Base = omp_executor; + using Base::max_threads; + using typename Base::index_type; + using typename Base::omp_executor; + using typename Base::shape_type; + + template + void + bulk_execute(F&& f, const shape_type& n) const + { + // Throw static assert failure if executor is not available + static_assert(is_executor_available_v, + "OpenMP benchmark executor unavailable"); + + // Measure total time taken by all threads + pcl::ScopeTime total_time("Total time taken:"); + + #pragma omp parallel num_threads(max_threads) + { + // Measure time taken by each thread + pcl::ScopeTime thread_time("Time taken by thread " + + std::to_string(omp_get_thread_num()) + ":"); + + // Invoke the callable n times + #pragma omp for nowait + for (index_type index = 0; index < n; ++index) + f(index); + } + } +}; + +int +main() +{ + // Create empty output point clouds and fill the input cloud with randomly generated + // points + pcl::PointCloud out_cloud1, out_cloud2; + const auto cloud = pcl::make_shared>(); + pcl::common::CloudGenerator> + generator{{-20., 20., 128}}; + generator.fill(200, 200, *cloud); + + // Create a functor filter that filters point outside a fixed radius + const auto positive_cond = [](const pcl::PointCloud& cloud, + pcl::index_t idx) { + return (cloud[idx].getArray3fMap() > 0).all(); + }; + + auto positive_filter = + pcl::experimental::FunctorFilter( + positive_cond); + positive_filter.setInputCloud(cloud); + + // Create instance of custom executor and apply the filter with it + auto exec = omp_benchmark_executor<>(4); + std::cout << "Filtering using 4 Threads" << std::endl; + positive_filter.filter(exec, out_cloud1); + + exec.set_max_threads(1); + std::cout << "\nFiltering using 1 Thread" << std::endl; + positive_filter.filter(exec, out_cloud1); + + return 0; +}