intel · sergey-semenov · Jun 16, 2025 · Jun 25, 2025 · Jun 25, 2025 · Jun 25, 2025
@@ -8,19 +8,27 @@
 #pragma once
 
 #include <sycl/detail/export.hpp>
+#include <sycl/detail/kernel_name_str_t.hpp>
 
 namespace sycl {
 inline namespace _V1 {
 namespace detail {
 
-struct KernelNameBasedCacheT;
+class KernelNameBasedCacheT;
+
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 __SYCL_EXPORT KernelNameBasedCacheT *createKernelNameBasedCache();
+#endif
+__SYCL_EXPORT KernelNameBasedCacheT *
+createKernelNameBasedCache(detail::ABINeutralKernelNameStrRefT MKernelName);
 
 // Retrieves a cache pointer unique to a kernel name type that can be used to
 // avoid kernel name based lookup in the runtime.
-template <typename KernelName>
-KernelNameBasedCacheT *getKernelNameBasedCache() {
-  static KernelNameBasedCacheT *Instance = createKernelNameBasedCache();
+template <typename KernelNameT>
+KernelNameBasedCacheT *
+getKernelNameBasedCache(detail::ABINeutralKernelNameStrRefT KernelName) {
+  static KernelNameBasedCacheT *Instance =
+      createKernelNameBasedCache(KernelName);
   return Instance;
 }
 

@@ -18,10 +18,12 @@ namespace detail {
 using KernelNameStrT = std::string_view;
 using KernelNameStrRefT = std::string_view;
 using ABINeutralKernelNameStrT = detail::string_view;
+using ABINeutralKernelNameStrRefT = detail::string_view;
 #else
 using KernelNameStrT = std::string;
 using KernelNameStrRefT = const std::string &;
 using ABINeutralKernelNameStrT = detail::string;
+using ABINeutralKernelNameStrRefT = const detail::string &;
 #endif
 
 inline KernelNameStrT toKernelNameStrT(const ABINeutralKernelNameStrT &str) {

@@ -888,14 +888,15 @@ class __SYCL_EXPORT handler {
       constexpr std::string_view KernelNameStr =
           detail::getKernelName<KernelName>();
       MKernelName = KernelNameStr;
+      setKernelNameBasedCachePtr(
+          detail::getKernelNameBasedCache<KernelName>(KernelNameStr));
     } else {
       // In case w/o the integration header it is necessary to process
       // accessors from the list(which are associated with this handler) as
       // arguments. We must copy the associated accessors as they are checked
       // later during finalize.
       setArgsToAssociatedAccessors();
     }
-    setKernelNameBasedCachePtr(detail::getKernelNameBasedCache<KernelName>());
 
     // If the kernel lambda is callable with a kernel_handler argument, manifest
     // the associated kernel handler.

@@ -288,6 +288,7 @@ set(SYCL_COMMON_SOURCES
     "detail/kernel_compiler/kernel_compiler_sycl.cpp"
     "detail/kernel_impl.cpp"
     "detail/kernel_name_based_cache.cpp"
+    "detail/kernel_name_based_cache_t.cpp"
     "detail/kernel_program_cache.cpp"
     "detail/memory_manager.cpp"
     "detail/pipes.cpp"

@@ -257,12 +257,14 @@ ThreadPool &GlobalHandler::getHostTaskThreadPool() {
   return TP;
 }
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 KernelNameBasedCacheT *GlobalHandler::createKernelNameBasedCache() {
   static std::deque<KernelNameBasedCacheT> &KernelNameBasedCaches =
       getOrCreate(MKernelNameBasedCaches);
   LockGuard LG{MKernelNameBasedCaches.Lock};
   return &KernelNameBasedCaches.emplace_back();
 }
+#endif
 
 void GlobalHandler::releaseDefaultContexts() {
   // Release shared-pointers to SYCL objects.
@@ -392,9 +394,11 @@ void shutdown_late() {
   Handler->MScheduler.Inst.reset(nullptr);
   Handler->MProgramManager.Inst.reset(nullptr);
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
   // Cache stores handles to the adapter, so clear it before
   // releasing adapters.
   Handler->MKernelNameBasedCaches.Inst.reset(nullptr);
+#endif
 
   // Clear the adapters and reset the instance if it was there.
   Handler->unloadAdapters();

@@ -11,7 +11,9 @@
 #include <sycl/detail/spinlock.hpp>
 #include <sycl/detail/util.hpp>
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 #include <deque>
+#endif
 #include <memory>
 #include <unordered_map>
 
@@ -27,7 +29,7 @@ class adapter_impl;
 class ods_target_list;
 class XPTIRegistry;
 class ThreadPool;
-struct KernelNameBasedCacheT;
+class KernelNameBasedCacheT;
 
 /// Wrapper class for global data structures with non-trivial destructors.
 ///
@@ -75,7 +77,9 @@ class GlobalHandler {
   ods_target_list &getOneapiDeviceSelectorTargets(const std::string &InitValue);
   XPTIRegistry &getXPTIRegistry();
   ThreadPool &getHostTaskThreadPool();
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
   KernelNameBasedCacheT *createKernelNameBasedCache();
+#endif
   static void registerStaticVarShutdownHandler();
 
   bool isOkToDefer() const;
@@ -132,7 +136,9 @@ class GlobalHandler {
   InstWithLock<XPTIRegistry> MXPTIRegistry;
   // Thread pool for host task and event callbacks execution
   InstWithLock<ThreadPool> MHostTaskThreadPool;
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
   InstWithLock<std::deque<KernelNameBasedCacheT>> MKernelNameBasedCaches;
+#endif
 };
 } // namespace detail
 } // namespace _V1

@@ -1587,9 +1587,10 @@ void exec_graph_impl::populateURKernelUpdateStructs(
     UrKernel = SyclKernelImpl->getHandleRef();
     EliminatedArgMask = SyclKernelImpl->getKernelArgMask();
   } else {
+    assert(ExecCG.MKernelNameBasedCachePtr);
     BundleObjs = sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
         ContextImpl, DeviceImpl, ExecCG.MKernelName,
-        ExecCG.MKernelNameBasedCachePtr);
+        *ExecCG.MKernelNameBasedCachePtr);
     UrKernel = BundleObjs->MKernelHandle;
     EliminatedArgMask = BundleObjs->MKernelArgMask;
   }

@@ -7,15 +7,24 @@
 //===----------------------------------------------------------------------===//
 
 #include <detail/global_handler.hpp>
+#include <detail/program_manager/program_manager.hpp>
 #include <sycl/detail/kernel_name_based_cache.hpp>
 
 namespace sycl {
 inline namespace _V1 {
 namespace detail {
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 KernelNameBasedCacheT *createKernelNameBasedCache() {
   return GlobalHandler::instance().createKernelNameBasedCache();
 }
+#endif
+
+KernelNameBasedCacheT *
+createKernelNameBasedCache(detail::ABINeutralKernelNameStrRefT KernelName) {
+  return ProgramManager::getInstance().createKernelNameBasedCache(
+      KernelName.data());
+}
 
 } // namespace detail
 } // namespace _V1

@@ -0,0 +1,56 @@
+//==-------------------- kernel_name_based_cache_t.cpp ---------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include <detail/kernel_name_based_cache_t.hpp>
+#include <detail/program_manager/program_manager.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace detail {
+
+KernelNameBasedCacheT::KernelNameBasedCacheT(KernelNameStrRefT KernelName) {
+  init(KernelName);
+}
+
+void KernelNameBasedCacheT::init(KernelNameStrRefT KernelName) {
+  auto &PM = detail::ProgramManager::getInstance();
+  MUsesAssert = PM.kernelUsesAssert(KernelName);
+  MImplicitLocalArgPos = PM.kernelImplicitLocalArgPos(KernelName);
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+  MInitialized.store(true);
+#endif
+}
+
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+void KernelNameBasedCacheT::initIfNeeded(KernelNameStrRefT KernelName) {
+  if (!MInitialized.load())
+    init(KernelName);
+}
+#endif
+
+FastKernelSubcacheT &KernelNameBasedCacheT::getKernelSubcache() {
+  assertInitialized();
+  return MFastKernelSubcache;
+}
+bool KernelNameBasedCacheT::usesAssert() {
+  assertInitialized();
+  return MUsesAssert;
+}
+const std::optional<int> &KernelNameBasedCacheT::getImplicitLocalArgPos() {
+  assertInitialized();
+  return MImplicitLocalArgPos;
+}
+
+void KernelNameBasedCacheT::assertInitialized() {
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+  assert(MInitialized.load() && "Cache needs to be initialized before use");
+#endif
+}
+
+} // namespace detail
+} // namespace _V1
+} // namespace sycl
@@ -10,6 +10,7 @@
 #include <detail/hashers.hpp>
 #include <detail/kernel_arg_mask.hpp>
 #include <emhash/hash_table8.hpp>
+#include <sycl/detail/kernel_name_str_t.hpp>
 #include <sycl/detail/spinlock.hpp>
 #include <sycl/detail/ur.hpp>
 
@@ -83,13 +84,33 @@ struct FastKernelSubcacheT {
   FastKernelSubcacheMutexT Mutex;
 };
 
-struct KernelNameBasedCacheT {
-  FastKernelSubcacheT FastKernelSubcache;
-  std::optional<bool> UsesAssert;
-  // Implicit local argument position is represented by an optional int, this
-  // uses another optional on top of that to represent lazy initialization of
-  // the cached value.
-  std::optional<std::optional<int>> ImplicitLocalArgPos;
+// This class is used for caching kernel name based information.
+// Pointers to instances of this class are stored in header function templates
+// as a static variable to avoid repeated runtime lookup overhead.
+class KernelNameBasedCacheT {
+public:
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+  KernelNameBasedCacheT() = default;
+#endif
+  KernelNameBasedCacheT(KernelNameStrRefT KernelName);
+
+  void init(KernelNameStrRefT KernelName);
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+  void initIfNeeded(KernelNameStrRefT KernelName);
+#endif
+  FastKernelSubcacheT &getKernelSubcache();
+  bool usesAssert();
+  const std::optional<int> &getImplicitLocalArgPos();
+
+private:
+  void assertInitialized();
+
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
+  std::atomic<bool> MInitialized = false;
+#endif
+  FastKernelSubcacheT MFastKernelSubcache;
+  bool MUsesAssert;
+  std::optional<int> MImplicitLocalArgPos;
 };
 
 } // namespace detail

@@ -221,25 +221,18 @@ class KernelProgramCache {
 
   class FastKernelSubcacheWrapper {
   public:
-    FastKernelSubcacheWrapper(FastKernelSubcacheT *CachePtr,
+    FastKernelSubcacheWrapper(FastKernelSubcacheT &Subcache,
                               ur_context_handle_t UrContext)
-        : MSubcachePtr{CachePtr}, MUrContext{UrContext} {
-      if (!MSubcachePtr) {
-        MOwnsSubcache = true;
-        MSubcachePtr = new FastKernelSubcacheT();
-      }
-    }
+        : MSubcachePtr{&Subcache}, MUrContext{UrContext} {}
     FastKernelSubcacheWrapper(const FastKernelSubcacheWrapper &) = delete;
     FastKernelSubcacheWrapper(FastKernelSubcacheWrapper &&Other)
-        : MSubcachePtr{Other.MSubcachePtr}, MOwnsSubcache{Other.MOwnsSubcache},
-          MUrContext{Other.MUrContext} {
+        : MSubcachePtr{Other.MSubcachePtr}, MUrContext{Other.MUrContext} {
       Other.MSubcachePtr = nullptr;
     }
     FastKernelSubcacheWrapper &
     operator=(const FastKernelSubcacheWrapper &) = delete;
     FastKernelSubcacheWrapper &operator=(FastKernelSubcacheWrapper &&Other) {
       MSubcachePtr = Other.MSubcachePtr;
-      MOwnsSubcache = Other.MOwnsSubcache;
       MUrContext = Other.MUrContext;
       Other.MSubcachePtr = nullptr;
       return *this;
@@ -249,11 +242,6 @@ class KernelProgramCache {
       if (!MSubcachePtr)
         return;
 
-      if (MOwnsSubcache) {
-        delete MSubcachePtr;
-        return;
-      }
-
       // Single subcache might be used by different contexts.
       // Remove all entries from the subcache that are associated with the
       // current context.
@@ -269,8 +257,7 @@ class KernelProgramCache {
     FastKernelSubcacheT &get() { return *MSubcachePtr; }
 
   private:
-    FastKernelSubcacheT *MSubcachePtr = nullptr;
-    bool MOwnsSubcache = false;
+    FastKernelSubcacheT *MSubcachePtr;
     ur_context_handle_t MUrContext = nullptr;
   };
 
@@ -459,18 +446,9 @@ class KernelProgramCache {
 
   FastKernelCacheValPtr
   tryToGetKernelFast(KernelNameStrRefT KernelName, ur_device_handle_t Device,
-                     FastKernelSubcacheT *KernelSubcacheHint) {
-    FastKernelCacheWriteLockT Lock(MFastKernelCacheMutex);
-    if (!KernelSubcacheHint) {
-      auto It = MFastKernelCache.try_emplace(
-          KernelName,
-          FastKernelSubcacheWrapper(KernelSubcacheHint, getURContext()));
-      KernelSubcacheHint = &It.first->second.get();
-    }
-
-    const FastKernelSubcacheEntriesT &SubcacheEntries =
-        KernelSubcacheHint->Entries;
-    FastKernelSubcacheReadLockT SubcacheLock{KernelSubcacheHint->Mutex};
+                     FastKernelSubcacheT &KernelSubcache) {
+    const FastKernelSubcacheEntriesT &SubcacheEntries = KernelSubcache.Entries;
+    FastKernelSubcacheReadLockT SubcacheLock{KernelSubcache.Mutex};
     ur_context_handle_t Context = getURContext();
     const FastKernelCacheKeyT RequiredKey(Device, Context);
     // Search for the kernel in the subcache.
@@ -488,7 +466,7 @@ class KernelProgramCache {
 
   void saveKernel(KernelNameStrRefT KernelName, ur_device_handle_t Device,
                   const FastKernelCacheValPtr &CacheVal,
-                  FastKernelSubcacheT *KernelSubcacheHint) {
+                  FastKernelSubcacheT &KernelSubcache) {
     if (SYCLConfig<SYCL_IN_MEM_CACHE_EVICTION_THRESHOLD>::
             isProgramCacheEvictionEnabled()) {
       // Save kernel in fast cache only if the corresponding program is also
@@ -508,15 +486,13 @@ class KernelProgramCache {
     // if no insertion took place, then some other thread has already inserted
     // smth in the cache
     traceKernel("Kernel inserted.", KernelName, true);
-    auto It = MFastKernelCache.try_emplace(
-        KernelName,
-        FastKernelSubcacheWrapper(KernelSubcacheHint, getURContext()));
-    KernelSubcacheHint = &It.first->second.get();
+    MFastKernelCache.try_emplace(
+        KernelName, FastKernelSubcacheWrapper(KernelSubcache, getURContext()));
 
-    FastKernelSubcacheWriteLockT SubcacheLock{KernelSubcacheHint->Mutex};
+    FastKernelSubcacheWriteLockT SubcacheLock{KernelSubcache.Mutex};
     ur_context_handle_t Context = getURContext();
-    KernelSubcacheHint->Entries.emplace_back(
-        FastKernelCacheKeyT(Device, Context), CacheVal);
+    KernelSubcache.Entries.emplace_back(FastKernelCacheKeyT(Device, Context),
+                                        CacheVal);
   }
 
   // Expects locked program cache