diff --git a/sycl/include/sycl/detail/kernel_name_based_cache.hpp b/sycl/include/sycl/detail/kernel_name_based_cache.hpp index 6bd2e38edc8e7..b005dc4d29477 100644 --- a/sycl/include/sycl/detail/kernel_name_based_cache.hpp +++ b/sycl/include/sycl/detail/kernel_name_based_cache.hpp @@ -8,19 +8,27 @@ #pragma once #include +#include namespace sycl { inline namespace _V1 { namespace detail { -struct KernelNameBasedCacheT; +class KernelNameBasedCacheT; + +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES __SYCL_EXPORT KernelNameBasedCacheT *createKernelNameBasedCache(); +#endif +__SYCL_EXPORT KernelNameBasedCacheT * +createKernelNameBasedCache(detail::ABINeutralKernelNameStrRefT MKernelName); // Retrieves a cache pointer unique to a kernel name type that can be used to // avoid kernel name based lookup in the runtime. -template -KernelNameBasedCacheT *getKernelNameBasedCache() { - static KernelNameBasedCacheT *Instance = createKernelNameBasedCache(); +template +KernelNameBasedCacheT * +getKernelNameBasedCache(detail::ABINeutralKernelNameStrRefT KernelName) { + static KernelNameBasedCacheT *Instance = + createKernelNameBasedCache(KernelName); return Instance; } diff --git a/sycl/include/sycl/detail/kernel_name_str_t.hpp b/sycl/include/sycl/detail/kernel_name_str_t.hpp index e0079ffb09c7e..b8ceb395431d4 100644 --- a/sycl/include/sycl/detail/kernel_name_str_t.hpp +++ b/sycl/include/sycl/detail/kernel_name_str_t.hpp @@ -18,10 +18,12 @@ namespace detail { using KernelNameStrT = std::string_view; using KernelNameStrRefT = std::string_view; using ABINeutralKernelNameStrT = detail::string_view; +using ABINeutralKernelNameStrRefT = detail::string_view; #else using KernelNameStrT = std::string; using KernelNameStrRefT = const std::string &; using ABINeutralKernelNameStrT = detail::string; +using ABINeutralKernelNameStrRefT = const detail::string &; #endif inline KernelNameStrT toKernelNameStrT(const ABINeutralKernelNameStrT &str) { diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 70429152f0ea1..a48a564b6684b 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -888,6 +888,8 @@ class __SYCL_EXPORT handler { constexpr std::string_view KernelNameStr = detail::getKernelName(); MKernelName = KernelNameStr; + setKernelNameBasedCachePtr( + detail::getKernelNameBasedCache(KernelNameStr)); } else { // In case w/o the integration header it is necessary to process // accessors from the list(which are associated with this handler) as @@ -895,7 +897,6 @@ class __SYCL_EXPORT handler { // later during finalize. setArgsToAssociatedAccessors(); } - setKernelNameBasedCachePtr(detail::getKernelNameBasedCache()); // If the kernel lambda is callable with a kernel_handler argument, manifest // the associated kernel handler. diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index dd1b833383055..c09f47c890173 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -288,6 +288,7 @@ set(SYCL_COMMON_SOURCES "detail/kernel_compiler/kernel_compiler_sycl.cpp" "detail/kernel_impl.cpp" "detail/kernel_name_based_cache.cpp" + "detail/kernel_name_based_cache_t.cpp" "detail/kernel_program_cache.cpp" "detail/memory_manager.cpp" "detail/pipes.cpp" diff --git a/sycl/source/detail/global_handler.cpp b/sycl/source/detail/global_handler.cpp index 46509daf741b9..30b0571c86cba 100644 --- a/sycl/source/detail/global_handler.cpp +++ b/sycl/source/detail/global_handler.cpp @@ -257,12 +257,14 @@ ThreadPool &GlobalHandler::getHostTaskThreadPool() { return TP; } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES KernelNameBasedCacheT *GlobalHandler::createKernelNameBasedCache() { static std::deque &KernelNameBasedCaches = getOrCreate(MKernelNameBasedCaches); LockGuard LG{MKernelNameBasedCaches.Lock}; return &KernelNameBasedCaches.emplace_back(); } +#endif void GlobalHandler::releaseDefaultContexts() { // Release shared-pointers to SYCL objects. @@ -392,9 +394,11 @@ void shutdown_late() { Handler->MScheduler.Inst.reset(nullptr); Handler->MProgramManager.Inst.reset(nullptr); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES // Cache stores handles to the adapter, so clear it before // releasing adapters. Handler->MKernelNameBasedCaches.Inst.reset(nullptr); +#endif // Clear the adapters and reset the instance if it was there. Handler->unloadAdapters(); diff --git a/sycl/source/detail/global_handler.hpp b/sycl/source/detail/global_handler.hpp index 605fd10fb77ab..072388220a744 100644 --- a/sycl/source/detail/global_handler.hpp +++ b/sycl/source/detail/global_handler.hpp @@ -11,7 +11,9 @@ #include #include +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES #include +#endif #include #include @@ -27,7 +29,7 @@ class adapter_impl; class ods_target_list; class XPTIRegistry; class ThreadPool; -struct KernelNameBasedCacheT; +class KernelNameBasedCacheT; /// Wrapper class for global data structures with non-trivial destructors. /// @@ -75,7 +77,9 @@ class GlobalHandler { ods_target_list &getOneapiDeviceSelectorTargets(const std::string &InitValue); XPTIRegistry &getXPTIRegistry(); ThreadPool &getHostTaskThreadPool(); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES KernelNameBasedCacheT *createKernelNameBasedCache(); +#endif static void registerStaticVarShutdownHandler(); bool isOkToDefer() const; @@ -132,7 +136,9 @@ class GlobalHandler { InstWithLock MXPTIRegistry; // Thread pool for host task and event callbacks execution InstWithLock MHostTaskThreadPool; +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES InstWithLock> MKernelNameBasedCaches; +#endif }; } // namespace detail } // namespace _V1 diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp index 6581bce0ef91f..eaa22e08bf733 100644 --- a/sycl/source/detail/graph/graph_impl.cpp +++ b/sycl/source/detail/graph/graph_impl.cpp @@ -1587,9 +1587,10 @@ void exec_graph_impl::populateURKernelUpdateStructs( UrKernel = SyclKernelImpl->getHandleRef(); EliminatedArgMask = SyclKernelImpl->getKernelArgMask(); } else { + assert(ExecCG.MKernelNameBasedCachePtr); BundleObjs = sycl::detail::ProgramManager::getInstance().getOrCreateKernel( ContextImpl, DeviceImpl, ExecCG.MKernelName, - ExecCG.MKernelNameBasedCachePtr); + *ExecCG.MKernelNameBasedCachePtr); UrKernel = BundleObjs->MKernelHandle; EliminatedArgMask = BundleObjs->MKernelArgMask; } diff --git a/sycl/source/detail/kernel_name_based_cache.cpp b/sycl/source/detail/kernel_name_based_cache.cpp index 17356e7f38fc8..8c2f8c5c68720 100644 --- a/sycl/source/detail/kernel_name_based_cache.cpp +++ b/sycl/source/detail/kernel_name_based_cache.cpp @@ -7,15 +7,24 @@ //===----------------------------------------------------------------------===// #include +#include #include namespace sycl { inline namespace _V1 { namespace detail { +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES KernelNameBasedCacheT *createKernelNameBasedCache() { return GlobalHandler::instance().createKernelNameBasedCache(); } +#endif + +KernelNameBasedCacheT * +createKernelNameBasedCache(detail::ABINeutralKernelNameStrRefT KernelName) { + return ProgramManager::getInstance().createKernelNameBasedCache( + KernelName.data()); +} } // namespace detail } // namespace _V1 diff --git a/sycl/source/detail/kernel_name_based_cache_t.cpp b/sycl/source/detail/kernel_name_based_cache_t.cpp new file mode 100644 index 0000000000000..dd63d8d7d4d4b --- /dev/null +++ b/sycl/source/detail/kernel_name_based_cache_t.cpp @@ -0,0 +1,56 @@ +//==-------------------- kernel_name_based_cache_t.cpp ---------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include + +namespace sycl { +inline namespace _V1 { +namespace detail { + +KernelNameBasedCacheT::KernelNameBasedCacheT(KernelNameStrRefT KernelName) { + init(KernelName); +} + +void KernelNameBasedCacheT::init(KernelNameStrRefT KernelName) { + auto &PM = detail::ProgramManager::getInstance(); + MUsesAssert = PM.kernelUsesAssert(KernelName); + MImplicitLocalArgPos = PM.kernelImplicitLocalArgPos(KernelName); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + MInitialized.store(true); +#endif +} + +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES +void KernelNameBasedCacheT::initIfNeeded(KernelNameStrRefT KernelName) { + if (!MInitialized.load()) + init(KernelName); +} +#endif + +FastKernelSubcacheT &KernelNameBasedCacheT::getKernelSubcache() { + assertInitialized(); + return MFastKernelSubcache; +} +bool KernelNameBasedCacheT::usesAssert() { + assertInitialized(); + return MUsesAssert; +} +const std::optional &KernelNameBasedCacheT::getImplicitLocalArgPos() { + assertInitialized(); + return MImplicitLocalArgPos; +} + +void KernelNameBasedCacheT::assertInitialized() { +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + assert(MInitialized.load() && "Cache needs to be initialized before use"); +#endif +} + +} // namespace detail +} // namespace _V1 +} // namespace sycl \ No newline at end of file diff --git a/sycl/source/detail/kernel_name_based_cache_t.hpp b/sycl/source/detail/kernel_name_based_cache_t.hpp index 3c2b3fb03cf6f..99e6c2484f425 100644 --- a/sycl/source/detail/kernel_name_based_cache_t.hpp +++ b/sycl/source/detail/kernel_name_based_cache_t.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -83,13 +84,33 @@ struct FastKernelSubcacheT { FastKernelSubcacheMutexT Mutex; }; -struct KernelNameBasedCacheT { - FastKernelSubcacheT FastKernelSubcache; - std::optional UsesAssert; - // Implicit local argument position is represented by an optional int, this - // uses another optional on top of that to represent lazy initialization of - // the cached value. - std::optional> ImplicitLocalArgPos; +// This class is used for caching kernel name based information. +// Pointers to instances of this class are stored in header function templates +// as a static variable to avoid repeated runtime lookup overhead. +class KernelNameBasedCacheT { +public: +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + KernelNameBasedCacheT() = default; +#endif + KernelNameBasedCacheT(KernelNameStrRefT KernelName); + + void init(KernelNameStrRefT KernelName); +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + void initIfNeeded(KernelNameStrRefT KernelName); +#endif + FastKernelSubcacheT &getKernelSubcache(); + bool usesAssert(); + const std::optional &getImplicitLocalArgPos(); + +private: + void assertInitialized(); + +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + std::atomic MInitialized = false; +#endif + FastKernelSubcacheT MFastKernelSubcache; + bool MUsesAssert; + std::optional MImplicitLocalArgPos; }; } // namespace detail diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp index 4c139f99c8211..a7b054c0f4b03 100644 --- a/sycl/source/detail/kernel_program_cache.hpp +++ b/sycl/source/detail/kernel_program_cache.hpp @@ -221,25 +221,18 @@ class KernelProgramCache { class FastKernelSubcacheWrapper { public: - FastKernelSubcacheWrapper(FastKernelSubcacheT *CachePtr, + FastKernelSubcacheWrapper(FastKernelSubcacheT &Subcache, ur_context_handle_t UrContext) - : MSubcachePtr{CachePtr}, MUrContext{UrContext} { - if (!MSubcachePtr) { - MOwnsSubcache = true; - MSubcachePtr = new FastKernelSubcacheT(); - } - } + : MSubcachePtr{&Subcache}, MUrContext{UrContext} {} FastKernelSubcacheWrapper(const FastKernelSubcacheWrapper &) = delete; FastKernelSubcacheWrapper(FastKernelSubcacheWrapper &&Other) - : MSubcachePtr{Other.MSubcachePtr}, MOwnsSubcache{Other.MOwnsSubcache}, - MUrContext{Other.MUrContext} { + : MSubcachePtr{Other.MSubcachePtr}, MUrContext{Other.MUrContext} { Other.MSubcachePtr = nullptr; } FastKernelSubcacheWrapper & operator=(const FastKernelSubcacheWrapper &) = delete; FastKernelSubcacheWrapper &operator=(FastKernelSubcacheWrapper &&Other) { MSubcachePtr = Other.MSubcachePtr; - MOwnsSubcache = Other.MOwnsSubcache; MUrContext = Other.MUrContext; Other.MSubcachePtr = nullptr; return *this; @@ -249,11 +242,6 @@ class KernelProgramCache { if (!MSubcachePtr) return; - if (MOwnsSubcache) { - delete MSubcachePtr; - return; - } - // Single subcache might be used by different contexts. // Remove all entries from the subcache that are associated with the // current context. @@ -269,8 +257,7 @@ class KernelProgramCache { FastKernelSubcacheT &get() { return *MSubcachePtr; } private: - FastKernelSubcacheT *MSubcachePtr = nullptr; - bool MOwnsSubcache = false; + FastKernelSubcacheT *MSubcachePtr; ur_context_handle_t MUrContext = nullptr; }; @@ -459,18 +446,9 @@ class KernelProgramCache { FastKernelCacheValPtr tryToGetKernelFast(KernelNameStrRefT KernelName, ur_device_handle_t Device, - FastKernelSubcacheT *KernelSubcacheHint) { - FastKernelCacheWriteLockT Lock(MFastKernelCacheMutex); - if (!KernelSubcacheHint) { - auto It = MFastKernelCache.try_emplace( - KernelName, - FastKernelSubcacheWrapper(KernelSubcacheHint, getURContext())); - KernelSubcacheHint = &It.first->second.get(); - } - - const FastKernelSubcacheEntriesT &SubcacheEntries = - KernelSubcacheHint->Entries; - FastKernelSubcacheReadLockT SubcacheLock{KernelSubcacheHint->Mutex}; + FastKernelSubcacheT &KernelSubcache) { + const FastKernelSubcacheEntriesT &SubcacheEntries = KernelSubcache.Entries; + FastKernelSubcacheReadLockT SubcacheLock{KernelSubcache.Mutex}; ur_context_handle_t Context = getURContext(); const FastKernelCacheKeyT RequiredKey(Device, Context); // Search for the kernel in the subcache. @@ -488,7 +466,7 @@ class KernelProgramCache { void saveKernel(KernelNameStrRefT KernelName, ur_device_handle_t Device, const FastKernelCacheValPtr &CacheVal, - FastKernelSubcacheT *KernelSubcacheHint) { + FastKernelSubcacheT &KernelSubcache) { if (SYCLConfig:: isProgramCacheEvictionEnabled()) { // Save kernel in fast cache only if the corresponding program is also @@ -508,15 +486,13 @@ class KernelProgramCache { // if no insertion took place, then some other thread has already inserted // smth in the cache traceKernel("Kernel inserted.", KernelName, true); - auto It = MFastKernelCache.try_emplace( - KernelName, - FastKernelSubcacheWrapper(KernelSubcacheHint, getURContext())); - KernelSubcacheHint = &It.first->second.get(); + MFastKernelCache.try_emplace( + KernelName, FastKernelSubcacheWrapper(KernelSubcache, getURContext())); - FastKernelSubcacheWriteLockT SubcacheLock{KernelSubcacheHint->Mutex}; + FastKernelSubcacheWriteLockT SubcacheLock{KernelSubcache.Mutex}; ur_context_handle_t Context = getURContext(); - KernelSubcacheHint->Entries.emplace_back( - FastKernelCacheKeyT(Device, Context), CacheVal); + KernelSubcache.Entries.emplace_back(FastKernelCacheKeyT(Device, Context), + CacheVal); } // Expects locked program cache diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp index 9177bf6a5956c..cbac262a1d863 100644 --- a/sycl/source/detail/program_manager/program_manager.cpp +++ b/sycl/source/detail/program_manager/program_manager.cpp @@ -1123,8 +1123,8 @@ ur_program_handle_t ProgramManager::getBuiltURProgram( FastKernelCacheValPtr ProgramManager::getOrCreateKernel( context_impl &ContextImpl, device_impl &DeviceImpl, - KernelNameStrRefT KernelName, - KernelNameBasedCacheT *KernelNameBasedCachePtr, const NDRDescT &NDRDesc) { + KernelNameStrRefT KernelName, KernelNameBasedCacheT &KernelNameBasedCache, + const NDRDescT &NDRDesc) { if constexpr (DbgProgMgr > 0) { std::cerr << ">>> ProgramManager::getOrCreateKernel(" << &ContextImpl << ", " << &DeviceImpl << ", " << KernelName << ")\n"; @@ -1134,12 +1134,9 @@ FastKernelCacheValPtr ProgramManager::getOrCreateKernel( KernelProgramCache &Cache = ContextImpl.getKernelProgramCache(); ur_device_handle_t UrDevice = DeviceImpl.getHandleRef(); - FastKernelSubcacheT *CacheHintPtr = - KernelNameBasedCachePtr ? &KernelNameBasedCachePtr->FastKernelSubcache - : nullptr; if (SYCLConfig::get()) { - if (auto KernelCacheValPtr = - Cache.tryToGetKernelFast(KernelName, UrDevice, CacheHintPtr)) { + if (auto KernelCacheValPtr = Cache.tryToGetKernelFast( + KernelName, UrDevice, KernelNameBasedCache.getKernelSubcache())) { return KernelCacheValPtr; } } @@ -1196,7 +1193,8 @@ FastKernelCacheValPtr ProgramManager::getOrCreateKernel( // MKernelsPerProgramCache, we need to increase the ref count of the kernel. ContextImpl.getAdapter().call( KernelArgMaskPair.first); - Cache.saveKernel(KernelName, UrDevice, ret_val, CacheHintPtr); + Cache.saveKernel(KernelName, UrDevice, ret_val, + KernelNameBasedCache.getKernelSubcache()); return ret_val; } @@ -1860,26 +1858,29 @@ void ProgramManager::cacheKernelImplicitLocalArg( } } -std::optional ProgramManager::kernelImplicitLocalArgPos( - KernelNameStrRefT KernelName, - KernelNameBasedCacheT *KernelNameBasedCachePtr) const { - auto getLocalArgPos = [&]() -> std::optional { - auto it = m_KernelImplicitLocalArgPos.find(KernelName); - if (it != m_KernelImplicitLocalArgPos.end()) - return it->second; - return {}; - }; +std::optional +ProgramManager::kernelImplicitLocalArgPos(KernelNameStrRefT KernelName) const { + auto it = m_KernelImplicitLocalArgPos.find(KernelName); + if (it != m_KernelImplicitLocalArgPos.end()) + return it->second; + return {}; +} - if (!KernelNameBasedCachePtr) - return getLocalArgPos(); - std::optional> &ImplicitLocalArgPos = - KernelNameBasedCachePtr->ImplicitLocalArgPos; - if (!ImplicitLocalArgPos.has_value()) { - ImplicitLocalArgPos = getLocalArgPos(); - } - return ImplicitLocalArgPos.value(); +KernelNameBasedCacheT * +ProgramManager::createKernelNameBasedCache(KernelNameStrRefT KernelName) { + auto Result = m_KernelNameBasedCaches.try_emplace(KernelName, KernelName); + assert(Result.second && "Kernel name based cache instance already exists"); + return &Result.first->second; } +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES +KernelNameBasedCacheT * +ProgramManager::getOrCreateKernelNameBasedCache(KernelNameStrRefT KernelName) { + auto Result = m_KernelNameBasedCaches.try_emplace(KernelName, KernelName); + return &Result.first->second; +} +#endif + static bool isBfloat16DeviceLibImage(sycl_device_binary RawImg, uint32_t *LibVersion = nullptr) { sycl_device_binary_property_set ImgPS; @@ -2166,8 +2167,9 @@ void ProgramManager::removeImages(sycl_device_binaries DeviceBinary) { return; // Acquire lock to read and modify maps for kernel bundles std::lock_guard KernelIDsGuard(m_KernelIDsMutex); - + std::cout << "Removing images" << std::endl; for (int I = 0; I < DeviceBinary->NumDeviceBinaries; I++) { + std::cout << " Binary " << I << std::endl; sycl_device_binary RawImg = &(DeviceBinary->DeviceBinaries[I]); auto DevImgIt = m_DeviceImages.find(RawImg); @@ -2180,45 +2182,11 @@ void ProgramManager::removeImages(sycl_device_binaries DeviceBinary) { RTDeviceBinaryImage *Img = DevImgIt->second.get(); + std::cout << " Erase arg mask" << std::endl; // Drop the kernel argument mask map m_EliminatedKernelArgMasks.erase(Img); - // Unmap the unique kernel IDs for the offload entries - for (sycl_offload_entry EntriesIt = EntriesB; EntriesIt != EntriesE; - EntriesIt = EntriesIt->Increment()) { - - // Drop entry for service kernel - if (std::strstr(EntriesIt->GetName(), "__sycl_service_kernel__")) { - m_ServiceKernels.erase(EntriesIt->GetName()); - continue; - } - - // Exported device functions won't have a kernel ID - if (m_ExportedSymbolImages.find(EntriesIt->GetName()) != - m_ExportedSymbolImages.end()) { - continue; - } - - // remove everything associated with this KernelName - m_KernelUsesAssert.erase(EntriesIt->GetName()); - m_KernelImplicitLocalArgPos.erase(EntriesIt->GetName()); - - if (auto It = m_KernelName2KernelIDs.find(EntriesIt->GetName()); - It != m_KernelName2KernelIDs.end()) { - m_KernelIDs2BinImage.erase(It->second); - m_KernelName2KernelIDs.erase(It); - } - } - - // Drop reverse mapping - m_BinImg2KernelIDs.erase(Img); - - // Unregister exported symbols (needs to happen after the ID unmap loop) - for (const sycl_device_binary_property &ESProp : - Img->getExportedSymbols()) { - m_ExportedSymbolImages.erase(ESProp->Name); - } - + std::cout << " Erase VFSet2Bin" << std::endl; for (const sycl_device_binary_property &VFProp : Img->getVirtualFunctions()) { std::string StrValue = DeviceBinaryProperty(VFProp).asCString(); @@ -2226,11 +2194,13 @@ void ProgramManager::removeImages(sycl_device_binaries DeviceBinary) { m_VFSet2BinImage.erase(SetName); } + std::cout << " Erase device globals" << std::endl; m_DeviceGlobals.eraseEntries(Img); { std::lock_guard HostPipesGuard(m_HostPipesMutex); auto HostPipes = Img->getHostPipes(); + std::cout << " Erase host pipes" << std::endl; for (const sycl_device_binary_property &HostPipe : HostPipes) { if (auto HostPipesIt = m_HostPipes.find(HostPipe->Name); HostPipesIt != m_HostPipes.end()) { @@ -2250,21 +2220,73 @@ void ProgramManager::removeImages(sycl_device_binaries DeviceBinary) { // Purge references to the image in native programs map { std::lock_guard NativeProgramsGuard(MNativeProgramsMutex); - // The map does not keep references to program handles; we can erase the // entry without calling UR release for (auto It = NativePrograms.begin(); It != NativePrograms.end();) { auto CurIt = It++; if (CurIt->second.second == Img) { + std::cout << " Remove all related entries" << std::endl; if (auto ContextImpl = CurIt->second.first.lock()) { ContextImpl->getKernelProgramCache().removeAllRelatedEntries( Img->getImageID()); } + std::cout << " Remove from native programs" << std::endl; NativePrograms.erase(CurIt); } } } + // Unmap the unique kernel IDs for the offload entries + std::cout << " Clear entries specific info" << std::endl; + for (sycl_offload_entry EntriesIt = EntriesB; EntriesIt != EntriesE; + EntriesIt = EntriesIt->Increment()) { + + // Drop entry for service kernel + if (std::strstr(EntriesIt->GetName(), "__sycl_service_kernel__")) { + std::cout << " Erase from service kernels" << std::endl; + m_ServiceKernels.erase(EntriesIt->GetName()); + continue; + } + + // Exported device functions won't have a kernel ID + if (m_ExportedSymbolImages.find(EntriesIt->GetName()) != + m_ExportedSymbolImages.end()) { + std::cout << " Exported function, skip" << std::endl; + continue; + } + + // remove everything associated with this KernelName + std::cout << " Erase from uses assert" << std::endl; + m_KernelUsesAssert.erase(EntriesIt->GetName()); + std::cout << " Erase from implicitlocalargpos" << std::endl; + m_KernelImplicitLocalArgPos.erase(EntriesIt->GetName()); + + if (auto It = m_KernelName2KernelIDs.find(EntriesIt->GetName()); + It != m_KernelName2KernelIDs.end()) { + std::cout << " Erase ID -> Img mapping" << std::endl; + m_KernelIDs2BinImage.erase(It->second); + std::cout << " Erase Name -> ID mapping" << std::endl; + m_KernelName2KernelIDs.erase(It); + } + if (auto It = m_KernelNameBasedCaches.find(EntriesIt->GetName()); + It != m_KernelNameBasedCaches.end()) { + std::cout << " Erase cache instance" << std::endl; + m_KernelNameBasedCaches.erase(It); + } + } + + // Drop reverse mapping + std::cout << " Erase reverse mapping" << std::endl; + m_BinImg2KernelIDs.erase(Img); + + std::cout << " Erase exported symbol images" << std::endl; + // Unregister exported symbols (needs to happen after the ID unmap loop) + for (const sycl_device_binary_property &ESProp : + Img->getExportedSymbols()) { + m_ExportedSymbolImages.erase(ESProp->Name); + } + + std::cout << " Erase from device images" << std::endl; m_DeviceImages.erase(DevImgIt); } } diff --git a/sycl/source/detail/program_manager/program_manager.hpp b/sycl/source/detail/program_manager/program_manager.hpp index bd60661949e87..602f478e2d830 100644 --- a/sycl/source/detail/program_manager/program_manager.hpp +++ b/sycl/source/detail/program_manager/program_manager.hpp @@ -200,7 +200,7 @@ class ProgramManager { FastKernelCacheValPtr getOrCreateKernel(context_impl &ContextImpl, device_impl &DeviceImpl, KernelNameStrRefT KernelName, - KernelNameBasedCacheT *KernelNameBasedCachePtr, + KernelNameBasedCacheT &KernelNameBasedCache, const NDRDescT &NDRDesc = {}); ur_kernel_handle_t getCachedMaterializedKernel( @@ -358,23 +358,22 @@ class ProgramManager { ~ProgramManager() = default; template - bool kernelUsesAssert(const NameT &KernelName, - KernelNameBasedCacheT *KernelNameBasedCachePtr) const { - if (!KernelNameBasedCachePtr) - return m_KernelUsesAssert.find(KernelName) != m_KernelUsesAssert.end(); - - std::optional &UsesAssert = KernelNameBasedCachePtr->UsesAssert; - if (!UsesAssert.has_value()) - UsesAssert = - m_KernelUsesAssert.find(KernelName) != m_KernelUsesAssert.end(); - return UsesAssert.value(); + bool kernelUsesAssert(const NameT &KernelName) const { + return m_KernelUsesAssert.find(KernelName) != m_KernelUsesAssert.end(); } SanitizerType kernelUsesSanitizer() const { return m_SanitizerFoundInImage; } - std::optional kernelImplicitLocalArgPos( - KernelNameStrRefT KernelName, - KernelNameBasedCacheT *KernelNameBasedCachePtr) const; + std::optional + kernelImplicitLocalArgPos(KernelNameStrRefT KernelName) const; + + KernelNameBasedCacheT * + createKernelNameBasedCache(KernelNameStrRefT KernelName); + +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + KernelNameBasedCacheT * + getOrCreateKernelNameBasedCache(KernelNameStrRefT KernelName); +#endif std::set getRawDeviceImages(const std::vector &KernelIDs); @@ -528,6 +527,11 @@ class ProgramManager { KernelUsesAssertSet m_KernelUsesAssert; std::unordered_map m_KernelImplicitLocalArgPos; + // Map for storing kernel name based caches. Runtime lookup should only be + // performed for ABI compatibility and user library unloading. + std::unordered_map + m_KernelNameBasedCaches; + // Sanitizer type used in device image SanitizerType m_SanitizerFoundInImage; diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e39cf23d74dbd..530a07d0e6a71 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -876,9 +876,7 @@ class queue_impl : public std::enable_shared_from_this { // Kernel only uses assert if it's non interop one KernelUsesAssert = (!Handler.MKernel || Handler.MKernel->hasSYCLMetadata()) && - ProgramManager::getInstance().kernelUsesAssert( - Handler.MKernelName.data(), - Handler.impl->MKernelNameBasedCachePtr); + Handler.impl->MKernelNameBasedCachePtr->usesAssert(); auto &PostProcess = *PostProcessorFunc; PostProcess(IsKernel, KernelUsesAssert, Event); diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 4ac11b25c9ff2..1318550ee11ea 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2018,10 +2018,11 @@ void instrumentationAddExtraKernelMetadata( // NOTE: Queue can be null when kernel is directly enqueued to a command // buffer // by graph API, when a modifiable graph is finalized. + assert(KernelNameBasedCachePtr); FastKernelCacheValPtr FastKernelCacheVal = detail::ProgramManager::getInstance().getOrCreateKernel( Queue->getContextImpl(), Queue->getDeviceImpl(), KernelName, - KernelNameBasedCachePtr); + *KernelNameBasedCachePtr); EliminatedArgMask = FastKernelCacheVal->MKernelArgMask; } @@ -2456,9 +2457,8 @@ static ur_result_t SetKernelParamsAndLaunch( applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc); } - std::optional ImplicitLocalArg = - ProgramManager::getInstance().kernelImplicitLocalArgPos( - KernelName, KernelNameBasedCachePtr); + const std::optional &ImplicitLocalArg = + KernelNameBasedCachePtr->getImplicitLocalArgPos(); // Set the implicit local memory buffer to support // get_work_group_scratch_memory. This is for backend not supporting // CUDA-style local memory setting. Note that we may have -1 as a position, @@ -2559,10 +2559,11 @@ getCGKernelInfo(const CGExecKernel &CommandGroup, context_impl &ContextImpl, DeviceImageImpl = SyclKernelImpl->getDeviceImage(); EliminatedArgMask = SyclKernelImpl->getKernelArgMask(); } else { + assert(CommandGroup.MKernelNameBasedCachePtr); FastKernelCacheValPtr FastKernelCacheVal = sycl::detail::ProgramManager::getInstance().getOrCreateKernel( ContextImpl, DeviceImpl, CommandGroup.MKernelName, - CommandGroup.MKernelNameBasedCachePtr); + *CommandGroup.MKernelNameBasedCachePtr); UrKernel = FastKernelCacheVal->MKernelHandle; EliminatedArgMask = FastKernelCacheVal->MKernelArgMask; // To keep UrKernel valid, we return FastKernelCacheValPtr. @@ -2724,8 +2725,9 @@ void enqueueImpKernel( EliminatedArgMask = SyclKernelImpl->getKernelArgMask(); KernelMutex = SyclKernelImpl->getCacheMutex(); } else { + assert(KernelNameBasedCachePtr); KernelCacheVal = detail::ProgramManager::getInstance().getOrCreateKernel( - ContextImpl, DeviceImpl, KernelName, KernelNameBasedCachePtr, NDRDesc); + ContextImpl, DeviceImpl, KernelName, *KernelNameBasedCachePtr, NDRDesc); Kernel = KernelCacheVal->MKernelHandle; KernelMutex = KernelCacheVal->MMutex; Program = KernelCacheVal->MProgramHandle; @@ -3252,8 +3254,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { // Kernel only uses assert if it's non interop one bool KernelUsesAssert = (!SyclKernel || SyclKernel->hasSYCLMetadata()) && - ProgramManager::getInstance().kernelUsesAssert( - KernelName, ExecKernel->MKernelNameBasedCachePtr); + ExecKernel->MKernelNameBasedCachePtr->usesAssert(); if (KernelUsesAssert) { EventImpl = MEvent.get(); } diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index dc5d2f9df6758..0f607ed86e068 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -490,6 +490,16 @@ event handler::finalize() { } if (type == detail::CGType::Kernel) { +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + if (impl->MKernelNameBasedCachePtr) { + impl->MKernelNameBasedCachePtr->initIfNeeded( + toKernelNameStrT(MKernelName)); + } else { + impl->MKernelNameBasedCachePtr = + detail::ProgramManager::getInstance().getOrCreateKernelNameBasedCache( + toKernelNameStrT(MKernelName)); + } +#endif // If there were uses of set_specialization_constant build the kernel_bundle detail::kernel_bundle_impl *KernelBundleImpPtr = getOrInsertHandlerKernelBundlePtr(/*Insert=*/false); @@ -563,10 +573,8 @@ event handler::finalize() { !impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents(); if (DiscardEvent) { // Kernel only uses assert if it's non interop one - bool KernelUsesAssert = - !(MKernel && MKernel->isInterop()) && - detail::ProgramManager::getInstance().kernelUsesAssert( - toKernelNameStrT(MKernelName), impl->MKernelNameBasedCachePtr); + bool KernelUsesAssert = !(MKernel && MKernel->isInterop()) && + impl->MKernelNameBasedCachePtr->usesAssert(); DiscardEvent = !KernelUsesAssert; } diff --git a/sycl/test-e2e/SharedLib/use_with_dlopen.cpp b/sycl/test-e2e/SharedLib/use_with_dlopen.cpp index 4d4910efa78e3..c5d9d8fd0f0b4 100644 --- a/sycl/test-e2e/SharedLib/use_with_dlopen.cpp +++ b/sycl/test-e2e/SharedLib/use_with_dlopen.cpp @@ -85,6 +85,6 @@ int main() { run(); #endif - return 0; + return -1; } #endif diff --git a/sycl/test-e2e/SharedLib/use_with_dlopen_verify_cache.cpp b/sycl/test-e2e/SharedLib/use_with_dlopen_verify_cache.cpp index bab8130bbab68..0dfdba721e003 100644 --- a/sycl/test-e2e/SharedLib/use_with_dlopen_verify_cache.cpp +++ b/sycl/test-e2e/SharedLib/use_with_dlopen_verify_cache.cpp @@ -106,6 +106,6 @@ int main() { run(); #endif - return 0; + return -1; } #endif