diff --git a/clang/lib/DPCT/RuleInfra/MapNames.cpp b/clang/lib/DPCT/RuleInfra/MapNames.cpp index 3e30a0eff3ff..b1b64fc1a215 100644 --- a/clang/lib/DPCT/RuleInfra/MapNames.cpp +++ b/clang/lib/DPCT/RuleInfra/MapNames.cpp @@ -366,6 +366,9 @@ void MapNames::setExplicitNamespaceMap( {"cudaIpcMemHandle_t", std::make_shared(getDpctNamespace() + "experimental::ipc_mem_handle_ext_t")}, + {"cudaIpcEventHandle_t", + std::make_shared( + getDpctNamespace() + "experimental::ipc_event_pool_handle_ext_t")}, {"char1", std::make_shared("int8_t")}, {"char2", std::make_shared(getClNamespace() + "char2")}, {"char3", std::make_shared(getClNamespace() + "char3")}, diff --git a/clang/lib/DPCT/RulesLang/APINamesMisc.inc b/clang/lib/DPCT/RulesLang/APINamesMisc.inc index 3112a9aa57ef..2f0d74a3dcb0 100644 --- a/clang/lib/DPCT/RulesLang/APINamesMisc.inc +++ b/clang/lib/DPCT/RulesLang/APINamesMisc.inc @@ -180,3 +180,29 @@ ASSIGNABLE_FACTORY(FEATURE_REQUEST_FACTORY( Diagnostics::TRY_EXPERIMENTAL_FEATURE, ARG("cudaIpcOpenMemHandle"), ARG("--use-experimental-features=level_zero"))))) + +ASSIGNABLE_FACTORY(FEATURE_REQUEST_FACTORY( + HelperFeatureEnum::device_ext, + CONDITIONAL_FACTORY_ENTRY( + clang::dpct::UseExtLevelZero, + CALL_FACTORY_ENTRY("cudaIpcOpenEventHandle", + CALL(MapNames::getDpctNamespace() + + "experimental::open_event_pool_ipc_handle", + ARG(0), ARG(1))), + UNSUPPORT_FACTORY_ENTRY( + "cudaIpcOpenEventHandle", Diagnostics::TRY_EXPERIMENTAL_FEATURE, + ARG("cudaIpcOpenEventHandle"), + ARG("--use-experimental-features=level_zero"))))) + +ASSIGNABLE_FACTORY(FEATURE_REQUEST_FACTORY( + HelperFeatureEnum::device_ext, + CONDITIONAL_FACTORY_ENTRY( + clang::dpct::UseExtLevelZero, + CALL_FACTORY_ENTRY("cudaIpcGetEventHandle", + CALL(MapNames::getDpctNamespace() + + "experimental::get_event_pool_ipc_handle", + ARG(1), ARG(0))), + UNSUPPORT_FACTORY_ENTRY( + "cudaIpcGetEventHandle", Diagnostics::TRY_EXPERIMENTAL_FEATURE, + ARG("cudaIpcGetEventHandle"), + ARG("--use-experimental-features=level_zero"))))) \ No newline at end of file diff --git a/clang/lib/DPCT/RulesLang/RulesLang.cpp b/clang/lib/DPCT/RulesLang/RulesLang.cpp index 859bf795c812..e220d82efcde 100644 --- a/clang/lib/DPCT/RulesLang/RulesLang.cpp +++ b/clang/lib/DPCT/RulesLang/RulesLang.cpp @@ -348,7 +348,8 @@ void TypeInDeclRule::registerMatcher(MatchFinder &MF) { "cudaGraphicsRegisterFlags", "cudaExternalMemoryHandleType", "cudaExternalSemaphoreHandleType", "CUstreamCallback", "cudaHostFn_t", "__nv_half2", "__nv_half", "cudaGraphNodeType", - "CUsurfref", "CUdevice_P2PAttribute", "cudaIpcMemHandle_t")))))) + "CUsurfref", "CUdevice_P2PAttribute", "cudaIpcMemHandle_t", + "cudaIpcEventHandle_t")))))) .bind("cudaTypeDef"), this); @@ -935,6 +936,13 @@ void TypeInDeclRule::runRule(const MatchFinder::MatchResult &Result) { return; } } + if (CanonicalTypeStr == "cudaIpcEventHandle_st") { + if (!DpctGlobalInfo::useExtLevelZero()) { + report(TL->getBeginLoc(), Diagnostics::TRY_EXPERIMENTAL_FEATURE, false, + "cudaIpcMemHandle_t", "--use-experimental-features=level_zero"); + return; + } + } if (CanonicalTypeStr == "cudaGraphExecUpdateResult") { report(TL->getBeginLoc(), Diagnostics::API_NOT_MIGRATED, false, diff --git a/clang/lib/DPCT/SrcAPI/APINames.inc b/clang/lib/DPCT/SrcAPI/APINames.inc index 17d1872b487b..6594d16a6142 100644 --- a/clang/lib/DPCT/SrcAPI/APINames.inc +++ b/clang/lib/DPCT/SrcAPI/APINames.inc @@ -1658,9 +1658,9 @@ ENTRY(cuDeviceGetPCIBusId, cuDeviceGetPCIBusId, false, NO_FLAG, P4, "comment") ENTRY(cuDeviceRegisterAsyncNotification, cuDeviceRegisterAsyncNotification, false, NO_FLAG, P4, "comment") ENTRY(cuDeviceUnregisterAsyncNotification, cuDeviceUnregisterAsyncNotification, false, NO_FLAG, P4, "comment") ENTRY(cuIpcCloseMemHandle, cuIpcCloseMemHandle, false, NO_FLAG, P4, "comment") -ENTRY(cuIpcGetEventHandle, cuIpcGetEventHandle, false, NO_FLAG, P4, "comment") +ENTRY(cuIpcGetEventHandle, cuIpcGetEventHandle, true, NO_FLAG, P4, "DPCT1119") ENTRY(cuIpcGetMemHandle, cuIpcGetMemHandle, false, NO_FLAG, P4, "comment") -ENTRY(cuIpcOpenEventHandle, cuIpcOpenEventHandle, false, NO_FLAG, P4, "comment") +ENTRY(cuIpcOpenEventHandle, cuIpcOpenEventHandle, true, NO_FLAG, P4, "DPCT1119") ENTRY(cuIpcOpenMemHandle, cuIpcOpenMemHandle, false, NO_FLAG, P4, "comment") ENTRY(cuMemAlloc, cuMemAlloc_v2, true, NO_FLAG, P4, "Successful") ENTRY(cuMemAllocHost, cuMemAllocHost_v2, true, NO_FLAG, P4, "comment") diff --git a/clang/runtime/dpct-rt/include/dpct/ze_utils.hpp b/clang/runtime/dpct-rt/include/dpct/ze_utils.hpp index 631246e40f6b..711e19f3f01e 100644 --- a/clang/runtime/dpct-rt/include/dpct/ze_utils.hpp +++ b/clang/runtime/dpct-rt/include/dpct/ze_utils.hpp @@ -23,6 +23,11 @@ struct ipc_mem_handle_ext_t { ze_ipc_mem_handle_t handle; }; +struct ipc_event_pool_handle_ext_t { + pid_t pid; + ze_ipc_event_pool_handle_t handle; +}; + namespace detail { #ifndef _SYS_pidfd_open @@ -33,7 +38,7 @@ namespace detail { #define _SYS_pidfd_getfd 438 // syscall number for pidfd_getfd #endif -inline int get_fd_of_peer_process(ipc_mem_handle_ext_t ext_handle) { +template inline int get_fd_of_peer_process(T ext_handle) { int pidfd = syscall(_SYS_pidfd_open, ext_handle.pid, 0); // obtain a file descriptor that refers to a // process(requires kernel 5.6+). @@ -43,6 +48,35 @@ inline int get_fd_of_peer_process(ipc_mem_handle_ext_t ext_handle) { 0); // obtain a duplicate of another process's file // descriptor(requires kernel 5.6+). } +constexpr ze_event_pool_desc_t default_event_pool_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_POOL_DESC, + .pNext = nullptr, + .flags = ZE_EVENT_POOL_FLAG_IPC | ZE_EVENT_POOL_FLAG_HOST_VISIBLE, + .count = 1}; + +constexpr ze_event_desc_t default_event_desc = { + .stype = ZE_STRUCTURE_TYPE_EVENT_DESC, .index = 0, .signal = 0, .wait = 0}; + +ze_event_pool_handle_t create_event_in_pool(sycl::event *event) { + ze_event_pool_handle_t h_event_pool = {}; + auto context = dpct::get_current_device().get_context(); + + if (h_event_pool == nullptr) { + ze_device_handle_t device = + sycl::get_native( + (sycl::device)dpct::get_current_device()); + zeEventPoolCreate( + sycl::get_native(context), + &default_event_pool_desc, 1, &device, &h_event_pool); + } + + ze_event_handle_t ze_event = {}; + zeEventCreate(h_event_pool, &default_event_desc, &ze_event); + zeEventHostReset(ze_event); + *event = sycl::make_event( + {ze_event, sycl::ext::oneapi::level_zero::ownership::keep}, context); + return h_event_pool; +} } // namespace detail @@ -76,6 +110,36 @@ inline ze_result_t open_mem_ipc_handle(ipc_mem_handle_ext_t ext_handle, ext_handle.handle, 0u, pptr); } +/// Gets an IPC event pool handle for the specified event handle that can be shared with another process. +/// \param [in] ext_handle IPC memory handle extension +/// \param [out] phipc Returned IPC event handle +ze_result_t get_event_pool_ipc_handle(sycl::event *event, + ipc_event_pool_handle_ext_t *phipc) { + phipc->pid = getpid(); + ze_event_pool_handle_t h_event_pool = detail::create_event_in_pool(event); + return zeEventPoolGetIpcHandle(h_event_pool, &phipc->handle); +} + +ze_result_t open_event_pool_ipc_handle(sycl::event **event, + ipc_event_pool_handle_ext_t hipc) { + ze_context_handle_t ze_context = + sycl::get_native( + dpct::get_current_device().get_context()); + int fd = detail::get_fd_of_peer_process(hipc); + if (fd < 0) + throw std::runtime_error("Cannot get file descriptor of peer process."); + *((int *)hipc.handle.data) = detail::get_fd_of_peer_process(hipc); + ze_event_handle_t ze_event = {}; + ze_event_pool_handle_t h_event_pool_t; + auto ret = zeEventPoolOpenIpcHandle(ze_context, hipc.handle, &h_event_pool_t); + zeEventCreate(h_event_pool_t, &detail::default_event_desc, &ze_event); + zeEventHostSignal(ze_event); + *event = + new sycl::event(sycl::make_event( + {ze_event, sycl::ext::oneapi::level_zero::ownership::keep}, + dpct::get_current_device().get_context())); + return ret; +} } // namespace experimental } // namespace dpct diff --git a/clang/test/dpct/IPC/share_mem_exp_option.cu b/clang/test/dpct/IPC/share_mem_exp_option.cu index 0cbfa00560b0..8c34560e30f1 100644 --- a/clang/test/dpct/IPC/share_mem_exp_option.cu +++ b/clang/test/dpct/IPC/share_mem_exp_option.cu @@ -3,107 +3,149 @@ // RUN: FileCheck --input-file %T/share_mem_exp_option/share_mem_exp_option.dp.cpp --match-full-lines %s // RUN: %if build_lit %{icpx -c -fsycl -DNO_BUILD_TEST %T/share_mem_exp_option/share_mem_exp_option.dp.cpp -o %T/share_mem_exp_option/share_mem_exp_option.dp.o %} - #include #include #include -#include #include #include #include #include -#define DATA_SIZE 1024 +#include +#define DATA_SIZE 1024 +constexpr int N = 4096; +constexpr int ITERATIONS = 10; +constexpr int BLOCK_SIZE = 16; #define shName "shared_memory" -typedef struct sharedMemoryInfo_st -{ +typedef struct sharedMemoryInfo_st { void *addr; size_t size; int shmFd; } sharedMemoryInfo; -int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info) -{ +int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info) { int status = 0; info->size = sz; info->shmFd = shm_open(name, O_RDWR | O_CREAT, 0777); - if (info->shmFd < 0) - { + if (info->shmFd < 0) { return errno; } status = ftruncate(info->shmFd, sz); - if (status != 0) - { + if (status != 0) { return status; } info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0); - if (info->addr == NULL) - { + if (info->addr == NULL) { return errno; } return 0; } -int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info) -{ +int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info) { info->size = sz; info->shmFd = shm_open(name, O_RDWR, 0777); - if (info->shmFd < 0) - { + if (info->shmFd < 0) { return errno; } info->addr = mmap(0, sz, PROT_READ | PROT_WRITE, MAP_SHARED, info->shmFd, 0); - if (info->addr == NULL) - { + if (info->addr == NULL) { return errno; } return 0; } -typedef struct shmStruct_st -{ +typedef struct shmStruct_st { // CHECK: dpct::experimental::ipc_mem_handle_ext_t memHandle; cudaIpcMemHandle_t memHandle; + // CHECK: dpct::experimental::ipc_event_pool_handle_ext_t eventHandle; + cudaIpcEventHandle_t eventHandle; } shmStruct; -__global__ void simpleKernel(int *ptr) -{ +__global__ void longKernel(float *matrixA, float *matrixB, float *matrixC, int *ptr) { + int i = blockIdx.y * blockDim.y + threadIdx.y; + int j = blockIdx.x * blockDim.x + threadIdx.x; + + if (i < N && j < N) { + float sum = 0.0f; + + for (int k = 0; k < N; ++k) { + volatile float a = matrixA[i * N + k]; + volatile float b = matrixB[k * N + j]; + sum += a * b; + } + + for (int iter = 0; iter < ITERATIONS; ++iter) { + sum = sqrtf(sum) + sinf(sum); + } + + matrixC[i * N + j] = sum; + } + ptr[j] = j - 10; +} + +__global__ void simpleKernel(int *ptr) { int idx = blockIdx.x * blockDim.x + threadIdx.x; ptr[idx] = idx - 10; + + float temp = 0.0f; + for (int j = 0; j < 1000000; ++j) { + temp += sin(static_cast(j)) * cos(static_cast(j)); + } } +__global__ void simpleKernel_2(int *ptr) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + ptr[idx] = ptr[idx] + 10; +} + + typedef pid_t Process; -int spawnProcess(Process *process, const char *app, char *const *args) -{ +int spawnProcess(Process *process, const char *app, char *const *args) { *process = fork(); - if (*process == 0) - { - if (0 > execvp(app, args)) - { + if (*process == 0) { + if (0 > execvp(app, args)) { return errno; } - } - else if (*process < 0) - { + } else if (*process < 0) { return errno; } return 0; } -int childProcess(int id) -{ +int childProcess(int id) { int threads = 256; sharedMemoryInfo info; + cudaEvent_t event; + cudaStream_t stream; shmStruct *shm = NULL; - if (sharedMemoryCreate(shName, sizeof(shmStruct), &info) != 0) - { + + float *d_matrixA, *d_matrixB, *d_matrixC; + size_t size = N * N * sizeof(float); + + cudaMalloc(&d_matrixA, size); + cudaMalloc(&d_matrixB, size); + cudaMalloc(&d_matrixC, size); + + float *h_matrixA = new float[N * N]{1.0f}; + float *h_matrixB = new float[N * N]{2.0f}; + + cudaMemcpy(d_matrixA, h_matrixA, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_matrixB, h_matrixB, size, cudaMemcpyHostToDevice); + + dim3 block(BLOCK_SIZE, BLOCK_SIZE); + dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE, + (N + BLOCK_SIZE - 1) / BLOCK_SIZE); + + cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking); + + if (sharedMemoryCreate(shName, sizeof(shmStruct), &info) != 0) { printf("Failed to create shared memory slab\n"); exit(EXIT_FAILURE); } @@ -112,19 +154,22 @@ int childProcess(int id) // CHECK: dpct::experimental::open_mem_ipc_handle(*(dpct::experimental::ipc_mem_handle_ext_t *)&shm->memHandle, (void **)&ptr); cudaIpcOpenMemHandle((void **)&ptr, *(cudaIpcMemHandle_t *)&shm->memHandle, cudaIpcMemLazyEnablePeerAccess); + // CHECK: dpct::experimental::open_event_pool_ipc_handle(&event, *(dpct::experimental::ipc_event_pool_handle_ext_t *)&shm->eventHandle); + cudaIpcOpenEventHandle( + &event, *(cudaIpcEventHandle_t *)&shm->eventHandle); - simpleKernel<<<1, threads, 0>>>(ptr); + longKernel<<>>(d_matrixA, d_matrixB, d_matrixC, ptr); + + cudaEventRecord(event, stream); return 0; } -int parentProcess(char *app) -{ +int parentProcess(char *app) { shmStruct *shm; sharedMemoryInfo info; void *ptr; - if (sharedMemoryCreate(shName, sizeof(*shm), &info) != 0) - { + if (sharedMemoryCreate(shName, sizeof(*shm), &info) != 0) { printf("Failed to create shared memory slab\n"); exit(EXIT_FAILURE); } @@ -135,16 +180,21 @@ int parentProcess(char *app) // CHECK: dpct::experimental::get_mem_ipc_handle(ptr, (dpct::experimental::ipc_mem_handle_ext_t *)&shm->memHandle); cudaIpcGetMemHandle((cudaIpcMemHandle_t *)&shm->memHandle, ptr); + cudaEvent_t event; + cudaEventCreate( + &event, cudaEventDisableTiming | cudaEventInterprocess); + // CHECK: dpct::experimental::get_event_pool_ipc_handle(event, (dpct::experimental::ipc_event_pool_handle_ext_t *)&shm->eventHandle); + cudaIpcGetEventHandle( + (cudaIpcEventHandle_t *)&shm->eventHandle, event); + char *const args[] = {app, "0", NULL}; Process process; spawnProcess(&process, app, args); wait(NULL); - cudaMemcpy(hostptr, ptr, DATA_SIZE, cudaMemcpyDeviceToHost); - for (int i = 0; i < DATA_SIZE / sizeof(int); i++) - { - if (hostptr[i] != i - 10) - { + cudaMemcpyAsync(hostptr, ptr, DATA_SIZE, cudaMemcpyDeviceToHost); + for (int i = 0; i < DATA_SIZE / sizeof(int); i++) { + if (hostptr[i] != i - 10) { std::cout << "Error: " << hostptr[i] << " != " << i - 10 << "\n"; return -1; } @@ -155,14 +205,10 @@ int parentProcess(char *app) return 0; } -int main(int argc, char **argv) -{ - if (argc == 1) - { +int main(int argc, char **argv) { + if (argc == 1) { return parentProcess(argv[0]); - } - else - { + } else { return childProcess(atoi(argv[1])); } -} \ No newline at end of file +}