Skip to content

[UR][Offload] Queue flag and out-of-order queue support #19531

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 24, 2025
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions unified-runtime/source/adapters/offload/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
case UR_DEVICE_INFO_USM_SYSTEM_SHARED_SUPPORT:
return ReturnValue(uint32_t{0});
case UR_DEVICE_INFO_QUEUE_PROPERTIES:
case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES:
case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be honest, I'm not entirely sure what the difference between "on host" and "on device" properties are.

return ReturnValue(
ur_queue_flags_t{UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE});
case UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES:
return ReturnValue(0);
case UR_DEVICE_INFO_SUPPORTED_PARTITIONS: {
Expand Down
37 changes: 26 additions & 11 deletions unified-runtime/source/adapters/offload/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
LaunchArgs.DynSharedMemory = 0;

ol_event_handle_t EventOut;
ol_queue_handle_t Queue;
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
OL_RETURN_ON_ERR(
olLaunchKernel(hQueue->OffloadQueue, hQueue->OffloadDevice,
hKernel->OffloadKernel, hKernel->Args.getStorage(),
hKernel->Args.getStorageSize(), &LaunchArgs, &EventOut));
olLaunchKernel(Queue, hQueue->OffloadDevice, hKernel->OffloadKernel,
hKernel->Args.getStorage(), hKernel->Args.getStorageSize(),
&LaunchArgs, &EventOut));

if (phEvent) {
auto *Event = new ur_event_handle_t_(UR_COMMAND_KERNEL_LAUNCH, hQueue);
Expand Down Expand Up @@ -107,17 +109,30 @@ ur_result_t doMemcpy(ur_command_t Command, ur_queue_handle_t hQueue,

ol_event_handle_t EventOut = nullptr;

OL_RETURN_ON_ERR(olMemcpy(hQueue->OffloadQueue, DestPtr, DestDevice, SrcPtr,
SrcDevice, size, phEvent ? &EventOut : nullptr));

ol_queue_handle_t Queue;
if (blocking) {
OL_RETURN_ON_ERR(olSyncQueue(hQueue->OffloadQueue));
// If we are using a blocking operation, create a temporary queue that lives
// only for this function
OL_RETURN_ON_ERR(olCreateQueue(hQueue->OffloadDevice, &Queue));
} else {
OL_RETURN_ON_ERR(hQueue->nextQueue(Queue));
}
OL_RETURN_ON_ERR(olMemcpy(Queue, DestPtr, DestDevice, SrcPtr, SrcDevice, size,
(phEvent || blocking) ? &EventOut : nullptr));

if (phEvent) {
auto *Event = new ur_event_handle_t_(Command, hQueue);
Event->OffloadEvent = EventOut;
*phEvent = Event;
if (blocking) {
OL_RETURN_ON_ERR(olSyncQueue(Queue));
OL_RETURN_ON_ERR(olDestroyQueue(Queue));

if (phEvent) {
*phEvent = ur_event_handle_t_::createEmptyEvent(Command, hQueue);
}
} else {
if (phEvent) {
auto *Event = new ur_event_handle_t_(Command, hQueue);
Event->OffloadEvent = EventOut;
*phEvent = Event;
}
}

return UR_RESULT_SUCCESS;
Expand Down
47 changes: 34 additions & 13 deletions unified-runtime/source/adapters/offload/queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,30 @@
#include "queue.hpp"
#include "ur2offload.hpp"

UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(ur_context_handle_t hContext,
ur_device_handle_t hDevice,
const ur_queue_properties_t *,
ur_queue_handle_t *phQueue) {
UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
[[maybe_unused]] ur_context_handle_t hContext, ur_device_handle_t hDevice,
const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) {

assert(hContext->Device == hDevice);

ur_queue_handle_t Queue = new ur_queue_handle_t_();
auto Res = olCreateQueue(hDevice->OffloadDevice, &Queue->OffloadQueue);
if (Res != OL_SUCCESS) {
delete Queue;
return offloadResultToUR(Res);
ur_queue_flags_t URFlags = 0;
if (pProps && pProps->stype == UR_STRUCTURE_TYPE_QUEUE_PROPERTIES) {
URFlags = pProps->flags;
}

Queue->OffloadDevice = hDevice->OffloadDevice;
Queue->UrContext = hContext;
ur_queue_handle_t Queue =
new ur_queue_handle_t_(hDevice->OffloadDevice, hContext, URFlags);

// For in-order queues, create the ol queue on construction so we can report
// any errors earlier
if (!(URFlags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)) {
[[maybe_unused]] ol_queue_handle_t InitQueue;
auto Res = Queue->nextQueue(InitQueue);
if (Res != OL_SUCCESS) {
delete Queue;
return offloadResultToUR(Res);
}
}

*phQueue = Queue;

Expand All @@ -47,6 +55,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue,
UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);

switch (propName) {
case UR_QUEUE_INFO_FLAGS:
return ReturnValue(hQueue->Flags);
case UR_QUEUE_INFO_REFERENCE_COUNT:
return ReturnValue(hQueue->RefCount.load());
default:
Expand All @@ -63,15 +73,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) {

UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
if (--hQueue->RefCount == 0) {
OL_RETURN_ON_ERR(olDestroyQueue(hQueue->OffloadQueue));
for (auto *Q : hQueue->OffloadQueues) {
if (!Q) {
break;
}
OL_RETURN_ON_ERR(olDestroyQueue(Q));
}
delete hQueue;
}

return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
return offloadResultToUR(olSyncQueue(hQueue->OffloadQueue));
for (auto *Q : hQueue->OffloadQueues) {
if (!Q) {
break;
}
OL_RETURN_ON_ERR(olSyncQueue(Q));
}
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(
Expand Down
33 changes: 32 additions & 1 deletion unified-runtime/source/adapters/offload/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,39 @@

#include "common.hpp"

constexpr size_t OOO_QUEUE_POOL_SIZE = 32;

struct ur_queue_handle_t_ : RefCounted {
ol_queue_handle_t OffloadQueue;
ur_queue_handle_t_(ol_device_handle_t Device, ur_context_handle_t UrContext,
ur_queue_flags_t Flags)
: OffloadQueues((Flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE)
? 1
: OOO_QUEUE_POOL_SIZE),
QueueOffset(0), OffloadDevice(Device), UrContext(UrContext),
Flags(Flags) {}

// In-order queues only have one element here, while out of order queues have
// a bank of queues to use. We rotate through them round robin instead of
// constantly creating new ones in case there is a long-running program that
// never destroys the ur queue. Out-of-order queues create ol queues when
// needed; any queues that are not yet created are nullptr.
std::vector<ol_queue_handle_t> OffloadQueues;
size_t QueueOffset;
ol_device_handle_t OffloadDevice;
ur_context_handle_t UrContext;
ur_queue_flags_t Flags;

ol_result_t nextQueue(ol_queue_handle_t &Handle) {
auto &Slot = OffloadQueues[QueueOffset++];
QueueOffset %= OffloadQueues.size();

if (!Slot) {
if (auto Res = olCreateQueue(OffloadDevice, &Slot)) {
return Res;
}
}

Handle = Slot;
return nullptr;
}
};