Skip to content
This repository was archived by the owner on Jan 26, 2024. It is now read-only.

merge release code into development #43

Open
wants to merge 50 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
d1039c6
Bulk promote till commit 'e324ae05efe9e5d10970f0aeca70fee15e79e2a6' i…
mangupta Jan 6, 2021
7c29487
SWDEV-2 - Change OpenCL version number from 3240 to 3241
chaunceyhui Jan 7, 2021
eb9e857
Merge branch 'amd-staging' into amd-master
mangupta Feb 22, 2021
f343e8f
Merge branch 'amd-staging' into amd-master
mangupta Mar 17, 2021
e6ec303
Merge branch 'amd-staging' into amd-master
mangupta Apr 6, 2021
a36e611
Merge branch 'amd-staging' into amd-master
mangupta Apr 20, 2021
9e6e161
Merge branch 'amd-staging' into amd-master
mangupta Apr 23, 2021
1a56c48
Merge branch 'amd-staging' into amd-master
mangupta May 7, 2021
69fb0af
Merge branch 'amd-staging' into amd-master
mangupta May 19, 2021
9123026
Merge branch 'amd-staging' into amd-master
mangupta May 25, 2021
cab88ea
Merge branch 'amd-staging' into amd-master
mangupta May 31, 2021
3e9cf94
Merge branch 'amd-staging' into amd-master
mangupta Aug 24, 2021
52a6f3c
Merge branch 'amd-staging' into amd-master
mangupta Aug 24, 2021
39594e6
Merge branch 'amd-staging' into amd-master
mangupta Sep 1, 2021
6d488ea
Merge branch 'amd-staging' into amd-master
mangupta Sep 7, 2021
448fa11
Promote till commit '93cfe9b0c310a39e9c5601c6e561fc4d54084e19'
mangupta Sep 15, 2021
d3affb8
Promote till commit '0f8d9e36cb4c9db9ad12ddecb66bb0fd12cdaba1'
mangupta Nov 20, 2021
15d6eed
Promote till commit '5341edfb8b5288a963b8d6fc8d788d14e0e6c87f'
mangupta Dec 8, 2021
100dd37
Promote till commit '9b0a981b66243d67598b8af97126918b6ef3f53f'
mangupta Dec 16, 2021
182f5b0
Promote till commit '5d7f14578d370d18ea1de57a65c47b13b8509553'
mangupta Jan 27, 2022
567b87a
Promote till commit '6e04eda8ba68a96a553de97f9dd640c79cd0dd86'
mangupta Feb 9, 2022
d0c9afb
Promote till commit '9f7654e98e64ed28abfcb792363d8ecfb3c1bb21'
mangupta Feb 11, 2022
42faa81
Promote till commit 'cfc619dfc8def1aa7de2bf58b01d2250b4c45c01'
mangupta Feb 22, 2022
80ae27f
Promote till commit 'f5a3ae92bfb5cbaea92d5e832da6be984d4a6e70'
mangupta Feb 25, 2022
45d2998
Promote till commit 'cb77a133c6b37514612b68c023988ae6e9c58279'
mangupta Mar 14, 2022
6fb1197
Promote till commit '0f90bef50776f1aec7021a634e36c5372b3884f5'
mangupta Mar 25, 2022
f201c07
SWDEV-307184 - Make sure runtime passes CO version into comgr without…
b-sumner Apr 23, 2022
eac630f
Promote till commit 'b9345ec1caa8d885e870d50bf73bc8cc338d8e84'
mangupta May 19, 2022
7960fd6
Promote till commit 'c9af2c79c37d105e08c479a736eb5ca5a3da354c'
mangupta May 26, 2022
3b1c2f7
Promote till commit '9114f5776f0099af9454623986153c2487dd86bc'
mangupta Jun 21, 2022
7a05e9c
Promote till commit 'ece3a52cfcd9bb145a615f701ed6276ed694b4df'
mangupta Jul 5, 2022
7cbadea
Promote till commit 'a645953cb5ff54c5eb2fff27c03a371936f47cdf'
mangupta Jul 15, 2022
50b9659
Promote till commit 'ca67dc206f2c335b5d4e764fbb5a24c73756b0fc'
mangupta Jul 22, 2022
4afd406
Promote till commit 'cf5f08fb930ea33cd774cfcd26de61358799c776'
mangupta Sep 8, 2022
974ac0d
Promote till commit '244eef8063fc5939b483193c82b13ae62c8e612c'
mangupta Sep 20, 2022
3405948
Promote till commit 'b73e1751b8cdf3c409014733ab2eb67a09ad03a1'
mangupta Oct 3, 2022
f7f62b9
Promote till commit '6a14d0a8ec8afedabe75123a68dc9b4988932050'
mangupta Oct 14, 2022
69772c9
Promote till commit 'f4d89cc93dad3dea35536d1df922b4dd52f974a7'
mangupta Oct 21, 2022
50613c0
Promote till commit 'a429ed232abfc99318bd2a5cae18ecc42a0aa1c8'
mangupta Nov 9, 2022
a339a18
Promote till commit 'b4cb610bf70fd26b77ec66fc3bf0121616bdaf8e'
mangupta Nov 21, 2022
7a9b540
Promote till commit '7697fd90d70128505ab604ec31c1a187316c8372'
mangupta Dec 1, 2022
bd2b754
Promote till commit '2d5a1a00433308271e7edbad0885baf675e2b355'
mangupta Dec 13, 2022
cb18e75
Promote till commit '7e2de2e4d617fafc0dcfe7e88ed070c687858d51'
mangupta Jan 27, 2023
041c004
SWDEV-325538 - Enable code object v5 by default
kzhuravl Nov 29, 2022
a983fc1
SWDEV-372790 - Copy AQL packet from runtime setup
gandryey Jan 23, 2023
6d8fc0b
SWDEV-379991, SWDEV-366886 - Revert "SWDEV-366886 - force svm alloc f…
jiabaxie Jan 25, 2023
1cf8f19
SWDEV-380024 - Fix performance drop in TF-RCCL models
Dec 1, 2022
dd5f3d2
SWDEV-386749 - Update stack size limit
gargrahul Mar 8, 2023
e047204
SWDEV-380035 - Check for agent and ptr match for hsa LOCKED ptr
mangupta Mar 10, 2023
d7491f0
SWDEV-394243 - Invalidate Barrier Value AQL header
saleelk Apr 26, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions compiler/lib/utils/OPTIONS.def
Original file line number Diff line number Diff line change
Expand Up @@ -1272,8 +1272,8 @@ OPTION(OT_UINT32, \
OA_RUNTIME|OVA_OPTIONAL|OA_SEPARATOR_EQUAL, \
"code-object-version", NULL, \
LCCodeObjectVersion, \
4, 4, 5, NULL, \
"Specify code object ABI version. Allowed values are 4, and 5. Defaults to 4. (COMGR only)")
5, 4, 5, NULL, \
"Specify code object ABI version. Allowed values are 4, and 5. Defaults to 5. (COMGR only)")

/*
Do not remove the following line. Any option should be
Expand Down
2 changes: 1 addition & 1 deletion device/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -751,7 +751,7 @@ bool Device::disableP2P(amd::Device* ptrDev) {
}

bool Device::UpdateStackSize(uint64_t stackSize) {
if (stackSize > 16 * Ki) {
if (stackSize > ((128 * Ki) - 16)) {
return false;
}
stack_size_ = stackSize;
Expand Down
11 changes: 8 additions & 3 deletions device/device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1796,9 +1796,14 @@ class Device : public RuntimeObject {

// Returns the status of HW event, associated with amd::Event
virtual bool IsHwEventReady(
const amd::Event& event, //!< AMD event for HW status validation
bool wait = false //!< If true then forces the event completion
) const {
const amd::Event& event, //!< AMD event for HW status validation
bool wait = false) const { //!< If true then forces the event completion
return false;
};

// Returns the status of HW event, associated with amd::Event
virtual bool IsHwEventReadyForcedWait(
const amd::Event& event) const { //!< AMD event for HW status validation
return false;
};

Expand Down
22 changes: 2 additions & 20 deletions device/rocm/rocblit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2682,31 +2682,13 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,

Memory* schedulerMem = dev().getRocMemory(schedulerParam);
sp->kernarg_address = reinterpret_cast<uint64_t>(schedulerMem->getDeviceMemory());

sp->hidden_global_offset_x = 0;
sp->hidden_global_offset_y = 0;
sp->hidden_global_offset_z = 0;
sp->thread_counter = 0;
sp->child_queue = reinterpret_cast<uint64_t>(schedulerQueue);
sp->complete_signal = schedulerSignal;

hsa_signal_store_relaxed(schedulerSignal, kInitSignalValueOne);

sp->scheduler_aql.header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
sp->scheduler_aql.setup = 1;
sp->scheduler_aql.workgroup_size_x = 1;
sp->scheduler_aql.workgroup_size_y = 1;
sp->scheduler_aql.workgroup_size_z = 1;
sp->scheduler_aql.grid_size_x = threads;
sp->scheduler_aql.grid_size_y = 1;
sp->scheduler_aql.grid_size_z = 1;
sp->scheduler_aql.kernel_object = gpuKernel.KernelCodeHandle();
sp->scheduler_aql.kernarg_address = (void*)sp->kernarg_address;
sp->scheduler_aql.private_segment_size = 0;
sp->scheduler_aql.group_segment_size = 0;

sp->vqueue_header = vqVM;

sp->parentAQL = sp->kernarg_address + sizeof(SchedulerParam);
Expand All @@ -2721,7 +2703,7 @@ bool KernelBlitManager::runScheduler(uint64_t vqVM, amd::Memory* schedulerParam,
address parameters = captureArguments(kernels_[Scheduler]);

if (!gpu().submitKernelInternal(ndrange, *kernels_[Scheduler],
parameters, nullptr)) {
parameters, nullptr, 0, nullptr, &sp->scheduler_aql)) {
return false;
}
releaseArguments(parameters);
Expand Down
51 changes: 35 additions & 16 deletions device/rocm/rocdevice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2331,7 +2331,6 @@ bool Device::IpcDetach (void* dev_ptr) const {
// ================================================================================================
void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_svm_mem_flags flags,
void* svmPtr) const {
constexpr bool kForceAllocation = true;
amd::Memory* mem = nullptr;

if (nullptr == svmPtr) {
Expand All @@ -2343,7 +2342,7 @@ void* Device::svmAlloc(amd::Context& context, size_t size, size_t alignment, cl_
return nullptr;
}

if (!mem->create(nullptr, false, false, kForceAllocation)) {
if (!mem->create(nullptr)) {
LogError("failed to create a svm hidden buffer!");
mem->release();
return nullptr;
Expand Down Expand Up @@ -2730,10 +2729,22 @@ bool Device::SetClockMode(const cl_set_device_clock_mode_input_amd setClockModeI
return result;
}

// ================================================================================================
bool Device::IsHwEventReadyForcedWait(const amd::Event& event) const {
void* hw_event =
(event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
if (hw_event == nullptr) {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event");
return false;
}
static constexpr bool Timeout = true;
return WaitForSignal<Timeout>(reinterpret_cast<ProfilingSignal*>(hw_event)->signal_, false, true);
}

// ================================================================================================
bool Device::IsHwEventReady(const amd::Event& event, bool wait) const {
void* hw_event = (event.NotifyEvent() != nullptr) ?
event.NotifyEvent()->HwEvent() : event.HwEvent();
void* hw_event =
(event.NotifyEvent() != nullptr) ? event.NotifyEvent()->HwEvent() : event.HwEvent();
if (hw_event == nullptr) {
ClPrint(amd::LOG_INFO, amd::LOG_SIG, "No HW event");
return false;
Expand Down Expand Up @@ -3210,7 +3221,9 @@ device::Signal* Device::createSignal() const {
amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size) {
// Only create arena_mem_object if CPU memory is accessible from HMM
// or if runtime received an interop from another ROCr's client
if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size)) {
hsa_amd_pointer_info_t ptr_info = {};
ptr_info.size = sizeof(hsa_amd_pointer_info_t);
if (!info_.hmmCpuMemoryAccessible_ && !IsValidAllocation(ptr, size, &ptr_info)) {
return nullptr;
}

Expand All @@ -3227,8 +3240,9 @@ amd::Memory* Device::GetArenaMemObj(const void* ptr, size_t& offset, size_t size
}

// Calculate the offset of the pointer.
const void* dev_ptr = reinterpret_cast<void*>(arena_mem_obj_->getDeviceMemory(
*arena_mem_obj_->getContext().devices()[0])->virtualAddress());
const void* dev_ptr = reinterpret_cast<void*>(
arena_mem_obj_->getDeviceMemory(*arena_mem_obj_->getContext().devices()[0])
->virtualAddress());
offset = reinterpret_cast<size_t>(ptr) - reinterpret_cast<size_t>(dev_ptr);

return arena_mem_obj_;
Expand All @@ -3242,20 +3256,25 @@ void Device::ReleaseGlobalSignal(void* signal) const {
}

// ================================================================================================
bool Device::IsValidAllocation(const void* dev_ptr, size_t size) const {
hsa_amd_pointer_info_t ptr_info = {};
ptr_info.size = sizeof(hsa_amd_pointer_info_t);
bool Device::IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info) {
// Query ptr type to see if it's a HMM allocation
hsa_status_t status = hsa_amd_pointer_info(
const_cast<void*>(dev_ptr), &ptr_info, nullptr, nullptr, nullptr);
hsa_status_t status =
hsa_amd_pointer_info(const_cast<void*>(dev_ptr), ptr_info, nullptr, nullptr, nullptr);
// The call should never fail in ROCR, but just check for an error and continue
if (status != HSA_STATUS_SUCCESS) {
LogError("hsa_amd_pointer_info() failed");
}
// Check if it's a legacy non-HMM allocation in ROCr
if (ptr_info.type != HSA_EXT_POINTER_TYPE_UNKNOWN) {
if ((size != 0) && ((reinterpret_cast<const_address>(dev_ptr) -
reinterpret_cast<const_address>(ptr_info.agentBaseAddress)) > size)) {

// Return false for pinned memory. A true return may result in a race because
// ROCclr may attempt to do a pin/copy/unpin underneath in a multithreaded environment
if (ptr_info->type == HSA_EXT_POINTER_TYPE_LOCKED) {
return false;
}

if (ptr_info->type != HSA_EXT_POINTER_TYPE_UNKNOWN) {
if ((size != 0) &&
((reinterpret_cast<const_address>(dev_ptr) -
reinterpret_cast<const_address>(ptr_info->agentBaseAddress)) > size)) {
return false;
}
return true;
Expand Down
4 changes: 3 additions & 1 deletion device/rocm/rocdevice.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ class NullDevice : public amd::Device {
cl_set_device_clock_mode_output_amd* pSetClockModeOutput) { return true; }

virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const { return false; }
virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const { return false; }
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const {};
virtual void ReleaseGlobalSignal(void* signal) const {}

Expand Down Expand Up @@ -443,6 +444,7 @@ class Device : public NullDevice {
cl_set_device_clock_mode_output_amd* pSetClockModeOutput);

virtual bool IsHwEventReady(const amd::Event& event, bool wait = false) const;
virtual bool IsHwEventReadyForcedWait(const amd::Event& event) const;
virtual void getHwEventTime(const amd::Event& event, uint64_t* start, uint64_t* end) const;
virtual void ReleaseGlobalSignal(void* signal) const;

Expand Down Expand Up @@ -549,7 +551,7 @@ class Device : public NullDevice {
const bool isFineGrainSupported() const;

//! Returns True if memory pointer is known to ROCr (excludes HMM allocations)
bool IsValidAllocation(const void* dev_ptr, size_t size) const;
bool IsValidAllocation(const void* dev_ptr, size_t size, hsa_amd_pointer_info_t* ptr_info);

//! Allocates hidden heap for device memory allocations
void HiddenHeapAlloc();
Expand Down
15 changes: 13 additions & 2 deletions device/rocm/rocvirtual.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1041,7 +1041,6 @@ void VirtualGPU::dispatchBarrierValuePacket(uint16_t packetHeader, bool resolveD
hsa_signal_t signal, hsa_signal_value_t value,
hsa_signal_value_t mask, hsa_signal_condition32_t cond,
bool skipTs, hsa_signal_t completionSignal) {
hsa_amd_barrier_value_packet_t barrier_value_packet_ = {0};
uint16_t rest = HSA_AMD_PACKET_TYPE_BARRIER_VALUE;
const uint32_t queueSize = gpu_queue_->size;
const uint32_t queueMask = queueSize - 1;
Expand Down Expand Up @@ -1274,6 +1273,7 @@ bool VirtualGPU::create() {
// Initialize barrier and barrier value packets
memset(&barrier_packet_, 0, sizeof(barrier_packet_));
barrier_packet_.header = kInvalidAql;
barrier_value_packet_.header.header = kInvalidAql;

// Create a object of PrintfDbg
printfdbg_ = new PrintfDbg(roc_device_);
Expand Down Expand Up @@ -2781,7 +2781,8 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize)
// ================================================================================================
bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
const amd::Kernel& kernel, const_address parameters, void* eventHandle,
uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd) {
uint32_t sharedMemBytes, amd::NDRangeKernelCommand* vcmd,
hsa_kernel_dispatch_packet_t* aql_packet) {
device::Kernel* devKernel = const_cast<device::Kernel*>(kernel.getDeviceKernel(dev()));
Kernel& gpuKernel = static_cast<Kernel&>(*devKernel);
size_t ldsUsage = gpuKernel.WorkgroupGroupSegmentByteSize();
Expand Down Expand Up @@ -3108,6 +3109,16 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
dispatchPacket.reserved2 = vcmd->profilingInfo().correlation_id_;
}

// Copy scheduler's AQL packet for possible relaunch from the scheduler itself
if (aql_packet != nullptr) {
*aql_packet = dispatchPacket;
aql_packet->header = (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
aql_packet->setup = sizes.dimensions() << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
}

// Dispatch the packet
if (!dispatchAqlPacket(
&dispatchPacket, aqlHeaderWithOrder,
Expand Down
10 changes: 6 additions & 4 deletions device/rocm/rocvirtual.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,18 +46,18 @@ constexpr static uint64_t kUnlimitedWait = std::numeric_limits<uint64_t>::max();

// Active wait time out incase same sdma engine is used again,
// then just wait instead of adding dependency wait signal.
constexpr static uint64_t kSDMAEngineTimeout = 10;
constexpr static uint64_t kForcedTimeout = 10;

template <bool active_wait_timeout = false>
inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool sdma_wait = false) {
inline bool WaitForSignal(hsa_signal_t signal, bool active_wait = false, bool forced_wait = false) {
if (hsa_signal_load_relaxed(signal) > 0) {
uint64_t timeout = kTimeout100us;
if (active_wait) {
timeout = kUnlimitedWait;
}
if (active_wait_timeout) {
// If diff engine, wait to 10 ms. Otherwise no wait
timeout = (sdma_wait ? kSDMAEngineTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K;
timeout = (forced_wait ? kForcedTimeout : ROC_ACTIVE_WAIT_TIMEOUT) * K;
if (timeout == 0) {
return false;
}
Expand Down Expand Up @@ -312,7 +312,8 @@ class VirtualGPU : public device::VirtualDevice {
const_address parameters, //!< Parameters for the kernel
void* event_handle, //!< Handle to OCL event for debugging
uint32_t sharedMemBytes = 0, //!< Shared memory size
amd::NDRangeKernelCommand* vcmd = nullptr //!< Original launch command
amd::NDRangeKernelCommand* vcmd = nullptr, //!< Original launch command
hsa_kernel_dispatch_packet_t* aql_packet = nullptr //!< Scheduler launch
);
void submitNativeFn(amd::NativeFnCommand& cmd);
void submitMarker(amd::Marker& cmd);
Expand Down Expand Up @@ -502,6 +503,7 @@ class VirtualGPU : public device::VirtualDevice {
hsa_agent_t gpu_device_; //!< Physical device
hsa_queue_t* gpu_queue_; //!< Queue associated with a gpu
hsa_barrier_and_packet_t barrier_packet_;
hsa_amd_barrier_value_packet_t barrier_value_packet_;

uint32_t dispatch_id_; //!< This variable must be updated atomically.
Device& roc_device_; //!< roc device object
Expand Down