Skip to content

Commit caa0094

Browse files
authored
backends/cuda: use async malloc/free (#14976)
Found device synchronize in aoti_torch_delete_tensor_object via Linux perf. This change appears to significantly improve self-reported latency from voxtral_runner as found in https://github.com/pytorch/executorch/blob/main/.github/workflows/cuda.yml#L111-L172: Baseline: Run latency (ms): audio_encoder: 575.797 token_embedding: 14.571 text_decoder: 3095.356 With this PR: Run latency (ms): audio_encoder: 175.807 token_embedding: 8.799 text_decoder: 344.367
1 parent 7395999 commit caa0094

File tree

1 file changed

+8
-5
lines changed

1 file changed

+8
-5
lines changed

backends/cuda/runtime/shims/memory.cpp

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,7 @@ AOTITorchError aoti_torch_empty_strided(
225225

226226
if (device_type == static_cast<int32_t>(SupportedDevices::CUDA)) {
227227
ET_CUDA_CHECK_OR_RETURN_ERROR(
228-
cudaMallocManaged(&ptr, static_cast<size_t>(nbytes)));
228+
cudaMallocAsync(&ptr, static_cast<size_t>(nbytes), cudaStreamDefault));
229229
} else if (device_type == static_cast<int32_t>(SupportedDevices::CPU)) {
230230
// Ensure 16-byte alignment for CPU memory to match CUDA requirements
231231
int result = posix_memalign(&ptr, 16, nbytes);
@@ -328,11 +328,14 @@ AOTITorchError aoti_torch_delete_tensor_object(Tensor* tensor) {
328328
ET_CUDA_CHECK_OR_RETURN_ERROR(
329329
cudaPointerGetAttributes(&attributes, data_ptr));
330330

331-
if (attributes.type == cudaMemoryTypeManaged) {
332-
// This is CUDA managed memory - free with proper synchronization
333-
ET_CUDA_CHECK_OR_RETURN_ERROR(cudaDeviceSynchronize());
334-
ET_CUDA_CHECK_OR_RETURN_ERROR(cudaFree(data_ptr));
331+
if (attributes.type == cudaMemoryTypeDevice) {
332+
ET_CUDA_CHECK_OR_RETURN_ERROR(
333+
cudaFreeAsync(data_ptr, cudaStreamDefault));
335334
} else {
335+
ET_CHECK_OR_RETURN_ERROR(
336+
attributes.type != cudaMemoryTypeManaged,
337+
Internal,
338+
"Expected host memory but got managed!")
336339
// This is CPU memory - free immediately
337340
free(data_ptr);
338341
data_ptr = nullptr;

0 commit comments

Comments
 (0)