From 11cda623712b7c12eabe197c5d553b28890b01b6 Mon Sep 17 00:00:00 2001 From: iupaikov-amd Date: Wed, 6 Aug 2025 18:25:01 +0000 Subject: [PATCH 1/5] Cherry-picked commit with merge conflict --- torch/testing/_internal/common_utils.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index f7fd0dab128e..2726344866a4 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -1894,9 +1894,26 @@ def dec_fn(fn): @wraps(fn) def wrap_fn(self, *args, **kwargs): if TEST_WITH_ROCM: +<<<<<<< HEAD prop = torch.cuda.get_device_properties(0) if prop.gcnArchName.split(":")[0] in arch: reason = f"skipIfRocm: test skipped on {arch}" +======= + device = torch.cuda.current_device() + props = torch.cuda.get_device_properties(device) + + total = props.total_memory / (1024 ** 3) # in GB + # This will probably return 0 because it only counts tensors + # and doesn't take into account any small supporting allocations + allocated = torch.cuda.memory_allocated(device) / (1024 ** 3) + free_global = total - allocated + + result = free_global > required_amount + + if not result: + reason = f"skipIfRocm: Not enough free VRAM on current ROCm device. " \ + f"Available: {free_global:.2f} GB | Required: {required_amount:.2f} GB." +>>>>>>> f78730679a1 (Formatting code style) raise unittest.SkipTest(reason) return fn(self, *args, **kwargs) return wrap_fn From 7d7066fdd88367b9bdbf13eb8b49dcd28e672c15 Mon Sep 17 00:00:00 2001 From: iupaikov-amd Date: Thu, 7 Aug 2025 08:59:59 +0000 Subject: [PATCH 2/5] Fixed merge issues --- torch/testing/_internal/common_utils.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 2726344866a4..4ea5c20ae58d 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -1894,14 +1894,23 @@ def dec_fn(fn): @wraps(fn) def wrap_fn(self, *args, **kwargs): if TEST_WITH_ROCM: -<<<<<<< HEAD prop = torch.cuda.get_device_properties(0) if prop.gcnArchName.split(":")[0] in arch: reason = f"skipIfRocm: test skipped on {arch}" -======= + raise unittest.SkipTest(reason) + return fn(self, *args, **kwargs) + return wrap_fn + return dec_fn + +# Checks if current ROCm device has enough VRAM against the required amount in GB +def skipIfRocmNotEnoughMemory(required_amount): + def dec_fn(fn): + @wraps(fn) + def wrap_fn(self, *args, **kwargs): + if TEST_WITH_ROCM: device = torch.cuda.current_device() props = torch.cuda.get_device_properties(device) - + total = props.total_memory / (1024 ** 3) # in GB # This will probably return 0 because it only counts tensors # and doesn't take into account any small supporting allocations @@ -1913,7 +1922,6 @@ def wrap_fn(self, *args, **kwargs): if not result: reason = f"skipIfRocm: Not enough free VRAM on current ROCm device. " \ f"Available: {free_global:.2f} GB | Required: {required_amount:.2f} GB." ->>>>>>> f78730679a1 (Formatting code style) raise unittest.SkipTest(reason) return fn(self, *args, **kwargs) return wrap_fn From 446bd8dd7ba852fe50b50ca0cd545421e3ba69ce Mon Sep 17 00:00:00 2001 From: iupaikov-amd Date: Thu, 7 Aug 2025 10:05:51 +0000 Subject: [PATCH 3/5] Added skips for devices with not enough memory --- test/inductor/test_max_autotune.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py index 741353fdbf5d..926736adb691 100644 --- a/test/inductor/test_max_autotune.py +++ b/test/inductor/test_max_autotune.py @@ -32,6 +32,7 @@ IS_WINDOWS, parametrize, TEST_WITH_ROCM, + skipIfRocmNotEnoughMemory, ) from torch.utils._triton import has_triton_tma_device @@ -981,6 +982,8 @@ def test_conv_backend(self): self.assertIn("NoValidChoicesError", str(context.exception)) + # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks + @skipIfRocmNotEnoughMemory(30) def test_non_contiguous_input_mm(self): """ Make sure the triton template can work with non-contiguous inputs without crash. @@ -1033,6 +1036,8 @@ def f(x, y): # TODO: fix accuracy failure of the triton template on XPU. # and enable this test case. @skipIfXpu + # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks + @skipIfRocmNotEnoughMemory(30) def test_non_contiguous_input_mm_plus_mm(self): x1 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE) y1 = rand_strided((32768, 768), (768, 1), device=GPU_TYPE) From 1bd78a8f2194efc60e1fbb272de8e178e2593a17 Mon Sep 17 00:00:00 2001 From: iupaikov-amd Date: Thu, 7 Aug 2025 10:14:51 +0000 Subject: [PATCH 4/5] Added a skip for one sdpa test on unsupported devices --- test/inductor/test_flex_decoding.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py index 098ebf35fbf6..8b1a8fb37c37 100644 --- a/test/inductor/test_flex_decoding.py +++ b/test/inductor/test_flex_decoding.py @@ -21,7 +21,10 @@ ) from torch.testing import FileCheck from torch.testing._internal import common_utils -from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_BF16 +from torch.testing._internal.common_cuda import ( + PLATFORM_SUPPORTS_BF16, + PLATFORM_SUPPORTS_FLASH_ATTENTION, +) from torch.testing._internal.common_utils import skipIfRocm from torch.utils._triton import has_triton @@ -1421,6 +1424,7 @@ def mask_mod(b, h, q, kv): self.assertEqual(query.grad[:, :, M:, :].sum(), 0) @supported_platform + @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA") def test_windowed_no_mask_vs_sdpa(self): score_mod = _generate_windowed(1000) attention = functools.partial(flex_attention, score_mod=score_mod) From fc7a5c3a573331617989677196b3ae3a65cf1ea7 Mon Sep 17 00:00:00 2001 From: iupaikov-amd Date: Thu, 14 Aug 2025 11:40:38 +0000 Subject: [PATCH 5/5] Used general decorator instead of ROCm specific --- test/inductor/test_max_autotune.py | 13 +++++++++---- torch/testing/_internal/common_utils.py | 25 ------------------------- 2 files changed, 9 insertions(+), 29 deletions(-) diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py index 926736adb691..3be2e6983ba3 100644 --- a/test/inductor/test_max_autotune.py +++ b/test/inductor/test_max_autotune.py @@ -27,12 +27,12 @@ TritonTemplateCaller, ) from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8 +from torch.testing._internal.common_device_type import largeTensorTest from torch.testing._internal.common_utils import ( instantiate_parametrized_tests, IS_WINDOWS, parametrize, TEST_WITH_ROCM, - skipIfRocmNotEnoughMemory, ) from torch.utils._triton import has_triton_tma_device @@ -45,7 +45,12 @@ from torch.fx.experimental.proxy_tensor import make_fx from torch.testing import FileCheck from torch.testing._internal.common_utils import skipIfRocm, skipIfXpu -from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA, HAS_GPU +from torch.testing._internal.inductor_utils import ( + GPU_TYPE, + HAS_CPU, + HAS_CUDA, + HAS_GPU, +) torch.set_float32_matmul_precision("high") @@ -983,7 +988,7 @@ def test_conv_backend(self): self.assertIn("NoValidChoicesError", str(context.exception)) # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks - @skipIfRocmNotEnoughMemory(30) + @largeTensorTest("30 GB", device=GPU_TYPE) def test_non_contiguous_input_mm(self): """ Make sure the triton template can work with non-contiguous inputs without crash. @@ -1037,7 +1042,7 @@ def f(x, y): # and enable this test case. @skipIfXpu # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks - @skipIfRocmNotEnoughMemory(30) + @largeTensorTest("30 GB", device=GPU_TYPE) def test_non_contiguous_input_mm_plus_mm(self): x1 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE) y1 = rand_strided((32768, 768), (768, 1), device=GPU_TYPE) diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index 4ea5c20ae58d..f7fd0dab128e 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -1902,31 +1902,6 @@ def wrap_fn(self, *args, **kwargs): return wrap_fn return dec_fn -# Checks if current ROCm device has enough VRAM against the required amount in GB -def skipIfRocmNotEnoughMemory(required_amount): - def dec_fn(fn): - @wraps(fn) - def wrap_fn(self, *args, **kwargs): - if TEST_WITH_ROCM: - device = torch.cuda.current_device() - props = torch.cuda.get_device_properties(device) - - total = props.total_memory / (1024 ** 3) # in GB - # This will probably return 0 because it only counts tensors - # and doesn't take into account any small supporting allocations - allocated = torch.cuda.memory_allocated(device) / (1024 ** 3) - free_global = total - allocated - - result = free_global > required_amount - - if not result: - reason = f"skipIfRocm: Not enough free VRAM on current ROCm device. " \ - f"Available: {free_global:.2f} GB | Required: {required_amount:.2f} GB." - raise unittest.SkipTest(reason) - return fn(self, *args, **kwargs) - return wrap_fn - return dec_fn - def runOnRocm(fn): @wraps(fn) def wrapper(*args, **kwargs):