From 11cda623712b7c12eabe197c5d553b28890b01b6 Mon Sep 17 00:00:00 2001
From: iupaikov-amd <iurii.paikov@amd.com>
Date: Wed, 6 Aug 2025 18:25:01 +0000
Subject: [PATCH 1/5] Cherry-picked commit with merge conflict

---
 torch/testing/_internal/common_utils.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index f7fd0dab128e..2726344866a4 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1894,9 +1894,26 @@ def dec_fn(fn):
         @wraps(fn)
         def wrap_fn(self, *args, **kwargs):
             if TEST_WITH_ROCM:
+<<<<<<< HEAD
                 prop = torch.cuda.get_device_properties(0)
                 if prop.gcnArchName.split(":")[0] in arch:
                     reason = f"skipIfRocm: test skipped on {arch}"
+=======
+                device = torch.cuda.current_device()
+                props = torch.cuda.get_device_properties(device)
+                
+                total = props.total_memory / (1024 ** 3)  # in GB
+                # This will probably return 0 because it only counts tensors
+                # and doesn't take into account any small supporting allocations
+                allocated = torch.cuda.memory_allocated(device) / (1024 ** 3)
+                free_global = total - allocated
+
+                result = free_global > required_amount
+
+                if not result:
+                    reason = f"skipIfRocm: Not enough free VRAM on current ROCm device. " \
+                        f"Available: {free_global:.2f} GB | Required: {required_amount:.2f} GB."
+>>>>>>> f78730679a1 (Formatting code style)
                     raise unittest.SkipTest(reason)
             return fn(self, *args, **kwargs)
         return wrap_fn

From 7d7066fdd88367b9bdbf13eb8b49dcd28e672c15 Mon Sep 17 00:00:00 2001
From: iupaikov-amd <iurii.paikov@amd.com>
Date: Thu, 7 Aug 2025 08:59:59 +0000
Subject: [PATCH 2/5] Fixed merge issues

---
 torch/testing/_internal/common_utils.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 2726344866a4..4ea5c20ae58d 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1894,14 +1894,23 @@ def dec_fn(fn):
         @wraps(fn)
         def wrap_fn(self, *args, **kwargs):
             if TEST_WITH_ROCM:
-<<<<<<< HEAD
                 prop = torch.cuda.get_device_properties(0)
                 if prop.gcnArchName.split(":")[0] in arch:
                     reason = f"skipIfRocm: test skipped on {arch}"
-=======
+                    raise unittest.SkipTest(reason)
+            return fn(self, *args, **kwargs)
+        return wrap_fn
+    return dec_fn
+
+# Checks if current ROCm device has enough VRAM against the required amount in GB
+def skipIfRocmNotEnoughMemory(required_amount):
+    def dec_fn(fn):
+        @wraps(fn)
+        def wrap_fn(self, *args, **kwargs):
+            if TEST_WITH_ROCM:
                 device = torch.cuda.current_device()
                 props = torch.cuda.get_device_properties(device)
-                
+
                 total = props.total_memory / (1024 ** 3)  # in GB
                 # This will probably return 0 because it only counts tensors
                 # and doesn't take into account any small supporting allocations
@@ -1913,7 +1922,6 @@ def wrap_fn(self, *args, **kwargs):
                 if not result:
                     reason = f"skipIfRocm: Not enough free VRAM on current ROCm device. " \
                         f"Available: {free_global:.2f} GB | Required: {required_amount:.2f} GB."
->>>>>>> f78730679a1 (Formatting code style)
                     raise unittest.SkipTest(reason)
             return fn(self, *args, **kwargs)
         return wrap_fn

From 446bd8dd7ba852fe50b50ca0cd545421e3ba69ce Mon Sep 17 00:00:00 2001
From: iupaikov-amd <iurii.paikov@amd.com>
Date: Thu, 7 Aug 2025 10:05:51 +0000
Subject: [PATCH 3/5] Added skips for devices with not enough memory

---
 test/inductor/test_max_autotune.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index 741353fdbf5d..926736adb691 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -32,6 +32,7 @@
     IS_WINDOWS,
     parametrize,
     TEST_WITH_ROCM,
+    skipIfRocmNotEnoughMemory,
 )
 from torch.utils._triton import has_triton_tma_device
 
@@ -981,6 +982,8 @@ def test_conv_backend(self):
 
         self.assertIn("NoValidChoicesError", str(context.exception))
 
+    # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
+    @skipIfRocmNotEnoughMemory(30)
     def test_non_contiguous_input_mm(self):
         """
         Make sure the triton template can work with non-contiguous inputs without crash.
@@ -1033,6 +1036,8 @@ def f(x, y):
     # TODO: fix accuracy failure of the triton template on XPU.
     # and enable this test case.
     @skipIfXpu
+    # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
+    @skipIfRocmNotEnoughMemory(30)
     def test_non_contiguous_input_mm_plus_mm(self):
         x1 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE)
         y1 = rand_strided((32768, 768), (768, 1), device=GPU_TYPE)

From 1bd78a8f2194efc60e1fbb272de8e178e2593a17 Mon Sep 17 00:00:00 2001
From: iupaikov-amd <iurii.paikov@amd.com>
Date: Thu, 7 Aug 2025 10:14:51 +0000
Subject: [PATCH 4/5] Added a skip for one sdpa test on unsupported devices

---
 test/inductor/test_flex_decoding.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/inductor/test_flex_decoding.py b/test/inductor/test_flex_decoding.py
index 098ebf35fbf6..8b1a8fb37c37 100644
--- a/test/inductor/test_flex_decoding.py
+++ b/test/inductor/test_flex_decoding.py
@@ -21,7 +21,10 @@
 )
 from torch.testing import FileCheck
 from torch.testing._internal import common_utils
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_BF16
+from torch.testing._internal.common_cuda import (
+    PLATFORM_SUPPORTS_BF16,
+    PLATFORM_SUPPORTS_FLASH_ATTENTION,
+)
 from torch.testing._internal.common_utils import skipIfRocm
 from torch.utils._triton import has_triton
 
@@ -1421,6 +1424,7 @@ def mask_mod(b, h, q, kv):
         self.assertEqual(query.grad[:, :, M:, :].sum(), 0)
 
     @supported_platform
+    @unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
     def test_windowed_no_mask_vs_sdpa(self):
         score_mod = _generate_windowed(1000)
         attention = functools.partial(flex_attention, score_mod=score_mod)

From fc7a5c3a573331617989677196b3ae3a65cf1ea7 Mon Sep 17 00:00:00 2001
From: iupaikov-amd <iurii.paikov@amd.com>
Date: Thu, 14 Aug 2025 11:40:38 +0000
Subject: [PATCH 5/5] Used general decorator instead of ROCm specific

---
 test/inductor/test_max_autotune.py      | 13 +++++++++----
 torch/testing/_internal/common_utils.py | 25 -------------------------
 2 files changed, 9 insertions(+), 29 deletions(-)

diff --git a/test/inductor/test_max_autotune.py b/test/inductor/test_max_autotune.py
index 926736adb691..3be2e6983ba3 100644
--- a/test/inductor/test_max_autotune.py
+++ b/test/inductor/test_max_autotune.py
@@ -27,12 +27,12 @@
     TritonTemplateCaller,
 )
 from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8
+from torch.testing._internal.common_device_type import largeTensorTest
 from torch.testing._internal.common_utils import (
     instantiate_parametrized_tests,
     IS_WINDOWS,
     parametrize,
     TEST_WITH_ROCM,
-    skipIfRocmNotEnoughMemory,
 )
 from torch.utils._triton import has_triton_tma_device
 
@@ -45,7 +45,12 @@
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
 from torch.testing._internal.common_utils import skipIfRocm, skipIfXpu
-from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_CUDA, HAS_GPU
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_CPU,
+    HAS_CUDA,
+    HAS_GPU,
+)
 
 
 torch.set_float32_matmul_precision("high")
@@ -983,7 +988,7 @@ def test_conv_backend(self):
         self.assertIn("NoValidChoicesError", str(context.exception))
 
     # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
-    @skipIfRocmNotEnoughMemory(30)
+    @largeTensorTest("30 GB", device=GPU_TYPE)
     def test_non_contiguous_input_mm(self):
         """
         Make sure the triton template can work with non-contiguous inputs without crash.
@@ -1037,7 +1042,7 @@ def f(x, y):
     # and enable this test case.
     @skipIfXpu
     # Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
-    @skipIfRocmNotEnoughMemory(30)
+    @largeTensorTest("30 GB", device=GPU_TYPE)
     def test_non_contiguous_input_mm_plus_mm(self):
         x1 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE)
         y1 = rand_strided((32768, 768), (768, 1), device=GPU_TYPE)
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 4ea5c20ae58d..f7fd0dab128e 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -1902,31 +1902,6 @@ def wrap_fn(self, *args, **kwargs):
         return wrap_fn
     return dec_fn
 
-# Checks if current ROCm device has enough VRAM against the required amount in GB
-def skipIfRocmNotEnoughMemory(required_amount):
-    def dec_fn(fn):
-        @wraps(fn)
-        def wrap_fn(self, *args, **kwargs):
-            if TEST_WITH_ROCM:
-                device = torch.cuda.current_device()
-                props = torch.cuda.get_device_properties(device)
-
-                total = props.total_memory / (1024 ** 3)  # in GB
-                # This will probably return 0 because it only counts tensors
-                # and doesn't take into account any small supporting allocations
-                allocated = torch.cuda.memory_allocated(device) / (1024 ** 3)
-                free_global = total - allocated
-
-                result = free_global > required_amount
-
-                if not result:
-                    reason = f"skipIfRocm: Not enough free VRAM on current ROCm device. " \
-                        f"Available: {free_global:.2f} GB | Required: {required_amount:.2f} GB."
-                    raise unittest.SkipTest(reason)
-            return fn(self, *args, **kwargs)
-        return wrap_fn
-    return dec_fn
-
 def runOnRocm(fn):
     @wraps(fn)
     def wrapper(*args, **kwargs):