Skip to content

[AUTOGENERATED] [release/2.7] NAVI32 specific fixes #2466

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: release/2.7
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion test/inductor/test_flex_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@
)
from torch.testing import FileCheck
from torch.testing._internal import common_utils
from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_BF16
from torch.testing._internal.common_cuda import (
PLATFORM_SUPPORTS_BF16,
PLATFORM_SUPPORTS_FLASH_ATTENTION,
)
from torch.testing._internal.common_utils import skipIfRocm
from torch.utils._triton import has_triton

Expand Down Expand Up @@ -1421,6 +1424,7 @@ def mask_mod(b, h, q, kv):
self.assertEqual(query.grad[:, :, M:, :].sum(), 0)

@supported_platform
@unittest.skipIf(not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support SDPA")
def test_windowed_no_mask_vs_sdpa(self):
score_mod = _generate_windowed(1000)
attention = functools.partial(flex_attention, score_mod=score_mod)
Expand Down
5 changes: 5 additions & 0 deletions test/inductor/test_max_autotune.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
IS_WINDOWS,
parametrize,
TEST_WITH_ROCM,
skipIfRocmNotEnoughMemory,
)
from torch.utils._triton import has_triton_tma_device

Expand Down Expand Up @@ -981,6 +982,8 @@ def test_conv_backend(self):

self.assertIn("NoValidChoicesError", str(context.exception))

# Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
@skipIfRocmNotEnoughMemory(30)
def test_non_contiguous_input_mm(self):
"""
Make sure the triton template can work with non-contiguous inputs without crash.
Expand Down Expand Up @@ -1033,6 +1036,8 @@ def f(x, y):
# TODO: fix accuracy failure of the triton template on XPU.
# and enable this test case.
@skipIfXpu
# Some ROCm GPUs don't have enough VRAM to run all autotune configurations and padding benchmarks
@skipIfRocmNotEnoughMemory(30)
def test_non_contiguous_input_mm_plus_mm(self):
x1 = rand_strided((50257, 32768), (1, 50304), device=GPU_TYPE)
y1 = rand_strided((32768, 768), (768, 1), device=GPU_TYPE)
Expand Down
25 changes: 25 additions & 0 deletions torch/testing/_internal/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1902,6 +1902,31 @@ def wrap_fn(self, *args, **kwargs):
return wrap_fn
return dec_fn

# Checks if current ROCm device has enough VRAM against the required amount in GB
def skipIfRocmNotEnoughMemory(required_amount):
def dec_fn(fn):
@wraps(fn)
def wrap_fn(self, *args, **kwargs):
if TEST_WITH_ROCM:
device = torch.cuda.current_device()
props = torch.cuda.get_device_properties(device)

total = props.total_memory / (1024 ** 3) # in GB
# This will probably return 0 because it only counts tensors
# and doesn't take into account any small supporting allocations
allocated = torch.cuda.memory_allocated(device) / (1024 ** 3)
free_global = total - allocated

result = free_global > required_amount

if not result:
reason = f"skipIfRocm: Not enough free VRAM on current ROCm device. " \
f"Available: {free_global:.2f} GB | Required: {required_amount:.2f} GB."
raise unittest.SkipTest(reason)
return fn(self, *args, **kwargs)
return wrap_fn
return dec_fn

def runOnRocm(fn):
@wraps(fn)
def wrapper(*args, **kwargs):
Expand Down