diff --git a/.automation_scripts/parse_xml_results.py b/.automation_scripts/parse_xml_results.py
new file mode 100644
index 000000000000..7db2e1ce9233
--- /dev/null
+++ b/.automation_scripts/parse_xml_results.py
@@ -0,0 +1,178 @@
+""" The Python PyTorch testing script.
+##
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""
+
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+# Backends list
+BACKENDS_LIST = [
+ "dist-gloo",
+ "dist-nccl"
+]
+
+TARGET_WORKFLOW = "--rerun-disabled-tests"
+
+def get_job_id(report: Path) -> int:
+ # [Job id in artifacts]
+ # Retrieve the job id from the report path. In our GHA workflows, we append
+ # the job id to the end of the report name, so `report` looks like:
+ # unzipped-test-reports-foo_5596745227/test/test-reports/foo/TEST-foo.xml
+ # and we want to get `5596745227` out of it.
+ try:
+ return int(report.parts[0].rpartition("_")[2])
+ except ValueError:
+ return -1
+
+def is_rerun_disabled_tests(root: ET.ElementTree) -> bool:
+ """
+ Check if the test report is coming from rerun_disabled_tests workflow
+ """
+ skipped = root.find(".//*skipped")
+ # Need to check against None here, if not skipped doesn't work as expected
+ if skipped is None:
+ return False
+
+ message = skipped.attrib.get("message", "")
+ return TARGET_WORKFLOW in message or "num_red" in message
+
+def parse_xml_report(
+ tag: str,
+ report: Path,
+ workflow_id: int,
+ workflow_run_attempt: int,
+ work_flow_name: str
+) -> Dict[Tuple[str], Dict[str, Any]]:
+ """Convert a test report xml file into a JSON-serializable list of test cases."""
+ print(f"Parsing {tag}s for test report: {report}")
+
+ job_id = get_job_id(report)
+ print(f"Found job id: {job_id}")
+
+ test_cases: Dict[Tuple[str], Dict[str, Any]] = {}
+
+ root = ET.parse(report)
+ # TODO: unlike unittest, pytest-flakefinder used by rerun disabled tests for test_ops
+ # includes skipped messages multiple times (50 times by default). This slows down
+ # this script too much (O(n)) because it tries to gather all the stats. This should
+ # be fixed later in the way we use pytest-flakefinder. A zipped test report from rerun
+ # disabled test is only few MB, but will balloon up to a much bigger XML file after
+ # extracting from a dozen to few hundred MB
+ if is_rerun_disabled_tests(root):
+ return test_cases
+
+ for test_case in root.iter(tag):
+ case = process_xml_element(test_case)
+ if tag == 'testcase':
+ case["workflow_id"] = workflow_id
+ case["workflow_run_attempt"] = workflow_run_attempt
+ case["job_id"] = job_id
+ case["work_flow_name"] = work_flow_name
+
+ # [invoking file]
+ # The name of the file that the test is located in is not necessarily
+ # the same as the name of the file that invoked the test.
+ # For example, `test_jit.py` calls into multiple other test files (e.g.
+ # jit/test_dce.py). For sharding/test selection purposes, we want to
+ # record the file that invoked the test.
+ #
+ # To do this, we leverage an implementation detail of how we write out
+ # tests (https://bit.ly/3ajEV1M), which is that reports are created
+ # under a folder with the same name as the invoking file.
+ case_name = report.parent.name
+ for ind in range(len(BACKENDS_LIST)):
+ if BACKENDS_LIST[ind] in report.parts:
+ case_name = case_name + "_" + BACKENDS_LIST[ind]
+ break
+ case["invoking_file"] = case_name
+ test_cases[ ( case["invoking_file"], case["classname"], case["name"], case["work_flow_name"] ) ] = case
+ elif tag == 'testsuite':
+ case["work_flow_name"] = work_flow_name
+ case["invoking_xml"] = report.name
+ case["running_time_xml"] = case["time"]
+ case_name = report.parent.name
+ for ind in range(len(BACKENDS_LIST)):
+ if BACKENDS_LIST[ind] in report.parts:
+ case_name = case_name + "_" + BACKENDS_LIST[ind]
+ break
+ case["invoking_file"] = case_name
+
+ test_cases[ ( case["invoking_file"], case["invoking_xml"], case["work_flow_name"] ) ] = case
+
+ return test_cases
+
+def process_xml_element(element: ET.Element) -> Dict[str, Any]:
+ """Convert a test suite element into a JSON-serializable dict."""
+ ret: Dict[str, Any] = {}
+
+ # Convert attributes directly into dict elements.
+ # e.g.
+ #
+ # becomes:
+ # {"name": "test_foo", "classname": "test_bar"}
+ ret.update(element.attrib)
+
+ # The XML format encodes all values as strings. Convert to ints/floats if
+ # possible to make aggregation possible in Rockset.
+ for k, v in ret.items():
+ try:
+ ret[k] = int(v)
+ except ValueError:
+ pass
+ try:
+ ret[k] = float(v)
+ except ValueError:
+ pass
+
+ # Convert inner and outer text into special dict elements.
+ # e.g.
+ # my_inner_text my_tail
+ # becomes:
+ # {"text": "my_inner_text", "tail": " my_tail"}
+ if element.text and element.text.strip():
+ ret["text"] = element.text
+ if element.tail and element.tail.strip():
+ ret["tail"] = element.tail
+
+ # Convert child elements recursively, placing them at a key:
+ # e.g.
+ #
+ # hello
+ # world
+ # another
+ #
+ # becomes
+ # {
+ # "foo": [{"text": "hello"}, {"text": "world"}],
+ # "bar": {"text": "another"}
+ # }
+ for child in element:
+ if child.tag not in ret:
+ ret[child.tag] = process_xml_element(child)
+ else:
+ # If there are multiple tags with the same name, they should be
+ # coalesced into a list.
+ if not isinstance(ret[child.tag], list):
+ ret[child.tag] = [ret[child.tag]]
+ ret[child.tag].append(process_xml_element(child))
+ return ret
\ No newline at end of file
diff --git a/.automation_scripts/run_pytorch_unit_tests.py b/.automation_scripts/run_pytorch_unit_tests.py
new file mode 100644
index 000000000000..514afd19624c
--- /dev/null
+++ b/.automation_scripts/run_pytorch_unit_tests.py
@@ -0,0 +1,518 @@
+#!/usr/bin/env python3
+
+""" The Python PyTorch testing script.
+##
+# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+"""
+
+import argparse
+import os
+import shutil
+import subprocess
+from subprocess import STDOUT, CalledProcessError
+
+from collections import namedtuple
+from datetime import datetime
+from pathlib import Path
+from parse_xml_results import (
+ parse_xml_report
+)
+from pprint import pprint
+from typing import Any, Dict, List
+
+# unit test status list
+UT_STATUS_LIST = [
+ "PASSED",
+ "MISSED",
+ "SKIPPED",
+ "FAILED",
+ "XFAILED",
+ "ERROR"
+]
+
+DEFAULT_CORE_TESTS = [
+ "test_nn",
+ "test_torch",
+ "test_cuda",
+ "test_ops",
+ "test_unary_ufuncs",
+ "test_autograd",
+ "inductor/test_torchinductor"
+]
+
+DISTRIBUTED_CORE_TESTS = [
+ "distributed/test_c10d_common",
+ "distributed/test_c10d_nccl",
+ "distributed/test_distributed_spawn"
+]
+
+CONSOLIDATED_LOG_FILE_NAME="pytorch_unit_tests.log"
+
+def parse_xml_reports_as_dict(workflow_run_id, workflow_run_attempt, tag, workflow_name, path="."):
+ test_cases = {}
+ items_list = os.listdir(path)
+ for dir in items_list:
+ new_dir = path + '/' + dir + '/'
+ if os.path.isdir(new_dir):
+ for xml_report in Path(new_dir).glob("**/*.xml"):
+ test_cases.update(
+ parse_xml_report(
+ tag,
+ xml_report,
+ workflow_run_id,
+ workflow_run_attempt,
+ workflow_name
+ )
+ )
+ return test_cases
+
+def get_test_status(test_case):
+ # In order of priority: S=skipped, F=failure, E=error, P=pass
+ if "skipped" in test_case and test_case["skipped"]:
+ type_message = test_case["skipped"]
+ if type_message.__contains__('type') and type_message['type'] == "pytest.xfail":
+ return "XFAILED"
+ else:
+ return "SKIPPED"
+ elif "failure" in test_case and test_case["failure"]:
+ return "FAILED"
+ elif "error" in test_case and test_case["error"]:
+ return "ERROR"
+ else:
+ return "PASSED"
+
+def get_test_message(test_case, status=None):
+ if status == "SKIPPED":
+ return test_case["skipped"] if "skipped" in test_case else ""
+ elif status == "FAILED":
+ return test_case["failure"] if "failure" in test_case else ""
+ elif status == "ERROR":
+ return test_case["error"] if "error" in test_case else ""
+ else:
+ if "skipped" in test_case:
+ return test_case["skipped"]
+ elif "failure" in test_case:
+ return test_case["failure"]
+ elif "error" in test_case:
+ return test_case["error"]
+ else:
+ return ""
+
+def get_test_file_running_time(test_suite):
+ if test_suite.__contains__('time'):
+ return test_suite["time"]
+ return 0
+
+def get_test_running_time(test_case):
+ if test_case.__contains__('time'):
+ return test_case["time"]
+ return ""
+
+def summarize_xml_files(path, workflow_name):
+ # statistics
+ TOTAL_TEST_NUM = 0
+ TOTAL_PASSED_NUM = 0
+ TOTAL_SKIPPED_NUM = 0
+ TOTAL_XFAIL_NUM = 0
+ TOTAL_FAILED_NUM = 0
+ TOTAL_ERROR_NUM = 0
+ TOTAL_EXECUTION_TIME = 0
+
+ #parse the xml files
+ test_cases = parse_xml_reports_as_dict(-1, -1, 'testcase', workflow_name, path)
+ test_suites = parse_xml_reports_as_dict(-1, -1, 'testsuite', workflow_name, path)
+ test_file_and_status = namedtuple("test_file_and_status", ["file_name", "status"])
+ # results dict
+ res = {}
+ res_item_list = [ "PASSED", "SKIPPED", "XFAILED", "FAILED", "ERROR" ]
+ test_file_items = set()
+ for (k,v) in list(test_suites.items()):
+ file_name = k[0]
+ if not file_name in test_file_items:
+ test_file_items.add(file_name)
+ # initialization
+ for item in res_item_list:
+ temp_item = test_file_and_status(file_name, item)
+ res[temp_item] = {}
+ temp_item_statistics = test_file_and_status(file_name, "STATISTICS")
+ res[temp_item_statistics] = {'TOTAL': 0, 'PASSED': 0, 'SKIPPED': 0, 'XFAILED': 0, 'FAILED': 0, 'ERROR': 0, 'EXECUTION_TIME': 0}
+ test_running_time = get_test_file_running_time(v)
+ res[temp_item_statistics]["EXECUTION_TIME"] += test_running_time
+ TOTAL_EXECUTION_TIME += test_running_time
+ else:
+ test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS")
+ test_running_time = get_test_file_running_time(v)
+ res[test_tuple_key_statistics]["EXECUTION_TIME"] += test_running_time
+ TOTAL_EXECUTION_TIME += test_running_time
+
+ for (k,v) in list(test_cases.items()):
+ file_name = k[0]
+ class_name = k[1]
+ test_name = k[2]
+ combined_name = file_name + "::" + class_name + "::" + test_name
+ test_status = get_test_status(v)
+ test_running_time = get_test_running_time(v)
+ test_message = get_test_message(v, test_status)
+ test_info_value = ""
+ test_tuple_key_status = test_file_and_status(file_name, test_status)
+ test_tuple_key_statistics = test_file_and_status(file_name, "STATISTICS")
+ TOTAL_TEST_NUM += 1
+ res[test_tuple_key_statistics]["TOTAL"] += 1
+ if test_status == "PASSED":
+ test_info_value = str(test_running_time)
+ res[test_tuple_key_status][combined_name] = test_info_value
+ res[test_tuple_key_statistics]["PASSED"] += 1
+ TOTAL_PASSED_NUM += 1
+ elif test_status == "SKIPPED":
+ test_info_value = str(test_running_time)
+ res[test_tuple_key_status][combined_name] = test_info_value
+ res[test_tuple_key_statistics]["SKIPPED"] += 1
+ TOTAL_SKIPPED_NUM += 1
+ elif test_status == "XFAILED":
+ test_info_value = str(test_running_time)
+ res[test_tuple_key_status][combined_name] = test_info_value
+ res[test_tuple_key_statistics]["XFAILED"] += 1
+ TOTAL_XFAIL_NUM += 1
+ elif test_status == "FAILED":
+ test_info_value = test_message
+ res[test_tuple_key_status][combined_name] = test_info_value
+ res[test_tuple_key_statistics]["FAILED"] += 1
+ TOTAL_FAILED_NUM += 1
+ elif test_status == "ERROR":
+ test_info_value = test_message
+ res[test_tuple_key_status][combined_name] = test_info_value
+ res[test_tuple_key_statistics]["ERROR"] += 1
+ TOTAL_ERROR_NUM += 1
+
+ # generate statistics_dict
+ statistics_dict = {}
+ statistics_dict["TOTAL"] = TOTAL_TEST_NUM
+ statistics_dict["PASSED"] = TOTAL_PASSED_NUM
+ statistics_dict["SKIPPED"] = TOTAL_SKIPPED_NUM
+ statistics_dict["XFAILED"] = TOTAL_XFAIL_NUM
+ statistics_dict["FAILED"] = TOTAL_FAILED_NUM
+ statistics_dict["ERROR"] = TOTAL_ERROR_NUM
+ statistics_dict["EXECUTION_TIME"] = TOTAL_EXECUTION_TIME
+ aggregate_item = workflow_name + "_aggregate"
+ total_item = test_file_and_status(aggregate_item, "STATISTICS")
+ res[total_item] = statistics_dict
+
+ return res
+
+def run_command_and_capture_output(cmd):
+ try:
+ print(f"Running command '{cmd}'")
+ with open(CONSOLIDATED_LOG_FILE_PATH, "a+") as output_file:
+ print(f"========================================", file=output_file, flush=True)
+ print(f"[RUN_PYTORCH_UNIT_TESTS] Running command '{cmd}'", file=output_file, flush=True) # send to consolidated file as well
+ print(f"========================================", file=output_file, flush=True)
+ p = subprocess.run(cmd, shell=True, stdout=output_file, stderr=STDOUT, text=True)
+ except CalledProcessError as e:
+ print(f"ERROR: Cmd {cmd} failed with return code: {e.returncode}!")
+
+def run_entire_tests(workflow_name, test_shell_path, overall_logs_path_current_run, test_reports_src):
+ if os.path.exists(test_reports_src):
+ shutil.rmtree(test_reports_src)
+
+ os.mkdir(test_reports_src)
+ copied_logs_path = ""
+ if workflow_name == "default":
+ os.environ['TEST_CONFIG'] = 'default'
+ copied_logs_path = overall_logs_path_current_run + "default_xml_results_entire_tests/"
+ elif workflow_name == "distributed":
+ os.environ['TEST_CONFIG'] = 'distributed'
+ copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_entire_tests/"
+ elif workflow_name == "inductor":
+ os.environ['TEST_CONFIG'] = 'inductor'
+ copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_entire_tests/"
+ # use test.sh for tests execution
+ run_command_and_capture_output(test_shell_path)
+ copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+ entire_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+ return entire_results_dict
+
+def run_priority_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src):
+ if os.path.exists(test_reports_src):
+ shutil.rmtree(test_reports_src)
+
+ os.mkdir(test_reports_src)
+ copied_logs_path = ""
+ if workflow_name == "default":
+ os.environ['TEST_CONFIG'] = 'default'
+ os.environ['HIP_VISIBLE_DEVICES'] = '0'
+ copied_logs_path = overall_logs_path_current_run + "default_xml_results_priority_tests/"
+ # use run_test.py for tests execution
+ default_priority_test_suites = " ".join(DEFAULT_CORE_TESTS)
+ command = "python3 " + test_run_test_path + " --include " + default_priority_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
+ run_command_and_capture_output(command)
+ del os.environ['HIP_VISIBLE_DEVICES']
+ elif workflow_name == "distributed":
+ os.environ['TEST_CONFIG'] = 'distributed'
+ os.environ['HIP_VISIBLE_DEVICES'] = '0,1'
+ copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_priority_tests/"
+ # use run_test.py for tests execution
+ distributed_priority_test_suites = " ".join(DISTRIBUTED_CORE_TESTS)
+ command = "python3 " + test_run_test_path + " --include " + distributed_priority_test_suites + " --distributed-tests --verbose"
+ run_command_and_capture_output(command)
+ del os.environ['HIP_VISIBLE_DEVICES']
+ copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+ priority_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+
+ return priority_results_dict
+
+def run_selected_tests(workflow_name, test_run_test_path, overall_logs_path_current_run, test_reports_src, selected_list):
+ if os.path.exists(test_reports_src):
+ shutil.rmtree(test_reports_src)
+
+ os.mkdir(test_reports_src)
+ copied_logs_path = ""
+ if workflow_name == "default":
+ os.environ['TEST_CONFIG'] = 'default'
+ os.environ['HIP_VISIBLE_DEVICES'] = '0'
+ copied_logs_path = overall_logs_path_current_run + "default_xml_results_selected_tests/"
+ # use run_test.py for tests execution
+ default_selected_test_suites = " ".join(selected_list)
+ command = "python3 " + test_run_test_path + " --include " + default_selected_test_suites + " --exclude-jit-executor --exclude-distributed-tests --verbose"
+ run_command_and_capture_output(command)
+ del os.environ['HIP_VISIBLE_DEVICES']
+ elif workflow_name == "distributed":
+ os.environ['TEST_CONFIG'] = 'distributed'
+ os.environ['HIP_VISIBLE_DEVICES'] = '0,1'
+ copied_logs_path = overall_logs_path_current_run + "distributed_xml_results_selected_tests/"
+ # use run_test.py for tests execution
+ distributed_selected_test_suites = " ".join(selected_list)
+ command = "python3 " + test_run_test_path + " --include " + distributed_selected_test_suites + " --distributed-tests --verbose"
+ run_command_and_capture_output(command)
+ del os.environ['HIP_VISIBLE_DEVICES']
+ elif workflow_name == "inductor":
+ os.environ['TEST_CONFIG'] = 'inductor'
+ copied_logs_path = overall_logs_path_current_run + "inductor_xml_results_selected_tests/"
+ inductor_selected_test_suites = ""
+ non_inductor_selected_test_suites = ""
+ for item in selected_list:
+ if "inductor/" in item:
+ inductor_selected_test_suites += item
+ inductor_selected_test_suites += " "
+ else:
+ non_inductor_selected_test_suites += item
+ non_inductor_selected_test_suites += " "
+ if inductor_selected_test_suites != "":
+ inductor_selected_test_suites = inductor_selected_test_suites[:-1]
+ command = "python3 " + test_run_test_path + " --include " + inductor_selected_test_suites + " --verbose"
+ run_command_and_capture_output(command)
+ if non_inductor_selected_test_suites != "":
+ non_inductor_selected_test_suites = non_inductor_selected_test_suites[:-1]
+ command = "python3 " + test_run_test_path + " --inductor --include " + non_inductor_selected_test_suites + " --verbose"
+ run_command_and_capture_output(command)
+ copied_logs_path_destination = shutil.copytree(test_reports_src, copied_logs_path)
+ selected_results_dict = summarize_xml_files(copied_logs_path_destination, workflow_name)
+
+ return selected_results_dict
+
+def run_test_and_summarize_results(
+ pytorch_root_dir: str,
+ priority_tests: bool,
+ test_config: List[str],
+ default_list: List[str],
+ distributed_list: List[str],
+ inductor_list: List[str],
+ skip_rerun: bool) -> Dict[str, Any]:
+
+ # copy current environment variables
+ _environ = dict(os.environ)
+
+ # modify path
+ test_shell_path = pytorch_root_dir + "/.ci/pytorch/test.sh"
+ test_run_test_path = pytorch_root_dir + "/test/run_test.py"
+ repo_test_log_folder_path = pytorch_root_dir + "/.automation_logs/"
+ test_reports_src = pytorch_root_dir + "/test/test-reports/"
+ run_test_python_file = pytorch_root_dir + "/test/run_test.py"
+
+ # change directory to pytorch root
+ os.chdir(pytorch_root_dir)
+
+ # all test results dict
+ res_all_tests_dict = {}
+
+ # patterns
+ search_text = "--reruns=2"
+ replace_text = "--reruns=0"
+
+ # create logs folder
+ if not os.path.exists(repo_test_log_folder_path):
+ os.mkdir(repo_test_log_folder_path)
+
+ # Set common environment variables for all scenarios
+ os.environ['CI'] = '1'
+ os.environ['PYTORCH_TEST_WITH_ROCM'] = '1'
+ os.environ['HSA_FORCE_FINE_GRAIN_PCIE'] = '1'
+ os.environ['PYTORCH_TESTING_DEVICE_ONLY_FOR'] = 'cuda'
+ os.environ['CONTINUE_THROUGH_ERROR'] = 'True'
+ if skip_rerun:
+ # modify run_test.py in-place
+ with open(run_test_python_file, 'r') as file:
+ data = file.read()
+ data = data.replace(search_text, replace_text)
+ with open(run_test_python_file, 'w') as file:
+ file.write(data)
+
+ # Time stamp
+ current_datetime = datetime.now().strftime("%Y%m%d_%H-%M-%S")
+ print("Current date & time : ", current_datetime)
+ # performed as Job ID
+ str_current_datetime = str(current_datetime)
+ overall_logs_path_current_run = repo_test_log_folder_path + str_current_datetime + "/"
+ os.mkdir(overall_logs_path_current_run)
+
+ global CONSOLIDATED_LOG_FILE_PATH
+ CONSOLIDATED_LOG_FILE_PATH = overall_logs_path_current_run + CONSOLIDATED_LOG_FILE_NAME
+
+ # Check multi gpu availability if distributed tests are enabled
+ if ("distributed" in test_config) or len(distributed_list) != 0:
+ check_num_gpus_for_distributed()
+
+ # Install test requirements
+ command = "pip3 install -r requirements.txt && pip3 install -r .ci/docker/requirements-ci.txt"
+ run_command_and_capture_output(command)
+
+ # Run entire tests for each workflow
+ if not priority_tests and not default_list and not distributed_list and not inductor_list:
+ # run entire tests for default, distributed and inductor workflows → use test.sh
+ if not test_config:
+ check_num_gpus_for_distributed()
+ # default test process
+ res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
+ res_all_tests_dict["default"] = res_default_all
+ # distributed test process
+ res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+ res_all_tests_dict["distributed"] = res_distributed_all
+ # inductor test process
+ res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
+ res_all_tests_dict["inductor"] = res_inductor_all
+ else:
+ workflow_list = []
+ for item in test_config:
+ workflow_list.append(item)
+ if "default" in workflow_list:
+ res_default_all = run_entire_tests("default", test_shell_path, overall_logs_path_current_run, test_reports_src)
+ res_all_tests_dict["default"] = res_default_all
+ if "distributed" in workflow_list:
+ res_distributed_all = run_entire_tests("distributed", test_shell_path, overall_logs_path_current_run, test_reports_src)
+ res_all_tests_dict["distributed"] = res_distributed_all
+ if "inductor" in workflow_list:
+ res_inductor_all = run_entire_tests("inductor", test_shell_path, overall_logs_path_current_run, test_reports_src)
+ res_all_tests_dict["inductor"] = res_inductor_all
+ # Run priority test for each workflow
+ elif priority_tests and not default_list and not distributed_list and not inductor_list:
+ if not test_config:
+ check_num_gpus_for_distributed()
+ # default test process
+ res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+ res_all_tests_dict["default"] = res_default_priority
+ # distributed test process
+ res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+ res_all_tests_dict["distributed"] = res_distributed_priority
+ # will not run inductor priority tests
+ print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
+ else:
+ workflow_list = []
+ for item in test_config:
+ workflow_list.append(item)
+ if "default" in workflow_list:
+ res_default_priority = run_priority_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+ res_all_tests_dict["default"] = res_default_priority
+ if "distributed" in workflow_list:
+ res_distributed_priority = run_priority_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src)
+ res_all_tests_dict["distributed"] = res_distributed_priority
+ if "inductor" in workflow_list:
+ print("Inductor priority tests cannot run since no core tests defined with inductor workflow.")
+ # Run specified tests for each workflow
+ elif (default_list or distributed_list or inductor_list) and not test_config and not priority_tests:
+ if default_list:
+ default_workflow_list = []
+ for item in default_list:
+ default_workflow_list.append(item)
+ res_default_selected = run_selected_tests("default", test_run_test_path, overall_logs_path_current_run, test_reports_src, default_workflow_list)
+ res_all_tests_dict["default"] = res_default_selected
+ if distributed_list:
+ distributed_workflow_list = []
+ for item in distributed_list:
+ distributed_workflow_list.append(item)
+ res_distributed_selected = run_selected_tests("distributed", test_run_test_path, overall_logs_path_current_run, test_reports_src, distributed_workflow_list)
+ res_all_tests_dict["distributed"] = res_distributed_selected
+ if inductor_list:
+ inductor_workflow_list = []
+ for item in inductor_list:
+ inductor_workflow_list.append(item)
+ res_inductor_selected = run_selected_tests("inductor", test_run_test_path, overall_logs_path_current_run, test_reports_src, inductor_workflow_list)
+ res_all_tests_dict["inductor"] = res_inductor_selected
+ else:
+ raise Exception("Invalid test configurations!")
+
+ # restore environment variables
+ os.environ.clear()
+ os.environ.update(_environ)
+
+ # restore files
+ if skip_rerun:
+ # modify run_test.py in-place
+ with open(run_test_python_file, 'r') as file:
+ data = file.read()
+ data = data.replace(replace_text, search_text)
+ with open(run_test_python_file, 'w') as file:
+ file.write(data)
+
+ return res_all_tests_dict
+
+def parse_args():
+ parser = argparse.ArgumentParser(description='Run PyTorch unit tests and generate xml results summary', formatter_class=argparse.RawTextHelpFormatter)
+ parser.add_argument('--test_config', nargs='+', default=[], type=str, help="space-separated list of test workflows to be executed eg. 'default distributed'")
+ parser.add_argument('--priority_tests', action='store_true', help="run priority tests only")
+ parser.add_argument('--default_list', nargs='+', default=[], help="space-separated list of 'default' config test suites/files to be executed eg. 'test_weak test_dlpack'")
+ parser.add_argument('--distributed_list', nargs='+', default=[], help="space-separated list of 'distributed' config test suites/files to be executed eg. 'distributed/test_c10d_common distributed/test_c10d_nccl'")
+ parser.add_argument('--inductor_list', nargs='+', default=[], help="space-separated list of 'inductor' config test suites/files to be executed eg. 'inductor/test_torchinductor test_ops'")
+ parser.add_argument('--pytorch_root', default='.', type=str, help="PyTorch root directory")
+ parser.add_argument('--skip_rerun', action='store_true', help="skip rerun process")
+ parser.add_argument('--example_output', type=str, help="{'workflow_name': {\n"
+ " test_file_and_status(file_name='workflow_aggregate', status='STATISTICS'): {}, \n"
+ " test_file_and_status(file_name='test_file_name_1', status='ERROR'): {}, \n"
+ " test_file_and_status(file_name='test_file_name_1', status='FAILED'): {}, \n"
+ " test_file_and_status(file_name='test_file_name_1', status='PASSED'): {}, \n"
+ " test_file_and_status(file_name='test_file_name_1', status='SKIPPED'): {}, \n"
+ " test_file_and_status(file_name='test_file_name_1', status='STATISTICS'): {} \n"
+ "}}\n")
+ parser.add_argument('--example_usages', type=str, help="RUN ALL TESTS: python3 run_pytorch_unit_tests.py \n"
+ "RUN PRIORITY TESTS: python3 run_pytorch_unit_tests.py --test_config distributed --priority_test \n"
+ "RUN SELECTED TESTS: python3 run_pytorch_unit_tests.py --default_list test_weak test_dlpack --inductor_list inductor/test_torchinductor")
+ return parser.parse_args()
+
+def check_num_gpus_for_distributed():
+ p = subprocess.run("rocminfo | grep -cE 'Name:\s+gfx'", shell=True, capture_output=True, text=True)
+ num_gpus_visible = int(p.stdout)
+ assert num_gpus_visible > 1, "Number of visible GPUs should be >1 to run distributed unit tests"
+
+def main():
+ args = parse_args()
+ all_tests_results = run_test_and_summarize_results(args.pytorch_root, args.priority_tests, args.test_config, args.default_list, args.distributed_list, args.inductor_list, args.skip_rerun)
+ pprint(dict(all_tests_results))
+
+if __name__ == "__main__":
+ main()
diff --git a/.ci/caffe2/test.sh b/.ci/caffe2/test.sh
index eaef1e3ebf88..7d1ce2fb4fa1 100755
--- a/.ci/caffe2/test.sh
+++ b/.ci/caffe2/test.sh
@@ -5,7 +5,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
if [[ ${BUILD_ENVIRONMENT} == *onnx* ]]; then
pip install click mock tabulate networkx==2.0
- pip -q install --user "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx"
+ pip -q install "file:///var/lib/jenkins/workspace/third_party/onnx#egg=onnx"
fi
# Skip tests in environments where they are not built/applicable
@@ -147,8 +147,8 @@ export DNNL_MAX_CPU_ISA=AVX2
if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then
# TODO(sdym@meta.com) remove this when the linked issue resolved.
# py is temporary until https://github.com/Teemu/pytest-sugar/issues/241 is fixed
- pip install --user py==1.11.0
- pip install --user pytest-sugar
+ pip install py==1.11.0
+ pip install pytest-sugar
# NB: Warnings are disabled because they make it harder to see what
# the actual erroring test is
"$PYTHON" \
diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh
index 6b978b8f4b55..6624d9928cbe 100755
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@@ -52,6 +52,8 @@ fi
if [[ "$image" == *-jammy* ]]; then
UBUNTU_VERSION=22.04
+elif [[ "$image" == *-noble* ]]; then
+ UBUNTU_VERSION=24.04
elif [[ "$image" == *ubuntu* ]]; then
extract_version_from_image_name ubuntu UBUNTU_VERSION
fi
diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt
index 568756a804f0..cf43cba72a42 100644
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@@ -1 +1 @@
-ae848267bebc65c6181e8cc5e64a6357d2679260
+5e5685356b9fc7b5ad9cdf4e510a1994a5b8601a
diff --git a/.ci/docker/common/common_utils.sh b/.ci/docker/common/common_utils.sh
index 27c1b815a0ea..110065698b58 100644
--- a/.ci/docker/common/common_utils.sh
+++ b/.ci/docker/common/common_utils.sh
@@ -23,6 +23,10 @@ conda_install() {
as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
}
+conda_install_through_forge() {
+ as_jenkins conda install -c conda-forge -q -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" $*
+}
+
conda_run() {
as_jenkins conda run -n py_$ANACONDA_PYTHON_VERSION --no-capture-output $*
}
diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh
index 64304fec6ed9..7d8ae247d7a0 100755
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@@ -15,6 +15,9 @@ install_ubuntu() {
elif [[ "$UBUNTU_VERSION" == "22.04"* ]]; then
cmake3="cmake=3.22*"
maybe_libiomp_dev=""
+ elif [[ "$UBUNTU_VERSION" == "24.04"* ]]; then
+ cmake3="cmake=3.28*"
+ maybe_libiomp_dev=""
else
cmake3="cmake=3.5*"
maybe_libiomp_dev="libiomp-dev"
diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh
index 11c51cac0bf8..b33f7f0a1e9d 100755
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@@ -87,6 +87,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
fi
+ if [[ "$UBUNTU_VERSION" == "24.04"* ]] ; then
+ conda_install_through_forge libstdcxx-ng=14
+ fi
+
# Install some other packages, including those needed for Python test reporting
pip_install -r /opt/conda/requirements-ci.txt
diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh
index 2a8d5b30e74e..fe2f35838fd9 100644
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@@ -8,9 +8,11 @@ ver() {
install_ubuntu() {
apt-get update
- if [[ $UBUNTU_VERSION == 20.04 ]]; then
- # gpg-agent is not available by default on 20.04
- apt-get install -y --no-install-recommends gpg-agent
+ # gpg-agent is not available by default
+ apt-get install -y --no-install-recommends gpg-agent
+ if [[ $(ver $UBUNTU_VERSION) -ge $(ver 22.04) ]]; then
+ echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' \
+ | sudo tee /etc/apt/preferences.d/rocm-pin-600
fi
apt-get install -y kmod
apt-get install -y wget
diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh
index 364ee23b97e5..db826ed6e027 100644
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@@ -1,32 +1,60 @@
-#!/usr/bin/env bash
-# Script used only in CD pipeline
+#!/bin/bash
+# Script used in CI and CD pipeline
-set -eou pipefail
+set -ex
-function do_install() {
- rocm_version=$1
- rocm_version_nodot=${1//./}
+ver() {
+ printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
+}
- # Version 2.7.2 + ROCm related updates
- MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
- magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+# Magma build scripts need `python`
+ln -sf /usr/bin/python3 /usr/bin/python
- rocm_dir="/opt/rocm"
- (
- set -x
- tmp_dir=$(mktemp -d)
- pushd ${tmp_dir}
- curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
- if tar -xvf "${magma_archive}"
- then
- mkdir -p "${rocm_dir}/magma"
- mv include "${rocm_dir}/magma/include"
- mv lib "${rocm_dir}/magma/lib"
- else
- echo "${magma_archive} not found, skipping magma install"
- fi
- popd
- )
-}
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+ almalinux)
+ yum install -y gcc-gfortran
+ ;;
+ *)
+ echo "No preinstalls to build magma..."
+ ;;
+esac
+
+MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
+
+# "install" hipMAGMA into /opt/rocm/magma by copying after build
+if [[ $(ver $ROCM_VERSION) -ge $(ver 7.0) ]]; then
+ git clone https://github.com/ROCm/utk-magma.git -b release/2.9.0_rocm70 magma
+ pushd magma
+ # version 2.9 + ROCm 7.0 related updates
+ git checkout 91c4f720a17e842b364e9de41edeef76995eb9ad
+else
+ git clone https://bitbucket.org/icl/magma.git
+ pushd magma
+ # Version 2.7.2 + ROCm related updates
+ git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
+fi
-do_install $1
+cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
+echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
+if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
+ echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
+fi
+echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
+echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
+export PATH="${PATH}:/opt/rocm/bin"
+if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
+ amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
+else
+ amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
+fi
+for arch in $amdgpu_targets; do
+ echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
+done
+# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
+sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
+make -f make.gen.hipMAGMA -j $(nproc)
+LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
+make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
+popd
+mv magma /opt/rocm
diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh
index f5e39fbaf9cc..f5b552e07971 100755
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@@ -21,7 +21,7 @@ elif [ -n "${TRITON_CPU}" ]; then
TRITON_REPO="https://github.com/triton-lang/triton-cpu"
TRITON_TEXT_FILE="triton-cpu"
else
- TRITON_REPO="https://github.com/triton-lang/triton"
+ TRITON_REPO="https://github.com/ROCm/triton"
TRITON_TEXT_FILE="triton"
fi
diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
index 4ecdde62408d..72811c384900 100644
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@@ -16,6 +16,7 @@ click
#test that import:
coremltools==5.0b5 ; python_version < "3.12"
+coremltools==8.3 ; python_version == "3.12"
#Description: Apple framework for ML integration
#Pinned versions: 5.0b5
#test that import:
@@ -63,6 +64,7 @@ lark==0.12.0
#test that import:
librosa>=0.6.2 ; python_version < "3.11"
+librosa==0.10.2 ; python_version == "3.12"
#Description: A python package for music and audio analysis
#Pinned versions: >=0.6.2
#test that import: test_spectral_ops.py
@@ -108,9 +110,8 @@ ninja==1.11.1.3
#Pinned versions: 1.11.1.3
#test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
-numba==0.49.0 ; python_version < "3.9"
-numba==0.55.2 ; python_version == "3.9"
-numba==0.55.2 ; python_version == "3.10"
+numba==0.60.0 ; python_version == "3.9"
+numba==0.61.2 ; python_version > "3.9"
#Description: Just-In-Time Compiler for Numerical Functions
#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
#test that import: test_numba_integration.py
@@ -128,12 +129,10 @@ numba==0.55.2 ; python_version == "3.10"
#test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
#test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
#test_binary_ufuncs.py
-numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
-numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
-numpy==2.1.2; python_version >= "3.13"
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
-pandas==2.0.3; python_version < "3.13"
-pandas==2.2.3; python_version >= "3.13"
+pandas==2.2.3
#onnxruntime
#Description: scoring engine for Open Neural Network Exchange (ONNX) models
@@ -244,8 +243,8 @@ scikit-image==0.22.0 ; python_version >= "3.10"
#Pinned versions: 0.20.3
#test that import:
-scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.14.1 ; python_version >= "3.12"
+scipy==1.13.1 ; python_version == "3.9"
+scipy==1.14.1 ; python_version > "3.9"
# Pin SciPy because of failing distribution tests (see #60347)
#Description: scientific python
#Pinned versions: 1.10.1
@@ -309,8 +308,7 @@ z3-solver==4.12.6.0
#Pinned versions:
#test that import:
-tensorboard==2.13.0 ; python_version < "3.13"
-tensorboard==2.18.0 ; python_version >= "3.13"
+tensorboard==2.18.0
#Description: Also included in .ci/docker/requirements-docs.txt
#Pinned versions:
#test that import: test_tensorboard
diff --git a/.ci/onnx/test.sh b/.ci/onnx/test.sh
index a7d3b72c62a7..d42ca2c218de 100755
--- a/.ci/onnx/test.sh
+++ b/.ci/onnx/test.sh
@@ -19,7 +19,7 @@ git config --global --add safe.directory /var/lib/jenkins/workspace
if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
# TODO: This can be removed later once vision is also part of the Docker image
- pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
+ pip install -q --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
# JIT C++ extensions require ninja, so put it into PATH.
export PATH="/var/lib/jenkins/.local/bin:$PATH"
# NB: ONNX test is fast (~15m) so it's ok to retry it few more times to avoid any flaky issue, we
diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh
index 8b05766ef400..092d88d6387f 100644
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@@ -127,9 +127,9 @@ function install_torchaudio() {
if [[ "$1" == "cuda" ]]; then
# TODO: This is better to be passed as a parameter from _linux-test workflow
# so that it can be consistent with what is set in build
- TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${commit}"
+ TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install --no-use-pep517 "git+https://github.com/pytorch/audio.git@${commit}"
else
- pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git@${commit}"
+ pip_install --no-use-pep517 "git+https://github.com/pytorch/audio.git@${commit}"
fi
}
@@ -139,8 +139,8 @@ function install_torchtext() {
local text_commit
data_commit=$(get_pinned_commit data)
text_commit=$(get_pinned_commit text)
- pip_install --no-use-pep517 --user "git+https://github.com/pytorch/data.git@${data_commit}"
- pip_install --no-use-pep517 --user "git+https://github.com/pytorch/text.git@${text_commit}"
+ pip_install --no-use-pep517 "git+https://github.com/pytorch/data.git@${data_commit}"
+ pip_install --no-use-pep517 "git+https://github.com/pytorch/text.git@${text_commit}"
}
function install_torchvision() {
@@ -153,7 +153,7 @@ function install_torchvision() {
echo 'char* dlerror(void) { return "";}'|gcc -fpic -shared -o "${HOME}/dlerror.so" -x c -
LD_PRELOAD=${orig_preload}:${HOME}/dlerror.so
fi
- pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git@${commit}"
+ pip_install --no-use-pep517 "git+https://github.com/pytorch/vision.git@${commit}"
if [ -n "${LD_PRELOAD}" ]; then
LD_PRELOAD=${orig_preload}
fi
@@ -173,7 +173,7 @@ function install_torchrec_and_fbgemm() {
if [[ "$BUILD_ENVIRONMENT" == *rocm* ]] ; then
# install torchrec first because it installs fbgemm nightly on top of rocm fbgemm
- pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+ pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
pip_uninstall fbgemm-gpu-nightly
pip_install tabulate # needed for newer fbgemm
@@ -190,8 +190,8 @@ function install_torchrec_and_fbgemm() {
rm -rf fbgemm
else
# See https://github.com/pytorch/pytorch/issues/106971
- CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
- pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+ CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
+ pip_install --no-use-pep517 "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
fi
}
@@ -234,7 +234,7 @@ function checkout_install_torchbench() {
function install_torchao() {
local commit
commit=$(get_pinned_commit torchao)
- pip_install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${commit}"
+ pip_install --no-use-pep517 "git+https://github.com/pytorch/ao.git@${commit}"
}
function print_sccache_stats() {
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
index 425cc2a80dc7..52fb572b81c4 100755
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@@ -201,7 +201,7 @@ fi
if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
# JIT C++ extensions require ninja.
- pip_install --user "ninja==1.10.2"
+ pip_install "ninja==1.10.2"
# ninja is installed in $HOME/.local/bin, e.g., /var/lib/jenkins/.local/bin for CI user jenkins
# but this script should be runnable by any user, including root
export PATH="$HOME/.local/bin:$PATH"
@@ -502,7 +502,7 @@ DYNAMO_BENCHMARK_FLAGS=()
pr_time_benchmarks() {
- pip_install --user "fbscribelogger"
+ pip_install "fbscribelogger"
TEST_REPORTS_DIR=$(pwd)/test/test-reports
mkdir -p "$TEST_REPORTS_DIR"
@@ -1469,8 +1469,8 @@ test_bazel() {
test_benchmarks() {
if [[ "$BUILD_ENVIRONMENT" == *cuda* && $TEST_CONFIG != *nogpu* ]]; then
- pip_install --user "pytest-benchmark==3.2.3"
- pip_install --user "requests"
+ pip_install "pytest-benchmark==3.2.3"
+ pip_install "requests"
BENCHMARK_DATA="benchmarks/.data"
mkdir -p ${BENCHMARK_DATA}
pytest benchmarks/fastrnns/test_bench.py --benchmark-sort=Name --benchmark-json=${BENCHMARK_DATA}/fastrnns_default.json --fuser=default --executor=default
diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh
index 7f89c5c2dd8e..b2df131ec33c 100755
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@@ -5,7 +5,9 @@ export TZ=UTC
tagged_version() {
GIT_DIR="${workdir}/pytorch/.git"
GIT_DESCRIBE="git --git-dir ${GIT_DIR} describe --tags --match v[0-9]*.[0-9]*.[0-9]*"
- if [[ ! -d "${GIT_DIR}" ]]; then
+ if [[ -n "${CIRCLE_TAG:-}" ]]; then
+ echo "${CIRCLE_TAG}"
+ elif [[ ! -d "${GIT_DIR}" ]]; then
echo "Abort, abort! Git dir ${GIT_DIR} does not exists!"
kill $$
elif ${GIT_DESCRIBE} --exact >/dev/null; then
@@ -69,6 +71,8 @@ fi
export PYTORCH_BUILD_NUMBER=1
+# This part is done in the builder scripts so commenting the duplicate code
+: <<'BLOCK_COMMENT'
# Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
@@ -117,6 +121,7 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
fi
fi
+BLOCK_COMMENT
USE_GLOO_WITH_OPENSSL="ON"
if [[ "$GPU_ARCH_TYPE" =~ .*aarch64.* ]]; then
diff --git a/.github/scripts/amd/package_triton_wheel.sh b/.github/scripts/amd/package_triton_wheel.sh
index 6ecf8bab116b..fe8d915422da 100755
--- a/.github/scripts/amd/package_triton_wheel.sh
+++ b/.github/scripts/amd/package_triton_wheel.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
set -ex
# Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
@@ -50,29 +51,15 @@ do
cp $lib $TRITON_ROCM_DIR/lib/
done
-# Required ROCm libraries
-if [[ "${MAJOR_VERSION}" == "6" ]]; then
- libamdhip="libamdhip64.so.6"
-else
- libamdhip="libamdhip64.so.5"
-fi
-
# Required ROCm libraries - ROCm 6.0
ROCM_SO=(
- "${libamdhip}"
- "libhsa-runtime64.so.1"
- "libdrm.so.2"
- "libdrm_amdgpu.so.1"
+ "libamdhip64.so"
+ "libhsa-runtime64.so"
+ "libdrm.so"
+ "libdrm_amdgpu.so"
+ "libamd_comgr.so"
+ "librocprofiler-register.so"
)
-if [[ $ROCM_INT -ge 60400 ]]; then
- ROCM_SO+=("libamd_comgr.so.3")
-else
- ROCM_SO+=("libamd_comgr.so.2")
-fi
-
-if [[ $ROCM_INT -ge 60100 ]]; then
- ROCM_SO+=("librocprofiler-register.so.0")
-fi
for lib in "${ROCM_SO[@]}"
do
@@ -94,10 +81,6 @@ do
fi
cp $file_path $TRITON_ROCM_DIR/lib
- # When running locally, and not building a wheel, we need to satisfy shared objects requests that don't look for versions
- LINKNAME=$(echo $lib | sed -e 's/\.so.*/.so/g')
- ln -sf $lib $TRITON_ROCM_DIR/lib/$LINKNAME
-
done
# Copy Include Files
diff --git a/.github/scripts/amd/patch_triton_wheel.sh b/.github/scripts/amd/patch_triton_wheel.sh
index 366913463154..fb3c0f36ddb4 100755
--- a/.github/scripts/amd/patch_triton_wheel.sh
+++ b/.github/scripts/amd/patch_triton_wheel.sh
@@ -19,15 +19,13 @@ replace_needed_sofiles() {
find $1 -name '*.so*' -o -name 'ld.lld' | while read sofile; do
origname=$2
patchedname=$3
- if [[ "$origname" != "$patchedname" ]]; then
- set +e
- origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
- ERRCODE=$?
- set -e
- if [ "$ERRCODE" -eq "0" ]; then
- echo "patching $sofile entry $origname to $patchedname"
- $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
- fi
+ set +e
+ origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
+ ERRCODE=$?
+ set -e
+ if [ "$ERRCODE" -eq "0" ]; then
+ echo "patching $sofile entry $origname to $patchedname"
+ $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
fi
done
}
diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py
index beec9f96aba2..695b4a9c865a 100644
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
import os
+import re
import shutil
import sys
from pathlib import Path
@@ -50,6 +51,30 @@ def patch_init_py(
with open(path, "w") as f:
f.write(orig)
+def get_rocm_version() -> str:
+ rocm_path = os.environ.get('ROCM_HOME') or os.environ.get('ROCM_PATH') or "/opt/rocm"
+ rocm_version = "0.0.0"
+ rocm_version_h = f"{rocm_path}/include/rocm-core/rocm_version.h"
+ if not os.path.isfile(rocm_version_h):
+ rocm_version_h = f"{rocm_path}/include/rocm_version.h"
+ # The file could be missing due to 1) ROCm version < 5.2, or 2) no ROCm install.
+ if os.path.isfile(rocm_version_h):
+ RE_MAJOR = re.compile(r"#define\s+ROCM_VERSION_MAJOR\s+(\d+)")
+ RE_MINOR = re.compile(r"#define\s+ROCM_VERSION_MINOR\s+(\d+)")
+ RE_PATCH = re.compile(r"#define\s+ROCM_VERSION_PATCH\s+(\d+)")
+ major, minor, patch = 0, 0, 0
+ for line in open(rocm_version_h):
+ match = RE_MAJOR.search(line)
+ if match:
+ major = int(match.group(1))
+ match = RE_MINOR.search(line)
+ if match:
+ minor = int(match.group(1))
+ match = RE_PATCH.search(line)
+ if match:
+ patch = int(match.group(1))
+ rocm_version = str(major)+"."+str(minor)+"."+str(patch)
+ return rocm_version
def build_triton(
*,
@@ -64,7 +89,12 @@ def build_triton(
if "MAX_JOBS" not in env:
max_jobs = os.cpu_count() or 1
env["MAX_JOBS"] = str(max_jobs)
-
+ if not release:
+ # Nightly binaries include the triton commit hash, i.e. 2.1.0+e6216047b8
+ # while release build should only include the version, i.e. 2.1.0
+ rocm_version = get_rocm_version()
+ version_suffix = f"+rocm{rocm_version}.git{commit_hash[:8]}"
+ version += version_suffix
with TemporaryDirectory() as tmpdir:
triton_basedir = Path(tmpdir) / "triton"
triton_pythondir = triton_basedir / "python"
@@ -72,6 +102,7 @@ def build_triton(
triton_repo = "https://github.com/openai/triton"
if device == "rocm":
triton_pkg_name = "pytorch-triton-rocm"
+ triton_repo = "https://github.com/ROCm/triton"
elif device == "xpu":
triton_pkg_name = "pytorch-triton-xpu"
triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"
@@ -88,6 +119,7 @@ def build_triton(
# change built wheel name and version
env["TRITON_WHEEL_NAME"] = triton_pkg_name
+ env["TRITON_WHEEL_VERSION_SUFFIX"] = version_suffix
if with_clang_ldd:
env["TRITON_BUILD_WITH_CLANG_LLD"] = "1"
diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp
index fd346b2d9af0..fefcf731cd82 100644
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@@ -419,7 +419,7 @@ void Context::setROCmFAPreferredBackend(at::ROCmFABackend b) {
if(b == at::ROCmFABackend::Ck) {
static const bool ck_unsupported = []() {
static const std::vector archs = {
- "gfx90a", "gfx942"
+ "gfx90a", "gfx942", "gfx950"
};
for (auto index: c10::irange(detail::getCUDAHooks().deviceCount())) {
if (!detail::getCUDAHooks().isGPUArch(archs, index)) {
diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp
index 89350a11bea7..573389bcd7c7 100644
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@@ -188,82 +188,11 @@ uint32_t _getAlignment(uintptr_t address) {
}
#endif
-static size_t _parseChosenWorkspaceSize() {
- auto val = c10::utils::get_env("CUBLASLT_WORKSPACE_SIZE");
-#ifdef USE_ROCM
- if (!val.has_value()) {
- // accept either env var
- val = c10::utils::get_env("HIPBLASLT_WORKSPACE_SIZE");
- }
- size_t workspace_size = 76*1024; /* Use 76 MB for hipBLASLt */
-#else
- size_t workspace_size = 1024; /* default size in KiB according to #73328 */
-#endif
-
- if (val.has_value()) {
- try {
- workspace_size = std::stoi(val.value());
- } catch (std::invalid_argument const&) {
- TORCH_WARN(
- "invalid CUBLASLT_WORKSPACE_SIZE,",
- " using default workspace size of ",
- workspace_size,
- " KiB.");
- } catch (std::out_of_range const&) {
- TORCH_WARN(
- "CUBLASLT_WORKSPACE_SIZE out of range,",
- " using default workspace size of ",
- workspace_size,
- " KiB.");
- }
- }
- return workspace_size * 1024;
-}
-
-static size_t _getWorkspaceSize() {
- static size_t workspace_size = _parseChosenWorkspaceSize();
- return workspace_size;
-}
-
-void* _getUnifiedWorkspaceWithoutHandle() {
- cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
- auto stream = c10::cuda::getCurrentCUDAStream();
- cudaStream_t _stream = stream;
- auto key = std::make_tuple(static_cast(handle), static_cast(_stream));
- auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
- TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
- return workspace_it->second.mutable_get();
-}
-
struct CublasLtWorkspace {
CublasLtWorkspace() {
- size = _getWorkspaceSize();
-#ifndef USE_ROCM
- static bool unified = c10::utils::check_env("TORCH_CUBLASLT_UNIFIED_WORKSPACE") == true;
- if (unified) {
- auto cublasWorkspaceSize = at::cuda::getChosenWorkspaceSize();
- if (cublasWorkspaceSize < size) {
- TORCH_WARN_ONCE("Requested unified CUBLASLT workspace size of ", size,
- " bytes exceeds CUBLAS workspace size of ", cublasWorkspaceSize,
- " bytes. Please increase CUBLAS workspace size",
- " via CUBLAS_WORKSPACE_CONFIG or decrease requested"
- " CUBLASLT_WORKSPACE_SIZE. Otherwise CUBLASLT workspace"
- " size will be limited to the CUBLAS workspace size.");
- size = cublasWorkspaceSize;
- }
- ptr = _getUnifiedWorkspaceWithoutHandle();
- } else {
- auto allocator = c10::cuda::CUDACachingAllocator::get();
- stashed_ptr_ = allocator->allocate(size);
- ptr = stashed_ptr_.mutable_get();
- }
-#else
- auto allocator = c10::cuda::CUDACachingAllocator::get();
- stashed_ptr_ = allocator->allocate(size);
- ptr = stashed_ptr_.mutable_get();
-#endif
+ size = at::cuda::getCUDABlasLtWorkspaceSize();
+ ptr = at::cuda::getCUDABlasLtWorkspace();
}
- at::DataPtr stashed_ptr_;
void * ptr;
size_t size;
};
@@ -1879,6 +1808,16 @@ void scaled_gemm(
matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
}
+ else if(mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
+#if ROCM_VERSION >= 70000
+ if (at::detail::getCUDAHooks().isGPUArch(0, {"gfx950"})) {
+ // Validate matrix dimensions for MX format
+ TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
+ "Matrix dimensions must be multiples of 32 for MX format. ",
+ "Got m=", m, ", n=", n, ", k=", k);
+ }
+#endif
+ }
#else
// rowwise isn't supported using older hipblaslt
TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with older hipblaslt");
@@ -1917,11 +1856,11 @@ void scaled_gemm(
}
if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
-#if CUDA_VERSION >= 12080
+#if (!defined(USE_ROCM) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0);
#else
- TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 and above");
+ TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales is only supported for CUDA 12.8 or ROCm 7.0(with gfx950) and above");
#endif // if CUDA_VERSION >= 12080
} else if (mat1_scale_dtype == kFloat8_e4m3fn && mat2_scale_dtype == kFloat8_e4m3fn) {
#if CUDA_VERSION >= 12080
@@ -2101,10 +2040,8 @@ void int8_gemm(
#ifdef USE_ROCM
CuBlasLtMatmulPreference preference;
- size_t workspaceSize = _getWorkspaceSize();
- preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize);
- auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
- auto workspace = allocator.allocate(workspaceSize);
+ auto ltworkspace = CublasLtWorkspace();
+ preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, ltworkspace.size);
cublasLtMatmulHeuristicResult_t heuristicResult = {};
int returnedResult = 0;
TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic(
@@ -2142,12 +2079,12 @@ void int8_gemm(
nullptr, // Heuristics don't seem to work for int8
#endif
#ifdef USE_ROCM
- workspace.mutable_get(),
+ ltworkspace.ptr,
#else
nullptr, // Non-zero workspace doesn't seem to work.
#endif
#ifdef USE_ROCM
- workspaceSize,
+ ltworkspace.size,
#else
0,
#endif
diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h
index 65019bb6097c..86e960cc1ab4 100644
--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@@ -89,7 +89,10 @@ TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();
TORCH_CUDA_CPP_API void clearCublasWorkspaces();
TORCH_CUDA_CPP_API std::map, at::DataPtr>& cublas_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API std::map, at::DataPtr>& cublaslt_handle_stream_to_workspace();
TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
+TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
+TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
#if defined(CUDART_VERSION) || defined(USE_ROCM)
TORCH_CUDA_CPP_API cusolverDnHandle_t getCurrentCUDASolverDnHandle();
diff --git a/aten/src/ATen/cuda/CublasHandlePool.cpp b/aten/src/ATen/cuda/CublasHandlePool.cpp
index 720304ad198e..81159d5c0e27 100644
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@@ -23,6 +23,9 @@
* To work around this difference in behavior, a separate handle pool is available for ROCm builds.
* For CUDA builds, getCurrentCUDABlasLtHandle will alias for getCurrentCUDABlasHandle,
* whereas for ROCm builds, it is a distinct function.
+ *
+ * The workspace pools are separate for ROCm. On CUDA, the env var
+ * TORCH_CUBLASLT_UNIFIED_WORKSPACE can be used to opt-in to unifying the workspace pools.
*/
namespace at::cuda {
@@ -109,8 +112,14 @@ std::map, at::DataPtr>& cublas_handle_stream_to_works
return instance;
}
+std::map, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
+ static auto& instance = *new std::map, at::DataPtr>;
+ return instance;
+}
+
void clearCublasWorkspaces() {
cublas_handle_stream_to_workspace().clear();
+ cublaslt_handle_stream_to_workspace().clear();
}
size_t parseChosenWorkspaceSize() {
@@ -157,15 +166,97 @@ size_t parseChosenWorkspaceSize() {
}
}
+size_t parseCUDABlasLtWorkspaceSize() {
+ auto val = c10::utils::get_env("CUBLASLT_WORKSPACE_SIZE");
+#ifdef USE_ROCM
+ if (!val.has_value()) {
+ // accept either env var
+ val = c10::utils::get_env("HIPBLASLT_WORKSPACE_SIZE");
+ }
+ size_t workspace_size = 76*1024; /* Use 76 MB for hipBLASLt */
+#else
+ size_t workspace_size = 1024; /* default size in KiB according to #73328 */
+#endif
+
+ if (val.has_value()) {
+ try {
+ workspace_size = std::stoi(val.value());
+ } catch (std::invalid_argument const&) {
+ TORCH_WARN(
+ "invalid CUBLASLT_WORKSPACE_SIZE,",
+ " using default workspace size of ",
+ workspace_size,
+ " KiB.");
+ } catch (std::out_of_range const&) {
+ TORCH_WARN(
+ "CUBLASLT_WORKSPACE_SIZE out of range,",
+ " using default workspace size of ",
+ workspace_size,
+ " KiB.");
+ }
+ }
+ return workspace_size * 1024;
+}
+
size_t getChosenWorkspaceSize() {
size_t pool_size = parseChosenWorkspaceSize();
return pool_size;
}
+#define TORCH_CUBLASLT_UNIFIED_WORKSPACE "TORCH_CUBLASLT_UNIFIED_WORKSPACE"
+
+size_t getCUDABlasLtWorkspaceSize() {
+ size_t pool_size = parseCUDABlasLtWorkspaceSize();
+#ifndef USE_ROCM
+ static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
+ if (unified) {
+ auto cublasWorkspaceSize = getChosenWorkspaceSize();
+ if (cublasWorkspaceSize < pool_size) {
+ TORCH_WARN_ONCE("Requested unified CUBLASLT workspace size of ", pool_size,
+ " bytes exceeds CUBLAS workspace size of ", cublasWorkspaceSize,
+ " bytes. Please increase CUBLAS workspace size",
+ " via CUBLAS_WORKSPACE_CONFIG or decrease requested"
+ " CUBLASLT_WORKSPACE_SIZE. Otherwise CUBLASLT workspace"
+ " size will be limited to the CUBLAS workspace size.");
+ pool_size = cublasWorkspaceSize;
+ }
+ }
+#endif
+ return pool_size;
+}
+
at::DataPtr getNewWorkspace() {
return c10::cuda::CUDACachingAllocator::get()->allocate(getChosenWorkspaceSize());
}
+at::DataPtr getNewCUDABlasLtWorkspace() {
+ return c10::cuda::CUDACachingAllocator::get()->allocate(getCUDABlasLtWorkspaceSize());
+}
+
+void* getCUDABlasLtWorkspace() {
+#ifndef USE_ROCM
+ static bool unified = c10::utils::check_env(TORCH_CUBLASLT_UNIFIED_WORKSPACE) == true;
+ if (unified) {
+ cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+ auto stream = c10::cuda::getCurrentCUDAStream();
+ cudaStream_t _stream = stream;
+ auto key = std::make_tuple(static_cast(handle), static_cast(_stream));
+ auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
+ TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
+ return workspace_it->second.mutable_get();
+ }
+#endif
+ cublasLtHandle_t handle = getCurrentCUDABlasLtHandle();
+ auto stream = c10::cuda::getCurrentCUDAStream();
+ cudaStream_t _stream = stream;
+ auto key = std::make_tuple(static_cast(handle), static_cast(_stream));
+ auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
+ if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
+ workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
+ }
+ return workspace_it->second.mutable_get();
+}
+
cublasHandle_t getCurrentCUDABlasHandle() {
c10::DeviceIndex device = 0;
AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
index a23a2d720c5c..fe6d1161d1ba 100644
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@@ -381,28 +381,6 @@ static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) {
return HIPBLAS_OP_T;
}
-static size_t GetHipblasltWorkspaceSize() {
- static const auto env = c10::utils::get_env("HIPBLASLT_WORKSPACE_SIZE");
- // 256MB is max workspace size allowed for hipblaslt
- // hipblaslt-bench uses 32MB
- // recommendation from hipblaslt author was 76MB
- // TunableOp hipBLASLt workspace size is aligned with
- // PyTorch's default in CUDABlas.cpp (_parseChosenWorkspaceSize)
- size_t workspace_size = 76*1024;
- if (env) {
- try {
- workspace_size = std::stoi(env.value());
- } catch(std::invalid_argument const& e) {
- TORCH_WARN("invalid HIPBLASLT_WORKSPACE_SIZE,",
- " using default workspace size of ", workspace_size, " KiB.");
- } catch(std::out_of_range const& e) {
- TORCH_WARN("HIPBLASLT_WORKSPACE_SIZE out of range,",
- " using default workspace size of ", workspace_size, " KiB.");
- }
- }
- return workspace_size * 1024;
-}
-
template
struct HipBlasLtDeleter {
void operator()(T* x) {
@@ -550,7 +528,7 @@ class HipblasltGemmOp : public Callable {
}
}
- size_t workspace_size = GetHipblasltWorkspaceSize();
+ size_t workspace_size = at::cuda::getCUDABlasLtWorkspaceSize();
auto op_handle = at::cuda::getCurrentCUDABlasLtHandle();
@@ -575,10 +553,7 @@ class HipblasltGemmOp : public Callable {
return FAIL;
}
- void* workspace_buffer = nullptr;
- if (workspace_size > 0) {
- workspace_buffer = c10::cuda::CUDACachingAllocator::raw_alloc(workspace_size);
- }
+ void* workspace_buffer = at::cuda::getCUDABlasLtWorkspace();
TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle,
matmul.descriptor(),
@@ -601,9 +576,6 @@ class HipblasltGemmOp : public Callable {
TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a));
TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b));
TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c));
- if (workspace_size > 0) {
- c10::cuda::CUDACachingAllocator::raw_delete(workspace_buffer);
- }
return OK;
}
diff --git a/aten/src/ATen/miopen/Descriptors.h b/aten/src/ATen/miopen/Descriptors.h
index a0ad4a4e1098..2eee837cd533 100644
--- a/aten/src/ATen/miopen/Descriptors.h
+++ b/aten/src/ATen/miopen/Descriptors.h
@@ -39,7 +39,7 @@ struct DescriptorDeleter {
// function.
template
// NOLINTNEXTLINE(bugprone-exception-escape)
-class TORCH_CUDA_CPP_API Descriptor {
+class TORCH_HIP_CPP_API Descriptor {
public:
// Use desc() to access the underlying descriptor pointer in
// a read-only fashion. Most client code should use this.
@@ -65,7 +65,7 @@ class TORCH_CUDA_CPP_API Descriptor {
std::unique_ptr> desc_;
};
-class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
+class TORCH_HIP_CPP_API TensorDescriptor : public Descriptor<
miopenTensorDescriptor,
&miopenCreateTensorDescriptor,
&miopenDestroyTensorDescriptor> {
@@ -88,7 +88,7 @@ class TORCH_CUDA_CPP_API TensorDescriptor : public Descriptor<
std::ostream& operator<<(std::ostream & out, const TensorDescriptor& d);
-class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor<
+class TORCH_HIP_CPP_API FilterDescriptor : public Descriptor<
miopenTensorDescriptor,
&miopenCreateTensorDescriptor,
&miopenDestroyTensorDescriptor> {
@@ -105,7 +105,7 @@ class TORCH_CUDA_CPP_API FilterDescriptor : public Descriptor<
}
};
-struct TORCH_CUDA_CPP_API ConvolutionDescriptor
+struct TORCH_HIP_CPP_API ConvolutionDescriptor
: public Descriptor<
miopenConvolutionDescriptor,
&miopenCreateConvolutionDescriptor,
@@ -121,7 +121,7 @@ struct TORCH_CUDA_CPP_API ConvolutionDescriptor
};
// NOLINTNEXTLINE(bugprone-exception-escape)
-struct TORCH_CUDA_CPP_API DropoutDescriptor
+struct TORCH_HIP_CPP_API DropoutDescriptor
: public Descriptor<
miopenDropoutDescriptor,
&miopenCreateDropoutDescriptor,
@@ -137,7 +137,7 @@ struct TORCH_CUDA_CPP_API DropoutDescriptor
}
};
-struct TORCH_CUDA_CPP_API RNNDescriptor
+struct TORCH_HIP_CPP_API RNNDescriptor
: public Descriptor
diff --git a/aten/src/ATen/miopen/Handle.h b/aten/src/ATen/miopen/Handle.h
index 4c80c3aea65b..b1637fca0a58 100644
--- a/aten/src/ATen/miopen/Handle.h
+++ b/aten/src/ATen/miopen/Handle.h
@@ -5,5 +5,5 @@
namespace at::native {
-TORCH_CUDA_CPP_API miopenHandle_t getMiopenHandle();
+TORCH_HIP_CPP_API miopenHandle_t getMiopenHandle();
} // namespace at::native
diff --git a/aten/src/ATen/miopen/Types.h b/aten/src/ATen/miopen/Types.h
index 0a8a1a952e2e..fdc0f6a607b7 100644
--- a/aten/src/ATen/miopen/Types.h
+++ b/aten/src/ATen/miopen/Types.h
@@ -6,7 +6,7 @@
namespace at::native {
-TORCH_CUDA_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor);
+TORCH_HIP_CPP_API miopenDataType_t getMiopenDataType(const at::Tensor& tensor);
int64_t miopen_version();
diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h
index 6e99e9565240..84381efe55b0 100644
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@@ -362,20 +362,24 @@ inline bool miopen_conv_use_channels_last(const at::Tensor& input, const at::Ten
return false;
}
- bool can_use_miopen_channels_last_2d = false;
// TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen
// See #64427
static std::optional PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC");
+ static bool suggest_nhwc = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC;
auto input_memory_format = input.suggest_memory_format();
auto weight_memory_format = weight.suggest_memory_format();
+ auto weight_ndim = weight.ndimension();
- can_use_miopen_channels_last_2d = PYTORCH_MIOPEN_SUGGEST_NHWC && *PYTORCH_MIOPEN_SUGGEST_NHWC && (
- ( (input_memory_format == at::MemoryFormat::ChannelsLast) ||
- (weight_memory_format == at::MemoryFormat::ChannelsLast) )
- );
+ bool can_use_miopen_channels_last_2d = suggest_nhwc && (weight_ndim == 4) && (
+ (input_memory_format == at::MemoryFormat::ChannelsLast) ||
+ (weight_memory_format == at::MemoryFormat::ChannelsLast)
+ );
- bool can_use_miopen_channels_last_3d = false;
+ bool can_use_miopen_channels_last_3d = suggest_nhwc && (weight_ndim == 5) && (
+ (input_memory_format == at::MemoryFormat::ChannelsLast3d) ||
+ (weight_memory_format == at::MemoryFormat::ChannelsLast3d)
+ );
return can_use_miopen_channels_last_2d || can_use_miopen_channels_last_3d;
}
diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp
index d06fc5168a0f..1122d9c8d38a 100644
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@@ -1421,7 +1421,7 @@ static inline at::MemoryFormat determine_backend_memory_format(
if (detail::getCUDAHooks().compiledWithMIOpen() && miopen_conv_use_channels_last(input, weight)) {
TORCH_INTERNAL_ASSERT((k == 4 || k == 5),
"Expected 4D or 5D input for miopen memory format selection in determine_backend_memory_format()");
- backend_memory_format = (k == 5) ? at::MemoryFormat::Contiguous /*at::MemoryFormat::ChannelsLast3d*/ : at::MemoryFormat::ChannelsLast;
+ backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
}
break;
case ConvBackend::Mkldnn:
diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp
index ecad7d7f3419..b9cf3c907f09 100644
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@@ -520,6 +520,12 @@ BatchNormBackend _select_batch_norm_backend(
return BatchNormBackend::Cudnn;
}
+ // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM once ROCm officially supports NHWC in MIOpen
+ // See #64427
+ // non static variable is used to be able to change environment variable in runtime for testing
+ // enabled by default for ROCm >= 7.0.0
+ bool PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM").value_or(ROCM_VERSION >= 70000);
+
if (
detail::getCUDAHooks().compiledWithMIOpen()
&& cudnn_enabled
@@ -527,6 +533,9 @@ BatchNormBackend _select_batch_norm_backend(
&& input.dim() <= MIOPEN_DIM_MAX
&& input.dim() >= 3
&& input.scalar_type() != at::kDouble
+#if (defined(USE_ROCM) && ROCM_VERSION < 60400)
+ && (input.scalar_type() != at::kBFloat16)
+#endif
&& (detail::getCUDAHooks().versionMIOpen() >= 30400 || input.scalar_type() != at::kBFloat16)
&& weight.scalar_type() == at::kFloat // only FP32 weight for FP32 or FP16/BF16(mixed) input
&& weight.defined() && bias.defined()
@@ -534,6 +543,12 @@ BatchNormBackend _select_batch_norm_backend(
|| (!running_mean.defined() && !running_var.defined() && training))
&& input.suggest_memory_format() != MemoryFormat::ChannelsLast
&& input.suggest_memory_format() != MemoryFormat::ChannelsLast3d
+ && (input.suggest_memory_format() == MemoryFormat::Contiguous
+#if (defined(USE_ROCM) && ROCM_VERSION >= 60500)
+ || (input.suggest_memory_format() == MemoryFormat::ChannelsLast && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
+ || (input.suggest_memory_format() == MemoryFormat::ChannelsLast3d && PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
+#endif
+ )
) {
return BatchNormBackend::Miopen;
}
@@ -613,7 +628,7 @@ std::tuple _batch_norm_impl_index(
if (backend == BatchNormBackend::Miopen) {
return std::tuple_cat(
at::miopen_batch_norm(
- input.contiguous(), weight.contiguous(), bias.contiguous(),
+ input.contiguous(input.suggest_memory_format()), weight.contiguous(), bias.contiguous(),
running_mean.defined() ? running_mean.contiguous() : running_mean,
running_var.defined() ? running_var.contiguous() : running_var,
training, momentum, eps),
diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp
index 1834839bb6e8..21e6f9f65dd7 100644
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@@ -1133,12 +1133,15 @@ ScalingType get_scaling_type(
auto expected_b_size =
BLOCK_SIZE_MN * ceil_div(dim_n, BLOCK_SIZE_MN) * padded_num_k_blocks;
+ //TODO: enable the checks for ROCm
+#ifndef USE_ROCM
TORCH_CHECK(scale_a.numel() == expected_a_size,
"For BlockWise scaling: Expected scale_a size to be ",
expected_a_size, " but got ", scale_a.numel());
TORCH_CHECK(scale_b.numel() == expected_b_size,
"For BlockWise scaling: Expected scale_b size to be ",
expected_b_size, " but got ", scale_b.numel());
+#endif
TORCH_CHECK(
scale_a.is_contiguous() && scale_b.is_contiguous(),
@@ -1205,6 +1208,7 @@ ScalingType get_scaling_type(
} // namespace
+
// Computes matrix multiply + bias while applying scaling to input and output matrices
// Scales are only applicable when matrices are of Float8 type and assumed to be equal to 1.0 by default.
// If output matrix type is 16 or 32-bit type, scale_result is not applied.
@@ -1268,6 +1272,14 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
// Type restrictions imposed by CuBLASLt as of CUDA-12.1
TORCH_CHECK(mat1.scalar_type() != ScalarType::Float8_e5m2 || mat2.scalar_type() != ScalarType::Float8_e5m2,
"Multiplication of two Float8_e5m2 matrices is not supported");
+#endif
+#ifdef USE_ROCM
+ if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) {
+ TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e5m2 is only supported for ROCm 6.0 and above");
+ }
+ if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) {
+ TORCH_CHECK(ROCM_VERSION >= 60000, "Float8_e4m3fn is only supported for ROCm 6.0 and above");
+ }
#endif
if (use_fast_accum) {
TORCH_CHECK(mat1.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat2.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat1` or `mat2` tensors have the `Float4_e2m1fn_x2` dtype.");
@@ -1327,7 +1339,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
}
#else
if (scaling_choice == ScalingType::RowWise) {
- // For ROCm, match behavior of f8f8bf16_rowwise type checking, for unit test purposes.
+ // For ROCm, match behavior of f8f8bf16_rowwise type checking
Tensor b = mat2;
if (_scaled_mm_is_fnuz()) {
TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fnuz);
@@ -1335,9 +1347,25 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
else {
TORCH_CHECK(b.dtype() == at::kFloat8_e4m3fn);
}
- // Until more than bf16 is supported.
+ // Until more than bf16 is supported
TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
- "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
+ "hipblaslt rowwise _scaled_mm only supports BFloat16 output");
+ }
+ else if (scaling_choice == ScalingType::BlockWise) {
+#if ROCM_VERSION >= 70000
+ TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}, 0),
+ "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
+
+ TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
+ mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
+ "Matrix dimensions must be multiples of 32 for block-wise scaling");
+
+ TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
+ out.scalar_type() == ScalarType::Half,
+ "Block-wise scaling only supports BFloat16 or Half output types");
+#else
+ TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
+#endif
}
#endif
@@ -1416,10 +1444,12 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
params.k = args.k;
params.a = args.mata->data_ptr();
params.a_scale_ptr = args.scale_mata_ptr;
+ params.a_scale_dtype = scale_a.scalar_type();
params.lda = args.lda;
params.a_dtype = args.mata->scalar_type();
params.b = args.matb->data_ptr();
params.b_scale_ptr = args.scale_matb_ptr;
+ params.b_scale_dtype = scale_b.scalar_type();
params.ldb = args.ldb;
params.b_dtype = args.matb->scalar_type();
params.bias_ptr = bias ? bias->data_ptr(): nullptr;
diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh
index 9b104a796636..f96b8d687bc8 100644
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@@ -226,8 +226,9 @@ C10_LAUNCH_BOUNDS_1(num_threads())
__global__ void vectorized_elementwise_kernel(int N, func_t f, array_t data) {
using traits = function_traits;
constexpr auto io_size = calc_io_size();
-#ifdef __gfx942__
- constexpr int tws = (io_size >= 2) ? 8 : 16;
+#if defined(USE_ROCM) && defined(__gfx942__)
+ // Similar check in launch_vectorized_kernel() as well. Both should be in sync.
+ constexpr int tws = 16;
#else
constexpr int tws = elems_per_thread();
#endif
@@ -296,7 +297,7 @@ static inline void launch_vectorized_kernel(
int vec_size = memory::can_vectorize_up_to(data);
c10::DeviceIndex curDevice = -1;
AT_CUDA_CHECK(c10::cuda::GetDevice(&curDevice));
- int tws = at::detail::getCUDAHooks().isGPUArch({"gfx942"}, curDevice) ? ((io_size >= 2) ? 8 : 16) : elems_per_thread();
+ int tws = at::detail::getCUDAHooks().isGPUArch({"gfx942"}, curDevice) ? 16 : elems_per_thread();
#else
using cpp_type = typename function_traits::result_type;
const uint16_t max_vec_size = memory::can_vectorize_up_to(data);
diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu
index 5d19b95b32f9..4b1e420d5da9 100644
--- a/aten/src/ATen/native/cuda/Embedding.cu
+++ b/aten/src/ATen/native/cuda/Embedding.cu
@@ -369,7 +369,7 @@ Tensor & embedding_renorm_cuda_(Tensor & self, const Tensor & indices,
int warp_size = at::cuda::warp_size();
TORCH_INTERNAL_ASSERT(num_threads() % warp_size == 0 &&
- num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads,
+ num_threads() <= cuda_utils::kCUDABlockReduceMaxThreads(),
"BlockReduceSum requires all warps be active");
const int64_t *num_unique_indices_ptr = num_unique_indices.const_data_ptr();
dim3 grid = unique_indices.numel();
diff --git a/aten/src/ATen/native/cuda/MultinomialKernel.cu b/aten/src/ATen/native/cuda/MultinomialKernel.cu
index 65770e40a8b2..8132e7df57b5 100644
--- a/aten/src/ATen/native/cuda/MultinomialKernel.cu
+++ b/aten/src/ATen/native/cuda/MultinomialKernel.cu
@@ -86,7 +86,7 @@ void renormRows(Tensor& t) {
TORCH_CHECK(props != nullptr);
int numSM = props->multiProcessorCount;
const int64_t maxThreads = std::min(
- props->maxThreadsPerBlock, cuda_utils::kCUDABlockReduceMaxThreads);
+ props->maxThreadsPerBlock, cuda_utils::kCUDABlockReduceMaxThreads());
int warp_size = at::cuda::warp_size();
dim3 grid(rows < numSM * 4 ? rows : numSM * 4);
diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu
index 5157d37f68b3..f27d76256cdb 100644
--- a/aten/src/ATen/native/cuda/SoftMax.cu
+++ b/aten/src/ATen/native/cuda/SoftMax.cu
@@ -183,15 +183,16 @@ inline dim3 SoftMaxForward_getBlockSize(uint64_t dim_size) {
uint64_t block_size = 1;
uint64_t max_block_size = std::min(dim_size, static_cast(max_threads));
- // We need a block size that is a multiple of C10_WARP_SIZE in order
+ // We need a block size that is a multiple of at::cuda::warp_size() in order
// to perform block size reductions using warp shuffle instructions.
- // Since max_threads is also a multiple of C10_WARPS_SIZE we do not
+ // Since max_threads is also a multiple of at::cuda::warp_size() we do not
// risk creating a block size larger than the limit.
- if (max_block_size % C10_WARP_SIZE == 0) {
+ int warp_size = at::cuda::warp_size();
+ if (max_block_size % warp_size == 0) {
block_size = max_block_size;
} else {
- block_size = (max_block_size / C10_WARP_SIZE + 1) * C10_WARP_SIZE;
+ block_size = (max_block_size / warp_size + 1) * warp_size;
}
return dim3(block_size);
@@ -1107,7 +1108,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
if constexpr (use_fast_softmax) {
dim3 block(512);
- size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+ size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
if (dim_size % ILP == 0) {
cunn_SoftMaxForwardGmem
<<>>(output_ptr, input_ptr, dim_size);
@@ -1117,7 +1118,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
}
} else {
dim3 block = SoftMaxForward_getBlockSize(dim_size);
- size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+ size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
smem_reduction_sz) / sizeof(scalar_t);
@@ -1198,7 +1199,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
constexpr int ILP = sizeof(float4) / sizeof(scalar_t);
if constexpr (use_fast_softmax) {
dim3 block(512);
- size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+ size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
if (dim_size % ILP == 0) {
cunn_SoftMaxForwardGmem
<<>>(output_ptr, input_ptr, dim_size);
@@ -1208,7 +1209,7 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t
}
} else {
dim3 block = SoftMaxForward_getBlockSize(dim_size);
- size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+ size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
smem_reduction_sz) / sizeof(scalar_t);
@@ -1274,7 +1275,7 @@ void dispatch_host_softmax_backward(int64_t dim_size, dim3 grid, Tensor &grad, T
constexpr int ILP = sizeof(float4) / sizeof(output_t);
dim3 block = SoftMax_getBlockSize(ILP, dim_size);
- size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t);
+ size_t smem_reduction_sz = block.x / at::cuda::warp_size() * sizeof(accscalar_t);
auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock -
smem_reduction_sz) / sizeof(output_t);
bool can_use_smem = static_cast(dim_size) < max_elements_per_smem;
diff --git a/aten/src/ATen/native/cuda/TensorModeKernel.cu b/aten/src/ATen/native/cuda/TensorModeKernel.cu
index 4764b078c050..0c97ab742103 100644
--- a/aten/src/ATen/native/cuda/TensorModeKernel.cu
+++ b/aten/src/ATen/native/cuda/TensorModeKernel.cu
@@ -207,7 +207,7 @@ void handle_fused_mode(
constexpr int num_threads = size / 2;
int warp_size = at::cuda::warp_size();
TORCH_INTERNAL_ASSERT(num_threads % warp_size == 0 &&
- num_threads <= cuda_utils::kCUDABlockReduceMaxThreads, "");
+ num_threads <= cuda_utils::kCUDABlockReduceMaxThreads(), "");
const auto memsize =
(sizeof(scalar_t) * size) + (2 * size * sizeof(unsigned int));
compute_mode
diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu
index 103b360bcb86..49086c42cd4a 100644
--- a/aten/src/ATen/native/cuda/TensorTopK.cu
+++ b/aten/src/ATen/native/cuda/TensorTopK.cu
@@ -439,8 +439,12 @@ __global__ void computeBlockwiseWithinKCounts(
warp_counts[warp] = count;
}
__syncthreads();
+#ifdef USE_ROCM
+ CUDA_KERNEL_ASSERT(RADIX_DIGITS < C10_WARP_SIZE * C10_WARP_SIZE);
+#else
static_assert(RADIX_DIGITS < C10_WARP_SIZE * C10_WARP_SIZE,
"Assuming only 1 warp is needed for final reduction");
+#endif
if (warp != 0) {
return;
}
diff --git a/aten/src/ATen/native/cuda/block_reduce.cuh b/aten/src/ATen/native/cuda/block_reduce.cuh
index 2a272d22c0c6..1818987c6a58 100644
--- a/aten/src/ATen/native/cuda/block_reduce.cuh
+++ b/aten/src/ATen/native/cuda/block_reduce.cuh
@@ -12,7 +12,17 @@ constexpr int kCUDABlockReduceNumThreads = 512;
// of which reduces C10_WARP_SIZE elements. So, at most
// C10_WARP_SIZE**2 elements can be reduced at a time.
// NOTE: This is >= the max block size on current hardware anyway (1024).
-constexpr int kCUDABlockReduceMaxThreads = C10_WARP_SIZE * C10_WARP_SIZE;
+// ROCm NOTE: C10_WARP_SIZE should only be used inside device functions,
+// and kCUDABlockReduceMaxThreads is a host-side variable.
+#ifdef USE_ROCM
+static int kCUDABlockReduceMaxThreads() {
+ return at::cuda::warp_size() * at::cuda::warp_size();
+}
+#else
+constexpr int kCUDABlockReduceMaxThreads() {
+ return C10_WARP_SIZE * C10_WARP_SIZE;
+}
+#endif
// Sums `val` across all threads in a warp.
//
diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp
index 19d044736991..cd1441a14b1e 100644
--- a/aten/src/ATen/native/cuda/jit_utils.cpp
+++ b/aten/src/ATen/native/cuda/jit_utils.cpp
@@ -45,7 +45,7 @@ namespace at::cuda::jit {
// Copied from aten/src/ATen/cuda/llvm_basic.cpp, then modified as above.
// If not compiling for ROCm, return the original get_traits_string().
std::string get_traits_string_but_hiprtc_safe() {
-#ifdef USE_ROCM
+#if defined(USE_ROCM) && ROCM_VERSION < 70000
return R"ESCAPE(
namespace std {
diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
index bdb169e26b14..aa25ad5f6ee0 100644
--- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu
+++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu
@@ -33,7 +33,12 @@ namespace at::native {
namespace {
constexpr int kCUDANumThreads = 256;
+#ifdef USE_ROCM
+// C10_WARP_SIZE is not constexpr for host code.
+#define kWarpSize C10_WARP_SIZE
+#else
constexpr unsigned int kWarpSize = C10_WARP_SIZE;
+#endif
constexpr int vec_size = 4; //we could make it dependent on dtype, but that would lead to different results between float and low-p types
// aligned vector generates vectorized load/store on CUDA (copy-pasted from MemoryAccess.cuh)
@@ -126,7 +131,11 @@ WelfordDataLN cuWelfordOnlineSum(
{
U delta = val - curr_sum.mean;
U new_count = curr_sum.count + 1.f;
+#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
+ U new_mean = curr_sum.mean + delta * __builtin_amdgcn_rcpf(new_count);
+#else
U new_mean = curr_sum.mean + delta * (1.f/new_count); //proper division is slow, this is less accurate but noticeably faster
+#endif
return {new_mean, curr_sum.sigma2 + delta * (val - new_mean), new_count};
}
@@ -140,7 +149,11 @@ WelfordDataLN cuWelfordCombine(
U count = dataA.count + dataB.count;
U mean, sigma2;
if (count > decltype(dataB.count){0}) {
+#if defined(USE_ROCM) && defined(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
+ auto coef = __builtin_amdgcn_rcpf(count);
+#else
auto coef = 1.f/count; //NB we don't use --use_fast_math, but this is emulation, 1./count goes to intrinsic, `* coef` is multiplication, instead of slow fp division
+#endif
auto nA = dataA.count * coef;
auto nB = dataB.count * coef;
mean = nA*dataA.mean + nB*dataB.mean;
diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
index af69dfc76e57..f21325cd0848 100644
--- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
+++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp
@@ -102,7 +102,7 @@ std::tuple miopen_batch_norm(
mode = miopenBNSpatial;
}
- auto output_t = at::empty(input->sizes(), input->options());
+ auto output_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format());
TensorArg output{ output_t, "output", 0 };
auto handle = getMiopenHandle();
@@ -179,8 +179,10 @@ std::tuple miopen_batch_norm_backward(
const Tensor& save_var_t =
save_var_t_opt.value_or(Tensor());
+ auto grad_output_contig =
+ grad_output_t.contiguous(input_t.suggest_memory_format());
TensorArg input{ input_t, "input", 1 },
- grad_output{ grad_output_t, "grad_output", 2 },
+ grad_output{ grad_output_contig, "grad_output", 2 },
weight{ weight_t, "weight", 3 },
save_mean{ save_mean_t, "save_mean", 4 },
save_var{ save_var_t, "save_var", 5 };
@@ -195,7 +197,9 @@ std::tuple miopen_batch_norm_backward(
}
checkAllSameType(c, {input, grad_output});
checkAllSameType(c, {weight, save_mean, save_var});
- checkAllContiguous(c, {input, grad_output, save_mean, save_var});
+ checkAllContiguous(c, {save_mean, save_var});
+ TORCH_CHECK(input->is_contiguous(input->suggest_memory_format()));
+ TORCH_CHECK(grad_output->is_contiguous(input->suggest_memory_format()));
checkDimRange(c, input, 2, 6 /* exclusive */);
checkSameSize(c, input, grad_output);
auto num_features = input->size(1);
@@ -210,7 +214,8 @@ std::tuple miopen_batch_norm_backward(
mode = miopenBNSpatial;
}
- auto grad_input_t = at::empty(input->sizes(), input->options());
+ auto grad_input_t = at::empty(
+ input->sizes(), input->options(), input->suggest_memory_format());
auto grad_weight_t = at::empty(weight->sizes(), weight->options());
auto grad_bias_t = at::empty(weight->sizes(), weight->options());
diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp
index d2cef0ca6218..f4e67e4fc307 100644
--- a/aten/src/ATen/native/miopen/Conv_miopen.cpp
+++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp
@@ -763,7 +763,7 @@ Tensor miopen_convolution_forward(
auto memory_format = at::MemoryFormat::Contiguous;
if (miopen_conv_use_channels_last(*input, *weight)) {
- memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+ memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
}
Tensor output_t = at::detail::empty_cuda(
@@ -872,7 +872,7 @@ Tensor miopen_depthwise_convolution_forward(
auto memory_format = at::MemoryFormat::Contiguous;
if (miopen_conv_use_channels_last(*input, *weight)) {
- memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+ memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
}
Tensor output_t = at::detail::empty_cuda(
@@ -1074,7 +1074,7 @@ Tensor miopen_depthwise_convolution_backward_weight(
auto memory_format = at::MemoryFormat::Contiguous;
if (miopen_conv_use_channels_last(*input, *grad_output)) {
- memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+ memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
}
Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
@@ -1127,7 +1127,7 @@ Tensor miopen_convolution_backward_weight(
auto memory_format = at::MemoryFormat::Contiguous;
if (miopen_conv_use_channels_last(*input, *grad_output)) {
- memory_format = (input->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+ memory_format = (input->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
}
Tensor grad_output_contig_t = grad_output->contiguous(memory_format);
@@ -1281,7 +1281,7 @@ Tensor miopen_convolution_backward_input(
auto memory_format = at::MemoryFormat::Contiguous;
if (miopen_conv_use_channels_last(*grad_output, *weight)) {
- memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+ memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
}
Tensor grad_input_t = at::detail::empty_cuda(
@@ -1389,7 +1389,7 @@ Tensor miopen_depthwise_convolution_backward_input(
auto memory_format = at::MemoryFormat::Contiguous;
if (miopen_conv_use_channels_last(*grad_output, *weight)) {
- memory_format = (weight->ndimension() == 5) ? /*at::MemoryFormat::ChannelsLast3d*/at::MemoryFormat::Contiguous : at::MemoryFormat::ChannelsLast;
+ memory_format = (weight->ndimension() == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
}
Tensor grad_input_t = at::detail::empty_cuda(
diff --git a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
index c9412d74e9cd..693ca536a319 100644
--- a/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
+++ b/aten/src/ATen/native/sparse/cuda/SparseCUDAApplyUtils.cuh
@@ -242,7 +242,11 @@ __global__ void coalesceValuesKernel(
// `if constexpr` when CUDA codes will be compiled under C++-17, see
// gh-56055 for blockers.
template
+#ifdef USE_ROCM
+C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE_STATIC*4)
+#else
C10_LAUNCH_BOUNDS_1(C10_WARP_SIZE*4)
+#endif
__global__ void coalesceValuesKernel(
int64_t *segment_offsets, int64_t *value_indices,
bool *values, bool *newValues,
diff --git a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp
index 38ec2ef20c5c..affa40619b59 100644
--- a/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd.hpp
@@ -453,4 +453,5 @@ struct fmha_bwd_traits
bool is_deterministic;
// TODO: padding check is inside this api
};
+template
float fmha_bwd(fmha_bwd_traits, fmha_bwd_args, const ck_tile::stream_config&);
diff --git a/c10/macros/Macros.h b/c10/macros/Macros.h
index 7d8238f91046..77ca999090d9 100644
--- a/c10/macros/Macros.h
+++ b/c10/macros/Macros.h
@@ -312,8 +312,38 @@ constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256;
#endif
#if defined(USE_ROCM)
-#define C10_WARP_SIZE warpSize // = 64 or 32 (Defined in hip_runtime.h)
-#else
+// C10_WARP_SIZE is only allowed for device code.
+// Host code _must_ use at::cuda::warp_size()
+// HIP header used to define warpSize as a constexpr that was either 32 or 64
+// depending on the target device, and then always set it to 64 for host code.
+// Host pass of HIP compiler needs C10_WARP_SIZE defined to _something_ so we
+// set it to something unreasonable to trigger obvious host code errors.
+
+namespace at::cuda {
+TORCH_CUDA_CPP_API int warp_size();
+}
+#ifdef __HIPCC__
+static inline int __host__ C10_WARP_SIZE_INTERNAL() {
+ return at::cuda::warp_size();
+}
+
+static inline constexpr int __device__ C10_WARP_SIZE_INTERNAL() {
+#if defined(__GFX9__)
+ return 64;
+#else // __GFX9__
+ return 32;
+#endif // __GFX9__
+}
+#else // __HIPCC__
+inline int C10_WARP_SIZE_INTERNAL() {
+ return at::cuda::warp_size();
+}
+#endif // __HIPCC__
+
+#define C10_WARP_SIZE (C10_WARP_SIZE_INTERNAL())
+#define C10_WARP_SIZE_STATIC 64
+
+#else // defined(USE_ROCM)
#define C10_WARP_SIZE 32
#endif
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index a93386c27f8d..1a06f2915787 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1048,6 +1048,22 @@ if(USE_ROCM)
list(APPEND HIP_HIPCC_FLAGS -fdebug-info-for-profiling)
endif(CMAKE_BUILD_TYPE MATCHES Debug)
+ # Get EnVar 'PYTORCH_LAYERNORM_FAST_RECIPROCAL' (or default to on).
+ if(DEFINED ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL})
+ set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE $ENV{PYTORCH_LAYERNORM_FAST_RECIPROCAL})
+ else()
+ set(PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE ON)
+ endif()
+
+ set(PYTORCH_LAYERNORM_FAST_RECIPROCAL
+ ${PYTORCH_LAYERNORM_FAST_RECIPROCAL_CMAKE}
+ CACHE BOOL "Enable fast reciprocals within layer normalization." FORCE
+ )
+
+ if(PYTORCH_LAYERNORM_FAST_RECIPROCAL)
+ add_definitions(-DPYTORCH_LAYERNORM_FAST_RECIPROCAL)
+ endif()
+
# needed for compat with newer versions of hip-clang that introduced C++20 mangling rules
list(APPEND HIP_HIPCC_FLAGS -fclang-abi-compat=17)
diff --git a/cmake/External/aotriton.cmake b/cmake/External/aotriton.cmake
index 8004b0f400a8..8b380d24f6c8 100644
--- a/cmake/External/aotriton.cmake
+++ b/cmake/External/aotriton.cmake
@@ -24,7 +24,7 @@ if(NOT __AOTRITON_INCLUDED)
set(__AOTRITON_SHA256_LIST
"861cd9f7479eec943933c27cb86920247e5b5dd139bc7c1376c81808abb7d7fe" # rocm6.3
"acea7d811a2d3bbe718b6e07fc2a9f739e49eecd60b4b6a36fcb3fe8edf85d78" # rocm6.4
- "7e29c325d5bd33ba896ddb106f5d4fc7d715274dca7fe937f724fffa82017838" # rocm7.0
+ "1e9b3dddf0c7fc07131c6f0f5266129e83ce2331f459fa2be8c63f4ae91b0f5b" # rocm7.0
)
set(__AOTRITON_Z "gz")
diff --git a/related_commits b/related_commits
new file mode 100644
index 000000000000..fd2787398fc1
--- /dev/null
+++ b/related_commits
@@ -0,0 +1,10 @@
+ubuntu|pytorch|apex|release/1.8.0|eab2474650906473d7d2d7053b870fe512438f90|https://github.com/ROCm/apex
+centos|pytorch|apex|release/1.8.0|eab2474650906473d7d2d7053b870fe512438f90|https://github.com/ROCm/apex
+ubuntu|pytorch|torchvision|release/0.23|824e8c8726b65fd9d5abdc9702f81c2b0c4c0dc8|https://github.com/pytorch/vision
+centos|pytorch|torchvision|release/0.23|824e8c8726b65fd9d5abdc9702f81c2b0c4c0dc8|https://github.com/pytorch/vision
+ubuntu|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
+centos|pytorch|torchdata|release/0.11|377e64c1be69a9be6649d14c9e3664070323e464|https://github.com/pytorch/data
+ubuntu|pytorch|torchaudio|release/2.8|6e1c7fe9ff6d82b8665d0a46d859d3357d2ebaaa|https://github.com/pytorch/audio
+centos|pytorch|torchaudio|release/2.8|6e1c7fe9ff6d82b8665d0a46d859d3357d2ebaaa|https://github.com/pytorch/audio
+ubuntu|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
+centos|pytorch|ao|main|a96eeb1c7d7ba24cf0ccfc105141729acfed22bf|https://github.com/pytorch/ao
diff --git a/requirements.txt b/requirements.txt
index 18f7810de951..f65837a0097e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,8 @@ jinja2
lintrunner ; platform_machine != "s390x"
networkx
ninja
-numpy
+numpy==2.0.2 ; python_version == "3.9"
+numpy==2.1.2 ; python_version > "3.9"
optree>=0.13.0
packaging
psutil
diff --git a/setup.py b/setup.py
index b4ebc92f5926..4db59ecd0b08 100644
--- a/setup.py
+++ b/setup.py
@@ -153,6 +153,10 @@
# USE_ROCM_KERNEL_ASSERT=1
# Enable kernel assert in ROCm platform
#
+# PYTORCH_LAYERNORM_FAST_RECIPROCAL
+# If set, enables the use of builtin functions for fast reciprocals (1/x) w.r.t.
+# layer normalization. Default: enabled.
+#
# Environment variables we respect (these environment variables are
# conventional and are often understood/set by other software.)
#
diff --git a/test/functorch/test_ops.py b/test/functorch/test_ops.py
index cef00f83eb72..244d8518c6a1 100644
--- a/test/functorch/test_ops.py
+++ b/test/functorch/test_ops.py
@@ -436,13 +436,6 @@ class TestOperators(TestCase):
), # Works on ROCm
xfail("torch.ops.aten._flash_attention_forward"),
xfail("torch.ops.aten._efficient_attention_forward"),
- # RuntimeError: Expected contiguous tensor, but got
- # non-contiguous tensor for argument #2 'grad_output'
- decorate(
- "_batch_norm_with_update",
- decorator=expectedFailureIf(TEST_WITH_ROCM),
- device_type="cuda",
- ),
}
),
)
@@ -2368,13 +2361,6 @@ def fn(input, weight, bias):
skip("sparse.sampled_addmm", ""),
skip("sparse.mm", "reduce"),
skip("native_layer_norm", "", device_type="cpu"),
- # RuntimeError: Expected contiguous tensor, but got
- # non-contiguous tensor for argument #2 'grad_output'
- decorate(
- "_batch_norm_with_update",
- decorator=expectedFailureIf(TEST_WITH_ROCM),
- device_type="cuda",
- ),
},
)
@opsToleranceOverride(
diff --git a/test/inductor/test_aot_inductor.py b/test/inductor/test_aot_inductor.py
index 9f93a3959d83..32a36653b225 100644
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@@ -31,7 +31,12 @@
from torch.export.pt2_archive._package import load_pt2
from torch.testing import FileCheck
from torch.testing._internal import common_utils
-from torch.testing._internal.common_cuda import PLATFORM_SUPPORTS_FP8, SM80OrLater
+from torch.testing._internal.common_cuda import (
+ PLATFORM_SUPPORTS_FLASH_ATTENTION,
+ PLATFORM_SUPPORTS_FP8,
+ PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
+ SM80OrLater,
+)
from torch.testing._internal.common_device_type import (
_has_sufficient_memory,
skipCUDAIf,
@@ -1363,6 +1368,12 @@ def forward(self, q, k, v):
self.check_model(Model(), example_inputs)
@unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
+ @unittest.skipIf(
+ # for archs where this isn't lowered to flash attention, the math
+ # backend will be used and it doesn't work for bfloat16
+ not PLATFORM_SUPPORTS_FLASH_ATTENTION,
+ "Some archs don't support SDPA with bfloat16",
+ )
def test_sdpa_2(self):
class Model(torch.nn.Module):
def __init__(self) -> None:
@@ -1615,6 +1626,9 @@ def forward(self, values, repeats, mask, embeddings, x, y, z, lst):
self.check_model(Repro(), example_inputs, dynamic_shapes=spec)
@skipIfXpu(msg="_scaled_dot_product_flash_attention is not supported on XPU yet")
+ @unittest.skipIf(
+ not PLATFORM_SUPPORTS_FLASH_ATTENTION, "Some archs don't support flash SDPA"
+ )
def test_fallback_kernel_with_symexpr_output(self):
if self.device != GPU_TYPE:
raise unittest.SkipTest("requires GPU")
@@ -4173,6 +4187,9 @@ def grid(meta):
dynamic_shapes=dynamic_shapes,
)
+ @unittest.skipIf(
+ not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Some archs don't support mem eff SDPA"
+ )
def test_scaled_dot_product_efficient_attention(self):
if self.device != GPU_TYPE:
raise unittest.SkipTest("requires GPU")
diff --git a/test/inductor/test_cooperative_reductions.py b/test/inductor/test_cooperative_reductions.py
index 469ceec2e1b2..a913ffb25bf3 100644
--- a/test/inductor/test_cooperative_reductions.py
+++ b/test/inductor/test_cooperative_reductions.py
@@ -58,7 +58,8 @@ def setUp(self):
torch._dynamo.reset()
def run_and_check(self, fn, args, *, expect_kernel_count=1):
- expected = fn(*args)
+ args_cpu = [tensor.cpu().to(torch.float32) for tensor in args]
+ expected = fn(*args_cpu).to(torch.float16)
fn = torch.compile(fn, fullgraph=True)
result, (source_code,) = run_and_get_code(fn, *args)
self.assertEqual(result, expected)
diff --git a/test/inductor/test_decompose_mem_bound_mm.py b/test/inductor/test_decompose_mem_bound_mm.py
index d21de3178cf1..828d05738739 100644
--- a/test/inductor/test_decompose_mem_bound_mm.py
+++ b/test/inductor/test_decompose_mem_bound_mm.py
@@ -12,6 +12,8 @@
from torch.testing import FileCheck
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
+ patch_test_members,
+ is_navi3_arch,
parametrize,
TEST_XPU,
)
@@ -61,31 +63,46 @@ def forward(self, input1, input2):
)
@instantiate_parametrized_tests
class TestDecomposeMemMM(TestCase):
- def compare_dict_tensors(self, ref_dict, res_dict, rtol=1e-3, atol=1e-3):
+ def __init__(self, method_name='runTest', methodName='runTest'):
+ super().__init__(method_name, methodName)
+ self.atol = 1e-3
+ self.rtol = 1e-3
+
+ def setup_tolerance(self, rtol=None, atol=None):
+ if rtol is None:
+ rtol = self.rtol
+ if atol is None:
+ atol = self.rtol
+
+ def compare_dict_tensors(self, ref_dict, res_dict, rtol=None, atol=None):
+ self.setup_tolerance(rtol, atol)
if len(set(ref_dict.keys())) != len(set(res_dict.keys())):
return False
for key1 in ref_dict.keys():
key2 = "_orig_mod." + key1
assert key2 in res_dict, f"{key1} does not exist in traced module"
- if not torch.allclose(ref_dict[key1], res_dict[key2], rtol=rtol, atol=atol):
+ if not torch.allclose(ref_dict[key1], res_dict[key2], rtol=self.rtol, atol=self.atol):
return False
return True
- def compare_pred(self, module, traced, input, rtol=1e-3, atol=1e-3):
+ def compare_pred(self, module, traced, input, rtol=None, atol=None):
+ self.setup_tolerance(rtol, atol)
ref = module(*input)
res = traced(*input)
- self.assertEqual(ref, res, rtol=rtol, atol=atol)
+ self.assertEqual(ref, res, rtol=self.rtol, atol=self.atol)
- def compare_parameters(self, module, traced, rtol=1e-3, atol=1e-3):
+ def compare_parameters(self, module, traced, rtol=None, atol=None):
+ self.setup_tolerance(rtol, atol)
ref_params = dict(module.named_parameters())
res_params = dict(traced.named_parameters())
- self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol, atol))
+ self.assertTrue(self.compare_dict_tensors(ref_params, res_params, rtol=self.rtol, atol=self.atol))
- def compare_gradients(self, module, traced, rtol=1e-3, atol=1e-3):
+ def compare_gradients(self, module, traced, rtol=None, atol=None):
+ self.setup_tolerance(rtol, atol)
ref_grad = {key: param.grad for key, param in module.named_parameters()}
res_grad = {key: param.grad for key, param in traced.named_parameters()}
self.assertTrue(
- self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
+ self.compare_dict_tensors(ref_grad, res_grad, rtol=self.rtol, atol=self.atol)
)
@parametrize(
@@ -192,6 +209,12 @@ def test_decompose_linear(self, m, n, k, has_bias, should_decompose):
)
counters.clear()
+ # We have to increase tolerance for navi3 because all fp16, bf16
+ # GEMMs operations have an accuracy issue caused by hardware limitation
+ @patch_test_members({
+ "atol": 2e-3 if is_navi3_arch() else 1e-3,
+ "rtol": 2e-3 if is_navi3_arch() else 1e-3
+ })
@parametrize(
"m,k,n, should_decompose",
[(20480, 5, 2, True), (20480, 32, 2, False), (2048, 2, 2, False)],
@@ -302,6 +325,12 @@ def test_decompose_mm_cpu(self, m, n, k, should_decompose):
)
counters.clear()
+ # We have to increase tolerance for navi3 because all fp16, bf16
+ # GEMMs operations have an accuracy issue caused by hardware limitation
+ @patch_test_members({
+ "atol": 3e-3 if is_navi3_arch() else 1e-3,
+ "rtol": 4e-3 if is_navi3_arch() else 1e-3
+ })
@parametrize(
"m,k,n, should_decompose",
[(20480, 5, 2, True), (20480, 32, 2, False), (2048, 2, 2, False)],
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
index 20ce486f6faa..ee18c3eb508f 100644
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@@ -11538,6 +11538,9 @@ def fn(q, k, v):
@xfail_if_mps_unimplemented
@expectedFailureXPU
+ @unittest.skipIf(
+ not PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, "Some archs don't support mem eff SDPA"
+ )
def test_scaled_dot_product_efficient_attention(self):
if self.device == "cpu":
raise unittest.SkipTest(f"requires {GPU_TYPE}")
diff --git a/test/test_binary_ufuncs.py b/test/test_binary_ufuncs.py
index bdc0d7329df5..7772134fd153 100644
--- a/test/test_binary_ufuncs.py
+++ b/test/test_binary_ufuncs.py
@@ -1480,8 +1480,8 @@ def to_np(value):
self.assertRaisesRegex(RuntimeError, regex, base.pow_, exponent)
elif torch.can_cast(torch.result_type(base, exponent), base.dtype):
actual2 = actual.pow_(exponent)
- self.assertEqual(actual, expected)
- self.assertEqual(actual2, expected)
+ self.assertEqual(actual, expected.to(actual))
+ self.assertEqual(actual2, expected.to(actual))
else:
self.assertRaisesRegex(
RuntimeError,
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 5e5adcf56a60..35e834d3b62e 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -467,6 +467,9 @@ def test_out_of_memory_retry(self):
IS_JETSON, "oom reporting has issues on jetson igx due to partial nvml support"
)
def test_set_per_process_memory_fraction(self):
+ if torch.version.hip and ('gfx1101' in torch.cuda.get_device_properties(0).gcnArchName):
+ torch.cuda.empty_cache()
+ torch.cuda.reset_peak_memory_stats()
orig = torch.cuda.get_per_process_memory_fraction(0)
torch.cuda.reset_peak_memory_stats(0)
try:
diff --git a/test/test_matmul_cuda.py b/test/test_matmul_cuda.py
index e8b4d9092cdd..8ec832e40a16 100644
--- a/test/test_matmul_cuda.py
+++ b/test/test_matmul_cuda.py
@@ -1453,6 +1453,10 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
device = "cuda"
M, K, N = mkn
+ if torch.version.hip:
+ if not (M % 32 == 0 and K % 32 == 0 and N % 32 == 0):
+ raise unittest.SkipTest("Matrix dimensions must be multiples of 32 on ROCm, skipping")
+
if recipe == "nvfp4" and K % 32 != 0:
return unittest.skip("K must be divisible by 32 for nvfp4 cublas gemm, skipping")
@@ -1462,7 +1466,7 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
if test_case_name == "a_eye_b_eye":
if not ((M == K) and (M == N)):
- return unittest.skip("this test is only defined for M == K == N, skipping")
+ raise unittest.SkipTest("this test is only defined for M == K == N, skipping")
A_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
B_ref = torch.eye(M, device=device, dtype=torch.bfloat16)
@@ -1601,7 +1605,7 @@ def test_blockwise_mxfp8_nvfp4_numerics(self, test_case_name, fast_accum, mkn, r
elif test_case_name == "data_random_scales_from_data":
if not K % BLOCK_SIZE == 0:
- return unittest.skip(f"this test is only defined for K a multiple of {BLOCK_SIZE}, skipping")
+ raise unittest.SkipTest(f"this test is only defined for K a multiple of {BLOCK_SIZE}, skipping")
require_exact_match = False
# random data, scales from data
A_ref = torch.randn((M, K), device=device, dtype=torch.bfloat16) * 1000
diff --git a/test/test_nn.py b/test/test_nn.py
index 2ece5fbdbd72..14d4eed971d2 100644
--- a/test/test_nn.py
+++ b/test/test_nn.py
@@ -8,6 +8,7 @@
import io
import itertools
import warnings
+import os
import pickle
import re
from copy import deepcopy
@@ -30,13 +31,13 @@
from torch.nn import Buffer, Parameter
from torch.nn.parallel._functions import Broadcast
from torch.testing._internal.common_dtype import integral_types, get_all_math_dtypes, floating_types
-from torch.testing._internal.common_utils import freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
+from torch.testing._internal.common_utils import dtype_name, freeze_rng_state, run_tests, TestCase, skipIfNoLapack, skipIfRocm, \
TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \
download_file, get_function_arglist, load_tests, skipIfMPS, \
IS_PPC, \
parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
- skipIfTorchDynamo, gcIfJetson, set_default_dtype
-from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION
+ skipIfTorchDynamo, skipIfRocmVersionLessThan, gcIfJetson, set_default_dtype
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, PLATFORM_SUPPORTS_FLASH_ATTENTION, _get_torch_rocm_version
from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
ctcloss_reference, get_new_module_tests, single_batch_reference_fn, _test_bfloat16_ops, _test_module_empty_input
@@ -5136,7 +5137,209 @@ def test_batchnorm_nhwc_cuda(self):
inp2 = inp1.contiguous(memory_format=torch.channels_last)
out1 = model(inp1)
out2 = model(inp2)
- self.assertTrue(torch.equal(out1, out2))
+ self.assertEqual(out1, out2)
+
+ @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+ @parametrize_test("dims", [2, 3], name_fn=lambda x: f"{x}D")
+ @parametrize_test("mode", ["train", "inference"], name_fn=lambda x: x)
+ @parametrize_test(
+ # test verifies cudnn/miopen batchnorm with the reference backend or memory format
+ # memory_format - one of ("NCHW", NHWC")
+ # ref_backend - one of ("cpu", "native", "NCHW", "NHWC")
+ # "cpu" - cpu backend with the same memory_format will be used as reference
+ # "native" - native backend (`with torch.backends.cudnn.flags(enabled=False)`)
+ # with the same memory_format will be used
+ # "NCHW" or "NHWC" - the same backend will be used but another memory format
+ # mixed - True or False. Mixed batchnorm mode where inputs are 16-bit and batchnorm is fp32
+ #
+ "memory_format,ref_backend,mixed,dtype",
+ [
+ ("NCHW", "cpu", False, torch.float),
+ ("NCHW", "cpu", True, torch.half),
+ ("NCHW", "cpu", True, torch.bfloat16),
+
+ ("NCHW", "native", False, torch.float),
+ ("NCHW", "native", True, torch.half),
+ ("NCHW", "native", True, torch.bfloat16),
+
+ ("NHWC", "cpu", False, torch.float),
+ ("NHWC", "cpu", True, torch.half),
+ ("NHWC", "cpu", True, torch.bfloat16),
+
+ ("NHWC", "native", False, torch.float),
+ ("NHWC", "native", True, torch.half),
+ ("NHWC", "native", True, torch.bfloat16),
+
+ ("NHWC", "NCHW", False, torch.float),
+ ("NHWC", "NCHW", True, torch.half),
+ ("NHWC", "NCHW", True, torch.bfloat16),
+ ],
+ name_fn=lambda f, b, m, t: f"{f}_vs_{b}{'_mixed' if m else ''}_{dtype_name(t)}"
+ )
+ def test_batchnorm(self, dims, mode, memory_format, ref_backend, mixed, dtype):
+ if torch.version.hip:
+ if self._testMethodName in ("test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16",
+ "test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
+ "test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16",
+ "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16"
+ ) and _get_torch_rocm_version() < (6, 4):
+ # NCHW bfloat16 path uses native kernels for rocm<=6.3
+ # train failed on rocm<=6.3 due to native tolerance issue SWDEV-507600
+ self.skipTest("bfloat16 NHWC train failed on ROCm <= 6.3")
+
+ if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_native_mixed_bfloat16",
+ "test_batchnorm_3D_train_NCHW_vs_native_mixed_bfloat16"
+ ) and _get_torch_rocm_version() >= (6, 4):
+ self.skipTest("bfloat16 NCHW train failed due to native tolerance issue SWDEV-507600")
+
+ if self._testMethodName == "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16" \
+ and _get_torch_rocm_version() < (6, 4):
+ self.skipTest("3D float16 NCHW train failed on ROCm<=6.3 ")
+
+ if dims == 3 and memory_format in ("NHWC", "NCHW"):
+ memory_format = memory_format + "3D"
+
+ def _create_tensor(size, memory_format, dtype, device):
+ t = torch.empty(size=size, memory_format=memory_format, dtype=dtype, device=device)
+ t = t.random_(1, 10)
+ return t
+
+ def _get_ref_device(backend: str , device: str):
+ # If 'backend' specifies the memory format, return 'device' arg, otherwise return a device matches the backend
+ if backend in ("NHWC", "NHWC3D", "NCHW", "NCHW3D"):
+ return device
+ if backend == "native":
+ return "cuda"
+ if backend == "cpu":
+ return "cpu"
+ else:
+ raise ValueError("Unknown backend")
+
+ def _get_backend_memory_format(backend: str, memory_format: torch.memory_format) -> torch.memory_format:
+ # If 'backend' specifies the memory format, return it, otherwise look at 'memory_format' arg
+ if backend == "NHWC":
+ return torch.channels_last
+ if backend == "NHWC3D":
+ return torch.channels_last_3d
+ if backend in ("NCHW", "NCHW3D"):
+ return torch.contiguous_format
+ if memory_format in (torch.contiguous_format, torch.channels_last, torch.channels_last_3d):
+ return memory_format
+ raise ValueError("Unable to detect memory format for backend={backend} and memory_format={memory_format}")
+
+ def _get_memory_format(t: torch.Tensor) -> torch.memory_format:
+ if t.is_contiguous(memory_format=torch.contiguous_format):
+ return torch.contiguous_format
+ if t.is_contiguous(memory_format=torch.channels_last):
+ return torch.channels_last
+ if t.is_contiguous(memory_format=torch.channels_last_3d):
+ return torch.channels_last_3d
+ return ValueError("Unsupported memory_format")
+
+ def _get_memory_format_from_name(memory_format_name: str) -> torch.memory_format:
+ if memory_format_name == "NHWC":
+ return torch.channels_last
+ elif memory_format_name == "NHWC3D":
+ return torch.channels_last_3d
+ elif memory_format_name in ("NCHW", "NCHW3D"):
+ return torch.contiguous_format
+ return ValueError("Unsupported memory_format")
+
+ def _create_backend(inp: torch.Tensor, mixed: bool = False):
+
+ mod = nn.BatchNorm2d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype) \
+ if inp.dim() == 4 else \
+ nn.BatchNorm3d(inp.size(1), device=inp.device, dtype=torch.float if mixed else inp.dtype)
+ return mod
+
+ def _test_batchnorm_train(inp, grad, mixed, ref_inp, ref_grad, ref_backend):
+ mod = _create_backend(inp, mixed).train()
+ mod.weight.data.uniform_()
+ mod.bias.data.uniform_()
+
+ ref_mod = _create_backend(ref_inp, mixed).train()
+ ref_mod.load_state_dict(mod.state_dict())
+
+ out = mod(inp)
+ out.backward(grad)
+
+ with torch.backends.cudnn.flags(enabled=False) if ref_backend == "native" else contextlib.nullcontext():
+ ref_out = ref_mod(ref_inp)
+ ref_out.backward(ref_grad)
+
+ self.assertTrue(out.is_contiguous(memory_format=_get_memory_format(inp)))
+ self.assertTrue(ref_out.is_contiguous(memory_format=_get_memory_format(ref_inp)))
+ self.assertEqual(out, ref_out)
+ self.assertEqual(mod.weight.grad, ref_mod.weight.grad)
+ self.assertEqual(mod.bias.grad, ref_mod.bias.grad)
+ self.assertEqual(mod.running_mean, ref_mod.running_mean)
+ self.assertEqual(mod.running_var, ref_mod.running_var)
+ self.assertEqual(inp.grad, ref_inp.grad)
+
+ def _train(memory_format_name, ref_backend, mixed, dtype):
+ memory_format = _get_memory_format_from_name(memory_format_name)
+
+ ref_memory_format = _get_backend_memory_format(ref_backend, memory_format)
+ ref_device = _get_ref_device(ref_backend, device="cuda")
+
+ size = (4, 8, 2, 2, 2) if memory_format_name in ("NCHW3D", "NHWC3D") else (4, 8, 2, 2)
+ inp = _create_tensor(size, memory_format, dtype, device="cuda").detach().requires_grad_()
+ grad = _create_tensor(size, memory_format, dtype, device="cuda")
+ ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device).requires_grad_()
+ ref_grad = grad.detach().clone(memory_format=ref_memory_format).to(device=ref_device)
+
+ _test_batchnorm_train(inp=inp, grad=grad, mixed=mixed,
+ ref_inp=ref_inp, ref_grad=ref_grad, ref_backend=ref_backend)
+
+ # TODO: enable permute logic later
+ # size = (2, 8, 8, 1)
+ # input = _create_tensor(size, memory_format, dtype, device="cuda").detach().requires_grad_()
+ # grad = _create_tensor(size, memory_format=torch.contiguous_format, dtype=dtype, device="cuda")
+ # # grad = _create_tensor(size, memory_format=memory_format, dtype=dtype, device="cuda")
+
+ # ref_input = input.detach().clone(memory_format=ref_memory_format).to(device=ref_device).requires_grad_(True)
+ # ref_grad = grad.detach().clone(memory_format=torch.contiguous_format).to(device=ref_device)
+ # # ref_grad = grad.detach().clone(memory_format=ref_memory_format).to(device=ref_device)
+
+ # if memory_format == torch.channels_last:
+ # grad = grad.permute(0, 2, 1, 3)
+ # # grad = grad.permute(0, 2, 3, 1)
+ # if ref_memory_format == torch.channels_last:
+ # ref_grad = ref_grad.permute(0, 2, 1, 3)
+ # # ef_grad = ref_grad.permute(0, 2, 3, 1)
+ # _test_batchnorm_train(input=input, grad=grad, mixed=mixed,
+ # ref_input=ref_input, ref_grad=ref_grad, ref_backend=ref_backend)
+
+ def _inference(memory_format_name, ref_backend, mixed, dtype):
+ memory_format = _get_memory_format_from_name(memory_format_name)
+ ref_memory_format = _get_backend_memory_format(ref_backend, memory_format)
+ ref_device = _get_ref_device(ref_backend, device="cuda")
+
+ size = (2, 64, 50, 50, 50) if memory_format_name in ("NCHW3D", "NHWC3D") else (2, 64, 50, 50)
+ inp = _create_tensor(size, memory_format, dtype, device="cuda")
+ ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device)
+ mod = _create_backend(inp, mixed).eval()
+ ref_mod = _create_backend(ref_inp, mixed).eval()
+
+ out = mod(inp)
+ with torch.backends.cudnn.flags(enabled=False) if ref_backend == "native" else contextlib.nullcontext():
+ ref_out = ref_mod(ref_inp)
+ self.assertEqual(out, ref_out)
+
+ # TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM once ROCm officially supports NHWC in MIOpen
+ PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM = "PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM"
+ prev_val = os.getenv(PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM)
+ try:
+ os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM] = "1"
+ if mode == "train":
+ _train(memory_format, ref_backend, mixed, dtype)
+ else:
+ _inference(memory_format, ref_backend, mixed, dtype)
+ finally:
+ if prev_val is None:
+ del os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM]
+ else:
+ os.environ[PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM] = prev_val
def test_batchnorm_load_state_dict(self):
bn = torch.nn.BatchNorm2d(3)
@@ -8377,7 +8580,6 @@ def test_affine_3d_rotateRandom(self, device):
self.assertEqual(scipy_ary, gridsample_ary.reshape_as(scipy_ary))
-
@onlyCUDA
@dtypes(torch.float, torch.half)
def test_batchnorm_large_batch(self, device, dtype):
diff --git a/third_party/composable_kernel b/third_party/composable_kernel
index 8086bbe3a78d..df6023e305f3 160000
--- a/third_party/composable_kernel
+++ b/third_party/composable_kernel
@@ -1 +1 @@
-Subproject commit 8086bbe3a78d931eb96fe12fdc014082e18d18d3
+Subproject commit df6023e305f389bbf7249b0c4414e649f3ad6598
diff --git a/tools/autograd/derivatives.yaml b/tools/autograd/derivatives.yaml
index e2419aab268b..d711480bb85e 100644
--- a/tools/autograd/derivatives.yaml
+++ b/tools/autograd/derivatives.yaml
@@ -2793,7 +2793,7 @@
self, weight, bias: "grad.defined() ? convolution_backward_symint(grad, self, weight, bias->sym_sizes(), stride, padding, dilation, false, std::vector(padding.size(), 0), groups, grad_input_mask) : std::tuple()"
- name: miopen_batch_norm(Tensor input, Tensor weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float exponential_average_factor, float epsilon) -> (Tensor, Tensor, Tensor)
- input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple()"
+ input, weight, bias: "grad.defined() ? (training ? miopen_batch_norm_backward(input, grad.contiguous(input.suggest_memory_format()), weight, running_mean, running_var, result1, result2, epsilon) : native_batch_norm_backward(grad, input, weight, running_mean, running_var, result1, result2, training, epsilon, grad_input_mask)) : std::tuple()"
result0: batch_norm_jvp(input_p, input_t, weight_p, weight_t, bias_p, bias_t, running_mean, running_var, result1, result2, training, epsilon)
- name: miopen_batch_norm_backward(Tensor input, Tensor grad_output, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, float epsilon) -> (Tensor, Tensor, Tensor)
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
index 4cc29b0c347f..20ccf0d74b60 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu
@@ -255,7 +255,7 @@ static __global__ void barrier_kernel(
void CUDASymmetricMemory::barrier(int channel, size_t timeout_ms) {
check_channel(channel, world_size_);
c10::cuda::CUDAGuard guard(local_device_idx_);
- barrier_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>(
+ barrier_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>(
reinterpret_cast(signal_pads_dev_),
channel,
rank_,
@@ -293,7 +293,7 @@ void CUDASymmetricMemory::put_signal(
size_t timeout_ms) {
check_channel(channel, world_size_);
c10::cuda::CUDAGuard guard(local_device_idx_);
- put_signal_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>(
+ put_signal_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>(
reinterpret_cast(signal_pads_dev_),
dst_rank,
channel,
@@ -337,7 +337,7 @@ void CUDASymmetricMemory::wait_signal(
size_t timeout_ms) {
check_channel(channel, world_size_);
c10::cuda::CUDAGuard guard(local_device_idx_);
- wait_signal_kernel<<<1, C10_WARP_SIZE, 0, at::cuda::getCurrentCUDAStream()>>>(
+ wait_signal_kernel<<<1, at::cuda::warp_size(), 0, at::cuda::getCurrentCUDAStream()>>>(
reinterpret_cast(signal_pads_dev_),
src_rank,
channel,
diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
index d7652b77ebab..a2d5f8f9f67b 100644
--- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemoryOps.cu
@@ -114,7 +114,7 @@ void init_elementwise_launch_config(
num_blocks = 1;
num_threads = at::round_up(
at::ceil_div(numel_per_split, numel_per_thread),
- static_cast(C10_WARP_SIZE));
+ static_cast(at::cuda::warp_size()));
} else {
num_blocks = std::min(
at::ceil_div(numel_per_split, max_num_threads * numel_per_thread),
diff --git a/torch/testing/_internal/common_cuda.py b/torch/testing/_internal/common_cuda.py
index a211851d671f..2620c64a95ef 100644
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@@ -108,7 +108,15 @@ def evaluate_platform_supports_fp8():
PLATFORM_SUPPORTS_FP8: bool = LazyVal(lambda: evaluate_platform_supports_fp8())
-PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: TEST_CUDA and SM100OrLater)
+def _platform_supports_mx_gemm():
+ if torch.cuda.is_available():
+ if torch.version.hip:
+ return 'gfx95' in torch.cuda.get_device_properties(0).gcnArchName
+ else:
+ return SM100OrLater
+ return False
+
+PLATFORM_SUPPORTS_MX_GEMM: bool = LazyVal(lambda: _platform_supports_mx_gemm())
if TEST_NUMBA:
try:
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index 45b7378f88cc..052a968d51e2 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -49,6 +49,7 @@
from typing import (
Any,
Callable,
+ Dict,
Optional,
TypeVar,
Union,
@@ -102,8 +103,18 @@
has_pytest = False
-MI300_ARCH = ("gfx942",)
+MI300_ARCH = ("gfx940", "gfx941", "gfx942")
+NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")
+NAVI3_ARCH = ("gfx1100", "gfx1101")
+NAVI4_ARCH = ("gfx1200", "gfx1201")
+def is_navi3_arch():
+ if torch.cuda.is_available():
+ prop = torch.cuda.get_device_properties(0)
+ gfx_arch = prop.gcnArchName.split(":")[0]
+ if gfx_arch in NAVI3_ARCH:
+ return True
+ return False
def freeze_rng_state(*args, **kwargs):
return torch.testing._utils.freeze_rng_state(*args, **kwargs)
@@ -5710,3 +5721,26 @@ def load_inline(*args, **kwargs):
return func(*args, load_inline=load_inline, **kwargs)
return wrapper
+
+# Decorator to patch multiple test class members for the duration of the subtest
+def patch_test_members(updates: Dict[str, Any]):
+ def decorator(test_func):
+ @wraps(test_func)
+ def wrapper(self, *args, **kwargs):
+ # Store the original values of the specified members
+ original_values = {member: getattr(self, member) for member in updates}
+
+ # Update the members before running the subtest
+ for member, value in updates.items():
+ setattr(self, member, value)
+
+ # Run the test function, allowing subtests to run
+ try:
+ return test_func(self, *args, **kwargs)
+ finally:
+ # Restore the original values of the specified members after the subtest finishes
+ for member, original_value in original_values.items():
+ setattr(self, member, original_value)
+
+ return wrapper
+ return decorator
\ No newline at end of file
diff --git a/torch/utils/hipify/cuda_to_hip_mappings.py b/torch/utils/hipify/cuda_to_hip_mappings.py
index a5145a2f4870..b251a85e245a 100644
--- a/torch/utils/hipify/cuda_to_hip_mappings.py
+++ b/torch/utils/hipify/cuda_to_hip_mappings.py
@@ -3870,6 +3870,7 @@
("CUDA_C_64U", ("HIP_C_64U", CONV_TYPE, API_RUNTIME)),
("CUDA_R_8F_E4M3", ("HIP_R_8F_E4M3", CONV_TYPE, API_RUNTIME)),
("CUDA_R_8F_E5M2", ("HIP_R_8F_E5M2", CONV_TYPE, API_RUNTIME)),
+ ("CUDA_R_4F_E2M1", ("HIP_R_4F_E2M1", CONV_TYPE, API_RUNTIME)),
(
"MAJOR_VERSION",
("hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED),
@@ -7347,6 +7348,10 @@
("CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F", ("HIPBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F", CONV_MATH_FUNC, API_BLAS)),
("CUBLASLT_MATMUL_DESC_AMAX_D_POINTER", ("HIPBLASLT_MATMUL_DESC_AMAX_D_POINTER", CONV_MATH_FUNC, API_BLAS)),
("CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", ("HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE", CONV_MATH_FUNC, API_BLAS)),
+ ("CUBLASLT_MATMUL_DESC_A_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_A_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
+ ("CUBLASLT_MATMUL_DESC_B_SCALE_MODE", ("HIPBLASLT_MATMUL_DESC_B_SCALE_MODE", CONV_MATH_FUNC, API_BLAS)),
+ ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0", CONV_MATH_FUNC, API_BLAS)),
+ ("CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", ("HIPBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3", CONV_MATH_FUNC, API_BLAS)),
("cublasLtMatrixLayout_t", ("hipblasLtMatrixLayout_t", CONV_MATH_FUNC, API_BLAS)),
("cublasLtMatrixLayoutOpaque_t", ("hipblasLtMatrixLayoutOpaque_t", CONV_MATH_FUNC, API_BLAS)),
("cublasLtMatrixLayoutAttribute_t", ("hipblasLtMatrixLayoutAttribute_t", CONV_MATH_FUNC, API_BLAS)),