From 9060485439b036ba4cc897a5c4ce9790dc310c4c Mon Sep 17 00:00:00 2001 From: "Ling, Liyang" Date: Thu, 27 Jun 2024 08:26:47 +0000 Subject: [PATCH 1/2] Add experimental check cl_bf16_conversion extention query --- csrc/gpu/aten/core/DeviceInfo.h | 1 + csrc/gpu/runtime/Device.cpp | 3 +++ csrc/gpu/runtime/DeviceProp.h | 1 + intel_extension_for_pytorch/csrc/xpu/Module.cpp | 4 +++- 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/csrc/gpu/aten/core/DeviceInfo.h b/csrc/gpu/aten/core/DeviceInfo.h index a487e215b..aa13b5f50 100644 --- a/csrc/gpu/aten/core/DeviceInfo.h +++ b/csrc/gpu/aten/core/DeviceInfo.h @@ -34,6 +34,7 @@ struct DeviceInfo { uint32_t max_num_sub_groups; std::vector sub_group_sizes; bool support_fp64; + bool support_cl_bf16_conversion; }; } // namespace dpcpp diff --git a/csrc/gpu/runtime/Device.cpp b/csrc/gpu/runtime/Device.cpp index 5dd4a123e..ce89de451 100644 --- a/csrc/gpu/runtime/Device.cpp +++ b/csrc/gpu/runtime/Device.cpp @@ -322,6 +322,8 @@ static void initDeviceProperty(DeviceId device_id) { : 8; device_prop.support_atomic64 = device.has(dpcpp_dev_aspect_atomic64); device_prop.support_fp64 = device.has(dpcpp_dev_aspect_fp64); + sycl::ext::oneapi::experimental::cl_version version{20, 20, 20}; + device_prop.support_cl_bf16_conversion = device.ext_oneapi_supports_cl_extension("cl_intel_bfloat16_conversions", &version); device_properties[device_id] = device_prop; @@ -358,6 +360,7 @@ static void initDeviceProperty(DeviceId device_id) { dev_info.support_fp64 = device_prop.support_fp64; #if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20240100) dev_info.device_arch = static_cast(device_prop.device_arch); + dev_info.support_cl_bf16_conversion = device_prop.support_cl_bf16_conversion; #else dev_info.device_arch = (uint64_t)0; #endif diff --git a/csrc/gpu/runtime/DeviceProp.h b/csrc/gpu/runtime/DeviceProp.h index dbd0d07a2..f2af1843d 100644 --- a/csrc/gpu/runtime/DeviceProp.h +++ b/csrc/gpu/runtime/DeviceProp.h @@ -143,6 +143,7 @@ struct DeviceProp { bool support_fp64; bool support_atomic64; + bool support_cl_bf16_conversion; }; } // namespace dpcpp diff --git a/intel_extension_for_pytorch/csrc/xpu/Module.cpp b/intel_extension_for_pytorch/csrc/xpu/Module.cpp index 8528051c5..34bc3117e 100644 --- a/intel_extension_for_pytorch/csrc/xpu/Module.cpp +++ b/intel_extension_for_pytorch/csrc/xpu/Module.cpp @@ -577,6 +577,7 @@ static void register_xpu_device_info(PyObject* module) { .def_readonly("max_num_sub_groups", &DeviceInfo::max_num_sub_groups) .def_readonly("sub_group_sizes", &DeviceInfo::sub_group_sizes) .def_readonly("has_fp64", &DeviceInfo::support_fp64) + .def_readonly("support_cl_bf16_conversion", &DeviceInfo::support_cl_bf16_conversion) .def_readonly("device_arch", &DeviceInfo::device_arch) .def_property_readonly( "dev_type", [](const DeviceInfo& info) { return get_dev_type(info); }) @@ -589,7 +590,8 @@ static void register_xpu_device_info(PyObject* module) { << ", total_memory=" << info.global_mem_size / (1024 * 1024) << "MB, max_compute_units=" << info.max_compute_units << ", gpu_eu_count=" << info.gpu_eu_count - << ", device_arch=" << info.device_arch << ")"; + << ", device_arch=" << info.device_arch + << ", support_cl_bf16_conversion=" << info.support_cl_bf16_conversion << ")"; return stream.str(); }); } From dcd81cd06628a52cf06f7ab3f1fd66294d1fa325 Mon Sep 17 00:00:00 2001 From: "Ling, Liyang" Date: Fri, 28 Jun 2024 04:14:15 +0000 Subject: [PATCH 2/2] Add 3 more extention check --- csrc/gpu/aten/core/DeviceInfo.h | 3 +++ csrc/gpu/runtime/Device.cpp | 10 ++++++++-- csrc/gpu/runtime/DeviceProp.h | 3 +++ intel_extension_for_pytorch/csrc/xpu/Module.cpp | 9 ++++++++- 4 files changed, 22 insertions(+), 3 deletions(-) diff --git a/csrc/gpu/aten/core/DeviceInfo.h b/csrc/gpu/aten/core/DeviceInfo.h index aa13b5f50..2bd0a4b01 100644 --- a/csrc/gpu/aten/core/DeviceInfo.h +++ b/csrc/gpu/aten/core/DeviceInfo.h @@ -35,6 +35,9 @@ struct DeviceInfo { std::vector sub_group_sizes; bool support_fp64; bool support_cl_bf16_conversion; + bool support_cl_sg_matmul_acc; + bool support_cl_sg_matmul_acc_tf32; + bool support_cl_sg_2d_block_io; }; } // namespace dpcpp diff --git a/csrc/gpu/runtime/Device.cpp b/csrc/gpu/runtime/Device.cpp index ce89de451..fcf679453 100644 --- a/csrc/gpu/runtime/Device.cpp +++ b/csrc/gpu/runtime/Device.cpp @@ -322,8 +322,11 @@ static void initDeviceProperty(DeviceId device_id) { : 8; device_prop.support_atomic64 = device.has(dpcpp_dev_aspect_atomic64); device_prop.support_fp64 = device.has(dpcpp_dev_aspect_fp64); - sycl::ext::oneapi::experimental::cl_version version{20, 20, 20}; + sycl::ext::oneapi::experimental::cl_version version; device_prop.support_cl_bf16_conversion = device.ext_oneapi_supports_cl_extension("cl_intel_bfloat16_conversions", &version); + device_prop.support_cl_sg_matmul_acc = device.ext_oneapi_supports_cl_extension("cl_intel_subgroup_matrix_multiply_accumulate", &version); + device_prop.support_cl_sg_matmul_acc_tf32 = device.ext_oneapi_supports_cl_extension("cl_intel_subgroup_matrix_multiply_accumulate_tensor_float32", &version); + device_prop.support_cl_sg_2d_block_io = device.ext_oneapi_supports_cl_extension("cl_intel_subgroup_2d_block_io", &version); device_properties[device_id] = device_prop; @@ -358,9 +361,12 @@ static void initDeviceProperty(DeviceId device_id) { dev_info.max_num_sub_groups = device_prop.max_num_subgroup; dev_info.sub_group_sizes = device_prop.subgroup_sizes; dev_info.support_fp64 = device_prop.support_fp64; + dev_info.support_cl_bf16_conversion = device_prop.support_cl_bf16_conversion; + dev_info.support_cl_sg_matmul_acc = device_prop.support_cl_sg_matmul_acc; + dev_info.support_cl_sg_matmul_acc_tf32 = device_prop.support_cl_sg_matmul_acc_tf32; + dev_info.support_cl_sg_2d_block_io = device_prop.support_cl_sg_2d_block_io; #if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20240100) dev_info.device_arch = static_cast(device_prop.device_arch); - dev_info.support_cl_bf16_conversion = device_prop.support_cl_bf16_conversion; #else dev_info.device_arch = (uint64_t)0; #endif diff --git a/csrc/gpu/runtime/DeviceProp.h b/csrc/gpu/runtime/DeviceProp.h index f2af1843d..b4bd5ce70 100644 --- a/csrc/gpu/runtime/DeviceProp.h +++ b/csrc/gpu/runtime/DeviceProp.h @@ -144,6 +144,9 @@ struct DeviceProp { bool support_fp64; bool support_atomic64; bool support_cl_bf16_conversion; + bool support_cl_sg_matmul_acc; + bool support_cl_sg_matmul_acc_tf32; + bool support_cl_sg_2d_block_io; }; } // namespace dpcpp diff --git a/intel_extension_for_pytorch/csrc/xpu/Module.cpp b/intel_extension_for_pytorch/csrc/xpu/Module.cpp index 34bc3117e..fb0a96a3f 100644 --- a/intel_extension_for_pytorch/csrc/xpu/Module.cpp +++ b/intel_extension_for_pytorch/csrc/xpu/Module.cpp @@ -578,6 +578,9 @@ static void register_xpu_device_info(PyObject* module) { .def_readonly("sub_group_sizes", &DeviceInfo::sub_group_sizes) .def_readonly("has_fp64", &DeviceInfo::support_fp64) .def_readonly("support_cl_bf16_conversion", &DeviceInfo::support_cl_bf16_conversion) + .def_readonly("support_cl_sg_matmul_acc", &DeviceInfo::support_cl_sg_matmul_acc) + .def_readonly("support_cl_sg_matmul_acc_tf32", &DeviceInfo::support_cl_sg_matmul_acc_tf32) + .def_readonly("support_cl_sg_2d_block_io", &DeviceInfo::support_cl_sg_2d_block_io) .def_readonly("device_arch", &DeviceInfo::device_arch) .def_property_readonly( "dev_type", [](const DeviceInfo& info) { return get_dev_type(info); }) @@ -591,7 +594,11 @@ static void register_xpu_device_info(PyObject* module) { << "MB, max_compute_units=" << info.max_compute_units << ", gpu_eu_count=" << info.gpu_eu_count << ", device_arch=" << info.device_arch - << ", support_cl_bf16_conversion=" << info.support_cl_bf16_conversion << ")"; + << ", support_cl_bf16_conversion=" << info.support_cl_bf16_conversion + << ", support_cl_sg_matmul_acc=" << info.support_cl_sg_matmul_acc + << ", support_cl_sg_matmul_acc_tf32=" << info.support_cl_sg_matmul_acc_tf32 + << ", support_cl_sg_2d_block_io=" << info.support_cl_sg_2d_block_io + << ")"; return stream.str(); }); }