diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 3a88dae041dd1..07c368efb4273 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -71,4 +71,21 @@ def XeGPUBlocking: Pass<"xegpu-blocking"> { ]; } +def XeGPUAttachTargetDevice : Pass<"xegpu-attach-target-device", "ModuleOp"> { + let summary = "Attach a dlti.target_system_spec entry with a named device"; + let description = [{ + This pass attaches a `dlti.target_system_spec` attribute to the module + with a device entry like `#dlti.dl_entry<"name", "">`. + }]; + + let options = [ + Option<"deviceName", "device-name", "std::string", + /*default=*/"\"pvc\"", + "Name of the target device to attach (e.g. pvc)">, + ]; + let dependentDialects = [ + "xegpu::XeGPUDialect", "mlir::DLTIDialect" + ]; +} + #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h new file mode 100644 index 0000000000000..4d33a5c9fcb41 --- /dev/null +++ b/mlir/include/mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h @@ -0,0 +1,227 @@ +//===--- uArch.h ---------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// PVC uArch definition. +/// +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_DIALECT_XEGPU_UTILS_INTEL_GPU_XE2_H +#define MLIR_DIALECT_XEGPU_UTILS_INTEL_GPU_XE2_H + +#include "mlir/Dialect/XeGPU/uArch/uArch.h" +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/TypeUtilities.h" +#include +#include +#include + +namespace mlir { +namespace xegpu { +namespace uArch { +namespace Xe2Plus { +struct XeCoreInfo { + uint32_t num_threads; + SharedMemory shared_memory; + uint32_t num_vector_units; + uint32_t num_matrix_units; + + // Constructor + XeCoreInfo(uint32_t num_threads, const SharedMemory &shared_memory, + uint32_t num_vector_units, uint32_t num_matrix_units) + : num_threads(num_threads), shared_memory(shared_memory), + num_vector_units(num_vector_units), num_matrix_units(num_matrix_units) { + } +}; + +struct Xe2Plus : public uArch { + XeCoreInfo xe_core; + Xe2Plus( + const std::string &archName, const std::string &archDescription, + const XeCoreInfo &xeCore, + const std::vector &hierarchy = {}, + const std::map ®Info = {}, + const std::vector &cacheInfo = {}, + const std::map> &instrs = {}, + const std::vector *> &restrs = {}) + : uArch(archName, archDescription, hierarchy, regInfo, cacheInfo, instrs, + restrs), + xe_core(xeCore) {} +}; + +// struct to represent DPAS instruction +struct DPASInstruction : public Instruction, public MatrixOpInterface { + // Range systolic_depth; + // Range repreat_count; + // Range execution_size; + // std::map ops_per_channel; + // std::vector> supported_types; + // std::map>> + // matrix_size; + + // bool checkSupportedDPASTypes(mlir::Type dstType, mlir::Type src0Type, + // mlir::Type src1Type, mlir::Type src2Type); + + DPASInstruction() + : Instruction("dpas", // name + "Dot Product Accumulate", // description + "0xABCD", // opcode + FunctionalUnit::Matrix, // functional_unit + InstructionType::SIMD, // type + InstructionScope::Subgroup, // scope + UnitOfComputation::Matrix) // unit_of_computation + {} + + // Override all virtuals from MatrixOpInterface + virtual bool checkSupportedMMATypes(mlir::Type AType, mlir::Type BType, + mlir::Type CType, + mlir::Type DType) override; + virtual std::vector getSupportedM(mlir::Type type) override; + virtual std::vector getSupportedK(mlir::Type type) override; + virtual std::vector getSupportedN(mlir::Type type) override; + virtual std::vector> + getSupportedMatrix(mlir::Type type, MatrixType matrixType) override; +}; + +// struct to represent Load2D/Store2D/Prefetch instruction +struct LoadStorePrefetch2DInstruction : public Instruction { + MemoryType memory_type; + MemoryAccessType memory_access_type; + // std::vector supported_types; + std::vector supported_types_bitwidth; + std::map alignment; + std::vector> supported_tile_sizes; + uint32_t min_surface_pitch; + + // Validate Array length restriction on a given tile + bool validateArrayLenRestriction(std::vector tile, + uint32_t array_len, mlir::Type dataType) { + + Restriction, uint32_t, mlir::Type> + width_array_len_restriction( + tile, array_len, dataType, + [](std::vector tile, uint32_t array_len, + mlir::Type dataType) { + assert(tile.size() == 2); + return tile[1] * array_len * + (dataType.getIntOrFloatBitWidth() / 8) <= + 64; + }); + return width_array_len_restriction.validate(); + } + + // Validate Surface Pitch restriction on a given tile + bool validateSurfacePitchRestriction(std::vector tile, + uint32_t surfacePitch /*in bytes*/) { + Restriction, uint32_t> surface_pitch_restriction( + tile, surfacePitch, + [](std::vector tile, uint32_t surfacePitch) { + assert(tile.size() == 2); + return surfacePitch >= 64; + }); + return surface_pitch_restriction.validate(); + } +}; + +namespace PVCuArch { +struct PVCuArch : public Xe2Plus { + // Maintaines ownership of the instructions owned by PVUarch + std::vector> owned_instructions; + PVCuArch() + : Xe2Plus("pvc", // archName + "Ponte Vecchio Architecture", // archDescription + XeCoreInfo(8, SharedMemory(512 * 1024, 4), 8, 8), // xeCore + {/* register_file_info */}, // Optional: empty + {/* cache_info */}, // Optional: empty + {/* instructions */}, // Optional: empty + {/* restrictions */} // Optional: empty + ) { + // Initialize uArchHierarchy + this->uArch_hierarchy.push_back(uArchHierarchyComponent("thread", 0)); + this->uArch_hierarchy.push_back(uArchHierarchyComponent("XeCore", 8)); + this->uArch_hierarchy.push_back(uArchHierarchyComponent("XeSlice", 16)); + this->uArch_hierarchy.push_back(uArchHierarchyComponent("XeStack", 4)); + this->uArch_hierarchy.push_back(uArchHierarchyComponent("gpu", 2)); + // Intialize register file info + // GRF + this->register_file_info.emplace( + "GRF", + RegisterFileInfo(64 * 1024, // size in bits + {"small", "large"}, // GRF modes + {128, 256}, // registers per thread per mode + 0, // number of banks + 0 // bank size + )); + // Initialize cache info + // L1 cache, XeCore level + this->cache_info.push_back( + CacheInfo(512 * 1024, 64, this->uArch_hierarchy[1])); + // L3 cache, XeStack level + this->cache_info.push_back( + CacheInfo(512 * 1024, 64, this->uArch_hierarchy[3])); + + // Add the instructions + auto dpas = std::make_shared(); + instructions.emplace(dpas->name, dpas); + // instructions[dpas->name] = dpas.get(); + owned_instructions.push_back(dpas); + } +}; +} // namespace PVCuArch + +namespace BMGuArch { +struct BMGuArch : public Xe2Plus { + // Maintaines ownership of the instructions owned by PVUarch + std::vector> owned_instructions; + BMGuArch() + : Xe2Plus("bmg", // archName + "Battlemage Architecture", // archDescription + XeCoreInfo(8, SharedMemory(256 * 1024, 4), 8, 8), // xeCore + {/* register_file_info */}, // Optional: empty + {/* cache_info */}, // Optional: empty + {/* instructions */}, // Optional: empty + {/* restrictions */} // Optional: empty + ) { + // Initialize uArchHierarchy + this->uArch_hierarchy.push_back(uArchHierarchyComponent("thread", 0)); + this->uArch_hierarchy.push_back(uArchHierarchyComponent("XeCore", 8)); + this->uArch_hierarchy.push_back(uArchHierarchyComponent("XeSlice", 4)); + this->uArch_hierarchy.push_back(uArchHierarchyComponent("XeStack", 5)); + this->uArch_hierarchy.push_back(uArchHierarchyComponent("gpu", 1)); + // Intialize register file info + // GRF + this->register_file_info["GRF"] = + RegisterFileInfo(64 * 1024, // size in bits + {"small", "large"}, // GRF modes + {128, 256}, // registers per thread per mode + 0, // number of banks + 0 // bank size + ); + // Initialize cache info + // L1 cache, XeCore level + this->cache_info.push_back( + CacheInfo(256 * 1024, 64, this->uArch_hierarchy[1])); + // L3 cache, XeStack level + this->cache_info.push_back( + CacheInfo(18 * 1024 * 1024, 256, this->uArch_hierarchy[3])); + + // Add the instructions + auto dpas = std::make_shared(); + instructions.emplace(dpas->name, dpas); + // instructions[dpas->name] = dpas.get(); + owned_instructions.push_back(dpas); + } +}; +} // namespace BMGuArch + +} // namespace Xe2Plus +} // namespace uArch +} // namespace xegpu +} // namespace mlir + +#endif // MLIR_DIALECT_XEGPU_UTILS_INTEL_GPU_XE2_H diff --git a/mlir/include/mlir/Dialect/XeGPU/uArch/uArch.h b/mlir/include/mlir/Dialect/XeGPU/uArch/uArch.h new file mode 100644 index 0000000000000..2540af5c253dd --- /dev/null +++ b/mlir/include/mlir/Dialect/XeGPU/uArch/uArch.h @@ -0,0 +1,389 @@ +//===--- uArch.h ---------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Base uArch definition for different architectures. +/// +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_DIALECT_XEGPU_UTILS_UARCH_H +#define MLIR_DIALECT_XEGPU_UTILS_UARCH_H + +#include +#include +#include +#include +#include +#include +#include + +#include "mlir/IR/Types.h" + +namespace mlir { +namespace xegpu { +namespace uArch { + +// Data types we need for YAML to uArch translation +struct Range { + int start; + int end; +}; + +// Restriction struct +// This struct is used to represent a restriction on the uArch +// The restriction is represented as a range of necessary parameters (template +// arguments) and a lambda function (validate()) that takes the same number of +// arguments as the number of template arguments The lambda function returns +// true if the arguments satisfy the restriction The lambda function returns +// false if the arguments do not satisfy the restriction + +// For example, a restriction that checks if the number of dimensions in a +// std::vector> is 2 can be represented as: +// std::vector> rt = +// {{1, 32}, {2, 16}}; Restriction>> r1(rt, +// [](std::vector> t) { return t.size() == 2; }); +// r1.validate() will return true if the number of dimensions in the +// std::vector> is 2 r1.validate() will return false if +// the number of dimensions in the std::vector> is not 2 + +// The primary purpose of Restriction struct is to provide a generic way to +// represent restrictions on the uArch and to validate if the uArch satisfies +// the restrictions +template +struct Restriction { + std::tuple data; + std::function func; + + Restriction(Args... args, std::function f) + : data(args...), func(f) {} + + bool validate() { return std::apply(func, data); } + std::any apply() { return std::apply(func, data); } +}; + +// Architecture HW component hierarchy to present thread, core, socket ... +struct uArchHierarchyComponent { + std::string name = ""; // optional name of the hierarchy component + // no. of lower hierarchy component it contains, e.g., for PVC XeCore it + // contains 8 threads, so no_of_component=8 + uint32_t no_of_component; + // Constructor + uArchHierarchyComponent(const std::string &name, uint32_t no_of_component) + : name(name), no_of_component(no_of_component) {} +}; + +// An enum class to represent the functional unit of an instruction +enum class FunctionalUnit { + ALU, + Tensor, + Matrix, + Load, + Store, + Branch, + Barrier, + Memory, + Atomic, + Interconnect, + Other +}; + +// An enum class to represent the type of memory +enum class MemoryType { Shared, Local, Global, Constant, Texture, Other }; + +// An enum class to represent the memory access type +enum class MemoryAccessType { Read, Write, ReadWrite, Other }; + +// An enum class to represent the type of an instruction +enum class InstructionType { SIMT, SIMD, SPMD, MIMD, Other }; + +// An enum class to represent the scope of an instruction +enum class InstructionScope { + WorkItem, + Subgroup, + Workgroup, + Cluster, + Thread, // For CPU + Core, // For CPU + Other +}; + +// An enum class to represent the unit of computation of an instruction +enum class UnitOfComputation { + Scalar, + Vector, // 1-D vector + Matrix, + Tile, + Other +}; + +// A struct to represent basic information about an instruction +// This struct is used to represent the information about an instruction in the +// uArch The information includes: +// - the name of the instruction, +// - the opcode, +// - the functional unit, +// - the type of the instruction, +// - the scope of the instruction, +// - the unit of computation, +// - the description of the instruction +// The information is represented as strings +// For example, the information about an instruction can be represented as: +// Instruction info = {"dpas", "0x83", "matrix", "simd", "subgroup", "tile", +// "Dot Product Accumulate Systolic (DPAS) is a matrix multiply-add +// operation"}; + +// The primary purpose of Instruction struct is to provide a generic way to +// represent information about an instruction and to use this information to +// generate the uArch. Specifc instruction in a uArch can inherit from this +// struct and add more fields as needed + +struct Instruction { + std::string name; + std::string description; + std::string opcode; + FunctionalUnit functional_unit; + InstructionType type; + InstructionScope scope; + UnitOfComputation unit_of_computation; + + // @TODO: Add more fields as needed + // std::string latency; + // std::string throughput; + // std::string pipeline; + // std::string resource; + // std::string comment; + Instruction(std::string name, std::string desc, std::string opcode, + FunctionalUnit fu, InstructionType itype, InstructionScope sc, + UnitOfComputation uoc) + : name(std::move(name)), description(std::move(desc)), + opcode(std::move(opcode)), functional_unit(fu), type(itype), scope(sc), + unit_of_computation(uoc) {} + + virtual ~Instruction() = default; +}; + +// A struct to represent register file information +struct RegisterFileInfo { + uint32_t size; // size per register in bits + std::vector mode; // e.g., "small", "large" GRF modes + std::vector + num_regs_per_thread_per_mode; // number of registers per thread per mode + uint32_t num_banks; + uint32_t bank_size; + + // Constructor + RegisterFileInfo() = default; + RegisterFileInfo(uint32_t size, const std::vector &mode, + const std::vector &numRegs, uint32_t num_banks, + uint32_t bank_size) + : size(size), mode(mode), num_regs_per_thread_per_mode(numRegs), + num_banks(num_banks), bank_size(bank_size) {} +}; + +// A struct to represent cache information +struct CacheInfo { + uint32_t size; + uint32_t line_size; + // At which component level the cache is shared + uArchHierarchyComponent component; + // uint32_t associativity; + // uint32_t num_banks; + // uint32_t bank_size; + // uint32_t num_ports; + // uint32_t port_width; + // uint32_t bank_conflicts; + // Constructor + CacheInfo(uint32_t size, uint32_t line_size, + const uArchHierarchyComponent &component) + : size(size), line_size(line_size), component(component) {} +}; + +// A struct to represent the uArch +// This struct is used to represent the microarchitecture of a target device +// The uArch includes: +// - the name of the uArch, +// - the description of the uArch, +// - the range of tiles supported by the uArch, +// - the set of tiles supported by the uArch, +// - the set of instructions supported by the uArch, +// - the set of restrictions on the uArch +// The information is represented as strings, std:vector, +// Instruction and Restriction structs. +// For example, the information about a +// uArch can be represented as: uArch uarch = {"XeHPG", "Intel Xe HPG +// microarchitecture", {2, {{1, 32}, {1, 32}}}, {2, {{1, 2, 4, 8, 16, 32}, {1, +// 2, 4, 8, 16, 32}}}, {{"dpas", "0x83", "matrix", "simd", "subgroup", "tile", +// "Dot Product Accumulate Systolic (DPAS) is a matrix multiply-add +// operation"}}, {r1, r2, r3}}; This represents a uArch named "XeHPG" with +// description "Intel Xe HPG microarchitecture" that supports 2x2 tiles with +// dimensions ranging from 1 to 32, 1 to 32, supports a DPAS instruction and has +// 3 restrictions r1, r2, r3 on the uArch +struct uArch { + std::string name; // similar to target triple + std::string description; + // Represent the whole uArch hierarchy + // For 2 stack Intel PVC it would look something like this: + // uArchHierarchy[0] = {thread, 0} + // uArchHierarchy[1] = {XeCore, 8} + // uArchHierarchy[2] = {XeSlice, 16} + // uArchHierarchy[3] = {XeStack, 4} + // uArchHierarchy[4] = {gpu, 2} + std::vector uArch_hierarchy; + // Different kind of regiger file information (e.g., GRF, ARF, etc.) + std::map register_file_info; + // Each level of cache is indexed lower to higher in the vector + // (e.g., L1 indexed at 0, L2 at 1 and so on) L1, L2, L3, etc. + std::vector cache_info; + std::map> instructions; + std::vector *> restrictions; + + // Constructor + uArch() = default; + uArch(const std::string &name, const std::string &description, + const std::vector &uArch_hierarchy = {}, + const std::map ®ister_file_info = {}, + const std::vector &cache_info = {}, + const std::map> + &instructions = {}, + const std::vector *> &restrictions = {}) + : name(name), description(description), uArch_hierarchy(uArch_hierarchy), + register_file_info(register_file_info), cache_info(cache_info), + instructions(instructions), restrictions(restrictions) {} +}; + +// A struct to represent shared memory information +struct SharedMemory { + uint32_t size; // in bytes + uint32_t alignment; // in bytes + // @TODO: Add more fields as needed + // uint32_t latency; + // uint32_t throughput; + // uint32_t bandwidth; + // uint32_t num_ports; + // uint32_t port_width; + // uint32_t bank_size; + // uint32_t bank_conflicts; + // uint32_t num_banks; + + // Constructor + SharedMemory(uint32_t size, uint32_t alignment) + : size(size), alignment(alignment) {} +}; + +// For future use case in Xe4+ + +// struct EUInfo { +// uint32_t num_eu_threads; +// SharedMemory shared_memory; +// }; + +// uint32_t num_simd_units; +// uint32_t num_spus; +// uint32_t num_smt; +// uint32_t num_hardware_threads; +// uint32_t num_threads_per_spu; +// uint32_t num_threads_per_simd_unit; +// uint32_t num_threads_per_hardware_thread; +// uint32_t num_threads_per_smt; +// SharedMemory shared_memory; +// }; + +// A struct to represent a GPU uArch +// This struct is used to represent the GPU microarchitecture of a target device +// struct GPUuArch : public uArch { +// uint32_t num_compute_units; +// uint32_t num_vector_units; +// uint32_t num_scalar_units; +// uint32_t num_tensor_units; +// uint32_t num_matrix_units; +// SharedMemory shared_memory; +// }; + +// Create a TileLikeOp Interface +struct TileOpInterface { + // Get the supported tiles for the specific data type. + // Can provide load/store/prefetch ops supported tile sizes for a specific + // uarch + virtual std::vector> + getSupportedTiles(mlir::Type type) = 0; + + // Validate the tile ops restrictions + // @param tile, tile to load/store/prefetch + // @param surface, surface to load/store/prefetch data from + // @param dataType, data type of the data + // @param surface_pitch, suface pitch + // @param array_len, array length + virtual bool validate(std::vector tile, + std::vector surface, mlir::Type dataType, + uint32_t surface_pitch, uint32_t array_len = 1) = 0; + virtual ~TileOpInterface() = default; +}; + +enum class MatrixType { MatrixA, MatrixB, MatrixC, MatrixD }; +struct MatrixOpInterface { + virtual bool checkSupportedMMATypes(mlir::Type AType, mlir::Type BType, + mlir::Type CType, mlir::Type DType) = 0; + virtual std::vector getSupportedM(mlir::Type type) = 0; + virtual std::vector getSupportedK(mlir::Type type) = 0; + virtual std::vector getSupportedN(mlir::Type type) = 0; + virtual std::vector> + getSupportedMatrix(mlir::Type type, MatrixType matrixType) = 0; + + virtual ~MatrixOpInterface() = default; +}; + +struct uArchMap { +public: + // Singleton instance + static uArchMap &instance() { + static uArchMap instance; + return instance; + } + + // Insert or update a key-value pair + void insert(const std::string &key, std::shared_ptr value) { + std::unique_lock lock(mutex_); + // map_[key] = std::move(value); // safe to overwrite + map_.emplace(key, std::move(value)); // safe to overwrite + } + + // Get a value by key (concurrent safe read) + std::shared_ptr get(const std::string &key) const { + std::shared_lock lock(mutex_); + auto it = map_.find(key); + if (it != map_.end()) + return it->second; + return nullptr; + } + + // Check if a key exists + bool contains(const std::string &key) const { + std::shared_lock lock(mutex_); + return map_.find(key) != map_.end(); + } + + // Remove a key + bool erase(const std::string &key) { + std::unique_lock lock(mutex_); + return map_.erase(key) > 0; + } + +private: + uArchMap() = default; + uArchMap(const uArchMap &) = delete; + uArchMap &operator=(const uArchMap &) = delete; + + mutable std::shared_mutex mutex_; + std::map> map_; +}; + +} // namespace uArch +} // namespace xegpu +} // namespace mlir + +#endif // MLIR_DIALECT_XEGPU_UTILS_UARCH_H diff --git a/mlir/lib/Dialect/XeGPU/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/CMakeLists.txt index 31167e6af908b..9079df050ab2b 100644 --- a/mlir/lib/Dialect/XeGPU/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/CMakeLists.txt @@ -1,3 +1,4 @@ add_subdirectory(IR) add_subdirectory(Transforms) +add_subdirectory(uArch) add_subdirectory(Utils) diff --git a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt index 242a97ccfdf6d..5393b9b7b1c6f 100644 --- a/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/IR/CMakeLists.txt @@ -15,6 +15,7 @@ add_mlir_dialect_library(MLIRXeGPUDialect MLIRArithUtils MLIRDialectUtils MLIRIR + MLIRXeGPUuArch MLIRViewLikeInterface MLIRVectorDialect ) diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp index 642c393cbc2c8..4dd3291a3f541 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp @@ -9,6 +9,7 @@ #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/IR/XeGPUTargetInfo.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/Builders.h" #include "mlir/IR/DialectImplementation.h" #include "llvm/ADT/TypeSwitch.h" @@ -31,6 +32,14 @@ void XeGPUDialect::initialize() { #define GET_ATTRDEF_LIST #include >(); + + // Populate the uArchMap with the supported target devices + auto pvcuArch = + std::make_shared(); + mlir::xegpu::uArch::uArchMap::instance().insert("pvc", pvcuArch); + auto bmguArch = + std::make_shared(); + mlir::xegpu::uArch::uArchMap::instance().insert("bmg", bmguArch); } // Checks if the given shape can be evenly distributed based on the layout diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index ef7cd1424e7a4..cff23a3a2c7fe 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -7,9 +7,11 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/Arith/Utils/Utils.h" +#include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" #include "mlir/IR/Builders.h" #include "mlir/IR/TypeUtilities.h" @@ -575,6 +577,45 @@ LogicalResult DpasOp::verify() { if (getAcc() && getAcc().getType() != getResultType()) return emitOpError("Expecting the acc type to be the same as result."); + // @uArch: Check if the types are supported for DPAS. + Operation *op = getOperation(); + auto moduleOp = op->getParentOfType(); + if (!moduleOp) + llvm::errs() << "No parent module op.\n"; + + // It target device info is not attched, skip the target-specific checks + auto targetDeviceNameAttr = dlti::query(moduleOp, {"GPU", "name"}); + if (failed(targetDeviceNameAttr)) + llvm::errs() + << "No target device found, skipping target-specific verification\n"; + + // Potential usage of uArch in verification. + if (succeeded(targetDeviceNameAttr)) { + auto targetDeviceNameStr = + llvm::dyn_cast(targetDeviceNameAttr.value()).str(); + auto targetDeviceArch = + mlir::xegpu::uArch::uArchMap::instance().get(targetDeviceNameStr); + if (targetDeviceArch) { + // @TODO: We should keep the name of the Instructions in one place, since + // we use the name of the instruction to find the instruction, it should + // be standardized and kept for users to access. + auto it = targetDeviceArch->instructions.find("dpas"); + if (it != targetDeviceArch->instructions.end()) { + std::shared_ptr instr = it->second; + auto matrixOp = + std::dynamic_pointer_cast( + instr); + if (matrixOp) { + if (!matrixOp->checkSupportedMMATypes( + getLhsType().getElementType(), getRhsType().getElementType(), + getResultType().getElementType(), + getResultType().getElementType())) + return emitOpError("Unsupported DPAS types."); + } + } + } + } + // SIMT code: the size of the B operand has to be a multiple of 32 bits. // It skips the semantic check since lack of architecture information. // Users need to ensure the correctness. diff --git a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt index 9c178d1d85642..c88c43aa43941 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Transforms/CMakeLists.txt @@ -1,4 +1,5 @@ add_mlir_dialect_library(MLIRXeGPUTransforms + XeGPUAttachTargetDevice.cpp XeGPUBlocking.cpp XeGPUFoldAliasOps.cpp XeGPUSubgroupDistribute.cpp @@ -22,6 +23,7 @@ add_mlir_dialect_library(MLIRXeGPUTransforms MLIRTransforms MLIRGPUDialect MLIRXeGPUUtils + MLIRXeGPUuArch MLIRGPUUtils MLIRVectorTransforms ) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUAttachTargetDevice.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUAttachTargetDevice.cpp new file mode 100644 index 0000000000000..ea1e24b8e28d2 --- /dev/null +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUAttachTargetDevice.cpp @@ -0,0 +1,81 @@ +//===-- XeGPUAttachTargetDevice.cpp ---- XeGPU Attach Target Device Pass --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/DLTI/DLTI.h" +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/Dialect/XeGPU/Transforms/Passes.h" +#include "mlir/Dialect/XeGPU/Transforms/Transforms.h" +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Interfaces/DataLayoutInterfaces.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LogicalResult.h" + +namespace mlir { +namespace xegpu { +#define GEN_PASS_DEF_XEGPUATTACHTARGETDEVICE +#include "mlir/Dialect/XeGPU/Transforms/Passes.h.inc" +} // namespace xegpu +} // namespace mlir + +using namespace mlir; + +namespace { +struct XeGPUAttachTargetDevicePass final + : public xegpu::impl::XeGPUAttachTargetDeviceBase< + XeGPUAttachTargetDevicePass> { + using Base::Base; + void runOnOperation() override; +}; +} // namespace + +void XeGPUAttachTargetDevicePass::runOnOperation() { + ModuleOp module = getOperation(); + MLIRContext *ctx = module.getContext(); + Builder b(ctx); + + // Build #dlti.dl_entry<"name", ""> + // auto nameEntry = dlti::DLEntryAttr::get(ctx, b.getStringAttr("name"), + // b.getStringAttr(deviceName)); + + auto nameEntry = DataLayoutEntryAttr::get(b.getStringAttr("name"), + b.getStringAttr(deviceName)); + + // Build #dlti.target_device_spec<...> + TargetDeviceSpecInterface deviceSpec = + TargetDeviceSpecAttr::get(ctx, {nameEntry}); + + // Construct a dl_entry for "GPU" = deviceSpec + auto sysSpecVal = + DataLayoutEntryAttr::get(b.getStringAttr("GPU"), deviceSpec); + + // Cast to the expected interface + DataLayoutEntryInterface sysSpecIface = + llvm::dyn_cast(sysSpecVal); + + // Now build target system spec + auto systemSpec = TargetSystemSpecAttr::get( + ctx, ArrayRef{sysSpecIface}); + + // Attach to module + module->setAttr("dlti.target_system_spec", systemSpec); + + // Create the uArch object for the target device and add it to the uArchMap + // We don't have to do it here, we can do it in the Dialect initialization + // phase, this is just showing one way of doing it + if (deviceName == "pvc") { + auto pvcuArch = + std::make_shared(); + mlir::xegpu::uArch::uArchMap::instance().insert(deviceName, pvcuArch); + } else if (deviceName == "bmg") { + auto bmguArch = + std::make_shared(); + mlir::xegpu::uArch::uArchMap::instance().insert(deviceName, bmguArch); + } +} diff --git a/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt index 98e84a4420722..8fa908087c0ae 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt +++ b/mlir/lib/Dialect/XeGPU/Utils/CMakeLists.txt @@ -8,4 +8,5 @@ add_mlir_dialect_library(MLIRXeGPUUtils MLIRIR MLIRSCFTransforms MLIRXeGPUDialect - ) +) + diff --git a/mlir/lib/Dialect/XeGPU/uArch/CMakeLists.txt b/mlir/lib/Dialect/XeGPU/uArch/CMakeLists.txt new file mode 100644 index 0000000000000..c7f691cb6dda7 --- /dev/null +++ b/mlir/lib/Dialect/XeGPU/uArch/CMakeLists.txt @@ -0,0 +1,11 @@ +add_mlir_dialect_library(MLIRXeGPUuArch + IntelGpuXe2.cpp + + ADDITIONAL_HEADER_DIRS + ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/XeGPU/uArch + + LINK_LIBS PUBLIC + MLIRIR + MLIRDialectUtils +) + diff --git a/mlir/lib/Dialect/XeGPU/uArch/IntelGpuXe2.cpp b/mlir/lib/Dialect/XeGPU/uArch/IntelGpuXe2.cpp new file mode 100644 index 0000000000000..13352713803cb --- /dev/null +++ b/mlir/lib/Dialect/XeGPU/uArch/IntelGpuXe2.cpp @@ -0,0 +1,164 @@ +#include "mlir/Dialect/XeGPU/uArch/IntelGpuXe2.h" +#include "llvm/Support/YAMLTraits.h" +#include +#include +#include + +using namespace mlir::xegpu::uArch; +using namespace mlir::xegpu::uArch::Xe2Plus; + +namespace mlir { +namespace xegpu { +namespace uArch { +namespace Xe2Plus { +bool DPASInstruction::checkSupportedMMATypes(mlir::Type AType, mlir::Type BType, + mlir::Type CType, + mlir::Type DType) { + if (AType.isF16() || BType.isF16()) { + if (AType != BType || (CType && (!CType.isF32() && !CType.isF16())) || + (!DType.isF32() && !DType.isF16())) { + llvm::errs() + << "Unsupported dpas combinations of Dst, Acc, A and B matrices, " + << "Supported types are:\n" + << " Dst | Acc | A | B \n" + << " f, hf | f, hf | hf | hf \n" + << "AType: " << AType << " BType: " << BType << " CType: " << CType + << " DType: " << DType; + return false; + } + } else if (AType.isBF16() || BType.isBF16()) { + if (AType != BType || (CType && (!CType.isF32() && !CType.isBF16())) || + (!DType.isF32() && !DType.isBF16())) { + llvm::errs() + << "Unsupported dpas combinations of Dst, Acc, A and B matrices, " + << "Supported types are:\n" + << " Dst | Acc | A | B \n" + << " f, bf | f, bf | bf | bf \n" + << "AType: " << AType << " BType: " << BType << " CType: " << CType + << " DType: " << DType; + return false; + } + } else if (AType.isTF32() || BType.isTF32()) { + if (AType != BType || (CType && (!CType.isF32() && !DType.isF32())) || + (!DType.isF32())) { + llvm::errs() + << "Unsupported dpas combinations of Dst, Acc, A and B matrices, " + << "Supported types are:\n" + << " Dst | Acc | A | B \n" + << " f | f | tf32 | tf32 \n" + << "AType: " << AType << " BType: " << BType << " CType: " << CType + << " DType: " << DType; + return false; + } + } else if (!(AType.isInteger(2) || AType.isInteger(4) || + AType.isInteger(8)) && + !(BType.isInteger(2) || BType.isInteger(4) || + BType.isInteger(8))) { + llvm::errs() + << "Unsupported dpas combinations of Dst, Acc, A and B matrices, " + << "Supported types are:\n" + << " Dst | Acc | A | B " + " \n" + << " ud, d | ud,d | ub,b,u4,s4,u2,s2 | ub,b,u4,s4,u2,s2 " + << "AType: " << AType << " BType: " << BType << " CType: " << CType + << " DType: " << DType; + return false; + } + + return true; +} + +std::vector DPASInstruction::getSupportedM(mlir::Type type) { + return {1, 2, 3, 4, 5, 6, 7, 8}; +} + +std::vector DPASInstruction::getSupportedK(mlir::Type type) { + // assert if data type is not int or float type + assert(type.isIntOrFloat() && "Matrix type must be int or float"); + auto bitWidth = type.getIntOrFloatBitWidth(); + uint32_t kSize = 0; + switch (bitWidth) { + case 2: + kSize = 64; + break; + case 4: + kSize = 64; + break; + case 8: + kSize = 32; + break; + case 16: + kSize = 16; + break; + case 32: + kSize = 8; + break; + default: + llvm_unreachable("Invalid int or float"); + } + return {kSize}; +} + +std::vector DPASInstruction::getSupportedN(mlir::Type type) { + return {16}; +} + +std::vector> +DPASInstruction::getSupportedMatrix(mlir::Type type, MatrixType matrixType) { + auto combineVectors = [](const std::vector &a, + const std::vector &b) + -> std::vector> { + std::vector> result; + for (unsigned x : a) { + for (unsigned y : b) { + result.emplace_back(x, y); + } + } + return result; + }; + + auto M = getSupportedM(type); + auto K = getSupportedK(type); + auto N = getSupportedN(type); + std::vector> resultMatrix; + + switch (matrixType) { + case MatrixType::MatrixA: + resultMatrix = combineVectors(M, K); + break; + case MatrixType::MatrixB: + resultMatrix = combineVectors(K, N); + break; + case MatrixType::MatrixC: + resultMatrix = combineVectors(M, N); + break; + case MatrixType::MatrixD: + resultMatrix = combineVectors(M, N); + break; + } + return resultMatrix; +} + +} // namespace Xe2Plus +} // namespace uArch +} // namespace xegpu +} // namespace mlir + +// namespace mlir { +// namespace xe_gpu { +// namespace namespace mlir { +// namespace xegpu { +// namespace PVCuArchYAML { { +// struct XeCoreInfo { +// uint32_t num_threads; +// SharedMemory shared_memory; +// uint32_t num_vector_units; +// uint32_t num_matrix_units; +// }; + +// struct Xe2Plus { +// XeCoreInfo xe_core; +// }; +// } +// } +// } diff --git a/mlir/test/Dialect/XeGPU/attach-target-device.mlir b/mlir/test/Dialect/XeGPU/attach-target-device.mlir new file mode 100644 index 0000000000000..df4553c2c2726 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/attach-target-device.mlir @@ -0,0 +1,54 @@ +// RUN: mlir-opt --xegpu-attach-target-device="device-name=pvc" %s -split-input-file -verify-diagnostics + +// module @valid_dpas attributes {gpu.container_module} { +// gpu.module @valid_dpas attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + +// gpu.func @valid_dpas(%a: memref<24x32xf16>, %b: memref<32x24xf16>) { +// %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout> +// %load_a = xegpu.load_nd %tdesc_a : !xegpu.tensor_desc<24x32xf16, #xegpu.layout> +// -> vector<24x32xf16> +// %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<32x24xf16> -> !xegpu.tensor_desc<32x24xf16, #xegpu.layout> +// %load_b = xegpu.load_nd %tdesc_b : !xegpu.tensor_desc<32x24xf16, #xegpu.layout> -> vector<32x24xf16> + +// %dpas = xegpu.dpas %load_a, %load_b {layout_result_0 = #xegpu.layout} : vector<24x32xf16>, vector<32x24xf16> -> vector<24x24xf16> +// gpu.return +// } +// } +// } + + +// RUN: mlir-opt %s -my-pass | FileCheck %s + +// CHECK: module @valid_dpas +// CHECK-SAME: attributes {dlti.target_system_spec = #dlti.target_system_spec<"GPU" = #dlti.target_device_spec<"name" = "pvc">>, gpu.container_module} +module @valid_dpas attributes {gpu.container_module} { + // CHECK: gpu.module @valid_dpas + gpu.module @valid_dpas attributes {spirv.target_env = #spirv.target_env<#spirv.vce,api = OpenCL,#spirv.resource_limits<>>} { + // CHECK: gpu.func @valid_dpas + gpu.func @valid_dpas(%a: memref<24x32xf16>, %b: memref<32x24xf16>) { + // CHECK: %[[TDESC_A:.*]] = xegpu.create_nd_tdesc %[[ARG0:.*]]{{\[}}0, 0] + // CHECK-SAME: memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16 + %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<24x32xf16, #xegpu.layout> + + // CHECK: %[[LOAD_A:.*]] = xegpu.load_nd %[[TDESC_A]] + // CHECK-SAME: -> vector<24x32xf16> + %load_a = xegpu.load_nd %tdesc_a : !xegpu.tensor_desc<24x32xf16, #xegpu.layout> -> vector<24x32xf16> + + // CHECK: %[[TDESC_B:.*]] = xegpu.create_nd_tdesc %[[ARG1:.*]]{{\[}}0, 0] + // CHECK-SAME: memref<32x24xf16> -> !xegpu.tensor_desc<32x24xf16 + %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<32x24xf16> -> !xegpu.tensor_desc<32x24xf16, #xegpu.layout> + + // CHECK: %[[LOAD_B:.*]] = xegpu.load_nd %[[TDESC_B]] + // CHECK-SAME: -> vector<32x24xf16> + %load_b = xegpu.load_nd %tdesc_b : !xegpu.tensor_desc<32x24xf16, #xegpu.layout> -> vector<32x24xf16> + + // CHECK: %[[DPAS:.*]] = xegpu.dpas %[[LOAD_A]], %[[LOAD_B]] + // CHECK-SAME: layout_result_0 = #xegpu.layout + // CHECK-SAME: : vector<24x32xf16>, vector<32x24xf16> -> vector<24x24xf16> + %dpas = xegpu.dpas %load_a, %load_b {layout_result_0 = #xegpu.layout} : vector<24x32xf16>, vector<32x24xf16> -> vector<24x24xf16> + + // CHECK: gpu.return + gpu.return + } + } +} diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir index 83a98ab0622b7..c862667b5875c 100644 --- a/mlir/test/Dialect/XeGPU/invalid.mlir +++ b/mlir/test/Dialect/XeGPU/invalid.mlir @@ -643,3 +643,22 @@ func.func @tensor_desc_invalid_sg_data(%src: ui64, %offsets: vector<16xindex>) { #xegpu.layout> return } + + +// ----- +module @invalid_dpas attributes {dlti.target_system_spec = #dlti.target_system_spec<"GPU" = #dlti.target_device_spec<"name" = "pvc">>, gpu.container_module} { + gpu.module @invalid_dpas attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + + gpu.func @invalid_dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { + %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %load_a = xegpu.load_nd %tdesc_a : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + -> vector<24x32xf32> + %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<32x24xf32> -> !xegpu.tensor_desc<32x24xf32, #xegpu.layout> + %load_b = xegpu.load_nd %tdesc_b : !xegpu.tensor_desc<32x24xf32, #xegpu.layout> -> vector<32x24xf32> + // expected-error@+1 {{Unsupported DPAS types.}} + %dpas = xegpu.dpas %load_a, %load_b {layout_result_0 = #xegpu.layout} : vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32> + gpu.return + } + } +} + diff --git a/mlir/test/Dialect/XeGPU/uarch-info.mlir b/mlir/test/Dialect/XeGPU/uarch-info.mlir new file mode 100644 index 0000000000000..351c96f8169c5 --- /dev/null +++ b/mlir/test/Dialect/XeGPU/uarch-info.mlir @@ -0,0 +1,14 @@ +module @eltwise_add attributes {gpu.container_module} { + gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + + gpu.func @dpas(%a: memref<24x32xf32>, %b: memref<32x24xf32>) { + %tdesc_a = xegpu.create_nd_tdesc %a[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + %load_a = xegpu.load_nd %tdesc_a : !xegpu.tensor_desc<24x32xf32, #xegpu.layout> + -> vector<24x32xf32> + %tdesc_b = xegpu.create_nd_tdesc %b[0, 0] : memref<32x24xf32> -> !xegpu.tensor_desc<32x24xf32, #xegpu.layout> + %load_b = xegpu.load_nd %tdesc_b : !xegpu.tensor_desc<32x24xf32, #xegpu.layout> -> vector<32x24xf32> + %dpas = xegpu.dpas %load_a, %load_b {layout_result_0 = #xegpu.layout} : vector<24x32xf32>, vector<32x24xf32> -> vector<24x24xf32> + gpu.return + } + } +}