Skip to content

Commit 9e8242f

Browse files
committed
[NVPTX] Add NVPTXIncreaseAligmentPass to improve vectorization
1 parent 35f6d91 commit 9e8242f

File tree

6 files changed

+224
-0
lines changed

6 files changed

+224
-0
lines changed

llvm/lib/Target/NVPTX/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ set(NVPTXCodeGen_sources
2626
NVPTXISelLowering.cpp
2727
NVPTXLowerAggrCopies.cpp
2828
NVPTXLowerAlloca.cpp
29+
NVPTXIncreaseAlignment.cpp
2930
NVPTXLowerArgs.cpp
3031
NVPTXLowerUnreachable.cpp
3132
NVPTXMCExpr.cpp

llvm/lib/Target/NVPTX/NVPTX.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ FunctionPass *createNVPTXTagInvariantLoadsPass();
5555
MachineFunctionPass *createNVPTXPeephole();
5656
MachineFunctionPass *createNVPTXProxyRegErasurePass();
5757
MachineFunctionPass *createNVPTXForwardParamsPass();
58+
FunctionPass *createNVPTXIncreaseLocalAlignmentPass();
5859

5960
void initializeNVVMReflectLegacyPassPass(PassRegistry &);
6061
void initializeGenericToNVVMLegacyPassPass(PassRegistry &);
@@ -76,6 +77,7 @@ void initializeNVPTXAAWrapperPassPass(PassRegistry &);
7677
void initializeNVPTXExternalAAWrapperPass(PassRegistry &);
7778
void initializeNVPTXPeepholePass(PassRegistry &);
7879
void initializeNVPTXTagInvariantLoadLegacyPassPass(PassRegistry &);
80+
void initializeNVPTXIncreaseLocalAlignmentLegacyPassPass(PassRegistry &);
7981

8082
struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> {
8183
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
@@ -111,6 +113,11 @@ struct NVPTXTagInvariantLoadsPass : PassInfoMixin<NVPTXTagInvariantLoadsPass> {
111113
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
112114
};
113115

116+
struct NVPTXIncreaseLocalAlignmentPass
117+
: PassInfoMixin<NVPTXIncreaseLocalAlignmentPass> {
118+
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
119+
};
120+
114121
namespace NVPTX {
115122
enum DrvInterface {
116123
NVCL,
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
//===-- NVPTXIncreaseAlignment.cpp - Increase alignment for local arrays --===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// A simple pass that looks at local memory arrays that are statically
10+
// sized and sets an appropriate alignment for them. This enables vectorization
11+
// of loads/stores to these arrays if not explicitly specified by the client.
12+
//
13+
// TODO: Ideally we should do a bin-packing of local arrays to maximize
14+
// alignments while minimizing holes.
15+
//
16+
//===----------------------------------------------------------------------===//
17+
18+
#include "NVPTX.h"
19+
#include "llvm/IR/DataLayout.h"
20+
#include "llvm/IR/Instructions.h"
21+
#include "llvm/IR/Module.h"
22+
#include "llvm/Pass.h"
23+
#include "llvm/Support/CommandLine.h"
24+
#include "llvm/Support/MathExtras.h"
25+
26+
using namespace llvm;
27+
28+
static cl::opt<bool>
29+
MaxLocalArrayAlignment("nvptx-use-max-local-array-alignment",
30+
cl::init(false), cl::Hidden,
31+
cl::desc("Use maximum alignment for local memory"));
32+
33+
static constexpr Align MaxPTXArrayAlignment = Align::Constant<16>();
34+
35+
/// Get the maximum useful alignment for an array. This is more likely to
36+
/// produce holes in the local memory.
37+
///
38+
/// Choose an alignment large enough that the entire array could be loaded with
39+
/// a single vector load (if possible). Cap the alignment at MaxPTXArrayAlignment.
40+
static Align getAggressiveArrayAlignment(const unsigned ArraySize) {
41+
return std::min(MaxPTXArrayAlignment, Align(PowerOf2Ceil(ArraySize)));
42+
}
43+
44+
/// Get the alignment of arrays that reduces the chances of leaving holes when
45+
/// arrays are allocated within a contiguous memory buffer (like shared memory
46+
/// and stack). Holes are still possible before and after the array allocation.
47+
///
48+
/// Choose the largest alignment such that the array size is a multiple of the
49+
/// alignment. If all elements of the buffer are allocated in order of
50+
/// alignment (higher to lower) no holes will be left.
51+
static Align getConservativeArrayAlignment(const unsigned ArraySize) {
52+
return commonAlignment(MaxPTXArrayAlignment, ArraySize);
53+
}
54+
55+
/// Find a better alignment for local arrays
56+
static bool updateAllocaAlignment(const DataLayout &DL,
57+
AllocaInst *Alloca) {
58+
// Looking for statically sized local arrays
59+
if (!Alloca->isStaticAlloca())
60+
return false;
61+
62+
// For now, we only support array allocas
63+
if (!(Alloca->isArrayAllocation() || Alloca->getAllocatedType()->isArrayTy()))
64+
return false;
65+
66+
const auto ArraySize = Alloca->getAllocationSize(DL);
67+
if (!(ArraySize && ArraySize->isFixed()))
68+
return false;
69+
70+
const auto ArraySizeValue = ArraySize->getFixedValue();
71+
const Align PreferredAlignment =
72+
MaxLocalArrayAlignment ? getAggressiveArrayAlignment(ArraySizeValue)
73+
: getConservativeArrayAlignment(ArraySizeValue);
74+
75+
if (PreferredAlignment > Alloca->getAlign()) {
76+
Alloca->setAlignment(PreferredAlignment);
77+
return true;
78+
}
79+
80+
return false;
81+
}
82+
83+
static bool runSetLocalArrayAlignment(Function &F) {
84+
bool Changed = false;
85+
const DataLayout &DL = F.getParent()->getDataLayout();
86+
87+
BasicBlock &EntryBB = F.getEntryBlock();
88+
for (Instruction &I : EntryBB)
89+
if (AllocaInst *Alloca = dyn_cast<AllocaInst>(&I))
90+
Changed |= updateAllocaAlignment(DL, Alloca);
91+
92+
return Changed;
93+
}
94+
95+
96+
namespace {
97+
struct NVPTXIncreaseLocalAlignmentLegacyPass : public FunctionPass {
98+
static char ID;
99+
NVPTXIncreaseLocalAlignmentLegacyPass() : FunctionPass(ID) {}
100+
101+
bool runOnFunction(Function &F) override;
102+
};
103+
} // namespace
104+
105+
char NVPTXIncreaseLocalAlignmentLegacyPass::ID = 0;
106+
INITIALIZE_PASS(NVPTXIncreaseLocalAlignmentLegacyPass, "nvptx-increase-local-alignment",
107+
"Increase alignment for statically sized alloca arrays", false,
108+
false)
109+
110+
FunctionPass *llvm::createNVPTXIncreaseLocalAlignmentPass() {
111+
return new NVPTXIncreaseLocalAlignmentLegacyPass();
112+
}
113+
114+
bool NVPTXIncreaseLocalAlignmentLegacyPass::runOnFunction(Function &F) {
115+
return runSetLocalArrayAlignment(F);
116+
}
117+
118+
PreservedAnalyses
119+
NVPTXIncreaseLocalAlignmentPass::run(Function &F, FunctionAnalysisManager &AM) {
120+
bool Changed = runSetLocalArrayAlignment(F);
121+
122+
if (!Changed)
123+
return PreservedAnalyses::all();
124+
125+
PreservedAnalyses PA;
126+
PA.preserveSet<CFGAnalyses>();
127+
return PA;
128+
}

llvm/lib/Target/NVPTX/NVPTXPassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,5 @@ FUNCTION_PASS("nvvm-intr-range", NVVMIntrRangePass())
4040
FUNCTION_PASS("nvptx-copy-byval-args", NVPTXCopyByValArgsPass())
4141
FUNCTION_PASS("nvptx-lower-args", NVPTXLowerArgsPass(*this))
4242
FUNCTION_PASS("nvptx-tag-invariant-loads", NVPTXTagInvariantLoadsPass())
43+
FUNCTION_PASS("nvptx-increase-local-alignment", NVPTXIncreaseLocalAlignmentPass())
4344
#undef FUNCTION_PASS

llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,8 @@ void NVPTXPassConfig::addIRPasses() {
391391
// but EarlyCSE can do neither of them.
392392
if (getOptLevel() != CodeGenOptLevel::None) {
393393
addEarlyCSEOrGVNPass();
394+
// Increase alignment for local arrays to improve vectorization.
395+
addPass(createNVPTXIncreaseLocalAlignmentPass());
394396
if (!DisableLoadStoreVectorizer)
395397
addPass(createLoadStoreVectorizerPass());
396398
addPass(createSROAPass());
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -passes=nvptx-increase-local-alignment < %s | FileCheck %s --check-prefixes=COMMON,DEFAULT
3+
; RUN: opt -S -passes=nvptx-increase-local-alignment -nvptx-use-max-local-array-alignment < %s | FileCheck %s --check-prefixes=COMMON,MAX
4+
target triple = "nvptx64-nvidia-cuda"
5+
6+
define void @test1() {
7+
; COMMON-LABEL: define void @test1() {
8+
; COMMON-NEXT: [[A:%.*]] = alloca i8, align 1
9+
; COMMON-NEXT: ret void
10+
;
11+
%a = alloca i8, align 1
12+
ret void
13+
}
14+
15+
define void @test2() {
16+
; DEFAULT-LABEL: define void @test2() {
17+
; DEFAULT-NEXT: [[A:%.*]] = alloca [63 x i8], align 1
18+
; DEFAULT-NEXT: ret void
19+
;
20+
; MAX-LABEL: define void @test2() {
21+
; MAX-NEXT: [[A:%.*]] = alloca [63 x i8], align 16
22+
; MAX-NEXT: ret void
23+
;
24+
%a = alloca [63 x i8], align 1
25+
ret void
26+
}
27+
28+
define void @test3() {
29+
; COMMON-LABEL: define void @test3() {
30+
; COMMON-NEXT: [[A:%.*]] = alloca [64 x i8], align 16
31+
; COMMON-NEXT: ret void
32+
;
33+
%a = alloca [64 x i8], align 1
34+
ret void
35+
}
36+
37+
define void @test4() {
38+
; DEFAULT-LABEL: define void @test4() {
39+
; DEFAULT-NEXT: [[A:%.*]] = alloca i8, i32 63, align 1
40+
; DEFAULT-NEXT: ret void
41+
;
42+
; MAX-LABEL: define void @test4() {
43+
; MAX-NEXT: [[A:%.*]] = alloca i8, i32 63, align 16
44+
; MAX-NEXT: ret void
45+
;
46+
%a = alloca i8, i32 63, align 1
47+
ret void
48+
}
49+
50+
define void @test5() {
51+
; COMMON-LABEL: define void @test5() {
52+
; COMMON-NEXT: [[A:%.*]] = alloca i8, i32 64, align 16
53+
; COMMON-NEXT: ret void
54+
;
55+
%a = alloca i8, i32 64, align 1
56+
ret void
57+
}
58+
59+
define void @test6() {
60+
; COMMON-LABEL: define void @test6() {
61+
; COMMON-NEXT: [[A:%.*]] = alloca i8, align 32
62+
; COMMON-NEXT: ret void
63+
;
64+
%a = alloca i8, align 32
65+
ret void
66+
}
67+
68+
define void @test7() {
69+
; COMMON-LABEL: define void @test7() {
70+
; COMMON-NEXT: [[A:%.*]] = alloca i32, align 2
71+
; COMMON-NEXT: ret void
72+
;
73+
%a = alloca i32, align 2
74+
ret void
75+
}
76+
77+
define void @test8() {
78+
; COMMON-LABEL: define void @test8() {
79+
; COMMON-NEXT: [[A:%.*]] = alloca [2 x i32], align 8
80+
; COMMON-NEXT: ret void
81+
;
82+
%a = alloca [2 x i32], align 2
83+
ret void
84+
}
85+

0 commit comments

Comments
 (0)