-
Notifications
You must be signed in to change notification settings - Fork 13.5k
Open
Labels
A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.A-autovectorizationArea: Autovectorization, which can impact perf or code sizeArea: Autovectorization, which can impact perf or code sizeC-optimizationCategory: An issue highlighting optimization opportunities or PRs implementing suchCategory: An issue highlighting optimization opportunities or PRs implementing suchI-slowIssue: Problems and improvements with respect to performance of generated code.Issue: Problems and improvements with respect to performance of generated code.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.Relevant to the compiler team, which will review and decide on the PR/issue.
Description
Trying to work around #142519 (code stops auto-vectorizing after 1.86),
the following code ends up being slower than the currently scalar version of the loop when decoding in image-webp
.
Hyperlinked code that no longer vectorizes vs. this code
https://rust.godbolt.org/z/9cTGTrdcn
The previous code is an optimization of the naively written code; we're only concerned with 4-byte chunks.
use std::ops::Range;
#[inline(never)]
pub fn apply_predictor_transform_12(image_data: &mut [u8], range: Range<usize>, width: usize) {
let mut i = range.start;
while i < range.end {
image_data[i] = image_data[i].wrapping_add(clamp_add_subtract_full(
i16::from(image_data[i - 4]),
i16::from(image_data[i - width * 4]),
i16::from(image_data[i - width * 4 - 4]),
));
i += 1;
}
}
fn clamp_add_subtract_full(a: i16, b: i16, c: i16) -> u8 {
(a + b - c).max(0).min(255) as u8
}
There seem to be a lot of instructions generated before entering the loop body for pointer checks.
I'm wondering if what ends up in the LLVM IR blocks labeled as bb2.lr.ph
and vector.memcheck
can be optimized better from the Rust side or in LLVM.
C and Rust functions compared
https://rust.godbolt.org/z/EMP51Knsv
Rewritten as a C function
#include <stddef.h>
#include <stdint.h>
struct Range {
size_t start;
size_t end;
};
void apply_predictor_transform_12(uint8_t *image_data, struct Range range,
size_t width) {
for (size_t i = range.start; i < range.end; i++) {
int16_t result = (int16_t)image_data[i - 4] +
(int16_t)image_data[i - width * 4] -
(int16_t)image_data[i - width * 4 - 4];
if (result > 255) {
result = 255;
}
if (result < 0) {
result = 0;
}
image_data[i] += (uint8_t)result;
}
}
C LLVM IR snippet
for.body.lr.ph:
%mul = shl i64 %width, 2
%0 = sub nuw i64 %range.coerce1, %range.coerce0
%min.iters.check = icmp ult i64 %0, 16
br i1 %min.iters.check, label %for.body.preheader, label %vector.memcheck
vector.memcheck:
%scevgep = getelementptr i8, ptr %image_data, i64 %range.coerce0
%scevgep42 = getelementptr i8, ptr %image_data, i64 %range.coerce1
%1 = getelementptr i8, ptr %image_data, i64 %range.coerce0
%scevgep43 = getelementptr i8, ptr %1, i64 -4
%2 = getelementptr i8, ptr %image_data, i64 %range.coerce1
%scevgep44 = getelementptr i8, ptr %2, i64 -4
%3 = sub i64 %range.coerce0, %mul
%scevgep45 = getelementptr i8, ptr %image_data, i64 %3
%4 = sub i64 %range.coerce1, %mul
%scevgep46 = getelementptr i8, ptr %image_data, i64 %4
%5 = add i64 %range.coerce0, -4
%6 = sub i64 %5, %mul
%scevgep47 = getelementptr i8, ptr %image_data, i64 %6
%7 = add i64 %range.coerce1, -4
%8 = sub i64 %7, %mul
%scevgep48 = getelementptr i8, ptr %image_data, i64 %8
%bound0 = icmp ult ptr %scevgep, %scevgep44
%bound1 = icmp ult ptr %scevgep43, %scevgep42
%found.conflict = and i1 %bound0, %bound1
%bound049 = icmp ult ptr %scevgep, %scevgep46
%bound150 = icmp ult ptr %scevgep45, %scevgep42
%found.conflict51 = and i1 %bound049, %bound150
%conflict.rdx = or i1 %found.conflict, %found.conflict51
%bound052 = icmp ult ptr %scevgep, %scevgep48
%bound153 = icmp ult ptr %scevgep47, %scevgep42
%found.conflict54 = and i1 %bound052, %bound153
%conflict.rdx55 = or i1 %conflict.rdx, %found.conflict54
br i1 %conflict.rdx55, label %for.body.preheader, label %vector.ph
Rust LLVM IR snippet
bb2.lr.ph:
%_23 = shl i64 %width, 2
%0 = add i64 %range.0, -4
%1 = sub i64 %0, %_23
%umax74 = tail call i64 @llvm.umax.i64(i64 %image_data.1, i64 %1)
%2 = add i64 %umax74, %_23
%3 = add i64 %2, 4
%4 = sub i64 %3, %range.0
%5 = sub i64 %range.0, %_23
%umax75 = tail call i64 @llvm.umax.i64(i64 %image_data.1, i64 %5)
%6 = add i64 %umax75, %_23
%7 = sub i64 %6, %range.0
%umin76 = tail call i64 @llvm.umin.i64(i64 %4, i64 %7)
%8 = xor i64 %range.0, -1
%9 = add i64 %range.1, %8
%umin77 = tail call i64 @llvm.umin.i64(i64 %umin76, i64 %9)
%10 = add i64 %range.0, -4
%umax78 = tail call i64 @llvm.umax.i64(i64 %image_data.1, i64 %10)
%11 = add i64 %umax78, 4
%12 = sub i64 %11, %range.0
%umin79 = tail call i64 @llvm.umin.i64(i64 %umin77, i64 %12)
%13 = tail call i64 @llvm.usub.sat.i64(i64 %image_data.1, i64 %range.0)
%umin81 = tail call i64 @llvm.umin.i64(i64 %umin79, i64 %13)
%14 = add i64 %umin81, 1
%min.iters.check = icmp ult i64 %14, 25
br i1 %min.iters.check, label %bb2.preheader, label %vector.memcheck
bb2.preheader:
%i.sroa.0.023.ph = phi i64 [ %range.0, %bb2.lr.ph ], [ %range.0, %vector.memcheck ], [ %39, %vector.body ]
br label %bb2
vector.memcheck:
%scevgep = getelementptr i8, ptr %image_data.0, i64 %range.0
%15 = add i64 %range.0, -4
%16 = sub i64 %15, %_23
%umax = tail call i64 @llvm.umax.i64(i64 %image_data.1, i64 %16)
%17 = add i64 %umax, %_23
%18 = add i64 %17, 4
%19 = sub i64 %18, %range.0
%20 = sub i64 %range.0, %_23
%umax54 = tail call i64 @llvm.umax.i64(i64 %image_data.1, i64 %20)
%21 = add i64 %umax54, %_23
%22 = sub i64 %21, %range.0
%umin = tail call i64 @llvm.umin.i64(i64 %19, i64 %22)
%23 = xor i64 %range.0, -1
%24 = add i64 %range.1, %23
%umin55 = tail call i64 @llvm.umin.i64(i64 %umin, i64 %24)
%25 = add i64 %range.0, -4
%umax56 = tail call i64 @llvm.umax.i64(i64 %image_data.1, i64 %25)
%26 = add i64 %umax56, 4
%27 = sub i64 %26, %range.0
%umin57 = tail call i64 @llvm.umin.i64(i64 %umin55, i64 %27)
%28 = tail call i64 @llvm.usub.sat.i64(i64 %image_data.1, i64 %range.0)
%umin59 = tail call i64 @llvm.umin.i64(i64 %umin57, i64 %28)
%29 = add i64 %range.0, %umin59
%30 = getelementptr i8, ptr %image_data.0, i64 %29
%scevgep60 = getelementptr i8, ptr %30, i64 1
%scevgep61 = getelementptr i8, ptr %image_data.0, i64 %25
%31 = getelementptr i8, ptr %image_data.0, i64 %29
%scevgep62 = getelementptr i8, ptr %31, i64 -3
%scevgep63 = getelementptr i8, ptr %image_data.0, i64 %20
%32 = add i64 %range.0, %umin59
%33 = add i64 %32, 1
%34 = sub i64 %33, %_23
%scevgep64 = getelementptr i8, ptr %image_data.0, i64 %34
%scevgep65 = getelementptr i8, ptr %image_data.0, i64 %16
%35 = add i64 %32, -3
%36 = sub i64 %35, %_23
%scevgep66 = getelementptr i8, ptr %image_data.0, i64 %36
%bound0 = icmp ult ptr %scevgep, %scevgep62
%bound1 = icmp ult ptr %scevgep61, %scevgep60
%found.conflict = and i1 %bound0, %bound1
%bound067 = icmp ult ptr %scevgep, %scevgep64
%bound168 = icmp ult ptr %scevgep63, %scevgep60
%found.conflict69 = and i1 %bound067, %bound168
%conflict.rdx = or i1 %found.conflict, %found.conflict69
%bound070 = icmp ult ptr %scevgep, %scevgep66
%bound171 = icmp ult ptr %scevgep65, %scevgep60
%found.conflict72 = and i1 %bound070, %bound171
%conflict.rdx73 = or i1 %conflict.rdx, %found.conflict72
br i1 %conflict.rdx73, label %bb2.preheader, label %vector.ph
Metadata
Metadata
Assignees
Labels
A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.Area: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.A-autovectorizationArea: Autovectorization, which can impact perf or code sizeArea: Autovectorization, which can impact perf or code sizeC-optimizationCategory: An issue highlighting optimization opportunities or PRs implementing suchCategory: An issue highlighting optimization opportunities or PRs implementing suchI-slowIssue: Problems and improvements with respect to performance of generated code.Issue: Problems and improvements with respect to performance of generated code.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.Relevant to the compiler team, which will review and decide on the PR/issue.