Skip to content

Possibly missed optimizations with strided slice access autovectorization #144005

@okaneco

Description

@okaneco

Trying to work around #142519 (code stops auto-vectorizing after 1.86),
the following code ends up being slower than the currently scalar version of the loop when decoding in image-webp.

Hyperlinked code that no longer vectorizes vs. this code
https://rust.godbolt.org/z/9cTGTrdcn

The previous code is an optimization of the naively written code; we're only concerned with 4-byte chunks.

use std::ops::Range;

#[inline(never)]
pub fn apply_predictor_transform_12(image_data: &mut [u8], range: Range<usize>, width: usize) {
    let mut i = range.start;
    while i < range.end {
        image_data[i] = image_data[i].wrapping_add(clamp_add_subtract_full(
            i16::from(image_data[i - 4]),
            i16::from(image_data[i - width * 4]),
            i16::from(image_data[i - width * 4 - 4]),
        ));
        i += 1;
    }
}

fn clamp_add_subtract_full(a: i16, b: i16, c: i16) -> u8 {
    (a + b - c).max(0).min(255) as u8
}

There seem to be a lot of instructions generated before entering the loop body for pointer checks.

I'm wondering if what ends up in the LLVM IR blocks labeled as bb2.lr.ph and vector.memcheck can be optimized better from the Rust side or in LLVM.

C and Rust functions compared
https://rust.godbolt.org/z/EMP51Knsv

Rewritten as a C function
#include <stddef.h>
#include <stdint.h>

struct Range {
  size_t start;
  size_t end;
};

void apply_predictor_transform_12(uint8_t *image_data, struct Range range,
                                  size_t width) {
  for (size_t i = range.start; i < range.end; i++) {
    int16_t result = (int16_t)image_data[i - 4] +
                     (int16_t)image_data[i - width * 4] -
                     (int16_t)image_data[i - width * 4 - 4];

    if (result > 255) {
      result = 255;
    }
    if (result < 0) {
      result = 0;
    }

    image_data[i] += (uint8_t)result;
  }
}
C LLVM IR snippet
for.body.lr.ph:
  %mul = shl i64 %width, 2
  %0 = sub nuw i64 %range.coerce1, %range.coerce0
  %min.iters.check = icmp ult i64 %0, 16
  br i1 %min.iters.check, label %for.body.preheader, label %vector.memcheck

vector.memcheck:
  %scevgep = getelementptr i8, ptr %image_data, i64 %range.coerce0
  %scevgep42 = getelementptr i8, ptr %image_data, i64 %range.coerce1
  %1 = getelementptr i8, ptr %image_data, i64 %range.coerce0
  %scevgep43 = getelementptr i8, ptr %1, i64 -4
  %2 = getelementptr i8, ptr %image_data, i64 %range.coerce1
  %scevgep44 = getelementptr i8, ptr %2, i64 -4
  %3 = sub i64 %range.coerce0, %mul
  %scevgep45 = getelementptr i8, ptr %image_data, i64 %3
  %4 = sub i64 %range.coerce1, %mul
  %scevgep46 = getelementptr i8, ptr %image_data, i64 %4
  %5 = add i64 %range.coerce0, -4
  %6 = sub i64 %5, %mul
  %scevgep47 = getelementptr i8, ptr %image_data, i64 %6
  %7 = add i64 %range.coerce1, -4
  %8 = sub i64 %7, %mul
  %scevgep48 = getelementptr i8, ptr %image_data, i64 %8
  %bound0 = icmp ult ptr %scevgep, %scevgep44
  %bound1 = icmp ult ptr %scevgep43, %scevgep42
  %found.conflict = and i1 %bound0, %bound1
  %bound049 = icmp ult ptr %scevgep, %scevgep46
  %bound150 = icmp ult ptr %scevgep45, %scevgep42
  %found.conflict51 = and i1 %bound049, %bound150
  %conflict.rdx = or i1 %found.conflict, %found.conflict51
  %bound052 = icmp ult ptr %scevgep, %scevgep48
  %bound153 = icmp ult ptr %scevgep47, %scevgep42
  %found.conflict54 = and i1 %bound052, %bound153
  %conflict.rdx55 = or i1 %conflict.rdx, %found.conflict54
  br i1 %conflict.rdx55, label %for.body.preheader, label %vector.ph
Rust LLVM IR snippet
bb2.lr.ph:
  %_23 = shl i64 %width, 2
  %0 = add i64 %range.0, -4
  %1 = sub i64 %0, %_23
  %umax74 = tail call i64 @llvm.umax.i64(i64 %image_data.1, i64 %1)
  %2 = add i64 %umax74, %_23
  %3 = add i64 %2, 4
  %4 = sub i64 %3, %range.0
  %5 = sub i64 %range.0, %_23
  %umax75 = tail call i64 @llvm.umax.i64(i64 %image_data.1, i64 %5)
  %6 = add i64 %umax75, %_23
  %7 = sub i64 %6, %range.0
  %umin76 = tail call i64 @llvm.umin.i64(i64 %4, i64 %7)
  %8 = xor i64 %range.0, -1
  %9 = add i64 %range.1, %8
  %umin77 = tail call i64 @llvm.umin.i64(i64 %umin76, i64 %9)
  %10 = add i64 %range.0, -4
  %umax78 = tail call i64 @llvm.umax.i64(i64 %image_data.1, i64 %10)
  %11 = add i64 %umax78, 4
  %12 = sub i64 %11, %range.0
  %umin79 = tail call i64 @llvm.umin.i64(i64 %umin77, i64 %12)
  %13 = tail call i64 @llvm.usub.sat.i64(i64 %image_data.1, i64 %range.0)
  %umin81 = tail call i64 @llvm.umin.i64(i64 %umin79, i64 %13)
  %14 = add i64 %umin81, 1
  %min.iters.check = icmp ult i64 %14, 25
  br i1 %min.iters.check, label %bb2.preheader, label %vector.memcheck

bb2.preheader:
  %i.sroa.0.023.ph = phi i64 [ %range.0, %bb2.lr.ph ], [ %range.0, %vector.memcheck ], [ %39, %vector.body ]
  br label %bb2

vector.memcheck:
  %scevgep = getelementptr i8, ptr %image_data.0, i64 %range.0
  %15 = add i64 %range.0, -4
  %16 = sub i64 %15, %_23
  %umax = tail call i64 @llvm.umax.i64(i64 %image_data.1, i64 %16)
  %17 = add i64 %umax, %_23
  %18 = add i64 %17, 4
  %19 = sub i64 %18, %range.0
  %20 = sub i64 %range.0, %_23
  %umax54 = tail call i64 @llvm.umax.i64(i64 %image_data.1, i64 %20)
  %21 = add i64 %umax54, %_23
  %22 = sub i64 %21, %range.0
  %umin = tail call i64 @llvm.umin.i64(i64 %19, i64 %22)
  %23 = xor i64 %range.0, -1
  %24 = add i64 %range.1, %23
  %umin55 = tail call i64 @llvm.umin.i64(i64 %umin, i64 %24)
  %25 = add i64 %range.0, -4
  %umax56 = tail call i64 @llvm.umax.i64(i64 %image_data.1, i64 %25)
  %26 = add i64 %umax56, 4
  %27 = sub i64 %26, %range.0
  %umin57 = tail call i64 @llvm.umin.i64(i64 %umin55, i64 %27)
  %28 = tail call i64 @llvm.usub.sat.i64(i64 %image_data.1, i64 %range.0)
  %umin59 = tail call i64 @llvm.umin.i64(i64 %umin57, i64 %28)
  %29 = add i64 %range.0, %umin59
  %30 = getelementptr i8, ptr %image_data.0, i64 %29
  %scevgep60 = getelementptr i8, ptr %30, i64 1
  %scevgep61 = getelementptr i8, ptr %image_data.0, i64 %25
  %31 = getelementptr i8, ptr %image_data.0, i64 %29
  %scevgep62 = getelementptr i8, ptr %31, i64 -3
  %scevgep63 = getelementptr i8, ptr %image_data.0, i64 %20
  %32 = add i64 %range.0, %umin59
  %33 = add i64 %32, 1
  %34 = sub i64 %33, %_23
  %scevgep64 = getelementptr i8, ptr %image_data.0, i64 %34
  %scevgep65 = getelementptr i8, ptr %image_data.0, i64 %16
  %35 = add i64 %32, -3
  %36 = sub i64 %35, %_23
  %scevgep66 = getelementptr i8, ptr %image_data.0, i64 %36
  %bound0 = icmp ult ptr %scevgep, %scevgep62
  %bound1 = icmp ult ptr %scevgep61, %scevgep60
  %found.conflict = and i1 %bound0, %bound1
  %bound067 = icmp ult ptr %scevgep, %scevgep64
  %bound168 = icmp ult ptr %scevgep63, %scevgep60
  %found.conflict69 = and i1 %bound067, %bound168
  %conflict.rdx = or i1 %found.conflict, %found.conflict69
  %bound070 = icmp ult ptr %scevgep, %scevgep66
  %bound171 = icmp ult ptr %scevgep65, %scevgep60
  %found.conflict72 = and i1 %bound070, %bound171
  %conflict.rdx73 = or i1 %conflict.rdx, %found.conflict72
  br i1 %conflict.rdx73, label %bb2.preheader, label %vector.ph

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.A-autovectorizationArea: Autovectorization, which can impact perf or code sizeC-optimizationCategory: An issue highlighting optimization opportunities or PRs implementing suchI-slowIssue: Problems and improvements with respect to performance of generated code.T-compilerRelevant to the compiler team, which will review and decide on the PR/issue.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions