Skip to content

Pessimising rewrite in contains_zero_byte #148041

Open
@Kmeakin

Description

@Kmeakin

Consider the following Rust code for determining if any of the 8 bytes in a u64 are zero, taken from the Rust standard library's implementation of memchr:

const LO: u64 = 0x01_01_01_01_01_01_01_01;
const HI: u64 = 0x80_80_80_80_80_80_80_80;

const fn contains_zero_byte(x: u64) -> bool {
    x.wrapping_sub(LO) & !x & HI != 0
}

The equivalent C++ code is:
https://godbolt.org/z/Ej8TT8v84

constexpr u64 LO = 0x01'01'01'01'01'01'01'01;
constexpr u64 HI = 0x80'80'80'80'80'80'80'80;

bool contains_zero_byte(u64 x) { 
    return ((x - LO) & ~x & HI) != 0;
}

For this function, GCC generates

;; AArch64:
contains_zero(unsigned long):
        mov     x1, -72340172838076674
        movk    x1, 0xfeff, lsl 0
        add     x1, x0, x1
        bic     x1, x1, x0
        tst     x1, -9187201950435737472
        cset    w0, ne
        ret

;; x86_64:
contains_zero_byte(unsigned long):
        movabs  rax, -72340172838076673
        add     rax, rdi
        andn    rdi, rdi, rax
        movabs  rax, -9187201950435737472
        test    rdi, rax
        setne   al
        ret

but LLVM generates:

;; AArch64:
contains_zero_byte(unsigned long):
        mov     x8, #72340172838076673
        mov     x9, #-9187201950435737472
        movk    x8, #256
        sub     x8, x8, x0
        orr     x8, x8, x0
        bics    xzr, x9, x8
        cset    w0, ne
        ret

;; x86_64:
contains_zero_byte(unsigned long):
        movabs  rax, 72340172838076672
        sub     rax, rdi
        or      rax, rdi
        movabs  rcx, -9187201950435737472
        andn    rax, rax, rcx
        setne   al
        ret

If we rewrite the LLVM IR for to match GCC's output, we get the correct assembly (llc output, alive proof)

define dso_local noundef i1 @src(i64 noundef %0) local_unnamed_addr #0 {
  %2 = sub i64 u0x0101010101010100, %0
  %3 = or i64 %2, %0
  %4 = and i64 %3, u0x8080808080808080
  %5 = icmp ne i64 %4, u0x8080808080808080
  ret i1 %5
}

define dso_local noundef i1 @tgt(i64 noundef %0) local_unnamed_addr #0 {
  %not_0 = xor i64 %0, -1
  %2 = add i64 %0, u0xFEFEFEFEFEFEFEFF
  %3 = and i64 %2, %not_0
  %4 = and i64 %3, u0x8080808080808080
  %5 = icmp ne i64 %4, 0
  ret i1 %5
}

Rewriting the C++ function does not produce the correct assembly, so I assume the problem is InstCombine is canonicalising to a less optimal form

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions