Skip to content

Commit 146ff74

Browse files
committed
Use 128-bit Widening Multiply on More Platforms
The 128-bit widening multiplication was previously gated by simply checking the target pointer width. This works as a simple heuristic, but a better heuristic can be used: 1. Most 64-bit architectures except SPARC64 and Wasm64 support the 128-bit widening multiplication, so it shouldn't be used on those two architectures. 2. The target pointer width doesn't always indicate that we are dealing with a 64-bit architecture, as there are ABIs that reduce the pointer width, especially on AArch64 and x86-64. 3. WebAssembly (regardless of pointer width) supports 64-bit to 128-bit widening multiplication with the `wide-arithmetic` proposal. The `wide-arithmetic` proposal is available since the LLVM 20 update and works perfectly for this use case as can be seen here: https://rust.godbolt.org/z/9jY7fxqxK Using `wasmtime explore`, we can see it compiles down to the ideal instructions on x86-64: ```nasm mulx rax, rdx, r10 xor rax, rdx ``` Based on the same change in [`foldhash`](orlp/foldhash#17).
1 parent dc5c33f commit 146ff74

File tree

1 file changed

+29
-5
lines changed

1 file changed

+29
-5
lines changed

src/lib.rs

Lines changed: 29 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -197,11 +197,27 @@ const PREVENT_TRIVIAL_ZERO_COLLAPSE: u64 = 0xa4093822299f31d0;
197197

198198
#[inline]
199199
fn multiply_mix(x: u64, y: u64) -> u64 {
200-
#[cfg(target_pointer_width = "64")]
200+
// The following code path is only fast if 64-bit to 128-bit widening
201+
// multiplication is supported by the architecture. Most 64-bit
202+
// architectures except SPARC64 and Wasm64 support it. However, the target
203+
// pointer width doesn't always indicate that we are dealing with a 64-bit
204+
// architecture, as there are ABIs that reduce the pointer width, especially
205+
// on AArch64 and x86-64. WebAssembly (regardless of pointer width) supports
206+
// 64-bit to 128-bit widening multiplication with the `wide-arithmetic`
207+
// proposal.
208+
#[cfg(any(
209+
all(
210+
target_pointer_width = "64",
211+
not(any(target_arch = "sparc64", target_arch = "wasm64")),
212+
),
213+
target_arch = "aarch64",
214+
target_arch = "x86_64",
215+
all(target_family = "wasm", target_feature = "wide-arithmetic"),
216+
))]
201217
{
202218
// We compute the full u64 x u64 -> u128 product, this is a single mul
203219
// instruction on x86-64, one mul plus one mulhi on ARM64.
204-
let full = (x as u128) * (y as u128);
220+
let full = (x as u128).wrapping_mul(y as u128);
205221
let lo = full as u64;
206222
let hi = (full >> 64) as u64;
207223

@@ -218,7 +234,15 @@ fn multiply_mix(x: u64, y: u64) -> u64 {
218234
// Multiplicative hashing is universal in a field (like mod p).
219235
}
220236

221-
#[cfg(target_pointer_width = "32")]
237+
#[cfg(not(any(
238+
all(
239+
target_pointer_width = "64",
240+
not(any(target_arch = "sparc64", target_arch = "wasm64")),
241+
),
242+
target_arch = "aarch64",
243+
target_arch = "x86_64",
244+
all(target_family = "wasm", target_feature = "wide-arithmetic"),
245+
)))]
222246
{
223247
// u64 x u64 -> u128 product is prohibitively expensive on 32-bit.
224248
// Decompose into 32-bit parts.
@@ -228,8 +252,8 @@ fn multiply_mix(x: u64, y: u64) -> u64 {
228252
let hy = (y >> 32) as u32;
229253

230254
// u32 x u32 -> u64 the low bits of one with the high bits of the other.
231-
let afull = (lx as u64) * (hy as u64);
232-
let bfull = (hx as u64) * (ly as u64);
255+
let afull = (lx as u64).wrapping_mul(hy as u64);
256+
let bfull = (hx as u64).wrapping_mul(ly as u64);
233257

234258
// Combine, swapping low/high of one of them so the upper bits of the
235259
// product of one combine with the lower bits of the other.

0 commit comments

Comments
 (0)