From 6849c16d79e50be6d6713b2a9464bf7314060703 Mon Sep 17 00:00:00 2001
From: Christopher Serr <christopher.serr@gmail.com>
Date: Mon, 7 Jul 2025 17:23:51 +0200
Subject: [PATCH] Use 128-bit Widening Multiply on More Platforms

The 128-bit widening multiplication was previously gated by simply
checking the target pointer width. This works as a simple heuristic, but
a better heuristic can be used:

1. Most 64-bit architectures except SPARC64 and Wasm64 support the
   128-bit widening multiplication, so it shouldn't be used on those two
   architectures.
2. The target pointer width doesn't always indicate that we are dealing
   with a 64-bit architecture, as there are ABIs that reduce the pointer
   width, especially on AArch64 and x86-64.
3. WebAssembly (regardless of pointer width) supports 64-bit to 128-bit
   widening multiplication with the `wide-arithmetic` proposal.

The `wide-arithmetic` proposal is available since the LLVM 20 update and
works perfectly for this use case as can be seen here:

https://rust.godbolt.org/z/9jY7fxqxK

Using `wasmtime explore`, we can see it compiles down to the ideal
instructions on x86-64:

```nasm
mulx rax, rdx, r10
xor rax, rdx
```

Based on the same change in
[`foldhash`](https://github.com/orlp/foldhash/pull/17).
---
 src/lib.rs | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/lib.rs b/src/lib.rs
index 03117c9..0e83e2d 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -197,11 +197,26 @@ const PREVENT_TRIVIAL_ZERO_COLLAPSE: u64 = 0xa4093822299f31d0;
 
 #[inline]
 fn multiply_mix(x: u64, y: u64) -> u64 {
-    #[cfg(target_pointer_width = "64")]
-    {
+    // The following code path is only fast if 64-bit to 128-bit widening
+    // multiplication is supported by the architecture. Most 64-bit
+    // architectures except SPARC64 and Wasm64 support it. However, the target
+    // pointer width doesn't always indicate that we are dealing with a 64-bit
+    // architecture, as there are ABIs that reduce the pointer width, especially
+    // on AArch64 and x86-64. WebAssembly (regardless of pointer width) supports
+    // 64-bit to 128-bit widening multiplication with the `wide-arithmetic`
+    // proposal.
+    if cfg!(any(
+        all(
+            target_pointer_width = "64",
+            not(any(target_arch = "sparc64", target_arch = "wasm64")),
+        ),
+        target_arch = "aarch64",
+        target_arch = "x86_64",
+        all(target_family = "wasm", target_feature = "wide-arithmetic"),
+    )) {
         // We compute the full u64 x u64 -> u128 product, this is a single mul
         // instruction on x86-64, one mul plus one mulhi on ARM64.
-        let full = (x as u128) * (y as u128);
+        let full = (x as u128).wrapping_mul(y as u128);
         let lo = full as u64;
         let hi = (full >> 64) as u64;
 
@@ -216,10 +231,7 @@ fn multiply_mix(x: u64, y: u64) -> u64 {
         //     x * y = 2^64 * hi + lo = (-1) * hi + lo = lo - hi,   (mod 2^64 + 1)
         //     x * y = 2^64 * hi + lo =    1 * hi + lo = lo + hi,   (mod 2^64 - 1)
         // Multiplicative hashing is universal in a field (like mod p).
-    }
-
-    #[cfg(target_pointer_width = "32")]
-    {
+    } else {
         // u64 x u64 -> u128 product is prohibitively expensive on 32-bit.
         // Decompose into 32-bit parts.
         let lx = x as u32;
@@ -228,8 +240,8 @@ fn multiply_mix(x: u64, y: u64) -> u64 {
         let hy = (y >> 32) as u32;
 
         // u32 x u32 -> u64 the low bits of one with the high bits of the other.
-        let afull = (lx as u64) * (hy as u64);
-        let bfull = (hx as u64) * (ly as u64);
+        let afull = (lx as u64).wrapping_mul(hy as u64);
+        let bfull = (hx as u64).wrapping_mul(ly as u64);
 
         // Combine, swapping low/high of one of them so the upper bits of the
         // product of one combine with the lower bits of the other.