From f55658b17d0b27c71a84511b578604ccf9d01cd2 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Thu, 3 Oct 2024 18:36:47 +0200 Subject: [PATCH] refactor(rust): Migrate to hashbrown 0.15 (#19091) --- Cargo.lock | 76 ++++++++++++------- Cargo.toml | 4 +- crates/polars-core/Cargo.toml | 3 +- .../chunked_array/builder/list/categorical.rs | 54 +++++-------- .../logical/categorical/builder.rs | 66 +++++++--------- crates/polars-io/src/file_cache/cache.rs | 2 +- 6 files changed, 102 insertions(+), 103 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dbd70821fd27..b889fde7503f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -180,7 +180,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "hashbrown", + "hashbrown 0.14.5", "num", ] @@ -235,9 +235,9 @@ dependencies = [ [[package]] name = "async-stream" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" dependencies = [ "async-stream-impl", "futures-core", @@ -246,9 +246,9 @@ dependencies = [ [[package]] name = "async-stream-impl" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", @@ -970,18 +970,18 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.18" +version = "4.5.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0956a43b323ac1afaffc053ed5c4b7c1f1800bacd1683c353aabbb752515dd3" +checksum = "7be5744db7978a28d9df86a214130d106a89ce49644cbc4e3f0c22c3fba30615" dependencies = [ "clap_builder", ] [[package]] name = "clap_builder" -version = "4.5.18" +version = "4.5.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d72166dd41634086d5803a47eb71ae740e61d84709c36f3c34110173db3961b" +checksum = "a5fbc17d3ef8278f55b282b2a2e75ae6f6c7d4bb70ed3d0382375104bfafdb4b" dependencies = [ "anstyle", "clap_lex", @@ -1469,6 +1469,12 @@ version = "1.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" +[[package]] +name = "foldhash" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f81ec6369c545a7d40e4589b5597581fa1c441fe1cce96dd1de43159910a36a2" + [[package]] name = "foreign_vec" version = "0.1.0" @@ -1694,7 +1700,7 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8588661a8607108a5ca69cab034063441a0413a0b041c13618a7dd348021ef6f" dependencies = [ - "hashbrown", + "hashbrown 0.14.5", "serde", ] @@ -1716,6 +1722,19 @@ dependencies = [ "serde", ] +[[package]] +name = "hashbrown" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", + "rayon", + "serde", +] + [[package]] name = "heck" version = "0.4.1" @@ -1970,12 +1989,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68b900aa2f7301e21c36462b170ee99994de34dff39a4a6a528e80e7376d07e5" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown", + "hashbrown 0.15.0", "serde", ] @@ -2139,7 +2158,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6e0d73b369f386f1c44abd9c570d5318f55ccde816ff4b562fa452e5182863d" dependencies = [ "core2", - "hashbrown", + "hashbrown 0.14.5", "rle-decode-fast", ] @@ -2207,7 +2226,7 @@ version = "0.12.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904" dependencies = [ - "hashbrown", + "hashbrown 0.14.5", ] [[package]] @@ -2850,7 +2869,7 @@ dependencies = [ "flate2", "futures", "getrandom", - "hashbrown", + "hashbrown 0.15.0", "hex", "indexmap", "itoa", @@ -2921,7 +2940,8 @@ dependencies = [ "chrono-tz", "comfy-table", "either", - "hashbrown", + "hashbrown 0.14.5", + "hashbrown 0.15.0", "indexmap", "ndarray", "num-traits", @@ -3012,7 +3032,7 @@ dependencies = [ "fs4", "futures", "glob", - "hashbrown", + "hashbrown 0.15.0", "home", "itoa", "memchr", @@ -3052,7 +3072,7 @@ dependencies = [ "chrono", "chrono-tz", "fallible-streaming-iterator", - "hashbrown", + "hashbrown 0.15.0", "indexmap", "itoa", "num-traits", @@ -3125,7 +3145,7 @@ dependencies = [ "chrono", "chrono-tz", "either", - "hashbrown", + "hashbrown 0.15.0", "hex", "indexmap", "jsonpath_lib_polars_vendor", @@ -3161,7 +3181,7 @@ dependencies = [ "fallible-streaming-iterator", "flate2", "futures", - "hashbrown", + "hashbrown 0.15.0", "lz4", "lz4_flex", "num-traits", @@ -3187,7 +3207,7 @@ dependencies = [ "crossbeam-queue", "enum_dispatch", "futures", - "hashbrown", + "hashbrown 0.15.0", "num-traits", "polars-arrow", "polars-compute", @@ -3217,7 +3237,7 @@ dependencies = [ "ciborium", "either", "futures", - "hashbrown", + "hashbrown 0.15.0", "libloading", "memmap2", "once_cell", @@ -3370,7 +3390,7 @@ dependencies = [ "bytemuck", "bytes", "compact_str", - "hashbrown", + "hashbrown 0.15.0", "indexmap", "libc", "memmap2", @@ -3686,9 +3706,9 @@ dependencies = [ [[package]] name = "raw-cpuid" -version = "11.1.0" +version = "11.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb9ee317cfe3fbd54b36a511efc1edd42e216903c9cd575e686dd68a2ba90d8d" +checksum = "1ab240315c661615f2ee9f0f2cd32d5a7343a84d5ebcccb99d46e6637565e7b0" dependencies = [ "bitflags", ] @@ -4743,9 +4763,9 @@ checksum = "eaea85b334db583fe3274d12b4cd1880032beab409c0d774be044d4480ab9a94" [[package]] name = "unicode-bidi" -version = "0.3.15" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08f95100a766bf4f8f28f90d77e0a5461bbdb219042e7679bebe79004fed8d75" +checksum = "5ab17db44d7388991a428b2ee655ce0c212e862eff1768a455c58f9aad6e7893" [[package]] name = "unicode-ident" diff --git a/Cargo.toml b/Cargo.toml index d266ead65b9d..39204515e5af 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,7 +51,9 @@ fallible-streaming-iterator = "0.1.9" fast-float = { version = "0.2" } flate2 = { version = "1", default-features = false } futures = "0.3.25" -hashbrown = { version = "=0.14.5", features = ["rayon", "ahash", "serde"] } +hashbrown = { version = "0.15.0", features = ["rayon", "serde"] } +# https://github.com/rust-lang/hashbrown/issues/564 +hashbrown_old_nightly_hack = { package = "hashbrown", version = "0.14.5", features = ["rayon", "serde"] } hex = "0.4.3" indexmap = { version = "2", features = ["std", "serde"] } itoa = "1.0.6" diff --git a/crates/polars-core/Cargo.toml b/crates/polars-core/Cargo.toml index a3c5b26e8386..95371084f40b 100644 --- a/crates/polars-core/Cargo.toml +++ b/crates/polars-core/Cargo.toml @@ -25,6 +25,7 @@ chrono-tz = { workspace = true, optional = true } comfy-table = { version = "7.1.1", default-features = false, optional = true } either = { workspace = true } hashbrown = { workspace = true } +hashbrown_old_nightly_hack = { workspace = true } indexmap = { workspace = true } ndarray = { workspace = true, optional = true } num-traits = { workspace = true } @@ -48,7 +49,7 @@ version_check = { workspace = true } [features] simd = ["arrow/simd", "polars-compute/simd"] -nightly = ["simd", "hashbrown/nightly", "polars-utils/nightly", "arrow/nightly"] +nightly = ["simd", "hashbrown/nightly", "hashbrown_old_nightly_hack/nightly", "polars-utils/nightly", "arrow/nightly"] avx512 = [] docs = [] temporal = ["regex", "chrono", "polars-error/regex"] diff --git a/crates/polars-core/src/chunked_array/builder/list/categorical.rs b/crates/polars-core/src/chunked_array/builder/list/categorical.rs index 3670e0ab3df9..efb4406bc4c1 100644 --- a/crates/polars-core/src/chunked_array/builder/list/categorical.rs +++ b/crates/polars-core/src/chunked_array/builder/list/categorical.rs @@ -1,3 +1,6 @@ +use hashbrown::hash_table::Entry; +use hashbrown::HashTable; + use super::*; pub fn create_categorical_chunked_listbuilder( @@ -75,15 +78,12 @@ impl ListBuilderTrait for ListEnumCategoricalChunkedBuilder { struct ListLocalCategoricalChunkedBuilder { inner: ListPrimitiveChunkedBuilder, - idx_lookup: PlHashMap, + idx_lookup: HashTable, ordering: CategoricalOrdering, categories: MutablePlString, categories_hash: u128, } -// Wrap u32 key to avoid incorrect usage of hashmap with custom lookup -struct KeyWrapper(u32); - impl ListLocalCategoricalChunkedBuilder { #[inline] pub fn get_hash_builder() -> PlRandomState { @@ -104,10 +104,7 @@ impl ListLocalCategoricalChunkedBuilder { values_capacity, DataType::UInt32, ), - idx_lookup: PlHashMap::with_capacity_and_hasher( - capacity, - ListLocalCategoricalChunkedBuilder::get_hash_builder(), - ), + idx_lookup: HashTable::with_capacity(capacity), ordering, categories: MutablePlString::with_capacity(capacity), categories_hash: hash, @@ -141,33 +138,24 @@ impl ListBuilderTrait for ListLocalCategoricalChunkedBuilder { // Custom hashing / equality functions for comparing the &str to the idx // SAFETY: index in hashmap are within bounds of categories - let r = unsafe { - self.idx_lookup.raw_table_mut().find_or_find_insert_slot( + unsafe { + let r = self.idx_lookup.entry( hash_cat, - |(k, _)| self.categories.value_unchecked(k.0 as usize) == cat, - |(k, _): &(KeyWrapper, ())| { - hash_builder.hash_one(self.categories.value_unchecked(k.0 as usize)) + |k| self.categories.value_unchecked(*k as usize) == cat, + |k| hash_builder.hash_one(self.categories.value_unchecked(*k as usize)), + ); + + match r { + Entry::Occupied(v) => { + // SAFETY: bucket is initialized. + idx_mapping.insert_unique_unchecked(idx as u32, *v.get()); + }, + Entry::Vacant(slot) => { + idx_mapping.insert_unique_unchecked(idx as u32, len as u32); + self.categories.push(Some(cat)); + slot.insert(len as u32); }, - ) - }; - - match r { - Ok(v) => { - // SAFETY: Bucket is initialized - idx_mapping.insert_unique_unchecked(idx as u32, unsafe { v.as_ref().0 .0 }); - }, - Err(e) => { - idx_mapping.insert_unique_unchecked(idx as u32, len as u32); - self.categories.push(Some(cat)); - // SAFETY: No mutations in hashmap since find_or_find_insert_slot call - unsafe { - self.idx_lookup.raw_table_mut().insert_in_slot( - hash_cat, - e, - (KeyWrapper(len as u32), ()), - ) - }; - }, + } } } diff --git a/crates/polars-core/src/chunked_array/logical/categorical/builder.rs b/crates/polars-core/src/chunked_array/logical/categorical/builder.rs index 23739e71d965..53199329e20f 100644 --- a/crates/polars-core/src/chunked_array/logical/categorical/builder.rs +++ b/crates/polars-core/src/chunked_array/logical/categorical/builder.rs @@ -1,23 +1,20 @@ use arrow::array::*; use arrow::legacy::trusted_len::TrustedLenPush; use hashbrown::hash_map::Entry; +use hashbrown::hash_table::{Entry as HTEntry, HashTable}; use polars_utils::itertools::Itertools; use crate::hashing::_HASHMAP_INIT_SIZE; use crate::prelude::*; use crate::{using_string_cache, StringCache, POOL}; -// Wrap u32 key to avoid incorrect usage of hashmap with custom lookup -#[repr(transparent)] -struct KeyWrapper(u32); - pub struct CategoricalChunkedBuilder { cat_builder: UInt32Vec, name: PlSmallStr, ordering: CategoricalOrdering, categories: MutablePlString, - // hashmap utilized by the local builder - local_mapping: PlHashMap, + local_mapping: HashTable, + local_hasher: PlRandomState, } impl CategoricalChunkedBuilder { @@ -27,44 +24,33 @@ impl CategoricalChunkedBuilder { name, ordering, categories: MutablePlString::with_capacity(_HASHMAP_INIT_SIZE), - local_mapping: PlHashMap::with_capacity_and_hasher( - capacity / 10, - StringCache::get_hash_builder(), - ), + local_mapping: HashTable::with_capacity(capacity / 10), + local_hasher: StringCache::get_hash_builder(), } } fn get_cat_idx(&mut self, s: &str, h: u64) -> (u32, bool) { let len = self.local_mapping.len() as u32; - // Custom hashing / equality functions for comparing the &str to the idx // SAFETY: index in hashmap are within bounds of categories - let r = unsafe { - self.local_mapping.raw_table_mut().find_or_find_insert_slot( + unsafe { + let r = self.local_mapping.entry( h, - |(k, _)| self.categories.value_unchecked(k.0 as usize) == s, - |(k, _): &(KeyWrapper, ())| { - StringCache::get_hash_builder() - .hash_one(self.categories.value_unchecked(k.0 as usize)) + |k| self.categories.value_unchecked(*k as usize) == s, + |k| { + self.local_hasher + .hash_one(self.categories.value_unchecked(*k as usize)) }, - ) - }; + ); - match r { - Ok(v) => { - // SAFETY: Bucket is initialized - (unsafe { v.as_ref().0 .0 }, false) - }, - Err(e) => { - self.categories.push(Some(s)); - // SAFETY: No mutations in hashmap since find_or_find_insert_slot call - unsafe { - self.local_mapping - .raw_table_mut() - .insert_in_slot(h, e, (KeyWrapper(len), ())) - }; - (len, true) - }, + match r { + HTEntry::Occupied(v) => (*v.get(), false), + HTEntry::Vacant(slot) => { + self.categories.push(Some(s)); + slot.insert(len); + (len, true) + }, + } } } @@ -72,13 +58,13 @@ impl CategoricalChunkedBuilder { /// Returns the index and if the value was new. #[inline] pub fn register_value(&mut self, s: &str) -> (u32, bool) { - let h = self.local_mapping.hasher().hash_one(s); + let h = self.local_hasher.hash_one(s); self.get_cat_idx(s, h) } #[inline] pub fn append_value(&mut self, s: &str) { - let h = self.local_mapping.hasher().hash_one(s); + let h = self.local_hasher.hash_one(s); let idx = self.get_cat_idx(s, h).0; self.cat_builder.push(Some(idx)); } @@ -115,14 +101,14 @@ impl CategoricalChunkedBuilder { // Save hashes for later when inserting into the global hashmap. let mut hashes = Vec::with_capacity(_HASHMAP_INIT_SIZE); for s in self.categories.values_iter() { - hashes.push(self.local_mapping.hasher().hash_one(s)); + hashes.push(self.local_hasher.hash_one(s)); } for opt_s in iter { match opt_s { None => self.append_null(), Some(s) => { - let hash = self.local_mapping.hasher().hash_one(s); + let hash = self.local_hasher.hash_one(s); let (cat_idx, new) = self.get_cat_idx(s, hash); self.cat_builder.push(Some(cat_idx)); if new { @@ -211,7 +197,9 @@ fn fill_global_to_local(local_to_global: &[u32], global_to_local: &mut PlHashMap #[allow(clippy::explicit_counter_loop)] for global_idx in local_to_global { // we know the keys are unique so this is much faster - global_to_local.insert_unique_unchecked(*global_idx, local_idx); + unsafe { + global_to_local.insert_unique_unchecked(*global_idx, local_idx); + } local_idx += 1; } } diff --git a/crates/polars-io/src/file_cache/cache.rs b/crates/polars-io/src/file_cache/cache.rs index fb473a93a9e8..e23c65af7e77 100644 --- a/crates/polars-io/src/file_cache/cache.rs +++ b/crates/polars-io/src/file_cache/cache.rs @@ -158,7 +158,7 @@ impl FileCache { get_file_fetcher()?, ttl, )); - entries.insert_unique_unchecked(uri, entry.clone()); + entries.insert(uri, entry.clone()); Ok(entry.clone()) } }