-
Notifications
You must be signed in to change notification settings - Fork 174
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add IANA/BCP47 time zone name mappings #3499
Changes from 6 commits
1ae7862
5dc8c6d
5341e17
85df3f9
2fc7d31
1e49656
062a52b
e0964a7
f3d5859
dceaa5b
744d2d5
a58a33a
ad9e056
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
// This file is part of ICU4X. For terms of use, please see the file | ||
// called LICENSE at the top level of the ICU4X source tree | ||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
|
||
//! 🚧 \[Unstable\] Property names-related data for this component | ||
//! | ||
//! <div class="stab unstable"> | ||
//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
//! including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
//! to be stable, their Rust representation might not be. Use with caution. | ||
//! </div> | ||
//! | ||
//! Read more about data providers: [`icu_provider`] | ||
|
||
use alloc::boxed::Box; | ||
use core::cmp::Ordering; | ||
use core::str; | ||
|
||
use icu_provider::prelude::*; | ||
|
||
use crate::TimeZoneBcp47Id; | ||
use tinystr::UnvalidatedTinyAsciiStr; | ||
use zerovec::ule::{UnvalidatedStr, VarULE}; | ||
use zerovec::{maps::ZeroMapKV, VarZeroSlice, VarZeroVec, ZeroMap}; | ||
|
||
/// This is a time zone identifier that can be "loose matched" as according to | ||
/// [ECMAScript Temporal](https://tc39.es/proposal-temporal/#sec-isavailabletimezonename) | ||
/// | ||
/// (matched case-insensitively in ASCII) | ||
/// | ||
/// This is expected to be ASCII, but we do not rely on this invariant anywhere except during | ||
/// datagen. | ||
/// | ||
/// The Ord impl will sort things using strict equality, but in such a way that all loose-equal items | ||
/// will sort into the same area, such that a map can be searched for both strict and loose equality. | ||
/// | ||
/// <div class="stab unstable"> | ||
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
/// to be stable, their Rust representation might not be. Use with caution. | ||
/// </div> | ||
/// | ||
/// # Examples | ||
/// | ||
/// Using a [`NormalizedTimeZoneIdStr`] as the key of a [`ZeroMap`]: | ||
/// | ||
/// ``` | ||
/// use icu_timezone::provider::names::NormalizedTimeZoneIdStr; | ||
/// use zerovec::ZeroMap; | ||
/// | ||
/// let map: ZeroMap<NormalizedTimeZoneIdStr, usize> = [ | ||
/// (NormalizedTimeZoneIdStr::from_str("America/Los_Angeles"), 11), | ||
/// (NormalizedTimeZoneIdStr::from_str("Asia/Kolkata"), 22), | ||
/// (NormalizedTimeZoneIdStr::from_str("Europe/Berlin"), 33), | ||
/// ] | ||
/// .into_iter() | ||
/// .collect(); | ||
/// | ||
/// let key_approx = NormalizedTimeZoneIdStr::from_str("europe/berlin"); | ||
/// let key_exact = NormalizedTimeZoneIdStr::from_str("Europe/Berlin"); | ||
/// | ||
/// // Strict lookup: | ||
/// assert_eq!(None, map.get_copied(key_approx)); | ||
/// assert_eq!(Some(33), map.get_copied(key_exact)); | ||
/// | ||
/// // Loose lookup: | ||
/// assert_eq!(Some(33), map.get_copied_by(|u| u.cmp_loose(key_approx))); | ||
/// assert_eq!(Some(33), map.get_copied_by(|u| u.cmp_loose(key_exact))); | ||
/// ``` | ||
#[derive(PartialEq, Eq)] // VarULE wants these to be byte equality | ||
#[derive(Debug, VarULE)] | ||
#[cfg_attr(feature = "serde", derive(serde::Serialize))] | ||
#[repr(transparent)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thought: we really should see if we can get someone to make the |
||
pub struct NormalizedTimeZoneIdStr(UnvalidatedStr); | ||
|
||
/// This impl requires enabling the optional `serde` Cargo feature of the `icu_properties` crate | ||
#[cfg(feature = "serde")] | ||
impl<'de> serde::Deserialize<'de> for Box<NormalizedTimeZoneIdStr> { | ||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> | ||
where | ||
D: serde::Deserializer<'de>, | ||
{ | ||
<Box<UnvalidatedStr>>::deserialize(deserializer).map(NormalizedTimeZoneIdStr::cast_box) | ||
} | ||
} | ||
|
||
/// This impl requires enabling the optional `serde` Cargo feature of the `icu_properties` crate | ||
#[cfg(feature = "serde")] | ||
impl<'de, 'a> serde::Deserialize<'de> for &'a NormalizedTimeZoneIdStr | ||
where | ||
'de: 'a, | ||
{ | ||
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error> | ||
where | ||
D: serde::Deserializer<'de>, | ||
{ | ||
<&UnvalidatedStr>::deserialize(deserializer).map(NormalizedTimeZoneIdStr::cast_ref) | ||
} | ||
} | ||
|
||
impl<'a> ZeroMapKV<'a> for NormalizedTimeZoneIdStr { | ||
type Container = VarZeroVec<'a, NormalizedTimeZoneIdStr>; | ||
type Slice = VarZeroSlice<NormalizedTimeZoneIdStr>; | ||
type GetType = NormalizedTimeZoneIdStr; | ||
type OwnedType = Box<NormalizedTimeZoneIdStr>; | ||
} | ||
|
||
/// The Ord/PartialOrd impl will sort things using strict equality, but in such a way that all loose-equal items | ||
/// will sort into the same area, such that a map can be searched for both strict and loose equality. | ||
impl PartialOrd for NormalizedTimeZoneIdStr { | ||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> { | ||
Some(self.cmp(other)) | ||
} | ||
} | ||
|
||
/// The Ord impl will sort things using strict equality, but in such a way that all loose-equal items | ||
/// will sort into the same area, such that a map can be searched for both strict and loose equality. | ||
impl Ord for NormalizedTimeZoneIdStr { | ||
fn cmp(&self, other: &Self) -> Ordering { | ||
let cmp = self.cmp_loose(other); | ||
// When loose equality holds, fall back to strict equality | ||
if cmp == Ordering::Equal { | ||
self.0.cmp(&other.0) | ||
} else { | ||
cmp | ||
} | ||
} | ||
} | ||
|
||
impl NormalizedTimeZoneIdStr { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You have I guess because this is only used in a ZV, postcard uses ULE instead of Serialize, but this should still work correctly in case someone uses this in a non-ULE location. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More constructors can be added on an as-needed basis; I wanted to add the ones that are reachable. In fact I should probably double check if there are any I can delete. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. /s/constructors/serde impls |
||
/// Perform the loose comparison as defined in [`NormalizedTimeZoneIdStr`]. | ||
pub fn cmp_loose(&self, other: &Self) -> Ordering { | ||
let self_iter = self.0.iter().map(u8::to_ascii_lowercase); | ||
let other_iter = other.0.iter().map(u8::to_ascii_lowercase); | ||
self_iter.cmp(other_iter) | ||
} | ||
|
||
/// Convert a string reference to a [`NormalizedTimeZoneIdStr`]. | ||
pub const fn from_str(s: &str) -> &Self { | ||
Self::cast_ref(UnvalidatedStr::from_str(s)) | ||
} | ||
|
||
/// Convert a [`UnvalidatedStr`] reference to a [`NormalizedTimeZoneIdStr`] reference. | ||
pub const fn cast_ref(value: &UnvalidatedStr) -> &Self { | ||
// Safety: repr(transparent) | ||
unsafe { core::mem::transmute(value) } | ||
} | ||
|
||
/// Convert a [`UnvalidatedStr`] box to a [`NormalizedTimeZoneIdStr`] box. | ||
pub const fn cast_box(value: Box<UnvalidatedStr>) -> Box<Self> { | ||
// Safety: repr(transparent) | ||
unsafe { core::mem::transmute(value) } | ||
} | ||
|
||
/// Get a [`NormalizedPropertyName`] box from a byte slice. | ||
pub fn boxed_from_bytes(b: &[u8]) -> Box<Self> { | ||
Self::cast_box(UnvalidatedStr::from_boxed_bytes(b.into())) | ||
} | ||
} | ||
|
||
/// A mapping from IANA time zone identifiers to BCP-47 time zone identifiers. | ||
/// | ||
/// Multiple IANA time zone IDs can map to the same BCP-47 time zone ID. | ||
/// | ||
/// <div class="stab unstable"> | ||
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
/// to be stable, their Rust representation might not be. Use with caution. | ||
/// </div> | ||
#[derive(Debug, Clone)] | ||
#[icu_provider::data_struct(marker(IanaToBcp47MapV1Marker, "time_zone/iana_to_bcp47@1"))] | ||
#[cfg_attr( | ||
feature = "datagen", | ||
derive(serde::Serialize, databake::Bake), | ||
databake(path = icu_timezone::provider::names), | ||
)] | ||
#[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
#[yoke(prove_covariance_manually)] | ||
pub struct IanaToBcp47MapV1<'data> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Having both IanaToBcp and BcpToIana seems wasteful, it's the same data in different order, and it's not small data. How about two ZeroMaps where the value is the index of the key in the other zeromap? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Would you like to discuss further? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will AsciiTrie be ready by the time is is released, or will that be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll experiment with some different data models, including experimental AsciiTrie, and report back the findings. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some data size findings:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think a 7% size increase is worth paying for getting the other key for free. AsciiTrie will need a new data marker anyway. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussion: ok to go with AsciiTrie as long as it lands in time for 1.3 |
||
/// A map from IANA time zone identifiers to BCP-47 time zone identifiers | ||
#[cfg_attr(feature = "serde", serde(borrow))] | ||
pub map: ZeroMap<'data, NormalizedTimeZoneIdStr, TimeZoneBcp47Id>, | ||
} | ||
|
||
/// A mapping from IANA time zone identifiers to BCP-47 time zone identifiers. | ||
/// | ||
/// The BCP-47 time zone ID maps to the default IANA time zone ID according to the CLDR data. | ||
/// | ||
/// <div class="stab unstable"> | ||
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways, | ||
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed | ||
/// to be stable, their Rust representation might not be. Use with caution. | ||
/// </div> | ||
#[derive(Debug, Clone)] | ||
#[icu_provider::data_struct(marker(Bcp47ToIanaMapV1Marker, "time_zone/bcp47_to_iana@1"))] | ||
#[cfg_attr( | ||
feature = "datagen", | ||
derive(serde::Serialize, databake::Bake), | ||
databake(path = icu_timezone::provider::names), | ||
)] | ||
#[cfg_attr(feature = "serde", derive(serde::Deserialize))] | ||
#[yoke(prove_covariance_manually)] | ||
pub struct Bcp47ToIanaMapV1<'data> { | ||
/// A map from BCP-47 time zone identifiers to IANA time zone identifiers | ||
#[cfg_attr(feature = "serde", serde(borrow))] | ||
pub map: ZeroMap<'data, UnvalidatedTinyAsciiStr<8>, str>, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thought: we could store the str unvalidated and lazily validate as well (GIGO returning None when validation is not possible) unclear if this is at all worth it |
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
// This file is part of ICU4X. For terms of use, please see the file | ||
// called LICENSE at the top level of the ICU4X source tree | ||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
|
||
use super::convert::compute_bcp47_tzids_btreemap; | ||
use crate::transform::cldr::cldr_serde; | ||
use icu_provider::datagen::IterableDataProvider; | ||
use icu_provider::prelude::*; | ||
use icu_timezone::provider::names::*; | ||
|
||
impl DataProvider<IanaToBcp47MapV1Marker> for crate::DatagenProvider { | ||
fn load(&self, _: DataRequest) -> Result<DataResponse<IanaToBcp47MapV1Marker>, DataError> { | ||
let resource: &cldr_serde::time_zones::bcp47_tzid::Resource = | ||
self.source | ||
.cldr()? | ||
.bcp47() | ||
.read_and_parse("timezone.json")?; | ||
let bcp47_tzid_data = &compute_bcp47_tzids_btreemap(&resource.keyword.u.time_zones.values); | ||
let data_struct = IanaToBcp47MapV1 { | ||
map: bcp47_tzid_data | ||
.iter() | ||
.map(|(k, v)| (NormalizedTimeZoneIdStr::boxed_from_bytes(k.as_bytes()), v)) | ||
.collect(), | ||
}; | ||
Ok(DataResponse { | ||
metadata: Default::default(), | ||
payload: Some(DataPayload::from_owned(data_struct)), | ||
}) | ||
} | ||
} | ||
|
||
impl IterableDataProvider<IanaToBcp47MapV1Marker> for crate::DatagenProvider { | ||
fn supported_locales(&self) -> Result<Vec<DataLocale>, DataError> { | ||
Ok(vec![Default::default()]) | ||
} | ||
} | ||
|
||
impl DataProvider<Bcp47ToIanaMapV1Marker> for crate::DatagenProvider { | ||
fn load(&self, _: DataRequest) -> Result<DataResponse<Bcp47ToIanaMapV1Marker>, DataError> { | ||
let resource: &cldr_serde::time_zones::bcp47_tzid::Resource = | ||
self.source | ||
.cldr()? | ||
.bcp47() | ||
.read_and_parse("timezone.json")?; | ||
// Note: The BTreeMap retains the order of the aliases, which is important for establishing | ||
// the canonical order of the IANA names. | ||
let bcp47_tzid_data = &compute_bcp47_tzids_btreemap(&resource.keyword.u.time_zones.values); | ||
let data_struct = Bcp47ToIanaMapV1 { | ||
map: bcp47_tzid_data | ||
.iter() | ||
.map(|(k, v)| (v.0.to_unvalidated(), k.as_str())) | ||
.collect(), | ||
}; | ||
Ok(DataResponse { | ||
metadata: Default::default(), | ||
payload: Some(DataPayload::from_owned(data_struct)), | ||
}) | ||
} | ||
} | ||
|
||
impl IterableDataProvider<Bcp47ToIanaMapV1Marker> for crate::DatagenProvider { | ||
fn supported_locales(&self) -> Result<Vec<DataLocale>, DataError> { | ||
Ok(vec![Default::default()]) | ||
} | ||
} |
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
issue: not sure if we can say this fixes #2909 since this is just the data model, there's no fetcher struct yet
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just pushed the API