Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add IANA/BCP47 time zone name mappings #3499

Closed
wants to merge 13 commits into from
5 changes: 5 additions & 0 deletions components/timezone/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ use tinystr::TinyAsciiStr;
use zerovec::ule::{AsULE, ULE};
use zerovec::{ZeroMap2d, ZeroSlice, ZeroVec};

pub mod names;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

issue: not sure if we can say this fixes #2909 since this is just the data model, there's no fetcher struct yet

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just pushed the API


pub use names::Bcp47ToIanaMapV1Marker;
pub use names::IanaToBcp47MapV1Marker;

/// TimeZone ID in BCP47 format
///
/// <div class="stab unstable">
Expand Down
207 changes: 207 additions & 0 deletions components/timezone/src/provider/names.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

//! 🚧 \[Unstable\] Property names-related data for this component
//!
//! <div class="stab unstable">
//! 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
//! including in SemVer minor releases. While the serde representation of data structs is guaranteed
//! to be stable, their Rust representation might not be. Use with caution.
//! </div>
//!
//! Read more about data providers: [`icu_provider`]

use alloc::boxed::Box;
use core::cmp::Ordering;
use core::str;

use icu_provider::prelude::*;

use crate::TimeZoneBcp47Id;
use tinystr::UnvalidatedTinyAsciiStr;
use zerovec::ule::{UnvalidatedStr, VarULE};
use zerovec::{maps::ZeroMapKV, VarZeroSlice, VarZeroVec, ZeroMap};

/// This is a time zone identifier that can be "loose matched" as according to
/// [ECMAScript Temporal](https://tc39.es/proposal-temporal/#sec-isavailabletimezonename)
///
/// (matched case-insensitively in ASCII)
///
/// This is expected to be ASCII, but we do not rely on this invariant anywhere except during
/// datagen.
///
/// The Ord impl will sort things using strict equality, but in such a way that all loose-equal items
/// will sort into the same area, such that a map can be searched for both strict and loose equality.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
///
/// # Examples
///
/// Using a [`NormalizedTimeZoneIdStr`] as the key of a [`ZeroMap`]:
///
/// ```
/// use icu_timezone::provider::names::NormalizedTimeZoneIdStr;
/// use zerovec::ZeroMap;
///
/// let map: ZeroMap<NormalizedTimeZoneIdStr, usize> = [
/// (NormalizedTimeZoneIdStr::from_str("America/Los_Angeles"), 11),
/// (NormalizedTimeZoneIdStr::from_str("Asia/Kolkata"), 22),
/// (NormalizedTimeZoneIdStr::from_str("Europe/Berlin"), 33),
/// ]
/// .into_iter()
/// .collect();
///
/// let key_approx = NormalizedTimeZoneIdStr::from_str("europe/berlin");
/// let key_exact = NormalizedTimeZoneIdStr::from_str("Europe/Berlin");
///
/// // Strict lookup:
/// assert_eq!(None, map.get_copied(key_approx));
/// assert_eq!(Some(33), map.get_copied(key_exact));
///
/// // Loose lookup:
/// assert_eq!(Some(33), map.get_copied_by(|u| u.cmp_loose(key_approx)));
/// assert_eq!(Some(33), map.get_copied_by(|u| u.cmp_loose(key_exact)));
/// ```
#[derive(PartialEq, Eq)] // VarULE wants these to be byte equality
#[derive(Debug, VarULE)]
#[cfg_attr(feature = "serde", derive(serde::Serialize))]
#[repr(transparent)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thought: we really should see if we can get someone to make the make_wrapper_varule macro

pub struct NormalizedTimeZoneIdStr(UnvalidatedStr);

/// This impl requires enabling the optional `serde` Cargo feature of the `icu_properties` crate
#[cfg(feature = "serde")]
impl<'de> serde::Deserialize<'de> for Box<NormalizedTimeZoneIdStr> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
<Box<UnvalidatedStr>>::deserialize(deserializer).map(NormalizedTimeZoneIdStr::cast_box)
}
}

/// This impl requires enabling the optional `serde` Cargo feature of the `icu_properties` crate
#[cfg(feature = "serde")]
impl<'de, 'a> serde::Deserialize<'de> for &'a NormalizedTimeZoneIdStr
where
'de: 'a,
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
<&UnvalidatedStr>::deserialize(deserializer).map(NormalizedTimeZoneIdStr::cast_ref)
}
}

impl<'a> ZeroMapKV<'a> for NormalizedTimeZoneIdStr {
type Container = VarZeroVec<'a, NormalizedTimeZoneIdStr>;
type Slice = VarZeroSlice<NormalizedTimeZoneIdStr>;
type GetType = NormalizedTimeZoneIdStr;
type OwnedType = Box<NormalizedTimeZoneIdStr>;
}

/// The Ord/PartialOrd impl will sort things using strict equality, but in such a way that all loose-equal items
/// will sort into the same area, such that a map can be searched for both strict and loose equality.
impl PartialOrd for NormalizedTimeZoneIdStr {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}

/// The Ord impl will sort things using strict equality, but in such a way that all loose-equal items
/// will sort into the same area, such that a map can be searched for both strict and loose equality.
impl Ord for NormalizedTimeZoneIdStr {
fn cmp(&self, other: &Self) -> Ordering {
let cmp = self.cmp_loose(other);
// When loose equality holds, fall back to strict equality
if cmp == Ordering::Equal {
self.0.cmp(&other.0)
} else {
cmp
}
}
}

impl NormalizedTimeZoneIdStr {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You have Box and & constructors, but don't we need Cow to keep postcard alloc-free while allowing JSON?

I guess because this is only used in a ZV, postcard uses ULE instead of Serialize, but this should still work correctly in case someone uses this in a non-ULE location.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More constructors can be added on an as-needed basis; I wanted to add the ones that are reachable. In fact I should probably double check if there are any I can delete.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/s/constructors/serde impls

/// Perform the loose comparison as defined in [`NormalizedTimeZoneIdStr`].
pub fn cmp_loose(&self, other: &Self) -> Ordering {
let self_iter = self.0.iter().map(u8::to_ascii_lowercase);
let other_iter = other.0.iter().map(u8::to_ascii_lowercase);
self_iter.cmp(other_iter)
}

/// Convert a string reference to a [`NormalizedTimeZoneIdStr`].
pub const fn from_str(s: &str) -> &Self {
Self::cast_ref(UnvalidatedStr::from_str(s))
}

/// Convert a [`UnvalidatedStr`] reference to a [`NormalizedTimeZoneIdStr`] reference.
pub const fn cast_ref(value: &UnvalidatedStr) -> &Self {
// Safety: repr(transparent)
unsafe { core::mem::transmute(value) }
}

/// Convert a [`UnvalidatedStr`] box to a [`NormalizedTimeZoneIdStr`] box.
pub const fn cast_box(value: Box<UnvalidatedStr>) -> Box<Self> {
// Safety: repr(transparent)
unsafe { core::mem::transmute(value) }
}

/// Get a [`NormalizedPropertyName`] box from a byte slice.
pub fn boxed_from_bytes(b: &[u8]) -> Box<Self> {
Self::cast_box(UnvalidatedStr::from_boxed_bytes(b.into()))
}
}

/// A mapping from IANA time zone identifiers to BCP-47 time zone identifiers.
///
/// Multiple IANA time zone IDs can map to the same BCP-47 time zone ID.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Clone)]
#[icu_provider::data_struct(marker(IanaToBcp47MapV1Marker, "time_zone/iana_to_bcp47@1"))]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize, databake::Bake),
databake(path = icu_timezone::provider::names),
)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[yoke(prove_covariance_manually)]
pub struct IanaToBcp47MapV1<'data> {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having both IanaToBcp and BcpToIana seems wasteful, it's the same data in different order, and it's not small data. How about two ZeroMaps where the value is the index of the key in the other zeromap?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. It's not exactly the same data; BCP47-to-IANA only contains canonical IDs, whereas IANA-to-BCP47 includes legacy and alias IANA IDs. Note that one is about 23% smaller than the other.
  2. By far the primary use case is IANA-to-BCP47. Most clients don't need BCP47-to-IANA, but it may be needed to support things like the Temporal API. Everyone else can slice out the other 12 kB.
  3. This follows the same design pattern as property names, which have separate keys for the two directions of mappings
  4. Making it a single data struct would increase data size for the common case (but I'm not sure by how much). Your proposed data model works well with ZeroMap; but I was also thinking that I would like to move these to AsciiTrie which doesn't support random indexing.

Would you like to discuss further?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will AsciiTrie be ready by the time is is released, or will that be V2 anyway?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll experiment with some different data models, including experimental AsciiTrie, and report back the findings.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some data size findings:

  • iana2bcp ZeroMap: 14475B (this PR)
  • bcp2iana ZeroMap: 11249B (this PR)
  • Combined iana/bcp with dual ZeroMaps: 15521B (Rob's suggestion)
  • iana2bcp AsciiTrie: 9820B

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a 7% size increase is worth paying for getting the other key for free. AsciiTrie will need a new data marker anyway.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Discussion: ok to go with AsciiTrie as long as it lands in time for 1.3

/// A map from IANA time zone identifiers to BCP-47 time zone identifiers
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: ZeroMap<'data, NormalizedTimeZoneIdStr, TimeZoneBcp47Id>,
}

/// A mapping from IANA time zone identifiers to BCP-47 time zone identifiers.
///
/// The BCP-47 time zone ID maps to the default IANA time zone ID according to the CLDR data.
///
/// <div class="stab unstable">
/// 🚧 This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
/// including in SemVer minor releases. While the serde representation of data structs is guaranteed
/// to be stable, their Rust representation might not be. Use with caution.
/// </div>
#[derive(Debug, Clone)]
#[icu_provider::data_struct(marker(Bcp47ToIanaMapV1Marker, "time_zone/bcp47_to_iana@1"))]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize, databake::Bake),
databake(path = icu_timezone::provider::names),
)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[yoke(prove_covariance_manually)]
pub struct Bcp47ToIanaMapV1<'data> {
/// A map from BCP-47 time zone identifiers to IANA time zone identifiers
#[cfg_attr(feature = "serde", serde(borrow))]
pub map: ZeroMap<'data, UnvalidatedTinyAsciiStr<8>, str>,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thought: we could store the str unvalidated and lazily validate as well (GIGO returning None when validation is not possible)

unclear if this is at all worth it

}
2 changes: 2 additions & 0 deletions provider/datagen/src/registry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ registry!(
AndListV1Marker,
AsciiHexDigitV1Marker,
BasicEmojiV1Marker,
Bcp47ToIanaMapV1Marker,
BidiClassV1Marker,
BidiClassNameToValueV1Marker,
BidiClassValueToLongNameV1Marker,
Expand Down Expand Up @@ -193,6 +194,7 @@ registry!(
GregorianDateSymbolsV1Marker,
HexDigitV1Marker,
HyphenV1Marker,
IanaToBcp47MapV1Marker,
IdContinueV1Marker,
IdeographicV1Marker,
IdsBinaryOperatorV1Marker,
Expand Down
13 changes: 7 additions & 6 deletions provider/datagen/src/transform/cldr/time_zones/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use icu_datetime::provider::time_zones::{
use icu_timezone::provider::MetazonePeriodV1;
use icu_timezone::ZoneVariant;
use std::borrow::Cow;
use std::collections::BTreeMap;
use std::collections::HashMap;
use tinystr::TinyStr8;

Expand All @@ -41,10 +42,10 @@ fn parse_hour_format(hour_format: &str) -> (Cow<'static, str>, Cow<'static, str>
(Cow::Owned(positive), Cow::Owned(negative))
}

fn compute_bcp47_tzids_hashmap(
pub(crate) fn compute_bcp47_tzids_btreemap(
bcp47_tzids_resource: &HashMap<TimeZoneBcp47Id, Bcp47TzidAliasData>,
) -> HashMap<String, TimeZoneBcp47Id> {
let mut bcp47_tzids = HashMap::new();
) -> BTreeMap<String, TimeZoneBcp47Id> {
let mut bcp47_tzids = BTreeMap::new();
for (bcp47_tzid, bcp47_tzid_data) in bcp47_tzids_resource.iter() {
if let Some(alias) = &bcp47_tzid_data.alias {
for data_value in alias.split(' ') {
Expand Down Expand Up @@ -173,7 +174,7 @@ impl From<CldrTimeZonesData<'_>> for ExemplarCitiesV1<'static> {
impl From<CldrTimeZonesData<'_>> for MetazonePeriodV1<'static> {
fn from(other: CldrTimeZonesData<'_>) -> Self {
let data = other.meta_zone_periods_resource;
let bcp47_tzid_data = &compute_bcp47_tzids_hashmap(other.bcp47_tzids_resource);
let bcp47_tzid_data = &compute_bcp47_tzids_btreemap(other.bcp47_tzids_resource);
let meta_zone_id_data = &compute_meta_zone_ids_hashmap(other.meta_zone_ids_resource);
Self(
data.iter()
Expand Down Expand Up @@ -239,7 +240,7 @@ macro_rules! long_short_impls {
impl From<CldrTimeZonesData<'_>> for $generic {
fn from(other: CldrTimeZonesData<'_>) -> Self {
let data = other.time_zone_names_resource;
let bcp47_tzid_data = &compute_bcp47_tzids_hashmap(other.bcp47_tzids_resource);
let bcp47_tzid_data = &compute_bcp47_tzids_btreemap(other.bcp47_tzids_resource);
let meta_zone_id_data =
&compute_meta_zone_ids_hashmap(other.meta_zone_ids_resource);
Self {
Expand Down Expand Up @@ -326,7 +327,7 @@ macro_rules! long_short_impls {
impl From<CldrTimeZonesData<'_>> for $specific {
fn from(other: CldrTimeZonesData<'_>) -> Self {
let data = other.time_zone_names_resource;
let bcp47_tzid_data = &compute_bcp47_tzids_hashmap(other.bcp47_tzids_resource);
let bcp47_tzid_data = &compute_bcp47_tzids_btreemap(other.bcp47_tzids_resource);
let meta_zone_id_data =
&compute_meta_zone_ids_hashmap(other.meta_zone_ids_resource);
Self {
Expand Down
1 change: 1 addition & 0 deletions provider/datagen/src/transform/cldr/time_zones/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use icu_timezone::provider::*;
use std::collections::HashMap;

mod convert;
mod names;

#[derive(Debug, Copy, Clone)]
struct CldrTimeZonesData<'a> {
Expand Down
65 changes: 65 additions & 0 deletions provider/datagen/src/transform/cldr/time_zones/names.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use super::convert::compute_bcp47_tzids_btreemap;
use crate::transform::cldr::cldr_serde;
use icu_provider::datagen::IterableDataProvider;
use icu_provider::prelude::*;
use icu_timezone::provider::names::*;

impl DataProvider<IanaToBcp47MapV1Marker> for crate::DatagenProvider {
fn load(&self, _: DataRequest) -> Result<DataResponse<IanaToBcp47MapV1Marker>, DataError> {
let resource: &cldr_serde::time_zones::bcp47_tzid::Resource =
self.source
.cldr()?
.bcp47()
.read_and_parse("timezone.json")?;
let bcp47_tzid_data = &compute_bcp47_tzids_btreemap(&resource.keyword.u.time_zones.values);
let data_struct = IanaToBcp47MapV1 {
map: bcp47_tzid_data
.iter()
.map(|(k, v)| (NormalizedTimeZoneIdStr::boxed_from_bytes(k.as_bytes()), v))
.collect(),
};
Ok(DataResponse {
metadata: Default::default(),
payload: Some(DataPayload::from_owned(data_struct)),
})
}
}

impl IterableDataProvider<IanaToBcp47MapV1Marker> for crate::DatagenProvider {
fn supported_locales(&self) -> Result<Vec<DataLocale>, DataError> {
Ok(vec![Default::default()])
}
}

impl DataProvider<Bcp47ToIanaMapV1Marker> for crate::DatagenProvider {
fn load(&self, _: DataRequest) -> Result<DataResponse<Bcp47ToIanaMapV1Marker>, DataError> {
let resource: &cldr_serde::time_zones::bcp47_tzid::Resource =
self.source
.cldr()?
.bcp47()
.read_and_parse("timezone.json")?;
// Note: The BTreeMap retains the order of the aliases, which is important for establishing
// the canonical order of the IANA names.
let bcp47_tzid_data = &compute_bcp47_tzids_btreemap(&resource.keyword.u.time_zones.values);
let data_struct = Bcp47ToIanaMapV1 {
map: bcp47_tzid_data
.iter()
.map(|(k, v)| (v.0.to_unvalidated(), k.as_str()))
.collect(),
};
Ok(DataResponse {
metadata: Default::default(),
payload: Some(DataPayload::from_owned(data_struct)),
})
}
}

impl IterableDataProvider<Bcp47ToIanaMapV1Marker> for crate::DatagenProvider {
fn supported_locales(&self) -> Result<Vec<DataLocale>, DataError> {
Ok(vec![Default::default()])
}
}
2 changes: 2 additions & 0 deletions provider/repodata/data/json/fingerprints.csv

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading