Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

i18n: upgrade to latest icu formatter #13834

Merged
merged 15 commits into from
Oct 5, 2023
50 changes: 25 additions & 25 deletions core/scripts/i18n/collect-strings.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import {pathToFileURL} from 'url';
import glob from 'glob';
import {expect} from 'expect';
import tsc from 'typescript';
import MessageParser from 'intl-messageformat-parser';
import MessageParser from '@formatjs/icu-messageformat-parser';
import esMain from 'es-main';
import isDeepEqual from 'lodash/isEqual.js';

Expand All @@ -24,6 +24,7 @@ import {pruneObsoleteLhlMessages} from './prune-obsolete-lhl-messages.js';
import {countTranslatedMessages} from './count-translated.js';
import {LH_ROOT} from '../../../shared/root.js';
import {resolveModulePath} from '../../../shared/esm-utils.js';
import {escapeIcuMessage} from '../../../shared/localization/format.js';

// Match declarations of UIStrings, terminating in either a `};\n` (very likely to always be right)
// or `}\n\n` (allowing semicolon to be optional, but insisting on a double newline so that an
Expand Down Expand Up @@ -189,36 +190,35 @@ function convertMessageToCtc(lhlMessage, examples = {}) {
* @param {string} lhlMessage
*/
function _lhlValidityChecks(lhlMessage) {
let parsedMessage;
let parsedMessageElements;
try {
parsedMessage = MessageParser.parse(lhlMessage);
parsedMessageElements = MessageParser.parse(escapeIcuMessage(lhlMessage), {ignoreTag: true});
} catch (err) {
if (err.name !== 'SyntaxError') throw err;
// Improve the intl-messageformat-parser syntax error output.
/** @type {Array<{text: string}>} */
const expected = err.expected;
const expectedStr = expected.map(exp => `'${exp.text}'`).join(', ');
throw new Error(`Did not find the expected syntax (one of ${expectedStr}) in message "${lhlMessage}"`);
throw new Error(`[${err.message}] Did not find the expected syntax in message: ${err.originalMessage}`);
}

for (const element of parsedMessage.elements) {
if (element.type !== 'argumentElement' || !element.format) continue;

if (element.format.type === 'pluralFormat' || element.format.type === 'selectFormat') {
// `plural`/`select` arguments can't have content before or after them.
// See http://userguide.icu-project.org/formatparse/messages#TOC-Complex-Argument-Types
// e.g. https://github.com/GoogleChrome/lighthouse/pull/11068#discussion_r451682796
if (parsedMessage.elements.length > 1) {
throw new Error(`Content cannot appear outside plural or select ICU messages. Instead, repeat that content in each option (message: '${lhlMessage}')`);
}

// Each option value must also be a valid lhlMessage.
for (const option of element.format.options) {
const optionStr = lhlMessage.slice(option.value.location.start.offset, option.value.location.end.offset);
_lhlValidityChecks(optionStr);
/**
* @param {MessageParser.MessageFormatElement[]} elements
*/
function validate(elements) {
for (const element of elements) {
if (element.type === MessageParser.TYPE.plural || element.type === MessageParser.TYPE.select) {
// `plural`/`select` arguments can't have content before or after them.
// See http://userguide.icu-project.org/formatparse/messages#TOC-Complex-Argument-Types
// e.g. https://github.com/GoogleChrome/lighthouse/pull/11068#discussion_r451682796
if (elements.length > 1) {
throw new Error(`Content cannot appear outside plural or select ICU messages. Instead, repeat that content in each option (message: '${lhlMessage}')`);
}

for (const option of Object.values(element.options)) {
validate(option.value);
}
}
}
}

validate(parsedMessageElements);
}

/**
Expand Down Expand Up @@ -389,7 +389,7 @@ function _processPlaceholderDirectIcu(icu, examples) {
for (const [key, value] of Object.entries(examples)) {
// Make sure all examples have ICU vars
if (!icu.message.includes(`{${key}}`)) {
throw Error(`Example '${key}' provided, but has not corresponding ICU replacement in message "${icu.message}"`);
throw Error(`Example '${key}' provided, but has no corresponding ICU replacement in message "${icu.message}"`);
}
const eName = `ICU_${idx++}`;
tempMessage = tempMessage.replace(`{${key}}`, `$${eName}$`);
Expand Down Expand Up @@ -517,7 +517,7 @@ function parseUIStrings(sourceStr, liveUIStrings) {
const key = getIdentifier(property);

// Use live message to avoid having to e.g. concat strings broken into parts.
const message = liveUIStrings[key];
const message = (liveUIStrings[key]);

// @ts-expect-error - Not part of the public tsc interface yet.
const jsDocComments = tsc.getJSDocCommentsAndTags(property);
Expand Down
14 changes: 8 additions & 6 deletions core/scripts/i18n/prune-obsolete-lhl-messages.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ import fs from 'fs';
import path from 'path';

import glob from 'glob';
import MessageParser from 'intl-messageformat-parser';
import MessageParser from '@formatjs/icu-messageformat-parser';

import {collectAllCustomElementsFromICU} from '../../../shared/localization/format.js';
import {collectAllCustomElementsFromICU, escapeIcuMessage} from '../../../shared/localization/format.js';
import {LH_ROOT} from '../../../shared/root.js';
import {readJson} from '../../test/test-utils.js';

Expand All @@ -24,8 +24,10 @@ import {readJson} from '../../test/test-utils.js';
* @return {boolean}
*/
function equalArguments(goldenArgumentIds, lhlMessage) {
const parsedMessage = MessageParser.parse(lhlMessage);
const lhlArgumentElements = collectAllCustomElementsFromICU(parsedMessage.elements);
const parsedMessageElements = MessageParser.parse(escapeIcuMessage(lhlMessage), {
ignoreTag: true,
});
const lhlArgumentElements = collectAllCustomElementsFromICU(parsedMessageElements);
const lhlArgumentIds = [...lhlArgumentElements.keys()];

if (goldenArgumentIds.length !== lhlArgumentIds.length) return false;
Expand Down Expand Up @@ -96,8 +98,8 @@ function getGoldenLocaleArgumentIds(goldenLhl) {
const goldenLocaleArgumentIds = {};

for (const [messageId, {message}] of Object.entries(goldenLhl)) {
const parsedMessage = MessageParser.parse(message);
const goldenArgumentElements = collectAllCustomElementsFromICU(parsedMessage.elements);
const parsedMessageElements = MessageParser.parse(escapeIcuMessage(message), {ignoreTag: true});
const goldenArgumentElements = collectAllCustomElementsFromICU(parsedMessageElements);
const goldenArgumentIds = [...goldenArgumentElements.keys()].sort();

goldenLocaleArgumentIds[messageId] = goldenArgumentIds;
Expand Down
8 changes: 4 additions & 4 deletions core/test/scripts/i18n/collect-strings-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ describe('#_lhlValidityChecks', () => {
it('errors when using non-supported custom-formatted ICU format', () => {
const message = 'Hello World took {var, badFormat, milliseconds}.';
expect(() => collect.convertMessageToCtc(message)).toThrow(
/Did not find the expected syntax \(one of 'number', 'date', 'time', 'plural', 'selectordinal', 'select'\) in message "Hello World took {var, badFormat, milliseconds}."$/);
/\[INVALID_ARGUMENT_TYPE\] Did not find the expected syntax in message: Hello World took {var, badFormat, milliseconds}.$/);
});

it('errors when there is content outside of a plural argument', () => {
Expand Down Expand Up @@ -370,14 +370,14 @@ describe('#_lhlValidityChecks', () => {
/Content cannot appear outside plural or select ICU messages.*=1 {1 request} other {# requests}}'\)$/);
});

it('errors when there is content outside of nested plural aguments', () => {
it('errors when there is content outside of nested plural arguments', () => {
const message = `{user_gender, select,
female {Ms. {name} received {count, plural, =1 {one award.} other {# awards.}}}
male {Mr. {name} received {count, plural, =1 {one award.} other {# awards.}}}
other {{name} received {count, plural, =1 {one award.} other {# awards.}}}
}`;
expect(() => collect.convertMessageToCtc(message, {name: 'Elbert'})).toThrow(
/Content cannot appear outside plural or select ICU messages.*\(message: 'Ms. {name} received {count, plural, =1 {one award.} other {# awards.}}'\)$/);
/Content cannot appear outside plural or select ICU messages.*\(message: '{user_gender, select/);
});
/* eslint-enable max-len */
});
Expand Down Expand Up @@ -562,7 +562,7 @@ describe('Convert Message to Placeholder', () => {
const message = 'Hello name.';
expect(() => collect.convertMessageToCtc(message, {name: 'Mary'}))
// eslint-disable-next-line max-len
.toThrow(/Example 'name' provided, but has not corresponding ICU replacement in message "Hello name."/);
.toThrow(/Example 'name' provided, but has no corresponding ICU replacement in message "Hello name."/);
});

it('errors when direct ICU has no examples', () => {
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@
},
"devDependencies": {
"@build-tracker/cli": "^1.0.0-beta.15",
"@formatjs/icu-messageformat-parser": "^2.6.2",
"@esbuild-kit/esm-loader": "^2.1.1",
"@esbuild-plugins/node-modules-polyfill": "^0.1.4",
"@jest/fake-timers": "^28.1.0",
Expand Down Expand Up @@ -152,7 +153,6 @@
"gh-pages": "^2.0.1",
"glob": "^7.1.3",
"idb-keyval": "2.2.0",
"intl-messageformat-parser": "^1.8.1",
"jest-mock": "^27.3.0",
"jest-snapshot": "^28.1.0",
"jsdom": "^12.2.0",
Expand Down Expand Up @@ -188,7 +188,7 @@
"devtools-protocol": "0.0.1200039",
"enquirer": "^2.3.6",
"http-link-header": "^1.1.1",
"intl-messageformat": "^4.4.0",
"intl-messageformat": "^10.5.3",
"jpeg-js": "^0.4.4",
"js-library-detector": "^6.7.0",
"lighthouse-logger": "^2.0.1",
Expand Down
109 changes: 73 additions & 36 deletions shared/localization/format.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,19 @@ import {getModuleDirectory} from '../esm-utils.js';
import {isObjectOfUnknownValues, isObjectOrArrayOfUnknownValues} from '../type-verifiers.js';
import {locales} from './locales.js';

// From @formatjs/icu-messageformat-parser - copy here so we don't need to bundle all that.
const TYPE = /** @type {const} */ ({
connorjclark marked this conversation as resolved.
Show resolved Hide resolved
literal: 0,
argument: 1,
number: 2,
date: 3,
time: 4,
select: 5,
plural: 6,
pound: 7,
tag: 8,
});

const moduleDir = getModuleDirectory(import.meta);

/** Contains available locales with messages. May be an empty object if bundled. */
Expand All @@ -30,11 +43,11 @@ const CANONICAL_LOCALES = fs.readdirSync(moduleDir + '/locales/')
.map(locale => locale.replace('.json', ''))
.sort();

/** @typedef {import('intl-messageformat-parser').Element} MessageElement */
/** @typedef {import('intl-messageformat-parser').ArgumentElement} ArgumentElement */
/** @typedef {import('@formatjs/icu-messageformat-parser').MessageFormatElement} MessageFormatElement */

const MESSAGE_I18N_ID_REGEX = / | [^\s]+$/;

/** @type {Partial<import('intl-messageformat').Formats>} */
const formats = {
number: {
bytes: {
Expand All @@ -57,40 +70,39 @@ const formats = {
};

/**
* Function to retrieve all 'argumentElement's from an ICU message. An argumentElement
* is an ICU element with an argument in it, like '{varName}' or '{varName, number, bytes}'. This
* differs from 'messageElement's which are just arbitrary text in a message.
* Function to retrieve all elements from an ICU message AST that are associated
* with a named input, like '{varName}' or '{varName, number, bytes}'. This
* differs from literal message types which are just arbitrary text.
*
* Notes:
* This function will recursively inspect plural elements for nested argumentElements.
* This function recursively inspects plural elements for nested elements,
* but since the output is a Map they are deduplicated.
* e.g. "=1{hello {icu}} =other{hello {icu}}" will produce one element in the output,
* with "icu" as its key.
*
* We need to find all the elements from the plural format sections, but
* they need to be deduplicated. I.e. "=1{hello {icu}} =other{hello {icu}}"
* the variable "icu" would appear twice if it wasn't de duplicated. And they cannot
* be stored in a set because they are not equal since their locations are different,
* thus they are stored via a Map keyed on the "id" which is the ICU varName.
* TODO: don't do that deduplication because messages within a plural message could be number
* messages with different styles.
*
* @param {Array<MessageElement>} icuElements
* @param {Map<string, ArgumentElement>} [seenElementsById]
* @return {Map<string, ArgumentElement>}
* @param {Array<MessageFormatElement>} icuElements
* @param {Map<string, MessageFormatElement>} [customElements]
* @return {Map<string, MessageFormatElement>}
*/
function collectAllCustomElementsFromICU(icuElements, seenElementsById = new Map()) {
function collectAllCustomElementsFromICU(icuElements, customElements = new Map()) {
for (const el of icuElements) {
// We are only interested in elements that need ICU formatting (argumentElements)
if (el.type !== 'argumentElement') continue;
if (el.type === TYPE.literal || el.type === TYPE.pound) continue;

seenElementsById.set(el.id, el);
customElements.set(el.value, el);

// Plurals need to be inspected recursively
if (!el.format || el.format.type !== 'pluralFormat') continue;
// Look at all options of the plural (=1{} =other{}...)
for (const option of el.format.options) {
// Run collections on each option's elements
collectAllCustomElementsFromICU(option.value.elements, seenElementsById);
if (el.type === TYPE.plural) {
// Look at all options of the plural (=1{} =other{}...)
for (const option of Object.values(el.options)) {
// Run collections on each option's elements
collectAllCustomElementsFromICU(option.value, customElements);
}
}
}

return seenElementsById;
return customElements;
}

/**
Expand All @@ -103,23 +115,22 @@ function collectAllCustomElementsFromICU(icuElements, seenElementsById = new Map
* @return {Record<string, string | number>}
*/
function _preformatValues(messageFormatter, values = {}, lhlMessage) {
const elementMap = collectAllCustomElementsFromICU(messageFormatter.getAst().elements);
const argumentElements = [...elementMap.values()];
const customElements = collectAllCustomElementsFromICU(messageFormatter.getAst());

/** @type {Record<string, string | number>} */
const formattedValues = {};

for (const {id, format} of argumentElements) {
for (const [id, element] of customElements) {
// Throw an error if a message's value isn't provided
if (id && (id in values) === false) {
if (!(id in values)) {
throw new Error(`ICU Message "${lhlMessage}" contains a value reference ("${id}") ` +
`that wasn't provided`);
}

const value = values[id];

// Direct `{id}` replacement and non-numeric values need no formatting.
if (!format || format.type !== 'numberFormat') {
if (element.type !== TYPE.number) {
formattedValues[id] = value;
continue;
}
Expand All @@ -130,13 +141,13 @@ function _preformatValues(messageFormatter, values = {}, lhlMessage) {
}

// Format values for known styles.
if (format.style === 'milliseconds') {
if (element.style === 'milliseconds') {
// Round all milliseconds to the nearest 10.
formattedValues[id] = Math.round(value / 10) * 10;
} else if (format.style === 'seconds' && id === 'timeInMs') {
} else if (element.style === 'seconds' && id === 'timeInMs') {
// Convert all seconds to the correct unit (currently only for `timeInMs`).
formattedValues[id] = Math.round(value / 100) / 10;
} else if (format.style === 'bytes') {
} else if (element.style === 'bytes') {
// Replace all the bytes with KB.
formattedValues[id] = value / 1024;
} else {
Expand All @@ -162,6 +173,19 @@ function _preformatValues(messageFormatter, values = {}, lhlMessage) {
return formattedValues;
}

/**
connorjclark marked this conversation as resolved.
Show resolved Hide resolved
* Our strings were made when \ was the escape character, but now it is '. To avoid churn,
* let's convert to the new style in memory.
* @param {string} message
* @return {string}
*/
function escapeIcuMessage(message) {
return message
.replace(/'/g, `''`)
.replace(/\\{/g, `'{`)
.replace(/\\}/g, `'}`);
}

/**
* Format string `message` by localizing `values` and inserting them. `message`
* is assumed to already be in the given locale.
Expand All @@ -172,19 +196,31 @@ function _preformatValues(messageFormatter, values = {}, lhlMessage) {
* @return {string}
*/
function formatMessage(message, values, locale) {
message = escapeIcuMessage(message);

// Parsing and formatting can be slow. Don't attempt if the string can't
// contain ICU placeholders, in which case formatting is already complete.
if (!message.includes('{') && values === undefined) return message;

// When using accented english, force the use of a different locale for number formatting.
const localeForMessageFormat = (locale === 'en-XA' || locale === 'en-XL') ? 'de-DE' : locale;

const formatter = new IntlMessageFormat(message, localeForMessageFormat, formats);
// This package is not correctly bundled.
/** @type {typeof IntlMessageFormat} */
// @ts-expect-error bundler woes
const IntlMessageFormatCtor = IntlMessageFormat.IntlMessageFormat || IntlMessageFormat;
const formatter = new IntlMessageFormatCtor(message, localeForMessageFormat, formats, {
ignoreTag: true,
});

// Preformat values for the message format like KB and milliseconds.
const valuesForMessageFormat = _preformatValues(formatter, values, message);

return formatter.format(valuesForMessageFormat);
const formattedResult = formatter.format(valuesForMessageFormat);
// We only format to strings.
if (typeof formattedResult !== 'string') {
throw new Error('unexpected formatted result');
}
return formattedResult;
}

/**
Expand Down Expand Up @@ -459,4 +495,5 @@ export {
getIcuMessageIdParts,
getAvailableLocales,
getCanonicalLocales,
escapeIcuMessage,
};
Loading
Loading