From 036c32af1f046970be50a0f09d490777773d3b6c Mon Sep 17 00:00:00 2001 From: Marc Durdin Date: Mon, 15 Jan 2024 11:47:22 +0700 Subject: [PATCH] feat(core): normalize input context First half of #9999. Adds support for normalization (to NFD) of input app context into the cached context. The keyboard processor will work with the NFD cached context. Adds unit tests for the normalization as part of the LDML keyboard processor test suite. TODO: * Comparing modified cached context to app context to determine the transform required to send to the app * Handling illegal unicode and unpaired surrogates on input context --- core/src/km_core_state_api.cpp | 90 +------ .../km_core_state_context_set_if_needed.cpp | 222 ++++++++++++++++++ core/src/kmx/kmx_processor.hpp | 4 + core/src/ldml/ldml_processor.hpp | 5 + core/src/meson.build | 1 + core/src/mock/mock_processor.hpp | 4 + core/src/processor.hpp | 3 + .../unit/ldml/test_context_normalization.cpp | 54 ++++- 8 files changed, 290 insertions(+), 93 deletions(-) create mode 100644 core/src/km_core_state_context_set_if_needed.cpp diff --git a/core/src/km_core_state_api.cpp b/core/src/km_core_state_api.cpp index bf8b41ae741..d0e0cb44ec7 100644 --- a/core/src/km_core_state_api.cpp +++ b/core/src/km_core_state_api.cpp @@ -21,6 +21,7 @@ #include "processor.hpp" #include "state.hpp" + using namespace km::core; // Forward declarations @@ -271,95 +272,6 @@ void km_core_state_imx_deregister_callback(km_core_state *state) state->imx_deregister_callback(); } -bool is_context_valid(km_core_cp const * context, km_core_cp const * cached_context) { - if (context == nullptr || cached_context == nullptr || *cached_context == '\0') { - // If the cached_context is "empty" then it needs updating - return false; - } - km_core_cp const* context_p = context; - while(*context_p) { - context_p++; - } - - km_core_cp const* cached_context_p = cached_context; - while(*cached_context_p) { - cached_context_p++; - } - - // we need to compare from the end of the cached context - for(; context_p >= context && cached_context_p >= cached_context; context_p--, cached_context_p--) { - if(*context_p != *cached_context_p) { - // The cached context doesn't match the application context, so it is - // invalid - return false; - } - } - - if(cached_context_p > cached_context) { - // if the cached context is longer than the application context, then we also - // assume that it is invalid - return false; - } - - // It's acceptable for the application context to be longer than the cached - // context, so if we match the whole cached context, we can safely return true - return true; -} - -km_core_context_status km_core_state_context_set_if_needed( - km_core_state *state, - km_core_cp const *application_context -) { - assert(state != nullptr); - assert(application_context != nullptr); - if(state == nullptr || application_context == nullptr) { - return KM_CORE_CONTEXT_STATUS_INVALID_ARGUMENT; - } - - size_t buf_size; - km_core_context_item* context_items = nullptr; - - auto context = km_core_state_context(state); - if(km_core_context_get(context, &context_items) != KM_CORE_STATUS_OK) { - return KM_CORE_CONTEXT_STATUS_ERROR; - } - - if(km_core_context_items_to_utf16(context_items, nullptr, &buf_size) != KM_CORE_STATUS_OK) { - km_core_context_items_dispose(context_items); - return KM_CORE_CONTEXT_STATUS_ERROR; - } - - std::unique_ptr cached_context(new km_core_cp[buf_size]); - - km_core_status status = km_core_context_items_to_utf16(context_items, cached_context.get(), &buf_size); - km_core_context_items_dispose(context_items); - - if(status != KM_CORE_STATUS_OK) { - return KM_CORE_CONTEXT_STATUS_ERROR; - } - - bool is_valid = is_context_valid(application_context, cached_context.get()); - - if(is_valid) { - // We keep the context as is - return KM_CORE_CONTEXT_STATUS_UNCHANGED; - } - - km_core_context_item* new_context_items = nullptr; - - // We replace the cached context with the current application context - status = km_core_context_items_from_utf16(application_context, &new_context_items); - if (status != KM_CORE_STATUS_OK) { - km_core_context_clear(context); - return KM_CORE_CONTEXT_STATUS_CLEARED; - } - - km_core_context_set(context, new_context_items); - km_core_context_items_dispose(new_context_items); - return KM_CORE_CONTEXT_STATUS_UPDATED; -} - - km_core_status km_core_state_context_clear( km_core_state *state ) { diff --git a/core/src/km_core_state_context_set_if_needed.cpp b/core/src/km_core_state_context_set_if_needed.cpp new file mode 100644 index 00000000000..84775772d35 --- /dev/null +++ b/core/src/km_core_state_context_set_if_needed.cpp @@ -0,0 +1,222 @@ +/* + Copyright: © 2018-2024 SIL International. + Description: Implementation of the state API functions using internal + data structures and functions. + Create Date: 15 Jan 2024 + Authors: Marc Durdin + History: 15 Jan 2024 - MCD - Refactor our km_core_state_context_set_if_needed + and implement normalization +*/ +#include + +#include + +#include "processor.hpp" +#include "state.hpp" +#include "debuglog.h" + +#if !defined(HAVE_ICU4C) +#error icu4c is required for this code +#endif + +#define U_FALLTHROUGH +#include "unicode/utypes.h" +#include "unicode/unistr.h" +#include "unicode/normalizer2.h" + +using namespace km::core; + +// Forward declarations + +bool should_normalize(km_core_state *state); +bool is_context_valid(km_core_cp const * context, km_core_cp const * cached_context); +km_core_cp* get_context_as_string(km_core_context *context); +bool set_context_from_string(km_core_context *context, km_core_cp const *new_context); +bool do_normalize_nfd(km_core_cp const * src, std::u16string &dst); +km_core_context_status do_fail(km_core_context *app_context, km_core_context *cached_context, const char* error); + +// --------------------------------------------------------------------------- + +km_core_context_status km_core_state_context_set_if_needed( + km_core_state *state, + km_core_cp const *new_app_context +) { + assert(state != nullptr); + assert(new_app_context != nullptr); + if(state == nullptr || new_app_context == nullptr) { + return KM_CORE_CONTEXT_STATUS_INVALID_ARGUMENT; + } + + auto app_context = km_core_state_app_context(state); + auto cached_context = km_core_state_context(state); + + // Retrieve the existing internally cached app context for comparison + + std::unique_ptr app_context_string(get_context_as_string(app_context)); + + // Compare the internal app context with the passed-in application context + + bool is_valid = is_context_valid(new_app_context, app_context_string.get()); + + if(is_valid) { + // We keep the context as is + return KM_CORE_CONTEXT_STATUS_UNCHANGED; + } + + // We replace the internal app context with the passed-in application context + + if(!set_context_from_string(app_context, new_app_context)) { + return do_fail(app_context, cached_context, "could not set new app context"); + } + + // Finally, we normalize and replace the cached context + + std::u16string normalized_buffer; + km_core_cp const *new_cached_context = nullptr; + + if(should_normalize(state)) { + if(!do_normalize_nfd(new_app_context, normalized_buffer)) { + return do_fail(app_context, cached_context, "could not normalize string"); + } + new_cached_context = normalized_buffer.c_str(); + } else { + new_cached_context = new_app_context; + } + + // TODO: #10100 will alter how we replace the cached context here -- maintaining + // markers as far as possible + + if(!set_context_from_string(cached_context, new_cached_context)) { + return do_fail(app_context, cached_context, "could not set new cached context"); + } + + return KM_CORE_CONTEXT_STATUS_UPDATED; +} + +/** + * Returns true if the current keyboard processor wants a normalized cached context + */ +bool should_normalize(km_core_state *state) { + return state->processor().supports_normalization(); +} + +/** + * Returns true if the internal app context does not need to be updated to the new + * app context + * + * TODO: #10100 will alter some of the assumptions here + */ +bool is_context_valid(km_core_cp const * new_app_context, km_core_cp const * app_context) { + if (new_app_context == nullptr || app_context == nullptr || *app_context == '\0') { + // If the app_context is "empty" then it needs updating + return false; + } + km_core_cp const* new_app_context_p = new_app_context; + while(*new_app_context_p) { + new_app_context_p++; + } + + km_core_cp const* app_context_p = app_context; + while(*app_context_p) { + app_context_p++; + } + + // we need to compare from the end of the cached context + for(; new_app_context_p >= new_app_context && app_context_p >= app_context; new_app_context_p--, app_context_p--) { + if(*new_app_context_p != *app_context_p) { + // The cached context doesn't match the application context, so it is + // invalid + return false; + } + } + + if(app_context_p > app_context) { + // if the cached context is longer than the application context, then we also + // assume that it is invalid + return false; + } + + // It's acceptable for the application context to be longer than the cached + // context, so if we match the whole cached context, we can safely return true + return true; +} + +/** + * Retrieves the context as a km_core_cp string, dropping markers + */ +km_core_cp* get_context_as_string(km_core_context *context) { + size_t buf_size = 0; + km_core_context_item* context_items = nullptr; + + if(km_core_context_get(context, &context_items) != KM_CORE_STATUS_OK) { + return nullptr; + } + + if(km_core_context_items_to_utf16(context_items, nullptr, &buf_size) != KM_CORE_STATUS_OK) { + km_core_context_items_dispose(context_items); + return nullptr; + } + + km_core_cp *app_context_string = new km_core_cp[buf_size]; + + km_core_status status = km_core_context_items_to_utf16(context_items, app_context_string, &buf_size); + km_core_context_items_dispose(context_items); + + if(status != KM_CORE_STATUS_OK) { + return nullptr; + } + + return app_context_string; +} + +/** + * Updates the context from the new_context km_core_cp string + */ +bool set_context_from_string(km_core_context *context, km_core_cp const *new_context) { + km_core_context_item* new_context_items = nullptr; + + km_core_status status = km_core_context_items_from_utf16(new_context, &new_context_items); + if (status != KM_CORE_STATUS_OK) { + return false; + } + + km_core_context_set(context, new_context_items); + km_core_context_items_dispose(new_context_items); + + return true; +} + +/** + * Normalize the input string using ICU + */ +bool do_normalize_nfd(km_core_cp const * src, std::u16string &dst) { + UErrorCode icu_status = U_ZERO_ERROR; + const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(icu_status); + assert(U_SUCCESS(icu_status)); + if(!U_SUCCESS(icu_status)) { + // TODO: log the failure code + return false; + } + icu::UnicodeString udst; + icu::UnicodeString usrc = icu::UnicodeString(src); + nfd->normalize(usrc, udst, icu_status); + assert(U_SUCCESS(icu_status)); + if(!U_SUCCESS(icu_status)) { + // TODO: log the failure code + return false; + } + + dst.assign(udst.getBuffer(), udst.length()); + return true; +} + +/** + * Clear the context when we have a failure so we don't end up with inconsistent + * context buffers, and log the error to our diagnostic log. + */ +km_core_context_status do_fail(km_core_context *app_context, km_core_context *cached_context, const char* error) { + DebugLog("%s", error); + km_core_context_clear(app_context); + km_core_context_clear(cached_context); + return KM_CORE_CONTEXT_STATUS_CLEARED; +} \ No newline at end of file diff --git a/core/src/kmx/kmx_processor.hpp b/core/src/kmx/kmx_processor.hpp index b83e8bba9ea..490bda53c6f 100644 --- a/core/src/kmx/kmx_processor.hpp +++ b/core/src/kmx/kmx_processor.hpp @@ -80,6 +80,10 @@ namespace core km_core_keyboard_imx * get_imx_list() const override; + bool + supports_normalization() const override { + return false; + } }; } // namespace core diff --git a/core/src/ldml/ldml_processor.hpp b/core/src/ldml/ldml_processor.hpp index 7ad997a944d..a66d151fde0 100644 --- a/core/src/ldml/ldml_processor.hpp +++ b/core/src/ldml/ldml_processor.hpp @@ -84,6 +84,11 @@ namespace core { km_core_keyboard_imx * get_imx_list() const override; + bool + supports_normalization() const override { + return true; + } + private: /** emit text to context and actions */ static void emit_text(km_core_state *state, const std::u16string &str); diff --git a/core/src/meson.build b/core/src/meson.build index e5314294ad1..a80eaaa94a9 100644 --- a/core/src/meson.build +++ b/core/src/meson.build @@ -52,6 +52,7 @@ kmx_files = files( 'km_core_keyboard_api.cpp', 'km_core_options_api.cpp', 'km_core_state_api.cpp', + 'km_core_state_context_set_if_needed.cpp', 'km_core_debug_api.cpp', 'km_core_processevent_api.cpp', 'jsonpp.cpp', diff --git a/core/src/mock/mock_processor.hpp b/core/src/mock/mock_processor.hpp index b10f4ad8d27..a3ec3c1caa2 100644 --- a/core/src/mock/mock_processor.hpp +++ b/core/src/mock/mock_processor.hpp @@ -65,6 +65,10 @@ namespace core km_core_keyboard_imx * get_imx_list() const override; + bool + supports_normalization() const override { + return false; + } }; class null_processor : public mock_processor { diff --git a/core/src/processor.hpp b/core/src/processor.hpp index 2dd52e82d48..7a6ed5aa334 100644 --- a/core/src/processor.hpp +++ b/core/src/processor.hpp @@ -122,6 +122,9 @@ namespace core virtual km_core_keyboard_imx * get_imx_list() const = 0; + virtual bool + supports_normalization() const = 0; + friend json & operator << (json &j, abstract_processor const &opts); }; diff --git a/core/tests/unit/ldml/test_context_normalization.cpp b/core/tests/unit/ldml/test_context_normalization.cpp index 061c09335d2..c3215ce4a3f 100644 --- a/core/tests/unit/ldml/test_context_normalization.cpp +++ b/core/tests/unit/ldml/test_context_normalization.cpp @@ -46,10 +46,22 @@ void setup(const char *keyboard) { try_status(km_core_state_create(test_kb, test_env_opts, &test_state)); } +void debug_context(km_core_debug_context_type context_type) { + auto context = km_core_state_context_debug(test_state, context_type); + if(context_type == KM_CORE_DEBUG_CONTEXT_APP) { + std::cout << "app context: " << context << std::endl; + } else { + std::cout << "cached context: " << context << std::endl; + } + km_core_cp_dispose(context); +} + bool is_identical_context(km_core_cp const *cached_context, km_core_debug_context_type context_type) { size_t buf_size; km_core_context_item * citems = nullptr; + debug_context(context_type); + if(context_type == KM_CORE_DEBUG_CONTEXT_APP) { try_status(km_core_context_get(km_core_state_app_context(test_state), &citems)); } else { @@ -66,18 +78,52 @@ bool is_identical_context(km_core_cp const *cached_context, km_core_debug_contex return result; } -void test_context_set_if_needed_for_ldml_normalization() { +void test_context_normalization_already_nfd() { + km_core_cp const *app_context_nfd = u"A\u0300"; + setup("k_001_tiny.kmx"); + assert(km_core_state_context_set_if_needed(test_state, app_context_nfd) == KM_CORE_CONTEXT_STATUS_UPDATED); + assert(is_identical_context(app_context_nfd, KM_CORE_DEBUG_CONTEXT_APP)); + assert(is_identical_context(app_context_nfd, KM_CORE_DEBUG_CONTEXT_CACHED)); + teardown(); +} + +void test_context_normalization_basic() { km_core_cp const *application_context = u"This is a test À"; km_core_cp const *cached_context = u"This is a test A\u0300"; setup("k_001_tiny.kmx"); assert(km_core_state_context_set_if_needed(test_state, application_context) == KM_CORE_CONTEXT_STATUS_UPDATED); + assert(is_identical_context(application_context, KM_CORE_DEBUG_CONTEXT_APP)); + assert(is_identical_context(cached_context, KM_CORE_DEBUG_CONTEXT_CACHED)); + teardown(); +} + +void test_context_normalization_hefty() { + // Latin Latin "ṩ" "Å" Tirhuta U+114bc -> U+114B9 U+114B0 + km_core_cp const *application_context = u"À" u"é̖" u"\u1e69" u"\u212b" u"\U000114BC"; + km_core_cp const *cached_context = u"A\u0300" u"e\u0316\u0301" u"\u0073\u0323\u0307" u"\u0041\u030a" u"\U000114B9\U000114B0"; + setup("k_001_tiny.kmx"); + assert(km_core_state_context_set_if_needed(test_state, application_context) == KM_CORE_CONTEXT_STATUS_UPDATED); + assert(is_identical_context(application_context, KM_CORE_DEBUG_CONTEXT_APP)); assert(is_identical_context(cached_context, KM_CORE_DEBUG_CONTEXT_CACHED)); + teardown(); +} + +void test_context_normalization_invalid_unicode() { + // unpaired surrogate illegal + km_core_cp const application_context[] = { 0xDC01, 0x0020, 0x0020, 0xFFFF, 0x0000 }; + km_core_cp const cached_context[] = { 0xDC01, 0x0020, 0x0020, 0xFFFF, 0x0000 }; + setup("k_001_tiny.kmx"); + assert(km_core_state_context_set_if_needed(test_state, application_context) == KM_CORE_CONTEXT_STATUS_UPDATED); assert(is_identical_context(application_context, KM_CORE_DEBUG_CONTEXT_APP)); + assert(is_identical_context(cached_context, KM_CORE_DEBUG_CONTEXT_CACHED)); teardown(); } -void test_context_set_if_needed() { - test_context_set_if_needed_for_ldml_normalization(); +void test_context_normalization() { + test_context_normalization_already_nfd(); + test_context_normalization_basic(); + test_context_normalization_hefty(); + // TODO: we need to strip illegal chars: test_context_normalization_invalid_unicode(); // -- unpaired surrogate, illegals } //------------------------------------------------------------------------------------- @@ -112,5 +158,5 @@ int main(int argc, char *argv []) { arg_path = get_wasm_file_path(arg_path); #endif - test_context_set_if_needed(); + test_context_normalization(); }