feat(core): normalize input context

First half of #9999. Adds support for normalization (to NFD) of input app context into the cached context. The keyboard processor will work with the NFD cached context. Adds unit tests for the normalization as part of the LDML keyboard processor test suite. TODO: * Comparing modified cached context to app context to determine the transform required to send to the app * Handling illegal unicode and unpaired surrogates on input context
keymanapp · Jan 15, 2024 · 036c32a · 036c32a
1 parent cd69405
commit 036c32a
Show file tree

Hide file tree

Showing 8 changed files with 290 additions and 93 deletions.
diff --git a/core/src/km_core_state_api.cpp b/core/src/km_core_state_api.cpp
@@ -21,6 +21,7 @@
 #include "processor.hpp"
 #include "state.hpp"
 
+
 using namespace km::core;
 
 // Forward declarations
@@ -271,95 +272,6 @@ void km_core_state_imx_deregister_callback(km_core_state *state)
   state->imx_deregister_callback();
 }
 
-bool is_context_valid(km_core_cp const * context, km_core_cp const * cached_context) {
-  if (context == nullptr || cached_context == nullptr || *cached_context == '\0') {
-    // If the cached_context is "empty" then it needs updating
-    return false;
-  }
-  km_core_cp const* context_p = context;
-  while(*context_p) {
-    context_p++;
-  }
-
-  km_core_cp const* cached_context_p = cached_context;
-  while(*cached_context_p) {
-    cached_context_p++;
-  }
-
-  // we need to compare from the end of the cached context
-  for(; context_p >= context && cached_context_p >= cached_context; context_p--, cached_context_p--) {
-    if(*context_p != *cached_context_p) {
-      // The cached context doesn't match the application context, so it is
-      // invalid
-      return false;
-    }
-  }
-
-  if(cached_context_p > cached_context) {
-    // if the cached context is longer than the application context, then we also
-    // assume that it is invalid
-    return false;
-  }
-
-  // It's acceptable for the application context to be longer than the cached
-  // context, so if we match the whole cached context, we can safely return true
-  return true;
-}
-
-km_core_context_status km_core_state_context_set_if_needed(
-  km_core_state *state,
-  km_core_cp const *application_context
-) {
-  assert(state != nullptr);
-  assert(application_context != nullptr);
-  if(state == nullptr || application_context == nullptr) {
-    return KM_CORE_CONTEXT_STATUS_INVALID_ARGUMENT;
-  }
-
-  size_t buf_size;
-  km_core_context_item* context_items = nullptr;
-
-  auto context = km_core_state_context(state);
-  if(km_core_context_get(context, &context_items) != KM_CORE_STATUS_OK) {
-    return KM_CORE_CONTEXT_STATUS_ERROR;
-  }
-
-  if(km_core_context_items_to_utf16(context_items, nullptr, &buf_size) != KM_CORE_STATUS_OK) {
-    km_core_context_items_dispose(context_items);
-    return KM_CORE_CONTEXT_STATUS_ERROR;
-  }
-
-  std::unique_ptr<km_core_cp[]> cached_context(new km_core_cp[buf_size]);
-
-  km_core_status status = km_core_context_items_to_utf16(context_items, cached_context.get(), &buf_size);
-  km_core_context_items_dispose(context_items);
-
-  if(status != KM_CORE_STATUS_OK) {
-    return KM_CORE_CONTEXT_STATUS_ERROR;
-  }
-
-  bool is_valid = is_context_valid(application_context, cached_context.get());
-
-  if(is_valid) {
-    // We keep the context as is
-    return KM_CORE_CONTEXT_STATUS_UNCHANGED;
-  }
-
-  km_core_context_item* new_context_items = nullptr;
-
-  // We replace the cached context with the current application context
-  status = km_core_context_items_from_utf16(application_context, &new_context_items);
-  if (status != KM_CORE_STATUS_OK) {
-    km_core_context_clear(context);
-    return KM_CORE_CONTEXT_STATUS_CLEARED;
-  }
-
-  km_core_context_set(context, new_context_items);
-  km_core_context_items_dispose(new_context_items);
-  return KM_CORE_CONTEXT_STATUS_UPDATED;
-}
-
-
 km_core_status km_core_state_context_clear(
   km_core_state *state
 ) {

diff --git a/core/src/km_core_state_context_set_if_needed.cpp b/core/src/km_core_state_context_set_if_needed.cpp
@@ -0,0 +1,222 @@
+/*
+  Copyright:    © 2018-2024 SIL International.
+  Description:  Implementation of the state API functions using internal
+                data structures and functions.
+  Create Date:  15 Jan 2024
+  Authors:      Marc Durdin
+  History:      15 Jan 2024 - MCD - Refactor our km_core_state_context_set_if_needed
+                                    and implement normalization
+*/
+#include <cassert>
+
+#include <keyman/keyman_core_api.h>
+
+#include "processor.hpp"
+#include "state.hpp"
+#include "debuglog.h"
+
+#if !defined(HAVE_ICU4C)
+#error icu4c is required for this code
+#endif
+
+#define U_FALLTHROUGH
+#include "unicode/utypes.h"
+#include "unicode/unistr.h"
+#include "unicode/normalizer2.h"
+
+using namespace km::core;
+
+// Forward declarations
+
+bool should_normalize(km_core_state *state);
+bool is_context_valid(km_core_cp const * context, km_core_cp const * cached_context);
+km_core_cp* get_context_as_string(km_core_context *context);
+bool set_context_from_string(km_core_context *context, km_core_cp const *new_context);
+bool do_normalize_nfd(km_core_cp const * src, std::u16string &dst);
+km_core_context_status do_fail(km_core_context *app_context, km_core_context *cached_context, const char* error);
+
+// ---------------------------------------------------------------------------
+
+km_core_context_status km_core_state_context_set_if_needed(
+  km_core_state *state,
+  km_core_cp const *new_app_context
+) {
+  assert(state != nullptr);
+  assert(new_app_context != nullptr);
+  if(state == nullptr || new_app_context == nullptr) {
+    return KM_CORE_CONTEXT_STATUS_INVALID_ARGUMENT;
+  }
+
+  auto app_context = km_core_state_app_context(state);
+  auto cached_context = km_core_state_context(state);
+
+  // Retrieve the existing internally cached app context for comparison
+
+  std::unique_ptr<km_core_cp[]> app_context_string(get_context_as_string(app_context));
+
+  // Compare the internal app context with the passed-in application context
+
+  bool is_valid = is_context_valid(new_app_context, app_context_string.get());
+
+  if(is_valid) {
+    // We keep the context as is
+    return KM_CORE_CONTEXT_STATUS_UNCHANGED;
+  }
+
+  // We replace the internal app context with the passed-in application context
+
+  if(!set_context_from_string(app_context, new_app_context)) {
+    return do_fail(app_context, cached_context, "could not set new app context");
+  }
+
+  // Finally, we normalize and replace the cached context
+
+  std::u16string normalized_buffer;
+  km_core_cp const *new_cached_context = nullptr;
+
+  if(should_normalize(state)) {
+    if(!do_normalize_nfd(new_app_context, normalized_buffer)) {
+      return do_fail(app_context, cached_context, "could not normalize string");
+    }
+    new_cached_context = normalized_buffer.c_str();
+  } else {
+    new_cached_context = new_app_context;
+  }
+
+  // TODO: #10100 will alter how we replace the cached context here -- maintaining
+  //       markers as far as possible
+
+  if(!set_context_from_string(cached_context, new_cached_context)) {
+    return do_fail(app_context, cached_context, "could not set new cached context");
+  }
+
+  return KM_CORE_CONTEXT_STATUS_UPDATED;
+}
+
+/**
+ * Returns true if the current keyboard processor wants a normalized cached context
+ */
+bool should_normalize(km_core_state *state) {
+  return state->processor().supports_normalization();
+}
+
+/**
+ * Returns true if the internal app context does not need to be updated to the new
+ * app context
+ *
+ * TODO: #10100 will alter some of the assumptions here
+ */
+bool is_context_valid(km_core_cp const * new_app_context, km_core_cp const * app_context) {
+  if (new_app_context == nullptr || app_context == nullptr || *app_context == '\0') {
+    // If the app_context is "empty" then it needs updating
+    return false;
+  }
+  km_core_cp const* new_app_context_p = new_app_context;
+  while(*new_app_context_p) {
+    new_app_context_p++;
+  }
+
+  km_core_cp const* app_context_p = app_context;
+  while(*app_context_p) {
+    app_context_p++;
+  }
+
+  // we need to compare from the end of the cached context
+  for(; new_app_context_p >= new_app_context && app_context_p >= app_context; new_app_context_p--, app_context_p--) {
+    if(*new_app_context_p != *app_context_p) {
+      // The cached context doesn't match the application context, so it is
+      // invalid
+      return false;
+    }
+  }
+
+  if(app_context_p > app_context) {
+    // if the cached context is longer than the application context, then we also
+    // assume that it is invalid
+    return false;
+  }
+
+  // It's acceptable for the application context to be longer than the cached
+  // context, so if we match the whole cached context, we can safely return true
+  return true;
+}
+
+/**
+ * Retrieves the context as a km_core_cp string, dropping markers
+ */
+km_core_cp* get_context_as_string(km_core_context *context) {
+  size_t buf_size = 0;
+  km_core_context_item* context_items = nullptr;
+
+  if(km_core_context_get(context, &context_items) != KM_CORE_STATUS_OK) {
+    return nullptr;
+  }
+
+  if(km_core_context_items_to_utf16(context_items, nullptr, &buf_size) != KM_CORE_STATUS_OK) {
+    km_core_context_items_dispose(context_items);
+    return nullptr;
+  }
+
+  km_core_cp *app_context_string = new km_core_cp[buf_size];
+
+  km_core_status status = km_core_context_items_to_utf16(context_items, app_context_string, &buf_size);
+  km_core_context_items_dispose(context_items);
+
+  if(status != KM_CORE_STATUS_OK) {
+    return nullptr;
+  }
+
+  return app_context_string;
+}
+
+/**
+ * Updates the context from the new_context km_core_cp string
+ */
+bool set_context_from_string(km_core_context *context, km_core_cp const *new_context) {
+  km_core_context_item* new_context_items = nullptr;
+
+  km_core_status status = km_core_context_items_from_utf16(new_context, &new_context_items);
+  if (status != KM_CORE_STATUS_OK) {
+    return false;
+  }
+
+  km_core_context_set(context, new_context_items);
+  km_core_context_items_dispose(new_context_items);
+
+  return true;
+}
+
+/**
+ * Normalize the input string using ICU
+ */
+bool do_normalize_nfd(km_core_cp const * src, std::u16string &dst) {
+  UErrorCode icu_status = U_ZERO_ERROR;
+  const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(icu_status);
+  assert(U_SUCCESS(icu_status));
+  if(!U_SUCCESS(icu_status)) {
+    // TODO: log the failure code
+    return false;
+  }
+  icu::UnicodeString udst;
+  icu::UnicodeString usrc = icu::UnicodeString(src);
+  nfd->normalize(usrc, udst, icu_status);
+  assert(U_SUCCESS(icu_status));
+  if(!U_SUCCESS(icu_status)) {
+    // TODO: log the failure code
+    return false;
+  }
+
+  dst.assign(udst.getBuffer(), udst.length());
+  return true;
+}
+
+/**
+ * Clear the context when we have a failure so we don't end up with inconsistent
+ * context buffers, and log the error to our diagnostic log.
+ */
+km_core_context_status do_fail(km_core_context *app_context, km_core_context *cached_context, const char* error) {
+  DebugLog("%s", error);
+  km_core_context_clear(app_context);
+  km_core_context_clear(cached_context);
+  return KM_CORE_CONTEXT_STATUS_CLEARED;
+}
diff --git a/core/src/kmx/kmx_processor.hpp b/core/src/kmx/kmx_processor.hpp
@@ -80,6 +80,10 @@ namespace core
 
     km_core_keyboard_imx * get_imx_list() const override;
 
+    bool
+    supports_normalization() const override {
+      return false;
+    }
   };
 
 } // namespace core

diff --git a/core/src/ldml/ldml_processor.hpp b/core/src/ldml/ldml_processor.hpp
@@ -84,6 +84,11 @@ namespace core {
 
     km_core_keyboard_imx  * get_imx_list() const override;
 
+    bool
+    supports_normalization() const override {
+      return true;
+    }
+
   private:
      /** emit text to context and actions */
      static void emit_text(km_core_state *state, const std::u16string &str);

diff --git a/core/src/meson.build b/core/src/meson.build
@@ -52,6 +52,7 @@ kmx_files = files(
   'km_core_keyboard_api.cpp',
   'km_core_options_api.cpp',
   'km_core_state_api.cpp',
+  'km_core_state_context_set_if_needed.cpp',
   'km_core_debug_api.cpp',
   'km_core_processevent_api.cpp',
   'jsonpp.cpp',

diff --git a/core/src/mock/mock_processor.hpp b/core/src/mock/mock_processor.hpp
@@ -65,6 +65,10 @@ namespace core
 
     km_core_keyboard_imx  * get_imx_list() const override;
 
+    bool
+    supports_normalization() const override {
+      return false;
+    }
   };
 
   class null_processor : public mock_processor {

diff --git a/core/src/processor.hpp b/core/src/processor.hpp
@@ -122,6 +122,9 @@ namespace core
     virtual km_core_keyboard_imx *
     get_imx_list() const = 0;
 
+    virtual bool
+    supports_normalization() const = 0;
+
     friend json & operator << (json &j, abstract_processor const &opts);
   };