Merge pull request #10403 from keymanapp/feat/core/9999-normalize-output

feat(core): infrastructure for normalization of output 🌱
keymanapp · Jan 19, 2024 · 4f909dc · 4f909dc
2 parents c6acf86 + ba46d21
commit 4f909dc
Show file tree

Hide file tree

Showing 15 changed files with 684 additions and 48 deletions.
diff --git a/common/include/test_color.h b/common/include/test_color.h
@@ -23,7 +23,7 @@ enum ansi_code {
     BRIGHT_RED = 196
 };
 
-bool enabled = false;
+bool enabled = false;   // TODO: move to test_color.c because test_color.h cannot be #included in more than 1 file in a project otherwise.
 
 class fg {
     ansi_code code;

diff --git a/core/include/keyman/keyman_core_api_context.h b/core/include/keyman/keyman_core_api_context.h
@@ -124,7 +124,7 @@ km_core_context_items_dispose(km_core_context_item *context_items);
  */
 KMN_API
 km_core_context *
-km_core_state_context(km_core_state *state);
+km_core_state_context(km_core_state const *state);
 
 /**
  * Get access to the state object's application context.
@@ -135,7 +135,7 @@ km_core_state_context(km_core_state *state);
  */
 KMN_API
 km_core_context *
-km_core_state_app_context(km_core_state *state);
+km_core_state_app_context(km_core_state const *state);
 
 /*
 ```

diff --git a/core/src/action.cpp b/core/src/action.cpp
@@ -16,7 +16,7 @@
 #include "state.hpp"
 #include "option.hpp"
 
-km_core_actions const * km::core::action_item_list_to_actions_object(
+km_core_actions * km::core::action_item_list_to_actions_object(
   km_core_action_item const *action_items
 ) {
   assert(action_items != nullptr);

diff --git a/core/src/action.hpp b/core/src/action.hpp
@@ -14,8 +14,20 @@
 namespace km {
 namespace core
 {
-  km_core_actions const *action_item_list_to_actions_object(
+  km_core_actions *action_item_list_to_actions_object(
     km_core_action_item const *action_items
   );
+
+  bool actions_normalize(
+    /* in */      km_core_context const *cached_context,
+    /* in, out */ km_core_context *app_context,
+    /* in, out */ km_core_actions *actions
+  );
+
+  bool actions_update_app_context_nfu(
+    /* in */      km_core_context const *cached_context,
+    /* in, out */ km_core_context *app_context
+  );
+
 } // namespace core
 } // namespace km
diff --git a/core/src/actions_normalize.cpp b/core/src/actions_normalize.cpp
@@ -0,0 +1,312 @@
+/*
+  Copyright:    © 2024 SIL International.
+  Description:  Implementation of the action output normalization.
+  Create Date:  16 Jan 2024
+  Authors:      Marc Durdin (MCD)
+  History:      16 Jan 2024 - MCD - Initial implementation from #9999
+*/
+#include <algorithm>
+#include <sstream>
+#include <memory>
+
+#include <cassert>
+#include "context.hpp"
+#include "action.hpp"
+#include "state.hpp"
+#include "option.hpp"
+#include "debuglog.h"
+#include "core_icu.h"
+
+// forward declarations
+
+icu::UnicodeString context_items_to_unicode_string(km_core_context const *context);
+km_core_usv *unicode_string_to_usv(icu::UnicodeString& src);
+
+/**
+ * Normalize the output from an action to NFC, across the context | output
+ * boundary, fixing up the app_context and the output actions to take into
+ * account the NFU input app_context
+ *
+ * @param cached_context  the cached context, in NFD, after transform has been
+ *                        applied to it by the keyboard processor
+ * @param app_context     the app context, in NFU; transform has not been
+ *                        applied, and will be applied by this function
+ * @param actions         transform to apply, in NFD, which will be converted
+ *                        to NFC by this function
+ * @return true on success, false on failure
+ */
+bool km::core::actions_normalize(
+  /* in */      km_core_context const *cached_context,
+  /* in, out */ km_core_context *app_context,
+  /* in, out */ km_core_actions *actions
+) {
+  assert(actions != nullptr);
+  assert(cached_context != nullptr);
+  assert(app_context != nullptr);
+  if(actions == nullptr || cached_context == nullptr || app_context == nullptr) {
+    return false;
+  }
+
+  /*
+    The code_points_to_delete value at this point is in NFD. The cached_context
+    is in NFD and has already been updated by the keyboard processor to the
+    expected result of the action, so we need to remove the output from a string
+    copy of the cached_context to start, in order to get it to the same position
+    as the app_context.
+
+    The app_context is in NFU. We need to figure out how many characters to
+    remove from the end of app_context in order to correctly normalize across
+    the boundary, without normalizing more of the string than necessary.
+
+    We do not need to mutate the cached_context itself, because it is already
+    correct. This is good, because it means we will not lose track of markers
+    within it. The app_context will be mutated, as it will need the new output
+    appended, in order to match the expected result. Remember that the
+    app_context does not contain markers; these are maintained only in the
+    cached_context.
+  */
+
+  /*
+    Initialization
+  */
+
+  UErrorCode icu_status = U_ZERO_ERROR;
+  const icu::Normalizer2 *nfc = icu::Normalizer2::getNFCInstance(icu_status);
+  assert(U_SUCCESS(icu_status));
+  if(!U_SUCCESS(icu_status)) {
+    DebugLog("getNFCInstance failed with %x", icu_status);
+    return false;
+  }
+
+  const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(icu_status);
+  assert(U_SUCCESS(icu_status));
+  if(!U_SUCCESS(icu_status)) {
+    DebugLog("getNFDInstance failed with %x", icu_status);
+    return false;
+  }
+
+  icu::UnicodeString output = icu::UnicodeString::fromUTF32(reinterpret_cast<const UChar32*>(actions->output), -1);
+  icu::UnicodeString cached_context_string = context_items_to_unicode_string(cached_context);
+  icu::UnicodeString app_context_string = context_items_to_unicode_string(app_context);
+  assert(!output.isBogus());
+  assert(!cached_context_string.isBogus());
+  assert(!app_context_string.isBogus());
+  if(output.isBogus() || cached_context_string.isBogus() || app_context_string.isBogus()) {
+    return false;
+  }
+  int nfu_to_delete = 0;
+
+  /*
+    Further debug assertion of inputs
+  */
+
+  assert(nfd->isNormalized(output, icu_status) && U_SUCCESS(icu_status));
+  assert(nfd->isNormalized(cached_context_string, icu_status) && U_SUCCESS(icu_status));
+
+  /*
+    The keyboard processor will have updated the cached_context already,
+    applying the transform to it, so we need to rewind this. Remove the output
+    from cached_context_string to start
+  */
+
+  assert(cached_context_string.length() >= output.length());
+  int n = cached_context_string.length() - output.length();
+  assert(cached_context_string.compare(n, output.length(), output) == 0);
+  cached_context_string.remove(n);
+
+  /*
+    While cached_context is guaranteed to be normalized, actions->output may not
+    start at a normalization boundary. In order to achieve the correct NFC
+    normalization in our output, we now need to look for a normalization
+    boundary prior to the intersection of the cached_context and the output.
+  */
+
+  while(n > 0 && output[0] && !nfd->hasBoundaryBefore(output[0])) {
+    // The output may interact with the context further in normalization. We
+    // need to copy characters back further until we reach a normalization
+    // boundary.
+
+    // Remove last code point from the context ...
+
+    n = cached_context_string.moveIndex32(n, -1);
+    UChar32 chr = cached_context_string.char32At(n);
+    cached_context_string.remove(n);
+
+    // And prepend it to the output ...
+
+    output.insert(0, chr);
+
+    // And finally remember that we now need to delete an additional NFD codepoint
+
+    actions->code_points_to_delete++;
+  }
+
+  /*
+    At this point, our output and cached_context are coherent and normalization
+    will be complete at the edit boundary.
+
+    Now, we need to adjust the delete_back to match the number of characters
+    that must actually be deleted from the applications's NFU context
+
+    To adjust, we remove one character at a time from the app_context until
+    its normalized form matches the cached_context normalized form.
+  */
+
+  while(app_context_string.length()) {
+    icu::UnicodeString app_context_nfd;
+    nfd->normalize(app_context_string, app_context_nfd, icu_status);
+    assert(U_SUCCESS(icu_status));
+    if(!U_SUCCESS(icu_status)) {
+      DebugLog("nfd->normalize failed with %x", icu_status);
+      return false;
+    }
+
+    if(app_context_nfd.compare(cached_context_string) == 0) {
+      break;
+    }
+    app_context_string.remove(app_context_string.length()-1);
+    nfu_to_delete++;
+  }
+
+  /*
+    Normalize our output string
+  */
+
+  icu::UnicodeString output_nfc;
+  nfc->normalize(output, output_nfc, icu_status);
+  assert(U_SUCCESS(icu_status));
+  if(!U_SUCCESS(icu_status)) {
+    DebugLog("nfc->normalize failed with %x", icu_status);
+    return false;
+  }
+
+  auto new_output = unicode_string_to_usv(output_nfc);
+  if(!new_output) {
+    // error logging handled in unicode_string_to_usv
+    return false;
+  }
+
+  /*
+    Final steps -- set our outputs
+  */
+
+  // Append the new NFC output to our reduced app_context
+
+  app_context_string.append(output_nfc);
+  km_core_context_item *app_context_items = nullptr;
+  km_core_status status = KM_CORE_STATUS_OK;
+  if((status = km_core_context_items_from_utf16(app_context_string.getTerminatedBuffer(), &app_context_items)) != KM_CORE_STATUS_OK) {
+    DebugLog("km_core_context_items_from_utf16 failed with %x", status);
+    delete [] new_output;
+    return false;
+  }
+
+  if((status = km_core_context_set(app_context, app_context_items)) != KM_CORE_STATUS_OK) {
+    DebugLog("km_core_context_set failed with %x", status);
+    km_core_context_items_dispose(app_context_items);
+    delete [] new_output;
+    return false;
+  }
+
+  km_core_context_items_dispose(app_context_items);
+
+  // Update actions with new NFC output + count of NFU code points to delete
+
+  delete [] actions->output;
+  actions->output = new_output;
+  actions->code_points_to_delete = nfu_to_delete;
+
+  return true;
+}
+
+/**
+ * Helper to convert km_core_context list into a icu::UnicodeString
+ */
+icu::UnicodeString context_items_to_unicode_string(km_core_context const *context) {
+  icu::UnicodeString nullString;
+  nullString.setToBogus();
+
+  km_core_context_item *items = nullptr;
+  km_core_status status;
+  if((status = km_core_context_get(context, &items)) != KM_CORE_STATUS_OK) {
+    DebugLog("Failed to retrieve context with %s", status);
+    return nullString;
+  }
+  size_t buf_size = 0;
+  if((status = km_core_context_items_to_utf32(items, nullptr, &buf_size)) != KM_CORE_STATUS_OK) {
+    DebugLog("Failed to retrieve context size with %s", status);
+    km_core_context_items_dispose(items);
+    return nullString;
+  }
+
+  km_core_usv *buf = new km_core_usv[buf_size];
+  if((status = km_core_context_items_to_utf32(items, buf, &buf_size)) != KM_CORE_STATUS_OK) {
+    DebugLog("Failed to retrieve context with %s", status);
+    km_core_context_items_dispose(items);
+    delete [] buf;
+    return nullString;
+  }
+
+  auto result = icu::UnicodeString::fromUTF32(reinterpret_cast<const UChar32*>(buf), -1);
+  km_core_context_items_dispose(items);
+  delete [] buf;
+  return result;
+}
+
+/**
+ * Helper to convert icu::UnicodeString to a UTF-32 km_core_usv buffer,
+ * nul-terminated
+ */
+km_core_usv *unicode_string_to_usv(icu::UnicodeString& src) {
+  UErrorCode icu_status = U_ZERO_ERROR;
+
+  km_core_usv *dst = new km_core_usv[src.length() + 1];
+
+  src.toUTF32(reinterpret_cast<UChar32*>(dst), src.length(), icu_status);
+
+  assert(U_SUCCESS(icu_status));
+  if(!U_SUCCESS(icu_status)) {
+    DebugLog("toUTF32 failed with %x", icu_status);
+    delete[] dst;
+    return nullptr;
+  }
+
+  dst[src.length()] = 0;
+  return dst;
+}
+
+
+
+/**
+ * Refresh app_context to match the cached_context. Does not do normalization,
+ * unlike `actions_normalize`. Used in conjunction with keyboard processors that
+ * do not support normalization.
+ *
+ * @param cached_context  the cached context, in NFU, after transform has been
+ *                        applied to it by the keyboard processor
+ * @param app_context     the app context, in NFU; transform has not been
+ *                        applied, and will effectively be applied by this
+ *                        function
+ * @return true on success, false on failure
+ */
+bool km::core::actions_update_app_context_nfu(
+  /* in */      km_core_context const *cached_context,
+  /* in, out */ km_core_context *app_context
+) {
+  // We simply copy the cached_context to the app_context
+  km_core_status status = KM_CORE_STATUS_OK;
+  km_core_context_item *items = nullptr;
+
+  if((status = km_core_context_get(cached_context, &items)) != KM_CORE_STATUS_OK) {
+    DebugLog("km_core_context_get failed with %d", status);
+    return false;
+  }
+
+  if((status = km_core_context_set(app_context, items)) != KM_CORE_STATUS_OK) {
+    DebugLog("km_core_context_set failed with %d", status);
+  }
+
+  delete [] items;
+
+  return status == KM_CORE_STATUS_OK;
+}
diff --git a/core/src/core_icu.h b/core/src/core_icu.h
@@ -0,0 +1,13 @@
+/**
+ * ICU modules used by Keyman Core
+ */
+#pragma once
+
+#if !defined(HAVE_ICU4C)
+#error icu4c is required for this code
+#endif
+
+#define U_FALLTHROUGH
+#include "unicode/utypes.h"
+#include "unicode/unistr.h"
+#include "unicode/normalizer2.h"