Skip to content

Commit

Permalink
feat(core): normalize input context
Browse files Browse the repository at this point in the history
First half of #9999.

Adds support for normalization (to NFD) of input app context into the
cached context. The keyboard processor will work with the NFD cached
context.

Adds unit tests for the normalization as part of the LDML keyboard
processor test suite.

TODO:
* Comparing modified cached context to app context to determine the
  transform required to send to the app
* Handling illegal unicode and unpaired surrogates on input context
  • Loading branch information
mcdurdin committed Jan 15, 2024
1 parent cd69405 commit 036c32a
Show file tree
Hide file tree
Showing 8 changed files with 290 additions and 93 deletions.
90 changes: 1 addition & 89 deletions core/src/km_core_state_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "processor.hpp"
#include "state.hpp"


using namespace km::core;

// Forward declarations
Expand Down Expand Up @@ -271,95 +272,6 @@ void km_core_state_imx_deregister_callback(km_core_state *state)
state->imx_deregister_callback();
}

bool is_context_valid(km_core_cp const * context, km_core_cp const * cached_context) {
if (context == nullptr || cached_context == nullptr || *cached_context == '\0') {
// If the cached_context is "empty" then it needs updating
return false;
}
km_core_cp const* context_p = context;
while(*context_p) {
context_p++;
}

km_core_cp const* cached_context_p = cached_context;
while(*cached_context_p) {
cached_context_p++;
}

// we need to compare from the end of the cached context
for(; context_p >= context && cached_context_p >= cached_context; context_p--, cached_context_p--) {
if(*context_p != *cached_context_p) {
// The cached context doesn't match the application context, so it is
// invalid
return false;
}
}

if(cached_context_p > cached_context) {
// if the cached context is longer than the application context, then we also
// assume that it is invalid
return false;
}

// It's acceptable for the application context to be longer than the cached
// context, so if we match the whole cached context, we can safely return true
return true;
}

km_core_context_status km_core_state_context_set_if_needed(
km_core_state *state,
km_core_cp const *application_context
) {
assert(state != nullptr);
assert(application_context != nullptr);
if(state == nullptr || application_context == nullptr) {
return KM_CORE_CONTEXT_STATUS_INVALID_ARGUMENT;
}

size_t buf_size;
km_core_context_item* context_items = nullptr;

auto context = km_core_state_context(state);
if(km_core_context_get(context, &context_items) != KM_CORE_STATUS_OK) {
return KM_CORE_CONTEXT_STATUS_ERROR;
}

if(km_core_context_items_to_utf16(context_items, nullptr, &buf_size) != KM_CORE_STATUS_OK) {
km_core_context_items_dispose(context_items);
return KM_CORE_CONTEXT_STATUS_ERROR;
}

std::unique_ptr<km_core_cp[]> cached_context(new km_core_cp[buf_size]);

km_core_status status = km_core_context_items_to_utf16(context_items, cached_context.get(), &buf_size);
km_core_context_items_dispose(context_items);

if(status != KM_CORE_STATUS_OK) {
return KM_CORE_CONTEXT_STATUS_ERROR;
}

bool is_valid = is_context_valid(application_context, cached_context.get());

if(is_valid) {
// We keep the context as is
return KM_CORE_CONTEXT_STATUS_UNCHANGED;
}

km_core_context_item* new_context_items = nullptr;

// We replace the cached context with the current application context
status = km_core_context_items_from_utf16(application_context, &new_context_items);
if (status != KM_CORE_STATUS_OK) {
km_core_context_clear(context);
return KM_CORE_CONTEXT_STATUS_CLEARED;
}

km_core_context_set(context, new_context_items);
km_core_context_items_dispose(new_context_items);
return KM_CORE_CONTEXT_STATUS_UPDATED;
}


km_core_status km_core_state_context_clear(
km_core_state *state
) {
Expand Down
222 changes: 222 additions & 0 deletions core/src/km_core_state_context_set_if_needed.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
/*
Copyright: © 2018-2024 SIL International.
Description: Implementation of the state API functions using internal
data structures and functions.
Create Date: 15 Jan 2024
Authors: Marc Durdin
History: 15 Jan 2024 - MCD - Refactor our km_core_state_context_set_if_needed
and implement normalization
*/
#include <cassert>

#include <keyman/keyman_core_api.h>

#include "processor.hpp"
#include "state.hpp"
#include "debuglog.h"

#if !defined(HAVE_ICU4C)
#error icu4c is required for this code
#endif

#define U_FALLTHROUGH
#include "unicode/utypes.h"
#include "unicode/unistr.h"
#include "unicode/normalizer2.h"

using namespace km::core;

// Forward declarations

bool should_normalize(km_core_state *state);
bool is_context_valid(km_core_cp const * context, km_core_cp const * cached_context);
km_core_cp* get_context_as_string(km_core_context *context);
bool set_context_from_string(km_core_context *context, km_core_cp const *new_context);
bool do_normalize_nfd(km_core_cp const * src, std::u16string &dst);
km_core_context_status do_fail(km_core_context *app_context, km_core_context *cached_context, const char* error);

// ---------------------------------------------------------------------------

km_core_context_status km_core_state_context_set_if_needed(
km_core_state *state,
km_core_cp const *new_app_context
) {
assert(state != nullptr);
assert(new_app_context != nullptr);
if(state == nullptr || new_app_context == nullptr) {
return KM_CORE_CONTEXT_STATUS_INVALID_ARGUMENT;
}

auto app_context = km_core_state_app_context(state);
auto cached_context = km_core_state_context(state);

// Retrieve the existing internally cached app context for comparison

std::unique_ptr<km_core_cp[]> app_context_string(get_context_as_string(app_context));

// Compare the internal app context with the passed-in application context

bool is_valid = is_context_valid(new_app_context, app_context_string.get());

if(is_valid) {
// We keep the context as is
return KM_CORE_CONTEXT_STATUS_UNCHANGED;
}

// We replace the internal app context with the passed-in application context

if(!set_context_from_string(app_context, new_app_context)) {
return do_fail(app_context, cached_context, "could not set new app context");
}

// Finally, we normalize and replace the cached context

std::u16string normalized_buffer;
km_core_cp const *new_cached_context = nullptr;

if(should_normalize(state)) {
if(!do_normalize_nfd(new_app_context, normalized_buffer)) {
return do_fail(app_context, cached_context, "could not normalize string");
}
new_cached_context = normalized_buffer.c_str();
} else {
new_cached_context = new_app_context;
}

// TODO: #10100 will alter how we replace the cached context here -- maintaining
// markers as far as possible

if(!set_context_from_string(cached_context, new_cached_context)) {
return do_fail(app_context, cached_context, "could not set new cached context");
}

return KM_CORE_CONTEXT_STATUS_UPDATED;
}

/**
* Returns true if the current keyboard processor wants a normalized cached context
*/
bool should_normalize(km_core_state *state) {
return state->processor().supports_normalization();
}

/**
* Returns true if the internal app context does not need to be updated to the new
* app context
*
* TODO: #10100 will alter some of the assumptions here
*/
bool is_context_valid(km_core_cp const * new_app_context, km_core_cp const * app_context) {
if (new_app_context == nullptr || app_context == nullptr || *app_context == '\0') {
// If the app_context is "empty" then it needs updating
return false;
}
km_core_cp const* new_app_context_p = new_app_context;
while(*new_app_context_p) {
new_app_context_p++;
}

km_core_cp const* app_context_p = app_context;
while(*app_context_p) {
app_context_p++;
}

// we need to compare from the end of the cached context
for(; new_app_context_p >= new_app_context && app_context_p >= app_context; new_app_context_p--, app_context_p--) {
if(*new_app_context_p != *app_context_p) {
// The cached context doesn't match the application context, so it is
// invalid
return false;
}
}

if(app_context_p > app_context) {
// if the cached context is longer than the application context, then we also
// assume that it is invalid
return false;
}

// It's acceptable for the application context to be longer than the cached
// context, so if we match the whole cached context, we can safely return true
return true;
}

/**
* Retrieves the context as a km_core_cp string, dropping markers
*/
km_core_cp* get_context_as_string(km_core_context *context) {
size_t buf_size = 0;
km_core_context_item* context_items = nullptr;

if(km_core_context_get(context, &context_items) != KM_CORE_STATUS_OK) {
return nullptr;
}

if(km_core_context_items_to_utf16(context_items, nullptr, &buf_size) != KM_CORE_STATUS_OK) {
km_core_context_items_dispose(context_items);
return nullptr;
}

km_core_cp *app_context_string = new km_core_cp[buf_size];

km_core_status status = km_core_context_items_to_utf16(context_items, app_context_string, &buf_size);
km_core_context_items_dispose(context_items);

if(status != KM_CORE_STATUS_OK) {
return nullptr;
}

return app_context_string;
}

/**
* Updates the context from the new_context km_core_cp string
*/
bool set_context_from_string(km_core_context *context, km_core_cp const *new_context) {
km_core_context_item* new_context_items = nullptr;

km_core_status status = km_core_context_items_from_utf16(new_context, &new_context_items);
if (status != KM_CORE_STATUS_OK) {
return false;
}

km_core_context_set(context, new_context_items);
km_core_context_items_dispose(new_context_items);

return true;
}

/**
* Normalize the input string using ICU
*/
bool do_normalize_nfd(km_core_cp const * src, std::u16string &dst) {
UErrorCode icu_status = U_ZERO_ERROR;
const icu::Normalizer2 *nfd = icu::Normalizer2::getNFDInstance(icu_status);
assert(U_SUCCESS(icu_status));
if(!U_SUCCESS(icu_status)) {
// TODO: log the failure code
return false;
}
icu::UnicodeString udst;
icu::UnicodeString usrc = icu::UnicodeString(src);
nfd->normalize(usrc, udst, icu_status);
assert(U_SUCCESS(icu_status));
if(!U_SUCCESS(icu_status)) {
// TODO: log the failure code
return false;
}

dst.assign(udst.getBuffer(), udst.length());
return true;
}

/**
* Clear the context when we have a failure so we don't end up with inconsistent
* context buffers, and log the error to our diagnostic log.
*/
km_core_context_status do_fail(km_core_context *app_context, km_core_context *cached_context, const char* error) {
DebugLog("%s", error);
km_core_context_clear(app_context);
km_core_context_clear(cached_context);
return KM_CORE_CONTEXT_STATUS_CLEARED;
}
4 changes: 4 additions & 0 deletions core/src/kmx/kmx_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,10 @@ namespace core

km_core_keyboard_imx * get_imx_list() const override;

bool
supports_normalization() const override {
return false;
}
};

} // namespace core
Expand Down
5 changes: 5 additions & 0 deletions core/src/ldml/ldml_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ namespace core {

km_core_keyboard_imx * get_imx_list() const override;

bool
supports_normalization() const override {
return true;
}

private:
/** emit text to context and actions */
static void emit_text(km_core_state *state, const std::u16string &str);
Expand Down
1 change: 1 addition & 0 deletions core/src/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ kmx_files = files(
'km_core_keyboard_api.cpp',
'km_core_options_api.cpp',
'km_core_state_api.cpp',
'km_core_state_context_set_if_needed.cpp',
'km_core_debug_api.cpp',
'km_core_processevent_api.cpp',
'jsonpp.cpp',
Expand Down
4 changes: 4 additions & 0 deletions core/src/mock/mock_processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ namespace core

km_core_keyboard_imx * get_imx_list() const override;

bool
supports_normalization() const override {
return false;
}
};

class null_processor : public mock_processor {
Expand Down
3 changes: 3 additions & 0 deletions core/src/processor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ namespace core
virtual km_core_keyboard_imx *
get_imx_list() const = 0;

virtual bool
supports_normalization() const = 0;

friend json & operator << (json &j, abstract_processor const &opts);
};

Expand Down
Loading

0 comments on commit 036c32a

Please sign in to comment.