Skip to content

Commit

Permalink
Break ground on suggest
Browse files Browse the repository at this point in the history
  • Loading branch information
the-mikedavis committed Sep 13, 2024
1 parent 9e6908d commit aedbd0f
Show file tree
Hide file tree
Showing 3 changed files with 169 additions and 11 deletions.
14 changes: 5 additions & 9 deletions src/checker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,9 @@ use crate::{
},
alloc::{string::String, vec::Vec},
classify_casing, erase_chars, AffixingMode, Casing, Dictionary, Flag, FlagSet, WordList,
AT_COMPOUND_BEGIN, AT_COMPOUND_END, AT_COMPOUND_MIDDLE, FULL_WORD,
AT_COMPOUND_BEGIN, AT_COMPOUND_END, AT_COMPOUND_MIDDLE, FULL_WORD, MAX_WORD_LEN,
};

// Nuspell limits the length of the input word:
// <https://github.com/nuspell/nuspell/blob/349e0d6bc68b776af035ca3ff664a7fc55d69387/src/nuspell/dictionary.cxx#L156>
const MAX_WORD_LEN: usize = 360;

macro_rules! has_flag {
( $flags:expr, $flag:expr ) => {{
match $flag {
Expand All @@ -29,8 +25,8 @@ macro_rules! flag {

// TODO: expose type and add options to it?
pub(crate) struct Checker<'a, S: BuildHasher> {
words: &'a WordList<S>,
aff: &'a AffData,
pub(crate) words: &'a WordList<S>,
pub(crate) aff: &'a AffData,
}

impl<'a, S: BuildHasher> Checker<'a, S> {
Expand Down Expand Up @@ -150,7 +146,7 @@ impl<'a, S: BuildHasher> Checker<'a, S> {
}
}

fn check_word(
pub(crate) fn check_word(
&self,
word: &str,
allow_bad_forceucase: Forceucase,
Expand Down Expand Up @@ -1287,7 +1283,7 @@ impl<'a, S: BuildHasher> Checker<'a, S> {

// Compounding

fn check_compound<const MODE: AffixingMode>(
pub(crate) fn check_compound<const MODE: AffixingMode>(
&self,
word: &str,
allow_bad_forceucase: Forceucase,
Expand Down
12 changes: 10 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,12 @@ extern crate alloc;
pub(crate) mod aff;
pub(crate) mod checker;
mod hash_bag;
mod suggester;

pub use aff::parser::{
ParseDictionaryError, ParseDictionaryErrorKind, ParseDictionaryErrorSource, ParseFlagError,
};
use suggester::Suggester;

use crate::alloc::{borrow::Cow, boxed::Box, slice, string::String, vec::Vec};
use aff::AffData;
Expand Down Expand Up @@ -188,8 +190,10 @@ impl<S: BuildHasher> Dictionary<S> {
Checker::new(self).check(word)
}

// suggest(&self, word: &str) -> impl Iterator<Item = String> ?
// accept a &mut Vec instead?
/// Fills the given vec with possible corrections from the dictionary for the given word.
pub fn suggest(&self, word: &str, out: &mut Vec<String>) {
Suggester::new(Checker::new(self)).suggest(word, out)
}

/// Adds a word to the dictionary.
///
Expand Down Expand Up @@ -442,6 +446,10 @@ const AT_COMPOUND_BEGIN: AffixingMode = 1;
const AT_COMPOUND_MIDDLE: AffixingMode = 2;
const AT_COMPOUND_END: AffixingMode = 3;

// Nuspell limits the length of the input word:
// <https://github.com/nuspell/nuspell/blob/349e0d6bc68b776af035ca3ff664a7fc55d69387/src/nuspell/dictionary.cxx#L156>
const MAX_WORD_LEN: usize = 360;

/// The casing of a word.
// Hunspell: <https://github.com/hunspell/hunspell/blob/8f9bb2957bfd74ca153fad96083a54488b518ca5/src/hunspell/csutil.hxx#L91-L96>
// Nuspell: <https://github.com/nuspell/nuspell/blob/349e0d6bc68b776af035ca3ff664a7fc55d69387/src/nuspell/utils.hxx#L91-L104>
Expand Down
154 changes: 154 additions & 0 deletions src/suggester.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
use core::hash::BuildHasher;

use crate::{
alloc::{borrow::Cow, string::String, vec::Vec},
checker::{Checker, Forceucase, HiddenHomonym},
classify_casing, Casing, AT_COMPOUND_BEGIN, MAX_WORD_LEN,
};

macro_rules! has_flag {
( $flags:expr, $flag:expr ) => {{
match $flag {
Some(flag) => $flags.contains(&flag),
None => false,
}
}};
}

pub(crate) struct Suggester<'a, S: BuildHasher> {
checker: Checker<'a, S>,
}

impl<'a, S: BuildHasher> Suggester<'a, S> {
pub fn new(checker: Checker<'a, S>) -> Self {
Self { checker }
}

pub fn suggest(&self, word: &str, out: &mut Vec<String>) {
out.clear();
if word.len() >= MAX_WORD_LEN {
return;
}

self.suggest_impl(word, out);
}

fn suggest_impl(&self, word: &str, out: &mut Vec<String>) {
if word.is_empty() {
return;
}

// ICONV
let word = self.checker.aff.input_conversions.convert(word);
let casing = classify_casing(&word);
let mut hq_suggestions = false;

match casing {
Casing::None => {
// ?
if self
.checker
.aff
.options
.compound_force_uppercase_flag
.is_some()
&& self
.checker
.check_compound::<AT_COMPOUND_BEGIN>(&word, Forceucase::AllowBadForceucase)
.is_some()
{
out.push(self.checker.aff.options.case_handling.titlecase(&word));
return;
}
hq_suggestions |= self.suggest_low(&word, out);
}
_ => todo!(),
}

// TODO: remove. Currently used to suppress an unused_variable lint.
assert!(!hq_suggestions);

// OCONV
for suggestion in out.iter_mut() {
match self.checker.aff.output_conversions.convert(suggestion) {
Cow::Borrowed(_) => (),
Cow::Owned(converted) => *suggestion = converted,
}
}
}

fn suggest_low(&self, word: &str, out: &mut Vec<String>) -> bool {
// let len = out.len();
self.uppercase_suggest(word, out);

false
}

// TODO: what to take here... a &str? a String? a Cow<str>?
fn add_suggestion_if_correct(&self, word: String, out: &mut Vec<String>) -> bool {
let Some(flags) = self.checker.check_word(
&word,
Forceucase::ForbidBadForceucase,
HiddenHomonym::SkipHiddenHomonym,
) else {
return false;
};

if has_flag!(flags, self.checker.aff.options.forbidden_word_flag) {
return false;
}

if self.checker.aff.options.forbid_warn
&& has_flag!(flags, self.checker.aff.options.warn_flag)
{
return false;
}

out.push(word);
true
}

fn uppercase_suggest(&self, word: &str, out: &mut Vec<String>) {
let upper = self.checker.aff.options.case_handling.uppercase(word);
self.add_suggestion_if_correct(upper, out);
}
}

#[cfg(test)]
mod test {
use super::*;
use crate::{
alloc::{string::ToString, vec},
EN_US,
};

fn suggest(word: &str) -> Vec<String> {
let mut suggestions = Vec::new();
EN_US.suggest(word, &mut suggestions);
suggestions
}

#[test]
fn empty_suggest() {
assert!(suggest("").is_empty());
}

#[test]
fn huge_word_is_skipped() {
assert!(suggest(&"hello".repeat(MAX_WORD_LEN)).is_empty());
}

#[test]
fn existing_suggestions_are_cleared() {
let mut suggestions = Vec::new();
suggestions.push("example".to_string());
EN_US.suggest("", &mut suggestions);
assert!(suggestions.is_empty())
}

#[test]
fn uppercase_suggest() {
// "ANSI" is correct in en_US and not "ansi".
assert_eq!(suggest("ansi"), vec!["ANSI".to_string()]);
}
}

0 comments on commit aedbd0f

Please sign in to comment.