diff --git a/bindings/java/kiwi_java.cpp b/bindings/java/kiwi_java.cpp index 5db1f758..9013d9fe 100644 --- a/bindings/java/kiwi_java.cpp +++ b/bindings/java/kiwi_java.cpp @@ -493,6 +493,16 @@ class JTypoTransformer : public kiwi::TypoTransformer, jni::JObject @@ -547,7 +557,11 @@ JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void* reserved) jni::define() .template ctor<>() .template method<&JTypoTransformer::addTypo>("_addTypo") - .template method<&JTypoTransformer::setContinualTypoCost>("_setContinualTypoCost"), + .template method<&JTypoTransformer::setContinualTypoCost>("_setContinualTypoCost") + .template method<&JTypoTransformer::setLengtheningTypoCost>("_setLengtheningTypoCost") + .template method<&JTypoTransformer::copy>("copy") + .template method<&JTypoTransformer::update>("_update") + .template method<&JTypoTransformer::scaleCost>("_scaleCost"), jni::define() .template ctor() diff --git a/bindings/java/kr/pe/bab2min/KiwiBuilder.java b/bindings/java/kr/pe/bab2min/KiwiBuilder.java index a440c602..5cfdfd17 100644 --- a/bindings/java/kr/pe/bab2min/KiwiBuilder.java +++ b/bindings/java/kr/pe/bab2min/KiwiBuilder.java @@ -63,8 +63,12 @@ public boolean isAlive() { @Override public native void close() throws Exception; + public native TypoTransformer copy(); public native void _addTypo(String orig, String error, float cost, byte convVowel); + public native void _update(TypoTransformer src); + public native void _scaleCost(float scale); public native void _setContinualTypoCost(float cost); + public native void _setLengtheningTypoCost(float cost); public TypoTransformer addTypo(String orig, String error, float cost, byte convVowel) { _addTypo(orig, error, cost, convVowel); @@ -80,10 +84,29 @@ public TypoTransformer addTypo(String[] orig, String[] error, float cost, byte c return this; } + // Set continual typo cost (inplace) public TypoTransformer setContinualTypoCost(float cost) { _setContinualTypoCost(cost); return this; } + + // Set lengthening typo cost (inplace) + public TypoTransformer setLengtheningTypoCost(float cost) { + _setLengtheningTypoCost(cost); + return this; + } + + // Inplace update + public TypoTransformer update(TypoTransformer src) { + _update(src); + return this; + } + + // Inplace scaling + public TypoTransformer scaleCost(float scale) { + _scaleCost(scale); + return this; + } } public KiwiBuilder(long _inst) { @@ -241,106 +264,7 @@ public Kiwi build(TypoTransformer typos) { .addTypo(new String[]{"ᆶ"}, new String[]{"ᆯᇂ"}, 1e-12f, CondVowel.none) .addTypo(new String[]{"ᆹ"}, new String[]{"ᆸᆺ", "ᆸᆻ"}, 1e-12f, CondVowel.none); - final public static TypoTransformer basicTypoSetWithContinual = new TypoTransformer() - .addTypo(new String[]{"ㅐ", "ㅔ"}, new String[]{"ㅐ", "ㅔ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ㅐ", "ㅔ"}, new String[]{"ㅒ", "ㅖ"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"ㅒ", "ㅖ"}, new String[]{"ㅐ", "ㅔ"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"ㅒ", "ㅖ"}, new String[]{"ㅒ", "ㅖ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ㅚ", "ㅙ", "ㅞ"}, new String[]{"ㅚ", "ㅙ", "ㅞ", "ㅐ", "ㅔ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ㅝ"}, new String[]{"ㅗ", "ㅓ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ㅟ", "ㅢ"}, new String[]{"ㅣ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"위", "의"}, new String[]{"이"}, Float.POSITIVE_INFINITY, CondVowel.none) - .addTypo(new String[]{"위", "의"}, new String[]{"이"}, 1.f, CondVowel.any) - .addTypo(new String[]{"자", "쟈"}, new String[]{"자", "쟈"}, 1.f, CondVowel.none) - .addTypo(new String[]{"재", "쟤"}, new String[]{"재", "쟤"}, 1.f, CondVowel.none) - .addTypo(new String[]{"저", "져"}, new String[]{"저", "져"}, 1.f, CondVowel.none) - .addTypo(new String[]{"제", "졔"}, new String[]{"제", "졔"}, 1.f, CondVowel.none) - .addTypo(new String[]{"조", "죠", "줘"}, new String[]{"조", "죠", "줘"}, 1.f, CondVowel.none) - .addTypo(new String[]{"주", "쥬"}, new String[]{"주", "쥬"}, 1.f, CondVowel.none) - .addTypo(new String[]{"차", "챠"}, new String[]{"차", "챠"}, 1.f, CondVowel.none) - .addTypo(new String[]{"채", "챼"}, new String[]{"채", "챼"}, 1.f, CondVowel.none) - .addTypo(new String[]{"처", "쳐"}, new String[]{"처", "쳐"}, 1.f, CondVowel.none) - .addTypo(new String[]{"체", "쳬"}, new String[]{"체", "쳬"}, 1.f, CondVowel.none) - .addTypo(new String[]{"초", "쵸", "춰"}, new String[]{"초", "쵸", "춰"}, 1.f, CondVowel.none) - .addTypo(new String[]{"추", "츄"}, new String[]{"추", "츄"}, 1.f, CondVowel.none) - .addTypo(new String[]{"유", "류"}, new String[]{"유", "류"}, 1.f, CondVowel.none) - .addTypo(new String[]{"므", "무"}, new String[]{"므", "무"}, 1.f, CondVowel.none) - .addTypo(new String[]{"브", "부"}, new String[]{"브", "부"}, 1.f, CondVowel.none) - .addTypo(new String[]{"프", "푸"}, new String[]{"프", "푸"}, 1.f, CondVowel.none) - .addTypo(new String[]{"르", "루"}, new String[]{"르", "루"}, 1.f, CondVowel.none) - .addTypo(new String[]{"러", "뤄"}, new String[]{"러", "뤄"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆩ", "ᆪ"}, new String[]{"ᆨ", "ᆩ", "ᆪ"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"ᆬ", "ᆭ"}, new String[]{"ᆫ", "ᆬ", "ᆭ"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"ᆰ", "ᆱ", "ᆲ", "ᆳ", "ᆴ", "ᆵ", "ᆶ"}, new String[]{"ᆯ", "ᆰ", "ᆱ", "ᆲ", "ᆳ", "ᆴ", "ᆵ", "ᆶ"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"ᆺ", "ᆻ"}, new String[]{"ᆺ", "ᆻ"}, 1.f, CondVowel.none) - - .addTypo(new String[]{"안"}, new String[]{"않"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"맞추", "맞히"}, new String[]{"맞추", "맞히"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"맞춰", "맞혀"}, new String[]{"맞춰", "맞혀"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"받치", "바치", "받히"}, new String[]{"받치", "바치", "받히"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"받쳐", "바쳐", "받혀"}, new String[]{"받쳐", "바쳐", "받혀"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"던", "든"}, new String[]{"던", "든"}, 1.f, CondVowel.none) - .addTypo(new String[]{"때", "데"}, new String[]{"때", "데"}, 1.5f, CondVowel.none) - .addTypo(new String[]{"빛", "빚"}, new String[]{"빛", "빚"}, 1.f, CondVowel.none) - - .addTypo(new String[]{"ᆮ이", "지"}, new String[]{"ᆮ이", "지"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆮ여", "져"}, new String[]{"ᆮ여", "져"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᇀ이", "치"}, new String[]{"ᇀ이", "치"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᇀ여", "쳐"}, new String[]{"ᇀ여", "쳐"}, 1.f, CondVowel.none) - - .addTypo(new String[]{"ᄀ", "ᄁ"}, new String[]{"ᄀ", "ᄁ"}, 1.f, CondVowel.applosive) - .addTypo(new String[]{"ᄃ", "ᄄ"}, new String[]{"ᄃ", "ᄄ"}, 1.f, CondVowel.applosive) - .addTypo(new String[]{"ᄇ", "ᄈ"}, new String[]{"ᄇ", "ᄈ"}, 1.f, CondVowel.applosive) - .addTypo(new String[]{"ᄉ", "ᄊ"}, new String[]{"ᄉ", "ᄊ"}, 1.f, CondVowel.applosive) - .addTypo(new String[]{"ᄌ", "ᄍ"}, new String[]{"ᄌ", "ᄍ"}, 1.f, CondVowel.applosive) - - .addTypo(new String[]{"ᇂᄒ", "ᆨᄒ", "ᇂᄀ"}, new String[]{"ᇂᄒ", "ᆨᄒ", "ᇂᄀ"}, 1.f, CondVowel.none) - - .addTypo(new String[]{"ᆨᄂ", "ᆩᄂ", "ᆪᄂ", "ᆿᄂ", "ᆼᄂ"}, new String[]{"ᆨᄂ", "ᆩᄂ", "ᆪᄂ", "ᆿᄂ", "ᆼᄂ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆨᄆ", "ᆩᄆ", "ᆪᄆ", "ᆿᄆ", "ᆼᄆ"}, new String[]{"ᆨᄆ", "ᆩᄆ", "ᆪᄆ", "ᆿᄆ", "ᆼᄆ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆨᄅ", "ᆩᄅ", "ᆪᄅ", "ᆿᄅ", "ᆼᄅ", "ᆼᄂ",}, new String[]{"ᆨᄅ", "ᆩᄅ", "ᆪᄅ", "ᆿᄅ", "ᆼᄅ", "ᆼᄂ",}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆮᄂ", "ᆺᄂ", "ᆻᄂ", "ᆽᄂ", "ᆾᄂ", "ᇀᄂ", "ᆫᄂ"}, new String[]{"ᆮᄂ", "ᆺᄂ", "ᆻᄂ", "ᆽᄂ", "ᆾᄂ", "ᇀᄂ", "ᆫᄂ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆮᄆ", "ᆺᄆ", "ᆻᄆ", "ᆽᄆ", "ᆾᄆ", "ᇀᄆ", "ᆫᄆ"}, new String[]{"ᆮᄆ", "ᆺᄆ", "ᆻᄆ", "ᆽᄆ", "ᆾᄆ", "ᇀᄆ", "ᆫᄆ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆮᄅ", "ᆺᄅ", "ᆻᄅ", "ᆽᄅ", "ᆾᄅ", "ᇀᄅ", "ᆫᄅ", "ᆫᄂ",}, new String[]{"ᆮᄅ", "ᆺᄅ", "ᆻᄅ", "ᆽᄅ", "ᆾᄅ", "ᇀᄅ", "ᆫᄅ", "ᆫᄂ",}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆸᄂ", "ᆹᄂ", "ᇁᄂ", "ᆷᄂ"}, new String[]{"ᆸᄂ", "ᆹᄂ", "ᇁᄂ", "ᆷᄂ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆸᄆ", "ᆹᄆ", "ᇁᄆ", "ᆷᄆ"}, new String[]{"ᆸᄆ", "ᆹᄆ", "ᇁᄆ", "ᆷᄆ"}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆸᄅ", "ᆹᄅ", "ᇁᄅ", "ᆷᄅ", "ᆷᄂ",}, new String[]{"ᆸᄅ", "ᆹᄅ", "ᇁᄅ", "ᆷᄅ", "ᆷᄂ",}, 1.f, CondVowel.none) - .addTypo(new String[]{"ᆫᄅ", "ᆫᄂ", "ᆯᄅ", "ᆯᄂ"}, new String[]{"ᆫᄅ", "ᆫᄂ", "ᆯᄅ", "ᆯᄂ"}, 1.f, CondVowel.none) - - .addTypo(new String[]{"ᆨᄋ", "ᄀ"}, new String[]{"ᆨᄋ", "ᄀ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆩᄋ", "ᄁ"}, new String[]{"ᆩᄋ", "ᄁ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆫᄋ", "ᆫᄒ", "ᄂ"}, new String[]{"ᆫᄋ", "ᆫᄒ", "ᄂ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆬᄋ", "ᆫᄌ"}, new String[]{"ᆬᄋ", "ᆫᄌ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆭᄋ", "ᄂ"}, new String[]{"ᆭᄋ", "ᄂ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆮᄋ", "ᄃ"}, new String[]{"ᆮᄋ", "ᄃ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆯᄋ", "ᆯᄒ", "ᄅ"}, new String[]{"ᆯᄋ", "ᆯᄒ", "ᄅ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆰᄋ", "ᆯᄀ"}, new String[]{"ᆰᄋ", "ᆯᄀ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆰᄒ", "ᆯᄏ"}, new String[]{"ᆰᄒ", "ᆯᄏ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆷᄋ", "ᄆ"}, new String[]{"ᆷᄋ", "ᄆ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆸᄋ", "ᄇ"}, new String[]{"ᆸᄋ", "ᄇ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆺᄋ", "ᄉ"}, new String[]{"ᆺᄋ", "ᄉ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆻᄋ", "ᆺᄉ", "ᄊ"}, new String[]{"ᆻᄋ", "ᆺᄉ", "ᄊ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆽᄋ", "ᄌ"}, new String[]{"ᆽᄋ", "ᄌ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆾᄋ", "ᆾᄒ", "ᆽᄒ", "ᄎ"}, new String[]{"ᆾᄋ", "ᆾᄒ", "ᆽᄒ", "ᄎ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᆿᄋ", "ᆿᄒ", "ᆨᄒ", "ᄏ"}, new String[]{"ᆿᄋ", "ᆿᄒ", "ᆨᄒ", "ᄏ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᇀᄋ", "ᇀᄒ", "ᆮᄒ", "ᄐ"}, new String[]{"ᇀᄋ", "ᇀᄒ", "ᆮᄒ", "ᄐ"}, 1.f, CondVowel.vowel) - .addTypo(new String[]{"ᇁᄋ", "ᇁᄒ", "ᆸᄒ", "ᄑ"}, new String[]{"ᇁᄋ", "ᇁᄒ", "ᆸᄒ", "ᄑ"}, 1.f, CondVowel.vowel) - - .addTypo(new String[]{"은", "는"}, new String[]{"은", "는"}, 2.f, CondVowel.none) - .addTypo(new String[]{"을", "를"}, new String[]{"을", "를"}, 2.f, CondVowel.none) - - .addTypo(new String[]{"ㅣ워", "ㅣ어", "ㅕ"}, new String[]{"ㅣ워", "ㅣ어", "ㅕ"}, 1.5f, CondVowel.none) - .setContinualTypoCost(1.f) - .addTypo(new String[]{"ᆪ"}, new String[]{"ᆨᆺ", "ᆨᆻ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆬ"}, new String[]{"ᆫᆽ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆭ"}, new String[]{"ᆫᇂ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆰ"}, new String[]{"ᆯᆨ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆱ"}, new String[]{"ᆯᆷ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆲ"}, new String[]{"ᆯᆸ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆳ"}, new String[]{"ᆯᆺ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆴ"}, new String[]{"ᆯᇀ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆵ"}, new String[]{"ᆯᇁ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆶ"}, new String[]{"ᆯᇂ"}, 1e-12f, CondVowel.none) - .addTypo(new String[]{"ᆹ"}, new String[]{"ᆸᆺ", "ᆸᆻ"}, 1e-12f, CondVowel.none); + final public static TypoTransformer basicTypoSetWithContinual = basicTypoSet.copy().update(continualTypoSet); + final public static TypoTransformer lengtheningTypoSet = new TypoTransformer().setLengtheningTypoCost(0.5f); } diff --git a/bindings/java/kr/pe/bab2min/KiwiTest.java b/bindings/java/kr/pe/bab2min/KiwiTest.java index 326ff831..6ee7dfa0 100644 --- a/bindings/java/kr/pe/bab2min/KiwiTest.java +++ b/bindings/java/kr/pe/bab2min/KiwiTest.java @@ -4,6 +4,9 @@ import java.util.concurrent.Future; import org.junit.Test; + +import kr.pe.bab2min.KiwiBuilder.TypoTransformer; + import static org.junit.Assert.*; public class KiwiTest { @@ -155,6 +158,30 @@ public void testContinualTypos() throws Exception { assertEquals(tokens[3].form, "어"); } + @Test + public void testCustomTypos() throws Exception { + System.gc(); + KiwiBuilder builder = new KiwiBuilder(modelPath); + TypoTransformer typoSet = KiwiBuilder.basicTypoSet.copy() + .update(KiwiBuilder.continualTypoSet) + .update(KiwiBuilder.lengtheningTypoSet); + Kiwi kiwi = builder.build(typoSet); + + Kiwi.Token[] tokens = kiwi.tokenize("프로그래미", Kiwi.Match.allWithNormalizing); + System.out.println(Arrays.deepToString(tokens)); + assertEquals(tokens[0].form, "프로그램"); + assertEquals(tokens[1].form, "이"); + + tokens = kiwi.tokenize("지인짜?", Kiwi.Match.allWithNormalizing); + System.out.println(Arrays.deepToString(tokens)); + assertEquals(tokens[0].form, "진짜"); + assertEquals(tokens[1].form, "?"); + + tokens = kiwi.tokenize("맗은 물", Kiwi.Match.allWithNormalizing); + System.out.println(Arrays.deepToString(tokens)); + assertEquals(tokens[0].form, "맑"); + } + @Test public void testBlocklist() throws Exception { System.gc(); diff --git a/include/kiwi/ArchUtils.h b/include/kiwi/ArchUtils.h index 06191001..ba2d0e85 100644 --- a/include/kiwi/ArchUtils.h +++ b/include/kiwi/ArchUtils.h @@ -30,13 +30,13 @@ namespace kiwi template<> struct ArchInfo { - static constexpr size_t alignment = 0; + static constexpr size_t alignment = 4; }; template<> struct ArchInfo { - static constexpr size_t alignment = 0; + static constexpr size_t alignment = 4; }; template<>