Skip to content

Commit

Permalink
Implement KiwiJava API for lengthening typos
Browse files Browse the repository at this point in the history
  • Loading branch information
bab2min committed Sep 16, 2024
1 parent e431105 commit f1d2e83
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 104 deletions.
16 changes: 15 additions & 1 deletion bindings/java/kiwi_java.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -493,6 +493,16 @@ class JTypoTransformer : public kiwi::TypoTransformer, jni::JObject<JTypoTransfo
static constexpr std::string_view className = "kr/pe/bab2min/KiwiBuilder$TypoTransformer";

using kiwi::TypoTransformer::TypoTransformer;

JTypoTransformer copy() const
{
return *this;
}

void update(const JTypoTransformer& o)
{
TypoTransformer::update(o);
}
};

class JKiwiBuilder : public kiwi::KiwiBuilder, jni::JObject<JKiwiBuilder>
Expand Down Expand Up @@ -547,7 +557,11 @@ JNIEXPORT jint JNICALL JNI_OnLoad(JavaVM* vm, void* reserved)
jni::define<JTypoTransformer>()
.template ctor<>()
.template method<&JTypoTransformer::addTypo>("_addTypo")
.template method<&JTypoTransformer::setContinualTypoCost>("_setContinualTypoCost"),
.template method<&JTypoTransformer::setContinualTypoCost>("_setContinualTypoCost")
.template method<&JTypoTransformer::setLengtheningTypoCost>("_setLengtheningTypoCost")
.template method<&JTypoTransformer::copy>("copy")
.template method<&JTypoTransformer::update>("_update")
.template method<&JTypoTransformer::scaleCost>("_scaleCost"),

jni::define<JKiwiBuilder>()
.template ctor<std::string, size_t, kiwi::BuildOption, bool>()
Expand Down
126 changes: 25 additions & 101 deletions bindings/java/kr/pe/bab2min/KiwiBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,12 @@ public boolean isAlive() {

@Override
public native void close() throws Exception;
public native TypoTransformer copy();
public native void _addTypo(String orig, String error, float cost, byte convVowel);
public native void _update(TypoTransformer src);
public native void _scaleCost(float scale);
public native void _setContinualTypoCost(float cost);
public native void _setLengtheningTypoCost(float cost);

public TypoTransformer addTypo(String orig, String error, float cost, byte convVowel) {
_addTypo(orig, error, cost, convVowel);
Expand All @@ -80,10 +84,29 @@ public TypoTransformer addTypo(String[] orig, String[] error, float cost, byte c
return this;
}

// Set continual typo cost (inplace)
public TypoTransformer setContinualTypoCost(float cost) {
_setContinualTypoCost(cost);
return this;
}

// Set lengthening typo cost (inplace)
public TypoTransformer setLengtheningTypoCost(float cost) {
_setLengtheningTypoCost(cost);
return this;
}

// Inplace update
public TypoTransformer update(TypoTransformer src) {
_update(src);
return this;
}

// Inplace scaling
public TypoTransformer scaleCost(float scale) {
_scaleCost(scale);
return this;
}
}

public KiwiBuilder(long _inst) {
Expand Down Expand Up @@ -241,106 +264,7 @@ public Kiwi build(TypoTransformer typos) {
.addTypo(new String[]{"ᆶ"}, new String[]{"ᆯᇂ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆹ"}, new String[]{"ᆸᆺ", "ᆸᆻ"}, 1e-12f, CondVowel.none);

final public static TypoTransformer basicTypoSetWithContinual = new TypoTransformer()
.addTypo(new String[]{"ㅐ", "ㅔ"}, new String[]{"ㅐ", "ㅔ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ㅐ", "ㅔ"}, new String[]{"ㅒ", "ㅖ"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"ㅒ", "ㅖ"}, new String[]{"ㅐ", "ㅔ"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"ㅒ", "ㅖ"}, new String[]{"ㅒ", "ㅖ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ㅚ", "ㅙ", "ㅞ"}, new String[]{"ㅚ", "ㅙ", "ㅞ", "ㅐ", "ㅔ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ㅝ"}, new String[]{"ㅗ", "ㅓ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ㅟ", "ㅢ"}, new String[]{"ㅣ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"위", "의"}, new String[]{"이"}, Float.POSITIVE_INFINITY, CondVowel.none)
.addTypo(new String[]{"위", "의"}, new String[]{"이"}, 1.f, CondVowel.any)
.addTypo(new String[]{"자", "쟈"}, new String[]{"자", "쟈"}, 1.f, CondVowel.none)
.addTypo(new String[]{"재", "쟤"}, new String[]{"재", "쟤"}, 1.f, CondVowel.none)
.addTypo(new String[]{"저", "져"}, new String[]{"저", "져"}, 1.f, CondVowel.none)
.addTypo(new String[]{"제", "졔"}, new String[]{"제", "졔"}, 1.f, CondVowel.none)
.addTypo(new String[]{"조", "죠", "줘"}, new String[]{"조", "죠", "줘"}, 1.f, CondVowel.none)
.addTypo(new String[]{"주", "쥬"}, new String[]{"주", "쥬"}, 1.f, CondVowel.none)
.addTypo(new String[]{"차", "챠"}, new String[]{"차", "챠"}, 1.f, CondVowel.none)
.addTypo(new String[]{"채", "챼"}, new String[]{"채", "챼"}, 1.f, CondVowel.none)
.addTypo(new String[]{"처", "쳐"}, new String[]{"처", "쳐"}, 1.f, CondVowel.none)
.addTypo(new String[]{"체", "쳬"}, new String[]{"체", "쳬"}, 1.f, CondVowel.none)
.addTypo(new String[]{"초", "쵸", "춰"}, new String[]{"초", "쵸", "춰"}, 1.f, CondVowel.none)
.addTypo(new String[]{"추", "츄"}, new String[]{"추", "츄"}, 1.f, CondVowel.none)
.addTypo(new String[]{"유", "류"}, new String[]{"유", "류"}, 1.f, CondVowel.none)
.addTypo(new String[]{"므", "무"}, new String[]{"므", "무"}, 1.f, CondVowel.none)
.addTypo(new String[]{"브", "부"}, new String[]{"브", "부"}, 1.f, CondVowel.none)
.addTypo(new String[]{"프", "푸"}, new String[]{"프", "푸"}, 1.f, CondVowel.none)
.addTypo(new String[]{"르", "루"}, new String[]{"르", "루"}, 1.f, CondVowel.none)
.addTypo(new String[]{"러", "뤄"}, new String[]{"러", "뤄"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆩ", "ᆪ"}, new String[]{"ᆨ", "ᆩ", "ᆪ"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"ᆬ", "ᆭ"}, new String[]{"ᆫ", "ᆬ", "ᆭ"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"ᆰ", "ᆱ", "ᆲ", "ᆳ", "ᆴ", "ᆵ", "ᆶ"}, new String[]{"ᆯ", "ᆰ", "ᆱ", "ᆲ", "ᆳ", "ᆴ", "ᆵ", "ᆶ"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"ᆺ", "ᆻ"}, new String[]{"ᆺ", "ᆻ"}, 1.f, CondVowel.none)

.addTypo(new String[]{"안"}, new String[]{"않"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"맞추", "맞히"}, new String[]{"맞추", "맞히"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"맞춰", "맞혀"}, new String[]{"맞춰", "맞혀"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"받치", "바치", "받히"}, new String[]{"받치", "바치", "받히"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"받쳐", "바쳐", "받혀"}, new String[]{"받쳐", "바쳐", "받혀"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"던", "든"}, new String[]{"던", "든"}, 1.f, CondVowel.none)
.addTypo(new String[]{"때", "데"}, new String[]{"때", "데"}, 1.5f, CondVowel.none)
.addTypo(new String[]{"빛", "빚"}, new String[]{"빛", "빚"}, 1.f, CondVowel.none)

.addTypo(new String[]{"ᆮ이", "지"}, new String[]{"ᆮ이", "지"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆮ여", "져"}, new String[]{"ᆮ여", "져"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᇀ이", "치"}, new String[]{"ᇀ이", "치"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᇀ여", "쳐"}, new String[]{"ᇀ여", "쳐"}, 1.f, CondVowel.none)

.addTypo(new String[]{"ᄀ", "ᄁ"}, new String[]{"ᄀ", "ᄁ"}, 1.f, CondVowel.applosive)
.addTypo(new String[]{"ᄃ", "ᄄ"}, new String[]{"ᄃ", "ᄄ"}, 1.f, CondVowel.applosive)
.addTypo(new String[]{"ᄇ", "ᄈ"}, new String[]{"ᄇ", "ᄈ"}, 1.f, CondVowel.applosive)
.addTypo(new String[]{"ᄉ", "ᄊ"}, new String[]{"ᄉ", "ᄊ"}, 1.f, CondVowel.applosive)
.addTypo(new String[]{"ᄌ", "ᄍ"}, new String[]{"ᄌ", "ᄍ"}, 1.f, CondVowel.applosive)

.addTypo(new String[]{"ᇂᄒ", "ᆨᄒ", "ᇂᄀ"}, new String[]{"ᇂᄒ", "ᆨᄒ", "ᇂᄀ"}, 1.f, CondVowel.none)

.addTypo(new String[]{"ᆨᄂ", "ᆩᄂ", "ᆪᄂ", "ᆿᄂ", "ᆼᄂ"}, new String[]{"ᆨᄂ", "ᆩᄂ", "ᆪᄂ", "ᆿᄂ", "ᆼᄂ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆨᄆ", "ᆩᄆ", "ᆪᄆ", "ᆿᄆ", "ᆼᄆ"}, new String[]{"ᆨᄆ", "ᆩᄆ", "ᆪᄆ", "ᆿᄆ", "ᆼᄆ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆨᄅ", "ᆩᄅ", "ᆪᄅ", "ᆿᄅ", "ᆼᄅ", "ᆼᄂ",}, new String[]{"ᆨᄅ", "ᆩᄅ", "ᆪᄅ", "ᆿᄅ", "ᆼᄅ", "ᆼᄂ",}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆮᄂ", "ᆺᄂ", "ᆻᄂ", "ᆽᄂ", "ᆾᄂ", "ᇀᄂ", "ᆫᄂ"}, new String[]{"ᆮᄂ", "ᆺᄂ", "ᆻᄂ", "ᆽᄂ", "ᆾᄂ", "ᇀᄂ", "ᆫᄂ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆮᄆ", "ᆺᄆ", "ᆻᄆ", "ᆽᄆ", "ᆾᄆ", "ᇀᄆ", "ᆫᄆ"}, new String[]{"ᆮᄆ", "ᆺᄆ", "ᆻᄆ", "ᆽᄆ", "ᆾᄆ", "ᇀᄆ", "ᆫᄆ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆮᄅ", "ᆺᄅ", "ᆻᄅ", "ᆽᄅ", "ᆾᄅ", "ᇀᄅ", "ᆫᄅ", "ᆫᄂ",}, new String[]{"ᆮᄅ", "ᆺᄅ", "ᆻᄅ", "ᆽᄅ", "ᆾᄅ", "ᇀᄅ", "ᆫᄅ", "ᆫᄂ",}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆸᄂ", "ᆹᄂ", "ᇁᄂ", "ᆷᄂ"}, new String[]{"ᆸᄂ", "ᆹᄂ", "ᇁᄂ", "ᆷᄂ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆸᄆ", "ᆹᄆ", "ᇁᄆ", "ᆷᄆ"}, new String[]{"ᆸᄆ", "ᆹᄆ", "ᇁᄆ", "ᆷᄆ"}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆸᄅ", "ᆹᄅ", "ᇁᄅ", "ᆷᄅ", "ᆷᄂ",}, new String[]{"ᆸᄅ", "ᆹᄅ", "ᇁᄅ", "ᆷᄅ", "ᆷᄂ",}, 1.f, CondVowel.none)
.addTypo(new String[]{"ᆫᄅ", "ᆫᄂ", "ᆯᄅ", "ᆯᄂ"}, new String[]{"ᆫᄅ", "ᆫᄂ", "ᆯᄅ", "ᆯᄂ"}, 1.f, CondVowel.none)

.addTypo(new String[]{"ᆨᄋ", "ᄀ"}, new String[]{"ᆨᄋ", "ᄀ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆩᄋ", "ᄁ"}, new String[]{"ᆩᄋ", "ᄁ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆫᄋ", "ᆫᄒ", "ᄂ"}, new String[]{"ᆫᄋ", "ᆫᄒ", "ᄂ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆬᄋ", "ᆫᄌ"}, new String[]{"ᆬᄋ", "ᆫᄌ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆭᄋ", "ᄂ"}, new String[]{"ᆭᄋ", "ᄂ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆮᄋ", "ᄃ"}, new String[]{"ᆮᄋ", "ᄃ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆯᄋ", "ᆯᄒ", "ᄅ"}, new String[]{"ᆯᄋ", "ᆯᄒ", "ᄅ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆰᄋ", "ᆯᄀ"}, new String[]{"ᆰᄋ", "ᆯᄀ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆰᄒ", "ᆯᄏ"}, new String[]{"ᆰᄒ", "ᆯᄏ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆷᄋ", "ᄆ"}, new String[]{"ᆷᄋ", "ᄆ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆸᄋ", "ᄇ"}, new String[]{"ᆸᄋ", "ᄇ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆺᄋ", "ᄉ"}, new String[]{"ᆺᄋ", "ᄉ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆻᄋ", "ᆺᄉ", "ᄊ"}, new String[]{"ᆻᄋ", "ᆺᄉ", "ᄊ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆽᄋ", "ᄌ"}, new String[]{"ᆽᄋ", "ᄌ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆾᄋ", "ᆾᄒ", "ᆽᄒ", "ᄎ"}, new String[]{"ᆾᄋ", "ᆾᄒ", "ᆽᄒ", "ᄎ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᆿᄋ", "ᆿᄒ", "ᆨᄒ", "ᄏ"}, new String[]{"ᆿᄋ", "ᆿᄒ", "ᆨᄒ", "ᄏ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᇀᄋ", "ᇀᄒ", "ᆮᄒ", "ᄐ"}, new String[]{"ᇀᄋ", "ᇀᄒ", "ᆮᄒ", "ᄐ"}, 1.f, CondVowel.vowel)
.addTypo(new String[]{"ᇁᄋ", "ᇁᄒ", "ᆸᄒ", "ᄑ"}, new String[]{"ᇁᄋ", "ᇁᄒ", "ᆸᄒ", "ᄑ"}, 1.f, CondVowel.vowel)

.addTypo(new String[]{"은", "는"}, new String[]{"은", "는"}, 2.f, CondVowel.none)
.addTypo(new String[]{"을", "를"}, new String[]{"을", "를"}, 2.f, CondVowel.none)

.addTypo(new String[]{"ㅣ워", "ㅣ어", "ㅕ"}, new String[]{"ㅣ워", "ㅣ어", "ㅕ"}, 1.5f, CondVowel.none)
.setContinualTypoCost(1.f)
.addTypo(new String[]{"ᆪ"}, new String[]{"ᆨᆺ", "ᆨᆻ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆬ"}, new String[]{"ᆫᆽ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆭ"}, new String[]{"ᆫᇂ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆰ"}, new String[]{"ᆯᆨ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆱ"}, new String[]{"ᆯᆷ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆲ"}, new String[]{"ᆯᆸ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆳ"}, new String[]{"ᆯᆺ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆴ"}, new String[]{"ᆯᇀ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆵ"}, new String[]{"ᆯᇁ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆶ"}, new String[]{"ᆯᇂ"}, 1e-12f, CondVowel.none)
.addTypo(new String[]{"ᆹ"}, new String[]{"ᆸᆺ", "ᆸᆻ"}, 1e-12f, CondVowel.none);
final public static TypoTransformer basicTypoSetWithContinual = basicTypoSet.copy().update(continualTypoSet);

final public static TypoTransformer lengtheningTypoSet = new TypoTransformer().setLengtheningTypoCost(0.5f);
}
27 changes: 27 additions & 0 deletions bindings/java/kr/pe/bab2min/KiwiTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import java.util.concurrent.Future;

import org.junit.Test;

import kr.pe.bab2min.KiwiBuilder.TypoTransformer;

import static org.junit.Assert.*;

public class KiwiTest {
Expand Down Expand Up @@ -155,6 +158,30 @@ public void testContinualTypos() throws Exception {
assertEquals(tokens[3].form, "어");
}

@Test
public void testCustomTypos() throws Exception {
System.gc();
KiwiBuilder builder = new KiwiBuilder(modelPath);
TypoTransformer typoSet = KiwiBuilder.basicTypoSet.copy()
.update(KiwiBuilder.continualTypoSet)
.update(KiwiBuilder.lengtheningTypoSet);
Kiwi kiwi = builder.build(typoSet);

Kiwi.Token[] tokens = kiwi.tokenize("프로그래미", Kiwi.Match.allWithNormalizing);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[0].form, "프로그램");
assertEquals(tokens[1].form, "이");

tokens = kiwi.tokenize("지인짜?", Kiwi.Match.allWithNormalizing);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[0].form, "진짜");
assertEquals(tokens[1].form, "?");

tokens = kiwi.tokenize("맗은 물", Kiwi.Match.allWithNormalizing);
System.out.println(Arrays.deepToString(tokens));
assertEquals(tokens[0].form, "맑");
}

@Test
public void testBlocklist() throws Exception {
System.gc();
Expand Down
4 changes: 2 additions & 2 deletions include/kiwi/ArchUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,13 @@ namespace kiwi
template<>
struct ArchInfo<ArchType::none>
{
static constexpr size_t alignment = 0;
static constexpr size_t alignment = 4;
};

template<>
struct ArchInfo<ArchType::balanced>
{
static constexpr size_t alignment = 0;
static constexpr size_t alignment = 4;
};

template<>
Expand Down

0 comments on commit f1d2e83

Please sign in to comment.