Skip to content

Commit

Permalink
Update character range (#10)
Browse files Browse the repository at this point in the history
  • Loading branch information
ayaka14732 committed Nov 22, 2020
1 parent a4df4f1 commit f5e7edf
Show file tree
Hide file tree
Showing 9 changed files with 327 additions and 76 deletions.
23 changes: 20 additions & 3 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,28 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
pip install -r requirements.txt
- name: Create directories
run: mkdir -p opencc_data cache output
- name: Download OpenCC data
run: |
cd opencc_data
curl -LsSZ --remote-name-all https://cdn.jsdelivr.net/npm/opencc-data@1.0.5/data/{STCharacters.txt,STPhrases.txt,TWPhrasesIT.txt,TWPhrasesName.txt,TWPhrasesOther.txt,TWVariants.txt,HKVariants.txt}
cat TWPhrasesIT.txt TWPhrasesName.txt TWPhrasesOther.txt > TWPhrases.txt
- name: Download GenYoMin font
run: |
cd cache
curl -LsSO https://github.com/ButTaiwan/genyo-font/releases/download/v1.501/GenYoMin.zip
unzip -q -n GenYoMin.zip "*.ttc"
- name: Download character list
run: |
cd cache
curl -LsSo 通用規範漢字表.txt https://github.com/rime-aca/character_set/e7d009a8a185a83f62ad2c903565b8bb85719221/%E9%80%9A%E7%94%A8%E8%A6%8F%E7%AF%84%E6%BC%A2%E5%AD%97%E8%A1%A8.txt
- name: Build
run: |
build/prepare.sh
python build/main.py
python build/convert_tables.py
python build/code_points_han.py
python build/font.py
- name: Copy license file
run: cp LICENSE output
- name: Upload font files
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.vscode
.mypy_cache
/opencc_data
/cache
/output
32 changes: 32 additions & 0 deletions build/code_points_han.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Code points of Han characters to be included:
# 1. Code points in Tongyong Guifan Hanzi Biao (通用规范汉字表)
# 2. Code points in OpenCC dictionaries

from itertools import chain

s = set()

with open('cache/通用規範漢字表.txt') as f:
for line in f:
if line and not line.startswith('#'):
c = line[0]
s.add(ord(c))

with open('opencc_data/STCharacters.txt') as f1, \
open('opencc_data/STPhrases.txt') as f2, \
open('opencc_data/TWVariants.txt') as f3, \
open('opencc_data/TWPhrases.txt') as f4, \
open('opencc_data/HKVariants.txt') as f5:
for line in chain(f1, f2, f3, f4, f5):
k, vx = line.rstrip('\n').split('\t')
vs = vx.split(' ')
for c in k:
s.add(ord(c))
for v in vs:
for c in v:
s.add(ord(c))

with open('cache/code_points_han.txt', 'w') as f:
for cp in sorted(s):
if cp > 128: # remove letters in the dictionaries
print(cp, file=f)
68 changes: 68 additions & 0 deletions build/convert_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Build convert tables
#
# Input:
# - build/t2twp.json
# -> opencc_data/TWPhrases.txt
# -> opencc_data/TWVariants.txt
# - opencc_data/STCharacters.txt
# - opencc_data/STPhrases.txt
# - opencc_data/TWVariants.txt
# - opencc_data/TWPhrases.txt
#
# Output:
# - cache/convert_table_words.txt
# - cache/convert_table_chars.txt
# - cache/convert_table_words_twp.txt
# - cache/convert_table_chars_twp.txt

from opencc import OpenCC

def build_entries(twp=False):
with open('opencc_data/STCharacters.txt') as f: # s2t
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
v = t2twp(v) if twp else v # s2t -> s2twp
yield k, v

with open('opencc_data/STPhrases.txt') as f: # s2t
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
v = t2twp(v) if twp else v # s2t -> s2twp
yield k, v

if twp:
with open('opencc_data/TWVariants.txt') as f: # t2tw
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
k = t2s(k) # t2tw -> s2tw
yield k, v

with open('opencc_data/TWPhrases.txt') as f: # t2twp
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
k = t2s(k) # t2twp -> s2twp
yield k, v

def go(twp=False):
entries = build_entries(twp=twp)
entries = dict(entries) # remove duplicates
entries = sorted(entries.items(), key=lambda k_v: (len(k_v[0]), k_v[0]), reverse=True) # sort

twp_suffix = '_twp' if twp else ''

with open(f'cache/convert_table_words{twp_suffix}.txt', 'w') as f1, \
open(f'cache/convert_table_chars{twp_suffix}.txt', 'w') as f2:
for k, v in entries:
print(k, v, sep='\t', file=f1 if len(k) > 1 else f2)

if __name__ == '__main__':
# Initialize OpenCC converters
t2s = OpenCC('t2s').convert
t2twp = OpenCC('./build/t2twp').convert

go()
go(twp=True)
91 changes: 29 additions & 62 deletions build/main.py → build/font.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from glob import glob
from itertools import chain, groupby
import json
from opencc import OpenCC
import os
import subprocess

Expand Down Expand Up @@ -83,10 +82,13 @@ def get_glyph_count(obj):
'''Get the total numbers of glyph in a font.'''
return len(obj['glyph_order'])

def build_codepoints_tonggui():
'''Build a set of all the codepoints in Tongyong Guifan Hanzi Biao (通用规范汉字表).'''
with open('cache/通用規範漢字表.txt') as f:
return {ord(line[0]) for line in f if line and not line.startswith('#')}
def build_codepoints_han():
'''Build a set of codepoints of Han characters to be included.'''
with open('cache/code_points_han.txt') as f:
s = set()
for line in f:
s.add(int(line))
return s

def build_codepoints_font(obj):
'''Build a set of all the codepoints in a font.'''
Expand All @@ -110,62 +112,36 @@ def build_codepoints_non_han():
range(0xFF61, 0xFF64 + 1),
))

# We restrict the Simplified Chinese characters (on the left side of the OpenCC dictionary
# file) to the range of Tongyong Guifan Hanzi Biao, and discard those conversions that are
# out of range. The remained conversions are stored in the entries variable.
#
# Then we calculate the range of “Which Traditional Chinese characters are needed if we
# convert Tongyong Guifan Hanzi Biao to Traditional Chinese”. The range is stored in the
# codepoints variable.
def build_opencc_char_table(codepoints_tonggui, codepoints_font, twp=False):
def build_opencc_char_table(codepoints_font, twp=False):
entries = []
codepoints = set()
twp_suffix = '_twp' if twp else ''

with open('cache/STCharacters.txt') as f: # s2t
with open(f'cache/convert_table_chars{twp_suffix}.txt') as f:
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
v = t2twp(v) if twp else v # s2t -> s2twp
k, v = line.rstrip('\n').split('\t')
codepoint_k = ord(k)
codepoint_v = ord(v)
if codepoint_k in codepoints_tonggui and codepoint_v in codepoints_font:
if codepoint_k in codepoints_font \
and codepoint_v in codepoints_font: # TODO FIXME: codepoint_k in codepoints_font should be unnecessary
entries.append((codepoint_k, codepoint_v))
codepoints.add(codepoint_v)

return entries, codepoints
return entries

def build_opencc_word_table(codepoints_tonggui, codepoints_font, twp=False):
entries = {}
codepoints = set()
def build_opencc_word_table(codepoints_font, twp=False):
entries = []
twp_suffix = '_twp' if twp else ''

with open('cache/STPhrases.txt') as f: # s2t
with open(f'cache/convert_table_words{twp_suffix}.txt') as f:
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
v = t2twp(v) if twp else v # s2t -> s2twp
k, v = line.rstrip('\n').split('\t')
codepoints_k = tuple(ord(c) for c in k)
codepoints_v = tuple(ord(c) for c in v)
if all(codepoint in codepoints_tonggui for codepoint in codepoints_k) \
and all(codepoint in codepoints_font for codepoint in codepoints_v):
entries[codepoints_k] = codepoints_v
codepoints.update(codepoints_v)

if twp:
with open('cache/TWPhrases.txt') as f: # t2twp
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
k = t2s(k) # t2twp -> s2twp
codepoints_k = tuple(ord(c) for c in k)
codepoints_v = tuple(ord(c) for c in v)
if all(codepoint in codepoints_tonggui for codepoint in codepoints_k) \
and all(codepoint in codepoints_font for codepoint in codepoints_v):
entries[codepoints_k] = codepoints_v
codepoints.update(codepoints_v)

# Sort from longest to shortest to force longest match
conversion_item_len = lambda conversion_item: len(conversion_item[0])
return sorted(entries.items(), key=conversion_item_len, reverse=True), codepoints
if all(codepoint in codepoints_font for codepoint in codepoints_k) \
and all(codepoint in codepoints_font for codepoint in codepoints_v): # TODO FIXME: the first line should be unnecessary
entries.append((codepoints_k, codepoints_v))

# The entries are already Sorted from longest to shortest to force longest match
return entries

def disassociate_codepoint_and_glyph_name(obj, codepoint, glyph_name):
'''
Expand Down Expand Up @@ -381,20 +357,15 @@ def build_dest_path_from_src_path(path, twp=False):
.replace('ttc', 'ttf')

def go(path, twp=False):
font = load_font(path, ttc_index=0)
font = load_font(path, ttc_index=0) # `ttc_index` 0: GenYoMin-TW; 1: GenYoMin-JP

# Determine the final Unicode range by the original font and OpenCC convert tables

codepoints_font = build_codepoints_font(font)
codepoints_tonggui = build_codepoints_tonggui() & codepoints_font
entries_char = build_opencc_char_table(codepoints_font, twp=twp)
entries_word = build_opencc_word_table(codepoints_font, twp=twp)

codepoints_final = codepoints_tonggui | build_codepoints_non_han() & codepoints_font

entries_char, codepoints_char = build_opencc_char_table(codepoints_tonggui, codepoints_font, twp=twp)
codepoints_final |= codepoints_char

entries_word, codepoints_word = build_opencc_word_table(codepoints_tonggui, codepoints_font, twp=twp)
codepoints_final |= codepoints_word
codepoints_final = (build_codepoints_non_han() | build_codepoints_han()) & codepoints_font

remove_codepoints(font, codepoints_font - codepoints_final)
clean_unused_glyphs(font)
Expand Down Expand Up @@ -431,10 +402,6 @@ def go(path, twp=False):
save_font(font, build_dest_path_from_src_path(path, twp=twp))

if __name__ == '__main__':
# Initialize OpenCC converters
t2s = OpenCC('t2s').convert
t2twp = OpenCC('./build/t2twp').convert

for path in glob('cache/GenYoMin-*.ttc'):
go(path)
go(path, twp=True)
8 changes: 0 additions & 8 deletions build/prepare.sh

This file was deleted.

6 changes: 3 additions & 3 deletions build/t2twp.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@
"type": "mmseg",
"dict": {
"type": "text",
"file": "../cache/TWPhrases.txt"
"file": "../opencc_data/TWPhrases.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "text",
"file": "../cache/TWPhrases.txt"
"file": "../opencc_data/TWPhrases.txt"
}, {
"type": "text",
"file": "../cache/TWVariants.txt"
"file": "../opencc_data/TWVariants.txt"
}]
}
}]
Expand Down
83 changes: 83 additions & 0 deletions test/index.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
@font-face {
font-family: "FanWunMing-Test";
font-weight: 100;
src: url("../output/FanWunMing-EL.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test";
font-weight: 300;
src: url("../output/FanWunMing-L.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test";
font-weight: 400;
src: url("../output/FanWunMing-R.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test";
font-weight: 500;
src: url("../output/FanWunMing-M.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test";
font-weight: 600;
src: url("../output/FanWunMing-SB.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test";
font-weight: 700;
src: url("../output/FanWunMing-B.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test";
font-weight: 900;
src: url("../output/FanWunMing-H.ttf") format("truetype");
}

@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 100;
src: url("../output/FanWunMing-TW-EL.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 300;
src: url("../output/FanWunMing-TW-L.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 400;
src: url("../output/FanWunMing-TW-R.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 500;
src: url("../output/FanWunMing-TW-M.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 600;
src: url("../output/FanWunMing-TW-SB.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 700;
src: url("../output/FanWunMing-TW-B.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 900;
src: url("../output/FanWunMing-TW-H.ttf") format("truetype");
}

:lang(en) { font-family: sans-serif, sans-serif; }
:lang(zh-CN) { font-family: 'FanWunMing-Test', serif; }
:lang(zh-CN).tw { font-family: 'FanWunMing-Test-TW', serif; }

.w100 { font-weight: 100; }
.w300 { font-weight: 300; }
.w400 { font-weight: 400; }
.w500 { font-weight: 500; }
.w600 { font-weight: 600; }
.w700 { font-weight: 700; }
.w900 { font-weight: 900; }
Loading

0 comments on commit f5e7edf

Please sign in to comment.