Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update character range #10

Merged
merged 1 commit into from
Nov 22, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,28 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install -r requirements.txt
pip install -r requirements.txt
- name: Create directories
run: mkdir -p opencc_data cache output
- name: Download OpenCC data
run: |
cd opencc_data
curl -LsSZ --remote-name-all https://cdn.jsdelivr.net/npm/opencc-data@1.0.5/data/{STCharacters.txt,STPhrases.txt,TWPhrasesIT.txt,TWPhrasesName.txt,TWPhrasesOther.txt,TWVariants.txt,HKVariants.txt}
cat TWPhrasesIT.txt TWPhrasesName.txt TWPhrasesOther.txt > TWPhrases.txt
- name: Download GenYoMin font
run: |
cd cache
curl -LsSO https://github.com/ButTaiwan/genyo-font/releases/download/v1.501/GenYoMin.zip
unzip -q -n GenYoMin.zip "*.ttc"
- name: Download character list
run: |
cd cache
curl -LsSo 通用規範漢字表.txt https://github.com/rime-aca/character_set/e7d009a8a185a83f62ad2c903565b8bb85719221/%E9%80%9A%E7%94%A8%E8%A6%8F%E7%AF%84%E6%BC%A2%E5%AD%97%E8%A1%A8.txt
- name: Build
run: |
build/prepare.sh
python build/main.py
python build/convert_tables.py
python build/code_points_han.py
python build/font.py
- name: Copy license file
run: cp LICENSE output
- name: Upload font files
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.vscode
.mypy_cache
/opencc_data
/cache
/output
32 changes: 32 additions & 0 deletions build/code_points_han.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Code points of Han characters to be included:
# 1. Code points in Tongyong Guifan Hanzi Biao (通用规范汉字表)
# 2. Code points in OpenCC dictionaries

from itertools import chain

s = set()

with open('cache/通用規範漢字表.txt') as f:
for line in f:
if line and not line.startswith('#'):
c = line[0]
s.add(ord(c))

with open('opencc_data/STCharacters.txt') as f1, \
open('opencc_data/STPhrases.txt') as f2, \
open('opencc_data/TWVariants.txt') as f3, \
open('opencc_data/TWPhrases.txt') as f4, \
open('opencc_data/HKVariants.txt') as f5:
for line in chain(f1, f2, f3, f4, f5):
k, vx = line.rstrip('\n').split('\t')
vs = vx.split(' ')
for c in k:
s.add(ord(c))
for v in vs:
for c in v:
s.add(ord(c))

with open('cache/code_points_han.txt', 'w') as f:
for cp in sorted(s):
if cp > 128: # remove letters in the dictionaries
print(cp, file=f)
68 changes: 68 additions & 0 deletions build/convert_tables.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Build convert tables
#
# Input:
# - build/t2twp.json
# -> opencc_data/TWPhrases.txt
# -> opencc_data/TWVariants.txt
# - opencc_data/STCharacters.txt
# - opencc_data/STPhrases.txt
# - opencc_data/TWVariants.txt
# - opencc_data/TWPhrases.txt
#
# Output:
# - cache/convert_table_words.txt
# - cache/convert_table_chars.txt
# - cache/convert_table_words_twp.txt
# - cache/convert_table_chars_twp.txt

from opencc import OpenCC

def build_entries(twp=False):
with open('opencc_data/STCharacters.txt') as f: # s2t
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
v = t2twp(v) if twp else v # s2t -> s2twp
yield k, v

with open('opencc_data/STPhrases.txt') as f: # s2t
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
v = t2twp(v) if twp else v # s2t -> s2twp
yield k, v

if twp:
with open('opencc_data/TWVariants.txt') as f: # t2tw
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
k = t2s(k) # t2tw -> s2tw
yield k, v

with open('opencc_data/TWPhrases.txt') as f: # t2twp
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
k = t2s(k) # t2twp -> s2twp
yield k, v

def go(twp=False):
entries = build_entries(twp=twp)
entries = dict(entries) # remove duplicates
entries = sorted(entries.items(), key=lambda k_v: (len(k_v[0]), k_v[0]), reverse=True) # sort

twp_suffix = '_twp' if twp else ''

with open(f'cache/convert_table_words{twp_suffix}.txt', 'w') as f1, \
open(f'cache/convert_table_chars{twp_suffix}.txt', 'w') as f2:
for k, v in entries:
print(k, v, sep='\t', file=f1 if len(k) > 1 else f2)

if __name__ == '__main__':
# Initialize OpenCC converters
t2s = OpenCC('t2s').convert
t2twp = OpenCC('./build/t2twp').convert

go()
go(twp=True)
91 changes: 29 additions & 62 deletions build/main.py → build/font.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from glob import glob
from itertools import chain, groupby
import json
from opencc import OpenCC
import os
import subprocess

Expand Down Expand Up @@ -83,10 +82,13 @@ def get_glyph_count(obj):
'''Get the total numbers of glyph in a font.'''
return len(obj['glyph_order'])

def build_codepoints_tonggui():
'''Build a set of all the codepoints in Tongyong Guifan Hanzi Biao (通用规范汉字表).'''
with open('cache/通用規範漢字表.txt') as f:
return {ord(line[0]) for line in f if line and not line.startswith('#')}
def build_codepoints_han():
'''Build a set of codepoints of Han characters to be included.'''
with open('cache/code_points_han.txt') as f:
s = set()
for line in f:
s.add(int(line))
return s

def build_codepoints_font(obj):
'''Build a set of all the codepoints in a font.'''
Expand All @@ -110,62 +112,36 @@ def build_codepoints_non_han():
range(0xFF61, 0xFF64 + 1),
))

# We restrict the Simplified Chinese characters (on the left side of the OpenCC dictionary
# file) to the range of Tongyong Guifan Hanzi Biao, and discard those conversions that are
# out of range. The remained conversions are stored in the entries variable.
#
# Then we calculate the range of “Which Traditional Chinese characters are needed if we
# convert Tongyong Guifan Hanzi Biao to Traditional Chinese”. The range is stored in the
# codepoints variable.
def build_opencc_char_table(codepoints_tonggui, codepoints_font, twp=False):
def build_opencc_char_table(codepoints_font, twp=False):
entries = []
codepoints = set()
twp_suffix = '_twp' if twp else ''

with open('cache/STCharacters.txt') as f: # s2t
with open(f'cache/convert_table_chars{twp_suffix}.txt') as f:
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
v = t2twp(v) if twp else v # s2t -> s2twp
k, v = line.rstrip('\n').split('\t')
codepoint_k = ord(k)
codepoint_v = ord(v)
if codepoint_k in codepoints_tonggui and codepoint_v in codepoints_font:
if codepoint_k in codepoints_font \
and codepoint_v in codepoints_font: # TODO FIXME: codepoint_k in codepoints_font should be unnecessary
entries.append((codepoint_k, codepoint_v))
codepoints.add(codepoint_v)

return entries, codepoints
return entries

def build_opencc_word_table(codepoints_tonggui, codepoints_font, twp=False):
entries = {}
codepoints = set()
def build_opencc_word_table(codepoints_font, twp=False):
entries = []
twp_suffix = '_twp' if twp else ''

with open('cache/STPhrases.txt') as f: # s2t
with open(f'cache/convert_table_words{twp_suffix}.txt') as f:
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
v = t2twp(v) if twp else v # s2t -> s2twp
k, v = line.rstrip('\n').split('\t')
codepoints_k = tuple(ord(c) for c in k)
codepoints_v = tuple(ord(c) for c in v)
if all(codepoint in codepoints_tonggui for codepoint in codepoints_k) \
and all(codepoint in codepoints_font for codepoint in codepoints_v):
entries[codepoints_k] = codepoints_v
codepoints.update(codepoints_v)

if twp:
with open('cache/TWPhrases.txt') as f: # t2twp
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
k = t2s(k) # t2twp -> s2twp
codepoints_k = tuple(ord(c) for c in k)
codepoints_v = tuple(ord(c) for c in v)
if all(codepoint in codepoints_tonggui for codepoint in codepoints_k) \
and all(codepoint in codepoints_font for codepoint in codepoints_v):
entries[codepoints_k] = codepoints_v
codepoints.update(codepoints_v)

# Sort from longest to shortest to force longest match
conversion_item_len = lambda conversion_item: len(conversion_item[0])
return sorted(entries.items(), key=conversion_item_len, reverse=True), codepoints
if all(codepoint in codepoints_font for codepoint in codepoints_k) \
and all(codepoint in codepoints_font for codepoint in codepoints_v): # TODO FIXME: the first line should be unnecessary
entries.append((codepoints_k, codepoints_v))

# The entries are already Sorted from longest to shortest to force longest match
return entries

def disassociate_codepoint_and_glyph_name(obj, codepoint, glyph_name):
'''
Expand Down Expand Up @@ -381,20 +357,15 @@ def build_dest_path_from_src_path(path, twp=False):
.replace('ttc', 'ttf')

def go(path, twp=False):
font = load_font(path, ttc_index=0)
font = load_font(path, ttc_index=0) # `ttc_index` 0: GenYoMin-TW; 1: GenYoMin-JP

# Determine the final Unicode range by the original font and OpenCC convert tables

codepoints_font = build_codepoints_font(font)
codepoints_tonggui = build_codepoints_tonggui() & codepoints_font
entries_char = build_opencc_char_table(codepoints_font, twp=twp)
entries_word = build_opencc_word_table(codepoints_font, twp=twp)

codepoints_final = codepoints_tonggui | build_codepoints_non_han() & codepoints_font

entries_char, codepoints_char = build_opencc_char_table(codepoints_tonggui, codepoints_font, twp=twp)
codepoints_final |= codepoints_char

entries_word, codepoints_word = build_opencc_word_table(codepoints_tonggui, codepoints_font, twp=twp)
codepoints_final |= codepoints_word
codepoints_final = (build_codepoints_non_han() | build_codepoints_han()) & codepoints_font

remove_codepoints(font, codepoints_font - codepoints_final)
clean_unused_glyphs(font)
Expand Down Expand Up @@ -431,10 +402,6 @@ def go(path, twp=False):
save_font(font, build_dest_path_from_src_path(path, twp=twp))

if __name__ == '__main__':
# Initialize OpenCC converters
t2s = OpenCC('t2s').convert
t2twp = OpenCC('./build/t2twp').convert

for path in glob('cache/GenYoMin-*.ttc'):
go(path)
go(path, twp=True)
8 changes: 0 additions & 8 deletions build/prepare.sh

This file was deleted.

6 changes: 3 additions & 3 deletions build/t2twp.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,18 @@
"type": "mmseg",
"dict": {
"type": "text",
"file": "../cache/TWPhrases.txt"
"file": "../opencc_data/TWPhrases.txt"
}
},
"conversion_chain": [{
"dict": {
"type": "group",
"dicts": [{
"type": "text",
"file": "../cache/TWPhrases.txt"
"file": "../opencc_data/TWPhrases.txt"
}, {
"type": "text",
"file": "../cache/TWVariants.txt"
"file": "../opencc_data/TWVariants.txt"
}]
}
}]
Expand Down
83 changes: 83 additions & 0 deletions test/index.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
@font-face {
font-family: "FanWunMing-Test";
font-weight: 100;
src: url("../output/FanWunMing-EL.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test";
font-weight: 300;
src: url("../output/FanWunMing-L.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test";
font-weight: 400;
src: url("../output/FanWunMing-R.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test";
font-weight: 500;
src: url("../output/FanWunMing-M.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test";
font-weight: 600;
src: url("../output/FanWunMing-SB.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test";
font-weight: 700;
src: url("../output/FanWunMing-B.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test";
font-weight: 900;
src: url("../output/FanWunMing-H.ttf") format("truetype");
}

@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 100;
src: url("../output/FanWunMing-TW-EL.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 300;
src: url("../output/FanWunMing-TW-L.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 400;
src: url("../output/FanWunMing-TW-R.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 500;
src: url("../output/FanWunMing-TW-M.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 600;
src: url("../output/FanWunMing-TW-SB.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 700;
src: url("../output/FanWunMing-TW-B.ttf") format("truetype");
}
@font-face {
font-family: "FanWunMing-Test-TW";
font-weight: 900;
src: url("../output/FanWunMing-TW-H.ttf") format("truetype");
}

:lang(en) { font-family: sans-serif, sans-serif; }
:lang(zh-CN) { font-family: 'FanWunMing-Test', serif; }
:lang(zh-CN).tw { font-family: 'FanWunMing-Test-TW', serif; }

.w100 { font-weight: 100; }
.w300 { font-weight: 300; }
.w400 { font-weight: 400; }
.w500 { font-weight: 500; }
.w600 { font-weight: 600; }
.w700 { font-weight: 700; }
.w900 { font-weight: 900; }
Loading