From 32fbb581e5a82dede4ed1b97d0378b6f2760f6a0 Mon Sep 17 00:00:00 2001 From: Ayaka Mikazuki Date: Sun, 22 Nov 2020 22:26:34 +0800 Subject: [PATCH] Update character range --- .github/workflows/build.yml | 23 ++++++++-- .gitignore | 1 + build/code_points_han.py | 32 +++++++++++++ build/convert_tables.py | 68 +++++++++++++++++++++++++++ build/{main.py => font.py} | 91 ++++++++++++------------------------- build/prepare.sh | 8 ---- build/t2twp.json | 6 +-- test/index.css | 83 +++++++++++++++++++++++++++++++++ test/index.html | 91 +++++++++++++++++++++++++++++++++++++ 9 files changed, 327 insertions(+), 76 deletions(-) create mode 100644 build/code_points_han.py create mode 100644 build/convert_tables.py rename build/{main.py => font.py} (81%) delete mode 100755 build/prepare.sh create mode 100644 test/index.css create mode 100644 test/index.html diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 89364e7..d35832c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,11 +19,28 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install -r requirements.txt + pip install -r requirements.txt + - name: Create directories + run: mkdir -p opencc_data cache output + - name: Download OpenCC data + run: | + cd opencc_data + curl -LsSZ --remote-name-all https://cdn.jsdelivr.net/npm/opencc-data@1.0.5/data/{STCharacters.txt,STPhrases.txt,TWPhrasesIT.txt,TWPhrasesName.txt,TWPhrasesOther.txt,TWVariants.txt,HKVariants.txt} + cat TWPhrasesIT.txt TWPhrasesName.txt TWPhrasesOther.txt > TWPhrases.txt + - name: Download GenYoMin font + run: | + cd cache + curl -LsSO https://github.com/ButTaiwan/genyo-font/releases/download/v1.501/GenYoMin.zip + unzip -q -n GenYoMin.zip "*.ttc" + - name: Download character list + run: | + cd cache + curl -LsSo 通用規範漢字表.txt https://raw.githubusercontent.com/rime-aca/character_set/e7d009a8a185a83f62ad2c903565b8bb85719221/%E9%80%9A%E7%94%A8%E8%A6%8F%E7%AF%84%E6%BC%A2%E5%AD%97%E8%A1%A8.txt - name: Build run: | - build/prepare.sh - python build/main.py + python build/convert_tables.py + python build/code_points_han.py + python build/font.py - name: Copy license file run: cp LICENSE output - name: Upload font files diff --git a/.gitignore b/.gitignore index ba20042..bb3fd3a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .vscode .mypy_cache +/opencc_data /cache /output diff --git a/build/code_points_han.py b/build/code_points_han.py new file mode 100644 index 0000000..4bf1333 --- /dev/null +++ b/build/code_points_han.py @@ -0,0 +1,32 @@ +# Code points of Han characters to be included: +# 1. Code points in Tongyong Guifan Hanzi Biao (通用规范汉字表) +# 2. Code points in OpenCC dictionaries + +from itertools import chain + +s = set() + +with open('cache/通用規範漢字表.txt') as f: + for line in f: + if line and not line.startswith('#'): + c = line[0] + s.add(ord(c)) + +with open('opencc_data/STCharacters.txt') as f1, \ +open('opencc_data/STPhrases.txt') as f2, \ +open('opencc_data/TWVariants.txt') as f3, \ +open('opencc_data/TWPhrases.txt') as f4, \ +open('opencc_data/HKVariants.txt') as f5: + for line in chain(f1, f2, f3, f4, f5): + k, vx = line.rstrip('\n').split('\t') + vs = vx.split(' ') + for c in k: + s.add(ord(c)) + for v in vs: + for c in v: + s.add(ord(c)) + +with open('cache/code_points_han.txt', 'w') as f: + for cp in sorted(s): + if cp > 128: # remove letters in the dictionaries + print(cp, file=f) diff --git a/build/convert_tables.py b/build/convert_tables.py new file mode 100644 index 0000000..901da91 --- /dev/null +++ b/build/convert_tables.py @@ -0,0 +1,68 @@ +# Build convert tables +# +# Input: +# - build/t2twp.json +# -> opencc_data/TWPhrases.txt +# -> opencc_data/TWVariants.txt +# - opencc_data/STCharacters.txt +# - opencc_data/STPhrases.txt +# - opencc_data/TWVariants.txt +# - opencc_data/TWPhrases.txt +# +# Output: +# - cache/convert_table_words.txt +# - cache/convert_table_chars.txt +# - cache/convert_table_words_twp.txt +# - cache/convert_table_chars_twp.txt + +from opencc import OpenCC + +def build_entries(twp=False): + with open('opencc_data/STCharacters.txt') as f: # s2t + for line in f: + k, vx = line.rstrip('\n').split('\t') + v = vx.split(' ')[0] # Only select the first candidate + v = t2twp(v) if twp else v # s2t -> s2twp + yield k, v + + with open('opencc_data/STPhrases.txt') as f: # s2t + for line in f: + k, vx = line.rstrip('\n').split('\t') + v = vx.split(' ')[0] # Only select the first candidate + v = t2twp(v) if twp else v # s2t -> s2twp + yield k, v + + if twp: + with open('opencc_data/TWVariants.txt') as f: # t2tw + for line in f: + k, vx = line.rstrip('\n').split('\t') + v = vx.split(' ')[0] # Only select the first candidate + k = t2s(k) # t2tw -> s2tw + yield k, v + + with open('opencc_data/TWPhrases.txt') as f: # t2twp + for line in f: + k, vx = line.rstrip('\n').split('\t') + v = vx.split(' ')[0] # Only select the first candidate + k = t2s(k) # t2twp -> s2twp + yield k, v + +def go(twp=False): + entries = build_entries(twp=twp) + entries = dict(entries) # remove duplicates + entries = sorted(entries.items(), key=lambda k_v: (len(k_v[0]), k_v[0]), reverse=True) # sort + + twp_suffix = '_twp' if twp else '' + + with open(f'cache/convert_table_words{twp_suffix}.txt', 'w') as f1, \ + open(f'cache/convert_table_chars{twp_suffix}.txt', 'w') as f2: + for k, v in entries: + print(k, v, sep='\t', file=f1 if len(k) > 1 else f2) + +if __name__ == '__main__': + # Initialize OpenCC converters + t2s = OpenCC('t2s').convert + t2twp = OpenCC('./build/t2twp').convert + + go() + go(twp=True) diff --git a/build/main.py b/build/font.py similarity index 81% rename from build/main.py rename to build/font.py index e02499c..3302a9f 100644 --- a/build/main.py +++ b/build/font.py @@ -3,7 +3,6 @@ from glob import glob from itertools import chain, groupby import json -from opencc import OpenCC import os import subprocess @@ -83,10 +82,13 @@ def get_glyph_count(obj): '''Get the total numbers of glyph in a font.''' return len(obj['glyph_order']) -def build_codepoints_tonggui(): - '''Build a set of all the codepoints in Tongyong Guifan Hanzi Biao (通用规范汉字表).''' - with open('cache/通用規範漢字表.txt') as f: - return {ord(line[0]) for line in f if line and not line.startswith('#')} +def build_codepoints_han(): + '''Build a set of codepoints of Han characters to be included.''' + with open('cache/code_points_han.txt') as f: + s = set() + for line in f: + s.add(int(line)) + return s def build_codepoints_font(obj): '''Build a set of all the codepoints in a font.''' @@ -110,62 +112,36 @@ def build_codepoints_non_han(): range(0xFF61, 0xFF64 + 1), )) -# We restrict the Simplified Chinese characters (on the left side of the OpenCC dictionary -# file) to the range of Tongyong Guifan Hanzi Biao, and discard those conversions that are -# out of range. The remained conversions are stored in the entries variable. -# -# Then we calculate the range of “Which Traditional Chinese characters are needed if we -# convert Tongyong Guifan Hanzi Biao to Traditional Chinese”. The range is stored in the -# codepoints variable. -def build_opencc_char_table(codepoints_tonggui, codepoints_font, twp=False): +def build_opencc_char_table(codepoints_font, twp=False): entries = [] - codepoints = set() + twp_suffix = '_twp' if twp else '' - with open('cache/STCharacters.txt') as f: # s2t + with open(f'cache/convert_table_chars{twp_suffix}.txt') as f: for line in f: - k, vx = line.rstrip('\n').split('\t') - v = vx.split(' ')[0] # Only select the first candidate - v = t2twp(v) if twp else v # s2t -> s2twp + k, v = line.rstrip('\n').split('\t') codepoint_k = ord(k) codepoint_v = ord(v) - if codepoint_k in codepoints_tonggui and codepoint_v in codepoints_font: + if codepoint_k in codepoints_font \ + and codepoint_v in codepoints_font: # TODO FIXME: codepoint_k in codepoints_font should be unnecessary entries.append((codepoint_k, codepoint_v)) - codepoints.add(codepoint_v) - return entries, codepoints + return entries -def build_opencc_word_table(codepoints_tonggui, codepoints_font, twp=False): - entries = {} - codepoints = set() +def build_opencc_word_table(codepoints_font, twp=False): + entries = [] + twp_suffix = '_twp' if twp else '' - with open('cache/STPhrases.txt') as f: # s2t + with open(f'cache/convert_table_words{twp_suffix}.txt') as f: for line in f: - k, vx = line.rstrip('\n').split('\t') - v = vx.split(' ')[0] # Only select the first candidate - v = t2twp(v) if twp else v # s2t -> s2twp + k, v = line.rstrip('\n').split('\t') codepoints_k = tuple(ord(c) for c in k) codepoints_v = tuple(ord(c) for c in v) - if all(codepoint in codepoints_tonggui for codepoint in codepoints_k) \ - and all(codepoint in codepoints_font for codepoint in codepoints_v): - entries[codepoints_k] = codepoints_v - codepoints.update(codepoints_v) - - if twp: - with open('cache/TWPhrases.txt') as f: # t2twp - for line in f: - k, vx = line.rstrip('\n').split('\t') - v = vx.split(' ')[0] # Only select the first candidate - k = t2s(k) # t2twp -> s2twp - codepoints_k = tuple(ord(c) for c in k) - codepoints_v = tuple(ord(c) for c in v) - if all(codepoint in codepoints_tonggui for codepoint in codepoints_k) \ - and all(codepoint in codepoints_font for codepoint in codepoints_v): - entries[codepoints_k] = codepoints_v - codepoints.update(codepoints_v) - - # Sort from longest to shortest to force longest match - conversion_item_len = lambda conversion_item: len(conversion_item[0]) - return sorted(entries.items(), key=conversion_item_len, reverse=True), codepoints + if all(codepoint in codepoints_font for codepoint in codepoints_k) \ + and all(codepoint in codepoints_font for codepoint in codepoints_v): # TODO FIXME: the first line should be unnecessary + entries.append((codepoints_k, codepoints_v)) + + # The entries are already Sorted from longest to shortest to force longest match + return entries def disassociate_codepoint_and_glyph_name(obj, codepoint, glyph_name): ''' @@ -381,20 +357,15 @@ def build_dest_path_from_src_path(path, twp=False): .replace('ttc', 'ttf') def go(path, twp=False): - font = load_font(path, ttc_index=0) + font = load_font(path, ttc_index=0) # `ttc_index` 0: GenYoMin-TW; 1: GenYoMin-JP # Determine the final Unicode range by the original font and OpenCC convert tables codepoints_font = build_codepoints_font(font) - codepoints_tonggui = build_codepoints_tonggui() & codepoints_font + entries_char = build_opencc_char_table(codepoints_font, twp=twp) + entries_word = build_opencc_word_table(codepoints_font, twp=twp) - codepoints_final = codepoints_tonggui | build_codepoints_non_han() & codepoints_font - - entries_char, codepoints_char = build_opencc_char_table(codepoints_tonggui, codepoints_font, twp=twp) - codepoints_final |= codepoints_char - - entries_word, codepoints_word = build_opencc_word_table(codepoints_tonggui, codepoints_font, twp=twp) - codepoints_final |= codepoints_word + codepoints_final = (build_codepoints_non_han() | build_codepoints_han()) & codepoints_font remove_codepoints(font, codepoints_font - codepoints_final) clean_unused_glyphs(font) @@ -431,10 +402,6 @@ def go(path, twp=False): save_font(font, build_dest_path_from_src_path(path, twp=twp)) if __name__ == '__main__': - # Initialize OpenCC converters - t2s = OpenCC('t2s').convert - t2twp = OpenCC('./build/t2twp').convert - for path in glob('cache/GenYoMin-*.ttc'): go(path) go(path, twp=True) diff --git a/build/prepare.sh b/build/prepare.sh deleted file mode 100755 index bf22a85..0000000 --- a/build/prepare.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/sh -mkdir -p cache output -cd cache -curl -LsSO https://github.com/ButTaiwan/genyo-font/releases/download/v1.501/GenYoMin.zip -curl -LsSZ --remote-name-all https://cdn.jsdelivr.net/npm/opencc-data@1.0.5/data/{STCharacters.txt,STPhrases.txt,TWPhrasesIT.txt,TWPhrasesName.txt,TWPhrasesOther.txt,TWVariants.txt} -curl -LsSo 通用規範漢字表.txt https://raw.githubusercontent.com/rime-aca/character_set/e7d009a8a185a83f62ad2c903565b8bb85719221/%E9%80%9A%E7%94%A8%E8%A6%8F%E7%AF%84%E6%BC%A2%E5%AD%97%E8%A1%A8.txt -cat TWPhrasesIT.txt TWPhrasesName.txt TWPhrasesOther.txt > TWPhrases.txt -unzip -q -n GenYoMin.zip "*.ttc" diff --git a/build/t2twp.json b/build/t2twp.json index d826efe..af2fc66 100644 --- a/build/t2twp.json +++ b/build/t2twp.json @@ -4,7 +4,7 @@ "type": "mmseg", "dict": { "type": "text", - "file": "../cache/TWPhrases.txt" + "file": "../opencc_data/TWPhrases.txt" } }, "conversion_chain": [{ @@ -12,10 +12,10 @@ "type": "group", "dicts": [{ "type": "text", - "file": "../cache/TWPhrases.txt" + "file": "../opencc_data/TWPhrases.txt" }, { "type": "text", - "file": "../cache/TWVariants.txt" + "file": "../opencc_data/TWVariants.txt" }] } }] diff --git a/test/index.css b/test/index.css new file mode 100644 index 0000000..f64285a --- /dev/null +++ b/test/index.css @@ -0,0 +1,83 @@ +@font-face { + font-family: "FanWunMing-Test"; + font-weight: 100; + src: url("../output/FanWunMing-EL.ttf") format("truetype"); +} +@font-face { + font-family: "FanWunMing-Test"; + font-weight: 300; + src: url("../output/FanWunMing-L.ttf") format("truetype"); +} +@font-face { + font-family: "FanWunMing-Test"; + font-weight: 400; + src: url("../output/FanWunMing-R.ttf") format("truetype"); +} +@font-face { + font-family: "FanWunMing-Test"; + font-weight: 500; + src: url("../output/FanWunMing-M.ttf") format("truetype"); +} +@font-face { + font-family: "FanWunMing-Test"; + font-weight: 600; + src: url("../output/FanWunMing-SB.ttf") format("truetype"); +} +@font-face { + font-family: "FanWunMing-Test"; + font-weight: 700; + src: url("../output/FanWunMing-B.ttf") format("truetype"); +} +@font-face { + font-family: "FanWunMing-Test"; + font-weight: 900; + src: url("../output/FanWunMing-H.ttf") format("truetype"); +} + +@font-face { + font-family: "FanWunMing-Test-TW"; + font-weight: 100; + src: url("../output/FanWunMing-TW-EL.ttf") format("truetype"); +} +@font-face { + font-family: "FanWunMing-Test-TW"; + font-weight: 300; + src: url("../output/FanWunMing-TW-L.ttf") format("truetype"); +} +@font-face { + font-family: "FanWunMing-Test-TW"; + font-weight: 400; + src: url("../output/FanWunMing-TW-R.ttf") format("truetype"); +} +@font-face { + font-family: "FanWunMing-Test-TW"; + font-weight: 500; + src: url("../output/FanWunMing-TW-M.ttf") format("truetype"); +} +@font-face { + font-family: "FanWunMing-Test-TW"; + font-weight: 600; + src: url("../output/FanWunMing-TW-SB.ttf") format("truetype"); +} +@font-face { + font-family: "FanWunMing-Test-TW"; + font-weight: 700; + src: url("../output/FanWunMing-TW-B.ttf") format("truetype"); +} +@font-face { + font-family: "FanWunMing-Test-TW"; + font-weight: 900; + src: url("../output/FanWunMing-TW-H.ttf") format("truetype"); +} + +:lang(en) { font-family: sans-serif, sans-serif; } +:lang(zh-CN) { font-family: 'FanWunMing-Test', serif; } +:lang(zh-CN).tw { font-family: 'FanWunMing-Test-TW', serif; } + +.w100 { font-weight: 100; } +.w300 { font-weight: 300; } +.w400 { font-weight: 400; } +.w500 { font-weight: 500; } +.w600 { font-weight: 600; } +.w700 { font-weight: 700; } +.w900 { font-weight: 900; } diff --git a/test/index.html b/test/index.html new file mode 100644 index 0000000..273d918 --- /dev/null +++ b/test/index.html @@ -0,0 +1,91 @@ + + + + + + + 繁媛明朝字体测试页 + + + +

繁媛明朝字体测试页

+ +

测试基本功能

+

错综复杂

+

错综复杂

+ +

测试一简对多繁

+

松赞干布

+

松赞干布

+

夸夸其谈

+

夸夸其谈

+

夸父逐日

+

夸父逐日

+

之子于归,远送于野

+

之子于归,远送于野

+

我干什么不干你事

+

我干什么不干你事

+

我发现太后的头发很干燥

+

我发现太后的头发很干燥

+

赞叹沙河涌汹涌的波浪

+

赞叹沙河涌汹涌的波浪

+

经理发现理发的人不多

+

经理发现理发的人不多

+ +

测试正向最长匹配

+

下面

+

下面

+

下面条

+

下面条

+ +

测试台湾字词转换

+

在搜索字段使用通配符

+

在搜索字段使用通配符

+

开放源代码的简转繁字体

+

开放源代码的简转繁字体

+

鼠标里面的硅二极管坏了,导致光标分辨率降低

+

鼠标里面的硅二极管坏了,导致光标分辨率降低

+

我们在老挝的服务器的硬盘需要使用互联网算法软件解决异步的问题

+

我们在老挝的服务器的硬盘需要使用互联网算法软件解决异步的问题

+

为什么你在床里面睡着?

+

为什么你在床里面睡着?

+

台式机

+

台式机

+

着装污染虚伪发泄棱柱群众里面

+

着装污染虚伪发泄棱柱群众里面

+

内存

+

内存

+

海内存知己

+

海内存知己

+ +

测试收字范围 (#3)

+

+

+

+

+ +

测试非简体字 (#7)

+

+

+

+

+

+

+ +

测试七种字重

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+

朱轩荫兰皋,翠幕映洛湄。

+ +