Update character range (#10)

ayaka14732 · Nov 22, 2020 · f5e7edf · f5e7edf
1 parent a4df4f1
commit f5e7edf
Show file tree

Hide file tree

Showing 9 changed files with 327 additions and 76 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -19,11 +19,28 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install -r requirements.txt
+        pip install -r requirements.txt
+    - name: Create directories
+      run: mkdir -p opencc_data cache output
+    - name: Download OpenCC data
+      run: |
+        cd opencc_data
+        curl -LsSZ --remote-name-all https://cdn.jsdelivr.net/npm/opencc-data@1.0.5/data/{STCharacters.txt,STPhrases.txt,TWPhrasesIT.txt,TWPhrasesName.txt,TWPhrasesOther.txt,TWVariants.txt,HKVariants.txt}
+        cat TWPhrasesIT.txt TWPhrasesName.txt TWPhrasesOther.txt > TWPhrases.txt
+    - name: Download GenYoMin font
+      run: |
+        cd cache
+        curl -LsSO https://github.com/ButTaiwan/genyo-font/releases/download/v1.501/GenYoMin.zip
+        unzip -q -n GenYoMin.zip "*.ttc"
+    - name: Download character list
+      run: |
+        cd cache
+        curl -LsSo 通用規範漢字表.txt https://github.com/rime-aca/character_set/e7d009a8a185a83f62ad2c903565b8bb85719221/%E9%80%9A%E7%94%A8%E8%A6%8F%E7%AF%84%E6%BC%A2%E5%AD%97%E8%A1%A8.txt
     - name: Build
       run: |
-        build/prepare.sh
-        python build/main.py
+        python build/convert_tables.py
+        python build/code_points_han.py
+        python build/font.py
     - name: Copy license file
       run: cp LICENSE output
     - name: Upload font files

diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .vscode
 .mypy_cache
+/opencc_data
 /cache
 /output
diff --git a/build/code_points_han.py b/build/code_points_han.py
@@ -0,0 +1,32 @@
+# Code points of Han characters to be included:
+# 1. Code points in Tongyong Guifan Hanzi Biao (通用规范汉字表)
+# 2. Code points in OpenCC dictionaries
+
+from itertools import chain
+
+s = set()
+
+with open('cache/通用規範漢字表.txt') as f:
+	for line in f:
+		if line and not line.startswith('#'):
+			c = line[0]
+			s.add(ord(c))
+
+with open('opencc_data/STCharacters.txt') as f1, \
+open('opencc_data/STPhrases.txt') as f2, \
+open('opencc_data/TWVariants.txt') as f3, \
+open('opencc_data/TWPhrases.txt') as f4, \
+open('opencc_data/HKVariants.txt') as f5:
+	for line in chain(f1, f2, f3, f4, f5):
+		k, vx = line.rstrip('\n').split('\t')
+		vs = vx.split(' ')
+		for c in k:
+			s.add(ord(c))
+		for v in vs:
+			for c in v:
+				s.add(ord(c))
+
+with open('cache/code_points_han.txt', 'w') as f:
+	for cp in sorted(s):
+		if cp > 128:  # remove letters in the dictionaries
+			print(cp, file=f)
diff --git a/build/convert_tables.py b/build/convert_tables.py
@@ -0,0 +1,68 @@
+# Build convert tables
+#
+# Input:
+# - build/t2twp.json
+#    -> opencc_data/TWPhrases.txt
+#    -> opencc_data/TWVariants.txt
+# - opencc_data/STCharacters.txt
+# - opencc_data/STPhrases.txt
+# - opencc_data/TWVariants.txt
+# - opencc_data/TWPhrases.txt
+#
+# Output:
+# - cache/convert_table_words.txt
+# - cache/convert_table_chars.txt
+# - cache/convert_table_words_twp.txt
+# - cache/convert_table_chars_twp.txt
+
+from opencc import OpenCC
+
+def build_entries(twp=False):
+	with open('opencc_data/STCharacters.txt') as f:  # s2t
+		for line in f:
+			k, vx = line.rstrip('\n').split('\t')
+			v = vx.split(' ')[0]  # Only select the first candidate
+			v = t2twp(v) if twp else v  # s2t -> s2twp
+			yield k, v
+
+	with open('opencc_data/STPhrases.txt') as f:  # s2t
+		for line in f:
+			k, vx = line.rstrip('\n').split('\t')
+			v = vx.split(' ')[0]  # Only select the first candidate
+			v = t2twp(v) if twp else v  # s2t -> s2twp
+			yield k, v
+
+	if twp:
+		with open('opencc_data/TWVariants.txt') as f:  # t2tw
+			for line in f:
+				k, vx = line.rstrip('\n').split('\t')
+				v = vx.split(' ')[0]  # Only select the first candidate
+				k = t2s(k)  # t2tw -> s2tw
+				yield k, v
+
+		with open('opencc_data/TWPhrases.txt') as f:  # t2twp
+			for line in f:
+				k, vx = line.rstrip('\n').split('\t')
+				v = vx.split(' ')[0]  # Only select the first candidate
+				k = t2s(k)  # t2twp -> s2twp
+				yield k, v
+
+def go(twp=False):
+	entries = build_entries(twp=twp)
+	entries = dict(entries)  # remove duplicates
+	entries = sorted(entries.items(), key=lambda k_v: (len(k_v[0]), k_v[0]), reverse=True)  # sort
+
+	twp_suffix = '_twp' if twp else ''
+
+	with open(f'cache/convert_table_words{twp_suffix}.txt', 'w') as f1, \
+	open(f'cache/convert_table_chars{twp_suffix}.txt', 'w') as f2:
+		for k, v in entries:
+			print(k, v, sep='\t', file=f1 if len(k) > 1 else f2)
+
+if __name__ == '__main__':
+	# Initialize OpenCC converters
+	t2s = OpenCC('t2s').convert
+	t2twp = OpenCC('./build/t2twp').convert
+
+	go()
+	go(twp=True)
diff --git a/build/main.py → build/font.py b/build/main.py → build/font.py
@@ -3,7 +3,6 @@
 from glob import glob
 from itertools import chain, groupby
 import json
-from opencc import OpenCC
 import os
 import subprocess
 
@@ -83,10 +82,13 @@ def get_glyph_count(obj):
 	'''Get the total numbers of glyph in a font.'''
 	return len(obj['glyph_order'])
 
-def build_codepoints_tonggui():
-	'''Build a set of all the codepoints in Tongyong Guifan Hanzi Biao (通用规范汉字表).'''
-	with open('cache/通用規範漢字表.txt') as f:
-		return {ord(line[0]) for line in f if line and not line.startswith('#')}
+def build_codepoints_han():
+	'''Build a set of codepoints of Han characters to be included.'''
+	with open('cache/code_points_han.txt') as f:
+		s = set()
+		for line in f:
+			s.add(int(line))
+		return s
 
 def build_codepoints_font(obj):
 	'''Build a set of all the codepoints in a font.'''
@@ -110,62 +112,36 @@ def build_codepoints_non_han():
 		range(0xFF61, 0xFF64 + 1),
 	))
 
-# We restrict the Simplified Chinese characters (on the left side of the OpenCC dictionary
-# file) to the range of Tongyong Guifan Hanzi Biao, and discard those conversions that are
-# out of range. The remained conversions are stored in the entries variable.
-#
-# Then we calculate the range of “Which Traditional Chinese characters are needed if we
-# convert Tongyong Guifan Hanzi Biao to Traditional Chinese”. The range is stored in the
-# codepoints variable.
-def build_opencc_char_table(codepoints_tonggui, codepoints_font, twp=False):
+def build_opencc_char_table(codepoints_font, twp=False):
 	entries = []
-	codepoints = set()
+	twp_suffix = '_twp' if twp else ''
 
-	with open('cache/STCharacters.txt') as f:  # s2t
+	with open(f'cache/convert_table_chars{twp_suffix}.txt') as f:
 		for line in f:
-			k, vx = line.rstrip('\n').split('\t')
-			v = vx.split(' ')[0]  # Only select the first candidate
-			v = t2twp(v) if twp else v  # s2t -> s2twp
+			k, v = line.rstrip('\n').split('\t')
 			codepoint_k = ord(k)
 			codepoint_v = ord(v)
-			if codepoint_k in codepoints_tonggui and codepoint_v in codepoints_font:
+			if codepoint_k in codepoints_font \
+			and codepoint_v in codepoints_font:  # TODO FIXME: codepoint_k in codepoints_font should be unnecessary
 				entries.append((codepoint_k, codepoint_v))
-				codepoints.add(codepoint_v)
 
-	return entries, codepoints
+	return entries
 
-def build_opencc_word_table(codepoints_tonggui, codepoints_font, twp=False):
-	entries = {}
-	codepoints = set()
+def build_opencc_word_table(codepoints_font, twp=False):
+	entries = []
+	twp_suffix = '_twp' if twp else ''
 
-	with open('cache/STPhrases.txt') as f:  # s2t
+	with open(f'cache/convert_table_words{twp_suffix}.txt') as f:
 		for line in f:
-			k, vx = line.rstrip('\n').split('\t')
-			v = vx.split(' ')[0]  # Only select the first candidate
-			v = t2twp(v) if twp else v  # s2t -> s2twp
+			k, v = line.rstrip('\n').split('\t')
 			codepoints_k = tuple(ord(c) for c in k)
 			codepoints_v = tuple(ord(c) for c in v)
-			if all(codepoint in codepoints_tonggui for codepoint in codepoints_k) \
-			and all(codepoint in codepoints_font for codepoint in codepoints_v):
-				entries[codepoints_k] = codepoints_v
-				codepoints.update(codepoints_v)
-
-	if twp:
-		with open('cache/TWPhrases.txt') as f:  # t2twp
-			for line in f:
-				k, vx = line.rstrip('\n').split('\t')
-				v = vx.split(' ')[0]  # Only select the first candidate
-				k = t2s(k)  # t2twp -> s2twp
-				codepoints_k = tuple(ord(c) for c in k)
-				codepoints_v = tuple(ord(c) for c in v)
-				if all(codepoint in codepoints_tonggui for codepoint in codepoints_k) \
-				and all(codepoint in codepoints_font for codepoint in codepoints_v):
-					entries[codepoints_k] = codepoints_v
-					codepoints.update(codepoints_v)
-
-	# Sort from longest to shortest to force longest match
-	conversion_item_len = lambda conversion_item: len(conversion_item[0])
-	return sorted(entries.items(), key=conversion_item_len, reverse=True), codepoints
+			if all(codepoint in codepoints_font for codepoint in codepoints_k) \
+			and all(codepoint in codepoints_font for codepoint in codepoints_v):  # TODO FIXME: the first line should be unnecessary
+				entries.append((codepoints_k, codepoints_v))
+
+	# The entries are already Sorted from longest to shortest to force longest match
+	return entries
 
 def disassociate_codepoint_and_glyph_name(obj, codepoint, glyph_name):
 	'''
@@ -381,20 +357,15 @@ def build_dest_path_from_src_path(path, twp=False):
 	.replace('ttc', 'ttf')
 
 def go(path, twp=False):
-	font = load_font(path, ttc_index=0)
+	font = load_font(path, ttc_index=0)  # `ttc_index` 0: GenYoMin-TW; 1: GenYoMin-JP
 
 	# Determine the final Unicode range by the original font and OpenCC convert tables
 
 	codepoints_font = build_codepoints_font(font)
-	codepoints_tonggui = build_codepoints_tonggui() & codepoints_font
+	entries_char = build_opencc_char_table(codepoints_font, twp=twp)
+	entries_word = build_opencc_word_table(codepoints_font, twp=twp)
 
-	codepoints_final = codepoints_tonggui | build_codepoints_non_han() & codepoints_font
-
-	entries_char, codepoints_char = build_opencc_char_table(codepoints_tonggui, codepoints_font, twp=twp)
-	codepoints_final |= codepoints_char
-
-	entries_word, codepoints_word = build_opencc_word_table(codepoints_tonggui, codepoints_font, twp=twp)
-	codepoints_final |= codepoints_word
+	codepoints_final = (build_codepoints_non_han() | build_codepoints_han()) & codepoints_font
 
 	remove_codepoints(font, codepoints_font - codepoints_final)
 	clean_unused_glyphs(font)
@@ -431,10 +402,6 @@ def go(path, twp=False):
 	save_font(font, build_dest_path_from_src_path(path, twp=twp))
 
 if __name__ == '__main__':
-	# Initialize OpenCC converters
-	t2s = OpenCC('t2s').convert
-	t2twp = OpenCC('./build/t2twp').convert
-
 	for path in glob('cache/GenYoMin-*.ttc'):
 		go(path)
 		go(path, twp=True)
diff --git a/build/prepare.sh b/build/prepare.sh
diff --git a/build/t2twp.json b/build/t2twp.json
@@ -4,18 +4,18 @@
         "type": "mmseg",
         "dict": {
             "type": "text",
-            "file": "../cache/TWPhrases.txt"
+            "file": "../opencc_data/TWPhrases.txt"
         }
     },
     "conversion_chain": [{
         "dict": {
             "type": "group",
             "dicts": [{
                 "type": "text",
-                "file": "../cache/TWPhrases.txt"
+                "file": "../opencc_data/TWPhrases.txt"
             }, {
                 "type": "text",
-                "file": "../cache/TWVariants.txt"
+                "file": "../opencc_data/TWVariants.txt"
             }]
         }
     }]

diff --git a/test/index.css b/test/index.css
@@ -0,0 +1,83 @@
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 100;
+  src: url("../output/FanWunMing-EL.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 300;
+  src: url("../output/FanWunMing-L.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 400;
+  src: url("../output/FanWunMing-R.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 500;
+  src: url("../output/FanWunMing-M.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 600;
+  src: url("../output/FanWunMing-SB.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 700;
+  src: url("../output/FanWunMing-B.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 900;
+  src: url("../output/FanWunMing-H.ttf") format("truetype");
+}
+
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 100;
+  src: url("../output/FanWunMing-TW-EL.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 300;
+  src: url("../output/FanWunMing-TW-L.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 400;
+  src: url("../output/FanWunMing-TW-R.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 500;
+  src: url("../output/FanWunMing-TW-M.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 600;
+  src: url("../output/FanWunMing-TW-SB.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 700;
+  src: url("../output/FanWunMing-TW-B.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 900;
+  src: url("../output/FanWunMing-TW-H.ttf") format("truetype");
+}
+
+:lang(en) { font-family: sans-serif, sans-serif; }
+:lang(zh-CN) { font-family: 'FanWunMing-Test', serif; }
+:lang(zh-CN).tw { font-family: 'FanWunMing-Test-TW', serif; }
+
+.w100 { font-weight: 100; }
+.w300 { font-weight: 300; }
+.w400 { font-weight: 400; }
+.w500 { font-weight: 500; }
+.w600 { font-weight: 600; }
+.w700 { font-weight: 700; }
+.w900 { font-weight: 900; }