From 32fbb581e5a82dede4ed1b97d0378b6f2760f6a0 Mon Sep 17 00:00:00 2001
From: Ayaka Mikazuki <ayaka@mail.shn.hk>
Date: Sun, 22 Nov 2020 22:26:34 +0800
Subject: [PATCH] Update character range

---
 .github/workflows/build.yml | 23 ++++++++--
 .gitignore                  |  1 +
 build/code_points_han.py    | 32 +++++++++++++
 build/convert_tables.py     | 68 +++++++++++++++++++++++++++
 build/{main.py => font.py}  | 91 ++++++++++++-------------------------
 build/prepare.sh            |  8 ----
 build/t2twp.json            |  6 +--
 test/index.css              | 83 +++++++++++++++++++++++++++++++++
 test/index.html             | 91 +++++++++++++++++++++++++++++++++++++
 9 files changed, 327 insertions(+), 76 deletions(-)
 create mode 100644 build/code_points_han.py
 create mode 100644 build/convert_tables.py
 rename build/{main.py => font.py} (81%)
 delete mode 100755 build/prepare.sh
 create mode 100644 test/index.css
 create mode 100644 test/index.html

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 89364e7..d35832c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -19,11 +19,28 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install -r requirements.txt
+        pip install -r requirements.txt
+    - name: Create directories
+      run: mkdir -p opencc_data cache output
+    - name: Download OpenCC data
+      run: |
+        cd opencc_data
+        curl -LsSZ --remote-name-all https://cdn.jsdelivr.net/npm/opencc-data@1.0.5/data/{STCharacters.txt,STPhrases.txt,TWPhrasesIT.txt,TWPhrasesName.txt,TWPhrasesOther.txt,TWVariants.txt,HKVariants.txt}
+        cat TWPhrasesIT.txt TWPhrasesName.txt TWPhrasesOther.txt > TWPhrases.txt
+    - name: Download GenYoMin font
+      run: |
+        cd cache
+        curl -LsSO https://github.com/ButTaiwan/genyo-font/releases/download/v1.501/GenYoMin.zip
+        unzip -q -n GenYoMin.zip "*.ttc"
+    - name: Download character list
+      run: |
+        cd cache
+        curl -LsSo 通用規範漢字表.txt https://raw.githubusercontent.com/rime-aca/character_set/e7d009a8a185a83f62ad2c903565b8bb85719221/%E9%80%9A%E7%94%A8%E8%A6%8F%E7%AF%84%E6%BC%A2%E5%AD%97%E8%A1%A8.txt
     - name: Build
       run: |
-        build/prepare.sh
-        python build/main.py
+        python build/convert_tables.py
+        python build/code_points_han.py
+        python build/font.py
     - name: Copy license file
       run: cp LICENSE output
     - name: Upload font files
diff --git a/.gitignore b/.gitignore
index ba20042..bb3fd3a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .vscode
 .mypy_cache
+/opencc_data
 /cache
 /output
diff --git a/build/code_points_han.py b/build/code_points_han.py
new file mode 100644
index 0000000..4bf1333
--- /dev/null
+++ b/build/code_points_han.py
@@ -0,0 +1,32 @@
+# Code points of Han characters to be included:
+# 1. Code points in Tongyong Guifan Hanzi Biao (通用规范汉字表)
+# 2. Code points in OpenCC dictionaries
+
+from itertools import chain
+
+s = set()
+
+with open('cache/通用規範漢字表.txt') as f:
+	for line in f:
+		if line and not line.startswith('#'):
+			c = line[0]
+			s.add(ord(c))
+
+with open('opencc_data/STCharacters.txt') as f1, \
+open('opencc_data/STPhrases.txt') as f2, \
+open('opencc_data/TWVariants.txt') as f3, \
+open('opencc_data/TWPhrases.txt') as f4, \
+open('opencc_data/HKVariants.txt') as f5:
+	for line in chain(f1, f2, f3, f4, f5):
+		k, vx = line.rstrip('\n').split('\t')
+		vs = vx.split(' ')
+		for c in k:
+			s.add(ord(c))
+		for v in vs:
+			for c in v:
+				s.add(ord(c))
+
+with open('cache/code_points_han.txt', 'w') as f:
+	for cp in sorted(s):
+		if cp > 128:  # remove letters in the dictionaries
+			print(cp, file=f)
diff --git a/build/convert_tables.py b/build/convert_tables.py
new file mode 100644
index 0000000..901da91
--- /dev/null
+++ b/build/convert_tables.py
@@ -0,0 +1,68 @@
+# Build convert tables
+#
+# Input:
+# - build/t2twp.json
+#    -> opencc_data/TWPhrases.txt
+#    -> opencc_data/TWVariants.txt
+# - opencc_data/STCharacters.txt
+# - opencc_data/STPhrases.txt
+# - opencc_data/TWVariants.txt
+# - opencc_data/TWPhrases.txt
+#
+# Output:
+# - cache/convert_table_words.txt
+# - cache/convert_table_chars.txt
+# - cache/convert_table_words_twp.txt
+# - cache/convert_table_chars_twp.txt
+
+from opencc import OpenCC
+
+def build_entries(twp=False):
+	with open('opencc_data/STCharacters.txt') as f:  # s2t
+		for line in f:
+			k, vx = line.rstrip('\n').split('\t')
+			v = vx.split(' ')[0]  # Only select the first candidate
+			v = t2twp(v) if twp else v  # s2t -> s2twp
+			yield k, v
+
+	with open('opencc_data/STPhrases.txt') as f:  # s2t
+		for line in f:
+			k, vx = line.rstrip('\n').split('\t')
+			v = vx.split(' ')[0]  # Only select the first candidate
+			v = t2twp(v) if twp else v  # s2t -> s2twp
+			yield k, v
+
+	if twp:
+		with open('opencc_data/TWVariants.txt') as f:  # t2tw
+			for line in f:
+				k, vx = line.rstrip('\n').split('\t')
+				v = vx.split(' ')[0]  # Only select the first candidate
+				k = t2s(k)  # t2tw -> s2tw
+				yield k, v
+
+		with open('opencc_data/TWPhrases.txt') as f:  # t2twp
+			for line in f:
+				k, vx = line.rstrip('\n').split('\t')
+				v = vx.split(' ')[0]  # Only select the first candidate
+				k = t2s(k)  # t2twp -> s2twp
+				yield k, v
+
+def go(twp=False):
+	entries = build_entries(twp=twp)
+	entries = dict(entries)  # remove duplicates
+	entries = sorted(entries.items(), key=lambda k_v: (len(k_v[0]), k_v[0]), reverse=True)  # sort
+
+	twp_suffix = '_twp' if twp else ''
+
+	with open(f'cache/convert_table_words{twp_suffix}.txt', 'w') as f1, \
+	open(f'cache/convert_table_chars{twp_suffix}.txt', 'w') as f2:
+		for k, v in entries:
+			print(k, v, sep='\t', file=f1 if len(k) > 1 else f2)
+
+if __name__ == '__main__':
+	# Initialize OpenCC converters
+	t2s = OpenCC('t2s').convert
+	t2twp = OpenCC('./build/t2twp').convert
+
+	go()
+	go(twp=True)
diff --git a/build/main.py b/build/font.py
similarity index 81%
rename from build/main.py
rename to build/font.py
index e02499c..3302a9f 100644
--- a/build/main.py
+++ b/build/font.py
@@ -3,7 +3,6 @@
 from glob import glob
 from itertools import chain, groupby
 import json
-from opencc import OpenCC
 import os
 import subprocess
 
@@ -83,10 +82,13 @@ def get_glyph_count(obj):
 	'''Get the total numbers of glyph in a font.'''
 	return len(obj['glyph_order'])
 
-def build_codepoints_tonggui():
-	'''Build a set of all the codepoints in Tongyong Guifan Hanzi Biao (通用规范汉字表).'''
-	with open('cache/通用規範漢字表.txt') as f:
-		return {ord(line[0]) for line in f if line and not line.startswith('#')}
+def build_codepoints_han():
+	'''Build a set of codepoints of Han characters to be included.'''
+	with open('cache/code_points_han.txt') as f:
+		s = set()
+		for line in f:
+			s.add(int(line))
+		return s
 
 def build_codepoints_font(obj):
 	'''Build a set of all the codepoints in a font.'''
@@ -110,62 +112,36 @@ def build_codepoints_non_han():
 		range(0xFF61, 0xFF64 + 1),
 	))
 
-# We restrict the Simplified Chinese characters (on the left side of the OpenCC dictionary
-# file) to the range of Tongyong Guifan Hanzi Biao, and discard those conversions that are
-# out of range. The remained conversions are stored in the entries variable.
-#
-# Then we calculate the range of “Which Traditional Chinese characters are needed if we
-# convert Tongyong Guifan Hanzi Biao to Traditional Chinese”. The range is stored in the
-# codepoints variable.
-def build_opencc_char_table(codepoints_tonggui, codepoints_font, twp=False):
+def build_opencc_char_table(codepoints_font, twp=False):
 	entries = []
-	codepoints = set()
+	twp_suffix = '_twp' if twp else ''
 
-	with open('cache/STCharacters.txt') as f:  # s2t
+	with open(f'cache/convert_table_chars{twp_suffix}.txt') as f:
 		for line in f:
-			k, vx = line.rstrip('\n').split('\t')
-			v = vx.split(' ')[0]  # Only select the first candidate
-			v = t2twp(v) if twp else v  # s2t -> s2twp
+			k, v = line.rstrip('\n').split('\t')
 			codepoint_k = ord(k)
 			codepoint_v = ord(v)
-			if codepoint_k in codepoints_tonggui and codepoint_v in codepoints_font:
+			if codepoint_k in codepoints_font \
+			and codepoint_v in codepoints_font:  # TODO FIXME: codepoint_k in codepoints_font should be unnecessary
 				entries.append((codepoint_k, codepoint_v))
-				codepoints.add(codepoint_v)
 
-	return entries, codepoints
+	return entries
 
-def build_opencc_word_table(codepoints_tonggui, codepoints_font, twp=False):
-	entries = {}
-	codepoints = set()
+def build_opencc_word_table(codepoints_font, twp=False):
+	entries = []
+	twp_suffix = '_twp' if twp else ''
 
-	with open('cache/STPhrases.txt') as f:  # s2t
+	with open(f'cache/convert_table_words{twp_suffix}.txt') as f:
 		for line in f:
-			k, vx = line.rstrip('\n').split('\t')
-			v = vx.split(' ')[0]  # Only select the first candidate
-			v = t2twp(v) if twp else v  # s2t -> s2twp
+			k, v = line.rstrip('\n').split('\t')
 			codepoints_k = tuple(ord(c) for c in k)
 			codepoints_v = tuple(ord(c) for c in v)
-			if all(codepoint in codepoints_tonggui for codepoint in codepoints_k) \
-			and all(codepoint in codepoints_font for codepoint in codepoints_v):
-				entries[codepoints_k] = codepoints_v
-				codepoints.update(codepoints_v)
-
-	if twp:
-		with open('cache/TWPhrases.txt') as f:  # t2twp
-			for line in f:
-				k, vx = line.rstrip('\n').split('\t')
-				v = vx.split(' ')[0]  # Only select the first candidate
-				k = t2s(k)  # t2twp -> s2twp
-				codepoints_k = tuple(ord(c) for c in k)
-				codepoints_v = tuple(ord(c) for c in v)
-				if all(codepoint in codepoints_tonggui for codepoint in codepoints_k) \
-				and all(codepoint in codepoints_font for codepoint in codepoints_v):
-					entries[codepoints_k] = codepoints_v
-					codepoints.update(codepoints_v)
-
-	# Sort from longest to shortest to force longest match
-	conversion_item_len = lambda conversion_item: len(conversion_item[0])
-	return sorted(entries.items(), key=conversion_item_len, reverse=True), codepoints
+			if all(codepoint in codepoints_font for codepoint in codepoints_k) \
+			and all(codepoint in codepoints_font for codepoint in codepoints_v):  # TODO FIXME: the first line should be unnecessary
+				entries.append((codepoints_k, codepoints_v))
+
+	# The entries are already Sorted from longest to shortest to force longest match
+	return entries
 
 def disassociate_codepoint_and_glyph_name(obj, codepoint, glyph_name):
 	'''
@@ -381,20 +357,15 @@ def build_dest_path_from_src_path(path, twp=False):
 	.replace('ttc', 'ttf')
 
 def go(path, twp=False):
-	font = load_font(path, ttc_index=0)
+	font = load_font(path, ttc_index=0)  # `ttc_index` 0: GenYoMin-TW; 1: GenYoMin-JP
 
 	# Determine the final Unicode range by the original font and OpenCC convert tables
 
 	codepoints_font = build_codepoints_font(font)
-	codepoints_tonggui = build_codepoints_tonggui() & codepoints_font
+	entries_char = build_opencc_char_table(codepoints_font, twp=twp)
+	entries_word = build_opencc_word_table(codepoints_font, twp=twp)
 
-	codepoints_final = codepoints_tonggui | build_codepoints_non_han() & codepoints_font
-
-	entries_char, codepoints_char = build_opencc_char_table(codepoints_tonggui, codepoints_font, twp=twp)
-	codepoints_final |= codepoints_char
-
-	entries_word, codepoints_word = build_opencc_word_table(codepoints_tonggui, codepoints_font, twp=twp)
-	codepoints_final |= codepoints_word
+	codepoints_final = (build_codepoints_non_han() | build_codepoints_han()) & codepoints_font
 
 	remove_codepoints(font, codepoints_font - codepoints_final)
 	clean_unused_glyphs(font)
@@ -431,10 +402,6 @@ def go(path, twp=False):
 	save_font(font, build_dest_path_from_src_path(path, twp=twp))
 
 if __name__ == '__main__':
-	# Initialize OpenCC converters
-	t2s = OpenCC('t2s').convert
-	t2twp = OpenCC('./build/t2twp').convert
-
 	for path in glob('cache/GenYoMin-*.ttc'):
 		go(path)
 		go(path, twp=True)
diff --git a/build/prepare.sh b/build/prepare.sh
deleted file mode 100755
index bf22a85..0000000
--- a/build/prepare.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/sh
-mkdir -p cache output
-cd cache
-curl -LsSO https://github.com/ButTaiwan/genyo-font/releases/download/v1.501/GenYoMin.zip
-curl -LsSZ --remote-name-all https://cdn.jsdelivr.net/npm/opencc-data@1.0.5/data/{STCharacters.txt,STPhrases.txt,TWPhrasesIT.txt,TWPhrasesName.txt,TWPhrasesOther.txt,TWVariants.txt}
-curl -LsSo 通用規範漢字表.txt https://raw.githubusercontent.com/rime-aca/character_set/e7d009a8a185a83f62ad2c903565b8bb85719221/%E9%80%9A%E7%94%A8%E8%A6%8F%E7%AF%84%E6%BC%A2%E5%AD%97%E8%A1%A8.txt
-cat TWPhrasesIT.txt TWPhrasesName.txt TWPhrasesOther.txt > TWPhrases.txt
-unzip -q -n GenYoMin.zip "*.ttc"
diff --git a/build/t2twp.json b/build/t2twp.json
index d826efe..af2fc66 100644
--- a/build/t2twp.json
+++ b/build/t2twp.json
@@ -4,7 +4,7 @@
         "type": "mmseg",
         "dict": {
             "type": "text",
-            "file": "../cache/TWPhrases.txt"
+            "file": "../opencc_data/TWPhrases.txt"
         }
     },
     "conversion_chain": [{
@@ -12,10 +12,10 @@
             "type": "group",
             "dicts": [{
                 "type": "text",
-                "file": "../cache/TWPhrases.txt"
+                "file": "../opencc_data/TWPhrases.txt"
             }, {
                 "type": "text",
-                "file": "../cache/TWVariants.txt"
+                "file": "../opencc_data/TWVariants.txt"
             }]
         }
     }]
diff --git a/test/index.css b/test/index.css
new file mode 100644
index 0000000..f64285a
--- /dev/null
+++ b/test/index.css
@@ -0,0 +1,83 @@
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 100;
+  src: url("../output/FanWunMing-EL.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 300;
+  src: url("../output/FanWunMing-L.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 400;
+  src: url("../output/FanWunMing-R.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 500;
+  src: url("../output/FanWunMing-M.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 600;
+  src: url("../output/FanWunMing-SB.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 700;
+  src: url("../output/FanWunMing-B.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test";
+  font-weight: 900;
+  src: url("../output/FanWunMing-H.ttf") format("truetype");
+}
+
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 100;
+  src: url("../output/FanWunMing-TW-EL.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 300;
+  src: url("../output/FanWunMing-TW-L.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 400;
+  src: url("../output/FanWunMing-TW-R.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 500;
+  src: url("../output/FanWunMing-TW-M.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 600;
+  src: url("../output/FanWunMing-TW-SB.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 700;
+  src: url("../output/FanWunMing-TW-B.ttf") format("truetype");
+}
+@font-face {
+  font-family: "FanWunMing-Test-TW";
+  font-weight: 900;
+  src: url("../output/FanWunMing-TW-H.ttf") format("truetype");
+}
+
+:lang(en) { font-family: sans-serif, sans-serif; }
+:lang(zh-CN) { font-family: 'FanWunMing-Test', serif; }
+:lang(zh-CN).tw { font-family: 'FanWunMing-Test-TW', serif; }
+
+.w100 { font-weight: 100; }
+.w300 { font-weight: 300; }
+.w400 { font-weight: 400; }
+.w500 { font-weight: 500; }
+.w600 { font-weight: 600; }
+.w700 { font-weight: 700; }
+.w900 { font-weight: 900; }
diff --git a/test/index.html b/test/index.html
new file mode 100644
index 0000000..273d918
--- /dev/null
+++ b/test/index.html
@@ -0,0 +1,91 @@
+<!DOCTYPE html>
+<html lang="zh-CN" dir="ltr">
+<head>
+  <meta charset="utf-8"/>
+  <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
+  <meta name="keywords" content="繁媛明朝, 简转繁字体, 字体, 测试"/>
+  <title>繁媛明朝字体测试页</title>
+  <link rel="stylesheet" href="index.css"/>
+</head>
+<body>
+  <h1>繁媛明朝字体测试页</h1>
+
+  <h2>测试基本功能</h2>
+  <p>错综复杂</p>
+  <p class="tw">错综复杂</p>
+
+  <h2>测试一简对多繁</h2>
+  <p>松赞干布</p>
+  <p class="tw">松赞干布</p>
+  <p>夸夸其谈</p>
+  <p class="tw">夸夸其谈</p>
+  <p>夸父逐日</p>
+  <p class="tw">夸父逐日</p>
+  <p>之子于归，远送于野</p>
+  <p class="tw">之子于归，远送于野</p>
+  <p>我干什么不干你事</p>
+  <p class="tw">我干什么不干你事</p>
+  <p>我发现太后的头发很干燥</p>
+  <p class="tw">我发现太后的头发很干燥</p>
+  <p>赞叹沙河涌汹涌的波浪</p>
+  <p class="tw">赞叹沙河涌汹涌的波浪</p>
+  <p>经理发现理发的人不多</p>
+  <p class="tw">经理发现理发的人不多</p>
+
+  <h2>测试正向最长匹配</h2>
+  <p>下面</p>
+  <p class="tw">下面</p>
+  <p>下面条</p>
+  <p class="tw">下面条</p>
+
+  <h2>测试台湾字词转换</h2>
+  <p>在搜索字段使用通配符</p>
+  <p class="tw">在搜索字段使用通配符</p>
+  <p>开放源代码的简转繁字体</p>
+  <p class="tw">开放源代码的简转繁字体</p>
+  <p>鼠标里面的硅二极管坏了，导致光标分辨率降低</p>
+  <p class="tw">鼠标里面的硅二极管坏了，导致光标分辨率降低</p>
+  <p>我们在老挝的服务器的硬盘需要使用互联网算法软件解决异步的问题</p>
+  <p class="tw">我们在老挝的服务器的硬盘需要使用互联网算法软件解决异步的问题</p>
+  <p>为什么你在床里面睡着？</p>
+  <p class="tw">为什么你在床里面睡着？</p>
+  <p>台式机</p>
+  <p class="tw">台式机</p>
+  <p>着装污染虚伪发泄棱柱群众里面</p>
+  <p class="tw">着装污染虚伪发泄棱柱群众里面</p>
+  <p>内存</p>
+  <p class="tw">内存</p>
+  <p>海内存知己</p>
+  <p class="tw">海内存知己</p>
+
+  <h2>测试收字范围 <span lang="en">(#3)</span></h2>
+  <p>诶</p>
+  <p class="tw">诶</p>
+  <p>瑸</p>
+  <p class="tw">瑸</p>
+
+  <h2>测试非简体字 <span lang="en">(#7)</span></h2>
+  <p>裏</p>
+  <p class="tw">裏</p>
+  <p>裡</p>
+  <p class="tw">裡</p>
+  <p>僞</p>
+  <p class="tw">僞</p>
+
+  <h2>测试七种字重</h2>
+  <p class="w100">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w100 tw">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w300">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w300 tw">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w400">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w400 tw">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w500">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w500 tw">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w600">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w600 tw">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w700">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w700 tw">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w900">朱轩荫兰皋，翠幕映洛湄。</p>
+  <p class="w900 tw">朱轩荫兰皋，翠幕映洛湄。</p>
+</body>
+</html>