-
Notifications
You must be signed in to change notification settings - Fork 3
/
setup.py
224 lines (185 loc) · 8.22 KB
/
setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
from itertools import chain
from opencc import OpenCC
from os import makedirs, path
from setuptools import setup, find_packages
from urllib.request import urlretrieve
here = path.abspath(path.dirname(__file__))
pkgroot = path.join(here, 'src/OpenCCFontGenerator')
def file_exists(filename):
return path.exists(path.join(pkgroot, filename))
def download_data():
'''
Download necessary data for the package.
'''
if any(not file_exists(filename) for filename in (
'cache/通用規範漢字表.txt',
'opencc_data/STCharacters.txt',
'opencc_data/STPhrases.txt',
'opencc_data/TWPhrasesIT.txt',
'opencc_data/TWPhrasesName.txt',
'opencc_data/TWPhrasesOther.txt',
'opencc_data/TWVariants.txt',
'opencc_data/HKVariants.txt',
'opencc_data/TWPhrases.txt',
)):
makedirs(path.join(pkgroot, 'cache'), exist_ok=True)
makedirs(path.join(pkgroot, 'opencc_data'), exist_ok=True)
urlretrieve('https://github.com/rime-aca/character_set/e7d009a8a185a83f62ad2c903565b8bb85719221/%E9%80%9A%E7%94%A8%E8%A6%8F%E7%AF%84%E6%BC%A2%E5%AD%97%E8%A1%A8.txt',
path.join(pkgroot, 'cache/通用規範漢字表.txt'))
def download_opencc_file(filename):
opencc_data_url_prefix = 'https://cdn.jsdelivr.net/npm/opencc-data@1.0.6/data/'
urlretrieve(opencc_data_url_prefix + filename,
path.join(pkgroot, 'opencc_data', filename))
for filename in (
'STCharacters.txt',
'STPhrases.txt',
'TWPhrasesIT.txt',
'TWPhrasesName.txt',
'TWPhrasesOther.txt',
'TWVariants.txt',
'HKVariants.txt',
):
download_opencc_file(filename)
# Combine three TW phrases files into one
with open(path.join(pkgroot, 'opencc_data/TWPhrasesIT.txt')) as f1, \
open(path.join(pkgroot, 'opencc_data/TWPhrasesName.txt')) as f2, \
open(path.join(pkgroot, 'opencc_data/TWPhrasesOther.txt')) as f3, \
open(path.join(pkgroot, 'opencc_data/TWPhrases.txt'), 'w') as g:
g.write(f1.read())
g.write(f2.read())
g.write(f3.read())
def build_convert_tables():
'''
Build necessary convert tables from OpenCC data.
Input:
- `build/t2twp.json`
- `opencc_data/TWPhrases.txt`
- `opencc_data/TWVariants.txt`
- `opencc_data/STCharacters.txt`
- `opencc_data/STPhrases.txt`
- `opencc_data/TWVariants.txt`
- `opencc_data/TWPhrases.txt`
Output:
- `cache/convert_table_words.txt`
- `cache/convert_table_chars.txt`
- `cache/convert_table_words_twp.txt`
- `cache/convert_table_chars_twp.txt`
'''
if any(not file_exists(filename) for filename in (
'cache/convert_table_words.txt',
'cache/convert_table_chars.txt',
'cache/convert_table_words_twp.txt',
'cache/convert_table_chars_twp.txt',
)):
def build_entries(twp=False):
with open(path.join(pkgroot, 'opencc_data/STCharacters.txt')) as f1, \
open(path.join(pkgroot, 'opencc_data/STPhrases.txt')) as f2, \
open(path.join(pkgroot, 'extra_convert_table.txt')) as f3: # s2t
for line in chain(f1, f2, f3):
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
v = t2twp(v) if twp else v # s2t -> s2twp
yield k, v
if twp:
with open(path.join(pkgroot, 'opencc_data/TWVariants.txt')) as f: # t2tw
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
k = t2s(k) # t2tw -> s2tw
yield k, v
with open(path.join(pkgroot, 'opencc_data/TWPhrases.txt')) as f: # t2twp
for line in f:
k, vx = line.rstrip('\n').split('\t')
v = vx.split(' ')[0] # Only select the first candidate
k = t2s(k) # t2twp -> s2twp
yield k, v
def go(twp=False):
entries = build_entries(twp=twp)
entries = dict(entries) # remove duplicates
entries = sorted(entries.items(), key=lambda k_v: (
len(k_v[0]), k_v[0]), reverse=True) # sort
twp_suffix = '_twp' if twp else ''
with open(path.join(pkgroot, f'cache/convert_table_words{twp_suffix}.txt'), 'w') as f1, \
open(path.join(pkgroot, f'cache/convert_table_chars{twp_suffix}.txt'), 'w') as f2:
for k, v in entries:
print(k, v, sep='\t', file=f1 if len(k) > 1 else f2)
# Initialize OpenCC converters
t2s = OpenCC('t2s').convert
t2twp = OpenCC(path.join(pkgroot, 't2twp')).convert
go()
go(twp=True)
def build_codepoints():
'''
Determine the necessary codepoints for the font.
Code points of Han characters to be included:
1. Code points in Tongyong Guifan Hanzi Biao (通用規範漢字表)
2. Code points in OpenCC dictionaries
'''
if not file_exists('cache/code_points_han.txt'):
s = set()
with open(path.join(pkgroot, 'cache/通用規範漢字表.txt')) as f:
for line in f:
if line and not line.startswith('#'):
c = line[0]
s.add(ord(c))
with open(path.join(pkgroot, 'opencc_data/STCharacters.txt')) as f1, \
open(path.join(pkgroot, 'opencc_data/STPhrases.txt')) as f2, \
open(path.join(pkgroot, 'opencc_data/TWVariants.txt')) as f3, \
open(path.join(pkgroot, 'opencc_data/TWPhrases.txt')) as f4, \
open(path.join(pkgroot, 'opencc_data/HKVariants.txt')) as f5:
for line in chain(f1, f2, f3, f4, f5):
k, vx = line.rstrip('\n').split('\t')
vs = vx.split(' ')
for c in k:
s.add(ord(c))
for v in vs:
for c in v:
s.add(ord(c))
for c in '妳攞噉㗎冚喺冇哋啲嘢啱佢嘅咁嚟屌咗撚噏瞓𡃁嘥掹孭氹詏噃𨳍掟埞曱甴𥄫𨳊嚿閪冧嬲卌嗻𧨾':
s.add(ord(c))
with open(path.join(pkgroot, 'cache/code_points_han.txt'), 'w') as f:
for cp in sorted(s):
if cp > 128: # remove letters in the dictionaries
print(cp, file=f)
download_data()
build_convert_tables()
build_codepoints()
with open(path.join(here, 'README.md')) as f:
long_description = f.read()
setup(
name='OpenCCFontGenerator',
version='0.0.1',
description='The OpenCC Font Generator',
long_description=long_description,
long_description_content_type='text/markdown',
url='https://github.com/ayaka14732/opencc-font-generator',
author='ayaka14732',
author_email='ayaka@mail.shn.hk',
classifiers=[
'Development Status :: 4 - Beta',
'Intended Audience :: Developers',
'Topic :: Software Development :: Libraries :: Python Modules',
'Topic :: Text Processing :: Linguistic',
'Natural Language :: Chinese (Simplified)',
'Natural Language :: Chinese (Traditional)',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
],
keywords='chinese nlp natural-language-processing',
packages=find_packages('src'),
package_dir={'': 'src'},
package_data={
'OpenCCFontGenerator': ['cache/*', 'opencc_data/*', 'extra_convert_table.txt', 't2twp.json'],
},
include_package_data=True,
python_requires='>=3.8, <4',
install_requires=[],
entry_points={},
project_urls={
'Bug Reports': 'https://github.com/ayaka14732/opencc-font-generator/issues',
'Source': 'https://github.com/ayaka14732/opencc-font-generator',
},
zip_safe=False,
)