-
Notifications
You must be signed in to change notification settings - Fork 0
/
builddict.py
149 lines (127 loc) · 4.83 KB
/
builddict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from bs4 import BeautifulSoup
import requests
import xlrd
import pandas as pd
from pathlib import Path
import time
import datetime as dt
import time
from pykakasi import kakasi as Kakasi
import unicodedata
#Open new and old dictionaries
script_dir = Path(__file__).parent
newdict_path = script_dir / "newdict.xlsx"
sheet_path = str(script_dir / "vdrj.xls")
sheet = sheet_path
wbc = xlrd.open_workbook(sheet_path, encoding_override='utf-8')
#Move contents of old dictionary to lists
df = pd.read_excel(sheet, sheet_name="list", usecols="C", dtype = object)
df2 = pd.read_excel(sheet, sheet_name="list", usecols="A", dtype = object)
data = {'Japanese': [], 'Kana': []}
newdict_df = pd.DataFrame(data)
wordlist = df.values.tolist()
kanlist = df2.values.tolist()
def is_japanese_char(ch):
c = ord(ch)
return (
0x4E00 <= c <= 0x9FFF # CJK Unified Ideographs (Kanji)
or 0x3040 <= c <= 0x309F # Hiragana
or 0x30A0 <= c <= 0x30FF # Katakana
)
def normalize_and_filter_japanese(text):
normalized_text = unicodedata.normalize("NFKC", text)
return ''.join(filter(is_japanese_char, normalized_text))
def grabword(w): #Grab related words/phrases to given word (w) from Weblio's dictionary
#Get HTML content of dictionary
max_retries = 10
retry_wait_time = 10 # seconds
for attempt in range(max_retries):
try:
url = f'https://ejje.weblio.jp/content/{w}'
response = requests.get(url)
break
except requests.exceptions.RequestException as e:
if attempt < max_retries - 1:
print(f"Request failed with error: {e}. Retrying in {retry_wait_time} seconds...")
time.sleep(retry_wait_time)
else:
print(f"Request failed after {max_retries} attempts. Skipping word...")
return []
soup = BeautifulSoup(response.content, 'html.parser')
#Grab all related word element from dictionary page
relatedwords = soup.find_all('div', class_='werbjJ')
morewords = []
#Isolate words and send to related word list
for word in relatedwords:
awesome = word.find_all('p')[-1].text.strip()
morewords.append(awesome)
return morewords
def kanaconvert(word):
kakasi_instance = Kakasi()
kakasi_instance.setMode("J", "K") # Convert from Japanese (kanji and hiragana) to katakana
kakasi_instance.setMode("H", "K") # Convert from hiragana to katakana
kakasi_instance.setMode("K", "K") # Convert from katakana to katakana
converter = kakasi_instance.getConverter()
kana = converter.do(word)
return kana
def systemstart(word): #Gets isolated word list and un-nests it
#Grab related words
thelist = grabword(word)
#Check if there are any words in the related words list
try:
cool = thelist[0]
except IndexError:
pass
#Add original word to related words list
thelist.append(word[0])
return thelist
def frmtdelta(delta): #Elapsed time function (from stackoverflow)
d = {"days": delta.days}
d["hr"], rem = divmod(delta.seconds, 3600)
d["min"], d["sec"] = divmod(rem, 60)
return "{days}d, {hr}h, {min}m, {sec}s".format(**d)
def checktime(): #Update time elapsed (also technically from stackoverflow)
now = time.time()
delta = dt.timedelta(seconds=int(now - start))
elapsed = frmtdelta(delta)
time.sleep(.1)
return elapsed
#Set up variables and open newdict.xlsx
start = time.time()
numb = 0
other = 0
checklist = []
duplicates = 0
#Grab related words for every word in original dictionary
for word in kanlist:
other += 1
success = systemstart(word)
for y in success:
normaly = normalize_and_filter_japanese(y)
z = kanaconvert(normaly)
numb += 1
newdict_df = pd.concat([newdict_df, pd.DataFrame({'Japanese': [normaly], 'Kana': [z]})], ignore_index=True)
# Remove the following line:
# checklist.extend([normalize_and_filter_japanese(y) for y in set(success)])
progress = int(100*(other/50816))
magnitude = round(numb/other, 1)
elapsedtime = checktime()
print(f'{progress}% done... ({other} out of 50816 processed)')
print(f'New dictionary length: {numb} ({magnitude}x bigger so far!)')
#Give me the stats after the job is done
#workbook.close()
length1 = len(newdict_df)
newdict_df = newdict_df.drop_duplicates(subset=["Japanese", "Kana"])
length2 = len(newdict_df)
# Save the DataFrame back to the newdict.xlsx file
duplicates = length1 - length2
numb -= duplicates
magnitude = round(numb/other, 1)
newdict_df.to_excel(newdict_path, index=False)
print("Done!")
print(f"Words processed: {other} out of 50816")
print(f"Words added: {numb} (a {magnitude}x increase!)")
print(f"Duplicates found: {duplicates}")
print(f"Success rate: {progress}%")
print(f"Time spent: {elapsedtime}")
print(f"All in all, a pretty nice Monday! :)")