-
Notifications
You must be signed in to change notification settings - Fork 0
/
wsi.py
executable file
·117 lines (106 loc) · 4.76 KB
/
wsi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python3
from os import path
from pandas import read_csv
from evaluate import evaluate
import argparse
import sys
import numpy as np
import gensim
import logging
from sklearn.cluster import AffinityPropagation, SpectralClustering
from helpers import visualize, fingerprint, save
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def main():
parser = argparse.ArgumentParser()
arg = parser.add_argument
arg('--input', help='Path to input file with contexts', required=True)
arg('--model', help='Path to word2vec model', required=True)
arg('--weights', dest='weights', action='store_true', help='Use word weights?')
arg('--2stage', dest='twostage', action='store_true', help='2-stage clustering?')
arg('--test', dest='testing', action='store_true', help='Make predictions for test file with no gold labels?')
parser.set_defaults(testing=False)
parser.set_defaults(twostage=False)
parser.set_defaults(weights=False)
args = parser.parse_args()
modelfile = args.model
if modelfile.endswith('.bin.gz'): # Word2vec binary format
model = gensim.models.KeyedVectors.load_word2vec_format(modelfile, binary=True)
elif modelfile.endswith('.vec.gz'): # Word2vec text format
model = gensim.models.KeyedVectors.load_word2vec_format(modelfile, binary=False)
elif 'fasttext' in modelfile and modelfile.endswith('model'): # fastText in Gensim native format
model = gensim.models.fasttext.FastText.load(modelfile)
else: # word2vec in Gensim native format
model = gensim.models.KeyedVectors.load(modelfile)
model.init_sims(replace=True)
dataset = args.input
# This combination of the Affinity Propagation parameters was best in our experiments.
# But in your task they can be different!
damping = 0.7
preference = -0.7
df = read_csv(dataset, sep="\t", encoding="utf-8")
predicted = []
goldsenses = []
for query in df.word.unique():
print('Now analyzing', query, '...', file=sys.stderr)
subset = df[df.word == query]
if not args.testing:
goldsenses.append(len(subset.gold_sense_id.unique()))
contexts = []
matrix = np.empty((subset.shape[0], model.vector_size))
counter = 0
lengths = []
for line in subset.iterrows():
con = line[1].context
identifier = line[1].context_id
label = query + str(identifier)
contexts.append(label)
if type(con) == float:
print('Empty context at', label, file=sys.stderr)
fp = np.zeros(model.vector_size)
else:
bow = con.split()
bow = [b for b in bow if b != query]
fp = fingerprint(bow, model, weights=args.weights)
lengths.append(len(bow))
matrix[counter, :] = fp
counter += 1
clustering = AffinityPropagation(preference=preference, damping=damping, random_state=None).fit(matrix)
# Two-stage clustering
if args.twostage:
nclusters = len(clustering.cluster_centers_indices_)
if nclusters < 1:
print('Fallback to 1 cluster!', file=sys.stderr)
nclusters = 1
elif nclusters == len(contexts):
print('Fallback to 4 clusters!', file=sys.stderr)
nclusters = 4
clustering = SpectralClustering(n_clusters=nclusters, n_init=20,
assign_labels='discretize', n_jobs=2).fit(matrix)
# End two-stage clustering
cur_predicted = clustering.labels_.tolist()
predicted += cur_predicted
if not args.testing:
gold = subset.gold_sense_id
print('Gold clusters:', len(set(gold)), file=sys.stderr)
print('Predicted clusters:', len(set(cur_predicted)), file=sys.stderr)
if args.testing:
if len(set(cur_predicted)) < 12:
visualize(contexts, matrix, cur_predicted, query)
else:
if len(set(gold)) < 6 and len(set(cur_predicted)) < 12:
visualize(contexts, matrix, cur_predicted, query, gold)
else:
print('Too many clusters, not visualizing', file=sys.stderr)
df.predict_sense_id = predicted
fname = path.splitext(path.basename(args.input))[0]
if args.testing:
save(df, fname)
else:
res = evaluate(save(df, fname))
print('ARI:', res)
print('Average number of senses:', np.average(goldsenses))
print('Variation of the number of senses:', np.std(goldsenses))
print('Minimum number of senses:', np.min(goldsenses))
print('Maximum number of senses:', np.max(goldsenses))
if __name__ == '__main__':
main()