-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
support both old and new fastText model #1319
Changes from 17 commits
9f9dd24
d7725ca
de39ab0
f0c3e25
5f5ace6
1509512
d7e5403
58a66c2
9c9d3ec
8ffb220
06ac316
3deb394
b038fdb
4f6aa4d
5c09bdf
5cdf4e6
55a2d37
aeb05c1
092ef86
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -42,6 +42,8 @@ | |
|
||
logger = logging.getLogger(__name__) | ||
|
||
FASTTEXT_FILEFORMAT_MAGIC = 793712314 | ||
|
||
|
||
class FastTextKeyedVectors(KeyedVectors): | ||
""" | ||
|
@@ -257,7 +259,15 @@ def load_binary_data(self, model_binary_file, encoding='utf8'): | |
self.load_vectors(f) | ||
|
||
def load_model_params(self, file_handle): | ||
(dim, ws, epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t) = self.struct_unpack(file_handle, '@12i1d') | ||
magic, version = self.struct_unpack(file_handle, '@2i') | ||
if magic == FASTTEXT_FILEFORMAT_MAGIC: # newer format | ||
self.new_format = True | ||
dim, ws, epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@12i1d') | ||
else: # older format | ||
self.new_format = False | ||
dim = magic | ||
ws = version | ||
epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d') | ||
# Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc) | ||
self.vector_size = dim | ||
self.window = ws | ||
|
@@ -272,11 +282,13 @@ def load_model_params(self, file_handle): | |
self.sample = t | ||
|
||
def load_dict(self, file_handle, encoding='utf8'): | ||
(vocab_size, nwords, _) = self.struct_unpack(file_handle, '@3i') | ||
vocab_size, nwords, _ = self.struct_unpack(file_handle, '@3i') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd prefer keeping the changes related to the issue with the french wiki in a separate PR. We don't want those changes to block this PR from being merged. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. separated the PRs. Thanks :) |
||
# Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) | ||
assert len(self.wv.vocab) == nwords, 'mismatch between vocab sizes' | ||
assert len(self.wv.vocab) == vocab_size, 'mismatch between vocab sizes' | ||
ntokens, = self.struct_unpack(file_handle, '@q') | ||
self.struct_unpack(file_handle, '@1q') # number of tokens | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PEP8: Inline comments should be separated by at least two spaces from the statement |
||
if self.new_format: | ||
pruneidx_size, = self.struct_unpack(file_handle, '@q') | ||
for i in range(nwords): | ||
word_bytes = b'' | ||
char_byte = file_handle.read(1) | ||
|
@@ -285,12 +297,17 @@ def load_dict(self, file_handle, encoding='utf8'): | |
word_bytes += char_byte | ||
char_byte = file_handle.read(1) | ||
word = word_bytes.decode(encoding) | ||
count, _ = self.struct_unpack(file_handle, '@ib') | ||
_ = self.struct_unpack(file_handle, '@i') | ||
count, _ = self.struct_unpack(file_handle, '@qb') | ||
assert self.wv.vocab[word].index == i, 'mismatch between gensim word index and fastText word index' | ||
self.wv.vocab[word].count = count | ||
|
||
if self.new_format: | ||
for j in range(pruneidx_size): | ||
self.struct_unpack(file_handle, '@2i') | ||
|
||
def load_vectors(self, file_handle): | ||
if self.new_format: | ||
self.struct_unpack(file_handle, '@?') # bool quant_input in fasttext.cc | ||
num_vectors, dim = self.struct_unpack(file_handle, '@2q') | ||
# Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc) | ||
assert self.vector_size == dim, 'mismatch between model sizes' | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to set
self.new_format = False
inside thiselse
, rather than insideinitialize_word_vectors