Skip to content

Commit

Permalink
Add build_vocab_from_freq to Word2Vec, speedup scan_vocab (#1599)
Browse files Browse the repository at this point in the history
* fix build vocab speed issue, and new function to build vocab from previously provided word frequencies table

* fix build vocab speed issue, function build vocab from previously provided word frequencies table

* fix build vocab speed issue, function build vocab from previously provided word frequencies table

* fix build vocab speed issue, function build vocab from previously provided word frequencies table

* Removing the extra blank lines, documentation in numpy-style to build_vocab_from_freq, and hanging indents in build_vocab

* Fixing Indentation

* Fixing gensim/models/word2vec.py:697:1: W293 blank line contains whitespace

* Remove trailing white spaces

* Adding test

* fix spaces
  • Loading branch information
jodevak authored and menshikh-iv committed Oct 19, 2017
1 parent 1a1fc44 commit e92b45d
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 4 deletions.
45 changes: 41 additions & 4 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -615,12 +615,49 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_
"""
Build vocabulary from a sequence of sentences (can be a once-only generator stream).
Each sentence must be a list of unicode strings.
"""
self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey
self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling
self.finalize_vocab(update=update) # build tables & arrays

def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
"""
Build vocabulary from a dictionary of word frequencies.
Build model vocabulary from a passed dictionary that contains (word,word count).
Words must be of type unicode strings.
Parameters
----------
`word_freq` : dict
Word,Word_Count dictionary.
`keep_raw_vocab` : bool
If not true, delete the raw vocabulary after the scaling is done and free up RAM.
`corpus_count`: int
Even if no corpus is provided, this argument can set corpus_count explicitly.
`trim_rule` = vocabulary trimming rule, specifies whether certain words should remain
in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count).
Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and
returns either `utils.RULE_DISCARD`, `utils.RULE_KEEP` or `utils.RULE_DEFAULT`.
`update`: bool
If true, the new provided words in `word_freq` dict will be added to model's vocab.
Returns
--------
None
Examples
--------
>>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
"""
logger.info("Processing provided word frequencies")
vocab = defaultdict(int, word_freq)

self.corpus_count = corpus_count if corpus_count else 0
self.raw_vocab = vocab

self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling
self.finalize_vocab(update=update) # build tables & arrays

def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
"""Do an initial scan of all words appearing in sentences."""
logger.info("collecting all words and their counts")
Expand All @@ -641,16 +678,16 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
if sentence_no % progress_per == 0:
logger.info(
"PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
sentence_no, sum(itervalues(vocab)) + total_words, len(vocab)
sentence_no, total_words, len(vocab)
)
for word in sentence:
vocab[word] += 1
total_words += 1

if self.max_vocab_size and len(vocab) > self.max_vocab_size:
total_words += utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
min_reduce += 1

total_words += sum(itervalues(vocab))
logger.info(
"collected %i word types from a corpus of %i raw words and %i sentences",
len(vocab), total_words, sentence_no + 1
Expand Down
47 changes: 47 additions & 0 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,53 @@ def load_on_instance():


class TestWord2VecModel(unittest.TestCase):
def testBuildVocabFromFreq(self):
"""Test that the algorithm is able to build vocabulary from given
frequency table"""
freq_dict = {
'minors': 2, 'graph': 3, 'system': 4,
'trees': 3, 'eps': 2, 'computer': 2,
'survey': 2, 'user': 3, 'human': 2,
'time': 2, 'interface': 2, 'response': 2
}
model_hs = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=1, negative=0)
model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5)
model_hs.build_vocab_from_freq(freq_dict)
model_neg.build_vocab_from_freq(freq_dict)
self.assertTrue(len(model_hs.wv.vocab), 12)
self.assertTrue(len(model_neg.wv.vocab), 12)
self.assertEqual(model_hs.wv.vocab['minors'].count, 2)
self.assertEqual(model_hs.wv.vocab['graph'].count, 3)
self.assertEqual(model_hs.wv.vocab['system'].count, 4)
self.assertEqual(model_hs.wv.vocab['trees'].count, 3)
self.assertEqual(model_hs.wv.vocab['eps'].count, 2)
self.assertEqual(model_hs.wv.vocab['computer'].count, 2)
self.assertEqual(model_hs.wv.vocab['survey'].count, 2)
self.assertEqual(model_hs.wv.vocab['user'].count, 3)
self.assertEqual(model_hs.wv.vocab['human'].count, 2)
self.assertEqual(model_hs.wv.vocab['time'].count, 2)
self.assertEqual(model_hs.wv.vocab['interface'].count, 2)
self.assertEqual(model_hs.wv.vocab['response'].count, 2)
self.assertEqual(model_neg.wv.vocab['minors'].count, 2)
self.assertEqual(model_neg.wv.vocab['graph'].count, 3)
self.assertEqual(model_neg.wv.vocab['system'].count, 4)
self.assertEqual(model_neg.wv.vocab['trees'].count, 3)
self.assertEqual(model_neg.wv.vocab['eps'].count, 2)
self.assertEqual(model_neg.wv.vocab['computer'].count, 2)
self.assertEqual(model_neg.wv.vocab['survey'].count, 2)
self.assertEqual(model_neg.wv.vocab['user'].count, 3)
self.assertEqual(model_neg.wv.vocab['human'].count, 2)
self.assertEqual(model_neg.wv.vocab['time'].count, 2)
self.assertEqual(model_neg.wv.vocab['interface'].count, 2)
self.assertEqual(model_neg.wv.vocab['response'].count, 2)
new_freq_dict = {'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1}
model_hs.build_vocab_from_freq(new_freq_dict, update=True)
model_neg.build_vocab_from_freq(new_freq_dict, update=True)
self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
self.assertEqual(len(model_hs.wv.vocab), 14)
self.assertEqual(len(model_neg.wv.vocab), 14)

def testOnlineLearning(self):
"""Test that the algorithm is able to add new words to the
vocabulary and to a trained model when using a sorted vocabulary"""
Expand Down

0 comments on commit e92b45d

Please sign in to comment.