Skip to content

Commit

Permalink
Improve scan_vocab speed, build_vocab_from_freq method (piskvorky…
Browse files Browse the repository at this point in the history
…#1695)

* fix build vocab speed issue, and new function to build vocab from previously provided word frequencies table

* fix build vocab speed issue, function build vocab from previously provided word frequencies table

* fix build vocab speed issue, function build vocab from previously provided word frequencies table

* fix build vocab speed issue, function build vocab from previously provided word frequencies table

* Removing the extra blank lines, documentation in numpy-style to build_vocab_from_freq, and hanging indents in build_vocab

* Fixing Indentation

* Fixing gensim/models/word2vec.py:697:1: W293 blank line contains whitespace

* Remove trailing white spaces

* Adding test

* fix spaces

* iteration 2 on code

* iteration 2 on code

* Fixing old version of word2vec.py merge problems

* Fixing indent

* Fixing Styling

* Fixing Styling

* test

* test

* adding total words count test

* adding total words count test
  • Loading branch information
jodevak authored and KMarie1 committed Nov 26, 2017
1 parent bb12a2a commit 4de55b9
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 9 deletions.
17 changes: 12 additions & 5 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,13 +647,19 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
Examples
--------
>>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
>>> from gensim.models.word2vec import Word2Vec
>>> model= Word2Vec()
>>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20})
"""
logger.info("Processing provided word frequencies")
vocab = defaultdict(int, word_freq)
raw_vocab = word_freq # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) to be directly the raw vocab
logger.info(
"collected %i different raw word, with total frequency of %i",
len(raw_vocab), sum(itervalues(raw_vocab))
)

self.corpus_count = corpus_count if corpus_count else 0
self.raw_vocab = vocab
self.corpus_count = corpus_count if corpus_count else 0 # Since no sentences are provided, this is to control the corpus_count
self.raw_vocab = raw_vocab

self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling
self.finalize_vocab(update=update) # build tables & arrays
Expand Down Expand Up @@ -682,7 +688,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
)
for word in sentence:
vocab[word] += 1
total_words += 1
total_words += len(sentence)

if self.max_vocab_size and len(vocab) > self.max_vocab_size:
utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
Expand All @@ -694,6 +700,7 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
)
self.corpus_count = sentence_no + 1
self.raw_vocab = vocab
return total_words

def scale_vocab(self, min_count=None, sample=None, dry_run=False,
keep_raw_vocab=False, trim_rule=None, update=False):
Expand Down
39 changes: 35 additions & 4 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ def testBuildVocabFromFreq(self):
model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5)
model_hs.build_vocab_from_freq(freq_dict)
model_neg.build_vocab_from_freq(freq_dict)
self.assertTrue(len(model_hs.wv.vocab), 12)
self.assertTrue(len(model_neg.wv.vocab), 12)
self.assertEqual(len(model_hs.wv.vocab), 12)
self.assertEqual(len(model_neg.wv.vocab), 12)
self.assertEqual(model_hs.wv.vocab['minors'].count, 2)
self.assertEqual(model_hs.wv.vocab['graph'].count, 3)
self.assertEqual(model_hs.wv.vocab['system'].count, 4)
Expand Down Expand Up @@ -107,11 +107,42 @@ def testBuildVocabFromFreq(self):
new_freq_dict = {'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1}
model_hs.build_vocab_from_freq(new_freq_dict, update=True)
model_neg.build_vocab_from_freq(new_freq_dict, update=True)
self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
self.assertEqual(model_hs.wv.vocab['graph'].count, 4)
self.assertEqual(model_hs.wv.vocab['artificial'].count, 4)
self.assertEqual(len(model_hs.wv.vocab), 14)
self.assertEqual(len(model_neg.wv.vocab), 14)

def testPruneVocab(self):
"""Test Prune vocab while scanning sentences"""
sentences = [
["graph", "system"],
["graph", "system"],
["system", "eps"],
["graph", "system"]
]
model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
self.assertEqual(len(model.wv.vocab), 2)
self.assertEqual(model.wv.vocab['graph'].count, 3)
self.assertEqual(model.wv.vocab['system'].count, 4)

sentences = [
["graph", "system"],
["graph", "system"],
["system", "eps"],
["graph", "system"],
["minors", "survey", "minors", "survey", "minors"]
]
model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0)
self.assertEqual(len(model.wv.vocab), 3)
self.assertEqual(model.wv.vocab['graph'].count, 3)
self.assertEqual(model.wv.vocab['minors'].count, 3)
self.assertEqual(model.wv.vocab['system'].count, 4)

def testTotalWordCount(self):
model = word2vec.Word2Vec(size=10, min_count=0, seed=42)
total_words = model.scan_vocab(sentences)
self.assertEqual(total_words, 29)

def testOnlineLearning(self):
"""Test that the algorithm is able to add new words to the
vocabulary and to a trained model when using a sorted vocabulary"""
Expand Down

0 comments on commit 4de55b9

Please sign in to comment.