From e92b45d3f83ad670da9cf6ae20ae86a2a1c8558c Mon Sep 17 00:00:00 2001 From: jodevak Date: Thu, 19 Oct 2017 09:37:41 +0300 Subject: [PATCH] Add build_vocab_from_freq to Word2Vec, speedup scan_vocab (#1599) * fix build vocab speed issue, and new function to build vocab from previously provided word frequencies table * fix build vocab speed issue, function build vocab from previously provided word frequencies table * fix build vocab speed issue, function build vocab from previously provided word frequencies table * fix build vocab speed issue, function build vocab from previously provided word frequencies table * Removing the extra blank lines, documentation in numpy-style to build_vocab_from_freq, and hanging indents in build_vocab * Fixing Indentation * Fixing gensim/models/word2vec.py:697:1: W293 blank line contains whitespace * Remove trailing white spaces * Adding test * fix spaces --- gensim/models/word2vec.py | 45 +++++++++++++++++++++++++++++++--- gensim/test/test_word2vec.py | 47 ++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 4 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 20d58a7977..754020a380 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -615,12 +615,49 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_ """ Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. - """ self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling self.finalize_vocab(update=update) # build tables & arrays + def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): + """ + Build vocabulary from a dictionary of word frequencies. + Build model vocabulary from a passed dictionary that contains (word,word count). + Words must be of type unicode strings. + + Parameters + ---------- + `word_freq` : dict + Word,Word_Count dictionary. + `keep_raw_vocab` : bool + If not true, delete the raw vocabulary after the scaling is done and free up RAM. + `corpus_count`: int + Even if no corpus is provided, this argument can set corpus_count explicitly. + `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain + in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). + Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and + returns either `utils.RULE_DISCARD`, `utils.RULE_KEEP` or `utils.RULE_DEFAULT`. + `update`: bool + If true, the new provided words in `word_freq` dict will be added to model's vocab. + + Returns + -------- + None + + Examples + -------- + >>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True) + """ + logger.info("Processing provided word frequencies") + vocab = defaultdict(int, word_freq) + + self.corpus_count = corpus_count if corpus_count else 0 + self.raw_vocab = vocab + + self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) # trim by min_count & precalculate downsampling + self.finalize_vocab(update=update) # build tables & arrays + def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): """Do an initial scan of all words appearing in sentences.""" logger.info("collecting all words and their counts") @@ -641,16 +678,16 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): if sentence_no % progress_per == 0: logger.info( "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", - sentence_no, sum(itervalues(vocab)) + total_words, len(vocab) + sentence_no, total_words, len(vocab) ) for word in sentence: vocab[word] += 1 + total_words += 1 if self.max_vocab_size and len(vocab) > self.max_vocab_size: - total_words += utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) + utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 - total_words += sum(itervalues(vocab)) logger.info( "collected %i word types from a corpus of %i raw words and %i sentences", len(vocab), total_words, sentence_no + 1 diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 81123ccd7a..20fb26ce4b 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -84,6 +84,53 @@ def load_on_instance(): class TestWord2VecModel(unittest.TestCase): + def testBuildVocabFromFreq(self): + """Test that the algorithm is able to build vocabulary from given + frequency table""" + freq_dict = { + 'minors': 2, 'graph': 3, 'system': 4, + 'trees': 3, 'eps': 2, 'computer': 2, + 'survey': 2, 'user': 3, 'human': 2, + 'time': 2, 'interface': 2, 'response': 2 + } + model_hs = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=1, negative=0) + model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5) + model_hs.build_vocab_from_freq(freq_dict) + model_neg.build_vocab_from_freq(freq_dict) + self.assertTrue(len(model_hs.wv.vocab), 12) + self.assertTrue(len(model_neg.wv.vocab), 12) + self.assertEqual(model_hs.wv.vocab['minors'].count, 2) + self.assertEqual(model_hs.wv.vocab['graph'].count, 3) + self.assertEqual(model_hs.wv.vocab['system'].count, 4) + self.assertEqual(model_hs.wv.vocab['trees'].count, 3) + self.assertEqual(model_hs.wv.vocab['eps'].count, 2) + self.assertEqual(model_hs.wv.vocab['computer'].count, 2) + self.assertEqual(model_hs.wv.vocab['survey'].count, 2) + self.assertEqual(model_hs.wv.vocab['user'].count, 3) + self.assertEqual(model_hs.wv.vocab['human'].count, 2) + self.assertEqual(model_hs.wv.vocab['time'].count, 2) + self.assertEqual(model_hs.wv.vocab['interface'].count, 2) + self.assertEqual(model_hs.wv.vocab['response'].count, 2) + self.assertEqual(model_neg.wv.vocab['minors'].count, 2) + self.assertEqual(model_neg.wv.vocab['graph'].count, 3) + self.assertEqual(model_neg.wv.vocab['system'].count, 4) + self.assertEqual(model_neg.wv.vocab['trees'].count, 3) + self.assertEqual(model_neg.wv.vocab['eps'].count, 2) + self.assertEqual(model_neg.wv.vocab['computer'].count, 2) + self.assertEqual(model_neg.wv.vocab['survey'].count, 2) + self.assertEqual(model_neg.wv.vocab['user'].count, 3) + self.assertEqual(model_neg.wv.vocab['human'].count, 2) + self.assertEqual(model_neg.wv.vocab['time'].count, 2) + self.assertEqual(model_neg.wv.vocab['interface'].count, 2) + self.assertEqual(model_neg.wv.vocab['response'].count, 2) + new_freq_dict = {'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1} + model_hs.build_vocab_from_freq(new_freq_dict, update=True) + model_neg.build_vocab_from_freq(new_freq_dict, update=True) + self.assertTrue(model_hs.wv.vocab['graph'].count, 4) + self.assertTrue(model_hs.wv.vocab['artificial'].count, 4) + self.assertEqual(len(model_hs.wv.vocab), 14) + self.assertEqual(len(model_neg.wv.vocab), 14) + def testOnlineLearning(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary"""