Add build_vocab_from_freq to Word2Vec, speedup scan_vocab (#1599)

* fix build vocab speed issue, and new function to build vocab from previously provided word frequencies table * fix build vocab speed issue, function build vocab from previously provided word frequencies table * fix build vocab speed issue, function build vocab from previously provided word frequencies table * fix build vocab speed issue, function build vocab from previously provided word frequencies table * Removing the extra blank lines, documentation in numpy-style to build_vocab_from_freq, and hanging indents in build_vocab * Fixing Indentation * Fixing gensim/models/word2vec.py:697:1: W293 blank line contains whitespace * Remove trailing white spaces * Adding test * fix spaces
piskvorky · Oct 19, 2017 · e92b45d · e92b45d
1 parent 1a1fc44
commit e92b45d
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 4 deletions.
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
@@ -615,12 +615,49 @@ def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_
         """
         Build vocabulary from a sequence of sentences (can be a once-only generator stream).
         Each sentence must be a list of unicode strings.
-
         """
         self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule)  # initial survey
         self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
         self.finalize_vocab(update=update)  # build tables & arrays
 
+    def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
+        """
+        Build vocabulary from a dictionary of word frequencies.
+        Build model vocabulary from a passed dictionary that contains (word,word count).
+        Words must be of type unicode strings.
+
+        Parameters
+        ----------
+        `word_freq` : dict
+            Word,Word_Count dictionary.
+        `keep_raw_vocab` : bool
+            If not true, delete the raw vocabulary after the scaling is done and free up RAM.
+        `corpus_count`: int
+            Even if no corpus is provided, this argument can set corpus_count explicitly.
+        `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain
+        in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count).
+        Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and
+        returns either `utils.RULE_DISCARD`, `utils.RULE_KEEP` or `utils.RULE_DEFAULT`.
+        `update`: bool
+            If true, the new provided words in `word_freq` dict will be added to model's vocab.
+
+        Returns
+        --------
+        None
+
+        Examples
+        --------
+        >>> build_vocab_from_freq({"Word1":15,"Word2":20}, update=True)
+        """
+        logger.info("Processing provided word frequencies")
+        vocab = defaultdict(int, word_freq)
+
+        self.corpus_count = corpus_count if corpus_count else 0
+        self.raw_vocab = vocab
+
+        self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)  # trim by min_count & precalculate downsampling
+        self.finalize_vocab(update=update)  # build tables & arrays
+
     def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
         """Do an initial scan of all words appearing in sentences."""
         logger.info("collecting all words and their counts")
@@ -641,16 +678,16 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
             if sentence_no % progress_per == 0:
                 logger.info(
                     "PROGRESS: at sentence #%i, processed %i words, keeping %i word types",
-                    sentence_no, sum(itervalues(vocab)) + total_words, len(vocab)
+                    sentence_no, total_words, len(vocab)
                 )
             for word in sentence:
                 vocab[word] += 1
+                total_words += 1
 
             if self.max_vocab_size and len(vocab) > self.max_vocab_size:
-                total_words += utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
+                utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule)
                 min_reduce += 1
 
-        total_words += sum(itervalues(vocab))
         logger.info(
             "collected %i word types from a corpus of %i raw words and %i sentences",
             len(vocab), total_words, sentence_no + 1

diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
@@ -84,6 +84,53 @@ def load_on_instance():
 
 
 class TestWord2VecModel(unittest.TestCase):
+    def testBuildVocabFromFreq(self):
+        """Test that the algorithm is able to build vocabulary from given
+        frequency table"""
+        freq_dict = {
+        'minors': 2, 'graph': 3, 'system': 4,
+        'trees': 3, 'eps': 2, 'computer': 2,
+        'survey': 2, 'user': 3, 'human': 2,
+        'time': 2, 'interface': 2, 'response': 2
+        }
+        model_hs = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=1, negative=0)
+        model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5)
+        model_hs.build_vocab_from_freq(freq_dict)
+        model_neg.build_vocab_from_freq(freq_dict)
+        self.assertTrue(len(model_hs.wv.vocab), 12)
+        self.assertTrue(len(model_neg.wv.vocab), 12)
+        self.assertEqual(model_hs.wv.vocab['minors'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['graph'].count, 3)
+        self.assertEqual(model_hs.wv.vocab['system'].count, 4)
+        self.assertEqual(model_hs.wv.vocab['trees'].count, 3)
+        self.assertEqual(model_hs.wv.vocab['eps'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['computer'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['survey'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['user'].count, 3)
+        self.assertEqual(model_hs.wv.vocab['human'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['time'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['interface'].count, 2)
+        self.assertEqual(model_hs.wv.vocab['response'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['minors'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['graph'].count, 3)
+        self.assertEqual(model_neg.wv.vocab['system'].count, 4)
+        self.assertEqual(model_neg.wv.vocab['trees'].count, 3)
+        self.assertEqual(model_neg.wv.vocab['eps'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['computer'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['survey'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['user'].count, 3)
+        self.assertEqual(model_neg.wv.vocab['human'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['time'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['interface'].count, 2)
+        self.assertEqual(model_neg.wv.vocab['response'].count, 2)
+        new_freq_dict = {'computer': 1, 'artificial': 4, 'human': 1, 'graph': 1, 'intelligence': 4, 'system': 1, 'trees': 1}
+        model_hs.build_vocab_from_freq(new_freq_dict, update=True)
+        model_neg.build_vocab_from_freq(new_freq_dict, update=True)
+        self.assertTrue(model_hs.wv.vocab['graph'].count, 4)
+        self.assertTrue(model_hs.wv.vocab['artificial'].count, 4)
+        self.assertEqual(len(model_hs.wv.vocab), 14)
+        self.assertEqual(len(model_neg.wv.vocab), 14)
+
     def testOnlineLearning(self):
         """Test that the algorithm is able to add new words to the
         vocabulary and to a trained model when using a sorted vocabulary"""