piskvorky · menshikh-iv · Jul 20, 2017 · Jun 30, 2017 · Jul 5, 2017 · Jul 5, 2017
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
@@ -64,6 +64,8 @@
 import warnings
 from collections import defaultdict
 import itertools as it
+from functools import partial
+from math import log
 
 from six import iteritems, string_types, next
 
@@ -106,7 +108,8 @@ class Phrases(interfaces.TransformationABC):
 
     """
     def __init__(self, sentences=None, min_count=5, threshold=10.0,
-                 max_vocab_size=40000000, delimiter=b'_', progress_per=10000):
+                 max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
+                 scoring='default'):
         """
         Initialize the model from an iterable of `sentences`. Each sentence must be
         a list of words (unicode strings) that will be used for training.
@@ -120,10 +123,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         `min_count` ignore all words and bigrams with total collected count lower
         than this.
 
-        `threshold` represents a threshold for forming the phrases (higher means
-        fewer phrases). A phrase of words `a` and `b` is accepted if
-        `(cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold`, where `N` is the
-        total vocabulary size.
+        `threshold` represents a score threshold for forming the phrases (higher means
+        fewer phrases). A phrase of words `a` followed by `b` is accepted if the score of the
+        phrase is greater than threshold. see the `scoring' setting
 
         `max_vocab_size` is the maximum size of the vocabulary. Used to control
         pruning of less common words, to keep memory under control. The default
@@ -133,12 +135,31 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         `delimiter` is the glue character used to join collocation tokens, and
         should be a byte string (e.g. b'_').
 
+        `scoring` specifies how potential phrases are scored for comparison to the `threshold`
+        setting. two settings are available:
+        'default': from "Efficient Estimaton of Word Representations in Vector Space" by
+            Mikolov, et. al.:
+            (count(worda followed by wordb) - min_count) * N /
+            (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size.
+        'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual
+            Information in Colocation Extraction" by Gerlof Bouma:
+            ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) /
+            - ln(prop(worda followed by wordb)
+            where prop(n) is the count of n / the count of everything in the entire corpus
+        'npmi' is more robust when dealing with common words that form part of common bigrams, and
+            ranges from -1 to 1, but is slower to calculate than the default
+
         """
         if min_count <= 0:
             raise ValueError("min_count should be at least 1")
 
-        if threshold <= 0:
-            raise ValueError("threshold should be positive")
+        if threshold <= 0 and scoring == 'default':
+            raise ValueError("threshold should be positive for default scoring")
+        if scoring == 'npmi' and (threshold < -1 or threshold > 1):
+            raise ValueError("threshold should be between -1 and 1 for npmi scoring")
+
+        if not (scoring == 'default' or scoring == 'npmi'):
+            raise ValueError('unknown scoring function "' + scoring + '" specified')
 
         self.min_count = min_count
         self.threshold = threshold
@@ -147,6 +168,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         self.min_reduce = 1  # ignore any tokens with count smaller than this
         self.delimiter = delimiter
         self.progress_per = progress_per
+        self.scoring = scoring
+        self.corpus_word_count = 0
 
         if sentences is not None:
             self.add_vocab(sentences)
@@ -178,14 +201,15 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
             if sentence:  # add last word skipped by previous loop
                 word = sentence[-1]
                 vocab[word] += 1
+                total_words += 1
 
             if len(vocab) > max_vocab_size:
                 utils.prune_vocab(vocab, min_reduce)
                 min_reduce += 1
 
         logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
                     (len(vocab), total_words, sentence_no + 1))
-        return min_reduce, vocab
+        return min_reduce, vocab, total_words
 
     def add_vocab(self, sentences):
         """
@@ -197,8 +221,10 @@ def add_vocab(self, sentences):
         # directly, but gives the new sentences a fighting chance to collect
         # sufficient counts, before being pruned out by the (large) accummulated
         # counts collected in previous learn_vocab runs.
-        min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per)
+        min_reduce, vocab, total_words = \
+        self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per)
 
+        self.corpus_word_count += total_words
         if len(self.vocab) > 0:
             logger.info("merging %i counts into %s", len(vocab), self)
             self.min_reduce = max(self.min_reduce, min_reduce)
@@ -226,31 +252,47 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
 
             then you can debug the threshold with generated tsv
         """
+
+        vocab = self.vocab
+        threshold = self.threshold
+        delimiter = self.delimiter  # delimiter used for lookup
+        min_count = self.min_count
+        scoring = self.scoring
+        corpus_word_count = self.corpus_word_count
+
+        if scoring == 'default':
+            scoring_function = \
+            partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
+        elif scoring == 'npmi':
+            scoring_function = \
+            partial(self.npmi_scorer, corpus_word_count=corpus_word_count)
+        # no else here to catch unknown scoring function, check is done in Phrases.__init__
+
         for sentence in sentences:
             s = [utils.any2utf8(w) for w in sentence]
             last_bigram = False
-            vocab = self.vocab
-            threshold = self.threshold
-            delimiter = self.delimiter  # delimiter used for lookup
-            min_count = self.min_count
+
             for word_a, word_b in zip(s, s[1:]):
-                if word_a in vocab and word_b in vocab:
+                # last bigram check was moved here to save a few CPU cycles
+                if word_a in vocab and word_b in vocab and not last_bigram:
                     bigram_word = delimiter.join((word_a, word_b))
-                    if bigram_word in vocab and not last_bigram:
-                        pa = float(vocab[word_a])
-                        pb = float(vocab[word_b])
-                        pab = float(vocab[bigram_word])
-                        score = (pab - min_count) / pa / pb * len(vocab)
+                    if bigram_word in vocab:
+                        count_a = float(vocab[word_a])
+                        count_b = float(vocab[word_b])
+                        count_ab = float(vocab[bigram_word])
+                        score = scoring_function(count_a, count_b, count_ab)
                         # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
                         #     bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
-                        if score > threshold:
+                        # added mincount check because if the scorer doesn't contain min_count
+                        # it would not be enforced otherwise
+                        if score > threshold and count_ab >= min_count:
                             if as_tuples:
                                 yield ((word_a, word_b), score)
                             else:
                                 yield (out_delimiter.join((word_a, word_b)), score)
                             last_bigram = True
                             continue
-                    last_bigram = False
+                last_bigram = False
 
     def __getitem__(self, sentence):
         """
@@ -311,6 +353,20 @@ def __getitem__(self, sentence):
 
         return [utils.to_unicode(w) for w in new_s]
 
+    # calculation of score based on original mikolov word2vec paper
+    # len_vocab and min_count set so functools.partial works
+    @staticmethod
+    def original_scorer(worda_count, wordb_count, bigram_count, len_vocab=0.0, min_count=0.0):
+        return (bigram_count - min_count) / worda_count / wordb_count * len_vocab
+
+    # normalized PMI, requires corpus size
+    @staticmethod
+    def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count=0.0):
+        pa = worda_count / corpus_word_count
+        pb = wordb_count / corpus_word_count
+        pab = bigram_count / corpus_word_count
+        return log(pab / (pa * pb)) / -log(pab)
+
 
 def pseudocorpus(source_vocab, sep):
     """Feeds source_vocab's compound keys back to it, to discover phrases"""
@@ -329,15 +385,16 @@ class Phraser(interfaces.TransformationABC):
     After the one-time initialization, a Phraser will be much smaller and
     somewhat faster than using the full Phrases model.
 
-    Reflects the results of the source model's `min_count` and `threshold`
-    settings. (You can tamper with those & create a new Phraser to try
+    Reflects the results of the source model's `min_count`, `threshold`, and
+    `scoring` settings. (You can tamper with those & create a new Phraser to try
     other values.)
 
     """
     def __init__(self, phrases_model):
         self.threshold = phrases_model.threshold
         self.min_count = phrases_model.min_count
         self.delimiter = phrases_model.delimiter
+        self.scoring = phrases_model.scoring
         self.phrasegrams = {}
         corpus = pseudocorpus(phrases_model.vocab, phrases_model.delimiter)
         logger.info('source_vocab length %i', len(phrases_model.vocab))

diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
@@ -138,7 +138,7 @@ def testExportPhrases(self):
             b'human interface'
         ])
 
-    def test_multiple_bigrams_single_entry(self):
+    def testMultipleBigramsSingleEntry(self):
         """ a single entry should produce multiple bigrams. """
         bigram = Phrases(sentences, min_count=1, threshold=1)
 
@@ -153,6 +153,36 @@ def test_multiple_bigrams_single_entry(self):
             b'human interface'
         ])
 
+    def testScoringDefault(self):
+        """ test the default scoring, from the mikolov word2vec paper """
+        bigram = Phrases(sentences, min_count=1, threshold=1)
+
+        seen_scores = set()
+
+        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
+        for phrase, score in bigram.export_phrases(test_sentences):
+            seen_scores.add(round(score, 3))
+
+        assert seen_scores == set([
+            5.167,  # score for graph minors
+            3.444  # score for human interface
+        ])
+
+    def testScoringNpmi(self):
+        """ test normalized pointwise mutual information scoring """
+        bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi')
+
+        seen_scores = set()
+
+        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
+        for phrase, score in bigram.export_phrases(test_sentences):
+            seen_scores.add(round(score, 3))
+
+        assert seen_scores == set([
+            .882,  # score for graph minors
+            .714  # score for human interface
+        ])
+
     def testBadParameters(self):
         """Test the phrases module with bad parameters."""
         # should fail with something less or equal than 0