From 04999a4e071b9ad9d10d7c261f45312c118e33b0 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Fri, 30 Jun 2017 16:19:48 -0400
Subject: [PATCH 01/11] initial commit of alternate scoring

now with a scoring parameter to initialize a Phrases object, defaults to
the mikolov paper scoring, but also switchable to 'npmi', normalized
pointwise mutual information

moved scoring calculation to call a function, scoring functions are now
top level functions in models.Phrases that are called when calculating
scores in models.Phrases.export_phrases
---
 gensim/models/phrases.py | 82 +++++++++++++++++++++++++++++++---------
 1 file changed, 64 insertions(+), 18 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index be735b865a..5ae9100d94 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -106,7 +106,8 @@ class Phrases(interfaces.TransformationABC):
 
     """
     def __init__(self, sentences=None, min_count=5, threshold=10.0,
-                 max_vocab_size=40000000, delimiter=b'_', progress_per=10000):
+                 max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
+				 scoring = 'default'):
         """
         Initialize the model from an iterable of `sentences`. Each sentence must be
         a list of words (unicode strings) that will be used for training.
@@ -120,10 +121,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         `min_count` ignore all words and bigrams with total collected count lower
         than this.
 
-        `threshold` represents a threshold for forming the phrases (higher means
-        fewer phrases). A phrase of words `a` and `b` is accepted if
-        `(cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold`, where `N` is the
-        total vocabulary size.
+        `threshold` represents a score threshold for forming the phrases (higher means
+        fewer phrases). A phrase of words `a` followed by `b` is accepted if the score of the
+        phrase is greater than threshold. see the `scoring' setting
 
         `max_vocab_size` is the maximum size of the vocabulary. Used to control
         pruning of less common words, to keep memory under control. The default
@@ -133,6 +133,20 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         `delimiter` is the glue character used to join collocation tokens, and
         should be a byte string (e.g. b'_').
 
+        `scoring` specifies how potential phrases are scored for comparison to the `threshold`
+        setting. two settings are available:
+        'default': from "Efficient Estimaton of Word Representations in Vector Space" by
+            Mikolov, et. al.:
+            (count(worda followed by wordb) - min_count) * N /
+            (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size.
+        'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual
+            Information in Colocation Extraction" by Gerlof Bouma:
+            ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) /
+            - ln(prop(worda followed by wordb)
+            where prop(n) is the count of n / the count of everything in the entire corpus
+        'npmi' is more robust when dealing with common words that form part of common bigrams, and
+            ranges from 0 to 1, but is slower to calculate than the default
+
         """
         if min_count <= 0:
             raise ValueError("min_count should be at least 1")
@@ -147,6 +161,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         self.min_reduce = 1  # ignore any tokens with count smaller than this
         self.delimiter = delimiter
         self.progress_per = progress_per
+		self.scoring = scoring
+		self.corpus_word_count = 0L
 
         if sentences is not None:
             self.add_vocab(sentences)
@@ -178,6 +194,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
             if sentence:  # add last word skipped by previous loop
                 word = sentence[-1]
                 vocab[word] += 1
+				total_words += 1
 
             if len(vocab) > max_vocab_size:
                 utils.prune_vocab(vocab, min_reduce)
@@ -185,7 +202,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
 
         logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
                     (len(vocab), total_words, sentence_no + 1))
-        return min_reduce, vocab
+        return min_reduce, vocab, total_words
 
     def add_vocab(self, sentences):
         """
@@ -199,6 +216,7 @@ def add_vocab(self, sentences):
         # counts collected in previous learn_vocab runs.
         min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per)
 
+        self.corpus_word_count += total_words
         if len(self.vocab) > 0:
             logger.info("merging %i counts into %s", len(vocab), self)
             self.min_reduce = max(self.min_reduce, min_reduce)
@@ -226,31 +244,45 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
 
             then you can debug the threshold with generated tsv
         """
+
+        vocab = self.vocab
+        threshold = self.threshold
+        delimiter = self.delimiter  # delimiter used for lookup
+        min_count = self.min_count
+        scoring = self.scoring
+        corpus_word_count = self.corpus_word_count
+
+        if scoring == 'mikolov':
+            scoring_function = partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
+        if scoring == 'npmi':
+            scoring_function = partial(self.npmi_scorer, corpus_word_count = corpus_word_count)
+        #TODO else: make sure this asserts if there is no scoring function
+
         for sentence in sentences:
             s = [utils.any2utf8(w) for w in sentence]
             last_bigram = False
-            vocab = self.vocab
-            threshold = self.threshold
-            delimiter = self.delimiter  # delimiter used for lookup
-            min_count = self.min_count
+
             for word_a, word_b in zip(s, s[1:]):
-                if word_a in vocab and word_b in vocab:
+                # last bigram check was moved here to save a few CPU cycles
+                if word_a in vocab and word_b in vocab and not last_bigram:
                     bigram_word = delimiter.join((word_a, word_b))
-                    if bigram_word in vocab and not last_bigram:
-                        pa = float(vocab[word_a])
-                        pb = float(vocab[word_b])
-                        pab = float(vocab[bigram_word])
-                        score = (pab - min_count) / pa / pb * len(vocab)
+                    if bigram_word in vocab:
+                        count_a = float(vocab[word_a])
+                        count_b = float(vocab[word_b])
+                        count_ab = float(vocab[bigram_word])
+                        score = scoring_function(count_a, count_b, count_ab)
                         # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
                         #     bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
-                        if score > threshold:
+                        # added mincount check because if the scorer doesn't contain min_count
+                        # it would not be enforced otherwise
+                        if score > threshold and count_ab >= min_count:
                             if as_tuples:
                                 yield ((word_a, word_b), score)
                             else:
                                 yield (out_delimiter.join((word_a, word_b)), score)
                             last_bigram = True
                             continue
-                    last_bigram = False
+                last_bigram = False
 
     def __getitem__(self, sentence):
         """
@@ -311,6 +343,20 @@ def __getitem__(self, sentence):
 
         return [utils.to_unicode(w) for w in new_s]
 
+    # calculation of score based on original mikolov word2vec paper
+    # len_vocab and min_count set so functools.partial works
+    @staticmethod
+    def original_scorer(worda_count, wordb_count, bigram_count, len_vocab = 0.0, min_count = 0.0):
+        return (bigram_count - min_count) / worda_count / wordb_count * len_vocab
+
+    # normalized PMI, requires corpus size
+    @staticmethod
+    def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count = 0.0):
+        pa = worda_count / corpus_word_count
+        pb = wordb_count / corpus_word_count
+        pab = bigram_count / corpus_word_count
+        return log(pab / (pa * pb)) / -log(pab)
+
 
 def pseudocorpus(source_vocab, sep):
     """Feeds source_vocab's compound keys back to it, to discover phrases"""

From 8984589ade598dabf31c343e470be4da548dc0f7 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Wed, 5 Jul 2017 11:41:17 -0400
Subject: [PATCH 02/11] all existing tests now pass

fixed some bugs with the pluggable scoring that were causing tests to
fail.
---
 gensim/models/phrases.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 5ae9100d94..7446c9ccfc 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -64,6 +64,7 @@
 import warnings
 from collections import defaultdict
 import itertools as it
+from functools import partial
 
 from six import iteritems, string_types, next
 
@@ -161,8 +162,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         self.min_reduce = 1  # ignore any tokens with count smaller than this
         self.delimiter = delimiter
         self.progress_per = progress_per
-		self.scoring = scoring
-		self.corpus_word_count = 0L
+        self.scoring = scoring
+        self.corpus_word_count = 0L
 
         if sentences is not None:
             self.add_vocab(sentences)
@@ -194,7 +195,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
             if sentence:  # add last word skipped by previous loop
                 word = sentence[-1]
                 vocab[word] += 1
-				total_words += 1
+                total_words += 1
 
             if len(vocab) > max_vocab_size:
                 utils.prune_vocab(vocab, min_reduce)
@@ -214,7 +215,8 @@ def add_vocab(self, sentences):
         # directly, but gives the new sentences a fighting chance to collect
         # sufficient counts, before being pruned out by the (large) accummulated
         # counts collected in previous learn_vocab runs.
-        min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per)
+        min_reduce, vocab, total_words = \
+        self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per)
 
         self.corpus_word_count += total_words
         if len(self.vocab) > 0:
@@ -252,11 +254,12 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
         scoring = self.scoring
         corpus_word_count = self.corpus_word_count
 
-        if scoring == 'mikolov':
+        if scoring == 'default':
             scoring_function = partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
-        if scoring == 'npmi':
+        elif scoring == 'npmi':
             scoring_function = partial(self.npmi_scorer, corpus_word_count = corpus_word_count)
-        #TODO else: make sure this asserts if there is no scoring function
+        else:
+            raise ValueError('unknown scoring function specified')
 
         for sentence in sentences:
             s = [utils.any2utf8(w) for w in sentence]

From a36b2fb1d804c900c5eb250ef37a9b7c02189a36 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Wed, 5 Jul 2017 12:24:05 -0400
Subject: [PATCH 03/11] added testScoringOriginal to test default scoring

---
 gensim/test/test_phrases.py | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index ba2cfc7192..3f5a7080f6 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -138,7 +138,7 @@ def testExportPhrases(self):
             b'human interface'
         ])
 
-    def test_multiple_bigrams_single_entry(self):
+    def testMultipleBigramsSingleEntry(self):
         """ a single entry should produce multiple bigrams. """
         bigram = Phrases(sentences, min_count=1, threshold=1)
 
@@ -153,6 +153,40 @@ def test_multiple_bigrams_single_entry(self):
             b'human interface'
         ])
 
+    def testScoringOriginal(self):
+        """ a single entry should produce multiple bigrams. """
+        bigram = Phrases(sentences, min_count=1, threshold=1)
+
+        seen_scores = set()
+
+        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
+        for phrase, score in bigram.export_phrases(test_sentences):
+            seen_scores.add(round(score,3))
+
+        assert seen_scores == set([
+            5.167, # score for graph minors
+            3.444 # score for human interface
+        ])
+
+    def testScoringNpmi(self):
+        """ a single entry should produce multiple bigrams. """
+        bigram = Phrases(sentences, min_count=1, threshold=1)
+
+        seen_bigrams = set()
+        seen_scores = set()
+
+        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
+        logging.debug(str(len(bigram.vocab)))
+        for phrase, score in bigram.export_phrases(test_sentences):
+            logging.debug('test Scoring Phrase ' + phrase)
+            logging.debug('test Scoring score ' + str(score))
+            seen_scores.add(round(score,3))
+
+        assert seen_scores == set([
+            5.167,
+            3.444
+        ])
+
     def testBadParameters(self):
         """Test the phrases module with bad parameters."""
         # should fail with something less or equal than 0

From 5043dbbae4eba7821d4de8237c04b4858358e8cf Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Wed, 5 Jul 2017 12:26:48 -0400
Subject: [PATCH 04/11] better name for test for default scorer

---
 gensim/test/test_phrases.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index 3f5a7080f6..bdf6997c3b 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -153,7 +153,7 @@ def testMultipleBigramsSingleEntry(self):
             b'human interface'
         ])
 
-    def testScoringOriginal(self):
+    def testScoringDefault(self):
         """ a single entry should produce multiple bigrams. """
         bigram = Phrases(sentences, min_count=1, threshold=1)
 

From 384172ef07c6a8a799f22319c2a1e17021f7ea2d Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Wed, 5 Jul 2017 13:38:52 -0400
Subject: [PATCH 05/11] moved scoring parameter checking logic to
 initialization

---
 gensim/models/phrases.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 7446c9ccfc..8506a256ff 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -65,6 +65,7 @@
 from collections import defaultdict
 import itertools as it
 from functools import partial
+from math import log
 
 from six import iteritems, string_types, next
 
@@ -152,8 +153,13 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         if min_count <= 0:
             raise ValueError("min_count should be at least 1")
 
-        if threshold <= 0:
-            raise ValueError("threshold should be positive")
+        if threshold <= 0 and scoring == 'default':
+            raise ValueError("threshold should be positive for default scoring")
+        if scoring == 'npmi' and (threshold < -1 or threshold > 1):
+            raise ValueError("threshold should be between -1 and 1 for npmi scoring")
+
+        if not (scoring == 'default' or scoring == 'npmi'):
+            raise ValueError('unknown scoring function "' + scoring + '" specified')
 
         self.min_count = min_count
         self.threshold = threshold
@@ -247,19 +253,21 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
             then you can debug the threshold with generated tsv
         """
 
+        if scoring == 'default':
+            self.scoring_function = \
+            partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
+        elif scoring == 'npmi':
+            self.scoring_function = \
+            partial(self.npmi_scorer, corpus_word_count = corpus_word_count)
+        # no else here to catch unknown scoring function, check is done in Phrases.__init__
+
         vocab = self.vocab
         threshold = self.threshold
         delimiter = self.delimiter  # delimiter used for lookup
         min_count = self.min_count
         scoring = self.scoring
         corpus_word_count = self.corpus_word_count
-
-        if scoring == 'default':
-            scoring_function = partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
-        elif scoring == 'npmi':
-            scoring_function = partial(self.npmi_scorer, corpus_word_count = corpus_word_count)
-        else:
-            raise ValueError('unknown scoring function specified')
+        scoring_function = self.scoring_function
 
         for sentence in sentences:
             s = [utils.any2utf8(w) for w in sentence]

From e3eeb678bae640c313e57ae753f9f5b2b127e4e0 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Wed, 5 Jul 2017 13:43:23 -0400
Subject: [PATCH 06/11] fixed bugin export_phrases scoring function creation

---
 gensim/models/phrases.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 8506a256ff..9d42938528 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -253,21 +253,20 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
             then you can debug the threshold with generated tsv
         """
 
-        if scoring == 'default':
-            self.scoring_function = \
-            partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
-        elif scoring == 'npmi':
-            self.scoring_function = \
-            partial(self.npmi_scorer, corpus_word_count = corpus_word_count)
-        # no else here to catch unknown scoring function, check is done in Phrases.__init__
-
         vocab = self.vocab
         threshold = self.threshold
         delimiter = self.delimiter  # delimiter used for lookup
         min_count = self.min_count
         scoring = self.scoring
         corpus_word_count = self.corpus_word_count
-        scoring_function = self.scoring_function
+
+        if self.scoring == 'default':
+            scoring_function = \
+            partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
+        elif self.scoring == 'npmi':
+            scoring_function = \
+            partial(self.npmi_scorer, corpus_word_count = corpus_word_count)
+        # no else here to catch unknown scoring function, check is done in Phrases.__init__
 
         for sentence in sentences:
             s = [utils.any2utf8(w) for w in sentence]

From a6684de326ca34254469b7f2946c4ea5d93e33de Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Wed, 5 Jul 2017 13:56:20 -0400
Subject: [PATCH 07/11] test for npmi scoring

---
 gensim/test/test_phrases.py | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index bdf6997c3b..b3b5b5665f 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -154,7 +154,7 @@ def testMultipleBigramsSingleEntry(self):
         ])
 
     def testScoringDefault(self):
-        """ a single entry should produce multiple bigrams. """
+        """ test the default scoring, from the mikolov word2vec paper """
         bigram = Phrases(sentences, min_count=1, threshold=1)
 
         seen_scores = set()
@@ -169,22 +169,18 @@ def testScoringDefault(self):
         ])
 
     def testScoringNpmi(self):
-        """ a single entry should produce multiple bigrams. """
-        bigram = Phrases(sentences, min_count=1, threshold=1)
+        """ test normalized pointwise mutual information scoring """
+        bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi')
 
-        seen_bigrams = set()
         seen_scores = set()
 
         test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
-        logging.debug(str(len(bigram.vocab)))
         for phrase, score in bigram.export_phrases(test_sentences):
-            logging.debug('test Scoring Phrase ' + phrase)
-            logging.debug('test Scoring score ' + str(score))
             seen_scores.add(round(score,3))
 
         assert seen_scores == set([
-            5.167,
-            3.444
+            .882, #score for graph minors
+            .714 # score for human interface
         ])
 
     def testBadParameters(self):

From b70648c7f9e91dea6fc96bacbc7943d4f1fa9747 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Wed, 5 Jul 2017 13:58:57 -0400
Subject: [PATCH 08/11] typo in phrases docstring

---
 gensim/models/phrases.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 9d42938528..1b28c8b922 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -147,7 +147,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
             - ln(prop(worda followed by wordb)
             where prop(n) is the count of n / the count of everything in the entire corpus
         'npmi' is more robust when dealing with common words that form part of common bigrams, and
-            ranges from 0 to 1, but is slower to calculate than the default
+            ranges from -1 to 1, but is slower to calculate than the default
 
         """
         if min_count <= 0:

From 99ec30117170bf3e619f834ff79258744275b39f Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Wed, 5 Jul 2017 14:44:40 -0400
Subject: [PATCH 09/11] copy scoring setting to Phraser

---
 gensim/models/phrases.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 1b28c8b922..869414787a 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -385,8 +385,8 @@ class Phraser(interfaces.TransformationABC):
     After the one-time initialization, a Phraser will be much smaller and
     somewhat faster than using the full Phrases model.
 
-    Reflects the results of the source model's `min_count` and `threshold`
-    settings. (You can tamper with those & create a new Phraser to try
+    Reflects the results of the source model's `min_count`, `threshold`, and
+    `scoring` settings. (You can tamper with those & create a new Phraser to try
     other values.)
 
     """
@@ -394,6 +394,7 @@ def __init__(self, phrases_model):
         self.threshold = phrases_model.threshold
         self.min_count = phrases_model.min_count
         self.delimiter = phrases_model.delimiter
+        self.scoring = phrases_model.scoring
         self.phrasegrams = {}
         corpus = pseudocorpus(phrases_model.vocab, phrases_model.delimiter)
         logger.info('source_vocab length %i', len(phrases_model.vocab))

From e408f9048b5ddd793c248208ced83d8e76cc1038 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Fri, 7 Jul 2017 16:16:08 -0400
Subject: [PATCH 10/11] fixing travis-ci errors

---
 gensim/models/phrases.py    | 12 ++++++------
 gensim/test/test_phrases.py | 12 ++++++------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 869414787a..9aa6a2701b 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -109,7 +109,7 @@ class Phrases(interfaces.TransformationABC):
     """
     def __init__(self, sentences=None, min_count=5, threshold=10.0,
                  max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
-				 scoring = 'default'):
+                 scoring='default'):
         """
         Initialize the model from an iterable of `sentences`. Each sentence must be
         a list of words (unicode strings) that will be used for training.
@@ -260,12 +260,12 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
         scoring = self.scoring
         corpus_word_count = self.corpus_word_count
 
-        if self.scoring == 'default':
+        if scoring == 'default':
             scoring_function = \
             partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
-        elif self.scoring == 'npmi':
+        elif scoring == 'npmi':
             scoring_function = \
-            partial(self.npmi_scorer, corpus_word_count = corpus_word_count)
+            partial(self.npmi_scorer, corpus_word_count=corpus_word_count)
         # no else here to catch unknown scoring function, check is done in Phrases.__init__
 
         for sentence in sentences:
@@ -356,12 +356,12 @@ def __getitem__(self, sentence):
     # calculation of score based on original mikolov word2vec paper
     # len_vocab and min_count set so functools.partial works
     @staticmethod
-    def original_scorer(worda_count, wordb_count, bigram_count, len_vocab = 0.0, min_count = 0.0):
+    def original_scorer(worda_count, wordb_count, bigram_count, len_vocab=0.0, min_count=0.0):
         return (bigram_count - min_count) / worda_count / wordb_count * len_vocab
 
     # normalized PMI, requires corpus size
     @staticmethod
-    def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count = 0.0):
+    def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count=0.0):
         pa = worda_count / corpus_word_count
         pb = wordb_count / corpus_word_count
         pab = bigram_count / corpus_word_count
diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index b3b5b5665f..688f92dbd0 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -161,11 +161,11 @@ def testScoringDefault(self):
 
         test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
         for phrase, score in bigram.export_phrases(test_sentences):
-            seen_scores.add(round(score,3))
+            seen_scores.add(round(score, 3))
 
         assert seen_scores == set([
-            5.167, # score for graph minors
-            3.444 # score for human interface
+            5.167,  # score for graph minors
+            3.444  # score for human interface
         ])
 
     def testScoringNpmi(self):
@@ -176,11 +176,11 @@ def testScoringNpmi(self):
 
         test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
         for phrase, score in bigram.export_phrases(test_sentences):
-            seen_scores.add(round(score,3))
+            seen_scores.add(round(score, 3))
 
         assert seen_scores == set([
-            .882, #score for graph minors
-            .714 # score for human interface
+            .882,  # score for graph minors
+            .714  # score for human interface
         ])
 
     def testBadParameters(self):

From 80b68c2ac455572dd03819ac0f3b0069e0499b73 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Fri, 7 Jul 2017 16:33:45 -0400
Subject: [PATCH 11/11] no need to specify long vs. int

---
 gensim/models/phrases.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 9aa6a2701b..33390fc08e 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -169,7 +169,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         self.delimiter = delimiter
         self.progress_per = progress_per
         self.scoring = scoring
-        self.corpus_word_count = 0L
+        self.corpus_word_count = 0
 
         if sentences is not None:
             self.add_vocab(sentences)