From 21c4401c5108ff879532743347c56fd54aa36e75 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Tue, 5 Sep 2017 13:27:37 -0400
Subject: [PATCH 01/17] initial commit of fixes in comments of #1423

---
 gensim/models/word2vec.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 255b9c553f..ddae9ff08f 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -1563,7 +1563,7 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         """
         `source` should be a path to a directory (as a string) where all files can be opened by the
         LineSentence class. Each file will be read up to
-        `limit` lines (or no clipped if limit is None, the default).
+        `limit` lines (or not clipped if limit is None, the default).
 
         Example::
 
@@ -1577,23 +1577,23 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         self.limit = limit
 
         if os.path.isfile(self.source):
-            logging.warning('single file read, better to use models.word2vec.LineSentence')
+            logger.warning('single file read, better to use models.word2vec.LineSentence')
             self.input_files = [self.source]  # force code compatibility with list of files
         elif os.path.isdir(self.source):
             self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
-            logging.debug('reading directory ' + self.source)
+            logger.warning('reading directory %s', self.source)
             self.input_files = os.listdir(self.source)
-            self.input_files = [self.source + file for file in self.input_files]  # make full paths
+            self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
             self.input_files.sort()  # makes sure it happens in filename order
         else:  # not a file or a directory, then we can't do anything with it
             raise ValueError('input is neither a file nor a path')
 
-        logging.info('files read into PathLineSentences:' + '\n'.join(self.input_files))
+        logger.info('files read into PathLineSentences: %s', '\n'.join(self.input_files))
 
     def __iter__(self):
         '''iterate through the files'''
         for file_name in self.input_files:
-            logging.info('reading file ' + file_name)
+            logger.info('reading file %s', file_name)
             with utils.smart_open(file_name) as fin:
                 for line in itertools.islice(fin, self.limit):
                     line = utils.to_unicode(line).split()

From 0590c2f90df93f35deb3bbeb91f5309734e56d5e Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Tue, 5 Sep 2017 13:38:29 -0400
Subject: [PATCH 02/17] removed unnecessary space in logger

---
 gensim/models/word2vec.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index ddae9ff08f..96c7195dc9 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -1588,7 +1588,7 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         else:  # not a file or a directory, then we can't do anything with it
             raise ValueError('input is neither a file nor a path')
 
-        logger.info('files read into PathLineSentences: %s', '\n'.join(self.input_files))
+        logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files))
 
     def __iter__(self):
         '''iterate through the files'''

From 34dc58f15945b8c104b6b8094fa66f219426c989 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Tue, 5 Sep 2017 16:21:46 -0400
Subject: [PATCH 03/17] added support for custom Phrases scorers

---
 gensim/models/phrases.py    | 82 +++++++++++++++++++++++++------------
 gensim/test/test_phrases.py | 17 +++++++-
 2 files changed, 71 insertions(+), 28 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 1f0826258c..2da62b9672 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -64,8 +64,8 @@
 import warnings
 from collections import defaultdict
 import itertools as it
-from functools import partial
 from math import log
+from inspect import getargspec
 
 from six import iteritems, string_types, next
 
@@ -137,18 +137,31 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         should be a byte string (e.g. b'_').
 
         `scoring` specifies how potential phrases are scored for comparison to the `threshold`
-        setting. two settings are available:
-        'default': from "Efficient Estimaton of Word Representations in Vector Space" by
-            Mikolov, et. al.:
-            (count(worda followed by wordb) - min_count) * N /
-            (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size.
-        'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual
-            Information in Colocation Extraction" by Gerlof Bouma:
-            ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) /
-            - ln(prop(worda followed by wordb)
-            where prop(n) is the count of n / the count of everything in the entire corpus
-        'npmi' is more robust when dealing with common words that form part of common bigrams, and
+        setting. `scoring` can be set with either a string that refers to a built-in scoring function,
+        or with a function with the expected parameter names.
+        Two built-in scoring functions are available by setting `scoring` to a string:
+            'default': from "Efficient Estimaton of Word Representations in Vector Space" by
+                Mikolov, et. al.:
+                (count(worda followed by wordb) - min_count) * N /
+                (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size.
+            'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual
+                Information in Colocation Extraction" by Gerlof Bouma:
+                ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) /
+                - ln(prop(worda followed by wordb)
+                where prop(n) is the count of n / the count of everything in the entire corpus
+            'npmi' is more robust when dealing with common words that form part of common bigrams, and
             ranges from -1 to 1, but is slower to calculate than the default
+        To use a custom scoring function, create a function with the following parameters and set the `scoring`
+        parameter to the custom function. You must use all the parameters in your function call, even if the
+        function does not require all the parameters.
+            worda_count: number of occurrances in `sentences` of the first token in the phrase being scored
+            wordb_count: number of occurrances in `sentences` of the second token in the phrase being scored
+            bigram_count: number of occurrances in `sentences` of the phrase being scored
+            len_vocab: the number of unique tokens in `sentences`
+            min_count: the `min_count` setting of the Phrases class
+            corpus_word_count: the total number of (non-unique) tokens in `sentences`
+        A scoring function without any of these parameters (even if the parameters are not used) will
+            raise a ValueError on initialization of the Phrases class
 
         """
         if min_count <= 0:
@@ -159,8 +172,23 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         if scoring == 'npmi' and (threshold < -1 or threshold > 1):
             raise ValueError("threshold should be between -1 and 1 for npmi scoring")
 
-        if not (scoring == 'default' or scoring == 'npmi'):
-            raise ValueError('unknown scoring function "' + scoring + '" specified')
+        # set scoring based on string
+        # intentially override the value of the scoring parameter rather than set self.scoring here,
+        # to still run the check of scoring function parameters in the next code block
+        if type(scoring) is str:
+            if scoring == 'default':
+                scoring = self.original_scorer
+            elif scoring == 'npmi':
+                scoring = self.npmi_scorer
+            else:
+                raise ValueError('unknown scoring method string %s specified' % (scoring))
+
+        scoring_parameters = ['worda_count','wordb_count','bigram_count','len_vocab','min_count','corpus_word_count']
+        if callable(scoring):
+            if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
+                self.scoring = scoring
+            else:
+                raise ValueError('scoring function missing expected parameters')
 
         self.min_count = min_count
         self.threshold = threshold
@@ -169,7 +197,6 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         self.min_reduce = 1  # ignore any tokens with count smaller than this
         self.delimiter = delimiter
         self.progress_per = progress_per
-        self.scoring = scoring
         self.corpus_word_count = 0
 
         if sentences is not None:
@@ -258,16 +285,14 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
         threshold = self.threshold
         delimiter = self.delimiter  # delimiter used for lookup
         min_count = self.min_count
-        scoring = self.scoring
+        scorer = self.scoring
         corpus_word_count = self.corpus_word_count
+        # made floats for scoring function
+        len_vocab = float(len(vocab))
+        scorer_min_count = float(min_count)
+        corpus_word_count = float(corpus_word_count)
+
 
-        if scoring == 'default':
-            scoring_function = \
-            partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
-        elif scoring == 'npmi':
-            scoring_function = \
-            partial(self.npmi_scorer, corpus_word_count=corpus_word_count)
-        # no else here to catch unknown scoring function, check is done in Phrases.__init__
 
         for sentence in sentences:
             s = [utils.any2utf8(w) for w in sentence]
@@ -281,7 +306,8 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
                         count_a = float(vocab[word_a])
                         count_b = float(vocab[word_b])
                         count_ab = float(vocab[bigram_word])
-                        score = scoring_function(count_a, count_b, count_ab)
+                        # scoring function should have all these parameters
+                        score = scorer(worda_count=count_a, wordb_count=count_b, bigram_count=count_ab, len_vocab=len_vocab, min_count=scorer_min_count, corpus_word_count=corpus_word_count)
                         # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
                         #     bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
                         # added mincount check because if the scorer doesn't contain min_count
@@ -354,15 +380,17 @@ def __getitem__(self, sentence):
 
         return [utils.to_unicode(w) for w in new_s]
 
+    # these two built-in scoring methods don't cast everything to float because the casting is done in the call
+    # to the scoring method in __getitem__ and export_phrases.
+
     # calculation of score based on original mikolov word2vec paper
-    # len_vocab and min_count set so functools.partial works
     @staticmethod
-    def original_scorer(worda_count, wordb_count, bigram_count, len_vocab=0.0, min_count=0.0):
+    def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
         return (bigram_count - min_count) / worda_count / wordb_count * len_vocab
 
     # normalized PMI, requires corpus size
     @staticmethod
-    def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count=0.0):
+    def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
         pa = worda_count / corpus_word_count
         pb = wordb_count / corpus_word_count
         pab = bigram_count / corpus_word_count
diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index 5397d6e4c3..bde64e06dc 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -174,7 +174,6 @@ def testScoringNpmi(self):
         bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi')
 
         seen_scores = set()
-
         test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
         for phrase, score in bigram.export_phrases(test_sentences):
             seen_scores.add(round(score, 3))
@@ -184,6 +183,22 @@ def testScoringNpmi(self):
             .714  # score for human interface
         ])
 
+    def testCustomScorer(self):
+        """ test using a custom scoring function """
+        # all scores will be 1
+        def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
+            return 1
+
+        bigram = Phrases(sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
+
+        seen_scores = []
+        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
+        for phrase, score in bigram.export_phrases(test_sentences):
+            seen_scores.append(score)
+
+        assert all(seen_scores) # all scores 1
+        assert len(seen_scores) == 3 #'graph minors' and 'survey human' and 'interface system'
+
     def testBadParameters(self):
         """Test the phrases module with bad parameters."""
         # should fail with something less or equal than 0

From 32b66bd677d8e425ef2e48383262ff6baab37dc8 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Wed, 6 Sep 2017 12:36:45 -0400
Subject: [PATCH 04/17] fixed Phrases.__getitem__ to support pluggable scoring
 #1533

---
 gensim/models/phrases.py    | 48 +++++++++++++++++++++----------------
 gensim/test/test_phrases.py | 13 ++++++++++
 2 files changed, 40 insertions(+), 21 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 2da62b9672..9b922eaff2 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -138,8 +138,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
 
         `scoring` specifies how potential phrases are scored for comparison to the `threshold`
         setting. `scoring` can be set with either a string that refers to a built-in scoring function,
-        or with a function with the expected parameter names.
-        Two built-in scoring functions are available by setting `scoring` to a string:
+        or with a function with the expected parameter names. Two built-in scoring functions are available
+        by setting `scoring` to a string:
             'default': from "Efficient Estimaton of Word Representations in Vector Space" by
                 Mikolov, et. al.:
                 (count(worda followed by wordb) - min_count) * N /
@@ -249,8 +249,7 @@ def add_vocab(self, sentences):
         # directly, but gives the new sentences a fighting chance to collect
         # sufficient counts, before being pruned out by the (large) accummulated
         # counts collected in previous learn_vocab runs.
-        min_reduce, vocab, total_words = \
-        self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per)
+        min_reduce, vocab, total_words = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per)
 
         self.corpus_word_count += total_words
         if len(self.vocab) > 0:
@@ -286,11 +285,10 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
         delimiter = self.delimiter  # delimiter used for lookup
         min_count = self.min_count
         scorer = self.scoring
-        corpus_word_count = self.corpus_word_count
         # made floats for scoring function
         len_vocab = float(len(vocab))
         scorer_min_count = float(min_count)
-        corpus_word_count = float(corpus_word_count)
+        corpus_word_count = float(self.corpus_word_count)
 
 
 
@@ -306,12 +304,10 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
                         count_a = float(vocab[word_a])
                         count_b = float(vocab[word_b])
                         count_ab = float(vocab[bigram_word])
-                        # scoring function should have all these parameters
+                        # scoring MUST have all these parameters, even if they are not used
                         score = scorer(worda_count=count_a, wordb_count=count_b, bigram_count=count_ab, len_vocab=len_vocab, min_count=scorer_min_count, corpus_word_count=corpus_word_count)
                         # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
-                        #     bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
-                        # added mincount check because if the scorer doesn't contain min_count
-                        # it would not be enforced otherwise
+                        #     bigram_word, count_ab, scorer_min_count, count_a, count_ab, len_vocab, score)
                         if score > threshold and count_ab >= min_count:
                             if as_tuples:
                                 yield ((word_a, word_b), score)
@@ -342,6 +338,16 @@ def __getitem__(self, sentence):
         """
         warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")
 
+        vocab = self.vocab
+        threshold = self.threshold
+        delimiter = self.delimiter  # delimiter used for lookup
+        min_count = self.min_count
+        scorer = self.scoring
+        # made floats for scoring function
+        len_vocab = float(len(vocab))
+        scorer_min_count = float(min_count)
+        corpus_word_count = float(self.corpus_word_count)
+
         is_single, sentence = _is_single(sentence)
         if not is_single:
             # if the input is an entire corpus (rather than a single sentence),
@@ -351,20 +357,20 @@ def __getitem__(self, sentence):
         s, new_s = [utils.any2utf8(w) for w in sentence], []
         last_bigram = False
         vocab = self.vocab
-        threshold = self.threshold
-        delimiter = self.delimiter
-        min_count = self.min_count
+
         for word_a, word_b in zip(s, s[1:]):
-            if word_a in vocab and word_b in vocab:
+            # last bigram check was moved here to save a few CPU cycles
+            if word_a in vocab and word_b in vocab and not last_bigram:
                 bigram_word = delimiter.join((word_a, word_b))
-                if bigram_word in vocab and not last_bigram:
-                    pa = float(vocab[word_a])
-                    pb = float(vocab[word_b])
-                    pab = float(vocab[bigram_word])
-                    score = (pab - min_count) / pa / pb * len(vocab)
+                if bigram_word in vocab:
+                    count_a = float(vocab[word_a])
+                    count_b = float(vocab[word_b])
+                    count_ab = float(vocab[bigram_word])
+                    # scoring MUST have all these parameters, even if they are not used
+                    score = scorer(worda_count=count_a, wordb_count=count_b, bigram_count=count_ab, len_vocab=len_vocab, min_count=scorer_min_count, corpus_word_count=corpus_word_count)
                     # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
-                    #     bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
-                    if score > threshold:
+                    #     bigram_word, count_ab, scorer_min_count, count_a, count_ab, len_vocab, score)
+                    if score > threshold and count_ab >= min_count:
                         new_s.append(bigram_word)
                         last_bigram = True
                         continue
diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index bde64e06dc..591883eec1 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -12,6 +12,7 @@
 import unittest
 import os
 import sys
+import pdb
 
 from gensim import utils
 from gensim.models.phrases import Phrases, Phraser
@@ -169,6 +170,15 @@ def testScoringDefault(self):
             3.444  # score for human interface
         ])
 
+    def test__getitem__(self):
+        """ test Phrases[sentences] with a single sentence"""
+        bigram = Phrases(sentences, min_count=1, threshold=1)
+        # pdb.set_trace()
+        test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
+        phrased_sentences = bigram[test_sentences].__iter__().next()
+
+        assert phrased_sentences == ['graph_minors', 'survey', 'human_interface']
+
     def testScoringNpmi(self):
         """ test normalized pointwise mutual information scoring """
         bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi')
@@ -183,6 +193,9 @@ def testScoringNpmi(self):
             .714  # score for human interface
         ])
 
+        phrased_sentences = bigram[test_sentences].__iter__().next()
+        assert phrased_sentences == ['graph_minors', 'survey', 'human_interface']
+
     def testCustomScorer(self):
         """ test using a custom scoring function """
         # all scores will be 1

From 9b3f801a86c11e3f0248df84e7aab69d723163fa Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Wed, 6 Sep 2017 14:14:25 -0400
Subject: [PATCH 05/17] travisCI style fixes

---
 gensim/models/phrases.py    | 2 +-
 gensim/test/test_phrases.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 9b922eaff2..b3adc814ce 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -183,7 +183,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
             else:
                 raise ValueError('unknown scoring method string %s specified' % (scoring))
 
-        scoring_parameters = ['worda_count','wordb_count','bigram_count','len_vocab','min_count','corpus_word_count']
+        scoring_parameters = ['worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count']
         if callable(scoring):
             if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
                 self.scoring = scoring
diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index 591883eec1..71a9e237f4 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -12,7 +12,6 @@
 import unittest
 import os
 import sys
-import pdb
 
 from gensim import utils
 from gensim.models.phrases import Phrases, Phraser
@@ -209,8 +208,8 @@ def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co
         for phrase, score in bigram.export_phrases(test_sentences):
             seen_scores.append(score)
 
-        assert all(seen_scores) # all scores 1
-        assert len(seen_scores) == 3 #'graph minors' and 'survey human' and 'interface system'
+        assert all(seen_scores)  # all scores 1
+        assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'
 
     def testBadParameters(self):
         """Test the phrases module with bad parameters."""

From 2698aa7ed545616871755a24105c6d077f8ac063 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Thu, 7 Sep 2017 08:28:27 -0400
Subject: [PATCH 06/17] fixed __next__() to next() for python 3 compatibilyt

---
 gensim/test/test_phrases.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index 71a9e237f4..083d07b537 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -174,9 +174,9 @@ def test__getitem__(self):
         bigram = Phrases(sentences, min_count=1, threshold=1)
         # pdb.set_trace()
         test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
-        phrased_sentences = bigram[test_sentences].__iter__().next()
+        phrased_sentence = next(bigram[test_sentences].__iter__())
 
-        assert phrased_sentences == ['graph_minors', 'survey', 'human_interface']
+        assert phrased_sentence == ['graph_minors', 'survey', 'human_interface']
 
     def testScoringNpmi(self):
         """ test normalized pointwise mutual information scoring """
@@ -192,9 +192,6 @@ def testScoringNpmi(self):
             .714  # score for human interface
         ])
 
-        phrased_sentences = bigram[test_sentences].__iter__().next()
-        assert phrased_sentences == ['graph_minors', 'survey', 'human_interface']
-
     def testCustomScorer(self):
         """ test using a custom scoring function """
         # all scores will be 1
@@ -209,7 +206,7 @@ def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co
             seen_scores.append(score)
 
         assert all(seen_scores)  # all scores 1
-        assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'
+        assert len(seen_scores) == 3  #'graph minors' and 'survey human' and 'interface system'
 
     def testBadParameters(self):
         """Test the phrases module with bad parameters."""

From accea8c4b9b6f2a931f28603e680f6ce1f38e8b3 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Thu, 7 Sep 2017 10:53:49 -0400
Subject: [PATCH 07/17] misc fixes

---
 gensim/models/phrases.py    | 10 ++++------
 gensim/test/test_phrases.py |  2 +-
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index b3adc814ce..3da1f6dfbf 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -177,9 +177,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         # to still run the check of scoring function parameters in the next code block
         if type(scoring) is str:
             if scoring == 'default':
-                scoring = self.original_scorer
+                scoring = original_scorer
             elif scoring == 'npmi':
-                scoring = self.npmi_scorer
+                scoring = npmi_scorer
             else:
                 raise ValueError('unknown scoring method string %s specified' % (scoring))
 
@@ -390,13 +390,11 @@ def __getitem__(self, sentence):
     # to the scoring method in __getitem__ and export_phrases.
 
     # calculation of score based on original mikolov word2vec paper
-    @staticmethod
-    def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
+def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
         return (bigram_count - min_count) / worda_count / wordb_count * len_vocab
 
     # normalized PMI, requires corpus size
-    @staticmethod
-    def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
+def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
         pa = worda_count / corpus_word_count
         pb = wordb_count / corpus_word_count
         pab = bigram_count / corpus_word_count
diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index 083d07b537..4e75dfb701 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -206,7 +206,7 @@ def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co
             seen_scores.append(score)
 
         assert all(seen_scores)  # all scores 1
-        assert len(seen_scores) == 3  #'graph minors' and 'survey human' and 'interface system'
+        assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'
 
     def testBadParameters(self):
         """Test the phrases module with bad parameters."""

From 8854097d5e25be1fe5c350a0acebbab4d2664d2a Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Thu, 7 Sep 2017 11:03:31 -0400
Subject: [PATCH 08/17] spacing fixes for style

---
 gensim/models/phrases.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 3da1f6dfbf..45b25fc96e 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -389,11 +389,13 @@ def __getitem__(self, sentence):
     # these two built-in scoring methods don't cast everything to float because the casting is done in the call
     # to the scoring method in __getitem__ and export_phrases.
 
+
     # calculation of score based on original mikolov word2vec paper
 def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
         return (bigram_count - min_count) / worda_count / wordb_count * len_vocab
 
-    # normalized PMI, requires corpus size
+
+# normalized PMI, requires corpus size
 def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
         pa = worda_count / corpus_word_count
         pb = wordb_count / corpus_word_count

From bbaf3f727da06f2db076ec4e036a1fb5a04f4404 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Thu, 7 Sep 2017 11:21:53 -0400
Subject: [PATCH 09/17] custom scorer support in sklearn api

---
 gensim/sklearn_api/phrases.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py
index 8a944f0235..2eab84b95e 100644
--- a/gensim/sklearn_api/phrases.py
+++ b/gensim/sklearn_api/phrases.py
@@ -22,7 +22,7 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator):
     """
 
     def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,
-            delimiter=b'_', progress_per=10000):
+            delimiter=b'_', progress_per=10000, scoring='default'):
         """
         Sklearn wrapper for Phrases model.
         """
@@ -32,13 +32,14 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000,
         self.max_vocab_size = max_vocab_size
         self.delimiter = delimiter
         self.progress_per = progress_per
+        self.scoring = scoring
 
     def fit(self, X, y=None):
         """
         Fit the model according to the given training data.
         """
         self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold,
-            max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per)
+            max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, scoring=self.scoring)
         return self
 
     def transform(self, docs):
@@ -62,7 +63,7 @@ def transform(self, docs):
     def partial_fit(self, X):
         if self.gensim_model is None:
             self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold,
-                max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per)
+                max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, scoring=self.scoring)
 
         self.gensim_model.add_vocab(X)
         return self

From 4e555c4c3348697fd7d49f5bfebd3bd45c4f04c2 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Thu, 7 Sep 2017 11:37:30 -0400
Subject: [PATCH 10/17] Phrases scikit interface tests for pluggable scoring

---
 gensim/test/test_sklearn_api.py | 59 +++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index 0e17905c2b..46e407646b 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -957,6 +957,65 @@ def testModelNotFitted(self):
         phrases_transformer = PhrasesTransformer()
         self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0])
 
+# specifically test pluggable scoring in Phrases, because possible pickling issues with function parameter
+
+# all scores will be 1
+# this is intentionally in main rather than a class method to support pickling
+def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
+    return 1
+
+class TestPhrasesTransformerCustomScorer(unittest.TestCase):
+
+    def setUp(self):
+        numpy.random.seed(0)
+
+        self.model = PhrasesTransformer(min_count=1, threshold=.9, scoring=dumb_scorer)
+        self.model.fit(phrases_sentences)
+
+    def testTransform(self):
+        # tranform one document
+        doc = phrases_sentences[-1]
+        phrase_tokens = self.model.transform(doc)[0]
+        expected_phrase_tokens = [u'graph_minors', u'survey_human', u'interface']
+        self.assertEqual(phrase_tokens, expected_phrase_tokens)
+
+    def testPartialFit(self):
+        new_sentences = [
+            ['world', 'peace', 'humans', 'world', 'peace', 'world', 'peace', 'people'],
+            ['world', 'peace', 'people'],
+            ['world', 'peace', 'humans']
+        ]
+        self.model.partial_fit(X=new_sentences)  # train model with new sentences
+
+        doc = ['graph', 'minors', 'survey', 'human', 'interface', 'world', 'peace']
+        phrase_tokens = self.model.transform(doc)[0]
+        expected_phrase_tokens = [u'graph_minors', u'survey_human', u'interface', u'world_peace']
+        self.assertEqual(phrase_tokens, expected_phrase_tokens)
+
+    def testSetGetParams(self):
+        # updating only one param
+        self.model.set_params(progress_per=5000)
+        model_params = self.model.get_params()
+        self.assertEqual(model_params["progress_per"], 5000)
+
+        # verify that the attributes values are also changed for `gensim_model` after fitting
+        self.model.fit(phrases_sentences)
+        self.assertEqual(getattr(self.model.gensim_model, 'progress_per'), 5000)
+
+    def testPersistence(self):
+        model_dump = pickle.dumps(self.model)
+        model_load = pickle.loads(model_dump)
+
+        doc = phrases_sentences[-1]
+        loaded_phrase_tokens = model_load.transform(doc)
+
+        # comparing the original and loaded models
+        original_phrase_tokens = self.model.transform(doc)
+        self.assertEqual(original_phrase_tokens, loaded_phrase_tokens)
+
+    def testModelNotFitted(self):
+        phrases_transformer = PhrasesTransformer()
+        self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0])
 
 if __name__ == '__main__':
     unittest.main()

From b16554f943d2e1dbd14311dc79ef52a006ad11f6 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Thu, 7 Sep 2017 14:23:45 -0400
Subject: [PATCH 11/17] missing line breaks

---
 gensim/test/test_sklearn_api.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index 46e407646b..8a5b3b1e5e 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -957,13 +957,15 @@ def testModelNotFitted(self):
         phrases_transformer = PhrasesTransformer()
         self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0])
 
+
 # specifically test pluggable scoring in Phrases, because possible pickling issues with function parameter
 
-# all scores will be 1
 # this is intentionally in main rather than a class method to support pickling
+# all scores will be 1
 def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
     return 1
 
+
 class TestPhrasesTransformerCustomScorer(unittest.TestCase):
 
     def setUp(self):

From a94a3fd9139e620a3352704c7ad00801b1735dd7 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Thu, 7 Sep 2017 14:51:38 -0400
Subject: [PATCH 12/17] style, clarity, and robustness fixes requested by
 @piskvorky

---
 gensim/models/phrases.py  | 12 ++++++------
 gensim/models/word2vec.py | 18 ++++++++++++------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 45b25fc96e..9962144cd5 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -175,7 +175,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         # set scoring based on string
         # intentially override the value of the scoring parameter rather than set self.scoring here,
         # to still run the check of scoring function parameters in the next code block
-        if type(scoring) is str:
+        if isinstance(scoring, basestring):
             if scoring == 'default':
                 scoring = original_scorer
             elif scoring == 'npmi':
@@ -392,15 +392,15 @@ def __getitem__(self, sentence):
 
     # calculation of score based on original mikolov word2vec paper
 def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
-        return (bigram_count - min_count) / worda_count / wordb_count * len_vocab
+    return (bigram_count - min_count) / worda_count / wordb_count * len_vocab
 
 
 # normalized PMI, requires corpus size
 def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
-        pa = worda_count / corpus_word_count
-        pb = wordb_count / corpus_word_count
-        pab = bigram_count / corpus_word_count
-        return log(pab / (pa * pb)) / -log(pab)
+    pa = worda_count / corpus_word_count
+    pb = wordb_count / corpus_word_count
+    pab = bigram_count / corpus_word_count
+    return log(pab / (pa * pb)) / -log(pab)
 
 
 def pseudocorpus(source_vocab, sep):
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 96c7195dc9..0e612ff82f 100644
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -1555,15 +1555,20 @@ def __iter__(self):
 
 class PathLineSentences(object):
     """
-    Simple format: one sentence = one line; words already preprocessed and separated by whitespace.
-    Like LineSentence, but will process all files in a directory in alphabetical order by filename
+
+    Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename.
+    The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. Any file not ending
+    with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories.
+
+    The format of files (either text, or compressed text files) in the path is one sentence = one line, with words already
+    preprocessed and separated by whitespace.
+
     """
 
     def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         """
         `source` should be a path to a directory (as a string) where all files can be opened by the
-        LineSentence class. Each file will be read up to
-        `limit` lines (or not clipped if limit is None, the default).
+        LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default).
 
         Example::
 
@@ -1577,11 +1582,12 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None):
         self.limit = limit
 
         if os.path.isfile(self.source):
-            logger.warning('single file read, better to use models.word2vec.LineSentence')
+            logger.debug('single file given as source, rather than a directory of files')
+            logger.debug('consider using models.word2vec.LineSentence for a single file')
             self.input_files = [self.source]  # force code compatibility with list of files
         elif os.path.isdir(self.source):
             self.source = os.path.join(self.source, '')  # ensures os-specific slash at end of path
-            logger.warning('reading directory %s', self.source)
+            logger.info('reading directory %s', self.source)
             self.input_files = os.listdir(self.source)
             self.input_files = [self.source + filename for filename in self.input_files]  # make full paths
             self.input_files.sort()  # makes sure it happens in filename order

From f9cc04f1b8b6adaaf6f1f48684439c97af4e40a0 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Thu, 7 Sep 2017 15:19:38 -0400
Subject: [PATCH 13/17] check in Phrases init to make sure scorer is pickleable

---
 gensim/models/phrases.py    |  9 +++++++++
 gensim/test/test_phrases.py | 11 ++++++++---
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 9962144cd5..31f8d205a7 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -66,6 +66,7 @@
 import itertools as it
 from math import log
 from inspect import getargspec
+import pickle
 
 from six import iteritems, string_types, next
 
@@ -162,6 +163,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
             corpus_word_count: the total number of (non-unique) tokens in `sentences`
         A scoring function without any of these parameters (even if the parameters are not used) will
             raise a ValueError on initialization of the Phrases class
+        The scoring function must be picklable
 
         """
         if min_count <= 0:
@@ -199,6 +201,13 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         self.progress_per = progress_per
         self.corpus_word_count = 0
 
+        # ensure picklability of custom scorer
+        try:
+            test_pickle = pickle.dumps(self.scoring)
+            load_pickle = pickle.loads(test_pickle)
+        except pickle.PickleError:
+            raise pickle.PickleError('unable to pickle custom Phrases scoring function')
+
         if sentences is not None:
             self.add_vocab(sentences)
 
diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index 4e75dfb701..5c07016d96 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -123,6 +123,14 @@ def testEncoding(self):
         self.assertTrue(isinstance(transformed, unicode))
 
 
+# scorer for testCustomScorer
+# function is outside of the scope of the test because for picklability of custom scorer
+# Phrases tests for picklability
+# all scores will be 1
+def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
+    return 1
+
+
 class TestPhrasesModel(unittest.TestCase):
     def testExportPhrases(self):
         """Test Phrases bigram export_phrases functionality."""
@@ -194,9 +202,6 @@ def testScoringNpmi(self):
 
     def testCustomScorer(self):
         """ test using a custom scoring function """
-        # all scores will be 1
-        def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
-            return 1
 
         bigram = Phrases(sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
 

From 5bbe144ea899a7dc9e46e87d1f6999e5e7f59d1b Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Thu, 7 Sep 2017 16:55:33 -0400
Subject: [PATCH 14/17] backwards scoring compatibility when loading a Phrases
 class

---
 gensim/models/phrases.py    | 51 +++++++++++++++++++---
 gensim/test/test_phrases.py | 86 +++++++++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+), 5 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 31f8d205a7..85309517a2 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -74,7 +74,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 def _is_single(obj):
     """
     Check whether `obj` is a single document or an entire corpus.
@@ -177,6 +176,13 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         # set scoring based on string
         # intentially override the value of the scoring parameter rather than set self.scoring here,
         # to still run the check of scoring function parameters in the next code block
+
+        # for python 2 and 3 compatibility. basestring is used to check if scoring is a string
+        try:
+            basestring
+        except NameError:
+            basestring = str
+
         if isinstance(scoring, basestring):
             if scoring == 'default':
                 scoring = original_scorer
@@ -395,11 +401,46 @@ def __getitem__(self, sentence):
 
         return [utils.to_unicode(w) for w in new_s]
 
-    # these two built-in scoring methods don't cast everything to float because the casting is done in the call
-    # to the scoring method in __getitem__ and export_phrases.
-
+    @classmethod
+    def load(cls, *args, **kwargs):
+        """
+        Load a previously saved Phrases class. Handles backwards compatibility from older Phrases versions which did not support
+            pluggable scoring functions. Otherwise, relies on utils.load
+        """
 
-    # calculation of score based on original mikolov word2vec paper
+        # for python 2 and 3 compatibility. basestring is used to check if model.scoring is a string
+        try:
+            basestring
+        except NameError:
+            basestring = str
+
+        model = super(Phrases, cls).load(*args, **kwargs)
+        # update older models
+        # if no scoring parameter, use default scoring
+        if not hasattr(model, 'scoring'):
+            logger.info('older version of Phrases loaded without scoring function')
+            logger.info('setting pluggable scoring method to original_scorer for compatibility')
+            model.scoring = original_scorer
+        # if there is a scoring parameter, and it's a text value, load the proper scoring function
+        if hasattr(model, 'scoring'):
+            if isinstance(model.scoring, basestring):
+                if model.scoring == 'default':
+                    logger.info('older version of Phrases loaded with "default" scoring parameter')
+                    logger.info('setting scoring method to original_scorer pluggable scoring method for compatibility')
+                    model.scoring = original_scorer
+                elif model.scoring == 'npmi':
+                    logger.info('older version of Phrases loaded with "npmi" scoring parameter')
+                    logger.info('setting scoring method to npmi_scorer pluggable scoring method for compatibility')
+                    model.scoring = npmi_scorer
+                else:
+                    raise ValueError('failed to load Phrases model with unknown scoring setting %s' % (model.scoring))
+        return model
+
+
+# these two built-in scoring methods don't cast everything to float because the casting is done in the call
+# to the scoring method in __getitem__ and export_phrases.
+
+# calculation of score based on original mikolov word2vec paper
 def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count):
     return (bigram_count - min_count) / worda_count / wordb_count * len_vocab
 
diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py
index 5c07016d96..ad76d43956 100644
--- a/gensim/test/test_phrases.py
+++ b/gensim/test/test_phrases.py
@@ -225,6 +225,92 @@ def testPruning(self):
         """Test that max_vocab_size parameter is respected."""
         bigram = Phrases(sentences, max_vocab_size=5)
         self.assertTrue(len(bigram.vocab) <= 5)
+
+    def testSaveLoadCustomScorer(self):
+        """ saving and loading a Phrases object with a custom scorer """
+
+        try:
+            bigram = Phrases(sentences, min_count=1, threshold=.001, scoring=dumb_scorer)
+            bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
+            bigram_loaded = Phrases.load("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
+            seen_scores = []
+            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
+            for phrase, score in bigram_loaded.export_phrases(test_sentences):
+                seen_scores.append(score)
+
+            assert all(seen_scores)  # all scores 1
+            assert len(seen_scores) == 3  # 'graph minors' and 'survey human' and 'interface system'
+
+        finally:
+            if os.path.exists("test_phrases_testSaveLoadCustomScorer_temp_save.pkl"):
+                os.remove("test_phrases_testSaveLoadCustomScorer_temp_save.pkl")
+
+    def testSaveLoad(self):
+        """ Saving and loading a Phrases object."""
+
+        try:
+            bigram = Phrases(sentences, min_count=1, threshold=1)
+            bigram.save("test_phrases_testSaveLoad_temp_save.pkl")
+            bigram_loaded = Phrases.load("test_phrases_testSaveLoad_temp_save.pkl")
+            seen_scores = set()
+            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
+            for phrase, score in bigram_loaded.export_phrases(test_sentences):
+                seen_scores.add(round(score, 3))
+
+            assert seen_scores == set([
+                5.167,  # score for graph minors
+                3.444  # score for human interface
+            ])
+
+        finally:
+            if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"):
+                os.remove("test_phrases_testSaveLoad_temp_save.pkl")
+
+    def testSaveLoadStringScoring(self):
+        """ Saving and loading a Phrases object with a string scoring parameter.
+        This should ensure backwards compatibility with the previous version of Phrases"""
+
+        try:
+            bigram = Phrases(sentences, min_count=1, threshold=1)
+            bigram.scoring = "default"
+            bigram.save("test_phrases_testSaveLoadStringScoring_temp_save.pkl")
+            bigram_loaded = Phrases.load("test_phrases_testSaveLoadStringScoring_temp_save.pkl")
+            seen_scores = set()
+            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
+            for phrase, score in bigram_loaded.export_phrases(test_sentences):
+                seen_scores.add(round(score, 3))
+
+            assert seen_scores == set([
+                5.167,  # score for graph minors
+                3.444  # score for human interface
+            ])
+
+        finally:
+            if os.path.exists("test_phrases_testSaveLoadStringScoring_temp_save.pkl"):
+                os.remove("test_phrases_testSaveLoadStringScoring_temp_save.pkl")
+
+    def testSaveLoadNoScoring(self):
+        """ Saving and loading a Phrases object with no scoring parameter.
+        This should ensure backwards compatibility with old versions of Phrases"""
+
+        try:
+            bigram = Phrases(sentences, min_count=1, threshold=1)
+            del(bigram.scoring)
+            bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
+            bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
+            seen_scores = set()
+            test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']]
+            for phrase, score in bigram_loaded.export_phrases(test_sentences):
+                seen_scores.add(round(score, 3))
+
+            assert seen_scores == set([
+                5.167,  # score for graph minors
+                3.444  # score for human interface
+            ])
+
+        finally:
+            if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"):
+                os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl")
 # endclass TestPhrasesModel
 
 

From 1481342bc8a2d2ab19b759d5b141e701c2a2b5a5 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Thu, 7 Sep 2017 17:00:08 -0400
Subject: [PATCH 15/17] removal of pickle testing objects in Phrases init

---
 gensim/models/phrases.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index 85309517a2..a97f0f479d 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -213,6 +213,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
             load_pickle = pickle.loads(test_pickle)
         except pickle.PickleError:
             raise pickle.PickleError('unable to pickle custom Phrases scoring function')
+        finally:
+            del(test_pickle)
+            del(load_pickle)
 
         if sentences is not None:
             self.add_vocab(sentences)

From fb7fbb153d063fd3ff396ee77bd21f4bcd3ed624 Mon Sep 17 00:00:00 2001
From: Michael Sherman <msherman49@bloomberg.net>
Date: Mon, 11 Sep 2017 11:32:49 -0400
Subject: [PATCH 16/17] switched to six for python 2/3 compatibility

---
 gensim/models/phrases.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index a97f0f479d..faa57d3a56 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -67,6 +67,7 @@
 from math import log
 from inspect import getargspec
 import pickle
+import six
 
 from six import iteritems, string_types, next
 
@@ -177,13 +178,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
         # intentially override the value of the scoring parameter rather than set self.scoring here,
         # to still run the check of scoring function parameters in the next code block
 
-        # for python 2 and 3 compatibility. basestring is used to check if scoring is a string
-        try:
-            basestring
-        except NameError:
-            basestring = str
-
-        if isinstance(scoring, basestring):
+        if isinstance(scoring, six.string_types):
             if scoring == 'default':
                 scoring = original_scorer
             elif scoring == 'npmi':

From e866d3f0646ac354636ec4b6a947c0b4a34d02a3 Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Tue, 24 Oct 2017 15:56:43 +0500
Subject: [PATCH 17/17] fix docstring

---
 gensim/models/phrases.py | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index ec5addef0a..2ec8592bcd 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -140,30 +140,33 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=4
         setting. `scoring` can be set with either a string that refers to a built-in scoring function,
         or with a function with the expected parameter names. Two built-in scoring functions are available
         by setting `scoring` to a string:
-            'default': from "Efficient Estimaton of Word Representations in Vector Space" by
-                Mikolov, et. al.:
-                (count(worda followed by wordb) - min_count) * N /
-                (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size.
-            'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual
+
+        'default': from "Efficient Estimaton of Word Representations in Vector Space" by
+                   Mikolov, et. al.:
+                   (count(worda followed by wordb) - min_count) * N /
+                   (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size.
+        'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual
                 Information in Colocation Extraction" by Gerlof Bouma:
                 ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) /
                 - ln(prop(worda followed by wordb)
                 where prop(n) is the count of n / the count of everything in the entire corpus
 
-            'npmi' is more robust when dealing with common words that form part of common bigrams, and
-            ranges from -1 to 1, but is slower to calculate than the default
+        'npmi' is more robust when dealing with common words that form part of common bigrams, and
+        ranges from -1 to 1, but is slower to calculate than the default
+
         To use a custom scoring function, create a function with the following parameters and set the `scoring`
         parameter to the custom function. You must use all the parameters in your function call, even if the
         function does not require all the parameters.
+
             worda_count: number of occurrances in `sentences` of the first token in the phrase being scored
             wordb_count: number of occurrances in `sentences` of the second token in the phrase being scored
             bigram_count: number of occurrances in `sentences` of the phrase being scored
             len_vocab: the number of unique tokens in `sentences`
             min_count: the `min_count` setting of the Phrases class
             corpus_word_count: the total number of (non-unique) tokens in `sentences`
+
         A scoring function without any of these parameters (even if the parameters are not used) will
-            raise a ValueError on initialization of the Phrases class
-        The scoring function must be picklable
+        raise a ValueError on initialization of the Phrases class. The scoring function must be picklable.
 
         """
         if min_count <= 0: