From 04999a4e071b9ad9d10d7c261f45312c118e33b0 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Fri, 30 Jun 2017 16:19:48 -0400 Subject: [PATCH 01/11] initial commit of alternate scoring now with a scoring parameter to initialize a Phrases object, defaults to the mikolov paper scoring, but also switchable to 'npmi', normalized pointwise mutual information moved scoring calculation to call a function, scoring functions are now top level functions in models.Phrases that are called when calculating scores in models.Phrases.export_phrases --- gensim/models/phrases.py | 82 +++++++++++++++++++++++++++++++--------- 1 file changed, 64 insertions(+), 18 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index be735b865a..5ae9100d94 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -106,7 +106,8 @@ class Phrases(interfaces.TransformationABC): """ def __init__(self, sentences=None, min_count=5, threshold=10.0, - max_vocab_size=40000000, delimiter=b'_', progress_per=10000): + max_vocab_size=40000000, delimiter=b'_', progress_per=10000, + scoring = 'default'): """ Initialize the model from an iterable of `sentences`. Each sentence must be a list of words (unicode strings) that will be used for training. @@ -120,10 +121,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, `min_count` ignore all words and bigrams with total collected count lower than this. - `threshold` represents a threshold for forming the phrases (higher means - fewer phrases). A phrase of words `a` and `b` is accepted if - `(cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold`, where `N` is the - total vocabulary size. + `threshold` represents a score threshold for forming the phrases (higher means + fewer phrases). A phrase of words `a` followed by `b` is accepted if the score of the + phrase is greater than threshold. see the `scoring' setting `max_vocab_size` is the maximum size of the vocabulary. Used to control pruning of less common words, to keep memory under control. The default @@ -133,6 +133,20 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, `delimiter` is the glue character used to join collocation tokens, and should be a byte string (e.g. b'_'). + `scoring` specifies how potential phrases are scored for comparison to the `threshold` + setting. two settings are available: + 'default': from "Efficient Estimaton of Word Representations in Vector Space" by + Mikolov, et. al.: + (count(worda followed by wordb) - min_count) * N / + (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size. + 'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual + Information in Colocation Extraction" by Gerlof Bouma: + ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) / + - ln(prop(worda followed by wordb) + where prop(n) is the count of n / the count of everything in the entire corpus + 'npmi' is more robust when dealing with common words that form part of common bigrams, and + ranges from 0 to 1, but is slower to calculate than the default + """ if min_count <= 0: raise ValueError("min_count should be at least 1") @@ -147,6 +161,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, self.min_reduce = 1 # ignore any tokens with count smaller than this self.delimiter = delimiter self.progress_per = progress_per + self.scoring = scoring + self.corpus_word_count = 0L if sentences is not None: self.add_vocab(sentences) @@ -178,6 +194,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): if sentence: # add last word skipped by previous loop word = sentence[-1] vocab[word] += 1 + total_words += 1 if len(vocab) > max_vocab_size: utils.prune_vocab(vocab, min_reduce) @@ -185,7 +202,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" % (len(vocab), total_words, sentence_no + 1)) - return min_reduce, vocab + return min_reduce, vocab, total_words def add_vocab(self, sentences): """ @@ -199,6 +216,7 @@ def add_vocab(self, sentences): # counts collected in previous learn_vocab runs. min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) + self.corpus_word_count += total_words if len(self.vocab) > 0: logger.info("merging %i counts into %s", len(vocab), self) self.min_reduce = max(self.min_reduce, min_reduce) @@ -226,31 +244,45 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): then you can debug the threshold with generated tsv """ + + vocab = self.vocab + threshold = self.threshold + delimiter = self.delimiter # delimiter used for lookup + min_count = self.min_count + scoring = self.scoring + corpus_word_count = self.corpus_word_count + + if scoring == 'mikolov': + scoring_function = partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) + if scoring == 'npmi': + scoring_function = partial(self.npmi_scorer, corpus_word_count = corpus_word_count) + #TODO else: make sure this asserts if there is no scoring function + for sentence in sentences: s = [utils.any2utf8(w) for w in sentence] last_bigram = False - vocab = self.vocab - threshold = self.threshold - delimiter = self.delimiter # delimiter used for lookup - min_count = self.min_count + for word_a, word_b in zip(s, s[1:]): - if word_a in vocab and word_b in vocab: + # last bigram check was moved here to save a few CPU cycles + if word_a in vocab and word_b in vocab and not last_bigram: bigram_word = delimiter.join((word_a, word_b)) - if bigram_word in vocab and not last_bigram: - pa = float(vocab[word_a]) - pb = float(vocab[word_b]) - pab = float(vocab[bigram_word]) - score = (pab - min_count) / pa / pb * len(vocab) + if bigram_word in vocab: + count_a = float(vocab[word_a]) + count_b = float(vocab[word_b]) + count_ab = float(vocab[bigram_word]) + score = scoring_function(count_a, count_b, count_ab) # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) - if score > threshold: + # added mincount check because if the scorer doesn't contain min_count + # it would not be enforced otherwise + if score > threshold and count_ab >= min_count: if as_tuples: yield ((word_a, word_b), score) else: yield (out_delimiter.join((word_a, word_b)), score) last_bigram = True continue - last_bigram = False + last_bigram = False def __getitem__(self, sentence): """ @@ -311,6 +343,20 @@ def __getitem__(self, sentence): return [utils.to_unicode(w) for w in new_s] + # calculation of score based on original mikolov word2vec paper + # len_vocab and min_count set so functools.partial works + @staticmethod + def original_scorer(worda_count, wordb_count, bigram_count, len_vocab = 0.0, min_count = 0.0): + return (bigram_count - min_count) / worda_count / wordb_count * len_vocab + + # normalized PMI, requires corpus size + @staticmethod + def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count = 0.0): + pa = worda_count / corpus_word_count + pb = wordb_count / corpus_word_count + pab = bigram_count / corpus_word_count + return log(pab / (pa * pb)) / -log(pab) + def pseudocorpus(source_vocab, sep): """Feeds source_vocab's compound keys back to it, to discover phrases""" From 8984589ade598dabf31c343e470be4da548dc0f7 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Wed, 5 Jul 2017 11:41:17 -0400 Subject: [PATCH 02/11] all existing tests now pass fixed some bugs with the pluggable scoring that were causing tests to fail. --- gensim/models/phrases.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 5ae9100d94..7446c9ccfc 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -64,6 +64,7 @@ import warnings from collections import defaultdict import itertools as it +from functools import partial from six import iteritems, string_types, next @@ -161,8 +162,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, self.min_reduce = 1 # ignore any tokens with count smaller than this self.delimiter = delimiter self.progress_per = progress_per - self.scoring = scoring - self.corpus_word_count = 0L + self.scoring = scoring + self.corpus_word_count = 0L if sentences is not None: self.add_vocab(sentences) @@ -194,7 +195,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000): if sentence: # add last word skipped by previous loop word = sentence[-1] vocab[word] += 1 - total_words += 1 + total_words += 1 if len(vocab) > max_vocab_size: utils.prune_vocab(vocab, min_reduce) @@ -214,7 +215,8 @@ def add_vocab(self, sentences): # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accummulated # counts collected in previous learn_vocab runs. - min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) + min_reduce, vocab, total_words = \ + self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) self.corpus_word_count += total_words if len(self.vocab) > 0: @@ -252,11 +254,12 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): scoring = self.scoring corpus_word_count = self.corpus_word_count - if scoring == 'mikolov': + if scoring == 'default': scoring_function = partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) - if scoring == 'npmi': + elif scoring == 'npmi': scoring_function = partial(self.npmi_scorer, corpus_word_count = corpus_word_count) - #TODO else: make sure this asserts if there is no scoring function + else: + raise ValueError('unknown scoring function specified') for sentence in sentences: s = [utils.any2utf8(w) for w in sentence] From a36b2fb1d804c900c5eb250ef37a9b7c02189a36 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Wed, 5 Jul 2017 12:24:05 -0400 Subject: [PATCH 03/11] added testScoringOriginal to test default scoring --- gensim/test/test_phrases.py | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index ba2cfc7192..3f5a7080f6 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -138,7 +138,7 @@ def testExportPhrases(self): b'human interface' ]) - def test_multiple_bigrams_single_entry(self): + def testMultipleBigramsSingleEntry(self): """ a single entry should produce multiple bigrams. """ bigram = Phrases(sentences, min_count=1, threshold=1) @@ -153,6 +153,40 @@ def test_multiple_bigrams_single_entry(self): b'human interface' ]) + def testScoringOriginal(self): + """ a single entry should produce multiple bigrams. """ + bigram = Phrases(sentences, min_count=1, threshold=1) + + seen_scores = set() + + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] + for phrase, score in bigram.export_phrases(test_sentences): + seen_scores.add(round(score,3)) + + assert seen_scores == set([ + 5.167, # score for graph minors + 3.444 # score for human interface + ]) + + def testScoringNpmi(self): + """ a single entry should produce multiple bigrams. """ + bigram = Phrases(sentences, min_count=1, threshold=1) + + seen_bigrams = set() + seen_scores = set() + + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] + logging.debug(str(len(bigram.vocab))) + for phrase, score in bigram.export_phrases(test_sentences): + logging.debug('test Scoring Phrase ' + phrase) + logging.debug('test Scoring score ' + str(score)) + seen_scores.add(round(score,3)) + + assert seen_scores == set([ + 5.167, + 3.444 + ]) + def testBadParameters(self): """Test the phrases module with bad parameters.""" # should fail with something less or equal than 0 From 5043dbbae4eba7821d4de8237c04b4858358e8cf Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Wed, 5 Jul 2017 12:26:48 -0400 Subject: [PATCH 04/11] better name for test for default scorer --- gensim/test/test_phrases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 3f5a7080f6..bdf6997c3b 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -153,7 +153,7 @@ def testMultipleBigramsSingleEntry(self): b'human interface' ]) - def testScoringOriginal(self): + def testScoringDefault(self): """ a single entry should produce multiple bigrams. """ bigram = Phrases(sentences, min_count=1, threshold=1) From 384172ef07c6a8a799f22319c2a1e17021f7ea2d Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Wed, 5 Jul 2017 13:38:52 -0400 Subject: [PATCH 05/11] moved scoring parameter checking logic to initialization --- gensim/models/phrases.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 7446c9ccfc..8506a256ff 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -65,6 +65,7 @@ from collections import defaultdict import itertools as it from functools import partial +from math import log from six import iteritems, string_types, next @@ -152,8 +153,13 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, if min_count <= 0: raise ValueError("min_count should be at least 1") - if threshold <= 0: - raise ValueError("threshold should be positive") + if threshold <= 0 and scoring == 'default': + raise ValueError("threshold should be positive for default scoring") + if scoring == 'npmi' and (threshold < -1 or threshold > 1): + raise ValueError("threshold should be between -1 and 1 for npmi scoring") + + if not (scoring == 'default' or scoring == 'npmi'): + raise ValueError('unknown scoring function "' + scoring + '" specified') self.min_count = min_count self.threshold = threshold @@ -247,19 +253,21 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): then you can debug the threshold with generated tsv """ + if scoring == 'default': + self.scoring_function = \ + partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) + elif scoring == 'npmi': + self.scoring_function = \ + partial(self.npmi_scorer, corpus_word_count = corpus_word_count) + # no else here to catch unknown scoring function, check is done in Phrases.__init__ + vocab = self.vocab threshold = self.threshold delimiter = self.delimiter # delimiter used for lookup min_count = self.min_count scoring = self.scoring corpus_word_count = self.corpus_word_count - - if scoring == 'default': - scoring_function = partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) - elif scoring == 'npmi': - scoring_function = partial(self.npmi_scorer, corpus_word_count = corpus_word_count) - else: - raise ValueError('unknown scoring function specified') + scoring_function = self.scoring_function for sentence in sentences: s = [utils.any2utf8(w) for w in sentence] From e3eeb678bae640c313e57ae753f9f5b2b127e4e0 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Wed, 5 Jul 2017 13:43:23 -0400 Subject: [PATCH 06/11] fixed bugin export_phrases scoring function creation --- gensim/models/phrases.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 8506a256ff..9d42938528 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -253,21 +253,20 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): then you can debug the threshold with generated tsv """ - if scoring == 'default': - self.scoring_function = \ - partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) - elif scoring == 'npmi': - self.scoring_function = \ - partial(self.npmi_scorer, corpus_word_count = corpus_word_count) - # no else here to catch unknown scoring function, check is done in Phrases.__init__ - vocab = self.vocab threshold = self.threshold delimiter = self.delimiter # delimiter used for lookup min_count = self.min_count scoring = self.scoring corpus_word_count = self.corpus_word_count - scoring_function = self.scoring_function + + if self.scoring == 'default': + scoring_function = \ + partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) + elif self.scoring == 'npmi': + scoring_function = \ + partial(self.npmi_scorer, corpus_word_count = corpus_word_count) + # no else here to catch unknown scoring function, check is done in Phrases.__init__ for sentence in sentences: s = [utils.any2utf8(w) for w in sentence] From a6684de326ca34254469b7f2946c4ea5d93e33de Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Wed, 5 Jul 2017 13:56:20 -0400 Subject: [PATCH 07/11] test for npmi scoring --- gensim/test/test_phrases.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index bdf6997c3b..b3b5b5665f 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -154,7 +154,7 @@ def testMultipleBigramsSingleEntry(self): ]) def testScoringDefault(self): - """ a single entry should produce multiple bigrams. """ + """ test the default scoring, from the mikolov word2vec paper """ bigram = Phrases(sentences, min_count=1, threshold=1) seen_scores = set() @@ -169,22 +169,18 @@ def testScoringDefault(self): ]) def testScoringNpmi(self): - """ a single entry should produce multiple bigrams. """ - bigram = Phrases(sentences, min_count=1, threshold=1) + """ test normalized pointwise mutual information scoring """ + bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi') - seen_bigrams = set() seen_scores = set() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] - logging.debug(str(len(bigram.vocab))) for phrase, score in bigram.export_phrases(test_sentences): - logging.debug('test Scoring Phrase ' + phrase) - logging.debug('test Scoring score ' + str(score)) seen_scores.add(round(score,3)) assert seen_scores == set([ - 5.167, - 3.444 + .882, #score for graph minors + .714 # score for human interface ]) def testBadParameters(self): From b70648c7f9e91dea6fc96bacbc7943d4f1fa9747 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Wed, 5 Jul 2017 13:58:57 -0400 Subject: [PATCH 08/11] typo in phrases docstring --- gensim/models/phrases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 9d42938528..1b28c8b922 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -147,7 +147,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, - ln(prop(worda followed by wordb) where prop(n) is the count of n / the count of everything in the entire corpus 'npmi' is more robust when dealing with common words that form part of common bigrams, and - ranges from 0 to 1, but is slower to calculate than the default + ranges from -1 to 1, but is slower to calculate than the default """ if min_count <= 0: From 99ec30117170bf3e619f834ff79258744275b39f Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Wed, 5 Jul 2017 14:44:40 -0400 Subject: [PATCH 09/11] copy scoring setting to Phraser --- gensim/models/phrases.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 1b28c8b922..869414787a 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -385,8 +385,8 @@ class Phraser(interfaces.TransformationABC): After the one-time initialization, a Phraser will be much smaller and somewhat faster than using the full Phrases model. - Reflects the results of the source model's `min_count` and `threshold` - settings. (You can tamper with those & create a new Phraser to try + Reflects the results of the source model's `min_count`, `threshold`, and + `scoring` settings. (You can tamper with those & create a new Phraser to try other values.) """ @@ -394,6 +394,7 @@ def __init__(self, phrases_model): self.threshold = phrases_model.threshold self.min_count = phrases_model.min_count self.delimiter = phrases_model.delimiter + self.scoring = phrases_model.scoring self.phrasegrams = {} corpus = pseudocorpus(phrases_model.vocab, phrases_model.delimiter) logger.info('source_vocab length %i', len(phrases_model.vocab)) From e408f9048b5ddd793c248208ced83d8e76cc1038 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Fri, 7 Jul 2017 16:16:08 -0400 Subject: [PATCH 10/11] fixing travis-ci errors --- gensim/models/phrases.py | 12 ++++++------ gensim/test/test_phrases.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 869414787a..9aa6a2701b 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -109,7 +109,7 @@ class Phrases(interfaces.TransformationABC): """ def __init__(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=40000000, delimiter=b'_', progress_per=10000, - scoring = 'default'): + scoring='default'): """ Initialize the model from an iterable of `sentences`. Each sentence must be a list of words (unicode strings) that will be used for training. @@ -260,12 +260,12 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): scoring = self.scoring corpus_word_count = self.corpus_word_count - if self.scoring == 'default': + if scoring == 'default': scoring_function = \ partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) - elif self.scoring == 'npmi': + elif scoring == 'npmi': scoring_function = \ - partial(self.npmi_scorer, corpus_word_count = corpus_word_count) + partial(self.npmi_scorer, corpus_word_count=corpus_word_count) # no else here to catch unknown scoring function, check is done in Phrases.__init__ for sentence in sentences: @@ -356,12 +356,12 @@ def __getitem__(self, sentence): # calculation of score based on original mikolov word2vec paper # len_vocab and min_count set so functools.partial works @staticmethod - def original_scorer(worda_count, wordb_count, bigram_count, len_vocab = 0.0, min_count = 0.0): + def original_scorer(worda_count, wordb_count, bigram_count, len_vocab=0.0, min_count=0.0): return (bigram_count - min_count) / worda_count / wordb_count * len_vocab # normalized PMI, requires corpus size @staticmethod - def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count = 0.0): + def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count=0.0): pa = worda_count / corpus_word_count pb = wordb_count / corpus_word_count pab = bigram_count / corpus_word_count diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index b3b5b5665f..688f92dbd0 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -161,11 +161,11 @@ def testScoringDefault(self): test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): - seen_scores.add(round(score,3)) + seen_scores.add(round(score, 3)) assert seen_scores == set([ - 5.167, # score for graph minors - 3.444 # score for human interface + 5.167, # score for graph minors + 3.444 # score for human interface ]) def testScoringNpmi(self): @@ -176,11 +176,11 @@ def testScoringNpmi(self): test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): - seen_scores.add(round(score,3)) + seen_scores.add(round(score, 3)) assert seen_scores == set([ - .882, #score for graph minors - .714 # score for human interface + .882, # score for graph minors + .714 # score for human interface ]) def testBadParameters(self): From 80b68c2ac455572dd03819ac0f3b0069e0499b73 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Fri, 7 Jul 2017 16:33:45 -0400 Subject: [PATCH 11/11] no need to specify long vs. int --- gensim/models/phrases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 9aa6a2701b..33390fc08e 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -169,7 +169,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, self.delimiter = delimiter self.progress_per = progress_per self.scoring = scoring - self.corpus_word_count = 0L + self.corpus_word_count = 0 if sentences is not None: self.add_vocab(sentences)