From 21c4401c5108ff879532743347c56fd54aa36e75 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Tue, 5 Sep 2017 13:27:37 -0400 Subject: [PATCH 01/17] initial commit of fixes in comments of #1423 --- gensim/models/word2vec.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 255b9c553f..ddae9ff08f 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1563,7 +1563,7 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): """ `source` should be a path to a directory (as a string) where all files can be opened by the LineSentence class. Each file will be read up to - `limit` lines (or no clipped if limit is None, the default). + `limit` lines (or not clipped if limit is None, the default). Example:: @@ -1577,23 +1577,23 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): self.limit = limit if os.path.isfile(self.source): - logging.warning('single file read, better to use models.word2vec.LineSentence') + logger.warning('single file read, better to use models.word2vec.LineSentence') self.input_files = [self.source] # force code compatibility with list of files elif os.path.isdir(self.source): self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path - logging.debug('reading directory ' + self.source) + logger.warning('reading directory %s', self.source) self.input_files = os.listdir(self.source) - self.input_files = [self.source + file for file in self.input_files] # make full paths + self.input_files = [self.source + filename for filename in self.input_files] # make full paths self.input_files.sort() # makes sure it happens in filename order else: # not a file or a directory, then we can't do anything with it raise ValueError('input is neither a file nor a path') - logging.info('files read into PathLineSentences:' + '\n'.join(self.input_files)) + logger.info('files read into PathLineSentences: %s', '\n'.join(self.input_files)) def __iter__(self): '''iterate through the files''' for file_name in self.input_files: - logging.info('reading file ' + file_name) + logger.info('reading file %s', file_name) with utils.smart_open(file_name) as fin: for line in itertools.islice(fin, self.limit): line = utils.to_unicode(line).split() From 0590c2f90df93f35deb3bbeb91f5309734e56d5e Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Tue, 5 Sep 2017 13:38:29 -0400 Subject: [PATCH 02/17] removed unnecessary space in logger --- gensim/models/word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index ddae9ff08f..96c7195dc9 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1588,7 +1588,7 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): else: # not a file or a directory, then we can't do anything with it raise ValueError('input is neither a file nor a path') - logger.info('files read into PathLineSentences: %s', '\n'.join(self.input_files)) + logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) def __iter__(self): '''iterate through the files''' From 34dc58f15945b8c104b6b8094fa66f219426c989 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Tue, 5 Sep 2017 16:21:46 -0400 Subject: [PATCH 03/17] added support for custom Phrases scorers --- gensim/models/phrases.py | 82 +++++++++++++++++++++++++------------ gensim/test/test_phrases.py | 17 +++++++- 2 files changed, 71 insertions(+), 28 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 1f0826258c..2da62b9672 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -64,8 +64,8 @@ import warnings from collections import defaultdict import itertools as it -from functools import partial from math import log +from inspect import getargspec from six import iteritems, string_types, next @@ -137,18 +137,31 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, should be a byte string (e.g. b'_'). `scoring` specifies how potential phrases are scored for comparison to the `threshold` - setting. two settings are available: - 'default': from "Efficient Estimaton of Word Representations in Vector Space" by - Mikolov, et. al.: - (count(worda followed by wordb) - min_count) * N / - (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size. - 'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual - Information in Colocation Extraction" by Gerlof Bouma: - ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) / - - ln(prop(worda followed by wordb) - where prop(n) is the count of n / the count of everything in the entire corpus - 'npmi' is more robust when dealing with common words that form part of common bigrams, and + setting. `scoring` can be set with either a string that refers to a built-in scoring function, + or with a function with the expected parameter names. + Two built-in scoring functions are available by setting `scoring` to a string: + 'default': from "Efficient Estimaton of Word Representations in Vector Space" by + Mikolov, et. al.: + (count(worda followed by wordb) - min_count) * N / + (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size. + 'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual + Information in Colocation Extraction" by Gerlof Bouma: + ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) / + - ln(prop(worda followed by wordb) + where prop(n) is the count of n / the count of everything in the entire corpus + 'npmi' is more robust when dealing with common words that form part of common bigrams, and ranges from -1 to 1, but is slower to calculate than the default + To use a custom scoring function, create a function with the following parameters and set the `scoring` + parameter to the custom function. You must use all the parameters in your function call, even if the + function does not require all the parameters. + worda_count: number of occurrances in `sentences` of the first token in the phrase being scored + wordb_count: number of occurrances in `sentences` of the second token in the phrase being scored + bigram_count: number of occurrances in `sentences` of the phrase being scored + len_vocab: the number of unique tokens in `sentences` + min_count: the `min_count` setting of the Phrases class + corpus_word_count: the total number of (non-unique) tokens in `sentences` + A scoring function without any of these parameters (even if the parameters are not used) will + raise a ValueError on initialization of the Phrases class """ if min_count <= 0: @@ -159,8 +172,23 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, if scoring == 'npmi' and (threshold < -1 or threshold > 1): raise ValueError("threshold should be between -1 and 1 for npmi scoring") - if not (scoring == 'default' or scoring == 'npmi'): - raise ValueError('unknown scoring function "' + scoring + '" specified') + # set scoring based on string + # intentially override the value of the scoring parameter rather than set self.scoring here, + # to still run the check of scoring function parameters in the next code block + if type(scoring) is str: + if scoring == 'default': + scoring = self.original_scorer + elif scoring == 'npmi': + scoring = self.npmi_scorer + else: + raise ValueError('unknown scoring method string %s specified' % (scoring)) + + scoring_parameters = ['worda_count','wordb_count','bigram_count','len_vocab','min_count','corpus_word_count'] + if callable(scoring): + if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters): + self.scoring = scoring + else: + raise ValueError('scoring function missing expected parameters') self.min_count = min_count self.threshold = threshold @@ -169,7 +197,6 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, self.min_reduce = 1 # ignore any tokens with count smaller than this self.delimiter = delimiter self.progress_per = progress_per - self.scoring = scoring self.corpus_word_count = 0 if sentences is not None: @@ -258,16 +285,14 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): threshold = self.threshold delimiter = self.delimiter # delimiter used for lookup min_count = self.min_count - scoring = self.scoring + scorer = self.scoring corpus_word_count = self.corpus_word_count + # made floats for scoring function + len_vocab = float(len(vocab)) + scorer_min_count = float(min_count) + corpus_word_count = float(corpus_word_count) + - if scoring == 'default': - scoring_function = \ - partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count)) - elif scoring == 'npmi': - scoring_function = \ - partial(self.npmi_scorer, corpus_word_count=corpus_word_count) - # no else here to catch unknown scoring function, check is done in Phrases.__init__ for sentence in sentences: s = [utils.any2utf8(w) for w in sentence] @@ -281,7 +306,8 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): count_a = float(vocab[word_a]) count_b = float(vocab[word_b]) count_ab = float(vocab[bigram_word]) - score = scoring_function(count_a, count_b, count_ab) + # scoring function should have all these parameters + score = scorer(worda_count=count_a, wordb_count=count_b, bigram_count=count_ab, len_vocab=len_vocab, min_count=scorer_min_count, corpus_word_count=corpus_word_count) # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) # added mincount check because if the scorer doesn't contain min_count @@ -354,15 +380,17 @@ def __getitem__(self, sentence): return [utils.to_unicode(w) for w in new_s] + # these two built-in scoring methods don't cast everything to float because the casting is done in the call + # to the scoring method in __getitem__ and export_phrases. + # calculation of score based on original mikolov word2vec paper - # len_vocab and min_count set so functools.partial works @staticmethod - def original_scorer(worda_count, wordb_count, bigram_count, len_vocab=0.0, min_count=0.0): + def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): return (bigram_count - min_count) / worda_count / wordb_count * len_vocab # normalized PMI, requires corpus size @staticmethod - def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count=0.0): + def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): pa = worda_count / corpus_word_count pb = wordb_count / corpus_word_count pab = bigram_count / corpus_word_count diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 5397d6e4c3..bde64e06dc 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -174,7 +174,6 @@ def testScoringNpmi(self): bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi') seen_scores = set() - test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] for phrase, score in bigram.export_phrases(test_sentences): seen_scores.add(round(score, 3)) @@ -184,6 +183,22 @@ def testScoringNpmi(self): .714 # score for human interface ]) + def testCustomScorer(self): + """ test using a custom scoring function """ + # all scores will be 1 + def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): + return 1 + + bigram = Phrases(sentences, min_count=1, threshold=.001, scoring=dumb_scorer) + + seen_scores = [] + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] + for phrase, score in bigram.export_phrases(test_sentences): + seen_scores.append(score) + + assert all(seen_scores) # all scores 1 + assert len(seen_scores) == 3 #'graph minors' and 'survey human' and 'interface system' + def testBadParameters(self): """Test the phrases module with bad parameters.""" # should fail with something less or equal than 0 From 32b66bd677d8e425ef2e48383262ff6baab37dc8 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Wed, 6 Sep 2017 12:36:45 -0400 Subject: [PATCH 04/17] fixed Phrases.__getitem__ to support pluggable scoring #1533 --- gensim/models/phrases.py | 48 +++++++++++++++++++++---------------- gensim/test/test_phrases.py | 13 ++++++++++ 2 files changed, 40 insertions(+), 21 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 2da62b9672..9b922eaff2 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -138,8 +138,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, `scoring` specifies how potential phrases are scored for comparison to the `threshold` setting. `scoring` can be set with either a string that refers to a built-in scoring function, - or with a function with the expected parameter names. - Two built-in scoring functions are available by setting `scoring` to a string: + or with a function with the expected parameter names. Two built-in scoring functions are available + by setting `scoring` to a string: 'default': from "Efficient Estimaton of Word Representations in Vector Space" by Mikolov, et. al.: (count(worda followed by wordb) - min_count) * N / @@ -249,8 +249,7 @@ def add_vocab(self, sentences): # directly, but gives the new sentences a fighting chance to collect # sufficient counts, before being pruned out by the (large) accummulated # counts collected in previous learn_vocab runs. - min_reduce, vocab, total_words = \ - self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) + min_reduce, vocab, total_words = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per) self.corpus_word_count += total_words if len(self.vocab) > 0: @@ -286,11 +285,10 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): delimiter = self.delimiter # delimiter used for lookup min_count = self.min_count scorer = self.scoring - corpus_word_count = self.corpus_word_count # made floats for scoring function len_vocab = float(len(vocab)) scorer_min_count = float(min_count) - corpus_word_count = float(corpus_word_count) + corpus_word_count = float(self.corpus_word_count) @@ -306,12 +304,10 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False): count_a = float(vocab[word_a]) count_b = float(vocab[word_b]) count_ab = float(vocab[bigram_word]) - # scoring function should have all these parameters + # scoring MUST have all these parameters, even if they are not used score = scorer(worda_count=count_a, wordb_count=count_b, bigram_count=count_ab, len_vocab=len_vocab, min_count=scorer_min_count, corpus_word_count=corpus_word_count) # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", - # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) - # added mincount check because if the scorer doesn't contain min_count - # it would not be enforced otherwise + # bigram_word, count_ab, scorer_min_count, count_a, count_ab, len_vocab, score) if score > threshold and count_ab >= min_count: if as_tuples: yield ((word_a, word_b), score) @@ -342,6 +338,16 @@ def __getitem__(self, sentence): """ warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class") + vocab = self.vocab + threshold = self.threshold + delimiter = self.delimiter # delimiter used for lookup + min_count = self.min_count + scorer = self.scoring + # made floats for scoring function + len_vocab = float(len(vocab)) + scorer_min_count = float(min_count) + corpus_word_count = float(self.corpus_word_count) + is_single, sentence = _is_single(sentence) if not is_single: # if the input is an entire corpus (rather than a single sentence), @@ -351,20 +357,20 @@ def __getitem__(self, sentence): s, new_s = [utils.any2utf8(w) for w in sentence], [] last_bigram = False vocab = self.vocab - threshold = self.threshold - delimiter = self.delimiter - min_count = self.min_count + for word_a, word_b in zip(s, s[1:]): - if word_a in vocab and word_b in vocab: + # last bigram check was moved here to save a few CPU cycles + if word_a in vocab and word_b in vocab and not last_bigram: bigram_word = delimiter.join((word_a, word_b)) - if bigram_word in vocab and not last_bigram: - pa = float(vocab[word_a]) - pb = float(vocab[word_b]) - pab = float(vocab[bigram_word]) - score = (pab - min_count) / pa / pb * len(vocab) + if bigram_word in vocab: + count_a = float(vocab[word_a]) + count_b = float(vocab[word_b]) + count_ab = float(vocab[bigram_word]) + # scoring MUST have all these parameters, even if they are not used + score = scorer(worda_count=count_a, wordb_count=count_b, bigram_count=count_ab, len_vocab=len_vocab, min_count=scorer_min_count, corpus_word_count=corpus_word_count) # logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s", - # bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score) - if score > threshold: + # bigram_word, count_ab, scorer_min_count, count_a, count_ab, len_vocab, score) + if score > threshold and count_ab >= min_count: new_s.append(bigram_word) last_bigram = True continue diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index bde64e06dc..591883eec1 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -12,6 +12,7 @@ import unittest import os import sys +import pdb from gensim import utils from gensim.models.phrases import Phrases, Phraser @@ -169,6 +170,15 @@ def testScoringDefault(self): 3.444 # score for human interface ]) + def test__getitem__(self): + """ test Phrases[sentences] with a single sentence""" + bigram = Phrases(sentences, min_count=1, threshold=1) + # pdb.set_trace() + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] + phrased_sentences = bigram[test_sentences].__iter__().next() + + assert phrased_sentences == ['graph_minors', 'survey', 'human_interface'] + def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi') @@ -183,6 +193,9 @@ def testScoringNpmi(self): .714 # score for human interface ]) + phrased_sentences = bigram[test_sentences].__iter__().next() + assert phrased_sentences == ['graph_minors', 'survey', 'human_interface'] + def testCustomScorer(self): """ test using a custom scoring function """ # all scores will be 1 From 9b3f801a86c11e3f0248df84e7aab69d723163fa Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Wed, 6 Sep 2017 14:14:25 -0400 Subject: [PATCH 05/17] travisCI style fixes --- gensim/models/phrases.py | 2 +- gensim/test/test_phrases.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 9b922eaff2..b3adc814ce 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -183,7 +183,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, else: raise ValueError('unknown scoring method string %s specified' % (scoring)) - scoring_parameters = ['worda_count','wordb_count','bigram_count','len_vocab','min_count','corpus_word_count'] + scoring_parameters = ['worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count'] if callable(scoring): if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters): self.scoring = scoring diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 591883eec1..71a9e237f4 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -12,7 +12,6 @@ import unittest import os import sys -import pdb from gensim import utils from gensim.models.phrases import Phrases, Phraser @@ -209,8 +208,8 @@ def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co for phrase, score in bigram.export_phrases(test_sentences): seen_scores.append(score) - assert all(seen_scores) # all scores 1 - assert len(seen_scores) == 3 #'graph minors' and 'survey human' and 'interface system' + assert all(seen_scores) # all scores 1 + assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system' def testBadParameters(self): """Test the phrases module with bad parameters.""" From 2698aa7ed545616871755a24105c6d077f8ac063 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Thu, 7 Sep 2017 08:28:27 -0400 Subject: [PATCH 06/17] fixed __next__() to next() for python 3 compatibilyt --- gensim/test/test_phrases.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 71a9e237f4..083d07b537 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -174,9 +174,9 @@ def test__getitem__(self): bigram = Phrases(sentences, min_count=1, threshold=1) # pdb.set_trace() test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']] - phrased_sentences = bigram[test_sentences].__iter__().next() + phrased_sentence = next(bigram[test_sentences].__iter__()) - assert phrased_sentences == ['graph_minors', 'survey', 'human_interface'] + assert phrased_sentence == ['graph_minors', 'survey', 'human_interface'] def testScoringNpmi(self): """ test normalized pointwise mutual information scoring """ @@ -192,9 +192,6 @@ def testScoringNpmi(self): .714 # score for human interface ]) - phrased_sentences = bigram[test_sentences].__iter__().next() - assert phrased_sentences == ['graph_minors', 'survey', 'human_interface'] - def testCustomScorer(self): """ test using a custom scoring function """ # all scores will be 1 @@ -209,7 +206,7 @@ def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co seen_scores.append(score) assert all(seen_scores) # all scores 1 - assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system' + assert len(seen_scores) == 3 #'graph minors' and 'survey human' and 'interface system' def testBadParameters(self): """Test the phrases module with bad parameters.""" From accea8c4b9b6f2a931f28603e680f6ce1f38e8b3 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Thu, 7 Sep 2017 10:53:49 -0400 Subject: [PATCH 07/17] misc fixes --- gensim/models/phrases.py | 10 ++++------ gensim/test/test_phrases.py | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index b3adc814ce..3da1f6dfbf 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -177,9 +177,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, # to still run the check of scoring function parameters in the next code block if type(scoring) is str: if scoring == 'default': - scoring = self.original_scorer + scoring = original_scorer elif scoring == 'npmi': - scoring = self.npmi_scorer + scoring = npmi_scorer else: raise ValueError('unknown scoring method string %s specified' % (scoring)) @@ -390,13 +390,11 @@ def __getitem__(self, sentence): # to the scoring method in __getitem__ and export_phrases. # calculation of score based on original mikolov word2vec paper - @staticmethod - def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): +def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): return (bigram_count - min_count) / worda_count / wordb_count * len_vocab # normalized PMI, requires corpus size - @staticmethod - def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): +def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): pa = worda_count / corpus_word_count pb = wordb_count / corpus_word_count pab = bigram_count / corpus_word_count diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 083d07b537..4e75dfb701 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -206,7 +206,7 @@ def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, co seen_scores.append(score) assert all(seen_scores) # all scores 1 - assert len(seen_scores) == 3 #'graph minors' and 'survey human' and 'interface system' + assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system' def testBadParameters(self): """Test the phrases module with bad parameters.""" From 8854097d5e25be1fe5c350a0acebbab4d2664d2a Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Thu, 7 Sep 2017 11:03:31 -0400 Subject: [PATCH 08/17] spacing fixes for style --- gensim/models/phrases.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 3da1f6dfbf..45b25fc96e 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -389,11 +389,13 @@ def __getitem__(self, sentence): # these two built-in scoring methods don't cast everything to float because the casting is done in the call # to the scoring method in __getitem__ and export_phrases. + # calculation of score based on original mikolov word2vec paper def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): return (bigram_count - min_count) / worda_count / wordb_count * len_vocab - # normalized PMI, requires corpus size + +# normalized PMI, requires corpus size def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): pa = worda_count / corpus_word_count pb = wordb_count / corpus_word_count From bbaf3f727da06f2db076ec4e036a1fb5a04f4404 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Thu, 7 Sep 2017 11:21:53 -0400 Subject: [PATCH 09/17] custom scorer support in sklearn api --- gensim/sklearn_api/phrases.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 8a944f0235..2eab84b95e 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -22,7 +22,7 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator): """ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, - delimiter=b'_', progress_per=10000): + delimiter=b'_', progress_per=10000, scoring='default'): """ Sklearn wrapper for Phrases model. """ @@ -32,13 +32,14 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, self.max_vocab_size = max_vocab_size self.delimiter = delimiter self.progress_per = progress_per + self.scoring = scoring def fit(self, X, y=None): """ Fit the model according to the given training data. """ self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold, - max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per) + max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, scoring=self.scoring) return self def transform(self, docs): @@ -62,7 +63,7 @@ def transform(self, docs): def partial_fit(self, X): if self.gensim_model is None: self.gensim_model = models.Phrases(sentences=X, min_count=self.min_count, threshold=self.threshold, - max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per) + max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, scoring=self.scoring) self.gensim_model.add_vocab(X) return self From 4e555c4c3348697fd7d49f5bfebd3bd45c4f04c2 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Thu, 7 Sep 2017 11:37:30 -0400 Subject: [PATCH 10/17] Phrases scikit interface tests for pluggable scoring --- gensim/test/test_sklearn_api.py | 59 +++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 0e17905c2b..46e407646b 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -957,6 +957,65 @@ def testModelNotFitted(self): phrases_transformer = PhrasesTransformer() self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0]) +# specifically test pluggable scoring in Phrases, because possible pickling issues with function parameter + +# all scores will be 1 +# this is intentionally in main rather than a class method to support pickling +def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): + return 1 + +class TestPhrasesTransformerCustomScorer(unittest.TestCase): + + def setUp(self): + numpy.random.seed(0) + + self.model = PhrasesTransformer(min_count=1, threshold=.9, scoring=dumb_scorer) + self.model.fit(phrases_sentences) + + def testTransform(self): + # tranform one document + doc = phrases_sentences[-1] + phrase_tokens = self.model.transform(doc)[0] + expected_phrase_tokens = [u'graph_minors', u'survey_human', u'interface'] + self.assertEqual(phrase_tokens, expected_phrase_tokens) + + def testPartialFit(self): + new_sentences = [ + ['world', 'peace', 'humans', 'world', 'peace', 'world', 'peace', 'people'], + ['world', 'peace', 'people'], + ['world', 'peace', 'humans'] + ] + self.model.partial_fit(X=new_sentences) # train model with new sentences + + doc = ['graph', 'minors', 'survey', 'human', 'interface', 'world', 'peace'] + phrase_tokens = self.model.transform(doc)[0] + expected_phrase_tokens = [u'graph_minors', u'survey_human', u'interface', u'world_peace'] + self.assertEqual(phrase_tokens, expected_phrase_tokens) + + def testSetGetParams(self): + # updating only one param + self.model.set_params(progress_per=5000) + model_params = self.model.get_params() + self.assertEqual(model_params["progress_per"], 5000) + + # verify that the attributes values are also changed for `gensim_model` after fitting + self.model.fit(phrases_sentences) + self.assertEqual(getattr(self.model.gensim_model, 'progress_per'), 5000) + + def testPersistence(self): + model_dump = pickle.dumps(self.model) + model_load = pickle.loads(model_dump) + + doc = phrases_sentences[-1] + loaded_phrase_tokens = model_load.transform(doc) + + # comparing the original and loaded models + original_phrase_tokens = self.model.transform(doc) + self.assertEqual(original_phrase_tokens, loaded_phrase_tokens) + + def testModelNotFitted(self): + phrases_transformer = PhrasesTransformer() + self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0]) if __name__ == '__main__': unittest.main() From b16554f943d2e1dbd14311dc79ef52a006ad11f6 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Thu, 7 Sep 2017 14:23:45 -0400 Subject: [PATCH 11/17] missing line breaks --- gensim/test/test_sklearn_api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 46e407646b..8a5b3b1e5e 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -957,13 +957,15 @@ def testModelNotFitted(self): phrases_transformer = PhrasesTransformer() self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0]) + # specifically test pluggable scoring in Phrases, because possible pickling issues with function parameter -# all scores will be 1 # this is intentionally in main rather than a class method to support pickling +# all scores will be 1 def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): return 1 + class TestPhrasesTransformerCustomScorer(unittest.TestCase): def setUp(self): From a94a3fd9139e620a3352704c7ad00801b1735dd7 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Thu, 7 Sep 2017 14:51:38 -0400 Subject: [PATCH 12/17] style, clarity, and robustness fixes requested by @piskvorky --- gensim/models/phrases.py | 12 ++++++------ gensim/models/word2vec.py | 18 ++++++++++++------ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 45b25fc96e..9962144cd5 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -175,7 +175,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, # set scoring based on string # intentially override the value of the scoring parameter rather than set self.scoring here, # to still run the check of scoring function parameters in the next code block - if type(scoring) is str: + if isinstance(scoring, basestring): if scoring == 'default': scoring = original_scorer elif scoring == 'npmi': @@ -392,15 +392,15 @@ def __getitem__(self, sentence): # calculation of score based on original mikolov word2vec paper def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): - return (bigram_count - min_count) / worda_count / wordb_count * len_vocab + return (bigram_count - min_count) / worda_count / wordb_count * len_vocab # normalized PMI, requires corpus size def npmi_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): - pa = worda_count / corpus_word_count - pb = wordb_count / corpus_word_count - pab = bigram_count / corpus_word_count - return log(pab / (pa * pb)) / -log(pab) + pa = worda_count / corpus_word_count + pb = wordb_count / corpus_word_count + pab = bigram_count / corpus_word_count + return log(pab / (pa * pb)) / -log(pab) def pseudocorpus(source_vocab, sep): diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 96c7195dc9..0e612ff82f 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1555,15 +1555,20 @@ def __iter__(self): class PathLineSentences(object): """ - Simple format: one sentence = one line; words already preprocessed and separated by whitespace. - Like LineSentence, but will process all files in a directory in alphabetical order by filename + + Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename. + The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. Any file not ending + with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories. + + The format of files (either text, or compressed text files) in the path is one sentence = one line, with words already + preprocessed and separated by whitespace. + """ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): """ `source` should be a path to a directory (as a string) where all files can be opened by the - LineSentence class. Each file will be read up to - `limit` lines (or not clipped if limit is None, the default). + LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default). Example:: @@ -1577,11 +1582,12 @@ def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): self.limit = limit if os.path.isfile(self.source): - logger.warning('single file read, better to use models.word2vec.LineSentence') + logger.debug('single file given as source, rather than a directory of files') + logger.debug('consider using models.word2vec.LineSentence for a single file') self.input_files = [self.source] # force code compatibility with list of files elif os.path.isdir(self.source): self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path - logger.warning('reading directory %s', self.source) + logger.info('reading directory %s', self.source) self.input_files = os.listdir(self.source) self.input_files = [self.source + filename for filename in self.input_files] # make full paths self.input_files.sort() # makes sure it happens in filename order From f9cc04f1b8b6adaaf6f1f48684439c97af4e40a0 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Thu, 7 Sep 2017 15:19:38 -0400 Subject: [PATCH 13/17] check in Phrases init to make sure scorer is pickleable --- gensim/models/phrases.py | 9 +++++++++ gensim/test/test_phrases.py | 11 ++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 9962144cd5..31f8d205a7 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -66,6 +66,7 @@ import itertools as it from math import log from inspect import getargspec +import pickle from six import iteritems, string_types, next @@ -162,6 +163,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, corpus_word_count: the total number of (non-unique) tokens in `sentences` A scoring function without any of these parameters (even if the parameters are not used) will raise a ValueError on initialization of the Phrases class + The scoring function must be picklable """ if min_count <= 0: @@ -199,6 +201,13 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, self.progress_per = progress_per self.corpus_word_count = 0 + # ensure picklability of custom scorer + try: + test_pickle = pickle.dumps(self.scoring) + load_pickle = pickle.loads(test_pickle) + except pickle.PickleError: + raise pickle.PickleError('unable to pickle custom Phrases scoring function') + if sentences is not None: self.add_vocab(sentences) diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 4e75dfb701..5c07016d96 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -123,6 +123,14 @@ def testEncoding(self): self.assertTrue(isinstance(transformed, unicode)) +# scorer for testCustomScorer +# function is outside of the scope of the test because for picklability of custom scorer +# Phrases tests for picklability +# all scores will be 1 +def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): + return 1 + + class TestPhrasesModel(unittest.TestCase): def testExportPhrases(self): """Test Phrases bigram export_phrases functionality.""" @@ -194,9 +202,6 @@ def testScoringNpmi(self): def testCustomScorer(self): """ test using a custom scoring function """ - # all scores will be 1 - def dumb_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): - return 1 bigram = Phrases(sentences, min_count=1, threshold=.001, scoring=dumb_scorer) From 5bbe144ea899a7dc9e46e87d1f6999e5e7f59d1b Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Thu, 7 Sep 2017 16:55:33 -0400 Subject: [PATCH 14/17] backwards scoring compatibility when loading a Phrases class --- gensim/models/phrases.py | 51 +++++++++++++++++++--- gensim/test/test_phrases.py | 86 +++++++++++++++++++++++++++++++++++++ 2 files changed, 132 insertions(+), 5 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 31f8d205a7..85309517a2 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -74,7 +74,6 @@ logger = logging.getLogger(__name__) - def _is_single(obj): """ Check whether `obj` is a single document or an entire corpus. @@ -177,6 +176,13 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, # set scoring based on string # intentially override the value of the scoring parameter rather than set self.scoring here, # to still run the check of scoring function parameters in the next code block + + # for python 2 and 3 compatibility. basestring is used to check if scoring is a string + try: + basestring + except NameError: + basestring = str + if isinstance(scoring, basestring): if scoring == 'default': scoring = original_scorer @@ -395,11 +401,46 @@ def __getitem__(self, sentence): return [utils.to_unicode(w) for w in new_s] - # these two built-in scoring methods don't cast everything to float because the casting is done in the call - # to the scoring method in __getitem__ and export_phrases. - + @classmethod + def load(cls, *args, **kwargs): + """ + Load a previously saved Phrases class. Handles backwards compatibility from older Phrases versions which did not support + pluggable scoring functions. Otherwise, relies on utils.load + """ - # calculation of score based on original mikolov word2vec paper + # for python 2 and 3 compatibility. basestring is used to check if model.scoring is a string + try: + basestring + except NameError: + basestring = str + + model = super(Phrases, cls).load(*args, **kwargs) + # update older models + # if no scoring parameter, use default scoring + if not hasattr(model, 'scoring'): + logger.info('older version of Phrases loaded without scoring function') + logger.info('setting pluggable scoring method to original_scorer for compatibility') + model.scoring = original_scorer + # if there is a scoring parameter, and it's a text value, load the proper scoring function + if hasattr(model, 'scoring'): + if isinstance(model.scoring, basestring): + if model.scoring == 'default': + logger.info('older version of Phrases loaded with "default" scoring parameter') + logger.info('setting scoring method to original_scorer pluggable scoring method for compatibility') + model.scoring = original_scorer + elif model.scoring == 'npmi': + logger.info('older version of Phrases loaded with "npmi" scoring parameter') + logger.info('setting scoring method to npmi_scorer pluggable scoring method for compatibility') + model.scoring = npmi_scorer + else: + raise ValueError('failed to load Phrases model with unknown scoring setting %s' % (model.scoring)) + return model + + +# these two built-in scoring methods don't cast everything to float because the casting is done in the call +# to the scoring method in __getitem__ and export_phrases. + +# calculation of score based on original mikolov word2vec paper def original_scorer(worda_count, wordb_count, bigram_count, len_vocab, min_count, corpus_word_count): return (bigram_count - min_count) / worda_count / wordb_count * len_vocab diff --git a/gensim/test/test_phrases.py b/gensim/test/test_phrases.py index 5c07016d96..ad76d43956 100644 --- a/gensim/test/test_phrases.py +++ b/gensim/test/test_phrases.py @@ -225,6 +225,92 @@ def testPruning(self): """Test that max_vocab_size parameter is respected.""" bigram = Phrases(sentences, max_vocab_size=5) self.assertTrue(len(bigram.vocab) <= 5) + + def testSaveLoadCustomScorer(self): + """ saving and loading a Phrases object with a custom scorer """ + + try: + bigram = Phrases(sentences, min_count=1, threshold=.001, scoring=dumb_scorer) + bigram.save("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") + bigram_loaded = Phrases.load("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") + seen_scores = [] + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] + for phrase, score in bigram_loaded.export_phrases(test_sentences): + seen_scores.append(score) + + assert all(seen_scores) # all scores 1 + assert len(seen_scores) == 3 # 'graph minors' and 'survey human' and 'interface system' + + finally: + if os.path.exists("test_phrases_testSaveLoadCustomScorer_temp_save.pkl"): + os.remove("test_phrases_testSaveLoadCustomScorer_temp_save.pkl") + + def testSaveLoad(self): + """ Saving and loading a Phrases object.""" + + try: + bigram = Phrases(sentences, min_count=1, threshold=1) + bigram.save("test_phrases_testSaveLoad_temp_save.pkl") + bigram_loaded = Phrases.load("test_phrases_testSaveLoad_temp_save.pkl") + seen_scores = set() + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] + for phrase, score in bigram_loaded.export_phrases(test_sentences): + seen_scores.add(round(score, 3)) + + assert seen_scores == set([ + 5.167, # score for graph minors + 3.444 # score for human interface + ]) + + finally: + if os.path.exists("test_phrases_testSaveLoad_temp_save.pkl"): + os.remove("test_phrases_testSaveLoad_temp_save.pkl") + + def testSaveLoadStringScoring(self): + """ Saving and loading a Phrases object with a string scoring parameter. + This should ensure backwards compatibility with the previous version of Phrases""" + + try: + bigram = Phrases(sentences, min_count=1, threshold=1) + bigram.scoring = "default" + bigram.save("test_phrases_testSaveLoadStringScoring_temp_save.pkl") + bigram_loaded = Phrases.load("test_phrases_testSaveLoadStringScoring_temp_save.pkl") + seen_scores = set() + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] + for phrase, score in bigram_loaded.export_phrases(test_sentences): + seen_scores.add(round(score, 3)) + + assert seen_scores == set([ + 5.167, # score for graph minors + 3.444 # score for human interface + ]) + + finally: + if os.path.exists("test_phrases_testSaveLoadStringScoring_temp_save.pkl"): + os.remove("test_phrases_testSaveLoadStringScoring_temp_save.pkl") + + def testSaveLoadNoScoring(self): + """ Saving and loading a Phrases object with no scoring parameter. + This should ensure backwards compatibility with old versions of Phrases""" + + try: + bigram = Phrases(sentences, min_count=1, threshold=1) + del(bigram.scoring) + bigram.save("test_phrases_testSaveLoadNoScoring_temp_save.pkl") + bigram_loaded = Phrases.load("test_phrases_testSaveLoadNoScoring_temp_save.pkl") + seen_scores = set() + test_sentences = [['graph', 'minors', 'survey', 'human', 'interface', 'system']] + for phrase, score in bigram_loaded.export_phrases(test_sentences): + seen_scores.add(round(score, 3)) + + assert seen_scores == set([ + 5.167, # score for graph minors + 3.444 # score for human interface + ]) + + finally: + if os.path.exists("test_phrases_testSaveLoadNoScoring_temp_save.pkl"): + os.remove("test_phrases_testSaveLoadNoScoring_temp_save.pkl") # endclass TestPhrasesModel From 1481342bc8a2d2ab19b759d5b141e701c2a2b5a5 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Thu, 7 Sep 2017 17:00:08 -0400 Subject: [PATCH 15/17] removal of pickle testing objects in Phrases init --- gensim/models/phrases.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index 85309517a2..a97f0f479d 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -213,6 +213,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, load_pickle = pickle.loads(test_pickle) except pickle.PickleError: raise pickle.PickleError('unable to pickle custom Phrases scoring function') + finally: + del(test_pickle) + del(load_pickle) if sentences is not None: self.add_vocab(sentences) From fb7fbb153d063fd3ff396ee77bd21f4bcd3ed624 Mon Sep 17 00:00:00 2001 From: Michael Sherman Date: Mon, 11 Sep 2017 11:32:49 -0400 Subject: [PATCH 16/17] switched to six for python 2/3 compatibility --- gensim/models/phrases.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index a97f0f479d..faa57d3a56 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -67,6 +67,7 @@ from math import log from inspect import getargspec import pickle +import six from six import iteritems, string_types, next @@ -177,13 +178,7 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, # intentially override the value of the scoring parameter rather than set self.scoring here, # to still run the check of scoring function parameters in the next code block - # for python 2 and 3 compatibility. basestring is used to check if scoring is a string - try: - basestring - except NameError: - basestring = str - - if isinstance(scoring, basestring): + if isinstance(scoring, six.string_types): if scoring == 'default': scoring = original_scorer elif scoring == 'npmi': From e866d3f0646ac354636ec4b6a947c0b4a34d02a3 Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 24 Oct 2017 15:56:43 +0500 Subject: [PATCH 17/17] fix docstring --- gensim/models/phrases.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py index ec5addef0a..2ec8592bcd 100644 --- a/gensim/models/phrases.py +++ b/gensim/models/phrases.py @@ -140,30 +140,33 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0, max_vocab_size=4 setting. `scoring` can be set with either a string that refers to a built-in scoring function, or with a function with the expected parameter names. Two built-in scoring functions are available by setting `scoring` to a string: - 'default': from "Efficient Estimaton of Word Representations in Vector Space" by - Mikolov, et. al.: - (count(worda followed by wordb) - min_count) * N / - (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size. - 'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual + + 'default': from "Efficient Estimaton of Word Representations in Vector Space" by + Mikolov, et. al.: + (count(worda followed by wordb) - min_count) * N / + (count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size. + 'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual Information in Colocation Extraction" by Gerlof Bouma: ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) / - ln(prop(worda followed by wordb) where prop(n) is the count of n / the count of everything in the entire corpus - 'npmi' is more robust when dealing with common words that form part of common bigrams, and - ranges from -1 to 1, but is slower to calculate than the default + 'npmi' is more robust when dealing with common words that form part of common bigrams, and + ranges from -1 to 1, but is slower to calculate than the default + To use a custom scoring function, create a function with the following parameters and set the `scoring` parameter to the custom function. You must use all the parameters in your function call, even if the function does not require all the parameters. + worda_count: number of occurrances in `sentences` of the first token in the phrase being scored wordb_count: number of occurrances in `sentences` of the second token in the phrase being scored bigram_count: number of occurrances in `sentences` of the phrase being scored len_vocab: the number of unique tokens in `sentences` min_count: the `min_count` setting of the Phrases class corpus_word_count: the total number of (non-unique) tokens in `sentences` + A scoring function without any of these parameters (even if the parameters are not used) will - raise a ValueError on initialization of the Phrases class - The scoring function must be picklable + raise a ValueError on initialization of the Phrases class. The scoring function must be picklable. """ if min_count <= 0: