Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

models.Phrases multiple scoring methods (#1363) #1464

Merged
103 changes: 80 additions & 23 deletions gensim/models/phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@
import warnings
from collections import defaultdict
import itertools as it
from functools import partial
from math import log

from six import iteritems, string_types, next

Expand Down Expand Up @@ -106,7 +108,8 @@ class Phrases(interfaces.TransformationABC):

"""
def __init__(self, sentences=None, min_count=5, threshold=10.0,
max_vocab_size=40000000, delimiter=b'_', progress_per=10000):
max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
scoring='default'):
"""
Initialize the model from an iterable of `sentences`. Each sentence must be
a list of words (unicode strings) that will be used for training.
Expand All @@ -120,10 +123,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
`min_count` ignore all words and bigrams with total collected count lower
than this.

`threshold` represents a threshold for forming the phrases (higher means
fewer phrases). A phrase of words `a` and `b` is accepted if
`(cnt(a, b) - min_count) * N / (cnt(a) * cnt(b)) > threshold`, where `N` is the
total vocabulary size.
`threshold` represents a score threshold for forming the phrases (higher means
fewer phrases). A phrase of words `a` followed by `b` is accepted if the score of the
phrase is greater than threshold. see the `scoring' setting
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Capitalize first word in sentence, end in full stop.


`max_vocab_size` is the maximum size of the vocabulary. Used to control
pruning of less common words, to keep memory under control. The default
Expand All @@ -133,12 +135,31 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
`delimiter` is the glue character used to join collocation tokens, and
should be a byte string (e.g. b'_').

`scoring` specifies how potential phrases are scored for comparison to the `threshold`
setting. two settings are available:
'default': from "Efficient Estimaton of Word Representations in Vector Space" by
Mikolov, et. al.:
(count(worda followed by wordb) - min_count) * N /
(count(worda) * count(wordb)) > threshold`, where `N` is the total vocabulary size.
'npmi': normalized pointwise mutual information, from "Normalized (Pointwise) Mutual
Information in Colocation Extraction" by Gerlof Bouma:
ln(prop(worda followed by wordb) / (prop(worda)*prop(wordb))) /
- ln(prop(worda followed by wordb)
where prop(n) is the count of n / the count of everything in the entire corpus
'npmi' is more robust when dealing with common words that form part of common bigrams, and
ranges from -1 to 1, but is slower to calculate than the default

"""
if min_count <= 0:
raise ValueError("min_count should be at least 1")

if threshold <= 0:
raise ValueError("threshold should be positive")
if threshold <= 0 and scoring == 'default':
raise ValueError("threshold should be positive for default scoring")
if scoring == 'npmi' and (threshold < -1 or threshold > 1):
raise ValueError("threshold should be between -1 and 1 for npmi scoring")

if not (scoring == 'default' or scoring == 'npmi'):
raise ValueError('unknown scoring function "' + scoring + '" specified')

self.min_count = min_count
self.threshold = threshold
Expand All @@ -147,6 +168,8 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
self.min_reduce = 1 # ignore any tokens with count smaller than this
self.delimiter = delimiter
self.progress_per = progress_per
self.scoring = scoring
self.corpus_word_count = 0

if sentences is not None:
self.add_vocab(sentences)
Expand Down Expand Up @@ -178,14 +201,15 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000):
if sentence: # add last word skipped by previous loop
word = sentence[-1]
vocab[word] += 1
total_words += 1

if len(vocab) > max_vocab_size:
utils.prune_vocab(vocab, min_reduce)
min_reduce += 1

logger.info("collected %i word types from a corpus of %i words (unigram + bigrams) and %i sentences" %
(len(vocab), total_words, sentence_no + 1))
return min_reduce, vocab
return min_reduce, vocab, total_words

def add_vocab(self, sentences):
"""
Expand All @@ -197,8 +221,10 @@ def add_vocab(self, sentences):
# directly, but gives the new sentences a fighting chance to collect
# sufficient counts, before being pruned out by the (large) accummulated
# counts collected in previous learn_vocab runs.
min_reduce, vocab = self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per)
min_reduce, vocab, total_words = \
self.learn_vocab(sentences, self.max_vocab_size, self.delimiter, self.progress_per)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code style: bad indentation (unneeded line break).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the number of columns we cap at? I thought it was 100, which I believe this exceeded.

Copy link
Owner

@piskvorky piskvorky Jul 25, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no hard limit; if the line becomes hard to read, we break it.

If the break would be even harder to read than the original (for semantic/visual/clarity reasons), we don't break it.

Line continuations are indented at one extra level (4 spaces to the right).


self.corpus_word_count += total_words
if len(self.vocab) > 0:
logger.info("merging %i counts into %s", len(vocab), self)
self.min_reduce = max(self.min_reduce, min_reduce)
Expand Down Expand Up @@ -226,31 +252,47 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):

then you can debug the threshold with generated tsv
"""

vocab = self.vocab
threshold = self.threshold
delimiter = self.delimiter # delimiter used for lookup
min_count = self.min_count
scoring = self.scoring
corpus_word_count = self.corpus_word_count

if scoring == 'default':
scoring_function = \
partial(self.original_scorer, len_vocab=float(len(vocab)), min_count=float(min_count))
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation (unneeded line break).

elif scoring == 'npmi':
scoring_function = \
partial(self.npmi_scorer, corpus_word_count=corpus_word_count)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation (unneeded line break).

# no else here to catch unknown scoring function, check is done in Phrases.__init__

for sentence in sentences:
s = [utils.any2utf8(w) for w in sentence]
last_bigram = False
vocab = self.vocab
threshold = self.threshold
delimiter = self.delimiter # delimiter used for lookup
min_count = self.min_count

for word_a, word_b in zip(s, s[1:]):
if word_a in vocab and word_b in vocab:
# last bigram check was moved here to save a few CPU cycles
if word_a in vocab and word_b in vocab and not last_bigram:
bigram_word = delimiter.join((word_a, word_b))
if bigram_word in vocab and not last_bigram:
pa = float(vocab[word_a])
pb = float(vocab[word_b])
pab = float(vocab[bigram_word])
score = (pab - min_count) / pa / pb * len(vocab)
if bigram_word in vocab:
count_a = float(vocab[word_a])
count_b = float(vocab[word_b])
count_ab = float(vocab[bigram_word])
score = scoring_function(count_a, count_b, count_ab)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A pluggable scoring function would have to be called with all corpus constants and Phrases settings used in any scoring function. Right now that would look like:
score = scoring_function(count_a, count_b, count_ab, min_count, len_vocab, corpus_word_count).
And the call would grow as the universe of variables considered by all scoring functions grows.

Copy link
Owner

@piskvorky piskvorky Jul 21, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that's still preferable. This string-passing seems inflexible.

We could support some common use-cases by passing a string, but the code underneath should simply translate that string into a scoring_function and work with that underneath. Custom scoring_functions should be supported IMO.

In other words, we could support both string and callable as param. If string, gensim converts that to a known callable (for easy-to-use common cases).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will make this change, hopefully before the end of the week, and make it part of a PR.

# logger.debug("score for %s: (pab=%s - min_count=%s) / pa=%s / pb=%s * vocab_size=%s = %s",
# bigram_word, pab, self.min_count, pa, pb, len(self.vocab), score)
if score > threshold:
# added mincount check because if the scorer doesn't contain min_count
# it would not be enforced otherwise
if score > threshold and count_ab >= min_count:
if as_tuples:
yield ((word_a, word_b), score)
else:
yield (out_delimiter.join((word_a, word_b)), score)
last_bigram = True
continue
last_bigram = False
last_bigram = False
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this on purpose? What is this change about?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is on purpose. Matches up to line 277. If that test fails we have to set last_bigram to false. This positioning sets it to false always--the only time it gets set to true is in line 293 when a passing bigram is found.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aha, so this is a bug fix at the same time. Thanks! CC @menshikh-iv


def __getitem__(self, sentence):
"""
Expand Down Expand Up @@ -311,6 +353,20 @@ def __getitem__(self, sentence):

return [utils.to_unicode(w) for w in new_s]

# calculation of score based on original mikolov word2vec paper
# len_vocab and min_count set so functools.partial works
@staticmethod
def original_scorer(worda_count, wordb_count, bigram_count, len_vocab=0.0, min_count=0.0):
return (bigram_count - min_count) / worda_count / wordb_count * len_vocab
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Beware of integer divisions - this code is brittle.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't fix this in PR #1573 . Rather, I just cast everything before calling the scoring method in Phrases and Phraser. I think that's the better place to do the casting since then it fixes the problem for all custom scorers as well.

Of course, I can do the casting in the scoring methods as well. Let me know if you still think I need it here and in npmi_scorer and I'll update PR #1573. It's extra steps, but I'd assume the performance hit is infinitesimal.


# normalized PMI, requires corpus size
@staticmethod
def npmi_scorer(worda_count, wordb_count, bigram_count, corpus_word_count=0.0):
pa = worda_count / corpus_word_count
Copy link
Owner

@piskvorky piskvorky Jul 21, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this meant to be an integer or float division? (dtto below)

pb = wordb_count / corpus_word_count
pab = bigram_count / corpus_word_count
return log(pab / (pa * pb)) / -log(pab)


def pseudocorpus(source_vocab, sep):
"""Feeds source_vocab's compound keys back to it, to discover phrases"""
Expand All @@ -329,15 +385,16 @@ class Phraser(interfaces.TransformationABC):
After the one-time initialization, a Phraser will be much smaller and
somewhat faster than using the full Phrases model.

Reflects the results of the source model's `min_count` and `threshold`
settings. (You can tamper with those & create a new Phraser to try
Reflects the results of the source model's `min_count`, `threshold`, and
`scoring` settings. (You can tamper with those & create a new Phraser to try
other values.)

"""
def __init__(self, phrases_model):
self.threshold = phrases_model.threshold
self.min_count = phrases_model.min_count
self.delimiter = phrases_model.delimiter
self.scoring = phrases_model.scoring
self.phrasegrams = {}
corpus = pseudocorpus(phrases_model.vocab, phrases_model.delimiter)
logger.info('source_vocab length %i', len(phrases_model.vocab))
Expand Down
32 changes: 31 additions & 1 deletion gensim/test/test_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def testExportPhrases(self):
b'human interface'
])

def test_multiple_bigrams_single_entry(self):
def testMultipleBigramsSingleEntry(self):
""" a single entry should produce multiple bigrams. """
bigram = Phrases(sentences, min_count=1, threshold=1)

Expand All @@ -153,6 +153,36 @@ def test_multiple_bigrams_single_entry(self):
b'human interface'
])

def testScoringDefault(self):
""" test the default scoring, from the mikolov word2vec paper """
bigram = Phrases(sentences, min_count=1, threshold=1)

seen_scores = set()

test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
for phrase, score in bigram.export_phrases(test_sentences):
seen_scores.add(round(score, 3))

assert seen_scores == set([
5.167, # score for graph minors
3.444 # score for human interface
])

def testScoringNpmi(self):
""" test normalized pointwise mutual information scoring """
bigram = Phrases(sentences, min_count=1, threshold=.5, scoring='npmi')

seen_scores = set()

test_sentences = [['graph', 'minors', 'survey', 'human', 'interface']]
for phrase, score in bigram.export_phrases(test_sentences):
seen_scores.add(round(score, 3))

assert seen_scores == set([
.882, # score for graph minors
.714 # score for human interface
])

def testBadParameters(self):
"""Test the phrases module with bad parameters."""
# should fail with something less or equal than 0
Expand Down