Skip to content

Commit

Permalink
Refactored code, addressed intial comments.
Browse files Browse the repository at this point in the history
  • Loading branch information
devashishd12 committed Aug 5, 2016
1 parent 5e44630 commit 20e2d6d
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 123 deletions.
125 changes: 60 additions & 65 deletions gensim/models/coherencemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,38 @@

import numpy as np

from collections import namedtuple

logger = logging.getLogger(__name__)

boolean_document_based = ['u_mass']
sliding_window_based = ['c_v', 'c_uci', 'c_npmi']
make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')

coherence_dict = {
'u_mass': make_pipeline(segmentation.s_one_pre,
probability_estimation.p_boolean_document,
direct_confirmation_measure.log_conditional_probability,
aggregation.arithmetic_mean),
'c_v': make_pipeline(segmentation.s_one_set,
probability_estimation.p_boolean_sliding_window,
indirect_confirmation_measure.cosine_similarity,
aggregation.arithmetic_mean),
'c_uci': make_pipeline(segmentation.s_one_one,
probability_estimation.p_boolean_sliding_window,
direct_confirmation_measure.log_ratio_measure,
aggregation.arithmetic_mean),
'c_npmi': make_pipeline(segmentation.s_one_one,
probability_estimation.p_boolean_sliding_window,
direct_confirmation_measure.log_ratio_measure,
aggregation.arithmetic_mean),
}

sliding_windows_dict = {
'c_v': 110,
'c_uci': 10,
'c_npmi': 10
}

class CoherenceModel(interfaces.TransformationABC):
"""
Expand All @@ -59,7 +87,7 @@ class CoherenceModel(interfaces.TransformationABC):
Model persistency is achieved via its load/save methods.
"""
def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v'):
def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v', topn=10):
"""
Args:
----
Expand All @@ -84,6 +112,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
'c_npmi'
For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary.
For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. Corpus is not needed.
topn : Integer corresponding to the number of top words to be extracted from each topic.
"""
if model is None and topics is None:
raise ValueError("One of model or topics has to be provided.")
Expand Down Expand Up @@ -118,97 +147,63 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
self.texts = texts
else:
raise ValueError("%s coherence is not currently supported." % coherence)

self.topn = topn
self.model = model
if model is not None:
self.topics = self._get_topics()
elif topics is not None:
self.topics = []
for topic in topics:
t_i = []
for t in range(len(topic)):
t_i.append(dictionary.token2id[topic[t]])
for n, _ in enumerate(topic):
t_i.append(dictionary.token2id[topic[n]])
self.topics.append(np.array(t_i))
self.coherence = coherence
# Set pipeline parameters:
if self.coherence == 'u_mass':
self.seg = segmentation.s_one_pre
self.prob = probability_estimation.p_boolean_document
self.conf = direct_confirmation_measure.log_conditional_probability
self.aggr = aggregation.arithmetic_mean

elif self.coherence == 'c_v':
self.seg = segmentation.s_one_set
self.prob = probability_estimation.p_boolean_sliding_window
self.conf = indirect_confirmation_measure.cosine_similarity
self.aggr = aggregation.arithmetic_mean

elif self.coherence == 'c_uci':
self.seg = segmentation.s_one_one
self.prob = probability_estimation.p_boolean_sliding_window
self.conf = direct_confirmation_measure.log_ratio_measure
self.aggr = aggregation.arithmetic_mean

elif self.coherence == 'c_npmi':
self.seg = segmentation.s_one_one
self.prob = probability_estimation.p_boolean_sliding_window
self.conf = direct_confirmation_measure.normalized_log_ratio_measure
self.aggr = aggregation.arithmetic_mean

def __str__(self):
return "CoherenceModel(segmentation=%s, probability estimation=%s, confirmation measure=%s, aggregation=%s)" % (
self.seg, self.prob, self.conf, self.aggr)
return coherence_dict[self.coherence].__str__()

def _get_topics(self):
"""Internal helper function to return topics from a trained topic model."""
topics = []
if isinstance(self.model, LdaModel):
for topic in self.model.state.get_lambda():
bestn = argsort(topic, topn=10, reverse=True)
bestn = argsort(topic, topn=self.topn, reverse=True)
topics.append(bestn)
elif isinstance(self.model, LdaVowpalWabbit):
for topic in self.model._get_topics():
bestn = argsort(topic, topn=10, reverse=True)
bestn = argsort(topic, topn=self.topn, reverse=True)
topics.append(bestn)
elif isinstance(self.model, LdaMallet):
for topic in self.model.word_topics:
bestn = argsort(topic, topn=10, reverse=True)
bestn = argsort(topic, topn=self.topn, reverse=True)
topics.append(bestn)
else:
raise ValueError("This topic model is not currently supported. Supported topic models are"
"LdaModel, LdaVowpalWabbit and LdaMallet.")
return topics

def get_coherence(self):
if self.coherence == 'u_mass':
segmented_topics = self.seg(self.topics)
per_topic_postings, num_docs = self.prob(self.corpus, segmented_topics)
confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_docs)
return self.aggr(confirmed_measures)

elif self.coherence == 'c_v':
if self.window_size is None:
self.window_size = 110
segmented_topics = self.seg(self.topics)
per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics,
dictionary=self.dictionary, window_size=self.window_size)
confirmed_measures = self.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows)
return self.aggr(confirmed_measures)

elif self.coherence == 'c_uci':
if self.window_size is None:
self.window_size = 10
segmented_topics = self.seg(self.topics)
per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics,
dictionary=self.dictionary, window_size=self.window_size)
confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_windows)
return self.aggr(confirmed_measures)

elif self.coherence == 'c_npmi':
if self.window_size is None:
self.window_size = 10
segmented_topics = self.seg(self.topics)
per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics,
dictionary=self.dictionary, window_size=self.window_size)
confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_windows)
return self.aggr(confirmed_measures)
"""
Return coherence value based on pipeline parameters.
"""
measure = coherence_dict[self.coherence]
segmented_topics = measure.seg(self.topics)
if self.coherence in boolean_document_based:
per_topic_postings, num_docs = measure.prob(self.corpus, segmented_topics)
confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_docs)
elif self.coherence in sliding_window_based:
if self.window_size is not None:
self.window_size = sliding_windows_dict[self.coherence]
per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics,
dictionary=self.dictionary, window_size=self.window_size)
if self.coherence == 'c_v':
confirmed_measures = measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows)
else:
if self.coherence == 'c_npmi':
normalize = True
else:
# For c_uci
normalize = False
confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize)
return measure.aggr(confirmed_measures)
31 changes: 19 additions & 12 deletions gensim/test/test_coherencemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,24 @@
['graph', 'minors', 'survey']]
dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
boolean_document_based = ['u_mass']
sliding_window_based = ['c_v', 'c_uci', 'c_npmi']


def testfile():
# temporary data will be stored to this file
return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')

def checkCoherenceMeasure(topics1, topics2, coherence):
"""Check provided topic coherence algorithm on given topics"""
if coherence in boolean_document_based:
cm1 = CoherenceModel(topics=topics1, corpus=corpus, dictionary=dictionary, coherence=coherence)
cm2 = CoherenceModel(topics=topics2, corpus=corpus, dictionary=dictionary, coherence=coherence)
else:
cm1 = CoherenceModel(topics=topics1, texts=texts, dictionary=dictionary, coherence=coherence)
cm2 = CoherenceModel(topics=topics2, texts=texts, dictionary=dictionary, coherence=coherence)
return cm1.get_coherence() > cm2.get_coherence()

class TestCoherenceModel(unittest.TestCase):
def setUp(self):
# Suppose given below are the topics which two different LdaModels come up with.
Expand All @@ -67,30 +79,25 @@ def setUp(self):

def testUMass(self):
"""Test U_Mass topic coherence algorithm on given topics"""
cm1 = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass')
cm2 = CoherenceModel(topics=self.topics2, corpus=corpus, dictionary=dictionary, coherence='u_mass')
self.assertTrue(cm1.get_coherence() > cm2.get_coherence())
self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'u_mass'))

def testCv(self):
"""Test C_v topic coherence algorithm on given topics"""
cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_v')
cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_v')
self.assertTrue(cm1.get_coherence() > cm2.get_coherence())
self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_v'))

def testCuci(self):
"""Test C_uci topic coherence algorithm on given topics"""
cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_uci')
cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_uci')
self.assertTrue(cm1.get_coherence() > cm2.get_coherence())
self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_uci'))

def testCnpmi(self):
"""Test C_npmi topic coherence algorithm on given topics"""
cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_npmi')
cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_npmi')
self.assertTrue(cm1.get_coherence() > cm2.get_coherence())
self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_npmi'))

def testUMassLdaModel(self):
"""Perform sanity check to see if u_mass coherence works with LDA Model"""
# Note that this is just a sanity check because LDA does not guarantee a better coherence
# value on the topics if iterations are increased. This can be seen here:
# https://gist.github.com/dsquareindia/60fd9ab65b673711c3fa00509287ddde
try:
cm = CoherenceModel(model=self.ldamodel, corpus=corpus, coherence='u_mass')
except:
Expand Down
2 changes: 1 addition & 1 deletion gensim/test/test_direct_confirmation.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def testLogRatioMeasure(self):

def testNormalizedLogRatioMeasure(self):
"""Test normalized_log_ratio_measure()"""
obtained = direct_confirmation_measure.normalized_log_ratio_measure(self.segmentation, self.posting_list, self.num_docs)[0]
obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.posting_list, self.num_docs, normalize=True)[0]
# Answer should be ~ -0.182321557 / -ln(1 / 5) = -0.113282753
expected = -0.113282753
self.assertAlmostEqual(obtained, expected)
Expand Down
58 changes: 21 additions & 37 deletions gensim/topic_coherence/direct_confirmation_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,12 +42,18 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):

return m_lc

def log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize=False):
"""
Popularly known as PMI.
This function calculates the log-ratio-measure which is used by
coherence measures such as c_v.
This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]
If normalize=False:
Popularly known as PMI.
This function calculates the log-ratio-measure which is used by
coherence measures such as c_v.
This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]
If normalize=True:
This function calculates the normalized-log-ratio-measure, popularly knowns as
NPMI which is used by coherence measures such as c_v.
This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e]
Args:
----
Expand All @@ -65,38 +71,16 @@ def log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
w_prime_docs = per_topic_postings[w_prime]
w_star_docs = per_topic_postings[w_star]
co_docs = w_prime_docs.intersection(w_star_docs)
numerator = (len(co_docs) / float(num_docs)) + EPSILON
denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs))
m_lr_i = np.log(numerator / denominator)
if normalize:
# For normalized log ratio measure
numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0]
co_doc_prob = len(co_docs) / float(num_docs)
m_lr_i = numerator / (-np.log(co_doc_prob + EPSILON))
else:
# For log ratio measure without normalization
numerator = (len(co_docs) / float(num_docs)) + EPSILON
denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs))
m_lr_i = np.log(numerator / denominator)
m_lr.append(m_lr_i)

return m_lr

def normalized_log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
"""
This function calculates the normalized-log-ratio-measure, popularly knowns as
NPMI which is used by coherence measures such as c_v.
This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e]
Args:
----
segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics
num_docs : Total number of documents in corpus. Used for calculating probability.
Returns:
-------
m_nlr : List of log ratio measures on each set in segmented topics.
"""
m_nlr = []
for s_i in segmented_topics:
for w_prime, w_star in s_i:
numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0]
w_prime_docs = per_topic_postings[w_prime]
w_star_docs = per_topic_postings[w_star]
co_docs = w_prime_docs.intersection(w_star_docs)
co_doc_prob = len(co_docs) / float(num_docs)
m_nlr_i = numerator / (-np.log(co_doc_prob + EPSILON))
m_nlr.append(m_nlr_i)

return m_nlr
21 changes: 13 additions & 8 deletions gensim/topic_coherence/indirect_confirmation_measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,12 @@
This module contains functions to compute confirmation on a pair of words or word subsets.
The formula used to compute indirect confirmation measure is:
_ _
m_sim(m, gamma)(W', W*) = s_sim(V_m,gamma(W'), V_m,gamma(W*))
m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*))
where s_sim can be cosine, dice or jaccard similarity and
_
V_m,gamma(W') = {sigma(w' belonging to W') m(w_i, w_j) ^ gamma} where j = 1, ...., |W|
\vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}
Here 'm' is the direct confirmation measure used.
"""
Expand Down Expand Up @@ -52,15 +52,15 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc
for w_j in w:
for w_i in w_prime:
if (w_i, w_j) not in backtrack:
backtrack[(w_i, w_j)] = measure([[(w_i, w_j)]], per_topic_postings, num_docs)[0]
backtrack[(w_i, w_j)] = measure[0]([[(w_i, w_j)]], per_topic_postings, num_docs, measure[1])[0]
if w_j not in context_vectors:
context_vectors[w_j] = backtrack[(w_i, w_j)] ** gamma
else:
context_vectors[w_j] += backtrack[(w_i, w_j)] ** gamma
else:
for w_j in w:
if (w_prime, w_j) not in backtrack:
backtrack[(w_prime, w_j)] = measure([[(w_prime, w_j)]], per_topic_postings, num_docs)[0]
backtrack[(w_prime, w_j)] = measure[0]([[(w_prime, w_j)]], per_topic_postings, num_docs, measure[1])[0]
context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma
return (context_vectors, backtrack)

Expand All @@ -70,7 +70,11 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam
_ _ _ _
u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect
_ _
cosine measure is computed as the cosine similarity between u and w.
cosine measure is computed as the cosine similarity between u and w. The formula used is:
m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*))
where each vector \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}
Args:
----
Expand All @@ -86,7 +90,8 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam
s_cos_sim : array of cosine similarity of the context vectors for each segmentation
"""
if measure == 'nlr':
measure = direct_confirmation_measure.normalized_log_ratio_measure
# make normalized log ratio measure tuple
measure = (direct_confirmation_measure.log_ratio_measure, True)
else:
raise ValueError("The direct confirmation measure you entered is not currently supported.")
backtrack = {} # Backtracking dictionary for storing measure values of topic id tuples eg. (1, 2).
Expand Down

0 comments on commit 20e2d6d

Please sign in to comment.