diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index f819dbe837..69f7ba8238 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -31,10 +31,38 @@ import numpy as np +from collections import namedtuple + logger = logging.getLogger(__name__) boolean_document_based = ['u_mass'] sliding_window_based = ['c_v', 'c_uci', 'c_npmi'] +make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr') + +coherence_dict = { + 'u_mass': make_pipeline(segmentation.s_one_pre, + probability_estimation.p_boolean_document, + direct_confirmation_measure.log_conditional_probability, + aggregation.arithmetic_mean), + 'c_v': make_pipeline(segmentation.s_one_set, + probability_estimation.p_boolean_sliding_window, + indirect_confirmation_measure.cosine_similarity, + aggregation.arithmetic_mean), + 'c_uci': make_pipeline(segmentation.s_one_one, + probability_estimation.p_boolean_sliding_window, + direct_confirmation_measure.log_ratio_measure, + aggregation.arithmetic_mean), + 'c_npmi': make_pipeline(segmentation.s_one_one, + probability_estimation.p_boolean_sliding_window, + direct_confirmation_measure.log_ratio_measure, + aggregation.arithmetic_mean), +} + +sliding_windows_dict = { + 'c_v': 110, + 'c_uci': 10, + 'c_npmi': 10 +} class CoherenceModel(interfaces.TransformationABC): """ @@ -59,7 +87,7 @@ class CoherenceModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ - def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v'): + def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v', topn=10): """ Args: ---- @@ -84,6 +112,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= 'c_npmi' For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. Corpus is not needed. + topn : Integer corresponding to the number of top words to be extracted from each topic. """ if model is None and topics is None: raise ValueError("One of model or topics has to be provided.") @@ -118,7 +147,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= self.texts = texts else: raise ValueError("%s coherence is not currently supported." % coherence) - + self.topn = topn self.model = model if model is not None: self.topics = self._get_topics() @@ -126,53 +155,28 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= self.topics = [] for topic in topics: t_i = [] - for t in range(len(topic)): - t_i.append(dictionary.token2id[topic[t]]) + for n, _ in enumerate(topic): + t_i.append(dictionary.token2id[topic[n]]) self.topics.append(np.array(t_i)) self.coherence = coherence - # Set pipeline parameters: - if self.coherence == 'u_mass': - self.seg = segmentation.s_one_pre - self.prob = probability_estimation.p_boolean_document - self.conf = direct_confirmation_measure.log_conditional_probability - self.aggr = aggregation.arithmetic_mean - - elif self.coherence == 'c_v': - self.seg = segmentation.s_one_set - self.prob = probability_estimation.p_boolean_sliding_window - self.conf = indirect_confirmation_measure.cosine_similarity - self.aggr = aggregation.arithmetic_mean - - elif self.coherence == 'c_uci': - self.seg = segmentation.s_one_one - self.prob = probability_estimation.p_boolean_sliding_window - self.conf = direct_confirmation_measure.log_ratio_measure - self.aggr = aggregation.arithmetic_mean - - elif self.coherence == 'c_npmi': - self.seg = segmentation.s_one_one - self.prob = probability_estimation.p_boolean_sliding_window - self.conf = direct_confirmation_measure.normalized_log_ratio_measure - self.aggr = aggregation.arithmetic_mean def __str__(self): - return "CoherenceModel(segmentation=%s, probability estimation=%s, confirmation measure=%s, aggregation=%s)" % ( - self.seg, self.prob, self.conf, self.aggr) + return coherence_dict[self.coherence].__str__() def _get_topics(self): """Internal helper function to return topics from a trained topic model.""" topics = [] if isinstance(self.model, LdaModel): for topic in self.model.state.get_lambda(): - bestn = argsort(topic, topn=10, reverse=True) + bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) elif isinstance(self.model, LdaVowpalWabbit): for topic in self.model._get_topics(): - bestn = argsort(topic, topn=10, reverse=True) + bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) elif isinstance(self.model, LdaMallet): for topic in self.model.word_topics: - bestn = argsort(topic, topn=10, reverse=True) + bestn = argsort(topic, topn=self.topn, reverse=True) topics.append(bestn) else: raise ValueError("This topic model is not currently supported. Supported topic models are" @@ -180,35 +184,26 @@ def _get_topics(self): return topics def get_coherence(self): - if self.coherence == 'u_mass': - segmented_topics = self.seg(self.topics) - per_topic_postings, num_docs = self.prob(self.corpus, segmented_topics) - confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_docs) - return self.aggr(confirmed_measures) - - elif self.coherence == 'c_v': - if self.window_size is None: - self.window_size = 110 - segmented_topics = self.seg(self.topics) - per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics, - dictionary=self.dictionary, window_size=self.window_size) - confirmed_measures = self.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows) - return self.aggr(confirmed_measures) - - elif self.coherence == 'c_uci': - if self.window_size is None: - self.window_size = 10 - segmented_topics = self.seg(self.topics) - per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics, - dictionary=self.dictionary, window_size=self.window_size) - confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_windows) - return self.aggr(confirmed_measures) - - elif self.coherence == 'c_npmi': - if self.window_size is None: - self.window_size = 10 - segmented_topics = self.seg(self.topics) - per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics, - dictionary=self.dictionary, window_size=self.window_size) - confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_windows) - return self.aggr(confirmed_measures) + """ + Return coherence value based on pipeline parameters. + """ + measure = coherence_dict[self.coherence] + segmented_topics = measure.seg(self.topics) + if self.coherence in boolean_document_based: + per_topic_postings, num_docs = measure.prob(self.corpus, segmented_topics) + confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_docs) + elif self.coherence in sliding_window_based: + if self.window_size is not None: + self.window_size = sliding_windows_dict[self.coherence] + per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics, + dictionary=self.dictionary, window_size=self.window_size) + if self.coherence == 'c_v': + confirmed_measures = measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows) + else: + if self.coherence == 'c_npmi': + normalize = True + else: + # For c_uci + normalize = False + confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize) + return measure.aggr(confirmed_measures) diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 655987252d..3961f67180 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -35,12 +35,24 @@ ['graph', 'minors', 'survey']] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] +boolean_document_based = ['u_mass'] +sliding_window_based = ['c_v', 'c_uci', 'c_npmi'] def testfile(): # temporary data will be stored to this file return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') +def checkCoherenceMeasure(topics1, topics2, coherence): + """Check provided topic coherence algorithm on given topics""" + if coherence in boolean_document_based: + cm1 = CoherenceModel(topics=topics1, corpus=corpus, dictionary=dictionary, coherence=coherence) + cm2 = CoherenceModel(topics=topics2, corpus=corpus, dictionary=dictionary, coherence=coherence) + else: + cm1 = CoherenceModel(topics=topics1, texts=texts, dictionary=dictionary, coherence=coherence) + cm2 = CoherenceModel(topics=topics2, texts=texts, dictionary=dictionary, coherence=coherence) + return cm1.get_coherence() > cm2.get_coherence() + class TestCoherenceModel(unittest.TestCase): def setUp(self): # Suppose given below are the topics which two different LdaModels come up with. @@ -67,30 +79,25 @@ def setUp(self): def testUMass(self): """Test U_Mass topic coherence algorithm on given topics""" - cm1 = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass') - cm2 = CoherenceModel(topics=self.topics2, corpus=corpus, dictionary=dictionary, coherence='u_mass') - self.assertTrue(cm1.get_coherence() > cm2.get_coherence()) + self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'u_mass')) def testCv(self): """Test C_v topic coherence algorithm on given topics""" - cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_v') - cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_v') - self.assertTrue(cm1.get_coherence() > cm2.get_coherence()) + self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_v')) def testCuci(self): """Test C_uci topic coherence algorithm on given topics""" - cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_uci') - cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_uci') - self.assertTrue(cm1.get_coherence() > cm2.get_coherence()) + self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_uci')) def testCnpmi(self): """Test C_npmi topic coherence algorithm on given topics""" - cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_npmi') - cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_npmi') - self.assertTrue(cm1.get_coherence() > cm2.get_coherence()) + self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_npmi')) def testUMassLdaModel(self): """Perform sanity check to see if u_mass coherence works with LDA Model""" + # Note that this is just a sanity check because LDA does not guarantee a better coherence + # value on the topics if iterations are increased. This can be seen here: + # https://gist.github.com/dsquareindia/60fd9ab65b673711c3fa00509287ddde try: cm = CoherenceModel(model=self.ldamodel, corpus=corpus, coherence='u_mass') except: diff --git a/gensim/test/test_direct_confirmation.py b/gensim/test/test_direct_confirmation.py index 5a8ad9bb43..cb35f0acc4 100644 --- a/gensim/test/test_direct_confirmation.py +++ b/gensim/test/test_direct_confirmation.py @@ -37,7 +37,7 @@ def testLogRatioMeasure(self): def testNormalizedLogRatioMeasure(self): """Test normalized_log_ratio_measure()""" - obtained = direct_confirmation_measure.normalized_log_ratio_measure(self.segmentation, self.posting_list, self.num_docs)[0] + obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.posting_list, self.num_docs, normalize=True)[0] # Answer should be ~ -0.182321557 / -ln(1 / 5) = -0.113282753 expected = -0.113282753 self.assertAlmostEqual(obtained, expected) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 271a4fa18d..83227822e9 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -42,12 +42,18 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs): return m_lc -def log_ratio_measure(segmented_topics, per_topic_postings, num_docs): +def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize=False): """ - Popularly known as PMI. - This function calculates the log-ratio-measure which is used by - coherence measures such as c_v. - This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))] + If normalize=False: + Popularly known as PMI. + This function calculates the log-ratio-measure which is used by + coherence measures such as c_v. + This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))] + + If normalize=True: + This function calculates the normalized-log-ratio-measure, popularly knowns as + NPMI which is used by coherence measures such as c_v. + This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e] Args: ---- @@ -65,38 +71,16 @@ def log_ratio_measure(segmented_topics, per_topic_postings, num_docs): w_prime_docs = per_topic_postings[w_prime] w_star_docs = per_topic_postings[w_star] co_docs = w_prime_docs.intersection(w_star_docs) - numerator = (len(co_docs) / float(num_docs)) + EPSILON - denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs)) - m_lr_i = np.log(numerator / denominator) + if normalize: + # For normalized log ratio measure + numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0] + co_doc_prob = len(co_docs) / float(num_docs) + m_lr_i = numerator / (-np.log(co_doc_prob + EPSILON)) + else: + # For log ratio measure without normalization + numerator = (len(co_docs) / float(num_docs)) + EPSILON + denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs)) + m_lr_i = np.log(numerator / denominator) m_lr.append(m_lr_i) return m_lr - -def normalized_log_ratio_measure(segmented_topics, per_topic_postings, num_docs): - """ - This function calculates the normalized-log-ratio-measure, popularly knowns as - NPMI which is used by coherence measures such as c_v. - This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e] - - Args: - ---- - segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. - per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics - num_docs : Total number of documents in corpus. Used for calculating probability. - - Returns: - ------- - m_nlr : List of log ratio measures on each set in segmented topics. - """ - m_nlr = [] - for s_i in segmented_topics: - for w_prime, w_star in s_i: - numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0] - w_prime_docs = per_topic_postings[w_prime] - w_star_docs = per_topic_postings[w_star] - co_docs = w_prime_docs.intersection(w_star_docs) - co_doc_prob = len(co_docs) / float(num_docs) - m_nlr_i = numerator / (-np.log(co_doc_prob + EPSILON)) - m_nlr.append(m_nlr_i) - - return m_nlr diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index dcb1432eaf..a9daa246c3 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -8,12 +8,12 @@ This module contains functions to compute confirmation on a pair of words or word subsets. The formula used to compute indirect confirmation measure is: - _ _ -m_sim(m, gamma)(W', W*) = s_sim(V_m,gamma(W'), V_m,gamma(W*)) + +m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) where s_sim can be cosine, dice or jaccard similarity and -_ -V_m,gamma(W') = {sigma(w' belonging to W') m(w_i, w_j) ^ gamma} where j = 1, ...., |W| + +\vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} Here 'm' is the direct confirmation measure used. """ @@ -52,7 +52,7 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc for w_j in w: for w_i in w_prime: if (w_i, w_j) not in backtrack: - backtrack[(w_i, w_j)] = measure([[(w_i, w_j)]], per_topic_postings, num_docs)[0] + backtrack[(w_i, w_j)] = measure[0]([[(w_i, w_j)]], per_topic_postings, num_docs, measure[1])[0] if w_j not in context_vectors: context_vectors[w_j] = backtrack[(w_i, w_j)] ** gamma else: @@ -60,7 +60,7 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc else: for w_j in w: if (w_prime, w_j) not in backtrack: - backtrack[(w_prime, w_j)] = measure([[(w_prime, w_j)]], per_topic_postings, num_docs)[0] + backtrack[(w_prime, w_j)] = measure[0]([[(w_prime, w_j)]], per_topic_postings, num_docs, measure[1])[0] context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma return (context_vectors, backtrack) @@ -70,7 +70,11 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam _ _ _ _ u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect _ _ - cosine measure is computed as the cosine similarity between u and w. + cosine measure is computed as the cosine similarity between u and w. The formula used is: + + m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) + + where each vector \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} Args: ---- @@ -86,7 +90,8 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam s_cos_sim : array of cosine similarity of the context vectors for each segmentation """ if measure == 'nlr': - measure = direct_confirmation_measure.normalized_log_ratio_measure + # make normalized log ratio measure tuple + measure = (direct_confirmation_measure.log_ratio_measure, True) else: raise ValueError("The direct confirmation measure you entered is not currently supported.") backtrack = {} # Backtracking dictionary for storing measure values of topic id tuples eg. (1, 2).