Refactored code, addressed intial comments.

piskvorky · Aug 5, 2016 · 20e2d6d · 20e2d6d
1 parent 5e44630
commit 20e2d6d
Show file tree

Hide file tree

Showing 5 changed files with 114 additions and 123 deletions.
diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
@@ -31,10 +31,38 @@
 
 import numpy as np
 
+from collections import namedtuple
+
 logger = logging.getLogger(__name__)
 
 boolean_document_based = ['u_mass']
 sliding_window_based = ['c_v', 'c_uci', 'c_npmi']
+make_pipeline = namedtuple('Coherence_Measure', 'seg, prob, conf, aggr')
+
+coherence_dict = {
+    'u_mass': make_pipeline(segmentation.s_one_pre,
+                            probability_estimation.p_boolean_document,
+                            direct_confirmation_measure.log_conditional_probability,
+                            aggregation.arithmetic_mean),
+    'c_v': make_pipeline(segmentation.s_one_set,
+                         probability_estimation.p_boolean_sliding_window,
+                         indirect_confirmation_measure.cosine_similarity,
+                         aggregation.arithmetic_mean),
+    'c_uci': make_pipeline(segmentation.s_one_one,
+                           probability_estimation.p_boolean_sliding_window,
+                           direct_confirmation_measure.log_ratio_measure,
+                           aggregation.arithmetic_mean),
+    'c_npmi': make_pipeline(segmentation.s_one_one,
+                            probability_estimation.p_boolean_sliding_window,
+                            direct_confirmation_measure.log_ratio_measure,
+                            aggregation.arithmetic_mean),
+}
+
+sliding_windows_dict = {
+    'c_v': 110,
+    'c_uci': 10,
+    'c_npmi': 10
+}
 
 class CoherenceModel(interfaces.TransformationABC):
     """
@@ -59,7 +87,7 @@ class CoherenceModel(interfaces.TransformationABC):
 
     Model persistency is achieved via its load/save methods.
     """
-    def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v'):
+    def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v', topn=10):
         """
         Args:
         ----
@@ -84,6 +112,7 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
                     'c_npmi'
                     For 'u_mass' corpus should be provided. If texts is provided, it will be converted to corpus using the dictionary.
                     For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. Corpus is not needed.
+        topn : Integer corresponding to the number of top words to be extracted from each topic.
         """
         if model is None and topics is None:
             raise ValueError("One of model or topics has to be provided.")
@@ -118,97 +147,63 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=
                 self.texts = texts
         else:
             raise ValueError("%s coherence is not currently supported." % coherence)
-
+        self.topn = topn
         self.model = model
         if model is not None:
             self.topics = self._get_topics()
         elif topics is not None:
             self.topics = []
             for topic in topics:
                 t_i = []
-                for t in range(len(topic)):
-                    t_i.append(dictionary.token2id[topic[t]])
+                for n, _ in enumerate(topic):
+                    t_i.append(dictionary.token2id[topic[n]])
                 self.topics.append(np.array(t_i))
         self.coherence = coherence
-        # Set pipeline parameters:
-        if self.coherence == 'u_mass':
-            self.seg = segmentation.s_one_pre
-            self.prob = probability_estimation.p_boolean_document
-            self.conf = direct_confirmation_measure.log_conditional_probability
-            self.aggr = aggregation.arithmetic_mean
-
-        elif self.coherence == 'c_v':
-            self.seg = segmentation.s_one_set
-            self.prob = probability_estimation.p_boolean_sliding_window
-            self.conf = indirect_confirmation_measure.cosine_similarity
-            self.aggr = aggregation.arithmetic_mean
-
-        elif self.coherence == 'c_uci':
-            self.seg = segmentation.s_one_one
-            self.prob = probability_estimation.p_boolean_sliding_window
-            self.conf = direct_confirmation_measure.log_ratio_measure
-            self.aggr = aggregation.arithmetic_mean
-
-        elif self.coherence == 'c_npmi':
-            self.seg = segmentation.s_one_one
-            self.prob = probability_estimation.p_boolean_sliding_window
-            self.conf = direct_confirmation_measure.normalized_log_ratio_measure
-            self.aggr = aggregation.arithmetic_mean
 
     def __str__(self):
-        return "CoherenceModel(segmentation=%s, probability estimation=%s, confirmation measure=%s, aggregation=%s)" % (
-            self.seg, self.prob, self.conf, self.aggr)
+        return coherence_dict[self.coherence].__str__()
 
     def _get_topics(self):
         """Internal helper function to return topics from a trained topic model."""
         topics = []
         if isinstance(self.model, LdaModel):
             for topic in self.model.state.get_lambda():
-                bestn = argsort(topic, topn=10, reverse=True)
+                bestn = argsort(topic, topn=self.topn, reverse=True)
                 topics.append(bestn)
         elif isinstance(self.model, LdaVowpalWabbit):
             for topic in self.model._get_topics():
-                bestn = argsort(topic, topn=10, reverse=True)
+                bestn = argsort(topic, topn=self.topn, reverse=True)
                 topics.append(bestn)
         elif isinstance(self.model, LdaMallet):
             for topic in self.model.word_topics:
-                bestn = argsort(topic, topn=10, reverse=True)
+                bestn = argsort(topic, topn=self.topn, reverse=True)
                 topics.append(bestn)
         else:
             raise ValueError("This topic model is not currently supported. Supported topic models are"
                              "LdaModel, LdaVowpalWabbit and LdaMallet.")
         return topics
 
     def get_coherence(self):
-        if self.coherence == 'u_mass':
-            segmented_topics = self.seg(self.topics)
-            per_topic_postings, num_docs = self.prob(self.corpus, segmented_topics)
-            confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_docs)
-            return self.aggr(confirmed_measures)
-
-        elif self.coherence == 'c_v':
-            if self.window_size is None:
-                self.window_size = 110
-            segmented_topics = self.seg(self.topics)
-            per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics,
-                                                        dictionary=self.dictionary, window_size=self.window_size)
-            confirmed_measures = self.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows)
-            return self.aggr(confirmed_measures)
-
-        elif self.coherence == 'c_uci':
-            if self.window_size is None:
-                self.window_size = 10
-            segmented_topics = self.seg(self.topics)
-            per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics,
-                                                        dictionary=self.dictionary, window_size=self.window_size)
-            confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_windows)
-            return self.aggr(confirmed_measures)
-
-        elif self.coherence == 'c_npmi':
-            if self.window_size is None:
-                self.window_size = 10
-            segmented_topics = self.seg(self.topics)
-            per_topic_postings, num_windows = self.prob(texts=self.texts, segmented_topics=segmented_topics,
-                                                        dictionary=self.dictionary, window_size=self.window_size)
-            confirmed_measures = self.conf(segmented_topics, per_topic_postings, num_windows)
-            return self.aggr(confirmed_measures)
+        """
+        Return coherence value based on pipeline parameters.
+        """
+        measure = coherence_dict[self.coherence]
+        segmented_topics = measure.seg(self.topics)
+        if self.coherence in boolean_document_based:
+            per_topic_postings, num_docs = measure.prob(self.corpus, segmented_topics)
+            confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_docs)
+        elif self.coherence in sliding_window_based:
+            if self.window_size is not None:
+                self.window_size = sliding_windows_dict[self.coherence]
+            per_topic_postings, num_windows = measure.prob(texts=self.texts, segmented_topics=segmented_topics,
+                                                           dictionary=self.dictionary, window_size=self.window_size)
+            if self.coherence == 'c_v':
+                confirmed_measures = measure.conf(self.topics, segmented_topics, per_topic_postings, 'nlr', 1, num_windows)
+            else:
+                if self.coherence == 'c_npmi':
+                    normalize = True
+                else:
+                    # For c_uci
+                    normalize = False
+                confirmed_measures = measure.conf(segmented_topics, per_topic_postings, num_windows, normalize=normalize)
+        return measure.aggr(confirmed_measures)
diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py
@@ -35,12 +35,24 @@
          ['graph', 'minors', 'survey']]
 dictionary = Dictionary(texts)
 corpus = [dictionary.doc2bow(text) for text in texts]
+boolean_document_based = ['u_mass']
+sliding_window_based = ['c_v', 'c_uci', 'c_npmi']
 
 
 def testfile():
     # temporary data will be stored to this file
     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
 
+def checkCoherenceMeasure(topics1, topics2, coherence):
+    """Check provided topic coherence algorithm on given topics"""
+    if coherence in boolean_document_based:
+        cm1 = CoherenceModel(topics=topics1, corpus=corpus, dictionary=dictionary, coherence=coherence)
+        cm2 = CoherenceModel(topics=topics2, corpus=corpus, dictionary=dictionary, coherence=coherence)
+    else:
+        cm1 = CoherenceModel(topics=topics1, texts=texts, dictionary=dictionary, coherence=coherence)
+        cm2 = CoherenceModel(topics=topics2, texts=texts, dictionary=dictionary, coherence=coherence)
+    return cm1.get_coherence() > cm2.get_coherence()
+
 class TestCoherenceModel(unittest.TestCase):
     def setUp(self):
         # Suppose given below are the topics which two different LdaModels come up with.
@@ -67,30 +79,25 @@ def setUp(self):
 
     def testUMass(self):
         """Test U_Mass topic coherence algorithm on given topics"""
-        cm1 = CoherenceModel(topics=self.topics1, corpus=corpus, dictionary=dictionary, coherence='u_mass')
-        cm2 = CoherenceModel(topics=self.topics2, corpus=corpus, dictionary=dictionary, coherence='u_mass')
-        self.assertTrue(cm1.get_coherence() > cm2.get_coherence())
+        self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'u_mass'))
 
     def testCv(self):
         """Test C_v topic coherence algorithm on given topics"""
-        cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_v')
-        cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_v')
-        self.assertTrue(cm1.get_coherence() > cm2.get_coherence())
+        self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_v'))
 
     def testCuci(self):
         """Test C_uci topic coherence algorithm on given topics"""
-        cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_uci')
-        cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_uci')
-        self.assertTrue(cm1.get_coherence() > cm2.get_coherence())
+        self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_uci'))
 
     def testCnpmi(self):
         """Test C_npmi topic coherence algorithm on given topics"""
-        cm1 = CoherenceModel(topics=self.topics1, texts=texts, dictionary=dictionary, coherence='c_npmi')
-        cm2 = CoherenceModel(topics=self.topics2, texts=texts, dictionary=dictionary, coherence='c_npmi')
-        self.assertTrue(cm1.get_coherence() > cm2.get_coherence())
+        self.assertTrue(checkCoherenceMeasure(self.topics1, self.topics2, 'c_npmi'))
 
     def testUMassLdaModel(self):
         """Perform sanity check to see if u_mass coherence works with LDA Model"""
+        # Note that this is just a sanity check because LDA does not guarantee a better coherence
+        # value on the topics if iterations are increased. This can be seen here:
+        # https://gist.github.com/dsquareindia/60fd9ab65b673711c3fa00509287ddde
         try:
             cm = CoherenceModel(model=self.ldamodel, corpus=corpus, coherence='u_mass')
         except:

diff --git a/gensim/test/test_direct_confirmation.py b/gensim/test/test_direct_confirmation.py
@@ -37,7 +37,7 @@ def testLogRatioMeasure(self):
 
     def testNormalizedLogRatioMeasure(self):
         """Test normalized_log_ratio_measure()"""
-        obtained = direct_confirmation_measure.normalized_log_ratio_measure(self.segmentation, self.posting_list, self.num_docs)[0]
+        obtained = direct_confirmation_measure.log_ratio_measure(self.segmentation, self.posting_list, self.num_docs, normalize=True)[0]
         # Answer should be ~ -0.182321557 / -ln(1 / 5) = -0.113282753
         expected = -0.113282753
         self.assertAlmostEqual(obtained, expected)

diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py
@@ -42,12 +42,18 @@ def log_conditional_probability(segmented_topics, per_topic_postings, num_docs):
 
     return m_lc
 
-def log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
+def log_ratio_measure(segmented_topics, per_topic_postings, num_docs, normalize=False):
     """
-    Popularly known as PMI.
-    This function calculates the log-ratio-measure which is used by
-    coherence measures such as c_v.
-    This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]
+    If normalize=False:
+        Popularly known as PMI.
+        This function calculates the log-ratio-measure which is used by
+        coherence measures such as c_v.
+        This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]
+
+    If normalize=True:
+        This function calculates the normalized-log-ratio-measure, popularly knowns as
+        NPMI which is used by coherence measures such as c_v.
+        This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e]
 
     Args:
     ----
@@ -65,38 +71,16 @@ def log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
             w_prime_docs = per_topic_postings[w_prime]
             w_star_docs = per_topic_postings[w_star]
             co_docs = w_prime_docs.intersection(w_star_docs)
-            numerator = (len(co_docs) / float(num_docs)) + EPSILON
-            denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs))
-            m_lr_i = np.log(numerator / denominator)
+            if normalize:
+                # For normalized log ratio measure
+                numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0]
+                co_doc_prob = len(co_docs) / float(num_docs)
+                m_lr_i = numerator / (-np.log(co_doc_prob + EPSILON))
+            else:
+                # For log ratio measure without normalization
+                numerator = (len(co_docs) / float(num_docs)) + EPSILON
+                denominator = (len(w_prime_docs) / float(num_docs)) * (len(w_star_docs) / float(num_docs))
+                m_lr_i = np.log(numerator / denominator)
             m_lr.append(m_lr_i)
 
     return m_lr
-
-def normalized_log_ratio_measure(segmented_topics, per_topic_postings, num_docs):
-    """
-    This function calculates the normalized-log-ratio-measure, popularly knowns as
-    NPMI which is used by coherence measures such as c_v.
-    This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e]
-
-    Args:
-    ----
-    segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples.
-    per_topic_postings : Output from the probability_estimation module. Is a dictionary of the posting list of all topics
-    num_docs : Total number of documents in corpus. Used for calculating probability.
-
-    Returns:
-    -------
-    m_nlr : List of log ratio measures on each set in segmented topics.
-    """
-    m_nlr = []
-    for s_i in segmented_topics:
-        for w_prime, w_star in s_i:
-            numerator = log_ratio_measure([[(w_prime, w_star)]], per_topic_postings, num_docs)[0]
-            w_prime_docs = per_topic_postings[w_prime]
-            w_star_docs = per_topic_postings[w_star]
-            co_docs = w_prime_docs.intersection(w_star_docs)
-            co_doc_prob = len(co_docs) / float(num_docs)
-            m_nlr_i = numerator / (-np.log(co_doc_prob + EPSILON))
-            m_nlr.append(m_nlr_i)
-
-    return m_nlr
diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py
@@ -8,12 +8,12 @@
 This module contains functions to compute confirmation on a pair of words or word subsets.
 
 The formula used to compute indirect confirmation measure is:
-                                _              _
-m_sim(m, gamma)(W', W*) = s_sim(V_m,gamma(W'), V_m,gamma(W*))
+
+m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*))
 
 where s_sim can be cosine, dice or jaccard similarity and
-_
-V_m,gamma(W') = {sigma(w' belonging to W') m(w_i, w_j) ^ gamma} where j = 1, ...., |W|
+
+\vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}
 
 Here 'm' is the direct confirmation measure used.
 """
@@ -52,15 +52,15 @@ def _make_seg(w_prime, w, per_topic_postings, measure, gamma, backtrack, num_doc
         for w_j in w:
             for w_i in w_prime:
                 if (w_i, w_j) not in backtrack:
-                    backtrack[(w_i, w_j)] = measure([[(w_i, w_j)]], per_topic_postings, num_docs)[0]
+                    backtrack[(w_i, w_j)] = measure[0]([[(w_i, w_j)]], per_topic_postings, num_docs, measure[1])[0]
                 if w_j not in context_vectors:
                     context_vectors[w_j] = backtrack[(w_i, w_j)] ** gamma
                 else:
                     context_vectors[w_j] += backtrack[(w_i, w_j)] ** gamma
     else:
         for w_j in w:
             if (w_prime, w_j) not in backtrack:
-                backtrack[(w_prime, w_j)] = measure([[(w_prime, w_j)]], per_topic_postings, num_docs)[0]
+                backtrack[(w_prime, w_j)] = measure[0]([[(w_prime, w_j)]], per_topic_postings, num_docs, measure[1])[0]
             context_vectors[w_j] = backtrack[(w_prime, w_j)] ** gamma
     return (context_vectors, backtrack)
 
@@ -70,7 +70,11 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam
     _   _         _   _
     u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect
                                                                 _     _
-    cosine measure is computed as the cosine similarity between u and w.
+    cosine measure is computed as the cosine similarity between u and w. The formula used is:
+
+    m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*))
+
+    where each vector \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}
 
     Args:
     ----
@@ -86,7 +90,8 @@ def cosine_similarity(topics, segmented_topics, per_topic_postings, measure, gam
     s_cos_sim : array of cosine similarity of the context vectors for each segmentation
     """
     if measure == 'nlr':
-        measure = direct_confirmation_measure.normalized_log_ratio_measure
+        # make normalized log ratio measure tuple
+        measure = (direct_confirmation_measure.log_ratio_measure, True)
     else:
         raise ValueError("The direct confirmation measure you entered is not currently supported.")
     backtrack = {}  # Backtracking dictionary for storing measure values of topic id tuples eg. (1, 2).