From 4f2474d52f2b1087e864a4c5bd7bfe559f2199de Mon Sep 17 00:00:00 2001
From: Bhargav Srinivasa <bhargavvader@gmail.com>
Date: Tue, 27 Dec 2016 13:35:49 +0100
Subject: [PATCH] [WIP] Print methods in HDP (#1055)

* Added print methods, lda_model

* Added HDP tests

* Changelog

* Removed duplicate code

* Removed duplicate code

* Added import

* Fixed Changelog
---
 CHANGELOG.md                 |  1 +
 gensim/models/hdpmodel.py    | 61 ++++++++++++++++++++++++++++++++++--
 gensim/models/ldamodel.py    | 15 +--------
 gensim/test/test_hdpmodel.py | 24 ++++++++++++--
 gensim/utils.py              | 16 ++++++++++
 5 files changed, 97 insertions(+), 20 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 06077e77ce..5c6d05f4ae 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ Unreleased:
 
 0.13.4, 2016-12-22
 
+* Added suggested lda model method and print methods to HDP class (@bhargavvader, [#1055](https://github.com/RaRe-Technologies/gensim/pull/1055))
 * New class KeyedVectors to store embedding separate from training code (@anmol01gulati and @droudy, [#980](https://github.com/RaRe-Technologies/gensim/pull/980))
 * Evaluation of word2vec models against semantic similarity datasets like SimLex-999 (@akutuzov, [#1047](https://github.com/RaRe-Technologies/gensim/pull/1047))
 * TensorBoard word embedding visualisation of Gensim Word2vec format (@loretoparisi, [#1051](https://github.com/RaRe-Technologies/gensim/pull/1051))
diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py
index 940295a6c6..71b37437ca 100755
--- a/gensim/models/hdpmodel.py
+++ b/gensim/models/hdpmodel.py
@@ -38,7 +38,7 @@
 import scipy.special as sp
 
 from gensim import interfaces, utils, matutils
-from gensim.models import basemodel
+from gensim.models import basemodel, ldamodel
 from six.moves import xrange
 
 logger = logging.getLogger(__name__)
@@ -56,6 +56,7 @@ def dirichlet_expectation(alpha):
     return(sp.psi(alpha) - sp.psi(np.sum(alpha, 1))[:, np.newaxis])
 
 
+
 def expect_log_sticks(sticks):
     """
     For stick-breaking hdp, return the E[log(sticks)]
@@ -130,7 +131,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
     def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
                  chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1,
                  gamma=1, eta=0.01, scale=1.0, var_converge=0.0001,
-                 outputdir=None):
+                 outputdir=None, random_state=None):
         """
         `gamma`: first level concentration
         `alpha`: second level concentration
@@ -151,6 +152,8 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
         self.max_time = max_time
         self.outputdir = outputdir
 
+        self.random_state = utils.get_random_state(random_state)
+
         self.lda_alpha = None
         self.lda_beta = None
 
@@ -169,7 +172,7 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
         self.m_var_sticks[1] = range(T - 1, 0, -1)
         self.m_varphi_ss = np.zeros(T)
 
-        self.m_lambda = np.random.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta
+        self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta
         self.m_eta = eta
         self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda)
 
@@ -442,6 +445,21 @@ def update_expectations(self):
         self.m_timestamp[:] = self.m_updatect
         self.m_status_up_to_date = True
 
+    def show_topic(self, topic_id, num_words=20, log=False, formatted=False):
+        """
+        Print the `num_words` most probable words for `topics` number of topics.
+        Set `topics=-1` to print all topics.
+
+        Set `formatted=True` to return the topics as a list of strings, or
+        `False` as lists of (weight, word) pairs.
+
+        """
+        if not self.m_status_up_to_date:
+            self.update_expectations()
+        betas = self.m_lambda + self.m_eta
+        hdp_formatter = HdpTopicFormatter(self.id2word, betas)
+        return hdp_formatter.show_topic(topic_id, num_words, log, formatted)
+        
     def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True):
         """
         Print the `num_words` most probable words for `topics` number of topics.
@@ -510,6 +528,17 @@ def hdp_to_lda(self):
 
         return (alpha, beta)
 
+    def suggested_lda_model(self):
+        """
+        Returns closest corresponding ldamodel object corresponding to current hdp model.
+        The hdp_to_lda method only returns corresponding alpha, beta values, and this method returns a trained ldamodel.
+        The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta.
+        """
+        alpha, beta = self.hdp_to_lda()
+        ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state)
+        ldam.expElogbeta[:] = beta
+        return ldam
+
     def evaluate_test_corpus(self, corpus):
         logger.info('TEST: evaluating test corpus')
         if self.lda_alpha is None or self.lda_beta is None:
@@ -589,6 +618,32 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):
 
         return shown
 
+    def print_topic(self, topic_id, num_words):
+        return self.show_topic(topic_id, num_words, formatted=True)
+
+    def show_topic(self, topic_id, num_words, log=False, formatted=False):
+
+        lambdak = list(self.data[topic_id, :])
+        lambdak = lambdak / sum(lambdak)
+
+        temp = zip(lambdak, xrange(len(lambdak)))
+        temp = sorted(temp, key=lambda x: x[0], reverse=True)
+
+        topic_terms = self.show_topic_terms(temp, num_words)
+
+        if formatted:
+            topic = self.format_topic(topic_id, topic_terms)
+
+            # assuming we only output formatted topics
+            if log:
+                logger.info(topic)
+        else:
+            topic = (topic_id, topic_terms)
+        
+        # we only return the topic_terms
+        return topic[1]
+
+
     def show_topic_terms(self, topic_data, num_words):
         return [(self.dictionary[wid], weight) for (weight, wid) in topic_data[:num_words]]
 
diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index c2324904e3..4bce54f4fc 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -92,19 +92,6 @@ def update_dir_prior(prior, N, logphat, rho):
 
     return prior
 
-def get_random_state(seed):
-     """ Turn seed into a np.random.RandomState instance.
-
-         Method originally from maciejkula/glove-python, and written by @joshloyal
-     """
-     if seed is None or seed is np.random:
-         return np.random.mtrand._rand
-     if isinstance(seed, (numbers.Integral, np.integer)):
-         return np.random.RandomState(seed)
-     if isinstance(seed, np.random.RandomState):
-        return seed
-     raise ValueError('%r cannot be used to seed a np.random.RandomState'
-                      ' instance' % seed)
 
 class LdaState(utils.SaveLoad):
     """
@@ -314,7 +301,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
 
         self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')
 
-        self.random_state = get_random_state(random_state)
+        self.random_state = utils.get_random_state(random_state)
 
         assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), (
                 "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py
index 441a5dbcf4..e2b543687c 100644
--- a/gensim/test/test_hdpmodel.py
+++ b/gensim/test/test_hdpmodel.py
@@ -23,6 +23,7 @@
 from gensim import matutils
 from gensim.test import basetests
 
+import numpy as np
 
 module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
 datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
@@ -51,12 +52,29 @@ class TestHdpModel(unittest.TestCase, basetests.TestBaseTopicModel):
     def setUp(self):
         self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
         self.class_ = hdpmodel.HdpModel
-        self.model = self.class_(corpus, id2word=dictionary)
+        self.model = self.class_(corpus, id2word=dictionary, random_state=np.random.seed(0))
 
-    def testShowTopic(self):
-        # TODO create show_topic in HdpModel and then test
+    def testTopicValues(self):
+        """
+        Check show topics method
+        """
+        results = self.model.show_topics()[0]
+        expected_prob, expected_word = '0.264', 'trees '
+        prob, word = results[1].split('+')[0].split('*')
+        self.assertEqual(results[0], 0)
+        self.assertEqual(prob, expected_prob)
+        self.assertEqual(word, expected_word)        
+ 
         return
 
+    def testLDAmodel(self):
+        """
+        Create ldamodel object, and check if the corresponding alphas are equal.
+        """
+        ldam = self.model.suggested_lda_model()
+        self.assertEqual(ldam.alpha[0], self.model.lda_alpha[0])
+
+
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()
diff --git a/gensim/utils.py b/gensim/utils.py
index 695f72be2f..0bdccc46f7 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -37,6 +37,7 @@
 import subprocess
 
 import numpy
+import numbers
 import scipy.sparse
 
 if sys.version_info[0] >= 3:
@@ -80,6 +81,21 @@ def smart_open(fname, mode='rb'):
 RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)
 
 
+def get_random_state(seed):
+     """ Turn seed into a np.random.RandomState instance.
+
+         Method originally from maciejkula/glove-python, and written by @joshloyal
+     """
+     if seed is None or seed is numpy.random:
+         return numpy.random.mtrand._rand
+     if isinstance(seed, (numbers.Integral, numpy.integer)):
+         return numpy.random.RandomState(seed)
+     if isinstance(seed, numpy.random.RandomState):
+        return seed
+     raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
+                      ' instance' % seed)
+
+
 def synchronous(tlockname):
     """
     A decorator to place an instance-based lock around a method.