From 4f2474d52f2b1087e864a4c5bd7bfe559f2199de Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Tue, 27 Dec 2016 13:35:49 +0100 Subject: [PATCH] [WIP] Print methods in HDP (#1055) * Added print methods, lda_model * Added HDP tests * Changelog * Removed duplicate code * Removed duplicate code * Added import * Fixed Changelog --- CHANGELOG.md | 1 + gensim/models/hdpmodel.py | 61 ++++++++++++++++++++++++++++++++++-- gensim/models/ldamodel.py | 15 +-------- gensim/test/test_hdpmodel.py | 24 ++++++++++++-- gensim/utils.py | 16 ++++++++++ 5 files changed, 97 insertions(+), 20 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 06077e77ce..5c6d05f4ae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Unreleased: 0.13.4, 2016-12-22 +* Added suggested lda model method and print methods to HDP class (@bhargavvader, [#1055](https://github.com/RaRe-Technologies/gensim/pull/1055)) * New class KeyedVectors to store embedding separate from training code (@anmol01gulati and @droudy, [#980](https://github.com/RaRe-Technologies/gensim/pull/980)) * Evaluation of word2vec models against semantic similarity datasets like SimLex-999 (@akutuzov, [#1047](https://github.com/RaRe-Technologies/gensim/pull/1047)) * TensorBoard word embedding visualisation of Gensim Word2vec format (@loretoparisi, [#1051](https://github.com/RaRe-Technologies/gensim/pull/1051)) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 940295a6c6..71b37437ca 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -38,7 +38,7 @@ import scipy.special as sp from gensim import interfaces, utils, matutils -from gensim.models import basemodel +from gensim.models import basemodel, ldamodel from six.moves import xrange logger = logging.getLogger(__name__) @@ -56,6 +56,7 @@ def dirichlet_expectation(alpha): return(sp.psi(alpha) - sp.psi(np.sum(alpha, 1))[:, np.newaxis]) + def expect_log_sticks(sticks): """ For stick-breaking hdp, return the E[log(sticks)] @@ -130,7 +131,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, - outputdir=None): + outputdir=None, random_state=None): """ `gamma`: first level concentration `alpha`: second level concentration @@ -151,6 +152,8 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, self.max_time = max_time self.outputdir = outputdir + self.random_state = utils.get_random_state(random_state) + self.lda_alpha = None self.lda_beta = None @@ -169,7 +172,7 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, self.m_var_sticks[1] = range(T - 1, 0, -1) self.m_varphi_ss = np.zeros(T) - self.m_lambda = np.random.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta + self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta self.m_eta = eta self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda) @@ -442,6 +445,21 @@ def update_expectations(self): self.m_timestamp[:] = self.m_updatect self.m_status_up_to_date = True + def show_topic(self, topic_id, num_words=20, log=False, formatted=False): + """ + Print the `num_words` most probable words for `topics` number of topics. + Set `topics=-1` to print all topics. + + Set `formatted=True` to return the topics as a list of strings, or + `False` as lists of (weight, word) pairs. + + """ + if not self.m_status_up_to_date: + self.update_expectations() + betas = self.m_lambda + self.m_eta + hdp_formatter = HdpTopicFormatter(self.id2word, betas) + return hdp_formatter.show_topic(topic_id, num_words, log, formatted) + def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): """ Print the `num_words` most probable words for `topics` number of topics. @@ -510,6 +528,17 @@ def hdp_to_lda(self): return (alpha, beta) + def suggested_lda_model(self): + """ + Returns closest corresponding ldamodel object corresponding to current hdp model. + The hdp_to_lda method only returns corresponding alpha, beta values, and this method returns a trained ldamodel. + The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta. + """ + alpha, beta = self.hdp_to_lda() + ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state) + ldam.expElogbeta[:] = beta + return ldam + def evaluate_test_corpus(self, corpus): logger.info('TEST: evaluating test corpus') if self.lda_alpha is None or self.lda_beta is None: @@ -589,6 +618,32 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): return shown + def print_topic(self, topic_id, num_words): + return self.show_topic(topic_id, num_words, formatted=True) + + def show_topic(self, topic_id, num_words, log=False, formatted=False): + + lambdak = list(self.data[topic_id, :]) + lambdak = lambdak / sum(lambdak) + + temp = zip(lambdak, xrange(len(lambdak))) + temp = sorted(temp, key=lambda x: x[0], reverse=True) + + topic_terms = self.show_topic_terms(temp, num_words) + + if formatted: + topic = self.format_topic(topic_id, topic_terms) + + # assuming we only output formatted topics + if log: + logger.info(topic) + else: + topic = (topic_id, topic_terms) + + # we only return the topic_terms + return topic[1] + + def show_topic_terms(self, topic_data, num_words): return [(self.dictionary[wid], weight) for (weight, wid) in topic_data[:num_words]] diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index c2324904e3..4bce54f4fc 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -92,19 +92,6 @@ def update_dir_prior(prior, N, logphat, rho): return prior -def get_random_state(seed): - """ Turn seed into a np.random.RandomState instance. - - Method originally from maciejkula/glove-python, and written by @joshloyal - """ - if seed is None or seed is np.random: - return np.random.mtrand._rand - if isinstance(seed, (numbers.Integral, np.integer)): - return np.random.RandomState(seed) - if isinstance(seed, np.random.RandomState): - return seed - raise ValueError('%r cannot be used to seed a np.random.RandomState' - ' instance' % seed) class LdaState(utils.SaveLoad): """ @@ -314,7 +301,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') - self.random_state = get_random_state(random_state) + self.random_state = utils.get_random_state(random_state) assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), ( "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py index 441a5dbcf4..e2b543687c 100644 --- a/gensim/test/test_hdpmodel.py +++ b/gensim/test/test_hdpmodel.py @@ -23,6 +23,7 @@ from gensim import matutils from gensim.test import basetests +import numpy as np module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder datapath = lambda fname: os.path.join(module_path, 'test_data', fname) @@ -51,12 +52,29 @@ class TestHdpModel(unittest.TestCase, basetests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.class_ = hdpmodel.HdpModel - self.model = self.class_(corpus, id2word=dictionary) + self.model = self.class_(corpus, id2word=dictionary, random_state=np.random.seed(0)) - def testShowTopic(self): - # TODO create show_topic in HdpModel and then test + def testTopicValues(self): + """ + Check show topics method + """ + results = self.model.show_topics()[0] + expected_prob, expected_word = '0.264', 'trees ' + prob, word = results[1].split('+')[0].split('*') + self.assertEqual(results[0], 0) + self.assertEqual(prob, expected_prob) + self.assertEqual(word, expected_word) + return + def testLDAmodel(self): + """ + Create ldamodel object, and check if the corresponding alphas are equal. + """ + ldam = self.model.suggested_lda_model() + self.assertEqual(ldam.alpha[0], self.model.lda_alpha[0]) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/utils.py b/gensim/utils.py index 695f72be2f..0bdccc46f7 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -37,6 +37,7 @@ import subprocess import numpy +import numbers import scipy.sparse if sys.version_info[0] >= 3: @@ -80,6 +81,21 @@ def smart_open(fname, mode='rb'): RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE) +def get_random_state(seed): + """ Turn seed into a np.random.RandomState instance. + + Method originally from maciejkula/glove-python, and written by @joshloyal + """ + if seed is None or seed is numpy.random: + return numpy.random.mtrand._rand + if isinstance(seed, (numbers.Integral, numpy.integer)): + return numpy.random.RandomState(seed) + if isinstance(seed, numpy.random.RandomState): + return seed + raise ValueError('%r cannot be used to seed a numpy.random.RandomState' + ' instance' % seed) + + def synchronous(tlockname): """ A decorator to place an instance-based lock around a method.