From 023c1415338e25a1ca8a74379b507e5a258e2081 Mon Sep 17 00:00:00 2001 From: prerna135 Date: Fri, 23 Jun 2017 02:10:47 +0530 Subject: [PATCH 01/10] Fixes a part of #1192 Fixes warnings in the .py files --- gensim/models/coherencemodel.py | 14 ++++---- gensim/models/keyedvectors.py | 4 +-- gensim/models/wrappers/ldamallet.py | 8 ++--- gensim/models/wrappers/ldavowpalwabbit.py | 8 ++--- gensim/summarization/summarizer.py | 4 +-- gensim/topic_coherence/aggregation.py | 8 ++--- .../direct_confirmation_measure.py | 16 ++++----- .../indirect_confirmation_measure.py | 19 +++++------ .../topic_coherence/probability_estimation.py | 23 ++++++------- gensim/topic_coherence/segmentation.py | 33 +++++++++---------- gensim/utils.py | 21 +++++++----- 11 files changed, 79 insertions(+), 79 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index d35a266a4a..cfee929f69 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -14,8 +14,8 @@ Implementation of this pipeline allows for the user to in essence "make" a coherence measure of his/her choice by choosing a method in each of the pipelines. -.. [1] Michael Roeder, Andreas Both and Alexander Hinneburg. Exploring the space of topic -coherence measures. http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf. +.. [1] Michael Roeder, Andreas Both and Alexander Hinneburg. Exploring the space of topic coherence measures. + `http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf.` """ import logging @@ -113,15 +113,17 @@ class CoherenceModel(interfaces.TransformationABC): def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v', topn=10, processes=-1): """ - Args: - ---- + Args + ==== model : Pre-trained topic model. Should be provided if topics is not provided. Currently supports LdaModel, LdaMallet wrapper and LdaVowpalWabbit wrapper. Use 'topics' parameter to plug in an as yet unsupported model. - topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. eg:: + topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. + eg:: topics = [['human', 'machine', 'computer', 'interface'], ['graph', 'trees', 'binary', 'widths']] - texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, eg:: + texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, + eg:: texts = [['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index d0f768b5f6..126a09431d 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -730,7 +730,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case dummy4unknown=False): """ Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where - lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'. + lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter`. An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html. @@ -748,7 +748,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first occurrence (also the most frequent if vocabulary is sorted) is taken. - Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words. + Use `dummy4unknown=True` to produce zero-valued similarities for pairs with out-of-vocabulary words. Otherwise (default False), these pairs are skipped entirely. """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 640cf11dd8..5e7cd789ef 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -358,14 +358,14 @@ def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): training model weights (alpha, beta...) from a trained mallet model into the gensim model. - Args: - ---- + Args + ==== mallet_model : Trained mallet model gamma_threshold : To be used for inference in the new LdaModel. iterations : number of iterations to be used for inference in the new LdaModel. - Returns: - ------- + Returns + ======= model_gensim : LdaModel instance; copied gensim LdaModel """ model_gensim = LdaModel( diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py index 7ae040293c..216942a8d5 100644 --- a/gensim/models/wrappers/ldavowpalwabbit.py +++ b/gensim/models/wrappers/ldavowpalwabbit.py @@ -567,13 +567,13 @@ def vwmodel2ldamodel(vw_model, iterations=50): simply copying the training model weights (alpha, beta...) from a trained vwmodel into the gensim model. - Args: - ---- + Args + ==== vw_model : Trained vowpal wabbit model. iterations : Number of iterations to be used for inference of the new LdaModel. - Returns: - ------- + Returns + ======= model_gensim : LdaModel instance; copied gensim LdaModel. """ model_gensim = LdaModel( diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index e749b4cc66..4c37006dbe 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -184,10 +184,10 @@ def summarize(text, ratio=0.2, word_count=None, split=False): The length of the output can be specified using the ratio and word_count parameters: - ratio should be a number between 0 and 1 that determines the + ratio should be a number between 0 and 1 that determines the percentage of the number of sentences of the original text to be chosen for the summary (defaults at 0.2). - word_count determines how many words will the output contain. + word_count determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. """ # Gets a list of processed sentences. diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py index 7c345d8812..109485dbd1 100644 --- a/gensim/topic_coherence/aggregation.py +++ b/gensim/topic_coherence/aggregation.py @@ -19,12 +19,12 @@ def arithmetic_mean(confirmed_measures): This functoin performs the arithmetic mean aggregation on the output obtained from the confirmation measure module. - Args: - ---- + Args + ==== confirmed_measures : list of calculated confirmation measure on each set in the segmented topics. - Returns: - ------- + Returns + ======= mean : Arithmetic mean of all the values contained in confirmation measures. """ return np.mean(confirmed_measures) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 467d134f29..fddfa1a843 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -23,14 +23,14 @@ def log_conditional_probability(segmented_topics, accumulator): which is used by coherence measures such as U_mass. This is defined as: m_lc(S_i) = log[(P(W', W*) + e) / P(W*)] - Args: - ---- + Args + ==== segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. accumulator: word occurrence accumulator from probability_estimation. - Returns: - ------- + Returns + ======= m_lc : List of log conditional probability measure for each topic. """ m_lc = [] @@ -64,14 +64,14 @@ def log_ratio_measure(segmented_topics, accumulator, normalize=False): NPMI which is used by coherence measures such as c_v. This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e] - Args: - ---- + Args + ==== segmented topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. accumulator: word occurrence accumulator from probability_estimation. - Returns: - ------- + Returns + ======= m_lr : List of log ratio measures for each topic. """ m_lr = [] diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 07f221e941..0c3c2ae8e7 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -41,25 +41,22 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamm _ _ _ _ u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect _ _ - cosine measure is computed as the cosine similarity between u and w. The formula used is: + cosine measure is computed as the cosine similarity between u and w. The formula used is:: m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) where each vector \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} - Args: - ---- - segmented_topics : Output from the segmentation module of the segmented topics. - Is a list of list of tuples. - accumulator : Output from the probability_estimation module. - Is an accumulator of word occurrences (see text_analysis module). + Args + ==== + segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. + accumulator : Output from the probability_estimation module. Is an accumulator of word occurrences (see text_analysis module). topics : Topics obtained from the trained topic model. - measure : String. Direct confirmation measure to be used. - Supported values are "nlr" (normalized log ratio). + measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). gamma : Gamma value for computing W', W* vectors; default is 1. - Returns: - ------- + Returns + ======= s_cos_sim : list of indirect cosine similarity measure for each topic. """ context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 85e787de18..4d9ba740b4 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -22,13 +22,13 @@ def p_boolean_document(corpus, segmented_topics): Boolean document estimates the probability of a single word as the number of documents in which the word occurs divided by the total number of documents. - Args: - ---- + Args + ==== corpus : The corpus of documents. segmented_topics : Output from the segmentation of topics. Could be simply topics too. - Returns: - ------- + Returns + ======= accumulator : word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. """ @@ -43,15 +43,15 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p document by copying the window content. Boolean document is applied to these virtual documents to compute word probabilities. - Args: - ---- + Args + ==== texts : List of string sentences. segmented_topics : Output from the segmentation of topics. Could be simply topics too. dictionary : Gensim dictionary mapping of the tokens and ids. window_size : Size of the sliding window. 110 found out to be the ideal size for large corpora. - Returns: - ------- + Returns + ======= accumulator : word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. """ @@ -67,11 +67,12 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p def unique_ids_from_segments(segmented_topics): """Return the set of all unique ids in a list of segmented topics. - Args: - ---- + Args + ==== segmented_topics: list of tuples of (word_id_set1, word_id_set2). Each word_id_set is either a single integer, or a `numpy.ndarray` of integers. - Returns: + Returns + ======= unique_ids : set of unique ids across all topic segments. """ unique_ids = set() # is a set of all the unique ids contained in topics. diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 9a2a58b060..2b400d1459 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -16,20 +16,19 @@ def s_one_pre(topics): """ This function performs s_one_pre segmentation on a list of topics. - s_one_pre segmentation is defined as: s_one_pre = {(W', W*) | W' = {w_i}; - W* = {w_j}; w_i, w_j belongs to W; i > j} + s_one_pre segmentation is defined as: s_one_pre = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i > j} Example: >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] >>> s_one_pre(topics) [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] - Args: - ---- + Args + ==== topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] - Returns: - ------- + Returns + ======= s_one_pre : list of list of (W', W*) tuples for all unique topic ids """ s_one_pre = [] @@ -46,20 +45,19 @@ def s_one_pre(topics): def s_one_one(topics): """ This function performs s_one_one segmentation on a list of topics. - s_one_one segmentation is defined as: s_one_one = {(W', W*) | W' = {w_i}; - W* = {w_j}; w_i, w_j belongs to W; i != j} + s_one_one segmentation is defined as: s_one_one = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i != j} Example: >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] >>> s_one_pre(topics) [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] - Args: - ---- + Args + ==== topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] - Returns: - ------- + Returns + ======= s_one_one : list of list of (W', W*) tuples for all unique topic ids """ s_one_one = [] @@ -79,8 +77,7 @@ def s_one_one(topics): def s_one_set(topics): """ This function performs s_one_set segmentation on a list of topics. - s_one_set segmentation is defined as: s_one_set = {(W', W*) | W' = {w_i}; w_i belongs to W; - W* = W} + s_one_set segmentation is defined as: s_one_set = {(W', W*) | W' = {w_i}; w_i belongs to W; W* = W} Example: >>> topics = [np.array([9, 10, 7]) >>> s_one_set(topics) @@ -88,12 +85,12 @@ def s_one_set(topics): (10, array([ 9, 10, 7])), (7, array([ 9, 10, 7]))]] - Args: - ---- + Args + ==== topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] - Returns: - ------- + Returns + ======= s_one_set : list of list of (W', W*) tuples for all unique topic ids. """ s_one_set = [] diff --git a/gensim/utils.py b/gensim/utils.py index dd391f887b..3506ee8436 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1208,14 +1208,17 @@ def strided_windows(ndarray, window_size): [4, 5, 6, 7, 8], [5, 6, 7, 8, 9]]) - Args: - ---- + Args + ==== ndarray: either a numpy.ndarray or something that can be converted into one. window_size: sliding window size. - :param window_size: - :return: numpy.ndarray of the subsequences produced by sliding a window of the given size over - the `ndarray`. Since this uses striding, the individual arrays are views rather than - copies of `ndarray`. Changes to one view modifies the others and the original. + param window_size: + + Returns + ======= + numpy.ndarray of the subsequences produced by sliding a window of the given size over + the `ndarray`. Since this uses striding, the individual arrays are views rather than + copies of `ndarray`. Changes to one view modifies the others and the original. """ ndarray = np.asarray(ndarray) if window_size == ndarray.shape[0]: @@ -1234,13 +1237,13 @@ def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include The windows produced are views of some subsequence of a text. To use deep copies instead, pass `copy=True`. - Args: - ---- + Args + ==== texts: List of string sentences. window_size: Size of sliding window. copy: False to use views of the texts (default) or True to produce deep copies. ignore_below_size: ignore documents that are not at least `window_size` in length (default behavior). - If False, the documents below `window_size` will be yielded as the full document. + If False, the documents below `window_size` will be yielded as the full document. """ for doc_num, document in enumerate(texts): From ad33484ba1015e07cd700a54858e2301dc9d6a79 Mon Sep 17 00:00:00 2001 From: prerna135 Date: Fri, 23 Jun 2017 14:33:46 +0530 Subject: [PATCH 02/10] Removing additional whitespaces --- gensim/models/coherencemodel.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index cfee929f69..081c619ae7 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -15,7 +15,7 @@ coherence measure of his/her choice by choosing a method in each of the pipelines. .. [1] Michael Roeder, Andreas Both and Alexander Hinneburg. Exploring the space of topic coherence measures. - `http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf.` +`http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf.` """ import logging @@ -119,12 +119,12 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= Currently supports LdaModel, LdaMallet wrapper and LdaVowpalWabbit wrapper. Use 'topics' parameter to plug in an as yet unsupported model. topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. - eg:: - topics = [['human', 'machine', 'computer', 'interface'], + eg : + topics = [['human', 'machine', 'computer', 'interface'], ['graph', 'trees', 'binary', 'widths']] texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, - eg:: - texts = [['system', 'human', 'system', 'eps'], + eg : + texts = [['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], From 4a5143a0f4f12a2c2f52c12e405e33c93f4cada0 Mon Sep 17 00:00:00 2001 From: prerna135 Date: Fri, 23 Jun 2017 14:36:04 +0530 Subject: [PATCH 03/10] Removing additional whitespaces from utils.py --- gensim/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/utils.py b/gensim/utils.py index 3506ee8436..a1eaf3662f 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1212,7 +1212,6 @@ def strided_windows(ndarray, window_size): ==== ndarray: either a numpy.ndarray or something that can be converted into one. window_size: sliding window size. - param window_size: Returns ======= From 9c31c0161a1788aa28b4cf0e793effa7716be2c6 Mon Sep 17 00:00:00 2001 From: prerna135 Date: Fri, 23 Jun 2017 15:21:45 +0530 Subject: [PATCH 04/10] Removing trailing/leading whitespaces from --- gensim/models/coherencemodel.py | 6 +++--- gensim/utils.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 081c619ae7..267bccae46 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -14,7 +14,7 @@ Implementation of this pipeline allows for the user to in essence "make" a coherence measure of his/her choice by choosing a method in each of the pipelines. -.. [1] Michael Roeder, Andreas Both and Alexander Hinneburg. Exploring the space of topic coherence measures. +.. [1] Michael Roeder, Andreas Both and Alexander Hinneburg. Exploring the space of topic coherence measures. `http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf.` """ @@ -118,11 +118,11 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= model : Pre-trained topic model. Should be provided if topics is not provided. Currently supports LdaModel, LdaMallet wrapper and LdaVowpalWabbit wrapper. Use 'topics' parameter to plug in an as yet unsupported model. - topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. + topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. eg : topics = [['human', 'machine', 'computer', 'interface'], ['graph', 'trees', 'binary', 'widths']] - texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, + texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, eg : texts = [['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], diff --git a/gensim/utils.py b/gensim/utils.py index a1eaf3662f..8aebfc7da1 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1212,7 +1212,7 @@ def strided_windows(ndarray, window_size): ==== ndarray: either a numpy.ndarray or something that can be converted into one. window_size: sliding window size. - + Returns ======= numpy.ndarray of the subsequences produced by sliding a window of the given size over From 91980d6968983ebb11c87a03f7ff2c7992cf4535 Mon Sep 17 00:00:00 2001 From: prerna135 Date: Mon, 26 Jun 2017 17:06:14 +0530 Subject: [PATCH 05/10] Making changes according to Google Code Style @menshikh-iv Fixing warnings in the .py files according to the Google Code Style. Most of the warnings were due to indentation errors. --- gensim/models/atmodel.py | 4 +- gensim/models/coherencemodel.py | 67 ++++++++++--------- gensim/models/ldamodel.py | 4 +- gensim/models/ldaseqmodel.py | 4 +- gensim/models/word2vec.py | 22 +++--- gensim/models/wrappers/dtmmodel.py | 5 +- gensim/models/wrappers/ldamallet.py | 18 +++-- gensim/models/wrappers/ldavowpalwabbit.py | 18 +++-- gensim/scripts/glove2word2vec.py | 5 +- gensim/similarities/docsim.py | 4 +- gensim/summarization/summarizer.py | 11 ++- gensim/topic_coherence/aggregation.py | 10 ++- .../direct_confirmation_measure.py | 30 ++++----- .../indirect_confirmation_measure.py | 22 +++--- .../topic_coherence/probability_estimation.py | 48 ++++++------- gensim/topic_coherence/segmentation.py | 30 ++++----- gensim/topic_coherence/text_analysis.py | 19 +++--- gensim/utils.py | 31 ++++----- 18 files changed, 161 insertions(+), 191 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 8770284178..335f2af7f0 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -533,10 +533,10 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, corpus (gensim corpus): The corpus with which the author-topic model should be updated. author2doc (dictionary): author to document mapping corresponding to indexes in input - corpus. + corpus. doc2author (dictionary): document to author mapping corresponding to indexes in input - corpus. + corpus. chunks_as_numpy (bool): Whether each chunk passed to `.inference` should be a np array of not. np can in some settings turn the term IDs diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index 267bccae46..a17fd8c8f5 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -9,13 +9,13 @@ the four stage topic coherence pipeline from the paper [1]_. The four stage pipeline is basically: -Segmentation -> Probability Estimation -> Confirmation Measure -> Aggregation. + Segmentation -> Probability Estimation -> Confirmation Measure -> Aggregation. Implementation of this pipeline allows for the user to in essence "make" a coherence measure of his/her choice by choosing a method in each of the pipelines. -.. [1] Michael Roeder, Andreas Both and Alexander Hinneburg. Exploring the space of topic coherence measures. -`http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf.` +.. [1] Michael Roeder, Andreas Both and Alexander Hinneburg. Exploring the space of topic + coherence measures. http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf. """ import logging @@ -113,43 +113,46 @@ class CoherenceModel(interfaces.TransformationABC): def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary=None, window_size=None, coherence='c_v', topn=10, processes=-1): """ - Args - ==== - model : Pre-trained topic model. Should be provided if topics is not provided. + Args: + model : Pre-trained topic model. Should be provided if topics is not provided. Currently supports LdaModel, LdaMallet wrapper and LdaVowpalWabbit wrapper. Use 'topics' parameter to plug in an as yet unsupported model. - topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. - eg : - topics = [['human', 'machine', 'computer', 'interface'], + topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. + eg:: + + topics = [['human', 'machine', 'computer', 'interface'], ['graph', 'trees', 'binary', 'widths']] - texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, - eg : - texts = [['system', 'human', 'system', 'eps'], + + texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, + eg:: + + texts = [['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']] - corpus : Gensim document corpus. - dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, - this is not needed. If both are provided, dictionary will be used. - window_size : Is the size of the window to be used for coherence measures using boolean sliding window as their - probability estimator. For 'u_mass' this doesn't matter. - If left 'None' the default window sizes are used which are: - 'c_v' : 110 - 'c_uci' : 10 - 'c_npmi' : 10 - coherence : Coherence measure to be used. Supported values are: - 'u_mass' - 'c_v' - 'c_uci' also popularly known as c_pmi - 'c_npmi' - For 'u_mass' corpus should be provided. If texts is provided, it will be converted - to corpus using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. - Corpus is not needed. - topn : Integer corresponding to the number of top words to be extracted from each topic. - processes : number of processes to use for probability estimation phase; any value less than 1 will be - interpreted to mean num_cpus - 1; default is -1. + + corpus : Gensim document corpus. + dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, + this is not needed. If both are provided, dictionary will be used. + window_size : Is the size of the window to be used for coherence measures using boolean sliding window as their + probability estimator. For 'u_mass' this doesn't matter. + If left 'None' the default window sizes are used which are: + 'c_v' : 110 + 'c_uci' : 10 + 'c_npmi' : 10 + coherence : Coherence measure to be used. Supported values are: + 'u_mass' + 'c_v' + 'c_uci' also popularly known as c_pmi + 'c_npmi' + For 'u_mass' corpus should be provided. If texts is provided, it will be converted + to corpus using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. + Corpus is not needed. + topn : Integer corresponding to the number of top words to be extracted from each topic. + processes : number of processes to use for probability estimation phase; any value less than 1 will be + interpreted to mean num_cpus - 1; default is -1. """ if model is None and topics is None: raise ValueError("One of model or topics has to be provided.") diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 9f41334d47..0e42760b96 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -981,11 +981,11 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, `n_ann_terms` is max quantity of words in intersection/symmetric difference between topics (used for annotation) Returns a matrix Z with shape (m1.num_topics, m2.num_topics), where Z[i][j] - difference between topic_i and topic_j and matrix annotation with shape (m1.num_topics, m2.num_topics, 2, None), - where + where: annotation[i][j] = [[`int_1`, `int_2`, ...], [`diff_1`, `diff_2`, ...]] and `int_k` is word from intersection of `topic_i` and `topic_j` and `diff_l` is word from symmetric difference of `topic_i` and `topic_j` - `normed` is a flag. If `true`, matrix Z will be normalized + `normed` is a flag. If `true`, matrix Z will be normalized Example: >>> m1, m2 = LdaMulticore.load(path_1), LdaMulticore.load(path_2) >>> mdiff, annotation = m1.diff(m2) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 8e57489a89..98a045623b 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -157,9 +157,9 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, """ fit an lda sequence model: - for each time period + for each time period: set up lda model with E[log p(w|z)] and \alpha - for each document + for each document: perform posterior inference update sufficient statistics/likelihood diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 85aeefe173..6dbc84c751 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -21,27 +21,25 @@ Initialize a model with e.g.:: ->>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) + >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) Persist a model to disk with:: ->>> model.save(fname) ->>> model = Word2Vec.load(fname) # you can continue training with the loaded model! + >>> model.save(fname) + >>> model = Word2Vec.load(fname) # you can continue training with the loaded model! -The word vectors are stored in a KeyedVectors instance in model.wv. This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec. +The word vectors are stored in a KeyedVectors instance in model.wv. This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:: >>> model.wv['computer'] # numpy vector of a word array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) The word vectors can also be instantiated from an existing file on disk in the word2vec C format as a KeyedVectors instance:: + NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights, vocabulary frequency and the binary tree is missing:: -NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights, vocabulary frequency and the binary tree is missing. - - - >>> from gensim.models.keyedvectors import KeyedVectors - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format + >>> from gensim.models.keyedvectors import KeyedVectors + >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format + >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format You can perform various NLP word tasks with the model. Some of them @@ -87,8 +85,8 @@ detect phrases longer than one word. Using phrases, you can learn a word2vec model where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`: ->>> bigram_transformer = gensim.models.Phrases(sentences) ->>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) + >>> bigram_transformer = gensim.models.Phrases(sentences) + >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) .. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. .. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 94a2e5eb1a..5eff091417 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -341,8 +341,9 @@ def dtm_coherence(self, time, num_words=20): """ returns all topics of a particular time-slice without probabilitiy values for it to be used for either "u_mass" or "c_v" coherence. - TODO: because of print format right now can only return for 1st time-slice. - should we fix the coherence printing or make changes to the print statements to mirror DTM python? + TODO: + because of print format right now can only return for 1st time-slice. + should we fix the coherence printing or make changes to the print statements to mirror DTM python? """ coherence_topics = [] for topic_no in range(0, self.num_topics): diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 5e7cd789ef..5276b035f1 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -21,8 +21,8 @@ Example: ->>> model = gensim.models.wrappers.LdaMallet('/Users/kofola/mallet-2.0.7/bin/mallet', corpus=my_corpus, num_topics=20, id2word=dictionary) ->>> print model[my_vector] # print LDA topics of a document + >>> model = gensim.models.wrappers.LdaMallet('/Users/kofola/mallet-2.0.7/bin/mallet', corpus=my_corpus, num_topics=20, id2word=dictionary) + >>> print model[my_vector] # print LDA topics of a document .. [1] http://mallet.cs.umass.edu/ @@ -358,15 +358,13 @@ def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): training model weights (alpha, beta...) from a trained mallet model into the gensim model. - Args - ==== - mallet_model : Trained mallet model - gamma_threshold : To be used for inference in the new LdaModel. - iterations : number of iterations to be used for inference in the new LdaModel. + Args: + mallet_model : Trained mallet model + gamma_threshold : To be used for inference in the new LdaModel. + iterations : number of iterations to be used for inference in the new LdaModel. - Returns - ======= - model_gensim : LdaModel instance; copied gensim LdaModel + Returns: + model_gensim : LdaModel instance; copied gensim LdaModel """ model_gensim = LdaModel( id2word=mallet_model.id2word, num_topics=mallet_model.num_topics, diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py index 216942a8d5..6d6ae9e275 100644 --- a/gensim/models/wrappers/ldavowpalwabbit.py +++ b/gensim/models/wrappers/ldavowpalwabbit.py @@ -498,9 +498,9 @@ def corpus_to_vw(corpus): character. E.g.: - | 4:7 14:1 22:8 6:3 - | 14:22 22:4 0:1 1:3 - | 7:2 8:2 + | 4:7 14:1 22:8 6:3 + | 14:22 22:4 0:1 1:3 + | 7:2 8:2 """ for entries in corpus: line = ['|'] @@ -567,14 +567,12 @@ def vwmodel2ldamodel(vw_model, iterations=50): simply copying the training model weights (alpha, beta...) from a trained vwmodel into the gensim model. - Args - ==== - vw_model : Trained vowpal wabbit model. - iterations : Number of iterations to be used for inference of the new LdaModel. + Args: + vw_model : Trained vowpal wabbit model. + iterations : Number of iterations to be used for inference of the new LdaModel. - Returns - ======= - model_gensim : LdaModel instance; copied gensim LdaModel. + Returns: + model_gensim : LdaModel instance; copied gensim LdaModel. """ model_gensim = LdaModel( num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, diff --git a/gensim/scripts/glove2word2vec.py b/gensim/scripts/glove2word2vec.py index 7709c48714..63bec15078 100644 --- a/gensim/scripts/glove2word2vec.py +++ b/gensim/scripts/glove2word2vec.py @@ -6,7 +6,8 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ -USAGE: $ python -m gensim.scripts.glove2word2vec --input --output +USAGE: + $ python -m gensim.scripts.glove2word2vec --input --output Where: : Input GloVe .txt file : Desired name of output Word2vec .txt file @@ -38,7 +39,7 @@ def get_glove_info(glove_file_name): def glove2word2vec(glove_input_file, word2vec_output_file): - """Convert `glove_input_file` in GloVe format into `word2vec_output_file in word2vec format.""" + """Convert `glove_input_file` in GloVe format into `word2vec_output_file` in word2vec format.""" num_lines, num_dims = get_glove_info(glove_input_file) logger.info("converting %i vectors from %s to %s", num_lines, glove_input_file, word2vec_output_file) with smart_open(word2vec_output_file, 'wb') as fout: diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index d61c64636e..2a1a9512ea 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -572,7 +572,6 @@ class WmdSimilarity(interfaces.SimilarityABC): >>> # Given a document collection "corpus", train word2vec model. >>> model = word2vec(corpus) >>> instance = WmdSimilarity(corpus, model, num_best=10) - >>> # Make query. >>> query = 'Very good, you should seat outdoor.' >>> sims = instance[query] @@ -582,8 +581,7 @@ def __init__(self, corpus, w2v_model, num_best=None, normalize_w2v_and_replace=T corpus: List of lists of strings, as in gensim.models.word2vec. w2v_model: A trained word2vec model. num_best: Number of results to retrieve. - normalize_w2v_and_replace: Whether or not to normalize the word2vec vectors to - length 1. + normalize_w2v_and_replace: Whether or not to normalize the word2vec vectors to length 1. """ self.corpus = corpus self.w2v_model = w2v_model diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 4c37006dbe..75724061d8 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -141,7 +141,6 @@ def summarize_corpus(corpus, ratio=0.2): The most important documents are returned as a list sorted by the document score, highest first. - """ % INPUT_MIN_LENGTH hashable_corpus = _build_hasheable_corpus(corpus) @@ -184,11 +183,11 @@ def summarize(text, ratio=0.2, word_count=None, split=False): The length of the output can be specified using the ratio and word_count parameters: - ratio should be a number between 0 and 1 that determines the - percentage of the number of sentences of the original text to be - chosen for the summary (defaults at 0.2). - word_count determines how many words will the output contain. - If both parameters are provided, the ratio will be ignored. + ratio should be a number between 0 and 1 that determines the + percentage of the number of sentences of the original text to be + chosen for the summary (defaults at 0.2). + word_count determines how many words will the output contain. + If both parameters are provided, the ratio will be ignored. """ # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text) diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py index 109485dbd1..341834c92f 100644 --- a/gensim/topic_coherence/aggregation.py +++ b/gensim/topic_coherence/aggregation.py @@ -19,12 +19,10 @@ def arithmetic_mean(confirmed_measures): This functoin performs the arithmetic mean aggregation on the output obtained from the confirmation measure module. - Args - ==== - confirmed_measures : list of calculated confirmation measure on each set in the segmented topics. + Args: + confirmed_measures : list of calculated confirmation measure on each set in the segmented topics. - Returns - ======= - mean : Arithmetic mean of all the values contained in confirmation measures. + Returns: + mean : Arithmetic mean of all the values contained in confirmation measures. """ return np.mean(confirmed_measures) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index fddfa1a843..26e86065fb 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -23,15 +23,13 @@ def log_conditional_probability(segmented_topics, accumulator): which is used by coherence measures such as U_mass. This is defined as: m_lc(S_i) = log[(P(W', W*) + e) / P(W*)] - Args - ==== - segmented_topics : Output from the segmentation module of the segmented topics. - Is a list of list of tuples. - accumulator: word occurrence accumulator from probability_estimation. - - Returns - ======= - m_lc : List of log conditional probability measure for each topic. + Args: + segmented_topics : Output from the segmentation module of the segmented topics. + Is a list of list of tuples. + accumulator: word occurrence accumulator from probability_estimation. + + Returns: + m_lc : List of log conditional probability measure for each topic. """ m_lc = [] num_docs = float(accumulator.num_docs) @@ -64,15 +62,13 @@ def log_ratio_measure(segmented_topics, accumulator, normalize=False): NPMI which is used by coherence measures such as c_v. This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e] - Args - ==== - segmented topics : Output from the segmentation module of the segmented topics. - Is a list of list of tuples. - accumulator: word occurrence accumulator from probability_estimation. + Args: + segmented topics : Output from the segmentation module of the segmented topics. + Is a list of list of tuples. + accumulator: word occurrence accumulator from probability_estimation. - Returns - ======= - m_lr : List of log ratio measures for each topic. + Returns: + m_lr : List of log ratio measures for each topic. """ m_lr = [] num_docs = float(accumulator.num_docs) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 0c3c2ae8e7..8eed6a1807 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -43,21 +43,19 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamm _ _ cosine measure is computed as the cosine similarity between u and w. The formula used is:: - m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) + m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) where each vector \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} - Args - ==== - segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. - accumulator : Output from the probability_estimation module. Is an accumulator of word occurrences (see text_analysis module). - topics : Topics obtained from the trained topic model. - measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). - gamma : Gamma value for computing W', W* vectors; default is 1. - - Returns - ======= - s_cos_sim : list of indirect cosine similarity measure for each topic. + Args: + segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. + accumulator : Output from the probability_estimation module. Is an accumulator of word occurrences (see text_analysis module). + topics : Topics obtained from the trained topic model. + measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). + gamma : Gamma value for computing W', W* vectors; default is 1. + + Returns: + s_cos_sim : list of indirect cosine similarity measure for each topic. """ context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 4d9ba740b4..7832494a5c 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -22,15 +22,13 @@ def p_boolean_document(corpus, segmented_topics): Boolean document estimates the probability of a single word as the number of documents in which the word occurs divided by the total number of documents. - Args - ==== - corpus : The corpus of documents. - segmented_topics : Output from the segmentation of topics. Could be simply topics too. - - Returns - ======= - accumulator : word occurrence accumulator instance that can be used to lookup token - frequencies and co-occurrence frequencies. + Args: + corpus : The corpus of documents. + segmented_topics : Output from the segmentation of topics. Could be simply topics too. + + Returns: + accumulator : word occurrence accumulator instance that can be used to lookup token + frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) return CorpusAccumulator(top_ids).accumulate(corpus) @@ -43,17 +41,15 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p document by copying the window content. Boolean document is applied to these virtual documents to compute word probabilities. - Args - ==== - texts : List of string sentences. - segmented_topics : Output from the segmentation of topics. Could be simply topics too. - dictionary : Gensim dictionary mapping of the tokens and ids. - window_size : Size of the sliding window. 110 found out to be the ideal size for large corpora. - - Returns - ======= - accumulator : word occurrence accumulator instance that can be used to lookup token - frequencies and co-occurrence frequencies. + Args: + texts : List of string sentences. + segmented_topics : Output from the segmentation of topics. Could be simply topics too. + dictionary : Gensim dictionary mapping of the tokens and ids. + window_size : Size of the sliding window. 110 found out to be the ideal size for large corpora. + + Returns: + accumulator : word occurrence accumulator instance that can be used to lookup token + frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) if processes <= 1: @@ -67,13 +63,11 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p def unique_ids_from_segments(segmented_topics): """Return the set of all unique ids in a list of segmented topics. - Args - ==== - segmented_topics: list of tuples of (word_id_set1, word_id_set2). Each word_id_set - is either a single integer, or a `numpy.ndarray` of integers. - Returns - ======= - unique_ids : set of unique ids across all topic segments. + Args: + segmented_topics: list of tuples of (word_id_set1, word_id_set2). Each word_id_set + is either a single integer, or a `numpy.ndarray` of integers. + Returns: + unique_ids : set of unique ids across all topic segments. """ unique_ids = set() # is a set of all the unique ids contained in topics. for s_i in segmented_topics: diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 2b400d1459..4845a26859 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -23,13 +23,11 @@ def s_one_pre(topics): >>> s_one_pre(topics) [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] - Args - ==== - topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + Args: + topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] - Returns - ======= - s_one_pre : list of list of (W', W*) tuples for all unique topic ids + Returns: + s_one_pre : list of list of (W', W*) tuples for all unique topic ids """ s_one_pre = [] @@ -52,13 +50,11 @@ def s_one_one(topics): >>> s_one_pre(topics) [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] - Args - ==== - topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + Args: + topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] - Returns - ======= - s_one_one : list of list of (W', W*) tuples for all unique topic ids + Returns: + s_one_one : list of list of (W', W*) tuples for all unique topic ids """ s_one_one = [] @@ -85,13 +81,11 @@ def s_one_set(topics): (10, array([ 9, 10, 7])), (7, array([ 9, 10, 7]))]] - Args - ==== - topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + Args: + topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] - Returns - ======= - s_one_set : list of list of (W', W*) tuples for all unique topic ids. + Returns: + s_one_set : list of list of (W', W*) tuples for all unique topic ids. """ s_one_set = [] diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index a44e57fb3e..d416dec006 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -29,9 +29,8 @@ def _ids_to_words(ids, dictionary): This function abstracts away the differences between the HashDictionary and the standard one. Args: - ---- - ids: list of list of tuples, where each tuple contains (token_id, iterable of token_ids). - This is the format returned by the topic_coherence.segmentation functions. + ids: list of list of tuples, where each tuple contains (token_id, iterable of token_ids). + This is the format returned by the topic_coherence.segmentation functions. """ if not dictionary.id2token: # may not be initialized in the standard gensim.corpora.Dictionary setattr(dictionary, 'id2token', {v: k for k, v in dictionary.token2id.items()}) @@ -169,9 +168,8 @@ class WindowedTextsAnalyzer(UsesDictionary): def __init__(self, relevant_ids, dictionary): """ Args: - ---- - relevant_ids: the set of words that occurrences should be accumulated for. - dictionary: Dictionary instance with mappings for the relevant_ids. + relevant_ids: the set of words that occurrences should be accumulated for. + dictionary: Dictionary instance with mappings for the relevant_ids. """ super(WindowedTextsAnalyzer, self).__init__(relevant_ids, dictionary) self._none_token = self._vocab_size # see _iter_texts for use of none token @@ -301,11 +299,10 @@ class ParallelWordOccurrenceAccumulator(WindowedTextsAnalyzer): def __init__(self, processes, *args, **kwargs): """ Args: - ---- - processes : number of processes to use; must be at least two. - args : should include `relevant_ids` and `dictionary` (see `UsesDictionary.__init__`). - kwargs : can include `batch_size`, which is the number of docs to send to a worker at a - time. If not included, it defaults to 64. + processes : number of processes to use; must be at least two. + args : should include `relevant_ids` and `dictionary` (see `UsesDictionary.__init__`). + kwargs : can include `batch_size`, which is the number of docs to send to a worker at a + time. If not included, it defaults to 64. """ super(ParallelWordOccurrenceAccumulator, self).__init__(*args) if processes < 2: diff --git a/gensim/utils.py b/gensim/utils.py index 8aebfc7da1..4dfe97e346 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1208,16 +1208,14 @@ def strided_windows(ndarray, window_size): [4, 5, 6, 7, 8], [5, 6, 7, 8, 9]]) - Args - ==== - ndarray: either a numpy.ndarray or something that can be converted into one. - window_size: sliding window size. - - Returns - ======= - numpy.ndarray of the subsequences produced by sliding a window of the given size over - the `ndarray`. Since this uses striding, the individual arrays are views rather than - copies of `ndarray`. Changes to one view modifies the others and the original. + Args: + ndarray: either a numpy.ndarray or something that can be converted into one. + window_size: sliding window size. + + Returns: + numpy.ndarray of the subsequences produced by sliding a window of the given size over + the `ndarray`. Since this uses striding, the individual arrays are views rather than + copies of `ndarray`. Changes to one view modifies the others and the original. """ ndarray = np.asarray(ndarray) if window_size == ndarray.shape[0]: @@ -1236,13 +1234,12 @@ def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include The windows produced are views of some subsequence of a text. To use deep copies instead, pass `copy=True`. - Args - ==== - texts: List of string sentences. - window_size: Size of sliding window. - copy: False to use views of the texts (default) or True to produce deep copies. - ignore_below_size: ignore documents that are not at least `window_size` in length (default behavior). - If False, the documents below `window_size` will be yielded as the full document. + Args: + texts: List of string sentences. + window_size: Size of sliding window. + copy: False to use views of the texts (default) or True to produce deep copies. + ignore_below_size: ignore documents that are not at least `window_size` in length (default behavior). + If False, the documents below `window_size` will be yielded as the full document. """ for doc_num, document in enumerate(texts): From 5eb900848de68cd22e5bf00e3f67839dc85423d0 Mon Sep 17 00:00:00 2001 From: prerna135 Date: Mon, 26 Jun 2017 19:07:56 +0530 Subject: [PATCH 06/10] Removing trailing spaces after Travis build --- gensim/scripts/glove2word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/scripts/glove2word2vec.py b/gensim/scripts/glove2word2vec.py index 63bec15078..8d3d1cb02f 100644 --- a/gensim/scripts/glove2word2vec.py +++ b/gensim/scripts/glove2word2vec.py @@ -6,7 +6,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ -USAGE: +USAGE: $ python -m gensim.scripts.glove2word2vec --input --output Where: : Input GloVe .txt file From ecfd353caf4b5d597e2b39560eac463ed19e5ae2 Mon Sep 17 00:00:00 2001 From: prerna135 Date: Sun, 2 Jul 2017 12:01:16 +0530 Subject: [PATCH 07/10] Removing duplication citation, toctree and non-local image uri warnings build succeeded, 21 warnings. Getting there. :-) --- docs/src/about.rst | 2 ++ docs/src/changes_080.rst | 2 ++ docs/src/conf.py | 2 ++ docs/src/corpora/corpora.rst | 2 ++ docs/src/models/{doc2vec.rst => doc2vec.inc} | 0 docs/src/models/models.rst | 2 ++ docs/src/models/{word2vec.rst => word2vec.inc} | 0 docs/src/models/wrappers/wrappers.rst | 2 ++ docs/src/similarities/simserver.rst | 2 ++ docs/src/simserver.rst | 2 ++ docs/src/summarization/textcleaner.rst | 2 +- 11 files changed, 17 insertions(+), 1 deletion(-) rename docs/src/models/{doc2vec.rst => doc2vec.inc} (100%) rename docs/src/models/{word2vec.rst => word2vec.inc} (100%) diff --git a/docs/src/about.rst b/docs/src/about.rst index 294c60d52c..64a65bd333 100644 --- a/docs/src/about.rst +++ b/docs/src/about.rst @@ -1,3 +1,5 @@ +:orphan: + .. _about: ============ diff --git a/docs/src/changes_080.rst b/docs/src/changes_080.rst index be5df9ad15..b038ccb930 100644 --- a/docs/src/changes_080.rst +++ b/docs/src/changes_080.rst @@ -1,3 +1,5 @@ +:orphan: + .. _changes_080: Change Set for 0.8.0 diff --git a/docs/src/conf.py b/docs/src/conf.py index d2417fe5aa..8136ad6d24 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -215,3 +215,5 @@ # If false, no module index is generated. #latex_use_modindex = True + +suppress_warnings = ['image.nonlocal_uri'] diff --git a/docs/src/corpora/corpora.rst b/docs/src/corpora/corpora.rst index 3ea5151c96..f92a68af71 100644 --- a/docs/src/corpora/corpora.rst +++ b/docs/src/corpora/corpora.rst @@ -1,3 +1,5 @@ +:orphan: + :mod:`corpora` -- Package for corpora I/O ========================================== diff --git a/docs/src/models/doc2vec.rst b/docs/src/models/doc2vec.inc similarity index 100% rename from docs/src/models/doc2vec.rst rename to docs/src/models/doc2vec.inc diff --git a/docs/src/models/models.rst b/docs/src/models/models.rst index f18032b7ee..0ac3b30831 100644 --- a/docs/src/models/models.rst +++ b/docs/src/models/models.rst @@ -1,3 +1,5 @@ +:orphan: + :mod:`models` -- Package for transformation models ====================================================== diff --git a/docs/src/models/word2vec.rst b/docs/src/models/word2vec.inc similarity index 100% rename from docs/src/models/word2vec.rst rename to docs/src/models/word2vec.inc diff --git a/docs/src/models/wrappers/wrappers.rst b/docs/src/models/wrappers/wrappers.rst index e6acac5448..9746202d6d 100644 --- a/docs/src/models/wrappers/wrappers.rst +++ b/docs/src/models/wrappers/wrappers.rst @@ -1,3 +1,5 @@ +:orphan: + :mod:`models.wrappers` -- Package for transformation models via external programs ================================================================================= diff --git a/docs/src/similarities/simserver.rst b/docs/src/similarities/simserver.rst index 86a529b1c6..636ba663f4 100644 --- a/docs/src/similarities/simserver.rst +++ b/docs/src/similarities/simserver.rst @@ -1,3 +1,5 @@ +:orphan: + :mod:`simserver` -- Document similarity server ============================================== diff --git a/docs/src/simserver.rst b/docs/src/simserver.rst index f4abed868e..1b0d2b4396 100644 --- a/docs/src/simserver.rst +++ b/docs/src/simserver.rst @@ -1,3 +1,5 @@ +:orphan: + .. _simserver: Document Similarity Server diff --git a/docs/src/summarization/textcleaner.rst b/docs/src/summarization/textcleaner.rst index dddaedcbbe..72eda3d779 100644 --- a/docs/src/summarization/textcleaner.rst +++ b/docs/src/summarization/textcleaner.rst @@ -7,4 +7,4 @@ :inherited-members: :undoc-members: :show-inheritance: -b + From f62e113f6e137952a447cf8f8bf7a1ad5b01b599 Mon Sep 17 00:00:00 2001 From: prerna135 Date: Sun, 2 Jul 2017 20:00:57 +0530 Subject: [PATCH 08/10] Adding .inc files to flake8 ignore list --- continuous_integration/travis/flake8_diff.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/continuous_integration/travis/flake8_diff.sh b/continuous_integration/travis/flake8_diff.sh index 14b4432915..35c045cf85 100755 --- a/continuous_integration/travis/flake8_diff.sh +++ b/continuous_integration/travis/flake8_diff.sh @@ -133,6 +133,6 @@ check_files() { if [[ "$MODIFIED_FILES" == "no_match" ]]; then echo "No file has been modified" else - check_files "$(echo "$MODIFIED_FILES" )" "--ignore=E501,E731,E12,W503 --exclude=*.sh,*.md,*.yml,*.rst,*.ipynb,*.txt,*.csv,*.vec,Dockerfile*" + check_files "$(echo "$MODIFIED_FILES" )" "--ignore=E501,E731,E12,W503 --exclude=*.sh,*.md,*.yml,*.rst,*.ipynb,*.txt,*.csv,*.vec,Dockerfile*,*.inc" fi echo -e "No problem detected by flake8\n" From f4b4ce6c9f083a5878352b05653c91fcb47e96fb Mon Sep 17 00:00:00 2001 From: prerna135 Date: Mon, 3 Jul 2017 13:23:44 +0530 Subject: [PATCH 09/10] Fixing more identation errors Now I'm down to, `build succeeded, 5 warnings.` However, I'm in a bit of a fix. Changing `doc2vec.rst` and `word2vec.rst` to `.inc` files removed the duplicate warnings but it also invalidates the references to these documents from my main toctree and the following warnings are produced. `apiref.rst:8: WARNING: toctree contains reference to nonexisting document u'models/doc2vec'` `apiref.rst:8: WARNING: toctree contains reference to nonexisting document u'models/word2vec'` --- docs/src/conf.py | 2 +- gensim/models/ldamodel.py | 4 ++++ gensim/models/ldaseqmodel.py | 20 +++++++++++-------- gensim/models/wrappers/wordrank.py | 4 ++-- .../indirect_confirmation_measure.py | 14 +++++++------ 5 files changed, 27 insertions(+), 17 deletions(-) diff --git a/docs/src/conf.py b/docs/src/conf.py index 8136ad6d24..340e714062 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -139,7 +139,7 @@ # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -html_favicon = 'favicon.ico' +html_favicon = '_static/favicon.ico' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 0e42760b96..406bc2f524 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -982,15 +982,19 @@ def diff(self, other, distance="kulback_leibler", num_words=100, n_ann_terms=10, Returns a matrix Z with shape (m1.num_topics, m2.num_topics), where Z[i][j] - difference between topic_i and topic_j and matrix annotation with shape (m1.num_topics, m2.num_topics, 2, None), where: + annotation[i][j] = [[`int_1`, `int_2`, ...], [`diff_1`, `diff_2`, ...]] and `int_k` is word from intersection of `topic_i` and `topic_j` and `diff_l` is word from symmetric difference of `topic_i` and `topic_j` `normed` is a flag. If `true`, matrix Z will be normalized + Example: + >>> m1, m2 = LdaMulticore.load(path_1), LdaMulticore.load(path_2) >>> mdiff, annotation = m1.diff(m2) >>> print(mdiff) # get matrix with difference for each topic pair from `m1` and `m2` >>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2` + """ distances = { diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 98a045623b..d293607557 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -156,14 +156,14 @@ def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_s def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize): """ fit an lda sequence model: + for each time period: + set up lda model with E[log p(w|z)] and \alpha - for each time period: - set up lda model with E[log p(w|z)] and \alpha - for each document: - perform posterior inference - update sufficient statistics/likelihood + for each document: + perform posterior inference + update sufficient statistics/likelihood - maximize topics + maximize topics """ LDASQE_EM_THRESHOLD = 1e-4 @@ -485,10 +485,14 @@ def compute_post_variance(self, word, chain_variance): This function accepts the word to compute variance for, along with the associated sslm class object, and returns variance and fwd_variance Computes Var[\beta_{t,w}] for t = 1:T - Fwd_Variance(t) ≡ E((beta_{t,w} − mean_{t,w})^2 |beta_{t} for 1:t) + :math:: + + Fwd_Variance(t) ≡ E((beta_{t,w} − mean_{t,w})^2 |beta_{t} for 1:t) = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * (fwd_variance[t - 1] + obs_variance) - Variance(t) ≡ E((beta_{t,w} − mean_cap{t,w})^2 |beta_cap{t} for 1:t) + :math:: + + Variance(t) ≡ E((beta_{t,w} − mean_cap{t,w})^2 |beta_cap{t} for 1:t) = fwd_variance[t - 1] + (fwd_variance[t - 1] / fwd_variance[t - 1] + obs_variance)^2 * (variance[t - 1] - (fwd_variance[t-1] + obs_variance)) """ diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 356be3051c..62d64f3266 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -58,8 +58,8 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, Expects file to contain space-separated tokens in a single line `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data. It will contain following contents: - Word Embeddings saved after every dump_period and stored in a file model_word_"current iter".txt - Context Embeddings saved after every dump_period and stored in a file model_context_"current iter".txt + Word Embeddings saved after every dump_period and stored in a file model_word_current\ iter.txt + Context Embeddings saved after every dump_period and stored in a file model_context_current\ iter.txt A meta directory which contain: 'vocab.txt' - vocab words, 'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 8eed6a1807..e9ea1a4e87 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -38,20 +38,22 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamma=1): """ This function calculates the indirect cosine measure. Given context vectors - _ _ _ _ u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect - _ _ - cosine measure is computed as the cosine similarity between u and w. The formula used is:: - + cosine measure is computed as the cosine similarity between u and w. + The formula used is: m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) - - where each vector \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} + where each vector: + \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} Args: segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. + accumulator : Output from the probability_estimation module. Is an accumulator of word occurrences (see text_analysis module). + topics : Topics obtained from the trained topic model. + measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). + gamma : Gamma value for computing W', W* vectors; default is 1. Returns: From 02fb823c888aa943cf40caad66f0ba772760e365 Mon Sep 17 00:00:00 2001 From: prerna135 Date: Wed, 5 Jul 2017 12:35:22 +0530 Subject: [PATCH 10/10] Removing the last few warnings --- docs/src/conf.py | 4 ++-- docs/src/models/{doc2vec.inc => doc2vec.rst} | 0 docs/src/models/{word2vec.inc => word2vec.rst} | 0 .../sklearn_wrapper_gensim_ldamodel.rst | 4 ++-- gensim/models/coherencemodel.py | 2 ++ gensim/models/ldaseqmodel.py | 6 ++---- gensim/models/wrappers/wordrank.py | 2 ++ .../sklearn_wrapper_gensim_ldamodel.py | 2 ++ gensim/summarization/summarizer.py | 2 ++ .../topic_coherence/indirect_confirmation_measure.py | 11 +++++++---- 10 files changed, 21 insertions(+), 12 deletions(-) rename docs/src/models/{doc2vec.inc => doc2vec.rst} (100%) rename docs/src/models/{word2vec.inc => word2vec.rst} (100%) diff --git a/docs/src/conf.py b/docs/src/conf.py index 340e714062..9d21422c50 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -16,7 +16,7 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.append(os.path.abspath('.')) +sys.path.append(os.path.abspath('.')) # -- General configuration ----------------------------------------------------- @@ -216,4 +216,4 @@ # If false, no module index is generated. #latex_use_modindex = True -suppress_warnings = ['image.nonlocal_uri'] +suppress_warnings = ['image.nonlocal_uri', 'ref.citation', 'ref.footnote'] diff --git a/docs/src/models/doc2vec.inc b/docs/src/models/doc2vec.rst similarity index 100% rename from docs/src/models/doc2vec.inc rename to docs/src/models/doc2vec.rst diff --git a/docs/src/models/word2vec.inc b/docs/src/models/word2vec.rst similarity index 100% rename from docs/src/models/word2vec.inc rename to docs/src/models/word2vec.rst diff --git a/docs/src/sklearn_integration/sklearn_wrapper_gensim_ldamodel.rst b/docs/src/sklearn_integration/sklearn_wrapper_gensim_ldamodel.rst index 95c100c4b1..585b8fc3dc 100644 --- a/docs/src/sklearn_integration/sklearn_wrapper_gensim_ldamodel.rst +++ b/docs/src/sklearn_integration/sklearn_wrapper_gensim_ldamodel.rst @@ -1,7 +1,7 @@ -:mod:`sklearn_integration.sklearn_wrapper_gensim_ldamodel.SklearnWrapperLdaModel` -- Scikit learn wrapper for Latent Dirichlet Allocation +:mod:`sklearn_integration.sklearn_wrapper_gensim_ldamodel` -- Scikit learn wrapper for Latent Dirichlet Allocation ========================================================================================================================================= -.. automodule:: gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel.SklearnWrapperLdaModel +.. automodule:: gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel :synopsis: Scikit learn wrapper for LDA model :members: :inherited-members: diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index a17fd8c8f5..8556db1c45 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -139,9 +139,11 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= window_size : Is the size of the window to be used for coherence measures using boolean sliding window as their probability estimator. For 'u_mass' this doesn't matter. If left 'None' the default window sizes are used which are: + 'c_v' : 110 'c_uci' : 10 'c_npmi' : 10 + coherence : Coherence measure to be used. Supported values are: 'u_mass' 'c_v' diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index d293607557..1544aed84e 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -487,13 +487,11 @@ def compute_post_variance(self, word, chain_variance): :math:: - Fwd_Variance(t) ≡ E((beta_{t,w} − mean_{t,w})^2 |beta_{t} for 1:t) - = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * (fwd_variance[t - 1] + obs_variance) + fwd\_variance[t] \equiv E((beta_{t,w}-mean_{t,w})^2 |beta_{t}\ for\ 1:t) = (obs\_variance / fwd\_variance[t - 1] + chain\_variance + obs\_variance ) * (fwd\_variance[t - 1] + obs\_variance) :math:: - Variance(t) ≡ E((beta_{t,w} − mean_cap{t,w})^2 |beta_cap{t} for 1:t) - = fwd_variance[t - 1] + (fwd_variance[t - 1] / fwd_variance[t - 1] + obs_variance)^2 * (variance[t - 1] - (fwd_variance[t-1] + obs_variance)) + variance[t] \equiv E((beta_{t,w}-mean\_cap_{t,w})^2 |beta\_cap_{t}\ for\ 1:t) = fwd\_variance[t - 1] + (fwd\_variance[t - 1] / fwd\_variance[t - 1] + obs\_variance)^2 * (variance[t - 1] - (fwd\_variance[t-1] + obs\_variance)) """ INIT_VARIANCE_CONST = 1000 diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 62d64f3266..dbcca9ebb9 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -58,9 +58,11 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, Expects file to contain space-separated tokens in a single line `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data. It will contain following contents: + Word Embeddings saved after every dump_period and stored in a file model_word_current\ iter.txt Context Embeddings saved after every dump_period and stored in a file model_context_current\ iter.txt A meta directory which contain: 'vocab.txt' - vocab words, 'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths + `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). `symmetric` if 0, only use left context words, else use left and right both. diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py index 1ad1fabccf..de7b40d825 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py @@ -114,9 +114,11 @@ def partial_fit(self, X): Train model over X. By default, 'online (single-pass)' mode is used for training the LDA model. Configure `passes` and `update_every` params at init to choose the mode among : + - online (single-pass): update_every != None and passes == 1 - online (multi-pass): update_every != None and passes > 1 - batch: update_every == None + """ if sparse.issparse(X): X = matutils.Sparse2Corpus(X) diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 75724061d8..c067c23faf 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -183,11 +183,13 @@ def summarize(text, ratio=0.2, word_count=None, split=False): The length of the output can be specified using the ratio and word_count parameters: + ratio should be a number between 0 and 1 that determines the percentage of the number of sentences of the original text to be chosen for the summary (defaults at 0.2). word_count determines how many words will the output contain. If both parameters are provided, the ratio will be ignored. + """ # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index e9ea1a4e87..241b96befc 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -40,24 +40,27 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamm This function calculates the indirect cosine measure. Given context vectors u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect cosine measure is computed as the cosine similarity between u and w. + The formula used is: + m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) + where each vector: + \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} Args: - segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. + segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. accumulator : Output from the probability_estimation module. Is an accumulator of word occurrences (see text_analysis module). - topics : Topics obtained from the trained topic model. - measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). - gamma : Gamma value for computing W', W* vectors; default is 1. Returns: + s_cos_sim : list of indirect cosine similarity measure for each topic. + """ context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma)