Skip to content

Commit

Permalink
Fix flake8 warnings W605, W504 (#2256)
Browse files Browse the repository at this point in the history
* Fix flake8 warnings W605

* fix W504

* pin flake8-rst (avoid issue from 0.5.0)
  • Loading branch information
horpto authored and menshikh-iv committed Dec 11, 2018
1 parent 2ccc82b commit 30528a5
Show file tree
Hide file tree
Showing 28 changed files with 99 additions and 100 deletions.
6 changes: 3 additions & 3 deletions gensim/corpora/wikicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
"""Capture interlinks text and article linked"""
RE_P17 = re.compile(
r'(\n.{0,4}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=)|(scope=))(.*))|'
'(^.{0,2}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=))(.*))',
r'(^.{0,2}((bgcolor)|(\d{0,1}[ ]?colspan)|(rowspan)|(style=)|(class=)|(align=))(.*))',
re.UNICODE
)
"""Table markup"""
Expand Down Expand Up @@ -143,8 +143,8 @@ def filter_example(elem, text, *args, **kwargs):
# regex is in the function call so that we do not pollute the wikicorpus
# namespace do not do this in production as this function is called for
# every element in the wiki dump
_regex_de_excellent = re.compile('.*\{\{(Exzellent.*?)\}\}[\s]*', flags=re.DOTALL)
_regex_de_featured = re.compile('.*\{\{(Lesenswert.*?)\}\}[\s]*', flags=re.DOTALL)
_regex_de_excellent = re.compile(r'.*\{\{(Exzellent.*?)\}\}[\s]*', flags=re.DOTALL)
_regex_de_featured = re.compile(r'.*\{\{(Lesenswert.*?)\}\}[\s]*', flags=re.DOTALL)

if text is None:
return False
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/atmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,14 +376,14 @@ def extend_corpus(self, corpus):
self.corpus.extend(corpus)

def compute_phinorm(self, expElogthetad, expElogbetad):
"""Efficiently computes the normalizing factor in phi.
r"""Efficiently computes the normalizing factor in phi.
Parameters
----------
expElogthetad: numpy.ndarray
Value of variational distribution :math:`q(\theta|\gamma)`.
expElogbetad: numpy.ndarray
Value of variational distribution :math:`q(\\beta|\lambda)`.
Value of variational distribution :math:`q(\beta|\lambda)`.
Returns
-------
Expand Down Expand Up @@ -888,7 +888,7 @@ def rho():
del other

def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, doc2author=None):
"""Estimate the variational bound of documents from `corpus`.
r"""Estimate the variational bound of documents from `corpus`.
:math:`\mathbb{E_{q}}[\log p(corpus)] - \mathbb{E_{q}}[\log q(corpus)]`
Expand Down
4 changes: 2 additions & 2 deletions gensim/models/base_any2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# Copyright (C) 2018 RaRe Technologies s.r.o.
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""This module contains base classes required for implementing \*2vec algorithms.
r"""This module contains base classes required for implementing \*2vec algorithms.
The class hierarchy is designed to facilitate adding more concrete implementations for creating embeddings.
In the most general case, the purpose of this class is to transform an arbitrary representation to a numerical vector
Expand Down Expand Up @@ -56,7 +56,7 @@


class BaseAny2VecModel(utils.SaveLoad):
"""Base class for training, using and evaluating \*2vec model.
r"""Base class for training, using and evaluating \*2vec model.
Contains implementation for multi-threaded training. The purpose of this class is to provide a
reference interface for concrete embedding implementations, whether the input space is a corpus
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/coherencemodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,9 +460,9 @@ def _relevant_ids_will_differ(self, new_topics):
return not self._accumulator.relevant_ids.issuperset(new_set)

def _topics_differ(self, new_topics):
return (new_topics is not None and
self._topics is not None and
not np.array_equal(new_topics, self._topics))
return (new_topics is not None
and self._topics is not None
and not np.array_equal(new_topics, self._topics))

def _get_topics(self):
"""Internal helper function to return topics from a trained topic model."""
Expand Down
8 changes: 4 additions & 4 deletions gensim/models/deprecated/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,8 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf

word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab
and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]

for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code
Expand Down Expand Up @@ -298,8 +298,8 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf

word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab
and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
doctag_len = len(doctag_indexes)
if doctag_len != model.dm_tag_count:
return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?)
Expand Down
8 changes: 4 additions & 4 deletions gensim/models/deprecated/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,8 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
"""
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab
and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window)
start = max(0, pos - model.window + reduced_window)
Expand Down Expand Up @@ -211,8 +211,8 @@ def train_batch_sg(model, sentences, alpha, work=None, neu1=None):
"""
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab
and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
# now go over all words from the (reduced) window, predicting each one in turn
Expand Down
4 changes: 2 additions & 2 deletions gensim/models/deprecated/old_saveload.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,8 @@ def _load_specials(self, fname, mmap, compress, subname):
"""
def mmap_error(obj, filename):
return IOError(
'Cannot mmap compressed object %s in file %s. ' % (obj, filename) +
'Use `load(fname, mmap=None)` or uncompress files manually.'
'Cannot mmap compressed object %s in file %s. ' % (obj, filename)
+ 'Use `load(fname, mmap=None)` or uncompress files manually.'
)

for attrib in getattr(self, '__recursive_saveloads', []):
Expand Down
8 changes: 4 additions & 4 deletions gensim/models/deprecated/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,8 +232,8 @@ def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False):
"""
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab
and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code

Expand Down Expand Up @@ -263,8 +263,8 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss
"""
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab
and model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
start = max(0, pos - model.window + reduced_window)
Expand Down
8 changes: 4 additions & 4 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,8 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf

word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab
and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]

for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code
Expand Down Expand Up @@ -314,8 +314,8 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf

word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab
and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
doctag_len = len(doctag_indexes)
if doctag_len != model.dm_tag_count:
return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?)
Expand Down
8 changes: 4 additions & 4 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ def train_batch_cbow(model, sentences, alpha, work=None, neu1=None):
"""
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab
and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window)
start = max(0, pos - model.window + reduced_window)
Expand Down Expand Up @@ -199,8 +199,8 @@ def train_batch_sg(model, sentences, alpha, work=None, neu1=None):
"""
result = 0
for sentence in sentences:
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab
and model.wv.vocab[w].sample_int > model.random.rand() * 2 ** 32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
# now go over all words from the (reduced) window, predicting each one in turn
Expand Down
40 changes: 20 additions & 20 deletions gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@


def expect_log_sticks(sticks):
"""For stick-breaking hdp, get the :math:`\mathbb{E}[log(sticks)]`.
r"""For stick-breaking hdp, get the :math:`\mathbb{E}[log(sticks)]`.
Parameters
----------
Expand All @@ -97,7 +97,7 @@ def expect_log_sticks(sticks):


def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100):
"""Performs EM-iteration on a single document for calculation of likelihood for a maximum iteration of `max_iter`.
r"""Performs EM-iteration on a single document for calculation of likelihood for a maximum iteration of `max_iter`.
Parameters
----------
Expand All @@ -115,7 +115,7 @@ def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100):
Returns
-------
(numpy.ndarray, numpy.ndarray)
Computed (:math:`likelihood`, :math:`\\gamma`).
Computed (:math:`likelihood`, :math:`\gamma`).
"""
gamma = np.ones(len(alpha))
Expand Down Expand Up @@ -172,7 +172,7 @@ def set_zero(self):


class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
"""`Hierarchical Dirichlet Process model <http://jmlr.csail.mit.edu/proceedings/papers/v15/wang11a/wang11a.pdf>`_
r"""`Hierarchical Dirichlet Process model <http://jmlr.csail.mit.edu/proceedings/papers/v15/wang11a/wang11a.pdf>`_
Topic models promise to help summarize and organize large archives of texts that cannot be easily analyzed by hand.
Hierarchical Dirichlet process (HDP) is a powerful mixed-membership model for the unsupervised analysis of grouped
Expand All @@ -194,7 +194,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
For this assume that there is a restaurant franchise (`corpus`) which has a large number of restaurants
(`documents`, `j`) under it. They have a global menu of dishes (`topics`, :math:`\Phi_{k}`) which they serve.
Also, a single dish (`topic`, :math:`\Phi_{k}`) is only served at a single table `t` for all the customers
(`words`, :math:`\\theta_{j,i}`) who sit at that table.
(`words`, :math:`\theta_{j,i}`) who sit at that table.
So, when a customer enters the restaurant he/she has the choice to make where he/she wants to sit.
He/she can choose to sit at a table where some customers are already sitting , or he/she can choose to sit
at a new table. Here the probability of choosing each option is not same.
Expand All @@ -213,31 +213,31 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
share the same set of atoms, :math:`\Phi_{k}`, and only the atom weights :math:`\pi _{jt}` differs.
There will be multiple document-level atoms :math:`\psi_{jt}` which map to the same corpus-level atom
:math:`\Phi_{k}`. Here, the :math:`\\beta` signify the weights given to each of the topics globally. Also, each
factor :math:`\\theta_{j,i}` is distributed according to :math:`G_{j}`, i.e., it takes on the value of
:math:`\Phi_{k}`. Here, the :math:`\beta` signify the weights given to each of the topics globally. Also, each
factor :math:`\theta_{j,i}` is distributed according to :math:`G_{j}`, i.e., it takes on the value of
:math:`\Phi_{k}` with probability :math:`\pi _{jt}`. :math:`C_{j,t}` is an indicator variable whose value `k`
signifies the index of :math:`\Phi`. This helps to map :math:`\psi_{jt}` to :math:`\Phi_{k}`.
The top level (`corpus` level) stick proportions correspond the values of :math:`\\beta`,
The top level (`corpus` level) stick proportions correspond the values of :math:`\beta`,
bottom level (`document` level) stick proportions correspond to the values of :math:`\pi`.
The truncation level for the corpus (`K`) and document (`T`) corresponds to the number of :math:`\\beta`
The truncation level for the corpus (`K`) and document (`T`) corresponds to the number of :math:`\beta`
and :math:`\pi` which are in existence.
Now, whenever coordinate ascent updates are to be performed, they happen at two level. The document level as well
as corpus level.
At document level, we update the following:
#. The parameters to the document level sticks, i.e, a and b parameters of :math:`\\beta` distribution of the
#. The parameters to the document level sticks, i.e, a and b parameters of :math:`\beta` distribution of the
variable :math:`\pi _{jt}`.
#. The parameters to per word topic indicators, :math:`Z_{j,n}`. Here :math:`Z_{j,n}` selects topic parameter
:math:`\psi_{jt}`.
#. The parameters to per document topic indices :math:`\Phi_{jtk}`.
At corpus level, we update the following:
#. The parameters to the top level sticks, i.e., the parameters of the :math:`\\beta` distribution for the
corpus level :math:`\\beta`, which signify the topic distribution at corpus level.
#. The parameters to the top level sticks, i.e., the parameters of the :math:`\beta` distribution for the
corpus level :math:`\beta`, which signify the topic distribution at corpus level.
#. The parameters to the topics :math:`\Phi_{k}`.
Now coming on to the steps involved, procedure for online variational inference for the Hdp model is as follows:
Expand All @@ -261,14 +261,14 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
Attributes
----------
lda_alpha : numpy.ndarray
Same as :math:`\\alpha` from :class:`gensim.models.ldamodel.LdaModel`.
Same as :math:`\alpha` from :class:`gensim.models.ldamodel.LdaModel`.
lda_beta : numpy.ndarray
Same as :math:`\\beta` from from :class:`gensim.models.ldamodel.LdaModel`.
Same as :math:`\beta` from from :class:`gensim.models.ldamodel.LdaModel`.
m_D : int
Number of documents in the corpus.
m_Elogbeta : numpy.ndarray:
Stores value of dirichlet expectation, i.e., compute :math:`E[log \\theta]` for a vector
:math:`\\theta \sim Dir(\\alpha)`.
Stores value of dirichlet expectation, i.e., compute :math:`E[log \theta]` for a vector
:math:`\theta \sim Dir(\alpha)`.
m_lambda : {numpy.ndarray, float}
Drawn samples from the parameterized gamma distribution.
m_lambda_sum : {numpy.ndarray, float}
Expand All @@ -280,7 +280,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
m_rhot : float
Assigns weight to the information obtained from the mini-chunk and its value it between 0 and 1.
m_status_up_to_date : bool
Flag to indicate whether `lambda `and :math:`E[log \\theta]` have been updated if True, otherwise - not.
Flag to indicate whether `lambda `and :math:`E[log \theta]` have been updated if True, otherwise - not.
m_timestamp : numpy.ndarray
Helps to keep track and perform lazy updates on lambda.
m_updatect : int
Expand Down Expand Up @@ -510,13 +510,13 @@ def update_finished(self, start_time, chunks_processed, docs_processed):
"""
return (
# chunk limit reached
(self.max_chunks and chunks_processed == self.max_chunks) or
(self.max_chunks and chunks_processed == self.max_chunks)

# time limit reached
(self.max_time and time.clock() - start_time > self.max_time) or
or (self.max_time and time.clock() - start_time > self.max_time)

# no limits and whole corpus has been processed once
(not self.max_chunks and not self.max_time and docs_processed >= self.m_D))
or (not self.max_chunks and not self.max_time and docs_processed >= self.m_D))

def update_chunk(self, chunk, update=True, opt_o=True):
"""Performs lazy update on necessary columns of lambda and variational inference for documents in the chunk.
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/ldamulticore.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,9 @@ def process_result_queue(force=False):
if (force and merged_new and queue_size[0] == 0) or (not self.batch and (other.numdocs >= updateafter)):
self.do_mstep(rho(), other, pass_ > 0)
other.reset()
if self.eval_every is not None and \
((force and queue_size[0] == 0) or
(self.eval_every != 0 and (self.num_updates / updateafter) % self.eval_every == 0)):
if self.eval_every is not None \
and ((force and queue_size[0] == 0)
or (self.eval_every != 0 and (self.num_updates / updateafter) % self.eval_every == 0)):
self.log_perplexity(chunk, total_docs=lencorpus)

chunk_stream = utils.grouper(corpus, self.chunksize, as_numpy=chunks_as_numpy)
Expand Down
Loading

0 comments on commit 30528a5

Please sign in to comment.