Skip to content

Commit

Permalink
Fix docstrings for gensim.models.rpmodel (#1802)
Browse files Browse the repository at this point in the history
* first attempt to convert few lines into numpy-style doc

* added parameters in documentation

* more documentation

* few corrections

* show inheritance and undoc members

* show special members

* example is executable now

* link to the paper added, named parameters

* fixed doc

* fixed doc

* fixed whitespaces

* fix docstrings & PEP8

* fix docstrings

* fix typo
  • Loading branch information
jazzmuesli authored and menshikh-iv committed Dec 27, 2017
1 parent 255ce25 commit cd776b5
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 24 deletions.
4 changes: 3 additions & 1 deletion docs/src/models/rpmodel.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@
:synopsis: Random Projections
:members:
:inherited-members:

:undoc-members:
:show-inheritance:
:special-members: __getitem__
103 changes: 80 additions & 23 deletions gensim/models/rpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,35 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""Random Projections (also known as Random Indexing).
For theoretical background on Random Projections, see [1]_.
Examples
--------
>>> from gensim.models import RpModel
>>> from gensim.corpora import Dictionary
>>> from gensim.test.utils import common_texts, temporary_file
>>>
>>> dictionary = Dictionary(common_texts) # fit dictionary
>>> corpus = [dictionary.doc2bow(text) for text in common_texts] # convert texts to BoW format
>>>
>>> model = RpModel(corpus, id2word=dictionary) # fit model
>>> result = model[corpus[3]] # apply model to document, result is vector in BoW format
>>>
>>> with temporary_file("model_file") as fname:
... model.save(fname) # save model to file
... loaded_model = RpModel.load(fname) # load model
References
----------
.. [1] Kanerva et al., 2000, Random indexing of text samples for Latent Semantic Analysis,
https://cloudfront.escholarship.org/dist/prd/content/qt5644k0w6/qt5644k0w6.pdf
"""

import logging

import numpy as np
Expand All @@ -16,30 +45,21 @@


class RpModel(interfaces.TransformationABC):
"""
Objects of this class allow building and maintaining a model for Random Projections
(also known as Random Indexing). For theoretical background on RP, see:
Kanerva et al.: "Random indexing of text samples for Latent Semantic Analysis."

The main methods are:
def __init__(self, corpus, id2word=None, num_topics=300):
"""
1. constructor, which creates the random projection matrix
2. the [] method, which transforms a simple count representation into the TfIdf
space.
Parameters
----------
corpus : iterable of iterable of (int, int)
Input corpus.
>>> rp = RpModel(corpus)
>>> print(rp[some_doc])
>>> rp.save('/tmp/foo.rp_model')
id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
Mapping `token_id` -> `token`, will be determine from corpus if `id2word == None`.
Model persistency is achieved via its load/save methods.
"""
num_topics : int, optional
Number of topics.
def __init__(self, corpus, id2word=None, num_topics=300):
"""
`id2word` is a mapping from word ids (integers) to words (strings). It is
used to determine the vocabulary size, as well as for debugging and topic
printing. If not set, it will be determined from the corpus.
"""
self.id2word = id2word
self.num_topics = num_topics
Expand All @@ -50,8 +70,13 @@ def __str__(self):
return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics)

def initialize(self, corpus):
"""
Initialize the random projection matrix.
"""Initialize the random projection matrix.
Parameters
----------
corpus : iterable of iterable of (int, int)
Input corpus.
"""
if self.id2word is None:
logger.info("no word id mapping provided; initializing from corpus, assuming identity")
Expand All @@ -73,8 +98,32 @@ def initialize(self, corpus):
# are smarter and this is no longer needed?

def __getitem__(self, bow):
"""
Return RP representation of the input vector and/or corpus.
"""Get random-projection representation of the input vector or corpus.
Parameters
----------
bow : {list of (int, int), iterable of list of (int, int)}
Input document or corpus.
Returns
-------
list of (int, float)
if `bow` is document OR
:class:`~gensim.interfaces.TransformedCorpus`
if `bow` is corpus.
Examples
----------
>>> from gensim.models import RpModel
>>> from gensim.corpora import Dictionary
>>> from gensim.test.utils import common_texts
>>>
>>> dictionary = Dictionary(common_texts) # fit dictionary
>>> corpus = [dictionary.doc2bow(text) for text in common_texts] # convert texts to BoW format
>>>
>>> model = RpModel(corpus, id2word=dictionary) # fit model
>>> result = model[corpus[0]] # apply model to document, result is vector in BoW format, i.e. [(1, 0.3), ... ]
"""
# if the input vector is in fact a corpus, return a transformed corpus as result
is_corpus, bow = utils.is_corpus(bow)
Expand All @@ -96,5 +145,13 @@ def __getitem__(self, bow):
]

def __setstate__(self, state):
"""Sets the internal state and updates freshly_loaded to True, called when unpicked.
Parameters
----------
state : dict
State of the class.
"""
self.__dict__ = state
self.freshly_loaded = True

0 comments on commit cd776b5

Please sign in to comment.