piskvorky · menshikh-iv · Jan 22, 2018 · Sep 30, 2017 · Oct 2, 2017 · Oct 2, 2017
diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
@@ -5,9 +5,7 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""
-Blei's LDA-C format.
-"""
+"""Сorpus in Blei's LDA-C format."""
 
 from __future__ import with_statement
 
@@ -19,30 +17,39 @@
 from six.moves import xrange
 
 
-logger = logging.getLogger('gensim.corpora.bleicorpus')
+logger = logging.getLogger(__name__)
 
 
 class BleiCorpus(IndexedCorpus):
-    """
-    Corpus in Blei's LDA-C format.
+    """Corpus in Blei's LDA-C format.
 
     The corpus is represented as two files: one describing the documents, and another
     describing the mapping between words and their ids.
 
     Each document is one line::
 
-      N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN
+        N fieldId1:fieldValue1 fieldId2:fieldValue2 ... fieldIdN:fieldValueN
+
+
+    The vocabulary is a file with words, one word per line; word at line K has an implicit `id=K`.
 
-    The vocabulary is a file with words, one word per line; word at line K has an
-    implicit ``id=K``.
     """
 
     def __init__(self, fname, fname_vocab=None):
         """
-        Initialize the corpus from a file.
 
-        `fname_vocab` is the file with vocabulary; if not specified, it defaults to
-        `fname.vocab`.
+        Parameters
+        ----------
+        fname : str
+            File path to Serialized corpus.
+        fname_vocab : str, optional
+            Vocabulary file. If `fname_vocab` is None, searching for the vocab.txt or `fname_vocab`.vocab file.
+
+        Raises
+        ------
+        IOError
+            If vocabulary file doesn't exist.
+
         """
         IndexedCorpus.__init__(self, fname)
         logger.info("loading corpus from %s", fname)
@@ -67,8 +74,13 @@ def __init__(self, fname, fname_vocab=None):
         self.id2word = dict(enumerate(words))
 
     def __iter__(self):
-        """
-        Iterate over the corpus, returning one sparse vector at a time.
+        """Iterate over the corpus, returning one sparse (BoW) vector at a time.
+
+        Yields
+        ------
+        list of (int, float)
+            Document's BoW representation.
+
         """
         lineno = -1
         with utils.smart_open(self.fname) as fin:
@@ -77,6 +89,19 @@ def __iter__(self):
         self.length = lineno + 1
 
     def line2doc(self, line):
+        """Convert line in Blei LDA-C format to document (BoW representation).
+
+        Parameters
+        ----------
+        line : str
+            Line in Blei's LDA-C format.
+
+        Returns
+        -------
+        list of (int, float)
+            Document's BoW representation.
+
+        """
         parts = utils.to_unicode(line).split()
         if int(parts[0]) != len(parts) - 1:
             raise ValueError("invalid format in %s: %s" % (self.fname, repr(line)))
@@ -86,14 +111,28 @@ def line2doc(self, line):
 
     @staticmethod
     def save_corpus(fname, corpus, id2word=None, metadata=False):
-        """
-        Save a corpus in the LDA-C format.
-
-        There are actually two files saved: `fname` and `fname.vocab`, where
-        `fname.vocab` is the vocabulary file.
+        """Save a corpus in the LDA-C format.
+
+        Notes
+        -----
+        There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.
+
+        Parameters
+        ----------
+        fname : str
+            Path to output filename.
+        corpus : iterable of iterable of (int, float)
+            Input corpus
+        id2word : dict of (str, str), optional
+            Mapping id -> word for `corpus`.
+        metadata : bool, optional
+            THIS PARAMETER WILL BE IGNORED.
+
+        Returns
+        -------
+        list of int
+            Offsets for each line in file (in bytes).
 
-        This function is automatically called by `BleiCorpus.serialize`; don't
-        call it directly, call `serialize` instead.
         """
         if id2word is None:
             logger.info("no word id mapping provided; initializing from corpus")
@@ -121,8 +160,19 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
         return offsets
 
     def docbyoffset(self, offset):
-        """
-        Return the document stored at file position `offset`.
+        """Get document corresponding to `offset`,
+        offset can be given from :meth:`~gensim.corpora.bleicorpus.BleiCorpus.save_corpus`.
+
+        Parameters
+        ----------
+        offset : int
+            Position of the document in the file (in bytes).
+
+        Returns
+        -------
+        list of (int, float)
+            Document in BoW format.
+
         """
         with utils.smart_open(self.fname) as f:
             f.seek(offset)

diff --git a/gensim/corpora/csvcorpus.py b/gensim/corpora/csvcorpus.py
@@ -4,10 +4,7 @@
 # Copyright (C) 2013 Zygmunt Zając <zygmunt@fastml.com>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-"""
-Corpus in CSV format.
-
-"""
+"""Corpus in CSV format."""
 
 
 from __future__ import with_statement
@@ -22,18 +19,24 @@
 
 
 class CsvCorpus(interfaces.CorpusABC):
-    """
-    Corpus in CSV format. The CSV delimiter, headers etc. are guessed automatically
-    based on the file content.
+    """Corpus in CSV format.
+
+    The CSV delimiter, headers etc. are guessed automatically based on the
+    file content.
 
     All row values are expected to be ints/floats.
 
     """
 
     def __init__(self, fname, labels):
-        """
-        Initialize the corpus from a file.
-        `labels` = are class labels present in the input file? => skip the first column
+        """Initialize the corpus from a file.
+
+        Parameters
+        ----------
+        fname : str
+            Filename.
+        labels : bool
+            Whether to skip the first column.
 
         """
         logger.info("loading corpus from %s", fname)
@@ -48,8 +51,11 @@ def __init__(self, fname, labels):
         logger.info("sniffed CSV delimiter=%r, headers=%s", self.dialect.delimiter, self.headers)
 
     def __iter__(self):
-        """
-        Iterate over the corpus, returning one sparse vector at a time.
+        """Iterate over the corpus, returning one sparse vector at a time.
+
+        Yields
+        ------
+        list of (int, float)
 
         """
         reader = csv.reader(utils.smart_open(self.fname), self.dialect)

diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py
@@ -5,17 +5,7 @@
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
-"""
-Indexed corpus is a mechanism for random-accessing corpora.
-
-While the standard corpus interface in gensim allows iterating over corpus with
-`for doc in corpus: pass`, indexed corpus allows accessing the documents with
-`corpus[docno]` (in O(1) look-up time).
-
-This functionality is achieved by storing an extra file (by default named the same
-as the corpus file plus '.index' suffix) that stores the byte offset of the beginning
-of each document.
-"""
+"""Base Indexed Corpus class."""
 
 import logging
 import six
@@ -28,13 +18,32 @@
 
 
 class IndexedCorpus(interfaces.CorpusABC):
+    """Indexed corpus is a mechanism for random-accessing corpora.
+
+    While the standard corpus interface in gensim allows iterating over
+    corpus with `for doc in corpus: pass`, indexed corpus allows accessing
+    the documents with `corpus[docno]` (in O(1) look-up time).
+
+    Notes
+    -----
+    This functionality is achieved by storing an extra file (by default
+    named the same as the '{corpus name}.index') that stores the byte
+    offset of the beginning of each document.
+
+    """
+
     def __init__(self, fname, index_fname=None):
-        """
-        Initialize this abstract base class, by loading a previously saved index
-        from `index_fname` (or `fname.index` if `index_fname` is not set).
-        This index will allow subclasses to support the `corpus[docno]` syntax
-        (random access to document #`docno` in O(1)).
+        """Initialize the corpus.
+
+        Parameters
+        ----------
+        fname : string
+            Filename.
+        index_fname : string or None
+            Index filename, or None for loading `fname`.index.
 
+        Examples
+        --------
         >>> # save corpus in SvmLightCorpus format with an index
         >>> corpus = [[(1, 0.5)], [(0, 1.0), (1, 2.0)]]
         >>> gensim.corpora.SvmLightCorpus.serialize('testfile.svmlight', corpus)
@@ -58,22 +67,31 @@ def __init__(self, fname, index_fname=None):
     @classmethod
     def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
                   progress_cnt=None, labels=None, metadata=False):
-        """
-        Iterate through the document stream `corpus`, saving the documents to `fname`
-        and recording byte offset of each document. Save the resulting index
-        structure to file `index_fname` (or `fname`.index is not set).
-
-        This relies on the underlying corpus class `serializer` providing (in
-        addition to standard iteration):
-
-        * `save_corpus` method that returns a sequence of byte offsets, one for
-           each saved document,
-        * the `docbyoffset(offset)` method, which returns a document
-          positioned at `offset` bytes within the persistent storage (file).
-        * metadata if set to true will ensure that serialize will write out article titles to a pickle file.
-
-        Example:
-
+        """Iterate through the document stream `corpus`.
+
+        Saving the documents to
+        `fname` and recording byte offset of each document.
+
+        Parameters
+        ----------
+        fname : str
+            Filename.
+        corpus : iterable
+            Iterable of documents.
+        id2word : dict of (str, str), optional
+            Transforms id to word.
+        index_fname : str
+             Where to save resulting index. Saved to `fname`.index if None.
+        progress_cnt : int
+            Number of documents after which progress info is printed.
+        labels : bool
+             Whether to skip the first column (class labels).
+        metadata : bool
+            If True will ensure that serialize will write out
+            article titles to a pickle file. (Default value = False).
+
+        Examples
+        --------
         >>> MmCorpus.serialize('test.mm', corpus)
         >>> mm = MmCorpus('test.mm') # `mm` document stream now has random access
         >>> print(mm[42]) # retrieve document no. 42, etc.
@@ -108,8 +126,15 @@ def serialize(serializer, fname, corpus, id2word=None, index_fname=None,
 
     def __len__(self):
         """
-        Return the index length if the corpus is indexed. Otherwise, make a pass
-        over self to calculate the corpus length and cache this number.
+        Return the index length.
+
+        If the corpus is not indexed, also count corpus length and cache this
+        value.
+
+        Returns
+        -------
+        int
+
         """
         if self.index is not None:
             return len(self.index)
@@ -119,11 +144,24 @@ def __len__(self):
         return self.length
 
     def __getitem__(self, docno):
+        """Return certain document.
+
+        Parameters
+        ----------
+        docno : int
+            Document number.
+
+        Returns
+        -------
+        `utils.SlicedCorpus`
+
+        """
         if self.index is None:
             raise RuntimeError("Cannot call corpus[docid] without an index")
         if isinstance(docno, (slice, list, numpy.ndarray)):
             return utils.SlicedCorpus(self, docno)
         elif isinstance(docno, six.integer_types + (numpy.integer,)):
             return self.docbyoffset(self.index[docno])
+            # TODO: no `docbyoffset` method, should be defined in this class
         else:
             raise ValueError('Unrecognised value for docno, use either a single integer, a slice or a numpy.ndarray')