diff --git a/docs/src/corpora/lowcorpus.rst b/docs/src/corpora/lowcorpus.rst
index 5d084b7c58..05ed448893 100644
--- a/docs/src/corpora/lowcorpus.rst
+++ b/docs/src/corpora/lowcorpus.rst
@@ -1,8 +1,8 @@
-:mod:`corpora.lowcorpus` -- Corpus in List-of-Words format
-===========================================================
+:mod:`corpora.lowcorpus` -- Corpus in GibbsLda++ format
+=======================================================
.. automodule:: gensim.corpora.lowcorpus
- :synopsis: Corpus in List-of-Words format
+ :synopsis: Corpus in GibbsLda++ format
:members:
:inherited-members:
:undoc-members:
diff --git a/docs/src/corpora/malletcorpus.rst b/docs/src/corpora/malletcorpus.rst
index 184b832dc5..72fcb472b7 100644
--- a/docs/src/corpora/malletcorpus.rst
+++ b/docs/src/corpora/malletcorpus.rst
@@ -1,8 +1,8 @@
-:mod:`corpora.malletcorpus` -- Corpus in Mallet format of List-Of-Words.
-========================================================================
+:mod:`corpora.malletcorpus` -- Corpus in Mallet format
+======================================================
.. automodule:: gensim.corpora.malletcorpus
- :synopsis: Corpus in Mallet format of List-Of-Words.
+ :synopsis: Corpus in Mallet format.
:members:
:inherited-members:
:undoc-members:
diff --git a/docs/src/corpora/textcorpus.rst b/docs/src/corpora/textcorpus.rst
index f0f3598a75..7fc346edba 100644
--- a/docs/src/corpora/textcorpus.rst
+++ b/docs/src/corpora/textcorpus.rst
@@ -1,8 +1,8 @@
-:mod:`corpora.textcorpus` -- Building corpora with dictionaries
-=================================================================
+:mod:`corpora.textcorpus` -- Tools for building corpora with dictionaries
+=========================================================================
.. automodule:: gensim.corpora.textcorpus
- :synopsis: Building corpora with dictionaries
+ :synopsis: Tools for building corpora with dictionaries
:members:
:inherited-members:
:undoc-members:
diff --git a/docs/src/corpora/ucicorpus.rst b/docs/src/corpora/ucicorpus.rst
index 75f6908478..37f753e5ef 100644
--- a/docs/src/corpora/ucicorpus.rst
+++ b/docs/src/corpora/ucicorpus.rst
@@ -1,8 +1,8 @@
-:mod:`corpora.ucicorpus` -- Corpus in UCI bag-of-words format
-==============================================================================================================
+:mod:`corpora.ucicorpus` -- Corpus in UCI format
+================================================
.. automodule:: gensim.corpora.ucicorpus
- :synopsis: Corpus in University of California, Irvine (UCI) bag-of-words format
+ :synopsis: Corpus in UCI format
:members:
:inherited-members:
:undoc-members:
diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
index e293c998a1..277df249e5 100644
--- a/gensim/corpora/lowcorpus.py
+++ b/gensim/corpora/lowcorpus.py
@@ -5,9 +5,7 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
-"""
-Corpus in GibbsLda++ format of List-Of-Words.
-"""
+"""Corpus in `GibbsLda++ format `_."""
from __future__ import with_statement
@@ -19,21 +17,33 @@
from six.moves import xrange, zip as izip
-logger = logging.getLogger('gensim.corpora.lowcorpus')
+logger = logging.getLogger(__name__)
def split_on_space(s):
+ """Split line by spaces, used in :class:`gensim.corpora.lowcorpus.LowCorpus`.
+
+ Parameters
+ ----------
+ s : str
+ Some line.
+
+ Returns
+ -------
+ list of str
+ List of tokens from `s`.
+
+ """
return [word for word in utils.to_unicode(s).strip().split(' ') if word]
class LowCorpus(IndexedCorpus):
- """
- List_Of_Words corpus handles input in GibbsLda++ format.
+ """Corpus handles input in `GibbsLda++ format `_.
- Quoting http://gibbslda.sourceforge.net/#3.2_Input_Data_Format::
+ **Format description**
- Both data for training/estimating the model and new data (i.e., previously
- unseen data) have the same format as follows:
+ Both data for training/estimating the model and new data (i.e., previously unseen data) have the same format
+ as follows ::
[M]
[document1]
@@ -41,26 +51,45 @@ class LowCorpus(IndexedCorpus):
...
[documentM]
- in which the first line is the total number for documents [M]. Each line
- after that is one document. [documenti] is the ith document of the dataset
- that consists of a list of Ni words/terms.
+ in which the first line is the total number for documents [M]. Each line after that is one document.
+ [documenti] is the ith document of the dataset that consists of a list of Ni words/terms ::
[documenti] = [wordi1] [wordi2] ... [wordiNi]
- in which all [wordij] (i=1..M, j=1..Ni) are text strings and they are separated
- by the blank character.
+ in which all [wordij] (i=1..M, j=1..Ni) are text strings and they are separated by the blank character.
+
+ Examples
+ --------
+ >>> from gensim.test.utils import datapath, get_tmpfile, common_texts
+ >>> from gensim.corpora import LowCorpus
+ >>> from gensim.corpora import Dictionary
+ >>>
+ >>> # Prepare needed data
+ >>> dictionary = Dictionary(common_texts)
+ >>> corpus = [dictionary.doc2bow(doc) for doc in common_texts]
+ >>>
+ >>> # Write corpus in GibbsLda++ format to disk
+ >>> output_fname = get_tmpfile("corpus.low")
+ >>> LowCorpus.serialize(output_fname, corpus, dictionary)
+ >>>
+ >>> # Read corpus
+ >>> loaded_corpus = LowCorpus(output_fname)
+
"""
def __init__(self, fname, id2word=None, line2words=split_on_space):
"""
- Initialize the corpus from a file.
- `id2word` and `line2words` are optional parameters.
- If provided, `id2word` is a dictionary mapping between word_ids (integers)
- and words (strings). If not provided, the mapping is constructed from
- the documents.
+ Parameters
+ ----------
+ fname : str
+ Path to file in GibbsLda++ format.
+ id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
+ Mapping between word_ids (integers) and words (strings).
+ If not provided, the mapping is constructed directly from `fname`.
+ line2words : callable, optional
+ Function which converts lines(str) into tokens(list of str),
+ using :func:`~gensim.corpora.lowcorpus.split_on_space` as default.
- `line2words` is a function which converts lines into tokens. Defaults to
- simple splitting on spaces.
"""
IndexedCorpus.__init__(self, fname)
logger.info("loading corpus from %s", fname)
@@ -91,6 +120,14 @@ def __init__(self, fname, id2word=None, line2words=split_on_space):
)
def _calculate_num_docs(self):
+ """Get number of documents in file.
+
+ Returns
+ -------
+ int
+ Number of documents.
+
+ """
# the first line in input data is the number of documents (integer). throws exception on bad input.
with utils.smart_open(self.fname) as fin:
try:
@@ -104,6 +141,19 @@ def __len__(self):
return self.num_docs
def line2doc(self, line):
+ """Covert line into document in BoW format.
+
+ Parameters
+ ----------
+ line : str
+ Line from input file.
+
+ Returns
+ -------
+ list of (int, int)
+ Document in BoW format
+
+ """
words = self.line2words(line)
if self.use_wordids:
@@ -132,8 +182,13 @@ def line2doc(self, line):
return doc
def __iter__(self):
- """
- Iterate over the corpus, returning one bag-of-words vector at a time.
+ """Iterate over the corpus.
+
+ Yields
+ ------
+ list of (int, int)
+ Document in BoW format.
+
"""
with utils.smart_open(self.fname) as fin:
for lineno, line in enumerate(fin):
@@ -142,11 +197,31 @@ def __iter__(self):
@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False):
- """
- Save a corpus in the List-of-words format.
+ """Save a corpus in the GibbsLda++ format.
+
+ Warnings
+ --------
+ This function is automatically called by :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize`,
+ don't call it directly, call :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize` instead.
+
+ Parameters
+ ----------
+ fname : str
+ Path to output file.
+ corpus : iterable of iterable of (int, int)
+ Corpus in BoW format.
+ id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
+ Mapping between word_ids (integers) and words (strings).
+ If not provided, the mapping is constructed directly from `corpus`.
+ metadata : bool, optional
+ THIS PARAMETER WILL BE IGNORED.
+
+ Return
+ ------
+ list of int
+ List of offsets in resulting file for each document (in bytes),
+ can be used for :meth:`~gensim.corpora.lowcorpus.LowCorpus.docbyoffset`
- This function is automatically called by `LowCorpus.serialize`; don't
- call it directly, call `serialize` instead.
"""
if id2word is None:
logger.info("no word id mapping provided; initializing from corpus")
@@ -174,8 +249,29 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
return offsets
def docbyoffset(self, offset):
- """
- Return the document stored at file position `offset`.
+ """Get the document stored in file by `offset` position.
+
+ Parameters
+ ----------
+ offset : int
+ Offset (in bytes) to begin of document.
+
+ Returns
+ -------
+ list of (int, int)
+ Document in BoW format.
+
+ Examples
+ --------
+ >>> from gensim.test.utils import datapath
+ >>> from gensim.corpora import LowCorpus
+ >>>
+ >>> data = LowCorpus(datapath("testcorpus.low"))
+ >>> data.docbyoffset(1) # end of first line
+ []
+ >>> data.docbyoffset(2) # start of second line
+ [(0, 1), (3, 1), (4, 1)]
+
"""
with utils.smart_open(self.fname) as f:
f.seek(offset)
@@ -183,6 +279,7 @@ def docbyoffset(self, offset):
@property
def id2word(self):
+ """Get mapping between words and their ids."""
return self._id2word
@id2word.setter
diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py
index cacf0074bd..37d7fc0d9d 100644
--- a/gensim/corpora/malletcorpus.py
+++ b/gensim/corpora/malletcorpus.py
@@ -3,9 +3,7 @@
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
-"""
-Corpus in Mallet format of List-Of-Words.
-"""
+"""Corpus in `Mallet format `_."""
from __future__ import with_statement
@@ -15,47 +13,114 @@
from gensim.corpora import LowCorpus
-logger = logging.getLogger('gensim.corpora.malletcorpus')
+logger = logging.getLogger(__name__)
class MalletCorpus(LowCorpus):
- """
- Quoting http://mallet.cs.umass.edu/import.php:
+ """Corpus handles input in `Mallet format `_.
+
+ **Format description**
- One file, one instance per line
- Assume the data is in the following format:
+ One file, one instance per line, assume the data is in the following format ::
[URL] [language] [text of the page...]
- Or, more generally,
+ Or, more generally, ::
+
[document #1 id] [label] [text of the document...]
[document #2 id] [label] [text of the document...]
...
[document #N id] [label] [text of the document...]
- Note that language/label is *not* considered in Gensim.
+ Note that language/label is *not* considered in Gensim, used `__unknown__` as default value.
+
+ Examples
+ --------
+ >>> from gensim.test.utils import datapath, get_tmpfile, common_texts
+ >>> from gensim.corpora import MalletCorpus
+ >>> from gensim.corpora import Dictionary
+ >>>
+ >>> # Prepare needed data
+ >>> dictionary = Dictionary(common_texts)
+ >>> corpus = [dictionary.doc2bow(doc) for doc in common_texts]
+ >>>
+ >>> # Write corpus in Mallet format to disk
+ >>> output_fname = get_tmpfile("corpus.mallet")
+ >>> MalletCorpus.serialize(output_fname, corpus, dictionary)
+ >>>
+ >>> # Read corpus
+ >>> loaded_corpus = MalletCorpus(output_fname)
"""
def __init__(self, fname, id2word=None, metadata=False):
+ """
+
+ Parameters
+ ----------
+ fname : str
+ Path to file in Mallet format.
+ id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
+ Mapping between word_ids (integers) and words (strings).
+ If not provided, the mapping is constructed directly from `fname`.
+ metadata : bool, optional
+ If True, return additional information ("document id" and "lang" when you call
+ :meth:`~gensim.corpora.malletcorpus.MalletCorpus.line2doc`,
+ :meth:`~gensim.corpora.malletcorpus.MalletCorpus.__iter__` or
+ :meth:`~gensim.corpora.malletcorpus.MalletCorpus.docbyoffset`
+
+ """
self.metadata = metadata
LowCorpus.__init__(self, fname, id2word)
def _calculate_num_docs(self):
+ """Get number of documents.
+
+ Returns
+ -------
+ int
+ Number of documents in file.
+
+ """
with utils.smart_open(self.fname) as fin:
result = sum(1 for _ in fin)
return result
def __iter__(self):
- """
- Iterate over the corpus at the given filename.
+ """Iterate over the corpus.
+
+ Yields
+ ------
+ list of (int, int)
+ Document in BoW format (+"document_id" and "lang" if metadata=True).
- Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary.
"""
with utils.smart_open(self.fname) as f:
for line in f:
yield self.line2doc(line)
def line2doc(self, line):
+ """Covert line into document in BoW format.
+
+ Parameters
+ ----------
+ line : str
+ Line from input file.
+
+ Returns
+ -------
+ list of (int, int)
+ Document in BoW format (+"document_id" and "lang" if metadata=True).
+
+ Examples
+ --------
+ >>> from gensim.test.utils import datapath
+ >>> from gensim.corpora import MalletCorpus
+ >>>
+ >>> corpus = MalletCorpus(datapath("testcorpus.mallet"))
+ >>> corpus.line2doc("en computer human interface")
+ [(3, 1), (4, 1)]
+
+ """
splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word]
docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:]
@@ -68,18 +133,39 @@ def line2doc(self, line):
@staticmethod
def save_corpus(fname, corpus, id2word=None, metadata=False):
- """
- Save a corpus in the Mallet format.
-
+ """Save a corpus in the Mallet format.
+
+ Warnings
+ --------
+ This function is automatically called by :meth:`gensim.corpora.malletcorpus.MalletCorpus.serialize`,
+ don't call it directly, call :meth:`gensim.corpora.lowcorpus.malletcorpus.MalletCorpus.serialize` instead.
+
+ Parameters
+ ----------
+ fname : str
+ Path to output file.
+ corpus : iterable of iterable of (int, int)
+ Corpus in BoW format.
+ id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional
+ Mapping between word_ids (integers) and words (strings).
+ If not provided, the mapping is constructed directly from `corpus`.
+ metadata : bool, optional
+ If True - ????
+
+ Return
+ ------
+ list of int
+ List of offsets in resulting file for each document (in bytes),
+ can be used for :meth:`~gensim.corpora.malletcorpus.Malletcorpus.docbyoffset`.
+
+ Notes
+ -----
The document id will be generated by enumerating the corpus.
That is, it will range between 0 and number of documents in the corpus.
Since Mallet has a language field in the format, this defaults to the string '__unknown__'.
If the language needs to be saved, post-processing will be required.
- This function is automatically called by `MalletCorpus.serialize`; don't
- call it directly, call `serialize` instead.
-
"""
if id2word is None:
logger.info("no word id mapping provided; initializing from corpus")
@@ -114,8 +200,29 @@ def save_corpus(fname, corpus, id2word=None, metadata=False):
return offsets
def docbyoffset(self, offset):
- """
- Return the document stored at file position `offset`.
+ """Get the document stored in file by `offset` position.
+
+ Parameters
+ ----------
+ offset : int
+ Offset (in bytes) to begin of document.
+
+ Returns
+ -------
+ list of (int, int)
+ Document in BoW format (+"document_id" and "lang" if metadata=True).
+
+ Examples
+ --------
+ >>> from gensim.test.utils import datapath
+ >>> from gensim.corpora import MalletCorpus
+ >>>
+ >>> data = MalletCorpus(datapath("testcorpus.mallet"))
+ >>> data.docbyoffset(1) # end of first line
+ [(3, 1), (4, 1)]
+ >>> data.docbyoffset(4) # start of second line
+ [(4, 1)]
+
"""
with utils.smart_open(self.fname) as f:
f.seek(offset)
diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py
index 7f78f5ca91..41ad492570 100644
--- a/gensim/corpora/textcorpus.py
+++ b/gensim/corpora/textcorpus.py
@@ -3,27 +3,34 @@
#
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
-"""
-Text corpora usually reside on disk, as text files in one format or another
-In a common scenario, we need to build a dictionary (a `word->integer id`
-mapping), which is then used to construct sparse bag-of-word vectors
-(= sequences of `(word_id, word_weight)` 2-tuples).
-
-This module provides some code scaffolding to simplify this pipeline. For
-example, given a corpus where each document is a separate line in file on disk,
-you would override the `TextCorpus.get_texts` method to read one line=document
-at a time, process it (lowercase, tokenize, whatever) and yield it as a sequence
-of words.
-
-Overriding `get_texts` is enough; you can then initialize the corpus with e.g.
-`MyTextCorpus(bz2.BZ2File('mycorpus.txt.bz2'))` and it will behave correctly like a
-corpus of sparse vectors. The `__iter__` methods is automatically set up, and
-dictionary is automatically populated with all `word->id` mappings.
-
-The resulting object can be used as input to all gensim models (TFIDF, LSI, ...),
-serialized with any format (Matrix Market, SvmLight, Blei's LDA-C format etc).
-
-See the `gensim.test.test_miislita.CorpusMiislita` class for a simple example.
+"""Module provides some code scaffolding to simplify use of built dictionary for constructing BoW vectors.
+
+Notes
+-----
+Text corpora usually reside on disk, as text files in one format or another In a common scenario,
+we need to build a dictionary (a `word->integer id` mapping), which is then used to construct sparse bag-of-word vectors
+(= iterable of `(word_id, word_weight)`).
+
+This module provides some code scaffolding to simplify this pipeline. For example, given a corpus where each document
+is a separate line in file on disk, you would override the :meth:`gensim.corpora.textcorpus.TextCorpus.get_texts`
+to read one line=document at a time, process it (lowercase, tokenize, whatever) and yield it as a sequence of words.
+
+Overriding :meth:`gensim.corpora.textcorpus.TextCorpus.get_texts` is enough, you can then initialize the corpus
+with e.g. `MyTextCorpus("mycorpus.txt.bz2")` and it will behave correctly like a corpus of sparse vectors.
+The :meth:`~gensim.corpora.textcorpus.TextCorpus.__iter__` method is automatically set up,
+and dictionary is automatically populated with all `word->id` mappings.
+
+The resulting object can be used as input to some of gensim models (:class:`~gensim.models.tfidfmodel.TfidfModel`,
+:class:`~gensim.models.lsimodel.LsiModel`, :class:`~gensim.models.ldamodel.LdaModel`, ...), serialized with any format
+(`Matrix Market `_,
+`SvmLight `_, `Blei's LDA-C format `_, etc).
+
+
+See Also
+--------
+:class:`gensim.test.test_miislita.CorpusMiislita`
+ Good simple example.
+
"""
@@ -44,98 +51,179 @@
def remove_stopwords(tokens, stopwords=STOPWORDS):
- """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`."""
+ """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`.
+
+ Parameters
+ ----------
+ tokens : iterable of str
+ Sequence of tokens.
+ stopwords : iterable of str, optional
+ Sequence of stopwords
+
+ Returns
+ -------
+ list of str
+ List of tokens without `stopwords`.
+
+ """
return [token for token in tokens if token not in stopwords]
def remove_short(tokens, minsize=3):
- """Remove tokens smaller than `minsize` chars, which is 3 by default."""
+ """Remove tokens shorter than `minsize` chars.
+
+ Parameters
+ ----------
+ tokens : iterable of str
+ Sequence of tokens.
+ minsize : int, optimal
+ Minimal length of token (include).
+
+ Returns
+ -------
+ list of str
+ List of tokens without short tokens.
+
+ """
return [token for token in tokens if len(token) >= minsize]
def lower_to_unicode(text, encoding='utf8', errors='strict'):
- """Lowercase `text` and convert to unicode."""
+ """Lowercase `text` and convert to unicode, using :func:`gensim.utils.any2unicode`.
+
+ Parameters
+ ----------
+ text : str
+ Input text.
+ encoding : str, optional
+ Encoding that will be used for conversion.
+ errors : str, optional
+ Error handling behaviour, used as parameter for `unicode` function (python2 only).
+
+ Returns
+ -------
+ str
+ Unicode version of `text`.
+
+ See Also
+ --------
+ :func:`gensim.utils.any2unicode`
+ Convert any string to unicode-string.
+
+ """
return utils.to_unicode(text.lower(), encoding, errors)
def strip_multiple_whitespaces(s):
- """Collapse multiple whitespace characters into a single space."""
+ """Collapse multiple whitespace characters into a single space.
+
+ Parameters
+ ----------
+ s : str
+ Input string
+
+ Returns
+ -------
+ str
+ String with collapsed whitespaces.
+
+ """
return RE_WHITESPACE.sub(" ", s)
class TextCorpus(interfaces.CorpusABC):
- """Helper class to simplify the pipeline of getting bag-of-words vectors (= a
- gensim corpus) from plain text.
+ """Helper class to simplify the pipeline of getting BoW vectors from plain text.
- This is an abstract base class: override the `get_texts()` and `__len__()`
- methods to match your particular input.
+ Notes
+ -----
+ This is an abstract base class: override the :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` and
+ :meth:`~gensim.corpora.textcorpus.TextCorpus.__len__` methods to match your particular input.
- Given a filename (or a file-like object) in constructor, the corpus object
- will be automatically initialized with a dictionary in `self.dictionary` and
- will support the `iter` corpus method. You have a few different ways of utilizing
- this class via subclassing or by construction with different preprocessing arguments.
+ Given a filename (or a file-like object) in constructor, the corpus object will be automatically initialized
+ with a dictionary in `self.dictionary` and will support the :meth:`~gensim.corpora.textcorpus.TextCorpus.__iter__`
+ corpus method. You have a few different ways of utilizing this class via subclassing or by construction with
+ different preprocessing arguments.
- The `iter` method converts the lists of tokens produced by `get_texts` to BoW format
- using `Dictionary.doc2bow`. `get_texts` does the following:
+ The :meth:`~gensim.corpora.textcorpus.TextCorpus.__iter__` method converts the lists of tokens produced by
+ :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` to BoW format using
+ :meth:`gensim.corpora.dictionary.Dictionary.doc2bow`.
- 1. Calls `getstream` to get a generator over the texts. It yields each document in
- turn from the underlying text file or files.
- 2. For each document from the stream, calls `preprocess_text` to produce a list of
- tokens; if metadata is enabled, it yields a 2-`tuple` with the document number as
- the second element.
+ :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` does the following:
+ #. Calls :meth:`~gensim.corpora.textcorpus.TextCorpus.getstream` to get a generator over the texts.
+ It yields each document in turn from the underlying text file or files.
+ #. For each document from the stream, calls :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` to produce
+ a list of tokens. If metadata=True, it yields a 2-`tuple` with the document number as the second element.
Preprocessing consists of 0+ `character_filters`, a `tokenizer`, and 0+ `token_filters`.
- The preprocessing consists of calling each filter in `character_filters` with the document
- text; unicode is not guaranteed, and if desired, the first filter should convert to unicode.
- The output of each character filter should be another string. The output from the final
- filter is fed to the `tokenizer`, which should split the string into a list of tokens (strings).
- Afterwards, the list of tokens is fed through each filter in `token_filters`. The final
- output returned from `preprocess_text` is the output from the final token filter.
+ The preprocessing consists of calling each filter in `character_filters` with the document text.
+ Unicode is not guaranteed, and if desired, the first filter should convert to unicode.
+ The output of each character filter should be another string. The output from the final filter is fed
+ to the `tokenizer`, which should split the string into a list of tokens (strings).
+ Afterwards, the list of tokens is fed through each filter in `token_filters`. The final output returned from
+ :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` is the output from the final token filter.
So to use this class, you can either pass in different preprocessing functions using the
`character_filters`, `tokenizer`, and `token_filters` arguments, or you can subclass it.
- If subclassing: override `getstream` to take text from different input sources in different
- formats. Overrride `preprocess_text` if you must provide different initial preprocessing,
- then call the `TextCorpus.preprocess_text` method to apply the normal preprocessing. You
- can also overrride `get_texts` in order to tag the documents (token lists) with different
- metadata.
+
+ If subclassing: override :meth:`~gensim.corpora.textcorpus.TextCorpus.getstream` to take text from different input
+ sources in different formats.
+ Override :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` if you must provide different initial
+ preprocessing, then call the :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` method to apply
+ the normal preprocessing.
+ You can also override :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` in order to tag the documents
+ (token lists) with different metadata.
The default preprocessing consists of:
- 1. lowercase and convert to unicode; assumes utf8 encoding
- 2. deaccent (asciifolding)
- 3. collapse multiple whitespaces into a single one
- 4. tokenize by splitting on whitespace
- 5. remove words less than 3 characters long
- 6. remove stopwords; see `gensim.parsing.preprocessing` for the list of stopwords
+ #. :func:`~gensim.corpora.textcorpus.lower_to_unicode` - lowercase and convert to unicode (assumes utf8 encoding)
+ #. :func:`~gensim.utils.deaccent`- deaccent (asciifolding)
+ #. :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces` - collapse multiple whitespaces into a single one
+ #. :func:`~gensim.utils.simple_tokenize` - tokenize by splitting on whitespace
+ #. :func:`~gensim.corpora.textcorpus.remove_short` - remove words less than 3 characters long
+ #. :func:`~gensim.corpora.textcorpus.remove_stopwords` - remove stopwords
"""
def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None,
tokenizer=None, token_filters=None):
"""
- Args:
- input (str): path to top-level directory to traverse for corpus documents.
- dictionary (Dictionary): if a dictionary is provided, it will not be updated
- with the given corpus on initialization. If none is provided, a new dictionary
- will be built for the given corpus. If no corpus is given, the dictionary will
- remain uninitialized.
- metadata (bool): True to yield metadata with each document, else False (default).
- character_filters (iterable of callable): each will be applied to the text of each
- document in order, and should return a single string with the modified text.
- For Python 2, the original text will not be unicode, so it may be useful to
- convert to unicode as the first character filter. The default character filters
- lowercase, convert to unicode (strict utf8), perform ASCII-folding, then collapse
- multiple whitespaces.
- tokenizer (callable): takes as input the document text, preprocessed by all filters
- in `character_filters`; should return an iterable of tokens (strings).
- token_filters (iterable of callable): each will be applied to the iterable of tokens
- in order, and should return another iterable of tokens. These filters can add,
- remove, or replace tokens, or do nothing at all. The default token filters
- remove tokens less than 3 characters long and remove stopwords using the list
- in `gensim.parsing.preprocessing.STOPWORDS`.
+
+ Parameters
+ ----------
+ input : str, optional
+ Path to top-level directory (file) to traverse for corpus documents.
+ dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
+ If a dictionary is provided, it will not be updated with the given corpus on initialization.
+ If None - new dictionary will be built for the given corpus.
+ If `input` is None, the dictionary will remain uninitialized.
+ metadata : bool, optional
+ If True - yield metadata with each document.
+ character_filters : iterable of callable, optional
+ Each will be applied to the text of each document in order, and should return a single string with
+ the modified text. For Python 2, the original text will not be unicode, so it may be useful to
+ convert to unicode as the first character filter.
+ If None - using :func:`~gensim.corpora.textcorpus.lower_to_unicode`,
+ :func:`~gensim.utils.deaccent` and :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces`.
+ tokenizer : callable, optional
+ Tokenizer for document, if None - using :func:`~gensim.utils.simple_tokenize`.
+ token_filters : iterable of callable, optional
+ Each will be applied to the iterable of tokens in order, and should return another iterable of tokens.
+ These filters can add, remove, or replace tokens, or do nothing at all.
+ If None - using :func:`~gensim.corpora.textcorpus.remove_short` and
+ :func:`~gensim.corpora.textcorpus.remove_stopwords`.
+
+ Examples
+ --------
+ >>> #TODO Example with inheritance
+ >>> from gensim.corpora.textcorpus import TextCorpus
+ >>> from gensim.test.utils import datapath
+ >>>
+ >>> corpus = TextCorpus(datapath('head500.noblanks.cor.bz2'))
+ >>> for bow in corpus:
+ ... pass
+
"""
self.input = input
self.metadata = metadata
@@ -157,9 +245,18 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter
self.init_dictionary(dictionary)
def init_dictionary(self, dictionary):
- """If `dictionary` is None, initialize to an empty Dictionary, and then if there
- is an `input` for the corpus, add all documents from that `input`. If the
- `dictionary` is already initialized, simply set it as the corpus's `dictionary`.
+ """Initialize/update dictionary.
+
+ Parameters
+ ----------
+ dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
+ If a dictionary is provided, it will not be updated with the given corpus on initialization.
+ If None - new dictionary will be built for the given corpus.
+
+ Notes
+ -----
+ If self.input is None - make nothing.
+
"""
self.dictionary = dictionary if dictionary is not None else Dictionary()
if self.input is not None:
@@ -175,9 +272,13 @@ def init_dictionary(self, dictionary):
logger.warning("No input document stream provided; assuming dictionary will be initialized some other way.")
def __iter__(self):
- """The function that defines a corpus.
+ """Iterate over the corpus.
+
+ Yields
+ ------
+ list of (int, int)
+ Document in BoW format (+ metadata if self.metadata).
- Iterating over the corpus must yield sparse vectors, one for each document.
"""
if self.metadata:
for text, metadata in self.get_texts():
@@ -187,9 +288,17 @@ def __iter__(self):
yield self.dictionary.doc2bow(text, allow_update=False)
def getstream(self):
- """Yield documents from the underlying plain text collection (of one or more files).
- Each item yielded from this method will be considered a document by subsequent
- preprocessing methods.
+ """Generate documents from the underlying plain text collection (of one or more files).
+
+ Yields
+ ------
+ str
+ Document read from plain-text file.
+
+ Notes
+ -----
+ After generator end - initialize self.length attribute.
+
"""
num_texts = 0
with utils.file_or_filename(self.input) as f:
@@ -200,14 +309,18 @@ def getstream(self):
self.length = num_texts
def preprocess_text(self, text):
- """Apply preprocessing to a single text document. This should perform tokenization
- in addition to any other desired preprocessing steps.
+ """Apply `self.character_filters`, `self.tokenizer`, `self.token_filters` to a single text document.
+
+ Parameters
+ ---------
+ text : str
+ Document read from plain-text file.
- Args:
- text (str): document text read from plain-text file.
+ Return
+ ------
+ list of str
+ List of tokens extracted from `text`.
- Returns:
- iterable of str: tokens produced from `text` as a result of preprocessing.
"""
for character_filter in self.character_filters:
text = character_filter(text)
@@ -219,8 +332,22 @@ def preprocess_text(self, text):
return tokens
def step_through_preprocess(self, text):
- """Yield tuples of functions and their output for each stage of preprocessing.
+ """Apply preprocessor one by one and generate result.
+
+ Warnings
+ --------
This is useful for debugging issues with the corpus preprocessing pipeline.
+
+ Parameters
+ ----------
+ text : str
+ Document text read from plain-text file.
+
+ Yields
+ ------
+ (callable, object)
+ Pre-processor, output from pre-processor (based on `text`)
+
"""
for character_filter in self.character_filters:
text = character_filter(text)
@@ -233,16 +360,13 @@ def step_through_preprocess(self, text):
yield (token_filter, token_filter(tokens))
def get_texts(self):
- """Iterate over the collection, yielding one document at a time. A document
- is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`.
- Each document will be fed through `preprocess_text`. That method should be
- overridden to provide different preprocessing steps. This method will need
- to be overridden if the metadata you'd like to yield differs from the line
- number.
-
- Returns:
- generator of lists of tokens (strings); each list corresponds to a preprocessed
- document from the corpus `input`.
+ """Generate documents from corpus.
+
+ Yields
+ ------
+ list of str
+ Document as sequence of tokens (+ lineno if self.metadata)
+
"""
lines = self.getstream()
if self.metadata:
@@ -253,25 +377,34 @@ def get_texts(self):
yield self.preprocess_text(line)
def sample_texts(self, n, seed=None, length=None):
- """Yield n random documents from the corpus without replacement.
-
+ """Generate `n` random documents from the corpus without replacement.
+
+ Parameters
+ ----------
+ n : int
+ Number of documents we want to sample.
+ seed : int, optional
+ If specified, use it as a seed for local random generator.
+ length : int, optional
+ Value will used as corpus length (because calculate length of corpus can be costly operation).
+ If not specified - will call `__length__`.
+
+ Raises
+ ------
+ ValueError
+ If `n` less than zero or greater than corpus size.
+
+ Notes
+ -----
Given the number of remaining documents in a corpus, we need to choose n elements.
- The probability for the current element to be chosen is n/remaining.
- If we choose it, we just decrease the n and move to the next element.
- Computing the corpus length may be a costly operation so you can use the optional
- parameter `length` instead.
-
- Args:
- n (int): number of documents we want to sample.
- seed (int|None): if specified, use it as a seed for local random generator.
- length (int|None): if specified, use it as a guess of corpus length.
- It must be positive and not greater than actual corpus length.
-
- Yields:
- list[str]: document represented as a list of tokens. See get_texts method.
-
- Raises:
- ValueError: when n is invalid or length was set incorrectly.
+ The probability for the current element to be chosen is `n` / remaining. If we choose it, we just decrease
+ the `n` and move to the next element.
+
+ Yields
+ ------
+ list of str
+ Sampled document as sequence of tokens.
+
"""
random_generator = random if seed is None else random.Random(seed)
if length is None:
@@ -302,6 +435,19 @@ def sample_texts(self, n, seed=None, length=None):
raise ValueError("length {0:d} greater than number of documents in corpus {1:d}".format(length, i + 1))
def __len__(self):
+ """Get length of corpus
+
+ Warnings
+ --------
+ If self.length is None - will read all corpus for calculate this attribute through
+ :meth:`~gensim.corpora.textcorpus.TextCorpus.getstream`.
+
+ Returns
+ -------
+ int
+ Length of corpus.
+
+ """
if self.length is None:
# cache the corpus length
self.length = sum(1 for _ in self.getstream())
@@ -309,28 +455,39 @@ def __len__(self):
class TextDirectoryCorpus(TextCorpus):
- """Read documents recursively from a directory,
- where each file (or line of each file) is interpreted as a plain text document.
+ """Read documents recursively from a directory.
+ Each file/line (depends on `lines_are_documents`) is interpreted as a plain text document.
+
"""
def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_depth=None,
pattern=None, exclude_pattern=None, lines_are_documents=False, **kwargs):
"""
- Args:
- min_depth (int): minimum depth in directory tree at which to begin searching for
- files. The default is 0, which means files starting in the top-level directory
- `input` will be considered.
- max_depth (int): max depth in directory tree at which files will no longer be
- considered. The default is None, which means recurse through all subdirectories.
- pattern (str or Pattern): regex to use for file name inclusion; all those files *not*
- matching this pattern will be ignored.
- exclude_pattern (str or Pattern): regex to use for file name exclusion; all files
- matching this pattern will be ignored.
- lines_are_documents (bool): if True, each line of each file is considered to be a
- document. If False (default), each file is considered to be a document.
- kwargs: keyword arguments passed through to the `TextCorpus` constructor. This is
- in addition to the non-kwargs `input`, `dictionary`, and `metadata`. See
- `TextCorpus.__init__` docstring for more details on these.
+
+ Parameters
+ ----------
+ input : str
+ Path to input file/folder.
+ dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
+ If a dictionary is provided, it will not be updated with the given corpus on initialization.
+ If None - new dictionary will be built for the given corpus.
+ If `input` is None, the dictionary will remain uninitialized.
+ metadata : bool, optional
+ If True - yield metadata with each document.
+ min_depth : int, optional
+ Minimum depth in directory tree at which to begin searching for files.
+ max_depth : int, optional
+ Max depth in directory tree at which files will no longer be considered.
+ If None - not limited.
+ pattern : str, optional
+ Regex to use for file name inclusion, all those files *not* matching this pattern will be ignored.
+ exclude_pattern : str, optional
+ Regex to use for file name exclusion, all files matching this pattern will be ignored.
+ lines_are_documents : bool, optional
+ If True - each line is considered a document, otherwise - each file is one document.
+ kwargs: keyword arguments passed through to the `TextCorpus` constructor.
+ See :meth:`gemsim.corpora.textcorpus.TextCorpus.__init__` docstring for more details on these.
+
"""
self._min_depth = min_depth
self._max_depth = sys.maxsize if max_depth is None else max_depth
@@ -385,9 +542,14 @@ def max_depth(self, max_depth):
self.length = None
def iter_filepaths(self):
- """Lazily yield paths to each file in the directory structure within the specified
- range of depths. If a filename pattern to match was given, further filter to only
- those filenames that match.
+ """Generate (lazily) paths to each file in the directory structure within the specified range of depths.
+ If a filename pattern to match was given, further filter to only those filenames that match.
+
+ Yields
+ ------
+ str
+ Path to file
+
"""
for depth, dirpath, dirnames, filenames in walk(self.input):
if self.min_depth <= depth <= self.max_depth:
@@ -400,12 +562,13 @@ def iter_filepaths(self):
yield os.path.join(dirpath, name)
def getstream(self):
- """Yield documents from the underlying plain text collection (of one or more files).
- Each item yielded from this method will be considered a document by subsequent
- preprocessing methods.
+ """Generate documents from the underlying plain text collection (of one or more files).
+
+ Yields
+ ------
+ str
+ One document (if lines_are_documents - True), otherwise - each file is one document.
- If `lines_are_documents` was set to True, items will be lines from files. Otherwise
- there will be one item per file, containing the entire contents of the file.
"""
num_texts = 0
for path in self.iter_filepaths():
@@ -421,11 +584,20 @@ def getstream(self):
self.length = num_texts
def __len__(self):
+ """Get length of corpus.
+
+ Returns
+ -------
+ int
+ Length of corpus.
+
+ """
if self.length is None:
self._cache_corpus_length()
return self.length
def _cache_corpus_length(self):
+ """Calculate length of corpus and cache it to `self.length`."""
if not self.lines_are_documents:
self.length = sum(1 for _ in self.iter_filepaths())
else:
@@ -433,9 +605,40 @@ def _cache_corpus_length(self):
def walk(top, topdown=True, onerror=None, followlinks=False, depth=0):
- """This is a mostly copied version of `os.walk` from the Python 2 source code.
+ """Generate the file names in a directory tree by walking the tree either top-down or bottom-up.
+ For each directory in the tree rooted at directory top (including top itself), it yields a 4-tuple
+ (depth, dirpath, dirnames, filenames).
+
+ Parameters
+ ----------
+ top : str
+ Root directory.
+ topdown : bool, optional
+ If True - you can modify dirnames in-place.
+ onerror : function, optional
+ Some function, will be called with one argument, an OSError instance.
+ It can report the error to continue with the walk, or raise the exception to abort the walk.
+ Note that the filename is available as the filename attribute of the exception object.
+ followlinks : bool, optional
+ If True - visit directories pointed to by symlinks, on systems that support them.
+ depth : int, optional
+ Height of file-tree, don't pass it manually (this used as accumulator for recursion).
+
+ Notes
+ -----
+ This is a mostly copied version of `os.walk` from the Python 2 source code.
The only difference is that it returns the depth in the directory tree structure
at which each yield is taking place.
+
+ Yields
+ ------
+ (int, str, list of str, list of str)
+ Depth, current path, visited directories, visited non-directories.
+
+ See Also
+ --------
+ `os.walk documentation `_
+
"""
islink, join, isdir = os.path.islink, os.path.join, os.path.isdir
diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py
index a8911ee07f..fa2de4ce77 100644
--- a/gensim/corpora/ucicorpus.py
+++ b/gensim/corpora/ucicorpus.py
@@ -5,11 +5,7 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
-"""
-University of California, Irvine (UCI) Bag-of-Words format.
-
-http://archive.ics.uci.edu/ml/datasets/Bag+of+Words
-"""
+"""Corpus in `UCI format `_."""
from __future__ import with_statement
@@ -24,16 +20,19 @@
from six.moves import xrange
-logger = logging.getLogger('gensim.corpora.ucicorpus')
+logger = logging.getLogger(__name__)
class UciReader(MmReader):
+ """Reader of UCI format for :class:`gensim.corpora.ucicorpus.UciCorpus`."""
def __init__(self, input):
"""
- Initialize the reader.
- The `input` parameter refers to a file on the local filesystem,
- which is expected to be in the UCI Bag-of-Words format.
+ Parameters
+ ----------
+ input : str
+ Path to file in UCI format.
+
"""
logger.info('Initializing corpus reader from %s', input)
@@ -55,30 +54,34 @@ def __init__(self, input):
)
def skip_headers(self, input_file):
+ """Skip headers in `input_file`.
+
+ Parameters
+ ----------
+ input_file : file
+ File object.
+
+ """
for lineno, _ in enumerate(input_file):
if lineno == 2:
break
class UciWriter(MmWriter):
- """
- Store a corpus in UCI Bag-of-Words format.
-
- This corpus format is identical to MM format, except for
- different file headers. There is no format line, and the first
- three lines of the file contain number_docs, num_terms, and num_nnz,
- one value per line.
+ """Writer of UCI format for :class:`gensim.corpora.ucicorpus.UciCorpus`.
- This implementation is based on matutils.MmWriter, and works the same way.
+ Notes
+ ---------
+ This corpus format is identical to `Matrix Market format,
+ except for different file headers. There is no format line, and the first three lines of the file
+ contain `number_docs`, `num_terms`, and `num_nnz`, one value per line.
"""
MAX_HEADER_LENGTH = 20 # reserve 20 bytes per header value
FAKE_HEADER = utils.to_utf8(' ' * MAX_HEADER_LENGTH + '\n')
def write_headers(self):
- """
- Write blank header lines. Will be updated later, once corpus stats are known.
- """
+ """Write blank header lines. Will be updated later, once corpus stats are known."""
for _ in range(3):
self.fout.write(self.FAKE_HEADER)
@@ -86,9 +89,7 @@ def write_headers(self):
self.headers_written = True
def update_headers(self, num_docs, num_terms, num_nnz):
- """
- Update headers with actual values.
- """
+ """Update headers with actual values."""
offset = 0
values = [utils.to_utf8(str(n)) for n in [num_docs, num_terms, num_nnz]]
@@ -101,6 +102,25 @@ def update_headers(self, num_docs, num_terms, num_nnz):
@staticmethod
def write_corpus(fname, corpus, progress_cnt=1000, index=False):
+ """Write corpus in file.
+
+ Parameters
+ ----------
+ fname : str
+ Path to output file.
+ corpus: iterable of list of (int, int)
+ Corpus in BoW format.
+ progress_cnt : int, optional
+ Progress counter, write log message each `progress_cnt` documents.
+ index : bool, optional
+ If True - return offsets, otherwise - nothing.
+
+ Return
+ ------
+ list of int
+ Sequence of offsets to documents (in bytes), only if index=True.
+
+ """
writer = UciWriter(fname)
writer.write_headers()
@@ -139,10 +159,26 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False):
class UciCorpus(UciReader, IndexedCorpus):
- """
- Corpus in the UCI bag-of-words format.
- """
+ """Corpus in the UCI bag-of-words format."""
def __init__(self, fname, fname_vocab=None):
+ """
+ Parameters
+ ----------
+ fname : str
+ Path to corpus in UCI format.
+ fname_vocab : bool, optional
+ Path to vocab.
+
+ Examples
+ --------
+ >>> from gensim.corpora import UciCorpus
+ >>> from gensim.test.utils import datapath
+ >>>
+ >>> corpus = UciCorpus(datapath('testcorpus.uci'))
+ >>> for document in corpus:
+ ... pass
+
+ """
IndexedCorpus.__init__(self, fname)
UciReader.__init__(self, fname)
@@ -157,17 +193,32 @@ def __init__(self, fname, fname_vocab=None):
self.transposed = True
def __iter__(self):
- """
- Interpret a matrix in UCI bag-of-words format as a streamed gensim corpus
- (yielding one document at a time).
+ """Iterate over the corpus.
+
+ Yields
+ ------
+ list of (int, int)
+ Document in BoW format.
+
"""
for docId, doc in super(UciCorpus, self).__iter__():
yield doc # get rid of docId, return the sparse vector only
def create_dictionary(self):
- """
- Utility method to generate gensim-style Dictionary directly from
- the corpus and vocabulary data.
+ """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data.
+
+ Return
+ ------
+ :class:`gensim.corpora.dictionary.Dictionary`
+ Dictionary, based on corpus.
+
+ Examples
+ --------
+ >>> from gensim.corpora.ucicorpus import UciCorpus
+ >>> from gensim.test.utils import datapath
+ >>> ucc = UciCorpus(datapath('testcorpus.uci'))
+ >>> dictionary = ucc.create_dictionary()
+
"""
dictionary = Dictionary()
@@ -193,14 +244,30 @@ def create_dictionary(self):
@staticmethod
def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False):
- """
- Save a corpus in the UCI Bag-of-Words format.
-
- There are actually two files saved: `fname` and `fname.vocab`, where
- `fname.vocab` is the vocabulary file.
+ """Save a corpus in the UCI Bag-of-Words format.
+
+ Warnings
+ --------
+ This function is automatically called by :meth`gensim.corpora.ucicorpus.UciCorpus.serialize`,
+ don't call it directly, call :meth`gensim.corpora.ucicorpus.UciCorpus.serialize` instead.
+
+ Parameters
+ ----------
+ fname : str
+ Path to output file.
+ corpus: iterable of iterable of (int, int)
+ Corpus in BoW format.
+ id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional
+ Mapping between words and their ids. If None - will be inferred from `corpus`.
+ progress_cnt : int, optional
+ Progress counter, write log message each `progress_cnt` documents.
+ metadata : bool, optional
+ THIS PARAMETER WILL BE IGNORED.
+
+ Notes
+ -----
+ There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file.
- This function is automatically called by `UciCorpus.serialize`; don't
- call it directly, call `serialize` instead.
"""
if id2word is None:
logger.info("no word id mapping provided; initializing from corpus")