diff --git a/docs/src/corpora/lowcorpus.rst b/docs/src/corpora/lowcorpus.rst index 5d084b7c58..05ed448893 100644 --- a/docs/src/corpora/lowcorpus.rst +++ b/docs/src/corpora/lowcorpus.rst @@ -1,8 +1,8 @@ -:mod:`corpora.lowcorpus` -- Corpus in List-of-Words format -=========================================================== +:mod:`corpora.lowcorpus` -- Corpus in GibbsLda++ format +======================================================= .. automodule:: gensim.corpora.lowcorpus - :synopsis: Corpus in List-of-Words format + :synopsis: Corpus in GibbsLda++ format :members: :inherited-members: :undoc-members: diff --git a/docs/src/corpora/malletcorpus.rst b/docs/src/corpora/malletcorpus.rst index 184b832dc5..72fcb472b7 100644 --- a/docs/src/corpora/malletcorpus.rst +++ b/docs/src/corpora/malletcorpus.rst @@ -1,8 +1,8 @@ -:mod:`corpora.malletcorpus` -- Corpus in Mallet format of List-Of-Words. -======================================================================== +:mod:`corpora.malletcorpus` -- Corpus in Mallet format +====================================================== .. automodule:: gensim.corpora.malletcorpus - :synopsis: Corpus in Mallet format of List-Of-Words. + :synopsis: Corpus in Mallet format. :members: :inherited-members: :undoc-members: diff --git a/docs/src/corpora/textcorpus.rst b/docs/src/corpora/textcorpus.rst index f0f3598a75..7fc346edba 100644 --- a/docs/src/corpora/textcorpus.rst +++ b/docs/src/corpora/textcorpus.rst @@ -1,8 +1,8 @@ -:mod:`corpora.textcorpus` -- Building corpora with dictionaries -================================================================= +:mod:`corpora.textcorpus` -- Tools for building corpora with dictionaries +========================================================================= .. automodule:: gensim.corpora.textcorpus - :synopsis: Building corpora with dictionaries + :synopsis: Tools for building corpora with dictionaries :members: :inherited-members: :undoc-members: diff --git a/docs/src/corpora/ucicorpus.rst b/docs/src/corpora/ucicorpus.rst index 75f6908478..37f753e5ef 100644 --- a/docs/src/corpora/ucicorpus.rst +++ b/docs/src/corpora/ucicorpus.rst @@ -1,8 +1,8 @@ -:mod:`corpora.ucicorpus` -- Corpus in UCI bag-of-words format -============================================================================================================== +:mod:`corpora.ucicorpus` -- Corpus in UCI format +================================================ .. automodule:: gensim.corpora.ucicorpus - :synopsis: Corpus in University of California, Irvine (UCI) bag-of-words format + :synopsis: Corpus in UCI format :members: :inherited-members: :undoc-members: diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index e293c998a1..277df249e5 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -5,9 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Corpus in GibbsLda++ format of List-Of-Words. -""" +"""Corpus in `GibbsLda++ format `_.""" from __future__ import with_statement @@ -19,21 +17,33 @@ from six.moves import xrange, zip as izip -logger = logging.getLogger('gensim.corpora.lowcorpus') +logger = logging.getLogger(__name__) def split_on_space(s): + """Split line by spaces, used in :class:`gensim.corpora.lowcorpus.LowCorpus`. + + Parameters + ---------- + s : str + Some line. + + Returns + ------- + list of str + List of tokens from `s`. + + """ return [word for word in utils.to_unicode(s).strip().split(' ') if word] class LowCorpus(IndexedCorpus): - """ - List_Of_Words corpus handles input in GibbsLda++ format. + """Corpus handles input in `GibbsLda++ format `_. - Quoting http://gibbslda.sourceforge.net/#3.2_Input_Data_Format:: + **Format description** - Both data for training/estimating the model and new data (i.e., previously - unseen data) have the same format as follows: + Both data for training/estimating the model and new data (i.e., previously unseen data) have the same format + as follows :: [M] [document1] @@ -41,26 +51,45 @@ class LowCorpus(IndexedCorpus): ... [documentM] - in which the first line is the total number for documents [M]. Each line - after that is one document. [documenti] is the ith document of the dataset - that consists of a list of Ni words/terms. + in which the first line is the total number for documents [M]. Each line after that is one document. + [documenti] is the ith document of the dataset that consists of a list of Ni words/terms :: [documenti] = [wordi1] [wordi2] ... [wordiNi] - in which all [wordij] (i=1..M, j=1..Ni) are text strings and they are separated - by the blank character. + in which all [wordij] (i=1..M, j=1..Ni) are text strings and they are separated by the blank character. + + Examples + -------- + >>> from gensim.test.utils import datapath, get_tmpfile, common_texts + >>> from gensim.corpora import LowCorpus + >>> from gensim.corpora import Dictionary + >>> + >>> # Prepare needed data + >>> dictionary = Dictionary(common_texts) + >>> corpus = [dictionary.doc2bow(doc) for doc in common_texts] + >>> + >>> # Write corpus in GibbsLda++ format to disk + >>> output_fname = get_tmpfile("corpus.low") + >>> LowCorpus.serialize(output_fname, corpus, dictionary) + >>> + >>> # Read corpus + >>> loaded_corpus = LowCorpus(output_fname) + """ def __init__(self, fname, id2word=None, line2words=split_on_space): """ - Initialize the corpus from a file. - `id2word` and `line2words` are optional parameters. - If provided, `id2word` is a dictionary mapping between word_ids (integers) - and words (strings). If not provided, the mapping is constructed from - the documents. + Parameters + ---------- + fname : str + Path to file in GibbsLda++ format. + id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional + Mapping between word_ids (integers) and words (strings). + If not provided, the mapping is constructed directly from `fname`. + line2words : callable, optional + Function which converts lines(str) into tokens(list of str), + using :func:`~gensim.corpora.lowcorpus.split_on_space` as default. - `line2words` is a function which converts lines into tokens. Defaults to - simple splitting on spaces. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) @@ -91,6 +120,14 @@ def __init__(self, fname, id2word=None, line2words=split_on_space): ) def _calculate_num_docs(self): + """Get number of documents in file. + + Returns + ------- + int + Number of documents. + + """ # the first line in input data is the number of documents (integer). throws exception on bad input. with utils.smart_open(self.fname) as fin: try: @@ -104,6 +141,19 @@ def __len__(self): return self.num_docs def line2doc(self, line): + """Covert line into document in BoW format. + + Parameters + ---------- + line : str + Line from input file. + + Returns + ------- + list of (int, int) + Document in BoW format + + """ words = self.line2words(line) if self.use_wordids: @@ -132,8 +182,13 @@ def line2doc(self, line): return doc def __iter__(self): - """ - Iterate over the corpus, returning one bag-of-words vector at a time. + """Iterate over the corpus. + + Yields + ------ + list of (int, int) + Document in BoW format. + """ with utils.smart_open(self.fname) as fin: for lineno, line in enumerate(fin): @@ -142,11 +197,31 @@ def __iter__(self): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): - """ - Save a corpus in the List-of-words format. + """Save a corpus in the GibbsLda++ format. + + Warnings + -------- + This function is automatically called by :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize`, + don't call it directly, call :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize` instead. + + Parameters + ---------- + fname : str + Path to output file. + corpus : iterable of iterable of (int, int) + Corpus in BoW format. + id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional + Mapping between word_ids (integers) and words (strings). + If not provided, the mapping is constructed directly from `corpus`. + metadata : bool, optional + THIS PARAMETER WILL BE IGNORED. + + Return + ------ + list of int + List of offsets in resulting file for each document (in bytes), + can be used for :meth:`~gensim.corpora.lowcorpus.LowCorpus.docbyoffset` - This function is automatically called by `LowCorpus.serialize`; don't - call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -174,8 +249,29 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): return offsets def docbyoffset(self, offset): - """ - Return the document stored at file position `offset`. + """Get the document stored in file by `offset` position. + + Parameters + ---------- + offset : int + Offset (in bytes) to begin of document. + + Returns + ------- + list of (int, int) + Document in BoW format. + + Examples + -------- + >>> from gensim.test.utils import datapath + >>> from gensim.corpora import LowCorpus + >>> + >>> data = LowCorpus(datapath("testcorpus.low")) + >>> data.docbyoffset(1) # end of first line + [] + >>> data.docbyoffset(2) # start of second line + [(0, 1), (3, 1), (4, 1)] + """ with utils.smart_open(self.fname) as f: f.seek(offset) @@ -183,6 +279,7 @@ def docbyoffset(self, offset): @property def id2word(self): + """Get mapping between words and their ids.""" return self._id2word @id2word.setter diff --git a/gensim/corpora/malletcorpus.py b/gensim/corpora/malletcorpus.py index cacf0074bd..37d7fc0d9d 100644 --- a/gensim/corpora/malletcorpus.py +++ b/gensim/corpora/malletcorpus.py @@ -3,9 +3,7 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Corpus in Mallet format of List-Of-Words. -""" +"""Corpus in `Mallet format `_.""" from __future__ import with_statement @@ -15,47 +13,114 @@ from gensim.corpora import LowCorpus -logger = logging.getLogger('gensim.corpora.malletcorpus') +logger = logging.getLogger(__name__) class MalletCorpus(LowCorpus): - """ - Quoting http://mallet.cs.umass.edu/import.php: + """Corpus handles input in `Mallet format `_. + + **Format description** - One file, one instance per line - Assume the data is in the following format: + One file, one instance per line, assume the data is in the following format :: [URL] [language] [text of the page...] - Or, more generally, + Or, more generally, :: + [document #1 id] [label] [text of the document...] [document #2 id] [label] [text of the document...] ... [document #N id] [label] [text of the document...] - Note that language/label is *not* considered in Gensim. + Note that language/label is *not* considered in Gensim, used `__unknown__` as default value. + + Examples + -------- + >>> from gensim.test.utils import datapath, get_tmpfile, common_texts + >>> from gensim.corpora import MalletCorpus + >>> from gensim.corpora import Dictionary + >>> + >>> # Prepare needed data + >>> dictionary = Dictionary(common_texts) + >>> corpus = [dictionary.doc2bow(doc) for doc in common_texts] + >>> + >>> # Write corpus in Mallet format to disk + >>> output_fname = get_tmpfile("corpus.mallet") + >>> MalletCorpus.serialize(output_fname, corpus, dictionary) + >>> + >>> # Read corpus + >>> loaded_corpus = MalletCorpus(output_fname) """ def __init__(self, fname, id2word=None, metadata=False): + """ + + Parameters + ---------- + fname : str + Path to file in Mallet format. + id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional + Mapping between word_ids (integers) and words (strings). + If not provided, the mapping is constructed directly from `fname`. + metadata : bool, optional + If True, return additional information ("document id" and "lang" when you call + :meth:`~gensim.corpora.malletcorpus.MalletCorpus.line2doc`, + :meth:`~gensim.corpora.malletcorpus.MalletCorpus.__iter__` or + :meth:`~gensim.corpora.malletcorpus.MalletCorpus.docbyoffset` + + """ self.metadata = metadata LowCorpus.__init__(self, fname, id2word) def _calculate_num_docs(self): + """Get number of documents. + + Returns + ------- + int + Number of documents in file. + + """ with utils.smart_open(self.fname) as fin: result = sum(1 for _ in fin) return result def __iter__(self): - """ - Iterate over the corpus at the given filename. + """Iterate over the corpus. + + Yields + ------ + list of (int, int) + Document in BoW format (+"document_id" and "lang" if metadata=True). - Yields a bag-of-words, a.k.a list of tuples of (word id, word count), based on the given id2word dictionary. """ with utils.smart_open(self.fname) as f: for line in f: yield self.line2doc(line) def line2doc(self, line): + """Covert line into document in BoW format. + + Parameters + ---------- + line : str + Line from input file. + + Returns + ------- + list of (int, int) + Document in BoW format (+"document_id" and "lang" if metadata=True). + + Examples + -------- + >>> from gensim.test.utils import datapath + >>> from gensim.corpora import MalletCorpus + >>> + >>> corpus = MalletCorpus(datapath("testcorpus.mallet")) + >>> corpus.line2doc("en computer human interface") + [(3, 1), (4, 1)] + + """ splited_line = [word for word in utils.to_unicode(line).strip().split(' ') if word] docid, doclang, words = splited_line[0], splited_line[1], splited_line[2:] @@ -68,18 +133,39 @@ def line2doc(self, line): @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): - """ - Save a corpus in the Mallet format. - + """Save a corpus in the Mallet format. + + Warnings + -------- + This function is automatically called by :meth:`gensim.corpora.malletcorpus.MalletCorpus.serialize`, + don't call it directly, call :meth:`gensim.corpora.lowcorpus.malletcorpus.MalletCorpus.serialize` instead. + + Parameters + ---------- + fname : str + Path to output file. + corpus : iterable of iterable of (int, int) + Corpus in BoW format. + id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional + Mapping between word_ids (integers) and words (strings). + If not provided, the mapping is constructed directly from `corpus`. + metadata : bool, optional + If True - ???? + + Return + ------ + list of int + List of offsets in resulting file for each document (in bytes), + can be used for :meth:`~gensim.corpora.malletcorpus.Malletcorpus.docbyoffset`. + + Notes + ----- The document id will be generated by enumerating the corpus. That is, it will range between 0 and number of documents in the corpus. Since Mallet has a language field in the format, this defaults to the string '__unknown__'. If the language needs to be saved, post-processing will be required. - This function is automatically called by `MalletCorpus.serialize`; don't - call it directly, call `serialize` instead. - """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") @@ -114,8 +200,29 @@ def save_corpus(fname, corpus, id2word=None, metadata=False): return offsets def docbyoffset(self, offset): - """ - Return the document stored at file position `offset`. + """Get the document stored in file by `offset` position. + + Parameters + ---------- + offset : int + Offset (in bytes) to begin of document. + + Returns + ------- + list of (int, int) + Document in BoW format (+"document_id" and "lang" if metadata=True). + + Examples + -------- + >>> from gensim.test.utils import datapath + >>> from gensim.corpora import MalletCorpus + >>> + >>> data = MalletCorpus(datapath("testcorpus.mallet")) + >>> data.docbyoffset(1) # end of first line + [(3, 1), (4, 1)] + >>> data.docbyoffset(4) # start of second line + [(4, 1)] + """ with utils.smart_open(self.fname) as f: f.seek(offset) diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index 7f78f5ca91..41ad492570 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -3,27 +3,34 @@ # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Text corpora usually reside on disk, as text files in one format or another -In a common scenario, we need to build a dictionary (a `word->integer id` -mapping), which is then used to construct sparse bag-of-word vectors -(= sequences of `(word_id, word_weight)` 2-tuples). - -This module provides some code scaffolding to simplify this pipeline. For -example, given a corpus where each document is a separate line in file on disk, -you would override the `TextCorpus.get_texts` method to read one line=document -at a time, process it (lowercase, tokenize, whatever) and yield it as a sequence -of words. - -Overriding `get_texts` is enough; you can then initialize the corpus with e.g. -`MyTextCorpus(bz2.BZ2File('mycorpus.txt.bz2'))` and it will behave correctly like a -corpus of sparse vectors. The `__iter__` methods is automatically set up, and -dictionary is automatically populated with all `word->id` mappings. - -The resulting object can be used as input to all gensim models (TFIDF, LSI, ...), -serialized with any format (Matrix Market, SvmLight, Blei's LDA-C format etc). - -See the `gensim.test.test_miislita.CorpusMiislita` class for a simple example. +"""Module provides some code scaffolding to simplify use of built dictionary for constructing BoW vectors. + +Notes +----- +Text corpora usually reside on disk, as text files in one format or another In a common scenario, +we need to build a dictionary (a `word->integer id` mapping), which is then used to construct sparse bag-of-word vectors +(= iterable of `(word_id, word_weight)`). + +This module provides some code scaffolding to simplify this pipeline. For example, given a corpus where each document +is a separate line in file on disk, you would override the :meth:`gensim.corpora.textcorpus.TextCorpus.get_texts` +to read one line=document at a time, process it (lowercase, tokenize, whatever) and yield it as a sequence of words. + +Overriding :meth:`gensim.corpora.textcorpus.TextCorpus.get_texts` is enough, you can then initialize the corpus +with e.g. `MyTextCorpus("mycorpus.txt.bz2")` and it will behave correctly like a corpus of sparse vectors. +The :meth:`~gensim.corpora.textcorpus.TextCorpus.__iter__` method is automatically set up, +and dictionary is automatically populated with all `word->id` mappings. + +The resulting object can be used as input to some of gensim models (:class:`~gensim.models.tfidfmodel.TfidfModel`, +:class:`~gensim.models.lsimodel.LsiModel`, :class:`~gensim.models.ldamodel.LdaModel`, ...), serialized with any format +(`Matrix Market `_, +`SvmLight `_, `Blei's LDA-C format `_, etc). + + +See Also +-------- +:class:`gensim.test.test_miislita.CorpusMiislita` + Good simple example. + """ @@ -44,98 +51,179 @@ def remove_stopwords(tokens, stopwords=STOPWORDS): - """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`.""" + """Remove stopwords using list from `gensim.parsing.preprocessing.STOPWORDS`. + + Parameters + ---------- + tokens : iterable of str + Sequence of tokens. + stopwords : iterable of str, optional + Sequence of stopwords + + Returns + ------- + list of str + List of tokens without `stopwords`. + + """ return [token for token in tokens if token not in stopwords] def remove_short(tokens, minsize=3): - """Remove tokens smaller than `minsize` chars, which is 3 by default.""" + """Remove tokens shorter than `minsize` chars. + + Parameters + ---------- + tokens : iterable of str + Sequence of tokens. + minsize : int, optimal + Minimal length of token (include). + + Returns + ------- + list of str + List of tokens without short tokens. + + """ return [token for token in tokens if len(token) >= minsize] def lower_to_unicode(text, encoding='utf8', errors='strict'): - """Lowercase `text` and convert to unicode.""" + """Lowercase `text` and convert to unicode, using :func:`gensim.utils.any2unicode`. + + Parameters + ---------- + text : str + Input text. + encoding : str, optional + Encoding that will be used for conversion. + errors : str, optional + Error handling behaviour, used as parameter for `unicode` function (python2 only). + + Returns + ------- + str + Unicode version of `text`. + + See Also + -------- + :func:`gensim.utils.any2unicode` + Convert any string to unicode-string. + + """ return utils.to_unicode(text.lower(), encoding, errors) def strip_multiple_whitespaces(s): - """Collapse multiple whitespace characters into a single space.""" + """Collapse multiple whitespace characters into a single space. + + Parameters + ---------- + s : str + Input string + + Returns + ------- + str + String with collapsed whitespaces. + + """ return RE_WHITESPACE.sub(" ", s) class TextCorpus(interfaces.CorpusABC): - """Helper class to simplify the pipeline of getting bag-of-words vectors (= a - gensim corpus) from plain text. + """Helper class to simplify the pipeline of getting BoW vectors from plain text. - This is an abstract base class: override the `get_texts()` and `__len__()` - methods to match your particular input. + Notes + ----- + This is an abstract base class: override the :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` and + :meth:`~gensim.corpora.textcorpus.TextCorpus.__len__` methods to match your particular input. - Given a filename (or a file-like object) in constructor, the corpus object - will be automatically initialized with a dictionary in `self.dictionary` and - will support the `iter` corpus method. You have a few different ways of utilizing - this class via subclassing or by construction with different preprocessing arguments. + Given a filename (or a file-like object) in constructor, the corpus object will be automatically initialized + with a dictionary in `self.dictionary` and will support the :meth:`~gensim.corpora.textcorpus.TextCorpus.__iter__` + corpus method. You have a few different ways of utilizing this class via subclassing or by construction with + different preprocessing arguments. - The `iter` method converts the lists of tokens produced by `get_texts` to BoW format - using `Dictionary.doc2bow`. `get_texts` does the following: + The :meth:`~gensim.corpora.textcorpus.TextCorpus.__iter__` method converts the lists of tokens produced by + :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` to BoW format using + :meth:`gensim.corpora.dictionary.Dictionary.doc2bow`. - 1. Calls `getstream` to get a generator over the texts. It yields each document in - turn from the underlying text file or files. - 2. For each document from the stream, calls `preprocess_text` to produce a list of - tokens; if metadata is enabled, it yields a 2-`tuple` with the document number as - the second element. + :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` does the following: + #. Calls :meth:`~gensim.corpora.textcorpus.TextCorpus.getstream` to get a generator over the texts. + It yields each document in turn from the underlying text file or files. + #. For each document from the stream, calls :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` to produce + a list of tokens. If metadata=True, it yields a 2-`tuple` with the document number as the second element. Preprocessing consists of 0+ `character_filters`, a `tokenizer`, and 0+ `token_filters`. - The preprocessing consists of calling each filter in `character_filters` with the document - text; unicode is not guaranteed, and if desired, the first filter should convert to unicode. - The output of each character filter should be another string. The output from the final - filter is fed to the `tokenizer`, which should split the string into a list of tokens (strings). - Afterwards, the list of tokens is fed through each filter in `token_filters`. The final - output returned from `preprocess_text` is the output from the final token filter. + The preprocessing consists of calling each filter in `character_filters` with the document text. + Unicode is not guaranteed, and if desired, the first filter should convert to unicode. + The output of each character filter should be another string. The output from the final filter is fed + to the `tokenizer`, which should split the string into a list of tokens (strings). + Afterwards, the list of tokens is fed through each filter in `token_filters`. The final output returned from + :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` is the output from the final token filter. So to use this class, you can either pass in different preprocessing functions using the `character_filters`, `tokenizer`, and `token_filters` arguments, or you can subclass it. - If subclassing: override `getstream` to take text from different input sources in different - formats. Overrride `preprocess_text` if you must provide different initial preprocessing, - then call the `TextCorpus.preprocess_text` method to apply the normal preprocessing. You - can also overrride `get_texts` in order to tag the documents (token lists) with different - metadata. + + If subclassing: override :meth:`~gensim.corpora.textcorpus.TextCorpus.getstream` to take text from different input + sources in different formats. + Override :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` if you must provide different initial + preprocessing, then call the :meth:`~gensim.corpora.textcorpus.TextCorpus.preprocess_text` method to apply + the normal preprocessing. + You can also override :meth:`~gensim.corpora.textcorpus.TextCorpus.get_texts` in order to tag the documents + (token lists) with different metadata. The default preprocessing consists of: - 1. lowercase and convert to unicode; assumes utf8 encoding - 2. deaccent (asciifolding) - 3. collapse multiple whitespaces into a single one - 4. tokenize by splitting on whitespace - 5. remove words less than 3 characters long - 6. remove stopwords; see `gensim.parsing.preprocessing` for the list of stopwords + #. :func:`~gensim.corpora.textcorpus.lower_to_unicode` - lowercase and convert to unicode (assumes utf8 encoding) + #. :func:`~gensim.utils.deaccent`- deaccent (asciifolding) + #. :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces` - collapse multiple whitespaces into a single one + #. :func:`~gensim.utils.simple_tokenize` - tokenize by splitting on whitespace + #. :func:`~gensim.corpora.textcorpus.remove_short` - remove words less than 3 characters long + #. :func:`~gensim.corpora.textcorpus.remove_stopwords` - remove stopwords """ def __init__(self, input=None, dictionary=None, metadata=False, character_filters=None, tokenizer=None, token_filters=None): """ - Args: - input (str): path to top-level directory to traverse for corpus documents. - dictionary (Dictionary): if a dictionary is provided, it will not be updated - with the given corpus on initialization. If none is provided, a new dictionary - will be built for the given corpus. If no corpus is given, the dictionary will - remain uninitialized. - metadata (bool): True to yield metadata with each document, else False (default). - character_filters (iterable of callable): each will be applied to the text of each - document in order, and should return a single string with the modified text. - For Python 2, the original text will not be unicode, so it may be useful to - convert to unicode as the first character filter. The default character filters - lowercase, convert to unicode (strict utf8), perform ASCII-folding, then collapse - multiple whitespaces. - tokenizer (callable): takes as input the document text, preprocessed by all filters - in `character_filters`; should return an iterable of tokens (strings). - token_filters (iterable of callable): each will be applied to the iterable of tokens - in order, and should return another iterable of tokens. These filters can add, - remove, or replace tokens, or do nothing at all. The default token filters - remove tokens less than 3 characters long and remove stopwords using the list - in `gensim.parsing.preprocessing.STOPWORDS`. + + Parameters + ---------- + input : str, optional + Path to top-level directory (file) to traverse for corpus documents. + dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional + If a dictionary is provided, it will not be updated with the given corpus on initialization. + If None - new dictionary will be built for the given corpus. + If `input` is None, the dictionary will remain uninitialized. + metadata : bool, optional + If True - yield metadata with each document. + character_filters : iterable of callable, optional + Each will be applied to the text of each document in order, and should return a single string with + the modified text. For Python 2, the original text will not be unicode, so it may be useful to + convert to unicode as the first character filter. + If None - using :func:`~gensim.corpora.textcorpus.lower_to_unicode`, + :func:`~gensim.utils.deaccent` and :func:`~gensim.corpora.textcorpus.strip_multiple_whitespaces`. + tokenizer : callable, optional + Tokenizer for document, if None - using :func:`~gensim.utils.simple_tokenize`. + token_filters : iterable of callable, optional + Each will be applied to the iterable of tokens in order, and should return another iterable of tokens. + These filters can add, remove, or replace tokens, or do nothing at all. + If None - using :func:`~gensim.corpora.textcorpus.remove_short` and + :func:`~gensim.corpora.textcorpus.remove_stopwords`. + + Examples + -------- + >>> #TODO Example with inheritance + >>> from gensim.corpora.textcorpus import TextCorpus + >>> from gensim.test.utils import datapath + >>> + >>> corpus = TextCorpus(datapath('head500.noblanks.cor.bz2')) + >>> for bow in corpus: + ... pass + """ self.input = input self.metadata = metadata @@ -157,9 +245,18 @@ def __init__(self, input=None, dictionary=None, metadata=False, character_filter self.init_dictionary(dictionary) def init_dictionary(self, dictionary): - """If `dictionary` is None, initialize to an empty Dictionary, and then if there - is an `input` for the corpus, add all documents from that `input`. If the - `dictionary` is already initialized, simply set it as the corpus's `dictionary`. + """Initialize/update dictionary. + + Parameters + ---------- + dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional + If a dictionary is provided, it will not be updated with the given corpus on initialization. + If None - new dictionary will be built for the given corpus. + + Notes + ----- + If self.input is None - make nothing. + """ self.dictionary = dictionary if dictionary is not None else Dictionary() if self.input is not None: @@ -175,9 +272,13 @@ def init_dictionary(self, dictionary): logger.warning("No input document stream provided; assuming dictionary will be initialized some other way.") def __iter__(self): - """The function that defines a corpus. + """Iterate over the corpus. + + Yields + ------ + list of (int, int) + Document in BoW format (+ metadata if self.metadata). - Iterating over the corpus must yield sparse vectors, one for each document. """ if self.metadata: for text, metadata in self.get_texts(): @@ -187,9 +288,17 @@ def __iter__(self): yield self.dictionary.doc2bow(text, allow_update=False) def getstream(self): - """Yield documents from the underlying plain text collection (of one or more files). - Each item yielded from this method will be considered a document by subsequent - preprocessing methods. + """Generate documents from the underlying plain text collection (of one or more files). + + Yields + ------ + str + Document read from plain-text file. + + Notes + ----- + After generator end - initialize self.length attribute. + """ num_texts = 0 with utils.file_or_filename(self.input) as f: @@ -200,14 +309,18 @@ def getstream(self): self.length = num_texts def preprocess_text(self, text): - """Apply preprocessing to a single text document. This should perform tokenization - in addition to any other desired preprocessing steps. + """Apply `self.character_filters`, `self.tokenizer`, `self.token_filters` to a single text document. + + Parameters + --------- + text : str + Document read from plain-text file. - Args: - text (str): document text read from plain-text file. + Return + ------ + list of str + List of tokens extracted from `text`. - Returns: - iterable of str: tokens produced from `text` as a result of preprocessing. """ for character_filter in self.character_filters: text = character_filter(text) @@ -219,8 +332,22 @@ def preprocess_text(self, text): return tokens def step_through_preprocess(self, text): - """Yield tuples of functions and their output for each stage of preprocessing. + """Apply preprocessor one by one and generate result. + + Warnings + -------- This is useful for debugging issues with the corpus preprocessing pipeline. + + Parameters + ---------- + text : str + Document text read from plain-text file. + + Yields + ------ + (callable, object) + Pre-processor, output from pre-processor (based on `text`) + """ for character_filter in self.character_filters: text = character_filter(text) @@ -233,16 +360,13 @@ def step_through_preprocess(self, text): yield (token_filter, token_filter(tokens)) def get_texts(self): - """Iterate over the collection, yielding one document at a time. A document - is a sequence of words (strings) that can be fed into `Dictionary.doc2bow`. - Each document will be fed through `preprocess_text`. That method should be - overridden to provide different preprocessing steps. This method will need - to be overridden if the metadata you'd like to yield differs from the line - number. - - Returns: - generator of lists of tokens (strings); each list corresponds to a preprocessed - document from the corpus `input`. + """Generate documents from corpus. + + Yields + ------ + list of str + Document as sequence of tokens (+ lineno if self.metadata) + """ lines = self.getstream() if self.metadata: @@ -253,25 +377,34 @@ def get_texts(self): yield self.preprocess_text(line) def sample_texts(self, n, seed=None, length=None): - """Yield n random documents from the corpus without replacement. - + """Generate `n` random documents from the corpus without replacement. + + Parameters + ---------- + n : int + Number of documents we want to sample. + seed : int, optional + If specified, use it as a seed for local random generator. + length : int, optional + Value will used as corpus length (because calculate length of corpus can be costly operation). + If not specified - will call `__length__`. + + Raises + ------ + ValueError + If `n` less than zero or greater than corpus size. + + Notes + ----- Given the number of remaining documents in a corpus, we need to choose n elements. - The probability for the current element to be chosen is n/remaining. - If we choose it, we just decrease the n and move to the next element. - Computing the corpus length may be a costly operation so you can use the optional - parameter `length` instead. - - Args: - n (int): number of documents we want to sample. - seed (int|None): if specified, use it as a seed for local random generator. - length (int|None): if specified, use it as a guess of corpus length. - It must be positive and not greater than actual corpus length. - - Yields: - list[str]: document represented as a list of tokens. See get_texts method. - - Raises: - ValueError: when n is invalid or length was set incorrectly. + The probability for the current element to be chosen is `n` / remaining. If we choose it, we just decrease + the `n` and move to the next element. + + Yields + ------ + list of str + Sampled document as sequence of tokens. + """ random_generator = random if seed is None else random.Random(seed) if length is None: @@ -302,6 +435,19 @@ def sample_texts(self, n, seed=None, length=None): raise ValueError("length {0:d} greater than number of documents in corpus {1:d}".format(length, i + 1)) def __len__(self): + """Get length of corpus + + Warnings + -------- + If self.length is None - will read all corpus for calculate this attribute through + :meth:`~gensim.corpora.textcorpus.TextCorpus.getstream`. + + Returns + ------- + int + Length of corpus. + + """ if self.length is None: # cache the corpus length self.length = sum(1 for _ in self.getstream()) @@ -309,28 +455,39 @@ def __len__(self): class TextDirectoryCorpus(TextCorpus): - """Read documents recursively from a directory, - where each file (or line of each file) is interpreted as a plain text document. + """Read documents recursively from a directory. + Each file/line (depends on `lines_are_documents`) is interpreted as a plain text document. + """ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_depth=None, pattern=None, exclude_pattern=None, lines_are_documents=False, **kwargs): """ - Args: - min_depth (int): minimum depth in directory tree at which to begin searching for - files. The default is 0, which means files starting in the top-level directory - `input` will be considered. - max_depth (int): max depth in directory tree at which files will no longer be - considered. The default is None, which means recurse through all subdirectories. - pattern (str or Pattern): regex to use for file name inclusion; all those files *not* - matching this pattern will be ignored. - exclude_pattern (str or Pattern): regex to use for file name exclusion; all files - matching this pattern will be ignored. - lines_are_documents (bool): if True, each line of each file is considered to be a - document. If False (default), each file is considered to be a document. - kwargs: keyword arguments passed through to the `TextCorpus` constructor. This is - in addition to the non-kwargs `input`, `dictionary`, and `metadata`. See - `TextCorpus.__init__` docstring for more details on these. + + Parameters + ---------- + input : str + Path to input file/folder. + dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional + If a dictionary is provided, it will not be updated with the given corpus on initialization. + If None - new dictionary will be built for the given corpus. + If `input` is None, the dictionary will remain uninitialized. + metadata : bool, optional + If True - yield metadata with each document. + min_depth : int, optional + Minimum depth in directory tree at which to begin searching for files. + max_depth : int, optional + Max depth in directory tree at which files will no longer be considered. + If None - not limited. + pattern : str, optional + Regex to use for file name inclusion, all those files *not* matching this pattern will be ignored. + exclude_pattern : str, optional + Regex to use for file name exclusion, all files matching this pattern will be ignored. + lines_are_documents : bool, optional + If True - each line is considered a document, otherwise - each file is one document. + kwargs: keyword arguments passed through to the `TextCorpus` constructor. + See :meth:`gemsim.corpora.textcorpus.TextCorpus.__init__` docstring for more details on these. + """ self._min_depth = min_depth self._max_depth = sys.maxsize if max_depth is None else max_depth @@ -385,9 +542,14 @@ def max_depth(self, max_depth): self.length = None def iter_filepaths(self): - """Lazily yield paths to each file in the directory structure within the specified - range of depths. If a filename pattern to match was given, further filter to only - those filenames that match. + """Generate (lazily) paths to each file in the directory structure within the specified range of depths. + If a filename pattern to match was given, further filter to only those filenames that match. + + Yields + ------ + str + Path to file + """ for depth, dirpath, dirnames, filenames in walk(self.input): if self.min_depth <= depth <= self.max_depth: @@ -400,12 +562,13 @@ def iter_filepaths(self): yield os.path.join(dirpath, name) def getstream(self): - """Yield documents from the underlying plain text collection (of one or more files). - Each item yielded from this method will be considered a document by subsequent - preprocessing methods. + """Generate documents from the underlying plain text collection (of one or more files). + + Yields + ------ + str + One document (if lines_are_documents - True), otherwise - each file is one document. - If `lines_are_documents` was set to True, items will be lines from files. Otherwise - there will be one item per file, containing the entire contents of the file. """ num_texts = 0 for path in self.iter_filepaths(): @@ -421,11 +584,20 @@ def getstream(self): self.length = num_texts def __len__(self): + """Get length of corpus. + + Returns + ------- + int + Length of corpus. + + """ if self.length is None: self._cache_corpus_length() return self.length def _cache_corpus_length(self): + """Calculate length of corpus and cache it to `self.length`.""" if not self.lines_are_documents: self.length = sum(1 for _ in self.iter_filepaths()) else: @@ -433,9 +605,40 @@ def _cache_corpus_length(self): def walk(top, topdown=True, onerror=None, followlinks=False, depth=0): - """This is a mostly copied version of `os.walk` from the Python 2 source code. + """Generate the file names in a directory tree by walking the tree either top-down or bottom-up. + For each directory in the tree rooted at directory top (including top itself), it yields a 4-tuple + (depth, dirpath, dirnames, filenames). + + Parameters + ---------- + top : str + Root directory. + topdown : bool, optional + If True - you can modify dirnames in-place. + onerror : function, optional + Some function, will be called with one argument, an OSError instance. + It can report the error to continue with the walk, or raise the exception to abort the walk. + Note that the filename is available as the filename attribute of the exception object. + followlinks : bool, optional + If True - visit directories pointed to by symlinks, on systems that support them. + depth : int, optional + Height of file-tree, don't pass it manually (this used as accumulator for recursion). + + Notes + ----- + This is a mostly copied version of `os.walk` from the Python 2 source code. The only difference is that it returns the depth in the directory tree structure at which each yield is taking place. + + Yields + ------ + (int, str, list of str, list of str) + Depth, current path, visited directories, visited non-directories. + + See Also + -------- + `os.walk documentation `_ + """ islink, join, isdir = os.path.islink, os.path.join, os.path.isdir diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index a8911ee07f..fa2de4ce77 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -5,11 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -University of California, Irvine (UCI) Bag-of-Words format. - -http://archive.ics.uci.edu/ml/datasets/Bag+of+Words -""" +"""Corpus in `UCI format `_.""" from __future__ import with_statement @@ -24,16 +20,19 @@ from six.moves import xrange -logger = logging.getLogger('gensim.corpora.ucicorpus') +logger = logging.getLogger(__name__) class UciReader(MmReader): + """Reader of UCI format for :class:`gensim.corpora.ucicorpus.UciCorpus`.""" def __init__(self, input): """ - Initialize the reader. - The `input` parameter refers to a file on the local filesystem, - which is expected to be in the UCI Bag-of-Words format. + Parameters + ---------- + input : str + Path to file in UCI format. + """ logger.info('Initializing corpus reader from %s', input) @@ -55,30 +54,34 @@ def __init__(self, input): ) def skip_headers(self, input_file): + """Skip headers in `input_file`. + + Parameters + ---------- + input_file : file + File object. + + """ for lineno, _ in enumerate(input_file): if lineno == 2: break class UciWriter(MmWriter): - """ - Store a corpus in UCI Bag-of-Words format. - - This corpus format is identical to MM format, except for - different file headers. There is no format line, and the first - three lines of the file contain number_docs, num_terms, and num_nnz, - one value per line. + """Writer of UCI format for :class:`gensim.corpora.ucicorpus.UciCorpus`. - This implementation is based on matutils.MmWriter, and works the same way. + Notes + --------- + This corpus format is identical to `Matrix Market format, + except for different file headers. There is no format line, and the first three lines of the file + contain `number_docs`, `num_terms`, and `num_nnz`, one value per line. """ MAX_HEADER_LENGTH = 20 # reserve 20 bytes per header value FAKE_HEADER = utils.to_utf8(' ' * MAX_HEADER_LENGTH + '\n') def write_headers(self): - """ - Write blank header lines. Will be updated later, once corpus stats are known. - """ + """Write blank header lines. Will be updated later, once corpus stats are known.""" for _ in range(3): self.fout.write(self.FAKE_HEADER) @@ -86,9 +89,7 @@ def write_headers(self): self.headers_written = True def update_headers(self, num_docs, num_terms, num_nnz): - """ - Update headers with actual values. - """ + """Update headers with actual values.""" offset = 0 values = [utils.to_utf8(str(n)) for n in [num_docs, num_terms, num_nnz]] @@ -101,6 +102,25 @@ def update_headers(self, num_docs, num_terms, num_nnz): @staticmethod def write_corpus(fname, corpus, progress_cnt=1000, index=False): + """Write corpus in file. + + Parameters + ---------- + fname : str + Path to output file. + corpus: iterable of list of (int, int) + Corpus in BoW format. + progress_cnt : int, optional + Progress counter, write log message each `progress_cnt` documents. + index : bool, optional + If True - return offsets, otherwise - nothing. + + Return + ------ + list of int + Sequence of offsets to documents (in bytes), only if index=True. + + """ writer = UciWriter(fname) writer.write_headers() @@ -139,10 +159,26 @@ def write_corpus(fname, corpus, progress_cnt=1000, index=False): class UciCorpus(UciReader, IndexedCorpus): - """ - Corpus in the UCI bag-of-words format. - """ + """Corpus in the UCI bag-of-words format.""" def __init__(self, fname, fname_vocab=None): + """ + Parameters + ---------- + fname : str + Path to corpus in UCI format. + fname_vocab : bool, optional + Path to vocab. + + Examples + -------- + >>> from gensim.corpora import UciCorpus + >>> from gensim.test.utils import datapath + >>> + >>> corpus = UciCorpus(datapath('testcorpus.uci')) + >>> for document in corpus: + ... pass + + """ IndexedCorpus.__init__(self, fname) UciReader.__init__(self, fname) @@ -157,17 +193,32 @@ def __init__(self, fname, fname_vocab=None): self.transposed = True def __iter__(self): - """ - Interpret a matrix in UCI bag-of-words format as a streamed gensim corpus - (yielding one document at a time). + """Iterate over the corpus. + + Yields + ------ + list of (int, int) + Document in BoW format. + """ for docId, doc in super(UciCorpus, self).__iter__(): yield doc # get rid of docId, return the sparse vector only def create_dictionary(self): - """ - Utility method to generate gensim-style Dictionary directly from - the corpus and vocabulary data. + """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data. + + Return + ------ + :class:`gensim.corpora.dictionary.Dictionary` + Dictionary, based on corpus. + + Examples + -------- + >>> from gensim.corpora.ucicorpus import UciCorpus + >>> from gensim.test.utils import datapath + >>> ucc = UciCorpus(datapath('testcorpus.uci')) + >>> dictionary = ucc.create_dictionary() + """ dictionary = Dictionary() @@ -193,14 +244,30 @@ def create_dictionary(self): @staticmethod def save_corpus(fname, corpus, id2word=None, progress_cnt=10000, metadata=False): - """ - Save a corpus in the UCI Bag-of-Words format. - - There are actually two files saved: `fname` and `fname.vocab`, where - `fname.vocab` is the vocabulary file. + """Save a corpus in the UCI Bag-of-Words format. + + Warnings + -------- + This function is automatically called by :meth`gensim.corpora.ucicorpus.UciCorpus.serialize`, + don't call it directly, call :meth`gensim.corpora.ucicorpus.UciCorpus.serialize` instead. + + Parameters + ---------- + fname : str + Path to output file. + corpus: iterable of iterable of (int, int) + Corpus in BoW format. + id2word : {dict of (int, str), :class:`gensim.corpora.dictionary.Dictionary`}, optional + Mapping between words and their ids. If None - will be inferred from `corpus`. + progress_cnt : int, optional + Progress counter, write log message each `progress_cnt` documents. + metadata : bool, optional + THIS PARAMETER WILL BE IGNORED. + + Notes + ----- + There are actually two files saved: `fname` and `fname.vocab`, where `fname.vocab` is the vocabulary file. - This function is automatically called by `UciCorpus.serialize`; don't - call it directly, call `serialize` instead. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus")