piskvorky · menshikh-iv · Jan 29, 2019 · Jan 24, 2019 · Jan 25, 2019 · Jan 25, 2019
diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
@@ -1,6 +1,9 @@
 # -*- coding: utf-8 -*-
 """Load models from the native binary format released by Facebook.
 
+The main entry point is the :py:func:`load` function.
+It returns a :py:class:`Model` namedtuple containing everything loaded from the binary.
+
 Examples
 --------
 

diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -14,6 +14,9 @@
 This module contains a fast native C implementation of Fasttext with Python interfaces. It is **not** only a wrapper
 around Facebook's implementation.
 
+This module supports loading models trained with Facebook's fastText implementation.
+It also supports continuing training from such models.
+
 For a tutorial see `this notebook
 <https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb>`_.
 
@@ -31,6 +34,15 @@
     >>> from gensim.models import FastText
     >>>
     >>> model = FastText(common_texts, size=4, window=3, min_count=1, iter=10)
+    >>> sentences = [
+    ...     ['computer', 'artificial', 'intelligence'],
+    ...     ['artificial', 'trees'],
+    ...     ['human', 'intelligence'],
+    ...     ['artificial', 'graph'],
+    ...     ['intelligence'],
+    ...     ['artificial', 'intelligence', 'system']
+    ... ]
+    >>> model.train(sentences, total_examples=len(sentences), epochs=model.epochs)
 
 Persist a model to disk with:
 
@@ -41,7 +53,51 @@
     >>> fname = get_tmpfile("fasttext.model")
     >>>
     >>> model.save(fname)
-    >>> model = FastText.load(fname)  # you can continue training with the loaded model!
+    >>> model = FastText.load(fname)
+
+Once loaded, such models behave identically to those created from scratch.
+For example, you can continue training the loaded model:
+
+.. sourcecode:: pycon
+
+    >>> import numpy as np
+    >>> old_computer = np.copy(model.wv['computer'])  # Grab the existing vector for this word
+    >>> new_sentences = [
+    ...     ['computers', 'expensive'],
+    ...     ['computer', 'chess', 'players', 'stronger', 'than', 'humans'],
+    ...     ['computers', 'are', 'everywhere'],
+    ... ]
+    >>> model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
+    >>> new_computer = model.wv['computer']
+    >>> # FIXME: why is this True??
+    >>> np.allclose(old_computer, new_computer, atol=1e-4)
+    False
+
+You can also load models trained with Facebook's fastText implementation:
+
+.. sourcecode:: pycon
+
+    >>> from gensim.test.utils import datapath
+    >>> cap_path = datapath("crime-and-punishment.bin")
+    >>> # Partial model: loads quickly, uses less RAM, but cannot continue training
+    >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)
+    >>> # Full model: loads slowly, consumes RAM, but can continue training (see below)
+    >>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True)
+
+Once loaded, such models behave identically to those trained from scratch.
+You may continue training them on new data:
+
+.. sourcecode:: pycon
+
+    >>> 'computer' in fb_full.wv.vocab  # New word, currently out of vocab
+    False
+    >>> old_computer = np.copy(fb_full.wv['computer'])  # Calculate current vectors
+    >>> fb_full.train(sentences, total_examples=len(sentences), epochs=model.epochs)
+    >>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
+    >>> new_computer = fb_full.wv['computer']
+    >>> # FIXME: why is this True??
+    >>> np.allclose(old_computer, new_computer, atol=1e-4)  # Vector has changed, model has learnt something
+    False
 
 Retrieve word-vector for vocab and out-of-vocab word:
 
@@ -85,6 +141,33 @@
 
     >>> analogies_result = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
 
+Implementation Notes
+--------------------
+
+These notes may help developers navigate our fastText implementation.
+The implementation is split across several submodules:
+
+- :py:mod:`gensim.models.fasttext`: This module. Contains FastText-specific functionality only.
+- :py:mod:`gensim.models.keyedvectors`: Implements both generic and FastText-specific functionality.
+- :py:mod:`gensim.models.word2vec`: Contains implementations for the vocabulary
+  and the trainables for FastText.
+- :py:mod:`gensim.models.base_any2vec`: Contains implementations for the base
+  classes, including functionality such as callbacks, logging.
+- :py:mod:`gensim.models.utils_any2vec`: Wrapper over Cython extensions.
+- :py:mod:`gensim.utils`: Implements model I/O (loading and saving)
+
+Our implementation relies heavily on inheritance.
+It consists of several important classes:
+
+- :py:class:`~gensim.models.word2vec.Word2VecVocab`: the vocabulary.
+  Keeps track of all the unique words, sometimes discarding the extremely rare ones.
+  This is sometimes called the Dictionary within Gensim.
+- :py:class:`~gensim.models.keyedvectors.FastTextKeyedVectors`: the vectors.
+  Once training is complete, this class is sufficient for calculating embeddings.
+- :py:class:`FastTextTrainables`: the underlying neural network. The implementation
+  uses this class to *learn* the word embeddings.
+- :py:class:`FastText`: ties everything together.
+
 """
 
 import logging
@@ -535,7 +618,7 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p
     def _set_train_params(self, **kwargs):
         #
         # We need the wv.buckets_word member to be initialized in order to
-        # continue training.  The _clear_post_train method destroys this
+        # continue training. The _clear_post_train method destroys this
         # variable, so we reinitialize it here, if needed.
         #
         # The .old_vocab_len and .old_hash2index_len members are set only to
@@ -771,7 +854,11 @@ def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True):
 
         Notes
         ------
-        Due to limitations in the FastText API, you cannot continue training with a model loaded this way.
+        Facebook provides both `.vec` and `.bin` files with their modules.
+        The former contains human-readable vectors.
+        The latter contains machine-readable vectors along with other model parameters.
+        This function effectively ignores `.vec` output file, since that file is redundant.
+        It only needs the `.bin` file.
 
         Parameters
         ----------
@@ -783,12 +870,12 @@ def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True):
         encoding : str, optional
             Specifies the file encoding.
         full_model : boolean, optional
-            If False, skips loading the hidden output matrix.  This saves a fair bit
+            If False, skips loading the hidden output matrix. This saves a fair bit
             of CPU time and RAM, but prevents training continuation.
 
         Returns
         -------
-        :class: `~gensim.models.fasttext.FastText`
+        gensim.models.fasttext.FastText
             The loaded model.
 
         """
@@ -856,7 +943,7 @@ def load(cls, *args, **kwargs):
 
             if not hasattr(model.wv, 'compatible_hash'):
                 logger.warning(
-                    "This older model was trained with a buggy hash function.  "
+                    "This older model was trained with a buggy hash function. "
                     "The model will continue to work, but consider training it "
                     "from scratch."
                 )
@@ -877,15 +964,44 @@ def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_inse
         return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive)
 
 
-#
-# Keep for backward compatibility.
-#
 class FastTextVocab(Word2VecVocab):
+    """This is a redundant class. It exists only to maintain backwards compatibility
+    with older gensim versions."""
     pass
 
 
 class FastTextTrainables(Word2VecTrainables):
-    """Represents the inner shallow neural network used to train :class:`~gensim.models.fasttext.FastText`."""
+    """Represents the inner shallow neural network used to train :class:`~gensim.models.fasttext.FastText`.
+
+    Mostly inherits from its parent (:py:class:`gensim.models.word2vec.Word2VecTrainables`).
+    Adds logic for calculating and maintaining ngram weights.
+
+    Attributes
+    ----------
+
+    hashfxn : function
+        Used for randomly initializing weights. Defaults to the built-in hash()
+    layer1_size : int
+        The size of the inner layer of the NN. Equal to the vector dimensionality. Set in the :py:class:`gensim.models.word2vec.Word2VecTrainables` constructor.
+    seed : float
+        The random generator seed used in reset_weights and update_weights
+    syn1 : numpy.array
+        The inner layer of the NN. Each row corresponds to a term in the vocabulary. Columns correspond to weights of the inner layer. There are layer1_size such weights. Set in the reset_weights and update_weights methods, only if hierarchical sampling is used.
+    syn1neg : numpy.array
+        Similar to syn1, but only set if negative sampling is used.
+    vectors_lockf : numpy.array
+        A one-dimensional array with one element for each term in the vocab. Set in reset_weights to an array of ones.
+    vectors_vocab_lockf : numpy.array
+        Similar to vectors_vocab_lockf, ones(len(model.trainables.vectors), dtype=REAL)
+    vectors_ngrams_lockf : numpy.array
+        np.ones((self.bucket, wv.vector_size), dtype=REAL)
+
+    Notes
+    -----
+
+    The lockf stuff looks like it gets used by the fast C implementation.
+
+    """
     def __init__(self, vector_size=100, seed=1, hashfxn=hash, bucket=2000000):
         super(FastTextTrainables, self).__init__(
             vector_size=vector_size, seed=seed, hashfxn=hashfxn)
@@ -899,17 +1015,17 @@ def __init__(self, vector_size=100, seed=1, hashfxn=hash, bucket=2000000):
         #   2. vectors_ngrams_lockf
         #
         # These are both 2D matrices of shapes equal to the shapes of
-        # wv.vectors_vocab and wv.vectors_ngrams.  So, each row corresponds to
+        # wv.vectors_vocab and wv.vectors_ngrams. So, each row corresponds to
         # a vector, and each column corresponds to a dimension within that
         # vector.
         #
         # Lockf stands for "lock factor": zero values suppress learning, one
-        # values enable it.  Interestingly, the vectors_vocab_lockf and
+        # values enable it. Interestingly, the vectors_vocab_lockf and
         # vectors_ngrams_lockf seem to be used only by the C code in
         # fasttext_inner.pyx.
         #
         # The word2vec implementation also uses vectors_lockf: in that case,
-        # it's a 1D array, with a real number for each vector.  The FastText
+        # it's a 1D array, with a real number for each vector. The FastText
         # implementation inherits this vectors_lockf attribute but doesn't
         # appear to use it.
         #
@@ -987,7 +1103,7 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True):
     encoding : str, optional
         Specifies the file encoding.
     full_model : boolean, optional
-        If False, skips loading the hidden output matrix.  This saves a fair bit
+        If False, skips loading the hidden output matrix. This saves a fair bit
         of CPU time and RAM, but prevents training continuation.
 
     Returns

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -1943,14 +1943,19 @@ class FastTextKeyedVectors(WordEmbeddingsKeyedVectors):
         If True, uses the Facebook-compatible hash function instead of the
         Gensim backwards-compatible hash function.
 
+    Some important attributes:
+
     Attributes
     ----------
     vectors_vocab : np.array
-        A vector for each entity in the vocabulary.
+        Each row corresponds to a vector for an entity in the vocabulary.
+        Columns correspond to vector dimensions.
     vectors_vocab_norm : np.array
         Same as vectors_vocab, but the vectors are L2 normalized.
     vectors_ngrams : np.array
         A vector for each ngram across all entities in the vocabulary.
+        Each row is a vector that corresponds to a bucket.
+        Columns correspond to vector dimensions.
     vectors_ngrams_norm : np.array
         Same as vectors_ngrams, but the vectors are L2 normalized.
         Under some conditions, may actually be the same matrix as
@@ -1964,7 +1969,8 @@ class FastTextKeyedVectors(WordEmbeddingsKeyedVectors):
         bucket to an index, and then indexing into vectors_ngrams (in other
         words, vectors_ngrams[hash2index[hash_fn(ngram) % bucket]].
     num_ngram_vectors : int
-        TODO
+        The number of vectors that correspond to ngrams, as opposed to terms
+        (full words).
 
     """
     def __init__(self, vector_size, min_n, max_n, bucket, compatible_hash):