piskvorky · menshikh-iv · Jan 14, 2019 · Mar 26, 2018 · Mar 26, 2018 · Mar 27, 2018
diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -14,6 +14,7 @@
 import math
 
 from gensim import utils
+from gensim.utils import deprecated
 
 import numpy as np
 import scipy.sparse
@@ -775,6 +776,9 @@ def cossim(vec1, vec2):
     return result
 
 
+@deprecated(
+    "Function will be removed in 4.0.0, use " +
+    "gensim.models.term_similarity.SparseTermSimilarityMatrix.inner_product instead")
 def softcossim(vec1, vec2, similarity_matrix):
     """Get Soft Cosine Measure between two vectors given a term similarity matrix.
 
@@ -789,8 +793,10 @@ def softcossim(vec1, vec2, similarity_matrix):
     vec2 : list of (int, float)
         A document vector in the BoW format.
     similarity_matrix : {:class:`scipy.sparse.csc_matrix`, :class:`scipy.sparse.csr_matrix`}
-        A term similarity matrix, typically produced by
-        :meth:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`.
+        A term similarity matrix. If the matrix is :class:`scipy.sparse.csr_matrix`, it is going
+        to be transposed. If you rely on the fact that there is at most a constant number of
+        non-zero elements in a single column, it is your responsibility to ensure that the matrix
+        is symmetric.
 
     Returns
     -------
@@ -806,6 +812,8 @@ def softcossim(vec1, vec2, similarity_matrix):
     --------
     :meth:`gensim.models.keyedvectors.WordEmbeddingsKeyedVectors.similarity_matrix`
         A term similarity matrix produced from term embeddings.
+    :func:`gensim.models.levenshtein.similarity_matrix`
+        A term similarity matrix produced from Levenshtein distances.
     :class:`gensim.similarities.docsim.SoftCosineSimilarity`
         A class for performing corpus-based similarity queries with Soft Cosine Measure.
 

diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py
@@ -13,15 +13,18 @@
 from .logentropy_model import LogEntropyModel  # noqa:F401
 from .word2vec import Word2Vec  # noqa:F401
 from .doc2vec import Doc2Vec  # noqa:F401
-from .keyedvectors import KeyedVectors  # noqa:F401
+from .keyedvectors import KeyedVectors, WordEmbeddingSimilarityIndex  # noqa:F401
 from .ldamulticore import LdaMulticore  # noqa:F401
 from .phrases import Phrases  # noqa:F401
 from .normmodel import NormModel  # noqa:F401
 from .atmodel import AuthorTopicModel  # noqa:F401
 from .ldaseqmodel import LdaSeqModel  # noqa:F401
 from .fasttext import FastText  # noqa:F401
 from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix  # noqa:F401
+from .term_similarity import TermSimilarityIndex, UniformTermSimilarityIndex, SparseTermSimilarityMatrix  # noqa:F401
+from .levenshtein import LevenshteinSimilarityIndex  # noqa:F401
 
+from . import levenshtein  # noqa:F401
 from . import wrappers  # noqa:F401
 from . import deprecated  # noqa:F401
 

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -76,13 +76,15 @@
     double, array, zeros, vstack, sqrt, newaxis, integer, \
     ndarray, sum as np_sum, prod, argmax, divide as np_divide
 import numpy as np
+
 from gensim import utils, matutils  # utility fnc for pickling, common scipy operations etc
 from gensim.corpora.dictionary import Dictionary
 from six import string_types, integer_types
 from six.moves import xrange, zip
-from scipy import sparse, stats
+from scipy import stats
 from gensim.utils import deprecated
 from gensim.models.utils_any2vec import _save_word2vec_format, _load_word2vec_format, _compute_ngrams, _ft_hash
+from gensim.models.term_similarity import TermSimilarityIndex, SparseTermSimilarityMatrix
 
 logger = logging.getLogger(__name__)
 
@@ -497,33 +499,33 @@ def similar_by_vector(self, vector, topn=10, restrict_vocab=None):
         """
         return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab)
 
+    @deprecated(
+        "Method will be removed in 4.0.0, use " +
+        "gensim.models.keyedvectors.WordEmbeddingSimilarityIndex instead")
     def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0, nonzero_limit=100, dtype=REAL):
         """Constructs a term similarity matrix for computing Soft Cosine Measure.
 
-        Constructs a a sparse term similarity matrix in the :class:`scipy.sparse.csc_matrix` format for computing
-        Soft Cosine Measure between documents.
+        Constructs a sparse term similarity matrix in the :class:`scipy.sparse.csc_matrix` format
+        for computing Soft Cosine Measure between documents.
 
         Parameters
         ----------
         dictionary : :class:`~gensim.corpora.dictionary.Dictionary`
-            A dictionary that specifies a mapping between words and the indices of rows and columns
-            of the resulting term similarity matrix.
-        tfidf : :class:`gensim.models.tfidfmodel.TfidfModel`, optional
-            A model that specifies the relative importance of the terms in the dictionary. The rows
-            of the term similarity matrix will be build in a decreasing order of importance of terms,
-            or in the order of term identifiers if None.
+            A dictionary that specifies the considered terms.
+        tfidf : :class:`gensim.models.tfidfmodel.TfidfModel` or None, optional
+            A model that specifies the relative importance of the terms in the dictionary. The
+            columns of the term similarity matrix will be build in a decreasing order of importance
+            of terms, or in the order of term identifiers if None.
         threshold : float, optional
-            Only pairs of words whose embeddings are more similar than `threshold` are considered
-            when building the sparse term similarity matrix.
+            Only embeddings more similar than `threshold` are considered when retrieving word
+            embeddings closest to a given word embedding.
         exponent : float, optional
-            The exponent applied to the similarity between two word embeddings when building the term similarity matrix.
+            Take the word embedding similarities larger than `threshold` to the power of `exponent`.
         nonzero_limit : int, optional
-            The maximum number of non-zero elements outside the diagonal in a single row or column
-            of the term similarity matrix. Setting `nonzero_limit` to a constant ensures that the
-            time complexity of computing the Soft Cosine Measure will be linear in the document
-            length rather than quadratic.
+            The maximum number of non-zero elements outside the diagonal in a single column of the
+            sparse term similarity matrix.
         dtype : numpy.dtype, optional
-            Data-type of the term similarity matrix.
+            Data-type of the sparse term similarity matrix.
 
         Returns
         -------
@@ -536,75 +538,22 @@ def similarity_matrix(self, dictionary, tfidf=None, threshold=0.0, exponent=2.0,
             The Soft Cosine Measure.
         :class:`gensim.similarities.docsim.SoftCosineSimilarity`
             A class for performing corpus-based similarity queries with Soft Cosine Measure.
+        :func:`gensim.models.levenshtein.similarity_matrix`
+            A term similarity matrix produced from Levenshtein distances.
 
 
         Notes
         -----
         The constructed matrix corresponds to the matrix Mrel defined in section 2.1 of
-        `Delphine Charlet and Geraldine Damnati, "SimBow at SemEval-2017 Task 3: Soft-Cosine Semantic Similarity
-        between Questions for Community Question Answering", 2017
+        `Delphine Charlet and Geraldine Damnati, "SimBow at SemEval-2017 Task 3: Soft-Cosine
+        Semantic Similarity between Questions for Community Question Answering", 2017
         <http://www.aclweb.org/anthology/S/S17/S17-2051.pdf>`__.
 
         """
-        logger.info("constructing a term similarity matrix")
-        matrix_order = len(dictionary)
-        matrix_nonzero = [1] * matrix_order
-        matrix = sparse.identity(matrix_order, dtype=dtype, format="dok")
-        num_skipped = 0
-        # Decide the order of rows.
-        if tfidf is None:
-            word_indices = range(matrix_order)
-        else:
-            assert max(tfidf.idfs) < matrix_order
-            word_indices = [
-                index for index, _
-                in sorted(tfidf.idfs.items(), key=lambda x: (x[1], -x[0]), reverse=True)
-            ]
-
-        # Traverse rows.
-        for row_number, w1_index in enumerate(word_indices):
-            if row_number % 1000 == 0:
-                logger.info(
-                    "PROGRESS: at %.02f%% rows (%d / %d, %d skipped, %.06f%% density)",
-                    100.0 * (row_number + 1) / matrix_order, row_number + 1, matrix_order,
-                    num_skipped, 100.0 * matrix.getnnz() / matrix_order**2)
-            w1 = dictionary[w1_index]
-            if w1 not in self.vocab:
-                num_skipped += 1
-                continue  # A word from the dictionary is not present in the word2vec model.
-
-            # Traverse upper triangle columns.
-            if matrix_order <= nonzero_limit + 1:  # Traverse all columns.
-                columns = (
-                    (w2_index, self.similarity(w1, dictionary[w2_index]))
-                    for w2_index in range(w1_index + 1, matrix_order)
-                    if w1_index != w2_index and dictionary[w2_index] in self.vocab)
-            else:  # Traverse only columns corresponding to the embeddings closest to w1.
-                num_nonzero = matrix_nonzero[w1_index] - 1
-                columns = (
-                    (dictionary.token2id[w2], similarity)
-                    for _, (w2, similarity)
-                    in zip(
-                        range(nonzero_limit - num_nonzero),
-                        self.most_similar(positive=[w1], topn=nonzero_limit - num_nonzero)
-                    )
-                    if w2 in dictionary.token2id
-                )
-                columns = sorted(columns, key=lambda x: x[0])
-
-            for w2_index, similarity in columns:
-                # Ensure that we don't exceed `nonzero_limit` by mirroring the upper triangle.
-                if similarity > threshold and matrix_nonzero[w2_index] <= nonzero_limit:
-                    element = similarity**exponent
-                    matrix[w1_index, w2_index] = element
-                    matrix_nonzero[w1_index] += 1
-                    matrix[w2_index, w1_index] = element
-                    matrix_nonzero[w2_index] += 1
-        logger.info(
-            "constructed a term similarity matrix with %0.6f %% nonzero elements",
-            100.0 * matrix.getnnz() / matrix_order**2
-        )
-        return matrix.tocsc()
+        index = WordEmbeddingSimilarityIndex(self, threshold=threshold, exponent=exponent)
+        similarity_matrix = SparseTermSimilarityMatrix(
+            index, dictionary, tfidf=tfidf, nonzero_limit=nonzero_limit, dtype=dtype)
+        return similarity_matrix.matrix
 
     def wmdistance(self, document1, document2):
         """
@@ -1110,6 +1059,49 @@ def init_sims(self, replace=False):
                 self.vectors_norm = (self.vectors / sqrt((self.vectors ** 2).sum(-1))[..., newaxis]).astype(REAL)
 
 
+class WordEmbeddingSimilarityIndex(TermSimilarityIndex):
+    """
+    Computes cosine similarities between word embeddings and retrieves the closest word embeddings
+    by cosine similarity for a given word embedding.
+
+    Parameters
+    ----------
+    keyedvectors : :class:`~gensim.models.keyedvectors.WordEmbeddingsKeyedVectors`
+        The word embeddings.
+    threshold : float, optional
+        Only embeddings more similar than `threshold` are considered when retrieving word embeddings
+        closest to a given word embedding.
+    exponent : float, optional
+        Take the word embedding similarities larger than `threshold` to the power of `exponent`.
+    kwargs : dict or None
+        A dict with keyword arguments that will be passed to the `keyedvectors.most_similar` method
+        when retrieving the word embeddings closest to a given word embedding.
+
+    See Also
+    --------
+    :class:`~gensim.models.term_similarity.SparseTermSimilarityMatrix`
+        Build a term similarity matrix and compute the Soft Cosine Measure.
+
+    """
+    def __init__(self, keyedvectors, threshold=0.0, exponent=2.0, kwargs=None):
+        assert isinstance(keyedvectors, WordEmbeddingsKeyedVectors)
+        self.keyedvectors = keyedvectors
+        self.threshold = threshold
+        self.exponent = exponent
+        self.kwargs = kwargs or {}
+        super(WordEmbeddingSimilarityIndex, self).__init__()
+
+    def most_similar(self, t1, topn=10):
+        if t1 not in self.keyedvectors.vocab:
+            logger.debug('an out-of-dictionary term "%s"', t1)
+        else:
+            for _, (t2, similarity) in zip(
+                    range(topn), self.keyedvectors.most_similar(
+                        positive=[t1], topn=topn, **self.kwargs)):
+                if similarity > self.threshold:
+                    yield (t2, similarity**self.exponent)
+
+
 class Word2VecKeyedVectors(WordEmbeddingsKeyedVectors):
     """Class to contain vectors and vocab for word2vec model.
     Used to perform operations on the vectors such as vector lookup, distance, similarity etc.