piskvorky · mpenkov · Jun 29, 2021 · May 25, 2021 · May 25, 2021 · May 28, 2021
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -171,6 +171,8 @@
 import itertools
 import warnings
 from numbers import Integral
+from typing import Iterable, Union
+from collections import OrderedDict
 
 from numpy import (
     dot, float32 as REAL, double, array, zeros, vstack,
@@ -1695,6 +1697,44 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut
             msg=f"merged {overlap_count} vectors into {self.vectors.shape} matrix from {fname}",
         )
 
+    def vectors_for_all(self, keys: Union[Iterable, Dictionary]) -> 'KeyedVectors':
+        """Produces vectors for all given keys.
+
+        Notes
+        -----
+        A new :class:`KeyedVectors` object will always be produced.
+
+        In subclasses such as :class:`~gensim.models.fasttext.FastTextKeyedVectors`,
+        vectors for out-of-vocabulary keys (words) may be inferred. In other classes
+        such as :class:`KeyedVectors`, out-of-vocabulary keys will be omitted
+        in the produced :class:`KeyedVectors` object.
+
+        Additional attributes set via the :meth:`KeyedVectors.set_vecattr` method
+        will not be preserved in the produced :class:`KeyedVectors` object.
+
+        Parameters
+        ----------
+        keys : {iterable of str, Dictionary}
+            The keys that will be vectorized.
+
+        Returns
+        -------
+        keyedvectors : :class:`~gensim.models.keyedvectors.KeyedVectors`
+            Vectors for all the given keys.
+
+        """
+        if isinstance(keys, Dictionary):
+            vocabulary = keys.token2id
+        else:
+            vocabulary = list(OrderedDict.fromkeys(keys))
+        vocab_size = len(vocabulary)
+        datatype = self.vectors.dtype
+        kv = KeyedVectors(self.vector_size, vocab_size, dtype=datatype)
+        for key in vocabulary:
+            weights = self[key]
+            _add_word_to_kv(kv, None, key, weights, vocab_size)
+        return kv
+
     def _upconvert_old_d2vkv(self):
         """Convert a deserialized older Doc2VecKeyedVectors instance to latest generic KeyedVectors"""
         self.vocab = self.doctags

diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py
@@ -102,6 +102,25 @@ class WordEmbeddingSimilarityIndex(TermSimilarityIndex):
     Computes cosine similarities between word embeddings and retrieves most
     similar terms for a given term.
 
+    Notes
+    -----
+    By fitting the word embeddings to a vocabulary that you will be using, you
+    will eliminate all out-of-vocabulary (OOV) words that you would otherwise
+    receive from the `most_similar` method:
+
+    >>> from gensim.test.utils import common_texts, datapath
+    >>> from gensim.corpora import Dictionary
+    >>> from gensim.models import FastText
+    >>> from gensim.models.word2vec import LineSentence
+    >>> from gensim.similarities import WordEmbeddingSimilarityIndex
+    >>>
+    >>> model = FastText(common_texts, vector_size=20, min_count=1)  # train word-vectors on a corpus
+    >>> different_corpus = LineSentence(datapath('lee_background.cor'))
+    >>> dictionary = Dictionary(different_corpus)  # construct a vocabulary on a different corpus
+    >>> word_vectors = model.wv.vectors_for_all(dictionary)  # remove OOV word-vectors and infer new words
+    >>> assert len(dictionary) == len(word_vectors)  # all words from our vocabulary received their word-vectors
+    >>> termsim_index = WordEmbeddingSimilarityIndex(word_vectors)
+
     Parameters
     ----------
     keyedvectors : :class:`~gensim.models.keyedvectors.KeyedVectors`
@@ -409,20 +428,18 @@ class SparseTermSimilarityMatrix(SaveLoad):
     >>> from gensim.models import Word2Vec
     >>> from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex
     >>> from gensim.similarities.index import AnnoyIndexer
-    >>> from scikits.sparse.cholmod import cholesky
     >>>
     >>> model = Word2Vec(common_texts, vector_size=20, min_count=1)  # train word-vectors
     >>> annoy = AnnoyIndexer(model, num_trees=2)  # use annoy for faster word similarity lookups
-    >>> termsim_index = WordEmbeddingSimilarityIndex(model.wv, kwargs={'indexer': annoy})
     >>> dictionary = Dictionary(common_texts)
+    >>> word_vectors = model.wv.vectors_for_all(dictionary)
+    >>> termsim_index = WordEmbeddingSimilarityIndex(word_vectors, kwargs={'indexer': annoy})
     >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts]
     >>> similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary, symmetric=True, dominant=True)
     >>> docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)
     >>>
     >>> query = 'graph trees computer'.split()  # make a query
     >>> sims = docsim_index[dictionary.doc2bow(query)]  # calculate similarity of query to each doc from bow_corpus
-    >>>
-    >>> word_embeddings = cholesky(similarity_matrix.matrix).L()  # obtain word embeddings from similarity matrix
 
     Check out `the Gallery <https://radimrehurek.com/gensim/auto_examples/tutorials/run_scm.html>`_
     for more examples.

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
@@ -850,6 +850,35 @@ def obsolete_testLoadOldModel(self):
         self.assertEqual(model.wv.vectors_vocab.shape, (12, 100))
         self.assertEqual(model.wv.vectors_ngrams.shape, (2000000, 100))
 
+    def test_vectors_for_all(self):
+        """Test vectors_for_all returns expected results."""
+        words = [
+            'responding',
+            'approached',
+            'chairman',
+            'an out-of-vocabulary word',
+            'another out-of-vocabulary word',
+        ]
+        vectors_for_all = self.test_model.wv.vectors_for_all(words)
+
+        expected = 5
+        predicted = len(vectors_for_all)
+        self.assertEqual(expected, predicted)
+
+        expected = self.test_model.wv['responding']
+        predicted = vectors_for_all['responding']
+        self.assertTrue(np.allclose(expected, predicted))
+
+        smaller_distance = np.linalg.norm(
+            vectors_for_all['an out-of-vocabulary word']
+            - vectors_for_all['another out-of-vocabulary word']
+        )
+        greater_distance = np.linalg.norm(
+            vectors_for_all['an out-of-vocabulary word']
+            - vectors_for_all['responding']
+        )
+        self.assertGreater(greater_distance, smaller_distance)
-        self.assertGreater(greater_distance, smaller_distance)
+        assert greater_distance > smaller_distance
-        self.assertGreater(greater_distance, smaller_distance)
+        assert greater_distance > smaller_distance
+
 
 with open(datapath('toy-data.txt')) as fin:
     TOY_SENTENCES = [fin.read().strip().split(' ')]

diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
@@ -39,6 +39,25 @@ def test_most_similar(self):
         predicted = [result[0] for result in self.vectors.most_similar('war', topn=5)]
         self.assertEqual(expected, predicted)
 
+    def test_vectors_for_all(self):
+        """Test vectors_for_all returns expected results."""
+        words = [
+            'conflict',
+            'administration',
+            'terrorism',
+            'an out-of-vocabulary word',
+            'another out-of-vocabulary word',
+        ]
+        vectors_for_all = self.vectors.vectors_for_all(words)
+
+        expected = 3
+        predicted = len(vectors_for_all)
+        self.assertEqual(expected, predicted)
+
+        expected = self.vectors['conflict']
+        predicted = vectors_for_all['conflict']
+        self.assertTrue(np.allclose(expected, predicted))
-        self.assertTrue(np.allclose(expected, predicted))
+        assert np.allclose(expected, predicted)
-        self.assertTrue(np.allclose(expected, predicted))
+        assert np.allclose(expected, predicted)
+
     def test_most_similar_topn(self):
         """Test most_similar returns correct results when `topn` is specified."""
         self.assertEqual(len(self.vectors.most_similar('war', topn=5)), 5)