Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sparse support when num_best not None in interfaces. Fixes #1294 #1321

Merged
merged 16 commits into from
Jun 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions gensim/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,11 @@ def __getitem__(self, query):
if self.num_best is None:
return result

# if maintain_sparity is True, result is scipy sparse. Sort, clip the
# topn and return as a scipy sparse matrix.
if getattr(self, 'maintain_sparsity', False):
return matutils.scipy2scipy_clipped(result, self.num_best)

# if the input query was a corpus (=more documents), compute the top-n
# most similar for each document in turn
if matutils.ismatrix(result):
Expand Down
37 changes: 37 additions & 0 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,43 @@ def any2sparse(vec, eps=1e-9):
return [(int(fid), float(fw)) for fid, fw in vec if np.abs(fw) > eps]


def scipy2scipy_clipped(matrix, topn, eps=1e-9):
"""
Return a scipy.sparse vector/matrix consisting of 'topn' elements of the greatest magnitude (absolute value).
"""
if not scipy.sparse.issparse(matrix):
raise ValueError("'%s' is not a scipy sparse vector." % matrix)
if topn <= 0:
return scipy.sparse.csr_matrix([])
# Return clipped sparse vector if input is a sparse vector.
if matrix.shape[0] == 1:
# use np.argpartition/argsort and only form tuples that are actually returned.
biggest = argsort(abs(matrix.data), topn, reverse=True)
indices, data = matrix.indices.take(biggest), matrix.data.take(biggest)
return scipy.sparse.csr_matrix((data, indices, [0, len(indices)]))
# Return clipped sparse matrix if input is a matrix, processing row by row.
else:
matrix_indices = []
matrix_data = []
matrix_indptr = [0]
# calling abs() on entire matrix once is faster than calling abs() iteratively for each row
matrix_abs = abs(matrix)
for i in range(matrix.shape[0]):
v = matrix.getrow(i)
v_abs = matrix_abs.getrow(i)
# Sort and clip each row vector first.
biggest = argsort(v_abs.data, topn, reverse=True)
indices, data = v.indices.take(biggest), v.data.take(biggest)
# Store the topn indices and values of each row vector.
matrix_data.append(data)
matrix_indices.append(indices)
matrix_indptr.append(matrix_indptr[-1] + min(len(indices), topn))
matrix_indices = np.concatenate(matrix_indices).ravel()
matrix_data = np.concatenate(matrix_data).ravel()
# Instantiate and return a sparse csr_matrix which preserves the order of indices/data.
return scipy.sparse.csr.csr_matrix((matrix_data, matrix_indices, matrix_indptr), shape=(matrix.shape[0], np.max(matrix_indices) + 1))


def scipy2sparse(vec, eps=1e-9):
"""Convert a scipy.sparse vector into gensim document format (=list of 2-tuples)."""
vec = vec.tocsr()
Expand Down
31 changes: 31 additions & 0 deletions gensim/test/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,22 @@ def test_full2sparse_clipped(self):
expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
self.assertTrue(matutils.full2sparse_clipped(vec, topn=3), expected)

def test_scipy2scipy_clipped(self):
# Test for scipy vector/row
vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15]
expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
vec_scipy = scipy.sparse.csr_matrix(vec)
vec_scipy_clipped = matutils.scipy2scipy_clipped(vec_scipy, topn=3)
self.assertTrue(scipy.sparse.issparse(vec_scipy_clipped))
self.assertTrue(matutils.scipy2sparse(vec_scipy_clipped), expected)

# Test for scipy matrix
vec = [0.8, 0.2, 0.0, 0.0, -0.1, -0.15]
expected = [(0, 0.80000000000000004), (1, 0.20000000000000001), (5, -0.14999999999999999)]
matrix_scipy = scipy.sparse.csr_matrix([vec] * 3)
matrix_scipy_clipped = matutils.scipy2scipy_clipped(matrix_scipy, topn=3)
self.assertTrue(scipy.sparse.issparse(matrix_scipy_clipped))
self.assertTrue([matutils.scipy2sparse(x) for x in matrix_scipy_clipped], [expected] * 3)


def testChunking(self):
Expand Down Expand Up @@ -405,6 +421,21 @@ def testMaintainSparsity(self):
self.assertTrue(scipy.sparse.issparse(sparse_sims))
numpy.testing.assert_array_equal(dense_sims, sparse_sims.todense())

def testMaintainSparsityWithNumBest(self):
"""Tests that sparsity is correctly maintained when maintain_sparsity=True and num_best is not None"""
num_features = len(dictionary)

index = self.cls(corpus, num_features=num_features, maintain_sparsity=False, num_best=3)
dense_topn_sims = index[corpus]

index = self.cls(corpus, num_features=num_features, maintain_sparsity=True, num_best=3)
scipy_topn_sims = index[corpus]

self.assertFalse(scipy.sparse.issparse(dense_topn_sims))
self.assertTrue(scipy.sparse.issparse(scipy_topn_sims))
self.assertEqual(dense_topn_sims, [matutils.scipy2sparse(v) for v in scipy_topn_sims])



class TestSimilarity(unittest.TestCase, _TestSimilarityABC):
def setUp(self):
Expand Down