Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ANN search using HNSWLib #544

Merged
merged 15 commits into from
Nov 15, 2023
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,15 @@ options:
--port PORT service port
```

## Efficient retrieval with ANN search

One important aspect of deploying recommender model is efficient retrieval via Approximate Nearest Neighor (ANN) search in vector space. Although, this is not the main focus of Cornac, we try our best to integrate several open-source frameworks out-of-the-box. [This example](tutorials/ann_hnswlib.ipynb) demonstrates how ANN search will work seamlessly as other recommender models.
tqtg marked this conversation as resolved.
Show resolved Hide resolved

| Supported framework | Cornac wrapper | Examples |
| :---: | :---: | :---: |
| [nmslib/hnswlib](https://github.com/nmslib/hnswlib) | [HNSWLibANN](cornac/models/ann/recom_ann_hnswlib.py) | [ann_hnswlib.ipynb](tutorials/ann_hnswlib.ipynb)


## Models

The recommender models supported by Cornac are listed below. Why don't you join us to lengthen the list?
Expand Down
1 change: 1 addition & 0 deletions cornac/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .recommender import Recommender

from .amr import AMR
from .ann import HNSWLibANN
from .baseline_only import BaselineOnly
from .bivaecf import BiVAECF
from .bpr import BPR
Expand Down
1 change: 1 addition & 0 deletions cornac/models/ann/__init__.py
tqtg marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .recom_ann_hnswlib import HNSWLibANN
139 changes: 139 additions & 0 deletions cornac/models/ann/recom_ann_base.py
tqtg marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# Copyright 2023 The Cornac Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================


from ..recommender import Recommender
from ..recommender import is_ann_supported


class BaseANN(Recommender):
"""Base class for a recommender model supporting Approximate Nearest Neighbor (ANN) search.

Parameters
----------------
recom: object: :obj:`cornac.models.Recommender`, required
Trained recommender model which to get user/item vectors from.

name: str, required
Name of the recommender model.

verbose: boolean, optional, default: False
When True, running logs are displayed.
"""

def __init__(self, recom, name="BaseANN", verbose=False):
super().__init__(name=name, verbose=verbose, trainable=False)

if not is_ann_supported(recom):
raise ValueError(f"{recom.name} doesn't support ANN search")

# ANN required attributes
self.measure = recom.get_vector_measure()
self.user_vectors = recom.get_user_vectors()
self.item_vectors = recom.get_item_vectors()

# get basic attributes to be a proper recommender
super().fit(train_set=recom.train_set, val_set=recom.val_set)

def build_index(self):
"""Building index from the base recommender model.

:raise NotImplementedError
"""
raise NotImplementedError()

def knn_query(self, query, k):
"""Implementing ANN search for a given query.

Returns
-------
:raise NotImplementedError
"""
raise NotImplementedError()

def recommend(self, user_id, k=-1, remove_seen=False, train_set=None):
"""Generate top-K item recommendations for a given user. Backward compatibility.

Parameters
----------
user_id: str, required
The original ID of user.

k: int, optional, default=-1
Cut-off length for recommendations, k=-1 will return ranked list of all items.

remove_seen: bool, optional, default: False
Remove seen/known items during training and validation from output recommendations.
This might shrink the list of recommendations to be less than k.

train_set: :obj:`cornac.data.Dataset`, optional, default: None
Training dataset needs to be provided in order to remove seen items.

Returns
-------
recommendations: list
Recommended items in the form of their original IDs.
"""
assert isinstance(user_id, str)
return self.recommend_batch(
batch_users=[user_id],
k=k,
remove_seen=remove_seen,
train_set=train_set,
)[0]

def recommend_batch(self, batch_users, k=-1, remove_seen=False, train_set=None):
"""Generate top-K item recommendations for a given batch of users. This is to leverage
parallelization provided by some ANN frameworks.

Parameters
----------
batch_users: list, required
The original ID of users.

k: int, optional, default=-1
Cut-off length for recommendations, k=-1 will return ranked list of all items.

remove_seen: bool, optional, default: False
Remove seen/known items during training and validation from output recommendations.
This might shrink the list of recommendations to be less than k.

train_set: :obj:`cornac.data.Dataset`, optional, default: None
Training dataset needs to be provided in order to remove seen items.

Returns
-------
recommendations: list
Recommended items in the form of their original IDs.
"""
user_idx = [self.uid_map.get(uid, -1) for uid in batch_users]

if any(i == -1 for i in user_idx):
raise ValueError(f"{batch_users} is unknown to the model.")

if k < -1 or k > self.total_items:
raise ValueError(
f"k={k} is invalid, there are {self.total_users} users in total."
)

query = self.user_vectors[user_idx]
knn_items, distances = self.knn_query(query, k=k)

# TODO: remove seen items

recommendations = [
[self.item_ids[i] for i in knn_items[u]] for u in range(len(user_idx))
]
return recommendations
124 changes: 124 additions & 0 deletions cornac/models/ann/recom_ann_hnswlib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# Copyright 2023 The Cornac Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================


import multiprocessing
import numpy as np

from .recom_ann_base import BaseANN


class HNSWLibANN(BaseANN):
"""Approximate Nearest Neighbor Search with HNSWLib (https://github.com/nmslib/hnswlib/).

Parameters
----------------
recom: object: :obj:`cornac.models.Recommender`, required
Trained recommender model which to get user/item vectors from.

M: int, optional, default: 16
Parameter that defines the maximum number of outgoing connections in the HNSW graph.
Higher M leads to higher accuracy/run_time at fixed ef/efConstruction

ef_construction: int, optional, default: 100
Parameter that controls speed/accuracy trade-off during the index construction.

ef: int, optional, default: 50
tqtg marked this conversation as resolved.
Show resolved Hide resolved
Parameter controlling query time/accuracy trade-off.

num_threads: int, optional, default: -1
Default number of threads to use when querying. If num_threads = -1, all cores will be used.

seed: int, optional, default: None
Random seed for reproducibility.

name: str, required
Name of the recommender model.

verbose: boolean, optional, default: False
When True, running logs are displayed.
"""

def __init__(
self,
recom,
M=16,
ef_construction=100,
ef=50,
num_threads=-1,
seed=None,
name="HNSWLibANN",
verbose=False,
):
super().__init__(recom=recom, name=name, verbose=verbose)
self.M = M
self.ef_construction = ef_construction
self.ef = ef
self.num_threads = (
num_threads if num_threads != -1 else multiprocessing.cpu_count()
)
self.seed = seed

self.index = None
self.ignored_attrs.extend(
[
"index", # will be saved separately
"item_vectors", # redundant after index is built
]
)

def build_index(self):
"""Building index from the base recommender model."""
import hnswlib

self.index = hnswlib.Index(space=self.measure, dim=self.item_vectors.shape[1])
random_seed = self.seed if self.seed else np.random.randint(np.iinfo(int).max)
self.index.init_index(
max_elements=self.item_vectors.shape[0],
ef_construction=self.ef_construction,
M=self.M,
random_seed=random_seed,
)
self.index.add_items(self.item_vectors, np.arange(self.item_vectors.shape[0]))

self.index.set_ef(self.ef)
self.index.set_num_threads(self.num_threads)

def knn_query(self, query, k):
"""Implementing ANN search for a given query.

Returns
-------
neighbors, distances: numpy.array and numpy.array
Array of k-nearest neighbors and corresponding distances for the given query.
"""
neighbors, distances = self.index.knn_query(query, k=k)
return neighbors, distances

def save(self, save_dir=None):
model_file = super().save(save_dir)
self.index.save_index(model_file + ".idx")
return model_file

@staticmethod
def load(model_path, trainable=False):
import hnswlib

model = BaseANN.load(model_path, trainable)
model.index = hnswlib.Index(
space=model.measure, dim=model.user_vectors.shape[1]
)
model.index.load_index(model.load_from + ".idx")
return model
53 changes: 51 additions & 2 deletions cornac/models/mf/recom_mf.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@ import numpy as np
cimport numpy as np
from tqdm.auto import trange

from ..recommender import Recommender
from ..recommender import Recommender, ANNMixin
from ...exception import ScoreException
from ...utils import fast_dot
from ...utils import get_rng
from ...utils.init_utils import normal, zeros



class MF(Recommender):
class MF(Recommender, ANNMixin):
"""Matrix Factorization.

Parameters
Expand Down Expand Up @@ -269,3 +269,52 @@ class MF(Recommender):
raise ScoreException("Can't make score prediction for (user_id=%d, item_id=%d)" % (user_idx, item_idx))
item_score = np.dot(self.u_factors[user_idx], self.i_factors[item_idx])
return item_score

def get_vector_measure(self):
"""Getting a valid choice of vector measurement in ANNMixin._measures.

Returns
-------
"ip" = "inner product" aka "dot product"
"""
return "ip"

def get_user_vectors(self):
"""Getting a matrix of user vectors served as query for ANN search.
tqtg marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
out: numpy.array
Matrix of user vectors for all users available in the model.
"""
user_vectors = self.u_factors
if self.use_bias:
user_vectors = np.concatenate(
(
user_vectors,
self.u_biases.reshape((-1, 1)),
np.ones([user_vectors.shape[0], 1]), # augmented for item bias
),
axis=1
)
return user_vectors

def get_item_vectors(self):
"""Getting a matrix of item vectors used for building index for ANN search.
tqtg marked this conversation as resolved.
Show resolved Hide resolved

Returns
-------
out: numpy.array
Matrix of item vectors for all items available in the model.
"""
item_vectors = self.i_factors
if self.use_bias:
item_vectors = np.concatenate(
(
item_vectors,
np.ones([item_vectors.shape[0], 1]), # augmented for user bias
self.i_biases.reshape((-1, 1)),
),
axis=1
)
return item_vectors
Loading
Loading