Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Complete the implementation of SMART #2420

Merged
merged 20 commits into from
Jul 7, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
f22084f
Fix the example code for SparseTermSimilarityMatrix
Witiko Jan 28, 2019
541cbb3
Extend implementation of SMART in models.tfidf
Witiko Mar 17, 2019
269abf3
Add collection frequency attribute to gensim.corpora.Dictionary
Witiko Mar 24, 2019
5b5c12f
Resolve SMART letter aliases in gensim.models.tfidf.resolve_weights
Witiko Mar 24, 2019
3cd63d1
Implement the `b` pivoted document length normalization method
Witiko Mar 24, 2019
40fd9c4
Fix error message in unitvec
Witiko Apr 23, 2019
533be4a
Remove redundant comment in TfidfModel
Witiko Apr 23, 2019
08d51a1
Fix TfidfModel.__getitem__ for callable self.normalize
Witiko Apr 23, 2019
76cdb86
Replace None checks with ducktyping in TfidfModel
Witiko Apr 23, 2019
18d30cb
Document and test wlocal parameter of TfidfModel
Witiko Apr 23, 2019
4b69090
Merge remote-tracking branch 'remotes/upstream/develop' into complete…
Witiko Apr 23, 2019
35f0f9d
Do not accept smartirs=None in resolve_weights
Witiko Apr 29, 2019
5d1213a
Remove blank line between resolve_weights docstring and body (PEP8)
Witiko Apr 29, 2019
13081f7
Omit word `except` from resolve_weights ValueError messages
Witiko Apr 29, 2019
57e5a04
Add missing blank lines to the smartirs_normalize docstring
Witiko Apr 29, 2019
7032636
Cross-reference docstrings of SMART scheme users (functions, classes)
Witiko May 4, 2019
5bb926b
Merge branch 'develop' into complete-smart
Witiko May 7, 2019
a2f4c7e
Document the default SMART scheme of TfidfModel
Witiko May 8, 2019
fccc5e5
Improve the documentation of slope and pivot
Witiko May 17, 2019
e709b0d
Merge remote-tracking branch 'upstream/develop' into complete-smart
Witiko May 31, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ class Dictionary(utils.SaveLoad, Mapping):
token -> tokenId.
id2token : dict of (int, str)
Reverse mapping for token2id, initialized in a lazy manner to save memory (not created until needed).
cfs : dict of (int, int)
piskvorky marked this conversation as resolved.
Show resolved Hide resolved
Collection frequencies: token_id -> how many instances of this token are contained in the documents.
dfs : dict of (int, int)
Document frequencies: token_id -> how many documents contain this token.
num_docs : int
Expand Down Expand Up @@ -74,6 +76,7 @@ def __init__(self, documents=None, prune_at=2000000):
"""
self.token2id = {}
self.id2token = {}
self.cfs = {}
self.dfs = {}

self.num_docs = 0
Expand Down Expand Up @@ -263,10 +266,10 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
self.num_docs += 1
self.num_pos += sum(itervalues(counter))
self.num_nnz += len(result)
# increase document count for each unique token that appeared in the document
dfs = self.dfs
for tokenid in iterkeys(result):
dfs[tokenid] = dfs.get(tokenid, 0) + 1
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

# return tokenids, in ascending id order
result = sorted(iteritems(result))
Expand Down Expand Up @@ -449,10 +452,12 @@ def filter_tokens(self, bad_ids=None, good_ids=None):
if bad_ids is not None:
bad_ids = set(bad_ids)
self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid not in bad_ids}
self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid not in bad_ids}
self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid not in bad_ids}
if good_ids is not None:
good_ids = set(good_ids)
self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid in good_ids}
self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid in good_ids}
self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid in good_ids}
self.compactify()

Expand Down
10 changes: 8 additions & 2 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ def unitvec(vec, norm='l2', return_norm=False):
----------
vec : {numpy.ndarray, scipy.sparse, list of (int, float)}
Input vector in any format
norm : {'l1', 'l2'}, optional
norm : {'l1', 'l2', 'unique'}, optional
Metric to normalize in.
return_norm : bool, optional
Return the length of vector `vec`, in addition to the normalized vector itself?
Expand All @@ -706,7 +706,7 @@ def unitvec(vec, norm='l2', return_norm=False):
Zero-vector will be unchanged.

"""
if norm not in ('l1', 'l2'):
if norm not in ('l1', 'l2', 'unique'):
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm)

if scipy.sparse.issparse(vec):
Expand All @@ -715,6 +715,8 @@ def unitvec(vec, norm='l2', return_norm=False):
veclen = np.sum(np.abs(vec.data))
if norm == 'l2':
veclen = np.sqrt(np.sum(vec.data ** 2))
if norm == 'unique':
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
veclen = vec.nnz
if veclen > 0.0:
if np.issubdtype(vec.dtype, np.integer):
vec = vec.astype(np.float)
Expand All @@ -734,6 +736,8 @@ def unitvec(vec, norm='l2', return_norm=False):
veclen = np.sum(np.abs(vec))
if norm == 'l2':
veclen = blas_nrm2(vec)
if norm == 'unique':
Witiko marked this conversation as resolved.
Show resolved Hide resolved
veclen = np.count_nonzero(vec)
if veclen > 0.0:
if np.issubdtype(vec.dtype, np.integer):
vec = vec.astype(np.float)
Expand All @@ -757,6 +761,8 @@ def unitvec(vec, norm='l2', return_norm=False):
length = float(sum(abs(val) for _, val in vec))
if norm == 'l2':
length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec))
if norm == 'unique':
mpenkov marked this conversation as resolved.
Show resolved Hide resolved
length = 1.0 * len(vec)
assert length > 0.0, "sparse documents must not contain any explicit zero entries"
if return_norm:
return ret_normalized_vec(vec, length), length
Expand Down
Loading