piskvorky · mpenkov · Jul 7, 2019 · Jan 28, 2019 · Mar 17, 2019 · Mar 24, 2019
diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
@@ -36,6 +36,8 @@ class Dictionary(utils.SaveLoad, Mapping):
         token -> tokenId.
     id2token : dict of (int, str)
         Reverse mapping for token2id, initialized in a lazy manner to save memory (not created until needed).
+    cfs : dict of (int, int)
+        Collection frequencies: token_id -> how many instances of this token are contained in the documents.
     dfs : dict of (int, int)
         Document frequencies: token_id -> how many documents contain this token.
     num_docs : int
@@ -74,6 +76,7 @@ def __init__(self, documents=None, prune_at=2000000):
         """
         self.token2id = {}
         self.id2token = {}
+        self.cfs = {}
         self.dfs = {}
 
         self.num_docs = 0
@@ -263,10 +266,10 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
             self.num_docs += 1
             self.num_pos += sum(itervalues(counter))
             self.num_nnz += len(result)
-            # increase document count for each unique token that appeared in the document
-            dfs = self.dfs
-            for tokenid in iterkeys(result):
-                dfs[tokenid] = dfs.get(tokenid, 0) + 1
+            # keep track of document and collection frequencies
+            for tokenid, freq in iteritems(result):
+                self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
+                self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
 
         # return tokenids, in ascending id order
         result = sorted(iteritems(result))
@@ -449,10 +452,12 @@ def filter_tokens(self, bad_ids=None, good_ids=None):
         if bad_ids is not None:
             bad_ids = set(bad_ids)
             self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid not in bad_ids}
+            self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid not in bad_ids}
             self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid not in bad_ids}
         if good_ids is not None:
             good_ids = set(good_ids)
             self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid in good_ids}
+            self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid in good_ids}
             self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid in good_ids}
         self.compactify()
 

diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -689,7 +689,7 @@ def unitvec(vec, norm='l2', return_norm=False):
     ----------
     vec : {numpy.ndarray, scipy.sparse, list of (int, float)}
         Input vector in any format
-    norm : {'l1', 'l2'}, optional
+    norm : {'l1', 'l2', 'unique'}, optional
         Metric to normalize in.
     return_norm : bool, optional
         Return the length of vector `vec`, in addition to the normalized vector itself?
@@ -706,7 +706,7 @@ def unitvec(vec, norm='l2', return_norm=False):
     Zero-vector will be unchanged.
 
     """
-    if norm not in ('l1', 'l2'):
+    if norm not in ('l1', 'l2', 'unique'):
         raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm)
 
     if scipy.sparse.issparse(vec):
@@ -715,6 +715,8 @@ def unitvec(vec, norm='l2', return_norm=False):
             veclen = np.sum(np.abs(vec.data))
         if norm == 'l2':
             veclen = np.sqrt(np.sum(vec.data ** 2))
+        if norm == 'unique':
+            veclen = vec.nnz
         if veclen > 0.0:
             if np.issubdtype(vec.dtype, np.integer):
                 vec = vec.astype(np.float)
@@ -734,6 +736,8 @@ def unitvec(vec, norm='l2', return_norm=False):
             veclen = np.sum(np.abs(vec))
         if norm == 'l2':
             veclen = blas_nrm2(vec)
+        if norm == 'unique':
+            veclen = np.count_nonzero(vec)
         if veclen > 0.0:
             if np.issubdtype(vec.dtype, np.integer):
                 vec = vec.astype(np.float)
@@ -757,6 +761,8 @@ def unitvec(vec, norm='l2', return_norm=False):
             length = float(sum(abs(val) for _, val in vec))
         if norm == 'l2':
             length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec))
+        if norm == 'unique':
+            length = 1.0 * len(vec)
         assert length > 0.0, "sparse documents must not contain any explicit zero entries"
         if return_norm:
             return ret_normalized_vec(vec, length), length