From f22084f2476a723ed6be39537e206e16f42ea77b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Mon, 28 Jan 2019 07:30:09 +0100
Subject: [PATCH 01/17] Fix the example code for SparseTermSimilarityMatrix

---
 gensim/similarities/termsim.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py
index 6a0b6d12b5..167b73b241 100644
--- a/gensim/similarities/termsim.py
+++ b/gensim/similarities/termsim.py
@@ -128,13 +128,13 @@ class SparseTermSimilarityMatrix(SaveLoad):
     >>> from gensim.test.utils import common_texts
     >>> from gensim.corpora import Dictionary
     >>> from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex
-    >>> from gensim.similarities import SoftCosineSimilarity, TermSimilarityMatrix
+    >>> from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
     >>>
     >>> model = Word2Vec(common_texts, size=20, min_count=1)  # train word-vectors
-    >>> termsim_index = WordEmbeddingSimilarityIndex(model)
+    >>> termsim_index = WordEmbeddingSimilarityIndex(model.wv)
     >>> dictionary = Dictionary(common_texts)
     >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts]
-    >>> similarity_matrix = TermSimilarityMatrix(termsim_index, dictionary)  # construct similarity matrix
+    >>> similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)  # construct similarity matrix
     >>> docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10)
     >>>
     >>> query = 'graph trees computer'.split()  # make a query

From 541cbb35b8c41e8a3f2e1f52ad8fb3767c6da9df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Sun, 17 Mar 2019 01:23:24 +0100
Subject: [PATCH 02/17] Extend implementation of SMART in models.tfidf

These are our additions:

* Make `t` an alias for the `n` term frequency method.

* Implement the `f` document frequency method.

* Rename `t` document frequency method to `f`.

* Make `x` an alias for the `n` document frequency method.

* Make `x` an alias for the `n` document length normalization method.

* Implement the `u` pivoted document length normalization method.

* Implement the `unique` vector norm to matutils.unitvec.

* Produce a helpful error message when a SMART scheme in the `ddd.qqq`
  format is requested.
---
 gensim/matutils.py             |  10 ++-
 gensim/models/tfidfmodel.py    | 116 ++++++++++++++++++++-------------
 gensim/sklearn_api/tfidf.py    |  43 ++++++------
 gensim/test/test_tfidfmodel.py |  81 ++++++++++++++++++++---
 4 files changed, 176 insertions(+), 74 deletions(-)

diff --git a/gensim/matutils.py b/gensim/matutils.py
index 979b99f6d5..ff584dfc4d 100644
--- a/gensim/matutils.py
+++ b/gensim/matutils.py
@@ -689,7 +689,7 @@ def unitvec(vec, norm='l2', return_norm=False):
     ----------
     vec : {numpy.ndarray, scipy.sparse, list of (int, float)}
         Input vector in any format
-    norm : {'l1', 'l2'}, optional
+    norm : {'l1', 'l2', 'unique'}, optional
         Metric to normalize in.
     return_norm : bool, optional
         Return the length of vector `vec`, in addition to the normalized vector itself?
@@ -706,7 +706,7 @@ def unitvec(vec, norm='l2', return_norm=False):
     Zero-vector will be unchanged.
 
     """
-    if norm not in ('l1', 'l2'):
+    if norm not in ('l1', 'l2', 'unique'):
         raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm)
 
     if scipy.sparse.issparse(vec):
@@ -715,6 +715,8 @@ def unitvec(vec, norm='l2', return_norm=False):
             veclen = np.sum(np.abs(vec.data))
         if norm == 'l2':
             veclen = np.sqrt(np.sum(vec.data ** 2))
+        if norm == 'unique':
+            veclen = vec.nnz
         if veclen > 0.0:
             if np.issubdtype(vec.dtype, np.integer):
                 vec = vec.astype(np.float)
@@ -734,6 +736,8 @@ def unitvec(vec, norm='l2', return_norm=False):
             veclen = np.sum(np.abs(vec))
         if norm == 'l2':
             veclen = blas_nrm2(vec)
+        if norm == 'unique':
+            veclen = np.count_nonzero(vec)
         if veclen > 0.0:
             if np.issubdtype(vec.dtype, np.integer):
                 vec = vec.astype(np.float)
@@ -757,6 +761,8 @@ def unitvec(vec, norm='l2', return_norm=False):
             length = float(sum(abs(val) for _, val in vec))
         if norm == 'l2':
             length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec))
+        if norm == 'unique':
+            length = 1.0 * len(vec)
         assert length > 0.0, "sparse documents must not contain any explicit zero entries"
         if return_norm:
             return ret_normalized_vec(vec, length), length
diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index a4cbedcd22..a87f580aa4 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -15,6 +15,7 @@
 
 import logging
 from functools import partial
+import re
 
 from gensim import interfaces, matutils, utils
 from six import iteritems
@@ -43,20 +44,23 @@ def resolve_weights(smartirs):
 
     local_letter : str
         Term frequency weighing, one of:
-            * `n` - natural,
-            * `l` - logarithm,
+            * `b` - binary,
+            * `t` or `n` - raw,
             * `a` - augmented,
-            * `b` - boolean,
+            * `l` - logarithm,
+            * `d` - double logarithm,
             * `L` - log average.
     global_letter : str
         Document frequency weighting, one of:
-            * `n` - none,
-            * `t` - idf,
-            * `p` - prob idf.
+            * `x` or `n` - none,
+            * `f` - idf,
+            * `t` - zero-corrected idf,
+            * `p` - probabilistic idf.
     normalization_letter : str
         Document normalization, one of:
-            * `n` - none,
-            * `c` - cosine.
+            * `x` or `n` - none,
+            * `c` - cosine,
+            * `u` - pivoted unique.
 
     Raises
     ------
@@ -65,19 +69,29 @@ def resolve_weights(smartirs):
         doesn't fit the list of permissible values.
 
     """
+    if isinstance(smartirs, str) and re.match(r"...\....", smartirs):
+        match = re.match(r"(?P<ddd>...)\.(?P<qqq>...)", smartirs)
+        raise ValueError(
+            "The notation {ddd}.{qqq} specifies two term-weighting schemes, "
+            "one for collection documents ({ddd}) and one for queries ({qqq}). "
+            "You must train two separate tf-idf models.".format(
+                ddd=match.group("ddd"),
+                qqq=match.group("qqq"),
+            )
+        )
     if not isinstance(smartirs, str) or len(smartirs) != 3:
         raise ValueError("Expected a string of length 3 except got " + smartirs)
 
     w_tf, w_df, w_n = smartirs
 
-    if w_tf not in 'nlabL':
-        raise ValueError("Expected term frequency weight to be one of 'nlabL', except got {}".format(w_tf))
+    if w_tf not in 'btnaldL':
+        raise ValueError("Expected term frequency weight to be one of 'btnaldL', except got {}".format(w_tf))
 
-    if w_df not in 'ntp':
-        raise ValueError("Expected inverse document frequency weight to be one of 'ntp', except got {}".format(w_df))
+    if w_df not in 'xnftp':
+        raise ValueError("Expected inverse document frequency weight to be one of 'xnftp', except got {}".format(w_df))
 
-    if w_n not in 'nc':
-        raise ValueError("Expected normalization weight to be one of 'ncb', except got {}".format(w_n))
+    if w_n not in 'xncu':
+        raise ValueError("Expected normalization weight to be one of 'xncu', except got {}".format(w_n))
 
     return w_tf, w_df, w_n
 
@@ -137,7 +151,7 @@ def smartirs_wlocal(tf, local_scheme):
     ----------
     tf : int
         Term frequency.
-    local : {'n', 'l', 'a', 'b', 'L'}
+    local : {'b', 't', 'n', 'a', 'l', 'd', 'L'}
         Local transformation scheme.
 
     Returns
@@ -146,10 +160,12 @@ def smartirs_wlocal(tf, local_scheme):
         Calculated local weight.
 
     """
-    if local_scheme == "n":
+    if local_scheme in ("t", "n"):
         return tf
     elif local_scheme == "l":
         return 1 + np.log2(tf)
+    elif local_scheme == "d":
+        return 1 + np.log2(1 + np.log2(tf))
     elif local_scheme == "a":
         return 0.5 + (0.5 * tf / tf.max(axis=0))
     elif local_scheme == "b":
@@ -167,7 +183,7 @@ def smartirs_wglobal(docfreq, totaldocs, global_scheme):
         Document frequency.
     totaldocs : int
         Total number of documents.
-    global_scheme : {'n', 't', 'p'}
+    global_scheme : {'x', 'n', 'f', 't', 'p'}
         Global transformation scheme.
 
     Returns
@@ -176,11 +192,12 @@ def smartirs_wglobal(docfreq, totaldocs, global_scheme):
         Calculated global weight.
 
     """
-
-    if global_scheme == "n":
-        return 1.
-    elif global_scheme == "t":
+    if global_scheme in ("x", "n"):
+        return 1.0
+    elif global_scheme == "f":
         return np.log2(1.0 * totaldocs / docfreq)
+    elif global_scheme == "t":
+        return np.log2((totaldocs + 1.0) / docfreq)
     elif global_scheme == "p":
         return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq))
 
@@ -191,11 +208,9 @@ def smartirs_normalize(x, norm_scheme, return_norm=False):
     Parameters
     ----------
     x : numpy.ndarray
-        Input array
-    norm_scheme : {'n', 'c'}
-        Normalizing function to use:
-        `n`: no normalization
-        `c`: unit L2 norm (scale `x` to unit euclidean length)
+        The tf-idf vector.
+    norm_scheme : {'x', 'n', 'c', 'u'}
+        Document length normalization scheme.
     return_norm : bool, optional
         Return the length of `x` as well?
 
@@ -204,10 +219,10 @@ def smartirs_normalize(x, norm_scheme, return_norm=False):
     numpy.ndarray
         Normalized array.
     float (only if return_norm is set)
-        L2 norm of `x`.
+        Norm of `x`.
 
     """
-    if norm_scheme == "n":
+    if norm_scheme in ("x", "n"):
         if return_norm:
             _, length = matutils.unitvec(x, return_norm=return_norm)
             return x, length
@@ -215,6 +230,8 @@ def smartirs_normalize(x, norm_scheme, return_norm=False):
             return x
     elif norm_scheme == "c":
         return matutils.unitvec(x, return_norm=return_norm)
+    elif norm_scheme == "u":
+        return matutils.unitvec(x, return_norm=return_norm, norm='unique')
 
 
 class TfidfModel(interfaces.TransformationABC):
@@ -265,7 +282,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
             (other options: :func:`math.sqrt`, :func:`math.log1p`, etc).
         wglobal : function, optional
             Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`.
-        normalize : bool, optional
+        normalize : {bool, callable}, optional
             Normalize document vectors to unit euclidean length? You can also inject your own function into `normalize`.
         smartirs : str, optional
             SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System,
@@ -274,20 +291,23 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
             for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector.
 
             Term frequency weighing:
-                * `n` - natural,
-                * `l` - logarithm,
+                * `b` - binary,
+                * `t` or `n` - raw,
                 * `a` - augmented,
-                * `b` - boolean,
+                * `l` - logarithm,
+                * `d` - double logarithm,
                 * `L` - log average.
 
             Document frequency weighting:
-                * `n` - none,
-                * `t` - idf,
-                * `p` - prob idf.
+                * `x` or `n` - none,
+                * `f` - idf,
+                * `t` - zero-corrected idf,
+                * `p` - probabilistic idf.
 
             Document normalization:
-                * `n` - none,
-                * `c` - cosine.
+                * `x` or `n` - none,
+                * `c` - cosine,
+                * `u` - pivoted unique.
 
             For more information visit `SMART Information Retrieval System
             <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
@@ -300,8 +320,9 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
             retrieval and relevance curves intersect.
 
             This parameter along with `slope` is used for pivoted document length normalization.
-            Only when `pivot` is not None will pivoted document length normalization be applied.
-            Otherwise, regular TfIdf is used.
+            When `pivot` is None, `smartirs` specifies the pivoted unique document normalization scheme, and either
+            `corpus` or `dictionary` are specified, then the pivot will be determined automatically. Otherwise, no
+            pivoted document length normalization is applied.
         slope : float, optional
             Parameter required by pivoted document length normalization which determines the slope to which
             the `old normalization` can be tilted. This parameter only works when pivot is defined.
@@ -315,16 +336,11 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
         self.pivot = pivot
         self.eps = 1e-12
 
-        # If smartirs is not None, override wlocal, wglobal and normalize
+        # If smartirs is not None, override wlocal and wglobal
         if smartirs is not None:
             n_tf, n_df, n_n = resolve_weights(smartirs)
             self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf)
             self.wglobal = partial(smartirs_wglobal, global_scheme=n_df)
-            # also return norm factor if pivot is not none
-            if self.pivot is None:
-                self.normalize = partial(smartirs_normalize, norm_scheme=n_n)
-            else:
-                self.normalize = partial(smartirs_normalize, norm_scheme=n_n, return_norm=True)
 
         if dictionary is not None:
             # user supplied a Dictionary object, which already contains all the
@@ -346,6 +362,16 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
             # be initialized in some other way
             pass
 
+        # If smartirs is not None, override pivot and normalize
+        if smartirs is not None:
+            if self.pivot is None and (dictionary is not None or corpus is not None) and n_n == "u":
+                self.pivot = 1.0 * self.num_nnz / self.num_docs
+            # also return norm factor if pivot is not none
+            if self.pivot is None:
+                self.normalize = partial(smartirs_normalize, norm_scheme=n_n)
+            else:
+                self.normalize = partial(smartirs_normalize, norm_scheme=n_n, return_norm=True)
+
     @classmethod
     def load(cls, *args, **kwargs):
         """Load a previously saved TfidfModel class. Handles backwards compatibility from
diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py
index 4484037572..f5ae6b8263 100644
--- a/gensim/sklearn_api/tfidf.py
+++ b/gensim/sklearn_api/tfidf.py
@@ -34,7 +34,7 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator):
 
     """
     def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
-                 wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="ntc",
+                 wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="nfc",
                  pivot=None, slope=0.65):
         """
 
@@ -60,21 +60,25 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
             The mnemonic for representing a combination of weights takes the form XYZ,
             for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector.
 
-            Term frequency weighing:
-                * `n` - natural,
-                * `l` - logarithm,
-                * `a` - augmented,
-                * `b` - boolean,
-                * `L` - log average.
-
-            Document frequency weighting:
-                * `n` - none,
-                * `t` - idf,
-                * `p` - prob idf.
-
-            Document normalization:
-                * `n` - none,
-                * `c` - cosine.
+            local_letter : str
+                Term frequency weighing, one of:
+                    * `b` - binary,
+                    * `t` or `n` - raw,
+                    * `a` - augmented,
+                    * `l` - logarithm,
+                    * `d` - double logarithm,
+                    * `L` - log average.
+            global_letter : str
+                Document frequency weighting, one of:
+                    * `x` or `n` - none,
+                    * `f` - idf,
+                    * `t` - zero-corrected idf,
+                    * `p` - probabilistic idf.
+            normalization_letter : str
+                Document normalization, one of:
+                    * `x` or `n` - none,
+                    * `c` - cosine,
+                    * `u` - pivoted unique.
 
             For more info, visit `"Wikipedia" <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
         pivot : float, optional
@@ -82,9 +86,10 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
             normalization curve. In the paper `Amit Singhal, Chris Buckley, Mandar Mitra:
             "Pivoted Document Length Normalization" <http://singhal.info/pivoted-dln.pdf>`_ it is the point where the
             retrieval and relevance curves intersect.
-            This parameter along with slope is used for pivoted document length normalization.
-            Only when `pivot` is not None pivoted document length normalization will be applied else regular TfIdf
-            is used.
+            This parameter along with `slope` is used for pivoted document length normalization.
+            When `pivot` is None, `smartirs` specifies the pivoted unique document normalization scheme, and either
+            `corpus` or `dictionary` are specified, then the pivot will be determined automatically. Otherwise, no
+            pivoted document length normalization is applied.
         slope : float, optional
             It is the parameter required by pivoted document length normalization which determines the slope to which
             the `old normalization` can be tilted. This parameter only works when pivot is defined by user and is not
diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py
index 79e3742d48..cb03b1112d 100644
--- a/gensim/test/test_tfidfmodel.py
+++ b/gensim/test/test_tfidfmodel.py
@@ -80,7 +80,7 @@ def test_persistence(self):
 
         # Test persistence with using `smartirs`
         fname = get_tmpfile('gensim_models_smartirs.tst')
-        model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs="nfc")
         model.save(fname)
         model2 = tfidfmodel.TfidfModel.load(fname)
         self.assertTrue(model.idfs == model2.idfs)
@@ -90,7 +90,7 @@ def test_persistence(self):
         self.assertTrue(np.allclose(model[[]], model2[[]]))  # try projecting an empty vector
 
         # Test persistence between Gensim v3.2.0 and current model.
-        model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
+        model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="nfc")
         model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
         idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
         idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
@@ -134,7 +134,7 @@ def test_persistence_compressed(self):
 
         # Test persistence with using `smartirs`
         fname = get_tmpfile('gensim_models_smartirs.tst.gz')
-        model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs="nfc")
         model.save(fname)
         model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
         self.assertTrue(model.idfs == model2.idfs)
@@ -144,7 +144,7 @@ def test_persistence_compressed(self):
         self.assertTrue(np.allclose(model[[]], model2[[]]))  # try projecting an empty vector
 
         # Test persistence between Gensim v3.2.0 and current compressed model.
-        model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
+        model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="nfc")
         model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
         idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
         idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
@@ -178,7 +178,7 @@ def test_consistency(self):
         docs = [corpus[1], corpus[2]]
 
         # Test if `ntc` yields the default docs.
-        model = tfidfmodel.TfidfModel(corpus, smartirs='ntc')
+        model = tfidfmodel.TfidfModel(corpus, smartirs='nfc')
         transformed_docs = [model[docs[0]], model[docs[1]]]
 
         model = tfidfmodel.TfidfModel(corpus)
@@ -188,6 +188,14 @@ def test_consistency(self):
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
 
         # Testing all the variations of `wlocal`
+        # tnn
+        model = tfidfmodel.TfidfModel(corpus, smartirs='tnn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = docs[:]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
         # nnn
         model = tfidfmodel.TfidfModel(corpus, smartirs='nnn')
         transformed_docs = [model[docs[0]], model[docs[1]]]
@@ -207,6 +215,17 @@ def test_consistency(self):
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
 
+        # dnn
+        model = tfidfmodel.TfidfModel(corpus, smartirs='dnn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [
+            [(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)],
+            [(5, 2.0), (9, 1.0), (10, 1.0)]
+        ]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
         # ann
         model = tfidfmodel.TfidfModel(corpus, smartirs='ann')
         transformed_docs = [model[docs[0]], model[docs[1]]]
@@ -242,12 +261,17 @@ def test_consistency(self):
             ]
         ]
 
+        # Testing all the variations of `glocal`
+        # nxn
+        model = tfidfmodel.TfidfModel(corpus, smartirs='nxn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = docs[:]
+
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
 
-        # Testing all the variations of `glocal`
-        # ntn
-        model = tfidfmodel.TfidfModel(corpus, smartirs='ntn')
+        # nfn
+        model = tfidfmodel.TfidfModel(corpus, smartirs='nfn')
         transformed_docs = [model[docs[0]], model[docs[1]]]
         expected_docs = [
             [
@@ -262,6 +286,22 @@ def test_consistency(self):
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
 
+        # ntn
+        model = tfidfmodel.TfidfModel(corpus, smartirs='ntn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [
+            [
+                (3, 3.321928094887362), (4, 3.321928094887362), (5, 1.736965594166206), (6, 3.321928094887362),
+                (7, 3.321928094887362), (8, 2.321928094887362)
+            ],
+            [
+                (5, 3.473931188332412), (9, 3.321928094887362), (10, 3.321928094887362)
+            ]
+        ]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
         # npn
         model = tfidfmodel.TfidfModel(corpus, smartirs='npn')
         transformed_docs = [model[docs[0]], model[docs[1]]]
@@ -279,6 +319,14 @@ def test_consistency(self):
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
 
         # Testing all the variations of `normalize`
+        # nnx
+        model = tfidfmodel.TfidfModel(corpus, smartirs='nnx')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = docs[:]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
         # nnc
         model = tfidfmodel.TfidfModel(corpus, smartirs='nnc')
         transformed_docs = [model[docs[0]], model[docs[1]]]
@@ -305,6 +353,23 @@ def test_consistency(self):
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
 
+        # nnu
+        slope = 0.2
+        model = tfidfmodel.TfidfModel(corpus, smartirs='nnu', slope=slope)
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        average_unique_length = 1.0 * sum(len(set(text)) for text in texts) / len(texts)
+        vector_norms = [
+            (1.0 - slope) * average_unique_length + slope * 6.0,
+            (1.0 - slope) * average_unique_length + slope * 3.0,
+        ]
+        expected_docs = [
+            [(termid, weight / vector_norms[0]) for termid, weight in docs[0]],
+            [(termid, weight / vector_norms[1]) for termid, weight in docs[1]],
+        ]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
     def test_pivoted_normalization(self):
         docs = [corpus[1], corpus[2]]
 

From 269abf3737ee85278e1daca370e6507945cca1a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Sun, 24 Mar 2019 12:14:44 +0100
Subject: [PATCH 03/17] Add collection frequency attribute to
 gensim.corpora.Dictionary

---
 gensim/corpora/dictionary.py           | 13 +++++++++----
 gensim/test/test_corpora_dictionary.py | 19 +++++++++++++++++++
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
index 21df726f3d..61634bb77f 100644
--- a/gensim/corpora/dictionary.py
+++ b/gensim/corpora/dictionary.py
@@ -36,6 +36,8 @@ class Dictionary(utils.SaveLoad, Mapping):
         token -> tokenId.
     id2token : dict of (int, str)
         Reverse mapping for token2id, initialized in a lazy manner to save memory (not created until needed).
+    cfs : dict of (int, int)
+        Collection frequencies: token_id -> how many instances of this token are contained in the documents.
     dfs : dict of (int, int)
         Document frequencies: token_id -> how many documents contain this token.
     num_docs : int
@@ -74,6 +76,7 @@ def __init__(self, documents=None, prune_at=2000000):
         """
         self.token2id = {}
         self.id2token = {}
+        self.cfs = {}
         self.dfs = {}
 
         self.num_docs = 0
@@ -263,10 +266,10 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
             self.num_docs += 1
             self.num_pos += sum(itervalues(counter))
             self.num_nnz += len(result)
-            # increase document count for each unique token that appeared in the document
-            dfs = self.dfs
-            for tokenid in iterkeys(result):
-                dfs[tokenid] = dfs.get(tokenid, 0) + 1
+            # keep track of document and collection frequencies
+            for tokenid, freq in iteritems(result):
+                self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
+                self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
 
         # return tokenids, in ascending id order
         result = sorted(iteritems(result))
@@ -449,10 +452,12 @@ def filter_tokens(self, bad_ids=None, good_ids=None):
         if bad_ids is not None:
             bad_ids = set(bad_ids)
             self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid not in bad_ids}
+            self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid not in bad_ids}
             self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid not in bad_ids}
         if good_ids is not None:
             good_ids = set(good_ids)
             self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid in good_ids}
+            self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid in good_ids}
             self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid in good_ids}
         self.compactify()
 
diff --git a/gensim/test/test_corpora_dictionary.py b/gensim/test/test_corpora_dictionary.py
index e5ec3221fd..cca9993952 100644
--- a/gensim/test/test_corpora_dictionary.py
+++ b/gensim/test/test_corpora_dictionary.py
@@ -76,6 +76,25 @@ def testDocFreqForOneDocWithSeveralWord(self):
         expected = {0: 1, 1: 1, 2: 1}
         self.assertEqual(d.dfs, expected)
 
+    def testDocFreqAndCollectionFreq(self):
+        # one doc
+        texts = [['human', 'human', 'human']]
+        d = Dictionary(texts)
+        self.assertEqual(d.cfs, {0: 3})
+        self.assertEqual(d.dfs, {0: 1})
+
+        # two docs
+        texts = [['human', 'human'], ['human']]
+        d = Dictionary(texts)
+        self.assertEqual(d.cfs, {0: 3})
+        self.assertEqual(d.dfs, {0: 2})
+
+        # three docs
+        texts = [['human'], ['human'], ['human']]
+        d = Dictionary(texts)
+        self.assertEqual(d.cfs, {0: 3})
+        self.assertEqual(d.dfs, {0: 3})
+
     def testBuild(self):
         d = Dictionary(self.texts)
 

From 5b5c12ff7240db1b37ef0b9c6216f59473925e4d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Sun, 24 Mar 2019 19:15:08 +0100
Subject: [PATCH 04/17] Resolve SMART letter aliases in
 gensim.models.tfidf.resolve_weights

---
 gensim/models/tfidfmodel.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index a87f580aa4..3b3b0131fd 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -30,7 +30,7 @@ def resolve_weights(smartirs):
 
     Parameters
     ----------
-    smartirs : str
+    smartirs : str or None
         `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
         Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
         variants in the vector space model. The mnemonic for representing a combination
@@ -40,7 +40,7 @@ def resolve_weights(smartirs):
 
     Returns
     -------
-    3-tuple (local_letter, global_letter, normalization_letter)
+    str of (local_letter, global_letter, normalization_letter) or None
 
     local_letter : str
         Term frequency weighing, one of:
@@ -69,6 +69,10 @@ def resolve_weights(smartirs):
         doesn't fit the list of permissible values.
 
     """
+
+    if smartirs is None:
+        return None
+
     if isinstance(smartirs, str) and re.match(r"...\....", smartirs):
         match = re.match(r"(?P<ddd>...)\.(?P<qqq>...)", smartirs)
         raise ValueError(
@@ -93,7 +97,15 @@ def resolve_weights(smartirs):
     if w_n not in 'xncu':
         raise ValueError("Expected normalization weight to be one of 'xncu', except got {}".format(w_n))
 
-    return w_tf, w_df, w_n
+    # resolve aliases
+    if w_tf == "t":
+        w_tf = "n"
+    if w_df == "x":
+        w_df = "n"
+    if w_n == "x":
+        w_n = "n"
+
+    return w_tf + w_df + w_n
 
 
 def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
@@ -151,7 +163,7 @@ def smartirs_wlocal(tf, local_scheme):
     ----------
     tf : int
         Term frequency.
-    local : {'b', 't', 'n', 'a', 'l', 'd', 'L'}
+    local : {'b', 'n', 'a', 'l', 'd', 'L'}
         Local transformation scheme.
 
     Returns
@@ -160,7 +172,7 @@ def smartirs_wlocal(tf, local_scheme):
         Calculated local weight.
 
     """
-    if local_scheme in ("t", "n"):
+    if local_scheme == "n":
         return tf
     elif local_scheme == "l":
         return 1 + np.log2(tf)
@@ -183,7 +195,7 @@ def smartirs_wglobal(docfreq, totaldocs, global_scheme):
         Document frequency.
     totaldocs : int
         Total number of documents.
-    global_scheme : {'x', 'n', 'f', 't', 'p'}
+    global_scheme : {'n', 'f', 't', 'p'}
         Global transformation scheme.
 
     Returns
@@ -192,7 +204,7 @@ def smartirs_wglobal(docfreq, totaldocs, global_scheme):
         Calculated global weight.
 
     """
-    if global_scheme in ("x", "n"):
+    if global_scheme == "n":
         return 1.0
     elif global_scheme == "f":
         return np.log2(1.0 * totaldocs / docfreq)
@@ -209,7 +221,7 @@ def smartirs_normalize(x, norm_scheme, return_norm=False):
     ----------
     x : numpy.ndarray
         The tf-idf vector.
-    norm_scheme : {'x', 'n', 'c', 'u'}
+    norm_scheme : {'n', 'c', 'u'}
         Document length normalization scheme.
     return_norm : bool, optional
         Return the length of `x` as well?
@@ -222,7 +234,7 @@ def smartirs_normalize(x, norm_scheme, return_norm=False):
         Norm of `x`.
 
     """
-    if norm_scheme in ("x", "n"):
+    if norm_scheme == "n":
         if return_norm:
             _, length = matutils.unitvec(x, return_norm=return_norm)
             return x, length

From 3cd63d1a62353baac1fd9abb85509d4f62b45730 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Sun, 24 Mar 2019 21:14:19 +0100
Subject: [PATCH 05/17] Implement the `b` pivoted document length normalization
 method

---
 gensim/models/tfidfmodel.py    | 95 +++++++++++++++++++++++-----------
 gensim/sklearn_api/tfidf.py    |  3 +-
 gensim/test/test_tfidfmodel.py | 17 ++++++
 3 files changed, 85 insertions(+), 30 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 3b3b0131fd..df649da0f6 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -18,7 +18,8 @@
 import re
 
 from gensim import interfaces, matutils, utils
-from six import iteritems
+from gensim.utils import deprecated
+from six import iteritems, iterkeys
 
 import numpy as np
 
@@ -60,7 +61,8 @@ def resolve_weights(smartirs):
         Document normalization, one of:
             * `x` or `n` - none,
             * `c` - cosine,
-            * `u` - pivoted unique.
+            * `u` - pivoted unique,
+            * `b` - pivoted character length.
 
     Raises
     ------
@@ -94,8 +96,8 @@ def resolve_weights(smartirs):
     if w_df not in 'xnftp':
         raise ValueError("Expected inverse document frequency weight to be one of 'xnftp', except got {}".format(w_df))
 
-    if w_n not in 'xncu':
-        raise ValueError("Expected normalization weight to be one of 'xncu', except got {}".format(w_n))
+    if w_n not in 'xncub':
+        raise ValueError("Expected normalization weight to be one of 'xncub', except got {}".format(w_n))
 
     # resolve aliases
     if w_tf == "t":
@@ -214,25 +216,23 @@ def smartirs_wglobal(docfreq, totaldocs, global_scheme):
         return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq))
 
 
+@deprecated("Function will be removed in 4.0.0")
 def smartirs_normalize(x, norm_scheme, return_norm=False):
     """Normalize a vector using the normalization scheme specified in `norm_scheme`.
-
     Parameters
     ----------
     x : numpy.ndarray
         The tf-idf vector.
-    norm_scheme : {'n', 'c', 'u'}
+    norm_scheme : {'n', 'c'}
         Document length normalization scheme.
     return_norm : bool, optional
         Return the length of `x` as well?
-
     Returns
     -------
     numpy.ndarray
         Normalized array.
     float (only if return_norm is set)
         Norm of `x`.
-
     """
     if norm_scheme == "n":
         if return_norm:
@@ -242,8 +242,6 @@ def smartirs_normalize(x, norm_scheme, return_norm=False):
             return x
     elif norm_scheme == "c":
         return matutils.unitvec(x, return_norm=return_norm)
-    elif norm_scheme == "u":
-        return matutils.unitvec(x, return_norm=return_norm, norm='unique')
 
 
 class TfidfModel(interfaces.TransformationABC):
@@ -319,7 +317,8 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
             Document normalization:
                 * `x` or `n` - none,
                 * `c` - cosine,
-                * `u` - pivoted unique.
+                * `u` - pivoted unique,
+                * `b` - pivoted character length.
 
             For more information visit `SMART Information Retrieval System
             <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
@@ -332,9 +331,12 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
             retrieval and relevance curves intersect.
 
             This parameter along with `slope` is used for pivoted document length normalization.
-            When `pivot` is None, `smartirs` specifies the pivoted unique document normalization scheme, and either
-            `corpus` or `dictionary` are specified, then the pivot will be determined automatically. Otherwise, no
-            pivoted document length normalization is applied.
+
+            When `pivot` is None, and `smartirs` specifies the pivoted unique document normalization scheme (u), and
+            either `corpus` or `dictionary` are specified, then the pivot will be determined automatically.
+
+            When `pivot` is None, and `smartirs` specifies the character length unique document normalization
+            scheme (b), and `dictionary` is specified, then the pivot will be determined automatically.
         slope : float, optional
             Parameter required by pivoted document length normalization which determines the slope to which
             the `old normalization` can be tilted. This parameter only works when pivot is defined.
@@ -343,14 +345,14 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
         self.id2word = id2word
         self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
         self.num_docs, self.num_nnz, self.idfs = None, None, None
-        self.smartirs = smartirs
+        self.smartirs = resolve_weights(smartirs)
         self.slope = slope
         self.pivot = pivot
         self.eps = 1e-12
 
         # If smartirs is not None, override wlocal and wglobal
         if smartirs is not None:
-            n_tf, n_df, n_n = resolve_weights(smartirs)
+            n_tf, n_df, n_n = self.smartirs
             self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf)
             self.wglobal = partial(smartirs_wglobal, global_scheme=n_df)
 
@@ -363,7 +365,9 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
                     "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus"
                 )
             self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
+            self.cfs = dictionary.cfs.copy()
             self.dfs = dictionary.dfs.copy()
+            self.term_lens = {termid: len(term) for termid, term in iteritems(dictionary)}
             self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
             if id2word is None:
                 self.id2word = dictionary
@@ -376,13 +380,25 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
 
         # If smartirs is not None, override pivot and normalize
         if smartirs is not None:
-            if self.pivot is None and (dictionary is not None or corpus is not None) and n_n == "u":
-                self.pivot = 1.0 * self.num_nnz / self.num_docs
-            # also return norm factor if pivot is not none
             if self.pivot is None:
-                self.normalize = partial(smartirs_normalize, norm_scheme=n_n)
-            else:
-                self.normalize = partial(smartirs_normalize, norm_scheme=n_n, return_norm=True)
+                if n_n == "u":
+                    if dictionary is not None or corpus is not None:
+                        if callable(self.normalize):
+                            logger.warning("constructor received smartirs; ignoring normalize")
+                        self.pivot = 1.0 * self.num_nnz / self.num_docs
+                    else:
+                        logger.warning("constructor received no corpus or dictionary; ignoring smartirs[2]")
+                elif n_n == "b":
+                    if dictionary is not None:
+                        if callable(self.normalize):
+                            logger.warning("constructor received smartirs; ignoring normalize")
+                        self.pivot = 1.0 * sum(
+                            self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in iterkeys(dictionary)
+                        ) / self.num_docs
+                    else:
+                        logger.warning("constructor received no dictionary; ignoring smartirs[2]")
+            elif n_n in 'ub':
+                logger.warning("constructor received pivot; ignoring smartirs[2]")
 
     @classmethod
     def load(cls, *args, **kwargs):
@@ -426,7 +442,9 @@ def initialize(self, corpus):
         # keep some stats about the training corpus
         self.num_docs = docno + 1
         self.num_nnz = numnnz
+        self.cfs = None
         self.dfs = dfs
+        self.term_lengths = None
         # and finally compute the idf weights
         n_features = max(dfs) if dfs else 0
         logger.info(
@@ -474,18 +492,37 @@ def __getitem__(self, bow, eps=1e-12):
             for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps
         ]
 
-        if self.normalize is True:
-            self.normalize = matutils.unitvec
-        elif self.normalize is False:
-            self.normalize = utils.identity
-
         # and finally, normalize the vector either to unit length, or use a
         # user-defined normalization function
+        if self.smartirs is not None:
+            n_n = self.smartirs[2]
+            if n_n == "n" or (n_n in 'ub' and self.pivot is None):
+                if self.pivot is not None:
+                    _, old_norm = matutils.unitvec(vector, return_norm=True)
+                norm_vector = vector
+            elif n_n == "c":
+                if self.pivot is not None:
+                    _, old_norm = matutils.unitvec(vector, return_norm=True)
+                else:
+                    norm_vector = matutils.unitvec(vector)
+            elif n_n == "u":
+                _, old_norm = matutils.unitvec(vector, return_norm=True, norm='unique')
+            elif n_n == "b":
+                old_norm = sum(freq * (self.term_lens[termid] + 1.0) for termid, freq in bow)
+        else:
+            if self.normalize:
+                self.normalize = matutils.unitvec
+            else:
+                self.normalize = utils.identity
+
+            if self.pivot is not None:
+                _, old_norm = self.normalize(vector, return_norm=True)
+            else:
+                norm_vector = self.normalize(vector)
+
         if self.pivot is None:
-            norm_vector = self.normalize(vector)
             norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps]
         else:
-            _, old_norm = self.normalize(vector, return_norm=True)
             pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm
             norm_vector = [
                 (termid, weight / float(pivoted_norm))
diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py
index f5ae6b8263..d95b8a125a 100644
--- a/gensim/sklearn_api/tfidf.py
+++ b/gensim/sklearn_api/tfidf.py
@@ -78,7 +78,8 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
                 Document normalization, one of:
                     * `x` or `n` - none,
                     * `c` - cosine,
-                    * `u` - pivoted unique.
+                    * `u` - pivoted unique,
+                    * `b` - pivoted character length.
 
             For more info, visit `"Wikipedia" <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
         pivot : float, optional
diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py
index cb03b1112d..0848d5e449 100644
--- a/gensim/test/test_tfidfmodel.py
+++ b/gensim/test/test_tfidfmodel.py
@@ -370,6 +370,23 @@ def test_consistency(self):
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
 
+        # nnb
+        slope = 0.2
+        model = tfidfmodel.TfidfModel(dictionary=dictionary, smartirs='nnb', slope=slope)
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        average_character_length = sum(len(word) + 1.0 for text in texts for word in text) / len(texts)
+        vector_norms = [
+            (1.0 - slope) * average_character_length + slope * 36.0,
+            (1.0 - slope) * average_character_length + slope * 25.0,
+        ]
+        expected_docs = [
+            [(termid, weight / vector_norms[0]) for termid, weight in docs[0]],
+            [(termid, weight / vector_norms[1]) for termid, weight in docs[1]],
+        ]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
     def test_pivoted_normalization(self):
         docs = [corpus[1], corpus[2]]
 

From 40fd9c4747319566324a35347ee35595aca42c91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Tue, 23 Apr 2019 15:04:12 +0200
Subject: [PATCH 06/17] Fix error message in unitvec

---
 gensim/matutils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gensim/matutils.py b/gensim/matutils.py
index ff584dfc4d..99b376f34c 100644
--- a/gensim/matutils.py
+++ b/gensim/matutils.py
@@ -706,8 +706,9 @@ def unitvec(vec, norm='l2', return_norm=False):
     Zero-vector will be unchanged.
 
     """
-    if norm not in ('l1', 'l2', 'unique'):
-        raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm)
+    supported_norms = ('l1', 'l2', 'unique')
+    if norm not in supported_norms:
+        raise ValueError("'%s' is not a supported norm. Currently supported norms are %s." % (norm, supported_norms))
 
     if scipy.sparse.issparse(vec):
         vec = vec.tocsr()

From 533be4a5156ce7e6f41d88d36b4aa46040d6d280 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Tue, 23 Apr 2019 15:11:38 +0200
Subject: [PATCH 07/17] Remove redundant comment in TfidfModel

---
 gensim/models/tfidfmodel.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index df649da0f6..1a31e30f01 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -350,7 +350,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
         self.pivot = pivot
         self.eps = 1e-12
 
-        # If smartirs is not None, override wlocal and wglobal
         if smartirs is not None:
             n_tf, n_df, n_n = self.smartirs
             self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf)

From 08d51a19682089aca0806779643a5552f17c151b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Tue, 23 Apr 2019 15:30:31 +0200
Subject: [PATCH 08/17] Fix TfidfModel.__getitem__ for callable self.normalize

---
 gensim/models/tfidfmodel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 1a31e30f01..8c95c46978 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -509,9 +509,9 @@ def __getitem__(self, bow, eps=1e-12):
             elif n_n == "b":
                 old_norm = sum(freq * (self.term_lens[termid] + 1.0) for termid, freq in bow)
         else:
-            if self.normalize:
+            if self.normalize is True:
                 self.normalize = matutils.unitvec
-            else:
+            elif self.normalize is False:
                 self.normalize = utils.identity
 
             if self.pivot is not None:

From 76cdb86da494f144ed04ba5619c66072e761fbd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Tue, 23 Apr 2019 15:41:25 +0200
Subject: [PATCH 09/17] Replace None checks with ducktyping in TfidfModel

---
 gensim/models/tfidfmodel.py | 48 +++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 8c95c46978..d792147576 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -72,7 +72,7 @@ def resolve_weights(smartirs):
 
     """
 
-    if smartirs is None:
+    if not smartirs:
         return None
 
     if isinstance(smartirs, str) and re.match(r"...\....", smartirs):
@@ -350,16 +350,16 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
         self.pivot = pivot
         self.eps = 1e-12
 
-        if smartirs is not None:
+        if smartirs:
             n_tf, n_df, n_n = self.smartirs
             self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf)
             self.wglobal = partial(smartirs_wglobal, global_scheme=n_df)
 
-        if dictionary is not None:
+        if dictionary:
             # user supplied a Dictionary object, which already contains all the
             # statistics we need to construct the IDF mapping. we can skip the
             # step that goes through the corpus (= an optimization).
-            if corpus is not None:
+            if corpus:
                 logger.warning(
                     "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus"
                 )
@@ -368,9 +368,9 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
             self.dfs = dictionary.dfs.copy()
             self.term_lens = {termid: len(term) for termid, term in iteritems(dictionary)}
             self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
-            if id2word is None:
+            if not id2word:
                 self.id2word = dictionary
-        elif corpus is not None:
+        elif corpus:
             self.initialize(corpus)
         else:
             # NOTE: everything is left uninitialized; presumably the model will
@@ -378,26 +378,22 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
             pass
 
         # If smartirs is not None, override pivot and normalize
-        if smartirs is not None:
-            if self.pivot is None:
-                if n_n == "u":
-                    if dictionary is not None or corpus is not None:
-                        if callable(self.normalize):
-                            logger.warning("constructor received smartirs; ignoring normalize")
-                        self.pivot = 1.0 * self.num_nnz / self.num_docs
-                    else:
-                        logger.warning("constructor received no corpus or dictionary; ignoring smartirs[2]")
-                elif n_n == "b":
-                    if dictionary is not None:
-                        if callable(self.normalize):
-                            logger.warning("constructor received smartirs; ignoring normalize")
-                        self.pivot = 1.0 * sum(
-                            self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in iterkeys(dictionary)
-                        ) / self.num_docs
-                    else:
-                        logger.warning("constructor received no dictionary; ignoring smartirs[2]")
-            elif n_n in 'ub':
+        if not smartirs:
+            return
+        if self.pivot is not None:
+            if n_n in 'ub':
                 logger.warning("constructor received pivot; ignoring smartirs[2]")
+            return
+        if n_n in 'ub' and callable(self.normalize):
+            logger.warning("constructor received smartirs; ignoring normalize")
+        if n_n in 'ub' and not dictionary and not corpus:
+            logger.warning("constructor received no corpus or dictionary; ignoring smartirs[2]")
+        elif n_n == "u":
+            self.pivot = 1.0 * self.num_nnz / self.num_docs
+        elif n_n == "b":
+            self.pivot = 1.0 * sum(
+                self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in iterkeys(dictionary)
+            ) / self.num_docs
 
     @classmethod
     def load(cls, *args, **kwargs):
@@ -493,7 +489,7 @@ def __getitem__(self, bow, eps=1e-12):
 
         # and finally, normalize the vector either to unit length, or use a
         # user-defined normalization function
-        if self.smartirs is not None:
+        if self.smartirs:
             n_n = self.smartirs[2]
             if n_n == "n" or (n_n in 'ub' and self.pivot is None):
                 if self.pivot is not None:

From 18d30cb1d0edde517ddedcba434ac35bd5e05ddf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Tue, 23 Apr 2019 16:39:59 +0200
Subject: [PATCH 10/17] Document and test wlocal parameter of TfidfModel

Closes #2444.
---
 gensim/models/tfidfmodel.py    |  6 +++---
 gensim/test/test_tfidfmodel.py | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index d792147576..1fefc97f05 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -287,10 +287,10 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
         dictionary : :class:`~gensim.corpora.Dictionary`
             If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used.
             to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored).
-        wlocals : function, optional
+        wlocals : callable, optional
             Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity`
-            (other options: :func:`math.sqrt`, :func:`math.log1p`, etc).
-        wglobal : function, optional
+            (other options: :func:`numpy.sqrt`, `lambda tf: 0.5 + (0.5 * tf / tf.max())`, etc.).
+        wglobal : callable, optional
             Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`.
         normalize : {bool, callable}, optional
             Normalize document vectors to unit euclidean length? You can also inject your own function into `normalize`.
diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py
index 0848d5e449..4ee387976b 100644
--- a/gensim/test/test_tfidfmodel.py
+++ b/gensim/test/test_tfidfmodel.py
@@ -416,6 +416,25 @@ def test_pivoted_normalization(self):
         self.assertTrue(np.allclose(sorted(transformed_docs[0]), sorted(expected_docs[0])))
         self.assertTrue(np.allclose(sorted(transformed_docs[1]), sorted(expected_docs[1])))
 
+    def test_wlocal_wglobal(self):
+        def wlocal(tf):
+            assert isinstance(tf, np.ndarray)
+            return iter(tf + 1)
+
+        def wglobal(df, total_docs):
+            return 1
+
+        docs = [corpus[1], corpus[2]]
+        model = tfidfmodel.TfidfModel(corpus, wlocal=wlocal, wglobal=wglobal, normalize=False)
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [
+            [(termid, weight + 1) for termid, weight in docs[0]],
+            [(termid, weight + 1) for termid, weight in docs[1]],
+        ]
+
+        self.assertTrue(np.allclose(sorted(transformed_docs[0]), sorted(expected_docs[0])))
+        self.assertTrue(np.allclose(sorted(transformed_docs[1]), sorted(expected_docs[1])))
+
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

From 35f0f9db70a846afbf0d12a0d3526ef61fa11dba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Mon, 29 Apr 2019 05:25:52 +0200
Subject: [PATCH 11/17] Do not accept smartirs=None in resolve_weights

---
 gensim/models/tfidfmodel.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 1fefc97f05..fba5863215 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -31,7 +31,7 @@ def resolve_weights(smartirs):
 
     Parameters
     ----------
-    smartirs : str or None
+    smartirs : str
         `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
         Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
         variants in the vector space model. The mnemonic for representing a combination
@@ -41,7 +41,7 @@ def resolve_weights(smartirs):
 
     Returns
     -------
-    str of (local_letter, global_letter, normalization_letter) or None
+    str of (local_letter, global_letter, normalization_letter)
 
     local_letter : str
         Term frequency weighing, one of:
@@ -72,9 +72,6 @@ def resolve_weights(smartirs):
 
     """
 
-    if not smartirs:
-        return None
-
     if isinstance(smartirs, str) and re.match(r"...\....", smartirs):
         match = re.match(r"(?P<ddd>...)\.(?P<qqq>...)", smartirs)
         raise ValueError(
@@ -345,7 +342,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
         self.id2word = id2word
         self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
         self.num_docs, self.num_nnz, self.idfs = None, None, None
-        self.smartirs = resolve_weights(smartirs)
+        self.smartirs = resolve_weights(smartirs) if smartirs is not None else None
         self.slope = slope
         self.pivot = pivot
         self.eps = 1e-12

From 5d1213adc967bd0e8ebc66e6d16c4866a3129bfa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Mon, 29 Apr 2019 05:28:28 +0200
Subject: [PATCH 12/17] Remove blank line between resolve_weights docstring and
 body (PEP8)

---
 gensim/models/tfidfmodel.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index fba5863215..2219f88979 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -71,7 +71,6 @@ def resolve_weights(smartirs):
         doesn't fit the list of permissible values.
 
     """
-
     if isinstance(smartirs, str) and re.match(r"...\....", smartirs):
         match = re.match(r"(?P<ddd>...)\.(?P<qqq>...)", smartirs)
         raise ValueError(

From 13081f7603b6286796f7548639582999edb8f2b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Mon, 29 Apr 2019 05:31:38 +0200
Subject: [PATCH 13/17] Omit word `except` from resolve_weights ValueError
 messages

---
 gensim/models/tfidfmodel.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 2219f88979..5fb0ad2292 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -82,18 +82,18 @@ def resolve_weights(smartirs):
             )
         )
     if not isinstance(smartirs, str) or len(smartirs) != 3:
-        raise ValueError("Expected a string of length 3 except got " + smartirs)
+        raise ValueError("Expected a string of length 3 got " + smartirs)
 
     w_tf, w_df, w_n = smartirs
 
     if w_tf not in 'btnaldL':
-        raise ValueError("Expected term frequency weight to be one of 'btnaldL', except got {}".format(w_tf))
+        raise ValueError("Expected term frequency weight to be one of 'btnaldL', got {}".format(w_tf))
 
     if w_df not in 'xnftp':
-        raise ValueError("Expected inverse document frequency weight to be one of 'xnftp', except got {}".format(w_df))
+        raise ValueError("Expected inverse document frequency weight to be one of 'xnftp', got {}".format(w_df))
 
     if w_n not in 'xncub':
-        raise ValueError("Expected normalization weight to be one of 'xncub', except got {}".format(w_n))
+        raise ValueError("Expected normalization weight to be one of 'xncub', got {}".format(w_n))
 
     # resolve aliases
     if w_tf == "t":

From 57e5a04101ec1b980fd41cc16e4ec3c4aa1fec96 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Mon, 29 Apr 2019 05:33:14 +0200
Subject: [PATCH 14/17] Add missing blank lines to the smartirs_normalize
 docstring

---
 gensim/models/tfidfmodel.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 5fb0ad2292..08407ea683 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -215,6 +215,7 @@ def smartirs_wglobal(docfreq, totaldocs, global_scheme):
 @deprecated("Function will be removed in 4.0.0")
 def smartirs_normalize(x, norm_scheme, return_norm=False):
     """Normalize a vector using the normalization scheme specified in `norm_scheme`.
+
     Parameters
     ----------
     x : numpy.ndarray
@@ -223,6 +224,7 @@ def smartirs_normalize(x, norm_scheme, return_norm=False):
         Document length normalization scheme.
     return_norm : bool, optional
         Return the length of `x` as well?
+
     Returns
     -------
     numpy.ndarray

From 70326362b10b5f28bb5f168221ac41dda875b3df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Sat, 4 May 2019 23:16:05 +0200
Subject: [PATCH 15/17] Cross-reference docstrings of SMART scheme users
 (functions, classes)

---
 gensim/models/tfidfmodel.py | 9 +++++++++
 gensim/sklearn_api/tfidf.py | 5 +++++
 2 files changed, 14 insertions(+)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 08407ea683..6eb511bcb9 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -70,6 +70,10 @@ def resolve_weights(smartirs):
         If `smartirs` is not a string of length 3 or one of the decomposed value
         doesn't fit the list of permissible values.
 
+    See Also
+    --------
+    ~gensim.sklearn_api.tfidf.TfIdfTransformer, TfidfModel : Classes that also use the SMART scheme.
+
     """
     if isinstance(smartirs, str) and re.match(r"...\....", smartirs):
         match = re.match(r"(?P<ddd>...)\.(?P<qqq>...)", smartirs)
@@ -339,6 +343,11 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
             Parameter required by pivoted document length normalization which determines the slope to which
             the `old normalization` can be tilted. This parameter only works when pivot is defined.
 
+        See Also
+        --------
+        ~gensim.sklearn_api.tfidf.TfIdfTransformer : Class that also uses the SMART scheme.
+        resolve_weights : Function that also uses the SMART scheme.
+
         """
         self.id2word = id2word
         self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py
index d95b8a125a..a262d9d9e1 100644
--- a/gensim/sklearn_api/tfidf.py
+++ b/gensim/sklearn_api/tfidf.py
@@ -96,6 +96,11 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
             the `old normalization` can be tilted. This parameter only works when pivot is defined by user and is not
             None.
 
+        See Also
+        --------
+        ~gensim.models.tfidfmodel.TfidfModel : Class that also uses the SMART scheme.
+        ~gensim.models.tfidfmodel.resolve_weights : Function that also uses the SMART scheme.
+
         """
         self.gensim_model = None
         self.id2word = id2word

From a2f4c7e422a325a86bb12a586a2e97f3d259f730 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Wed, 8 May 2019 17:24:56 +0200
Subject: [PATCH 16/17] Document the default SMART scheme of TfidfModel

---
 gensim/models/tfidfmodel.py | 1 +
 gensim/sklearn_api/tfidf.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 6eb511bcb9..e5b7d252d2 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -322,6 +322,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
                 * `u` - pivoted unique,
                 * `b` - pivoted character length.
 
+            Default is `nfc`.
             For more information visit `SMART Information Retrieval System
             <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
         pivot : float, optional
diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py
index a262d9d9e1..a918ec5528 100644
--- a/gensim/sklearn_api/tfidf.py
+++ b/gensim/sklearn_api/tfidf.py
@@ -81,6 +81,7 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
                     * `u` - pivoted unique,
                     * `b` - pivoted character length.
 
+            Default is `nfc`.
             For more info, visit `"Wikipedia" <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
         pivot : float, optional
             It is the point around which the regular normalization curve is `tilted` to get the new pivoted

From fccc5e5926a6b91daac09a27588f620fb670e902 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= <witiko@mail.muni.cz>
Date: Sat, 18 May 2019 01:02:04 +0200
Subject: [PATCH 17/17] Improve the documentation of slope and pivot

---
 gensim/models/tfidfmodel.py | 45 ++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index e5b7d252d2..239ef71428 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -267,7 +267,7 @@ class TfidfModel(interfaces.TransformationABC):
 
     """
     def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
-                 wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.65):
+                 wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.25):
         r"""Compute TF-IDF by multiplying a local component (term frequency) with a global component
         (inverse document frequency), and normalizing the resulting documents to unit length.
         Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents
@@ -322,33 +322,48 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
                 * `u` - pivoted unique,
                 * `b` - pivoted character length.
 
-            Default is `nfc`.
+            Default is 'nfc'.
             For more information visit `SMART Information Retrieval System
             <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
-        pivot : float, optional
-            See the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/.
+        pivot : float or None, optional
+            In information retrieval, TF-IDF is biased against long documents [1]_. Pivoted document length
+            normalization solves this problem by changing the norm of a document to `slope * old_norm + (1.0 -
+            slope) * pivot`.
 
-            Pivot is the point around which the regular normalization curve is `tilted` to get the new pivoted
-            normalization curve. In the paper `Amit Singhal, Chris Buckley, Mandar Mitra:
-            "Pivoted Document Length Normalization" <http://singhal.info/pivoted-dln.pdf>`_ it is the point where the
-            retrieval and relevance curves intersect.
+            You can either set the `pivot` by hand, or you can let Gensim figure it out automatically with the following
+            two steps:
 
-            This parameter along with `slope` is used for pivoted document length normalization.
+                * Set either the `u` or `b` document normalization in the `smartirs` parameter.
+                * Set either the `corpus` or `dictionary` parameter. The `pivot` will be automatically determined from
+                  the properties of the `corpus` or `dictionary`.
 
-            When `pivot` is None, and `smartirs` specifies the pivoted unique document normalization scheme (u), and
-            either `corpus` or `dictionary` are specified, then the pivot will be determined automatically.
+            If `pivot` is None and you don't follow steps 1 and 2, then pivoted document length normalization will be
+            disabled. Default is None.
 
-            When `pivot` is None, and `smartirs` specifies the character length unique document normalization
-            scheme (b), and `dictionary` is specified, then the pivot will be determined automatically.
+            See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/.
         slope : float, optional
-            Parameter required by pivoted document length normalization which determines the slope to which
-            the `old normalization` can be tilted. This parameter only works when pivot is defined.
+            In information retrieval, TF-IDF is biased against long documents [1]_. Pivoted document length
+            normalization solves this problem by changing the norm of a document to `slope * old_norm + (1.0 -
+            slope) * pivot`.
+
+            Setting the `slope` to 0.0 uses only the `pivot` as the norm, and setting the `slope` to 1.0 effectively
+            disables pivoted document length normalization. Singhal [2]_ suggests setting the `slope` between 0.2 and
+            0.3 for best results. Default is 0.25.
+
+            See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/.
 
         See Also
         --------
         ~gensim.sklearn_api.tfidf.TfIdfTransformer : Class that also uses the SMART scheme.
         resolve_weights : Function that also uses the SMART scheme.
 
+        References
+        ----------
+        .. [1] Singhal, A., Buckley, C., & Mitra, M. (1996). `Pivoted Document Length
+           Normalization <http://singhal.info/pivoted-dln.pdf>`_. *SIGIR Forum*, 51, 176–184.
+        .. [2] Singhal, A. (2001). `Modern information retrieval: A brief overview <http://singhal.info/ieee2001.pdf>`_.
+           *IEEE Data Eng. Bull.*, 24(4), 35–43.
+
         """
         self.id2word = id2word
         self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize