piskvorky · menshikh-iv · Mar 13, 2018 · Dec 12, 2017 · Dec 12, 2017 · Dec 15, 2017
diff --git a/docs/notebooks/pivoted_document_length_normalisation.ipynb b/docs/notebooks/pivoted_document_length_normalisation.ipynb
diff --git a/gensim/matutils.py b/gensim/matutils.py
@@ -666,7 +666,7 @@ def ret_log_normalize_vec(vec, axis=1):
 blas_scal = blas('scal', np.array([], dtype=float))
 
 
-def unitvec(vec, norm='l2'):
+def unitvec(vec, norm='l2', return_norm=False):
     """Scale a vector to unit length.
 
     Parameters
@@ -675,11 +675,16 @@ def unitvec(vec, norm='l2'):
         Input vector in any format
     norm : {'l1', 'l2'}, optional
         Normalization that will be used.
+    return_norm : bool, optional
+        if set to true, it returns the normalization factor along with the
+        normalized vector.
 
     Returns
     -------
-    {numpy.ndarray, scipy.sparse, list of (int, float)}
+    numpy.ndarray, scipy.sparse, list of (int, float)}
         Normalized vector in same format as `vec`.
+    int
+        Normalizing factor.
 
     Notes
     -----
@@ -695,9 +700,15 @@ def unitvec(vec, norm='l2'):
         if norm == 'l2':
             veclen = np.sqrt(np.sum(vec.data ** 2))
         if veclen > 0.0:
-            return vec / veclen
+            if return_norm:
+                return vec / veclen, veclen
+            else:
+                return vec / veclen
         else:
-            return vec
+            if return_norm:
+                return vec, 1
+            else:
+                return vec
 
     if isinstance(vec, np.ndarray):
         vec = np.asarray(vec, dtype=float)
@@ -706,9 +717,15 @@ def unitvec(vec, norm='l2'):
         if norm == 'l2':
             veclen = blas_nrm2(vec)
         if veclen > 0.0:
-            return blas_scal(1.0 / veclen, vec)
+            if return_norm:
+                return blas_scal(1.0 / veclen, vec), veclen
+            else:
+                return blas_scal(1.0 / veclen, vec)
         else:
-            return vec
+            if return_norm:
+                return vec, 1
+            else:
+                return vec
 
     try:
         first = next(iter(vec))  # is there at least one element?
@@ -721,7 +738,10 @@ def unitvec(vec, norm='l2'):
         if norm == 'l2':
             length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec))
         assert length > 0.0, "sparse documents must not contain any explicit zero entries"
-        return ret_normalized_vec(vec, length)
+        if return_norm:
+            return ret_normalized_vec(vec, length), length
+        else:
+            return ret_normalized_vec(vec, length)
     else:
         raise ValueError("unknown input type")
 

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
@@ -57,6 +57,7 @@ def resolve_weights(smartirs):
     References
     ----------
     .. [1] https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System
+    .. [2] http://singhal.info/pivoted-dln.pdf
 
     """
     if not isinstance(smartirs, str) or len(smartirs) != 3:
@@ -70,7 +71,7 @@ def resolve_weights(smartirs):
     if w_df not in 'ntp':
         raise ValueError("Expected inverse document frequency weight to be one of 'ntp', except got {}".format(w_df))
 
-    if w_n not in 'ncb':
+    if w_n not in 'nc':
         raise ValueError("Expected normalization weight to be one of 'ncb', except got {}".format(w_n))
 
     return w_tf, w_df, w_n
@@ -177,7 +178,7 @@ def updated_wglobal(docfreq, totaldocs, n_df):
         return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2)
 
 
-def updated_normalize(x, n_n):
+def updated_normalize(x, n_n, return_norm=False):
     """Normalizes the final tf-idf value according to the value of `n_n`.
 
     Parameters
@@ -186,17 +187,25 @@ def updated_normalize(x, n_n):
         Input array
     n_n : {'n', 'c'}
         Parameter that decides the normalizing function to be used.
+    return_norm : bool, optional
+        if set to true, it also returns the normalization factor along with the
+        normalized vector.
 
     Returns
     -------
     numpy.ndarray
         Normalized array.
+    int
+        Normalizing factor.
 
     """
     if n_n == "n":
-        return x
+        if return_norm:
+            return x, 1
+        else:
+            return x
     elif n_n == "c":
-        return matutils.unitvec(x)
+        return matutils.unitvec(x, return_norm=return_norm)
 
 
 class TfidfModel(interfaces.TransformationABC):
@@ -219,7 +228,8 @@ class TfidfModel(interfaces.TransformationABC):
     """
 
     def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
-                 wglobal=df2idf, normalize=True, smartirs=None):
+                 wglobal=df2idf, normalize=True, smartirs=None,
+                 pivot=None, slope=0.65):
         """Compute tf-idf by multiplying a local component (term frequency) with a global component
         (inverse document frequency), and normalizing the resulting documents to unit length.
         Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents
@@ -273,21 +283,37 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
                 * `c` - cosine.
 
             For more information visit [1]_.
-
+        pivot : float, optional
+            It is the point around which the regular normalization curve is `tilted` to get the new pivoted
+            normalization curve. In the paper[2] it is the point where the retrieval and relevance curves intersect.
+            This parameter along with slope is used for pivoted document length normalization[2].
+            Only when `pivot` is not None pivoted document length normalization will be applied else regular TfIdf
+            is used.
+        slope : float, optional
+            It is the parameter required by pivoted document length normalization which determines the slope to which
+            the `old normalization` can be tilted. This parameter only works when pivot is defined by user and is not
+            None.
         """
 
         self.id2word = id2word
         self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
         self.num_docs, self.num_nnz, self.idfs = None, None, None
         self.smartirs = smartirs
+        self.slope = slope
+        self.pivot = pivot
+        self.eps = 1e-12
 
         # If smartirs is not None, override wlocal, wglobal and normalize
         if smartirs is not None:
             n_tf, n_df, n_n = resolve_weights(smartirs)
 
             self.wlocal = partial(updated_wlocal, n_tf=n_tf)
             self.wglobal = partial(updated_wglobal, n_df=n_df)
-            self.normalize = partial(updated_normalize, n_n=n_n)
+            # also return norm factor if pivot is not none
+            if self.pivot is None:
+                self.normalize = partial(updated_normalize, n_n=n_n)
+            else:
+                self.normalize = partial(updated_normalize, n_n=n_n, return_norm=True)
 
         if dictionary is not None:
             # user supplied a Dictionary object, which already contains all the
@@ -309,6 +335,23 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
             # be initialized in some other way
             pass
 
+    @classmethod
+    def load(cls, *args, **kwargs):
+        """
+        Load a previously saved TfidfModel class. Handles backwards compatibility from
+            older TfidfModel versions which did not use pivoted document normalization.
+        """
+        model = super(TfidfModel, cls).load(*args, **kwargs)
+        if not hasattr(model, 'pivot'):
+            logger.info('older version of %s loaded without pivot arg', cls.__name__)
+            logger.info('Setting pivot to None.')
+            model.pivot = None
+        if not hasattr(model, 'slope'):
+            logger.info('older version of %s loaded without slope arg', cls.__name__)
+            logger.info('Setting slope to 0.65.')
+            model.slope = 0.65
+        return model
+
     def __str__(self):
         return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz)
 
@@ -360,6 +403,7 @@ def __getitem__(self, bow, eps=1e-12):
             TfIdf corpus, if `bow` is corpus.
 
         """
+        self.eps = eps
         # if the input vector is in fact a corpus, return a transformed corpus as a result
         is_corpus, bow = utils.is_corpus(bow)
         if is_corpus:
@@ -377,7 +421,7 @@ def __getitem__(self, bow, eps=1e-12):
 
         vector = [
             (termid, tf * self.idfs.get(termid))
-            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > eps
+            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps
         ]
 
         if self.normalize is True:
@@ -387,8 +431,14 @@ def __getitem__(self, bow, eps=1e-12):
 
         # and finally, normalize the vector either to unit length, or use a
         # user-defined normalization function
-        vector = self.normalize(vector)
-
-        # make sure there are no explicit zeroes in the vector (must be sparse)
-        vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
-        return vector
+        if self.pivot is None:
+            norm_vector = self.normalize(vector)
+            norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps]
+        else:
+            logger.info("Using pivoted normalization")
+            _, old_norm = self.normalize(vector, return_norm=True)
+            pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm
+            norm_vector = [(termid, weight / float(pivoted_norm))
+            for termid, weight in vector if abs(weight / float(pivoted_norm)) > self.eps
+            ]
+        return norm_vector
diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py
@@ -22,7 +22,8 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator):
     """
 
     def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
-                 wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="ntc"):
+                 wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="ntc",
+                 pivot=None, slope=0.65):
         """
         Sklearn wrapper for Tf-Idf model.
         """
@@ -33,6 +34,8 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
         self.wglobal = wglobal
         self.normalize = normalize
         self.smartirs = smartirs
+        self.slope = slope
+        self.pivot = pivot
 
     def fit(self, X, y=None):
         """
@@ -41,6 +44,7 @@ def fit(self, X, y=None):
         self.gensim_model = TfidfModel(
             corpus=X, id2word=self.id2word, dictionary=self.dictionary, wlocal=self.wlocal,
             wglobal=self.wglobal, normalize=self.normalize, smartirs=self.smartirs,
+            pivot=self.pivot, slope=self.slope
         )
         return self
 
@@ -56,4 +60,5 @@ def transform(self, docs):
         # input as python lists
         if isinstance(docs[0], tuple):
             docs = [docs]
+
         return [self.gensim_model[doc] for doc in docs]
diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py
@@ -39,7 +39,7 @@ class TestTfidfModel(unittest.TestCase):
     def setUp(self):
         self.corpus = MmCorpus(datapath('testcorpus.mm'))
 
-    def testTransform(self):
+    def test_transform(self):
         # create the transformation model
         model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
 
@@ -50,7 +50,7 @@ def testTransform(self):
         expected = [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)]
         self.assertTrue(np.allclose(transformed, expected))
 
-    def testInit(self):
+    def test_init(self):
         # create the transformation model by analyzing a corpus
         # uses the global `corpus`!
         model1 = tfidfmodel.TfidfModel(common_corpus)
@@ -65,7 +65,7 @@ def testInit(self):
         model2 = tfidfmodel.TfidfModel(dictionary=common_dictionary)
         self.assertEqual(model1.idfs, model2.idfs)
 
-    def testPersistence(self):
+    def test_persistence(self):
         # Test persistence without using `smartirs`
         fname = get_tmpfile('gensim_models.tst')
         model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
@@ -97,7 +97,25 @@ def testPersistence(self):
         self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
         self.assertTrue(np.allclose(model3[[]], model4[[]]))  # try projecting an empty vector
 
-    def testPersistenceCompressed(self):
+        # Test persistence with using pivoted normalization
+        fname = get_tmpfile('gensim_models_smartirs.tst')
+        model = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
+        model.save(fname)
+        model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
+        self.assertTrue(model.idfs == model2.idfs)
+        tstvec = [corpus[1], corpus[2]]
+        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
+        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
+
+        # Test persistence between Gensim v3.2.0 and pivoted normalization compressed model.
+        model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
+        model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
+        self.assertTrue(model3.idfs == model4.idfs)
+        tstvec = [corpus[1], corpus[2]]
+        self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
+        self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
+
+    def test_persistence_compressed(self):
         # Test persistence without using `smartirs`
         fname = get_tmpfile('gensim_models.tst.gz')
         model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
@@ -129,7 +147,25 @@ def testPersistenceCompressed(self):
         self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
         self.assertTrue(np.allclose(model3[[]], model4[[]]))  # try projecting an empty vector
 
-    def TestConsistency(self):
+        # Test persistence with using pivoted normalization
+        fname = get_tmpfile('gensim_models_smartirs.tst.gz')
+        model = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
+        model.save(fname)
+        model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
+        self.assertTrue(model.idfs == model2.idfs)
+        tstvec = [corpus[1], corpus[2]]
+        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
+        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
+
+        # Test persistence between Gensim v3.2.0 and pivoted normalization compressed model.
+        model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
+        model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
+        self.assertTrue(model3.idfs == model4.idfs)
+        tstvec = [corpus[1], corpus[2]]
+        self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
+        self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
+
+    def test_consistency(self):
         docs = [corpus[1], corpus[2]]
 
         # Test if `ntc` yields the default docs.
@@ -283,6 +319,30 @@ def TestConsistency(self):
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
 
+    def test_pivoted_normalization(self):
+        docs = [corpus[1], corpus[2]]
+
+        # Test if slope=1 yields the default docs for pivoted normalization.
+        model = tfidfmodel.TfidfModel(self.corpus)
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+
+        model = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
+        expected_docs = [model[docs[0]], model[docs[1]]]
+
+        self.assertTrue(np.allclose(sorted(transformed_docs[0]), sorted(expected_docs[0])))
+        self.assertTrue(np.allclose(sorted(transformed_docs[1]), sorted(expected_docs[1])))
+
+        # Test if pivoted model is consistent
+        model = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=0.5)
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(8, 0.8884910505493495), (7, 0.648974041227711), (6, 0.8884910505493495),
+            (5, 0.648974041227711), (4, 0.8884910505493495), (3, 0.8884910505493495)],
+            [(10, 0.8164965809277263), (9, 0.8164965809277263), (5, 1.6329931618554525)]
+            ]
+
+        self.assertTrue(np.allclose(sorted(transformed_docs[0]), sorted(expected_docs[0])))
+        self.assertTrue(np.allclose(sorted(transformed_docs[1]), sorted(expected_docs[1])))
+
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)