From f22084f2476a723ed6be39537e206e16f42ea77b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Mon, 28 Jan 2019 07:30:09 +0100 Subject: [PATCH 01/17] Fix the example code for SparseTermSimilarityMatrix --- gensim/similarities/termsim.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py index 6a0b6d12b5..167b73b241 100644 --- a/gensim/similarities/termsim.py +++ b/gensim/similarities/termsim.py @@ -128,13 +128,13 @@ class SparseTermSimilarityMatrix(SaveLoad): >>> from gensim.test.utils import common_texts >>> from gensim.corpora import Dictionary >>> from gensim.models import Word2Vec, WordEmbeddingSimilarityIndex - >>> from gensim.similarities import SoftCosineSimilarity, TermSimilarityMatrix + >>> from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix >>> >>> model = Word2Vec(common_texts, size=20, min_count=1) # train word-vectors - >>> termsim_index = WordEmbeddingSimilarityIndex(model) + >>> termsim_index = WordEmbeddingSimilarityIndex(model.wv) >>> dictionary = Dictionary(common_texts) >>> bow_corpus = [dictionary.doc2bow(document) for document in common_texts] - >>> similarity_matrix = TermSimilarityMatrix(termsim_index, dictionary) # construct similarity matrix + >>> similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary) # construct similarity matrix >>> docsim_index = SoftCosineSimilarity(bow_corpus, similarity_matrix, num_best=10) >>> >>> query = 'graph trees computer'.split() # make a query From 541cbb35b8c41e8a3f2e1f52ad8fb3767c6da9df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Sun, 17 Mar 2019 01:23:24 +0100 Subject: [PATCH 02/17] Extend implementation of SMART in models.tfidf These are our additions: * Make `t` an alias for the `n` term frequency method. * Implement the `f` document frequency method. * Rename `t` document frequency method to `f`. * Make `x` an alias for the `n` document frequency method. * Make `x` an alias for the `n` document length normalization method. * Implement the `u` pivoted document length normalization method. * Implement the `unique` vector norm to matutils.unitvec. * Produce a helpful error message when a SMART scheme in the `ddd.qqq` format is requested. --- gensim/matutils.py | 10 ++- gensim/models/tfidfmodel.py | 116 ++++++++++++++++++++------------- gensim/sklearn_api/tfidf.py | 43 ++++++------ gensim/test/test_tfidfmodel.py | 81 ++++++++++++++++++++--- 4 files changed, 176 insertions(+), 74 deletions(-) diff --git a/gensim/matutils.py b/gensim/matutils.py index 979b99f6d5..ff584dfc4d 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -689,7 +689,7 @@ def unitvec(vec, norm='l2', return_norm=False): ---------- vec : {numpy.ndarray, scipy.sparse, list of (int, float)} Input vector in any format - norm : {'l1', 'l2'}, optional + norm : {'l1', 'l2', 'unique'}, optional Metric to normalize in. return_norm : bool, optional Return the length of vector `vec`, in addition to the normalized vector itself? @@ -706,7 +706,7 @@ def unitvec(vec, norm='l2', return_norm=False): Zero-vector will be unchanged. """ - if norm not in ('l1', 'l2'): + if norm not in ('l1', 'l2', 'unique'): raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm) if scipy.sparse.issparse(vec): @@ -715,6 +715,8 @@ def unitvec(vec, norm='l2', return_norm=False): veclen = np.sum(np.abs(vec.data)) if norm == 'l2': veclen = np.sqrt(np.sum(vec.data ** 2)) + if norm == 'unique': + veclen = vec.nnz if veclen > 0.0: if np.issubdtype(vec.dtype, np.integer): vec = vec.astype(np.float) @@ -734,6 +736,8 @@ def unitvec(vec, norm='l2', return_norm=False): veclen = np.sum(np.abs(vec)) if norm == 'l2': veclen = blas_nrm2(vec) + if norm == 'unique': + veclen = np.count_nonzero(vec) if veclen > 0.0: if np.issubdtype(vec.dtype, np.integer): vec = vec.astype(np.float) @@ -757,6 +761,8 @@ def unitvec(vec, norm='l2', return_norm=False): length = float(sum(abs(val) for _, val in vec)) if norm == 'l2': length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec)) + if norm == 'unique': + length = 1.0 * len(vec) assert length > 0.0, "sparse documents must not contain any explicit zero entries" if return_norm: return ret_normalized_vec(vec, length), length diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index a4cbedcd22..a87f580aa4 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -15,6 +15,7 @@ import logging from functools import partial +import re from gensim import interfaces, matutils, utils from six import iteritems @@ -43,20 +44,23 @@ def resolve_weights(smartirs): local_letter : str Term frequency weighing, one of: - * `n` - natural, - * `l` - logarithm, + * `b` - binary, + * `t` or `n` - raw, * `a` - augmented, - * `b` - boolean, + * `l` - logarithm, + * `d` - double logarithm, * `L` - log average. global_letter : str Document frequency weighting, one of: - * `n` - none, - * `t` - idf, - * `p` - prob idf. + * `x` or `n` - none, + * `f` - idf, + * `t` - zero-corrected idf, + * `p` - probabilistic idf. normalization_letter : str Document normalization, one of: - * `n` - none, - * `c` - cosine. + * `x` or `n` - none, + * `c` - cosine, + * `u` - pivoted unique. Raises ------ @@ -65,19 +69,29 @@ def resolve_weights(smartirs): doesn't fit the list of permissible values. """ + if isinstance(smartirs, str) and re.match(r"...\....", smartirs): + match = re.match(r"(?P...)\.(?P...)", smartirs) + raise ValueError( + "The notation {ddd}.{qqq} specifies two term-weighting schemes, " + "one for collection documents ({ddd}) and one for queries ({qqq}). " + "You must train two separate tf-idf models.".format( + ddd=match.group("ddd"), + qqq=match.group("qqq"), + ) + ) if not isinstance(smartirs, str) or len(smartirs) != 3: raise ValueError("Expected a string of length 3 except got " + smartirs) w_tf, w_df, w_n = smartirs - if w_tf not in 'nlabL': - raise ValueError("Expected term frequency weight to be one of 'nlabL', except got {}".format(w_tf)) + if w_tf not in 'btnaldL': + raise ValueError("Expected term frequency weight to be one of 'btnaldL', except got {}".format(w_tf)) - if w_df not in 'ntp': - raise ValueError("Expected inverse document frequency weight to be one of 'ntp', except got {}".format(w_df)) + if w_df not in 'xnftp': + raise ValueError("Expected inverse document frequency weight to be one of 'xnftp', except got {}".format(w_df)) - if w_n not in 'nc': - raise ValueError("Expected normalization weight to be one of 'ncb', except got {}".format(w_n)) + if w_n not in 'xncu': + raise ValueError("Expected normalization weight to be one of 'xncu', except got {}".format(w_n)) return w_tf, w_df, w_n @@ -137,7 +151,7 @@ def smartirs_wlocal(tf, local_scheme): ---------- tf : int Term frequency. - local : {'n', 'l', 'a', 'b', 'L'} + local : {'b', 't', 'n', 'a', 'l', 'd', 'L'} Local transformation scheme. Returns @@ -146,10 +160,12 @@ def smartirs_wlocal(tf, local_scheme): Calculated local weight. """ - if local_scheme == "n": + if local_scheme in ("t", "n"): return tf elif local_scheme == "l": return 1 + np.log2(tf) + elif local_scheme == "d": + return 1 + np.log2(1 + np.log2(tf)) elif local_scheme == "a": return 0.5 + (0.5 * tf / tf.max(axis=0)) elif local_scheme == "b": @@ -167,7 +183,7 @@ def smartirs_wglobal(docfreq, totaldocs, global_scheme): Document frequency. totaldocs : int Total number of documents. - global_scheme : {'n', 't', 'p'} + global_scheme : {'x', 'n', 'f', 't', 'p'} Global transformation scheme. Returns @@ -176,11 +192,12 @@ def smartirs_wglobal(docfreq, totaldocs, global_scheme): Calculated global weight. """ - - if global_scheme == "n": - return 1. - elif global_scheme == "t": + if global_scheme in ("x", "n"): + return 1.0 + elif global_scheme == "f": return np.log2(1.0 * totaldocs / docfreq) + elif global_scheme == "t": + return np.log2((totaldocs + 1.0) / docfreq) elif global_scheme == "p": return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq)) @@ -191,11 +208,9 @@ def smartirs_normalize(x, norm_scheme, return_norm=False): Parameters ---------- x : numpy.ndarray - Input array - norm_scheme : {'n', 'c'} - Normalizing function to use: - `n`: no normalization - `c`: unit L2 norm (scale `x` to unit euclidean length) + The tf-idf vector. + norm_scheme : {'x', 'n', 'c', 'u'} + Document length normalization scheme. return_norm : bool, optional Return the length of `x` as well? @@ -204,10 +219,10 @@ def smartirs_normalize(x, norm_scheme, return_norm=False): numpy.ndarray Normalized array. float (only if return_norm is set) - L2 norm of `x`. + Norm of `x`. """ - if norm_scheme == "n": + if norm_scheme in ("x", "n"): if return_norm: _, length = matutils.unitvec(x, return_norm=return_norm) return x, length @@ -215,6 +230,8 @@ def smartirs_normalize(x, norm_scheme, return_norm=False): return x elif norm_scheme == "c": return matutils.unitvec(x, return_norm=return_norm) + elif norm_scheme == "u": + return matutils.unitvec(x, return_norm=return_norm, norm='unique') class TfidfModel(interfaces.TransformationABC): @@ -265,7 +282,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden (other options: :func:`math.sqrt`, :func:`math.log1p`, etc). wglobal : function, optional Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`. - normalize : bool, optional + normalize : {bool, callable}, optional Normalize document vectors to unit euclidean length? You can also inject your own function into `normalize`. smartirs : str, optional SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System, @@ -274,20 +291,23 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector. Term frequency weighing: - * `n` - natural, - * `l` - logarithm, + * `b` - binary, + * `t` or `n` - raw, * `a` - augmented, - * `b` - boolean, + * `l` - logarithm, + * `d` - double logarithm, * `L` - log average. Document frequency weighting: - * `n` - none, - * `t` - idf, - * `p` - prob idf. + * `x` or `n` - none, + * `f` - idf, + * `t` - zero-corrected idf, + * `p` - probabilistic idf. Document normalization: - * `n` - none, - * `c` - cosine. + * `x` or `n` - none, + * `c` - cosine, + * `u` - pivoted unique. For more information visit `SMART Information Retrieval System `_. @@ -300,8 +320,9 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden retrieval and relevance curves intersect. This parameter along with `slope` is used for pivoted document length normalization. - Only when `pivot` is not None will pivoted document length normalization be applied. - Otherwise, regular TfIdf is used. + When `pivot` is None, `smartirs` specifies the pivoted unique document normalization scheme, and either + `corpus` or `dictionary` are specified, then the pivot will be determined automatically. Otherwise, no + pivoted document length normalization is applied. slope : float, optional Parameter required by pivoted document length normalization which determines the slope to which the `old normalization` can be tilted. This parameter only works when pivot is defined. @@ -315,16 +336,11 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.pivot = pivot self.eps = 1e-12 - # If smartirs is not None, override wlocal, wglobal and normalize + # If smartirs is not None, override wlocal and wglobal if smartirs is not None: n_tf, n_df, n_n = resolve_weights(smartirs) self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf) self.wglobal = partial(smartirs_wglobal, global_scheme=n_df) - # also return norm factor if pivot is not none - if self.pivot is None: - self.normalize = partial(smartirs_normalize, norm_scheme=n_n) - else: - self.normalize = partial(smartirs_normalize, norm_scheme=n_n, return_norm=True) if dictionary is not None: # user supplied a Dictionary object, which already contains all the @@ -346,6 +362,16 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden # be initialized in some other way pass + # If smartirs is not None, override pivot and normalize + if smartirs is not None: + if self.pivot is None and (dictionary is not None or corpus is not None) and n_n == "u": + self.pivot = 1.0 * self.num_nnz / self.num_docs + # also return norm factor if pivot is not none + if self.pivot is None: + self.normalize = partial(smartirs_normalize, norm_scheme=n_n) + else: + self.normalize = partial(smartirs_normalize, norm_scheme=n_n, return_norm=True) + @classmethod def load(cls, *args, **kwargs): """Load a previously saved TfidfModel class. Handles backwards compatibility from diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index 4484037572..f5ae6b8263 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -34,7 +34,7 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator): """ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, - wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="ntc", + wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="nfc", pivot=None, slope=0.65): """ @@ -60,21 +60,25 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, The mnemonic for representing a combination of weights takes the form XYZ, for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector. - Term frequency weighing: - * `n` - natural, - * `l` - logarithm, - * `a` - augmented, - * `b` - boolean, - * `L` - log average. - - Document frequency weighting: - * `n` - none, - * `t` - idf, - * `p` - prob idf. - - Document normalization: - * `n` - none, - * `c` - cosine. + local_letter : str + Term frequency weighing, one of: + * `b` - binary, + * `t` or `n` - raw, + * `a` - augmented, + * `l` - logarithm, + * `d` - double logarithm, + * `L` - log average. + global_letter : str + Document frequency weighting, one of: + * `x` or `n` - none, + * `f` - idf, + * `t` - zero-corrected idf, + * `p` - probabilistic idf. + normalization_letter : str + Document normalization, one of: + * `x` or `n` - none, + * `c` - cosine, + * `u` - pivoted unique. For more info, visit `"Wikipedia" `_. pivot : float, optional @@ -82,9 +86,10 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, normalization curve. In the paper `Amit Singhal, Chris Buckley, Mandar Mitra: "Pivoted Document Length Normalization" `_ it is the point where the retrieval and relevance curves intersect. - This parameter along with slope is used for pivoted document length normalization. - Only when `pivot` is not None pivoted document length normalization will be applied else regular TfIdf - is used. + This parameter along with `slope` is used for pivoted document length normalization. + When `pivot` is None, `smartirs` specifies the pivoted unique document normalization scheme, and either + `corpus` or `dictionary` are specified, then the pivot will be determined automatically. Otherwise, no + pivoted document length normalization is applied. slope : float, optional It is the parameter required by pivoted document length normalization which determines the slope to which the `old normalization` can be tilted. This parameter only works when pivot is defined by user and is not diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index 79e3742d48..cb03b1112d 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -80,7 +80,7 @@ def test_persistence(self): # Test persistence with using `smartirs` fname = get_tmpfile('gensim_models_smartirs.tst') - model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") + model = tfidfmodel.TfidfModel(self.corpus, smartirs="nfc") model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname) self.assertTrue(model.idfs == model2.idfs) @@ -90,7 +90,7 @@ def test_persistence(self): self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector # Test persistence between Gensim v3.2.0 and current model. - model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") + model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="nfc") model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst')) idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] @@ -134,7 +134,7 @@ def test_persistence_compressed(self): # Test persistence with using `smartirs` fname = get_tmpfile('gensim_models_smartirs.tst.gz') - model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") + model = tfidfmodel.TfidfModel(self.corpus, smartirs="nfc") model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) self.assertTrue(model.idfs == model2.idfs) @@ -144,7 +144,7 @@ def test_persistence_compressed(self): self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector # Test persistence between Gensim v3.2.0 and current compressed model. - model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") + model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="nfc") model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2')) idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] @@ -178,7 +178,7 @@ def test_consistency(self): docs = [corpus[1], corpus[2]] # Test if `ntc` yields the default docs. - model = tfidfmodel.TfidfModel(corpus, smartirs='ntc') + model = tfidfmodel.TfidfModel(corpus, smartirs='nfc') transformed_docs = [model[docs[0]], model[docs[1]]] model = tfidfmodel.TfidfModel(corpus) @@ -188,6 +188,14 @@ def test_consistency(self): self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Testing all the variations of `wlocal` + # tnn + model = tfidfmodel.TfidfModel(corpus, smartirs='tnn') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = docs[:] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + # nnn model = tfidfmodel.TfidfModel(corpus, smartirs='nnn') transformed_docs = [model[docs[0]], model[docs[1]]] @@ -207,6 +215,17 @@ def test_consistency(self): self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + # dnn + model = tfidfmodel.TfidfModel(corpus, smartirs='dnn') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = [ + [(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], + [(5, 2.0), (9, 1.0), (10, 1.0)] + ] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + # ann model = tfidfmodel.TfidfModel(corpus, smartirs='ann') transformed_docs = [model[docs[0]], model[docs[1]]] @@ -242,12 +261,17 @@ def test_consistency(self): ] ] + # Testing all the variations of `glocal` + # nxn + model = tfidfmodel.TfidfModel(corpus, smartirs='nxn') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = docs[:] + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) - # Testing all the variations of `glocal` - # ntn - model = tfidfmodel.TfidfModel(corpus, smartirs='ntn') + # nfn + model = tfidfmodel.TfidfModel(corpus, smartirs='nfn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ [ @@ -262,6 +286,22 @@ def test_consistency(self): self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + # ntn + model = tfidfmodel.TfidfModel(corpus, smartirs='ntn') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = [ + [ + (3, 3.321928094887362), (4, 3.321928094887362), (5, 1.736965594166206), (6, 3.321928094887362), + (7, 3.321928094887362), (8, 2.321928094887362) + ], + [ + (5, 3.473931188332412), (9, 3.321928094887362), (10, 3.321928094887362) + ] + ] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + # npn model = tfidfmodel.TfidfModel(corpus, smartirs='npn') transformed_docs = [model[docs[0]], model[docs[1]]] @@ -279,6 +319,14 @@ def test_consistency(self): self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Testing all the variations of `normalize` + # nnx + model = tfidfmodel.TfidfModel(corpus, smartirs='nnx') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = docs[:] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + # nnc model = tfidfmodel.TfidfModel(corpus, smartirs='nnc') transformed_docs = [model[docs[0]], model[docs[1]]] @@ -305,6 +353,23 @@ def test_consistency(self): self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + # nnu + slope = 0.2 + model = tfidfmodel.TfidfModel(corpus, smartirs='nnu', slope=slope) + transformed_docs = [model[docs[0]], model[docs[1]]] + average_unique_length = 1.0 * sum(len(set(text)) for text in texts) / len(texts) + vector_norms = [ + (1.0 - slope) * average_unique_length + slope * 6.0, + (1.0 - slope) * average_unique_length + slope * 3.0, + ] + expected_docs = [ + [(termid, weight / vector_norms[0]) for termid, weight in docs[0]], + [(termid, weight / vector_norms[1]) for termid, weight in docs[1]], + ] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + def test_pivoted_normalization(self): docs = [corpus[1], corpus[2]] From 269abf3737ee85278e1daca370e6507945cca1a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Sun, 24 Mar 2019 12:14:44 +0100 Subject: [PATCH 03/17] Add collection frequency attribute to gensim.corpora.Dictionary --- gensim/corpora/dictionary.py | 13 +++++++++---- gensim/test/test_corpora_dictionary.py | 19 +++++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 21df726f3d..61634bb77f 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -36,6 +36,8 @@ class Dictionary(utils.SaveLoad, Mapping): token -> tokenId. id2token : dict of (int, str) Reverse mapping for token2id, initialized in a lazy manner to save memory (not created until needed). + cfs : dict of (int, int) + Collection frequencies: token_id -> how many instances of this token are contained in the documents. dfs : dict of (int, int) Document frequencies: token_id -> how many documents contain this token. num_docs : int @@ -74,6 +76,7 @@ def __init__(self, documents=None, prune_at=2000000): """ self.token2id = {} self.id2token = {} + self.cfs = {} self.dfs = {} self.num_docs = 0 @@ -263,10 +266,10 @@ def doc2bow(self, document, allow_update=False, return_missing=False): self.num_docs += 1 self.num_pos += sum(itervalues(counter)) self.num_nnz += len(result) - # increase document count for each unique token that appeared in the document - dfs = self.dfs - for tokenid in iterkeys(result): - dfs[tokenid] = dfs.get(tokenid, 0) + 1 + # keep track of document and collection frequencies + for tokenid, freq in iteritems(result): + self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq + self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order result = sorted(iteritems(result)) @@ -449,10 +452,12 @@ def filter_tokens(self, bad_ids=None, good_ids=None): if bad_ids is not None: bad_ids = set(bad_ids) self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid not in bad_ids} + self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid not in bad_ids} self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid not in bad_ids} if good_ids is not None: good_ids = set(good_ids) self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid in good_ids} + self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid in good_ids} self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid in good_ids} self.compactify() diff --git a/gensim/test/test_corpora_dictionary.py b/gensim/test/test_corpora_dictionary.py index e5ec3221fd..cca9993952 100644 --- a/gensim/test/test_corpora_dictionary.py +++ b/gensim/test/test_corpora_dictionary.py @@ -76,6 +76,25 @@ def testDocFreqForOneDocWithSeveralWord(self): expected = {0: 1, 1: 1, 2: 1} self.assertEqual(d.dfs, expected) + def testDocFreqAndCollectionFreq(self): + # one doc + texts = [['human', 'human', 'human']] + d = Dictionary(texts) + self.assertEqual(d.cfs, {0: 3}) + self.assertEqual(d.dfs, {0: 1}) + + # two docs + texts = [['human', 'human'], ['human']] + d = Dictionary(texts) + self.assertEqual(d.cfs, {0: 3}) + self.assertEqual(d.dfs, {0: 2}) + + # three docs + texts = [['human'], ['human'], ['human']] + d = Dictionary(texts) + self.assertEqual(d.cfs, {0: 3}) + self.assertEqual(d.dfs, {0: 3}) + def testBuild(self): d = Dictionary(self.texts) From 5b5c12ff7240db1b37ef0b9c6216f59473925e4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Sun, 24 Mar 2019 19:15:08 +0100 Subject: [PATCH 04/17] Resolve SMART letter aliases in gensim.models.tfidf.resolve_weights --- gensim/models/tfidfmodel.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index a87f580aa4..3b3b0131fd 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -30,7 +30,7 @@ def resolve_weights(smartirs): Parameters ---------- - smartirs : str + smartirs : str or None `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting variants in the vector space model. The mnemonic for representing a combination @@ -40,7 +40,7 @@ def resolve_weights(smartirs): Returns ------- - 3-tuple (local_letter, global_letter, normalization_letter) + str of (local_letter, global_letter, normalization_letter) or None local_letter : str Term frequency weighing, one of: @@ -69,6 +69,10 @@ def resolve_weights(smartirs): doesn't fit the list of permissible values. """ + + if smartirs is None: + return None + if isinstance(smartirs, str) and re.match(r"...\....", smartirs): match = re.match(r"(?P...)\.(?P...)", smartirs) raise ValueError( @@ -93,7 +97,15 @@ def resolve_weights(smartirs): if w_n not in 'xncu': raise ValueError("Expected normalization weight to be one of 'xncu', except got {}".format(w_n)) - return w_tf, w_df, w_n + # resolve aliases + if w_tf == "t": + w_tf = "n" + if w_df == "x": + w_df = "n" + if w_n == "x": + w_n = "n" + + return w_tf + w_df + w_n def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): @@ -151,7 +163,7 @@ def smartirs_wlocal(tf, local_scheme): ---------- tf : int Term frequency. - local : {'b', 't', 'n', 'a', 'l', 'd', 'L'} + local : {'b', 'n', 'a', 'l', 'd', 'L'} Local transformation scheme. Returns @@ -160,7 +172,7 @@ def smartirs_wlocal(tf, local_scheme): Calculated local weight. """ - if local_scheme in ("t", "n"): + if local_scheme == "n": return tf elif local_scheme == "l": return 1 + np.log2(tf) @@ -183,7 +195,7 @@ def smartirs_wglobal(docfreq, totaldocs, global_scheme): Document frequency. totaldocs : int Total number of documents. - global_scheme : {'x', 'n', 'f', 't', 'p'} + global_scheme : {'n', 'f', 't', 'p'} Global transformation scheme. Returns @@ -192,7 +204,7 @@ def smartirs_wglobal(docfreq, totaldocs, global_scheme): Calculated global weight. """ - if global_scheme in ("x", "n"): + if global_scheme == "n": return 1.0 elif global_scheme == "f": return np.log2(1.0 * totaldocs / docfreq) @@ -209,7 +221,7 @@ def smartirs_normalize(x, norm_scheme, return_norm=False): ---------- x : numpy.ndarray The tf-idf vector. - norm_scheme : {'x', 'n', 'c', 'u'} + norm_scheme : {'n', 'c', 'u'} Document length normalization scheme. return_norm : bool, optional Return the length of `x` as well? @@ -222,7 +234,7 @@ def smartirs_normalize(x, norm_scheme, return_norm=False): Norm of `x`. """ - if norm_scheme in ("x", "n"): + if norm_scheme == "n": if return_norm: _, length = matutils.unitvec(x, return_norm=return_norm) return x, length From 3cd63d1a62353baac1fd9abb85509d4f62b45730 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Sun, 24 Mar 2019 21:14:19 +0100 Subject: [PATCH 05/17] Implement the `b` pivoted document length normalization method --- gensim/models/tfidfmodel.py | 95 +++++++++++++++++++++++----------- gensim/sklearn_api/tfidf.py | 3 +- gensim/test/test_tfidfmodel.py | 17 ++++++ 3 files changed, 85 insertions(+), 30 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 3b3b0131fd..df649da0f6 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -18,7 +18,8 @@ import re from gensim import interfaces, matutils, utils -from six import iteritems +from gensim.utils import deprecated +from six import iteritems, iterkeys import numpy as np @@ -60,7 +61,8 @@ def resolve_weights(smartirs): Document normalization, one of: * `x` or `n` - none, * `c` - cosine, - * `u` - pivoted unique. + * `u` - pivoted unique, + * `b` - pivoted character length. Raises ------ @@ -94,8 +96,8 @@ def resolve_weights(smartirs): if w_df not in 'xnftp': raise ValueError("Expected inverse document frequency weight to be one of 'xnftp', except got {}".format(w_df)) - if w_n not in 'xncu': - raise ValueError("Expected normalization weight to be one of 'xncu', except got {}".format(w_n)) + if w_n not in 'xncub': + raise ValueError("Expected normalization weight to be one of 'xncub', except got {}".format(w_n)) # resolve aliases if w_tf == "t": @@ -214,25 +216,23 @@ def smartirs_wglobal(docfreq, totaldocs, global_scheme): return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq)) +@deprecated("Function will be removed in 4.0.0") def smartirs_normalize(x, norm_scheme, return_norm=False): """Normalize a vector using the normalization scheme specified in `norm_scheme`. - Parameters ---------- x : numpy.ndarray The tf-idf vector. - norm_scheme : {'n', 'c', 'u'} + norm_scheme : {'n', 'c'} Document length normalization scheme. return_norm : bool, optional Return the length of `x` as well? - Returns ------- numpy.ndarray Normalized array. float (only if return_norm is set) Norm of `x`. - """ if norm_scheme == "n": if return_norm: @@ -242,8 +242,6 @@ def smartirs_normalize(x, norm_scheme, return_norm=False): return x elif norm_scheme == "c": return matutils.unitvec(x, return_norm=return_norm) - elif norm_scheme == "u": - return matutils.unitvec(x, return_norm=return_norm, norm='unique') class TfidfModel(interfaces.TransformationABC): @@ -319,7 +317,8 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden Document normalization: * `x` or `n` - none, * `c` - cosine, - * `u` - pivoted unique. + * `u` - pivoted unique, + * `b` - pivoted character length. For more information visit `SMART Information Retrieval System `_. @@ -332,9 +331,12 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden retrieval and relevance curves intersect. This parameter along with `slope` is used for pivoted document length normalization. - When `pivot` is None, `smartirs` specifies the pivoted unique document normalization scheme, and either - `corpus` or `dictionary` are specified, then the pivot will be determined automatically. Otherwise, no - pivoted document length normalization is applied. + + When `pivot` is None, and `smartirs` specifies the pivoted unique document normalization scheme (u), and + either `corpus` or `dictionary` are specified, then the pivot will be determined automatically. + + When `pivot` is None, and `smartirs` specifies the character length unique document normalization + scheme (b), and `dictionary` is specified, then the pivot will be determined automatically. slope : float, optional Parameter required by pivoted document length normalization which determines the slope to which the `old normalization` can be tilted. This parameter only works when pivot is defined. @@ -343,14 +345,14 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.id2word = id2word self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize self.num_docs, self.num_nnz, self.idfs = None, None, None - self.smartirs = smartirs + self.smartirs = resolve_weights(smartirs) self.slope = slope self.pivot = pivot self.eps = 1e-12 # If smartirs is not None, override wlocal and wglobal if smartirs is not None: - n_tf, n_df, n_n = resolve_weights(smartirs) + n_tf, n_df, n_n = self.smartirs self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf) self.wglobal = partial(smartirs_wglobal, global_scheme=n_df) @@ -363,7 +365,9 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus" ) self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz + self.cfs = dictionary.cfs.copy() self.dfs = dictionary.dfs.copy() + self.term_lens = {termid: len(term) for termid, term in iteritems(dictionary)} self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) if id2word is None: self.id2word = dictionary @@ -376,13 +380,25 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden # If smartirs is not None, override pivot and normalize if smartirs is not None: - if self.pivot is None and (dictionary is not None or corpus is not None) and n_n == "u": - self.pivot = 1.0 * self.num_nnz / self.num_docs - # also return norm factor if pivot is not none if self.pivot is None: - self.normalize = partial(smartirs_normalize, norm_scheme=n_n) - else: - self.normalize = partial(smartirs_normalize, norm_scheme=n_n, return_norm=True) + if n_n == "u": + if dictionary is not None or corpus is not None: + if callable(self.normalize): + logger.warning("constructor received smartirs; ignoring normalize") + self.pivot = 1.0 * self.num_nnz / self.num_docs + else: + logger.warning("constructor received no corpus or dictionary; ignoring smartirs[2]") + elif n_n == "b": + if dictionary is not None: + if callable(self.normalize): + logger.warning("constructor received smartirs; ignoring normalize") + self.pivot = 1.0 * sum( + self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in iterkeys(dictionary) + ) / self.num_docs + else: + logger.warning("constructor received no dictionary; ignoring smartirs[2]") + elif n_n in 'ub': + logger.warning("constructor received pivot; ignoring smartirs[2]") @classmethod def load(cls, *args, **kwargs): @@ -426,7 +442,9 @@ def initialize(self, corpus): # keep some stats about the training corpus self.num_docs = docno + 1 self.num_nnz = numnnz + self.cfs = None self.dfs = dfs + self.term_lengths = None # and finally compute the idf weights n_features = max(dfs) if dfs else 0 logger.info( @@ -474,18 +492,37 @@ def __getitem__(self, bow, eps=1e-12): for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > self.eps ] - if self.normalize is True: - self.normalize = matutils.unitvec - elif self.normalize is False: - self.normalize = utils.identity - # and finally, normalize the vector either to unit length, or use a # user-defined normalization function + if self.smartirs is not None: + n_n = self.smartirs[2] + if n_n == "n" or (n_n in 'ub' and self.pivot is None): + if self.pivot is not None: + _, old_norm = matutils.unitvec(vector, return_norm=True) + norm_vector = vector + elif n_n == "c": + if self.pivot is not None: + _, old_norm = matutils.unitvec(vector, return_norm=True) + else: + norm_vector = matutils.unitvec(vector) + elif n_n == "u": + _, old_norm = matutils.unitvec(vector, return_norm=True, norm='unique') + elif n_n == "b": + old_norm = sum(freq * (self.term_lens[termid] + 1.0) for termid, freq in bow) + else: + if self.normalize: + self.normalize = matutils.unitvec + else: + self.normalize = utils.identity + + if self.pivot is not None: + _, old_norm = self.normalize(vector, return_norm=True) + else: + norm_vector = self.normalize(vector) + if self.pivot is None: - norm_vector = self.normalize(vector) norm_vector = [(termid, weight) for termid, weight in norm_vector if abs(weight) > self.eps] else: - _, old_norm = self.normalize(vector, return_norm=True) pivoted_norm = (1 - self.slope) * self.pivot + self.slope * old_norm norm_vector = [ (termid, weight / float(pivoted_norm)) diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index f5ae6b8263..d95b8a125a 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -78,7 +78,8 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, Document normalization, one of: * `x` or `n` - none, * `c` - cosine, - * `u` - pivoted unique. + * `u` - pivoted unique, + * `b` - pivoted character length. For more info, visit `"Wikipedia" `_. pivot : float, optional diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index cb03b1112d..0848d5e449 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -370,6 +370,23 @@ def test_consistency(self): self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + # nnb + slope = 0.2 + model = tfidfmodel.TfidfModel(dictionary=dictionary, smartirs='nnb', slope=slope) + transformed_docs = [model[docs[0]], model[docs[1]]] + average_character_length = sum(len(word) + 1.0 for text in texts for word in text) / len(texts) + vector_norms = [ + (1.0 - slope) * average_character_length + slope * 36.0, + (1.0 - slope) * average_character_length + slope * 25.0, + ] + expected_docs = [ + [(termid, weight / vector_norms[0]) for termid, weight in docs[0]], + [(termid, weight / vector_norms[1]) for termid, weight in docs[1]], + ] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + def test_pivoted_normalization(self): docs = [corpus[1], corpus[2]] From 40fd9c4747319566324a35347ee35595aca42c91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Tue, 23 Apr 2019 15:04:12 +0200 Subject: [PATCH 06/17] Fix error message in unitvec --- gensim/matutils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/gensim/matutils.py b/gensim/matutils.py index ff584dfc4d..99b376f34c 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -706,8 +706,9 @@ def unitvec(vec, norm='l2', return_norm=False): Zero-vector will be unchanged. """ - if norm not in ('l1', 'l2', 'unique'): - raise ValueError("'%s' is not a supported norm. Currently supported norms are 'l1' and 'l2'." % norm) + supported_norms = ('l1', 'l2', 'unique') + if norm not in supported_norms: + raise ValueError("'%s' is not a supported norm. Currently supported norms are %s." % (norm, supported_norms)) if scipy.sparse.issparse(vec): vec = vec.tocsr() From 533be4a5156ce7e6f41d88d36b4aa46040d6d280 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Tue, 23 Apr 2019 15:11:38 +0200 Subject: [PATCH 07/17] Remove redundant comment in TfidfModel --- gensim/models/tfidfmodel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index df649da0f6..1a31e30f01 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -350,7 +350,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.pivot = pivot self.eps = 1e-12 - # If smartirs is not None, override wlocal and wglobal if smartirs is not None: n_tf, n_df, n_n = self.smartirs self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf) From 08d51a19682089aca0806779643a5552f17c151b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Tue, 23 Apr 2019 15:30:31 +0200 Subject: [PATCH 08/17] Fix TfidfModel.__getitem__ for callable self.normalize --- gensim/models/tfidfmodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 1a31e30f01..8c95c46978 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -509,9 +509,9 @@ def __getitem__(self, bow, eps=1e-12): elif n_n == "b": old_norm = sum(freq * (self.term_lens[termid] + 1.0) for termid, freq in bow) else: - if self.normalize: + if self.normalize is True: self.normalize = matutils.unitvec - else: + elif self.normalize is False: self.normalize = utils.identity if self.pivot is not None: From 76cdb86da494f144ed04ba5619c66072e761fbd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Tue, 23 Apr 2019 15:41:25 +0200 Subject: [PATCH 09/17] Replace None checks with ducktyping in TfidfModel --- gensim/models/tfidfmodel.py | 48 +++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 8c95c46978..d792147576 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -72,7 +72,7 @@ def resolve_weights(smartirs): """ - if smartirs is None: + if not smartirs: return None if isinstance(smartirs, str) and re.match(r"...\....", smartirs): @@ -350,16 +350,16 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.pivot = pivot self.eps = 1e-12 - if smartirs is not None: + if smartirs: n_tf, n_df, n_n = self.smartirs self.wlocal = partial(smartirs_wlocal, local_scheme=n_tf) self.wglobal = partial(smartirs_wglobal, global_scheme=n_df) - if dictionary is not None: + if dictionary: # user supplied a Dictionary object, which already contains all the # statistics we need to construct the IDF mapping. we can skip the # step that goes through the corpus (= an optimization). - if corpus is not None: + if corpus: logger.warning( "constructor received both corpus and explicit inverse document frequencies; ignoring the corpus" ) @@ -368,9 +368,9 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.dfs = dictionary.dfs.copy() self.term_lens = {termid: len(term) for termid, term in iteritems(dictionary)} self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) - if id2word is None: + if not id2word: self.id2word = dictionary - elif corpus is not None: + elif corpus: self.initialize(corpus) else: # NOTE: everything is left uninitialized; presumably the model will @@ -378,26 +378,22 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden pass # If smartirs is not None, override pivot and normalize - if smartirs is not None: - if self.pivot is None: - if n_n == "u": - if dictionary is not None or corpus is not None: - if callable(self.normalize): - logger.warning("constructor received smartirs; ignoring normalize") - self.pivot = 1.0 * self.num_nnz / self.num_docs - else: - logger.warning("constructor received no corpus or dictionary; ignoring smartirs[2]") - elif n_n == "b": - if dictionary is not None: - if callable(self.normalize): - logger.warning("constructor received smartirs; ignoring normalize") - self.pivot = 1.0 * sum( - self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in iterkeys(dictionary) - ) / self.num_docs - else: - logger.warning("constructor received no dictionary; ignoring smartirs[2]") - elif n_n in 'ub': + if not smartirs: + return + if self.pivot is not None: + if n_n in 'ub': logger.warning("constructor received pivot; ignoring smartirs[2]") + return + if n_n in 'ub' and callable(self.normalize): + logger.warning("constructor received smartirs; ignoring normalize") + if n_n in 'ub' and not dictionary and not corpus: + logger.warning("constructor received no corpus or dictionary; ignoring smartirs[2]") + elif n_n == "u": + self.pivot = 1.0 * self.num_nnz / self.num_docs + elif n_n == "b": + self.pivot = 1.0 * sum( + self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in iterkeys(dictionary) + ) / self.num_docs @classmethod def load(cls, *args, **kwargs): @@ -493,7 +489,7 @@ def __getitem__(self, bow, eps=1e-12): # and finally, normalize the vector either to unit length, or use a # user-defined normalization function - if self.smartirs is not None: + if self.smartirs: n_n = self.smartirs[2] if n_n == "n" or (n_n in 'ub' and self.pivot is None): if self.pivot is not None: From 18d30cb1d0edde517ddedcba434ac35bd5e05ddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Tue, 23 Apr 2019 16:39:59 +0200 Subject: [PATCH 10/17] Document and test wlocal parameter of TfidfModel Closes #2444. --- gensim/models/tfidfmodel.py | 6 +++--- gensim/test/test_tfidfmodel.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index d792147576..1fefc97f05 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -287,10 +287,10 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden dictionary : :class:`~gensim.corpora.Dictionary` If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used. to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored). - wlocals : function, optional + wlocals : callable, optional Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity` - (other options: :func:`math.sqrt`, :func:`math.log1p`, etc). - wglobal : function, optional + (other options: :func:`numpy.sqrt`, `lambda tf: 0.5 + (0.5 * tf / tf.max())`, etc.). + wglobal : callable, optional Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`. normalize : {bool, callable}, optional Normalize document vectors to unit euclidean length? You can also inject your own function into `normalize`. diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index 0848d5e449..4ee387976b 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -416,6 +416,25 @@ def test_pivoted_normalization(self): self.assertTrue(np.allclose(sorted(transformed_docs[0]), sorted(expected_docs[0]))) self.assertTrue(np.allclose(sorted(transformed_docs[1]), sorted(expected_docs[1]))) + def test_wlocal_wglobal(self): + def wlocal(tf): + assert isinstance(tf, np.ndarray) + return iter(tf + 1) + + def wglobal(df, total_docs): + return 1 + + docs = [corpus[1], corpus[2]] + model = tfidfmodel.TfidfModel(corpus, wlocal=wlocal, wglobal=wglobal, normalize=False) + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = [ + [(termid, weight + 1) for termid, weight in docs[0]], + [(termid, weight + 1) for termid, weight in docs[1]], + ] + + self.assertTrue(np.allclose(sorted(transformed_docs[0]), sorted(expected_docs[0]))) + self.assertTrue(np.allclose(sorted(transformed_docs[1]), sorted(expected_docs[1]))) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) From 35f0f9db70a846afbf0d12a0d3526ef61fa11dba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Mon, 29 Apr 2019 05:25:52 +0200 Subject: [PATCH 11/17] Do not accept smartirs=None in resolve_weights --- gensim/models/tfidfmodel.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 1fefc97f05..fba5863215 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -31,7 +31,7 @@ def resolve_weights(smartirs): Parameters ---------- - smartirs : str or None + smartirs : str `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting variants in the vector space model. The mnemonic for representing a combination @@ -41,7 +41,7 @@ def resolve_weights(smartirs): Returns ------- - str of (local_letter, global_letter, normalization_letter) or None + str of (local_letter, global_letter, normalization_letter) local_letter : str Term frequency weighing, one of: @@ -72,9 +72,6 @@ def resolve_weights(smartirs): """ - if not smartirs: - return None - if isinstance(smartirs, str) and re.match(r"...\....", smartirs): match = re.match(r"(?P...)\.(?P...)", smartirs) raise ValueError( @@ -345,7 +342,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.id2word = id2word self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize self.num_docs, self.num_nnz, self.idfs = None, None, None - self.smartirs = resolve_weights(smartirs) + self.smartirs = resolve_weights(smartirs) if smartirs is not None else None self.slope = slope self.pivot = pivot self.eps = 1e-12 From 5d1213adc967bd0e8ebc66e6d16c4866a3129bfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Mon, 29 Apr 2019 05:28:28 +0200 Subject: [PATCH 12/17] Remove blank line between resolve_weights docstring and body (PEP8) --- gensim/models/tfidfmodel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index fba5863215..2219f88979 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -71,7 +71,6 @@ def resolve_weights(smartirs): doesn't fit the list of permissible values. """ - if isinstance(smartirs, str) and re.match(r"...\....", smartirs): match = re.match(r"(?P...)\.(?P...)", smartirs) raise ValueError( From 13081f7603b6286796f7548639582999edb8f2b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Mon, 29 Apr 2019 05:31:38 +0200 Subject: [PATCH 13/17] Omit word `except` from resolve_weights ValueError messages --- gensim/models/tfidfmodel.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 2219f88979..5fb0ad2292 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -82,18 +82,18 @@ def resolve_weights(smartirs): ) ) if not isinstance(smartirs, str) or len(smartirs) != 3: - raise ValueError("Expected a string of length 3 except got " + smartirs) + raise ValueError("Expected a string of length 3 got " + smartirs) w_tf, w_df, w_n = smartirs if w_tf not in 'btnaldL': - raise ValueError("Expected term frequency weight to be one of 'btnaldL', except got {}".format(w_tf)) + raise ValueError("Expected term frequency weight to be one of 'btnaldL', got {}".format(w_tf)) if w_df not in 'xnftp': - raise ValueError("Expected inverse document frequency weight to be one of 'xnftp', except got {}".format(w_df)) + raise ValueError("Expected inverse document frequency weight to be one of 'xnftp', got {}".format(w_df)) if w_n not in 'xncub': - raise ValueError("Expected normalization weight to be one of 'xncub', except got {}".format(w_n)) + raise ValueError("Expected normalization weight to be one of 'xncub', got {}".format(w_n)) # resolve aliases if w_tf == "t": From 57e5a04101ec1b980fd41cc16e4ec3c4aa1fec96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Mon, 29 Apr 2019 05:33:14 +0200 Subject: [PATCH 14/17] Add missing blank lines to the smartirs_normalize docstring --- gensim/models/tfidfmodel.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 5fb0ad2292..08407ea683 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -215,6 +215,7 @@ def smartirs_wglobal(docfreq, totaldocs, global_scheme): @deprecated("Function will be removed in 4.0.0") def smartirs_normalize(x, norm_scheme, return_norm=False): """Normalize a vector using the normalization scheme specified in `norm_scheme`. + Parameters ---------- x : numpy.ndarray @@ -223,6 +224,7 @@ def smartirs_normalize(x, norm_scheme, return_norm=False): Document length normalization scheme. return_norm : bool, optional Return the length of `x` as well? + Returns ------- numpy.ndarray From 70326362b10b5f28bb5f168221ac41dda875b3df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Sat, 4 May 2019 23:16:05 +0200 Subject: [PATCH 15/17] Cross-reference docstrings of SMART scheme users (functions, classes) --- gensim/models/tfidfmodel.py | 9 +++++++++ gensim/sklearn_api/tfidf.py | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 08407ea683..6eb511bcb9 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -70,6 +70,10 @@ def resolve_weights(smartirs): If `smartirs` is not a string of length 3 or one of the decomposed value doesn't fit the list of permissible values. + See Also + -------- + ~gensim.sklearn_api.tfidf.TfIdfTransformer, TfidfModel : Classes that also use the SMART scheme. + """ if isinstance(smartirs, str) and re.match(r"...\....", smartirs): match = re.match(r"(?P...)\.(?P...)", smartirs) @@ -339,6 +343,11 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden Parameter required by pivoted document length normalization which determines the slope to which the `old normalization` can be tilted. This parameter only works when pivot is defined. + See Also + -------- + ~gensim.sklearn_api.tfidf.TfIdfTransformer : Class that also uses the SMART scheme. + resolve_weights : Function that also uses the SMART scheme. + """ self.id2word = id2word self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index d95b8a125a..a262d9d9e1 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -96,6 +96,11 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, the `old normalization` can be tilted. This parameter only works when pivot is defined by user and is not None. + See Also + -------- + ~gensim.models.tfidfmodel.TfidfModel : Class that also uses the SMART scheme. + ~gensim.models.tfidfmodel.resolve_weights : Function that also uses the SMART scheme. + """ self.gensim_model = None self.id2word = id2word From a2f4c7e422a325a86bb12a586a2e97f3d259f730 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Wed, 8 May 2019 17:24:56 +0200 Subject: [PATCH 16/17] Document the default SMART scheme of TfidfModel --- gensim/models/tfidfmodel.py | 1 + gensim/sklearn_api/tfidf.py | 1 + 2 files changed, 2 insertions(+) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 6eb511bcb9..e5b7d252d2 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -322,6 +322,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden * `u` - pivoted unique, * `b` - pivoted character length. + Default is `nfc`. For more information visit `SMART Information Retrieval System `_. pivot : float, optional diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index a262d9d9e1..a918ec5528 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -81,6 +81,7 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, * `u` - pivoted unique, * `b` - pivoted character length. + Default is `nfc`. For more info, visit `"Wikipedia" `_. pivot : float, optional It is the point around which the regular normalization curve is `tilted` to get the new pivoted From fccc5e5926a6b91daac09a27588f620fb670e902 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Sat, 18 May 2019 01:02:04 +0200 Subject: [PATCH 17/17] Improve the documentation of slope and pivot --- gensim/models/tfidfmodel.py | 45 ++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index e5b7d252d2..239ef71428 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -267,7 +267,7 @@ class TfidfModel(interfaces.TransformationABC): """ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity, - wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.65): + wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.25): r"""Compute TF-IDF by multiplying a local component (term frequency) with a global component (inverse document frequency), and normalizing the resulting documents to unit length. Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents @@ -322,33 +322,48 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden * `u` - pivoted unique, * `b` - pivoted character length. - Default is `nfc`. + Default is 'nfc'. For more information visit `SMART Information Retrieval System `_. - pivot : float, optional - See the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/. + pivot : float or None, optional + In information retrieval, TF-IDF is biased against long documents [1]_. Pivoted document length + normalization solves this problem by changing the norm of a document to `slope * old_norm + (1.0 - + slope) * pivot`. - Pivot is the point around which the regular normalization curve is `tilted` to get the new pivoted - normalization curve. In the paper `Amit Singhal, Chris Buckley, Mandar Mitra: - "Pivoted Document Length Normalization" `_ it is the point where the - retrieval and relevance curves intersect. + You can either set the `pivot` by hand, or you can let Gensim figure it out automatically with the following + two steps: - This parameter along with `slope` is used for pivoted document length normalization. + * Set either the `u` or `b` document normalization in the `smartirs` parameter. + * Set either the `corpus` or `dictionary` parameter. The `pivot` will be automatically determined from + the properties of the `corpus` or `dictionary`. - When `pivot` is None, and `smartirs` specifies the pivoted unique document normalization scheme (u), and - either `corpus` or `dictionary` are specified, then the pivot will be determined automatically. + If `pivot` is None and you don't follow steps 1 and 2, then pivoted document length normalization will be + disabled. Default is None. - When `pivot` is None, and `smartirs` specifies the character length unique document normalization - scheme (b), and `dictionary` is specified, then the pivot will be determined automatically. + See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/. slope : float, optional - Parameter required by pivoted document length normalization which determines the slope to which - the `old normalization` can be tilted. This parameter only works when pivot is defined. + In information retrieval, TF-IDF is biased against long documents [1]_. Pivoted document length + normalization solves this problem by changing the norm of a document to `slope * old_norm + (1.0 - + slope) * pivot`. + + Setting the `slope` to 0.0 uses only the `pivot` as the norm, and setting the `slope` to 1.0 effectively + disables pivoted document length normalization. Singhal [2]_ suggests setting the `slope` between 0.2 and + 0.3 for best results. Default is 0.25. + + See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/. See Also -------- ~gensim.sklearn_api.tfidf.TfIdfTransformer : Class that also uses the SMART scheme. resolve_weights : Function that also uses the SMART scheme. + References + ---------- + .. [1] Singhal, A., Buckley, C., & Mitra, M. (1996). `Pivoted Document Length + Normalization `_. *SIGIR Forum*, 51, 176–184. + .. [2] Singhal, A. (2001). `Modern information retrieval: A brief overview `_. + *IEEE Data Eng. Bull.*, 24(4), 35–43. + """ self.id2word = id2word self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize