From 151f171760eb2d50ef399e626779335d576878c0 Mon Sep 17 00:00:00 2001 From: Marcelo d'Almeida Date: Thu, 19 Sep 2019 22:13:32 -0300 Subject: [PATCH 1/6] Refactor bm25 to include model parametrization --- gensim/summarization/bm25.py | 74 ++++++++++++++++++++++++++++++------ 1 file changed, 62 insertions(+), 12 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index b9621aeb1e..e31d07c587 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -44,10 +44,6 @@ from multiprocessing import Pool from ..utils import effective_n_jobs -PARAM_K1 = 1.5 -PARAM_B = 0.75 -EPSILON = 0.25 - class BM25(object): """Implementation of Best Matching 25 ranking function. @@ -66,14 +62,56 @@ class BM25(object): List of document lengths. """ - def __init__(self, corpus): + def __init__(self, corpus, k1=1.5, b=0.75, epsilon=0.25): """ + + 'k1 is a variable which helps determine term frequency saturation characteristics. + That is, it limits how much a single query term can affect the score of a given document. + A higher/lower k1 value means that the slope of “tf() of BM25” curve changes. + This has the effect of changing how “terms occurring extra times add extra score.” + An interpretation of k1 is that for documents of the average length, it is the value of the term frequency that + gives a score of half the maximum score for the considered term. The curve of the impact of tf on the score + grows quickly when tf() ≤ k1 and slower and slower when tf() > k1.' + + Shane Connelly (2018). Practical BM25 - Part 2: The BM25 Algorithm and its Variables + https://www.elastic.co/pt/blog/practical-bm25-part-2-the-bm25-algorithm-and-its-variables + + + 'If b is bigger, the effects of the length of the document compared to the average length are more amplified. + To see this, you can imagine if you set b to 0, the effect of the length ratio would be completely nullified + and the length of the document would have no bearing on the score' + + Shane Connelly (2018). Practical BM25 - Part 2: The BM25 Algorithm and its Variables + https://www.elastic.co/pt/blog/practical-bm25-part-2-the-bm25-algorithm-and-its-variables + + + 'A significant number of such experiments have been done, and suggest that in general values + such as 0.5 < b < 0.8 and 1.2 < k1 < 2 are reasonably good in many circumstances. + However, there is also evidence that optimal values do depend on other factors + (such as the type of documents or queries).' + + Robertson, Stephen; Zaragoza, Hugo (2009). The Probabilistic Relevance Framework: BM25 and Beyond, + http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf + + Parameters ---------- corpus : list of list of str Given corpus. + k1 : float + Constant used for influencing the term frequency saturation + b : float + Constant used for influencing the effects of different document lengths relative to average document length + epsilon : float + Constant used for negative idf of document in corpus. + """ + + self.k1 = k1 + self.b = b + self.epsilon = epsilon + self.corpus_size = 0 self.avgdl = 0 self.doc_freqs = [] @@ -116,7 +154,7 @@ def _initialize(self, corpus): negative_idfs.append(word) self.average_idf = float(idf_sum) / len(self.idf) - eps = EPSILON * self.average_idf + eps = self.epsilon * self.average_idf for word in negative_idfs: self.idf[word] = eps @@ -141,8 +179,8 @@ def get_score(self, document, index): for word in document: if word not in doc_freqs: continue - score += (self.idf[word] * doc_freqs[word] * (PARAM_K1 + 1) - / (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl))) + score += (self.idf[word] * doc_freqs[word] * (self.k1 + 1) + / (doc_freqs[word] + self.k * (1 - self.b + self.b * self.doc_len[index] / self.avgdl))) return score def get_scores(self, document): @@ -226,7 +264,7 @@ def _get_scores(bm25, document): return bm25.get_scores(document) -def iter_bm25_bow(corpus, n_jobs=1): +def iter_bm25_bow(corpus, k1=1.5, b=0.75, epsilon=0.25, n_jobs=1): """Yield BM25 scores (weights) of documents in corpus. Each document has to be weighted with every document in given corpus. @@ -234,6 +272,12 @@ def iter_bm25_bow(corpus, n_jobs=1): ---------- corpus : list of list of str Corpus of documents. + k1 : float + Constant used for influencing the term frequency saturation + b : float + Constant used for influencing the effects of different document lengths relative to average document length + epsilon : float + Constant used for negative idf of document in corpus. n_jobs : int The number of processes to use for computing bm25. @@ -255,7 +299,7 @@ def iter_bm25_bow(corpus, n_jobs=1): >>> result = iter_bm25_weights(corpus, n_jobs=-1) """ - bm25 = BM25(corpus) + bm25 = BM25(corpus, k1, b, epsilon) n_processes = effective_n_jobs(n_jobs) if n_processes == 1: @@ -272,7 +316,7 @@ def iter_bm25_bow(corpus, n_jobs=1): pool.join() -def get_bm25_weights(corpus, n_jobs=1): +def get_bm25_weights(corpus, k1=1.5, b=0.75, epsilon=0.25, n_jobs=1): """Returns BM25 scores (weights) of documents in corpus. Each document has to be weighted with every document in given corpus. @@ -280,6 +324,12 @@ def get_bm25_weights(corpus, n_jobs=1): ---------- corpus : list of list of str Corpus of documents. + k1 : float + Constant used for influencing the term frequency saturation + b : float + Constant used for influencing the effects of different document lengths relative to average document length + epsilon : float + Constant used for negative idf of document in corpus. n_jobs : int The number of processes to use for computing bm25. @@ -301,7 +351,7 @@ def get_bm25_weights(corpus, n_jobs=1): >>> result = get_bm25_weights(corpus, n_jobs=-1) """ - bm25 = BM25(corpus) + bm25 = BM25(corpus, k1, b, epsilon) n_processes = effective_n_jobs(n_jobs) if n_processes == 1: From 40fb465fb4036017a327b1498fe2f4015c6d7fda Mon Sep 17 00:00:00 2001 From: Marcelo d'Almeida Date: Mon, 23 Sep 2019 00:22:12 -0300 Subject: [PATCH 2/6] Refactor constants back and fix typo --- gensim/summarization/bm25.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index e31d07c587..fdbd7029bf 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -44,6 +44,10 @@ from multiprocessing import Pool from ..utils import effective_n_jobs +PARAM_K1 = 1.5 +PARAM_B = 0.75 +EPSILON = 0.25 + class BM25(object): """Implementation of Best Matching 25 ranking function. @@ -62,7 +66,7 @@ class BM25(object): List of document lengths. """ - def __init__(self, corpus, k1=1.5, b=0.75, epsilon=0.25): + def __init__(self, corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON): """ 'k1 is a variable which helps determine term frequency saturation characteristics. @@ -180,7 +184,7 @@ def get_score(self, document, index): if word not in doc_freqs: continue score += (self.idf[word] * doc_freqs[word] * (self.k1 + 1) - / (doc_freqs[word] + self.k * (1 - self.b + self.b * self.doc_len[index] / self.avgdl))) + / (doc_freqs[word] + self.k1 * (1 - self.b + self.b * self.doc_len[index] / self.avgdl))) return score def get_scores(self, document): @@ -264,7 +268,7 @@ def _get_scores(bm25, document): return bm25.get_scores(document) -def iter_bm25_bow(corpus, k1=1.5, b=0.75, epsilon=0.25, n_jobs=1): +def iter_bm25_bow(corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON, n_jobs=1): """Yield BM25 scores (weights) of documents in corpus. Each document has to be weighted with every document in given corpus. @@ -316,7 +320,7 @@ def iter_bm25_bow(corpus, k1=1.5, b=0.75, epsilon=0.25, n_jobs=1): pool.join() -def get_bm25_weights(corpus, k1=1.5, b=0.75, epsilon=0.25, n_jobs=1): +def get_bm25_weights(corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON, n_jobs=1): """Returns BM25 scores (weights) of documents in corpus. Each document has to be weighted with every document in given corpus. From 08ed9f142fae6074e3f8422d41b9dc76defb2c3a Mon Sep 17 00:00:00 2001 From: Marcelo d'Almeida Date: Sun, 27 Oct 2019 13:01:34 -0300 Subject: [PATCH 3/6] Refactor parameters order and description --- gensim/summarization/bm25.py | 94 ++++++++++++++++++------------------ 1 file changed, 48 insertions(+), 46 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index fdbd7029bf..fdf8e8067c 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -68,47 +68,27 @@ class BM25(object): def __init__(self, corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON): """ - - 'k1 is a variable which helps determine term frequency saturation characteristics. - That is, it limits how much a single query term can affect the score of a given document. - A higher/lower k1 value means that the slope of “tf() of BM25” curve changes. - This has the effect of changing how “terms occurring extra times add extra score.” - An interpretation of k1 is that for documents of the average length, it is the value of the term frequency that - gives a score of half the maximum score for the considered term. The curve of the impact of tf on the score - grows quickly when tf() ≤ k1 and slower and slower when tf() > k1.' - - Shane Connelly (2018). Practical BM25 - Part 2: The BM25 Algorithm and its Variables - https://www.elastic.co/pt/blog/practical-bm25-part-2-the-bm25-algorithm-and-its-variables - - - 'If b is bigger, the effects of the length of the document compared to the average length are more amplified. - To see this, you can imagine if you set b to 0, the effect of the length ratio would be completely nullified - and the length of the document would have no bearing on the score' - - Shane Connelly (2018). Practical BM25 - Part 2: The BM25 Algorithm and its Variables - https://www.elastic.co/pt/blog/practical-bm25-part-2-the-bm25-algorithm-and-its-variables - - - 'A significant number of such experiments have been done, and suggest that in general values - such as 0.5 < b < 0.8 and 1.2 < k1 < 2 are reasonably good in many circumstances. - However, there is also evidence that optimal values do depend on other factors - (such as the type of documents or queries).' - - Robertson, Stephen; Zaragoza, Hugo (2009). The Probabilistic Relevance Framework: BM25 and Beyond, - http://www.staff.city.ac.uk/~sb317/papers/foundations_bm25_review.pdf - - Parameters ---------- corpus : list of list of str Given corpus. k1 : float - Constant used for influencing the term frequency saturation + Constant used for influencing the term frequency saturation. After saturation is reached, additional + presence for the term adds a significantly less additional score. According to [1]_, experiments suggest + that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as + the type of documents or queries. b : float - Constant used for influencing the effects of different document lengths relative to average document length + Constant used for influencing the effects of different document lengths relative to average document length. + When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to + [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value + depends on factors such as the type of documents or queries. epsilon : float - Constant used for negative idf of document in corpus. - + Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts + negative idf values. Negative idf implies that adding a very common term to a document penalize the overall + score (with 'very common' meaning that it is present in more than half of the documents). That can be + undesirable as it means that an identical document would score less than an almost identical one (by + removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among + different documents) to receive an extra score. """ @@ -268,7 +248,7 @@ def _get_scores(bm25, document): return bm25.get_scores(document) -def iter_bm25_bow(corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON, n_jobs=1): +def iter_bm25_bow(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON): """Yield BM25 scores (weights) of documents in corpus. Each document has to be weighted with every document in given corpus. @@ -276,14 +256,25 @@ def iter_bm25_bow(corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON, n_jobs=1): ---------- corpus : list of list of str Corpus of documents. + n_jobs : int + The number of processes to use for computing bm25. k1 : float - Constant used for influencing the term frequency saturation + Constant used for influencing the term frequency saturation. After saturation is reached, additional + presence for the term adds a significantly less additional score. According to [1]_, experiments suggest + that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as + the type of documents or queries. b : float - Constant used for influencing the effects of different document lengths relative to average document length + Constant used for influencing the effects of different document lengths relative to average document length. + When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to + [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value + depends on factors such as the type of documents or queries. epsilon : float - Constant used for negative idf of document in corpus. - n_jobs : int - The number of processes to use for computing bm25. + Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts + negative idf values. Negative idf implies that adding a very common term to a document penalize the overall + score (with 'very common' meaning that it is present in more than half of the documents). That can be + undesirable as it means that an identical document would score less than an almost identical one (by + removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among + different documents) to receive an extra score. Yields ------- @@ -320,7 +311,7 @@ def iter_bm25_bow(corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON, n_jobs=1): pool.join() -def get_bm25_weights(corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON, n_jobs=1): +def get_bm25_weights(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON): """Returns BM25 scores (weights) of documents in corpus. Each document has to be weighted with every document in given corpus. @@ -328,14 +319,25 @@ def get_bm25_weights(corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON, n_jobs=1): ---------- corpus : list of list of str Corpus of documents. + n_jobs : int + The number of processes to use for computing bm25. k1 : float - Constant used for influencing the term frequency saturation + Constant used for influencing the term frequency saturation. After saturation is reached, additional + presence for the term adds a significantly less additional score. According to [1]_, experiments suggest + that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as + the type of documents or queries. b : float - Constant used for influencing the effects of different document lengths relative to average document length + Constant used for influencing the effects of different document lengths relative to average document length. + When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to + [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value + depends on factors such as the type of documents or queries. epsilon : float - Constant used for negative idf of document in corpus. - n_jobs : int - The number of processes to use for computing bm25. + Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts + negative idf values. Negative idf implies that adding a very common term to a document penalize the overall + score (with 'very common' meaning that it is present in more than half of the documents). That can be + undesirable as it means that an identical document would score less than an almost identical one (by + removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among + different documents) to receive an extra score. Returns ------- From e880b253e51e7dd1728fb984126b68a129356a78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Mon, 6 Jan 2020 03:01:31 +0100 Subject: [PATCH 4/6] Add BM25 tests This closes #2597 and closes #2606 --- gensim/test/test_BM25.py | 72 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_BM25.py b/gensim/test/test_BM25.py index d575ae9433..4a8d7ef85e 100644 --- a/gensim/test/test_BM25.py +++ b/gensim/test/test_BM25.py @@ -11,7 +11,7 @@ import logging import unittest -from gensim.summarization.bm25 import get_bm25_weights +from gensim.summarization.bm25 import get_bm25_weights, iter_bm25_bow, BM25 from gensim.test.utils import common_texts @@ -62,6 +62,76 @@ def test_multiprocessing(self): self.assertAlmostEqual(weights1, weights3) self.assertAlmostEqual(weights2, weights3) + def test_k1(self): + """ Changing the k1 parameter should give consistent results """ + corpus = common_texts + index = 0 + doc = corpus[index] + first_k1 = 1.0 + second_k1 = 2.0 + + first_bm25 = BM25(corpus, k1=first_k1) + second_bm25 = BM25(corpus, k1=second_k1) + self.assertTrue(first_bm25.get_score(doc, index) < second_bm25.get_score(doc, index)) + + first_iter = iter_bm25_bow(corpus, k1=first_k1) + second_iter = iter_bm25_bow(corpus, k1=second_k1) + self.assertTrue(dict(next(iter(first_iter)))[index] < dict(next(iter(second_iter)))[index]) + + first_weights = get_bm25_weights(corpus, k1=first_k1) + second_weights = get_bm25_weights(corpus, k1=second_k1) + self.assertTrue(first_weights[index] < second_weights[index]) + + def test_b(self): + """ Changing the b parameter should give consistent results """ + corpus = common_texts + index = 0 + doc = corpus[index] + first_b = 1.0 + second_b = 2.0 + + first_bm25 = BM25(corpus, b=first_b) + second_bm25 = BM25(corpus, b=second_b) + self.assertTrue(first_bm25.get_score(doc, index) < second_bm25.get_score(doc, index)) + + first_iter = iter_bm25_bow(corpus, b=first_b) + second_iter = iter_bm25_bow(corpus, b=second_b) + self.assertTrue(dict(next(iter(first_iter)))[index] < dict(next(iter(second_iter)))[index]) + + first_weights = get_bm25_weights(corpus, b=first_b) + second_weights = get_bm25_weights(corpus, b=second_b) + self.assertTrue(first_weights[index] < second_weights[index]) + + def test_epsilon(self): + """ Changing the b parameter should give consistent results """ + corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']] + first_epsilon = 1.0 + second_epsilon = 2.0 + bm25 = BM25(corpus) + words_with_negative_idfs = set([ + word + for word, idf in bm25.idf.items() + if idf < 0 + ]) + index, doc = [ + (index, document) + for index, document + in enumerate(corpus) + if words_with_negative_idfs & set(document) + ][0] + + first_bm25 = BM25(corpus, epsilon=first_epsilon) + second_bm25 = BM25(corpus, epsilon=second_epsilon) + self.assertTrue(first_bm25.get_score(doc, index) > second_bm25.get_score(doc, index)) + + first_iter = iter_bm25_bow(corpus, epsilon=first_epsilon) + second_iter = iter_bm25_bow(corpus, epsilon=second_epsilon) + self.assertTrue(dict(next(iter(first_iter)))[index] > dict(next(iter(second_iter)))[index]) + + first_weights = get_bm25_weights(corpus, epsilon=first_epsilon) + second_weights = get_bm25_weights(corpus, epsilon=second_epsilon) + self.assertTrue(first_weights[index] > second_weights[index]) + if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG) From 4a61276eb1200ee3b28ea398283af423a762457d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Wed, 8 Jan 2020 00:10:17 +0100 Subject: [PATCH 5/6] Simplify asserts in BM25 tests --- gensim/test/test_BM25.py | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/gensim/test/test_BM25.py b/gensim/test/test_BM25.py index 4a8d7ef85e..eb63ddc328 100644 --- a/gensim/test/test_BM25.py +++ b/gensim/test/test_BM25.py @@ -72,15 +72,21 @@ def test_k1(self): first_bm25 = BM25(corpus, k1=first_k1) second_bm25 = BM25(corpus, k1=second_k1) - self.assertTrue(first_bm25.get_score(doc, index) < second_bm25.get_score(doc, index)) + first_score = first_bm25.get_score(doc, index) + second_score = second_bm25.get_score(doc, index) + self.assertLess(first_score, second_score) first_iter = iter_bm25_bow(corpus, k1=first_k1) second_iter = iter_bm25_bow(corpus, k1=second_k1) - self.assertTrue(dict(next(iter(first_iter)))[index] < dict(next(iter(second_iter)))[index]) + first_score = dict(next(iter(first_iter)))[index] + second_score = dict(next(iter(second_iter)))[index] + self.assertLess(first_score, second_score) first_weights = get_bm25_weights(corpus, k1=first_k1) second_weights = get_bm25_weights(corpus, k1=second_k1) - self.assertTrue(first_weights[index] < second_weights[index]) + first_score = first_weights[index] + second_score = second_weights[index] + self.assertLess(first_score, second_score) def test_b(self): """ Changing the b parameter should give consistent results """ @@ -92,15 +98,21 @@ def test_b(self): first_bm25 = BM25(corpus, b=first_b) second_bm25 = BM25(corpus, b=second_b) - self.assertTrue(first_bm25.get_score(doc, index) < second_bm25.get_score(doc, index)) + first_score = first_bm25.get_score(doc, index) + second_score = second_bm25.get_score(doc, index) + self.assertLess(first_score, second_score) first_iter = iter_bm25_bow(corpus, b=first_b) second_iter = iter_bm25_bow(corpus, b=second_b) - self.assertTrue(dict(next(iter(first_iter)))[index] < dict(next(iter(second_iter)))[index]) + first_score = dict(next(iter(first_iter)))[index] + second_score = dict(next(iter(second_iter)))[index] + self.assertLess(first_score, second_score) first_weights = get_bm25_weights(corpus, b=first_b) second_weights = get_bm25_weights(corpus, b=second_b) - self.assertTrue(first_weights[index] < second_weights[index]) + first_score = first_weights[index] + second_score = second_weights[index] + self.assertLess(first_score, second_score) def test_epsilon(self): """ Changing the b parameter should give consistent results """ @@ -122,15 +134,21 @@ def test_epsilon(self): first_bm25 = BM25(corpus, epsilon=first_epsilon) second_bm25 = BM25(corpus, epsilon=second_epsilon) - self.assertTrue(first_bm25.get_score(doc, index) > second_bm25.get_score(doc, index)) + first_score = first_bm25.get_score(doc, index) + second_score = second_bm25.get_score(doc, index) + self.assertGreater(first_score, second_score) first_iter = iter_bm25_bow(corpus, epsilon=first_epsilon) second_iter = iter_bm25_bow(corpus, epsilon=second_epsilon) - self.assertTrue(dict(next(iter(first_iter)))[index] > dict(next(iter(second_iter)))[index]) + first_score = dict(next(iter(first_iter)))[index] + second_score = dict(next(iter(second_iter)))[index] + self.assertGreater(first_score, second_score) first_weights = get_bm25_weights(corpus, epsilon=first_epsilon) second_weights = get_bm25_weights(corpus, epsilon=second_epsilon) - self.assertTrue(first_weights[index] > second_weights[index]) + first_score = first_weights[index] + second_score = second_weights[index] + self.assertGreater(first_score, second_score) if __name__ == '__main__': From 13845d12d11146462723c598048df6809824e9d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Wed, 8 Jan 2020 00:31:21 +0100 Subject: [PATCH 6/6] Refactor BM25.get_score --- gensim/summarization/bm25.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index 43a9468ba6..beabbf728c 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -168,13 +168,15 @@ def get_score(self, document, index): BM25 score. """ - score = 0 + score = 0.0 doc_freqs = self.doc_freqs[index] + numerator_constant = self.k1 + 1 + denominator_constant = self.k1 * (1 - self.b + self.b * self.doc_len[index] / self.avgdl) for word in document: - if word not in doc_freqs: - continue - score += (self.idf[word] * doc_freqs[word] * (self.k1 + 1) - / (doc_freqs[word] + self.k1 * (1 - self.b + self.b * self.doc_len[index] / self.avgdl))) + if word in doc_freqs: + df = self.doc_freqs[index][word] + idf = self.idf[word] + score += (idf * df * numerator_constant) / (df + denominator_constant) return score def get_scores(self, document):