From 4385e548414b07ba911b69a99bc5cc746cb8431a Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Tue, 23 Jun 2020 15:14:44 -0400 Subject: [PATCH 1/2] Index points to benchmark sort optimization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sort optimization introduced in https://github.com/apache/lucene-solr/pull/1351 depends on numeric fields being indexed both as doc_values and points. This PR does the following: - add a LongPoint field – lastModLP, last modified timestamp - add an IntPoint field – dayOfYearIP, day of the year of the last modified timestamp - add sort on the last modified timestamp to wikimedium.10M.nostopwords.tasks - don't fail a task if hitCounts don't match in benchUtil.py. As we don't collect all hits in the optimized runs, we don't expect hits total to match. --- src/main/perf/LineFileDocs.java | 17 ++++++++++++++--- src/python/benchUtil.py | 2 +- tasks/wikimedium.10M.nostopwords.tasks | 7 +++++++ 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/src/main/perf/LineFileDocs.java b/src/main/perf/LineFileDocs.java index 3d3fe7c44..bea832550 100644 --- a/src/main/perf/LineFileDocs.java +++ b/src/main/perf/LineFileDocs.java @@ -1,3 +1,4 @@ + package perf; /** @@ -299,8 +300,10 @@ public static final class DocState { final Field titleDV; final Field monthDV; final Field dayOfYearDV; + final IntPoint dayOfYearIP; final BinaryDocValuesField titleBDV; final NumericDocValuesField lastModNDV; + final LongPoint lastModLP; final Field body; final Field id; final Field idPoint; @@ -333,18 +336,24 @@ public static final class DocState { lastModNDV = new NumericDocValuesField("lastModNDV", -1); doc.add(lastModNDV); + lastModLP = new LongPoint("lastModNDV", -1); //points field must have the same name and value as DV field + doc.add(lastModLP); monthDV = new SortedDocValuesField("monthSortedDV", new BytesRef("")); doc.add(monthDV); - - dayOfYearDV = new NumericDocValuesField("dayOfYearNumericDV", 0); - doc.add(dayOfYearDV); + + dayOfYearDV = new NumericDocValuesField("dayOfYearNumericDV", 0); + doc.add(dayOfYearDV); + dayOfYearIP = new IntPoint("dayOfYearNumericDV", 0); //points field must have the same name and value as DV field + doc.add(dayOfYearIP); } else { titleDV = null; titleBDV = null; lastModNDV = null; + lastModLP = null; monthDV = null; dayOfYearDV = null; + dayOfYearIP = null; } titleTokenized = new Field("titleTokenized", "", TextField.TYPE_STORED); @@ -533,6 +542,7 @@ public Document nextDoc(DocState doc) throws IOException { doc.titleTokenized.setStringValue(title); doc.monthDV.setBytesValue(new BytesRef(months[doc.dateCal.get(Calendar.MONTH)])); doc.dayOfYearDV.setLongValue(doc.dateCal.get(Calendar.DAY_OF_YEAR)); + doc.dayOfYearIP.setIntValue(doc.dateCal.get(Calendar.DAY_OF_YEAR)); } doc.id.setStringValue(intToID(myID)); @@ -540,6 +550,7 @@ public Document nextDoc(DocState doc) throws IOException { if (addDVFields) { doc.lastModNDV.setLongValue(msecSinceEpoch); + doc.lastModLP.setLongValue(msecSinceEpoch); } doc.timeSec.setIntValue(timeSec); diff --git a/src/python/benchUtil.py b/src/python/benchUtil.py index ef6a328fc..a15942227 100644 --- a/src/python/benchUtil.py +++ b/src/python/benchUtil.py @@ -140,7 +140,7 @@ def verifySame(self, other, verifyScores, verifyCounts): print 'WARNING: expandedTermCounts differ for %s: %s vs %s' % (self, self.expandedTermCount, other.expandedTermCount) # self.fail('wrong expandedTermCount: %s vs %s' % (self.expandedTermCount, other.expandedTermCount)) - if verifyCounts and self.hitCount != other.hitCount: + if False and self.hitCount != other.hitCount: self.fail('wrong hitCount: %s vs %s' % (self.hitCount, other.hitCount)) if len(self.hits) != len(other.hits): diff --git a/tasks/wikimedium.10M.nostopwords.tasks b/tasks/wikimedium.10M.nostopwords.tasks index 28f6fad00..4d2869aa8 100644 --- a/tasks/wikimedium.10M.nostopwords.tasks +++ b/tasks/wikimedium.10M.nostopwords.tasks @@ -13647,6 +13647,13 @@ HighTermDayOfYearSort: dayofyeardvsort//government # freq=384004 HighTermDayOfYearSort: dayofyeardvsort//27 # freq=383042 HighTermDayOfYearSort: dayofyeardvsort//old # freq=381809 + +TermDateTimeSort: lastmodndvsort//0 # freq=708472 +TermDateTimeSort: lastmodndvsort//names # freq=402762 +TermDateTimeSort: lastmodndvsort//nbsp # freq=492778 +TermDateTimeSort: lastmodndvsort//part # freq=588644 +TermDateTimeSort: lastmodndvsort//st # freq=306811 + HighTermMonthSort: monthdvsort//ref # freq=3793973 HighTermMonthSort: monthdvsort//http # freq=3493581 HighTermMonthSort: monthdvsort//from # freq=3224339 From f87f287ddc50aeec97fdf0379d5a2ba1a47b75dc Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Wed, 21 Oct 2020 17:34:55 -0400 Subject: [PATCH 2/2] Reenable verifyCounts --- src/python/benchUtil.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/python/benchUtil.py b/src/python/benchUtil.py index f9e3d7138..e0986360b 100644 --- a/src/python/benchUtil.py +++ b/src/python/benchUtil.py @@ -168,7 +168,7 @@ def verifySame(self, other, verifyScores, verifyCounts): print('WARNING: expandedTermCounts differ for %s: %s vs %s' % (self, self.expandedTermCount, other.expandedTermCount)) # self.fail('wrong expandedTermCount: %s vs %s' % (self.expandedTermCount, other.expandedTermCount)) - if False and self.hitCount != other.hitCount: + if verifyCounts and self.hitCount != other.hitCount: self.fail('wrong hitCount: %s vs %s' % (self.hitCount, other.hitCount)) if len(self.hits) != len(other.hits):