Skip to content

Commit

Permalink
Index points to benchmark sort optimization
Browse files Browse the repository at this point in the history
Sort optimization introduced in apache/lucene-solr#1351
depends on numeric fields being indexed both as doc_values and points.

This PR does the following:
- add a LongPoint field – lastModLP, last modified timestamp
- add an IntPoint field – dayOfYearIP, day of the year of the last modified timestamp
- add sort on the last modified timestamp to wikimedium.10M.nostopwords.tasks

If we make a comparison with a run where sort optimization is not enabled,
as hits count may differ for a task not to fail,  `competition` in `localrun.py`
should be modified to:

```
 comp =  competition.Competition(verifyCounts=False)
```
  • Loading branch information
mayya-sharipova authored Oct 22, 2020
2 parents ba1a47b + f87f287 commit 1962af3
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
17 changes: 14 additions & 3 deletions src/main/perf/LineFileDocs.java
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@

package perf;

/**
Expand Down Expand Up @@ -299,8 +300,10 @@ public static final class DocState {
final Field titleDV;
final Field monthDV;
final Field dayOfYearDV;
final IntPoint dayOfYearIP;
final BinaryDocValuesField titleBDV;
final NumericDocValuesField lastModNDV;
final LongPoint lastModLP;
final Field body;
final Field id;
final Field idPoint;
Expand Down Expand Up @@ -333,18 +336,24 @@ public static final class DocState {

lastModNDV = new NumericDocValuesField("lastModNDV", -1);
doc.add(lastModNDV);
lastModLP = new LongPoint("lastModNDV", -1); //points field must have the same name and value as DV field
doc.add(lastModLP);

monthDV = new SortedDocValuesField("monthSortedDV", new BytesRef(""));
doc.add(monthDV);

dayOfYearDV = new NumericDocValuesField("dayOfYearNumericDV", 0);
doc.add(dayOfYearDV);

dayOfYearDV = new NumericDocValuesField("dayOfYearNumericDV", 0);
doc.add(dayOfYearDV);
dayOfYearIP = new IntPoint("dayOfYearNumericDV", 0); //points field must have the same name and value as DV field
doc.add(dayOfYearIP);
} else {
titleDV = null;
titleBDV = null;
lastModNDV = null;
lastModLP = null;
monthDV = null;
dayOfYearDV = null;
dayOfYearIP = null;
}

titleTokenized = new Field("titleTokenized", "", TextField.TYPE_STORED);
Expand Down Expand Up @@ -533,13 +542,15 @@ public Document nextDoc(DocState doc) throws IOException {
doc.titleTokenized.setStringValue(title);
doc.monthDV.setBytesValue(new BytesRef(months[doc.dateCal.get(Calendar.MONTH)]));
doc.dayOfYearDV.setLongValue(doc.dateCal.get(Calendar.DAY_OF_YEAR));
doc.dayOfYearIP.setIntValue(doc.dateCal.get(Calendar.DAY_OF_YEAR));
}
doc.id.setStringValue(intToID(myID));

doc.idPoint.setIntValue(myID);

if (addDVFields) {
doc.lastModNDV.setLongValue(msecSinceEpoch);
doc.lastModLP.setLongValue(msecSinceEpoch);
}

doc.timeSec.setIntValue(timeSec);
Expand Down
7 changes: 7 additions & 0 deletions tasks/wikimedium.10M.nostopwords.tasks
Original file line number Diff line number Diff line change
Expand Up @@ -13647,6 +13647,13 @@ HighTermDayOfYearSort: dayofyeardvsort//government # freq=384004
HighTermDayOfYearSort: dayofyeardvsort//27 # freq=383042
HighTermDayOfYearSort: dayofyeardvsort//old # freq=381809


TermDateTimeSort: lastmodndvsort//0 # freq=708472
TermDateTimeSort: lastmodndvsort//names # freq=402762
TermDateTimeSort: lastmodndvsort//nbsp # freq=492778
TermDateTimeSort: lastmodndvsort//part # freq=588644
TermDateTimeSort: lastmodndvsort//st # freq=306811

HighTermMonthSort: monthdvsort//ref # freq=3793973
HighTermMonthSort: monthdvsort//http # freq=3493581
HighTermMonthSort: monthdvsort//from # freq=3224339
Expand Down

0 comments on commit 1962af3

Please sign in to comment.