Skip to content

Commit

Permalink
LUCENE-9599 Make comparator aware of index sorting
Browse files Browse the repository at this point in the history
Currently, if search sort is equal to index sort,  we have an early
termination in TopFieldCollector. As we work to enhance comparators
to provide skipping functionality (PR apache#1351), we would like to
move this termination functionality on index sort from
TopFieldCollector to comparators.

This patch does the following:
- Add method usesIndexSort to LeafFieldComparator
- Make numeric comparators aware of index sort and early terminate on
  collecting all competitive hits
- Move TermValComparator and TermOrdValComparator from FieldComparator
  to comparator package, for all comparators to be in the same package
- Enhance TermValComparator to provide skipping functionality when
  index is sorted

One item left for TODO for a following PR is to remove the logic of
early termniation from TopFieldCollector. We can do that once
we ensure that all BulkScorers are using iterators from collectors
than can skip non-competitive docs.

Relates to apache#1351
  • Loading branch information
mayya-sharipova committed Nov 4, 2020
1 parent 656ce93 commit fff115b
Show file tree
Hide file tree
Showing 15 changed files with 686 additions and 415 deletions.
380 changes: 0 additions & 380 deletions lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
Original file line number Diff line number Diff line change
Expand Up @@ -224,384 +224,4 @@ public int compareTop(int doc) throws IOException {
return Float.compare(docValue, topValue);
}
}

/** Sorts by field's natural Term sort order, using
* ordinals. This is functionally equivalent to {@link
* org.apache.lucene.search.FieldComparator.TermValComparator}, but it first resolves the string
* to their relative ordinal positions (using the index
* returned by {@link org.apache.lucene.index.LeafReader#getSortedDocValues(String)}), and
* does most comparisons using the ordinals. For medium
* to large results, this comparator will be much faster
* than {@link org.apache.lucene.search.FieldComparator.TermValComparator}. For very small
* result sets it may be slower. */
public static class TermOrdValComparator extends FieldComparator<BytesRef> implements LeafFieldComparator {
/* Ords for each slot.
@lucene.internal */
final int[] ords;

/* Values for each slot.
@lucene.internal */
final BytesRef[] values;
private final BytesRefBuilder[] tempBRs;

/* Which reader last copied a value into the slot. When
we compare two slots, we just compare-by-ord if the
readerGen is the same; else we must compare the
values (slower).
@lucene.internal */
final int[] readerGen;

/* Gen of current reader we are on.
@lucene.internal */
int currentReaderGen = -1;

/* Current reader's doc ord/values.
@lucene.internal */
SortedDocValues termsIndex;

private final String field;

/* Bottom slot, or -1 if queue isn't full yet
@lucene.internal */
int bottomSlot = -1;

/* Bottom ord (same as ords[bottomSlot] once bottomSlot
is set). Cached for faster compares.
@lucene.internal */
int bottomOrd;

/* True if current bottom slot matches the current
reader.
@lucene.internal */
boolean bottomSameReader;

/* Bottom value (same as values[bottomSlot] once
bottomSlot is set). Cached for faster compares.
@lucene.internal */
BytesRef bottomValue;

/** Set by setTopValue. */
BytesRef topValue;
boolean topSameReader;
int topOrd;

/** -1 if missing values are sorted first, 1 if they are
* sorted last */
final int missingSortCmp;

/** Which ordinal to use for a missing value. */
final int missingOrd;

/** Creates this, sorting missing values first. */
public TermOrdValComparator(int numHits, String field) {
this(numHits, field, false);
}

/** Creates this, with control over how missing values
* are sorted. Pass sortMissingLast=true to put
* missing values at the end. */
public TermOrdValComparator(int numHits, String field, boolean sortMissingLast) {
ords = new int[numHits];
values = new BytesRef[numHits];
tempBRs = new BytesRefBuilder[numHits];
readerGen = new int[numHits];
this.field = field;
if (sortMissingLast) {
missingSortCmp = 1;
missingOrd = Integer.MAX_VALUE;
} else {
missingSortCmp = -1;
missingOrd = -1;
}
}

private int getOrdForDoc(int doc) throws IOException {
if (termsIndex.advanceExact(doc)) {
return termsIndex.ordValue();
} else {
return -1;
}
}

@Override
public int compare(int slot1, int slot2) {
if (readerGen[slot1] == readerGen[slot2]) {
return ords[slot1] - ords[slot2];
}

final BytesRef val1 = values[slot1];
final BytesRef val2 = values[slot2];
if (val1 == null) {
if (val2 == null) {
return 0;
}
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}

@Override
public int compareBottom(int doc) throws IOException {
assert bottomSlot != -1;
int docOrd = getOrdForDoc(doc);
if (docOrd == -1) {
docOrd = missingOrd;
}
if (bottomSameReader) {
// ord is precisely comparable, even in the equal case
return bottomOrd - docOrd;
} else if (bottomOrd >= docOrd) {
// the equals case always means bottom is > doc
// (because we set bottomOrd to the lower bound in
// setBottom):
return 1;
} else {
return -1;
}
}

@Override
public void copy(int slot, int doc) throws IOException {
int ord = getOrdForDoc(doc);
if (ord == -1) {
ord = missingOrd;
values[slot] = null;
} else {
assert ord >= 0;
if (tempBRs[slot] == null) {
tempBRs[slot] = new BytesRefBuilder();
}
tempBRs[slot].copyBytes(termsIndex.lookupOrd(ord));
values[slot] = tempBRs[slot].get();
}
ords[slot] = ord;
readerGen[slot] = currentReaderGen;
}

/** Retrieves the SortedDocValues for the field in this segment */
protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field) throws IOException {
return DocValues.getSorted(context.reader(), field);
}

@Override
public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
termsIndex = getSortedDocValues(context, field);
currentReaderGen++;

if (topValue != null) {
// Recompute topOrd/SameReader
int ord = termsIndex.lookupTerm(topValue);
if (ord >= 0) {
topSameReader = true;
topOrd = ord;
} else {
topSameReader = false;
topOrd = -ord-2;
}
} else {
topOrd = missingOrd;
topSameReader = true;
}
//System.out.println(" getLeafComparator topOrd=" + topOrd + " topSameReader=" + topSameReader);

if (bottomSlot != -1) {
// Recompute bottomOrd/SameReader
setBottom(bottomSlot);
}

return this;
}

@Override
public void setBottom(final int bottom) throws IOException {
bottomSlot = bottom;

bottomValue = values[bottomSlot];
if (currentReaderGen == readerGen[bottomSlot]) {
bottomOrd = ords[bottomSlot];
bottomSameReader = true;
} else {
if (bottomValue == null) {
// missingOrd is null for all segments
assert ords[bottomSlot] == missingOrd;
bottomOrd = missingOrd;
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
} else {
final int ord = termsIndex.lookupTerm(bottomValue);
if (ord < 0) {
bottomOrd = -ord - 2;
bottomSameReader = false;
} else {
bottomOrd = ord;
// exact value match
bottomSameReader = true;
readerGen[bottomSlot] = currentReaderGen;
ords[bottomSlot] = bottomOrd;
}
}
}
}

@Override
public void setTopValue(BytesRef value) {
// null is fine: it means the last doc of the prior
// search was missing this value
topValue = value;
//System.out.println("setTopValue " + topValue);
}

@Override
public BytesRef value(int slot) {
return values[slot];
}

@Override
public int compareTop(int doc) throws IOException {

int ord = getOrdForDoc(doc);
if (ord == -1) {
ord = missingOrd;
}

if (topSameReader) {
// ord is precisely comparable, even in the equal
// case
//System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));
return topOrd - ord;
} else if (ord <= topOrd) {
// the equals case always means doc is < value
// (because we set lastOrd to the lower bound)
return 1;
} else {
return -1;
}
}

@Override
public int compareValues(BytesRef val1, BytesRef val2) {
if (val1 == null) {
if (val2 == null) {
return 0;
}
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}

@Override
public void setScorer(Scorable scorer) {}
}

/** Sorts by field's natural Term sort order. All
* comparisons are done using BytesRef.compareTo, which is
* slow for medium to large result sets but possibly
* very fast for very small results sets. */
public static class TermValComparator extends FieldComparator<BytesRef> implements LeafFieldComparator {

private final BytesRef[] values;
private final BytesRefBuilder[] tempBRs;
private BinaryDocValues docTerms;
private final String field;
private BytesRef bottom;
private BytesRef topValue;
private final int missingSortCmp;

/** Sole constructor. */
public TermValComparator(int numHits, String field, boolean sortMissingLast) {
values = new BytesRef[numHits];
tempBRs = new BytesRefBuilder[numHits];
this.field = field;
missingSortCmp = sortMissingLast ? 1 : -1;
}

private BytesRef getValueForDoc(int doc) throws IOException {
if (docTerms.advanceExact(doc)) {
return docTerms.binaryValue();
} else {
return null;
}
}

@Override
public int compare(int slot1, int slot2) {
final BytesRef val1 = values[slot1];
final BytesRef val2 = values[slot2];
return compareValues(val1, val2);
}

@Override
public int compareBottom(int doc) throws IOException {
final BytesRef comparableBytes = getValueForDoc(doc);
return compareValues(bottom, comparableBytes);
}

@Override
public void copy(int slot, int doc) throws IOException {
final BytesRef comparableBytes = getValueForDoc(doc);
if (comparableBytes == null) {
values[slot] = null;
} else {
if (tempBRs[slot] == null) {
tempBRs[slot] = new BytesRefBuilder();
}
tempBRs[slot].copyBytes(comparableBytes);
values[slot] = tempBRs[slot].get();
}
}

/** Retrieves the BinaryDocValues for the field in this segment */
protected BinaryDocValues getBinaryDocValues(LeafReaderContext context, String field) throws IOException {
return DocValues.getBinary(context.reader(), field);
}

@Override
public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
docTerms = getBinaryDocValues(context, field);
return this;
}

@Override
public void setBottom(final int bottom) {
this.bottom = values[bottom];
}

@Override
public void setTopValue(BytesRef value) {
// null is fine: it means the last doc of the prior
// search was missing this value
topValue = value;
}

@Override
public BytesRef value(int slot) {
return values[slot];
}

@Override
public int compareValues(BytesRef val1, BytesRef val2) {
// missing always sorts first:
if (val1 == null) {
if (val2 == null) {
return 0;
}
return missingSortCmp;
} else if (val2 == null) {
return -missingSortCmp;
}
return val1.compareTo(val2);
}

@Override
public int compareTop(int doc) throws IOException {
return compareValues(topValue, getValueForDoc(doc));
}

@Override
public void setScorer(Scorable scorer) {}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,13 @@ default DocIdSetIterator competitiveIterator() throws IOException {
default void setHitsThresholdReached() throws IOException{
}


/**
* Informs this leaf comparator that sort uses index sorting.
* If index sorting is used, we can update competitive iterator to the empty iterator
* as soon as the needed number of documents are collected.
*/
default void usesIndexSort() {
}

}
Loading

0 comments on commit fff115b

Please sign in to comment.