LUCENE-9599 Make comparator aware of index sorting

Currently, if search sort is equal to index sort, we have an early termination in TopFieldCollector. As we work to enhance comparators to provide skipping functionality (PR apache#1351), we would like to move this termination functionality on index sort from TopFieldCollector to comparators. This patch does the following: - Add method usesIndexSort to LeafFieldComparator - Make numeric comparators aware of index sort and early terminate on collecting all competitive hits - Move TermValComparator and TermOrdValComparator from FieldComparator to comparator package, for all comparators to be in the same package - Enhance TermValComparator to provide skipping functionality when index is sorted One item left for TODO for a following PR is to remove the logic of early termniation from TopFieldCollector. We can do that once we ensure that all BulkScorers are using iterators from collectors than can skip non-competitive docs. Relates to apache#1351
mayya-sharipova · Nov 4, 2020 · fff115b · fff115b
1 parent 656ce93
commit fff115b
Show file tree

Hide file tree

Showing 15 changed files with 686 additions and 415 deletions.
diff --git a/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java b/lucene/core/src/java/org/apache/lucene/search/FieldComparator.java
@@ -224,384 +224,4 @@ public int compareTop(int doc) throws IOException {
       return Float.compare(docValue, topValue);
     }
   }
-
-  /** Sorts by field's natural Term sort order, using
-   *  ordinals.  This is functionally equivalent to {@link
-   *  org.apache.lucene.search.FieldComparator.TermValComparator}, but it first resolves the string
-   *  to their relative ordinal positions (using the index
-   *  returned by {@link org.apache.lucene.index.LeafReader#getSortedDocValues(String)}), and
-   *  does most comparisons using the ordinals.  For medium
-   *  to large results, this comparator will be much faster
-   *  than {@link org.apache.lucene.search.FieldComparator.TermValComparator}.  For very small
-   *  result sets it may be slower. */
-  public static class TermOrdValComparator extends FieldComparator<BytesRef> implements LeafFieldComparator {
-    /* Ords for each slot.
-       @lucene.internal */
-    final int[] ords;
-
-    /* Values for each slot.
-       @lucene.internal */
-    final BytesRef[] values;
-    private final BytesRefBuilder[] tempBRs;
-
-    /* Which reader last copied a value into the slot. When
-       we compare two slots, we just compare-by-ord if the
-       readerGen is the same; else we must compare the
-       values (slower).
-       @lucene.internal */
-    final int[] readerGen;
-
-    /* Gen of current reader we are on.
-       @lucene.internal */
-    int currentReaderGen = -1;
-
-    /* Current reader's doc ord/values.
-       @lucene.internal */
-    SortedDocValues termsIndex;
-
-    private final String field;
-
-    /* Bottom slot, or -1 if queue isn't full yet
-       @lucene.internal */
-    int bottomSlot = -1;
-
-    /* Bottom ord (same as ords[bottomSlot] once bottomSlot
-       is set).  Cached for faster compares.
-       @lucene.internal */
-    int bottomOrd;
-
-    /* True if current bottom slot matches the current
-       reader.
-       @lucene.internal */
-    boolean bottomSameReader;
-
-    /* Bottom value (same as values[bottomSlot] once
-       bottomSlot is set).  Cached for faster compares.
-      @lucene.internal */
-    BytesRef bottomValue;
-
-    /** Set by setTopValue. */
-    BytesRef topValue;
-    boolean topSameReader;
-    int topOrd;
-
-    /** -1 if missing values are sorted first, 1 if they are
-     *  sorted last */
-    final int missingSortCmp;
-
-    /** Which ordinal to use for a missing value. */
-    final int missingOrd;
-
-    /** Creates this, sorting missing values first. */
-    public TermOrdValComparator(int numHits, String field) {
-      this(numHits, field, false);
-    }
-
-    /** Creates this, with control over how missing values
-     *  are sorted.  Pass sortMissingLast=true to put
-     *  missing values at the end. */
-    public TermOrdValComparator(int numHits, String field, boolean sortMissingLast) {
-      ords = new int[numHits];
-      values = new BytesRef[numHits];
-      tempBRs = new BytesRefBuilder[numHits];
-      readerGen = new int[numHits];
-      this.field = field;
-      if (sortMissingLast) {
-        missingSortCmp = 1;
-        missingOrd = Integer.MAX_VALUE;
-      } else {
-        missingSortCmp = -1;
-        missingOrd = -1;
-      }
-    }
-
-    private int getOrdForDoc(int doc) throws IOException {
-      if (termsIndex.advanceExact(doc)) {
-        return termsIndex.ordValue();
-      } else {
-        return -1;
-      }
-    }
-
-    @Override
-    public int compare(int slot1, int slot2) {
-      if (readerGen[slot1] == readerGen[slot2]) {
-        return ords[slot1] - ords[slot2];
-      }
-
-      final BytesRef val1 = values[slot1];
-      final BytesRef val2 = values[slot2];
-      if (val1 == null) {
-        if (val2 == null) {
-          return 0;
-        }
-        return missingSortCmp;
-      } else if (val2 == null) {
-        return -missingSortCmp;
-      }
-      return val1.compareTo(val2);
-    }
-
-    @Override
-    public int compareBottom(int doc) throws IOException {
-      assert bottomSlot != -1;
-      int docOrd = getOrdForDoc(doc);
-      if (docOrd == -1) {
-        docOrd = missingOrd;
-      }
-      if (bottomSameReader) {
-        // ord is precisely comparable, even in the equal case
-        return bottomOrd - docOrd;
-      } else if (bottomOrd >= docOrd) {
-        // the equals case always means bottom is > doc
-        // (because we set bottomOrd to the lower bound in
-        // setBottom):
-        return 1;
-      } else {
-        return -1;
-      }
-    }
-
-    @Override
-    public void copy(int slot, int doc) throws IOException {
-      int ord = getOrdForDoc(doc);
-      if (ord == -1) {
-        ord = missingOrd;
-        values[slot] = null;
-      } else {
-        assert ord >= 0;
-        if (tempBRs[slot] == null) {
-          tempBRs[slot] = new BytesRefBuilder();
-        }
-        tempBRs[slot].copyBytes(termsIndex.lookupOrd(ord));
-        values[slot] = tempBRs[slot].get();
-      }
-      ords[slot] = ord;
-      readerGen[slot] = currentReaderGen;
-    }
-
-    /** Retrieves the SortedDocValues for the field in this segment */
-    protected SortedDocValues getSortedDocValues(LeafReaderContext context, String field) throws IOException {
-      return DocValues.getSorted(context.reader(), field);
-    }
-
-    @Override
-    public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
-      termsIndex = getSortedDocValues(context, field);
-      currentReaderGen++;
-
-      if (topValue != null) {
-        // Recompute topOrd/SameReader
-        int ord = termsIndex.lookupTerm(topValue);
-        if (ord >= 0) {
-          topSameReader = true;
-          topOrd = ord;
-        } else {
-          topSameReader = false;
-          topOrd = -ord-2;
-        }
-      } else {
-        topOrd = missingOrd;
-        topSameReader = true;
-      }
-      //System.out.println("  getLeafComparator topOrd=" + topOrd + " topSameReader=" + topSameReader);
-
-      if (bottomSlot != -1) {
-        // Recompute bottomOrd/SameReader
-        setBottom(bottomSlot);
-      }
-
-      return this;
-    }
-
-    @Override
-    public void setBottom(final int bottom) throws IOException {
-      bottomSlot = bottom;
-
-      bottomValue = values[bottomSlot];
-      if (currentReaderGen == readerGen[bottomSlot]) {
-        bottomOrd = ords[bottomSlot];
-        bottomSameReader = true;
-      } else {
-        if (bottomValue == null) {
-          // missingOrd is null for all segments
-          assert ords[bottomSlot] == missingOrd;
-          bottomOrd = missingOrd;
-          bottomSameReader = true;
-          readerGen[bottomSlot] = currentReaderGen;
-        } else {
-          final int ord = termsIndex.lookupTerm(bottomValue);
-          if (ord < 0) {
-            bottomOrd = -ord - 2;
-            bottomSameReader = false;
-          } else {
-            bottomOrd = ord;
-            // exact value match
-            bottomSameReader = true;
-            readerGen[bottomSlot] = currentReaderGen;            
-            ords[bottomSlot] = bottomOrd;
-          }
-        }
-      }
-    }
-
-    @Override
-    public void setTopValue(BytesRef value) {
-      // null is fine: it means the last doc of the prior
-      // search was missing this value
-      topValue = value;
-      //System.out.println("setTopValue " + topValue);
-    }
-
-    @Override
-    public BytesRef value(int slot) {
-      return values[slot];
-    }
-
-    @Override
-    public int compareTop(int doc) throws IOException {
-
-      int ord = getOrdForDoc(doc);
-      if (ord == -1) {
-        ord = missingOrd;
-      }
-
-      if (topSameReader) {
-        // ord is precisely comparable, even in the equal
-        // case
-        //System.out.println("compareTop doc=" + doc + " ord=" + ord + " ret=" + (topOrd-ord));
-        return topOrd - ord;
-      } else if (ord <= topOrd) {
-        // the equals case always means doc is < value
-        // (because we set lastOrd to the lower bound)
-        return 1;
-      } else {
-        return -1;
-      }
-    }
-
-    @Override
-    public int compareValues(BytesRef val1, BytesRef val2) {
-      if (val1 == null) {
-        if (val2 == null) {
-          return 0;
-        }
-        return missingSortCmp;
-      } else if (val2 == null) {
-        return -missingSortCmp;
-      }
-      return val1.compareTo(val2);
-    }
-
-    @Override
-    public void setScorer(Scorable scorer) {}
-  }
-
-  /** Sorts by field's natural Term sort order.  All
-   *  comparisons are done using BytesRef.compareTo, which is
-   *  slow for medium to large result sets but possibly
-   *  very fast for very small results sets. */
-  public static class TermValComparator extends FieldComparator<BytesRef> implements LeafFieldComparator {
-
-    private final BytesRef[] values;
-    private final BytesRefBuilder[] tempBRs;
-    private BinaryDocValues docTerms;
-    private final String field;
-    private BytesRef bottom;
-    private BytesRef topValue;
-    private final int missingSortCmp;
-
-    /** Sole constructor. */
-    public TermValComparator(int numHits, String field, boolean sortMissingLast) {
-      values = new BytesRef[numHits];
-      tempBRs = new BytesRefBuilder[numHits];
-      this.field = field;
-      missingSortCmp = sortMissingLast ? 1 : -1;
-    }
-
-    private BytesRef getValueForDoc(int doc) throws IOException {
-      if (docTerms.advanceExact(doc)) {
-        return docTerms.binaryValue();
-      } else {
-        return null;
-      }
-    }
-
-    @Override
-    public int compare(int slot1, int slot2) {
-      final BytesRef val1 = values[slot1];
-      final BytesRef val2 = values[slot2];
-      return compareValues(val1, val2);
-    }
-
-    @Override
-    public int compareBottom(int doc) throws IOException {
-      final BytesRef comparableBytes = getValueForDoc(doc);
-      return compareValues(bottom, comparableBytes);
-    }
-
-    @Override
-    public void copy(int slot, int doc) throws IOException {
-      final BytesRef comparableBytes = getValueForDoc(doc);
-      if (comparableBytes == null) {
-        values[slot] = null;
-      } else {
-        if (tempBRs[slot] == null) {
-          tempBRs[slot] = new BytesRefBuilder();
-        }
-        tempBRs[slot].copyBytes(comparableBytes);
-        values[slot] = tempBRs[slot].get();
-      }
-    }
-
-    /** Retrieves the BinaryDocValues for the field in this segment */
-    protected BinaryDocValues getBinaryDocValues(LeafReaderContext context, String field) throws IOException {
-      return DocValues.getBinary(context.reader(), field);
-    }
-
-    @Override
-    public LeafFieldComparator getLeafComparator(LeafReaderContext context) throws IOException {
-      docTerms = getBinaryDocValues(context, field);
-      return this;
-    }
-
-    @Override
-    public void setBottom(final int bottom) {
-      this.bottom = values[bottom];
-    }
-
-    @Override
-    public void setTopValue(BytesRef value) {
-      // null is fine: it means the last doc of the prior
-      // search was missing this value
-      topValue = value;
-    }
-
-    @Override
-    public BytesRef value(int slot) {
-      return values[slot];
-    }
-
-    @Override
-    public int compareValues(BytesRef val1, BytesRef val2) {
-      // missing always sorts first:
-      if (val1 == null) {
-        if (val2 == null) {
-          return 0;
-        }
-        return missingSortCmp;
-      } else if (val2 == null) {
-        return -missingSortCmp;
-      }
-      return val1.compareTo(val2);
-    }
-
-    @Override
-    public int compareTop(int doc) throws IOException {
-      return compareValues(topValue, getValueForDoc(doc));
-    }
-
-    @Override
-    public void setScorer(Scorable scorer) {}
-  }
 }
diff --git a/lucene/core/src/java/org/apache/lucene/search/LeafFieldComparator.java b/lucene/core/src/java/org/apache/lucene/search/LeafFieldComparator.java
@@ -132,4 +132,13 @@ default DocIdSetIterator competitiveIterator() throws IOException {
   default void setHitsThresholdReached() throws IOException{
   }
 
+
+  /**
+   * Informs this leaf comparator that sort uses index sorting.
+   * If index sorting is used, we can update competitive iterator to the empty iterator
+   * as soon as the needed number of documents are collected.
+   */
+  default void usesIndexSort() {
+  }
+
 }