/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.PriorityQueue;

import org.apache.lucene.geo.Rectangle;
import org.apache.lucene.index.PointValues.IntersectVisitor;
import org.apache.lucene.index.PointValues.Relation;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SloppyMath;
import org.apache.lucene.util.bkd.BKDReader;
import org.apache.lucene.util.bkd.BKDReader.IndexTree;
import org.apache.lucene.util.bkd.BKDReader.IntersectState;

import static org.apache.lucene.geo.GeoEncodingUtils.decodeLatitude;
import static org.apache.lucene.geo.GeoEncodingUtils.decodeLongitude;

KNN search on top of 2D lat/lon indexed points.
@lucene.experimental
/** * KNN search on top of 2D lat/lon indexed points. * * @lucene.experimental */
class NearestNeighbor { static class Cell implements Comparable<Cell> { final int readerIndex; final byte[] minPacked; final byte[] maxPacked; final IndexTree index;
The closest distance from a point in this cell to the query point, computed as a sort key through SloppyMath.haversinSortKey. Note that this is an approximation to the closest distance, and there could be a point in the cell that is closer.
/** * The closest distance from a point in this cell to the query point, computed as a sort key through * {@link SloppyMath#haversinSortKey}. Note that this is an approximation to the closest distance, * and there could be a point in the cell that is closer. */
final double distanceSortKey; public Cell(IndexTree index, int readerIndex, byte[] minPacked, byte[] maxPacked, double distanceSortKey) { this.index = index; this.readerIndex = readerIndex; this.minPacked = minPacked.clone(); this.maxPacked = maxPacked.clone(); this.distanceSortKey = distanceSortKey; } public int compareTo(Cell other) { return Double.compare(distanceSortKey, other.distanceSortKey); } @Override public String toString() { double minLat = decodeLatitude(minPacked, 0); double minLon = decodeLongitude(minPacked, Integer.BYTES); double maxLat = decodeLatitude(maxPacked, 0); double maxLon = decodeLongitude(maxPacked, Integer.BYTES); return "Cell(readerIndex=" + readerIndex + " nodeID=" + index.getNodeID() + " isLeaf=" + index.isLeafNode() + " lat=" + minLat + " TO " + maxLat + ", lon=" + minLon + " TO " + maxLon + "; distanceSortKey=" + distanceSortKey + ")"; } } private static class NearestVisitor implements IntersectVisitor { public int curDocBase; public Bits curLiveDocs; final int topN; final PriorityQueue<NearestHit> hitQueue; final double pointLat; final double pointLon; private int setBottomCounter; private double minLon = Double.NEGATIVE_INFINITY; private double maxLon = Double.POSITIVE_INFINITY; private double minLat = Double.NEGATIVE_INFINITY; private double maxLat = Double.POSITIVE_INFINITY; // second set of longitude ranges to check (for cross-dateline case) private double minLon2 = Double.POSITIVE_INFINITY; public NearestVisitor(PriorityQueue<NearestHit> hitQueue, int topN, double pointLat, double pointLon) { this.hitQueue = hitQueue; this.topN = topN; this.pointLat = pointLat; this.pointLon = pointLon; } @Override public void visit(int docID) { throw new AssertionError(); } private void maybeUpdateBBox() { if (setBottomCounter < 1024 || (setBottomCounter & 0x3F) == 0x3F) { NearestHit hit = hitQueue.peek(); Rectangle box = Rectangle.fromPointDistance(pointLat, pointLon, SloppyMath.haversinMeters(hit.distanceSortKey)); //System.out.println(" update bbox to " + box); minLat = box.minLat; maxLat = box.maxLat; if (box.crossesDateline()) { // box1 minLon = Double.NEGATIVE_INFINITY; maxLon = box.maxLon; // box2 minLon2 = box.minLon; } else { minLon = box.minLon; maxLon = box.maxLon; // disable box2 minLon2 = Double.POSITIVE_INFINITY; } } setBottomCounter++; } @Override public void visit(int docID, byte[] packedValue) { //System.out.println("visit docID=" + docID + " liveDocs=" + curLiveDocs); if (curLiveDocs != null && curLiveDocs.get(docID) == false) { return; } double docLatitude = decodeLatitude(packedValue, 0); double docLongitude = decodeLongitude(packedValue, Integer.BYTES); // test bounding box if (docLatitude < minLat || docLatitude > maxLat) { return; } if ((docLongitude < minLon || docLongitude > maxLon) && (docLongitude < minLon2)) { return; } // Use the haversin sort key when comparing hits, as it is faster to compute than the true distance. double distanceSortKey = SloppyMath.haversinSortKey(pointLat, pointLon, docLatitude, docLongitude); //System.out.println(" visit docID=" + docID + " distanceSortKey=" + distanceSortKey + " docLat=" + docLatitude + " docLon=" + docLongitude); int fullDocID = curDocBase + docID; if (hitQueue.size() == topN) { // queue already full NearestHit hit = hitQueue.peek(); //System.out.println(" bottom distanceSortKey=" + hit.distanceSortKey); // we don't collect docs in order here, so we must also test the tie-break case ourselves: if (distanceSortKey < hit.distanceSortKey || (distanceSortKey == hit.distanceSortKey && fullDocID < hit.docID)) { hitQueue.poll(); hit.docID = fullDocID; hit.distanceSortKey = distanceSortKey; hitQueue.offer(hit); //System.out.println(" ** keep2, now bottom=" + hit); maybeUpdateBBox(); } } else { NearestHit hit = new NearestHit(); hit.docID = fullDocID; hit.distanceSortKey = distanceSortKey; hitQueue.offer(hit); //System.out.println(" ** keep1, now bottom=" + hit); } } @Override public Relation compare(byte[] minPackedValue, byte[] maxPackedValue) { double cellMinLat = decodeLatitude(minPackedValue, 0); double cellMinLon = decodeLongitude(minPackedValue, Integer.BYTES); double cellMaxLat = decodeLatitude(maxPackedValue, 0); double cellMaxLon = decodeLongitude(maxPackedValue, Integer.BYTES); if (cellMaxLat < minLat || maxLat < cellMinLat || ((cellMaxLon < minLon || maxLon < cellMinLon) && cellMaxLon < minLon2)) { // this cell is outside our search bbox; don't bother exploring any more return Relation.CELL_OUTSIDE_QUERY; } return Relation.CELL_CROSSES_QUERY; } }
Holds one hit from NearestNeighbor.nearest
/** Holds one hit from {@link NearestNeighbor#nearest} */
static class NearestHit { public int docID;
The distance from the hit to the query point, computed as a sort key through SloppyMath.haversinSortKey.
/** * The distance from the hit to the query point, computed as a sort key through {@link SloppyMath#haversinSortKey}. */
public double distanceSortKey; @Override public String toString() { return "NearestHit(docID=" + docID + " distanceSortKey=" + distanceSortKey + ")"; } } // TODO: can we somehow share more with, or simply directly use, the LatLonPointDistanceComparator? It's really doing the same thing as // our hitQueue... public static NearestHit[] nearest(double pointLat, double pointLon, List<BKDReader> readers, List<Bits> liveDocs, List<Integer> docBases, final int n) throws IOException { //System.out.println("NEAREST: readers=" + readers + " liveDocs=" + liveDocs + " pointLat=" + pointLat + " pointLon=" + pointLon); // Holds closest collected points seen so far: // TODO: if we used lucene's PQ we could just updateTop instead of poll/offer: final PriorityQueue<NearestHit> hitQueue = new PriorityQueue<>(n, new Comparator<NearestHit>() { @Override public int compare(NearestHit a, NearestHit b) { // sort by opposite distanceSortKey natural order int cmp = Double.compare(a.distanceSortKey, b.distanceSortKey); if (cmp != 0) { return -cmp; } // tie-break by higher docID: return b.docID - a.docID; } }); // Holds all cells, sorted by closest to the point: PriorityQueue<Cell> cellQueue = new PriorityQueue<>(); NearestVisitor visitor = new NearestVisitor(hitQueue, n, pointLat, pointLon); List<BKDReader.IntersectState> states = new ArrayList<>(); // Add root cell for each reader into the queue: int bytesPerDim = -1; for(int i=0;i<readers.size();i++) { BKDReader reader = readers.get(i); if (bytesPerDim == -1) { bytesPerDim = reader.getBytesPerDimension(); } else if (bytesPerDim != reader.getBytesPerDimension()) { throw new IllegalStateException("bytesPerDim changed from " + bytesPerDim + " to " + reader.getBytesPerDimension() + " across readers"); } byte[] minPackedValue = reader.getMinPackedValue(); byte[] maxPackedValue = reader.getMaxPackedValue(); IntersectState state = reader.getIntersectState(visitor); states.add(state); cellQueue.offer(new Cell(state.index, i, reader.getMinPackedValue(), reader.getMaxPackedValue(), approxBestDistance(minPackedValue, maxPackedValue, pointLat, pointLon))); } while (cellQueue.size() > 0) { Cell cell = cellQueue.poll(); //System.out.println(" visit " + cell); // TODO: if we replace approxBestDistance with actualBestDistance, we can put an opto here to break once this "best" cell is fully outside of the hitQueue bottom's radius: BKDReader reader = readers.get(cell.readerIndex); if (cell.index.isLeafNode()) { //System.out.println(" leaf"); // Leaf block: visit all points and possibly collect them: visitor.curDocBase = docBases.get(cell.readerIndex); visitor.curLiveDocs = liveDocs.get(cell.readerIndex); reader.visitLeafBlockValues(cell.index, states.get(cell.readerIndex)); //System.out.println(" now " + hitQueue.size() + " hits"); } else { //System.out.println(" non-leaf"); // Non-leaf block: split into two cells and put them back into the queue: if (visitor.compare(cell.minPacked, cell.maxPacked) == Relation.CELL_OUTSIDE_QUERY) { continue; } BytesRef splitValue = BytesRef.deepCopyOf(cell.index.getSplitDimValue()); int splitDim = cell.index.getSplitDim(); // we must clone the index so that we we can recurse left and right "concurrently": IndexTree newIndex = cell.index.clone(); byte[] splitPackedValue = cell.maxPacked.clone(); System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim); cell.index.pushLeft(); cellQueue.offer(new Cell(cell.index, cell.readerIndex, cell.minPacked, splitPackedValue, approxBestDistance(cell.minPacked, splitPackedValue, pointLat, pointLon))); splitPackedValue = cell.minPacked.clone(); System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim*bytesPerDim, bytesPerDim); newIndex.pushRight(); cellQueue.offer(new Cell(newIndex, cell.readerIndex, splitPackedValue, cell.maxPacked, approxBestDistance(splitPackedValue, cell.maxPacked, pointLat, pointLon))); } } NearestHit[] hits = new NearestHit[hitQueue.size()]; int downTo = hitQueue.size()-1; while (hitQueue.size() != 0) { hits[downTo] = hitQueue.poll(); downTo--; } return hits; } // NOTE: incoming args never cross the dateline, since they are a BKD cell private static double approxBestDistance(byte[] minPackedValue, byte[] maxPackedValue, double pointLat, double pointLon) { double minLat = decodeLatitude(minPackedValue, 0); double minLon = decodeLongitude(minPackedValue, Integer.BYTES); double maxLat = decodeLatitude(maxPackedValue, 0); double maxLon = decodeLongitude(maxPackedValue, Integer.BYTES); return approxBestDistance(minLat, maxLat, minLon, maxLon, pointLat, pointLon); } // NOTE: incoming args never cross the dateline, since they are a BKD cell private static double approxBestDistance(double minLat, double maxLat, double minLon, double maxLon, double pointLat, double pointLon) { // TODO: can we make this the trueBestDistance? I.e., minimum distance between the point and ANY point on the box? we can speed things // up if so, but not enrolling any BKD cell whose true best distance is > bottom of the current hit queue if (pointLat >= minLat && pointLat <= maxLat && pointLon >= minLon && pointLon <= maxLon) { // point is inside the cell! return 0.0; } double d1 = SloppyMath.haversinSortKey(pointLat, pointLon, minLat, minLon); double d2 = SloppyMath.haversinSortKey(pointLat, pointLon, minLat, maxLon); double d3 = SloppyMath.haversinSortKey(pointLat, pointLon, maxLat, maxLon); double d4 = SloppyMath.haversinSortKey(pointLat, pointLon, maxLat, minLon); return Math.min(Math.min(d1, d2), Math.min(d3, d4)); } }