/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.index;


import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;

import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.PriorityQueue;

Exposes TermsEnum API, merged from TermsEnum API of sub-segments. This does a merge sort, by term text, of the sub-readers.
@lucene.experimental
/** * Exposes {@link TermsEnum} API, merged from {@link TermsEnum} API of sub-segments. * This does a merge sort, by term text, of the sub-readers. * * @lucene.experimental */
public final class MultiTermsEnum extends BaseTermsEnum { private static final Comparator<TermsEnumWithSlice> INDEX_COMPARATOR = new Comparator<TermsEnumWithSlice>() { @Override public int compare(TermsEnumWithSlice o1, TermsEnumWithSlice o2) { return o1.index - o2.index; } }; private final TermMergeQueue queue; private final TermsEnumWithSlice[] subs; // all of our subs (one per sub-reader) private final TermsEnumWithSlice[] currentSubs; // current subs that have at least one term for this field private final TermsEnumWithSlice[] top; private final MultiPostingsEnum.EnumWithSlice[] subDocs; private BytesRef lastSeek; private boolean lastSeekExact; private final BytesRefBuilder lastSeekScratch = new BytesRefBuilder(); private int numTop; private int numSubs; private BytesRef current; static class TermsEnumIndex { public final static TermsEnumIndex[] EMPTY_ARRAY = new TermsEnumIndex[0]; final int subIndex; final TermsEnum termsEnum; public TermsEnumIndex(TermsEnum termsEnum, int subIndex) { this.termsEnum = termsEnum; this.subIndex = subIndex; } }
Returns how many sub-reader slices contain the current term. @see #getMatchArray
/** Returns how many sub-reader slices contain the current * term. @see #getMatchArray */
public int getMatchCount() { return numTop; }
Returns sub-reader slices positioned to the current term.
/** Returns sub-reader slices positioned to the current term. */
public TermsEnumWithSlice[] getMatchArray() { return top; }
Sole constructor. @param slices Which sub-reader slices we should merge.
/** Sole constructor. * @param slices Which sub-reader slices we should * merge. */
public MultiTermsEnum(ReaderSlice[] slices) { queue = new TermMergeQueue(slices.length); top = new TermsEnumWithSlice[slices.length]; subs = new TermsEnumWithSlice[slices.length]; subDocs = new MultiPostingsEnum.EnumWithSlice[slices.length]; for(int i=0;i<slices.length;i++) { subs[i] = new TermsEnumWithSlice(i, slices[i]); subDocs[i] = new MultiPostingsEnum.EnumWithSlice(); subDocs[i].slice = slices[i]; } currentSubs = new TermsEnumWithSlice[slices.length]; } @Override public BytesRef term() { return current; }
The terms array must be newly created TermsEnum, ie BytesRefIterator.next has not yet been called.
/** The terms array must be newly created TermsEnum, ie * {@link TermsEnum#next} has not yet been called. */
public TermsEnum reset(TermsEnumIndex[] termsEnumsIndex) throws IOException { assert termsEnumsIndex.length <= top.length; numSubs = 0; numTop = 0; queue.clear(); for(int i=0;i<termsEnumsIndex.length;i++) { final TermsEnumIndex termsEnumIndex = termsEnumsIndex[i]; assert termsEnumIndex != null; final BytesRef term = termsEnumIndex.termsEnum.next(); if (term != null) { final TermsEnumWithSlice entry = subs[termsEnumIndex.subIndex]; entry.reset(termsEnumIndex.termsEnum, term); queue.add(entry); currentSubs[numSubs++] = entry; } else { // field has no terms } } if (queue.size() == 0) { return TermsEnum.EMPTY; } else { return this; } } @Override public boolean seekExact(BytesRef term) throws IOException { queue.clear(); numTop = 0; boolean seekOpt = false; if (lastSeek != null && lastSeek.compareTo(term) <= 0) { seekOpt = true; } lastSeek = null; lastSeekExact = true; for(int i=0;i<numSubs;i++) { final boolean status; // LUCENE-2130: if we had just seek'd already, prior // to this seek, and the new seek term is after the // previous one, don't try to re-seek this sub if its // current term is already beyond this new seek term. // Doing so is a waste because this sub will simply // seek to the same spot. if (seekOpt) { final BytesRef curTerm = currentSubs[i].current; if (curTerm != null) { final int cmp = term.compareTo(curTerm); if (cmp == 0) { status = true; } else if (cmp < 0) { status = false; } else { status = currentSubs[i].terms.seekExact(term); } } else { status = false; } } else { status = currentSubs[i].terms.seekExact(term); } if (status) { top[numTop++] = currentSubs[i]; current = currentSubs[i].current = currentSubs[i].terms.term(); assert term.equals(currentSubs[i].current); } } // if at least one sub had exact match to the requested // term then we found match return numTop > 0; } @Override public SeekStatus seekCeil(BytesRef term) throws IOException { queue.clear(); numTop = 0; lastSeekExact = false; boolean seekOpt = false; if (lastSeek != null && lastSeek.compareTo(term) <= 0) { seekOpt = true; } lastSeekScratch.copyBytes(term); lastSeek = lastSeekScratch.get(); for(int i=0;i<numSubs;i++) { final SeekStatus status; // LUCENE-2130: if we had just seek'd already, prior // to this seek, and the new seek term is after the // previous one, don't try to re-seek this sub if its // current term is already beyond this new seek term. // Doing so is a waste because this sub will simply // seek to the same spot. if (seekOpt) { final BytesRef curTerm = currentSubs[i].current; if (curTerm != null) { final int cmp = term.compareTo(curTerm); if (cmp == 0) { status = SeekStatus.FOUND; } else if (cmp < 0) { status = SeekStatus.NOT_FOUND; } else { status = currentSubs[i].terms.seekCeil(term); } } else { status = SeekStatus.END; } } else { status = currentSubs[i].terms.seekCeil(term); } if (status == SeekStatus.FOUND) { top[numTop++] = currentSubs[i]; current = currentSubs[i].current = currentSubs[i].terms.term(); queue.add(currentSubs[i]); } else { if (status == SeekStatus.NOT_FOUND) { currentSubs[i].current = currentSubs[i].terms.term(); assert currentSubs[i].current != null; queue.add(currentSubs[i]); } else { assert status == SeekStatus.END; // enum exhausted currentSubs[i].current = null; } } } if (numTop > 0) { // at least one sub had exact match to the requested term return SeekStatus.FOUND; } else if (queue.size() > 0) { // no sub had exact match, but at least one sub found // a term after the requested term -- advance to that // next term: pullTop(); return SeekStatus.NOT_FOUND; } else { return SeekStatus.END; } } @Override public void seekExact(long ord) { throw new UnsupportedOperationException(); } @Override public long ord() { throw new UnsupportedOperationException(); } private void pullTop() { // extract all subs from the queue that have the same // top term assert numTop == 0; numTop = queue.fillTop(top); current = top[0].current; } private void pushTop() throws IOException { // call next() on each top, and reorder queue for (int i = 0; i < numTop; i++) { TermsEnumWithSlice top = queue.top(); top.current = top.terms.next(); if (top.current == null) { queue.pop(); } else { queue.updateTop(); } } numTop = 0; } @Override public BytesRef next() throws IOException { if (lastSeekExact) { // Must seekCeil at this point, so those subs that // didn't have the term can find the following term. // NOTE: we could save some CPU by only seekCeil the // subs that didn't match the last exact seek... but // most impls short-circuit if you seekCeil to term // they are already on. final SeekStatus status = seekCeil(current); assert status == SeekStatus.FOUND; lastSeekExact = false; } lastSeek = null; // restore queue pushTop(); // gather equal top fields if (queue.size() > 0) { // TODO: we could maybe defer this somewhat costly operation until one of the APIs that // needs to see the top is invoked (docFreq, postings, etc.) pullTop(); } else { current = null; } return current; } @Override public int docFreq() throws IOException { int sum = 0; for(int i=0;i<numTop;i++) { sum += top[i].terms.docFreq(); } return sum; } @Override public long totalTermFreq() throws IOException { long sum = 0; for(int i=0;i<numTop;i++) { final long v = top[i].terms.totalTermFreq(); assert v != -1; sum += v; } return sum; } @Override public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { MultiPostingsEnum docsEnum; // Can only reuse if incoming enum is also a MultiDocsEnum if (reuse != null && reuse instanceof MultiPostingsEnum) { docsEnum = (MultiPostingsEnum) reuse; // ... and was previously created w/ this MultiTermsEnum: if (!docsEnum.canReuse(this)) { docsEnum = new MultiPostingsEnum(this, subs.length); } } else { docsEnum = new MultiPostingsEnum(this, subs.length); } int upto = 0; ArrayUtil.timSort(top, 0, numTop, INDEX_COMPARATOR); for(int i=0;i<numTop;i++) { final TermsEnumWithSlice entry = top[i]; assert entry.index < docsEnum.subPostingsEnums.length: entry.index + " vs " + docsEnum.subPostingsEnums.length + "; " + subs.length; final PostingsEnum subPostingsEnum = entry.terms.postings(docsEnum.subPostingsEnums[entry.index], flags); assert subPostingsEnum != null; docsEnum.subPostingsEnums[entry.index] = subPostingsEnum; subDocs[upto].postingsEnum = subPostingsEnum; subDocs[upto].slice = entry.subSlice; upto++; } return docsEnum.reset(subDocs, upto); } @Override public ImpactsEnum impacts(int flags) throws IOException { // implemented to not fail CheckIndex, but you shouldn't be using impacts on a slow reader return new SlowImpactsEnum(postings(null, flags)); } final static class TermsEnumWithSlice { private final ReaderSlice subSlice; TermsEnum terms; public BytesRef current; final int index; public TermsEnumWithSlice(int index, ReaderSlice subSlice) { this.subSlice = subSlice; this.index = index; assert subSlice.length >= 0: "length=" + subSlice.length; } public void reset(TermsEnum terms, BytesRef term) { this.terms = terms; current = term; } @Override public String toString() { return subSlice.toString()+":"+terms; } } private final static class TermMergeQueue extends PriorityQueue<TermsEnumWithSlice> { final int[] stack; TermMergeQueue(int size) { super(size); this.stack = new int[size]; } @Override protected boolean lessThan(TermsEnumWithSlice termsA, TermsEnumWithSlice termsB) { return termsA.current.compareTo(termsB.current) < 0; }
Add the PriorityQueue<TermsEnumWithSlice>.top() slice as well as all slices that are positionned on the same term to tops and return how many of them there are.
/** Add the {@link #top()} slice as well as all slices that are positionned * on the same term to {@code tops} and return how many of them there are. */
int fillTop(TermsEnumWithSlice[] tops) { final int size = size(); if (size == 0) { return 0; } tops[0] = top(); int numTop = 1; stack[0] = 1; int stackLen = 1; while (stackLen != 0) { final int index = stack[--stackLen]; final int leftChild = index << 1; for (int child = leftChild, end = Math.min(size, leftChild + 1); child <= end; ++child) { TermsEnumWithSlice te = get(child); if (te.current.equals(tops[0].current)) { tops[numTop++] = te; stack[stackLen++] = child; } } } return numTop; } private TermsEnumWithSlice get(int i) { return (TermsEnumWithSlice) getHeapArray()[i]; } } @Override public String toString() { return "MultiTermsEnum(" + Arrays.toString(subs) + ")"; } }