/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.codecs.uniformsplit;

import java.io.IOException;
import java.util.Arrays;

import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.ByteRunAutomaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
import org.apache.lucene.util.automaton.Transition;

The "intersect" TermsEnum response to UniformSplitTerms.intersect(CompiledAutomaton, BytesRef), intersecting the terms with an automaton.

By design of the UniformSplit block keys, it is less efficient than org.apache.lucene.codecs.blocktree.IntersectTermsEnum for FuzzyQuery (-37%). It is slightly slower for WildcardQuery (-5%) and slightly faster for PrefixQuery (+5%).

@lucene.experimental
/** * The "intersect" {@link TermsEnum} response to {@link UniformSplitTerms#intersect(CompiledAutomaton, BytesRef)}, * intersecting the terms with an automaton. * <p> * By design of the UniformSplit block keys, it is less efficient than * {@code org.apache.lucene.codecs.blocktree.IntersectTermsEnum} for {@link org.apache.lucene.search.FuzzyQuery} (-37%). * It is slightly slower for {@link org.apache.lucene.search.WildcardQuery} (-5%) and slightly faster for * {@link org.apache.lucene.search.PrefixQuery} (+5%). * * @lucene.experimental */
public class IntersectBlockReader extends BlockReader {
Block iteration order. Whether to move next block, jump to a block away, or end the iteration.
/** * Block iteration order. Whether to move next block, jump to a block away, or end the iteration. */
protected enum BlockIteration {NEXT, SEEK, END}
Threshold that controls when to attempt to jump to a block away.

This counter is 0 when entering a block. It is incremented each time a term is rejected by the automaton. When the counter is greater than or equal to this threshold, then we compute the next term accepted by the automaton, with AutomatonNextTermCalculator, and we jump to a block away if the next term accepted is greater than the immediate next term in the block.

A low value, for example 1, improves the performance of automatons requiring many jumps, for example FuzzyQuery and most WildcardQuery. A higher value improves the performance of automatons with less or no jump, for example PrefixQuery. A threshold of 4 seems to be a good balance.

/** * Threshold that controls when to attempt to jump to a block away. * <p> * This counter is 0 when entering a block. It is incremented each time a term is rejected by the automaton. * When the counter is greater than or equal to this threshold, then we compute the next term accepted by * the automaton, with {@link AutomatonNextTermCalculator}, and we jump to a block away if the next term * accepted is greater than the immediate next term in the block. * <p> * A low value, for example 1, improves the performance of automatons requiring many jumps, for example * {@link org.apache.lucene.search.FuzzyQuery} and most {@link org.apache.lucene.search.WildcardQuery}. * A higher value improves the performance of automatons with less or no jump, for example * {@link org.apache.lucene.search.PrefixQuery}. * A threshold of 4 seems to be a good balance. */
protected final int NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD = 4; protected final Automaton automaton; protected final ByteRunAutomaton runAutomaton; protected final boolean finite; protected final BytesRef commonSuffix; // maybe null protected final int minTermLength; protected final AutomatonNextTermCalculator nextStringCalculator;
Set this when our current mode is seeking to this term. Set to null after.
/** * Set this when our current mode is seeking to this term. Set to null after. */
protected BytesRef seekTerm;
Number of bytes accepted by the automaton when validating the current term.
/** * Number of bytes accepted by the automaton when validating the current term. */
protected int numMatchedBytes;
Automaton states reached when validating the current term, from 0 to numMatchedBytes - 1.
/** * Automaton states reached when validating the current term, from 0 to {@link #numMatchedBytes} - 1. */
protected int[] states;
Block iteration order determined when scanning the terms in the current block.
/** * Block iteration order determined when scanning the terms in the current block. */
protected BlockIteration blockIteration;
Counter of the number of consecutively rejected terms. Depending on NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD, this may trigger a jump to a block away.
/** * Counter of the number of consecutively rejected terms. * Depending on {@link #NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD}, this may trigger a jump to a block away. */
protected int numConsecutivelyRejectedTerms; protected IntersectBlockReader(CompiledAutomaton compiled, BytesRef startTerm, IndexDictionary.BrowserSupplier dictionaryBrowserSupplier, IndexInput blockInput, PostingsReaderBase postingsReader, FieldMetadata fieldMetadata, BlockDecoder blockDecoder) throws IOException { super(dictionaryBrowserSupplier, blockInput, postingsReader, fieldMetadata, blockDecoder); automaton = compiled.automaton; runAutomaton = compiled.runAutomaton; finite = compiled.finite; commonSuffix = compiled.commonSuffixRef; minTermLength = getMinTermLength(); nextStringCalculator = new AutomatonNextTermCalculator(compiled); seekTerm = startTerm; }
Computes the minimal length of the terms accepted by the automaton. This speeds up the term scanning for automatons accepting a finite language.
/** * Computes the minimal length of the terms accepted by the automaton. * This speeds up the term scanning for automatons accepting a finite language. */
protected int getMinTermLength() { // Automatons accepting infinite language (e.g. PrefixQuery and WildcardQuery) do not benefit much from // min term length while it takes time to compute it. More precisely, by skipping this computation PrefixQuery // is significantly boosted while WildcardQuery might be slightly degraded on average. This min term length // mainly boosts FuzzyQuery. int commonSuffixLength = commonSuffix == null ? 0 : commonSuffix.length; if (!finite) { return commonSuffixLength; } // Since we are only dealing with finite language, there is no loop to detect. int commonPrefixLength = 0; int state = 0; Transition t = null; while (true) { if (runAutomaton.isAccept(state)) { // The common prefix reaches a final state. So common prefix and common suffix overlap. // Min term length is the max between common prefix and common suffix lengths. return Math.max(commonPrefixLength, commonSuffixLength); } if (automaton.getNumTransitions(state) == 1) { if (t == null) { t = new Transition(); } automaton.getTransition(state, 0, t); if (t.min == t.max) { state = t.dest; commonPrefixLength++; continue; } } break; } // Min term length is the sum of common prefix and common suffix lengths. return commonPrefixLength + commonSuffixLength; } @Override public BytesRef next() throws IOException { if (blockHeader == null) { if (!seekFirstBlock()) { return null; } states = new int[32]; blockIteration = BlockIteration.NEXT; } termState = null; do { BytesRef term = nextTermInBlockMatching(); if (term != null) { return term; } } while (nextBlock()); return null; } protected boolean seekFirstBlock() throws IOException { seekTerm = nextStringCalculator.nextSeekTerm(seekTerm); if (seekTerm == null) { return false; } long blockStartFP = getOrCreateDictionaryBrowser().seekBlock(seekTerm); if (blockStartFP == -1) { blockStartFP = fieldMetadata.getFirstBlockStartFP(); } else if (isBeyondLastTerm(seekTerm, blockStartFP)) { return false; } initializeHeader(seekTerm, blockStartFP); return blockHeader != null; }
Finds the next block line that matches (accepted by the automaton), or null when at end of block.
Returns:The next term in the current block that is accepted by the automaton; or null if none.
/** * Finds the next block line that matches (accepted by the automaton), or null when at end of block. * * @return The next term in the current block that is accepted by the automaton; or null if none. */
protected BytesRef nextTermInBlockMatching() throws IOException { if (seekTerm == null) { if (readLineInBlock() == null) { return null; } } else { SeekStatus seekStatus = seekInBlock(seekTerm); seekTerm = null; if (seekStatus == SeekStatus.END) { return null; } assert numMatchedBytes == 0; assert numConsecutivelyRejectedTerms == 0; } while (true) { TermBytes lineTermBytes = blockLine.getTermBytes(); BytesRef lineTerm = lineTermBytes.getTerm(); assert lineTerm.offset == 0; if (states.length <= lineTerm.length) { states = ArrayUtil.growExact(states, ArrayUtil.oversize(lineTerm.length + 1, Byte.BYTES)); } // Since terms are delta encoded, we may start the automaton steps from the last state reached by the previous term. int index = Math.min(lineTermBytes.getSuffixOffset(), numMatchedBytes); // Skip this term early if it is shorter than the min term length, or if it does not end with the common suffix // accepted by the automaton. if (lineTerm.length >= minTermLength && (commonSuffix == null || endsWithCommonSuffix(lineTerm.bytes, lineTerm.length))) { int state = states[index]; while (true) { if (index == lineTerm.length) { if (runAutomaton.isAccept(state)) { // The automaton accepts the current term. Record the number of matched bytes and return the term. assert runAutomaton.run(lineTerm.bytes, 0, lineTerm.length); numMatchedBytes = index; if (numConsecutivelyRejectedTerms > 0) { numConsecutivelyRejectedTerms = 0; } assert blockIteration == BlockIteration.NEXT; return lineTerm; } break; } state = runAutomaton.step(state, lineTerm.bytes[index] & 0xff); if (state == -1) { // The automaton rejects the current term. break; } // Record the reached automaton state. states[++index] = state; } } // The current term is not accepted by the automaton. // Still record the reached automaton state to start the next term steps from there. assert !runAutomaton.run(lineTerm.bytes, 0, lineTerm.length); numMatchedBytes = index; // If the number of consecutively rejected terms reaches the threshold, // then determine whether it is worthwhile to jump to a block away. if (++numConsecutivelyRejectedTerms >= NUM_CONSECUTIVELY_REJECTED_TERMS_THRESHOLD && lineIndexInBlock < blockHeader.getLinesCount() - 1 && !nextStringCalculator.isLinearState(lineTerm)) { // Compute the next term accepted by the automaton after the current term. if ((seekTerm = nextStringCalculator.nextSeekTerm(lineTerm)) == null) { blockIteration = BlockIteration.END; return null; } // It is worthwhile to jump to a block away if the next term accepted is after the next term in the block. // Actually the block away may be the current block, but this is a good heuristic. readLineInBlock(); if (seekTerm.compareTo(blockLine.getTermBytes().getTerm()) > 0) { // Stop scanning this block terms and set the iteration order to jump to a block away by seeking seekTerm. blockIteration = BlockIteration.SEEK; return null; } seekTerm = null; // If it is not worthwhile to jump to a block away, do not attempt anymore for the current block. numConsecutivelyRejectedTerms = Integer.MIN_VALUE; } else if (readLineInBlock() == null) { // No more terms in the block. The iteration order is to open the very next block. assert blockIteration == BlockIteration.NEXT; return null; } } }
Indicates whether the given term ends with the automaton common suffix. This allows to quickly skip terms that the automaton would reject eventually.
/** * Indicates whether the given term ends with the automaton common suffix. * This allows to quickly skip terms that the automaton would reject eventually. */
protected boolean endsWithCommonSuffix(byte[] termBytes, int termLength) { byte[] suffixBytes = commonSuffix.bytes; int suffixLength = commonSuffix.length; int offset = termLength - suffixLength; assert offset >= 0; // We already checked minTermLength. for (int i = 0; i < suffixLength; i++) { if (termBytes[offset + i] != suffixBytes[i]) { return false; } } return true; }
Opens the next block. Depending on the blockIteration order, it may be the very next block, or a block away that may contain seekTerm.
Returns:true if the next block is opened; false if there is no blocks anymore and the iteration is over.
/** * Opens the next block. * Depending on the {@link #blockIteration} order, it may be the very next block, or a block away that may contain * {@link #seekTerm}. * * @return true if the next block is opened; false if there is no blocks anymore and the iteration is over. */
protected boolean nextBlock() throws IOException { long blockStartFP; switch (blockIteration) { case NEXT: assert seekTerm == null; blockStartFP = blockInput.getFilePointer(); break; case SEEK: assert seekTerm != null; blockStartFP = getOrCreateDictionaryBrowser().seekBlock(seekTerm); if (isBeyondLastTerm(seekTerm, blockStartFP)) { return false; } blockIteration = BlockIteration.NEXT; break; case END: return false; default: throw new UnsupportedOperationException("Unsupported " + BlockIteration.class.getSimpleName()); } numMatchedBytes = 0; numConsecutivelyRejectedTerms = 0; initializeHeader(seekTerm, blockStartFP); return blockHeader != null; } @Override public boolean seekExact(BytesRef text) { throw new UnsupportedOperationException(); } @Override public void seekExact(long ord) { throw new UnsupportedOperationException(); } @Override public void seekExact(BytesRef term, TermState state) { throw new UnsupportedOperationException(); } @Override public SeekStatus seekCeil(BytesRef text) { throw new UnsupportedOperationException(); }
This is mostly a copy of AutomatonTermsEnum. Since it's an inner class, the outer class can call methods that ATE does not expose. It'd be nice if ATE's logic could be more extensible.
/** * This is mostly a copy of AutomatonTermsEnum. Since it's an inner class, the outer class can * call methods that ATE does not expose. It'd be nice if ATE's logic could be more extensible. */
protected class AutomatonNextTermCalculator { // for path tracking: each short records gen when we last // visited the state; we use gens to avoid having to clear protected final short[] visited; protected short curGen; // the reference used for seeking forwards through the term dictionary protected final BytesRefBuilder seekBytesRef = new BytesRefBuilder(); // true if we are enumerating an infinite portion of the DFA. // in this case it is faster to drive the query based on the terms dictionary. // when this is true, linearUpperBound indicate the end of range // of terms where we should simply do sequential reads instead. protected boolean linear; protected final BytesRef linearUpperBound = new BytesRef(); protected final Transition transition = new Transition(); protected final IntsRefBuilder savedStates = new IntsRefBuilder(); protected AutomatonNextTermCalculator(CompiledAutomaton compiled) { visited = compiled.finite ? null : new short[runAutomaton.getSize()]; }
Records the given state has been visited.
/** * Records the given state has been visited. */
protected void setVisited(int state) { if (!finite) { visited[state] = curGen; } }
Indicates whether the given state has been visited.
/** * Indicates whether the given state has been visited. */
protected boolean isVisited(int state) { return !finite && visited[state] == curGen; }
True if the current state of the automata is best iterated linearly (without seeking).
/** * True if the current state of the automata is best iterated linearly (without seeking). */
protected boolean isLinearState(BytesRef term) { return linear && term.compareTo(linearUpperBound) < 0; }
See Also:
  • nextSeekTerm.nextSeekTerm(BytesRef)
/** * @see org.apache.lucene.index.FilteredTermsEnum#nextSeekTerm(BytesRef) */
protected BytesRef nextSeekTerm(final BytesRef term) { //System.out.println("ATE.nextSeekTerm term=" + term); if (term == null) { assert seekBytesRef.length() == 0; // return the empty term, as it's valid if (runAutomaton.isAccept(0)) { return seekBytesRef.get(); } } else { seekBytesRef.copyBytes(term); } // seek to the next possible string; if (nextString()) { return seekBytesRef.get(); // reposition } else { return null; // no more possible strings can match } }
Sets the enum to operate in linear fashion, as we have found a looping transition at position: we set an upper bound and act like a TermRangeQuery for this portion of the term space.
/** * Sets the enum to operate in linear fashion, as we have found * a looping transition at position: we set an upper bound and * act like a TermRangeQuery for this portion of the term space. */
protected void setLinear(int position) { assert linear == false; int state = 0; int maxInterval = 0xff; //System.out.println("setLinear pos=" + position + " seekbytesRef=" + seekBytesRef); for (int i = 0; i < position; i++) { state = runAutomaton.step(state, seekBytesRef.byteAt(i) & 0xff); assert state >= 0 : "state=" + state; } final int numTransitions = automaton.getNumTransitions(state); automaton.initTransition(state, transition); for (int i = 0; i < numTransitions; i++) { automaton.getNextTransition(transition); if (transition.min <= (seekBytesRef.byteAt(position) & 0xff) && (seekBytesRef.byteAt(position) & 0xff) <= transition.max) { maxInterval = transition.max; break; } } // 0xff terms don't get the optimization... not worth the trouble. if (maxInterval != 0xff) maxInterval++; int length = position + 1; /* position + maxTransition */ if (linearUpperBound.bytes.length < length) { linearUpperBound.bytes = new byte[ArrayUtil.oversize(length, Byte.BYTES)]; } System.arraycopy(seekBytesRef.bytes(), 0, linearUpperBound.bytes, 0, position); linearUpperBound.bytes[position] = (byte) maxInterval; linearUpperBound.length = length; linear = true; }
Increments the byte buffer to the next String in binary order after s that will not put the machine into a reject state. If such a string does not exist, returns false.

The correctness of this method depends upon the automaton being deterministic, and having no transitions to dead states.

Returns:true if more possible solutions exist for the DFA
/** * Increments the byte buffer to the next String in binary order after s that will not put * the machine into a reject state. If such a string does not exist, returns * false. * <p> * The correctness of this method depends upon the automaton being deterministic, * and having no transitions to dead states. * * @return true if more possible solutions exist for the DFA */
protected boolean nextString() { int state; int pos = 0; savedStates.grow(seekBytesRef.length() + 1); savedStates.setIntAt(0, 0); while (true) { if (!finite && ++curGen == 0) { // Clear the visited states every time curGen wraps (so very infrequently to not impact average perf). Arrays.fill(visited, (short) -1); } linear = false; // walk the automaton until a character is rejected. for (state = savedStates.intAt(pos); pos < seekBytesRef.length(); pos++) { setVisited(state); int nextState = runAutomaton.step(state, seekBytesRef.byteAt(pos) & 0xff); if (nextState == -1) break; savedStates.setIntAt(pos + 1, nextState); // we found a loop, record it for faster enumeration if (!linear && isVisited(nextState)) { setLinear(pos); } state = nextState; } // take the useful portion, and the last non-reject state, and attempt to // append characters that will match. if (nextString(state, pos)) { return true; } else { /* no more solutions exist from this useful portion, backtrack */ if ((pos = backtrack(pos)) < 0) /* no more solutions at all */ return false; final int newState = runAutomaton.step(savedStates.intAt(pos), seekBytesRef.byteAt(pos) & 0xff); if (newState >= 0 && runAutomaton.isAccept(newState)) /* String is good to go as-is */ return true; /* else advance further */ // paranoia? if we backtrack thru an infinite DFA, the loop detection is important! // for now, restart from scratch for all infinite DFAs if (!finite) pos = 0; } } }
Returns the next String in lexicographic order that will not put the machine into a reject state.

This method traverses the DFA from the given position in the String, starting at the given state.

If this cannot satisfy the machine, returns false. This method will walk the minimal path, in lexicographic order, as long as possible.

If this method returns false, then there might still be more solutions, it is necessary to backtrack to find out.

Params:
  • state – current non-reject state
  • position – useful portion of the string
Returns:true if more possible solutions exist for the DFA from this position
/** * Returns the next String in lexicographic order that will not put * the machine into a reject state. * <p> * This method traverses the DFA from the given position in the String, * starting at the given state. * <p> * If this cannot satisfy the machine, returns false. This method will * walk the minimal path, in lexicographic order, as long as possible. * <p> * If this method returns false, then there might still be more solutions, * it is necessary to backtrack to find out. * * @param state current non-reject state * @param position useful portion of the string * @return true if more possible solutions exist for the DFA from this * position */
protected boolean nextString(int state, int position) { /* * the next lexicographic character must be greater than the existing * character, if it exists. */ int c = 0; if (position < seekBytesRef.length()) { c = seekBytesRef.byteAt(position) & 0xff; // if the next byte is 0xff and is not part of the useful portion, // then by definition it puts us in a reject state, and therefore this // path is dead. there cannot be any higher transitions. backtrack. if (c++ == 0xff) return false; } seekBytesRef.setLength(position); setVisited(state); final int numTransitions = automaton.getNumTransitions(state); automaton.initTransition(state, transition); // find the minimal path (lexicographic order) that is >= c for (int i = 0; i < numTransitions; i++) { automaton.getNextTransition(transition); if (transition.max >= c) { int nextChar = Math.max(c, transition.min); // append either the next sequential char, or the minimum transition seekBytesRef.grow(seekBytesRef.length() + 1); seekBytesRef.append((byte) nextChar); state = transition.dest; /* * as long as is possible, continue down the minimal path in * lexicographic order. if a loop or accept state is encountered, stop. */ while (!isVisited(state) && !runAutomaton.isAccept(state)) { setVisited(state); /* * Note: we work with a DFA with no transitions to dead states. * so the below is ok, if it is not an accept state, * then there MUST be at least one transition. */ automaton.initTransition(state, transition); automaton.getNextTransition(transition); state = transition.dest; // append the minimum transition seekBytesRef.grow(seekBytesRef.length() + 1); seekBytesRef.append((byte) transition.min); // we found a loop, record it for faster enumeration if (!linear && isVisited(state)) { setLinear(seekBytesRef.length() - 1); } } return true; } } return false; }
Attempts to backtrack thru the string after encountering a dead end at some given position. Returns false if no more possible strings can match.
Params:
  • position – current position in the input String
Returns:position >= 0 if more possible solutions exist for the DFA
/** * Attempts to backtrack thru the string after encountering a dead end * at some given position. Returns false if no more possible strings * can match. * * @param position current position in the input String * @return {@code position >= 0} if more possible solutions exist for the DFA */
protected int backtrack(int position) { while (position-- > 0) { int nextChar = seekBytesRef.byteAt(position) & 0xff; // if a character is 0xff it's a dead-end too, // because there is no higher character in binary sort order. if (nextChar++ != 0xff) { seekBytesRef.setByteAt(position, (byte) nextChar); seekBytesRef.setLength(position + 1); return position; } } return -1; /* all solutions exhausted */ } } }