/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util.automaton;

  
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.index.SingleTermsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil;

Immutable class holding compiled details for a given Automaton. The Automaton is deterministic, must not have dead states but is not necessarily minimal.
@lucene.experimental
/** * Immutable class holding compiled details for a given * Automaton. The Automaton is deterministic, must not have * dead states but is not necessarily minimal. * * @lucene.experimental */
public class CompiledAutomaton implements Accountable { private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(CompiledAutomaton.class);
Automata are compiled into different internal forms for the most efficient execution depending upon the language they accept.
/** * Automata are compiled into different internal forms for the * most efficient execution depending upon the language they accept. */
public enum AUTOMATON_TYPE {
Automaton that accepts no strings.
/** Automaton that accepts no strings. */
NONE,
Automaton that accepts all possible strings.
/** Automaton that accepts all possible strings. */
ALL,
Automaton that accepts only a single fixed string.
/** Automaton that accepts only a single fixed string. */
SINGLE,
Catch-all for any other automata.
/** Catch-all for any other automata. */
NORMAL };
If simplify is true this will be the "simplified" type; else, this is NORMAL
/** If simplify is true this will be the "simplified" type; else, this is NORMAL */
public final AUTOMATON_TYPE type;
For AUTOMATON_TYPE.SINGLE this is the singleton term.
/** * For {@link AUTOMATON_TYPE#SINGLE} this is the singleton term. */
public final BytesRef term;
Matcher for quickly determining if a byte[] is accepted. only valid for AUTOMATON_TYPE.NORMAL.
/** * Matcher for quickly determining if a byte[] is accepted. * only valid for {@link AUTOMATON_TYPE#NORMAL}. */
public final ByteRunAutomaton runAutomaton;
Two dimensional array of transitions, indexed by state number for traversal. The state numbering is consistent with runAutomaton. Only valid for AUTOMATON_TYPE.NORMAL.
/** * Two dimensional array of transitions, indexed by state * number for traversal. The state numbering is consistent with * {@link #runAutomaton}. * Only valid for {@link AUTOMATON_TYPE#NORMAL}. */
public final Automaton automaton;
Shared common suffix accepted by the automaton. Only valid for AUTOMATON_TYPE.NORMAL, and only when the automaton accepts an infinite language. This will be null if the common prefix is length 0.
/** * Shared common suffix accepted by the automaton. Only valid * for {@link AUTOMATON_TYPE#NORMAL}, and only when the * automaton accepts an infinite language. This will be null * if the common prefix is length 0. */
public final BytesRef commonSuffixRef;
Indicates if the automaton accepts a finite set of strings. Null if this was not computed. Only valid for AUTOMATON_TYPE.NORMAL.
/** * Indicates if the automaton accepts a finite set of strings. * Null if this was not computed. * Only valid for {@link AUTOMATON_TYPE#NORMAL}. */
public final Boolean finite;
Which state, if any, accepts all suffixes, else -1.
/** Which state, if any, accepts all suffixes, else -1. */
public final int sinkState;
Create this, passing simplify=true and finite=null, so that we try to simplify the automaton and determine if it is finite.
/** Create this, passing simplify=true and finite=null, so that we try * to simplify the automaton and determine if it is finite. */
public CompiledAutomaton(Automaton automaton) { this(automaton, null, true); }
Returns sink state, if present, else -1.
/** Returns sink state, if present, else -1. */
private static int findSinkState(Automaton automaton) { int numStates = automaton.getNumStates(); Transition t = new Transition(); int foundState = -1; for (int s=0;s<numStates;s++) { if (automaton.isAccept(s)) { int count = automaton.initTransition(s, t); boolean isSinkState = false; for(int i=0;i<count;i++) { automaton.getNextTransition(t); if (t.dest == s && t.min == 0 && t.max == 0xff) { isSinkState = true; break; } } if (isSinkState) { foundState = s; break; } } } return foundState; }
Create this. If finite is null, we use Operations.isFinite to determine whether it is finite. If simplify is true, we run possibly expensive operations to determine if the automaton is one the cases in AUTOMATON_TYPE.
/** Create this. If finite is null, we use {@link Operations#isFinite} * to determine whether it is finite. If simplify is true, we run * possibly expensive operations to determine if the automaton is one * the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. */
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify) { this(automaton, finite, simplify, Operations.DEFAULT_MAX_DETERMINIZED_STATES, false); }
Create this. If finite is null, we use Operations.isFinite to determine whether it is finite. If simplify is true, we run possibly expensive operations to determine if the automaton is one the cases in AUTOMATON_TYPE. If simplify requires determinizing the automaton then only maxDeterminizedStates will be created. Any more than that will cause a TooComplexToDeterminizeException.
/** Create this. If finite is null, we use {@link Operations#isFinite} * to determine whether it is finite. If simplify is true, we run * possibly expensive operations to determine if the automaton is one * the cases in {@link CompiledAutomaton.AUTOMATON_TYPE}. If simplify * requires determinizing the automaton then only maxDeterminizedStates * will be created. Any more than that will cause a * TooComplexToDeterminizeException. */
public CompiledAutomaton(Automaton automaton, Boolean finite, boolean simplify, int maxDeterminizedStates, boolean isBinary) { if (automaton.getNumStates() == 0) { automaton = new Automaton(); automaton.createState(); } if (simplify) { // Test whether the automaton is a "simple" form and // if so, don't create a runAutomaton. Note that on a // large automaton these tests could be costly: if (Operations.isEmpty(automaton)) { // matches nothing type = AUTOMATON_TYPE.NONE; term = null; commonSuffixRef = null; runAutomaton = null; this.automaton = null; this.finite = null; sinkState = -1; return; } boolean isTotal; // NOTE: only approximate, because automaton may not be minimal: if (isBinary) { isTotal = Operations.isTotal(automaton, 0, 0xff); } else { isTotal = Operations.isTotal(automaton); } if (isTotal) { // matches all possible strings type = AUTOMATON_TYPE.ALL; term = null; commonSuffixRef = null; runAutomaton = null; this.automaton = null; this.finite = null; sinkState = -1; return; } automaton = Operations.determinize(automaton, maxDeterminizedStates); IntsRef singleton = Operations.getSingleton(automaton); if (singleton != null) { // matches a fixed string type = AUTOMATON_TYPE.SINGLE; commonSuffixRef = null; runAutomaton = null; this.automaton = null; this.finite = null; if (isBinary) { term = StringHelper.intsRefToBytesRef(singleton); } else { term = new BytesRef(UnicodeUtil.newString(singleton.ints, singleton.offset, singleton.length)); } sinkState = -1; return; } } type = AUTOMATON_TYPE.NORMAL; term = null; if (finite == null) { this.finite = Operations.isFinite(automaton); } else { this.finite = finite; } Automaton binary; if (isBinary) { // Caller already built binary automaton themselves, e.g. PrefixQuery // does this since it can be provided with a binary (not necessarily // UTF8!) term: binary = automaton; } else { // Incoming automaton is unicode, and we must convert to UTF8 to match what's in the index: binary = new UTF32ToUTF8().convert(automaton); } if (this.finite) { commonSuffixRef = null; } else { // NOTE: this is a very costly operation! We should test if it's really warranted in practice... we could do a fast match // by looking for a sink state (which means it has no common suffix). Or maybe we shouldn't do it when simplify is false?: BytesRef suffix = Operations.getCommonSuffixBytesRef(binary, maxDeterminizedStates); if (suffix.length == 0) { commonSuffixRef = null; } else { commonSuffixRef = suffix; } } // This will determinize the binary automaton for us: runAutomaton = new ByteRunAutomaton(binary, true, maxDeterminizedStates); this.automaton = runAutomaton.automaton; // TODO: this is a bit fragile because if the automaton is not minimized there could be more than 1 sink state but auto-prefix will fail // to run for those: sinkState = findSinkState(this.automaton); } private Transition transition = new Transition(); //private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; private BytesRef addTail(int state, BytesRefBuilder term, int idx, int leadLabel) { //System.out.println("addTail state=" + state + " term=" + term.utf8ToString() + " idx=" + idx + " leadLabel=" + (char) leadLabel); //System.out.println(automaton.toDot()); // Find biggest transition that's < label // TODO: use binary search here int maxIndex = -1; int numTransitions = automaton.initTransition(state, transition); for(int i=0;i<numTransitions;i++) { automaton.getNextTransition(transition); if (transition.min < leadLabel) { maxIndex = i; } else { // Transitions are alway sorted break; } } //System.out.println(" maxIndex=" + maxIndex); assert maxIndex != -1; automaton.getTransition(state, maxIndex, transition); // Append floorLabel final int floorLabel; if (transition.max > leadLabel-1) { floorLabel = leadLabel-1; } else { floorLabel = transition.max; } //System.out.println(" floorLabel=" + (char) floorLabel); term.grow(1+idx); //if (DEBUG) System.out.println(" add floorLabel=" + (char) floorLabel + " idx=" + idx); term.setByteAt(idx, (byte) floorLabel); state = transition.dest; //System.out.println(" dest: " + state); idx++; // Push down to last accept state while (true) { numTransitions = automaton.getNumTransitions(state); if (numTransitions == 0) { //System.out.println("state=" + state + " 0 trans"); assert runAutomaton.isAccept(state); term.setLength(idx); //if (DEBUG) System.out.println(" return " + term.utf8ToString()); return term.get(); } else { // We are pushing "top" -- so get last label of // last transition: //System.out.println("get state=" + state + " numTrans=" + numTransitions); automaton.getTransition(state, numTransitions-1, transition); term.grow(1+idx); //if (DEBUG) System.out.println(" push maxLabel=" + (char) lastTransition.max + " idx=" + idx); //System.out.println(" add trans dest=" + scratch.dest + " label=" + (char) scratch.max); term.setByteAt(idx, (byte) transition.max); state = transition.dest; idx++; } } } // TODO: should this take startTerm too? This way // Terms.intersect could forward to this method if type != // NORMAL:
Return a TermsEnum intersecting the provided Terms with the terms accepted by this automaton.
/** Return a {@link TermsEnum} intersecting the provided {@link Terms} * with the terms accepted by this automaton. */
public TermsEnum getTermsEnum(Terms terms) throws IOException { switch(type) { case NONE: return TermsEnum.EMPTY; case ALL: return terms.iterator(); case SINGLE: return new SingleTermsEnum(terms.iterator(), term); case NORMAL: return terms.intersect(this, null); default: // unreachable throw new RuntimeException("unhandled case"); } }
Finds largest term accepted by this Automaton, that's <= the provided input term. The result is placed in output; it's fine for output and input to point to the same bytes. The returned result is either the provided output, or null if there is no floor term (ie, the provided input term is before the first term accepted by this Automaton).
/** Finds largest term accepted by this Automaton, that's * &lt;= the provided input term. The result is placed in * output; it's fine for output and input to point to * the same bytes. The returned result is either the * provided output, or null if there is no floor term * (ie, the provided input term is before the first term * accepted by this Automaton). */
public BytesRef floor(BytesRef input, BytesRefBuilder output) { //if (DEBUG) System.out.println("CA.floor input=" + input.utf8ToString()); int state = 0; // Special case empty string: if (input.length == 0) { if (runAutomaton.isAccept(state)) { output.clear(); return output.get(); } else { return null; } } final List<Integer> stack = new ArrayList<>(); int idx = 0; while (true) { int label = input.bytes[input.offset + idx] & 0xff; int nextState = runAutomaton.step(state, label); //if (DEBUG) System.out.println(" cycle label=" + (char) label + " nextState=" + nextState); if (idx == input.length-1) { if (nextState != -1 && runAutomaton.isAccept(nextState)) { // Input string is accepted output.grow(1+idx); output.setByteAt(idx, (byte) label); output.setLength(input.length); //if (DEBUG) System.out.println(" input is accepted; return term=" + output.utf8ToString()); return output.get(); } else { nextState = -1; } } if (nextState == -1) { // Pop back to a state that has a transition // <= our label: while (true) { int numTransitions = automaton.getNumTransitions(state); if (numTransitions == 0) { assert runAutomaton.isAccept(state); output.setLength(idx); //if (DEBUG) System.out.println(" return " + output.utf8ToString()); return output.get(); } else { automaton.getTransition(state, 0, transition); if (label-1 < transition.min) { if (runAutomaton.isAccept(state)) { output.setLength(idx); //if (DEBUG) System.out.println(" return " + output.utf8ToString()); return output.get(); } // pop if (stack.size() == 0) { //if (DEBUG) System.out.println(" pop ord=" + idx + " return null"); return null; } else { state = stack.remove(stack.size()-1); idx--; //if (DEBUG) System.out.println(" pop ord=" + (idx+1) + " label=" + (char) label + " first trans.min=" + (char) transitions[0].min); label = input.bytes[input.offset + idx] & 0xff; } } else { //if (DEBUG) System.out.println(" stop pop ord=" + idx + " first trans.min=" + (char) transitions[0].min); break; } } } //if (DEBUG) System.out.println(" label=" + (char) label + " idx=" + idx); return addTail(state, output, idx, label); } else { output.grow(1+idx); output.setByteAt(idx, (byte) label); stack.add(state); state = nextState; idx++; } } } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((runAutomaton == null) ? 0 : runAutomaton.hashCode()); result = prime * result + ((term == null) ? 0 : term.hashCode()); result = prime * result + ((type == null) ? 0 : type.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; CompiledAutomaton other = (CompiledAutomaton) obj; if (type != other.type) return false; if (type == AUTOMATON_TYPE.SINGLE) { if (!term.equals(other.term)) return false; } else if (type == AUTOMATON_TYPE.NORMAL) { if (!runAutomaton.equals(other.runAutomaton)) return false; } return true; } @Override public long ramBytesUsed() { return BASE_RAM_BYTES + RamUsageEstimator.sizeOfObject(automaton) + RamUsageEstimator.sizeOfObject(commonSuffixRef) + RamUsageEstimator.sizeOfObject(runAutomaton) + RamUsageEstimator.sizeOfObject(term) + RamUsageEstimator.sizeOfObject(transition); } }