/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util.automaton;


import java.util.*;

import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.UnicodeUtil;

Builds a minimal, deterministic Automaton that accepts a set of strings. The algorithm requires sorted input data, but is very fast (nearly linear with the input size).
See Also:
/** * Builds a minimal, deterministic {@link Automaton} that accepts a set of * strings. The algorithm requires sorted input data, but is very fast * (nearly linear with the input size). * * @see #build(Collection) * @see Automata#makeStringUnion(Collection) */
public final class DaciukMihovAutomatonBuilder {
This builder rejects terms that are more than 1k chars long since it then uses recursion based on the length of the string, which might cause stack overflows.
/** * This builder rejects terms that are more than 1k chars long since it then * uses recursion based on the length of the string, which might cause stack * overflows. */
static final int MAX_TERM_LENGTH = 1_000;
The default constructor is private. Use static methods directly.
/** * The default constructor is private. Use static methods directly. */
private DaciukMihovAutomatonBuilder() { super(); }
DFSA state with char labels on transitions.
/** * DFSA state with <code>char</code> labels on transitions. */
private final static class State {
An empty set of labels.
/** An empty set of labels. */
private final static int[] NO_LABELS = new int[0];
An empty set of states.
/** An empty set of states. */
private final static State[] NO_STATES = new State[0];
Labels of outgoing transitions. Indexed identically to states. Labels must be sorted lexicographically.
/** * Labels of outgoing transitions. Indexed identically to {@link #states}. * Labels must be sorted lexicographically. */
int[] labels = NO_LABELS;
States reachable from outgoing transitions. Indexed identically to labels.
/** * States reachable from outgoing transitions. Indexed identically to * {@link #labels}. */
State[] states = NO_STATES;
true if this state corresponds to the end of at least one input sequence.
/** * <code>true</code> if this state corresponds to the end of at least one * input sequence. */
boolean is_final;
Returns the target state of a transition leaving this state and labeled with label. If no such transition exists, returns null.
/** * Returns the target state of a transition leaving this state and labeled * with <code>label</code>. If no such transition exists, returns * <code>null</code>. */
State getState(int label) { final int index = Arrays.binarySearch(labels, label); return index >= 0 ? states[index] : null; }
Two states are equal if:
  • they have an identical number of outgoing transitions, labeled with the same labels
  • corresponding outgoing transitions lead to the same states (to states with an identical right-language).
/** * Two states are equal if: * <ul> * <li>they have an identical number of outgoing transitions, labeled with * the same labels</li> * <li>corresponding outgoing transitions lead to the same states (to states * with an identical right-language). * </ul> */
@Override public boolean equals(Object obj) { final State other = (State) obj; return is_final == other.is_final && Arrays.equals(this.labels, other.labels) && referenceEquals(this.states, other.states); }
Compute the hash code of the current status of this state.
/** * Compute the hash code of the <i>current</i> status of this state. */
@Override public int hashCode() { int hash = is_final ? 1 : 0; hash ^= hash * 31 + this.labels.length; for (int c : this.labels) hash ^= hash * 31 + c; /* * Compare the right-language of this state using reference-identity of * outgoing states. This is possible because states are interned (stored * in registry) and traversed in post-order, so any outgoing transitions * are already interned. */ for (State s : this.states) { hash ^= System.identityHashCode(s); } return hash; }
Return true if this state has any children (outgoing transitions).
/** * Return <code>true</code> if this state has any children (outgoing * transitions). */
boolean hasChildren() { return labels.length > 0; }
Create a new outgoing transition labeled label and return the newly created target state for this transition.
/** * Create a new outgoing transition labeled <code>label</code> and return * the newly created target state for this transition. */
State newState(int label) { assert Arrays.binarySearch(labels, label) < 0 : "State already has transition labeled: " + label; labels = ArrayUtil.growExact(labels, labels.length + 1); states = ArrayUtil.growExact(states, states.length + 1); labels[labels.length - 1] = label; return states[states.length - 1] = new State(); }
Return the most recent transitions's target state.
/** * Return the most recent transitions's target state. */
State lastChild() { assert hasChildren() : "No outgoing transitions."; return states[states.length - 1]; }
Return the associated state if the most recent transition is labeled with label.
/** * Return the associated state if the most recent transition is labeled with * <code>label</code>. */
State lastChild(int label) { final int index = labels.length - 1; State s = null; if (index >= 0 && labels[index] == label) { s = states[index]; } assert s == getState(label); return s; }
Replace the last added outgoing transition's target state with the given state.
/** * Replace the last added outgoing transition's target state with the given * state. */
void replaceLastChild(State state) { assert hasChildren() : "No outgoing transitions."; states[states.length - 1] = state; }
Compare two lists of objects for reference-equality.
/** * Compare two lists of objects for reference-equality. */
private static boolean referenceEquals(Object[] a1, Object[] a2) { if (a1.length != a2.length) { return false; } for (int i = 0; i < a1.length; i++) { if (a1[i] != a2[i]) { return false; } } return true; } }
A "registry" for state interning.
/** * A "registry" for state interning. */
private HashMap<State,State> stateRegistry = new HashMap<>();
Root automaton state.
/** * Root automaton state. */
private State root = new State();
Previous sequence added to the automaton in add(CharsRef).
/** * Previous sequence added to the automaton in {@link #add(CharsRef)}. */
private CharsRef previous;
A comparator used for enforcing sorted UTF8 order, used in assertions only.
/** * A comparator used for enforcing sorted UTF8 order, used in assertions only. */
@SuppressWarnings("deprecation") private static final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
Add another character sequence to this automaton. The sequence must be lexicographically larger or equal compared to any previous sequences added to this automaton (the input must be sorted).
/** * Add another character sequence to this automaton. The sequence must be * lexicographically larger or equal compared to any previous sequences added * to this automaton (the input must be sorted). */
public void add(CharsRef current) { if (current.length > MAX_TERM_LENGTH) { throw new IllegalArgumentException("This builder doesn't allow terms that are larger than 1,000 characters, got " + current); } assert stateRegistry != null : "Automaton already built."; assert previous == null || comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: " + previous + " >= " + current; assert setPrevious(current); // Descend in the automaton (find matching prefix). int pos = 0, max = current.length(); State next, state = root; while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) { state = next; // todo, optimize me pos += Character.charCount(Character.codePointAt(current, pos)); } if (state.hasChildren()) replaceOrRegister(state); addSuffix(state, current, pos); }
Finalize the automaton and return the root state. No more strings can be added to the builder after this call.
Returns:Root automaton state.
/** * Finalize the automaton and return the root state. No more strings can be * added to the builder after this call. * * @return Root automaton state. */
public State complete() { if (this.stateRegistry == null) throw new IllegalStateException(); if (root.hasChildren()) replaceOrRegister(root); stateRegistry = null; return root; }
Internal recursive traversal for conversion.
/** * Internal recursive traversal for conversion. */
private static int convert(Automaton.Builder a, State s, IdentityHashMap<State,Integer> visited) { Integer converted = visited.get(s); if (converted != null) { return converted; } converted = a.createState(); a.setAccept(converted, s.is_final); visited.put(s, converted); int i = 0; int[] labels = s.labels; for (DaciukMihovAutomatonBuilder.State target : s.states) { a.addTransition(converted, convert(a, target, visited), labels[i++]); } return converted; }
Build a minimal, deterministic automaton from a sorted list of BytesRef representing strings in UTF-8. These strings must be binary-sorted.
/** * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing * strings in UTF-8. These strings must be binary-sorted. */
public static Automaton build(Collection<BytesRef> input) { final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder(); char[] chars = new char[0]; CharsRef ref = new CharsRef(); for (BytesRef b : input) { chars = ArrayUtil.grow(chars, b.length); final int len = UnicodeUtil.UTF8toUTF16(b, chars); ref.chars = chars; ref.length = len; builder.add(ref); } Automaton.Builder a = new Automaton.Builder(); convert(a, builder.complete(), new IdentityHashMap<State,Integer>()); return a.finish(); }
Copy current into an internal buffer.
/** * Copy <code>current</code> into an internal buffer. */
private boolean setPrevious(CharsRef current) { // don't need to copy, once we fix https://issues.apache.org/jira/browse/LUCENE-3277 // still, called only from assert previous = CharsRef.deepCopyOf(current); return true; }
Replace last child of state with an already registered state or stateRegistry the last child state.
/** * Replace last child of <code>state</code> with an already registered state * or stateRegistry the last child state. */
private void replaceOrRegister(State state) { final State child = state.lastChild(); if (child.hasChildren()) replaceOrRegister(child); final State registered = stateRegistry.get(child); if (registered != null) { state.replaceLastChild(registered); } else { stateRegistry.put(child, child); } }
Add a suffix of current starting at fromIndex (inclusive) to state state.
/** * Add a suffix of <code>current</code> starting at <code>fromIndex</code> * (inclusive) to state <code>state</code>. */
private void addSuffix(State state, CharSequence current, int fromIndex) { final int len = current.length(); while (fromIndex < len) { int cp = Character.codePointAt(current, fromIndex); state = state.newState(cp); fromIndex += Character.charCount(cp); } state.is_final = true; } }