/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.fst;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;
import java.io.IOException;
import java.io.Writer;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.TreeSet;
Static helper methods.
@lucene.experimental
/** Static helper methods.
*
* @lucene.experimental */
public final class Util {
private Util() {
}
Looks up the output for this input, or null if the
input is not accepted. /** Looks up the output for this input, or null if the
* input is not accepted. */
public static<T> T get(FST<T> fst, IntsRef input) throws IOException {
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
final BytesReader fstReader = fst.getBytesReader();
// Accumulate output as we go
T output = fst.outputs.getNoOutput();
for(int i=0;i<input.length;i++) {
if (fst.findTargetArc(input.ints[input.offset + i], arc, arc, fstReader) == null) {
return null;
}
output = fst.outputs.add(output, arc.output);
}
if (arc.isFinal()) {
return fst.outputs.add(output, arc.nextFinalOutput);
} else {
return null;
}
}
// TODO: maybe a CharsRef version for BYTE2
Looks up the output for this input, or null if the
input is not accepted /** Looks up the output for this input, or null if the
* input is not accepted */
public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
assert fst.inputType == FST.INPUT_TYPE.BYTE1;
final BytesReader fstReader = fst.getBytesReader();
// TODO: would be nice not to alloc this on every lookup
final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
// Accumulate output as we go
T output = fst.outputs.getNoOutput();
for(int i=0;i<input.length;i++) {
if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc, fstReader) == null) {
return null;
}
output = fst.outputs.add(output, arc.output);
}
if (arc.isFinal()) {
return fst.outputs.add(output, arc.nextFinalOutput);
} else {
return null;
}
}
Reverse lookup (lookup by output instead of by input),
in the special case when your FSTs outputs are
strictly ascending. This locates the input/output
pair where the output is equal to the target, and will
return null if that output does not exist.
NOTE: this only works with FST<Long>
, only works when the outputs are ascending in order with the inputs. For example, simple ordinals (0, 1, 2, ...), or file offsets (when appending to a file) fit this.
/** Reverse lookup (lookup by output instead of by input),
* in the special case when your FSTs outputs are
* strictly ascending. This locates the input/output
* pair where the output is equal to the target, and will
* return null if that output does not exist.
*
* <p>NOTE: this only works with {@code FST<Long>}, only
* works when the outputs are ascending in order with
* the inputs.
* For example, simple ordinals (0, 1,
* 2, ...), or file offsets (when appending to a file)
* fit this. */
@Deprecated
public static IntsRef getByOutput(FST<Long> fst, long targetOutput) throws IOException {
final BytesReader in = fst.getBytesReader();
// TODO: would be nice not to alloc this on every lookup
FST.Arc<Long> arc = fst.getFirstArc(new FST.Arc<Long>());
FST.Arc<Long> scratchArc = new FST.Arc<>();
final IntsRefBuilder result = new IntsRefBuilder();
return getByOutput(fst, targetOutput, in, arc, scratchArc, result);
}
Expert: like getByOutput(FST<Long>, long)
except reusing BytesReader, initial and scratch Arc, and result. /**
* Expert: like {@link Util#getByOutput(FST, long)} except reusing
* BytesReader, initial and scratch Arc, and result.
*/
@Deprecated
public static IntsRef getByOutput(FST<Long> fst, long targetOutput, BytesReader in, Arc<Long> arc, Arc<Long> scratchArc, IntsRefBuilder result) throws IOException {
long output = arc.output;
int upto = 0;
//System.out.println("reverseLookup output=" + targetOutput);
while(true) {
//System.out.println("loop: output=" + output + " upto=" + upto + " arc=" + arc);
if (arc.isFinal()) {
final long finalOutput = output + arc.nextFinalOutput;
//System.out.println(" isFinal finalOutput=" + finalOutput);
if (finalOutput == targetOutput) {
result.setLength(upto);
//System.out.println(" found!");
return result.get();
} else if (finalOutput > targetOutput) {
//System.out.println(" not found!");
return null;
}
}
if (FST.targetHasArcs(arc)) {
//System.out.println(" targetHasArcs");
result.grow(1+upto);
fst.readFirstRealTargetArc(arc.target, arc, in);
if (arc.bytesPerArc != 0 && arc.arcIdx > Integer.MIN_VALUE) {
int low = 0;
int high = arc.numArcs-1;
int mid = 0;
//System.out.println("bsearch: numArcs=" + arc.numArcs + " target=" + targetOutput + " output=" + output);
boolean exact = false;
while (low <= high) {
mid = (low + high) >>> 1;
in.setPosition(arc.posArcsStart);
in.skipBytes(arc.bytesPerArc*mid);
final byte flags = in.readByte();
fst.readLabel(in);
final long minArcOutput;
if ((flags & FST.BIT_ARC_HAS_OUTPUT) != 0) {
final long arcOutput = fst.outputs.read(in);
minArcOutput = output + arcOutput;
} else {
minArcOutput = output;
}
//System.out.println(" cycle mid=" + mid + " output=" + minArcOutput);
if (minArcOutput == targetOutput) {
exact = true;
break;
} else if (minArcOutput < targetOutput) {
low = mid + 1;
} else {
high = mid - 1;
}
}
if (high == -1) {
return null;
} else if (exact) {
arc.arcIdx = mid-1;
} else {
arc.arcIdx = low-2;
}
fst.readNextRealArc(arc, in);
result.setIntAt(upto++, arc.label);
output += arc.output;
} else {
FST.Arc<Long> prevArc = null;
while(true) {
//System.out.println(" cycle label=" + arc.label + " output=" + arc.output);
// This is the min output we'd hit if we follow
// this arc:
final long minArcOutput = output + arc.output;
if (minArcOutput == targetOutput) {
// Recurse on this arc:
//System.out.println(" match! break");
output = minArcOutput;
result.setIntAt(upto++, arc.label);
break;
} else if (minArcOutput > targetOutput) {
if (prevArc == null) {
// Output doesn't exist
return null;
} else {
// Recurse on previous arc:
arc.copyFrom(prevArc);
result.setIntAt(upto++, arc.label);
output += arc.output;
//System.out.println(" recurse prev label=" + (char) arc.label + " output=" + output);
break;
}
} else if (arc.isLast()) {
// Recurse on this arc:
output = minArcOutput;
//System.out.println(" recurse last label=" + (char) arc.label + " output=" + output);
result.setIntAt(upto++, arc.label);
break;
} else {
// Read next arc in this node:
prevArc = scratchArc;
prevArc.copyFrom(arc);
//System.out.println(" after copy label=" + (char) prevArc.label + " vs " + (char) arc.label);
fst.readNextRealArc(arc, in);
}
}
}
} else {
//System.out.println(" no target arcs; not found!");
return null;
}
}
}
Represents a path in TopNSearcher.
@lucene.experimental
/** Represents a path in TopNSearcher.
*
* @lucene.experimental
*/
public static class FSTPath<T> {
Holds the last arc appended to this path /** Holds the last arc appended to this path */
public FST.Arc<T> arc;
Holds cost plus any usage-specific output: /** Holds cost plus any usage-specific output: */
public T output;
public final IntsRefBuilder input;
public final float boost;
public final CharSequence context;
// Custom int payload for consumers; the NRT suggester uses this to record if this path has already enumerated a surface form
public int payload;
Sole constructor /** Sole constructor */
public FSTPath(T output, FST.Arc<T> arc, IntsRefBuilder input) {
this(output, arc, input, 0, null, -1);
}
public FSTPath(T output, FST.Arc<T> arc, IntsRefBuilder input, float boost, CharSequence context, int payload) {
this.arc = new FST.Arc<T>().copyFrom(arc);
this.output = output;
this.input = input;
this.boost = boost;
this.context = context;
this.payload = payload;
}
public FSTPath<T> newPath(T output, IntsRefBuilder input) {
return new FSTPath<>(output, this.arc, input, this.boost, this.context, this.payload);
}
@Override
public String toString() {
return "input=" + input.get() + " output=" + output + " context=" + context + " boost=" + boost + " payload=" + payload;
}
}
Compares first by the provided comparator, and then
tie breaks by path.input. /** Compares first by the provided comparator, and then
* tie breaks by path.input. */
private static class TieBreakByInputComparator<T> implements Comparator<FSTPath<T>> {
private final Comparator<T> comparator;
public TieBreakByInputComparator(Comparator<T> comparator) {
this.comparator = comparator;
}
@Override
public int compare(FSTPath<T> a, FSTPath<T> b) {
int cmp = comparator.compare(a.output, b.output);
if (cmp == 0) {
return a.input.get().compareTo(b.input.get());
} else {
return cmp;
}
}
}
Utility class to find top N shortest paths from start
point(s). /** Utility class to find top N shortest paths from start
* point(s). */
public static class TopNSearcher<T> {
private final FST<T> fst;
private final BytesReader bytesReader;
private final int topN;
private final int maxQueueDepth;
private final FST.Arc<T> scratchArc = new FST.Arc<>();
private final Comparator<T> comparator;
private final Comparator<FSTPath<T>> pathComparator;
TreeSet<FSTPath<T>> queue = null;
Creates an unbounded TopNSearcher
Params: - fst – the
FST
to search on - topN – the number of top scoring entries to retrieve
- maxQueueDepth – the maximum size of the queue of possible top entries
- comparator – the comparator to select the top N
/**
* Creates an unbounded TopNSearcher
* @param fst the {@link org.apache.lucene.util.fst.FST} to search on
* @param topN the number of top scoring entries to retrieve
* @param maxQueueDepth the maximum size of the queue of possible top entries
* @param comparator the comparator to select the top N
*/
public TopNSearcher(FST<T> fst, int topN, int maxQueueDepth, Comparator<T> comparator) {
this(fst, topN, maxQueueDepth, comparator, new TieBreakByInputComparator<>(comparator));
}
public TopNSearcher(FST<T> fst, int topN, int maxQueueDepth, Comparator<T> comparator,
Comparator<FSTPath<T>> pathComparator) {
this.fst = fst;
this.bytesReader = fst.getBytesReader();
this.topN = topN;
this.maxQueueDepth = maxQueueDepth;
this.comparator = comparator;
this.pathComparator = pathComparator;
queue = new TreeSet<>(pathComparator);
}
// If back plus this arc is competitive then add to queue:
protected void addIfCompetitive(FSTPath<T> path) {
assert queue != null;
T output = fst.outputs.add(path.output, path.arc.output);
if (queue.size() == maxQueueDepth) {
FSTPath<T> bottom = queue.last();
int comp = pathComparator.compare(path, bottom);
if (comp > 0) {
// Doesn't compete
return;
} else if (comp == 0) {
// Tie break by alpha sort on the input:
path.input.append(path.arc.label);
final int cmp = bottom.input.get().compareTo(path.input.get());
path.input.setLength(path.input.length() - 1);
// We should never see dups:
assert cmp != 0;
if (cmp < 0) {
// Doesn't compete
return;
}
}
// Competes
} else {
// Queue isn't full yet, so any path we hit competes:
}
// copy over the current input to the new input
// and add the arc.label to the end
IntsRefBuilder newInput = new IntsRefBuilder();
newInput.copyInts(path.input.get());
newInput.append(path.arc.label);
FSTPath<T> newPath = path.newPath(output, newInput);
if (acceptPartialPath(newPath)) {
queue.add(newPath);
if (queue.size() == maxQueueDepth+1) {
queue.pollLast();
}
}
}
public void addStartPaths(FST.Arc<T> node, T startOutput, boolean allowEmptyString, IntsRefBuilder input) throws IOException {
addStartPaths(node, startOutput, allowEmptyString, input, 0, null, -1);
}
Adds all leaving arcs, including 'finished' arc, if
the node is final, from this node into the queue. /** Adds all leaving arcs, including 'finished' arc, if
* the node is final, from this node into the queue. */
public void addStartPaths(FST.Arc<T> node, T startOutput, boolean allowEmptyString, IntsRefBuilder input,
float boost, CharSequence context, int payload) throws IOException {
// De-dup NO_OUTPUT since it must be a singleton:
if (startOutput.equals(fst.outputs.getNoOutput())) {
startOutput = fst.outputs.getNoOutput();
}
FSTPath<T> path = new FSTPath<>(startOutput, node, input, boost, context, payload);
fst.readFirstTargetArc(node, path.arc, bytesReader);
// Bootstrap: find the min starting arc
while (true) {
if (allowEmptyString || path.arc.label != FST.END_LABEL) {
addIfCompetitive(path);
}
if (path.arc.isLast()) {
break;
}
fst.readNextArc(path.arc, bytesReader);
}
}
public TopResults<T> search() throws IOException {
final List<Result<T>> results = new ArrayList<>();
final BytesReader fstReader = fst.getBytesReader();
final T NO_OUTPUT = fst.outputs.getNoOutput();
// TODO: we could enable FST to sorting arcs by weight
// as it freezes... can easily do this on first pass
// (w/o requiring rewrite)
// TODO: maybe we should make an FST.INPUT_TYPE.BYTE0.5!?
// (nibbles)
int rejectCount = 0;
// For each top N path:
while (results.size() < topN) {
FSTPath<T> path;
if (queue == null) {
// Ran out of paths
break;
}
// Remove top path since we are now going to
// pursue it:
path = queue.pollFirst();
if (path == null) {
// There were less than topN paths available:
break;
}
//System.out.println("pop path=" + path + " arc=" + path.arc.output);
if (acceptPartialPath(path) == false) {
continue;
}
if (path.arc.label == FST.END_LABEL) {
// Empty string!
path.input.setLength(path.input.length() - 1);
results.add(new Result<>(path.input.get(), path.output));
continue;
}
if (results.size() == topN-1 && maxQueueDepth == topN) {
// Last path -- don't bother w/ queue anymore:
queue = null;
}
// We take path and find its "0 output completion",
// ie, just keep traversing the first arc with
// NO_OUTPUT that we can find, since this must lead
// to the minimum path that completes from
// path.arc.
// For each input letter:
while (true) {
fst.readFirstTargetArc(path.arc, path.arc, fstReader);
// For each arc leaving this node:
boolean foundZero = false;
while(true) {
// tricky: instead of comparing output == 0, we must
// express it via the comparator compare(output, 0) == 0
if (comparator.compare(NO_OUTPUT, path.arc.output) == 0) {
if (queue == null) {
foundZero = true;
break;
} else if (!foundZero) {
scratchArc.copyFrom(path.arc);
foundZero = true;
} else {
addIfCompetitive(path);
}
} else if (queue != null) {
addIfCompetitive(path);
}
if (path.arc.isLast()) {
break;
}
fst.readNextArc(path.arc, fstReader);
}
assert foundZero;
if (queue != null) {
// TODO: maybe we can save this copyFrom if we
// are more clever above... eg on finding the
// first NO_OUTPUT arc we'd switch to using
// scratchArc
path.arc.copyFrom(scratchArc);
}
if (path.arc.label == FST.END_LABEL) {
// Add final output:
path.output = fst.outputs.add(path.output, path.arc.output);
if (acceptResult(path)) {
results.add(new Result<>(path.input.get(), path.output));
} else {
rejectCount++;
}
break;
} else {
path.input.append(path.arc.label);
path.output = fst.outputs.add(path.output, path.arc.output);
if (acceptPartialPath(path) == false) {
break;
}
}
}
}
return new TopResults<>(rejectCount + topN <= maxQueueDepth, results);
}
protected boolean acceptResult(FSTPath<T> path) {
return acceptResult(path.input.get(), path.output);
}
Override this to prevent considering a path before it's complete /** Override this to prevent considering a path before it's complete */
protected boolean acceptPartialPath(FSTPath<T> path) {
return true;
}
protected boolean acceptResult(IntsRef input, T output) {
return true;
}
}
Holds a single input (IntsRef) + output, returned by shortestPaths()
. /** Holds a single input (IntsRef) + output, returned by
* {@link #shortestPaths shortestPaths()}. */
public final static class Result<T> {
public final IntsRef input;
public final T output;
public Result(IntsRef input, T output) {
this.input = input;
this.output = output;
}
}
Holds the results for a top N search using TopNSearcher
/**
* Holds the results for a top N search using {@link TopNSearcher}
*/
public static final class TopResults<T> implements Iterable<Result<T>> {
true
iff this is a complete result ie. if
the specified queue size was large enough to find the complete list of results. This might
be false
if the TopNSearcher
rejected too many results. /**
* <code>true</code> iff this is a complete result ie. if
* the specified queue size was large enough to find the complete list of results. This might
* be <code>false</code> if the {@link TopNSearcher} rejected too many results.
*/
public final boolean isComplete;
The top results
/**
* The top results
*/
public final List<Result<T>> topN;
TopResults(boolean isComplete, List<Result<T>> topN) {
this.topN = topN;
this.isComplete = isComplete;
}
@Override
public Iterator<Result<T>> iterator() {
return topN.iterator();
}
}
Starting from node, find the top N min cost
completions to a final node. /** Starting from node, find the top N min cost
* completions to a final node. */
public static <T> TopResults<T> shortestPaths(FST<T> fst, FST.Arc<T> fromNode, T startOutput, Comparator<T> comparator, int topN,
boolean allowEmptyString) throws IOException {
// All paths are kept, so we can pass topN for
// maxQueueDepth and the pruning is admissible:
TopNSearcher<T> searcher = new TopNSearcher<>(fst, topN, topN, comparator);
// since this search is initialized with a single start node
// it is okay to start with an empty input path here
searcher.addStartPaths(fromNode, startOutput, allowEmptyString, new IntsRefBuilder());
return searcher.search();
}
Dumps an FST
to a GraphViz's dot
language description
for visualization. Example of use:
PrintWriter pw = new PrintWriter("out.dot");
Util.toDot(fst, pw, true, true);
pw.close();
and then, from command line:
dot -Tpng -o out.png out.dot
Note: larger FSTs (a few thousand nodes) won't even
render, don't bother.
Params: - sameRank –
If
true
, the resulting dot
file will try
to order states in layers of breadth-first traversal. This may
mess up arcs, but makes the output FST's structure a bit clearer. - labelStates –
If
true
states will have labels equal to their offsets in their
binary format. Expands the graph considerably.
See Also:
/**
* Dumps an {@link FST} to a GraphViz's <code>dot</code> language description
* for visualization. Example of use:
*
* <pre class="prettyprint">
* PrintWriter pw = new PrintWriter("out.dot");
* Util.toDot(fst, pw, true, true);
* pw.close();
* </pre>
*
* and then, from command line:
*
* <pre>
* dot -Tpng -o out.png out.dot
* </pre>
*
* <p>
* Note: larger FSTs (a few thousand nodes) won't even
* render, don't bother.
*
* @param sameRank
* If <code>true</code>, the resulting <code>dot</code> file will try
* to order states in layers of breadth-first traversal. This may
* mess up arcs, but makes the output FST's structure a bit clearer.
*
* @param labelStates
* If <code>true</code> states will have labels equal to their offsets in their
* binary format. Expands the graph considerably.
*
* @see <a href="http://www.graphviz.org/">graphviz project</a>
*/
public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates)
throws IOException {
final String expandedNodeColor = "blue";
// This is the start arc in the automaton (from the epsilon state to the first state
// with outgoing transitions.
final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
// A queue of transitions to consider for the next level.
final List<FST.Arc<T>> thisLevelQueue = new ArrayList<>();
// A queue of transitions to consider when processing the next level.
final List<FST.Arc<T>> nextLevelQueue = new ArrayList<>();
nextLevelQueue.add(startArc);
//System.out.println("toDot: startArc: " + startArc);
// A list of states on the same level (for ranking).
final List<Integer> sameLevelStates = new ArrayList<>();
// A bitset of already seen states (target offset).
final BitSet seen = new BitSet();
seen.set((int) startArc.target);
// Shape for states.
final String stateShape = "circle";
final String finalStateShape = "doublecircle";
// Emit DOT prologue.
out.write("digraph FST {\n");
out.write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");
if (!labelStates) {
out.write(" node [shape=circle, width=.2, height=.2, style=filled]\n");
}
emitDotState(out, "initial", "point", "white", "");
final T NO_OUTPUT = fst.outputs.getNoOutput();
final BytesReader r = fst.getBytesReader();
// final FST.Arc<T> scratchArc = new FST.Arc<>();
{
final String stateColor;
if (fst.isExpandedTarget(startArc, r)) {
stateColor = expandedNodeColor;
} else {
stateColor = null;
}
final boolean isFinal;
final T finalOutput;
if (startArc.isFinal()) {
isFinal = true;
finalOutput = startArc.nextFinalOutput == NO_OUTPUT ? null : startArc.nextFinalOutput;
} else {
isFinal = false;
finalOutput = null;
}
emitDotState(out, Long.toString(startArc.target), isFinal ? finalStateShape : stateShape, stateColor, finalOutput == null ? "" : fst.outputs.outputToString(finalOutput));
}
out.write(" initial -> " + startArc.target + "\n");
int level = 0;
while (!nextLevelQueue.isEmpty()) {
// we could double buffer here, but it doesn't matter probably.
//System.out.println("next level=" + level);
thisLevelQueue.addAll(nextLevelQueue);
nextLevelQueue.clear();
level++;
out.write("\n // Transitions and states at level: " + level + "\n");
while (!thisLevelQueue.isEmpty()) {
final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
//System.out.println(" pop: " + arc);
if (FST.targetHasArcs(arc)) {
// scan all target arcs
//System.out.println(" readFirstTarget...");
final long node = arc.target;
fst.readFirstRealTargetArc(arc.target, arc, r);
//System.out.println(" firstTarget: " + arc);
while (true) {
//System.out.println(" cycle arc=" + arc);
// Emit the unseen state and add it to the queue for the next level.
if (arc.target >= 0 && !seen.get((int) arc.target)) {
/*
boolean isFinal = false;
T finalOutput = null;
fst.readFirstTargetArc(arc, scratchArc);
if (scratchArc.isFinal() && fst.targetHasArcs(scratchArc)) {
// target is final
isFinal = true;
finalOutput = scratchArc.output == NO_OUTPUT ? null : scratchArc.output;
System.out.println("dot hit final label=" + (char) scratchArc.label);
}
*/
final String stateColor;
if (fst.isExpandedTarget(arc, r)) {
stateColor = expandedNodeColor;
} else {
stateColor = null;
}
final String finalOutput;
if (arc.nextFinalOutput != null && arc.nextFinalOutput != NO_OUTPUT) {
finalOutput = fst.outputs.outputToString(arc.nextFinalOutput);
} else {
finalOutput = "";
}
emitDotState(out, Long.toString(arc.target), stateShape, stateColor, finalOutput);
// To see the node address, use this instead:
//emitDotState(out, Integer.toString(arc.target), stateShape, stateColor, String.valueOf(arc.target));
seen.set((int) arc.target);
nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
sameLevelStates.add((int) arc.target);
}
String outs;
if (arc.output != NO_OUTPUT) {
outs = "/" + fst.outputs.outputToString(arc.output);
} else {
outs = "";
}
if (!FST.targetHasArcs(arc) && arc.isFinal() && arc.nextFinalOutput != NO_OUTPUT) {
// Tricky special case: sometimes, due to
// pruning, the builder can [sillily] produce
// an FST with an arc into the final end state
// (-1) but also with a next final output; in
// this case we pull that output up onto this
// arc
outs = outs + "/[" + fst.outputs.outputToString(arc.nextFinalOutput) + "]";
}
final String arcColor;
if (arc.flag(FST.BIT_TARGET_NEXT)) {
arcColor = "red";
} else {
arcColor = "black";
}
assert arc.label != FST.END_LABEL;
out.write(" " + node + " -> " + arc.target + " [label=\"" + printableLabel(arc.label) + outs + "\"" + (arc.isFinal() ? " style=\"bold\"" : "" ) + " color=\"" + arcColor + "\"]\n");
// Break the loop if we're on the last arc of this state.
if (arc.isLast()) {
//System.out.println(" break");
break;
}
fst.readNextRealArc(arc, r);
}
}
}
// Emit state ranking information.
if (sameRank && sameLevelStates.size() > 1) {
out.write(" {rank=same; ");
for (int state : sameLevelStates) {
out.write(state + "; ");
}
out.write(" }\n");
}
sameLevelStates.clear();
}
// Emit terminating state (always there anyway).
out.write(" -1 [style=filled, color=black, shape=doublecircle, label=\"\"]\n\n");
out.write(" {rank=sink; -1 }\n");
out.write("}\n");
out.flush();
}
Emit a single state in the dot
language.
/**
* Emit a single state in the <code>dot</code> language.
*/
private static void emitDotState(Writer out, String name, String shape,
String color, String label) throws IOException {
out.write(" " + name
+ " ["
+ (shape != null ? "shape=" + shape : "") + " "
+ (color != null ? "color=" + color : "") + " "
+ (label != null ? "label=\"" + label + "\"" : "label=\"\"") + " "
+ "]\n");
}
Ensures an arc's label is indeed printable (dot uses US-ASCII).
/**
* Ensures an arc's label is indeed printable (dot uses US-ASCII).
*/
private static String printableLabel(int label) {
// Any ordinary ascii character, except for " or \, are
// printed as the character; else, as a hex string:
if (label >= 0x20 && label <= 0x7d && label != 0x22 && label != 0x5c) { // " OR \
return Character.toString((char) label);
}
return "0x" + Integer.toHexString(label);
}
Just maps each UTF16 unit (char) to the ints in an
IntsRef. /** Just maps each UTF16 unit (char) to the ints in an
* IntsRef. */
public static IntsRef toUTF16(CharSequence s, IntsRefBuilder scratch) {
final int charLimit = s.length();
scratch.setLength(charLimit);
scratch.grow(charLimit);
for (int idx = 0; idx < charLimit; idx++) {
scratch.setIntAt(idx, (int) s.charAt(idx));
}
return scratch.get();
}
Decodes the Unicode codepoints from the provided
CharSequence and places them in the provided scratch
IntsRef, which must not be null, returning it. /** Decodes the Unicode codepoints from the provided
* CharSequence and places them in the provided scratch
* IntsRef, which must not be null, returning it. */
public static IntsRef toUTF32(CharSequence s, IntsRefBuilder scratch) {
int charIdx = 0;
int intIdx = 0;
final int charLimit = s.length();
while(charIdx < charLimit) {
scratch.grow(intIdx+1);
final int utf32 = Character.codePointAt(s, charIdx);
scratch.setIntAt(intIdx, utf32);
charIdx += Character.charCount(utf32);
intIdx++;
}
scratch.setLength(intIdx);
return scratch.get();
}
Decodes the Unicode codepoints from the provided
char[] and places them in the provided scratch
IntsRef, which must not be null, returning it. /** Decodes the Unicode codepoints from the provided
* char[] and places them in the provided scratch
* IntsRef, which must not be null, returning it. */
public static IntsRef toUTF32(char[] s, int offset, int length, IntsRefBuilder scratch) {
int charIdx = offset;
int intIdx = 0;
final int charLimit = offset + length;
while(charIdx < charLimit) {
scratch.grow(intIdx+1);
final int utf32 = Character.codePointAt(s, charIdx, charLimit);
scratch.setIntAt(intIdx, utf32);
charIdx += Character.charCount(utf32);
intIdx++;
}
scratch.setLength(intIdx);
return scratch.get();
}
Just takes unsigned byte values from the BytesRef and
converts into an IntsRef. /** Just takes unsigned byte values from the BytesRef and
* converts into an IntsRef. */
public static IntsRef toIntsRef(BytesRef input, IntsRefBuilder scratch) {
scratch.clear();
for(int i=0;i<input.length;i++) {
scratch.append(input.bytes[i+input.offset] & 0xFF);
}
return scratch.get();
}
Just converts IntsRef to BytesRef; you must ensure the
int values fit into a byte. /** Just converts IntsRef to BytesRef; you must ensure the
* int values fit into a byte. */
public static BytesRef toBytesRef(IntsRef input, BytesRefBuilder scratch) {
scratch.grow(input.length);
for(int i=0;i<input.length;i++) {
int value = input.ints[i+input.offset];
// NOTE: we allow -128 to 255
assert value >= Byte.MIN_VALUE && value <= 255: "value " + value + " doesn't fit into byte";
scratch.setByteAt(i, (byte) value);
}
scratch.setLength(input.length);
return scratch.get();
}
// Uncomment for debugging:
/*
public static <T> void dotToFile(FST<T> fst, String filePath) throws IOException {
Writer w = new OutputStreamWriter(new FileOutputStream(filePath));
toDot(fst, w, true, true);
w.close();
}
*/
Reads the first arc greater or equal than the given label into the provided
arc in place and returns it iff found, otherwise return null
.
Params: - label – the label to ceil on
- fst – the fst to operate on
- follow – the arc to follow reading the label from
- arc – the arc to read into in place
- in – the fst's
BytesReader
/**
* Reads the first arc greater or equal than the given label into the provided
* arc in place and returns it iff found, otherwise return <code>null</code>.
*
* @param label the label to ceil on
* @param fst the fst to operate on
* @param follow the arc to follow reading the label from
* @param arc the arc to read into in place
* @param in the fst's {@link BytesReader}
*/
public static <T> Arc<T> readCeilArc(int label, FST<T> fst, Arc<T> follow,
Arc<T> arc, BytesReader in) throws IOException {
// TODO maybe this is a useful in the FST class - we could simplify some other code like FSTEnum?
if (label == FST.END_LABEL) {
if (follow.isFinal()) {
if (follow.target <= 0) {
arc.flags = FST.BIT_LAST_ARC;
} else {
arc.flags = 0;
// NOTE: nextArc is a node (not an address!) in this case:
arc.nextArc = follow.target;
}
arc.output = follow.nextFinalOutput;
arc.label = FST.END_LABEL;
return arc;
} else {
return null;
}
}
if (!FST.targetHasArcs(follow)) {
return null;
}
fst.readFirstTargetArc(follow, arc, in);
if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
if (arc.arcIdx == Integer.MIN_VALUE) {
// Arcs are in an array-with-gaps
int offset = label - arc.label;
if (offset >= arc.numArcs) {
return null;
} else if (offset < 0) {
return arc;
} else {
arc.nextArc = arc.posArcsStart - offset * arc.bytesPerArc;
return fst.readNextRealArc(arc, in);
}
}
// Arcs are packed array -- use binary search to find
// the target.
int low = arc.arcIdx;
int high = arc.numArcs - 1;
int mid = 0;
// System.out.println("do arc array low=" + low + " high=" + high +
// " targetLabel=" + targetLabel);
while (low <= high) {
mid = (low + high) >>> 1;
in.setPosition(arc.posArcsStart);
in.skipBytes(arc.bytesPerArc * mid + 1);
final int midLabel = fst.readLabel(in);
final int cmp = midLabel - label;
// System.out.println(" cycle low=" + low + " high=" + high + " mid=" +
// mid + " midLabel=" + midLabel + " cmp=" + cmp);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
arc.arcIdx = mid-1;
return fst.readNextRealArc(arc, in);
}
}
if (low == arc.numArcs) {
// DEAD END!
return null;
}
arc.arcIdx = (low > high ? high : low);
return fst.readNextRealArc(arc, in);
}
// Linear scan
fst.readFirstRealTargetArc(follow.target, arc, in);
while (true) {
// System.out.println(" non-bs cycle");
// TODO: we should fix this code to not have to create
// object for the output of every arc we scan... only
// for the matching arc, if found
if (arc.label >= label) {
// System.out.println(" found!");
return arc;
} else if (arc.isLast()) {
return null;
} else {
fst.readNextRealArc(arc, in);
}
}
}
}