/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.util.fst;


import java.io.IOException;

import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST.INPUT_TYPE; // javadoc

// TODO: could we somehow stream an FST to disk while we
// build it?

Builds a minimal FST (maps an IntsRef term to an arbitrary output) from pre-sorted terms with outputs. The FST becomes an FSA if you use NoOutputs. The FST is written on-the-fly into a compact serialized format byte array, which can be saved to / loaded from a Directory or used directly for traversal. The FST is always finite (no cycles).

NOTE: The algorithm is described at http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698

The parameterized type T is the output type. See the subclasses of Outputs.

FSTs larger than 2.1GB are now possible (as of Lucene 4.2). FSTs containing more than 2.1B nodes are also now possible, however they cannot be packed.

@lucene.experimental
/** * Builds a minimal FST (maps an IntsRef term to an arbitrary * output) from pre-sorted terms with outputs. The FST * becomes an FSA if you use NoOutputs. The FST is written * on-the-fly into a compact serialized format byte array, which can * be saved to / loaded from a Directory or used directly * for traversal. The FST is always finite (no cycles). * * <p>NOTE: The algorithm is described at * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.24.3698</p> * * <p>The parameterized type T is the output type. See the * subclasses of {@link Outputs}. * * <p>FSTs larger than 2.1GB are now possible (as of Lucene * 4.2). FSTs containing more than 2.1B nodes are also now * possible, however they cannot be packed. * * @lucene.experimental */
public class Builder<T> {
Default oversizing factor used to decide whether to encode a node with direct addressing or binary search. Default is 1: ensure no oversizing on average.

This factor does not determine whether to encode a node with a list of variable length arcs or with fixed length arcs. It only determines the effective encoding of a node that is already known to be encoded with fixed length arcs. See FST.shouldExpandNodeWithFixedLengthArcs() and FST.shouldExpandNodeWithDirectAddressing().

For English words we measured 217K nodes, only 3.27% nodes are encoded with fixed length arcs, and 99.99% of them with direct addressing. Overall FST memory reduced by 1.67%.

For worst case we measured 168K nodes, 50% of them are encoded with fixed length arcs, and 14% of them with direct encoding. Overall FST memory reduced by 0.8%.

Use TestFstDirectAddressing.main() and TestFstDirectAddressing.testWorstCaseForDirectAddressing() to evaluate a change.

See Also:
/** * Default oversizing factor used to decide whether to encode a node with direct addressing or binary search. * Default is 1: ensure no oversizing on average. * <p> * This factor does not determine whether to encode a node with a list of variable length arcs or with * fixed length arcs. It only determines the effective encoding of a node that is already known to be * encoded with fixed length arcs. * See {@code FST.shouldExpandNodeWithFixedLengthArcs()} * and {@code FST.shouldExpandNodeWithDirectAddressing()}. * <p> * For English words we measured 217K nodes, only 3.27% nodes are encoded with fixed length arcs, * and 99.99% of them with direct addressing. Overall FST memory reduced by 1.67%. * <p> * For worst case we measured 168K nodes, 50% of them are encoded with fixed length arcs, * and 14% of them with direct encoding. Overall FST memory reduced by 0.8%. * <p> * Use {@code TestFstDirectAddressing.main()} * and {@code TestFstDirectAddressing.testWorstCaseForDirectAddressing()} * to evaluate a change. * * @see #setDirectAddressingMaxOversizingFactor */
static final float DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR = 1.0f; private final NodeHash<T> dedupHash; final FST<T> fst; private final T NO_OUTPUT; // private static final boolean DEBUG = true; // simplistic pruning: we prune node (and all following // nodes) if less than this number of terms go through it: private final int minSuffixCount1; // better pruning: we prune node (and all following // nodes) if the prior node has less than this number of // terms go through it: private final int minSuffixCount2; private final boolean doShareNonSingletonNodes; private final int shareMaxTailLength; private final IntsRefBuilder lastInput = new IntsRefBuilder(); // NOTE: cutting this over to ArrayList instead loses ~6% // in build performance on 9.8M Wikipedia terms; so we // left this as an array: // current "frontier" private UnCompiledNode<T>[] frontier; // Used for the BIT_TARGET_NEXT optimization (whereby // instead of storing the address of the target node for // a given arc, we mark a single bit noting that the next // node in the byte[] is the target node): long lastFrozenNode; // Reused temporarily while building the FST: int[] numBytesPerArc = new int[4]; int[] numLabelBytesPerArc = new int[numBytesPerArc.length]; final FixedLengthArcsBuffer fixedLengthArcsBuffer = new FixedLengthArcsBuffer(); long arcCount; long nodeCount; long binarySearchNodeCount; long directAddressingNodeCount; boolean allowFixedLengthArcs; float directAddressingMaxOversizingFactor = DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR; long directAddressingExpansionCredit; BytesStore bytes;
Instantiates an FST/FSA builder without any pruning. A shortcut to Builder(INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int) with pruning options turned off.
/** * Instantiates an FST/FSA builder without any pruning. A shortcut to {@link * #Builder(FST.INPUT_TYPE, int, int, boolean, boolean, int, Outputs, boolean, int)} with * pruning options turned off. */
public Builder(FST.INPUT_TYPE inputType, Outputs<T> outputs) { this(inputType, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15); }
Instantiates an FST/FSA builder with all the possible tuning and construction tweaks. Read parameter documentation carefully.
Params:
  • inputType – The input type (transition labels). Can be anything from INPUT_TYPE enumeration. Shorter types will consume less memory. Strings (character sequences) are represented as INPUT_TYPE.BYTE4 (full unicode codepoints).
  • minSuffixCount1 – If pruning the input graph during construction, this threshold is used for telling if a node is kept or pruned. If transition_count(node) >= minSuffixCount1, the node is kept.
  • minSuffixCount2 – (Note: only Mike McCandless knows what this one is really doing...)
  • doShareSuffix – If true, the shared suffixes will be compacted into unique paths. This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to false creates a single suffix path for all input sequences. This will result in a larger FST, but requires substantially less memory and CPU during building.
  • doShareNonSingletonNodes – Only used if doShareSuffix is true. Set this to true to ensure FST is fully minimal, at cost of more CPU and more RAM during building.
  • shareMaxTailLength – Only used if doShareSuffix is true. Set this to Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more CPU and more RAM during building.
  • outputs – The output type for each input sequence. Applies only if building an FST. For FSA, use NoOutputs.getSingleton() and NoOutputs.getNoOutput() as the singleton output object.
  • allowFixedLengthArcs – Pass false to disable the fixed length arc optimization (binary search or direct addressing) while building the FST; this will make the resulting FST smaller but slower to traverse.
  • bytesPageBits – How many bits wide to make each byte[] block in the BytesStore; if you know the FST will be large then make this larger. For example 15 bits = 32768 byte pages.
/** * Instantiates an FST/FSA builder with all the possible tuning and construction * tweaks. Read parameter documentation carefully. * * @param inputType * The input type (transition labels). Can be anything from {@link INPUT_TYPE} * enumeration. Shorter types will consume less memory. Strings (character sequences) are * represented as {@link INPUT_TYPE#BYTE4} (full unicode codepoints). * * @param minSuffixCount1 * If pruning the input graph during construction, this threshold is used for telling * if a node is kept or pruned. If transition_count(node) &gt;= minSuffixCount1, the node * is kept. * * @param minSuffixCount2 * (Note: only Mike McCandless knows what this one is really doing...) * * @param doShareSuffix * If <code>true</code>, the shared suffixes will be compacted into unique paths. * This requires an additional RAM-intensive hash map for lookups in memory. Setting this parameter to * <code>false</code> creates a single suffix path for all input sequences. This will result in a larger * FST, but requires substantially less memory and CPU during building. * * @param doShareNonSingletonNodes * Only used if doShareSuffix is true. Set this to * true to ensure FST is fully minimal, at cost of more * CPU and more RAM during building. * * @param shareMaxTailLength * Only used if doShareSuffix is true. Set this to * Integer.MAX_VALUE to ensure FST is fully minimal, at cost of more * CPU and more RAM during building. * * @param outputs The output type for each input sequence. Applies only if building an FST. For * FSA, use {@link NoOutputs#getSingleton()} and {@link NoOutputs#getNoOutput()} as the * singleton output object. * * @param allowFixedLengthArcs Pass false to disable the fixed length arc optimization (binary search or * direct addressing) while building the FST; this will make the resulting FST smaller but slower to * traverse. * * @param bytesPageBits How many bits wide to make each * byte[] block in the BytesStore; if you know the FST * will be large then make this larger. For example 15 * bits = 32768 byte pages. */
public Builder(FST.INPUT_TYPE inputType, int minSuffixCount1, int minSuffixCount2, boolean doShareSuffix, boolean doShareNonSingletonNodes, int shareMaxTailLength, Outputs<T> outputs, boolean allowFixedLengthArcs, int bytesPageBits) { this.minSuffixCount1 = minSuffixCount1; this.minSuffixCount2 = minSuffixCount2; this.doShareNonSingletonNodes = doShareNonSingletonNodes; this.shareMaxTailLength = shareMaxTailLength; this.allowFixedLengthArcs = allowFixedLengthArcs; fst = new FST<>(inputType, outputs, bytesPageBits); bytes = fst.bytes; assert bytes != null; if (doShareSuffix) { dedupHash = new NodeHash<>(fst, bytes.getReverseReader(false)); } else { dedupHash = null; } NO_OUTPUT = outputs.getNoOutput(); @SuppressWarnings({"rawtypes","unchecked"}) final UnCompiledNode<T>[] f = (UnCompiledNode<T>[]) new UnCompiledNode[10]; frontier = f; for(int idx=0;idx<frontier.length;idx++) { frontier[idx] = new UnCompiledNode<>(this, idx); } }
Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing of arcs instead of binary search.

Setting this factor to a negative value (e.g. -1) effectively disables direct addressing, only binary search nodes will be created.

See Also:
  • DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR
/** * Overrides the default the maximum oversizing of fixed array allowed to enable direct addressing * of arcs instead of binary search. * <p> * Setting this factor to a negative value (e.g. -1) effectively disables direct addressing, * only binary search nodes will be created. * * @see #DIRECT_ADDRESSING_MAX_OVERSIZING_FACTOR */
public Builder<T> setDirectAddressingMaxOversizingFactor(float factor) { directAddressingMaxOversizingFactor = factor; return this; }
See Also:
  • setDirectAddressingMaxOversizingFactor(float)
/** * @see #setDirectAddressingMaxOversizingFactor(float) */
public float getDirectAddressingMaxOversizingFactor() { return directAddressingMaxOversizingFactor; } public long getTermCount() { return frontier[0].inputCount; } public long getNodeCount() { // 1+ in order to count the -1 implicit final node return 1+nodeCount; } public long getArcCount() { return arcCount; } public long getMappedStateCount() { return dedupHash == null ? 0 : nodeCount; } private CompiledNode compileNode(UnCompiledNode<T> nodeIn, int tailLength) throws IOException { final long node; long bytesPosStart = bytes.getPosition(); if (dedupHash != null && (doShareNonSingletonNodes || nodeIn.numArcs <= 1) && tailLength <= shareMaxTailLength) { if (nodeIn.numArcs == 0) { node = fst.addNode(this, nodeIn); lastFrozenNode = node; } else { node = dedupHash.add(this, nodeIn); } } else { node = fst.addNode(this, nodeIn); } assert node != -2; long bytesPosEnd = bytes.getPosition(); if (bytesPosEnd != bytesPosStart) { // The FST added a new node: assert bytesPosEnd > bytesPosStart; lastFrozenNode = node; } nodeIn.clear(); final CompiledNode fn = new CompiledNode(); fn.node = node; return fn; } private void freezeTail(int prefixLenPlus1) throws IOException { //System.out.println(" compileTail " + prefixLenPlus1); final int downTo = Math.max(1, prefixLenPlus1); for(int idx=lastInput.length(); idx >= downTo; idx--) { boolean doPrune = false; boolean doCompile = false; final UnCompiledNode<T> node = frontier[idx]; final UnCompiledNode<T> parent = frontier[idx-1]; if (node.inputCount < minSuffixCount1) { doPrune = true; doCompile = true; } else if (idx > prefixLenPlus1) { // prune if parent's inputCount is less than suffixMinCount2 if (parent.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && parent.inputCount == 1 && idx > 1)) { // my parent, about to be compiled, doesn't make the cut, so // I'm definitely pruned // if minSuffixCount2 is 1, we keep only up // until the 'distinguished edge', ie we keep only the // 'divergent' part of the FST. if my parent, about to be // compiled, has inputCount 1 then we are already past the // distinguished edge. NOTE: this only works if // the FST outputs are not "compressible" (simple // ords ARE compressible). doPrune = true; } else { // my parent, about to be compiled, does make the cut, so // I'm definitely not pruned doPrune = false; } doCompile = true; } else { // if pruning is disabled (count is 0) we can always // compile current node doCompile = minSuffixCount2 == 0; } //System.out.println(" label=" + ((char) lastInput.ints[lastInput.offset+idx-1]) + " idx=" + idx + " inputCount=" + frontier[idx].inputCount + " doCompile=" + doCompile + " doPrune=" + doPrune); if (node.inputCount < minSuffixCount2 || (minSuffixCount2 == 1 && node.inputCount == 1 && idx > 1)) { // drop all arcs for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) { @SuppressWarnings({"rawtypes","unchecked"}) final UnCompiledNode<T> target = (UnCompiledNode<T>) node.arcs[arcIdx].target; target.clear(); } node.numArcs = 0; } if (doPrune) { // this node doesn't make it -- deref it node.clear(); parent.deleteLast(lastInput.intAt(idx-1), node); } else { if (minSuffixCount2 != 0) { compileAllTargets(node, lastInput.length()-idx); } final T nextFinalOutput = node.output; // We "fake" the node as being final if it has no // outgoing arcs; in theory we could leave it // as non-final (the FST can represent this), but // FSTEnum, Util, etc., have trouble w/ non-final // dead-end states: final boolean isFinal = node.isFinal || node.numArcs == 0; if (doCompile) { // this node makes it and we now compile it. first, // compile any targets that were previously // undecided: parent.replaceLast(lastInput.intAt(idx-1), compileNode(node, 1+lastInput.length()-idx), nextFinalOutput, isFinal); } else { // replaceLast just to install // nextFinalOutput/isFinal onto the arc parent.replaceLast(lastInput.intAt(idx-1), node, nextFinalOutput, isFinal); // this node will stay in play for now, since we are // undecided on whether to prune it. later, it // will be either compiled or pruned, so we must // allocate a new node: frontier[idx] = new UnCompiledNode<>(this, idx); } } } } // for debugging /* private String toString(BytesRef b) { try { return b.utf8ToString() + " " + b; } catch (Throwable t) { return b.toString(); } } */
Add the next input/output pair. The provided input must be sorted after the previous one according to IntsRef.compareTo. It's also OK to add the same input twice in a row with different outputs, as long as Outputs implements the Outputs.merge method. Note that input is fully consumed after this method is returned (so caller is free to reuse), but output is not. So if your outputs are changeable (eg ByteSequenceOutputs or IntSequenceOutputs) then you cannot reuse across calls.
/** Add the next input/output pair. The provided input * must be sorted after the previous one according to * {@link IntsRef#compareTo}. It's also OK to add the same * input twice in a row with different outputs, as long * as {@link Outputs} implements the {@link Outputs#merge} * method. Note that input is fully consumed after this * method is returned (so caller is free to reuse), but * output is not. So if your outputs are changeable (eg * {@link ByteSequenceOutputs} or {@link * IntSequenceOutputs}) then you cannot reuse across * calls. */
public void add(IntsRef input, T output) throws IOException { /* if (DEBUG) { BytesRef b = new BytesRef(input.length); for(int x=0;x<input.length;x++) { b.bytes[x] = (byte) input.ints[x]; } b.length = input.length; if (output == NO_OUTPUT) { System.out.println("\nFST ADD: input=" + toString(b) + " " + b); } else { System.out.println("\nFST ADD: input=" + toString(b) + " " + b + " output=" + fst.outputs.outputToString(output)); } } */ // De-dup NO_OUTPUT since it must be a singleton: if (output.equals(NO_OUTPUT)) { output = NO_OUTPUT; } assert lastInput.length() == 0 || input.compareTo(lastInput.get()) >= 0: "inputs are added out of order lastInput=" + lastInput.get() + " vs input=" + input; assert validOutput(output); //System.out.println("\nadd: " + input); if (input.length == 0) { // empty input: only allowed as first input. we have // to special case this because the packed FST // format cannot represent the empty input since // 'finalness' is stored on the incoming arc, not on // the node frontier[0].inputCount++; frontier[0].isFinal = true; fst.setEmptyOutput(output); return; } // compare shared prefix length int pos1 = 0; int pos2 = input.offset; final int pos1Stop = Math.min(lastInput.length(), input.length); while(true) { frontier[pos1].inputCount++; //System.out.println(" incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" + frontier[pos1]); if (pos1 >= pos1Stop || lastInput.intAt(pos1) != input.ints[pos2]) { break; } pos1++; pos2++; } final int prefixLenPlus1 = pos1+1; if (frontier.length < input.length+1) { final UnCompiledNode<T>[] next = ArrayUtil.grow(frontier, input.length+1); for(int idx=frontier.length;idx<next.length;idx++) { next[idx] = new UnCompiledNode<>(this, idx); } frontier = next; } // minimize/compile states from previous input's // orphan'd suffix freezeTail(prefixLenPlus1); // init tail states for current input for(int idx=prefixLenPlus1;idx<=input.length;idx++) { frontier[idx-1].addArc(input.ints[input.offset + idx - 1], frontier[idx]); frontier[idx].inputCount++; } final UnCompiledNode<T> lastNode = frontier[input.length]; if (lastInput.length() != input.length || prefixLenPlus1 != input.length + 1) { lastNode.isFinal = true; lastNode.output = NO_OUTPUT; } // push conflicting outputs forward, only as far as // needed for(int idx=1;idx<prefixLenPlus1;idx++) { final UnCompiledNode<T> node = frontier[idx]; final UnCompiledNode<T> parentNode = frontier[idx-1]; final T lastOutput = parentNode.getLastOutput(input.ints[input.offset + idx - 1]); assert validOutput(lastOutput); final T commonOutputPrefix; final T wordSuffix; if (lastOutput != NO_OUTPUT) { commonOutputPrefix = fst.outputs.common(output, lastOutput); assert validOutput(commonOutputPrefix); wordSuffix = fst.outputs.subtract(lastOutput, commonOutputPrefix); assert validOutput(wordSuffix); parentNode.setLastOutput(input.ints[input.offset + idx - 1], commonOutputPrefix); node.prependOutput(wordSuffix); } else { commonOutputPrefix = wordSuffix = NO_OUTPUT; } output = fst.outputs.subtract(output, commonOutputPrefix); assert validOutput(output); } if (lastInput.length() == input.length && prefixLenPlus1 == 1+input.length) { // same input more than 1 time in a row, mapping to // multiple outputs lastNode.output = fst.outputs.merge(lastNode.output, output); } else { // this new arc is private to this new input; set its // arc output to the leftover output: frontier[prefixLenPlus1-1].setLastOutput(input.ints[input.offset + prefixLenPlus1-1], output); } // save last input lastInput.copyInts(input); //System.out.println(" count[0]=" + frontier[0].inputCount); } private boolean validOutput(T output) { return output == NO_OUTPUT || !output.equals(NO_OUTPUT); }
Returns final FST. NOTE: this will return null if nothing is accepted by the FST.
/** Returns final FST. NOTE: this will return null if * nothing is accepted by the FST. */
public FST<T> finish() throws IOException { final UnCompiledNode<T> root = frontier[0]; // minimize nodes in the last word's suffix freezeTail(0); if (root.inputCount < minSuffixCount1 || root.inputCount < minSuffixCount2 || root.numArcs == 0) { if (fst.emptyOutput == null) { return null; } else if (minSuffixCount1 > 0 || minSuffixCount2 > 0) { // empty string got pruned return null; } } else { if (minSuffixCount2 != 0) { compileAllTargets(root, lastInput.length()); } } //if (DEBUG) System.out.println(" builder.finish root.isFinal=" + root.isFinal + " root.output=" + root.output); fst.finish(compileNode(root, lastInput.length()).node); return fst; } private void compileAllTargets(UnCompiledNode<T> node, int tailLength) throws IOException { for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) { final Arc<T> arc = node.arcs[arcIdx]; if (!arc.target.isCompiled()) { // not yet compiled @SuppressWarnings({"rawtypes","unchecked"}) final UnCompiledNode<T> n = (UnCompiledNode<T>) arc.target; if (n.numArcs == 0) { //System.out.println("seg=" + segment + " FORCE final arc=" + (char) arc.label); arc.isFinal = n.isFinal = true; } arc.target = compileNode(n, tailLength-1); } } }
Expert: holds a pending (seen but not yet serialized) arc.
/** Expert: holds a pending (seen but not yet serialized) arc. */
public static class Arc<T> { public int label; // really an "unsigned" byte public Node target; public boolean isFinal; public T output; public T nextFinalOutput; } // NOTE: not many instances of Node or CompiledNode are in // memory while the FST is being built; it's only the // current "frontier": static interface Node { boolean isCompiled(); } public long fstRamBytesUsed() { return fst.ramBytesUsed(); } static final class CompiledNode implements Node { long node; @Override public boolean isCompiled() { return true; } }
Expert: holds a pending (seen but not yet serialized) Node.
/** Expert: holds a pending (seen but not yet serialized) Node. */
public static final class UnCompiledNode<T> implements Node { final Builder<T> owner; public int numArcs; public Arc<T>[] arcs; // TODO: instead of recording isFinal/output on the // node, maybe we should use -1 arc to mean "end" (like // we do when reading the FST). Would simplify much // code here... public T output; public boolean isFinal; public long inputCount;
This node's depth, starting from the automaton root.
/** This node's depth, starting from the automaton root. */
public final int depth;
Params:
  • depth – The node's depth starting from the automaton root. Needed for LUCENE-2934 (node expansion based on conditions other than the fanout size).
/** * @param depth * The node's depth starting from the automaton root. Needed for * LUCENE-2934 (node expansion based on conditions other than the * fanout size). */
@SuppressWarnings({"rawtypes","unchecked"}) public UnCompiledNode(Builder<T> owner, int depth) { this.owner = owner; arcs = (Arc<T>[]) new Arc[1]; arcs[0] = new Arc<>(); output = owner.NO_OUTPUT; this.depth = depth; } @Override public boolean isCompiled() { return false; } public void clear() { numArcs = 0; isFinal = false; output = owner.NO_OUTPUT; inputCount = 0; // We don't clear the depth here because it never changes // for nodes on the frontier (even when reused). } public T getLastOutput(int labelToMatch) { assert numArcs > 0; assert arcs[numArcs-1].label == labelToMatch; return arcs[numArcs-1].output; } public void addArc(int label, Node target) { assert label >= 0; assert numArcs == 0 || label > arcs[numArcs-1].label: "arc[numArcs-1].label=" + arcs[numArcs-1].label + " new label=" + label + " numArcs=" + numArcs; if (numArcs == arcs.length) { final Arc<T>[] newArcs = ArrayUtil.grow(arcs, numArcs+1); for(int arcIdx=numArcs;arcIdx<newArcs.length;arcIdx++) { newArcs[arcIdx] = new Arc<>(); } arcs = newArcs; } final Arc<T> arc = arcs[numArcs++]; arc.label = label; arc.target = target; arc.output = arc.nextFinalOutput = owner.NO_OUTPUT; arc.isFinal = false; } public void replaceLast(int labelToMatch, Node target, T nextFinalOutput, boolean isFinal) { assert numArcs > 0; final Arc<T> arc = arcs[numArcs-1]; assert arc.label == labelToMatch: "arc.label=" + arc.label + " vs " + labelToMatch; arc.target = target; //assert target.node != -2; arc.nextFinalOutput = nextFinalOutput; arc.isFinal = isFinal; } public void deleteLast(int label, Node target) { assert numArcs > 0; assert label == arcs[numArcs-1].label; assert target == arcs[numArcs-1].target; numArcs--; } public void setLastOutput(int labelToMatch, T newOutput) { assert owner.validOutput(newOutput); assert numArcs > 0; final Arc<T> arc = arcs[numArcs-1]; assert arc.label == labelToMatch; arc.output = newOutput; } // pushes an output prefix forward onto all arcs public void prependOutput(T outputPrefix) { assert owner.validOutput(outputPrefix); for(int arcIdx=0;arcIdx<numArcs;arcIdx++) { arcs[arcIdx].output = owner.fst.outputs.add(outputPrefix, arcs[arcIdx].output); assert owner.validOutput(arcs[arcIdx].output); } if (isFinal) { output = owner.fst.outputs.add(outputPrefix, output); assert owner.validOutput(output); } } }
Reusable buffer for building nodes with fixed length arcs (binary search or direct addressing).
/** * Reusable buffer for building nodes with fixed length arcs (binary search or direct addressing). */
static class FixedLengthArcsBuffer { // Initial capacity is the max length required for the header of a node with fixed length arcs: // header(byte) + numArcs(vint) + numBytes(vint) private byte[] bytes = new byte[11]; private final ByteArrayDataOutput bado = new ByteArrayDataOutput(bytes);
Ensures the capacity of the internal byte array. Enlarges it if needed.
/** Ensures the capacity of the internal byte array. Enlarges it if needed. */
FixedLengthArcsBuffer ensureCapacity(int capacity) { if (bytes.length < capacity) { bytes = new byte[ArrayUtil.oversize(capacity, Byte.BYTES)]; bado.reset(bytes); } return this; } FixedLengthArcsBuffer resetPosition() { bado.reset(bytes); return this; } FixedLengthArcsBuffer writeByte(byte b) { bado.writeByte(b); return this; } FixedLengthArcsBuffer writeVInt(int i) { try { bado.writeVInt(i); } catch (IOException e) { // Never thrown. throw new RuntimeException(e); } return this; } int getPosition() { return bado.getPosition(); }
Gets the internal byte array.
/** Gets the internal byte array. */
byte[] getBytes() { return bytes; } } }