package org.apache.lucene.analysis;
import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RollingBuffer;
import org.apache.lucene.util.automaton.Automaton;
public class TokenStreamToAutomaton {
private boolean preservePositionIncrements;
private boolean finalOffsetGapAsHole;
private boolean unicodeArcs;
public TokenStreamToAutomaton() {
this.preservePositionIncrements = true;
}
public void setPreservePositionIncrements(boolean enablePositionIncrements) {
this.preservePositionIncrements = enablePositionIncrements;
}
public void setFinalOffsetGapAsHole(boolean finalOffsetGapAsHole) {
this.finalOffsetGapAsHole = finalOffsetGapAsHole;
}
public void setUnicodeArcs(boolean unicodeArcs) {
this.unicodeArcs = unicodeArcs;
}
private static class Position implements RollingBuffer.Resettable {
int arriving = -1;
int leaving = -1;
@Override
public void reset() {
arriving = -1;
leaving = -1;
}
}
private static class Positions extends RollingBuffer<Position> {
@Override
protected Position newInstance() {
return new Position();
}
}
protected BytesRef changeToken(BytesRef in) {
return in;
}
public static final int POS_SEP = 0x001f;
public static final int HOLE = 0x001e;
public Automaton toAutomaton(TokenStream in) throws IOException {
final Automaton.Builder builder = new Automaton.Builder();
builder.createState();
final TermToBytesRefAttribute termBytesAtt = in.addAttribute(TermToBytesRefAttribute.class);
final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class);
final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class);
final OffsetAttribute offsetAtt = in.addAttribute(OffsetAttribute.class);
in.reset();
final RollingBuffer<Position> positions = new Positions();
int pos = -1;
int freedPos = 0;
Position posData = null;
int maxOffset = 0;
while (in.incrementToken()) {
int posInc = posIncAtt.getPositionIncrement();
if (preservePositionIncrements == false && posInc > 1) {
posInc = 1;
}
assert pos > -1 || posInc > 0;
if (posInc > 0) {
pos += posInc;
posData = positions.get(pos);
assert posData.leaving == -1;
if (posData.arriving == -1) {
if (pos == 0) {
posData.leaving = 0;
} else {
posData.leaving = builder.createState();
addHoles(builder, positions, pos);
}
} else {
posData.leaving = builder.createState();
builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
if (posInc > 1) {
addHoles(builder, positions, pos);
}
}
while (freedPos <= pos) {
Position freePosData = positions.get(freedPos);
if (freePosData.arriving == -1 || freePosData.leaving == -1) {
break;
}
positions.freeBefore(freedPos);
freedPos++;
}
}
final int endPos = pos + posLengthAtt.getPositionLength();
final BytesRef termUTF8 = changeToken(termBytesAtt.getBytesRef());
int[] termUnicode = null;
final Position endPosData = positions.get(endPos);
if (endPosData.arriving == -1) {
endPosData.arriving = builder.createState();
}
int termLen;
if (unicodeArcs) {
final String utf16 = termUTF8.utf8ToString();
termUnicode = new int[utf16.codePointCount(0, utf16.length())];
termLen = termUnicode.length;
for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
termUnicode[j++] = cp = utf16.codePointAt(i);
}
} else {
termLen = termUTF8.length;
}
int state = posData.leaving;
for(int byteIDX=0;byteIDX<termLen;byteIDX++) {
final int nextState = byteIDX == termLen-1 ? endPosData.arriving : builder.createState();
int c;
if (unicodeArcs) {
c = termUnicode[byteIDX];
} else {
c = termUTF8.bytes[termUTF8.offset + byteIDX] & 0xff;
}
builder.addTransition(state, nextState, c);
state = nextState;
}
maxOffset = Math.max(maxOffset, offsetAtt.endOffset());
}
in.end();
int endPosInc = posIncAtt.getPositionIncrement();
if (endPosInc == 0 && finalOffsetGapAsHole && offsetAtt.endOffset() > maxOffset) {
endPosInc = 1;
} else if (endPosInc > 0 && preservePositionIncrements==false) {
endPosInc = 0;
}
int endState;
if (endPosInc > 0) {
endState = builder.createState();
int lastState = endState;
while (true) {
int state1 = builder.createState();
builder.addTransition(lastState, state1, HOLE);
endPosInc--;
if (endPosInc == 0) {
builder.setAccept(state1, true);
break;
}
int state2 = builder.createState();
builder.addTransition(state1, state2, POS_SEP);
lastState = state2;
}
} else {
endState = -1;
}
pos++;
while (pos <= positions.getMaxPos()) {
posData = positions.get(pos);
if (posData.arriving != -1) {
if (endState != -1) {
builder.addTransition(posData.arriving, endState, POS_SEP);
} else {
builder.setAccept(posData.arriving, true);
}
}
pos++;
}
return builder.finish();
}
private static void addHoles(Automaton.Builder builder, RollingBuffer<Position> positions, int pos) {
Position posData = positions.get(pos);
Position prevPosData = positions.get(pos-1);
while(posData.arriving == -1 || prevPosData.leaving == -1) {
if (posData.arriving == -1) {
posData.arriving = builder.createState();
builder.addTransition(posData.arriving, posData.leaving, POS_SEP);
}
if (prevPosData.leaving == -1) {
if (pos == 1) {
prevPosData.leaving = 0;
} else {
prevPosData.leaving = builder.createState();
}
if (prevPosData.arriving != -1) {
builder.addTransition(prevPosData.arriving, prevPosData.leaving, POS_SEP);
}
}
builder.addTransition(prevPosData.leaving, posData.arriving, HOLE);
pos--;
if (pos <= 0) {
break;
}
posData = prevPosData;
prevPosData = positions.get(pos-1);
}
}
}