package org.apache.lucene.analysis.miscellaneous;
import java.io.IOException;
import java.util.Arrays;
import java.util.Comparator;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.AttributeSource;
public class FingerprintFilter extends TokenFilter {
public static final int DEFAULT_MAX_OUTPUT_TOKEN_SIZE = 1024;
public static final char DEFAULT_SEPARATOR = ' ';
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
private CharArraySet uniqueTerms = null;
private final int maxOutputTokenSize;
private AttributeSource.State finalState;
private final char separator;
private boolean inputEnded = false;
public FingerprintFilter(TokenStream input) {
this(input, DEFAULT_MAX_OUTPUT_TOKEN_SIZE, DEFAULT_SEPARATOR);
}
public FingerprintFilter(TokenStream input, int maxOutputTokenSize,
char separator) {
super(input);
this.maxOutputTokenSize = maxOutputTokenSize;
this.separator = separator;
}
@Override
public final boolean incrementToken() throws IOException {
if (inputEnded) {
return false;
}
boolean result = buildSingleOutputToken();
finalState = captureState();
return result;
}
private final boolean buildSingleOutputToken() throws IOException {
inputEnded = false;
char clonedLastTerm[] = null;
uniqueTerms = new CharArraySet(8, false);
int outputTokenSize = 0;
while (input.incrementToken()) {
if (outputTokenSize > maxOutputTokenSize) {
continue;
}
final char term[] = termAttribute.buffer();
final int length = termAttribute.length();
if (!uniqueTerms.contains(term, 0, length)) {
clonedLastTerm = new char[length];
System.arraycopy(term, 0, clonedLastTerm, 0, length);
if (uniqueTerms.size() > 0) {
outputTokenSize++;
}
uniqueTerms.add(clonedLastTerm);
outputTokenSize += length;
}
}
input.end();
inputEnded = true;
offsetAtt.setOffset(0, offsetAtt.endOffset());
posLenAtt.setPositionLength(1);
posIncrAtt.setPositionIncrement(1);
typeAtt.setType("fingerprint");
if (uniqueTerms.size() < 1) {
termAttribute.setEmpty();
return false;
}
if (outputTokenSize > maxOutputTokenSize) {
termAttribute.setEmpty();
uniqueTerms.clear();
return false;
}
if (uniqueTerms.size() == 1) {
termAttribute.setEmpty().append(new String(clonedLastTerm));
uniqueTerms.clear();
return true;
}
Object[] items = uniqueTerms.toArray();
Arrays.sort(items, new Comparator<Object>() {
@Override
public int compare(Object o1, Object o2) {
char v1[] = (char[]) o1;
char v2[] = (char[]) o2;
int len1 = v1.length;
int len2 = v2.length;
int lim = Math.min(len1, len2);
int k = 0;
while (k < lim) {
char c1 = v1[k];
char c2 = v2[k];
if (c1 != c2) {
return c1 - c2;
}
k++;
}
return len1 - len2;
}
});
StringBuilder sb = new StringBuilder();
for (Object item : items) {
if (sb.length() >= 1) {
sb.append(separator);
}
sb.append((char[]) item);
}
termAttribute.setEmpty().append(sb);
uniqueTerms.clear();
return true;
}
@Override
public final void end() throws IOException {
if (!inputEnded) {
input.end();
inputEnded = true;
}
if (finalState != null) {
restoreState(finalState);
}
}
@Override
public void reset() throws IOException {
super.reset();
inputEnded = false;
uniqueTerms = null;
}
}