package org.apache.lucene.analysis.hunspell;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
final class Stemmer {
private final Dictionary dictionary;
private final BytesRef scratch = new BytesRef();
private final StringBuilder segment = new StringBuilder();
private final ByteArrayDataInput affixReader;
private final StringBuilder scratchSegment = new StringBuilder();
private char scratchBuffer[] = new char[32];
private final int formStep;
public Stemmer(Dictionary dictionary) {
this.dictionary = dictionary;
this.affixReader = new ByteArrayDataInput(dictionary.affixData);
for (int level = 0; level < 3; level++) {
if (dictionary.prefixes != null) {
prefixArcs[level] = new FST.Arc<>();
prefixReaders[level] = dictionary.prefixes.getBytesReader();
}
if (dictionary.suffixes != null) {
suffixArcs[level] = new FST.Arc<>();
suffixReaders[level] = dictionary.suffixes.getBytesReader();
}
}
formStep = dictionary.hasStemExceptions ? 2 : 1;
}
public List<CharsRef> stem(String word) {
return stem(word.toCharArray(), word.length());
}
public List<CharsRef> stem(char word[], int length) {
if (dictionary.needsInputCleaning) {
scratchSegment.setLength(0);
scratchSegment.append(word, 0, length);
CharSequence cleaned = dictionary.cleanInput(scratchSegment, segment);
scratchBuffer = ArrayUtil.grow(scratchBuffer, cleaned.length());
length = segment.length();
segment.getChars(0, length, scratchBuffer, 0);
word = scratchBuffer;
}
int caseType = caseOf(word, length);
if (caseType == UPPER_CASE) {
caseFoldTitle(word, length);
caseFoldLower(titleBuffer, length);
List<CharsRef> list = doStem(word, length, false);
list.addAll(doStem(titleBuffer, length, true));
list.addAll(doStem(lowerBuffer, length, true));
return list;
} else if (caseType == TITLE_CASE) {
caseFoldLower(word, length);
List<CharsRef> list = doStem(word, length, false);
list.addAll(doStem(lowerBuffer, length, true));
return list;
} else {
return doStem(word, length, false);
}
}
private char[] lowerBuffer = new char[8];
private char[] titleBuffer = new char[8];
private static final int EXACT_CASE = 0;
private static final int TITLE_CASE = 1;
private static final int UPPER_CASE = 2;
private int caseOf(char word[], int length) {
if (dictionary.ignoreCase || length == 0 || !Character.isUpperCase(word[0])) {
return EXACT_CASE;
}
boolean seenUpper = false;
boolean seenLower = false;
for (int i = 1; i < length; i++) {
boolean v = Character.isUpperCase(word[i]);
seenUpper |= v;
seenLower |= !v;
}
if (!seenLower) {
return UPPER_CASE;
} else if (!seenUpper) {
return TITLE_CASE;
} else {
return EXACT_CASE;
}
}
private void caseFoldTitle(char word[], int length) {
titleBuffer = ArrayUtil.grow(titleBuffer, length);
System.arraycopy(word, 0, titleBuffer, 0, length);
for (int i = 1; i < length; i++) {
titleBuffer[i] = dictionary.caseFold(titleBuffer[i]);
}
}
private void caseFoldLower(char word[], int length) {
lowerBuffer = ArrayUtil.grow(lowerBuffer, length);
System.arraycopy(word, 0, lowerBuffer, 0, length);
lowerBuffer[0] = dictionary.caseFold(lowerBuffer[0]);
}
private List<CharsRef> doStem(char word[], int length, boolean caseVariant) {
List<CharsRef> stems = new ArrayList<>();
IntsRef forms = dictionary.lookupWord(word, 0, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
boolean checkKeepCase = caseVariant && dictionary.keepcase != -1;
boolean checkNeedAffix = dictionary.needaffix != -1;
boolean checkOnlyInCompound = dictionary.onlyincompound != -1;
if (checkKeepCase || checkNeedAffix || checkOnlyInCompound) {
dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch);
char wordFlags[] = Dictionary.decodeFlags(scratch);
if (checkKeepCase && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) {
continue;
}
if (checkNeedAffix && Dictionary.hasFlag(wordFlags, (char)dictionary.needaffix)) {
continue;
}
if (checkOnlyInCompound && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) {
continue;
}
}
stems.add(newStem(word, length, forms, i));
}
}
try {
boolean v = stems.addAll(stem(word, length, -1, -1, -1, 0, true, true, false, false, caseVariant));
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
return stems;
}
public List<CharsRef> uniqueStems(char word[], int length) {
List<CharsRef> stems = stem(word, length);
if (stems.size() < 2) {
return stems;
}
CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
List<CharsRef> deduped = new ArrayList<>();
for (CharsRef s : stems) {
if (!terms.contains(s)) {
deduped.add(s);
terms.add(s);
}
}
return deduped;
}
private CharsRef newStem(char buffer[], int length, IntsRef forms, int formID) {
final String exception;
if (dictionary.hasStemExceptions) {
int exceptionID = forms.ints[forms.offset + formID + 1];
if (exceptionID > 0) {
exception = dictionary.getStemException(exceptionID);
} else {
exception = null;
}
} else {
exception = null;
}
if (dictionary.needsOutputCleaning) {
scratchSegment.setLength(0);
if (exception != null) {
scratchSegment.append(exception);
} else {
scratchSegment.append(buffer, 0, length);
}
try {
Dictionary.applyMappings(dictionary.oconv, scratchSegment);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
char cleaned[] = new char[scratchSegment.length()];
scratchSegment.getChars(0, cleaned.length, cleaned, 0);
return new CharsRef(cleaned, 0, cleaned.length);
} else {
if (exception != null) {
return new CharsRef(exception);
} else {
return new CharsRef(buffer, 0, length);
}
}
}
final FST.BytesReader prefixReaders[] = new FST.BytesReader[3];
@SuppressWarnings({"unchecked","rawtypes"})
final FST.Arc<IntsRef> prefixArcs[] = new FST.Arc[3];
final FST.BytesReader suffixReaders[] = new FST.BytesReader[3];
@SuppressWarnings({"unchecked","rawtypes"})
final FST.Arc<IntsRef> suffixArcs[] = new FST.Arc[3];
private List<CharsRef> stem(char word[], int length, int previous, int prevFlag, int prefixFlag, int recursionDepth, boolean doPrefix, boolean doSuffix, boolean previousWasPrefix, boolean circumfix, boolean caseVariant) throws IOException {
List<CharsRef> stems = new ArrayList<>();
if (doPrefix && dictionary.prefixes != null) {
FST<IntsRef> fst = dictionary.prefixes;
Outputs<IntsRef> outputs = fst.outputs;
FST.BytesReader bytesReader = prefixReaders[recursionDepth];
FST.Arc<IntsRef> arc = prefixArcs[recursionDepth];
fst.getFirstArc(arc);
IntsRef NO_OUTPUT = outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
int limit = dictionary.fullStrip ? length : length-1;
for (int i = 0; i < limit; i++) {
if (i > 0) {
int ch = word[i-1];
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
break;
} else if (arc.output() != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output());
}
}
IntsRef prefixes = null;
if (!arc.isFinal()) {
continue;
} else {
prefixes = fst.outputs.add(output, arc.nextFinalOutput());
}
for (int j = 0; j < prefixes.length; j++) {
int prefix = prefixes.ints[prefixes.offset + j];
if (prefix == previous) {
continue;
}
affixReader.setPosition(8 * prefix);
char flag = (char) (affixReader.readShort() & 0xffff);
char stripOrd = (char) (affixReader.readShort() & 0xffff);
int condition = (char) (affixReader.readShort() & 0xffff);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
final boolean compatible;
if (recursionDepth == 0) {
if (dictionary.onlyincompound == -1) {
compatible = true;
} else {
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
}
} else if (crossProduct) {
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
assert prevFlag >= 0;
boolean allowed = dictionary.onlyincompound == -1 ||
!Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
compatible = allowed && hasCrossCheckedFlag((char)prevFlag, appendFlags, false);
} else {
compatible = false;
}
if (compatible) {
int deAffixedStart = i;
int deAffixedLength = length - deAffixedStart;
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd+1];
int stripLength = stripEnd - stripStart;
if (!checkCondition(condition, dictionary.stripData, stripStart, stripLength, word, deAffixedStart, deAffixedLength)) {
continue;
}
char strippedWord[] = new char[stripLength + deAffixedLength];
System.arraycopy(dictionary.stripData, stripStart, strippedWord, 0, stripLength);
System.arraycopy(word, deAffixedStart, strippedWord, stripLength, deAffixedLength);
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, prefix, -1, recursionDepth, true, circumfix, caseVariant);
stems.addAll(stemList);
}
}
}
}
if (doSuffix && dictionary.suffixes != null) {
FST<IntsRef> fst = dictionary.suffixes;
Outputs<IntsRef> outputs = fst.outputs;
FST.BytesReader bytesReader = suffixReaders[recursionDepth];
FST.Arc<IntsRef> arc = suffixArcs[recursionDepth];
fst.getFirstArc(arc);
IntsRef NO_OUTPUT = outputs.getNoOutput();
IntsRef output = NO_OUTPUT;
int limit = dictionary.fullStrip ? 0 : 1;
for (int i = length; i >= limit; i--) {
if (i < length) {
int ch = word[i];
if (fst.findTargetArc(ch, arc, arc, bytesReader) == null) {
break;
} else if (arc.output() != NO_OUTPUT) {
output = fst.outputs.add(output, arc.output());
}
}
IntsRef suffixes = null;
if (!arc.isFinal()) {
continue;
} else {
suffixes = fst.outputs.add(output, arc.nextFinalOutput());
}
for (int j = 0; j < suffixes.length; j++) {
int suffix = suffixes.ints[suffixes.offset + j];
if (suffix == previous) {
continue;
}
affixReader.setPosition(8 * suffix);
char flag = (char) (affixReader.readShort() & 0xffff);
char stripOrd = (char) (affixReader.readShort() & 0xffff);
int condition = (char) (affixReader.readShort() & 0xffff);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
final boolean compatible;
if (recursionDepth == 0) {
if (dictionary.onlyincompound == -1) {
compatible = true;
} else {
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
compatible = !Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
}
} else if (crossProduct) {
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
assert prevFlag >= 0;
boolean allowed = dictionary.onlyincompound == -1 ||
!Dictionary.hasFlag(appendFlags, (char) dictionary.onlyincompound);
compatible = allowed && hasCrossCheckedFlag((char)prevFlag, appendFlags, previousWasPrefix);
} else {
compatible = false;
}
if (compatible) {
int appendLength = length - i;
int deAffixedLength = length - appendLength;
int stripStart = dictionary.stripOffsets[stripOrd];
int stripEnd = dictionary.stripOffsets[stripOrd+1];
int stripLength = stripEnd - stripStart;
if (!checkCondition(condition, word, 0, deAffixedLength, dictionary.stripData, stripStart, stripLength)) {
continue;
}
char strippedWord[] = new char[stripLength + deAffixedLength];
System.arraycopy(word, 0, strippedWord, 0, deAffixedLength);
System.arraycopy(dictionary.stripData, stripStart, strippedWord, deAffixedLength, stripLength);
List<CharsRef> stemList = applyAffix(strippedWord, strippedWord.length, suffix, prefixFlag, recursionDepth, false, circumfix, caseVariant);
stems.addAll(stemList);
}
}
}
}
return stems;
}
private boolean checkCondition(int condition, char c1[], int c1off, int c1len, char c2[], int c2off, int c2len) {
if (condition != 0) {
CharacterRunAutomaton pattern = dictionary.patterns.get(condition);
int state = 0;
for (int i = c1off; i < c1off + c1len; i++) {
state = pattern.step(state, c1[i]);
if (state == -1) {
return false;
}
}
for (int i = c2off; i < c2off + c2len; i++) {
state = pattern.step(state, c2[i]);
if (state == -1) {
return false;
}
}
return pattern.isAccept(state);
}
return true;
}
List<CharsRef> applyAffix(char strippedWord[], int length, int affix, int prefixFlag, int recursionDepth, boolean prefix, boolean circumfix, boolean caseVariant) throws IOException {
affixReader.setPosition(8 * affix);
char flag = (char) (affixReader.readShort() & 0xffff);
affixReader.skipBytes(2);
int condition = (char) (affixReader.readShort() & 0xffff);
boolean crossProduct = (condition & 1) == 1;
condition >>>= 1;
char append = (char) (affixReader.readShort() & 0xffff);
List<CharsRef> stems = new ArrayList<>();
IntsRef forms = dictionary.lookupWord(strippedWord, 0, length);
if (forms != null) {
for (int i = 0; i < forms.length; i += formStep) {
dictionary.flagLookup.get(forms.ints[forms.offset+i], scratch);
char wordFlags[] = Dictionary.decodeFlags(scratch);
if (Dictionary.hasFlag(wordFlags, flag)) {
boolean chainedPrefix = dictionary.complexPrefixes && recursionDepth == 1 && prefix;
if (chainedPrefix == false && prefixFlag >= 0 && !Dictionary.hasFlag(wordFlags, (char)prefixFlag)) {
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
if (!hasCrossCheckedFlag((char)prefixFlag, appendFlags, false)) {
continue;
}
}
if (dictionary.circumfix != -1) {
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
boolean suffixCircumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix);
if (circumfix != suffixCircumfix) {
continue;
}
}
if (caseVariant && dictionary.keepcase != -1 && Dictionary.hasFlag(wordFlags, (char)dictionary.keepcase)) {
continue;
}
if (dictionary.onlyincompound != -1 && Dictionary.hasFlag(wordFlags, (char)dictionary.onlyincompound)) {
continue;
}
stems.add(newStem(strippedWord, length, forms, i));
}
}
}
if (dictionary.circumfix != -1 && !circumfix && prefix) {
dictionary.flagLookup.get(append, scratch);
char appendFlags[] = Dictionary.decodeFlags(scratch);
circumfix = Dictionary.hasFlag(appendFlags, (char)dictionary.circumfix);
}
if (crossProduct) {
if (recursionDepth == 0) {
if (prefix) {
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, dictionary.complexPrefixes && dictionary.twoStageAffix, true, true, circumfix, caseVariant));
} else if (dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
}
} else if (recursionDepth == 1) {
if (prefix && dictionary.complexPrefixes) {
stems.addAll(stem(strippedWord, length, affix, flag, flag, ++recursionDepth, false, true, true, circumfix, caseVariant));
} else if (prefix == false && dictionary.complexPrefixes == false && dictionary.twoStageAffix) {
stems.addAll(stem(strippedWord, length, affix, flag, prefixFlag, ++recursionDepth, false, true, false, circumfix, caseVariant));
}
}
}
return stems;
}
private boolean hasCrossCheckedFlag(char flag, char[] flags, boolean matchEmpty) {
return (flags.length == 0 && matchEmpty) || Arrays.binarySearch(flags, flag) >= 0;
}
}