/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.miscellaneous;


import java.io.IOException;
import java.util.ArrayList;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FST.Arc;
import org.apache.lucene.util.fst.FST.BytesReader;

Provides the ability to override any KeywordAttribute aware stemmer with custom dictionary-based stemming.
/** * Provides the ability to override any {@link KeywordAttribute} aware stemmer * with custom dictionary-based stemming. */
public final class StemmerOverrideFilter extends TokenFilter { private final StemmerOverrideMap stemmerOverrideMap; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class); private final BytesReader fstReader; private final Arc<BytesRef> scratchArc = new FST.Arc<>(); private char[] spare = new char[0];
Create a new StemmerOverrideFilter, performing dictionary-based stemming with the provided dictionary.

Any dictionary-stemmed terms will be marked with KeywordAttribute so that they will not be stemmed with stemmers down the chain.

/** * Create a new StemmerOverrideFilter, performing dictionary-based stemming * with the provided <code>dictionary</code>. * <p> * Any dictionary-stemmed terms will be marked with {@link KeywordAttribute} * so that they will not be stemmed with stemmers down the chain. * </p> */
public StemmerOverrideFilter(final TokenStream input, final StemmerOverrideMap stemmerOverrideMap) { super(input); this.stemmerOverrideMap = stemmerOverrideMap; fstReader = stemmerOverrideMap.getBytesReader(); } @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (fstReader == null) { // No overrides return true; } if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms final BytesRef stem = stemmerOverrideMap.get(termAtt.buffer(), termAtt.length(), scratchArc, fstReader); if (stem != null) { spare = ArrayUtil.grow(termAtt.buffer(), stem.length); final int length = UnicodeUtil.UTF8toUTF16(stem, spare); if (spare != termAtt.buffer()) { termAtt.copyBuffer(spare, 0, length); } else { termAtt.setLength(length); } keywordAtt.setKeyword(true); } } return true; } else { return false; } }
A read-only 4-byte FST backed map that allows fast case-insensitive key value lookups for StemmerOverrideFilter
/** * A read-only 4-byte FST backed map that allows fast case-insensitive key * value lookups for {@link StemmerOverrideFilter} */
// TODO maybe we can generalize this and reuse this map somehow? public final static class StemmerOverrideMap { private final FST<BytesRef> fst; private final boolean ignoreCase;
Creates a new StemmerOverrideMap
Params:
  • fst – the fst to lookup the overrides
  • ignoreCase – if the keys case should be ingored
/** * Creates a new {@link StemmerOverrideMap} * @param fst the fst to lookup the overrides * @param ignoreCase if the keys case should be ingored */
public StemmerOverrideMap(FST<BytesRef> fst, boolean ignoreCase) { this.fst = fst; this.ignoreCase = ignoreCase; } /** * Returns a {@link BytesReader} to pass to the {@link #get(char[], int, FST.Arc, FST.BytesReader)} method. */ public BytesReader getBytesReader() { if (fst == null) { return null; } else { return fst.getBytesReader(); } }
Returns the value mapped to the given key or null if the key is not in the FST dictionary.
/** * Returns the value mapped to the given key or <code>null</code> if the key is not in the FST dictionary. */
public BytesRef get(char[] buffer, int bufferLen, Arc<BytesRef> scratchArc, BytesReader fstReader) throws IOException { BytesRef pendingOutput = fst.outputs.getNoOutput(); BytesRef matchOutput = null; int bufUpto = 0; fst.getFirstArc(scratchArc); while (bufUpto < bufferLen) { final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { return null; } pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); bufUpto += Character.charCount(codePoint); } if (scratchArc.isFinal()) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); } return matchOutput; } }
This builder builds an FST for the StemmerOverrideFilter
/** * This builder builds an {@link FST} for the {@link StemmerOverrideFilter} */
public static class Builder { private final BytesRefHash hash = new BytesRefHash(); private final BytesRefBuilder spare = new BytesRefBuilder(); private final ArrayList<CharSequence> outputValues = new ArrayList<>(); private final boolean ignoreCase; private final CharsRefBuilder charsSpare = new CharsRefBuilder();
Creates a new Builder with ignoreCase set to false
/** * Creates a new {@link Builder} with ignoreCase set to <code>false</code> */
public Builder() { this(false); }
Creates a new Builder
Params:
  • ignoreCase – if the input case should be ignored.
/** * Creates a new {@link Builder} * @param ignoreCase if the input case should be ignored. */
public Builder(boolean ignoreCase) { this.ignoreCase = ignoreCase; }
Adds an input string and its stemmer override output to this builder.
Params:
  • input – the input char sequence
  • output – the stemmer override output char sequence
Returns:false iff the input has already been added to this builder otherwise true.
/** * Adds an input string and its stemmer override output to this builder. * * @param input the input char sequence * @param output the stemmer override output char sequence * @return <code>false</code> iff the input has already been added to this builder otherwise <code>true</code>. */
public boolean add(CharSequence input, CharSequence output) { final int length = input.length(); if (ignoreCase) { // convert on the fly to lowercase charsSpare.grow(length); final char[] buffer = charsSpare.chars(); for (int i = 0; i < length; ) { i += Character.toChars( Character.toLowerCase( Character.codePointAt(input, i)), buffer, i); } spare.copyChars(buffer, 0, length); } else { spare.copyChars(input, 0, length); } if (hash.add(spare.get()) >= 0) { outputValues.add(output); return true; } return false; }
Returns an StemmerOverrideMap to be used with the StemmerOverrideFilter
Throws:
Returns:an StemmerOverrideMap to be used with the StemmerOverrideFilter
/** * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter} * @throws IOException if an {@link IOException} occurs; */
public StemmerOverrideMap build() throws IOException { ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>( FST.INPUT_TYPE.BYTE4, outputs); final int[] sort = hash.sort(); IntsRefBuilder intsSpare = new IntsRefBuilder(); final int size = hash.size(); BytesRef spare = new BytesRef(); for (int i = 0; i < size; i++) { int id = sort[i]; BytesRef bytesRef = hash.get(id, spare); intsSpare.copyUTF8Bytes(bytesRef); builder.add(intsSpare.get(), new BytesRef(outputValues.get(id))); } return new StemmerOverrideMap(builder.finish(), ignoreCase); } } }