org.apache.lucene/lucene-analyzers-common/8.7.0 : org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java

EdgeNGramTokenFilter
https://lucene.apache.org/lucene-parent/lucene-analyzers-common: Additional Analyzers (The Apache Software Foundation)
Apache License, Version 2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ngram;


import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;

Tokenizes the given token into n-grams of given size(s).
 This TokenFilter create n-grams from the beginning edge of a input token. 
As of Lucene 4.4, this filter handles correctly
supplementary characters.
/**
 * Tokenizes the given token into n-grams of given size(s).
 * <p>
 * This {@link TokenFilter} create n-grams from the beginning edge of a input token.
 * <p><a name="match_version"></a>As of Lucene 4.4, this filter handles correctly
 * supplementary characters.
 */
public final class EdgeNGramTokenFilter extends TokenFilter {
  public static final boolean DEFAULT_PRESERVE_ORIGINAL = false;

  private final int minGram;
  private final int maxGram;
  private final boolean preserveOriginal;

  private char[] curTermBuffer;
  private int curTermLength;
  private int curTermCodePointCount;
  private int curGramSize;
  private int curPosIncr;
  private State state;
  
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

  Creates an EdgeNGramTokenFilter that, for a given input term, produces all
edge n-grams with lengths >= minGram and <= maxGram. Will
optionally preserve the original term when its length is outside of the
defined range.
Params: input – TokenStream holding the input to be tokenized
minGram – the minimum length of the generated n-grams
maxGram – the maximum length of the generated n-grams
preserveOriginal – Whether or not to keep the original term when it
is outside the min/max size range./**
   * Creates an EdgeNGramTokenFilter that, for a given input term, produces all
   * edge n-grams with lengths &gt;= minGram and &lt;= maxGram. Will
   * optionally preserve the original term when its length is outside of the
   * defined range.
   * 
   * @param input {@link TokenStream} holding the input to be tokenized
   * @param minGram the minimum length of the generated n-grams
   * @param maxGram the maximum length of the generated n-grams
   * @param preserveOriginal Whether or not to keep the original term when it
   * is outside the min/max size range.
   */
  public EdgeNGramTokenFilter(
      TokenStream input, int minGram, int maxGram, boolean preserveOriginal) {
    super(input);

    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }

    if (minGram > maxGram) {
      throw new IllegalArgumentException("minGram must not be greater than maxGram");
    }

    this.minGram = minGram;
    this.maxGram = maxGram;
    this.preserveOriginal = preserveOriginal;
  }

  Creates an EdgeNGramTokenFilter that produces edge n-grams of the given
size.
Params: input – TokenStream holding the input to be tokenized
gramSize – the n-gram size to generate./**
   * Creates an EdgeNGramTokenFilter that produces edge n-grams of the given
   * size.
   *
   * @param input {@link TokenStream} holding the input to be tokenized
   * @param gramSize the n-gram size to generate.
   */
  public EdgeNGramTokenFilter(TokenStream input, int gramSize) {
    this(input, gramSize, gramSize, DEFAULT_PRESERVE_ORIGINAL);
  }

  @Override
  public final boolean incrementToken() throws IOException {
    while (true) {
      if (curTermBuffer == null) {
        if (!input.incrementToken()) {
          return false;
        }
        state = captureState();
        
        curTermLength = termAtt.length();
        curTermCodePointCount = Character.codePointCount(termAtt, 0, curTermLength);
        curPosIncr += posIncrAtt.getPositionIncrement();

        if (preserveOriginal && curTermCodePointCount < minGram) {
          // Token is shorter than minGram, but we'd still like to keep it.
          posIncrAtt.setPositionIncrement(curPosIncr);
          curPosIncr = 0;
          return true;
        }
        
        curTermBuffer = termAtt.buffer().clone();
        curGramSize = minGram;
      }

      if (curGramSize <= curTermCodePointCount) {
        if (curGramSize <= maxGram) { // curGramSize is between minGram and maxGram
          restoreState(state);
          // first ngram gets increment, others don't
          posIncrAtt.setPositionIncrement(curPosIncr);
          curPosIncr = 0;

          final int charLength = Character.offsetByCodePoints(curTermBuffer, 0, curTermLength, 0, curGramSize);
          termAtt.copyBuffer(curTermBuffer, 0, charLength);
          curGramSize++;
          return true;
        }
        else if (preserveOriginal) {
          // Token is longer than maxGram, but we'd still like to keep it.
          restoreState(state);
          posIncrAtt.setPositionIncrement(0);
          termAtt.copyBuffer(curTermBuffer, 0, curTermLength);
          curTermBuffer = null;
          return true;
        }
      }
      // Done with this input token, get next token on the next iteration.
      curTermBuffer = null;
    }
  }

  @Override
  public void reset() throws IOException {
    super.reset();
    curTermBuffer = null;
    curPosIncr = 0;
  }

  @Override
  public void end() throws IOException {
    super.end();
    posIncrAtt.setPositionIncrement(curPosIncr);
  }
}
/

org.apache.lucene/ lucene-analyzers-common/ 8.7.0/ org/apache/lucene/analysis/ngram/EdgeNGramTokenFilter.java