org.apache.lucene/lucene-analyzers-common/8.7.0 : org/apache/lucene/analysis/ngram/NGramTokenizer.java

NGramTokenizer
https://lucene.apache.org/lucene-parent/lucene-analyzers-common: Additional Analyzers (The Apache Software Foundation)
Apache License, Version 2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.ngram;


import java.io.IOException;

import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.util.AttributeFactory;

Tokenizes the input into n-grams of the given size(s).
On the contrary to NGramTokenFilter, this class sets offsets so that characters between startOffset and endOffset in the original stream are the same as the term chars. 
For example, "abcde" would be tokenized as (minGram=2, maxGram=3):

Term ab abc bc bcd cd cde de
Position increment 1 1 1 1 1 1 1
Position length 1 1 1 1 1 1 1
Offsets [0,2[ [0,3[ [1,3[ [1,4[ [2,4[ [2,5[ [3,5[


This tokenizer changed a lot in Lucene 4.4 in order to:

tokenize in a streaming fashion to support streams which are larger
than 1024 chars (limit of the previous version),
count grams based on unicode code points instead of java chars (and
never split in the middle of surrogate pairs),
give the ability to pre-tokenize the stream before computing n-grams.
Additionally, this class doesn't trim trailing whitespaces and emits
tokens in a different order, tokens are now emitted by increasing start
offsets while they used to be emitted by increasing lengths (which prevented
from supporting large input streams).
/**
 * Tokenizes the input into n-grams of the given size(s).
 * <p>On the contrary to {@link NGramTokenFilter}, this class sets offsets so
 * that characters between startOffset and endOffset in the original stream are
 * the same as the term chars.
 * <p>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
 * <table summary="ngram tokens example">
 * <tr><th>Term</th><td>ab</td><td>abc</td><td>bc</td><td>bcd</td><td>cd</td><td>cde</td><td>de</td></tr>
 * <tr><th>Position increment</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
 * <tr><th>Position length</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
 * <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
 * </table>
 * <a name="version"></a>
 * <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
 * <li>tokenize in a streaming fashion to support streams which are larger
 * than 1024 chars (limit of the previous version),
 * <li>count grams based on unicode code points instead of java chars (and
 * never split in the middle of surrogate pairs),
 * <li>give the ability to {@link #isTokenChar(int) pre-tokenize} the stream
 * before computing n-grams.</ul>
 * <p>Additionally, this class doesn't trim trailing whitespaces and emits
 * tokens in a different order, tokens are now emitted by increasing start
 * offsets while they used to be emitted by increasing lengths (which prevented
 * from supporting large input streams).
 */
// non-final to allow for overriding isTokenChar, but all other methods should be final
public class NGramTokenizer extends Tokenizer {
  public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
  public static final int DEFAULT_MAX_NGRAM_SIZE = 2;

  private CharacterUtils.CharacterBuffer charBuffer;
  private int[] buffer; // like charBuffer, but converted to code points
  private int bufferStart, bufferEnd; // remaining slice in buffer
  private int offset;
  private int gramSize;
  private int minGram, maxGram;
  private boolean exhausted;
  private int lastCheckedChar; // last offset in the buffer that we checked
  private int lastNonTokenChar; // last offset that we found to not be a token char
  private boolean edgesOnly; // leading edges n-grams only

  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
  private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

  NGramTokenizer(int minGram, int maxGram, boolean edgesOnly) {
    init(minGram, maxGram, edgesOnly);
  }

  Creates NGramTokenizer with given min and max n-grams.
Params: minGram – the smallest n-gram to generate
maxGram – the largest n-gram to generate/**
   * Creates NGramTokenizer with given min and max n-grams.
   * @param minGram the smallest n-gram to generate
   * @param maxGram the largest n-gram to generate
   */
  public NGramTokenizer(int minGram, int maxGram) {
    this(minGram, maxGram, false);
  }

  NGramTokenizer(AttributeFactory factory, int minGram, int maxGram, boolean edgesOnly) {
    super(factory);
    init(minGram, maxGram, edgesOnly);
  }

  Creates NGramTokenizer with given min and max n-grams.
Params: factory – AttributeFactory to use
minGram – the smallest n-gram to generate
maxGram – the largest n-gram to generate/**
   * Creates NGramTokenizer with given min and max n-grams.
   * @param factory {@link org.apache.lucene.util.AttributeFactory} to use
   * @param minGram the smallest n-gram to generate
   * @param maxGram the largest n-gram to generate
   */
  public NGramTokenizer(AttributeFactory factory, int minGram, int maxGram) {
    this(factory, minGram, maxGram, false);
  }

  Creates NGramTokenizer with default min and max n-grams.
/**
   * Creates NGramTokenizer with default min and max n-grams.
   */
  public NGramTokenizer() {
    this(DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
  }

  private void init(int minGram, int maxGram, boolean edgesOnly) {
    if (minGram < 1) {
      throw new IllegalArgumentException("minGram must be greater than zero");
    }
    if (minGram > maxGram) {
      throw new IllegalArgumentException("minGram must not be greater than maxGram");
    }
    this.minGram = minGram;
    this.maxGram = maxGram;
    this.edgesOnly = edgesOnly;
    charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
    buffer = new int[charBuffer.getBuffer().length];
    // Make the term att large enough
    termAtt.resizeBuffer(2 * maxGram);
  }

  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();

    // termination of this loop is guaranteed by the fact that every iteration
    // either advances the buffer (calls consumes()) or increases gramSize
    while (true) {
      // compact
      if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) {
        System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
        bufferEnd -= bufferStart;
        lastCheckedChar -= bufferStart;
        lastNonTokenChar -= bufferStart;
        bufferStart = 0;

        // fill in remaining space
        exhausted = !CharacterUtils.fill(charBuffer, input, buffer.length - bufferEnd);
        // convert to code points
        bufferEnd += CharacterUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
      }

      // should we go to the next offset?
      if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) {
        if (bufferStart + 1 + minGram > bufferEnd) {
          assert exhausted;
          return false;
        }
        consume();
        gramSize = minGram;
      }

      updateLastNonTokenChar();

      // retry if the token to be emitted was going to not only contain token chars
      final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
      final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
      if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) {
        consume();
        gramSize = minGram;
        continue;
      }

      final int length = CharacterUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
      termAtt.setLength(length);
      posIncAtt.setPositionIncrement(1);
      posLenAtt.setPositionLength(1);
      offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length));
      ++gramSize;
      return true;
    }
  }

  private void updateLastNonTokenChar() {
    final int termEnd = bufferStart + gramSize - 1;
    if (termEnd > lastCheckedChar) {
      for (int i = termEnd; i > lastCheckedChar; --i) {
        if (!isTokenChar(buffer[i])) {
          lastNonTokenChar = i;
          break;
        }
      }
      lastCheckedChar = termEnd;
    }
  }

  Consume one code point. /** Consume one code point. */
  private void consume() {
    offset += Character.charCount(buffer[bufferStart++]);
  }

  Only collect characters which satisfy this condition. /** Only collect characters which satisfy this condition. */
  protected boolean isTokenChar(int chr) {
    return true;
  }

  @Override
  public final void end() throws IOException {
    super.end();
    assert bufferStart <= bufferEnd;
    int endOffset = offset;
    for (int i = bufferStart; i < bufferEnd; ++i) {
      endOffset += Character.charCount(buffer[i]);
    }
    endOffset = correctOffset(endOffset);
    // set final offset
    offsetAtt.setOffset(endOffset, endOffset);
  }

  @Override
  public final void reset() throws IOException {
    super.reset();
    bufferStart = bufferEnd = buffer.length;
    lastNonTokenChar = lastCheckedChar = bufferStart - 1;
    offset = 0;
    gramSize = minGram;
    exhausted = false;
    charBuffer.reset();
  }
}
Term	ab	abc	bc	bcd	cd	cde	de
Position increment	1	1	1	1	1	1	1
Position length	1	1	1	1	1	1	1
Offsets	[0,2[	[0,3[	[1,3[	[1,4[	[2,4[	[2,5[	[3,5[
/

org.apache.lucene/ lucene-analyzers-common/ 8.7.0/ org/apache/lucene/analysis/ngram/NGramTokenizer.java