/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.ngram;
import java.io.IOException;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.CharacterUtils;
import org.apache.lucene.util.AttributeFactory;
Tokenizes the input into n-grams of the given size(s).
On the contrary to NGramTokenFilter
, this class sets offsets so that characters between startOffset and endOffset in the original stream are the same as the term chars.
For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
Term ab abc bc bcd cd cde de
Position increment 1 1 1 1 1 1 1
Position length 1 1 1 1 1 1 1
Offsets [0,2[ [0,3[ [1,3[ [1,4[ [2,4[ [2,5[ [3,5[
This tokenizer changed a lot in Lucene 4.4 in order to:
- tokenize in a streaming fashion to support streams which are larger
than 1024 chars (limit of the previous version),
- count grams based on unicode code points instead of java chars (and
never split in the middle of surrogate pairs),
- give the ability to
pre-tokenize
the stream before computing n-grams.
Additionally, this class doesn't trim trailing whitespaces and emits
tokens in a different order, tokens are now emitted by increasing start
offsets while they used to be emitted by increasing lengths (which prevented
from supporting large input streams).
/**
* Tokenizes the input into n-grams of the given size(s).
* <p>On the contrary to {@link NGramTokenFilter}, this class sets offsets so
* that characters between startOffset and endOffset in the original stream are
* the same as the term chars.
* <p>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
* <table summary="ngram tokens example">
* <tr><th>Term</th><td>ab</td><td>abc</td><td>bc</td><td>bcd</td><td>cd</td><td>cde</td><td>de</td></tr>
* <tr><th>Position increment</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
* <tr><th>Position length</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
* <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
* </table>
* <a name="version"></a>
* <p>This tokenizer changed a lot in Lucene 4.4 in order to:<ul>
* <li>tokenize in a streaming fashion to support streams which are larger
* than 1024 chars (limit of the previous version),
* <li>count grams based on unicode code points instead of java chars (and
* never split in the middle of surrogate pairs),
* <li>give the ability to {@link #isTokenChar(int) pre-tokenize} the stream
* before computing n-grams.</ul>
* <p>Additionally, this class doesn't trim trailing whitespaces and emits
* tokens in a different order, tokens are now emitted by increasing start
* offsets while they used to be emitted by increasing lengths (which prevented
* from supporting large input streams).
*/
// non-final to allow for overriding isTokenChar, but all other methods should be final
public class NGramTokenizer extends Tokenizer {
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
private CharacterUtils.CharacterBuffer charBuffer;
private int[] buffer; // like charBuffer, but converted to code points
private int bufferStart, bufferEnd; // remaining slice in buffer
private int offset;
private int gramSize;
private int minGram, maxGram;
private boolean exhausted;
private int lastCheckedChar; // last offset in the buffer that we checked
private int lastNonTokenChar; // last offset that we found to not be a token char
private boolean edgesOnly; // leading edges n-grams only
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
NGramTokenizer(int minGram, int maxGram, boolean edgesOnly) {
init(minGram, maxGram, edgesOnly);
}
Creates NGramTokenizer with given min and max n-grams.
Params: - minGram – the smallest n-gram to generate
- maxGram – the largest n-gram to generate
/**
* Creates NGramTokenizer with given min and max n-grams.
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(int minGram, int maxGram) {
this(minGram, maxGram, false);
}
NGramTokenizer(AttributeFactory factory, int minGram, int maxGram, boolean edgesOnly) {
super(factory);
init(minGram, maxGram, edgesOnly);
}
Creates NGramTokenizer with given min and max n-grams.
Params: - factory –
AttributeFactory
to use - minGram – the smallest n-gram to generate
- maxGram – the largest n-gram to generate
/**
* Creates NGramTokenizer with given min and max n-grams.
* @param factory {@link org.apache.lucene.util.AttributeFactory} to use
* @param minGram the smallest n-gram to generate
* @param maxGram the largest n-gram to generate
*/
public NGramTokenizer(AttributeFactory factory, int minGram, int maxGram) {
this(factory, minGram, maxGram, false);
}
Creates NGramTokenizer with default min and max n-grams.
/**
* Creates NGramTokenizer with default min and max n-grams.
*/
public NGramTokenizer() {
this(DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
}
private void init(int minGram, int maxGram, boolean edgesOnly) {
if (minGram < 1) {
throw new IllegalArgumentException("minGram must be greater than zero");
}
if (minGram > maxGram) {
throw new IllegalArgumentException("minGram must not be greater than maxGram");
}
this.minGram = minGram;
this.maxGram = maxGram;
this.edgesOnly = edgesOnly;
charBuffer = CharacterUtils.newCharacterBuffer(2 * maxGram + 1024); // 2 * maxGram in case all code points require 2 chars and + 1024 for buffering to not keep polling the Reader
buffer = new int[charBuffer.getBuffer().length];
// Make the term att large enough
termAtt.resizeBuffer(2 * maxGram);
}
@Override
public final boolean incrementToken() throws IOException {
clearAttributes();
// termination of this loop is guaranteed by the fact that every iteration
// either advances the buffer (calls consumes()) or increases gramSize
while (true) {
// compact
if (bufferStart >= bufferEnd - maxGram - 1 && !exhausted) {
System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
bufferEnd -= bufferStart;
lastCheckedChar -= bufferStart;
lastNonTokenChar -= bufferStart;
bufferStart = 0;
// fill in remaining space
exhausted = !CharacterUtils.fill(charBuffer, input, buffer.length - bufferEnd);
// convert to code points
bufferEnd += CharacterUtils.toCodePoints(charBuffer.getBuffer(), 0, charBuffer.getLength(), buffer, bufferEnd);
}
// should we go to the next offset?
if (gramSize > maxGram || (bufferStart + gramSize) > bufferEnd) {
if (bufferStart + 1 + minGram > bufferEnd) {
assert exhausted;
return false;
}
consume();
gramSize = minGram;
}
updateLastNonTokenChar();
// retry if the token to be emitted was going to not only contain token chars
final boolean termContainsNonTokenChar = lastNonTokenChar >= bufferStart && lastNonTokenChar < (bufferStart + gramSize);
final boolean isEdgeAndPreviousCharIsTokenChar = edgesOnly && lastNonTokenChar != bufferStart - 1;
if (termContainsNonTokenChar || isEdgeAndPreviousCharIsTokenChar) {
consume();
gramSize = minGram;
continue;
}
final int length = CharacterUtils.toChars(buffer, bufferStart, gramSize, termAtt.buffer(), 0);
termAtt.setLength(length);
posIncAtt.setPositionIncrement(1);
posLenAtt.setPositionLength(1);
offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + length));
++gramSize;
return true;
}
}
private void updateLastNonTokenChar() {
final int termEnd = bufferStart + gramSize - 1;
if (termEnd > lastCheckedChar) {
for (int i = termEnd; i > lastCheckedChar; --i) {
if (!isTokenChar(buffer[i])) {
lastNonTokenChar = i;
break;
}
}
lastCheckedChar = termEnd;
}
}
Consume one code point. /** Consume one code point. */
private void consume() {
offset += Character.charCount(buffer[bufferStart++]);
}
Only collect characters which satisfy this condition. /** Only collect characters which satisfy this condition. */
protected boolean isTokenChar(int chr) {
return true;
}
@Override
public final void end() throws IOException {
super.end();
assert bufferStart <= bufferEnd;
int endOffset = offset;
for (int i = bufferStart; i < bufferEnd; ++i) {
endOffset += Character.charCount(buffer[i]);
}
endOffset = correctOffset(endOffset);
// set final offset
offsetAtt.setOffset(endOffset, endOffset);
}
@Override
public final void reset() throws IOException {
super.reset();
bufferStart = bufferEnd = buffer.length;
lastNonTokenChar = lastCheckedChar = bufferStart - 1;
offset = 0;
gramSize = minGram;
exhausted = false;
charBuffer.reset();
}
}