 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *     http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * See the License for the specific language governing permissions and
 * limitations under the License.
package org.apache.lucene.analysis.cjk;

import java.io.IOException;

import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.ArrayUtil;

Forms bigrams of CJK terms that are generated from StandardTokenizer or ICUTokenizer.

CJK types are set by these tokenizers, but you can also use CJKBigramFilter(TokenStream, int) to explicitly control which of the CJK scripts are turned into bigrams.

By default, when a CJK character has no adjacent characters to form a bigram, it is output in unigram form. If you want to always output both unigrams and bigrams, set the outputUnigrams flag in CJKBigramFilter(TokenStream, int, boolean). This can be used for a combined unigram+bigram approach.

Unlike ICUTokenizer, StandardTokenizer does not split at script boundaries. Korean Hangul characters are treated the same as many other scripts' letters, and as a result, StandardTokenizer can produce tokens that mix Hangul and non-Hangul characters, e.g. "한국abc". Such mixed-script tokens are typed as <ALPHANUM> rather than <HANGUL>, and as a result, will not be converted to bigrams by CJKBigramFilter. In all cases, all non-CJK input is passed thru unmodified.

/** * Forms bigrams of CJK terms that are generated from StandardTokenizer * or ICUTokenizer. * <p> * CJK types are set by these tokenizers, but you can also use * {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which * of the CJK scripts are turned into bigrams. * <p> * By default, when a CJK character has no adjacent characters to form * a bigram, it is output in unigram form. If you want to always output * both unigrams and bigrams, set the <code>outputUnigrams</code> * flag in {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean)}. * This can be used for a combined unigram+bigram approach. * <p> * Unlike ICUTokenizer, StandardTokenizer does not split at script boundaries. * Korean Hangul characters are treated the same as many other scripts' * letters, and as a result, StandardTokenizer can produce tokens that mix * Hangul and non-Hangul characters, e.g. "한국abc". Such mixed-script tokens * are typed as <code>&lt;ALPHANUM&gt;</code> rather than * <code>&lt;HANGUL&gt;</code>, and as a result, will not be converted to * bigrams by CJKBigramFilter. * * In all cases, all non-CJK input is passed thru unmodified. */
public final class CJKBigramFilter extends TokenFilter { // configuration
bigram flag for Han Ideographs
/** bigram flag for Han Ideographs */
public static final int HAN = 1;
bigram flag for Hiragana
/** bigram flag for Hiragana */
public static final int HIRAGANA = 2;
bigram flag for Katakana
/** bigram flag for Katakana */
public static final int KATAKANA = 4;
bigram flag for Hangul
/** bigram flag for Hangul */
public static final int HANGUL = 8;
when we emit a bigram, it's then marked as this type
/** when we emit a bigram, it's then marked as this type */
public static final String DOUBLE_TYPE = "<DOUBLE>";
when we emit a unigram, it's then marked as this type
/** when we emit a unigram, it's then marked as this type */
public static final String SINGLE_TYPE = "<SINGLE>"; // the types from standardtokenizer private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]; private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA]; private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA]; private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL]; // sentinel value for ignoring a script private static final Object NO = new Object(); // these are set to either their type or NO if we want to pass them thru private final Object doHan; private final Object doHiragana; private final Object doKatakana; private final Object doHangul; // true if we should output unigram tokens always private final boolean outputUnigrams; private boolean ngramState; // false = output unigram, true = output bigram private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private final PositionLengthAttribute posLengthAtt = addAttribute(PositionLengthAttribute.class); // buffers containing codepoint and offsets in parallel int buffer[] = new int[8]; int startOffset[] = new int[8]; int endOffset[] = new int[8]; // length of valid buffer int bufferLen; // current buffer index int index; // the last end offset, to determine if we should bigram across tokens int lastEndOffset; private boolean exhausted; /** * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int) * CJKBigramFilter(in, HAN | HIRAGANA | KATAKANA | HANGUL)} */ public CJKBigramFilter(TokenStream in) { this(in, HAN | HIRAGANA | KATAKANA | HANGUL); } /** * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int, boolean) * CJKBigramFilter(in, flags, false)} */ public CJKBigramFilter(TokenStream in, int flags) { this(in, flags, false); }
Create a new CJKBigramFilter, specifying which writing systems should be bigrammed, and whether or not unigrams should also be output.
  • flags – OR'ed set from HAN, HIRAGANA, KATAKANA, HANGUL
  • outputUnigrams – true if unigrams for the selected writing systems should also be output. when this is false, this is only done when there are no adjacent characters to form a bigram.
/** * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed, * and whether or not unigrams should also be output. * @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA}, * {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL} * @param outputUnigrams true if unigrams for the selected writing systems should also be output. * when this is false, this is only done when there are no adjacent characters to form * a bigram. */
public CJKBigramFilter(TokenStream in, int flags, boolean outputUnigrams) { super(in); doHan = (flags & HAN) == 0 ? NO : HAN_TYPE; doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE; doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE; doHangul = (flags & HANGUL) == 0 ? NO : HANGUL_TYPE; this.outputUnigrams = outputUnigrams; } /* * much of this complexity revolves around handling the special case of a * "lone cjk character" where cjktokenizer would output a unigram. this * is also the only time we ever have to captureState. */ @Override public boolean incrementToken() throws IOException { while (true) { if (hasBufferedBigram()) { // case 1: we have multiple remaining codepoints buffered, // so we can emit a bigram here. if (outputUnigrams) { // when also outputting unigrams, we output the unigram first, // then rewind back to revisit the bigram. // so an input of ABC is A + (rewind)AB + B + (rewind)BC + C // the logic in hasBufferedUnigram ensures we output the C, // even though it did actually have adjacent CJK characters. if (ngramState) { flushBigram(); } else { flushUnigram(); index--; } ngramState = !ngramState; } else { flushBigram(); } return true; } else if (doNext()) { // case 2: look at the token type. should we form any n-grams? String type = typeAtt.type(); if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) { // acceptable CJK type: we form n-grams from these. // as long as the offsets are aligned, we just add these to our current buffer. // otherwise, we clear the buffer and start over. if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue if (hasBufferedUnigram()) { // we have a buffered unigram, and we peeked ahead to see if we could form // a bigram, but we can't, because the offsets are unaligned. capture the state // of this peeked data to be revisited next time thru the loop, and dump our unigram. loneState = captureState(); flushUnigram(); return true; } index = 0; bufferLen = 0; } refill(); } else { // not a CJK type: we just return these as-is. if (hasBufferedUnigram()) { // we have a buffered unigram, and we peeked ahead to see if we could form // a bigram, but we can't, because it's not a CJK type. capture the state // of this peeked data to be revisited next time thru the loop, and dump our unigram. loneState = captureState(); flushUnigram(); return true; } return true; } } else { // case 3: we have only zero or 1 codepoints buffered, // so not enough to form a bigram. But, we also have no // more input. So if we have a buffered codepoint, emit // a unigram, otherwise, it's end of stream. if (hasBufferedUnigram()) { flushUnigram(); // flush our remaining unigram return true; } return false; } } } private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams
looks at next input token, returning false is none is available
/** * looks at next input token, returning false is none is available */
private boolean doNext() throws IOException { if (loneState != null) { restoreState(loneState); loneState = null; return true; } else { if (exhausted) { return false; } else if (input.incrementToken()) { return true; } else { exhausted = true; return false; } } }
refills buffers with new data from the current token.
/** * refills buffers with new data from the current token. */
private void refill() { // compact buffers to keep them smallish if they become large // just a safety check, but technically we only need the last codepoint if (bufferLen > 64) { int last = bufferLen - 1; buffer[0] = buffer[last]; startOffset[0] = startOffset[last]; endOffset[0] = endOffset[last]; bufferLen = 1; index -= last; } char termBuffer[] = termAtt.buffer(); int len = termAtt.length(); int start = offsetAtt.startOffset(); int end = offsetAtt.endOffset(); int newSize = bufferLen + len; buffer = ArrayUtil.grow(buffer, newSize); startOffset = ArrayUtil.grow(startOffset, newSize); endOffset = ArrayUtil.grow(endOffset, newSize); lastEndOffset = end; if (end - start != len) { // crazy offsets (modified by synonym or charfilter): just preserve for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) { cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len); startOffset[bufferLen] = start; endOffset[bufferLen] = end; bufferLen++; } } else { // normal offsets for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) { cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len); cpLen = Character.charCount(cp); startOffset[bufferLen] = start; start = endOffset[bufferLen] = start + cpLen; bufferLen++; } } }
Flushes a bigram token to output from our buffer This is the normal case, e.g. ABC -> AB BC
/** * Flushes a bigram token to output from our buffer * This is the normal case, e.g. ABC -&gt; AB BC */
private void flushBigram() { clearAttributes(); char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries) int len1 = Character.toChars(buffer[index], termBuffer, 0); int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1); termAtt.setLength(len2); offsetAtt.setOffset(startOffset[index], endOffset[index+1]); typeAtt.setType(DOUBLE_TYPE); // when outputting unigrams, all bigrams are synonyms that span two unigrams if (outputUnigrams) { posIncAtt.setPositionIncrement(0); posLengthAtt.setPositionLength(2); } index++; }
Flushes a unigram token to output from our buffer. This happens when we encounter isolated CJK characters, either the whole CJK string is a single character, or we encounter a CJK character surrounded by space, punctuation, english, etc, but not beside any other CJK.
/** * Flushes a unigram token to output from our buffer. * This happens when we encounter isolated CJK characters, either the whole * CJK string is a single character, or we encounter a CJK character surrounded * by space, punctuation, english, etc, but not beside any other CJK. */
private void flushUnigram() { clearAttributes(); char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates) int len = Character.toChars(buffer[index], termBuffer, 0); termAtt.setLength(len); offsetAtt.setOffset(startOffset[index], endOffset[index]); typeAtt.setType(SINGLE_TYPE); index++; }
True if we have multiple codepoints sitting in our buffer
/** * True if we have multiple codepoints sitting in our buffer */
private boolean hasBufferedBigram() { return bufferLen - index > 1; }
True if we have a single codepoint sitting in our buffer, where its future (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen inputs.
/** * True if we have a single codepoint sitting in our buffer, where its future * (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen * inputs. */
private boolean hasBufferedUnigram() { if (outputUnigrams) { // when outputting unigrams always return bufferLen - index == 1; } else { // otherwise it's only when we have a lone CJK character return bufferLen == 1 && index == 0; } } @Override public void reset() throws IOException { super.reset(); bufferLen = 0; index = 0; lastEndOffset = 0; loneState = null; exhausted = false; ngramState = false; } }