/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.miscellaneous;

import java.util.Locale;

A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterGraphFilter rules.
@lucene.internal
/** * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterGraphFilter rules. * @lucene.internal */
public final class WordDelimiterIterator { static final int LOWER = 0x01; static final int UPPER = 0x02; static final int DIGIT = 0x04; static final int SUBWORD_DELIM = 0x08; // combinations: for testing, not for setting bits public static final int ALPHA = 0x03; public static final int ALPHANUM = 0x07;
Indicates the end of iteration
/** Indicates the end of iteration */
public static final int DONE = -1; public static final byte[] DEFAULT_WORD_DELIM_TABLE; char text[]; int length;
start position of text, excluding leading delimiters
/** start position of text, excluding leading delimiters */
int startBounds;
end position of text, excluding trailing delimiters
/** end position of text, excluding trailing delimiters */
int endBounds;
Beginning of subword
/** Beginning of subword */
int current;
End of subword
/** End of subword */
int end; /* does this string end with a possessive such as 's */ private boolean hasFinalPossessive = false;
If false, causes case changes to be ignored (subwords will only be generated given SUBWORD_DELIM tokens). (Defaults to true)
/** * If false, causes case changes to be ignored (subwords will only be generated * given SUBWORD_DELIM tokens). (Defaults to true) */
final boolean splitOnCaseChange;
If false, causes numeric changes to be ignored (subwords will only be generated given SUBWORD_DELIM tokens). (Defaults to true)
/** * If false, causes numeric changes to be ignored (subwords will only be generated * given SUBWORD_DELIM tokens). (Defaults to true) */
final boolean splitOnNumerics;
If true, causes trailing "'s" to be removed for each subword. (Defaults to true)

"O'Neil's" => "O", "Neil"
/** * If true, causes trailing "'s" to be removed for each subword. (Defaults to true) * <p/> * "O'Neil's" =&gt; "O", "Neil" */
final boolean stemEnglishPossessive; private final byte[] charTypeTable;
if true, need to skip over a possessive found in the last call to next()
/** if true, need to skip over a possessive found in the last call to next() */
private boolean skipPossessive = false; // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be // done if separated by these chars?) "," would be an obvious candidate... static { byte[] tab = new byte[256]; for (int i = 0; i < 256; i++) { byte code = 0; if (Character.isLowerCase(i)) { code |= LOWER; } else if (Character.isUpperCase(i)) { code |= UPPER; } else if (Character.isDigit(i)) { code |= DIGIT; } if (code == 0) { code = SUBWORD_DELIM; } tab[i] = code; } DEFAULT_WORD_DELIM_TABLE = tab; }
Create a new WordDelimiterIterator operating with the supplied rules.
Params:
  • charTypeTable – table containing character types
  • splitOnCaseChange – if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regardless)
  • splitOnNumerics – if true, causes "j2se" to be three tokens; "j" "2" "se"
  • stemEnglishPossessive – if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"
/** * Create a new WordDelimiterIterator operating with the supplied rules. * * @param charTypeTable table containing character types * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regardless) * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se" * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" =&gt; "O", "Neil" */
WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) { this.charTypeTable = charTypeTable; this.splitOnCaseChange = splitOnCaseChange; this.splitOnNumerics = splitOnNumerics; this.stemEnglishPossessive = stemEnglishPossessive; } @Override public String toString() { if (end == DONE) { return "DONE"; } return new String(text, current, end - current) + " [" + current + "-" + end + "]" + " type=" + String.format(Locale.ROOT, "%#02x", type()); }
Advance to the next subword in the string.
Returns:index of the next subword, or DONE if all subwords have been returned
/** * Advance to the next subword in the string. * * @return index of the next subword, or {@link #DONE} if all subwords have been returned */
int next() { current = end; if (current == DONE) { return DONE; } if (skipPossessive) { current += 2; skipPossessive = false; } int lastType = 0; while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) { current++; } if (current >= endBounds) { return end = DONE; } for (end = current + 1; end < endBounds; end++) { int type = charType(text[end]); if (isBreak(lastType, type)) { break; } lastType = type; } if (end < endBounds - 1 && endsWithPossessive(end + 2)) { skipPossessive = true; } return end; }
Return the type of the current subword. This currently uses the type of the first character in the subword.
Returns:type of the current word
/** * Return the type of the current subword. * This currently uses the type of the first character in the subword. * * @return type of the current word */
int type() { if (end == DONE) { return 0; } int type = charType(text[current]); switch (type) { // return ALPHA word type for both lower and upper case LOWER: case UPPER: return ALPHA; default: return type; } }
Reset the text to a new value, and reset all state
Params:
  • text – New text
  • length – length of the text
/** * Reset the text to a new value, and reset all state * * @param text New text * @param length length of the text */
void setText(char text[], int length) { this.text = text; this.length = this.endBounds = length; current = startBounds = end = 0; skipPossessive = hasFinalPossessive = false; setBounds(); } // ================================================= Helper Methods ================================================
Determines whether the transition from lastType to type indicates a break
Params:
  • lastType – Last subword type
  • type – Current subword type
Returns:true if the transition indicates a break, false otherwise
/** * Determines whether the transition from lastType to type indicates a break * * @param lastType Last subword type * @param type Current subword type * @return {@code true} if the transition indicates a break, {@code false} otherwise */
private boolean isBreak(int lastType, int type) { if ((type & lastType) != 0) { return false; } if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) { // ALPHA->ALPHA: always ignore if case isn't considered. return false; } else if (isUpper(lastType) && isAlpha(type)) { // UPPER->letter: Don't split return false; } else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) { // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split return false; } return true; }
Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters
Returns:true if the current word contains only one subword, false otherwise
/** * Determines if the current word contains only one subword. Note, it could be potentially surrounded by delimiters * * @return {@code true} if the current word contains only one subword, {@code false} otherwise */
boolean isSingleWord() { if (hasFinalPossessive) { return current == startBounds && end == endBounds - 2; } else { return current == startBounds && end == endBounds; } }
Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove it yet, simply note it.
/** * Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove * it yet, simply note it. */
private void setBounds() { while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) { startBounds++; } while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) { endBounds--; } if (endsWithPossessive(endBounds)) { hasFinalPossessive = true; } current = startBounds; }
Determines if the text at the given position indicates an English possessive which should be removed
Params:
  • pos – Position in the text to check if it indicates an English possessive
Returns:true if the text at the position indicates an English possessive, false otherwise
/** * Determines if the text at the given position indicates an English possessive which should be removed * * @param pos Position in the text to check if it indicates an English possessive * @return {@code true} if the text at the position indicates an English possessive, {@code false} otherwise */
private boolean endsWithPossessive(int pos) { return (stemEnglishPossessive && pos > 2 && text[pos - 2] == '\'' && (text[pos - 1] == 's' || text[pos - 1] == 'S') && isAlpha(charType(text[pos - 3])) && (pos == endBounds || isSubwordDelim(charType(text[pos])))); }
Determines the type of the given character
Params:
  • ch – Character whose type is to be determined
Returns:Type of the character
/** * Determines the type of the given character * * @param ch Character whose type is to be determined * @return Type of the character */
private int charType(int ch) { if (ch < charTypeTable.length) { return charTypeTable[ch]; } return getType(ch); }
Computes the type of the given character
Params:
  • ch – Character whose type is to be determined
Returns:Type of the character
/** * Computes the type of the given character * * @param ch Character whose type is to be determined * @return Type of the character */
public static byte getType(int ch) { switch (Character.getType(ch)) { case Character.UPPERCASE_LETTER: return UPPER; case Character.LOWERCASE_LETTER: return LOWER; case Character.TITLECASE_LETTER: case Character.MODIFIER_LETTER: case Character.OTHER_LETTER: case Character.NON_SPACING_MARK: case Character.ENCLOSING_MARK: // depends what it encloses? case Character.COMBINING_SPACING_MARK: return ALPHA; case Character.DECIMAL_DIGIT_NUMBER: case Character.LETTER_NUMBER: case Character.OTHER_NUMBER: return DIGIT; // case Character.SPACE_SEPARATOR: // case Character.LINE_SEPARATOR: // case Character.PARAGRAPH_SEPARATOR: // case Character.CONTROL: // case Character.FORMAT: // case Character.PRIVATE_USE: case Character.SURROGATE: // prevent splitting return ALPHA|DIGIT; // case Character.DASH_PUNCTUATION: // case Character.START_PUNCTUATION: // case Character.END_PUNCTUATION: // case Character.CONNECTOR_PUNCTUATION: // case Character.OTHER_PUNCTUATION: // case Character.MATH_SYMBOL: // case Character.CURRENCY_SYMBOL: // case Character.MODIFIER_SYMBOL: // case Character.OTHER_SYMBOL: // case Character.INITIAL_QUOTE_PUNCTUATION: // case Character.FINAL_QUOTE_PUNCTUATION: default: return SUBWORD_DELIM; } }
Checks if the given word type includes ALPHA
Params:
  • type – Word type to check
Returns:true if the type contains ALPHA, false otherwise
/** * Checks if the given word type includes {@link #ALPHA} * * @param type Word type to check * @return {@code true} if the type contains ALPHA, {@code false} otherwise */
static boolean isAlpha(int type) { return (type & ALPHA) != 0; }
Checks if the given word type includes DIGIT
Params:
  • type – Word type to check
Returns:true if the type contains DIGIT, false otherwise
/** * Checks if the given word type includes {@link #DIGIT} * * @param type Word type to check * @return {@code true} if the type contains DIGIT, {@code false} otherwise */
static boolean isDigit(int type) { return (type & DIGIT) != 0; }
Checks if the given word type includes SUBWORD_DELIM
Params:
  • type – Word type to check
Returns:true if the type contains SUBWORD_DELIM, false otherwise
/** * Checks if the given word type includes {@link #SUBWORD_DELIM} * * @param type Word type to check * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise */
static boolean isSubwordDelim(int type) { return (type & SUBWORD_DELIM) != 0; }
Checks if the given word type includes UPPER
Params:
  • type – Word type to check
Returns:true if the type contains UPPER, false otherwise
/** * Checks if the given word type includes {@link #UPPER} * * @param type Word type to check * @return {@code true} if the type contains UPPER, {@code false} otherwise */
static boolean isUpper(int type) { return (type & UPPER) != 0; } }