org.apache.lucene/lucene-analyzers-common/8.2.0 : org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java

WordDelimiterIterator
http://lucene.apache.org/lucene-parent/lucene-analyzers-common: Additional Analyzers (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.miscellaneous;

A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterGraphFilter rules.
@lucene.internal /**
 * A BreakIterator-like API for iterating over subwords in text, according to WordDelimiterGraphFilter rules.
 * @lucene.internal
 */
public final class WordDelimiterIterator {

  static final int LOWER = 0x01;
  static final int UPPER = 0x02;
  static final int DIGIT = 0x04;
  static final int SUBWORD_DELIM = 0x08;

  // combinations: for testing, not for setting bits
  public static final int ALPHA = 0x03;
  public static final int ALPHANUM = 0x07;

  Indicates the end of iteration /** Indicates the end of iteration */
  public static final int DONE = -1;
  
  public static final byte[] DEFAULT_WORD_DELIM_TABLE;

  char text[];
  int length;
  
  start position of text, excluding leading delimiters /** start position of text, excluding leading delimiters */
  int startBounds;
  end position of text, excluding trailing delimiters /** end position of text, excluding trailing delimiters */
  int endBounds;
  
  Beginning of subword /** Beginning of subword */
  int current;
  End of subword /** End of subword */
  int end;
  
  /* does this string end with a possessive such as 's */
  private boolean hasFinalPossessive = false;
  
  If false, causes case changes to be ignored (subwords will only be generated
given SUBWORD_DELIM tokens). (Defaults to true)
/**
   * If false, causes case changes to be ignored (subwords will only be generated
   * given SUBWORD_DELIM tokens). (Defaults to true)
   */
  final boolean splitOnCaseChange;
  
  If false, causes numeric changes to be ignored (subwords will only be generated
given SUBWORD_DELIM tokens). (Defaults to true)
/**
   * If false, causes numeric changes to be ignored (subwords will only be generated
   * given SUBWORD_DELIM tokens). (Defaults to true)
   */
  final boolean splitOnNumerics;

  If true, causes trailing "'s" to be removed for each subword. (Defaults to true)

"O'Neil's" => "O", "Neil"
/**
   * If true, causes trailing "'s" to be removed for each subword. (Defaults to true)
   * <p/>
   * "O'Neil's" =&gt; "O", "Neil"
   */
  final boolean stemEnglishPossessive;

  private final byte[] charTypeTable;
  
  if true, need to skip over a possessive found in the last call to next() /** if true, need to skip over a possessive found in the last call to next() */
  private boolean skipPossessive = false;

  // TODO: should there be a WORD_DELIM category for chars that only separate words (no catenation of subwords will be
  // done if separated by these chars?) "," would be an obvious candidate...
  static {
    byte[] tab = new byte[256];
    for (int i = 0; i < 256; i++) {
      byte code = 0;
      if (Character.isLowerCase(i)) {
        code |= LOWER;
      }
      else if (Character.isUpperCase(i)) {
        code |= UPPER;
      }
      else if (Character.isDigit(i)) {
        code |= DIGIT;
      }
      if (code == 0) {
        code = SUBWORD_DELIM;
      }
      tab[i] = code;
    }
    DEFAULT_WORD_DELIM_TABLE = tab;
  }

  Create a new WordDelimiterIterator operating with the supplied rules.
Params: charTypeTable – table containing character types
splitOnCaseChange – if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regardless)
splitOnNumerics – if true, causes "j2se" to be three tokens; "j" "2" "se"
stemEnglishPossessive – if true, causes trailing "'s" to be removed for each subword: "O'Neil's" => "O", "Neil"/**
   * Create a new WordDelimiterIterator operating with the supplied rules.
   * 
   * @param charTypeTable table containing character types
   * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens; ("Power-Shot" remains two parts regardless)
   * @param splitOnNumerics if true, causes "j2se" to be three tokens; "j" "2" "se"
   * @param stemEnglishPossessive if true, causes trailing "'s" to be removed for each subword: "O'Neil's" =&gt; "O", "Neil"
   */
  WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean splitOnNumerics, boolean stemEnglishPossessive) {
    this.charTypeTable = charTypeTable;
    this.splitOnCaseChange = splitOnCaseChange;
    this.splitOnNumerics = splitOnNumerics;
    this.stemEnglishPossessive = stemEnglishPossessive;
  }
  
  Advance to the next subword in the string.
Returns: index of the next subword, or DONE if all subwords have been returned/**
   * Advance to the next subword in the string.
   *
   * @return index of the next subword, or {@link #DONE} if all subwords have been returned
   */
  int next() {
    current = end;
    if (current == DONE) {
      return DONE;
    }
    
    if (skipPossessive) {
      current += 2;
      skipPossessive = false;
    }

    int lastType = 0;
    
    while (current < endBounds && (isSubwordDelim(lastType = charType(text[current])))) {
      current++;
    }

    if (current >= endBounds) {
      return end = DONE;
    }
    
    for (end = current + 1; end < endBounds; end++) {
      int type = charType(text[end]);
      if (isBreak(lastType, type)) {
        break;
      }
      lastType = type;
    }
    
    if (end < endBounds - 1 && endsWithPossessive(end + 2)) {
      skipPossessive = true;
    }
    
    return end;
  }


  Return the type of the current subword.
This currently uses the type of the first character in the subword.
Returns: type of the current word/**
   * Return the type of the current subword.
   * This currently uses the type of the first character in the subword.
   *
   * @return type of the current word
   */
  int type() {
    if (end == DONE) {
      return 0;
    }
    
    int type = charType(text[current]);
    switch (type) {
      // return ALPHA word type for both lower and upper
      case LOWER:
      case UPPER:
        return ALPHA;
      default:
        return type;
    }
  }

  Reset the text to a new value, and reset all state
Params: text – New text
length – length of the text/**
   * Reset the text to a new value, and reset all state
   *
   * @param text New text
   * @param length length of the text
   */
  void setText(char text[], int length) {
    this.text = text;
    this.length = this.endBounds = length;
    current = startBounds = end = 0;
    skipPossessive = hasFinalPossessive = false;
    setBounds();
  }

  // ================================================= Helper Methods ================================================

  Determines whether the transition from lastType to type indicates a break
Params: lastType – Last subword type
type – Current subword type
Returns: true if the transition indicates a break, false otherwise/**
   * Determines whether the transition from lastType to type indicates a break
   *
   * @param lastType Last subword type
   * @param type Current subword type
   * @return {@code true} if the transition indicates a break, {@code false} otherwise
   */
  private boolean isBreak(int lastType, int type) {
    if ((type & lastType) != 0) {
      return false;
    }
    
    if (!splitOnCaseChange && isAlpha(lastType) && isAlpha(type)) {
      // ALPHA->ALPHA: always ignore if case isn't considered.
      return false;
    } else if (isUpper(lastType) && isAlpha(type)) {
      // UPPER->letter: Don't split
      return false;
    } else if (!splitOnNumerics && ((isAlpha(lastType) && isDigit(type)) || (isDigit(lastType) && isAlpha(type)))) {
      // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
      return false;
    }

    return true;
  }
  
  Determines if the current word contains only one subword.  Note, it could be potentially surrounded by delimiters
Returns: true if the current word contains only one subword, false otherwise/**
   * Determines if the current word contains only one subword.  Note, it could be potentially surrounded by delimiters
   *
   * @return {@code true} if the current word contains only one subword, {@code false} otherwise
   */
  boolean isSingleWord() {
    if (hasFinalPossessive) {
      return current == startBounds && end == endBounds - 2;
    }
    else {
      return current == startBounds && end == endBounds;
    }
  }
   
  Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
it yet, simply note it.
/**
   * Set the internal word bounds (remove leading and trailing delimiters). Note, if a possessive is found, don't remove
   * it yet, simply note it.
   */
  private void setBounds() {
    while (startBounds < length && (isSubwordDelim(charType(text[startBounds])))) {
      startBounds++;
    }
    
    while (endBounds > startBounds && (isSubwordDelim(charType(text[endBounds - 1])))) {
      endBounds--;
    }
    if (endsWithPossessive(endBounds)) {
      hasFinalPossessive = true;
    }
    current = startBounds;
  }
  
  Determines if the text at the given position indicates an English possessive which should be removed
Params: pos – Position in the text to check if it indicates an English possessive
Returns: true if the text at the position indicates an English possessive, false otherwise/**
   * Determines if the text at the given position indicates an English possessive which should be removed
   *
   * @param pos Position in the text to check if it indicates an English possessive
   * @return {@code true} if the text at the position indicates an English possessive, {@code false} otherwise
   */
  private boolean endsWithPossessive(int pos) {
    return (stemEnglishPossessive &&
            pos > 2 &&
            text[pos - 2] == '\'' &&
            (text[pos - 1] == 's' || text[pos - 1] == 'S') &&
            isAlpha(charType(text[pos - 3])) &&
            (pos == endBounds || isSubwordDelim(charType(text[pos]))));
  }

  Determines the type of the given character
Params: ch – Character whose type is to be determined
Returns: Type of the character/**
   * Determines the type of the given character
   *
   * @param ch Character whose type is to be determined
   * @return Type of the character
   */
  private int charType(int ch) {
    if (ch < charTypeTable.length) {
      return charTypeTable[ch];
    }
    return getType(ch);
  }
  
  Computes the type of the given character
Params: ch – Character whose type is to be determined
Returns: Type of the character/**
   * Computes the type of the given character
   *
   * @param ch Character whose type is to be determined
   * @return Type of the character
   */
  public static byte getType(int ch) {
    switch (Character.getType(ch)) {
      case Character.UPPERCASE_LETTER: return UPPER;
      case Character.LOWERCASE_LETTER: return LOWER;

      case Character.TITLECASE_LETTER:
      case Character.MODIFIER_LETTER:
      case Character.OTHER_LETTER:
      case Character.NON_SPACING_MARK:
      case Character.ENCLOSING_MARK:  // depends what it encloses?
      case Character.COMBINING_SPACING_MARK:
        return ALPHA; 

      case Character.DECIMAL_DIGIT_NUMBER:
      case Character.LETTER_NUMBER:
      case Character.OTHER_NUMBER:
        return DIGIT;

      // case Character.SPACE_SEPARATOR:
      // case Character.LINE_SEPARATOR:
      // case Character.PARAGRAPH_SEPARATOR:
      // case Character.CONTROL:
      // case Character.FORMAT:
      // case Character.PRIVATE_USE:

      case Character.SURROGATE:  // prevent splitting
        return ALPHA|DIGIT;  

      // case Character.DASH_PUNCTUATION:
      // case Character.START_PUNCTUATION:
      // case Character.END_PUNCTUATION:
      // case Character.CONNECTOR_PUNCTUATION:
      // case Character.OTHER_PUNCTUATION:
      // case Character.MATH_SYMBOL:
      // case Character.CURRENCY_SYMBOL:
      // case Character.MODIFIER_SYMBOL:
      // case Character.OTHER_SYMBOL:
      // case Character.INITIAL_QUOTE_PUNCTUATION:
      // case Character.FINAL_QUOTE_PUNCTUATION:

      default: return SUBWORD_DELIM;
    }
  }

  Checks if the given word type includes ALPHA 
Params: type – Word type to check
Returns: true if the type contains ALPHA, false otherwise/**
   * Checks if the given word type includes {@link #ALPHA}
   *
   * @param type Word type to check
   * @return {@code true} if the type contains ALPHA, {@code false} otherwise
   */
  static boolean isAlpha(int type) {
    return (type & ALPHA) != 0;
  }

  Checks if the given word type includes DIGIT 
Params: type – Word type to check
Returns: true if the type contains DIGIT, false otherwise/**
   * Checks if the given word type includes {@link #DIGIT}
   *
   * @param type Word type to check
   * @return {@code true} if the type contains DIGIT, {@code false} otherwise
   */
  static boolean isDigit(int type) {
    return (type & DIGIT) != 0;
  }

  Checks if the given word type includes SUBWORD_DELIM 
Params: type – Word type to check
Returns: true if the type contains SUBWORD_DELIM, false otherwise/**
   * Checks if the given word type includes {@link #SUBWORD_DELIM}
   *
   * @param type Word type to check
   * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
   */
  static boolean isSubwordDelim(int type) {
    return (type & SUBWORD_DELIM) != 0;
  }

  Checks if the given word type includes UPPER 
Params: type – Word type to check
Returns: true if the type contains UPPER, false otherwise/**
   * Checks if the given word type includes {@link #UPPER}
   *
   * @param type Word type to check
   * @return {@code true} if the type contains UPPER, {@code false} otherwise
   */
  static boolean isUpper(int type) {
    return (type & UPPER) != 0;
  }

}
Params:	lastType – Last subword type type – Current subword type
Returns:	`true` if the transition indicates a break, `false` otherwise
Params:	pos – Position in the text to check if it indicates an English possessive
Returns:	`true` if the text at the position indicates an English possessive, `false` otherwise
Params:	ch – Character whose type is to be determined
Returns:	Type of the character
Params:	type – Word type to check
Returns:	`true` if the type contains ALPHA, `false` otherwise
/

org.apache.lucene/ lucene-analyzers-common/ 8.2.0/ org/apache/lucene/analysis/miscellaneous/WordDelimiterIterator.java