/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.util;


import java.text.BreakIterator; // javadoc
import java.text.CharacterIterator;
import java.util.Locale;

A CharacterIterator used internally for use with BreakIterator
@lucene.internal
/** * A CharacterIterator used internally for use with {@link BreakIterator} * @lucene.internal */
public abstract class CharArrayIterator implements CharacterIterator { private char array[]; private int start; private int index; private int length; private int limit; public char [] getText() { return array; } public int getStart() { return start; } public int getLength() { return length; }
Set a new region of text to be examined by this iterator
Params:
  • array – text buffer to examine
  • start – offset into buffer
  • length – maximum length to examine
/** * Set a new region of text to be examined by this iterator * * @param array text buffer to examine * @param start offset into buffer * @param length maximum length to examine */
public void setText(final char array[], int start, int length) { this.array = array; this.start = start; this.index = start; this.length = length; this.limit = start + length; } @Override public char current() { return (index == limit) ? DONE : jreBugWorkaround(array[index]); } protected abstract char jreBugWorkaround(char ch); @Override public char first() { index = start; return current(); } @Override public int getBeginIndex() { return 0; } @Override public int getEndIndex() { return length; } @Override public int getIndex() { return index - start; } @Override public char last() { index = (limit == start) ? limit : limit - 1; return current(); } @Override public char next() { if (++index >= limit) { index = limit; return DONE; } else { return current(); } } @Override public char previous() { if (--index < start) { index = start; return DONE; } else { return current(); } } @Override public char setIndex(int position) { if (position < getBeginIndex() || position > getEndIndex()) throw new IllegalArgumentException("Illegal Position: " + position); index = start + position; return current(); } @Override public CharArrayIterator clone() { try { return (CharArrayIterator)super.clone(); } catch (CloneNotSupportedException e) { // CharacterIterator does not allow you to throw CloneNotSupported throw new RuntimeException(e); } }
Create a new CharArrayIterator that works around JRE bugs in a manner suitable for BreakIterator.getSentenceInstance()
/** * Create a new CharArrayIterator that works around JRE bugs * in a manner suitable for {@link BreakIterator#getSentenceInstance()} */
public static CharArrayIterator newSentenceInstance() { if (HAS_BUGGY_BREAKITERATORS) { return new CharArrayIterator() { // work around this for now by lying about all surrogates to // the sentence tokenizer, instead we treat them all as // SContinue so we won't break around them. @Override protected char jreBugWorkaround(char ch) { return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch; } }; } else { return new CharArrayIterator() { // no bugs @Override protected char jreBugWorkaround(char ch) { return ch; } }; } }
Create a new CharArrayIterator that works around JRE bugs in a manner suitable for BreakIterator.getWordInstance()
/** * Create a new CharArrayIterator that works around JRE bugs * in a manner suitable for {@link BreakIterator#getWordInstance()} */
public static CharArrayIterator newWordInstance() { if (HAS_BUGGY_BREAKITERATORS) { return new CharArrayIterator() { // work around this for now by lying about all surrogates to the word, // instead we treat them all as ALetter so we won't break around them. @Override protected char jreBugWorkaround(char ch) { return ch >= 0xD800 && ch <= 0xDFFF ? 0x0041 : ch; } }; } else { return new CharArrayIterator() { // no bugs @Override protected char jreBugWorkaround(char ch) { return ch; } }; } }
True if this JRE has a buggy BreakIterator implementation
/** * True if this JRE has a buggy BreakIterator implementation */
public static final boolean HAS_BUGGY_BREAKITERATORS; static { boolean v; try { BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); bi.setText("\udb40\udc53"); bi.next(); v = false; } catch (Exception e) { v = true; } HAS_BUGGY_BREAKITERATORS = v; } }