/*
 * Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
/*
 *******************************************************************************
 * (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved         *
 *                                                                             *
 * The original version of this source code and documentation is copyrighted   *
 * and owned by IBM, These materials are provided under terms of a License     *
 * Agreement between IBM and Sun. This technology is protected by multiple     *
 * US and International patents. This notice and attribution to IBM may not    *
 * to removed.                                                                 *
 *******************************************************************************
 */

package sun.text.normalizer;

Class enabling iteration of the values in a Trie.

Result of each iteration contains the interval of codepoints that have the same value type and the value type itself.

The comparison of each codepoint value is done via extract(), which the default implementation is to return the value as it is.

Method extract() can be overwritten to perform manipulations on codepoint values in order to perform specialized comparison.

TrieIterator is designed to be a generic iterator for the CharTrie and the IntTrie, hence to accommodate both types of data, the return result will be in terms of int (32 bit) values.

See com.ibm.icu.text.UCharacterTypeIterator for examples of use.

Notes for porting utrie_enum from icu4c to icu4j:
Internally, icu4c's utrie_enum performs all iterations in its body. In Java sense, the caller will have to pass a object with a callback function UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, uint32_t value) into utrie_enum. utrie_enum will then find ranges of codepoints with the same value as determined by UTrieEnumValue(const void *context, uint32_t value). for each range, utrie_enum calls the callback function to perform a task. In this way, icu4c performs the iteration within utrie_enum. To follow the JDK model, icu4j is slightly different from icu4c. Instead of requesting the caller to implement an object for a callback. The caller will have to implement a subclass of TrieIterator, fleshing out the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j, the caller will have to code his own iteration and flesh out the task (equivalent to UTrieEnumRange) to be performed in the iteration loop.

There are basically 3 usage scenarios for porting:

1) UTrieEnumValue is the only implemented callback then just implement a subclass of TrieIterator and override the extract(int) method. The extract(int) method is analogus to UTrieEnumValue callback.

2) UTrieEnumValue and UTrieEnumRange both are implemented then implement a subclass of TrieIterator, override the extract method and iterate, e.g

utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange, set);
In Java :

class TrieIteratorImpl extends TrieIterator{
    public TrieIteratorImpl(Trie data){
        super(data);
    }
    public int extract(int value){
        // port the implementation of _enumPropertyStartsValue here
    }
}
....
TrieIterator fcdIter  = new TrieIteratorImpl(fcdTrieImpl.fcdTrie);
while(fcdIter.next(result)) {
    // port the implementation of _enumPropertyStartsRange
}

3) UTrieEnumRange is the only implemented callback then just implement the while loop, when utrie_enum is called

// utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set);
TrieIterator fcdIter  = new TrieIterator(fcdTrieImpl.fcdTrie);
while(fcdIter.next(result)){
    set.add(result.start);
}

Author:synwee
See Also:
  • Trie
  • UCharacterTypeIterator
Since:release 2.1, Jan 17 2002
/** * <p>Class enabling iteration of the values in a Trie.</p> * <p>Result of each iteration contains the interval of codepoints that have * the same value type and the value type itself.</p> * <p>The comparison of each codepoint value is done via extract(), which the * default implementation is to return the value as it is.</p> * <p>Method extract() can be overwritten to perform manipulations on * codepoint values in order to perform specialized comparison.</p> * <p>TrieIterator is designed to be a generic iterator for the CharTrie * and the IntTrie, hence to accommodate both types of data, the return * result will be in terms of int (32 bit) values.</p> * <p>See com.ibm.icu.text.UCharacterTypeIterator for examples of use.</p> * <p>Notes for porting utrie_enum from icu4c to icu4j:<br> * Internally, icu4c's utrie_enum performs all iterations in its body. In Java * sense, the caller will have to pass a object with a callback function * UTrieEnumRange(const void *context, UChar32 start, UChar32 limit, * uint32_t value) into utrie_enum. utrie_enum will then find ranges of * codepoints with the same value as determined by * UTrieEnumValue(const void *context, uint32_t value). for each range, * utrie_enum calls the callback function to perform a task. In this way, * icu4c performs the iteration within utrie_enum. * To follow the JDK model, icu4j is slightly different from icu4c. * Instead of requesting the caller to implement an object for a callback. * The caller will have to implement a subclass of TrieIterator, fleshing out * the method extract(int) (equivalent to UTrieEnumValue). Independent of icu4j, * the caller will have to code his own iteration and flesh out the task * (equivalent to UTrieEnumRange) to be performed in the iteration loop. * </p> * <p>There are basically 3 usage scenarios for porting:</p> * <p>1) UTrieEnumValue is the only implemented callback then just implement a * subclass of TrieIterator and override the extract(int) method. The * extract(int) method is analogus to UTrieEnumValue callback. * </p> * <p>2) UTrieEnumValue and UTrieEnumRange both are implemented then implement * a subclass of TrieIterator, override the extract method and iterate, e.g * </p> * <p>utrie_enum(&normTrie, _enumPropertyStartsValue, _enumPropertyStartsRange, * set);<br> * In Java :<br> * <pre> * class TrieIteratorImpl extends TrieIterator{ * public TrieIteratorImpl(Trie data){ * super(data); * } * public int extract(int value){ * // port the implementation of _enumPropertyStartsValue here * } * } * .... * TrieIterator fcdIter = new TrieIteratorImpl(fcdTrieImpl.fcdTrie); * while(fcdIter.next(result)) { * // port the implementation of _enumPropertyStartsRange * } * </pre> * </p> * <p>3) UTrieEnumRange is the only implemented callback then just implement * the while loop, when utrie_enum is called * <pre> * // utrie_enum(&fcdTrie, NULL, _enumPropertyStartsRange, set); * TrieIterator fcdIter = new TrieIterator(fcdTrieImpl.fcdTrie); * while(fcdIter.next(result)){ * set.add(result.start); * } * </pre> * </p> * @author synwee * @see com.ibm.icu.impl.Trie * @see com.ibm.icu.lang.UCharacterTypeIterator * @since release 2.1, Jan 17 2002 */
public class TrieIterator implements RangeValueIterator { // public constructor ---------------------------------------------
TrieEnumeration constructor
Params:
  • trie – to be used
Throws:
/** * TrieEnumeration constructor * @param trie to be used * @exception IllegalArgumentException throw when argument is null. */
public TrieIterator(Trie trie) { if (trie == null) { throw new IllegalArgumentException( "Argument trie cannot be null"); } m_trie_ = trie; // synwee: check that extract belongs to the child class m_initialValue_ = extract(m_trie_.getInitialValue()); reset(); } // public methods -------------------------------------------------

Returns true if we are not at the end of the iteration, false otherwise.

The next set of codepoints with the same value type will be calculated during this call and returned in the arguement element.

Params:
  • element – return result
Throws:
  • NoSuchElementException – - if no more elements exist.
See Also:
  • Element
Returns:true if we are not at the end of the iteration, false otherwise.
/** * <p>Returns true if we are not at the end of the iteration, false * otherwise.</p> * <p>The next set of codepoints with the same value type will be * calculated during this call and returned in the arguement element.</p> * @param element return result * @return true if we are not at the end of the iteration, false otherwise. * @exception NoSuchElementException - if no more elements exist. * @see com.ibm.icu.util.RangeValueIterator.Element */
public final boolean next(Element element) { if (m_nextCodepoint_ > UCharacter.MAX_VALUE) { return false; } if (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE && calculateNextBMPElement(element)) { return true; } calculateNextSupplementaryElement(element); return true; }
Resets the iterator to the beginning of the iteration
/** * Resets the iterator to the beginning of the iteration */
public final void reset() { m_currentCodepoint_ = 0; m_nextCodepoint_ = 0; m_nextIndex_ = 0; m_nextBlock_ = m_trie_.m_index_[0] << Trie.INDEX_STAGE_2_SHIFT_; if (m_nextBlock_ == 0) { m_nextValue_ = m_initialValue_; } else { m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_)); } m_nextBlockIndex_ = 0; m_nextTrailIndexOffset_ = TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_; } // protected methods ----------------------------------------------
Called by next() to extracts a 32 bit value from a trie value used for comparison. This method is to be overwritten if special manipulation is to be done to retrieve a relevant comparison. The default function is to return the value as it is.
Params:
  • value – a value from the trie
Returns:extracted value
/** * Called by next() to extracts a 32 bit value from a trie value * used for comparison. * This method is to be overwritten if special manipulation is to be done * to retrieve a relevant comparison. * The default function is to return the value as it is. * @param value a value from the trie * @return extracted value */
protected int extract(int value) { return value; } // private methods ------------------------------------------------
Set the result values
Params:
  • element – return result object
  • start – codepoint of range
  • limit – (end + 1) codepoint of range
  • value – common value of range
/** * Set the result values * @param element return result object * @param start codepoint of range * @param limit (end + 1) codepoint of range * @param value common value of range */
private final void setResult(Element element, int start, int limit, int value) { element.start = start; element.limit = limit; element.value = value; }
Finding the next element. This method is called just before returning the result of next(). We always store the next element before it is requested. In the case that we have to continue calculations into the supplementary planes, a false will be returned.
Params:
  • element – return result object
Returns:true if the next range is found, false if we have to proceed to the supplementary range.
/** * Finding the next element. * This method is called just before returning the result of * next(). * We always store the next element before it is requested. * In the case that we have to continue calculations into the * supplementary planes, a false will be returned. * @param element return result object * @return true if the next range is found, false if we have to proceed to * the supplementary range. */
private final boolean calculateNextBMPElement(Element element) { int currentBlock = m_nextBlock_; int currentValue = m_nextValue_; m_currentCodepoint_ = m_nextCodepoint_; m_nextCodepoint_ ++; m_nextBlockIndex_ ++; if (!checkBlockDetail(currentValue)) { setResult(element, m_currentCodepoint_, m_nextCodepoint_, currentValue); return true; } // synwee check that next block index == 0 here // enumerate BMP - the main loop enumerates data blocks while (m_nextCodepoint_ < UCharacter.SUPPLEMENTARY_MIN_VALUE) { m_nextIndex_ ++; // because of the way the character is split to form the index // the lead surrogate and trail surrogate can not be in the // mid of a block if (m_nextCodepoint_ == LEAD_SURROGATE_MIN_VALUE_) { // skip lead surrogate code units, // go to lead surrogate codepoints m_nextIndex_ = BMP_INDEX_LENGTH_; } else if (m_nextCodepoint_ == TRAIL_SURROGATE_MIN_VALUE_) { // go back to regular BMP code points m_nextIndex_ = m_nextCodepoint_ >> Trie.INDEX_STAGE_1_SHIFT_; } m_nextBlockIndex_ = 0; if (!checkBlock(currentBlock, currentValue)) { setResult(element, m_currentCodepoint_, m_nextCodepoint_, currentValue); return true; } } m_nextCodepoint_ --; // step one back since this value has not been m_nextBlockIndex_ --; // retrieved yet. return false; }
Finds the next supplementary element. For each entry in the trie, the value to be delivered is passed through extract(). We always store the next element before it is requested. Called after calculateNextBMP() completes its round of BMP characters. There is a slight difference in the usage of m_currentCodepoint_ here as compared to calculateNextBMP(). Though both represents the lower bound of the next element, in calculateNextBMP() it gets set at the start of any loop, where-else, in calculateNextSupplementary() since m_currentCodepoint_ already contains the lower bound of the next element (passed down from calculateNextBMP()), we keep it till the end before resetting it to the new value. Note, if there are no more iterations, it will never get to here. Blocked out by next().
Params:
  • element – return result object
/** * Finds the next supplementary element. * For each entry in the trie, the value to be delivered is passed through * extract(). * We always store the next element before it is requested. * Called after calculateNextBMP() completes its round of BMP characters. * There is a slight difference in the usage of m_currentCodepoint_ * here as compared to calculateNextBMP(). Though both represents the * lower bound of the next element, in calculateNextBMP() it gets set * at the start of any loop, where-else, in calculateNextSupplementary() * since m_currentCodepoint_ already contains the lower bound of the * next element (passed down from calculateNextBMP()), we keep it till * the end before resetting it to the new value. * Note, if there are no more iterations, it will never get to here. * Blocked out by next(). * @param element return result object */
private final void calculateNextSupplementaryElement(Element element) { int currentValue = m_nextValue_; int currentBlock = m_nextBlock_; m_nextCodepoint_ ++; m_nextBlockIndex_ ++; if (UTF16.getTrailSurrogate(m_nextCodepoint_) != UTF16.TRAIL_SURROGATE_MIN_VALUE) { // this piece is only called when we are in the middle of a lead // surrogate block if (!checkNullNextTrailIndex() && !checkBlockDetail(currentValue)) { setResult(element, m_currentCodepoint_, m_nextCodepoint_, currentValue); m_currentCodepoint_ = m_nextCodepoint_; return; } // we have cleared one block m_nextIndex_ ++; m_nextTrailIndexOffset_ ++; if (!checkTrailBlock(currentBlock, currentValue)) { setResult(element, m_currentCodepoint_, m_nextCodepoint_, currentValue); m_currentCodepoint_ = m_nextCodepoint_; return; } } int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); // enumerate supplementary code points while (nextLead < TRAIL_SURROGATE_MIN_VALUE_) { // lead surrogate access int leadBlock = m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << Trie.INDEX_STAGE_2_SHIFT_; if (leadBlock == m_trie_.m_dataOffset_) { // no entries for a whole block of lead surrogates if (currentValue != m_initialValue_) { m_nextValue_ = m_initialValue_; m_nextBlock_ = 0; m_nextBlockIndex_ = 0; setResult(element, m_currentCodepoint_, m_nextCodepoint_, currentValue); m_currentCodepoint_ = m_nextCodepoint_; return; } nextLead += DATA_BLOCK_LENGTH_; // number of total affected supplementary codepoints in one // block // this is not a simple addition of // DATA_BLOCK_SUPPLEMENTARY_LENGTH since we need to consider // that we might have moved some of the codepoints m_nextCodepoint_ = UCharacterProperty.getRawSupplementary( (char)nextLead, (char)UTF16.TRAIL_SURROGATE_MIN_VALUE); continue; } if (m_trie_.m_dataManipulate_ == null) { throw new NullPointerException( "The field DataManipulate in this Trie is null"); } // enumerate trail surrogates for this lead surrogate m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( m_trie_.getValue(leadBlock + (nextLead & Trie.INDEX_STAGE_3_MASK_))); if (m_nextIndex_ <= 0) { // no data for this lead surrogate if (currentValue != m_initialValue_) { m_nextValue_ = m_initialValue_; m_nextBlock_ = 0; m_nextBlockIndex_ = 0; setResult(element, m_currentCodepoint_, m_nextCodepoint_, currentValue); m_currentCodepoint_ = m_nextCodepoint_; return; } m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_; } else { m_nextTrailIndexOffset_ = 0; if (!checkTrailBlock(currentBlock, currentValue)) { setResult(element, m_currentCodepoint_, m_nextCodepoint_, currentValue); m_currentCodepoint_ = m_nextCodepoint_; return; } } nextLead ++; } // deliver last range setResult(element, m_currentCodepoint_, UCharacter.MAX_VALUE + 1, currentValue); }
Internal block value calculations Performs calculations on a data block to find codepoints in m_nextBlock_ after the index m_nextBlockIndex_ that has the same value. Note m_*_ variables at this point is the next codepoint whose value has not been calculated. But when returned with false, it will be the last codepoint whose value has been calculated.
Params:
  • currentValue – the value which other codepoints are tested against
Returns:true if the whole block has the same value as currentValue or if the whole block has been calculated, false otherwise.
/** * Internal block value calculations * Performs calculations on a data block to find codepoints in m_nextBlock_ * after the index m_nextBlockIndex_ that has the same value. * Note m_*_ variables at this point is the next codepoint whose value * has not been calculated. * But when returned with false, it will be the last codepoint whose * value has been calculated. * @param currentValue the value which other codepoints are tested against * @return true if the whole block has the same value as currentValue or if * the whole block has been calculated, false otherwise. */
private final boolean checkBlockDetail(int currentValue) { while (m_nextBlockIndex_ < DATA_BLOCK_LENGTH_) { m_nextValue_ = extract(m_trie_.getValue(m_nextBlock_ + m_nextBlockIndex_)); if (m_nextValue_ != currentValue) { return false; } ++ m_nextBlockIndex_; ++ m_nextCodepoint_; } return true; }
Internal block value calculations Performs calculations on a data block to find codepoints in m_nextBlock_ that has the same value. Will call checkBlockDetail() if highlevel check fails. Note m_*_ variables at this point is the next codepoint whose value has not been calculated.
Params:
  • currentBlock – the initial block containing all currentValue
  • currentValue – the value which other codepoints are tested against
Returns:true if the whole block has the same value as currentValue or if the whole block has been calculated, false otherwise.
/** * Internal block value calculations * Performs calculations on a data block to find codepoints in m_nextBlock_ * that has the same value. * Will call checkBlockDetail() if highlevel check fails. * Note m_*_ variables at this point is the next codepoint whose value * has not been calculated. * @param currentBlock the initial block containing all currentValue * @param currentValue the value which other codepoints are tested against * @return true if the whole block has the same value as currentValue or if * the whole block has been calculated, false otherwise. */
private final boolean checkBlock(int currentBlock, int currentValue) { m_nextBlock_ = m_trie_.m_index_[m_nextIndex_] << Trie.INDEX_STAGE_2_SHIFT_; if (m_nextBlock_ == currentBlock && (m_nextCodepoint_ - m_currentCodepoint_) >= DATA_BLOCK_LENGTH_) { // the block is the same as the previous one, filled with // currentValue m_nextCodepoint_ += DATA_BLOCK_LENGTH_; } else if (m_nextBlock_ == 0) { // this is the all-initial-value block if (currentValue != m_initialValue_) { m_nextValue_ = m_initialValue_; m_nextBlockIndex_ = 0; return false; } m_nextCodepoint_ += DATA_BLOCK_LENGTH_; } else { if (!checkBlockDetail(currentValue)) { return false; } } return true; }
Internal block value calculations Performs calculations on multiple data blocks for a set of trail surrogates to find codepoints in m_nextBlock_ that has the same value. Will call checkBlock() for internal block checks. Note m_*_ variables at this point is the next codepoint whose value has not been calculated.
Params:
  • currentBlock – the initial block containing all currentValue
  • currentValue – the value which other codepoints are tested against
Returns:true if the whole block has the same value as currentValue or if the whole block has been calculated, false otherwise.
/** * Internal block value calculations * Performs calculations on multiple data blocks for a set of trail * surrogates to find codepoints in m_nextBlock_ that has the same value. * Will call checkBlock() for internal block checks. * Note m_*_ variables at this point is the next codepoint whose value * has not been calculated. * @param currentBlock the initial block containing all currentValue * @param currentValue the value which other codepoints are tested against * @return true if the whole block has the same value as currentValue or if * the whole block has been calculated, false otherwise. */
private final boolean checkTrailBlock(int currentBlock, int currentValue) { // enumerate code points for this lead surrogate while (m_nextTrailIndexOffset_ < TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_) { // if we ever reach here, we are at the start of a new block m_nextBlockIndex_ = 0; // copy of most of the body of the BMP loop if (!checkBlock(currentBlock, currentValue)) { return false; } m_nextTrailIndexOffset_ ++; m_nextIndex_ ++; } return true; }
Checks if we are beginning at the start of a initial block. If we are then the rest of the codepoints in this initial block has the same values. We increment m_nextCodepoint_ and relevant data members if so. This is used only in for the supplementary codepoints because the offset to the trail indexes could be 0.
Returns:true if we are at the start of a initial block.
/** * Checks if we are beginning at the start of a initial block. * If we are then the rest of the codepoints in this initial block * has the same values. * We increment m_nextCodepoint_ and relevant data members if so. * This is used only in for the supplementary codepoints because * the offset to the trail indexes could be 0. * @return true if we are at the start of a initial block. */
private final boolean checkNullNextTrailIndex() { if (m_nextIndex_ <= 0) { m_nextCodepoint_ += TRAIL_SURROGATE_COUNT_ - 1; int nextLead = UTF16.getLeadSurrogate(m_nextCodepoint_); int leadBlock = m_trie_.m_index_[nextLead >> Trie.INDEX_STAGE_1_SHIFT_] << Trie.INDEX_STAGE_2_SHIFT_; if (m_trie_.m_dataManipulate_ == null) { throw new NullPointerException( "The field DataManipulate in this Trie is null"); } m_nextIndex_ = m_trie_.m_dataManipulate_.getFoldingOffset( m_trie_.getValue(leadBlock + (nextLead & Trie.INDEX_STAGE_3_MASK_))); m_nextIndex_ --; m_nextBlockIndex_ = DATA_BLOCK_LENGTH_; return true; } return false; } // private data members --------------------------------------------
Size of the stage 1 BMP indexes
/** * Size of the stage 1 BMP indexes */
private static final int BMP_INDEX_LENGTH_ = 0x10000 >> Trie.INDEX_STAGE_1_SHIFT_;
Lead surrogate minimum value
/** * Lead surrogate minimum value */
private static final int LEAD_SURROGATE_MIN_VALUE_ = 0xD800;
Trail surrogate minimum value
/** * Trail surrogate minimum value */
private static final int TRAIL_SURROGATE_MIN_VALUE_ = 0xDC00;
Number of trail surrogate
/** * Number of trail surrogate */
private static final int TRAIL_SURROGATE_COUNT_ = 0x400;
Number of stage 1 indexes for supplementary calculations that maps to each lead surrogate character. See second pass into getRawOffset for the trail surrogate character. 10 for significant number of bits for trail surrogates, 5 for what we discard during shifting.
/** * Number of stage 1 indexes for supplementary calculations that maps to * each lead surrogate character. * See second pass into getRawOffset for the trail surrogate character. * 10 for significant number of bits for trail surrogates, 5 for what we * discard during shifting. */
private static final int TRAIL_SURROGATE_INDEX_BLOCK_LENGTH_ = 1 << (10 - Trie.INDEX_STAGE_1_SHIFT_);
Number of data values in a stage 2 (data array) block.
/** * Number of data values in a stage 2 (data array) block. */
private static final int DATA_BLOCK_LENGTH_ = 1 << Trie.INDEX_STAGE_1_SHIFT_;
Trie instance
/** * Trie instance */
private Trie m_trie_;
Initial value for trie values
/** * Initial value for trie values */
private int m_initialValue_;
Next element results and data.
/** * Next element results and data. */
private int m_currentCodepoint_; private int m_nextCodepoint_; private int m_nextValue_; private int m_nextIndex_; private int m_nextBlock_; private int m_nextBlockIndex_; private int m_nextTrailIndexOffset_; }