 * Copyright (c) 2005, 2006, Oracle and/or its affiliates. All rights reserved.
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.

 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved                     *
 *                                                                             *
 * The original version of this source code and documentation is copyrighted   *
 * and owned by IBM, These materials are provided under terms of a License     *
 * Agreement between IBM and Sun. This technology is protected by multiple     *
 * US and International patents. This notice and attribution to IBM may not    *
 * to removed.                                                                 *

package sun.text.normalizer;

import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.text.BreakIterator;
import java.util.Locale;

Internal class used for Unicode character property database.

This classes store binary data read from uprops.icu. It does not have the capability to parse the data into more high-level information. It only returns bytes of information when required.

Due to the form most commonly used for retrieval, array of char is used to store the binary data.

UCharacterPropertyDB also contains information on accessing indexes to significant points in the binary data.

Responsibility for molding the binary data into more meaning form lies on UCharacter.

Author:Syn Wee Quek
Since:release 2.1, february 1st 2002
/** * <p>Internal class used for Unicode character property database.</p> * <p>This classes store binary data read from uprops.icu. * It does not have the capability to parse the data into more high-level * information. It only returns bytes of information when required.</p> * <p>Due to the form most commonly used for retrieval, array of char is used * to store the binary data.</p> * <p>UCharacterPropertyDB also contains information on accessing indexes to * significant points in the binary data.</p> * <p>Responsibility for molding the binary data into more meaning form lies on * <a href=UCharacter.html>UCharacter</a>.</p> * @author Syn Wee Quek * @since release 2.1, february 1st 2002 * @draft 2.1 */
public final class UCharacterProperty implements Trie.DataManipulate { // public data members -----------------------------------------------
Trie data
/** * Trie data */
public CharTrie m_trie_;
Optimization CharTrie index array
/** * Optimization * CharTrie index array */
public char[] m_trieIndex_;
Optimization CharTrie data array
/** * Optimization * CharTrie data array */
public char[] m_trieData_;
Optimization CharTrie data offset
/** * Optimization * CharTrie data offset */
public int m_trieInitialValue_;
Character property table
/** * Character property table */
public int m_property_[];
Unicode version
/** * Unicode version */
public VersionInfo m_unicodeVersion_;
Exception indicator for uppercase type
/** * Exception indicator for uppercase type */
public static final int EXC_UPPERCASE_ = 0;
Exception indicator for lowercase type
/** * Exception indicator for lowercase type */
public static final int EXC_LOWERCASE_ = 1;
Exception indicator for titlecase type
/** * Exception indicator for titlecase type */
public static final int EXC_TITLECASE_ = 2;
Exception indicator for digit type
/** * Exception indicator for digit type */
public static final int EXC_UNUSED_ = 3;
Exception indicator for numeric type
/** * Exception indicator for numeric type */
public static final int EXC_NUMERIC_VALUE_ = 4;
Exception indicator for denominator type
/** * Exception indicator for denominator type */
public static final int EXC_DENOMINATOR_VALUE_ = 5;
Exception indicator for mirror type
/** * Exception indicator for mirror type */
public static final int EXC_MIRROR_MAPPING_ = 6;
Exception indicator for special casing type
/** * Exception indicator for special casing type */
public static final int EXC_SPECIAL_CASING_ = 7;
Exception indicator for case folding type
/** * Exception indicator for case folding type */
public static final int EXC_CASE_FOLDING_ = 8;
EXC_COMBINING_CLASS_ is not found in ICU. Used to retrieve the combining class of the character in the exception value
/** * EXC_COMBINING_CLASS_ is not found in ICU. * Used to retrieve the combining class of the character in the exception * value */
public static final int EXC_COMBINING_CLASS_ = 9;
Latin lowercase i
/** * Latin lowercase i */
public static final char LATIN_SMALL_LETTER_I_ = 0x69;
Character type mask
/** * Character type mask */
public static final int TYPE_MASK = 0x1F;
Exception test mask
/** * Exception test mask */
public static final int EXCEPTION_MASK = 0x20; // public methods ----------------------------------------------------
Java friends implementation
/** * Java friends implementation */
public void setIndexData(CharTrie.FriendAgent friendagent) { m_trieIndex_ = friendagent.getPrivateIndex(); m_trieData_ = friendagent.getPrivateData(); m_trieInitialValue_ = friendagent.getPrivateInitialValue(); }
Called by com.ibm.icu.util.Trie to extract from a lead surrogate's data the index array offset of the indexes for that lead surrogate.
  • value – data value for a surrogate from the trie, including the folding offset
Returns:data offset or 0 if there is no data for the lead surrogate
/** * Called by com.ibm.icu.util.Trie to extract from a lead surrogate's * data the index array offset of the indexes for that lead surrogate. * @param value data value for a surrogate from the trie, including the * folding offset * @return data offset or 0 if there is no data for the lead surrogate */
public int getFoldingOffset(int value) { if ((value & SUPPLEMENTARY_FOLD_INDICATOR_MASK_) != 0) { return (value & SUPPLEMENTARY_FOLD_OFFSET_MASK_); } else { return 0; } }
Gets the property value at the index. This is optimized. Note this is alittle different from CharTrie the index m_trieData_ is never negative.
  • ch – code point whose property value is to be retrieved
Returns:property value of code point
/** * Gets the property value at the index. * This is optimized. * Note this is alittle different from CharTrie the index m_trieData_ * is never negative. * @param ch code point whose property value is to be retrieved * @return property value of code point */
public int getProperty(int ch) { if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { // BMP codepoint // optimized try { return m_property_[ m_trieData_[ (m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_] << Trie.INDEX_STAGE_2_SHIFT_) + (ch & Trie.INDEX_STAGE_3_MASK_)]]; } catch (ArrayIndexOutOfBoundsException e) { return m_property_[m_trieInitialValue_]; } } if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { return m_property_[ m_trieData_[ (m_trieIndex_[Trie.LEAD_INDEX_OFFSET_ + (ch >> Trie.INDEX_STAGE_1_SHIFT_)] << Trie.INDEX_STAGE_2_SHIFT_) + (ch & Trie.INDEX_STAGE_3_MASK_)]]; } // for optimization if (ch <= UTF16.CODEPOINT_MAX_VALUE) { // look at the construction of supplementary characters // trail forms the ends of it. return m_property_[m_trie_.getSurrogateValue( UTF16.getLeadSurrogate(ch), (char)(ch & Trie.SURROGATE_MASK_))]; } // return m_dataOffset_ if there is an error, in this case we return // the default value: m_initialValue_ // we cannot assume that m_initialValue_ is at offset 0 // this is for optimization. return m_property_[m_trieInitialValue_]; // return m_property_[m_trie_.getCodePointValue(ch)]; }
Getting the signed numeric value of a character embedded in the property argument
  • prop – the character
Returns:signed numberic value
/** * Getting the signed numeric value of a character embedded in the property * argument * @param prop the character * @return signed numberic value */
public static int getSignedValue(int prop) { return (prop >> VALUE_SHIFT_); }
Getting the exception index for argument property
  • prop – character property
Returns:exception index
/** * Getting the exception index for argument property * @param prop character property * @return exception index */
public static int getExceptionIndex(int prop) { return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_; }
Determines if the exception value passed in has the kind of information which the indicator wants, e.g if the exception value contains the digit value of the character
  • index – exception index
  • indicator – type indicator
Returns:true if type value exist
/** * Determines if the exception value passed in has the kind of information * which the indicator wants, e.g if the exception value contains the digit * value of the character * @param index exception index * @param indicator type indicator * @return true if type value exist */
public boolean hasExceptionValue(int index, int indicator) { return (m_exception_[index] & (1 << indicator)) != 0; }
Gets the exception value at the index, assuming that data type is available. Result is undefined if data is not available. Use hasExceptionValue() to determine data's availability.
  • index –
  • etype – exception data type
Returns:exception data type value at index
/** * Gets the exception value at the index, assuming that data type is * available. Result is undefined if data is not available. Use * hasExceptionValue() to determine data's availability. * @param index * @param etype exception data type * @return exception data type value at index */
public int getException(int index, int etype) { // contained in exception data if (etype == EXC_COMBINING_CLASS_) { return m_exception_[index]; } // contained in the exception digit address index = addExceptionOffset(m_exception_[index], etype, ++ index); return m_exception_[index]; }
Gets the folded case value at the index
  • index – of the case value to be retrieved
  • count – number of characters to retrieve
  • str – string buffer to which to append the result
/** * Gets the folded case value at the index * @param index of the case value to be retrieved * @param count number of characters to retrieve * @param str string buffer to which to append the result */
public void getFoldCase(int index, int count, StringBuffer str) { // first 2 chars are for the simple mappings index += 2; while (count > 0) { str.append(m_case_[index]); index ++; count --; } }
Gets the unicode additional properties. C version getUnicodeProperties.
  • codepoint – codepoint whose additional properties is to be retrieved
Returns:unicode properties
/** * Gets the unicode additional properties. * C version getUnicodeProperties. * @param codepoint codepoint whose additional properties is to be * retrieved * @return unicode properties */
public int getAdditional(int codepoint) { return m_additionalVectors_[m_additionalTrie_.getCodePointValue(codepoint)]; }

Get the "age" of the code point.

The "age" is the Unicode version when the code point was first designated (as a non-character or for Private Use) or assigned a character.

This can be useful to avoid emitting code points to receiving processes that do not accept newer characters.

The data is from the UCD file DerivedAge.txt.

This API does not check the validity of the codepoint.

  • codepoint – The code point.
Returns:the Unicode version number
@draftICU 2.1
/** * <p>Get the "age" of the code point.</p> * <p>The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) or assigned a * character.</p> * <p>This can be useful to avoid emitting code points to receiving * processes that do not accept newer characters.</p> * <p>The data is from the UCD file DerivedAge.txt.</p> * <p>This API does not check the validity of the codepoint.</p> * @param codepoint The code point. * @return the Unicode version number * @draft ICU 2.1 */
public VersionInfo getAge(int codepoint) { int version = getAdditional(codepoint) >> AGE_SHIFT_; return VersionInfo.getInstance( (version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_, version & LAST_NIBBLE_MASK_, 0, 0); }
Forms a supplementary code point from the argument character
Note this is for internal use hence no checks for the validity of the surrogate characters are done
  • lead – lead surrogate character
  • trail – trailing surrogate character
Returns:code point of the supplementary character
/** * Forms a supplementary code point from the argument character<br> * Note this is for internal use hence no checks for the validity of the * surrogate characters are done * @param lead lead surrogate character * @param trail trailing surrogate character * @return code point of the supplementary character */
public static int getRawSupplementary(char lead, char trail) { return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; }
Loads the property data and initialize the UCharacterProperty instance.
  • RuntimeException – when data is missing or data has been corrupted
/** * Loads the property data and initialize the UCharacterProperty instance. * @throws RuntimeException when data is missing or data has been corrupted */
public static UCharacterProperty getInstance() throws RuntimeException { if (INSTANCE_ == null) { try { INSTANCE_ = new UCharacterProperty(); } catch (Exception e) { throw new RuntimeException(e.getMessage()); } } return INSTANCE_; }
Checks if the argument c is to be treated as a white space in ICU rules. Usually ICU rule white spaces are ignored unless quoted.
  • c – codepoint to check
Returns:true if c is a ICU white space
/** * Checks if the argument c is to be treated as a white space in ICU * rules. Usually ICU rule white spaces are ignored unless quoted. * @param c codepoint to check * @return true if c is a ICU white space */
public static boolean isRuleWhiteSpace(int c) { /* "white space" in the sense of ICU rule parsers This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES. See UTR #31: http://www.unicode.org/reports/tr31/. U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029 */ return (c >= 0x0009 && c <= 0x2029 && (c <= 0x000D || c == 0x0020 || c == 0x0085 || c == 0x200E || c == 0x200F || c >= 0x2028)); } // protected variables -----------------------------------------------
Case table
/** * Case table */
char m_case_[];
Exception property table
/** * Exception property table */
int m_exception_[];
Extra property trie
/** * Extra property trie */
CharTrie m_additionalTrie_;
Extra property vectors, 1st column for age and second for binary properties.
/** * Extra property vectors, 1st column for age and second for binary * properties. */
int m_additionalVectors_[];
Number of additional columns
/** * Number of additional columns */
int m_additionalColumnsCount_;
Maximum values for block, bits used as in vector word 0
/** * Maximum values for block, bits used as in vector word * 0 */
int m_maxBlockScriptValue_;
Maximum values for script, bits used as in vector word 0
/** * Maximum values for script, bits used as in vector word * 0 */
int m_maxJTGValue_; // private variables -------------------------------------------------
UnicodeData.txt property object
/** * UnicodeData.txt property object */
private static UCharacterProperty INSTANCE_ = null;
Default name of the datafile
/** * Default name of the datafile */
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
Default buffer size of datafile
/** * Default buffer size of datafile */
private static final int DATA_BUFFER_SIZE_ = 25000;
This, from what i infer is the max size of the indicators used for the exception values. Number of bits in an 8-bit integer value
/** * This, from what i infer is the max size of the indicators used for the * exception values. * Number of bits in an 8-bit integer value */
private static final int EXC_GROUP_ = 8;
Mask to get the group
/** * Mask to get the group */
private static final int EXC_GROUP_MASK_ = 255;
Mask to get the digit value in the exception result
/** * Mask to get the digit value in the exception result */
private static final int EXC_DIGIT_MASK_ = 0xFFFF;
Offset table for data in exception block.
Table formed by the number of bits used for the index, e.g. 0 = 0 bits, 1 = 1 bits.
/** * Offset table for data in exception block.<br> * Table formed by the number of bits used for the index, e.g. 0 = 0 bits, * 1 = 1 bits. */
private static final byte FLAGS_OFFSET_[] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 };
Numeric value shift
/** * Numeric value shift */
private static final int VALUE_SHIFT_ = 20;
Mask to be applied after shifting to obtain an unsigned numeric value
/** * Mask to be applied after shifting to obtain an unsigned numeric value */
private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0x7FF; /** * */ private static final int NUMERIC_TYPE_SHIFT = 12;
Folding indicator mask
/** * Folding indicator mask */
private static final int SUPPLEMENTARY_FOLD_INDICATOR_MASK_ = 0x8000;
Folding offset mask
/** * Folding offset mask */
private static final int SUPPLEMENTARY_FOLD_OFFSET_MASK_ = 0x7FFF;
Shift value for lead surrogate to form a supplementary character.
/** * Shift value for lead surrogate to form a supplementary character. */
private static final int LEAD_SURROGATE_SHIFT_ = 10;
Offset to add to combined surrogate pair to avoid msking.
/** * Offset to add to combined surrogate pair to avoid msking. */
To get the last character out from a data type
/** * To get the last character out from a data type */
private static final int LAST_CHAR_MASK_ = 0xFFFF;
First nibble shift
/** * First nibble shift */
private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
Second nibble mask
/** * Second nibble mask */
private static final int LAST_NIBBLE_MASK_ = 0xF;
Age value shift
/** * Age value shift */
private static final int AGE_SHIFT_ = 24; // private constructors --------------------------------------------------
  • thrown – when data reading fails or data corrupted
/** * Constructor * @exception thrown when data reading fails or data corrupted */
private UCharacterProperty() throws IOException { // jar access InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_); BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_); UCharacterPropertyReader reader = new UCharacterPropertyReader(b); reader.read(this); b.close(); m_trie_.putIndexData(this); } /* Is followed by {case-ignorable}* cased ? */
Getting the correct address for data in the exception value
  • evalue – exception value
  • indicator – type of data to retrieve
  • address – current address to move from
Returns:the correct address
/** * Getting the correct address for data in the exception value * @param evalue exception value * @param indicator type of data to retrieve * @param address current address to move from * @return the correct address */
private int addExceptionOffset(int evalue, int indicator, int address) { int result = address; if (indicator >= EXC_GROUP_) { result += FLAGS_OFFSET_[evalue & EXC_GROUP_MASK_]; evalue >>= EXC_GROUP_; indicator -= EXC_GROUP_; } int mask = (1 << indicator) - 1; result += FLAGS_OFFSET_[evalue & mask]; return result; } private static final int TAB = 0x0009; private static final int LF = 0x000a; private static final int FF = 0x000c; private static final int CR = 0x000d; private static final int U_A = 0x0041; private static final int U_Z = 0x005a; private static final int U_a = 0x0061; private static final int U_z = 0x007a; private static final int DEL = 0x007f; private static final int NL = 0x0085; private static final int NBSP = 0x00a0; private static final int CGJ = 0x034f; private static final int FIGURESP= 0x2007; private static final int HAIRSP = 0x200a; private static final int ZWNJ = 0x200c; private static final int ZWJ = 0x200d; private static final int RLM = 0x200f; private static final int NNBSP = 0x202f; private static final int WJ = 0x2060; private static final int INHSWAP = 0x206a; private static final int NOMDIG = 0x206f; private static final int ZWNBSP = 0xfeff; public UnicodeSet addPropertyStarts(UnicodeSet set) { int c; /* add the start code point of each same-value range of each trie */ //utrie_enum(&normTrie, NULL, _enumPropertyStartsRange, set); TrieIterator propsIter = new TrieIterator(m_trie_); RangeValueIterator.Element propsResult = new RangeValueIterator.Element(); while(propsIter.next(propsResult)){ set.add(propsResult.start); } //utrie_enum(&propsVectorsTrie, NULL, _enumPropertyStartsRange, set); TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_); RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element(); while(propsVectorsIter.next(propsVectorsResult)){ set.add(propsVectorsResult.start); } /* add code points with hardcoded properties, plus the ones following them */ /* add for IS_THAT_CONTROL_SPACE() */ set.add(TAB); /* range TAB..CR */ set.add(CR+1); set.add(0x1c); set.add(0x1f+1); set.add(NL); set.add(NL+1); /* add for u_isIDIgnorable() what was not added above */ set.add(DEL); /* range DEL..NBSP-1, NBSP added below */ set.add(HAIRSP); set.add(RLM+1); set.add(INHSWAP); set.add(NOMDIG+1); set.add(ZWNBSP); set.add(ZWNBSP+1); /* add no-break spaces for u_isWhitespace() what was not added above */ set.add(NBSP); set.add(NBSP+1); set.add(FIGURESP); set.add(FIGURESP+1); set.add(NNBSP); set.add(NNBSP+1); /* add for u_charDigitValue() */ set.add(0x3007); set.add(0x3008); set.add(0x4e00); set.add(0x4e01); set.add(0x4e8c); set.add(0x4e8d); set.add(0x4e09); set.add(0x4e0a); set.add(0x56db); set.add(0x56dc); set.add(0x4e94); set.add(0x4e95); set.add(0x516d); set.add(0x516e); set.add(0x4e03); set.add(0x4e04); set.add(0x516b); set.add(0x516c); set.add(0x4e5d); set.add(0x4e5e); /* add for u_digit() */ set.add(U_a); set.add(U_z+1); set.add(U_A); set.add(U_Z+1); /* add for UCHAR_DEFAULT_IGNORABLE_CODE_POINT what was not added above */ set.add(WJ); /* range WJ..NOMDIG */ set.add(0xfff0); set.add(0xfffb+1); set.add(0xe0000); set.add(0xe0fff+1); /* add for UCHAR_GRAPHEME_BASE and others */ set.add(CGJ); set.add(CGJ+1); /* add for UCHAR_JOINING_TYPE */ set.add(ZWNJ); /* range ZWNJ..ZWJ */ set.add(ZWJ+1); /* add Jamo type boundaries for UCHAR_HANGUL_SYLLABLE_TYPE */ set.add(0x1100); int value= UCharacter.HangulSyllableType.LEADING_JAMO; int value2; for(c=0x115a; c<=0x115f; ++c) { value2= UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE); if(value!=value2) { value=value2; set.add(c); } } set.add(0x1160); value=UCharacter.HangulSyllableType.VOWEL_JAMO; for(c=0x11a3; c<=0x11a7; ++c) { value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE); if(value!=value2) { value=value2; set.add(c); } } set.add(0x11a8); value=UCharacter.HangulSyllableType.TRAILING_JAMO; for(c=0x11fa; c<=0x11ff; ++c) { value2=UCharacter.getIntPropertyValue(c, UProperty.HANGUL_SYLLABLE_TYPE); if(value!=value2) { value=value2; set.add(c); } } /* * Omit code points for u_charCellWidth() because * - it is deprecated and not a real Unicode property * - they are probably already set from the trie enumeration */ /* * Omit code points with hardcoded specialcasing properties * because we do not build property UnicodeSets for them right now. */ return set; // for chaining } /*---------------------------------------------------------------- * Inclusions list *----------------------------------------------------------------*/ /* * Return a set of characters for property enumeration. * The set implicitly contains 0x110000 as well, which is one more than the highest * Unicode code point. * * This set is used as an ordered list - its code points are ordered, and * consecutive code points (in Unicode code point order) in the set define a range. * For each two consecutive characters (start, limit) in the set, * all of the UCD/normalization and related properties for * all code points start..limit-1 are all the same, * except for character names and ISO comments. * * All Unicode code points U+0000..U+10ffff are covered by these ranges. * The ranges define a partition of the Unicode code space. * ICU uses the inclusions set to enumerate properties for generating * UnicodeSets containing all code points that have a certain property value. * * The Inclusion List is generated from the UCD. It is generated * by enumerating the data tries, and code points for hardcoded properties * are added as well. * * -------------------------------------------------------------------------- * * The following are ideas for getting properties-unique code point ranges, * with possible optimizations beyond the current implementation. * These optimizations would require more code and be more fragile. * The current implementation generates one single list (set) for all properties. * * To enumerate properties efficiently, one needs to know ranges of * repetitive values, so that the value of only each start code point * can be applied to the whole range. * This information is in principle available in the uprops.icu/unorm.icu data. * * There are two obstacles: * * 1. Some properties are computed from multiple data structures, * making it necessary to get repetitive ranges by intersecting * ranges from multiple tries. * * 2. It is not economical to write code for getting repetitive ranges * that are precise for each of some 50 properties. * * Compromise ideas: * * - Get ranges per trie, not per individual property. * Each range contains the same values for a whole group of properties. * This would generate currently five range sets, two for uprops.icu tries * and three for unorm.icu tries. * * - Combine sets of ranges for multiple tries to get sufficient sets * for properties, e.g., the uprops.icu main and auxiliary tries * for all non-normalization properties. * * Ideas for representing ranges and combining them: * * - A UnicodeSet could hold just the start code points of ranges. * Multiple sets are easily combined by or-ing them together. * * - Alternatively, a UnicodeSet could hold each even-numbered range. * All ranges could be enumerated by using each start code point * (for the even-numbered ranges) as well as each limit (end+1) code point * (for the odd-numbered ranges). * It should be possible to combine two such sets by xor-ing them, * but no more than two. * * The second way to represent ranges may(?!) yield smaller UnicodeSet arrays, * but the first one is certainly simpler and applicable for combining more than * two range sets. * * It is possible to combine all range sets for all uprops/unorm tries into one * set that can be used for all properties. * As an optimization, there could be less-combined range sets for certain * groups of properties. * The relationship of which less-combined range set to use for which property * depends on the implementation of the properties and must be hardcoded * - somewhat error-prone and higher maintenance but can be tested easily * by building property sets "the simple way" in test code. * * --- * * Do not use a UnicodeSet pattern because that causes infinite recursion; * UnicodeSet depends on the inclusions set. */ public UnicodeSet getInclusions() { UnicodeSet set = new UnicodeSet(); NormalizerImpl.addPropertyStarts(set); addPropertyStarts(set); return set; } }