/*
 * Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
/*
 *******************************************************************************
 * (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved                     *
 *                                                                             *
 * The original version of this source code and documentation is copyrighted   *
 * and owned by IBM, These materials are provided under terms of a License     *
 * Agreement between IBM and Sun. This technology is protected by multiple     *
 * US and International patents. This notice and attribution to IBM may not    *
 * to removed.                                                                 *
 *******************************************************************************
 */

package sun.text.normalizer;

import java.lang.ref.SoftReference;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

The UCharacter class provides extensions to the java.lang.Character class. These extensions provide support for Unicode 3.2 properties and together with the UTF16 class, provide support for supplementary characters (those with code points above U+FFFF).

Code points are represented in these API using ints. While it would be more convenient in Java to have a separate primitive datatype for them, ints suffice in the meantime.

To use this class please add the jar file name icu4j.jar to the class path, since it contains data files which supply the information used by this file.
E.g. In Windows
set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar.
Otherwise, another method would be to copy the files uprops.dat and unames.icu from the icu4j source subdirectory $ICU4J_SRC/src/com.ibm.icu.impl.data to your class directory $ICU4J_CLASS/com.ibm.icu.impl.data.

Aside from the additions for UTF-16 support, and the updated Unicode 3.1 properties, the main differences between UCharacter and Character are:

  • UCharacter is not designed to be a char wrapper and does not have APIs to which involves management of that single char.
    These include:
    • char charValue(),
    • int compareTo(java.lang.Character, java.lang.Character), etc.
  • UCharacter does not include Character APIs that are deprecated, not does it include the Java-specific character information, such as boolean isJavaIdentifierPart(char ch).
  • Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric values '10' - '35'. UCharacter also does this in digit and getNumericValue, to adhere to the java semantics of these methods. New methods unicodeDigit, and getUnicodeNumericValue do not treat the above code points as having numeric values. This is a semantic change from ICU4J 1.3.1.

Further detail differences can be determined from the program com.ibm.icu.dev.test.lang.UCharacterCompare

This class is not subclassable

Author:Syn Wee Quek
See Also:
  • UCharacterEnums
@stableICU 2.1
/** * <p> * The UCharacter class provides extensions to the * <a href=http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.html> * java.lang.Character</a> class. These extensions provide support for * Unicode 3.2 properties and together with the <a href=../text/UTF16.html>UTF16</a> * class, provide support for supplementary characters (those with code * points above U+FFFF). * </p> * <p> * Code points are represented in these API using ints. While it would be * more convenient in Java to have a separate primitive datatype for them, * ints suffice in the meantime. * </p> * <p> * To use this class please add the jar file name icu4j.jar to the * class path, since it contains data files which supply the information used * by this file.<br> * E.g. In Windows <br> * <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br> * Otherwise, another method would be to copy the files uprops.dat and * unames.icu from the icu4j source subdirectory * <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory * <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>. * </p> * <p> * Aside from the additions for UTF-16 support, and the updated Unicode 3.1 * properties, the main differences between UCharacter and Character are: * <ul> * <li> UCharacter is not designed to be a char wrapper and does not have * APIs to which involves management of that single char.<br> * These include: * <ul> * <li> char charValue(), * <li> int compareTo(java.lang.Character, java.lang.Character), etc. * </ul> * <li> UCharacter does not include Character APIs that are deprecated, not * does it include the Java-specific character information, such as * boolean isJavaIdentifierPart(char ch). * <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric * values '10' - '35'. UCharacter also does this in digit and * getNumericValue, to adhere to the java semantics of these * methods. New methods unicodeDigit, and * getUnicodeNumericValue do not treat the above code points * as having numeric values. This is a semantic change from ICU4J 1.3.1. * </ul> * <p> * Further detail differences can be determined from the program * <a href = http://oss.software.ibm.com/developerworks/opensource/cvs/icu4j/~checkout~/icu4j/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java> * com.ibm.icu.dev.test.lang.UCharacterCompare</a> * </p> * <p> * This class is not subclassable * </p> * @author Syn Wee Quek * @stable ICU 2.1 * @see com.ibm.icu.lang.UCharacterEnums */
public final class UCharacter {
Numeric Type constants.
See Also:
  • NUMERIC_TYPE.NUMERIC_TYPE
@stableICU 2.4
/** * Numeric Type constants. * @see UProperty#NUMERIC_TYPE * @stable ICU 2.4 */
public static interface NumericType {
@stableICU 2.4
/** * @stable ICU 2.4 */
public static final int NONE = 0;
@stableICU 2.4
/** * @stable ICU 2.4 */
public static final int DECIMAL = 1;
@stableICU 2.4
/** * @stable ICU 2.4 */
public static final int DIGIT = 2;
@stableICU 2.4
/** * @stable ICU 2.4 */
public static final int NUMERIC = 3;
@stableICU 2.4
/** * @stable ICU 2.4 */
public static final int COUNT = 4; }
Hangul Syllable Type constants.
See Also:
  • HANGUL_SYLLABLE_TYPE.HANGUL_SYLLABLE_TYPE
@stableICU 2.6
/** * Hangul Syllable Type constants. * * @see UProperty#HANGUL_SYLLABLE_TYPE * @stable ICU 2.6 */
public static interface HangulSyllableType {
@stableICU 2.6
/** * @stable ICU 2.6 */
public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/
@stableICU 2.6
/** * @stable ICU 2.6 */
public static final int LEADING_JAMO = 1; /*[L]*/
@stableICU 2.6
/** * @stable ICU 2.6 */
public static final int VOWEL_JAMO = 2; /*[V]*/
@stableICU 2.6
/** * @stable ICU 2.6 */
public static final int TRAILING_JAMO = 3; /*[T]*/
@stableICU 2.6
/** * @stable ICU 2.6 */
public static final int LV_SYLLABLE = 4; /*[LV]*/
@stableICU 2.6
/** * @stable ICU 2.6 */
public static final int LVT_SYLLABLE = 5; /*[LVT]*/
@stableICU 2.6
/** * @stable ICU 2.6 */
public static final int COUNT = 6; }
[Sun] This interface moved from UCharacterEnums.java. 'Enum' for the CharacterCategory constants. These constants are compatible in name but not in value with those defined in java.lang.Character.
See Also:
  • UCharacterCategory
@draftICU 3.0
Deprecated:This is a draft API and might change in a future release of ICU.
/** * [Sun] This interface moved from UCharacterEnums.java. * * 'Enum' for the CharacterCategory constants. These constants are * compatible in name <b>but not in value</b> with those defined in * <code>java.lang.Character</code>. * @see UCharacterCategory * @draft ICU 3.0 * @deprecated This is a draft API and might change in a future release of ICU. */
public static interface ECharacterCategory {
Character type Lu
@stableICU 2.1
/** * Character type Lu * @stable ICU 2.1 */
public static final int UPPERCASE_LETTER = 1;
Character type Lt
@stableICU 2.1
/** * Character type Lt * @stable ICU 2.1 */
public static final int TITLECASE_LETTER = 3;
Character type Lo
@stableICU 2.1
/** * Character type Lo * @stable ICU 2.1 */
public static final int OTHER_LETTER = 5; } // public data members -----------------------------------------------
The lowest Unicode code point value.
@stableICU 2.1
/** * The lowest Unicode code point value. * @stable ICU 2.1 */
public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
The highest Unicode code point value (scalar value) according to the Unicode Standard. This is a 21-bit value (21 bits, rounded up).
Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE
@stableICU 2.1
/** * The highest Unicode code point value (scalar value) according to the * Unicode Standard. * This is a 21-bit value (21 bits, rounded up).<br> * Up-to-date Unicode implementation of java.lang.Character.MIN_VALUE * @stable ICU 2.1 */
public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
The minimum value for Supplementary code points
@stableICU 2.1
/** * The minimum value for Supplementary code points * @stable ICU 2.1 */
public static final int SUPPLEMENTARY_MIN_VALUE = UTF16.SUPPLEMENTARY_MIN_VALUE;
Special value that is returned by getUnicodeNumericValue(int) when no numeric value is defined for a code point.
See Also:
@stableICU 2.4
/** * Special value that is returned by getUnicodeNumericValue(int) when no * numeric value is defined for a code point. * @stable ICU 2.4 * @see #getUnicodeNumericValue */
public static final double NO_NUMERIC_VALUE = -123456789; // public methods ----------------------------------------------------
Retrieves the numeric value of a decimal digit code point.
This method observes the semantics of java.lang.Character.digit(). Note that this will return positive values for code points for which isDigit returns false, just like java.lang.Character.
Semantic Change: In release 1.3.1 and prior, this did not treat the European letters as having a digit value, and also treated numeric letters and other numbers as digits. This has been changed to conform to the java semantics.
A code point is a valid digit if and only if:
  • ch is a decimal digit or one of the european letters, and
  • the value of ch is less than the specified radix.
Params:
  • ch – the code point to query
  • radix – the radix
Returns:the numeric value represented by the code point in the specified radix, or -1 if the code point is not a decimal digit or if its value is too large for the radix
@stableICU 2.1
/** * Retrieves the numeric value of a decimal digit code point. * <br>This method observes the semantics of * <code>java.lang.Character.digit()</code>. Note that this * will return positive values for code points for which isDigit * returns false, just like java.lang.Character. * <br><em>Semantic Change:</em> In release 1.3.1 and * prior, this did not treat the European letters as having a * digit value, and also treated numeric letters and other numbers as * digits. * This has been changed to conform to the java semantics. * <br>A code point is a valid digit if and only if: * <ul> * <li>ch is a decimal digit or one of the european letters, and * <li>the value of ch is less than the specified radix. * </ul> * @param ch the code point to query * @param radix the radix * @return the numeric value represented by the code point in the * specified radix, or -1 if the code point is not a decimal digit * or if its value is too large for the radix * @stable ICU 2.1 */
public static int digit(int ch, int radix) { // when ch is out of bounds getProperty == 0 int props = getProperty(ch); if (getNumericType(props) != NumericType.DECIMAL) { return (radix <= 10) ? -1 : getEuropeanDigit(ch); } // if props == 0, it will just fall through and return -1 if (isNotExceptionIndicator(props)) { // not contained in exception data // getSignedValue is just shifting so we can check for the sign // first // Optimization // int result = UCharacterProperty.getSignedValue(props); // if (result >= 0) { // return result; // } if (props >= 0) { return UCharacterProperty.getSignedValue(props); } } else { int index = UCharacterProperty.getExceptionIndex(props); if (PROPERTY_.hasExceptionValue(index, UCharacterProperty.EXC_NUMERIC_VALUE_)) { int result = PROPERTY_.getException(index, UCharacterProperty.EXC_NUMERIC_VALUE_); if (result >= 0) { return result; } } } if (radix > 10) { int result = getEuropeanDigit(ch); if (result >= 0 && result < radix) { return result; } } return -1; }

Get the numeric value for a Unicode code point as defined in the Unicode Character Database.

A "double" return type is necessary because some numeric values are fractions, negative, or too large for int.

For characters without any numeric values in the Unicode Character Database, this function will return NO_NUMERIC_VALUE.

API Change: In release 2.2 and prior, this API has a return type int and returns -1 when the argument ch does not have a corresponding numeric value. This has been changed to synch with ICU4C

This corresponds to the ICU4C function u_getNumericValue.
Params:
  • ch – Code point to get the numeric value for.
Returns:numeric value of ch, or NO_NUMERIC_VALUE if none is defined.
@stableICU 2.4
/** * <p>Get the numeric value for a Unicode code point as defined in the * Unicode Character Database.</p> * <p>A "double" return type is necessary because some numeric values are * fractions, negative, or too large for int.</p> * <p>For characters without any numeric values in the Unicode Character * Database, this function will return NO_NUMERIC_VALUE.</p> * <p><em>API Change:</em> In release 2.2 and prior, this API has a * return type int and returns -1 when the argument ch does not have a * corresponding numeric value. This has been changed to synch with ICU4C * </p> * This corresponds to the ICU4C function u_getNumericValue. * @param ch Code point to get the numeric value for. * @return numeric value of ch, or NO_NUMERIC_VALUE if none is defined. * @stable ICU 2.4 */
public static double getUnicodeNumericValue(int ch) { // equivalent to c version double u_getNumericValue(UChar32 c) int props = PROPERTY_.getProperty(ch); int numericType = getNumericType(props); if (numericType > NumericType.NONE && numericType < NumericType.COUNT) { if (isNotExceptionIndicator(props)) { return UCharacterProperty.getSignedValue(props); } else { int index = UCharacterProperty.getExceptionIndex(props); boolean nex = false; boolean dex = false; double numerator = 0; if (PROPERTY_.hasExceptionValue(index, UCharacterProperty.EXC_NUMERIC_VALUE_)) { int num = PROPERTY_.getException(index, UCharacterProperty.EXC_NUMERIC_VALUE_); // There are special values for huge numbers that are // powers of ten. genprops/store.c documents: // if numericValue = 0x7fffff00 + x then // numericValue = 10 ^ x if (num >= NUMERATOR_POWER_LIMIT_) { num &= 0xff; // 10^x without math.h numerator = Math.pow(10, num); } else { numerator = num; } nex = true; } double denominator = 0; if (PROPERTY_.hasExceptionValue(index, UCharacterProperty.EXC_DENOMINATOR_VALUE_)) { denominator = PROPERTY_.getException(index, UCharacterProperty.EXC_DENOMINATOR_VALUE_); // faster path not in c if (numerator != 0) { return numerator / denominator; } dex = true; } if (nex) { if (dex) { return numerator / denominator; } return numerator; } if (dex) { return 1 / denominator; } } } return NO_NUMERIC_VALUE; }
Returns a value indicating a code point's Unicode category. Up-to-date Unicode implementation of java.lang.Character.getType() except for the above mentioned code points that had their category changed.
Return results are constants from the interface UCharacterCategory
NOTE: the UCharacterCategory values are not compatible with those returned by java.lang.Character.getType. UCharacterCategory values match the ones used in ICU4C, while java.lang.Character type values, though similar, skip the value 17.

Params:
  • ch – code point whose type is to be determined
Returns:category which is a value of UCharacterCategory
@stableICU 2.1
/** * Returns a value indicating a code point's Unicode category. * Up-to-date Unicode implementation of java.lang.Character.getType() * except for the above mentioned code points that had their category * changed.<br> * Return results are constants from the interface * <a href=UCharacterCategory.html>UCharacterCategory</a><br> * <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with * those returned by java.lang.Character.getType. UCharacterCategory values * match the ones used in ICU4C, while java.lang.Character type * values, though similar, skip the value 17.</p> * @param ch code point whose type is to be determined * @return category which is a value of UCharacterCategory * @stable ICU 2.1 */
public static int getType(int ch) { return getProperty(ch) & UCharacterProperty.TYPE_MASK; } //// for StringPrep
Returns a code point corresponding to the two UTF16 characters.
Params:
  • lead – the lead char
  • trail – the trail char
Throws:
Returns:code point if surrogate characters are valid.
@stableICU 2.1
/** * Returns a code point corresponding to the two UTF16 characters. * @param lead the lead char * @param trail the trail char * @return code point if surrogate characters are valid. * @exception IllegalArgumentException thrown when argument characters do * not form a valid codepoint * @stable ICU 2.1 */
public static int getCodePoint(char lead, char trail) { if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE && lead <= UTF16.LEAD_SURROGATE_MAX_VALUE && trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE && trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) { return UCharacterProperty.getRawSupplementary(lead, trail); } throw new IllegalArgumentException("Illegal surrogate characters"); } //// for StringPrep
Returns the Bidirection property of a code point. For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional property.
Result returned belongs to the interface UCharacterDirection
Params:
  • ch – the code point to be determined its direction
Returns:direction constant from UCharacterDirection.
@stableICU 2.1
/** * Returns the Bidirection property of a code point. * For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional * property.<br> * Result returned belongs to the interface * <a href=UCharacterDirection.html>UCharacterDirection</a> * @param ch the code point to be determined its direction * @return direction constant from UCharacterDirection. * @stable ICU 2.1 */
public static int getDirection(int ch) { // when ch is out of bounds getProperty == 0 return (getProperty(ch) >> BIDI_SHIFT_) & BIDI_MASK_AFTER_SHIFT_; }
The given string is mapped to its case folding equivalent according to UnicodeData.txt and CaseFolding.txt; if any character has no case folding equivalent, the character itself is returned. "Full", multiple-code point case folding mappings are returned here. For "simple" single-code point mappings use the API foldCase(int ch, boolean defaultmapping).
Params:
  • str – the String to be converted
  • defaultmapping – Indicates if all mappings defined in CaseFolding.txt is to be used, otherwise the mappings for dotted I and dotless i marked with 'I' in CaseFolding.txt will be skipped.
See Also:
Returns: the case folding equivalent of the character, if any; otherwise the character itself.
@stableICU 2.1
/** * The given string is mapped to its case folding equivalent according to * UnicodeData.txt and CaseFolding.txt; if any character has no case * folding equivalent, the character itself is returned. * "Full", multiple-code point case folding mappings are returned here. * For "simple" single-code point mappings use the API * foldCase(int ch, boolean defaultmapping). * @param str the String to be converted * @param defaultmapping Indicates if all mappings defined in * CaseFolding.txt is to be used, otherwise the * mappings for dotted I and dotless i marked with * 'I' in CaseFolding.txt will be skipped. * @return the case folding equivalent of the character, if * any; otherwise the character itself. * @see #foldCase(int, boolean) * @stable ICU 2.1 */
public static String foldCase(String str, boolean defaultmapping) { int size = str.length(); StringBuffer result = new StringBuffer(size); int offset = 0; int ch; // case mapping loop while (offset < size) { ch = UTF16.charAt(str, offset); offset += UTF16.getCharCount(ch); int props = PROPERTY_.getProperty(ch); if (isNotExceptionIndicator(props)) { int type = UCharacterProperty.TYPE_MASK & props; if (type == ECharacterCategory.UPPERCASE_LETTER || type == ECharacterCategory.TITLECASE_LETTER) { ch += UCharacterProperty.getSignedValue(props); } } else { int index = UCharacterProperty.getExceptionIndex(props); if (PROPERTY_.hasExceptionValue(index, UCharacterProperty.EXC_CASE_FOLDING_)) { int exception = PROPERTY_.getException(index, UCharacterProperty.EXC_CASE_FOLDING_); if (exception != 0) { PROPERTY_.getFoldCase(exception & LAST_CHAR_MASK_, exception >> SHIFT_24_, result); } else { // special case folding mappings, hardcoded if (ch != 0x49 && ch != 0x130) { // return ch itself because there is no special // mapping for it UTF16.append(result, ch); continue; } if (defaultmapping) { // default mappings if (ch == 0x49) { // 0049; C; 0069; # LATIN CAPITAL LETTER I result.append( UCharacterProperty.LATIN_SMALL_LETTER_I_); } else if (ch == 0x130) { // 0130; F; 0069 0307; // # LATIN CAPITAL LETTER I WITH DOT ABOVE result.append( UCharacterProperty.LATIN_SMALL_LETTER_I_); result.append((char)0x307); } } else { // Turkic mappings if (ch == 0x49) { // 0049; T; 0131; # LATIN CAPITAL LETTER I result.append((char)0x131); } else if (ch == 0x130) { // 0130; T; 0069; // # LATIN CAPITAL LETTER I WITH DOT ABOVE result.append( UCharacterProperty.LATIN_SMALL_LETTER_I_); } } } // do not fall through to the output of c continue; } else { if (PROPERTY_.hasExceptionValue(index, UCharacterProperty.EXC_LOWERCASE_)) { ch = PROPERTY_.getException(index, UCharacterProperty.EXC_LOWERCASE_); } } } // handle 1:1 code point mappings from UnicodeData.txt UTF16.append(result, ch); } return result.toString(); }

Get the "age" of the code point.

The "age" is the Unicode version when the code point was first designated (as a non-character or for Private Use) or assigned a character.

This can be useful to avoid emitting code points to receiving processes that do not accept newer characters.

The data is from the UCD file DerivedAge.txt.

Params:
  • ch – The code point.
Returns:the Unicode version number
@stableICU 2.6
/** * <p>Get the "age" of the code point.</p> * <p>The "age" is the Unicode version when the code point was first * designated (as a non-character or for Private Use) or assigned a * character. * <p>This can be useful to avoid emitting code points to receiving * processes that do not accept newer characters.</p> * <p>The data is from the UCD file DerivedAge.txt.</p> * @param ch The code point. * @return the Unicode version number * @stable ICU 2.6 */
public static VersionInfo getAge(int ch) { if (ch < MIN_VALUE || ch > MAX_VALUE) { throw new IllegalArgumentException("Codepoint out of bounds"); } return PROPERTY_.getAge(ch); }

Gets the property value for an Unicode property type of a code point. Also returns binary and mask property values.

Unicode, especially in version 3.2, defines many more properties than the original set in UnicodeData.txt.

The properties APIs are intended to reflect Unicode properties as defined in the Unicode Character Database (UCD) and Unicode Technical Reports (UTR). For details about the properties see http://www.unicode.org/.

For names of Unicode properties see the UCD file PropertyAliases.txt.

Sample usage:
int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
boolean b = (ideo == 1) ? true : false;
Params:
  • ch – code point to test.
  • type – UProperty selector constant, identifies which binary property to check. Must be UProperty.BINARY_START <= type < UProperty.BINARY_LIMIT or UProperty.INT_START <= type < UProperty.INT_LIMIT or UProperty.MASK_START <= type < UProperty.MASK_LIMIT.
See Also:
  • UProperty
  • hasBinaryProperty
  • getIntPropertyMinValue
  • getIntPropertyMaxValue
  • getUnicodeVersion
Returns:numeric value that is directly the property value or, for enumerated properties, corresponds to the numeric value of the enumerated constant of the respective property value enumeration type (cast to enum type if necessary). Returns 0 or 1 (for false / true) for binary Unicode properties. Returns a bit-mask for mask properties. Returns 0 if 'type' is out of bounds or if the Unicode version does not have data for the property at all, or not for this code point.
@stableICU 2.4
/** * <p>Gets the property value for an Unicode property type of a code point. * Also returns binary and mask property values.</p> * <p>Unicode, especially in version 3.2, defines many more properties than * the original set in UnicodeData.txt.</p> * <p>The properties APIs are intended to reflect Unicode properties as * defined in the Unicode Character Database (UCD) and Unicode Technical * Reports (UTR). For details about the properties see * http://www.unicode.org/.</p> * <p>For names of Unicode properties see the UCD file PropertyAliases.txt. * </p> * <pre> * Sample usage: * int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH); * int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC); * boolean b = (ideo == 1) ? true : false; * </pre> * @param ch code point to test. * @param type UProperty selector constant, identifies which binary * property to check. Must be * UProperty.BINARY_START &lt;= type &lt; UProperty.BINARY_LIMIT or * UProperty.INT_START &lt;= type &lt; UProperty.INT_LIMIT or * UProperty.MASK_START &lt;= type &lt; UProperty.MASK_LIMIT. * @return numeric value that is directly the property value or, * for enumerated properties, corresponds to the numeric value of * the enumerated constant of the respective property value * enumeration type (cast to enum type if necessary). * Returns 0 or 1 (for false / true) for binary Unicode properties. * Returns a bit-mask for mask properties. * Returns 0 if 'type' is out of bounds or if the Unicode version * does not have data for the property at all, or not for this code * point. * @see UProperty * @see #hasBinaryProperty * @see #getIntPropertyMinValue * @see #getIntPropertyMaxValue * @see #getUnicodeVersion * @stable ICU 2.4 */
public static int getIntPropertyValue(int ch, int type) { /* * For Normalizer with Unicode 3.2, this method is called only for * HANGUL_SYLLABLE_TYPE in UnicodeSet.addPropertyStarts(). */ if (type == UProperty.HANGUL_SYLLABLE_TYPE) { /* purely algorithmic; hardcode known characters, check for assigned new ones */ if(ch<NormalizerImpl.JAMO_L_BASE) { /* NA */ } else if(ch<=0x11ff) { /* Jamo range */ if(ch<=0x115f) { /* Jamo L range, HANGUL CHOSEONG ... */ if(ch==0x115f || ch<=0x1159 || getType(ch)==ECharacterCategory.OTHER_LETTER) { return HangulSyllableType.LEADING_JAMO; } } else if(ch<=0x11a7) { /* Jamo V range, HANGUL JUNGSEONG ... */ if(ch<=0x11a2 || getType(ch)==ECharacterCategory.OTHER_LETTER) { return HangulSyllableType.VOWEL_JAMO; } } else { /* Jamo T range */ if(ch<=0x11f9 || getType(ch)==ECharacterCategory.OTHER_LETTER) { return HangulSyllableType.TRAILING_JAMO; } } } else if((ch-=NormalizerImpl.HANGUL_BASE)<0) { /* NA */ } else if(ch<NormalizerImpl.HANGUL_COUNT) { /* Hangul syllable */ return ch%NormalizerImpl.JAMO_T_COUNT==0 ? HangulSyllableType.LV_SYLLABLE : HangulSyllableType.LVT_SYLLABLE; } } return 0; /* NA */ } // private variables -------------------------------------------------
Database storing the sets of character property
/** * Database storing the sets of character property */
private static final UCharacterProperty PROPERTY_;
For optimization
/** * For optimization */
private static final char[] PROPERTY_TRIE_INDEX_; private static final char[] PROPERTY_TRIE_DATA_; private static final int[] PROPERTY_DATA_; private static final int PROPERTY_INITIAL_VALUE_; // block to initialise character property database static { try { PROPERTY_ = UCharacterProperty.getInstance(); PROPERTY_TRIE_INDEX_ = PROPERTY_.m_trieIndex_; PROPERTY_TRIE_DATA_ = PROPERTY_.m_trieData_; PROPERTY_DATA_ = PROPERTY_.m_property_; PROPERTY_INITIAL_VALUE_ = PROPERTY_DATA_[PROPERTY_.m_trieInitialValue_]; } catch (Exception e) { throw new RuntimeException(e.getMessage()); } }
To get the last character out from a data type
/** * To get the last character out from a data type */
private static final int LAST_CHAR_MASK_ = 0xFFFF; /** * To get the last byte out from a data type */ // private static final int LAST_BYTE_MASK_ = 0xFF; /** * Shift 16 bits */ // private static final int SHIFT_16_ = 16;
Shift 24 bits
/** * Shift 24 bits */
private static final int SHIFT_24_ = 24;
Shift to get numeric type
/** * Shift to get numeric type */
private static final int NUMERIC_TYPE_SHIFT_ = 12;
Mask to get numeric type
/** * Mask to get numeric type */
private static final int NUMERIC_TYPE_MASK_ = 0x7 << NUMERIC_TYPE_SHIFT_;
Shift to get bidi bits
/** * Shift to get bidi bits */
private static final int BIDI_SHIFT_ = 6;
Mask to be applied after shifting to get bidi bits
/** * Mask to be applied after shifting to get bidi bits */
private static final int BIDI_MASK_AFTER_SHIFT_ = 0x1F;

Numerator power limit. There are special values for huge numbers that are powers of ten.

c version genprops/store.c documents: if numericValue = 0x7fffff00 + x then numericValue = 10 ^ x

/** * <p>Numerator power limit. * There are special values for huge numbers that are powers of ten.</p> * <p>c version genprops/store.c documents: * if numericValue = 0x7fffff00 + x then numericValue = 10 ^ x</p> */
private static final int NUMERATOR_POWER_LIMIT_ = 0x7fffff00;
Integer properties mask and shift values for joining type. Equivalent to icu4c UPROPS_JT_MASK.
/** * Integer properties mask and shift values for joining type. * Equivalent to icu4c UPROPS_JT_MASK. */
private static final int JOINING_TYPE_MASK_ = 0x00003800;
Integer properties mask and shift values for joining type. Equivalent to icu4c UPROPS_JT_SHIFT.
/** * Integer properties mask and shift values for joining type. * Equivalent to icu4c UPROPS_JT_SHIFT. */
private static final int JOINING_TYPE_SHIFT_ = 11;
Integer properties mask and shift values for joining group. Equivalent to icu4c UPROPS_JG_MASK.
/** * Integer properties mask and shift values for joining group. * Equivalent to icu4c UPROPS_JG_MASK. */
private static final int JOINING_GROUP_MASK_ = 0x000007e0;
Integer properties mask and shift values for joining group. Equivalent to icu4c UPROPS_JG_SHIFT.
/** * Integer properties mask and shift values for joining group. * Equivalent to icu4c UPROPS_JG_SHIFT. */
private static final int JOINING_GROUP_SHIFT_ = 5;
Integer properties mask for decomposition type. Equivalent to icu4c UPROPS_DT_MASK.
/** * Integer properties mask for decomposition type. * Equivalent to icu4c UPROPS_DT_MASK. */
private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
Integer properties mask and shift values for East Asian cell width. Equivalent to icu4c UPROPS_EA_MASK
/** * Integer properties mask and shift values for East Asian cell width. * Equivalent to icu4c UPROPS_EA_MASK */
private static final int EAST_ASIAN_MASK_ = 0x00038000;
Integer properties mask and shift values for East Asian cell width. Equivalent to icu4c UPROPS_EA_SHIFT
/** * Integer properties mask and shift values for East Asian cell width. * Equivalent to icu4c UPROPS_EA_SHIFT */
private static final int EAST_ASIAN_SHIFT_ = 15;
Integer properties mask and shift values for line breaks. Equivalent to icu4c UPROPS_LB_MASK
/** * Integer properties mask and shift values for line breaks. * Equivalent to icu4c UPROPS_LB_MASK */
private static final int LINE_BREAK_MASK_ = 0x007C0000;
Integer properties mask and shift values for line breaks. Equivalent to icu4c UPROPS_LB_SHIFT
/** * Integer properties mask and shift values for line breaks. * Equivalent to icu4c UPROPS_LB_SHIFT */
private static final int LINE_BREAK_SHIFT_ = 18;
Integer properties mask and shift values for blocks. Equivalent to icu4c UPROPS_BLOCK_MASK
/** * Integer properties mask and shift values for blocks. * Equivalent to icu4c UPROPS_BLOCK_MASK */
private static final int BLOCK_MASK_ = 0x00007f80;
Integer properties mask and shift values for blocks. Equivalent to icu4c UPROPS_BLOCK_SHIFT
/** * Integer properties mask and shift values for blocks. * Equivalent to icu4c UPROPS_BLOCK_SHIFT */
private static final int BLOCK_SHIFT_ = 7;
Integer properties mask and shift values for scripts. Equivalent to icu4c UPROPS_SHIFT_MASK
/** * Integer properties mask and shift values for scripts. * Equivalent to icu4c UPROPS_SHIFT_MASK */
private static final int SCRIPT_MASK_ = 0x0000007f; // private constructor ----------------------------------------------- ///CLOVER:OFF
Private constructor to prevent instantiation
/** * Private constructor to prevent instantiation */
private UCharacter() { } ///CLOVER:ON // private methods ---------------------------------------------------
Getting the digit values of characters like 'A' - 'Z', normal, half-width and full-width. This method assumes that the other digit characters are checked by the calling method.
Params:
  • ch – character to test
Returns:-1 if ch is not a character of the form 'A' - 'Z', otherwise its corresponding digit will be returned.
/** * Getting the digit values of characters like 'A' - 'Z', normal, * half-width and full-width. This method assumes that the other digit * characters are checked by the calling method. * @param ch character to test * @return -1 if ch is not a character of the form 'A' - 'Z', otherwise * its corresponding digit will be returned. */
private static int getEuropeanDigit(int ch) { if ((ch > 0x7a && ch < 0xff21) || ch < 0x41 || (ch > 0x5a && ch < 0x61) || ch > 0xff5a || (ch > 0xff31 && ch < 0xff41)) { return -1; } if (ch <= 0x7a) { // ch >= 0x41 or ch < 0x61 return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61); } // ch >= 0xff21 if (ch <= 0xff3a) { return ch + 10 - 0xff21; } // ch >= 0xff41 && ch <= 0xff5a return ch + 10 - 0xff41; }
Gets the numeric type of the property argument
Params:
  • props – 32 bit property
Returns:the numeric type
/** * Gets the numeric type of the property argument * @param props 32 bit property * @return the numeric type */
private static int getNumericType(int props) { return (props & NUMERIC_TYPE_MASK_) >> NUMERIC_TYPE_SHIFT_; }
Checks if the property value has a exception indicator
Params:
  • props – 32 bit property value
Returns:true if property does not have a exception indicator, false otherwise
/** * Checks if the property value has a exception indicator * @param props 32 bit property value * @return true if property does not have a exception indicator, false * otherwise */
private static boolean isNotExceptionIndicator(int props) { return (props & UCharacterProperty.EXCEPTION_MASK) == 0; }
Gets the property value at the index. This is optimized. Note this is alittle different from CharTrie the index m_trieData_ is never negative. This is a duplicate of UCharacterProperty.getProperty. For optimization purposes, this method calls the trie data directly instead of through UCharacterProperty.getProperty.
Params:
  • ch – code point whose property value is to be retrieved
Returns:property value of code point
@stableICU 2.6
/** * Gets the property value at the index. * This is optimized. * Note this is alittle different from CharTrie the index m_trieData_ * is never negative. * This is a duplicate of UCharacterProperty.getProperty. For optimization * purposes, this method calls the trie data directly instead of through * UCharacterProperty.getProperty. * @param ch code point whose property value is to be retrieved * @return property value of code point * @stable ICU 2.6 */
private static int getProperty(int ch) { if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE || (ch > UTF16.LEAD_SURROGATE_MAX_VALUE && ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) { // BMP codepoint try { // using try for < 0 ch is faster than using an if statement return PROPERTY_DATA_[ PROPERTY_TRIE_DATA_[ (PROPERTY_TRIE_INDEX_[ch >> 5] << 2) + (ch & 0x1f)]]; } catch (ArrayIndexOutOfBoundsException e) { return PROPERTY_INITIAL_VALUE_; } } if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { // surrogate return PROPERTY_DATA_[ PROPERTY_TRIE_DATA_[ (PROPERTY_TRIE_INDEX_[(0x2800 >> 5) + (ch >> 5)] << 2) + (ch & 0x1f)]]; } // for optimization if (ch <= UTF16.CODEPOINT_MAX_VALUE) { // look at the construction of supplementary characters // trail forms the ends of it. return PROPERTY_DATA_[PROPERTY_.m_trie_.getSurrogateValue( UTF16.getLeadSurrogate(ch), (char)(ch & 0x3ff))]; } // return m_dataOffset_ if there is an error, in this case we return // the default value: m_initialValue_ // we cannot assume that m_initialValue_ is at offset 0 // this is for optimization. return PROPERTY_INITIAL_VALUE_; } }