/*
* Copyright (c) 2005, 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.BufferedInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.util.MissingResourceException;
Internal class used for Unicode character property database.
This classes store binary data read from uprops.icu.
It does not have the capability to parse the data into more high-level
information. It only returns bytes of information when required.
Due to the form most commonly used for retrieval, array of char is used
to store the binary data.
UCharacterPropertyDB also contains information on accessing indexes to
significant points in the binary data.
Responsibility for molding the binary data into more meaning form lies on
UCharacter.
Author: Syn Wee Quek Since: release 2.1, february 1st 2002
/**
* <p>Internal class used for Unicode character property database.</p>
* <p>This classes store binary data read from uprops.icu.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.</p>
* <p>Due to the form most commonly used for retrieval, array of char is used
* to store the binary data.</p>
* <p>UCharacterPropertyDB also contains information on accessing indexes to
* significant points in the binary data.</p>
* <p>Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacter.html>UCharacter</a>.</p>
* @author Syn Wee Quek
* @since release 2.1, february 1st 2002
*/
public final class UCharacterProperty
{
// public data members -----------------------------------------------
Trie data
/**
* Trie data
*/
public CharTrie m_trie_;
Optimization
CharTrie index array
/**
* Optimization
* CharTrie index array
*/
public char[] m_trieIndex_;
Optimization
CharTrie data array
/**
* Optimization
* CharTrie data array
*/
public char[] m_trieData_;
Optimization
CharTrie data offset
/**
* Optimization
* CharTrie data offset
*/
public int m_trieInitialValue_;
Unicode version
/**
* Unicode version
*/
public VersionInfo m_unicodeVersion_;
// uprops.h enum UPropertySource --------------------------------------- ***
From uchar.c/uprops.icu properties vectors trie /** From uchar.c/uprops.icu properties vectors trie */
public static final int SRC_PROPSVEC=2;
One more than the highest UPropertySource (SRC_) constant. /** One more than the highest UPropertySource (SRC_) constant. */
public static final int SRC_COUNT=9;
// public methods ----------------------------------------------------
Java friends implementation
/**
* Java friends implementation
*/
public void setIndexData(CharTrie.FriendAgent friendagent)
{
m_trieIndex_ = friendagent.getPrivateIndex();
m_trieData_ = friendagent.getPrivateData();
m_trieInitialValue_ = friendagent.getPrivateInitialValue();
}
Gets the property value at the index.
This is optimized.
Note this is alittle different from CharTrie the index m_trieData_
is never negative.
Params: - ch – code point whose property value is to be retrieved
Returns: property value of code point
/**
* Gets the property value at the index.
* This is optimized.
* Note this is alittle different from CharTrie the index m_trieData_
* is never negative.
* @param ch code point whose property value is to be retrieved
* @return property value of code point
*/
public final int getProperty(int ch)
{
if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE
|| (ch > UTF16.LEAD_SURROGATE_MAX_VALUE
&& ch < UTF16.SUPPLEMENTARY_MIN_VALUE)) {
// BMP codepoint 0000..D7FF or DC00..FFFF
// optimized
try { // using try for ch < 0 is faster than using an if statement
return m_trieData_[
(m_trieIndex_[ch >> Trie.INDEX_STAGE_1_SHIFT_]
<< Trie.INDEX_STAGE_2_SHIFT_)
+ (ch & Trie.INDEX_STAGE_3_MASK_)];
} catch (ArrayIndexOutOfBoundsException e) {
return m_trieInitialValue_;
}
}
if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
// lead surrogate D800..DBFF
return m_trieData_[
(m_trieIndex_[Trie.LEAD_INDEX_OFFSET_
+ (ch >> Trie.INDEX_STAGE_1_SHIFT_)]
<< Trie.INDEX_STAGE_2_SHIFT_)
+ (ch & Trie.INDEX_STAGE_3_MASK_)];
}
if (ch <= UTF16.CODEPOINT_MAX_VALUE) {
// supplementary code point 10000..10FFFF
// look at the construction of supplementary characters
// trail forms the ends of it.
return m_trie_.getSurrogateValue(
UTF16.getLeadSurrogate(ch),
(char)(ch & Trie.SURROGATE_MASK_));
}
// ch is out of bounds
// return m_dataOffset_ if there is an error, in this case we return
// the default value: m_initialValue_
// we cannot assume that m_initialValue_ is at offset 0
// this is for optimization.
return m_trieInitialValue_;
// this all is an inlined form of return m_trie_.getCodePointValue(ch);
}
Getting the unsigned numeric value of a character embedded in the property
argument
Params: - prop – the character
Returns: unsigned numberic value
/**
* Getting the unsigned numeric value of a character embedded in the property
* argument
* @param prop the character
* @return unsigned numberic value
*/
public static int getUnsignedValue(int prop)
{
return (prop >> VALUE_SHIFT_) & UNSIGNED_VALUE_MASK_AFTER_SHIFT_;
}
Gets the unicode additional properties.
C version getUnicodeProperties.
Params: - codepoint – codepoint whose additional properties is to be
retrieved
- column –
Returns: unicode properties
/**
* Gets the unicode additional properties.
* C version getUnicodeProperties.
* @param codepoint codepoint whose additional properties is to be
* retrieved
* @param column
* @return unicode properties
*/
public int getAdditional(int codepoint, int column) {
if (column == -1) {
return getProperty(codepoint);
}
if (column < 0 || column >= m_additionalColumnsCount_) {
return 0;
}
return m_additionalVectors_[
m_additionalTrie_.getCodePointValue(codepoint) + column];
}
Get the "age" of the code point.
The "age" is the Unicode version when the code point was first
designated (as a non-character or for Private Use) or assigned a
character.
This can be useful to avoid emitting code points to receiving
processes that do not accept newer characters.
The data is from the UCD file DerivedAge.txt.
This API does not check the validity of the codepoint.
Params: - codepoint – The code point.
Returns: the Unicode version number
/**
* <p>Get the "age" of the code point.</p>
* <p>The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.</p>
* <p>This can be useful to avoid emitting code points to receiving
* processes that do not accept newer characters.</p>
* <p>The data is from the UCD file DerivedAge.txt.</p>
* <p>This API does not check the validity of the codepoint.</p>
* @param codepoint The code point.
* @return the Unicode version number
*/
public VersionInfo getAge(int codepoint)
{
int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
return VersionInfo.getInstance(
(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
version & LAST_NIBBLE_MASK_, 0, 0);
}
Forms a supplementary code point from the argument character
Note this is for internal use hence no checks for the validity of the
surrogate characters are done
Params: - lead – lead surrogate character
- trail – trailing surrogate character
Returns: code point of the supplementary character
/**
* Forms a supplementary code point from the argument character<br>
* Note this is for internal use hence no checks for the validity of the
* surrogate characters are done
* @param lead lead surrogate character
* @param trail trailing surrogate character
* @return code point of the supplementary character
*/
public static int getRawSupplementary(char lead, char trail)
{
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
}
Loads the property data and initialize the UCharacterProperty instance.
Throws: - MissingResourceException – when data is missing or data has been corrupted
/**
* Loads the property data and initialize the UCharacterProperty instance.
* @throws MissingResourceException when data is missing or data has been corrupted
*/
public static UCharacterProperty getInstance()
{
if(INSTANCE_ == null) {
try {
INSTANCE_ = new UCharacterProperty();
}
catch (Exception e) {
throw new MissingResourceException(e.getMessage(),"","");
}
}
return INSTANCE_;
}
Checks if the argument c is to be treated as a white space in ICU
rules. Usually ICU rule white spaces are ignored unless quoted.
Equivalent to test for Pattern_White_Space Unicode property.
Stable set of characters, won't change.
See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
Params: - c – codepoint to check
Returns: true if c is a ICU white space
/**
* Checks if the argument c is to be treated as a white space in ICU
* rules. Usually ICU rule white spaces are ignored unless quoted.
* Equivalent to test for Pattern_White_Space Unicode property.
* Stable set of characters, won't change.
* See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
* @param c codepoint to check
* @return true if c is a ICU white space
*/
public static boolean isRuleWhiteSpace(int c)
{
/* "white space" in the sense of ICU rule parsers
This is a FIXED LIST that is NOT DEPENDENT ON UNICODE PROPERTIES.
See UAX #31 Identifier and Pattern Syntax: http://www.unicode.org/reports/tr31/
U+0009..U+000D, U+0020, U+0085, U+200E..U+200F, and U+2028..U+2029
Equivalent to test for Pattern_White_Space Unicode property.
*/
return (c >= 0x0009 && c <= 0x2029 &&
(c <= 0x000D || c == 0x0020 || c == 0x0085 ||
c == 0x200E || c == 0x200F || c >= 0x2028));
}
// protected variables -----------------------------------------------
Extra property trie
/**
* Extra property trie
*/
CharTrie m_additionalTrie_;
Extra property vectors, 1st column for age and second for binary
properties.
/**
* Extra property vectors, 1st column for age and second for binary
* properties.
*/
int m_additionalVectors_[];
Number of additional columns
/**
* Number of additional columns
*/
int m_additionalColumnsCount_;
Maximum values for block, bits used as in vector word
0
/**
* Maximum values for block, bits used as in vector word
* 0
*/
int m_maxBlockScriptValue_;
Maximum values for script, bits used as in vector word
0
/**
* Maximum values for script, bits used as in vector word
* 0
*/
int m_maxJTGValue_;
// private variables -------------------------------------------------
UnicodeData.txt property object
/**
* UnicodeData.txt property object
*/
private static UCharacterProperty INSTANCE_ = null;
Default name of the datafile
/**
* Default name of the datafile
*/
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
Default buffer size of datafile
/**
* Default buffer size of datafile
*/
private static final int DATA_BUFFER_SIZE_ = 25000;
Numeric value shift
/**
* Numeric value shift
*/
private static final int VALUE_SHIFT_ = 8;
Mask to be applied after shifting to obtain an unsigned numeric value
/**
* Mask to be applied after shifting to obtain an unsigned numeric value
*/
private static final int UNSIGNED_VALUE_MASK_AFTER_SHIFT_ = 0xFF;
Shift value for lead surrogate to form a supplementary character.
/**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
Offset to add to combined surrogate pair to avoid msking.
/**
* Offset to add to combined surrogate pair to avoid msking.
*/
private static final int SURROGATE_OFFSET_ =
UTF16.SUPPLEMENTARY_MIN_VALUE -
(UTF16.SURROGATE_MIN_VALUE <<
LEAD_SURROGATE_SHIFT_) -
UTF16.TRAIL_SURROGATE_MIN_VALUE;
// additional properties ----------------------------------------------
First nibble shift
/**
* First nibble shift
*/
private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
Second nibble mask
/**
* Second nibble mask
*/
private static final int LAST_NIBBLE_MASK_ = 0xF;
Age value shift
/**
* Age value shift
*/
private static final int AGE_SHIFT_ = 24;
// private constructors --------------------------------------------------
Constructor
Throws: - IOException – thrown when data reading fails or data corrupted
/**
* Constructor
* @exception IOException thrown when data reading fails or data corrupted
*/
private UCharacterProperty() throws IOException
{
// jar access
InputStream is = ICUData.getRequiredStream(DATA_FILE_NAME_);
BufferedInputStream b = new BufferedInputStream(is, DATA_BUFFER_SIZE_);
UCharacterPropertyReader reader = new UCharacterPropertyReader(b);
reader.read(this);
b.close();
m_trie_.putIndexData(this);
}
public void upropsvec_addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the properties vectors trie */
if(m_additionalColumnsCount_>0) {
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
TrieIterator propsVectorsIter = new TrieIterator(m_additionalTrie_);
RangeValueIterator.Element propsVectorsResult = new RangeValueIterator.Element();
while(propsVectorsIter.next(propsVectorsResult)){
set.add(propsVectorsResult.start);
}
}
}
}