/*
* reserved comment block
* DO NOT REMOVE OR ALTER!
*/
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.sun.org.apache.xml.internal.serializer;
import java.io.UnsupportedEncodingException;
Holds information about a given encoding, which is the Java name for the
encoding, the equivalent ISO name.
An object of this type has two useful methods
isInEncoding(char ch);
which can be called if the character is not the high one in
a surrogate pair and:
isInEncoding(char high, char low);
which can be called if the two characters from a high/low surrogate pair.
An EncodingInfo object is a node in a binary search tree. Such a node
will answer if a character is in the encoding, and do so for a given
range of unicode values (m_first
to
m_last
). It will handle a certain range of values
explicitly (m_explFirst
to m_explLast
).
If the unicode point is before that explicit range, that is it
is in the range m_first <= value < m_explFirst
, then it will delegate to another EncodingInfo object for The root
of such a tree, m_before. Likewise for values in the range
m_explLast < value <= m_last
, but delgating to m_after
Actually figuring out if a code point is in the encoding is expensive. So the
purpose of this tree is to cache such determinations, and not to build the
entire tree of information at the start, but only build up as much of the
tree as is used during the transformation.
This Class is not a public API, and should only be used internally within
the serializer.
@xsl.usage internal
/**
* Holds information about a given encoding, which is the Java name for the
* encoding, the equivalent ISO name.
* <p>
* An object of this type has two useful methods
* <pre>
* isInEncoding(char ch);
* </pre>
* which can be called if the character is not the high one in
* a surrogate pair and:
* <pre>
* isInEncoding(char high, char low);
* </pre>
* which can be called if the two characters from a high/low surrogate pair.
* <p>
* An EncodingInfo object is a node in a binary search tree. Such a node
* will answer if a character is in the encoding, and do so for a given
* range of unicode values (<code>m_first</code> to
* <code>m_last</code>). It will handle a certain range of values
* explicitly (<code>m_explFirst</code> to <code>m_explLast</code>).
* If the unicode point is before that explicit range, that is it
* is in the range <code>m_first <= value < m_explFirst</code>, then it will delegate to another EncodingInfo object for The root
* of such a tree, m_before. Likewise for values in the range
* <code>m_explLast < value <= m_last</code>, but delgating to <code>m_after</code>
* <p>
* Actually figuring out if a code point is in the encoding is expensive. So the
* purpose of this tree is to cache such determinations, and not to build the
* entire tree of information at the start, but only build up as much of the
* tree as is used during the transformation.
* <p>
* This Class is not a public API, and should only be used internally within
* the serializer.
*
* @xsl.usage internal
*/
public final class EncodingInfo extends Object
{
The ISO encoding name.
/**
* The ISO encoding name.
*/
final String name;
The name used by the Java convertor.
/**
* The name used by the Java convertor.
*/
final String javaName;
A helper object that we can ask if a
single char, or a surrogate UTF-16 pair
of chars that form a single character,
is in this encoding.
/**
* A helper object that we can ask if a
* single char, or a surrogate UTF-16 pair
* of chars that form a single character,
* is in this encoding.
*/
private InEncoding m_encoding;
This is not a public API. It returns true if the
char in question is in the encoding.
Params: - ch – the char in question.
@xsl.usage internal
/**
* This is not a public API. It returns true if the
* char in question is in the encoding.
* @param ch the char in question.
* @xsl.usage internal
*/
public boolean isInEncoding(char ch) {
if (m_encoding == null) {
m_encoding = new EncodingImpl();
// One could put alternate logic in here to
// instantiate another object that implements the
// InEncoding interface. For example if the JRE is 1.4 or up
// we could have an object that uses JRE 1.4 methods
}
return m_encoding.isInEncoding(ch);
}
This is not a public API. It returns true if the
character formed by the high/low pair is in the encoding.
Params: - high – a char that the a high char of a high/low surrogate pair.
- low – a char that is the low char of a high/low surrogate pair.
@xsl.usage internal
/**
* This is not a public API. It returns true if the
* character formed by the high/low pair is in the encoding.
* @param high a char that the a high char of a high/low surrogate pair.
* @param low a char that is the low char of a high/low surrogate pair.
* @xsl.usage internal
*/
public boolean isInEncoding(char high, char low) {
if (m_encoding == null) {
m_encoding = new EncodingImpl();
// One could put alternate logic in here to
// instantiate another object that implements the
// InEncoding interface. For example if the JRE is 1.4 or up
// we could have an object that uses JRE 1.4 methods
}
return m_encoding.isInEncoding(high, low);
}
Create an EncodingInfo object based on the ISO name and Java name.
If both parameters are null any character will be considered to
be in the encoding. This is useful for when the serializer is in
temporary output state, and has no assciated encoding.
Params: - name – reference to the ISO name.
- javaName – reference to the Java encoding name.
/**
* Create an EncodingInfo object based on the ISO name and Java name.
* If both parameters are null any character will be considered to
* be in the encoding. This is useful for when the serializer is in
* temporary output state, and has no assciated encoding.
*
* @param name reference to the ISO name.
* @param javaName reference to the Java encoding name.
*/
public EncodingInfo(String name, String javaName)
{
this.name = name;
this.javaName = javaName;
}
A simple interface to isolate the implementation.
We could also use some new JRE 1.4 methods in another implementation
provided we use reflection with them.
This interface is not a public API,
and should only be used internally within the serializer.
@xsl.usage internal
/**
* A simple interface to isolate the implementation.
* We could also use some new JRE 1.4 methods in another implementation
* provided we use reflection with them.
* <p>
* This interface is not a public API,
* and should only be used internally within the serializer.
* @xsl.usage internal
*/
private interface InEncoding {
Returns true if the char is in the encoding
/**
* Returns true if the char is in the encoding
*/
public boolean isInEncoding(char ch);
Returns true if the high/low surrogate pair forms
a character that is in the encoding.
/**
* Returns true if the high/low surrogate pair forms
* a character that is in the encoding.
*/
public boolean isInEncoding(char high, char low);
}
This class implements the
/**
* This class implements the
*/
private class EncodingImpl implements InEncoding {
public boolean isInEncoding(char ch1) {
final boolean ret;
int codePoint = Encodings.toCodePoint(ch1);
if (codePoint < m_explFirst) {
// The unicode value is before the range
// that we explictly manage, so we delegate the answer.
// If we don't have an m_before object to delegate to, make one.
if (m_before == null)
m_before =
new EncodingImpl(
m_encoding,
m_first,
m_explFirst - 1,
codePoint);
ret = m_before.isInEncoding(ch1);
} else if (m_explLast < codePoint) {
// The unicode value is after the range
// that we explictly manage, so we delegate the answer.
// If we don't have an m_after object to delegate to, make one.
if (m_after == null)
m_after =
new EncodingImpl(
m_encoding,
m_explLast + 1,
m_last,
codePoint);
ret = m_after.isInEncoding(ch1);
} else {
// The unicode value is in the range we explitly handle
final int idx = codePoint - m_explFirst;
// If we already know the answer, just return it.
if (m_alreadyKnown[idx])
ret = m_isInEncoding[idx];
else {
// We don't know the answer, so find out,
// which may be expensive, then cache the answer
ret = inEncoding(ch1, m_encoding);
m_alreadyKnown[idx] = true;
m_isInEncoding[idx] = ret;
}
}
return ret;
}
public boolean isInEncoding(char high, char low) {
final boolean ret;
int codePoint = Encodings.toCodePoint(high,low);
if (codePoint < m_explFirst) {
// The unicode value is before the range
// that we explictly manage, so we delegate the answer.
// If we don't have an m_before object to delegate to, make one.
if (m_before == null)
m_before =
new EncodingImpl(
m_encoding,
m_first,
m_explFirst - 1,
codePoint);
ret = m_before.isInEncoding(high,low);
} else if (m_explLast < codePoint) {
// The unicode value is after the range
// that we explictly manage, so we delegate the answer.
// If we don't have an m_after object to delegate to, make one.
if (m_after == null)
m_after =
new EncodingImpl(
m_encoding,
m_explLast + 1,
m_last,
codePoint);
ret = m_after.isInEncoding(high,low);
} else {
// The unicode value is in the range we explitly handle
final int idx = codePoint - m_explFirst;
// If we already know the answer, just return it.
if (m_alreadyKnown[idx])
ret = m_isInEncoding[idx];
else {
// We don't know the answer, so find out,
// which may be expensive, then cache the answer
ret = inEncoding(high, low, m_encoding);
m_alreadyKnown[idx] = true;
m_isInEncoding[idx] = ret;
}
}
return ret;
}
The encoding.
/**
* The encoding.
*/
final private String m_encoding;
m_first through m_last is the range of unicode
values that this object will return an answer on.
It may delegate to a similar object with a different
range
/**
* m_first through m_last is the range of unicode
* values that this object will return an answer on.
* It may delegate to a similar object with a different
* range
*/
final private int m_first;
m_explFirst through m_explLast is the range of unicode
value that this object handles explicitly and does not
delegate to a similar object.
/**
* m_explFirst through m_explLast is the range of unicode
* value that this object handles explicitly and does not
* delegate to a similar object.
*/
final private int m_explFirst;
final private int m_explLast;
final private int m_last;
The object, of the same type as this one,
that handles unicode values in a range before
the range explictly handled by this object, and
to which this object may delegate.
/**
* The object, of the same type as this one,
* that handles unicode values in a range before
* the range explictly handled by this object, and
* to which this object may delegate.
*/
private InEncoding m_before;
The object, of the same type as this one,
that handles unicode values in a range after
the range explictly handled by this object, and
to which this object may delegate.
/**
* The object, of the same type as this one,
* that handles unicode values in a range after
* the range explictly handled by this object, and
* to which this object may delegate.
*/
private InEncoding m_after;
The number of unicode values explicitly handled
by a single EncodingInfo object. This value is
tuneable, but is set to 128 because that covers the
entire low range of ASCII type chars within a single
object.
/**
* The number of unicode values explicitly handled
* by a single EncodingInfo object. This value is
* tuneable, but is set to 128 because that covers the
* entire low range of ASCII type chars within a single
* object.
*/
private static final int RANGE = 128;
A flag to record if we already know the answer
for the given unicode value.
/**
* A flag to record if we already know the answer
* for the given unicode value.
*/
final private boolean m_alreadyKnown[] = new boolean[RANGE];
A table holding the answer on whether the given unicode
value is in the encoding.
/**
* A table holding the answer on whether the given unicode
* value is in the encoding.
*/
final private boolean m_isInEncoding[] = new boolean[RANGE];
private EncodingImpl() {
// This object will answer whether any unicode value
// is in the encoding, it handles values 0 through Integer.MAX_VALUE
this(javaName, 0, Integer.MAX_VALUE, (char) 0);
}
private EncodingImpl(String encoding, int first, int last, int codePoint) {
// Set the range of unicode values that this object manages
// either explicitly or implicitly.
m_first = first;
m_last = last;
// Set the range of unicode values that this object
// explicitly manages. Align the explicitly managed values
// to RANGE so multiple EncodingImpl objects dont manage the same
// values.
m_explFirst = codePoint / RANGE * RANGE;
m_explLast = m_explFirst + (RANGE-1);
m_encoding = encoding;
if (javaName != null)
{
// Some optimization.
if (0 <= m_explFirst && m_explFirst <= 127) {
// This particular EncodingImpl explicitly handles
// characters in the low range.
if ("UTF8".equals(javaName)
|| "UTF-16".equals(javaName)
|| "ASCII".equals(javaName)
|| "US-ASCII".equals(javaName)
|| "Unicode".equals(javaName)
|| "UNICODE".equals(javaName)
|| javaName.startsWith("ISO8859")) {
// Not only does this EncodingImpl object explicitly
// handle chracters in the low range, it is
// also one that we know something about, without
// needing to call inEncoding(char ch, String encoding)
// for this low range
//
// By initializing the table ahead of time
// for these low values, we prevent the expensive
// inEncoding(char ch, String encoding)
// from being called, at least for these common
// encodings.
for (int unicode = 1; unicode < 127; unicode++) {
final int idx = unicode - m_explFirst;
if (0 <= idx && idx < RANGE) {
m_alreadyKnown[idx] = true;
m_isInEncoding[idx] = true;
}
}
}
}
/* A little bit more than optimization.
*
* We will say that any character is in the encoding if
* we don't have an encoding.
* This is meaningful when the serializer is being used
* in temporary output state, where we are not writing to
* the final output tree. It is when writing to the
* final output tree that we need to worry about the output
* encoding
*/
if (javaName == null) {
for (int idx = 0; idx < m_alreadyKnown.length; idx++) {
m_alreadyKnown[idx] = true;
m_isInEncoding[idx] = true;
}
}
}
}
}
This is heart of the code that determines if a given character
is in the given encoding. This method is probably expensive,
and the answer should be cached.
This method is not a public API,
and should only be used internally within the serializer.
Params: - ch – the char in question, that is not a high char of
a high/low surrogate pair.
- encoding – the Java name of the enocding.
@xsl.usage internal
/**
* This is heart of the code that determines if a given character
* is in the given encoding. This method is probably expensive,
* and the answer should be cached.
* <p>
* This method is not a public API,
* and should only be used internally within the serializer.
* @param ch the char in question, that is not a high char of
* a high/low surrogate pair.
* @param encoding the Java name of the enocding.
*
* @xsl.usage internal
*
*/
private static boolean inEncoding(char ch, String encoding) {
boolean isInEncoding;
try {
char cArray[] = new char[1];
cArray[0] = ch;
// Construct a String from the char
String s = new String(cArray);
// Encode the String into a sequence of bytes
// using the given, named charset.
byte[] bArray = s.getBytes(encoding);
isInEncoding = inEncoding(ch, bArray);
} catch (Exception e) {
isInEncoding = false;
// If for some reason the encoding is null, e.g.
// for a temporary result tree, we should just
// say that every character is in the encoding.
if (encoding == null)
isInEncoding = true;
}
return isInEncoding;
}
This is heart of the code that determines if a given high/low
surrogate pair forms a character that is in the given encoding.
This method is probably expensive, and the answer should be cached.
This method is not a public API,
and should only be used internally within the serializer.
Params: - high – the high char of
a high/low surrogate pair.
- low – the low char of a high/low surrogate pair.
- encoding – the Java name of the encoding.
@xsl.usage internal
/**
* This is heart of the code that determines if a given high/low
* surrogate pair forms a character that is in the given encoding.
* This method is probably expensive, and the answer should be cached.
* <p>
* This method is not a public API,
* and should only be used internally within the serializer.
* @param high the high char of
* a high/low surrogate pair.
* @param low the low char of a high/low surrogate pair.
* @param encoding the Java name of the encoding.
*
* @xsl.usage internal
*
*/
private static boolean inEncoding(char high, char low, String encoding) {
boolean isInEncoding;
try {
char cArray[] = new char[2];
cArray[0] = high;
cArray[1] = low;
// Construct a String from the char
String s = new String(cArray);
// Encode the String into a sequence of bytes
// using the given, named charset.
byte[] bArray = s.getBytes(encoding);
isInEncoding = inEncoding(high,bArray);
} catch (Exception e) {
isInEncoding = false;
}
return isInEncoding;
}
This method is the core of determining if character
is in the encoding. The method is not foolproof, because
s.getBytes(encoding) has specified behavior only if the
characters are in the specified encoding. However this
method tries it's best.
Params: - ch – the char that was converted using getBytes, or
the first char of a high/low pair that was converted.
- data – the bytes written out by the call to s.getBytes(encoding);
Returns: true if the character is in the encoding.
/**
* This method is the core of determining if character
* is in the encoding. The method is not foolproof, because
* s.getBytes(encoding) has specified behavior only if the
* characters are in the specified encoding. However this
* method tries it's best.
* @param ch the char that was converted using getBytes, or
* the first char of a high/low pair that was converted.
* @param data the bytes written out by the call to s.getBytes(encoding);
* @return true if the character is in the encoding.
*/
private static boolean inEncoding(char ch, byte[] data) {
final boolean isInEncoding;
// If the string written out as data is not in the encoding,
// the output is not specified according to the documentation
// on the String.getBytes(encoding) method,
// but we do our best here.
if (data==null || data.length == 0) {
isInEncoding = false;
}
else {
if (data[0] == 0)
isInEncoding = false;
else if (data[0] == '?' && ch != '?')
isInEncoding = false;
/*
* else if (isJapanese) {
* // isJapanese is really
* // ( "EUC-JP".equals(javaName)
* // || "EUC_JP".equals(javaName)
* // || "SJIS".equals(javaName) )
*
* // Work around some bugs in JRE for Japanese
* if(data[0] == 0x21)
* isInEncoding = false;
* else if (ch == 0xA5)
* isInEncoding = false;
* else
* isInEncoding = true;
* }
*/
else {
// We don't know for sure, but it looks like it is in the encoding
isInEncoding = true;
}
}
return isInEncoding;
}
}