/* Woodstox Lite ("wool") XML processor
*
* Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fasterxml.aalto.out;
import com.fasterxml.aalto.util.XmlCharTypes;
import com.fasterxml.aalto.util.XmlChars;
public final class OutputCharTypes
extends XmlCharTypes
{
Although many encodings (Latin1, Ascii) could use shorter tables,
for UTF-8 2k makes sense, as it will then cover both one and
type byte sequences. And this being the case, let's use the same
size for all encodings.
/**
* Although many encodings (Latin1, Ascii) could use shorter tables,
* for UTF-8 2k makes sense, as it will then cover both one and
* type byte sequences. And this being the case, let's use the same
* size for all encodings.
*/
final static int MAIN_TABLE_SIZE = 2048;
// Note: this is only used on writer-side, thus overlaps with previous one:
public final static int CT_OUTPUT_MUST_QUOTE = CT_MULTIBYTE_N;
/* Also, dealing with names is bit more complicated, as there's
* both validity, and possible encoding, to tackle...
* So let's try if we can figure out combinations: only 1 and 2-byte
* encodings are covered by the table, fortunately
*<p>
* Note: values must not overlap with base class' constants
*/
Unencodable means that while the name char may be acceptable
per se, it can not be encode using current encoding
/**
* Unencodable means that while the name char may be acceptable
* per se, it can not be encode using current encoding
*/
public final static int CT_OUTPUT_NAME_UNENCODABLE = 4;
public final static int CT_OUTPUT_NAME_NONFIRST_MB2 = 5;
public final static int CT_OUTPUT_NAME_ANY_MB2 = 6;
// Singleton instances:
private static XmlCharTypes sAsciiCharTypes = null;
private static XmlCharTypes sLatin1CharTypes = null;
/* Note: unlike others, let's create eagerly, not lazily,
* as this is expected to be the common case:
*/
private final static XmlCharTypes sUtf8CharTypes = new XmlCharTypes(MAIN_TABLE_SIZE);
static {
/* On output side, utf-8 handling is bit different though;
* 8-bit range is affected but in different way. So let's actually
* start from vanilla Latin1 settings:
*/
fillInLatin1Chars(sUtf8CharTypes.TEXT_CHARS,
sUtf8CharTypes.ATTR_CHARS,
sUtf8CharTypes.NAME_CHARS,
sUtf8CharTypes.DTD_CHARS,
sUtf8CharTypes.OTHER_CHARS);
/* And then just note that all 8-bit textual things need
* two-byte encoding (not applicable for name tables, though,
* uses separate vales)
*/
modifyForUtf8(sUtf8CharTypes.TEXT_CHARS);
modifyForUtf8(sUtf8CharTypes.ATTR_CHARS);
modifyForUtf8(sUtf8CharTypes.DTD_CHARS);
modifyForUtf8(sUtf8CharTypes.OTHER_CHARS);
/* But beyond that, not all name characters between 256 and 2047
* are legal...
*/
modifyUtf8Names(sUtf8CharTypes.NAME_CHARS);
/* One final point: within attribute values, we need tad more
* quoting for some things.
*/
modifyForAttrWrite(sUtf8CharTypes.ATTR_CHARS);
}
public final static XmlCharTypes getUtf8CharTypes()
{
return sUtf8CharTypes;
}
public final static XmlCharTypes getLatin1CharTypes()
{
if (sLatin1CharTypes == null) {
sLatin1CharTypes = new XmlCharTypes(MAIN_TABLE_SIZE);
fillInLatin1Chars(sLatin1CharTypes.TEXT_CHARS,
sLatin1CharTypes.ATTR_CHARS,
sLatin1CharTypes.NAME_CHARS,
sLatin1CharTypes.DTD_CHARS,
sLatin1CharTypes.OTHER_CHARS);
modifyForLatin1(sLatin1CharTypes.TEXT_CHARS);
modifyForLatin1(sLatin1CharTypes.ATTR_CHARS);
modifyForLatin1(sLatin1CharTypes.DTD_CHARS);
modifyForLatin1(sLatin1CharTypes.OTHER_CHARS);
// not applicable for names
// Also, extra quoting for some chars in attr values
modifyForAttrWrite(sLatin1CharTypes.ATTR_CHARS);
}
return sLatin1CharTypes;
}
public final static XmlCharTypes getAsciiCharTypes()
{
if (sAsciiCharTypes == null) {
sAsciiCharTypes = new XmlCharTypes(MAIN_TABLE_SIZE);
// We'll start with 8-bit char set
fillInLatin1Chars(sAsciiCharTypes.TEXT_CHARS,
sAsciiCharTypes.ATTR_CHARS,
sAsciiCharTypes.NAME_CHARS,
sAsciiCharTypes.DTD_CHARS,
sAsciiCharTypes.OTHER_CHARS);
// And then just require quoting for non-7-bit chars
modifyForAscii(sAsciiCharTypes.TEXT_CHARS);
modifyForAscii(sAsciiCharTypes.ATTR_CHARS);
modifyForAscii(sAsciiCharTypes.DTD_CHARS);
modifyForAscii(sAsciiCharTypes.OTHER_CHARS);
modifyAsciiNames(sAsciiCharTypes.NAME_CHARS);
// Also, extra quoting for some chars in attr values
modifyForAttrWrite(sAsciiCharTypes.ATTR_CHARS);
}
return sAsciiCharTypes;
}
private static void modifyForLatin1(int[] charTable)
{
/* And also mark 0x7F - 0x9F (although for xml 1.1, could
* consider not quoting 0x85?)
*/
for (int i = 0x7F; i <= 0x9F; ++i) {
charTable[i] = CT_OUTPUT_MUST_QUOTE;
}
requireQuotingAfter(charTable, 0xFF);
}
private static void modifyLatin1Names(int[] charTable)
{
for (int i = 0x100, len = charTable.length; i < len; ++i) {
// Just need to indicate none should be 'ok'...
int val = charTable[i];
if (val == CT_NAME_NONFIRST || val == CT_NAME_ANY) {
charTable[i] = CT_OUTPUT_NAME_UNENCODABLE;
}
}
}
private static void modifyForUtf8(int[] charTable)
{
for (int i = 0x80, len = charTable.length; i < len; ++i) {
// Let's not modify entries that indicate 'must quote' or invalid:
if (charTable[i] == CT_OK) {
charTable[i] = CT_MULTIBYTE_2;
}
}
}
private static void modifyUtf8Names(int[] charTable)
{
/* !!! For now, we'll just use xml 1.0 rules, for 1.1 need
* to use separate set of tables.
*/
for (int i = 0x80, len = charTable.length; i < len; ++i) {
if (XmlChars.is10NameStartChar(i)) {
charTable[i] = CT_OUTPUT_NAME_ANY_MB2;
} else if (XmlChars.is10NameChar(i)) {
charTable[i] = CT_OUTPUT_NAME_NONFIRST_MB2;
} else {
charTable[i] = CT_NAME_NONE;
}
}
}
private static void modifyForAscii(int[] charTable)
{
requireQuotingAfter(charTable, 0x7F);
}
private static void modifyAsciiNames(int[] charTable)
{
modifyLatin1Names(charTable);
for (int i = 0x80, len = charTable.length; i < len; ++i) {
// Just need to indicate none should be 'ok'...
int val = charTable[i];
if (val == CT_NAME_NONFIRST || val == CT_NAME_ANY) {
charTable[i] = CT_OUTPUT_NAME_UNENCODABLE;
}
}
}
private static void modifyForAttrWrite(int[] charTable)
{
charTable['\t'] = CT_OUTPUT_MUST_QUOTE;
}
private static void requireQuotingAfter(int[] charTable, int lastValid)
{
// For the most part, like Latin1
for (int i = lastValid+1, len = charTable.length; i < len; ++i) {
// Just need to indicate none should be 'ok'...
if (charTable[i] == CT_OK) {
charTable[i] = CT_OUTPUT_MUST_QUOTE;
}
}
}
}