/* Woodstox Lite ("wool") XML processor
*
* Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fasterxml.aalto.util;
This is a simple container class, mostly used to encapsulate details
of character typing out of parser/scanner/writer classes, while still
making int arrays auto-generated only if needed (esp. for encodings
never needed, which may be the case for ascii etc).
/**
* This is a simple container class, mostly used to encapsulate details
* of character typing out of parser/scanner/writer classes, while still
* making int arrays auto-generated only if needed (esp. for encodings
* never needed, which may be the case for ascii etc).
*/
public class XmlCharTypes
{
// First, common constants to all (non-name) types:
public final static int CT_OK = 0;
public final static int CT_INVALID = 1; // either invalid xml in general, or in this context
public final static int CT_WS_CR = 2;
public final static int CT_WS_LF = 3;
public final static int CT_MULTIBYTE_N = 4; // (too) long encoding
public final static int CT_MULTIBYTE_2 = 5; // 2-byte encoding
public final static int CT_MULTIBYTE_3 = 6; // 3-byte encoding
public final static int CT_MULTIBYTE_4 = 7; // 4-byte encoding
// Constants for regular char types
public final static int CT_WS_TAB = 8;
public final static int CT_LT = 9; // for start/end tags
public final static int CT_AMP = 10; // for entities
public final static int CT_RBRACKET = 11; // for ]]> detection
public final static int CT_QMARK = 12; // for PI
public final static int CT_HYPHEN = 13; // for Comments
public final static int CT_ATTR_QUOTE = 14; // ' and ", for attr values
public final static int CT_LBRACKET = 16; // for dtd subset sections
public final static int CT_GT = 17; // for dtd subset sections
// // // Constants for DTDs:
// (first ones from common types)
public final static int CT_DTD_QUOTE = 8; // ' and ", for attr values
public final static int CT_DTD_LT = 9; // directive start/end
public final static int CT_DTD_GT = 10;
public final static int CT_DTD_RBRACKET = 11; // for ending dtd subset
public final static int CT_DTD_PERCENT = 12; // for ending dtd subset
// // // Constants for names:
/* These are common constants for name char types, shared between
* both input and output sides:
*/
public final static int CT_NAME_NONE = 0; // not a valid name char
public final static int CT_NAME_COLON = 1; // not a valid name char
public final static int CT_NAME_NONFIRST = 2; // good name char except as first (including colon)
public final static int CT_NAME_ANY = 3; // good name char, first or any
// // // Constants for public ids:
public final static int PUBID_INVALID = 0;
public final static int PUBID_OK = 1;
// Instance data
Character type table used for regular textual content (for
CHARACTERS event)
/**
* Character type table used for regular textual content (for
* CHARACTERS event)
*/
public final int[] TEXT_CHARS;
Character type table used for attribute values
/**
* Character type table used for attribute values
*/
public final int[] ATTR_CHARS;
Character type table used for name characters (note: type ints
used different from other tables)
/**
* Character type table used for name characters (note: type ints
* used different from other tables)
*/
public final int[] NAME_CHARS;
Character type table used for DTD subsets; contains a few
additional types beyond most tables
/**
* Character type table used for DTD subsets; contains a few
* additional types beyond most tables
*/
public final int[] DTD_CHARS;
Character type table used for events other than CHARACTERS or
elements; ie. for comments, PIs, CData, DTD internal subset
/**
* Character type table used for events other than CHARACTERS or
* elements; ie. for comments, PIs, CData, DTD internal subset
*/
public final int[] OTHER_CHARS;
And finally, we also have shared table for valid public id
characters...
/**
* And finally, we also have shared table for valid public id
* characters...
*/
public final static int[] PUBID_CHARS = new int[256];
static {
for (int i = 0, last = ('z' - 'a'); i <= last; ++i) {
PUBID_CHARS['A' + i] = PUBID_OK;
PUBID_CHARS['a' + i] = PUBID_OK;
}
for (int i = '0'; i <= '9'; ++i) {
PUBID_CHARS[i] = PUBID_OK;
}
// 3 main white space types are valid
PUBID_CHARS[0x0A] = PUBID_OK;
PUBID_CHARS[0x0D] = PUBID_OK;
PUBID_CHARS[0x20] = PUBID_OK;
// And many of punctuation/separator ascii chars too:
PUBID_CHARS['-'] = PUBID_OK;
PUBID_CHARS['\''] = PUBID_OK;
PUBID_CHARS['('] = PUBID_OK;
PUBID_CHARS[')'] = PUBID_OK;
PUBID_CHARS['+'] = PUBID_OK;
PUBID_CHARS[','] = PUBID_OK;
PUBID_CHARS['.'] = PUBID_OK;
PUBID_CHARS['/'] = PUBID_OK;
PUBID_CHARS[':'] = PUBID_OK;
PUBID_CHARS['='] = PUBID_OK;
PUBID_CHARS['?'] = PUBID_OK;
PUBID_CHARS[';'] = PUBID_OK;
PUBID_CHARS['!'] = PUBID_OK;
PUBID_CHARS['*'] = PUBID_OK;
PUBID_CHARS['#'] = PUBID_OK;
PUBID_CHARS['@'] = PUBID_OK;
PUBID_CHARS['$'] = PUBID_OK;
PUBID_CHARS['_'] = PUBID_OK;
PUBID_CHARS['%'] = PUBID_OK;
}
public XmlCharTypes() { this(256); }
public XmlCharTypes(int size) {
TEXT_CHARS = new int[size];
ATTR_CHARS = new int[size];
NAME_CHARS = new int[size];
DTD_CHARS = new int[size];
OTHER_CHARS = new int[size];
}
public static void fillInLatin1Chars(int[] textChars,
int[] attrChars,
int[] nameChars,
int[] dtdChars,
int[] otherChars)
{
// text:
fillIn8BitTextRange(textChars);
// high-order entries are 'ok' by default, no need to fill
// attr:
fillIn8BitAttrRange(attrChars);
// high-order entries are 'ok' by default, no need to fill
// name chars:
fillIn8BitNameRange(nameChars);
// High-order name tokens...
for (int i = 0xC0; i <= 0xFF; ++i) {
if (i != 0xD7 && i != 0xF7) {
nameChars[i] = CT_NAME_ANY;
}
}
nameChars[0xB7] = CT_NAME_NONFIRST;
// // DTD chars:
fillIn8BitDtdRange(dtdChars);
// ... lotsa matching to do here
// others:
// let's start with basic text chars:
fillIn8BitTextRange(otherChars);
/* And then just remove amp and lt (not special in any of these
* events), and add ']', '?' and '-', which mark start of end
* markers in the events.
*/
otherChars['&'] = CT_OK;
otherChars['<'] = CT_OK;
otherChars[']'] = CT_RBRACKET; // for CDATA
otherChars['?'] = CT_QMARK; // for PI
otherChars['-'] = CT_HYPHEN; // for Comment
}
/*
/**********************************************************************
/* Internal methods
/**********************************************************************
*/
private static void fillInCommonTextRange(int[] arr)
{
for (int i = 0; i < 32; ++i) {
arr[i] = CT_INVALID;
}
// And linefeeds are always converted
arr['\r'] = CT_WS_CR;
arr['\n'] = CT_WS_LF;
arr['\t'] = CT_OK; // it's just fine, usually not converted
}
public static void fillIn8BitNameRange(int[] arr)
{
for (int i = 'a'; i <= 'z'; ++i) {
arr[i] = CT_NAME_ANY;
}
for (int i = 'A'; i <= 'Z'; ++i) {
arr[i] = CT_NAME_ANY;
}
// Non-letter first chars:
arr['_'] = CT_NAME_ANY;
// And then non-first ones:
arr[':'] = CT_NAME_COLON;
arr['-'] = CT_NAME_NONFIRST;
arr['.'] = CT_NAME_NONFIRST;
for (int i = '0'; i <= '9'; ++i) {
arr[i] = CT_NAME_NONFIRST;
}
}
Called to set state of 7-bit chars in text content
/**
* Called to set state of 7-bit chars in text content
*/
protected static void fillIn8BitTextRange(int[] arr)
{
fillInCommonTextRange(arr);
arr['<'] = CT_LT;
arr['&'] = CT_AMP;
arr[']'] = CT_RBRACKET;
}
Called to set state of 7-bit chars in attribute values
/**
* Called to set state of 7-bit chars in attribute values
*/
protected static void fillIn8BitAttrRange(int[] arr)
{
fillInCommonTextRange(arr);
arr['\t'] = CT_WS_TAB;
arr['<'] = CT_LT;
arr['&'] = CT_AMP;
arr['\''] = CT_ATTR_QUOTE;
arr['"'] = CT_ATTR_QUOTE;
}
protected static void fillIn8BitDtdRange(int[] arr)
{
fillInCommonTextRange(arr);
arr['\''] = CT_DTD_QUOTE;
arr['"'] = CT_DTD_QUOTE;
arr['<'] = CT_DTD_LT;
arr['>'] = CT_DTD_GT;
// No need to check for lbracket (for now?)
arr[']'] = CT_DTD_RBRACKET;
arr['%'] = CT_DTD_PERCENT;
}
}