/* Woodstox Lite ("wool") XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.in;

import com.fasterxml.aalto.util.XmlCharTypes;

public final class InputCharTypes
    extends XmlCharTypes
{
    /* Most of the type values are shared, but name handling differs
     * enough, to warrant partially separate value spaces
     */

    

Important: must not overlap with the base constants. Last constant (CT_NAME_ANY) currently has value 3.

/** *<p> * Important: must not overlap with the base constants. * Last constant (CT_NAME_ANY) currently has value 3. */
public final static int CT_INPUT_NAME_MB_N = 4; public final static int CT_INPUT_NAME_MB_2 = 5; public final static int CT_INPUT_NAME_MB_3 = 6; public final static int CT_INPUT_NAME_MB_4 = 7; // Singleton instances: /* Let's create non-UTF types lazily, as there's a good chance * they might not be used, thereby possibly reducing memory footprint * and startup time */ private static XmlCharTypes sAsciiCharTypes = null; private static XmlCharTypes sLatin1CharTypes = null; /* Note: unlike others, let's create eagerly, not lazily, * as this is expected to be the common case: */ private final static XmlCharTypes sUtf8CharTypes = new XmlCharTypes(); static { fillInUtf8Chars(sUtf8CharTypes.TEXT_CHARS, sUtf8CharTypes.ATTR_CHARS, sUtf8CharTypes.NAME_CHARS, sUtf8CharTypes.DTD_CHARS, sUtf8CharTypes.OTHER_CHARS); } public final static XmlCharTypes getUtf8CharTypes() { return sUtf8CharTypes; } public final static synchronized XmlCharTypes getAsciiCharTypes() { if (sAsciiCharTypes == null) { sAsciiCharTypes = new XmlCharTypes(); fillInLatin1Chars(sAsciiCharTypes.TEXT_CHARS, sAsciiCharTypes.ATTR_CHARS, sAsciiCharTypes.NAME_CHARS, sAsciiCharTypes.DTD_CHARS, sAsciiCharTypes.OTHER_CHARS); // but need to wipe out everything for high-bit range: fillInIllegalAsciiRange(sAsciiCharTypes.TEXT_CHARS); fillInIllegalAsciiRange(sAsciiCharTypes.ATTR_CHARS); fillInIllegalAsciiRange(sAsciiCharTypes.NAME_CHARS); fillInIllegalAsciiRange(sAsciiCharTypes.DTD_CHARS); fillInIllegalAsciiRange(sAsciiCharTypes.OTHER_CHARS); } return sAsciiCharTypes; } public final static synchronized XmlCharTypes getLatin1CharTypes() { if (sLatin1CharTypes == null) { sLatin1CharTypes = new XmlCharTypes(); fillInLatin1Chars(sLatin1CharTypes.TEXT_CHARS, sLatin1CharTypes.ATTR_CHARS, sLatin1CharTypes.NAME_CHARS, sLatin1CharTypes.DTD_CHARS, sLatin1CharTypes.OTHER_CHARS); } return sLatin1CharTypes; } public static void fillInUtf8Chars(int[] textChars, int[] attrChars, int[] nameChars, int[] dtdChars, int[] otherChars) { // text chars fillIn8BitTextRange(textChars); fillInMultiByteTextRange(textChars); // attr chars fillIn8BitAttrRange(attrChars); fillInMultiByteTextRange(attrChars); // name chars fillIn8BitNameRange(nameChars); /* Although 7-bit range uses different values, let's use * same byte length markers for 8-bit range (as with text content) */ fillInMultiByteNameRange(nameChars); // // DTD chars: fillIn8BitDtdRange(dtdChars); fillInMultiByteTextRange(dtdChars); // ... lotsa matching to do here // 25-Jan-2011, tatu: Can't remember why LBRACKET would be needed: // otherChars['['] = CT_LBRACKET; otherChars[']'] = CT_RBRACKET; otherChars['>'] = CT_GT; // and finally, others (comment, CDATA, PI) // let's start with basic text chars: fillIn8BitTextRange(otherChars); fillInMultiByteTextRange(otherChars); /* And then just remove ampersand and lt (not special in any of * these events), and add ']', '?' and '-', which mark start of end * markers in the events. */ otherChars['&'] = CT_OK; otherChars['<'] = CT_OK; otherChars[']'] = CT_RBRACKET; // for CDATA otherChars['?'] = CT_QMARK; // for PI otherChars['-'] = CT_HYPHEN; // for Comment } private static void fillInMultiByteTextRange(int[] arr) { for (int c = 128; c < 256; ++c) { int code; // Let's use code from UTF-8 decoder, to ensure correctness if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) code = CT_MULTIBYTE_2; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) code = CT_MULTIBYTE_3; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... code = CT_MULTIBYTE_4; } else { code = CT_INVALID; } arr[c] = code; } } private static void fillInMultiByteNameRange(int[] arr) { for (int c = 128; c < 256; ++c) { int code; // Let's use code from UTF-8 decoder, to ensure correctness if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) code = CT_INPUT_NAME_MB_2; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) code = CT_INPUT_NAME_MB_3; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... code = CT_INPUT_NAME_MB_4; } else { code = CT_INVALID; } arr[c] = code; } } protected static void fillInIllegalAsciiRange(int[] arr) { for (int i = 128; i <= 255; ++i) { arr[i] = CT_INVALID; } } }