com.fasterxml/aalto-xml/1.2.2 : com/fasterxml/aalto/out/OutputCharTypes.java

OutputCharTypes
http://github.com/FasterXML/aalto-xml/: Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2) (FasterXML)
The Apache Software License, Version 2.0
Tatu Saloranta
/* Woodstox Lite ("wool") XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.out;

import com.fasterxml.aalto.util.XmlCharTypes;
import com.fasterxml.aalto.util.XmlChars;

public final class OutputCharTypes
    extends XmlCharTypes
{
    Although many encodings (Latin1, Ascii) could use shorter tables,
for UTF-8 2k makes sense, as it will then cover both one and
type byte sequences. And this being the case, let's use the same
size for all encodings.
/**
     * Although many encodings (Latin1, Ascii) could use shorter tables,
     * for UTF-8 2k makes sense, as it will then cover both one and
     * type byte sequences. And this being the case, let's use the same
     * size for all encodings.
     */
    final static int MAIN_TABLE_SIZE = 2048;

    // Note: this is only used on writer-side, thus overlaps with previous one:
    public final static int CT_OUTPUT_MUST_QUOTE = CT_MULTIBYTE_N;

    /* Also, dealing with names is bit more complicated, as there's
     * both validity, and possible encoding, to tackle...
     * So let's try if we can figure out combinations: only 1 and 2-byte
     * encodings are covered by the table, fortunately
     *<p>
     * Note: values must not overlap with base class' constants
     */

    Unencodable means that while the name char may be acceptable
per se, it can not be encode using current encoding
/**
     * Unencodable means that while the name char may be acceptable
     * per se, it can not be encode using current encoding
     */
    public final static int CT_OUTPUT_NAME_UNENCODABLE = 4;
    public final static int CT_OUTPUT_NAME_NONFIRST_MB2 = 5;
    public final static int CT_OUTPUT_NAME_ANY_MB2 = 6;

    // Singleton instances:

    private static XmlCharTypes sAsciiCharTypes = null;

    private static XmlCharTypes sLatin1CharTypes = null;

    /* Note: unlike others, let's create eagerly, not lazily,
     * as this is expected to be the common case:
     */
    private final static XmlCharTypes sUtf8CharTypes = new XmlCharTypes(MAIN_TABLE_SIZE);
    static {
        /* On output side, utf-8 handling is bit different though;
         * 8-bit range is affected but in different way. So let's actually
         * start from vanilla Latin1 settings:
         */
        fillInLatin1Chars(sUtf8CharTypes.TEXT_CHARS,
                          sUtf8CharTypes.ATTR_CHARS,
                          sUtf8CharTypes.NAME_CHARS,
                          sUtf8CharTypes.DTD_CHARS,
                          sUtf8CharTypes.OTHER_CHARS);
        /* And then just note that all 8-bit textual things need
         * two-byte encoding (not applicable for name tables, though,
         * uses separate vales)
         */
        modifyForUtf8(sUtf8CharTypes.TEXT_CHARS);
        modifyForUtf8(sUtf8CharTypes.ATTR_CHARS);
        modifyForUtf8(sUtf8CharTypes.DTD_CHARS);
        modifyForUtf8(sUtf8CharTypes.OTHER_CHARS);

        /* But beyond that, not all name characters between 256 and 2047
         * are legal...
         */
        modifyUtf8Names(sUtf8CharTypes.NAME_CHARS);

        /* One final point: within attribute values, we need tad more
         * quoting for some things.
         */
        modifyForAttrWrite(sUtf8CharTypes.ATTR_CHARS);
    }

    public final static XmlCharTypes getUtf8CharTypes()
    {
        return sUtf8CharTypes;
    }

    public final static XmlCharTypes getLatin1CharTypes()
    {
        if (sLatin1CharTypes == null) {
            sLatin1CharTypes = new XmlCharTypes(MAIN_TABLE_SIZE);
            fillInLatin1Chars(sLatin1CharTypes.TEXT_CHARS,
                              sLatin1CharTypes.ATTR_CHARS,
                              sLatin1CharTypes.NAME_CHARS,
                              sLatin1CharTypes.DTD_CHARS,
                              sLatin1CharTypes.OTHER_CHARS);
            modifyForLatin1(sLatin1CharTypes.TEXT_CHARS);
            modifyForLatin1(sLatin1CharTypes.ATTR_CHARS);
            modifyForLatin1(sLatin1CharTypes.DTD_CHARS);
            modifyForLatin1(sLatin1CharTypes.OTHER_CHARS);
            // not applicable for names

            // Also, extra quoting for some chars in attr values
            modifyForAttrWrite(sLatin1CharTypes.ATTR_CHARS);
        }
        return sLatin1CharTypes;
    }

    public final static XmlCharTypes getAsciiCharTypes()
    {
        if (sAsciiCharTypes == null) {
            sAsciiCharTypes = new XmlCharTypes(MAIN_TABLE_SIZE);
            // We'll start with 8-bit char set
            fillInLatin1Chars(sAsciiCharTypes.TEXT_CHARS,
                              sAsciiCharTypes.ATTR_CHARS,
                              sAsciiCharTypes.NAME_CHARS,
                              sAsciiCharTypes.DTD_CHARS,
                              sAsciiCharTypes.OTHER_CHARS);
            // And then just require quoting for non-7-bit chars
            modifyForAscii(sAsciiCharTypes.TEXT_CHARS);
            modifyForAscii(sAsciiCharTypes.ATTR_CHARS);
            modifyForAscii(sAsciiCharTypes.DTD_CHARS);
            modifyForAscii(sAsciiCharTypes.OTHER_CHARS);
            modifyAsciiNames(sAsciiCharTypes.NAME_CHARS);

            // Also, extra quoting for some chars in attr values
            modifyForAttrWrite(sAsciiCharTypes.ATTR_CHARS);
        }
        return sAsciiCharTypes;
    }

    private static void modifyForLatin1(int[] charTable)
    {
        /* And also mark 0x7F - 0x9F (although for xml 1.1, could
         * consider not quoting 0x85?)
         */
        for (int i = 0x7F; i <= 0x9F; ++i) {
            charTable[i] = CT_OUTPUT_MUST_QUOTE;
        }
        requireQuotingAfter(charTable, 0xFF);
    }

    private static void modifyLatin1Names(int[] charTable)
    {
        for (int i = 0x100, len = charTable.length; i < len; ++i) {
            // Just need to indicate none should be 'ok'...
            int val = charTable[i];
            if (val == CT_NAME_NONFIRST || val == CT_NAME_ANY) {
                charTable[i] = CT_OUTPUT_NAME_UNENCODABLE;
            }
        }
    }

    private static void modifyForUtf8(int[] charTable)
    {
        for (int i = 0x80, len = charTable.length; i < len; ++i) {
            // Let's not modify entries that indicate 'must quote' or invalid:
            if (charTable[i] == CT_OK) {
                charTable[i] = CT_MULTIBYTE_2;
            }
        }
    }

    private static void modifyUtf8Names(int[] charTable)
    {
        /* !!! For now, we'll just use xml 1.0 rules, for 1.1 need
         *   to use separate set of tables.
         */
        for (int i = 0x80, len = charTable.length; i < len; ++i) {
            if (XmlChars.is10NameStartChar(i)) {
                charTable[i] = CT_OUTPUT_NAME_ANY_MB2;
            } else if (XmlChars.is10NameChar(i)) {
                charTable[i] = CT_OUTPUT_NAME_NONFIRST_MB2;
            } else {
                charTable[i] = CT_NAME_NONE;
            }
        }
    }

    private static void modifyForAscii(int[] charTable)
    {
        requireQuotingAfter(charTable, 0x7F);
    }

    private static void modifyAsciiNames(int[] charTable)
    {
        modifyLatin1Names(charTable);
        for (int i = 0x80, len = charTable.length; i < len; ++i) {
            // Just need to indicate none should be 'ok'...
            int val = charTable[i];
            if (val == CT_NAME_NONFIRST || val == CT_NAME_ANY) {
                charTable[i] = CT_OUTPUT_NAME_UNENCODABLE;
            }
        }
    }

    private static void modifyForAttrWrite(int[] charTable)
    {
        charTable['\t'] = CT_OUTPUT_MUST_QUOTE;
    }

    private static void requireQuotingAfter(int[] charTable, int lastValid)
    {
        // For the most part, like Latin1
        for (int i = lastValid+1, len = charTable.length; i < len; ++i) {
            // Just need to indicate none should be 'ok'...
            if (charTable[i] == CT_OK) {
                charTable[i] = CT_OUTPUT_MUST_QUOTE;
            }
        }
    }
}
/

com.fasterxml/ aalto-xml/ 1.2.2/ com/fasterxml/aalto/out/OutputCharTypes.java