/* Aalto XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.in;

import java.io.*;

import javax.xml.stream.XMLStreamException;

import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.XmlCharTypes;
import com.fasterxml.aalto.util.XmlChars;

Scanner for tokenizing XML content from a byte stream encoding using UTF-8 encoding, or something suitably close it for decoding purposes (including ISO-Latin1 and US-ASCII).
/** * Scanner for tokenizing XML content from a byte stream encoding using * UTF-8 encoding, or something suitably close it for decoding purposes * (including ISO-Latin1 and US-ASCII). */
public final class Utf8Scanner extends StreamScanner { /* /********************************************************************** /* Life-cycle /********************************************************************** */ public Utf8Scanner(ReaderConfig cfg, InputStream in, byte[] buffer, int ptr, int last) { super(cfg, in, buffer, ptr, last); } /* /********************************************************************** /* Internal methods, secondary parsing /********************************************************************** */ @Override protected final void finishToken() throws XMLStreamException { _tokenIncomplete = false; switch (_currToken) { case PROCESSING_INSTRUCTION: finishPI(); break; case CHARACTERS: finishCharacters(); break; case COMMENT: finishComment(); break; case SPACE: finishSpace(); break; case DTD: finishDTD(true); // true -> get text break; case CDATA: finishCData(); break; default: ErrorConsts.throwInternalError(); } } @Override protected int handleStartElement(byte b) throws XMLStreamException { _currToken = START_ELEMENT; _currNsCount = 0; PName elemName = parsePName(b); /* Ok. Need to create a qualified name. Simplest for element * in default ns (no extra work -- expressed as null binding); * otherwise need to find binding */ String prefix = elemName.getPrefix(); boolean allBound; // flag to check 'late' bindings if (prefix == null) { // element in default ns allBound = true; // which need not be bound } else { elemName = bindName(elemName, prefix); allBound = elemName.isBound(); } _tokenName = elemName; _currElem = new ElementScope(elemName, _currElem); // And then attribute parsing loop: int attrPtr = 0; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; int c = (int) b & 0xFF; // Intervening space to skip? if (c <= INT_SPACE) { do { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; c = (int) b & 0xFF; } while (c <= INT_SPACE); } else if (c != INT_SLASH && c != INT_GT) { c = decodeCharForError(b); throwUnexpectedChar(c, " expected space, or '>' or \"/>\""); } // Ok; either need to get an attribute name, or end marker: if (c == INT_SLASH) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; if (b != BYTE_GT) { c = decodeCharForError(b); throwUnexpectedChar(c, " expected '>'"); } _isEmptyTag = true; break; } else if (c == INT_GT) { _isEmptyTag = false; break; } else if (c == INT_LT) { reportInputProblem("Unexpected '<' character in element (missing closing '>'?)"); } // Ok, an attr name: PName attrName = parsePName(b); prefix = attrName.getPrefix(); boolean isNsDecl; if (prefix == null) { // can be default ns decl: isNsDecl = (attrName.getLocalName() == "xmlns"); } else { // May be a namespace decl though? if (prefix == "xmlns") { isNsDecl = true; } else { attrName = bindName(attrName, prefix); if (allBound) { allBound = attrName.isBound(); } isNsDecl = false; } } // Optional space to skip again while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; c = (int) b & 0xFF; if (c > INT_SPACE) { break; } if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } } if (c != INT_EQ) { c = decodeCharForError(b); throwUnexpectedChar(c, " expected '='"); } // Optional space to skip again while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; c = (int) b & 0xFF; if (c > INT_SPACE) { break; } if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } } if (c != INT_QUOTE && c != INT_APOS) { c = decodeCharForError(b); throwUnexpectedChar(c, " Expected a quote"); } /* Ok, finally: value parsing. However, ns URIs are to be handled * different from attribute values... let's offline URIs, since * they should be less common than attribute values. */ if (isNsDecl) { // default ns, or explicit? handleNsDeclaration(attrName, b); ++_currNsCount; } else { // nope, a 'real' attribute: attrPtr = collectValue(attrPtr, b, attrName); } } { // Note: this call also checks attribute uniqueness int act = _attrCollector.finishLastValue(attrPtr); if (act < 0) { // error, dup attr indicated by -1 act = _attrCollector.getCount(); // let's get correct count reportInputProblem(_attrCollector.getErrorMsg()); } _attrCount = act; } ++_depth; /* Was there any prefix that wasn't bound prior to use? * That's legal, assuming declaration was found later on... * let's check */ if (!allBound) { if (!elemName.isBound()) { // element itself unbound reportUnboundPrefix(_tokenName, false); } for (int i = 0, len = _attrCount; i < len; ++i) { PName attrName = _attrCollector.getName(i); if (!attrName.isBound()) { reportUnboundPrefix(attrName, true); } } } return START_ELEMENT; }
This method implements the tight loop for parsing attribute values. It's off-lined from the main start element method to simplify main method, which makes code more maintainable and possibly easier for JIT/HotSpot to optimize.
/** * This method implements the tight loop for parsing attribute * values. It's off-lined from the main start element method to * simplify main method, which makes code more maintainable * and possibly easier for JIT/HotSpot to optimize. */
private final int collectValue(int attrPtr, byte quoteByte, PName attrName) throws XMLStreamException { char[] attrBuffer = _attrCollector.startNewValue(attrName, attrPtr); final int[] TYPES = _charTypes.ATTR_CHARS; final int quoteChar = (int) quoteByte; value_loop: while (true) { int c; ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (attrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } int max = _inputEnd; { int max2 = ptr + (attrBuffer.length - attrPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) _inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } attrBuffer[attrPtr++] = (char) c; } _inputPtr = ptr; } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } // fall through case XmlCharTypes.CT_WS_LF: markLF(); // fall through case XmlCharTypes.CT_WS_TAB: // Plus, need to convert these all to simple space c = INT_SPACE; break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (attrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: throwUnexpectedChar(c, "'<' not allowed in attribute value"); case XmlCharTypes.CT_AMP: c = handleEntityInText(false); if (c == 0) { // unexpanded general entity... not good reportUnexpandedEntityInAttr(attrName, false); } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (attrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } } break; case XmlCharTypes.CT_ATTR_QUOTE: if (c == quoteChar) { break value_loop; } // default: // Other chars are not important here... } // We know there's room for at least one char without checking attrBuffer[attrPtr++] = (char) c; } return attrPtr; }
Method called from the main START_ELEMENT handling loop, to parse namespace URI values.
/** * Method called from the main START_ELEMENT handling loop, to * parse namespace URI values. */
private void handleNsDeclaration(PName name, byte quoteByte) throws XMLStreamException { int attrPtr = 0; char[] attrBuffer = _nameBuffer; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; if (b == quoteByte) { break; } int c; if (b == BYTE_AMP) { // entity c = handleEntityInText(false); if (c == 0) { // general entity; should never happen reportUnexpandedEntityInAttr(name, true); } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { if (attrPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } c -= 0x10000; attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); } } else if (b == BYTE_LT) { // error c = (int) b; throwUnexpectedChar(c, "'<' not allowed in attribute value"); } else { c = (int) b & 0xFF; if (c < INT_SPACE) { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else { if (c < 0) { c = decodeMultiByteChar(c, _inputPtr); if (c < 0) { // surrogate pair c = -c; // Let's add first part right away: if (attrPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } c -= 0x10000; attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); } } else if (c != INT_TAB) { throwInvalidSpace(c); } } } } if (attrPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } attrBuffer[attrPtr++] = (char) c; } /* Simple optimization: for default ns removal (or, with * ns 1.1, any other as well), will use empty value... no * need to try to intern: */ if (attrPtr == 0) { bindNs(name, ""); } else { String uri = _config.canonicalizeURI(attrBuffer, attrPtr); bindNs(name, uri); } }
Method called when an ampersand is encounter in text segment. Method needs to determine whether it is a pre-defined or character entity (in which case it will be expanded into a single char or surrogate pair), or a general entity (in which case it will most likely be returned as ENTITY_REFERENCE event)
Params:
  • inAttr – True, if reference is from attribute value; false if from normal text content
Returns:0 if a general parsed entity encountered; integer value of a (valid) XML content character otherwise
/** * Method called when an ampersand is encounter in text segment. * Method needs to determine whether it is a pre-defined or character * entity (in which case it will be expanded into a single char or * surrogate pair), or a general * entity (in which case it will most likely be returned as * ENTITY_REFERENCE event) * * @param inAttr True, if reference is from attribute value; false * if from normal text content * * @return 0 if a general parsed entity encountered; integer * value of a (valid) XML content character otherwise */
@Override protected final int handleEntityInText(boolean inAttr) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_HASH) { return handleCharEntity(); } String start; if (b == BYTE_a) { // amp or apos? b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_m) { // amp? b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_p) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_SEMICOLON) { return INT_AMP; } start = "amp"; } else { start = "am"; } } else if (b == BYTE_p) { // apos? b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_o) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_s) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_SEMICOLON) { return INT_APOS; } start = "apos"; } else { start = "apo"; } } else { start = "ap"; } } else { start = "a"; } } else if (b == BYTE_l) { // lt? b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_t) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_SEMICOLON) { return INT_LT; } start = "lt"; } else { start = "l"; } } else if (b == BYTE_g) { // gt? b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_t) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_SEMICOLON) { return INT_GT; } start = "gt"; } else { start = "g"; } } else if (b == BYTE_q) { // quot? b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_u) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_o) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_t) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_SEMICOLON) { return INT_QUOTE; } start = "quot"; } else { start = "quo"; } } else { start = "qu"; } } else { start = "q"; } } else { start = ""; } final int[] TYPES = _charTypes.NAME_CHARS; /* All righty: we have the beginning of the name, plus the first * byte too. So let's see what we can do with it. */ char[] cbuf = _nameBuffer; int cix = 0; for (int len = start.length(); cix < len; ++cix) { cbuf[cix] = start.charAt(cix); } //int colon = -1; while (b != BYTE_SEMICOLON) { boolean ok; int c = (int) b & 0xFF; // Has to be a valid name start char though: switch (TYPES[c]) { case XmlCharTypes.CT_NAME_NONE: case XmlCharTypes.CT_NAME_COLON: // not ok for entities? case XmlCharTypes.CT_NAME_NONFIRST: ok = (cix > 0); break; case XmlCharTypes.CT_NAME_ANY: ok = true; break; case InputCharTypes.CT_INPUT_NAME_MB_2: c = decodeUtf8_2(c); ok = XmlChars.is10NameStartChar(c); break; case InputCharTypes.CT_INPUT_NAME_MB_3: c = decodeUtf8_3(c); ok = XmlChars.is10NameStartChar(c); break; case InputCharTypes.CT_INPUT_NAME_MB_4: c = decodeUtf8_4(c); ok = XmlChars.is10NameStartChar(c); if (ok) { if (cix >= cbuf.length) { _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length); } // Let's add first part right away: c -= 0x10000; cbuf[cix++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); } break; case InputCharTypes.CT_INPUT_NAME_MB_N: default: ok = false; break; } if (!ok) { reportInvalidNameChar(c, cix); } if (cix >= cbuf.length) { _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length); } cbuf[cix++] = (char) c; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; } // Ok, let's construct a (temporary) entity name, then: String pname = new String(cbuf, 0, cix); // (note: hash is dummy... not to be compared to anything etc) _tokenName = new PNameC(pname, null, pname, 0); /* One more thing: do we actually allow entities in this mode * and with this event? */ if (_config.willExpandEntities()) { reportInputProblem("General entity reference (&"+pname+";) encountered in entity expanding mode: operation not (yet) implemented"); } if (inAttr) { reportInputProblem("General entity reference (&"+pname+";) encountered in attribute value, in non-entity-expanding mode: no way to handle it"); } return 0; } /* /********************************************************************** /* Internal methods, name parsing: /********************************************************************** */
Parsing of public ids is bit more complicated than that of system ids, since white space is to be coalesced.
/** * Parsing of public ids is bit more complicated than that of system * ids, since white space is to be coalesced. */
@Override protected String parsePublicId(byte quoteChar) throws XMLStreamException { char[] outputBuffer = _nameBuffer; int outPtr = 0; final int[] TYPES = XmlCharTypes.PUBID_CHARS; boolean addSpace = false; main_loop: while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } // Easier to check without char type table, first: byte b = _inputBuffer[_inputPtr++]; if (b == quoteChar) { break main_loop; } int c = (int) b & 0xFF; if (TYPES[c] != XmlCharTypes.PUBID_OK) { throwUnexpectedChar(c, " in public identifier"); } // White space? Needs to be coalesced if (c <= INT_SPACE) { addSpace = true; continue; } if (addSpace) { if (outPtr >= outputBuffer.length) { _nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length); outPtr = 0; } outputBuffer[outPtr++] = ' '; addSpace = false; } if (outPtr >= outputBuffer.length) { _nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length); outPtr = 0; } outputBuffer[outPtr++] = (char) c; } return new String(outputBuffer, 0, outPtr); } @Override protected String parseSystemId(byte quoteChar) throws XMLStreamException { // caller has init'ed the buffer... char[] outputBuffer = _nameBuffer; int outPtr = 0; // attribute types are closest matches, so let's use them final int[] TYPES = _charTypes.ATTR_CHARS; //boolean spaceToAdd = false; main_loop: while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } int c = (int) _inputBuffer[_inputPtr++] & 0xFF; if (TYPES[c] != 0) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_ATTR_QUOTE: if (c == (int) quoteChar) { break main_loop; } } } if (outPtr >= outputBuffer.length) { _nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length); outPtr = 0; } outputBuffer[outPtr++] = (char) c; } return new String(outputBuffer, 0, outPtr); } /* /********************************************************************** /* Content skipping /********************************************************************** */ @Override protected final boolean skipCharacters() throws XMLStreamException { final int[] TYPES = _charTypes.TEXT_CHARS; final byte[] inputBuffer = _inputBuffer; while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: skipUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: skipUtf8_4(c); break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: --_inputPtr; return false; case XmlCharTypes.CT_AMP: c = handleEntityInText(false); if (c == 0) { // unexpandable general parsed entity return true; } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; byte b; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = inputBuffer[_inputPtr]; if (b != BYTE_RBRACKET) { break; } ++_inputPtr; // to skip past bracket ++count; } if (b == BYTE_GT && count > 1) { reportIllegalCDataEnd(); } } break; // default: // Other types are not important here... } } } @Override protected final void skipComment() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: skipUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: skipUtf8_4(c); break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_HYPHEN: // '-->'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_HYPHEN) { // ok, must be end then ++_inputPtr; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr++] != BYTE_GT) { reportDoubleHyphenInComments(); } return; } break; // default: // Other types are not important here... } } } @Override protected final void skipCData() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: skipUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: skipUtf8_4(c); break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_RBRACKET: // ']]>'? { // end is nigh? int count = 0; byte b; do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } ++count; b = _inputBuffer[_inputPtr++]; } while (b == BYTE_RBRACKET); if (b == BYTE_GT) { if (count > 1) { // gotcha return; } // can still skip plain ']>'... } else { --_inputPtr; // need to push back last char } } break; // default: // Other types are not important here... } } } @Override protected final void skipPI() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: skipUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: skipUtf8_4(c); break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_QMARK: // '?>'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_GT) { ++_inputPtr; return; } break; // default: // Other types are not important here... } } } @Override protected final void skipSpace() throws XMLStreamException { // mTmpChar has a space, but it's been checked, can ignore int ptr = _inputPtr; while (true) { if (ptr >= _inputEnd) { if (!loadMore()) { break; } ptr = _inputPtr; } int c = (int) _inputBuffer[ptr] & 0xFF; if (c > INT_SPACE) { // !!! TODO: xml 1.1 ws break; } ++ptr; if (c == INT_LF) { markLF(ptr); } else if (c == INT_CR) { if (ptr >= _inputEnd) { if (!loadMore()) { break; } ptr = _inputPtr; } if (_inputBuffer[ptr] == BYTE_LF) { ++ptr; } markLF(ptr); } else if (c != INT_SPACE && c != INT_TAB) { _inputPtr = ptr; throwInvalidSpace(c); } } _inputPtr = ptr; } /* private final int skipMultiByteChar(int c, int ptr) throws XMLStreamException { int needed; // Ok; if we end here, we got multi-byte combination if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) needed = 1; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) needed = 2; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... needed = 3; } else { reportInvalidInitial(c & 0xFF); needed = 1; // never gets here } if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } c = (int) _inputBuffer[ptr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, ptr); } if (needed > 1) { // needed == 1 means 2 bytes total if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } c = (int) _inputBuffer[ptr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, ptr); } if (needed > 2) { // 4 bytes? (need surrogates) if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } c = (int) _inputBuffer[ptr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, ptr); } } } return ptr; } private final int skipMultiByteChar(int c, int type, int ptr) throws XMLStreamException { type -= XmlCharTypes.CT_MULTIBYTE_N; // number of more bytes needed if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } c = (int) _inputBuffer[ptr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, ptr); } if (type > 1) { // needed == 1 means 2 bytes total if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } c = (int) _inputBuffer[ptr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, ptr); } if (type > 2) { // 4 bytes? (need surrogates) if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } c = (int) _inputBuffer[ptr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, ptr); } } } return ptr; } */ private final void skipUtf8_2(int c) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = (int) _inputBuffer[_inputPtr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, _inputPtr); } } /* Alas, can't heavily optimize skipping, since we still have to * do validity checks... */ private final void skipUtf8_3(int c) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c &= 0x0F; if (c >= 0xD) { // have to check c <<= 6; int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c |= (d & 0x3F); if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) { c = handleInvalidXmlChar(c); } } } else { // no checks, can discard c = (int) _inputBuffer[_inputPtr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, _inputPtr); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = (int) _inputBuffer[_inputPtr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, _inputPtr); } } } private final void skipUtf8_4(int c) throws XMLStreamException { if ((_inputPtr + 4) > _inputEnd) { skipUtf8_4Slow(c); return; } int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } } private final void skipUtf8_4Slow(int c) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } } /* /********************************************************************** /* Content parsing /********************************************************************** */ @Override protected final void finishCData() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.resetWithEmpty(); int outPtr = 0; /* At this point, space (if any) has been skipped, and we are * to parse and store the contents */ main_loop: while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then exceptions: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_RBRACKET: // close ']]>' marker? /* Ok: let's just parse all consequtive right brackets, * and see if followed by greater-than char. This because * we can only push back at most one char at a time, and * thus can't easily just check a subset */ int count = 0; // ignoring first one byte b; do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr]; if (b != BYTE_RBRACKET) { break; } ++_inputPtr; ++count; } while (true); // Was the marker found? boolean ok = (b == BYTE_GT && count >= 1); if (ok) { --count; } // Brackets to copy to output? for (; count > 0; --count) { outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } if (ok) { ++_inputPtr; // to consume '>' break main_loop; } break; } // Ok, can output the char; there's room for one char at least outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); /* 03-Feb-2009, tatu: To support coalescing mode, may need to * do some extra work */ if (_cfgCoalescing && !_entityPending) { finishCoalescedText(); } } @Override protected final void finishCharacters() throws XMLStreamException { int outPtr; int c; char[] outputBuffer; // Ok, so what was the first char / entity? c = _tmpChar; if (c < 0) { // from entity; can just copy as is c = -c; outputBuffer = _textBuilder.resetWithEmpty(); outPtr = 0; if ((c >> 16) != 0) { // surrogate pair? c -= 0x10000; /* Note: after resetting the buffer, it's known to have * space for more than 2 chars we need to add */ outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); } outputBuffer[outPtr++] = (char) c; } else { // white space that we are interested in? if (c == INT_CR || c == INT_LF) { ++_inputPtr; // wasn't advanced yet, in this case outPtr = checkInTreeIndentation(c); if (outPtr < 0) { return; } // Above call also initializes the text builder appropriately outputBuffer = _textBuilder.getBufferWithoutReset(); } else { outputBuffer = _textBuilder.resetWithEmpty(); outPtr = 0; } } final int[] TYPES = _charTypes.TEXT_CHARS; final byte[] inputBuffer = _inputBuffer; main_loop: while (true) { // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then fallback for funny chars / UTF-8 multibytes: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) >= 2) { c = decodeUtf8_3fast(c); } else { c = decodeUtf8_3(c); } break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: --_inputPtr; break main_loop; case XmlCharTypes.CT_AMP: c = handleEntityInText(false); if (c == 0) { // unexpandable general parsed entity // _inputPtr set by entity expansion method _entityPending = true; break main_loop; } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; byte b; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = inputBuffer[_inputPtr]; if (b != BYTE_RBRACKET) { break; } ++_inputPtr; // to skip past bracket ++count; } if (b == BYTE_GT && count > 1) { reportIllegalCDataEnd(); } // Nope. Need to output all brackets, then; except // for one that can be left for normal output while (count > 1) { outputBuffer[outPtr++] = ']'; // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } --count; } } // Can just output the first ']' along normal output break; // default: // Other types are not important here... } // We know there's room for one more: outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); /* 03-Feb-2009, tatu: To support coalescing mode, may need to * do some extra work */ if (_cfgCoalescing && !_entityPending) { finishCoalescedText(); } } @Override protected final void finishComment() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.resetWithEmpty(); int outPtr = 0; main_loop: while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_HYPHEN: // '-->'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_HYPHEN) { // ok, must be end then ++_inputPtr; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr++] != BYTE_GT) { reportDoubleHyphenInComments(); } break main_loop; } break; // default: // Other types are not important here... } // Ok, can output the char (we know there's room for one more) outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); }
When this method gets called we know that we have an internal subset, and that the opening '[' has already been read.
/** * When this method gets called we know that we have an internal subset, * and that the opening '[' has already been read. */
@Override protected final void finishDTD(boolean copyContents) throws XMLStreamException { char[] outputBuffer = copyContents ? _textBuilder.resetWithEmpty() : null; int outPtr = 0; final int[] TYPES = _charTypes.DTD_CHARS; boolean inDecl = false; // in declaration/directive? int quoteChar = 0; // inside quoted string? main_loop: while (true) { int c; /* First we'll have a quickie loop for speeding through * uneventful chars... */ ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } int max = _inputEnd; if (outputBuffer != null) { if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } } while (ptr < max) { c = (int) _inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } if (outputBuffer != null) { outputBuffer[outPtr++] = (char) c; } } _inputPtr = ptr; } switch (TYPES[c]) { // First, common types case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); if (outputBuffer != null) { // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } // And let the other char output down below } break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); // Then DTD-specific types: case XmlCharTypes.CT_DTD_QUOTE: // apos or quot if (quoteChar == 0) { quoteChar = c; } else { if (quoteChar == c) { quoteChar = 0; } } break; case XmlCharTypes.CT_DTD_LT: if (!inDecl) { inDecl = true; } break; case XmlCharTypes.CT_DTD_GT: if (quoteChar == 0) { inDecl = false; } break; case XmlCharTypes.CT_DTD_RBRACKET: if (!inDecl && quoteChar == 0) { break main_loop; } break; // default: // Other types are not important here... } if (outputBuffer != null) { // will have room for one more outputBuffer[outPtr++] = (char) c; } } if (outputBuffer != null) { _textBuilder.setCurrentLength(outPtr); } // but still need to match the '>'... byte b = skipInternalWs(false, null); if (b != BYTE_GT) { throwUnexpectedChar(decodeCharForError(b), " expected '>' after the internal subset"); } } @Override protected final void finishPI() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.resetWithEmpty(); int outPtr = 0; /* At this point, space (if any) has been skipped, and we are * to parse and store the contents */ main_loop: while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then exceptions: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_QMARK: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_GT) { // ok, the end! ++_inputPtr; break main_loop; } // Not end mark, just need to reprocess the second char // default: // Other types are not important here... } // Ok, can output the char (we know there's room for one more) outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); }
Note: this method is only called in cases where it is known that only space chars are legal. Thus, encountering a non-space is an error (WFC or VC). However, an end-of-input is ok.
/** * Note: this method is only called in cases where it is known * that only space chars are legal. Thus, encountering a non-space * is an error (WFC or VC). However, an end-of-input is ok. */
@Override protected final void finishSpace() throws XMLStreamException { /* Ok: so, mTmpChar contains first space char. If it looks * like indentation, we can probably optimize a bit... */ int tmp = _tmpChar; char[] outputBuffer; int outPtr; if (tmp == BYTE_CR || tmp == BYTE_LF) { outPtr = checkPrologIndentation(tmp); if (outPtr < 0) { return; } // Above call also initializes the text builder appropriately outputBuffer = _textBuilder.getBufferWithoutReset(); } else { outputBuffer = _textBuilder.resetWithEmpty(); outputBuffer[0] = (char) tmp; outPtr = 1; } int ptr = _inputPtr; while (true) { if (ptr >= _inputEnd) { if (!loadMore()) { break; } ptr = _inputPtr; } int c = (int) _inputBuffer[ptr] & 0xFF; // !!! TODO: check for xml 1.1 whitespace? if (c > INT_SPACE) { break; } ++ptr; if (c == INT_LF) { markLF(ptr); } else if (c == INT_CR) { if (ptr >= _inputEnd) { if (!loadMore()) { // still need to output the lf if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = '\n'; break; } ptr = _inputPtr; } if (_inputBuffer[ptr] == BYTE_LF) { ++ptr; } markLF(ptr); c = INT_LF; // need to convert to canonical lf } else if (c != INT_SPACE && c != INT_TAB) { _inputPtr = ptr; throwInvalidSpace(c); } // Ok, can output the char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; _textBuilder.setCurrentLength(outPtr); } /* /********************************************************************** /* 2nd level parsing/skipping for coalesced text /********************************************************************** */
Method that gets called after a primary text segment (of type CHARACTERS or CDATA, not applicable to SPACE) has been read in text buffer. Method has to see if the following event would be textual as well, and if so, read it (and any other following textual segments).
/** * Method that gets called after a primary text segment (of type * CHARACTERS or CDATA, not applicable to SPACE) has been read in * text buffer. Method has to see if the following event would * be textual as well, and if so, read it (and any other following * textual segments). */
protected final void finishCoalescedText() throws XMLStreamException { while (true) { // no matter what, will need (and can get) one char if (_inputPtr >= _inputEnd) { if (!loadMore()) { // most likely an error, will be handled later on return; } } if (_inputBuffer[_inputPtr] == BYTE_LT) { // markup of some kind /* In worst case, need 3 chars ("<![") all in all to know * if we are getting a CDATA section */ if ((_inputPtr + 3) >= _inputEnd) { if (!loadAndRetain(3)) { // probably an error, but will be handled later return; } } if (_inputBuffer[_inputPtr+1] != BYTE_EXCL || _inputBuffer[_inputPtr+2] != BYTE_LBRACKET) { // can't be CDATA, we are done here return; } // but let's verify it still: _inputPtr += 3; for (int i = 0; i < 6; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; if (b != (byte) CDATA_STR.charAt(i)) { int ch = decodeCharForError(b); reportTreeUnexpChar(ch, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)"); } } finishCoalescedCData(); } else { // textual (or entity, error etc) finishCoalescedCharacters(); if (_entityPending) { break; } } } } // note: code mostly copied from 'finishCharacters', just simplified // in some places protected final void finishCoalescedCharacters() throws XMLStreamException { // first char can't be from (char) entity (wrt finishCharacters) final int[] TYPES = _charTypes.TEXT_CHARS; final byte[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); int c; main_loop: while (true) { // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then fallback for funny chars / UTF-8 multibytes: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) >= 2) { c = decodeUtf8_3fast(c); } else { c = decodeUtf8_3(c); } break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: --_inputPtr; break main_loop; case XmlCharTypes.CT_AMP: c = handleEntityInText(false); if (c == 0) { // unexpandable general parsed entity // _inputPtr set by entity expansion method _entityPending = true; break main_loop; } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; byte b; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = inputBuffer[_inputPtr]; if (b != BYTE_RBRACKET) { break; } ++_inputPtr; // to skip past bracket ++count; } if (b == BYTE_GT && count > 1) { reportIllegalCDataEnd(); } // Nope. Need to output all brackets, then; except // for one that can be left for normal output while (count > 1) { outputBuffer[outPtr++] = ']'; // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } --count; } } // Can just output the first ']' along normal output break; // default: // Other types are not important here... } // We know there's room for one more: outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); } // note: code mostly copied from 'finishCharacters', just simplified // in some places protected final void finishCoalescedCData() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); /* At this point, space (if any) has been skipped, and we are * to parse and store the contents */ main_loop: while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then exceptions: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_RBRACKET: // close ']]>' marker? /* Ok: let's just parse all consequtive right brackets, * and see if followed by greater-than char. This because * we can only push back at most one char at a time, and * thus can't easily just check a subset */ int count = 0; // ignoring first one byte b; do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr]; if (b != BYTE_RBRACKET) { break; } ++_inputPtr; ++count; } while (true); // Was the marker found? boolean ok = (b == BYTE_GT && count >= 1); if (ok) { --count; } // Brackets to copy to output? for (; count > 0; --count) { outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } if (ok) { ++_inputPtr; // to consume '>' break main_loop; } break; } // Ok, can output the char; there's room for one char at least outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); }
Method that gets called after a primary text segment (of type CHARACTERS or CDATA, not applicable to SPACE) has been skipped. Method has to see if the following event would be textual as well, and if so, skip it (and any other following textual segments).
Returns:True if we encountered an unexpandable entity
/** * Method that gets called after a primary text segment (of type * CHARACTERS or CDATA, not applicable to SPACE) has been skipped. * Method has to see if the following event would * be textual as well, and if so, skip it (and any other following * textual segments). * * @return True if we encountered an unexpandable entity */
@Override protected final boolean skipCoalescedText() throws XMLStreamException { while (true) { // no matter what, will need (and can get) one char if (_inputPtr >= _inputEnd) { if (!loadMore()) { // most likely an error, will be handled later on return false; } } if (_inputBuffer[_inputPtr] == BYTE_LT) { // markup of some kind /* In worst case, need 3 chars ("<![") all in all to know * if we are getting a CDATA section */ if ((_inputPtr + 3) >= _inputEnd) { if (!loadAndRetain(3)) { // probably an error, but will be handled later return false; } } if (_inputBuffer[_inputPtr+1] != BYTE_EXCL || _inputBuffer[_inputPtr+2] != BYTE_LBRACKET) { // can't be CDATA, we are done here return false; } // but let's verify it still: _inputPtr += 3; for (int i = 0; i < 6; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; if (b != (byte) CDATA_STR.charAt(i)) { int ch = decodeCharForError(b); reportTreeUnexpChar(ch, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)"); } } skipCData(); } else { // textual (or entity, error etc) if (skipCharacters()) { return true; } } } } /* /********************************************************************** /* Other methods, utf-decoding /********************************************************************** */
Returns:Either decoded character (if positive int); or negated value of a high-order char (one that needs surrogate pair)
/** * @return Either decoded character (if positive int); or negated * value of a high-order char (one that needs surrogate pair) */
private final int decodeMultiByteChar(int c, int ptr) throws XMLStreamException { int needed; if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) c &= 0x1F; needed = 1; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) c &= 0x0F; needed = 2; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... c &= 0x07; needed = 3; } else { reportInvalidInitial(c & 0xFF); needed = 1; // never gets here } if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } int d = (int) _inputBuffer[ptr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, ptr); } c = (c << 6) | (d & 0x3F); if (needed > 1) { // needed == 1 means 2 bytes total if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } d = (int) _inputBuffer[ptr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, ptr); } c = (c << 6) | (d & 0x3F); if (needed > 2) { // 4 bytes? (need surrogates) if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } d = (int) _inputBuffer[ptr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, ptr); } c = (c << 6) | (d & 0x3F); /* Need to signal such pair differently (to make comparison * easier) */ c = -c; } } _inputPtr = ptr; return c; } private final int decodeUtf8_2(int c) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } return ((c & 0x1F) << 6) | (d & 0x3F); } private final int decodeUtf8_3(int c1) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c1 &= 0x0F; int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } int c = (c1 << 6) | (d & 0x3F); if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) { c = handleInvalidXmlChar(c); } } } return c; } private final int decodeUtf8_3fast(int c1) throws XMLStreamException { c1 &= 0x0F; int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } int c = (c1 << 6) | (d & 0x3F); d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) { c = handleInvalidXmlChar(c); } } } return c; }
Returns:Character value minus 0x10000; this so that caller can readily expand it to actual surrogates
/** * @return Character value <b>minus 0x10000</c>; this so that caller * can readily expand it to actual surrogates */
private final int decodeUtf8_4(int c) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = ((c & 0x07) << 6) | (d & 0x3F); if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } /* note: won't change it to negative here, since caller * already knows it'll need a surrogate */ return ((c << 6) | (d & 0x3F)) - 0x10000; } /* /********************************************************************** /* Internal methods, error reporting /********************************************************************** */
Method called called to decode a full UTF-8 characters, given its first byte. Note: does not do any validity checks, since this is only to be used for informational purposes (often when an error has already been encountered)
/** * Method called called to decode a full UTF-8 characters, given * its first byte. Note: does not do any validity checks, since this * is only to be used for informational purposes (often when an error * has already been encountered) */
@Override public int decodeCharForError(byte b) throws XMLStreamException { int c = (int) b; if (c >= 0) { // ascii? fine as is... return c; } int needed; // Ok; if we end here, we got multi-byte combination if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) c &= 0x1F; needed = 1; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) c &= 0x0F; needed = 2; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... c &= 0x07; needed = 3; } else { reportInvalidInitial(c & 0xFF); needed = 1; // never gets here } int d = nextByte(); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF); } c = (c << 6) | (d & 0x3F); if (needed > 1) { // needed == 1 means 2 bytes total d = nextByte(); // 3rd byte if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF); } c = (c << 6) | (d & 0x3F); if (needed > 2) { // 4 bytes? (need surrogates) d = nextByte(); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF); } c = (c << 6) | (d & 0x3F); } } return c; } protected void reportInvalidOther(int mask, int ptr) throws XMLStreamException { _inputPtr = ptr; reportInvalidOther(mask); } }