/* Aalto XML processor
*
* Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.fasterxml.aalto.in;
import java.io.*;
import javax.xml.stream.XMLStreamException;
import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.XmlCharTypes;
import com.fasterxml.aalto.util.XmlChars;
Scanner for tokenizing XML content from a byte stream encoding using
UTF-8 encoding, or something suitably close it for decoding purposes
(including ISO-Latin1 and US-ASCII).
/**
* Scanner for tokenizing XML content from a byte stream encoding using
* UTF-8 encoding, or something suitably close it for decoding purposes
* (including ISO-Latin1 and US-ASCII).
*/
public final class Utf8Scanner
extends StreamScanner
{
/*
/**********************************************************************
/* Life-cycle
/**********************************************************************
*/
public Utf8Scanner(ReaderConfig cfg, InputStream in,
byte[] buffer, int ptr, int last)
{
super(cfg, in, buffer, ptr, last);
}
/*
/**********************************************************************
/* Internal methods, secondary parsing
/**********************************************************************
*/
@Override
protected final void finishToken() throws XMLStreamException
{
_tokenIncomplete = false;
switch (_currToken) {
case PROCESSING_INSTRUCTION:
finishPI();
break;
case CHARACTERS:
finishCharacters();
break;
case COMMENT:
finishComment();
break;
case SPACE:
finishSpace();
break;
case DTD:
finishDTD(true); // true -> get text
break;
case CDATA:
finishCData();
break;
default:
ErrorConsts.throwInternalError();
}
}
@Override
protected int handleStartElement(byte b)
throws XMLStreamException
{
_currToken = START_ELEMENT;
_currNsCount = 0;
PName elemName = parsePName(b);
/* Ok. Need to create a qualified name. Simplest for element
* in default ns (no extra work -- expressed as null binding);
* otherwise need to find binding
*/
String prefix = elemName.getPrefix();
boolean allBound; // flag to check 'late' bindings
if (prefix == null) { // element in default ns
allBound = true; // which need not be bound
} else {
elemName = bindName(elemName, prefix);
allBound = elemName.isBound();
}
_tokenName = elemName;
_currElem = new ElementScope(elemName, _currElem);
// And then attribute parsing loop:
int attrPtr = 0;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
int c = (int) b & 0xFF;
// Intervening space to skip?
if (c <= INT_SPACE) {
do {
if (c == INT_LF) {
markLF();
} else if (c == INT_CR) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
} else if (c != INT_SPACE && c != INT_TAB) {
throwInvalidSpace(c);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
c = (int) b & 0xFF;
} while (c <= INT_SPACE);
} else if (c != INT_SLASH && c != INT_GT) {
c = decodeCharForError(b);
throwUnexpectedChar(c, " expected space, or '>' or \"/>\"");
}
// Ok; either need to get an attribute name, or end marker:
if (c == INT_SLASH) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
if (b != BYTE_GT) {
c = decodeCharForError(b);
throwUnexpectedChar(c, " expected '>'");
}
_isEmptyTag = true;
break;
} else if (c == INT_GT) {
_isEmptyTag = false;
break;
} else if (c == INT_LT) {
reportInputProblem("Unexpected '<' character in element (missing closing '>'?)");
}
// Ok, an attr name:
PName attrName = parsePName(b);
prefix = attrName.getPrefix();
boolean isNsDecl;
if (prefix == null) { // can be default ns decl:
isNsDecl = (attrName.getLocalName() == "xmlns");
} else {
// May be a namespace decl though?
if (prefix == "xmlns") {
isNsDecl = true;
} else {
attrName = bindName(attrName, prefix);
if (allBound) {
allBound = attrName.isBound();
}
isNsDecl = false;
}
}
// Optional space to skip again
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
c = (int) b & 0xFF;
if (c > INT_SPACE) {
break;
}
if (c == INT_LF) {
markLF();
} else if (c == INT_CR) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
} else if (c != INT_SPACE && c != INT_TAB) {
throwInvalidSpace(c);
}
}
if (c != INT_EQ) {
c = decodeCharForError(b);
throwUnexpectedChar(c, " expected '='");
}
// Optional space to skip again
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
c = (int) b & 0xFF;
if (c > INT_SPACE) {
break;
}
if (c == INT_LF) {
markLF();
} else if (c == INT_CR) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
} else if (c != INT_SPACE && c != INT_TAB) {
throwInvalidSpace(c);
}
}
if (c != INT_QUOTE && c != INT_APOS) {
c = decodeCharForError(b);
throwUnexpectedChar(c, " Expected a quote");
}
/* Ok, finally: value parsing. However, ns URIs are to be handled
* different from attribute values... let's offline URIs, since
* they should be less common than attribute values.
*/
if (isNsDecl) { // default ns, or explicit?
handleNsDeclaration(attrName, b);
++_currNsCount;
} else { // nope, a 'real' attribute:
attrPtr = collectValue(attrPtr, b, attrName);
}
}
{
// Note: this call also checks attribute uniqueness
int act = _attrCollector.finishLastValue(attrPtr);
if (act < 0) { // error, dup attr indicated by -1
act = _attrCollector.getCount(); // let's get correct count
reportInputProblem(_attrCollector.getErrorMsg());
}
_attrCount = act;
}
++_depth;
/* Was there any prefix that wasn't bound prior to use?
* That's legal, assuming declaration was found later on...
* let's check
*/
if (!allBound) {
if (!elemName.isBound()) { // element itself unbound
reportUnboundPrefix(_tokenName, false);
}
for (int i = 0, len = _attrCount; i < len; ++i) {
PName attrName = _attrCollector.getName(i);
if (!attrName.isBound()) {
reportUnboundPrefix(attrName, true);
}
}
}
return START_ELEMENT;
}
This method implements the tight loop for parsing attribute
values. It's off-lined from the main start element method to
simplify main method, which makes code more maintainable
and possibly easier for JIT/HotSpot to optimize.
/**
* This method implements the tight loop for parsing attribute
* values. It's off-lined from the main start element method to
* simplify main method, which makes code more maintainable
* and possibly easier for JIT/HotSpot to optimize.
*/
private final int collectValue(int attrPtr, byte quoteByte, PName attrName)
throws XMLStreamException
{
char[] attrBuffer = _attrCollector.startNewValue(attrName, attrPtr);
final int[] TYPES = _charTypes.ATTR_CHARS;
final int quoteChar = (int) quoteByte;
value_loop:
while (true) {
int c;
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (attrPtr >= attrBuffer.length) {
attrBuffer = _attrCollector.valueBufferFull();
}
int max = _inputEnd;
{
int max2 = ptr + (attrBuffer.length - attrPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) _inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
attrBuffer[attrPtr++] = (char) c;
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
// fall through
case XmlCharTypes.CT_WS_LF:
markLF();
// fall through
case XmlCharTypes.CT_WS_TAB:
// Plus, need to convert these all to simple space
c = INT_SPACE;
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
if (attrPtr >= attrBuffer.length) {
attrBuffer = _attrCollector.valueBufferFull();
}
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_LT:
throwUnexpectedChar(c, "'<' not allowed in attribute value");
case XmlCharTypes.CT_AMP:
c = handleEntityInText(false);
if (c == 0) { // unexpanded general entity... not good
reportUnexpandedEntityInAttr(attrName, false);
}
// Ok; does it need a surrogate though? (over 16 bits)
if ((c >> 16) != 0) {
c -= 0x10000;
attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
if (attrPtr >= attrBuffer.length) {
attrBuffer = _attrCollector.valueBufferFull();
}
}
break;
case XmlCharTypes.CT_ATTR_QUOTE:
if (c == quoteChar) {
break value_loop;
}
// default:
// Other chars are not important here...
}
// We know there's room for at least one char without checking
attrBuffer[attrPtr++] = (char) c;
}
return attrPtr;
}
Method called from the main START_ELEMENT handling loop, to
parse namespace URI values.
/**
* Method called from the main START_ELEMENT handling loop, to
* parse namespace URI values.
*/
private void handleNsDeclaration(PName name, byte quoteByte)
throws XMLStreamException
{
int attrPtr = 0;
char[] attrBuffer = _nameBuffer;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
if (b == quoteByte) {
break;
}
int c;
if (b == BYTE_AMP) { // entity
c = handleEntityInText(false);
if (c == 0) { // general entity; should never happen
reportUnexpandedEntityInAttr(name, true);
}
// Ok; does it need a surrogate though? (over 16 bits)
if ((c >> 16) != 0) {
if (attrPtr >= attrBuffer.length) {
_nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length);
}
c -= 0x10000;
attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
}
} else if (b == BYTE_LT) { // error
c = (int) b;
throwUnexpectedChar(c, "'<' not allowed in attribute value");
} else {
c = (int) b & 0xFF;
if (c < INT_SPACE) {
if (c == INT_LF) {
markLF();
} else if (c == INT_CR) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
} else {
if (c < 0) {
c = decodeMultiByteChar(c, _inputPtr);
if (c < 0) { // surrogate pair
c = -c;
// Let's add first part right away:
if (attrPtr >= attrBuffer.length) {
_nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length);
}
c -= 0x10000;
attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
}
} else if (c != INT_TAB) {
throwInvalidSpace(c);
}
}
}
}
if (attrPtr >= attrBuffer.length) {
_nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length);
}
attrBuffer[attrPtr++] = (char) c;
}
/* Simple optimization: for default ns removal (or, with
* ns 1.1, any other as well), will use empty value... no
* need to try to intern:
*/
if (attrPtr == 0) {
bindNs(name, "");
} else {
String uri = _config.canonicalizeURI(attrBuffer, attrPtr);
bindNs(name, uri);
}
}
Method called when an ampersand is encounter in text segment.
Method needs to determine whether it is a pre-defined or character
entity (in which case it will be expanded into a single char or
surrogate pair), or a general
entity (in which case it will most likely be returned as
ENTITY_REFERENCE event)
Params: - inAttr – True, if reference is from attribute value; false
if from normal text content
Returns: 0 if a general parsed entity encountered; integer
value of a (valid) XML content character otherwise
/**
* Method called when an ampersand is encounter in text segment.
* Method needs to determine whether it is a pre-defined or character
* entity (in which case it will be expanded into a single char or
* surrogate pair), or a general
* entity (in which case it will most likely be returned as
* ENTITY_REFERENCE event)
*
* @param inAttr True, if reference is from attribute value; false
* if from normal text content
*
* @return 0 if a general parsed entity encountered; integer
* value of a (valid) XML content character otherwise
*/
@Override
protected final int handleEntityInText(boolean inAttr)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
if (b == BYTE_HASH) {
return handleCharEntity();
}
String start;
if (b == BYTE_a) { // amp or apos?
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_m) { // amp?
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_p) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_SEMICOLON) {
return INT_AMP;
}
start = "amp";
} else {
start = "am";
}
} else if (b == BYTE_p) { // apos?
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_o) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_s) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_SEMICOLON) {
return INT_APOS;
}
start = "apos";
} else {
start = "apo";
}
} else {
start = "ap";
}
} else {
start = "a";
}
} else if (b == BYTE_l) { // lt?
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_t) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_SEMICOLON) {
return INT_LT;
}
start = "lt";
} else {
start = "l";
}
} else if (b == BYTE_g) { // gt?
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_t) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_SEMICOLON) {
return INT_GT;
}
start = "gt";
} else {
start = "g";
}
} else if (b == BYTE_q) { // quot?
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_u) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_o) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_t) {
b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne();
if (b == BYTE_SEMICOLON) {
return INT_QUOTE;
}
start = "quot";
} else {
start = "quo";
}
} else {
start = "qu";
}
} else {
start = "q";
}
} else {
start = "";
}
final int[] TYPES = _charTypes.NAME_CHARS;
/* All righty: we have the beginning of the name, plus the first
* byte too. So let's see what we can do with it.
*/
char[] cbuf = _nameBuffer;
int cix = 0;
for (int len = start.length(); cix < len; ++cix) {
cbuf[cix] = start.charAt(cix);
}
//int colon = -1;
while (b != BYTE_SEMICOLON) {
boolean ok;
int c = (int) b & 0xFF;
// Has to be a valid name start char though:
switch (TYPES[c]) {
case XmlCharTypes.CT_NAME_NONE:
case XmlCharTypes.CT_NAME_COLON: // not ok for entities?
case XmlCharTypes.CT_NAME_NONFIRST:
ok = (cix > 0);
break;
case XmlCharTypes.CT_NAME_ANY:
ok = true;
break;
case InputCharTypes.CT_INPUT_NAME_MB_2:
c = decodeUtf8_2(c);
ok = XmlChars.is10NameStartChar(c);
break;
case InputCharTypes.CT_INPUT_NAME_MB_3:
c = decodeUtf8_3(c);
ok = XmlChars.is10NameStartChar(c);
break;
case InputCharTypes.CT_INPUT_NAME_MB_4:
c = decodeUtf8_4(c);
ok = XmlChars.is10NameStartChar(c);
if (ok) {
if (cix >= cbuf.length) {
_nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
}
// Let's add first part right away:
c -= 0x10000;
cbuf[cix++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
}
break;
case InputCharTypes.CT_INPUT_NAME_MB_N:
default:
ok = false;
break;
}
if (!ok) {
reportInvalidNameChar(c, cix);
}
if (cix >= cbuf.length) {
_nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length);
}
cbuf[cix++] = (char) c;
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr++];
}
// Ok, let's construct a (temporary) entity name, then:
String pname = new String(cbuf, 0, cix);
// (note: hash is dummy... not to be compared to anything etc)
_tokenName = new PNameC(pname, null, pname, 0);
/* One more thing: do we actually allow entities in this mode
* and with this event?
*/
if (_config.willExpandEntities()) {
reportInputProblem("General entity reference (&"+pname+";) encountered in entity expanding mode: operation not (yet) implemented");
}
if (inAttr) {
reportInputProblem("General entity reference (&"+pname+";) encountered in attribute value, in non-entity-expanding mode: no way to handle it");
}
return 0;
}
/*
/**********************************************************************
/* Internal methods, name parsing:
/**********************************************************************
*/
Parsing of public ids is bit more complicated than that of system
ids, since white space is to be coalesced.
/**
* Parsing of public ids is bit more complicated than that of system
* ids, since white space is to be coalesced.
*/
@Override
protected String parsePublicId(byte quoteChar) throws XMLStreamException
{
char[] outputBuffer = _nameBuffer;
int outPtr = 0;
final int[] TYPES = XmlCharTypes.PUBID_CHARS;
boolean addSpace = false;
main_loop:
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
// Easier to check without char type table, first:
byte b = _inputBuffer[_inputPtr++];
if (b == quoteChar) {
break main_loop;
}
int c = (int) b & 0xFF;
if (TYPES[c] != XmlCharTypes.PUBID_OK) {
throwUnexpectedChar(c, " in public identifier");
}
// White space? Needs to be coalesced
if (c <= INT_SPACE) {
addSpace = true;
continue;
}
if (addSpace) {
if (outPtr >= outputBuffer.length) {
_nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length);
outPtr = 0;
}
outputBuffer[outPtr++] = ' ';
addSpace = false;
}
if (outPtr >= outputBuffer.length) {
_nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length);
outPtr = 0;
}
outputBuffer[outPtr++] = (char) c;
}
return new String(outputBuffer, 0, outPtr);
}
@Override
protected String parseSystemId(byte quoteChar) throws XMLStreamException
{
// caller has init'ed the buffer...
char[] outputBuffer = _nameBuffer;
int outPtr = 0;
// attribute types are closest matches, so let's use them
final int[] TYPES = _charTypes.ATTR_CHARS;
//boolean spaceToAdd = false;
main_loop:
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
int c = (int) _inputBuffer[_inputPtr++] & 0xFF;
if (TYPES[c] != 0) {
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_ATTR_QUOTE:
if (c == (int) quoteChar) {
break main_loop;
}
}
}
if (outPtr >= outputBuffer.length) {
_nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length);
outPtr = 0;
}
outputBuffer[outPtr++] = (char) c;
}
return new String(outputBuffer, 0, outPtr);
}
/*
/**********************************************************************
/* Content skipping
/**********************************************************************
*/
@Override
protected final boolean skipCharacters() throws XMLStreamException
{
final int[] TYPES = _charTypes.TEXT_CHARS;
final byte[] inputBuffer = _inputBuffer;
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
int max = _inputEnd;
if (ptr >= max) {
loadMoreGuaranteed();
ptr = _inputPtr;
max = _inputEnd;
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
skipUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
skipUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
skipUtf8_4(c);
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_LT:
--_inputPtr;
return false;
case XmlCharTypes.CT_AMP:
c = handleEntityInText(false);
if (c == 0) { // unexpandable general parsed entity
return true;
}
break;
case XmlCharTypes.CT_RBRACKET: // ']]>'?
{
// Let's then just count number of brackets --
// in case they are not followed by '>'
int count = 1;
byte b;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = inputBuffer[_inputPtr];
if (b != BYTE_RBRACKET) {
break;
}
++_inputPtr; // to skip past bracket
++count;
}
if (b == BYTE_GT && count > 1) {
reportIllegalCDataEnd();
}
}
break;
// default:
// Other types are not important here...
}
}
}
@Override
protected final void skipComment() throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
int max = _inputEnd;
if (ptr >= max) {
loadMoreGuaranteed();
ptr = _inputPtr;
max = _inputEnd;
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
skipUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
skipUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
skipUtf8_4(c);
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_HYPHEN: // '-->'?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_HYPHEN) { // ok, must be end then
++_inputPtr;
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr++] != BYTE_GT) {
reportDoubleHyphenInComments();
}
return;
}
break;
// default:
// Other types are not important here...
}
}
}
@Override
protected final void skipCData() throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
int max = _inputEnd;
if (ptr >= max) {
loadMoreGuaranteed();
ptr = _inputPtr;
max = _inputEnd;
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
skipUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
skipUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
skipUtf8_4(c);
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_RBRACKET: // ']]>'?
{
// end is nigh?
int count = 0;
byte b;
do {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
++count;
b = _inputBuffer[_inputPtr++];
} while (b == BYTE_RBRACKET);
if (b == BYTE_GT) {
if (count > 1) { // gotcha
return;
}
// can still skip plain ']>'...
} else {
--_inputPtr; // need to push back last char
}
}
break;
// default:
// Other types are not important here...
}
}
}
@Override
protected final void skipPI() throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
int max = _inputEnd;
if (ptr >= max) {
loadMoreGuaranteed();
ptr = _inputPtr;
max = _inputEnd;
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
skipUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
skipUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
skipUtf8_4(c);
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_QMARK: // '?>'?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_GT) {
++_inputPtr;
return;
}
break;
// default:
// Other types are not important here...
}
}
}
@Override
protected final void skipSpace() throws XMLStreamException
{
// mTmpChar has a space, but it's been checked, can ignore
int ptr = _inputPtr;
while (true) {
if (ptr >= _inputEnd) {
if (!loadMore()) {
break;
}
ptr = _inputPtr;
}
int c = (int) _inputBuffer[ptr] & 0xFF;
if (c > INT_SPACE) { // !!! TODO: xml 1.1 ws
break;
}
++ptr;
if (c == INT_LF) {
markLF(ptr);
} else if (c == INT_CR) {
if (ptr >= _inputEnd) {
if (!loadMore()) {
break;
}
ptr = _inputPtr;
}
if (_inputBuffer[ptr] == BYTE_LF) {
++ptr;
}
markLF(ptr);
} else if (c != INT_SPACE && c != INT_TAB) {
_inputPtr = ptr;
throwInvalidSpace(c);
}
}
_inputPtr = ptr;
}
/*
private final int skipMultiByteChar(int c, int ptr)
throws XMLStreamException
{
int needed;
// Ok; if we end here, we got multi-byte combination
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
needed = 1;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
needed = 2;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
needed = 3;
} else {
reportInvalidInitial(c & 0xFF);
needed = 1; // never gets here
}
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
c = (int) _inputBuffer[ptr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, ptr);
}
if (needed > 1) { // needed == 1 means 2 bytes total
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
c = (int) _inputBuffer[ptr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, ptr);
}
if (needed > 2) { // 4 bytes? (need surrogates)
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
c = (int) _inputBuffer[ptr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, ptr);
}
}
}
return ptr;
}
private final int skipMultiByteChar(int c, int type, int ptr)
throws XMLStreamException
{
type -= XmlCharTypes.CT_MULTIBYTE_N; // number of more bytes needed
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
c = (int) _inputBuffer[ptr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, ptr);
}
if (type > 1) { // needed == 1 means 2 bytes total
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
c = (int) _inputBuffer[ptr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, ptr);
}
if (type > 2) { // 4 bytes? (need surrogates)
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
c = (int) _inputBuffer[ptr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, ptr);
}
}
}
return ptr;
}
*/
private final void skipUtf8_2(int c)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = (int) _inputBuffer[_inputPtr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, _inputPtr);
}
}
/* Alas, can't heavily optimize skipping, since we still have to
* do validity checks...
*/
private final void skipUtf8_3(int c)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c &= 0x0F;
if (c >= 0xD) { // have to check
c <<= 6;
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
c |= (d & 0x3F);
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
c = (c << 6) | (d & 0x3F);
// 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal
if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF
if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) {
c = handleInvalidXmlChar(c);
}
}
} else { // no checks, can discard
c = (int) _inputBuffer[_inputPtr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, _inputPtr);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c = (int) _inputBuffer[_inputPtr++];
if ((c & 0xC0) != 0x080) {
reportInvalidOther(c & 0xFF, _inputPtr);
}
}
}
private final void skipUtf8_4(int c) throws XMLStreamException
{
if ((_inputPtr + 4) > _inputEnd) {
skipUtf8_4Slow(c);
return;
}
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
}
private final void skipUtf8_4Slow(int c) throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
}
/*
/**********************************************************************
/* Content parsing
/**********************************************************************
*/
@Override
protected final void finishCData()
throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.resetWithEmpty();
int outPtr = 0;
/* At this point, space (if any) has been skipped, and we are
* to parse and store the contents
*/
main_loop:
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
}
// And then exceptions:
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_RBRACKET: // close ']]>' marker?
/* Ok: let's just parse all consequtive right brackets,
* and see if followed by greater-than char. This because
* we can only push back at most one char at a time, and
* thus can't easily just check a subset
*/
int count = 0; // ignoring first one
byte b;
do {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr];
if (b != BYTE_RBRACKET) {
break;
}
++_inputPtr;
++count;
} while (true);
// Was the marker found?
boolean ok = (b == BYTE_GT && count >= 1);
if (ok) {
--count;
}
// Brackets to copy to output?
for (; count > 0; --count) {
outputBuffer[outPtr++] = ']';
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
}
if (ok) {
++_inputPtr; // to consume '>'
break main_loop;
}
break;
}
// Ok, can output the char; there's room for one char at least
outputBuffer[outPtr++] = (char) c;
}
_textBuilder.setCurrentLength(outPtr);
/* 03-Feb-2009, tatu: To support coalescing mode, may need to
* do some extra work
*/
if (_cfgCoalescing && !_entityPending) {
finishCoalescedText();
}
}
@Override
protected final void finishCharacters() throws XMLStreamException
{
int outPtr;
int c;
char[] outputBuffer;
// Ok, so what was the first char / entity?
c = _tmpChar;
if (c < 0) { // from entity; can just copy as is
c = -c;
outputBuffer = _textBuilder.resetWithEmpty();
outPtr = 0;
if ((c >> 16) != 0) { // surrogate pair?
c -= 0x10000;
/* Note: after resetting the buffer, it's known to have
* space for more than 2 chars we need to add
*/
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
}
outputBuffer[outPtr++] = (char) c;
} else { // white space that we are interested in?
if (c == INT_CR || c == INT_LF) {
++_inputPtr; // wasn't advanced yet, in this case
outPtr = checkInTreeIndentation(c);
if (outPtr < 0) {
return;
}
// Above call also initializes the text builder appropriately
outputBuffer = _textBuilder.getBufferWithoutReset();
} else {
outputBuffer = _textBuilder.resetWithEmpty();
outPtr = 0;
}
}
final int[] TYPES = _charTypes.TEXT_CHARS;
final byte[] inputBuffer = _inputBuffer;
main_loop:
while (true) {
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
}
// And then fallback for funny chars / UTF-8 multibytes:
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
if ((_inputEnd - _inputPtr) >= 2) {
c = decodeUtf8_3fast(c);
} else {
c = decodeUtf8_3(c);
}
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_LT:
--_inputPtr;
break main_loop;
case XmlCharTypes.CT_AMP:
c = handleEntityInText(false);
if (c == 0) { // unexpandable general parsed entity
// _inputPtr set by entity expansion method
_entityPending = true;
break main_loop;
}
// Ok; does it need a surrogate though? (over 16 bits)
if ((c >> 16) != 0) {
c -= 0x10000;
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
// Need to ensure room for one more char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
}
break;
case XmlCharTypes.CT_RBRACKET: // ']]>'?
{
// Let's then just count number of brackets --
// in case they are not followed by '>'
int count = 1;
byte b;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = inputBuffer[_inputPtr];
if (b != BYTE_RBRACKET) {
break;
}
++_inputPtr; // to skip past bracket
++count;
}
if (b == BYTE_GT && count > 1) {
reportIllegalCDataEnd();
}
// Nope. Need to output all brackets, then; except
// for one that can be left for normal output
while (count > 1) {
outputBuffer[outPtr++] = ']';
// Need to ensure room for one more char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
--count;
}
}
// Can just output the first ']' along normal output
break;
// default:
// Other types are not important here...
}
// We know there's room for one more:
outputBuffer[outPtr++] = (char) c;
}
_textBuilder.setCurrentLength(outPtr);
/* 03-Feb-2009, tatu: To support coalescing mode, may need to
* do some extra work
*/
if (_cfgCoalescing && !_entityPending) {
finishCoalescedText();
}
}
@Override
protected final void finishComment() throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.resetWithEmpty();
int outPtr = 0;
main_loop:
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_HYPHEN: // '-->'?
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_HYPHEN) { // ok, must be end then
++_inputPtr;
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr++] != BYTE_GT) {
reportDoubleHyphenInComments();
}
break main_loop;
}
break;
// default:
// Other types are not important here...
}
// Ok, can output the char (we know there's room for one more)
outputBuffer[outPtr++] = (char) c;
}
_textBuilder.setCurrentLength(outPtr);
}
When this method gets called we know that we have an internal subset,
and that the opening '[' has already been read.
/**
* When this method gets called we know that we have an internal subset,
* and that the opening '[' has already been read.
*/
@Override
protected final void finishDTD(boolean copyContents) throws XMLStreamException
{
char[] outputBuffer = copyContents ? _textBuilder.resetWithEmpty() : null;
int outPtr = 0;
final int[] TYPES = _charTypes.DTD_CHARS;
boolean inDecl = false; // in declaration/directive?
int quoteChar = 0; // inside quoted string?
main_loop:
while (true) {
int c;
/* First we'll have a quickie loop for speeding through
* uneventful chars...
*/
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
int max = _inputEnd;
if (outputBuffer != null) {
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
}
while (ptr < max) {
c = (int) _inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
if (outputBuffer != null) {
outputBuffer[outPtr++] = (char) c;
}
}
_inputPtr = ptr;
}
switch (TYPES[c]) {
// First, common types
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
if (outputBuffer != null) {
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
// And let the other char output down below
}
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
// Then DTD-specific types:
case XmlCharTypes.CT_DTD_QUOTE: // apos or quot
if (quoteChar == 0) {
quoteChar = c;
} else {
if (quoteChar == c) {
quoteChar = 0;
}
}
break;
case XmlCharTypes.CT_DTD_LT:
if (!inDecl) {
inDecl = true;
}
break;
case XmlCharTypes.CT_DTD_GT:
if (quoteChar == 0) {
inDecl = false;
}
break;
case XmlCharTypes.CT_DTD_RBRACKET:
if (!inDecl && quoteChar == 0) {
break main_loop;
}
break;
// default:
// Other types are not important here...
}
if (outputBuffer != null) { // will have room for one more
outputBuffer[outPtr++] = (char) c;
}
}
if (outputBuffer != null) {
_textBuilder.setCurrentLength(outPtr);
}
// but still need to match the '>'...
byte b = skipInternalWs(false, null);
if (b != BYTE_GT) {
throwUnexpectedChar(decodeCharForError(b), " expected '>' after the internal subset");
}
}
@Override
protected final void finishPI() throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.resetWithEmpty();
int outPtr = 0;
/* At this point, space (if any) has been skipped, and we are
* to parse and store the contents
*/
main_loop:
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
}
// And then exceptions:
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_QMARK:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (_inputBuffer[_inputPtr] == BYTE_GT) { // ok, the end!
++_inputPtr;
break main_loop;
}
// Not end mark, just need to reprocess the second char
// default:
// Other types are not important here...
}
// Ok, can output the char (we know there's room for one more)
outputBuffer[outPtr++] = (char) c;
}
_textBuilder.setCurrentLength(outPtr);
}
Note: this method is only called in cases where it is known
that only space chars are legal. Thus, encountering a non-space
is an error (WFC or VC). However, an end-of-input is ok.
/**
* Note: this method is only called in cases where it is known
* that only space chars are legal. Thus, encountering a non-space
* is an error (WFC or VC). However, an end-of-input is ok.
*/
@Override
protected final void finishSpace() throws XMLStreamException
{
/* Ok: so, mTmpChar contains first space char. If it looks
* like indentation, we can probably optimize a bit...
*/
int tmp = _tmpChar;
char[] outputBuffer;
int outPtr;
if (tmp == BYTE_CR || tmp == BYTE_LF) {
outPtr = checkPrologIndentation(tmp);
if (outPtr < 0) {
return;
}
// Above call also initializes the text builder appropriately
outputBuffer = _textBuilder.getBufferWithoutReset();
} else {
outputBuffer = _textBuilder.resetWithEmpty();
outputBuffer[0] = (char) tmp;
outPtr = 1;
}
int ptr = _inputPtr;
while (true) {
if (ptr >= _inputEnd) {
if (!loadMore()) {
break;
}
ptr = _inputPtr;
}
int c = (int) _inputBuffer[ptr] & 0xFF;
// !!! TODO: check for xml 1.1 whitespace?
if (c > INT_SPACE) {
break;
}
++ptr;
if (c == INT_LF) {
markLF(ptr);
} else if (c == INT_CR) {
if (ptr >= _inputEnd) {
if (!loadMore()) { // still need to output the lf
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
outputBuffer[outPtr++] = '\n';
break;
}
ptr = _inputPtr;
}
if (_inputBuffer[ptr] == BYTE_LF) {
++ptr;
}
markLF(ptr);
c = INT_LF; // need to convert to canonical lf
} else if (c != INT_SPACE && c != INT_TAB) {
_inputPtr = ptr;
throwInvalidSpace(c);
}
// Ok, can output the char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
_textBuilder.setCurrentLength(outPtr);
}
/*
/**********************************************************************
/* 2nd level parsing/skipping for coalesced text
/**********************************************************************
*/
Method that gets called after a primary text segment (of type
CHARACTERS or CDATA, not applicable to SPACE) has been read in
text buffer. Method has to see if the following event would
be textual as well, and if so, read it (and any other following
textual segments).
/**
* Method that gets called after a primary text segment (of type
* CHARACTERS or CDATA, not applicable to SPACE) has been read in
* text buffer. Method has to see if the following event would
* be textual as well, and if so, read it (and any other following
* textual segments).
*/
protected final void finishCoalescedText()
throws XMLStreamException
{
while (true) {
// no matter what, will need (and can get) one char
if (_inputPtr >= _inputEnd) {
if (!loadMore()) { // most likely an error, will be handled later on
return;
}
}
if (_inputBuffer[_inputPtr] == BYTE_LT) { // markup of some kind
/* In worst case, need 3 chars ("<![") all in all to know
* if we are getting a CDATA section
*/
if ((_inputPtr + 3) >= _inputEnd) {
if (!loadAndRetain(3)) {
// probably an error, but will be handled later
return;
}
}
if (_inputBuffer[_inputPtr+1] != BYTE_EXCL
|| _inputBuffer[_inputPtr+2] != BYTE_LBRACKET) {
// can't be CDATA, we are done here
return;
}
// but let's verify it still:
_inputPtr += 3;
for (int i = 0; i < 6; ++i) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
if (b != (byte) CDATA_STR.charAt(i)) {
int ch = decodeCharForError(b);
reportTreeUnexpChar(ch, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)");
}
}
finishCoalescedCData();
} else { // textual (or entity, error etc)
finishCoalescedCharacters();
if (_entityPending) {
break;
}
}
}
}
// note: code mostly copied from 'finishCharacters', just simplified
// in some places
protected final void finishCoalescedCharacters()
throws XMLStreamException
{
// first char can't be from (char) entity (wrt finishCharacters)
final int[] TYPES = _charTypes.TEXT_CHARS;
final byte[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.getBufferWithoutReset();
int outPtr = _textBuilder.getCurrentLength();
int c;
main_loop:
while (true) {
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
}
// And then fallback for funny chars / UTF-8 multibytes:
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
if ((_inputEnd - _inputPtr) >= 2) {
c = decodeUtf8_3fast(c);
} else {
c = decodeUtf8_3(c);
}
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_LT:
--_inputPtr;
break main_loop;
case XmlCharTypes.CT_AMP:
c = handleEntityInText(false);
if (c == 0) { // unexpandable general parsed entity
// _inputPtr set by entity expansion method
_entityPending = true;
break main_loop;
}
// Ok; does it need a surrogate though? (over 16 bits)
if ((c >> 16) != 0) {
c -= 0x10000;
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
// Need to ensure room for one more char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
}
break;
case XmlCharTypes.CT_RBRACKET: // ']]>'?
{
// Let's then just count number of brackets --
// in case they are not followed by '>'
int count = 1;
byte b;
while (true) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = inputBuffer[_inputPtr];
if (b != BYTE_RBRACKET) {
break;
}
++_inputPtr; // to skip past bracket
++count;
}
if (b == BYTE_GT && count > 1) {
reportIllegalCDataEnd();
}
// Nope. Need to output all brackets, then; except
// for one that can be left for normal output
while (count > 1) {
outputBuffer[outPtr++] = ']';
// Need to ensure room for one more char
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
--count;
}
}
// Can just output the first ']' along normal output
break;
// default:
// Other types are not important here...
}
// We know there's room for one more:
outputBuffer[outPtr++] = (char) c;
}
_textBuilder.setCurrentLength(outPtr);
}
// note: code mostly copied from 'finishCharacters', just simplified
// in some places
protected final void finishCoalescedCData()
throws XMLStreamException
{
final int[] TYPES = _charTypes.OTHER_CHARS;
final byte[] inputBuffer = _inputBuffer;
char[] outputBuffer = _textBuilder.getBufferWithoutReset();
int outPtr = _textBuilder.getCurrentLength();
/* At this point, space (if any) has been skipped, and we are
* to parse and store the contents
*/
main_loop:
while (true) {
int c;
// Then the tight ascii non-funny-char loop:
ascii_loop:
while (true) {
int ptr = _inputPtr;
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
int max = _inputEnd;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
_inputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
_inputPtr = ptr;
}
// And then exceptions:
switch (TYPES[c]) {
case XmlCharTypes.CT_INVALID:
c = handleInvalidXmlChar(c);
case XmlCharTypes.CT_WS_CR:
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
if (inputBuffer[_inputPtr] == BYTE_LF) {
++_inputPtr;
}
markLF();
c = INT_LF;
break;
case XmlCharTypes.CT_WS_LF:
markLF();
break;
case XmlCharTypes.CT_MULTIBYTE_2:
c = decodeUtf8_2(c);
break;
case XmlCharTypes.CT_MULTIBYTE_3:
c = decodeUtf8_3(c);
break;
case XmlCharTypes.CT_MULTIBYTE_4:
c = decodeUtf8_4(c);
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
c = 0xDC00 | (c & 0x3FF);
// And let the other char output down below
break;
case XmlCharTypes.CT_MULTIBYTE_N:
reportInvalidInitial(c);
case XmlCharTypes.CT_RBRACKET: // close ']]>' marker?
/* Ok: let's just parse all consequtive right brackets,
* and see if followed by greater-than char. This because
* we can only push back at most one char at a time, and
* thus can't easily just check a subset
*/
int count = 0; // ignoring first one
byte b;
do {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
b = _inputBuffer[_inputPtr];
if (b != BYTE_RBRACKET) {
break;
}
++_inputPtr;
++count;
} while (true);
// Was the marker found?
boolean ok = (b == BYTE_GT && count >= 1);
if (ok) {
--count;
}
// Brackets to copy to output?
for (; count > 0; --count) {
outputBuffer[outPtr++] = ']';
if (outPtr >= outputBuffer.length) {
outputBuffer = _textBuilder.finishCurrentSegment();
outPtr = 0;
}
}
if (ok) {
++_inputPtr; // to consume '>'
break main_loop;
}
break;
}
// Ok, can output the char; there's room for one char at least
outputBuffer[outPtr++] = (char) c;
}
_textBuilder.setCurrentLength(outPtr);
}
Method that gets called after a primary text segment (of type
CHARACTERS or CDATA, not applicable to SPACE) has been skipped.
Method has to see if the following event would
be textual as well, and if so, skip it (and any other following
textual segments).
Returns: True if we encountered an unexpandable entity
/**
* Method that gets called after a primary text segment (of type
* CHARACTERS or CDATA, not applicable to SPACE) has been skipped.
* Method has to see if the following event would
* be textual as well, and if so, skip it (and any other following
* textual segments).
*
* @return True if we encountered an unexpandable entity
*/
@Override
protected final boolean skipCoalescedText()
throws XMLStreamException
{
while (true) {
// no matter what, will need (and can get) one char
if (_inputPtr >= _inputEnd) {
if (!loadMore()) { // most likely an error, will be handled later on
return false;
}
}
if (_inputBuffer[_inputPtr] == BYTE_LT) { // markup of some kind
/* In worst case, need 3 chars ("<![") all in all to know
* if we are getting a CDATA section
*/
if ((_inputPtr + 3) >= _inputEnd) {
if (!loadAndRetain(3)) { // probably an error, but will be handled later
return false;
}
}
if (_inputBuffer[_inputPtr+1] != BYTE_EXCL
|| _inputBuffer[_inputPtr+2] != BYTE_LBRACKET) {
// can't be CDATA, we are done here
return false;
}
// but let's verify it still:
_inputPtr += 3;
for (int i = 0; i < 6; ++i) {
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
byte b = _inputBuffer[_inputPtr++];
if (b != (byte) CDATA_STR.charAt(i)) {
int ch = decodeCharForError(b);
reportTreeUnexpChar(ch, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)");
}
}
skipCData();
} else { // textual (or entity, error etc)
if (skipCharacters()) {
return true;
}
}
}
}
/*
/**********************************************************************
/* Other methods, utf-decoding
/**********************************************************************
*/
Returns: Either decoded character (if positive int); or negated
value of a high-order char (one that needs surrogate pair)
/**
* @return Either decoded character (if positive int); or negated
* value of a high-order char (one that needs surrogate pair)
*/
private final int decodeMultiByteChar(int c, int ptr)
throws XMLStreamException
{
int needed;
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
c &= 0x1F;
needed = 1;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
c &= 0x0F;
needed = 2;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
c &= 0x07;
needed = 3;
} else {
reportInvalidInitial(c & 0xFF);
needed = 1; // never gets here
}
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
int d = (int) _inputBuffer[ptr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, ptr);
}
c = (c << 6) | (d & 0x3F);
if (needed > 1) { // needed == 1 means 2 bytes total
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
d = (int) _inputBuffer[ptr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, ptr);
}
c = (c << 6) | (d & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates)
if (ptr >= _inputEnd) {
loadMoreGuaranteed();
ptr = _inputPtr;
}
d = (int) _inputBuffer[ptr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, ptr);
}
c = (c << 6) | (d & 0x3F);
/* Need to signal such pair differently (to make comparison
* easier)
*/
c = -c;
}
}
_inputPtr = ptr;
return c;
}
private final int decodeUtf8_2(int c)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
return ((c & 0x1F) << 6) | (d & 0x3F);
}
private final int decodeUtf8_3(int c1)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
c1 &= 0x0F;
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
int c = (c1 << 6) | (d & 0x3F);
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
c = (c << 6) | (d & 0x3F);
if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal
if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF
if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) {
c = handleInvalidXmlChar(c);
}
}
}
return c;
}
private final int decodeUtf8_3fast(int c1)
throws XMLStreamException
{
c1 &= 0x0F;
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
int c = (c1 << 6) | (d & 0x3F);
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
c = (c << 6) | (d & 0x3F);
if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal
if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF
if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) {
c = handleInvalidXmlChar(c);
}
}
}
return c;
}
Returns: Character value minus 0x10000; this so that caller
can readily expand it to actual surrogates
/**
* @return Character value <b>minus 0x10000</c>; this so that caller
* can readily expand it to actual surrogates
*/
private final int decodeUtf8_4(int c)
throws XMLStreamException
{
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
int d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
c = ((c & 0x07) << 6) | (d & 0x3F);
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
c = (c << 6) | (d & 0x3F);
if (_inputPtr >= _inputEnd) {
loadMoreGuaranteed();
}
d = (int) _inputBuffer[_inputPtr++];
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF, _inputPtr);
}
/* note: won't change it to negative here, since caller
* already knows it'll need a surrogate
*/
return ((c << 6) | (d & 0x3F)) - 0x10000;
}
/*
/**********************************************************************
/* Internal methods, error reporting
/**********************************************************************
*/
Method called called to decode a full UTF-8 characters, given
its first byte. Note: does not do any validity checks, since this
is only to be used for informational purposes (often when an error
has already been encountered)
/**
* Method called called to decode a full UTF-8 characters, given
* its first byte. Note: does not do any validity checks, since this
* is only to be used for informational purposes (often when an error
* has already been encountered)
*/
@Override
public int decodeCharForError(byte b) throws XMLStreamException
{
int c = (int) b;
if (c >= 0) { // ascii? fine as is...
return c;
}
int needed;
// Ok; if we end here, we got multi-byte combination
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
c &= 0x1F;
needed = 1;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
c &= 0x0F;
needed = 2;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
c &= 0x07;
needed = 3;
} else {
reportInvalidInitial(c & 0xFF);
needed = 1; // never gets here
}
int d = nextByte();
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF);
}
c = (c << 6) | (d & 0x3F);
if (needed > 1) { // needed == 1 means 2 bytes total
d = nextByte(); // 3rd byte
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF);
}
c = (c << 6) | (d & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates)
d = nextByte();
if ((d & 0xC0) != 0x080) {
reportInvalidOther(d & 0xFF);
}
c = (c << 6) | (d & 0x3F);
}
}
return c;
}
protected void reportInvalidOther(int mask, int ptr) throws XMLStreamException
{
_inputPtr = ptr;
reportInvalidOther(mask);
}
}