/* Aalto XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.async;

import javax.xml.stream.XMLStreamException;

import com.fasterxml.aalto.AsyncByteArrayFeeder;
import com.fasterxml.aalto.in.*;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.XmlCharTypes;

This is the base class for asynchronous (non-blocking) XML scanners. Due to basic complexity of async approach, character-based doesn't make much sense, so only byte-based input is supported.
/** * This is the base class for asynchronous (non-blocking) XML * scanners. Due to basic complexity of async approach, character-based * doesn't make much sense, so only byte-based input is supported. */
public class AsyncByteArrayScanner extends AsyncByteScanner implements AsyncByteArrayFeeder { /* /********************************************************************** /* Input buffer handling /********************************************************************** */
This buffer is actually provided by caller
/** * This buffer is actually provided by caller */
protected byte[] _inputBuffer;
In addition to current buffer pointer, and end pointer, we will also need to know number of bytes originally contained. This is needed to correctly update location information when the block has been completed.
/** * In addition to current buffer pointer, and end pointer, * we will also need to know number of bytes originally * contained. This is needed to correctly update location * information when the block has been completed. */
protected int _origBufferLen; /* /********************************************************************** /* Instance construction /********************************************************************** */ public AsyncByteArrayScanner(ReaderConfig cfg) { super(cfg); // must start by checking if there's XML declaration... _state = STATE_PROLOG_INITIAL; _currToken = EVENT_INCOMPLETE; } @Override public String toString() { return "asyncScanner; curr="+_currToken+" next="+_nextEvent+", state = "+_state; } /* /********************************************************************** /* Implementation for low-level accessors /********************************************************************** */ @Override protected final byte _currentByte() throws XMLStreamException { return _inputBuffer[_inputPtr]; } @Override protected final byte _nextByte() throws XMLStreamException { return _inputBuffer[_inputPtr++]; } @Override protected final byte _prevByte() throws XMLStreamException { return _inputBuffer[_inputPtr-1]; } /* /********************************************************************** /* Parsing, comments /********************************************************************** */ protected int parseCommentContents() throws XMLStreamException { // Left-overs from last input block? if (_pendingInput != 0) { // CR, multi-byte, or '-'? int result = handleCommentPending(); // If there's not enough input, or if we completed, can leave if (result != 0) { return result; } // otherwise we should be good to continue } char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; main_loop: while (true) { int c; // Then the tight ASCII non-funny-char loop: ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { break main_loop; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = _inputPtr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) inputBuffer[_inputPtr++] & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } outputBuffer[outPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_HYPHEN: // '-->'? if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_COMMENT_HYPHEN1; break main_loop; } if (_inputBuffer[_inputPtr] == BYTE_HYPHEN) { // ok, must be end then ++_inputPtr; if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_COMMENT_HYPHEN2; break main_loop; } if (_inputBuffer[_inputPtr++] != BYTE_GT) { reportDoubleHyphenInComments(); } _textBuilder.setCurrentLength(outPtr); _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return COMMENT; } break; // default: // Other types are not important here... } // Ok, can output the char (we know there's room for one more) outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); return EVENT_INCOMPLETE; }
Returns:EVENT_INCOMPLETE, if there's not enough input to handle pending char, COMMENT, if we handled complete "-->" end marker, or 0 to indicate something else was succesfully handled.
/** * @return EVENT_INCOMPLETE, if there's not enough input to * handle pending char, COMMENT, if we handled complete * "--&gt;" end marker, or 0 to indicate something else * was succesfully handled. */
protected int handleCommentPending() throws XMLStreamException { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } if (_pendingInput == PENDING_STATE_COMMENT_HYPHEN1) { if (_inputBuffer[_inputPtr] != BYTE_HYPHEN) { // can't be the end marker, just append '-' and go _pendingInput = 0; _textBuilder.append("-"); return 0; } ++_inputPtr; _pendingInput = PENDING_STATE_COMMENT_HYPHEN2; if (_inputPtr >= _inputEnd) { // no more input? return EVENT_INCOMPLETE; } // continue } if (_pendingInput == PENDING_STATE_COMMENT_HYPHEN2) { _pendingInput = 0; byte b = _inputBuffer[_inputPtr++]; if (b != BYTE_GT) { reportDoubleHyphenInComments(); } _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return COMMENT; } // Otherwise can use default code return handleAndAppendPending() ? 0 : EVENT_INCOMPLETE; } /* /********************************************************************** /* Parsing, PI /********************************************************************** */ protected int parsePIData() throws XMLStreamException { // Left-overs from last input block? if (_pendingInput != 0) { // CR, multi-byte, '?' int result = handlePIPending(); // If there's not enough input, or if we completed, can leave if (result != 0) { return result; } // otherwise we should be good to continue } char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; main_loop: while (true) { int c; // Then the tight ASCII non-funny-char loop: ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { break main_loop; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = _inputPtr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) inputBuffer[_inputPtr++] & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } outputBuffer[outPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_QMARK: if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_PI_QMARK; break main_loop; } if (_inputBuffer[_inputPtr] == BYTE_GT) { // end ++_inputPtr; _textBuilder.setCurrentLength(outPtr); _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return PROCESSING_INSTRUCTION; } // Not end mark, just need to reprocess the second char break; // default: // Other types are not important here... } // Ok, can output the char (we know there's room for one more) outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); return EVENT_INCOMPLETE; }
Returns:EVENT_INCOMPLETE, if there's not enough input to handle pending char, PROCESSING_INSTRUCTION, if we handled complete "?>" end marker, or 0 to indicate something else was succesfully handled.
/** * @return EVENT_INCOMPLETE, if there's not enough input to * handle pending char, PROCESSING_INSTRUCTION, if we handled complete * "?&gt;" end marker, or 0 to indicate something else * was succesfully handled. */
protected int handlePIPending() throws XMLStreamException { // First, the special case, end marker: if (_pendingInput == PENDING_STATE_PI_QMARK) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } byte b = _inputBuffer[_inputPtr]; _pendingInput = 0; if (b != BYTE_GT) { // can't be the end marker, just append '-' and go _textBuilder.append('?'); return 0; } ++_inputPtr; _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return PROCESSING_INSTRUCTION; } // Otherwise can use default code return handleAndAppendPending() ? 0 : EVENT_INCOMPLETE; } /* /********************************************************************** /* Parsing, internal DTD subset /********************************************************************** */ @Override protected final boolean handleDTDInternalSubset(boolean init) throws XMLStreamException { char[] outputBuffer; int outPtr; if (init) { // first time around outputBuffer = _textBuilder.resetWithEmpty(); outPtr = 0; _elemAttrQuote = 0; _inDtdDeclaration = false; } else { if (_pendingInput != 0) { if (!handleAndAppendPending()) { return false; } } outputBuffer = _textBuilder.getBufferWithoutReset(); outPtr = _textBuilder.getCurrentLength(); } final int[] TYPES = _charTypes.DTD_CHARS; final byte[] inputBuffer = _inputBuffer; main_loop: while (true) { int c; // Then the tight ASCII non-funny-char loop: ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { break main_loop; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = _inputPtr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) inputBuffer[_inputPtr++] & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } outputBuffer[outPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_DTD_QUOTE: // apos or quot if (_elemAttrQuote == 0) { _elemAttrQuote = (byte) c; } else { if (_elemAttrQuote == c) { _elemAttrQuote = 0; } } break; case XmlCharTypes.CT_DTD_LT: if (!_inDtdDeclaration) { _inDtdDeclaration = true; } break; case XmlCharTypes.CT_DTD_GT: if (_elemAttrQuote == 0) { _inDtdDeclaration = false; } break; case XmlCharTypes.CT_DTD_RBRACKET: if (!_inDtdDeclaration && _elemAttrQuote == 0) { _textBuilder.setCurrentLength(outPtr); return true; } break; // default: // Other types are not important here... } // Ok, can output the char (we know there's room for one more) outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); return false; } /* /********************************************************************** /* Parsing, CDATA /********************************************************************** */ protected final int parseCDataContents() throws XMLStreamException { // Left-overs from last input block? if (_pendingInput != 0) { // CR, multi-byte, or ']'? int result = handleCDataPending(); // If there's not enough input, or if we completed, can leave if (result != 0) { return result; } // otherwise we should be good to continue } char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; main_loop: while (true) { int c; // Then the tight ASCII non-funny-char loop: ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { break main_loop; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = _inputPtr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) inputBuffer[_inputPtr++] & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } outputBuffer[outPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_RBRACKET: // ']]>'? if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CDATA_BRACKET1; break main_loop; } // Hmmh. This is more complex... so be it. if (_inputBuffer[_inputPtr] == BYTE_RBRACKET) { // end might be nigh... ++_inputPtr; while (true) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CDATA_BRACKET2; break main_loop; } if (_inputBuffer[_inputPtr] == BYTE_GT) { ++_inputPtr; _textBuilder.setCurrentLength(outPtr); _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return CDATA; } if (_inputBuffer[_inputPtr] != BYTE_RBRACKET) { // neither '>' nor ']'; push "]]" back outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = ']'; continue main_loop; } // Got third bracket; push one back, keep on checking ++_inputPtr; outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } } break; // default: // Other types are not important here... } // Ok, can output the char (we know there's room for one more) outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); return EVENT_INCOMPLETE; }
Returns:EVENT_INCOMPLETE, if there's not enough input to handle pending char, CDATA, if we handled complete "]]>" end marker, or 0 to indicate something else was succesfully handled.
/** * @return EVENT_INCOMPLETE, if there's not enough input to * handle pending char, CDATA, if we handled complete * "]]&gt;" end marker, or 0 to indicate something else * was succesfully handled. */
protected final int handleCDataPending() throws XMLStreamException { if (_pendingInput == PENDING_STATE_CDATA_BRACKET1) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } if (_inputBuffer[_inputPtr] != BYTE_RBRACKET) { // can't be the end marker, just append ']' and go _textBuilder.append(']'); return (_pendingInput = 0); } ++_inputPtr; _pendingInput = PENDING_STATE_CDATA_BRACKET2; if (_inputPtr >= _inputEnd) { // no more input? return EVENT_INCOMPLETE; } // continue } while (_pendingInput == PENDING_STATE_CDATA_BRACKET2) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_GT) { _pendingInput = 0; _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return CDATA; } if (b != BYTE_RBRACKET) { --_inputPtr; _textBuilder.append("]]"); return (_pendingInput = 0); } _textBuilder.append(']'); } // Otherwise can use default code return handleAndAppendPending() ? 0 : EVENT_INCOMPLETE; }
This method gets called, if the first character of a CHARACTERS event could not be fully read (multi-byte, split over buffer boundary). If so, there is some pending data to be handled.
/** * This method gets called, if the first character of a * CHARACTERS event could not be fully read (multi-byte, * split over buffer boundary). If so, there is some * pending data to be handled. */
protected int startCharactersPending() throws XMLStreamException { // First, need to have at least one more byte: if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } // K. So what was the type again? int c = _pendingInput; _pendingInput = 0; // Possible \r\n linefeed? if (c == PENDING_STATE_CR) { if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); _textBuilder.resetWithChar(CHAR_LF); } else { // Nah, a multi-byte UTF-8 char: // Let's just retest the first pending byte (in LSB): switch (_charTypes.TEXT_CHARS[c & 0xFF]) { case XmlCharTypes.CT_MULTIBYTE_2: // Easy: must have just one byte, did get another one: _textBuilder.resetWithChar((char) decodeUtf8_2(c)); break; case XmlCharTypes.CT_MULTIBYTE_3: { // Ok... so do we have one or two pending bytes? int next = _inputBuffer[_inputPtr++] & 0xFF; int c2 = (c >> 8); if (c2 == 0) { // just one; need two more if (_inputPtr >= _inputEnd) { // but got only one _pendingInput = c | (next << 8); return EVENT_INCOMPLETE; } int c3 = _inputBuffer[_inputPtr++] & 0xFF; c = decodeUtf8_3(c, next, c3); } else { // had two, got one, bueno: c = decodeUtf8_3((c & 0xFF), c2, next); } _textBuilder.resetWithChar((char) c); } break; case XmlCharTypes.CT_MULTIBYTE_4: { int next = (int) _inputBuffer[_inputPtr++] & 0xFF; // Only had one? if ((c >> 8) == 0) { // ok, so need 3 more if (_inputPtr >= _inputEnd) { // just have 1 _pendingInput = c | (next << 8); return EVENT_INCOMPLETE; } int c2 = _inputBuffer[_inputPtr++] & 0xFF; if (_inputPtr >= _inputEnd) { // almost, got 2 _pendingInput = c | (next << 8) | (c2 << 16); return EVENT_INCOMPLETE; } int c3 = _inputBuffer[_inputPtr++] & 0xFF; c = decodeUtf8_4(c, next, c2, c3); } else { // had two or three int c2 = (c >> 8) & 0xFF; int c3 = (c >> 16); if (c3 == 0) { // just two if (_inputPtr >= _inputEnd) { // one short _pendingInput = c | (next << 16); return EVENT_INCOMPLETE; } c3 = _inputBuffer[_inputPtr++] & 0xFF; c = decodeUtf8_4((c & 0xFF), c2, next, c3); } else { // had three, got last c = decodeUtf8_4((c & 0xFF), c2, c3, next); } } } // Need a surrogate pair, have to call from here: _textBuilder.resetWithSurrogate(c); return (_currToken = CHARACTERS); default: // should never occur: throwInternal(); } } // Great, we got it. Is that enough? if (_cfgCoalescing && !_cfgLazyParsing) { // In eager coalescing mode, must read it all return finishCharactersCoalescing(); } _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return _currToken; }
TODO: Method not yet implemented
/** * TODO: Method not yet implemented */
protected final int finishCharactersCoalescing() throws XMLStreamException { // First things first: any pending partial multi-bytes? if (_pendingInput != 0) { if (!handleAndAppendPending()) { return EVENT_INCOMPLETE; } } throw new UnsupportedOperationException(); // !!! TBI // return 0; } /* /********************************************************************** /* Async input, methods to feed (push) content to parse /********************************************************************** */ @Override public final boolean needMoreInput() { return (_inputPtr >=_inputEnd) && !_endOfInput; } @Override public void feedInput(byte[] buf, int start, int len) throws XMLStreamException { // Must not have remaining input if (_inputPtr < _inputEnd) { throw new XMLStreamException("Still have "+(_inputEnd - _inputPtr)+" unread bytes"); } // and shouldn't have been marked as end-of-input if (_endOfInput) { throw new XMLStreamException("Already closed, can not feed more input"); } // Time to update pointers first _pastBytesOrChars += _origBufferLen; _rowStartOffset -= _origBufferLen; // And then update buffer settings _inputBuffer = buf; _inputPtr = start; _inputEnd = start+len; _origBufferLen = len; } /* /********************************************************************** /* Implementation of parsing API /********************************************************************** */ @Override public int nextFromTree() throws XMLStreamException { // Had a fully complete event? Need to reset state: if (_currToken != EVENT_INCOMPLETE) { /* First, need to handle some complications arising from * empty elements, and namespace binding/unbinding: */ if (_currToken == START_ELEMENT) { if (_isEmptyTag) { --_depth; // Important: do NOT overwrite start location, same as with START_ELEMENT return (_currToken = END_ELEMENT); } } else if (_currToken == END_ELEMENT) { _currElem = _currElem.getParent(); // Any namespace declarations that need to be unbound? while (_lastNsDecl != null && _lastNsDecl.getLevel() >= _depth) { _lastNsDecl = _lastNsDecl.unbind(); } } // keep track of where event started setStartLocation(); /* Only CHARACTERS can remain incomplete: this happens if * first character is decoded, but coalescing mode is NOT * set. Skip can not therefore block, nor will add pending * input. Can also occur when we have run out of input */ if (_tokenIncomplete) { if (!skipCharacters()) { // couldn't complete skipping return EVENT_INCOMPLETE; } _tokenIncomplete = false; } _currToken = _nextEvent = EVENT_INCOMPLETE; _state = STATE_DEFAULT; } // Don't yet know the type? if (_nextEvent == EVENT_INCOMPLETE) { if (_state == STATE_DEFAULT) { /* We can only have pending input for (incomplete) * CHARACTERS event. */ if (_pendingInput != 0) { // CR, or multi-byte? _nextEvent = CHARACTERS; return startCharactersPending(); } if (_inputPtr >= _inputEnd) { // nothing we can do? return _currToken; // i.e. EVENT_INCOMPLETE } byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_LT) { // root element, comment, proc instr? _state = STATE_TREE_SEEN_LT; } else if (b == BYTE_AMP) { _state = STATE_TREE_SEEN_AMP; } else { _nextEvent = CHARACTERS; return startCharacters(b); } } if (_inputPtr >= _inputEnd) { return _currToken; // i.e. EVENT_INCOMPLETE } if (_state == STATE_TREE_SEEN_LT) { // Ok, so we've just seen the less-than char... byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_EXCL) { // comment or CDATA _state = STATE_TREE_SEEN_EXCL; } else if (b == BYTE_QMARK) { _nextEvent = PROCESSING_INSTRUCTION; _state = STATE_DEFAULT; return handlePI(); } else if (b == BYTE_SLASH) { return handleEndElementStart(); } else { // Probably start element -- need to retain first char tho return handleStartElementStart(b); } } else if (_state == STATE_TREE_SEEN_AMP) { return handleEntityStartingToken(); } else if (_state == STATE_TREE_NAMED_ENTITY_START) { return handleNamedEntityStartingToken(); } else if (_state == STATE_TREE_NUMERIC_ENTITY_START) { return handleNumericEntityStartingToken(); } if (_state == STATE_TREE_SEEN_EXCL) { if (_inputPtr >= _inputEnd) { return _currToken; // i.e. EVENT_INCOMPLETE } byte b = _inputBuffer[_inputPtr++]; // Comment or CDATA? if (b == BYTE_HYPHEN) { // Comment _nextEvent = COMMENT; _state = STATE_DEFAULT; } else if (b == BYTE_LBRACKET) { // CDATA _nextEvent = CDATA; _state = STATE_DEFAULT; } else { reportTreeUnexpChar(decodeCharForError(b), " (expected either '-' for COMMENT or '[CDATA[' for CDATA section)"); } } else { throwInternal(); } } /* We know the type; event is usually partially processed * and needs to be completely read. */ switch (_nextEvent) { case START_ELEMENT: return handleStartElement(); case END_ELEMENT: return handleEndElement(); case PROCESSING_INSTRUCTION: return handlePI(); case COMMENT: return handleComment(); case CDATA: return handleCData(); case CHARACTERS: if (!_cfgLazyParsing) { // !!! TBI: how would non-lazy mode work? if (_cfgCoalescing) { return finishCharactersCoalescing(); } } if (_pendingInput != 0) { // multi-byte, or CR without LF return startCharactersPending(); } // Otherwise, should not get here throwInternal(); // case ENTITY_REFERENCE: } return throwInternal(); // never gets here } /* /********************************************************************** /* Second-level parsing; character content (in tree) /********************************************************************** */ private int handleCData() throws XMLStreamException { if (_state == STATE_CDATA_CONTENT) { return parseCDataContents(); } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } return handleCDataStartMarker(_inputBuffer[_inputPtr++]); } private int handleCDataStartMarker(byte b) throws XMLStreamException { switch (_state) { case STATE_DEFAULT: if (b != BYTE_C) { reportTreeUnexpChar(decodeCharForError(b), " (expected 'C' for CDATA)"); } _state = STATE_CDATA_C; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer[_inputPtr++]; // fall through case STATE_CDATA_C: if (b != BYTE_D) { reportTreeUnexpChar(decodeCharForError(b), " (expected 'D' for CDATA)"); } _state = STATE_CDATA_CD; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer[_inputPtr++]; // fall through case STATE_CDATA_CD: if (b != BYTE_A) { reportTreeUnexpChar(decodeCharForError(b), " (expected 'A' for CDATA)"); } _state = STATE_CDATA_CDA; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer[_inputPtr++]; // fall through case STATE_CDATA_CDA: if (b != BYTE_T) { reportTreeUnexpChar(decodeCharForError(b), " (expected 'T' for CDATA)"); } _state = STATE_CDATA_CDAT; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer[_inputPtr++]; // fall through case STATE_CDATA_CDAT: if (b != BYTE_A) { reportTreeUnexpChar(decodeCharForError(b), " (expected 'A' for CDATA)"); } _state = STATE_CDATA_CDATA; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer[_inputPtr++]; // fall through case STATE_CDATA_CDATA: if (b != BYTE_LBRACKET) { reportTreeUnexpChar(decodeCharForError(b), " (expected '[' for CDATA)"); } _textBuilder.resetWithEmpty(); _state = STATE_CDATA_CONTENT; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } return parseCDataContents(); } return throwInternal(); } /* /********************************************************************** /* Second-level parsing; other (PI, Comment) /********************************************************************** */ @Override protected int handlePI() throws XMLStreamException { // Most common case first: if (_state == STATE_PI_IN_DATA) { return parsePIData(); } main_loop: while (true) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } switch (_state) { case STATE_DEFAULT: _tokenName = parseNewName(_inputBuffer[_inputPtr++]); if (_tokenName == null) { _state = STATE_PI_IN_TARGET; return EVENT_INCOMPLETE; } _state = STATE_PI_AFTER_TARGET; checkPITargetName(_tokenName); if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } // fall through case STATE_PI_AFTER_TARGET: // Need ws or "?>" { byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_QMARK) { // Quick check, can we see '>' as well? All done, if so if (_inputPtr < _inputEnd && _inputBuffer[_inputPtr] == BYTE_GT) { ++_inputPtr; break main_loop; // means we are done } // If not (whatever reason), let's move to check state _state = STATE_PI_AFTER_TARGET_QMARK; break; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { if (!asyncSkipSpace()) { // ran out of input? _state = STATE_PI_AFTER_TARGET_WS; return EVENT_INCOMPLETE; } _textBuilder.resetWithEmpty(); // Quick check, perhaps we'll see end marker? if ((_inputPtr+1) < _inputEnd && _inputBuffer[_inputPtr] == BYTE_QMARK && _inputBuffer[_inputPtr+1] == BYTE_GT) { _inputPtr += 2; break main_loop; // means we are done } // If not, we'll move to 'data' portion of PI _state = STATE_PI_IN_DATA; return parsePIData(); } // Otherwise, it's an error reportMissingPISpace(decodeCharForError(b)); } // fall through case STATE_PI_AFTER_TARGET_WS: if (!asyncSkipSpace()) { // ran out of input? return EVENT_INCOMPLETE; } // Can just move to "data" portion right away _state = STATE_PI_IN_DATA; _textBuilder.resetWithEmpty(); return parsePIData(); case STATE_PI_AFTER_TARGET_QMARK: { // Must get '>' following '?' we saw right after name byte b = _inputBuffer[_inputPtr++]; // Otherwise, it's an error if (b != BYTE_GT) { reportMissingPISpace(decodeCharForError(b)); } } // but if it's ok, we are done break main_loop; case STATE_PI_IN_TARGET: _tokenName = parsePName(); if (_tokenName == null) { return EVENT_INCOMPLETE; } checkPITargetName(_tokenName); _state = STATE_PI_AFTER_TARGET; break; default: return throwInternal(); } } _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return PROCESSING_INSTRUCTION; } @Override protected final int handleComment() throws XMLStreamException { if (_state == STATE_COMMENT_CONTENT) { return parseCommentContents(); } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } byte b = _inputBuffer[_inputPtr++]; if (_state == STATE_DEFAULT) { if (b != BYTE_HYPHEN) { reportTreeUnexpChar(decodeCharForError(b), " (expected '-' for COMMENT)"); } _state = STATE_COMMENT_CONTENT; _textBuilder.resetWithEmpty(); return parseCommentContents(); } if (_state == STATE_COMMENT_HYPHEN2) { // We are almost done, just need to get '>' at the end if (b != BYTE_GT) { reportDoubleHyphenInComments(); } _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return COMMENT; } return throwInternal(); } /* /********************************************************************** /* Second-level parsing; helper methods /********************************************************************** */
Method to skip whatever space can be skipped.

NOTE: if available content ends with a CR, method will set _pendingInput to PENDING_STATE_CR.

Returns:True, if was able to skip through the space and find a non-space byte; false if reached end-of-buffer
/** * Method to skip whatever space can be skipped. *<p> * NOTE: if available content ends with a CR, method will set * <code>_pendingInput</code> to <code>PENDING_STATE_CR</code>. * * @return True, if was able to skip through the space and find * a non-space byte; false if reached end-of-buffer */
@Override protected boolean asyncSkipSpace() throws XMLStreamException { while (_inputPtr < _inputEnd) { byte b = _inputBuffer[_inputPtr]; if ((b & 0xFF) > INT_SPACE) { // hmmmh. Shouldn't this be handled someplace else? if (_pendingInput == PENDING_STATE_CR) { markLF(); _pendingInput = 0; } return true; } ++_inputPtr; if (b == BYTE_LF) { markLF(); } else if (b == BYTE_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break; } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (b != BYTE_SPACE && b != BYTE_TAB) { throwInvalidSpace(b); } } return false; }
Method called when a new token (within tree) starts with an entity.
Returns:Type of event to return
/** * Method called when a new token (within tree) starts with an * entity. * * @return Type of event to return */
protected int handleEntityStartingToken() throws XMLStreamException { _textBuilder.resetWithEmpty(); byte b = _inputBuffer[_inputPtr++]; // we know one is available if (b == BYTE_HASH) { // numeric character entity _textBuilder.resetWithEmpty(); _state = STATE_TREE_NUMERIC_ENTITY_START; _pendingInput = PENDING_STATE_ENT_SEEN_HASH; if (_inputPtr >= _inputEnd) { // but no more content to parse yet return EVENT_INCOMPLETE; } return handleNumericEntityStartingToken(); } PName n = parseNewEntityName(b); // null if incomplete; non-null otherwise if (n == null) { // Not sure if it's a char entity or general one; so we don't yet know type _state = STATE_TREE_NAMED_ENTITY_START; return EVENT_INCOMPLETE; } int ch = decodeGeneralEntity(n); if (ch == 0) { // not a character entity _tokenName = n; return (_nextEvent = _currToken = ENTITY_REFERENCE); } // character entity; initialize buffer, _textBuilder.resetWithChar((char)ch); _nextEvent = 0; _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return _currToken; }
Method called when we see an entity that is starting a new token, and part of its name has been decoded (but not all)
/** * Method called when we see an entity that is starting a new token, * and part of its name has been decoded (but not all) */
protected int handleNamedEntityStartingToken() throws XMLStreamException { PName n = parseEntityName(); // null if incomplete; non-null otherwise if (n == null) { return _nextEvent; // i.e. EVENT_INCOMPLETE } int ch = decodeGeneralEntity(n); if (ch == 0) { // not a character entity _tokenName = n; return (_currToken = ENTITY_REFERENCE); } // character entity; initialize buffer, _textBuilder.resetWithChar((char)ch); _nextEvent = 0; _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return _currToken; }
Method called to handle cases where we find something other than a character entity (or one of 4 pre-defined general entities that act like character entities)
/** * Method called to handle cases where we find something other than * a character entity (or one of 4 pre-defined general entities that * act like character entities) */
protected int handleNumericEntityStartingToken() throws XMLStreamException { if (_pendingInput == PENDING_STATE_ENT_SEEN_HASH) { byte b = _inputBuffer[_inputPtr]; // we know one is available _entityValue = 0; if (b == BYTE_x) { // 'x' marks hex _pendingInput = PENDING_STATE_ENT_IN_HEX_DIGIT; if (++_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } } else { // if not 'x', must be a digit _pendingInput = PENDING_STATE_ENT_IN_DEC_DIGIT; // let's just keep byte for calculation } } if (_pendingInput == PENDING_STATE_ENT_IN_HEX_DIGIT) { if (!decodeHexEntity()) { return EVENT_INCOMPLETE; } } else { if (!decodeDecEntity()) { return EVENT_INCOMPLETE; } } // and now we have the full value verifyAndAppendEntityCharacter(_entityValue); _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } _pendingInput = 0; return _currToken; }
Returns:True if entity was decoded (and value assigned to _entityValue; false otherwise
/** * @return True if entity was decoded (and value assigned to <code>_entityValue</code>; * false otherwise */
protected final boolean decodeHexEntity() throws XMLStreamException { int value = _entityValue; while (_inputPtr < _inputEnd) { byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_SEMICOLON) { _entityValue = value; return true; } int ch = (int) b; if (ch <= INT_9 && ch >= INT_0) { ch -= INT_0; } else if (ch <= INT_F && ch >= INT_A) { ch = 10 + (ch - INT_A); } else if (ch <= INT_f && ch >= INT_a) { ch = 10 + (ch - INT_a); } else { throwUnexpectedChar(decodeCharForError(b), " expected a hex digit (0-9a-fA-F) for character entity"); } value = (value << 4) + ch; if (value > MAX_UNICODE_CHAR) { // Overflow? _entityValue = value; reportEntityOverflow(); } } _entityValue = value; return false; }
Returns:True if entity was decoded (and value assigned to _entityValue; false otherwise
/** * @return True if entity was decoded (and value assigned to <code>_entityValue</code>; * false otherwise */
protected final boolean decodeDecEntity() throws XMLStreamException { int value = _entityValue; while (_inputPtr < _inputEnd) { byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_SEMICOLON) { _entityValue = value; return true; } int ch = ((int) b) - INT_0; if (ch < 0 || ch > 9) { // invalid entity throwUnexpectedChar(decodeCharForError(b), " expected a digit (0 - 9) for character entity"); } value = (value * 10) + ch; if (value > MAX_UNICODE_CHAR) { // Overflow? _entityValue = value; reportEntityOverflow(); } } _entityValue = value; return false; }
Method that verifies that given named entity is followed by a semi-colon (meaning next byte must be available for reading); and if so, whether it is one of pre-defined general entities.
Returns:Character of the expanded pre-defined general entity (if name matches one); zero if not.
/** * Method that verifies that given named entity is followed by * a semi-colon (meaning next byte must be available for reading); * and if so, whether it is one of pre-defined general entities. * * @return Character of the expanded pre-defined general entity * (if name matches one); zero if not. */
protected final int decodeGeneralEntity(PName entityName) throws XMLStreamException { // First things first: verify that we got semicolon afterwards byte b = _inputBuffer[_inputPtr++]; if (b != BYTE_SEMICOLON) { throwUnexpectedChar(decodeCharForError(b), " expected ';' following entity name (\""+entityName.getPrefixedName()+"\")"); } String name = entityName.getPrefixedName(); if (name == "amp") { return INT_AMP; } if (name == "lt") { return INT_LT; } if (name == "apos") { return INT_APOS; } if (name == "quot") { return INT_QUOTE; } if (name == "gt") { return INT_GT; } return 0; }
Method called when '<' and (what appears to be) a name start character have been seen.
/** * Method called when '&lt;' and (what appears to be) a name * start character have been seen. */
@Override protected int handleStartElementStart(byte b) throws XMLStreamException { PName elemName = parseNewName(b); _nextEvent = START_ELEMENT; if (elemName == null) { _state = STATE_SE_ELEM_NAME; return EVENT_INCOMPLETE; } initStartElement(elemName); return handleStartElement(); } @Override protected int handleStartElement() throws XMLStreamException { main_loop: while (true) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } byte b; int c; switch (_state) { case STATE_SE_ELEM_NAME: { PName elemName = parsePName(); if (elemName == null) { return EVENT_INCOMPLETE; } initStartElement(elemName); } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } // Fall through to next state case STATE_SE_SPACE_OR_END: // obligatory space, or end if (_pendingInput != 0) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } // Ok, got a space, can move on } else { b = _inputBuffer[_inputPtr++]; c = (int) b & 0xFF; if (c <= INT_SPACE) { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return EVENT_INCOMPLETE; } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } } else if (c == INT_GT) { // must be '/' or '>' return finishStartElement(false); } else if (c == INT_SLASH) { _state = STATE_SE_SEEN_SLASH; continue main_loop; } else { throwUnexpectedChar(decodeCharForError(b), " expected space, or '>' or \"/>\""); } } _state = STATE_SE_SPACE_OR_ATTRNAME; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } // can fall through, again: case STATE_SE_SPACE_OR_ATTRNAME: case STATE_SE_SPACE_OR_EQ: case STATE_SE_SPACE_OR_ATTRVALUE: /* Common to these states is that there may be leading space(s), * so let's see if any has to be skipped */ if (_pendingInput != 0) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } } b = _inputBuffer[_inputPtr++]; c = (int) b & 0xFF; while (c <= INT_SPACE) { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return EVENT_INCOMPLETE; } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer[_inputPtr++]; c = (int) b & 0xFF; } switch (_state) { case STATE_SE_SPACE_OR_ATTRNAME: if (b == BYTE_SLASH) { _state = STATE_SE_SEEN_SLASH; continue main_loop; } if (b == BYTE_GT) { return finishStartElement(false); } { PName n = parseNewName(b); if (n == null) { _state = STATE_SE_ATTR_NAME; return EVENT_INCOMPLETE; } _state = STATE_SE_SPACE_OR_EQ; _elemAttrName = n; } continue main_loop; case STATE_SE_SPACE_OR_EQ: if (b != BYTE_EQ) { throwUnexpectedChar(decodeCharForError(b), " expected '='"); } _state = STATE_SE_SPACE_OR_ATTRVALUE; continue main_loop; case STATE_SE_SPACE_OR_ATTRVALUE: if (b != BYTE_QUOT && b != BYTE_APOS) { throwUnexpectedChar(decodeCharForError(b), " Expected a quote"); } initAttribute(b); continue main_loop; default: throwInternal(); } case STATE_SE_ATTR_NAME: { PName n = parsePName(); if (n == null) { return EVENT_INCOMPLETE; } _elemAttrName = n; _state = STATE_SE_SPACE_OR_EQ; } break; case STATE_SE_ATTR_VALUE_NORMAL: if (!handleAttrValue()) { return EVENT_INCOMPLETE; } _state = STATE_SE_SPACE_OR_END; break; case STATE_SE_ATTR_VALUE_NSDECL: if (!handleNsDecl()) { return EVENT_INCOMPLETE; } _state = STATE_SE_SPACE_OR_END; break; case STATE_SE_SEEN_SLASH: { b = _inputBuffer[_inputPtr++]; if (b != BYTE_GT) { throwUnexpectedChar(decodeCharForError(b), " expected '>'"); } return finishStartElement(true); } default: throwInternal(); } } } private void initStartElement(PName elemName) { String prefix = elemName.getPrefix(); if (prefix == null) { // element in default ns _elemAllNsBound = true; // which need not be bound } else { elemName = bindName(elemName, prefix); _elemAllNsBound = elemName.isBound(); } _tokenName = elemName; _currElem = new ElementScope(elemName, _currElem); _attrCount = 0; _currNsCount = 0; _elemAttrPtr = 0; _state = STATE_SE_SPACE_OR_END; } private void initAttribute(byte quoteChar) { _elemAttrQuote = quoteChar; PName attrName = _elemAttrName; String prefix = attrName.getPrefix(); boolean nsDecl; if (prefix == null) { // can be default ns decl: nsDecl = (attrName.getLocalName() == "xmlns"); } else { // May be a namespace decl though? if (prefix == "xmlns") { nsDecl = true; } else { attrName = bindName(attrName, prefix); if (_elemAllNsBound) { _elemAllNsBound = attrName.isBound(); } nsDecl = false; } } if (nsDecl) { _state = STATE_SE_ATTR_VALUE_NSDECL; // Ns decls use name buffer transiently _elemNsPtr = 0; ++_currNsCount; } else { _state = STATE_SE_ATTR_VALUE_NORMAL; // Regular attributes are appended, shouldn't reset ptr _attrCollector.startNewValue(attrName, _elemAttrPtr); } }
Method called to wrap up settings when the whole start (or empty) element has been parsed.
/** * Method called to wrap up settings when the whole start * (or empty) element has been parsed. */
private int finishStartElement(boolean emptyTag) throws XMLStreamException { _isEmptyTag = emptyTag; // Note: this call also checks attribute uniqueness int act = _attrCollector.finishLastValue(_elemAttrPtr); if (act < 0) { // error, dup attr indicated by -1 act = _attrCollector.getCount(); // let's get correct count reportInputProblem(_attrCollector.getErrorMsg()); } _attrCount = act; ++_depth; /* Was there any prefix that wasn't bound prior to use? * That's legal, assuming declaration was found later on... * let's check */ if (!_elemAllNsBound) { if (!_tokenName.isBound()) { // element itself unbound reportUnboundPrefix(_tokenName, false); } for (int i = 0, len = _attrCount; i < len; ++i) { PName attrName = _attrCollector.getName(i); if (!attrName.isBound()) { reportUnboundPrefix(attrName, true); } } } return (_currToken = START_ELEMENT); } private int handleEndElementStart() throws XMLStreamException { --_depth; _tokenName = _currElem.getName(); /* Ok, perhaps we can do this quickly? This works, if we * are expected to have the full name (plus one more byte * to indicate name end) in the current buffer: */ int size = _tokenName.sizeInQuads(); if ((_inputEnd - _inputPtr) < ((size << 2) + 1)) { // may need to load more _nextEvent = END_ELEMENT; _state = STATE_DEFAULT; _quadCount = _currQuad = _currQuadBytes = 0; /* No, need to take it slow. Can not yet give up, though, * without reading remainder of the buffer */ return handleEndElement(); } byte[] buf = _inputBuffer; // First all full chunks of 4 bytes (if any) --size; for (int qix = 0; qix < size; ++qix) { int ptr = _inputPtr; int q = (buf[ptr] << 24) | ((buf[ptr+1] & 0xFF) << 16) | ((buf[ptr+2] & 0xFF) << 8) | ((buf[ptr+3] & 0xFF)) ; _inputPtr += 4; // match? if (q != _tokenName.getQuad(qix)) { reportUnexpectedEndTag(_tokenName.getPrefixedName()); } } /* After which we can deal with the last entry: it's bit * tricky as we don't actually fully know byte length... */ int lastQ = _tokenName.getQuad(size); int q = buf[_inputPtr++] & 0xFF; if (q != lastQ) { // need second byte? q = (q << 8) | (buf[_inputPtr++] & 0xFF); if (q != lastQ) { // need third byte? q = (q << 8) | (buf[_inputPtr++] & 0xFF); if (q != lastQ) { // need full 4 bytes? q = (q << 8) | (buf[_inputPtr++] & 0xFF); if (q != lastQ) { // still no match? failure! reportUnexpectedEndTag(_tokenName.getPrefixedName()); } } } } // Trailing space? int i2 = _inputBuffer[_inputPtr++] & 0xFF; while (i2 <= INT_SPACE) { if (i2 == INT_LF) { markLF(); } else if (i2 == INT_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; _nextEvent = END_ELEMENT; _state = STATE_EE_NEED_GT; return EVENT_INCOMPLETE; } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (i2 != INT_SPACE && i2 != INT_TAB) { throwInvalidSpace(i2); } if (_inputPtr >= _inputEnd) { _nextEvent = END_ELEMENT; _state = STATE_EE_NEED_GT; return EVENT_INCOMPLETE; } i2 = _inputBuffer[_inputPtr++] & 0xFF; } if (i2 != INT_GT) { throwUnexpectedChar(decodeCharForError((byte)i2), " expected space or closing '>'"); } return (_currToken = END_ELEMENT); }
This method is "slow" version of above, used when name of the end element can split input buffer boundary
/** * This method is "slow" version of above, used when name of * the end element can split input buffer boundary */
private int handleEndElement() throws XMLStreamException { if (_state == STATE_DEFAULT) { // parsing name final PName elemName = _tokenName; final int quadSize = elemName.sizeInQuads() - 1; // need to ignore last for now for (; _quadCount < quadSize; ++_quadCount) { // first, full quads for (; _currQuadBytes < 4; ++_currQuadBytes) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } _currQuad = (_currQuad << 8) | (_inputBuffer[_inputPtr++] & 0xFF); } // match? if (_currQuad != elemName.getQuad(_quadCount)) { reportUnexpectedEndTag(elemName.getPrefixedName()); } _currQuad = _currQuadBytes = 0; } // So far so good! Now need to check the last quad: int lastQ = elemName.getLastQuad(); while (true) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } int q = (_currQuad << 8); q |= (_inputBuffer[_inputPtr++] & 0xFF); _currQuad = q; if (q == lastQ) { // match break; } if (++_currQuadBytes > 3) { // no match, error reportUnexpectedEndTag(elemName.getPrefixedName()); break; // never gets here } } // Bueno. How about optional space, '>'? _state = STATE_EE_NEED_GT; } else if (_state != STATE_EE_NEED_GT) { throwInternal(); } if (_pendingInput != 0) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } // it's ignorable ws } // Trailing space? while (true) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } int i2 = _inputBuffer[_inputPtr++] & 0xFF; if (i2 <= INT_SPACE) { if (i2 == INT_LF) { markLF(); } else if (i2 == INT_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return EVENT_INCOMPLETE; } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (i2 != INT_SPACE && i2 != INT_TAB) { throwInvalidSpace(i2); } continue; } if (i2 != INT_GT) { throwUnexpectedChar(decodeCharForError((byte)i2), " expected space or closing '>'"); } // Hah, done! return (_currToken = END_ELEMENT); } } /* /********************************************************************** /* Implementation of parsing API, character events /********************************************************************** */ @Override protected final int startCharacters(byte b) throws XMLStreamException { dummy_loop: do { // dummy loop, to allow break int c = (int) b & 0xFF; switch (_charTypes.TEXT_CHARS[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: /* Note: can not have pending input when this method * is called. No need to check that (could assert) */ if (_inputPtr >= _inputEnd) { // no more input available _pendingInput = PENDING_STATE_CR; return EVENT_INCOMPLETE; } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; return EVENT_INCOMPLETE; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); } _pendingInput = c; return EVENT_INCOMPLETE; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 16); } } _pendingInput = c; return EVENT_INCOMPLETE; } c = decodeUtf8_4(c); // Need a surrogate pair, have to call from here: _textBuilder.resetWithSurrogate(c); break dummy_loop; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); break; case XmlCharTypes.CT_LT: // should never get here case XmlCharTypes.CT_AMP: // - "" - throwInternal(); break; case XmlCharTypes.CT_RBRACKET: // ']]>'? // !!! TBI: check for "]]>" default: break; } _textBuilder.resetWithChar((char) c); } while (false); // dummy loop, for break if (_cfgCoalescing && !_cfgLazyParsing) { // In eager coalescing mode, must read it all return finishCharactersCoalescing(); } _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return _currToken; }
This method only gets called in non-coalescing mode; and if so, needs to parse as many characters of the current text segment from the current input block as possible.
/** * This method only gets called in non-coalescing mode; and if so, * needs to parse as many characters of the current text segment * from the current input block as possible. */
@Override protected final void finishCharacters() throws XMLStreamException { /* Now: there should not usually be any pending input (as it's * handled when CHARACTERS segment started, and this method * only gets called exactly once)... but we may want to * revisit this subject when (if) coalescing mode is to be * tackled. */ if (_pendingInput != 0) { // !!! TBI: needs to be changed for coalescing mode throwInternal(); } final int[] TYPES = _charTypes.TEXT_CHARS; final byte[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.getBufferWithoutReset(); // Should have just one code point (one or two chars). Assert? int outPtr = _textBuilder.getCurrentLength(); main_loop: while (true) { int c; // Then the tight ASCII non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { break main_loop; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then fallback for funny chars / UTF-8 multibytes: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: --_inputPtr; break main_loop; case XmlCharTypes.CT_AMP: c = handleEntityInCharacters(); if (c == 0) { // not a successfully expanded char entity // _inputPtr set by entity expansion method --_inputPtr; break main_loop; } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? /* 09-Mar-2007, tatus: This will not give 100% coverage, * for it may be split across input buffer boundary. * For now this will have to suffice though. */ { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; byte b = BYTE_NULL; while (_inputPtr < _inputEnd) { b = inputBuffer[_inputPtr]; if (b != BYTE_RBRACKET) { break; } ++_inputPtr; // to skip past bracket ++count; } if (b == BYTE_GT && count > 1) { reportIllegalCDataEnd(); } // Nope. Need to output all brackets, then; except // for one that can be left for normal output while (--count > 0) { outputBuffer[outPtr++] = ']'; // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } } // Can just output the first ']' along normal output break; // default: // Other types are not important here... } // We know there's room for one more: outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); }
Method called to handle entity encountered inside CHARACTERS segment, when trying to complete a non-coalescing text segment.

NOTE: unlike with generic parsing of named entities, where trailing semicolon needs to be left in place, here we should just process it right away.

Returns:Expanded (character) entity, if positive number; 0 if incomplete.
/** * Method called to handle entity encountered inside * CHARACTERS segment, when trying to complete a non-coalescing text segment. *<p> * NOTE: unlike with generic parsing of named entities, where trailing semicolon * needs to be left in place, here we should just process it right away. * * @return Expanded (character) entity, if positive number; 0 if incomplete. */
protected int handleEntityInCharacters() throws XMLStreamException { /* Thing that simplifies processing here is that handling * is pretty much optional: if there isn't enough data, we * just return 0 and are done with it. * * Also: we need at least 3 more characters for any character entity */ int ptr = _inputPtr; if ((ptr + 3) <= _inputEnd) { byte b = _inputBuffer[ptr++]; if (b == BYTE_HASH) { // numeric character entity if (_inputBuffer[ptr] == BYTE_x) { return handleHexEntityInCharacters(ptr+1); } return handleDecEntityInCharacters(ptr); } // general entity; maybe one of pre-defined ones if (b == BYTE_a) { // amp or apos? b = _inputBuffer[ptr++]; if (b == BYTE_m) { if ((ptr + 1) < _inputPtr && _inputBuffer[ptr] == BYTE_p && _inputBuffer[ptr+1] == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_AMP; } } else if (b == BYTE_p) { if ((ptr + 2) < _inputPtr && _inputBuffer[ptr] == BYTE_o && _inputBuffer[ptr+1] == BYTE_s && _inputBuffer[ptr+2] == BYTE_SEMICOLON) { _inputPtr = ptr + 3; return INT_APOS; } } } else if (b == BYTE_g) { // gt? if (_inputBuffer[ptr] == BYTE_t && _inputBuffer[ptr+1] == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_GT; } } else if (b == BYTE_l) { // lt? if (_inputBuffer[ptr] == BYTE_t && _inputBuffer[ptr+1] == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_LT; } } else if (b == BYTE_q) { // quot? if ((ptr + 3) < _inputPtr && _inputBuffer[ptr] == BYTE_u && _inputBuffer[ptr+1] == BYTE_o && _inputBuffer[ptr+2] == BYTE_t && _inputBuffer[ptr+3] == BYTE_SEMICOLON) { _inputPtr = ptr + 4; return INT_APOS; } } } // couldn't handle: return 0; } protected int handleDecEntityInCharacters(int ptr) throws XMLStreamException { byte b = _inputBuffer[ptr++]; final int end = _inputEnd; int value = 0; do { int ch = (int) b; if (ch > INT_9 || ch < INT_0) { throwUnexpectedChar(decodeCharForError(b), " expected a digit (0 - 9) for character entity"); } value = (value * 10) + (ch - INT_0); if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } if (ptr >= end) { return 0; } b = _inputBuffer[ptr++]; } while (b != BYTE_SEMICOLON); _inputPtr = ptr; verifyXmlChar(value); return value; } protected int handleHexEntityInCharacters(int ptr) throws XMLStreamException { byte b = _inputBuffer[ptr++]; final int end = _inputEnd; int value = 0; do { int ch = (int) b; if (ch <= INT_9 && ch >= INT_0) { ch -= INT_0; } else if (ch <= INT_F && ch >= INT_A) { ch = 10 + (ch - INT_A); } else if (ch <= INT_f && ch >= INT_a) { ch = 10 + (ch - INT_a); } else { throwUnexpectedChar(decodeCharForError(b), " expected a hex digit (0-9a-fA-F) for character entity"); } value = (value << 4) + ch; if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } if (ptr >= end) { return 0; } b = _inputBuffer[ptr++]; } while (b != BYTE_SEMICOLON); _inputPtr = ptr; verifyXmlChar(value); return value; }
Method called to handle split multi-byte character, by decoding it and appending to the text buffer, if possible.
Returns:True, if split character was completely handled; false if not
/** * Method called to handle split multi-byte character, by decoding * it and appending to the text buffer, if possible. * * @return True, if split character was completely handled; false * if not */
private final boolean handleAndAppendPending() throws XMLStreamException { // First, need to have at least one more byte: if (_inputPtr >= _inputEnd) { return false; } int c = _pendingInput; _pendingInput = 0; // Possible \r\n linefeed? if (c < 0) { // markers are all negative if (c == PENDING_STATE_CR) { if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); _textBuilder.append(CHAR_LF); return true; } throwInternal(); } // Nah, a multi-byte UTF-8 char: // Let's just re-test the first pending byte (in LSB): switch (_charTypes.TEXT_CHARS[c & 0xFF]) { case XmlCharTypes.CT_MULTIBYTE_2: // Easy: must have just one byte, did get another one: _textBuilder.append((char) decodeUtf8_2(c)); break; case XmlCharTypes.CT_MULTIBYTE_3: { // Ok... so do we have one or two pending bytes? int next = _inputBuffer[_inputPtr++] & 0xFF; int c2 = (c >> 8); if (c2 == 0) { // just one; need two more if (_inputPtr >= _inputEnd) { // but got only one _pendingInput = c | (next << 8); return false; } int c3 = _inputBuffer[_inputPtr++] & 0xFF; c = decodeUtf8_3(c, next, c3); } else { // had two, got one, bueno: c = decodeUtf8_3((c & 0xFF), c2, next); } _textBuilder.append((char) c); } break; case XmlCharTypes.CT_MULTIBYTE_4: { int next = (int) _inputBuffer[_inputPtr++] & 0xFF; // Only had one? if ((c >> 8) == 0) { // ok, so need 3 more if (_inputPtr >= _inputEnd) { // just have 1 _pendingInput = c | (next << 8); return false; } int c2 = _inputBuffer[_inputPtr++] & 0xFF; if (_inputPtr >= _inputEnd) { // almost, got 2 _pendingInput = c | (next << 8) | (c2 << 16); return false; } int c3 = _inputBuffer[_inputPtr++] & 0xFF; c = decodeUtf8_4(c, next, c2, c3); } else { // had two or three int c2 = (c >> 8) & 0xFF; int c3 = (c >> 16); if (c3 == 0) { // just two if (_inputPtr >= _inputEnd) { // one short _pendingInput = c | (next << 16); return false; } c3 = _inputBuffer[_inputPtr++] & 0xFF; c = decodeUtf8_4((c & 0xFF), c2, next, c3); } else { // had three, got last c = decodeUtf8_4((c & 0xFF), c2, c3, next); } } } // Need a surrogate pair, have to call from here: _textBuilder.appendSurrogate(c); break; default: // should never occur: throwInternal(); } return true; } /* /********************************************************************** /* Implementation of parsing API, skipping remainder CHARACTERS section /********************************************************************** */
Method that will be called to skip all possible characters from the input buffer, but without blocking. Partial characters are not to be handled (not pending input is to be added).
Returns:True, if skipping ending with an unexpanded entity; false if not
/** * Method that will be called to skip all possible characters * from the input buffer, but without blocking. Partial * characters are not to be handled (not pending input * is to be added). * * @return True, if skipping ending with an unexpanded * entity; false if not */
@Override protected boolean skipCharacters() throws XMLStreamException { if (_pendingInput != 0) { if (!skipPending()) { return false; } } final int[] TYPES = _charTypes.TEXT_CHARS; final byte[] inputBuffer = _inputBuffer; main_loop: while (true) { int c; ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { break main_loop; } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } // And then fallback for funny chars / UTF-8 multibytes: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } decodeUtf8_4(c); break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: --_inputPtr; return true; case XmlCharTypes.CT_AMP: c = skipEntityInCharacters(); if (c == 0) { // not a successfully expanded char entity _pendingInput = PENDING_STATE_TEXT_AMP; // but we may have input to skip nonetheless.. if (_inputPtr < _inputEnd) { if (skipPending()) { return true; } } return false; } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? /* !!! 09-Mar-2007, tatu: This will not give 100% coverage, * for it may be split across input buffer boundary. * For now this will have to suffice though. */ { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; byte b = BYTE_NULL; while (_inputPtr < _inputEnd) { b = inputBuffer[_inputPtr]; if (b != BYTE_RBRACKET) { break; } ++_inputPtr; // to skip past bracket ++count; } if (b == BYTE_GT && count > 1) { reportIllegalCDataEnd(); } } break; // default: // Other types are not important here... } } // Ran out of input, no entity encountered return false; } private final boolean skipPending() throws XMLStreamException { // First, need to have at least one more byte: if (_inputPtr >= _inputEnd) { return false; } // Possible \r\n linefeed? if (_pendingInput < 0) { // markers are all negative while (true) { switch (_pendingInput) { case PENDING_STATE_CR: _pendingInput = 0; if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); return true; case PENDING_STATE_TEXT_AMP: { byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_HASH) { _pendingInput = PENDING_STATE_TEXT_AMP_HASH; break; } PName n = parseNewEntityName(b); if (n == null) { _pendingInput = PENDING_STATE_TEXT_IN_ENTITY; return false; } int ch = decodeGeneralEntity(n); if (ch == 0) { _tokenName = n; _nextEvent = ENTITY_REFERENCE; } } _pendingInput = 0; return true; // no matter what, we are done case PENDING_STATE_TEXT_AMP_HASH: _entityValue = 0; if (_inputBuffer[_inputPtr] == BYTE_x) { ++_inputPtr; if (decodeHexEntity()) { _pendingInput = 0; return true; } _pendingInput = PENDING_STATE_TEXT_HEX_ENTITY; return false; } if (decodeDecEntity()) { _pendingInput = 0; return true; } _pendingInput = PENDING_STATE_TEXT_DEC_ENTITY; return false; case PENDING_STATE_TEXT_DEC_ENTITY: if (decodeDecEntity()) { _pendingInput = 0; return true; } return false; case PENDING_STATE_TEXT_HEX_ENTITY: if (decodeHexEntity()) { _pendingInput = 0; return true; } return false; case PENDING_STATE_TEXT_IN_ENTITY: { PName n = parseEntityName(); if (n == null) { return false; } int ch = decodeGeneralEntity(n); if (ch == 0) { _tokenName = n; _nextEvent = ENTITY_REFERENCE; } } _pendingInput = 0; return true; case PENDING_STATE_TEXT_BRACKET1: if (_inputBuffer[_inputPtr] != BYTE_RBRACKET) { _pendingInput = 0; return true; } ++_inputPtr; _pendingInput = PENDING_STATE_TEXT_BRACKET2; break; case PENDING_STATE_TEXT_BRACKET2: // may get sequence... { byte b = _inputBuffer[_inputPtr]; if (b == BYTE_RBRACKET) { ++_inputPtr; break; } if (b == BYTE_GT) { // problem! ++_inputPtr; reportInputProblem("Encountered ']]>' in text segment"); } } // nope, something else, reprocess _pendingInput = 0; return true; default: throwInternal(); } if (_inputPtr >= _inputEnd) { return false; } } } // Nah, a multi-byte UTF-8 char: // Let's just re-test the first pending byte (in LSB): int c = _pendingInput; switch (_charTypes.TEXT_CHARS[c & 0xFF]) { case XmlCharTypes.CT_MULTIBYTE_2: // Easy: must have just one byte, did get another one: skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: { // Ok... so do we have one or two pending bytes? int next = _inputBuffer[_inputPtr++] & 0xFF; int c2 = (c >> 8); if (c2 == 0) { // just one; need two more if (_inputPtr >= _inputEnd) { // but got only one _pendingInput = c | (next << 8); return false; } int c3 = _inputBuffer[_inputPtr++] & 0xFF; decodeUtf8_3(c, next, c3); } else { // had two, got one, bueno: decodeUtf8_3((c & 0xFF), c2, next); } } break; case XmlCharTypes.CT_MULTIBYTE_4: { int next = (int) _inputBuffer[_inputPtr++] & 0xFF; // Only had one? if ((c >> 8) == 0) { // ok, so need 3 more if (_inputPtr >= _inputEnd) { // just have 1 _pendingInput = c | (next << 8); return false; } int c2 = _inputBuffer[_inputPtr++] & 0xFF; if (_inputPtr >= _inputEnd) { // almost, got 2 _pendingInput = c | (next << 8) | (c2 << 16); return false; } int c3 = _inputBuffer[_inputPtr++] & 0xFF; decodeUtf8_4(c, next, c2, c3); } else { // had two or three int c2 = (c >> 8) & 0xFF; int c3 = (c >> 16); if (c3 == 0) { // just two if (_inputPtr >= _inputEnd) { // one short _pendingInput = c | (next << 16); return false; } c3 = _inputBuffer[_inputPtr++] & 0xFF; decodeUtf8_4((c & 0xFF), c2, next, c3); } else { // had three, got last decodeUtf8_4((c & 0xFF), c2, c3, next); } } } break; default: // should never occur: throwInternal(); } _pendingInput = 0; return true; }
Method called to handle entity encountered inside CHARACTERS segment, when trying to complete a non-coalescing text segment.
Returns:Expanded (character) entity, if positive number; 0 if incomplete.
/** * Method called to handle entity encountered inside * CHARACTERS segment, when trying to complete a non-coalescing text segment. * * @return Expanded (character) entity, if positive number; 0 if incomplete. */
private int skipEntityInCharacters() throws XMLStreamException { /* Thing that simplifies processing here is that handling * is pretty much optional: if there isn't enough data, we * just return 0 and are done with it. * * Also: we need at least 3 more characters for any character entity */ int ptr = _inputPtr; if ((ptr + 3) <= _inputEnd) { byte b = _inputBuffer[ptr++]; if (b == BYTE_HASH) { // numeric character entity if (_inputBuffer[ptr] == BYTE_x) { return handleHexEntityInCharacters(ptr+1); } return handleDecEntityInCharacters(ptr); } // general entity; maybe one of pre-defined ones if (b == BYTE_a) { // amp or apos? b = _inputBuffer[ptr++]; if (b == BYTE_m) { if ((ptr + 1) < _inputPtr && _inputBuffer[ptr] == BYTE_p && _inputBuffer[ptr+1] == BYTE_SEMICOLON) { _inputPtr = ptr + 2; // NOTE: do skip semicolon as well return INT_AMP; } } else if (b == BYTE_p) { if ((ptr + 2) < _inputPtr && _inputBuffer[ptr] == BYTE_o && _inputBuffer[ptr+1] == BYTE_s && _inputBuffer[ptr+2] == BYTE_SEMICOLON) { _inputPtr = ptr + 3; return INT_APOS; } } } else if (b == BYTE_g) { // gt? if (_inputBuffer[ptr] == BYTE_t && _inputBuffer[ptr+1] == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_GT; } } else if (b == BYTE_l) { // lt? if (_inputBuffer[ptr] == BYTE_t && _inputBuffer[ptr+1] == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_LT; } } else if (b == BYTE_q) { // quot? if ((ptr + 3) < _inputPtr && _inputBuffer[ptr] == BYTE_u && _inputBuffer[ptr+1] == BYTE_o && _inputBuffer[ptr+2] == BYTE_t && _inputBuffer[ptr+3] == BYTE_SEMICOLON) { _inputPtr = ptr + 4; return INT_APOS; } } } // couldn't handle: return 0; }
Coalescing mode is (and will) not be implemented for non-blocking parsers, so this method should never get called.
/** * Coalescing mode is (and will) not be implemented for non-blocking * parsers, so this method should never get called. */
@Override protected boolean skipCoalescedText() throws XMLStreamException { throwInternal(); return false; } /* /********************************************************************** /* Implementation of parsing API, element/attr events /********************************************************************** */
Returns:True, if the whole value was read; false if only part (due to buffer ending)
/** * @return True, if the whole value was read; false if * only part (due to buffer ending) */
@Override protected boolean handleAttrValue() throws XMLStreamException { // First; any pending input? if (_pendingInput != 0) { if (!handleAttrValuePending()) { return false; } _pendingInput = 0; } char[] attrBuffer = _attrCollector.continueValue(); final int[] TYPES = _charTypes.ATTR_CHARS; final int quoteChar = (int) _elemAttrQuote; value_loop: while (true) { int c; ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { return false; } if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } int max = _inputEnd; { int max2 = _inputPtr + (attrBuffer.length - _elemAttrPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) _inputBuffer[_inputPtr++] & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } attrBuffer[_elemAttrPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return false; } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } // fall through case XmlCharTypes.CT_WS_LF: markLF(); // fall through case XmlCharTypes.CT_WS_TAB: // Plus, need to convert these all to simple space c = INT_SPACE; break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; return false; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); } _pendingInput = c; return false; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 16); } } _pendingInput = c; return false; } c = decodeUtf8_4(c); // Let's add first part right away: attrBuffer[_elemAttrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: throwUnexpectedChar(c, "'<' not allowed in attribute value"); case XmlCharTypes.CT_AMP: c = handleEntityInAttributeValue(); if (c <= 0) { // general entity; should never happen if (c < 0) { // end-of-input return false; } reportUnexpandedEntityInAttr(_elemAttrName, false); } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; attrBuffer[_elemAttrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } } break; case XmlCharTypes.CT_ATTR_QUOTE: if (c == quoteChar) { break value_loop; } // default: // Other chars are not important here... } // We know there's room for at least one char without checking attrBuffer[_elemAttrPtr++] = (char) c; } return true; // yeah, we're done! }
Returns:True if the partial information was succesfully handled; false if not
/** * @return True if the partial information was succesfully handled; * false if not */
private final boolean handleAttrValuePending() throws XMLStreamException { if (_pendingInput == PENDING_STATE_CR) { if (!handlePartialCR()) { return false; } char[] attrBuffer = _attrCollector.continueValue(); if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } // All LFs get converted to spaces, in attribute values attrBuffer[_elemAttrPtr++] = ' '; return true; } // otherwise must be related to entity handling within attribute value if (_inputPtr >= _inputEnd) { return false; } int ch; if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP) { byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_HASH) { // numeric character entity _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH; if (_inputPtr >= _inputEnd) { return false; } if (_inputBuffer[_inputPtr] == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return false; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } } else { PName entityName = parseNewEntityName(b); if (entityName == null) { _pendingInput = PENDING_STATE_ATTR_VALUE_ENTITY_NAME; return false; } ch = decodeGeneralEntity(entityName); if (ch == 0) { // can't have general entities within attribute values _tokenName = entityName; reportUnexpandedEntityInAttr(_elemAttrName, false); } } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP_HASH) { if (_inputBuffer[_inputPtr] == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return false; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP_HASH_X) { ch = handleHexEntityInAttribute(true); } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_ENTITY_NAME) { PName entityName = parseEntityName(); if (entityName == null) { return false; } ch = decodeGeneralEntity(entityName); if (ch == 0) { // can't have general entities within attribute values _tokenName = entityName; reportUnexpandedEntityInAttr(_elemAttrName, false); } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_DEC_DIGIT) { ch = handleDecEntityInAttribute(false); } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_HEX_DIGIT) { ch = handleHexEntityInAttribute(false); } else { // nope, split UTF-8 char // Nah, a multi-byte UTF-8 char. Alas, can't use shared method, as results // don't go in shared text buffer... ch = handleAttrValuePendingUTF8(); } if (ch == 0) { // wasn't resolved return false; } char[] attrBuffer = _attrCollector.continueValue(); // Ok; does it need a surrogate though? (over 16 bits) if ((ch >> 16) != 0) { ch -= 0x10000; if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } attrBuffer[_elemAttrPtr++] = (char) (0xD800 | (ch >> 10)); ch = 0xDC00 | (ch & 0x3FF); } if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } attrBuffer[_elemAttrPtr++] = (char) ch; return true; // done it! } private final int handleAttrValuePendingUTF8() throws XMLStreamException { // note: we know there must be at least one byte available at this point int c = _pendingInput; _pendingInput = 0; // Let's just re-test the first pending byte (in LSB): switch (_charTypes.TEXT_CHARS[c & 0xFF]) { case XmlCharTypes.CT_MULTIBYTE_2: // Easy: must have just one byte, did get another one: return decodeUtf8_2(c); case XmlCharTypes.CT_MULTIBYTE_3: { // Ok... so do we have one or two pending bytes? int next = _inputBuffer[_inputPtr++] & 0xFF; int c2 = (c >> 8); if (c2 == 0) { // just one; need two more if (_inputPtr >= _inputEnd) { // but got only one _pendingInput = c | (next << 8); return 0; } int c3 = _inputBuffer[_inputPtr++] & 0xFF; c = decodeUtf8_3(c, next, c3); } else { // had two, got one, bueno: c = decodeUtf8_3((c & 0xFF), c2, next); } return c; } case XmlCharTypes.CT_MULTIBYTE_4: { int next = (int) _inputBuffer[_inputPtr++] & 0xFF; // Only had one? if ((c >> 8) == 0) { // ok, so need 3 more if (_inputPtr >= _inputEnd) { // just have 1 _pendingInput = c | (next << 8); return 0; } int c2 = _inputBuffer[_inputPtr++] & 0xFF; if (_inputPtr >= _inputEnd) { // almost, got 2 _pendingInput = c | (next << 8) | (c2 << 16); return 0; } int c3 = _inputBuffer[_inputPtr++] & 0xFF; c = decodeUtf8_4(c, next, c2, c3); } else { // had two or three int c2 = (c >> 8) & 0xFF; int c3 = (c >> 16); if (c3 == 0) { // just two if (_inputPtr >= _inputEnd) { // one short _pendingInput = c | (next << 16); return 0; } c3 = _inputBuffer[_inputPtr++] & 0xFF; c = decodeUtf8_4((c & 0xFF), c2, next, c3); } else { // had three, got last c = decodeUtf8_4((c & 0xFF), c2, c3, next); } } return c; } default: // should never occur: throwInternal(); return 0; // never gets here } } private final int handleDecEntityInAttribute(boolean starting) throws XMLStreamException { byte b = _inputBuffer[_inputPtr++]; // we know one is available if (starting) { int ch = (int) b; if (ch < INT_0 || ch > INT_9) { // invalid entity throwUnexpectedChar(decodeCharForError(b), " expected a digit (0 - 9) for character entity"); } _pendingInput = PENDING_STATE_ATTR_VALUE_DEC_DIGIT; _entityValue = ch - INT_0; if (_inputPtr >= _inputEnd) { return 0; } b = _inputBuffer[_inputPtr++]; } while (b != BYTE_SEMICOLON) { int ch = ((int) b) - INT_0; if (ch < 0 || ch > 9) { // invalid entity throwUnexpectedChar(decodeCharForError(b), " expected a digit (0 - 9) for character entity"); } int value = (_entityValue * 10) + ch; _entityValue = value; if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } if (_inputPtr >= _inputEnd) { return 0; } b = _inputBuffer[_inputPtr++]; } verifyXmlChar(_entityValue); _pendingInput = 0; return _entityValue; } private final int handleHexEntityInAttribute(boolean starting) throws XMLStreamException { byte b = _inputBuffer[_inputPtr++]; // we know one is available boolean firstLoop = starting; while (b != BYTE_SEMICOLON) { int ch = (int) b; if (ch <= INT_9 && ch >= INT_0) { ch -= INT_0; } else if (ch <= INT_F && ch >= INT_A) { ch = 10 + (ch - INT_A); } else if (ch <= INT_f && ch >= INT_a) { ch = 10 + (ch - INT_a); } else { throwUnexpectedChar(decodeCharForError(b), " expected a hex digit (0-9a-fA-F) for character entity"); } int value = ch; if (firstLoop) { _pendingInput = PENDING_STATE_ATTR_VALUE_HEX_DIGIT; firstLoop = false; } else { value += (_entityValue << 4); if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } } _entityValue = value; if (_inputPtr >= _inputEnd) { return 0; } b = _inputBuffer[_inputPtr++]; } verifyXmlChar(_entityValue); _pendingInput = 0; return _entityValue; }
Method called to handle entity encountered inside attribute value.
Returns:Value of expanded character entity, if processed (which must be 1 or above); 0 for general entity, or -1 for "not enough input"
/** * Method called to handle entity encountered inside attribute value. * * @return Value of expanded character entity, if processed (which must be * 1 or above); 0 for general entity, or -1 for "not enough input" */
protected int handleEntityInAttributeValue() throws XMLStreamException { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP; return -1; } byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_HASH) { // numeric character entity _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH; if (_inputPtr >= _inputEnd) { return -1; } int ch; if (_inputBuffer[_inputPtr] == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return -1; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } if (ch == 0) { return -1; } return ch; } PName entityName = parseNewEntityName(b); if (entityName == null) { _pendingInput = PENDING_STATE_ATTR_VALUE_ENTITY_NAME; return -1; } int ch = decodeGeneralEntity(entityName); if (ch != 0) { return ch; } _tokenName = entityName; return 0; } @Override protected boolean handleNsDecl() throws XMLStreamException { final int[] TYPES = _charTypes.ATTR_CHARS; char[] attrBuffer = _nameBuffer; final int quoteChar = (int) _elemAttrQuote; // First; any pending input? if (_pendingInput != 0) { if (!handleNsValuePending()) { return false; } _pendingInput = 0; } value_loop: while (true) { int c; ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { return false; } if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } int max = _inputEnd; { int max2 = _inputPtr + (attrBuffer.length - _elemNsPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) _inputBuffer[_inputPtr++] & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } attrBuffer[_elemNsPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return false; } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } // fall through case XmlCharTypes.CT_WS_LF: markLF(); // fall through case XmlCharTypes.CT_WS_TAB: // Plus, need to convert these all to simple space c = INT_SPACE; break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; return false; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); } _pendingInput = c; return false; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer[_inputPtr++] & 0xFF; c |= (d << 16); } } _pendingInput = c; return false; } c = decodeUtf8_4(c); // Let's add first part right away: attrBuffer[_elemNsPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: throwUnexpectedChar(c, "'<' not allowed in attribute value"); case XmlCharTypes.CT_AMP: c = handleEntityInAttributeValue(); if (c <= 0) { // general entity; should never happen if (c < 0) { // end-of-input return false; } reportUnexpandedEntityInAttr(_elemAttrName, true); } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; attrBuffer[_elemNsPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } } break; case XmlCharTypes.CT_ATTR_QUOTE: if (c == quoteChar) { break value_loop; } // default: // Other chars are not important here... } // We know there's room for at least one char without checking attrBuffer[_elemNsPtr++] = (char) c; } /* Simple optimization: for default ns removal (or, with * ns 1.1, any other as well), will use empty value... no * need to try to intern: */ int attrPtr = _elemNsPtr; if (attrPtr == 0) { bindNs(_elemAttrName, ""); } else { String uri = _config.canonicalizeURI(attrBuffer, attrPtr); bindNs(_elemAttrName, uri); } return true; }
Returns:True if the partial information was succesfully handled; false if not
/** * @return True if the partial information was succesfully handled; * false if not */
private final boolean handleNsValuePending() throws XMLStreamException { if (_pendingInput == PENDING_STATE_CR) { if (!handlePartialCR()) { return false; } char[] attrBuffer = _nameBuffer; if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } // All lfs get converted to spaces, in attribute values attrBuffer[_elemNsPtr++] = ' '; return true; } // otherwise must be related to entity handling within attribute value if (_inputPtr >= _inputEnd) { return false; } int ch; if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP) { byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_HASH) { // numeric character entity _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH; if (_inputPtr >= _inputEnd) { return false; } if (_inputBuffer[_inputPtr] == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return false; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } } else { PName entityName = parseNewEntityName(b); if (entityName == null) { _pendingInput = PENDING_STATE_ATTR_VALUE_ENTITY_NAME; return false; } ch = decodeGeneralEntity(entityName); if (ch == 0) { // can't have general entities within attribute values _tokenName = entityName; reportUnexpandedEntityInAttr(_elemAttrName, false); } } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP_HASH) { if (_inputBuffer[_inputPtr] == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return false; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP_HASH_X) { ch = handleHexEntityInAttribute(true); } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_ENTITY_NAME) { PName entityName = parseEntityName(); if (entityName == null) { return false; } ch = decodeGeneralEntity(entityName); if (ch == 0) { // can't have general entities within attribute values _tokenName = entityName; reportUnexpandedEntityInAttr(_elemAttrName, false); } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_DEC_DIGIT) { ch = handleDecEntityInAttribute(false); } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_HEX_DIGIT) { ch = handleHexEntityInAttribute(false); } else { // 05-Aug-2012, tatu: Apparently we can end up here too... ch = handleAttrValuePendingUTF8(); } if (ch == 0) { // wasn't resolved return false; } char[] attrBuffer = _nameBuffer; // Ok; does it need a surrogate though? (over 16 bits) if ((ch >> 16) != 0) { ch -= 0x10000; if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } attrBuffer[_elemNsPtr++] = (char) (0xD800 | (ch >> 10)); ch = 0xDC00 | (ch & 0x3FF); } if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } attrBuffer[_elemNsPtr++] = (char) ch; return true; // done it! } /* /********************************************************************** /* Common name/entity parsing /********************************************************************** */ @Override protected final PName parseNewName(byte b) throws XMLStreamException { int q = b & 0xFF; // Let's do just quick sanity check first; a thorough check will be // done later on if necessary, now we'll just do the very cheap // check to catch extra spaces etc. if (q < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode throwUnexpectedChar(q, "; expected a name start character"); } _quadCount = 0; _currQuad = q; _currQuadBytes = 1; return parsePName(); }
This method can (for now?) be shared between all Ascii-based encodings, since it only does coarse validity checking -- real checks are done in different method.

Some notes about assumption implementation makes:

  • Well-formed xml content can not end with a name: as such, end-of-input is an error and we can throw an exception
/** * This method can (for now?) be shared between all Ascii-based * encodings, since it only does coarse validity checking -- real * checks are done in different method. *<p> * Some notes about assumption implementation makes: *<ul> * <li>Well-formed xml content can not end with a name: as such, * end-of-input is an error and we can throw an exception * </li> * </ul> */
@Override protected final PName parsePName() throws XMLStreamException { int q = _currQuad; while (true) { int i; switch (_currQuadBytes) { case 0: if (_inputPtr >= _inputEnd) { return null; // all pointers have been set } q = _inputBuffer[_inputPtr++] & 0xFF; // Since name char validity is checked later on, we only need // to be able to reliably see the end of the name... and those // are simple enough so that we can just compare; lookup table // won't speed things up (according to profiler) if (q < 65) { // 'A' // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (q < 45 || q > 58 || q == 47) { // End of name return findPName(q, 0); } } // fall through case 1: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 1; return null; } i = _inputBuffer[_inputPtr++] & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return findPName(q, 1); } } q = (q << 8) | i; // fall through case 2: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 2; return null; } i = _inputBuffer[_inputPtr++] & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return findPName(q, 2); } } q = (q << 8) | i; // fall through case 3: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 3; return null; } i = _inputBuffer[_inputPtr++] & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return findPName(q, 3); } } q = (q << 8) | i; } // If we get this far, need to add full quad into result array and update state if (_quadCount == 0) { // first quad _quadBuffer[0] = q; _quadCount = 1; } else { if (_quadCount >= _quadBuffer.length) { // let's just double? _quadBuffer = DataUtil.growArrayBy(_quadBuffer, _quadBuffer.length); } _quadBuffer[_quadCount++] = q; } _currQuadBytes = 0; } } protected final PName parseNewEntityName(byte b) throws XMLStreamException { int q = b & 0xFF; if (q < INT_A) { throwUnexpectedChar(q, "; expected a name start character"); } _quadCount = 0; _currQuad = q; _currQuadBytes = 1; return parseEntityName(); } protected final PName parseEntityName() throws XMLStreamException { int q = _currQuad; while (true) { int i; switch (_currQuadBytes) { case 0: if (_inputPtr >= _inputEnd) { return null; // all pointers have been set } q = _inputBuffer[_inputPtr++] & 0xFF; /* Since name char validity is checked later on, we only * need to be able to reliably see the end of the name... * and those are simple enough so that we can just * compare; lookup table won't speed things up (according * to profiler) */ if (q < 65) { // 'A' // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (q < 45 || q > 58 || q == 47) { // apos, quot? if (_quadCount == 1) { q = _quadBuffer[0]; if (q == EntityNames.ENTITY_APOS_QUAD) { --_inputPtr; return EntityNames.ENTITY_APOS; } if (q == EntityNames.ENTITY_QUOT_QUAD) { --_inputPtr; return EntityNames.ENTITY_QUOT; } } // Nope, generic: return findPName(q, 0); } } // fall through case 1: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 1; return null; } i = _inputBuffer[_inputPtr++] & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return findPName(q, 1); } } q = (q << 8) | i; // fall through case 2: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 2; return null; } i = _inputBuffer[_inputPtr++] & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { // lt or gt? if (_quadCount == 0) { if (q == EntityNames.ENTITY_GT_QUAD) { --_inputPtr; return EntityNames.ENTITY_GT; } if (q == EntityNames.ENTITY_LT_QUAD) { --_inputPtr; return EntityNames.ENTITY_LT; } } return findPName(q, 2); } } q = (q << 8) | i; // fall through case 3: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 3; return null; } i = _inputBuffer[_inputPtr++] & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { // amp? if (_quadCount == 0) { if (q == EntityNames.ENTITY_AMP_QUAD) { --_inputPtr; return EntityNames.ENTITY_AMP; } } return findPName(q, 3); } } q = (q << 8) | i; } /* If we get this far, need to add full quad into * result array and update state */ if (_quadCount == 0) { // first quad _quadBuffer[0] = q; _quadCount = 1; } else { if (_quadCount >= _quadBuffer.length) { // let's just double? _quadBuffer = DataUtil.growArrayBy(_quadBuffer, _quadBuffer.length); } _quadBuffer[_quadCount++] = q; } _currQuadBytes = 0; } } /* /********************************************************************** /* Internal methods, LF handling /********************************************************************** */
Method called when there is a pending \r (from past buffer), and we need to see
Returns:True if the linefeed was succesfully processed (had enough input data to do that); or false if there is no data available to check this
/** * Method called when there is a pending \r (from past buffer), * and we need to see * * @return True if the linefeed was succesfully processed (had * enough input data to do that); or false if there is no * data available to check this */
@Override protected final boolean handlePartialCR() { // sanity check if (_pendingInput != PENDING_STATE_CR) { throwInternal(); } if (_inputPtr >= _inputEnd) { return false; } _pendingInput = 0; if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } ++_currRow; _rowStartOffset = _inputPtr; return true; } /* /********************************************************************** /* Multi-byte char decoding /********************************************************************** */

Note: caller must guarantee enough data is available before calling the method

/** *<p> * Note: caller must guarantee enough data is available before * calling the method */
protected final int decodeUtf8_2(int c) throws XMLStreamException { int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } return ((c & 0x1F) << 6) | (d & 0x3F); } protected final void skipUtf8_2(int c) throws XMLStreamException { int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } }

Note: caller must guarantee enough data is available before calling the method

/** *<p> * Note: caller must guarantee enough data is available before * calling the method */
protected final int decodeUtf8_3(int c1) throws XMLStreamException { c1 &= 0x0F; int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } int c = (c1 << 6) | (d & 0x3F); d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) { c = handleInvalidXmlChar(c); } } } return c; } protected final int decodeUtf8_3(int c1, int c2, int c3) throws XMLStreamException { // Note: first char is assumed to have been checked if ((c2 & 0xC0) != 0x080) { reportInvalidOther(c2 & 0xFF, _inputPtr-1); } if ((c3 & 0xC0) != 0x080) { reportInvalidOther(c3 & 0xFF, _inputPtr); } int c = ((c1 & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) { c = handleInvalidXmlChar(c); } } } return c; } protected final int decodeUtf8_4(int c) throws XMLStreamException { int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = ((c & 0x07) << 6) | (d & 0x3F); d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } /* note: won't change it to negative here, since caller * already knows it'll need a surrogate */ return ((c << 6) | (d & 0x3F)) - 0x10000; }
Returns:Character value minus 0x10000; this so that caller can readily expand it to actual surrogates
/** * @return Character value <b>minus 0x10000</b>; this so that caller * can readily expand it to actual surrogates */
protected final int decodeUtf8_4(int c1, int c2, int c3, int c4) throws XMLStreamException { /* Note: first char is assumed to have been checked, * (but not yet masked) */ if ((c2 & 0xC0) != 0x080) { reportInvalidOther(c2 & 0xFF, _inputPtr-2); } int c = ((c1 & 0x07) << 6) | (c2 & 0x3F); if ((c3 & 0xC0) != 0x080) { reportInvalidOther(c3 & 0xFF, _inputPtr-1); } c = (c << 6) | (c3 & 0x3F); if ((c4 & 0xC0) != 0x080) { reportInvalidOther(c4 & 0xFF, _inputPtr); } return ((c << 6) | (c4 & 0x3F)) - 0x10000; } }