/* Aalto XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.in;

import java.io.*;

import javax.xml.stream.XMLStreamException;


import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.impl.IoStreamException;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.TextBuilder;
import com.fasterxml.aalto.util.XmlCharTypes;

Base class for various byte stream based scanners (generally one for each type of encoding supported).
/** * Base class for various byte stream based scanners (generally one * for each type of encoding supported). */
public abstract class StreamScanner extends ByteBasedScanner { /* /********************************************************************** /* Configuration, input, buffering /********************************************************************** */
Underlying InputStream to use for reading content.
/** * Underlying InputStream to use for reading content. */
protected InputStream _in; protected byte[] _inputBuffer; /* /********************************************************************** /* Character, name decoding /********************************************************************** */
This is a simple container object that is used to access the decoding tables for characters. Indirection is needed since we actually support multiple utf-8 compatible encodings, not just utf-8 itself.
/** * This is a simple container object that is used to access the * decoding tables for characters. Indirection is needed since * we actually support multiple utf-8 compatible encodings, not * just utf-8 itself. */
protected final XmlCharTypes _charTypes;
For now, symbol table contains prefixed names. In future it is possible that they may be split into prefixes and local names?
/** * For now, symbol table contains prefixed names. In future it is * possible that they may be split into prefixes and local names? */
protected final ByteBasedPNameTable _symbols;
This buffer is used for name parsing. Will be expanded if/as needed; 32 ints can hold names 128 ascii chars long.
/** * This buffer is used for name parsing. Will be expanded if/as * needed; 32 ints can hold names 128 ascii chars long. */
protected int[] _quadBuffer = new int[32]; /* /********************************************************************** /* Life-cycle /********************************************************************** */ public StreamScanner(ReaderConfig cfg, InputStream in, byte[] buffer, int ptr, int last) { super(cfg); _charTypes = cfg.getCharTypes(); _symbols = cfg.getBBSymbols(); _in = in; _inputBuffer = buffer; _inputPtr = ptr; _inputEnd = last; } @Override protected void _releaseBuffers() { super._releaseBuffers(); if (_symbols.maybeDirty()) { _config.updateBBSymbols(_symbols); } // Note: if we have block input (_in == null), the buffer we use is // not owned by scanner, can't recycle. // Also note that this method will always get called before _closeSource() // so that _in won't be cleared before we have a chance to see it. if (_in != null) { if (_inputBuffer != null) { _config.freeFullBBuffer(_inputBuffer); _inputBuffer = null; } } } @Override protected void _closeSource() throws IOException { if (_in != null) { _in.close(); _in = null; } } /* /********************************************************************** /* Abstract methods for sub-classes to implement /********************************************************************** */ protected abstract int handleEntityInText(boolean inAttr) throws XMLStreamException; protected abstract String parsePublicId(byte quoteChar) throws XMLStreamException; protected abstract String parseSystemId(byte quoteChar) throws XMLStreamException; /* /********************************************************************** /* Implementation of parsing API /********************************************************************** */ @Override public final int nextFromProlog(boolean isProlog) throws XMLStreamException { if (_tokenIncomplete) { // left-overs from last thingy? skipToken(); } // First: keep track of where event started setStartLocation(); // Ok: we should get a WS or '<'. So, let's skip through WS while (true) { if (_inputPtr >= _inputEnd) { if (!loadMore()) { setStartLocation(); return TOKEN_EOI; } } int c = _inputBuffer[_inputPtr++] & 0xFF; // Really should get white space or '<'... if (c == INT_LT) { break; } /* 26-Mar-2008, tatus: White space in prolog/epilog is * not to be reported at all (by default at least), as * it is not part of XML Infoset content. So let's * just actively skip it here */ if (c != INT_SPACE) { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { if (!loadMore()) { markLF(); setStartLocation(); return TOKEN_EOI; } } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_TAB) { reportPrologUnexpChar(isProlog, decodeCharForError((byte)c), null); } } } // Ok, got LT: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(COMMENT); // not necessarily a comment of course } byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_EXCL) { // comment/DOCTYPE? (CDATA not legal) return handlePrologDeclStart(isProlog); } if (b == BYTE_QMARK) { return handlePIStart(); } /* End tag not allowed if no open tree; and only one root * element (one root-level start tag) */ if (b == BYTE_SLASH || !isProlog) { reportPrologUnexpElement(isProlog, b); } return handleStartElement(b); } @Override public final int nextFromTree() throws XMLStreamException { if (_tokenIncomplete) { // left-overs? if (skipToken()) { // Figured out next event (ENTITY_REFERENCE)? // !!! We don't yet parse DTD, don't know real contents return _nextEntity(); } } else { // note: START_ELEMENT/END_ELEMENT never incomplete if (_currToken == START_ELEMENT) { if (_isEmptyTag) { --_depth; return (_currToken = END_ELEMENT); } } else if (_currToken == END_ELEMENT) { _currElem = _currElem.getParent(); // Any namespace declarations that need to be unbound? while (_lastNsDecl != null && _lastNsDecl.getLevel() >= _depth) { _lastNsDecl =_lastNsDecl.unbind(); } } else { // It's possible CHARACTERS entity with an entity ref: if (_entityPending) { _entityPending = false; return _nextEntity(); } } } // and except for special cases, mark down actual start location of the event setStartLocation(); /* Any more data? Although it'd be an error not to get any, * let's leave error reporting up to caller */ if (_inputPtr >= _inputEnd) { if (!loadMore()) { setStartLocation(); return TOKEN_EOI; } } byte b = _inputBuffer[_inputPtr]; /* Can get pretty much any type; start/end element, comment/PI, * CDATA, text, entity reference... */ if (b == BYTE_LT) { // root element, comment, proc instr? ++_inputPtr; b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(COMMENT); if (b == BYTE_EXCL) { // comment or CDATA return handleCommentOrCdataStart(); } if (b == BYTE_QMARK) { return handlePIStart(); } if (b == BYTE_SLASH) { return handleEndElement(); } return handleStartElement(b); } if (b == BYTE_AMP) { // entity reference ++_inputPtr; /* Need to expand; should indicate either text, or an unexpanded * entity reference */ int i = handleEntityInText(false); if (i == 0) { // general entity return (_currToken = ENTITY_REFERENCE); } /* Nope, a char entity; need to indicate it came from an entity. * Since we may want to store the char as is, too, let's negate * entity-based char */ _tmpChar = -i; } else { /* Let's store it for future reference. May or may not be used -- * so let's not advance input ptr quite yet. */ _tmpChar = (int) b & 0xFF; // need to ensure it won't be negative } if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return (_currToken = CHARACTERS); }
Helper method used to isolate things that need to be (re)set in cases where
/** * Helper method used to isolate things that need to be (re)set in * cases where */
protected int _nextEntity() { // !!! Also, have to assume start location has been set or such _textBuilder.resetWithEmpty(); // !!! TODO: handle start location? return (_currToken = ENTITY_REFERENCE); } /* /********************************************************************** /* Internal methods, secondary parsing /********************************************************************** */ private final int handlePrologDeclStart(boolean isProlog) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_HYPHEN) { // Comment? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; if (b == BYTE_HYPHEN) { if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishComment(); } return (_currToken = COMMENT); } } else if (b == BYTE_D) { // DOCTYPE? if (isProlog) { // no DOCTYPE in epilog handleDtdStart(); // incomplete flag is set by handleDtdStart if (!_cfgLazyParsing) { if (_tokenIncomplete) { finishDTD(true); // must copy contents, may be needed _tokenIncomplete = false; } } return DTD; } } /* error... for error recovery purposes, let's just pretend * like it was unfinished CHARACTERS, though. */ _tokenIncomplete = true; _currToken = CHARACTERS; reportPrologUnexpChar(isProlog, decodeCharForError(b), " (expected '-' for COMMENT)"); return _currToken; // never gets here } private final int handleDtdStart() throws XMLStreamException { matchAsciiKeyword("DOCTYPE"); // And then some white space and root name byte b = skipInternalWs(true, "after DOCTYPE keyword, before root name"); _tokenName = parsePName(b); b = skipInternalWs(false, null); if (b == BYTE_P) { // PUBLIC matchAsciiKeyword("PUBLIC"); b = skipInternalWs(true, null); _publicId = parsePublicId(b); b = skipInternalWs(true, null); _systemId = parseSystemId(b); b = skipInternalWs(false, null); } else if (b == BYTE_S) { // SYSTEM matchAsciiKeyword("SYSTEM"); b = skipInternalWs(true, null); _publicId = null; _systemId = parseSystemId(b); b = skipInternalWs(false, null); } else { _publicId = _systemId = null; } /* Ok; so, need to get either an internal subset, or the * end: */ if (b == BYTE_GT) { // fine, we are done _tokenIncomplete = false; return (_currToken = DTD); } if (b != BYTE_LBRACKET) { // If not end, must have int. subset String msg = (_systemId != null) ? " (expected '[' for the internal subset, or '>' to end DOCTYPE declaration)" : " (expected a 'PUBLIC' or 'SYSTEM' keyword, '[' for the internal subset, or '>' to end DOCTYPE declaration)"; reportTreeUnexpChar(decodeCharForError(b), msg); } /* Need not parse the int. subset yet, can leave as is, and then * either skip or parse later on */ _tokenIncomplete = true; return (_currToken = DTD); } private final int handleCommentOrCdataStart() throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; // Let's first see if it's a comment (simpler) if (b == BYTE_HYPHEN) { // Comment if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; if (b != BYTE_HYPHEN) { reportTreeUnexpChar(decodeCharForError(b), " (expected '-' for COMMENT)"); } if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishComment(); } return (_currToken = COMMENT); } // If not, should be CDATA: if (b == BYTE_LBRACKET) { // CDATA _currToken = CDATA; for (int i = 0; i < 6; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; if (b != (byte) CDATA_STR.charAt(i)) { int ch = decodeCharForError(b); reportTreeUnexpChar(ch, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)"); } } if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCData(); } return CDATA; } reportTreeUnexpChar(decodeCharForError(b), " (expected either '-' for COMMENT or '[CDATA[' for CDATA section)"); return TOKEN_EOI; // never gets here }
Method called after leading '
/** * Method called after leading '<?' has been parsed; needs to parse * target. */
private final int handlePIStart() throws XMLStreamException { _currToken = PROCESSING_INSTRUCTION; // Ok, first, need a name if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; _tokenName = parsePName(b); { // but is it "xml" (case insensitive)? String ln = _tokenName.getLocalName(); if (ln.length() == 3 && ln.equalsIgnoreCase("xml") && _tokenName.getPrefix() == null) { reportInputProblem(ErrorConsts.ERR_WF_PI_XML_TARGET); } } /* Let's then verify that we either get a space, or closing * '?>': this way we'll catch some problems right away, and also * simplify actual processing of contents. */ if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } int c = (int) _inputBuffer[_inputPtr++] & 0xFF; if (c <= INT_SPACE) { // Ok, let's skip the white space... while (true) { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = (int) _inputBuffer[_inputPtr] & 0xFF; if (c > INT_SPACE) { break; } ++_inputPtr; } // Ok, got non-space, need to push back: if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishPI(); } } else { if (c != INT_QMARK) { reportMissingPISpace(decodeCharForError((byte)c)); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; if (b != BYTE_GT) { reportMissingPISpace(decodeCharForError(b)); } _textBuilder.resetWithEmpty(); _tokenIncomplete = false; } return PROCESSING_INSTRUCTION; }
Returns:Code point for the entity that expands to a valid XML content character.
/** * @return Code point for the entity that expands to a valid XML * content character. */
protected final int handleCharEntity() throws XMLStreamException { // Hex or decimal? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; int value = 0; if (b == BYTE_x) { // hex while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; if (b == BYTE_SEMICOLON) { break; } value = value << 4; int c = (int) b; if (c <= '9' && c >= '0') { value += (c - '0'); } else if (c >= 'a' && c <= 'f') { value += 10 + (c - 'a'); } else if (c >= 'A' && c <= 'F') { value += 10 + (c - 'A'); } else { throwUnexpectedChar(decodeCharForError(b), "; expected a hex digit (0-9a-fA-F)"); } if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } } } else { // numeric (decimal) while (b != BYTE_SEMICOLON) { int c = (int) b; if (c <= '9' && c >= '0') { value = (value * 10) + (c - '0'); if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } } else { throwUnexpectedChar(decodeCharForError(b), "; expected a decimal number"); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; } } verifyXmlChar(value); return value; }
Parsing of start element requires parsing of the element name (and attribute names), and is thus encoding-specific.
/** * Parsing of start element requires parsing of the element name * (and attribute names), and is thus encoding-specific. */
protected abstract int handleStartElement(byte b) throws XMLStreamException;
Note that this method is currently also shareable for all Ascii-based encodings, and at least between UTF-8 and ISO-Latin1. The reason is that since we already know exact bytes that need to be matched, there's no danger of getting invalid encodings or such. So, for now, let's leave this method here in the base class.
/** * Note that this method is currently also shareable for all Ascii-based * encodings, and at least between UTF-8 and ISO-Latin1. The reason is * that since we already know exact bytes that need to be matched, * there's no danger of getting invalid encodings or such. * So, for now, let's leave this method here in the base class. */
protected final int handleEndElement() throws XMLStreamException { --_depth; _currToken = END_ELEMENT; // Ok, at this point we have seen '/', need the name _tokenName = _currElem.getName(); int size = _tokenName.sizeInQuads(); /* Do we need to take the slow route? Let's separate that out * to another method. * Note: we'll require max bytes for name PLUS one (for trailing * '>', most likely). */ if ((_inputEnd - _inputPtr) < ((size << 2) + 1)) { // may need to load more return handleEndElementSlow(size); } int ptr = _inputPtr; byte[] buf = _inputBuffer; // First all full chunks of 4 bytes (if any) --size; for (int qix = 0; qix < size; ++qix) { int q = (buf[ptr] << 24) | ((buf[ptr+1] & 0xFF) << 16) | ((buf[ptr+2] & 0xFF) << 8) | ((buf[ptr+3] & 0xFF)) ; ptr += 4; // match? if (q != _tokenName.getQuad(qix)) { _inputPtr = ptr; reportUnexpectedEndTag(_tokenName.getPrefixedName()); } } /* After which we can deal with the last entry: it's bit * tricky as we don't actually fully know byte length... */ int lastQ = _tokenName.getQuad(size); int q = buf[ptr++] & 0xFF; if (q != lastQ) { // need second byte? q = (q << 8) | (buf[ptr++] & 0xFF); if (q != lastQ) { // need third byte? q = (q << 8) | (buf[ptr++] & 0xFF); if (q != lastQ) { // need full 4 bytes? q = (q << 8) | (buf[ptr++] & 0xFF); if (q != lastQ) { // still no match? failure! _inputPtr = ptr; reportUnexpectedEndTag(_tokenName.getPrefixedName()); } } } } // Trailing space? int i2 = _inputBuffer[ptr] & 0xFF; _inputPtr = ptr + 1; while (i2 <= INT_SPACE) { if (i2 == INT_LF) { markLF(); } else if (i2 == INT_CR) { byte b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b != BYTE_LF) { markLF(_inputPtr-1); i2 = (int) b & 0xFF; continue; } markLF(); } else if (i2 != INT_SPACE && i2 != INT_TAB) { throwInvalidSpace(i2); } i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF; } if (i2 != INT_GT) { throwUnexpectedChar(decodeCharForError((byte)i2), " expected space or closing '>'"); } return END_ELEMENT; } private final int handleEndElementSlow(int size) throws XMLStreamException { /* Nope, will likely cross the input boundary; need * to do proper checks */ --size; for (int qix = 0; qix < size; ++qix) { // first, full chunks int q = 0; for (int i = 0; i < 4; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } q = (q << 8) | (_inputBuffer[_inputPtr++] & 0xFF); } // match? if (q != _tokenName.getQuad(qix)) { reportUnexpectedEndTag(_tokenName.getPrefixedName()); } } // And then the last 1-4 bytes: int lastQ = _tokenName.getQuad(size); int q = 0; int i = 0; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } q = (q << 8) | (_inputBuffer[_inputPtr++] & 0xFF); if (q == lastQ) { // match break; } if (++i > 3) { // no match, error reportUnexpectedEndTag(_tokenName.getPrefixedName()); break; // never gets here } } // Trailing space? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } int i2 = _inputBuffer[_inputPtr++]; while (i2 <= INT_SPACE) { if (i2 == INT_LF) { markLF(); } else if (i2 == INT_CR) { byte b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b != BYTE_LF) { markLF(_inputPtr-1); i2 = (int) b & 0xFF; continue; } markLF(); } else if (i2 != INT_SPACE && i2 != INT_TAB) { throwInvalidSpace(i2); } i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF; } if (i2 != INT_GT) { throwUnexpectedChar(decodeCharForError((byte)i2), " expected space or closing '>'"); } return END_ELEMENT; } /* 28-Oct-2006, tatus: This is the old (slow) implementation. I'll * leave it here, since it's known to work, so in case new impl * has problems, one can refer to the old impl */ /* protected final int handleEndElement2() throws XMLStreamException { --_depth; _currToken = END_ELEMENT; // Ok, at this point we have seen '/', need the name _tokenName = _currElem.getName(); int i2; int qix = 0; while (true) { int q; int expQuad = _tokenName.getQuad(qix); // First byte of a quad: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } i2 = _inputBuffer[_inputPtr++] & 0xFF; if (i2 < 65) { // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (i2 < 45 || i2 > 58 || i2 == 47) { if (0 != expQuad || _tokenName.sizeInQuads() != qix) { reportUnexpectedEndTag(_tokenName.getPrefixedName()); } break; } } q = i2; ++qix; // since this started a new quad // second byte //i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } i2 = _inputBuffer[_inputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { if (q != expQuad || _tokenName.sizeInQuads() != qix) { reportUnexpectedEndTag(_tokenName.getPrefixedName()); } break; } } q = (q << 8) | i2; // third byte //i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } i2 = _inputBuffer[_inputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? if (q != expQuad || _tokenName.sizeInQuads() != qix) { reportUnexpectedEndTag(_tokenName.getPrefixedName()); } break; } } q = (q << 8) | i2; // fourth byte //i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } i2 = _inputBuffer[_inputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? if (q != expQuad || _tokenName.sizeInQuads() != qix) { reportUnexpectedEndTag(_tokenName.getPrefixedName()); } break; } } q = (q << 8) | i2; // Full quad, ok; need to compare now: if (q != expQuad) { // Let's just fall through, then; will throw exception reportUnexpectedEndTag(_tokenName.getPrefixedName()); } } // Note: i2 still holds the last byte read (except if we detected // a mismatch; but that caused an exception above) // Trailing space? while (i2 <= INT_SPACE) { if (i2 == INT_LF) { markLF(); } else if (i2 == INT_CR) { byte b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b != BYTE_LF) { markLF(_inputPtr-1); i2 = (int) b & 0xFF; continue; } markLF(); } else if (i2 != INT_SPACE && i2 != INT_TAB) { throwInvalidSpace(i2); } i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF; } if (i2 != INT_GT) { throwUnexpectedChar(decodeCharForError((byte)i2), " expected space or closing '>'"); } return END_ELEMENT; } */ /* /********************************************************************** /* Common name/entity parsing /********************************************************************** */
This method can (for now?) be shared between all Ascii-based encodings, since it only does coarse validity checking -- real checks are done in different method.

Some notes about assumption implementation makes:

  • Well-formed xml content can not end with a name: as such, end-of-input is an error and we can throw an exception
/** * This method can (for now?) be shared between all Ascii-based * encodings, since it only does coarse validity checking -- real * checks are done in different method. *<p> * Some notes about assumption implementation makes: *<ul> * <li>Well-formed xml content can not end with a name: as such, * end-of-input is an error and we can throw an exception * </li> * </ul> */
protected final PName parsePName(byte b) throws XMLStreamException { // First: can we optimize out bounds checks? if ((_inputEnd - _inputPtr) < 8) { // got 1 byte, but need 7, plus one trailing return parsePNameSlow(b); } // If so, can also unroll loops nicely int q = b & 0xFF; // Let's do just quick sanity check first; a thorough check will be // done later on if necessary, now we'll just do the very cheap // check to catch extra spaces etc. if (q < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode throwUnexpectedChar(q, "; expected a name start character"); } int i2 = _inputBuffer[_inputPtr++] & 0xFF; // For other bytes beyond first we have to do bit more complicated // check, to reliably find out where name ends. Still can do quite // simple checks though if (i2 < 65) { // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (i2 < 45 || i2 > 58 || i2 == 47) { return findPName(q, 1); } } q = (q << 8) | i2; i2 = (int) _inputBuffer[_inputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q, 2); } } q = (q << 8) | i2; i2 = (int) _inputBuffer[_inputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 3 (ascii) char name? return findPName(q, 3); } } q = (q << 8) | i2; i2 = (int) _inputBuffer[_inputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 4 (ascii) char name? return findPName(q, 4); } } // Longer, let's offline: return parsePNameMedium(i2, q); } protected PName parsePNameMedium(int i2, int q1) throws XMLStreamException { // Ok, so far so good; one quad, one byte. Then the second int q2 = i2; i2 = _inputBuffer[_inputPtr++] & 0xFF; if (i2 < 65) { // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (i2 < 45 || i2 > 58 || i2 == 47) { return findPName(q1, q2, 1); } } q2 = (q2 << 8) | i2; i2 = (int) _inputBuffer[_inputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q1, q2, 2); } } q2 = (q2 << 8) | i2; i2 = (int) _inputBuffer[_inputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 3 (ascii) char name? return findPName(q1, q2, 3); } } q2 = (q2 << 8) | i2; i2 = (int) _inputBuffer[_inputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 4 (ascii) char name? return findPName(q1, q2, 4); } } // Ok, no, longer loop. Let's offline int[] quads = _quadBuffer; quads[0] = q1; quads[1] = q2; return parsePNameLong(i2, quads); } protected final PName parsePNameLong(int q, int[] quads) throws XMLStreamException { int qix = 2; while (true) { // Second byte of a new quad if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } int i2 = _inputBuffer[_inputPtr++] & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // End of name, a single ascii char? return findPName(q, quads, qix, 1); } } // 3rd byte: q = (q << 8) | i2; i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q, quads, qix, 2); } } // 4th byte: q = (q << 8) | i2; i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q, quads, qix, 3); } } q = (q << 8) | i2; i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q, quads, qix, 4); } } if (qix >= quads.length) { // let's just double? _quadBuffer = quads = DataUtil.growArrayBy(quads, quads.length); } quads[qix] = q; ++qix; q = i2; } } protected final PName parsePNameSlow(byte b) throws XMLStreamException { int q = b & 0xFF; // Let's do just quick sanity check first; a thorough check will be // done later on if necessary, now we'll just do the very cheap // check to catch extra spaces etc. if (q < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode throwUnexpectedChar(q, "; expected a name start character"); } int[] quads = _quadBuffer; int qix = 0; // Let's optimize a bit for shorter PNames... int firstQuad = 0; while (true) { // Second byte if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } int i2 = _inputBuffer[_inputPtr++] & 0xFF; // For other bytes beyond first we have to do bit more complicated // check, to reliably find out where name ends. Still can do quite // simple checks though if (i2 < 65) { // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (i2 < 45 || i2 > 58 || i2 == 47) { // End of name, a single ascii char? return findPName(q, 1, firstQuad, qix, quads); } } // 3rd byte: q = (q << 8) | i2; i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 2 (ascii) char name? return findPName(q, 2, firstQuad, qix, quads); } } // 4th byte: q = (q << 8) | i2; i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 3 (ascii) char name? return findPName(q, 3, firstQuad, qix, quads); } } q = (q << 8) | i2; // Ok; one more full quad gotten... but just to squeeze bit // more mileage out of it, was this the end? i2 = (int) ((_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne()) & 0xFF; if (i2 < 65) { if (i2 < 45 || i2 > 58 || i2 == 47) { // 4 (ascii) char name? return findPName(q, 4, firstQuad, qix, quads); } } // Nope; didn't end. May need to store the quad in temporary // buffer and continue if (qix == 0) { // not yet, was the first quad firstQuad = q; } else if (qix == 1) { // second quad, need to init buffer quads[0] = firstQuad; quads[1] = q; } else { // 3rd or after... need to make sure there's room if (qix >= quads.length) { // let's just double? _quadBuffer = quads = DataUtil.growArrayBy(quads, quads.length); } quads[qix] = q; } ++qix; q = i2; } }
Method called to process a sequence of bytes that is likely to be a PName. At this point we encountered an end marker, and may either hit a formerly seen well-formed PName; an as-of-yet unseen well-formed PName; or a non-well-formed sequence (containing one or more non-name chars without any valid end markers).
Params:
  • onlyQuad – Word with 1 to 4 bytes that make up PName
  • lastByteCount – Number of actual bytes contained in onlyQuad; 0 to 3.
/** * Method called to process a sequence of bytes that is likely to * be a PName. At this point we encountered an end marker, and * may either hit a formerly seen well-formed PName; an as-of-yet * unseen well-formed PName; or a non-well-formed sequence (containing * one or more non-name chars without any valid end markers). * * @param onlyQuad Word with 1 to 4 bytes that make up PName * @param lastByteCount Number of actual bytes contained in onlyQuad; 0 to 3. */
private final PName findPName(int onlyQuad, int lastByteCount) throws XMLStreamException { // First, need to push back the byte read but not used: --_inputPtr; int hash = ByteBasedPNameTable.calcHash(onlyQuad); PName name = _symbols.findSymbol(hash, onlyQuad, 0); if (name == null) { // Let's simplify things a bit, and just use array based one then: _quadBuffer[0] = onlyQuad; name = addPName(hash, _quadBuffer, 1, lastByteCount); } return name; }
Method called to process a sequence of bytes that is likely to be a PName. At this point we encountered an end marker, and may either hit a formerly seen well-formed PName; an as-of-yet unseen well-formed PName; or a non-well-formed sequence (containing one or more non-name chars without any valid end markers).
Params:
  • firstQuad – First 1 to 4 bytes of the PName
  • secondQuad – Word with last 1 to 4 bytes of the PName
  • lastByteCount – Number of bytes contained in secondQuad; 0 to 3.
/** * Method called to process a sequence of bytes that is likely to * be a PName. At this point we encountered an end marker, and * may either hit a formerly seen well-formed PName; an as-of-yet * unseen well-formed PName; or a non-well-formed sequence (containing * one or more non-name chars without any valid end markers). * * @param firstQuad First 1 to 4 bytes of the PName * @param secondQuad Word with last 1 to 4 bytes of the PName * @param lastByteCount Number of bytes contained in secondQuad; 0 to 3. */
private final PName findPName(int firstQuad, int secondQuad, int lastByteCount) throws XMLStreamException { // First, need to push back the byte read but not used: --_inputPtr; int hash = ByteBasedPNameTable.calcHash(firstQuad, secondQuad); PName name = _symbols.findSymbol(hash, firstQuad, secondQuad); if (name == null) { // Let's just use array, then _quadBuffer[0] = firstQuad; _quadBuffer[1] = secondQuad; name = addPName(hash, _quadBuffer, 2, lastByteCount); } return name; }
Method called to process a sequence of bytes that is likely to be a PName. At this point we encountered an end marker, and may either hit a formerly seen well-formed PName; an as-of-yet unseen well-formed PName; or a non-well-formed sequence (containing one or more non-name chars without any valid end markers).
Params:
  • lastQuad – Word with last 0 to 3 bytes of the PName; not included in the quad array
  • quads – Array that contains all the quads, except for the last one, for names with more than 8 bytes (i.e. more than 2 quads)
  • qlen – Number of quads in the array, except if less than 2 (in which case only firstQuad and lastQuad are used)
  • lastByteCount – Number of bytes contained in lastQuad; 0 to 3.
/** * Method called to process a sequence of bytes that is likely to * be a PName. At this point we encountered an end marker, and * may either hit a formerly seen well-formed PName; an as-of-yet * unseen well-formed PName; or a non-well-formed sequence (containing * one or more non-name chars without any valid end markers). * * @param lastQuad Word with last 0 to 3 bytes of the PName; not included * in the quad array * @param quads Array that contains all the quads, except for the * last one, for names with more than 8 bytes (i.e. more than * 2 quads) * @param qlen Number of quads in the array, except if less than 2 * (in which case only firstQuad and lastQuad are used) * @param lastByteCount Number of bytes contained in lastQuad; 0 to 3. */
private final PName findPName(int lastQuad, int[] quads, int qlen, int lastByteCount) throws XMLStreamException { // First, need to push back the byte read but not used: --_inputPtr; /* Nope, long (3 quads or more). At this point, the last quad is * not yet in the array, let's add: */ if (qlen >= quads.length) { // let's just double? _quadBuffer = quads = DataUtil.growArrayBy(quads, quads.length); } quads[qlen++] = lastQuad; int hash = ByteBasedPNameTable.calcHash(quads, qlen); PName name = _symbols.findSymbol(hash, quads, qlen); if (name == null) { name = addPName(hash, quads, qlen, lastByteCount); } return name; }
Method called to process a sequence of bytes that is likely to be a PName. At this point we encountered an end marker, and may either hit a formerly seen well-formed PName; an as-of-yet unseen well-formed PName; or a non-well-formed sequence (containing one or more non-name chars without any valid end markers).
Params:
  • lastQuad – Word with last 0 to 3 bytes of the PName; not included in the quad array
  • lastByteCount – Number of bytes contained in lastQuad; 0 to 3.
  • firstQuad – First 1 to 4 bytes of the PName (4 if length at least 4 bytes; less only if not).
  • qlen – Number of quads in the array, except if less than 2 (in which case only firstQuad and lastQuad are used)
  • quads – Array that contains all the quads, except for the last one, for names with more than 8 bytes (i.e. more than 2 quads)
/** * Method called to process a sequence of bytes that is likely to * be a PName. At this point we encountered an end marker, and * may either hit a formerly seen well-formed PName; an as-of-yet * unseen well-formed PName; or a non-well-formed sequence (containing * one or more non-name chars without any valid end markers). * * @param lastQuad Word with last 0 to 3 bytes of the PName; not included * in the quad array * @param lastByteCount Number of bytes contained in lastQuad; 0 to 3. * @param firstQuad First 1 to 4 bytes of the PName (4 if length * at least 4 bytes; less only if not). * @param qlen Number of quads in the array, except if less than 2 * (in which case only firstQuad and lastQuad are used) * @param quads Array that contains all the quads, except for the * last one, for names with more than 8 bytes (i.e. more than * 2 quads) */
private final PName findPName(int lastQuad, int lastByteCount, int firstQuad, int qlen, int[] quads) throws XMLStreamException { // Separate handling for short names: if (qlen <= 1) { if (qlen == 0) { // 4-bytes or less; only has 'lastQuad' defined return findPName(lastQuad, lastByteCount); } return findPName(firstQuad, lastQuad, lastByteCount); } return findPName(lastQuad, quads, qlen, lastByteCount); } protected final PName addPName(int hash, int[] quads, int qlen, int lastQuadBytes) throws XMLStreamException { return addUTFPName(_symbols, _charTypes, hash, quads, qlen, lastQuadBytes); } /* /********************************************************************** /* Other parsing helper methods /********************************************************************** */
Returns:First byte following skipped white space
/** * @return First byte following skipped white space */
protected byte skipInternalWs(boolean reqd, String msg) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; int c = b & 0xFF; if (c > INT_SPACE) { if (!reqd) { return b; } reportTreeUnexpChar(decodeCharForError(b), " (expected white space "+msg+")"); } do { // But let's first handle the space we already got: if (b == BYTE_LF) { markLF(); } else if (b == BYTE_CR) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (b != BYTE_SPACE && b != BYTE_TAB) { throwInvalidSpace(b); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; } while ((b & 0xFF) <= INT_SPACE); return b; } private final void matchAsciiKeyword(String keyw) throws XMLStreamException { for (int i = 1, len = keyw.length(); i < len; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; if (b != (byte) keyw.charAt(i)) { reportTreeUnexpChar(decodeCharForError(b), " (expected '"+keyw.charAt(i)+"' for "+keyw+" keyword)"); } } }

Note: consequtive white space is only considered indentation, if the following token seems like a tag (start/end). This so that if a CDATA section follows, it can be coalesced in coalescing mode. Although we could check if coalescing mode is enabled, this should seldom have significant effect either way, so it removes one possible source of problems in coalescing mode.

Returns:-1, if indentation was handled; offset in the output buffer, if not
/** *<p> * Note: consequtive white space is only considered indentation, * if the following token seems like a tag (start/end). This so * that if a CDATA section follows, it can be coalesced in * coalescing mode. Although we could check if coalescing mode is * enabled, this should seldom have significant effect either way, * so it removes one possible source of problems in coalescing mode. * * @return -1, if indentation was handled; offset in the output * buffer, if not */
protected final int checkInTreeIndentation(int c) throws XMLStreamException { if (c == INT_CR) { // First a degenerate case, a lone \r: if (_inputPtr >= _inputEnd && !loadMore()) { _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } } markLF(); // Then need an indentation char (or start/end tag): if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr]; if (b != BYTE_SPACE && b != BYTE_TAB) { // May still be indentation, if it's lt + non-exclamation mark if (b == BYTE_LT) { if ((_inputPtr+1) < _inputEnd && _inputBuffer[_inputPtr+1] != BYTE_EXCL) { _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } } char[] outBuf = _textBuilder.resetWithEmpty(); outBuf[0] = CHAR_LF; _textBuilder.setCurrentLength(1); return 1; } // So how many do we get? ++_inputPtr; int count = 1; int max = (b == BYTE_SPACE) ? TextBuilder.MAX_INDENT_SPACES : TextBuilder.MAX_INDENT_TABS; while (count <= max) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b2 = _inputBuffer[_inputPtr]; if (b2 != b) { // Has to be followed by a start/end tag... if (b2 == BYTE_LT && (_inputPtr+1) < _inputEnd && _inputBuffer[_inputPtr+1] != BYTE_EXCL) { _textBuilder.resetWithIndentation(count, (char) b); return -1; } break; } ++_inputPtr; ++count; } // Nope, hit something else, or too long: need to just copy the stuff // we know buffer has enough room either way char[] outBuf = _textBuilder.resetWithEmpty(); outBuf[0] = CHAR_LF; char ind = (char) b; for (int i = 1; i <= count; ++i) { outBuf[i] = ind; } count += 1; // to account for leading lf _textBuilder.setCurrentLength(count); return count; }
Returns:-1, if indentation was handled; offset in the output buffer, if not
/** * @return -1, if indentation was handled; offset in the output * buffer, if not */
protected final int checkPrologIndentation(int c) throws XMLStreamException { if (c == INT_CR) { // First a degenerate case, a lone \r: if (_inputPtr >= _inputEnd && !loadMore()) { _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } } markLF(); // Ok, indentation char? if (_inputPtr >= _inputEnd && !loadMore()) { _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } byte b = _inputBuffer[_inputPtr]; // won't advance past the char yet if (b != BYTE_SPACE && b != BYTE_TAB) { // If lt, it's still indentation ok: if (b == BYTE_LT) { // need _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } // Nope... something else char[] outBuf = _textBuilder.resetWithEmpty(); outBuf[0] = CHAR_LF; _textBuilder.setCurrentLength(1); return 1; } // So how many do we get? ++_inputPtr; int count = 1; int max = (b == BYTE_SPACE) ? TextBuilder.MAX_INDENT_SPACES : TextBuilder.MAX_INDENT_TABS; while (true) { if (_inputPtr >= _inputEnd && !loadMore()) { break; } if (_inputBuffer[_inputPtr] != b) { break; } ++_inputPtr; ++count; if (count >= max) { // ok, can't share... but can build it still // we know buffer has enough room char[] outBuf = _textBuilder.resetWithEmpty(); outBuf[0] = CHAR_LF; char ind = (char) b; for (int i = 1; i <= count; ++i) { outBuf[i] = ind; } count += 1; // to account for leading lf _textBuilder.setCurrentLength(count); return count; } } // Ok, gotcha? _textBuilder.resetWithIndentation(count, (char) b); return -1; } /* /********************************************************************** /* Methods for sub-classes, reading data /********************************************************************** */ @Override protected final boolean loadMore() throws XMLStreamException { // First, let's update offsets: _pastBytesOrChars += _inputEnd; _rowStartOffset -= _inputEnd; _inputPtr = 0; // If it's a block source, there's no input stream, or any more data: if (_in == null) { _inputEnd = 0; return false; } try { int count = _in.read(_inputBuffer, 0, _inputBuffer.length); if (count < 1) { _inputEnd = 0; if (count == 0) { /* Sanity check; should never happen with correctly written * InputStreams... */ reportInputProblem("InputStream returned 0 bytes, even when asked to read up to "+_inputBuffer.length); } return false; } _inputEnd = count; return true; } catch (IOException ioe) { throw new IoStreamException(ioe); } } protected final byte nextByte(int tt) throws XMLStreamException { if (_inputPtr >= _inputEnd) { if (!loadMore()) { reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(tt)); } } return _inputBuffer[_inputPtr++]; } protected final byte nextByte() throws XMLStreamException { if (_inputPtr >= _inputEnd) { if (!loadMore()) { reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(_currToken)); } } return _inputBuffer[_inputPtr++]; } protected final byte loadOne() throws XMLStreamException { if (!loadMore()) { reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(_currToken)); } return _inputBuffer[_inputPtr++]; } protected final byte loadOne(int type) throws XMLStreamException { if (!loadMore()) { reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(type)); } return _inputBuffer[_inputPtr++]; } protected final boolean loadAndRetain(int nrOfChars) throws XMLStreamException { /* first: can't move, if we were handed an immutable block * (alternative to handing InputStream as _in) */ if (_in == null) { return false; } // otherwise, need to use cut'n pasted code from loadMore()... _pastBytesOrChars += _inputPtr; _rowStartOffset -= _inputPtr; int remaining = (_inputEnd - _inputPtr); // must be > 0 System.arraycopy(_inputBuffer, _inputPtr, _inputBuffer, 0, remaining); _inputPtr = 0; _inputEnd = remaining; // temporarily set to cover copied stuff try { do { int max = _inputBuffer.length - _inputEnd; int count = _in.read(_inputBuffer, _inputEnd, max); if (count < 1) { if (count == 0) { // Sanity check, should never happen with non-buggy readers/stream reportInputProblem("InputStream returned 0 bytes, even when asked to read up to "+max); } return false; } _inputEnd += count; } while (_inputEnd < nrOfChars); return true; } catch (IOException ioe) { throw new IoStreamException(ioe); } } }