package com.fasterxml.aalto.in;

import java.io.*;

import javax.xml.stream.XMLStreamException;

import org.codehaus.stax2.XMLStreamLocation2;

import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.impl.IoStreamException;
import com.fasterxml.aalto.impl.LocationImpl;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.TextBuilder;
import com.fasterxml.aalto.util.XmlCharTypes;
import com.fasterxml.aalto.util.XmlChars;
import com.fasterxml.aalto.util.XmlConsts;

This is the concrete scanner implementation used when input comes as a Reader. In general using this scanner is quite a bit less optimal than that of InputStream based scanner. Nonetheless, it is included for completeness, since Stax interface allows passing Readers as input sources.
/** * This is the concrete scanner implementation used when input comes * as a {@link java.io.Reader}. In general using this scanner is quite * a bit less optimal than that of {@link java.io.InputStream} based * scanner. Nonetheless, it is included for completeness, since Stax * interface allows passing Readers as input sources. */
public final class ReaderScanner extends XmlScanner {
Although java chars are basically UTF-16 in memory, the closest match for char types is Latin1.
/** * Although java chars are basically UTF-16 in memory, the closest * match for char types is Latin1. */
private final static XmlCharTypes sCharTypes = InputCharTypes.getLatin1CharTypes(); /* /********************************************************************** /* Configuration /********************************************************************** */
Underlying InputStream to use for reading content.
/** * Underlying InputStream to use for reading content. */
protected Reader _in; /* /********************************************************************** /* Input buffering /********************************************************************** */ protected char[] _inputBuffer; protected int _inputPtr; protected int _inputEnd;
Storage location for a single character that can not be pushed back (for example, multi-byte char)
/** * Storage location for a single character that can not be pushed * back (for example, multi-byte char) */
protected int mTmpChar = INT_NULL; /* /********************************************************************** /* Symbol handling /********************************************************************** */
For now, symbol table contains prefixed names. In future it is possible that they may be split into prefixes and local names?
/** * For now, symbol table contains prefixed names. In future it is * possible that they may be split into prefixes and local names? */
protected final CharBasedPNameTable _symbols; /* /********************************************************************** /* Life-cycle /********************************************************************** */ public ReaderScanner(ReaderConfig cfg, Reader r, char[] buffer, int ptr, int last) { super(cfg); _in = r; _inputBuffer = buffer; _inputPtr = ptr; _inputEnd = last; _pastBytesOrChars = 0; // should it be passed by caller? _rowStartOffset = 0; // should probably be passed by caller... _symbols = cfg.getCBSymbols(); } public ReaderScanner(ReaderConfig cfg, Reader r) { super(cfg); _in = r; _inputBuffer = cfg.allocFullCBuffer(ReaderConfig.DEFAULT_CHAR_BUFFER_LEN); _inputPtr = _inputEnd = 0; _pastBytesOrChars = 0; // should it be passed by caller? _rowStartOffset = 0; // should probably be passed by caller... _symbols = cfg.getCBSymbols(); } @Override protected void _releaseBuffers() { super._releaseBuffers(); if (_symbols.maybeDirty()) { _config.updateCBSymbols(_symbols); } /* Note: if we have block input (_in == null), the buffer we * use is not owned by scanner, can't recycle * Also note that this method will always get called before * _closeSource(); so that _in won't be cleared before we * have a chance to see it. */ if (_in != null) { if (_inputBuffer != null) { _config.freeFullCBuffer(_inputBuffer); _inputBuffer = null; } } } @Override protected void _closeSource() throws IOException { if (_in != null) { _in.close(); _in = null; } } /* /********************************************************************** /* Public scanner interface (1st level parsing) /********************************************************************** */ @Override protected final void finishToken() throws XMLStreamException { _tokenIncomplete = false; switch (_currToken) { case PROCESSING_INSTRUCTION: finishPI(); break; case CHARACTERS: finishCharacters(); break; case COMMENT: finishComment(); break; case SPACE: finishSpace(); break; case DTD: finishDTD(true); // true -> get text break; case CDATA: finishCData(); break; default: ErrorConsts.throwInternalError(); } } // // // First, main iteration methods @Override public final int nextFromProlog(boolean isProlog) throws XMLStreamException { if (_tokenIncomplete) { // left-overs from last thingy? skipToken(); } // First: keep track of where event started setStartLocation(); // Ok: we should get a WS or '<'. So, let's skip through WS while (true) { // Any more data? Just need a single byte if (_inputPtr >= _inputEnd) { if (!loadMore()) { setStartLocation(); return TOKEN_EOI; } } int c = _inputBuffer[_inputPtr++] & 0xFF; // Really should get white space or '<'... if (c == '<') { break; } if (c != ' ') { if (c == '\n') { markLF(); } else if (c == '\r') { if (_inputPtr >= _inputEnd) { if (!loadMore()) { markLF(); setStartLocation(); return TOKEN_EOI; } } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } else if (c != '\t') { reportPrologUnexpChar(isProlog, c, null); } } } // Ok, got LT: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(COMMENT); } char c = _inputBuffer[_inputPtr++]; if (c == '!') { // comment/DOCTYPE? (CDATA not legal) return handlePrologDeclStart(isProlog); } if (c == '?') { return handlePIStart(); } /* End tag not allowed if no open tree; and only one root * element (one root-level start tag) */ if (c == '/' || !isProlog) { reportPrologUnexpElement(isProlog, c); } return handleStartElement(c); } @Override public final int nextFromTree() throws XMLStreamException { if (_tokenIncomplete) { // left-overs? if (skipToken()) { // Figured out next event (ENTITY_REFERENCE)? // !!! We don't yet parse DTD, don't know real contents return _nextEntity(); } } else { // note: START_ELEMENT/END_ELEMENT never incomplete if (_currToken == START_ELEMENT) { if (_isEmptyTag) { // Important: retain same start location as with START_ELEMENT, don't overwrite --_depth; return (_currToken = END_ELEMENT); } } else if (_currToken == END_ELEMENT) { _currElem = _currElem.getParent(); // Any namespace declarations that need to be unbound? while (_lastNsDecl != null && _lastNsDecl.getLevel() >= _depth) { _lastNsDecl = _lastNsDecl.unbind(); } } else { // It's possible CHARACTERS entity with an entity ref: if (_entityPending) { _entityPending = false; return _nextEntity(); } } } // and except for special cases, mark down actual start location of the event setStartLocation(); /* Any more data? Although it'd be an error not to get any, * let's leave error reporting up to caller */ if (_inputPtr >= _inputEnd) { if (!loadMore()) { setStartLocation(); return TOKEN_EOI; } } char c = _inputBuffer[_inputPtr]; /* Can get pretty much any type; start/end element, comment/PI, * CDATA, text, entity reference... */ if (c == '<') { // root element, comment, proc instr? ++_inputPtr; c = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(COMMENT); if (c == '!') { // comment or CDATA return handleCommentOrCdataStart(); } if (c == '?') { return handlePIStart(); } if (c == '/') { return handleEndElement(); } return handleStartElement(c); } if (c == '&') { // entity reference ++_inputPtr; /* Need to expand; should indicate either text, or an unexpanded * entity reference */ int i = handleEntityInText(false); if (i == 0) { // general entity return (_currToken = ENTITY_REFERENCE); } /* Nope, a char entity; need to indicate it came from an entity. * Since we may want to store the char as is, too, let's negate * entity-based char */ mTmpChar = -i; } else { /* Let's store it for future reference. May or may not be used -- * so let's not advance input ptr quite yet. */ mTmpChar = c; } // text, possibly/probably ok if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return (_currToken = CHARACTERS); }
Helper method used to isolate things that need to be (re)set in cases where
/** * Helper method used to isolate things that need to be (re)set in * cases where */
protected int _nextEntity() { // !!! Also, have to assume start location has been set or such _textBuilder.resetWithEmpty(); // !!! TODO: handle start location? return (_currToken = ENTITY_REFERENCE); } /* /********************************************************************** /* 2nd level parsing /********************************************************************** */ protected final int handlePrologDeclStart(boolean isProlog) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c == '-') { // Comment? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == '-') { if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishComment(); } return (_currToken = COMMENT); } } else if (c == 'D') { // DOCTYPE? if (isProlog) { // no DOCTYPE in epilog handleDtdStart(); // incomplete flag is set by handleDtdStart if (!_cfgLazyParsing) { if (_tokenIncomplete) { finishDTD(true); // must copy contents, may be needed _tokenIncomplete = false; } } return DTD; } } /* error... for error recovery purposes, let's just pretend * like it was unfinished CHARACTERS, though. */ _tokenIncomplete = true; _currToken = CHARACTERS; reportPrologUnexpChar(isProlog, c, " (expected '-' for COMMENT)"); return _currToken; // never gets here } private final int handleDtdStart() throws XMLStreamException { matchAsciiKeyword("DOCTYPE"); // And then some white space and root name char c = skipInternalWs(true, "after DOCTYPE keyword, before root name"); _tokenName = parsePName(c); c = skipInternalWs(false, null); //boolean gotId; if (c == 'P') { // PUBLIC matchAsciiKeyword("PUBLIC"); c = skipInternalWs(true, null); _publicId = parsePublicId(c); c = skipInternalWs(true, null); _systemId = parseSystemId(c); c = skipInternalWs(false, null); } else if (c == 'S') { // SYSTEM matchAsciiKeyword("SYSTEM"); c = skipInternalWs(true, null); _publicId = null; _systemId = parseSystemId(c); c = skipInternalWs(false, null); } else { _publicId = _systemId = null; } /* Ok; so, need to get either an internal subset, or the * end: */ if (c == '>') { // fine, we are done _tokenIncomplete = false; return (_currToken = DTD); } if (c != '[') { // If not end, must have int. subset String msg = (_systemId != null) ? " (expected '[' for the internal subset, or '>' to end DOCTYPE declaration)" : " (expected a 'PUBLIC' or 'SYSTEM' keyword, '[' for the internal subset, or '>' to end DOCTYPE declaration)"; reportTreeUnexpChar(c, msg); } /* Need not parse the int. subset yet, can leave as is, and then * either skip or parse later on */ _tokenIncomplete = true; return (_currToken = DTD); } protected final int handleCommentOrCdataStart() throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; // Let's first see if it's a comment (simpler) if (c == '-') { // Comment if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c != '-') { reportTreeUnexpChar(c, " (expected '-' for COMMENT)"); } if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishComment(); } return (_currToken = COMMENT); } // If not, should be CDATA: if (c == '[') { // CDATA _currToken = CDATA; for (int i = 0; i < 6; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c != CDATA_STR.charAt(i)) { reportTreeUnexpChar(c, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)"); } } if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCData(); } return CDATA; } reportTreeUnexpChar(c, " (expected either '-' for COMMENT or '[CDATA[' for CDATA section)"); return TOKEN_EOI; // never gets here } protected final int handlePIStart() throws XMLStreamException { _currToken = PROCESSING_INSTRUCTION; // Ok, first, need a name if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } // Ok, first, need a name char c = _inputBuffer[_inputPtr++]; _tokenName = parsePName(c); { // but is it "xml" (case insensitive)? String ln = _tokenName.getLocalName(); if (ln.length() == 3 && ln.equalsIgnoreCase("xml") && _tokenName.getPrefix() == null) { reportInputProblem(ErrorConsts.ERR_WF_PI_XML_TARGET); } } /* Let's then verify that we either get a space, or closing * '?>': this way we'll catch some problems right away, and also * simplify actual processing of contents. */ if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c <= INT_SPACE) { // Ok, let's skip the white space... while (true) { if (c == '\n') { markLF(); } else if (c == '\r') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } else if (c != ' ' && c != '\t') { throwInvalidSpace(c); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr]; if (c > 0x0020) { break; } ++_inputPtr; } // Ok, got non-space, need to push back: if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishPI(); } } else { if (c != INT_QMARK) { reportMissingPISpace(c); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c != '>') { reportMissingPISpace(c); } _textBuilder.resetWithEmpty(); _tokenIncomplete = false; } return PROCESSING_INSTRUCTION; }
Returns:Code point for the entity that expands to a valid XML content character.
/** * @return Code point for the entity that expands to a valid XML * content character. */
protected final int handleCharEntity() throws XMLStreamException { // Hex or decimal? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; int value = 0; if (c == 'x') { // hex while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == ';') { break; } value = value << 4; if (c <= '9' && c >= '0') { value += (c - '0'); } else if (c >= 'a' && c <= 'f') { value += 10 + (c - 'a'); } else if (c >= 'A' && c <= 'F') { value += 10 + (c - 'A'); } else { throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F)"); } if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } } } else { // numeric (decimal) while (c != ';') { if (c <= '9' && c >= '0') { value = (value * 10) + (c - '0'); if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } } else { throwUnexpectedChar(c, "; expected a decimal number"); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; } } // Ok, and then need to check result is a valid XML content char: if (value >= 0xD800) { // note: checked for overflow earlier if (value < 0xE000) { // no surrogates via entity expansion reportInvalidXmlChar(value); } if (value == 0xFFFE || value == 0xFFFF) { reportInvalidXmlChar(value); } } else if (value < 32) { // XML 1.1 allows most other chars; 1.0 does not: if (value != INT_LF && value != INT_CR && value != INT_TAB) { if (!_xml11 || value == 0) { reportInvalidXmlChar(value); } } } return value; } protected final int handleStartElement(char c) throws XMLStreamException { _currToken = START_ELEMENT; _currNsCount = 0; PName elemName = parsePName(c); /* Ok. Need to create a qualified name. Simplest for element * in default ns (no extra work -- expressed as null binding); * otherwise need to find binding */ String prefix = elemName.getPrefix(); boolean allBound; // flag to check 'late' bindings if (prefix == null) { // element in default ns allBound = true; // which need not be bound } else { elemName = bindName(elemName, prefix); allBound = elemName.isBound(); } _tokenName = elemName; _currElem = new ElementScope(elemName, _currElem); // And then attribute parsing loop: int attrPtr = 0; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; // Intervening space to skip? if (c <= INT_SPACE) { do { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } else if (c != ' ' && c != '\t') { throwInvalidSpace(c); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; } while (c <= INT_SPACE); } else if (c != INT_SLASH && c != INT_GT) { throwUnexpectedChar(c, " expected space, or '>' or \"/>\""); } // Ok; either need to get an attribute name, or end marker: if (c == INT_SLASH) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c != '>') { throwUnexpectedChar(c, " expected '>'"); } _isEmptyTag = true; break; } else if (c == '>') { _isEmptyTag = false; break; } else if (c == '<') { reportInputProblem("Unexpected '<' character in element (missing closing '>'?)"); } // Ok, an attr name: PName attrName = parsePName(c); prefix = attrName.getPrefix(); boolean isNsDecl; if (prefix == null) { // can be default ns decl: isNsDecl = (attrName.getLocalName() == "xmlns"); } else { // May be a namespace decl though? if (prefix == "xmlns") { isNsDecl = true; } else { attrName = bindName(attrName, prefix); if (allBound) { allBound = attrName.isBound(); } isNsDecl = false; } } // Optional space to skip again while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c > INT_SPACE) { break; } if (c == '\n') { markLF(); } else if (c == '\r') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } else if (c != ' ' && c != '\t') { throwInvalidSpace(c); } } if (c != '=') { throwUnexpectedChar(c, " expected '='"); } // Optional space to skip again while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c > INT_SPACE) { break; } if (c == '\n') { markLF(); } else if (c == '\r') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } else if (c != ' ' && c != '\t') { throwInvalidSpace(c); } } if (c != '"' && c != '\'') { throwUnexpectedChar(c, " Expected a quote"); } /* Ok, finally: value parsing. However, ns URIs are to be handled * different from attribute values... let's offline URIs, since * they should be less common than attribute values. */ if (isNsDecl) { // default ns, or explicit? handleNsDeclaration(attrName, c); ++_currNsCount; } else { // nope, a 'real' attribute: attrPtr = collectValue(attrPtr, c, attrName); } } { // Note: this call also checks attribute uniqueness int act = _attrCollector.finishLastValue(attrPtr); if (act < 0) { // error, dup attr indicated by -1 act = _attrCollector.getCount(); // let's get correct count reportInputProblem(_attrCollector.getErrorMsg()); } _attrCount = act; } ++_depth; /* Was there any prefix that wasn't bound prior to use? * That's legal, assuming declaration was found later on... * let's check */ if (!allBound) { if (!elemName.isBound()) { // element itself unbound reportUnboundPrefix(_tokenName, false); } for (int i = 0, len = _attrCount; i < len; ++i) { PName attrName = _attrCollector.getName(i); if (!attrName.isBound()) { reportUnboundPrefix(attrName, true); } } } return START_ELEMENT; }
This method implements the tight loop for parsing attribute values. It's off-lined from the main start element method to simplify main method, which makes code more maintainable and possibly easier for JIT/HotSpot to optimize.
/** * This method implements the tight loop for parsing attribute * values. It's off-lined from the main start element method to * simplify main method, which makes code more maintainable * and possibly easier for JIT/HotSpot to optimize. */
private final int collectValue(int attrPtr, char quoteChar, PName attrName) throws XMLStreamException { char[] attrBuffer = _attrCollector.startNewValue(attrName, attrPtr); final int[] TYPES = sCharTypes.ATTR_CHARS; value_loop: while (true) { char c; ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (attrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } int max = _inputEnd; { int max2 = ptr + (attrBuffer.length - attrPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = _inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } attrBuffer[attrPtr++] = c; } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } // fall through case XmlCharTypes.CT_WS_LF: markLF(); // fall through case XmlCharTypes.CT_WS_TAB: // Plus, need to convert these all to simple space c = ' '; break; case XmlCharTypes.CT_LT: throwUnexpectedChar(c, "'<' not allowed in attribute value"); case XmlCharTypes.CT_AMP: { int d = handleEntityInText(false); if (d == 0) { // unexpanded general entity... not good reportUnexpandedEntityInAttr(attrName, false); } // Ok; does it need a surrogate though? (over 16 bits) if ((d >> 16) != 0) { d -= 0x10000; attrBuffer[attrPtr++] = (char) (0xD800 | (d >> 10)); d = 0xDC00 | (d & 0x3FF); if (attrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } } c = (char) d; } break; case XmlCharTypes.CT_ATTR_QUOTE: if (c == quoteChar) { break value_loop; } // default: // Other chars are not important here... } } else if (c >= 0xD800) { if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); attrBuffer[attrPtr++] = c; // Need to ensure room for one more if (attrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } // We know there's room for at least one more char attrBuffer[attrPtr++] = c; } return attrPtr; }
Method called from the main START_ELEMENT handling loop, to parse namespace URI values.
/** * Method called from the main START_ELEMENT handling loop, to * parse namespace URI values. */
private void handleNsDeclaration(PName name, char quoteChar) throws XMLStreamException { int attrPtr = 0; char[] attrBuffer = _nameBuffer; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c == quoteChar) { break; } if (c == '&') { // entity int d = handleEntityInText(false); if (d == 0) { // general entity; should never happen reportUnexpandedEntityInAttr(name, true); } // Ok; does it need a surrogate though? (over 16 bits) if ((d >> 16) != 0) { if (attrPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } d -= 0x10000; attrBuffer[attrPtr++] = (char) (0xD800 | (d >> 10)); d = 0xDC00 | (d & 0x3FF); } c = (char) d; } else if (c == '<') { // error throwUnexpectedChar(c, "'<' not allowed in attribute value"); } else { if (c < INT_SPACE) { if (c == '\n') { markLF(); } else if (c == '\r') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); c = '\n'; } else if (c != '\t') { throwInvalidSpace(c); } } } if (attrPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } attrBuffer[attrPtr++] = c; } /* Simple optimization: for default ns removal (or, with * ns 1.1, any other as well), will use empty value... no * need to try to intern: */ if (attrPtr == 0) { bindNs(name, ""); } else { String uri = _config.canonicalizeURI(attrBuffer, attrPtr); bindNs(name, uri); } } protected final int handleEndElement() throws XMLStreamException { --_depth; _currToken = END_ELEMENT; // Ok, at this point we have seen '/', need the name _tokenName = _currElem.getName(); String pname = _tokenName.getPrefixedName(); char c; int i = 0; int len = pname.length(); do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c != pname.charAt(i)) { reportUnexpectedEndTag(pname); } } while (++i < len); // Can still have a problem, if name didn't end there... if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c <= ' ') { c = skipInternalWs(false, null); } else if (c != '>') { if (c == ':' || XmlChars.is10NameChar(c)) { reportUnexpectedEndTag(pname); } } if (c != '>') { throwUnexpectedChar(c, " expected space or closing '>'"); } return END_ELEMENT; } protected final int handleEntityInText(boolean inAttr) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c == '#') { return handleCharEntity(); } String start; if (c == 'a') { // amp or apos? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 'm') { // amp? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 'p') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == ';') { return INT_AMP; } start = "amp"; } else { start = "am"; } } else if (c == 'p') { // apos? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 'o') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 's') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == ';') { return INT_APOS; } start = "apos"; } else { start = "apo"; } } else { start = "ap"; } } else { start = "a"; } } else if (c == 'l') { // lt? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 't') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == ';') { return INT_LT; } start = "lt"; } else { start = "l"; } } else if (c == 'g') { // gt? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 't') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == ';') { return INT_GT; } start = "gt"; } else { start = "g"; } } else if (c == 'q') { // quot? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 'u') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 'o') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 't') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == ';') { return INT_QUOTE; } start = "quot"; } else { start = "quo"; } } else { start = "qu"; } } else { start = "q"; } } else { start = ""; } final int[] TYPES = sCharTypes.NAME_CHARS; /* All righty: we have the beginning of the name, plus the first * char too. So let's see what we can do with it. */ char[] cbuf = _nameBuffer; int cix = 0; for (int len = start.length(); cix < len; ++cix) { cbuf[cix] = start.charAt(cix); } //int colon = -1; while (c != ';') { boolean ok; // Has to be a valid name start char though: if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_NAME_NONE: case XmlCharTypes.CT_NAME_COLON: // not ok for entities? case XmlCharTypes.CT_NAME_NONFIRST: ok = (cix > 0); break; case XmlCharTypes.CT_NAME_ANY: ok = true; break; default: ok = false; break; } } else { if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception int value = decodeSurrogate(c); if (cix >= cbuf.length) { _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length); } cbuf[cix++] = c; c = _inputBuffer[_inputPtr-1]; // was read by decode func ok = (cix == 0) ? XmlChars.is10NameStartChar(value) : XmlChars.is10NameChar(value); } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); ok = false; // never gets here } else { ok = true; } } if (!ok) { reportInvalidNameChar(c, cix); } if (cix >= cbuf.length) { _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length); } cbuf[cix++] = c; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; } // Ok, let's construct a (temporary) entity name, then: String pname = new String(cbuf, 0, cix); // (note: hash is dummy... not to be compared to anything etc) _tokenName = new PNameC(pname, null, pname, 0); /* One more thing: do we actually allow entities in this mode * and with this event? */ if (_config.willExpandEntities()) { reportInputProblem("General entity reference (&"+pname+";) encountered in entity expanding mode: operation not (yet) implemented"); } if (inAttr) { reportInputProblem("General entity reference (&"+pname+";) encountered in attribute value, in non-entity-expanding mode: no way to handle it"); } return 0; } @Override protected final void finishComment() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.resetWithEmpty(); int outPtr = 0; main_loop: while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_HYPHEN: // '-->'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '-') { // ok, must be end then ++_inputPtr; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr++] != '>') { reportDoubleHyphenInComments(); } break main_loop; } break; // default: // Other types are not important here.. } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } // We know there's room for one more: outputBuffer[outPtr++] = c; } _textBuilder.setCurrentLength(outPtr); } @Override protected final void finishPI() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.resetWithEmpty(); int outPtr = 0; main_loop: while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == CHAR_LF) { ++_inputPtr; } markLF(); c = '\n'; } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_QMARK: // '?>'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '>') { ++_inputPtr; break main_loop; } break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } // We know there's room for one more: outputBuffer[outPtr++] = c; } _textBuilder.setCurrentLength(outPtr); } @Override protected final void finishDTD(boolean copyContents) throws XMLStreamException { char[] outputBuffer = copyContents ? _textBuilder.resetWithEmpty() : null; int outPtr = 0; final int[] TYPES = sCharTypes.DTD_CHARS; boolean inDecl = false; // in declaration/directive? int quoteChar = 0; // inside quoted string? main_loop: while (true) { char c; /* First we'll have a quickie loop for speeding through * uneventful chars... */ ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } int max = _inputEnd; if (outputBuffer != null) { if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } } while (ptr < max) { c = _inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } if (outputBuffer != null) { outputBuffer[outPtr++] = c; } } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_DTD_QUOTE: // apos or quot if (quoteChar == 0) { quoteChar = c; } else { if (quoteChar == c) { quoteChar = 0; } } break; case XmlCharTypes.CT_DTD_LT: if (!inDecl) { inDecl = true; } break; case XmlCharTypes.CT_DTD_GT: if (quoteChar == 0) { inDecl = false; } break; case XmlCharTypes.CT_DTD_RBRACKET: if (!inDecl && quoteChar == 0) { break main_loop; } break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); if (outputBuffer != null) { outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } if (outputBuffer != null) { // has room for one more outputBuffer[outPtr++] = c; } } if (outputBuffer != null) { _textBuilder.setCurrentLength(outPtr); } // but still need to match the '>'... char c = skipInternalWs(false, null); if (c != '>') { throwUnexpectedChar(c, " expected '>' after the internal subset"); } } @Override protected final void finishCData() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.resetWithEmpty(); int outPtr = 0; /* At this point, space (if any) has been skipped, and we are * to parse and store the contents */ main_loop: while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; } // And then exceptions: if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_RBRACKET: // close ']]>' marker? /* Ok: let's just parse all consequtive right brackets, * and see if followed by greater-than char. This because * we can only push back at most one char at a time, and * thus can't easily just check a subset */ int count = 0; // ignore first bracket char d; do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = _inputBuffer[_inputPtr]; if (d != ']') { break; } ++_inputPtr; ++count; } while (true); // Was the marker found? boolean ok = (d == '>' && count >= 1); if (ok) { --count; } // Brackets to copy to output? for (; count > 0; --count) { outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } if (ok) { ++_inputPtr; // to consume '>' break main_loop; } break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } // Ok, can output the char; there's room for one char at least outputBuffer[outPtr++] = c; } _textBuilder.setCurrentLength(outPtr); /* 03-Feb-2009, tatu: To support coalescing mode, may need to * do some extra work */ if (_cfgCoalescing && !_entityPending) { finishCoalescedText(); } } @Override protected final void finishCharacters() throws XMLStreamException { int outPtr; char[] outputBuffer; // Ok, so what was the first char / entity? { int c = mTmpChar; if (c < 0) { // from entity; can just copy as is c = -c; outputBuffer = _textBuilder.resetWithEmpty(); outPtr = 0; if ((c >> 16) != 0) { // surrogate pair? c -= 0x10000; /* Note: after resetting the buffer, it's known to have * space for more than 2 chars we need to add */ outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); } outputBuffer[outPtr++] = (char) c; } else { // white space that we are interested in? if (c == INT_CR || c == INT_LF) { ++_inputPtr; // wasn't advanced yet, in this case outPtr = checkInTreeIndentation((char) c); if (outPtr < 0) { return; } // Above call also initializes the text builder appropriately outputBuffer = _textBuilder.getBufferWithoutReset(); } else { outputBuffer = _textBuilder.resetWithEmpty(); outPtr = 0; } } } final int[] TYPES = sCharTypes.TEXT_CHARS; final char[] inputBuffer = _inputBuffer; main_loop: while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; } // And then exceptions: if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (inputBuffer[ptr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_LT: --_inputPtr; break main_loop; case XmlCharTypes.CT_AMP: { int d = handleEntityInText(false); if (d == 0) { // unexpandable general parsed entity // _inputPtr set by entity expansion method _entityPending = true; break main_loop; } // Ok; does it need a surrogate though? (over 16 bits) if ((d >> 16) != 0) { d -= 0x10000; outputBuffer[outPtr++] = (char) (0xD800 | (d >> 10)); // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } d = (0xDC00 | (d & 0x3FF)); } c = (char) d; } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = inputBuffer[_inputPtr]; if (c != ']') { break; } ++_inputPtr; // to skip past bracket ++count; } if (c == '>' && count > 1) { reportIllegalCDataEnd(); } // Nope. Need to output all brackets, then; except // for one that can be left for normal output while (count > 1) { outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } // Need to ensure room for one more char --count; } } // Can just output the first ']' along normal output c = ']'; break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } outputBuffer[outPtr++] = c; } _textBuilder.setCurrentLength(outPtr); // 03-Feb-2009, tatu: Need to support coalescing mode too: if (_cfgCoalescing && !_entityPending) { finishCoalescedText(); } } @Override protected final void finishSpace() throws XMLStreamException { /* Ok: so, mTmpChar contains first space char. If it looks * like indentation, we can probably optimize a bit... */ char tmp = (char)mTmpChar; char[] outputBuffer; int outPtr; if (tmp == '\r' || tmp == '\n') { outPtr = checkPrologIndentation(tmp); if (outPtr < 0) { return; } // Above call also initializes the text builder appropriately outputBuffer = _textBuilder.getBufferWithoutReset(); } else { outputBuffer = _textBuilder.resetWithEmpty(); outputBuffer[0] = tmp; outPtr = 1; } int ptr = _inputPtr; while (true) { if (ptr >= _inputEnd) { if (!loadMore()) { break; } ptr = _inputPtr; } char c = _inputBuffer[ptr]; if (c > INT_SPACE) { break; } ++ptr; if (c == INT_LF) { markLF(ptr); } else if (c == INT_CR) { if (ptr >= _inputEnd) { if (!loadMore()) { // still need to output the lf if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = '\n'; break; } ptr = _inputPtr; } if (_inputBuffer[ptr] == '\n') { ++ptr; } markLF(ptr); c = '\n'; // need to convert to canonical lf } else if (c != ' ' && c != '\t') { _inputPtr = ptr; throwInvalidSpace(c); } // Ok, can output the char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; _textBuilder.setCurrentLength(outPtr); } /* /********************************************************************** /* 2nd level parsing for coalesced text /********************************************************************** */
Method that gets called after a primary text segment (of type CHARACTERS or CDATA, not applicable to SPACE) has been read in text buffer. Method has to see if the following event would be textual as well, and if so, read it (and any other following textual segments).
/** * Method that gets called after a primary text segment (of type * CHARACTERS or CDATA, not applicable to SPACE) has been read in * text buffer. Method has to see if the following event would * be textual as well, and if so, read it (and any other following * textual segments). */
protected final void finishCoalescedText() throws XMLStreamException { while (true) { // no matter what, will need (and can get) one char if (_inputPtr >= _inputEnd) { if (!loadMore()) { // most likely an error, will be handled later on return; } } if (_inputBuffer[_inputPtr] == '<') { // markup of some kind /* In worst case, need 3 chars ("<![") all in all to know * if we are getting a CDATA section */ if ((_inputPtr + 3) >= _inputEnd) { if (!loadAndRetain(3)) { // probably an error, but will be handled later return; } } if (_inputBuffer[_inputPtr+1] != '!' || _inputBuffer[_inputPtr+2] != '[') { // can't be CDATA, we are done here return; } // but let's verify it still: _inputPtr += 3; for (int i = 0; i < 6; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c != CDATA_STR.charAt(i)) { reportTreeUnexpChar(c, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)"); } } finishCoalescedCData(); } else { // textual (or entity, error etc) finishCoalescedCharacters(); if (_entityPending) { break; } } } } // note: code mostly copied from 'finishCharacters', just simplified // in some places protected final void finishCoalescedCData() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); /* At this point, space (if any) has been skipped, and we are * to parse and store the contents */ main_loop: while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; } // And then exceptions: if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_RBRACKET: // close ']]>' marker? /* Ok: let's just parse all consequtive right brackets, * and see if followed by greater-than char. This because * we can only push back at most one char at a time, and * thus can't easily just check a subset */ int count = 0; // ignore first bracket char d; do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = _inputBuffer[_inputPtr]; if (d != ']') { break; } ++_inputPtr; ++count; } while (true); // Was the marker found? boolean ok = (d == '>' && count >= 1); if (ok) { --count; } // Brackets to copy to output? for (; count > 0; --count) { outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } if (ok) { ++_inputPtr; // to consume '>' break main_loop; } break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } // Ok, can output the char; there's room for one char at least outputBuffer[outPtr++] = c; } _textBuilder.setCurrentLength(outPtr); } // note: code mostly copied from 'finishCharacters', just simplified // in some places protected final void finishCoalescedCharacters() throws XMLStreamException { // first char can't be from (char) entity (wrt finishCharacters) final int[] TYPES = sCharTypes.TEXT_CHARS; final char[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); main_loop: while (true) { char c; ascii_loop: while (true) { // tight loop for ascii chars int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; } // And then exceptions: if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (inputBuffer[ptr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_LT: --_inputPtr; break main_loop; case XmlCharTypes.CT_AMP: { int d = handleEntityInText(false); if (d == 0) { // unexpandable general parsed entity // _inputPtr set by entity expansion method _entityPending = true; break main_loop; } // Ok; does it need a surrogate though? (over 16 bits) if ((d >> 16) != 0) { d -= 0x10000; outputBuffer[outPtr++] = (char) (0xD800 | (d >> 10)); // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } d = (0xDC00 | (d & 0x3FF)); } c = (char) d; } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = inputBuffer[_inputPtr]; if (c != ']') { break; } ++_inputPtr; // to skip past bracket ++count; } if (c == '>' && count > 1) { reportIllegalCDataEnd(); } // Nope. Need to output all brackets, then; except // for one that can be left for normal output while (count > 1) { outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } // Need to ensure room for one more char --count; } } // Can just output the first ']' along normal output c = ']'; break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } outputBuffer[outPtr++] = c; } _textBuilder.setCurrentLength(outPtr); }
Method that gets called after a primary text segment (of type CHARACTERS or CDATA, not applicable to SPACE) has been skipped. Method has to see if the following event would be textual as well, and if so, skip it (and any other following textual segments).
Returns:True if we encountered an unexpandable entity
/** * Method that gets called after a primary text segment (of type * CHARACTERS or CDATA, not applicable to SPACE) has been skipped. * Method has to see if the following event would * be textual as well, and if so, skip it (and any other following * textual segments). * * @return True if we encountered an unexpandable entity */
@Override protected final boolean skipCoalescedText() throws XMLStreamException { while (true) { // no matter what, will need (and can get) one char if (_inputPtr >= _inputEnd) { if (!loadMore()) { // most likely an error, will be handled later on return false; } } if (_inputBuffer[_inputPtr] == '<') { // markup of some kind /* In worst case, need 3 chars ("<![") all in all to know * if we are getting a CDATA section */ if ((_inputPtr + 3) >= _inputEnd) { if (!loadAndRetain(3)) { // probably an error, but will be handled later return false; } } if (_inputBuffer[_inputPtr+1] != '!' || _inputBuffer[_inputPtr+2] != '[') { // can't be CDATA, we are done here return false; } // but let's verify it still: _inputPtr += 3; for (int i = 0; i < 6; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c != CDATA_STR.charAt(i)) { reportTreeUnexpChar(c, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)"); } } skipCData(); } else { // textual (or entity, error etc) if (skipCharacters()) { return true; } } } } /* /********************************************************************** /* 2nd level parsing for skipping content /********************************************************************** */ @Override protected final void skipComment() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_HYPHEN: // '-->'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '-') { // ok, must be end then ++_inputPtr; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr++] != '>') { reportDoubleHyphenInComments(); } return; } break; } // default: // Other types are not important here... } } } @Override protected final void skipPI() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == CHAR_LF) { ++_inputPtr; } markLF(); } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_QMARK: // '?>'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '>') { ++_inputPtr; return; } break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception /*char d =*/ checkSurrogate(c); } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } // skipping, no need to output } } @Override protected final boolean skipCharacters() throws XMLStreamException { final int[] TYPES = sCharTypes.TEXT_CHARS; final char[] inputBuffer = _inputBuffer; while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == CHAR_LF) { ++_inputPtr; } markLF(); } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_LT: --_inputPtr; return false; case XmlCharTypes.CT_AMP: { int d = handleEntityInText(false); if (d == 0) { // unexpandable general parsed entity return true; } } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = inputBuffer[_inputPtr]; if (c != ']') { break; } ++_inputPtr; // to skip past bracket ++count; } if (c == '>' && count > 1) { reportIllegalCDataEnd(); } } // Can just output the first ']' along normal output break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception /*char d =*/ checkSurrogate(c); } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } } } @Override protected final void skipCData() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (inputBuffer[ptr] == CHAR_LF) { ++ptr; ++_inputPtr; } markLF(ptr); } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // end is nigh? int count = 0; do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } ++count; c = _inputBuffer[_inputPtr++]; } while (c == ']'); if (c == '>') { if (count > 1) { // gotcha return; } // can still skip plain ']>'... } else { --_inputPtr; // need to push back last char } } break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception /*char d =*/ checkSurrogate(c); } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } } } @Override protected final void skipSpace() throws XMLStreamException { // mTmpChar has a space, but it's been checked, can ignore int ptr = _inputPtr; while (true) { if (ptr >= _inputEnd) { if (!loadMore()) { break; } ptr = _inputPtr; } char c = _inputBuffer[ptr]; if (c > ' ') { // !!! TODO: xml 1.1 ws break; } ++ptr; if (c == '\n') { markLF(ptr); } else if (c == '\r') { if (ptr >= _inputEnd) { if (!loadMore()) { break; } ptr = _inputPtr; } if (_inputBuffer[ptr] == '\n') { ++ptr; } markLF(ptr); } else if (c != ' ' && c != '\t') { _inputPtr = ptr; throwInvalidSpace(c); } } _inputPtr = ptr; } /* /********************************************************************** /* Entity/name handling /********************************************************************** */
Returns:First byte following skipped white space
/** * @return First byte following skipped white space */
protected char skipInternalWs(boolean reqd, String msg) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c > INT_SPACE) { if (!reqd) { return c; } reportTreeUnexpChar(c, " (expected white space "+msg+")"); } do { // But let's first handle the space we already got: if (c == '\n') { markLF(); } else if (c == '\r') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } else if (c != ' ' && c != '\t') { throwInvalidSpace(c); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; } while (c <= INT_SPACE); return c; } private final void matchAsciiKeyword(String keyw) throws XMLStreamException { for (int i = 1, len = keyw.length(); i < len; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c != keyw.charAt(i)) { reportTreeUnexpChar(c, " (expected '"+keyw.charAt(i)+"' for "+keyw+" keyword)"); } } }

Note: consequtive white space is only considered indentation, if the following token seems like a tag (start/end). This so that if a CDATA section follows, it can be coalesced in coalescing mode. Although we could check if coalescing mode is enabled, this should seldom have significant effect either way, so it removes one possible source of problems in coalescing mode.

Returns:-1, if indentation was handled; offset in the output buffer, if not
/** *<p> * Note: consequtive white space is only considered indentation, * if the following token seems like a tag (start/end). This so * that if a CDATA section follows, it can be coalesced in * coalescing mode. Although we could check if coalescing mode is * enabled, this should seldom have significant effect either way, * so it removes one possible source of problems in coalescing mode. * * @return -1, if indentation was handled; offset in the output * buffer, if not */
protected final int checkInTreeIndentation(char c) throws XMLStreamException { if (c == '\r') { // First a degenerate case, a lone \r: if (_inputPtr >= _inputEnd && !loadMore()) { _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } } markLF(); // Then need an indentation char (or start/end tag): if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr]; if (c != ' ' && c != '\t') { // May still be indentation, if it's lt + non-exclamation mark if (c == '<') { if ((_inputPtr+1) < _inputEnd && _inputBuffer[_inputPtr+1] != '!') { _textBuilder.resetWithIndentation(0, ' '); return -1; } } char[] outputBuffer = _textBuilder.resetWithEmpty(); outputBuffer[0] = '\n'; _textBuilder.setCurrentLength(1); return 1; } // So how many do we get? ++_inputPtr; int count = 1; int max = (c == ' ') ? TextBuilder.MAX_INDENT_SPACES : TextBuilder.MAX_INDENT_TABS; while (count <= max) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c2 = _inputBuffer[_inputPtr]; if (c2 != c) { // Has to be followed by a start/end tag... if (c2 == '<' && (_inputPtr+1) < _inputEnd && _inputBuffer[_inputPtr+1] != '!') { _textBuilder.resetWithIndentation(count, c); return -1; } break; } ++_inputPtr; ++count; } // Nope, hit something else, or too long: need to just copy the stuff // we know buffer has enough room either way char[] outputBuffer = _textBuilder.resetWithEmpty(); outputBuffer[0] = '\n'; for (int i = 1; i <= count; ++i) { outputBuffer[i] = c; } count += 1; // to account for leading lf _textBuilder.setCurrentLength(count); return count; }
Returns:-1, if indentation was handled; offset in the output buffer, if not
/** * @return -1, if indentation was handled; offset in the output * buffer, if not */
protected final int checkPrologIndentation(char c) throws XMLStreamException { if (c == '\r') { // First a degenerate case, a lone \r: if (_inputPtr >= _inputEnd && !loadMore()) { _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } } markLF(); // Ok, indentation char? if (_inputPtr >= _inputEnd && !loadMore()) { _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } c = _inputBuffer[_inputPtr]; // won't advance past the char yet if (c != ' ' && c != '\t') { // If lt, it's still indentation ok: if (c == '<') { // need _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } // Nope... something else char[] outputBuffer = _textBuilder.resetWithEmpty(); outputBuffer[0] = '\n'; _textBuilder.setCurrentLength(1); return 1; } // So how many do we get? ++_inputPtr; int count = 1; int max = (c == ' ') ? TextBuilder.MAX_INDENT_SPACES : TextBuilder.MAX_INDENT_TABS; while (true) { if (_inputPtr >= _inputEnd && !loadMore()) { break; } if (_inputBuffer[_inputPtr] != c) { break; } ++_inputPtr; ++count; if (count >= max) { // ok, can't share... but can build it still // we know buffer has enough room char[] outputBuffer = _textBuilder.resetWithEmpty(); outputBuffer[0] = '\n'; for (int i = 1; i <= count; ++i) { outputBuffer[i] = c; } count += 1; // to account for leading lf _textBuilder.setCurrentLength(count); return count; } } // Ok, gotcha? _textBuilder.resetWithIndentation(count, c); return -1; } protected PName parsePName(char c) throws XMLStreamException { char[] nameBuffer = _nameBuffer; /* Let's do just quick sanity check first; a thorough check will be * done later on if necessary, now we'll just do the very cheap * check to catch extra spaces etc. */ if (c < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode throwUnexpectedChar(c, "; expected a name start character"); } nameBuffer[0] = c; int hash = (int) c; int ptr = 1; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr]; int d = (int) c; if (d < 65) { // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (d < 45 || d > 58 || d == 47) { // End of name, a single ascii char? PName n = _symbols.findSymbol(nameBuffer, 0, ptr, hash); if (n == null) { n = addPName(nameBuffer, ptr, hash); } return n; } } ++_inputPtr; if (ptr >= nameBuffer.length) { _nameBuffer = nameBuffer = DataUtil.growArrayBy(nameBuffer, nameBuffer.length); } nameBuffer[ptr++] = c; hash = (hash * 31) + d; } } protected final PName addPName(char[] nameBuffer, int nameLen, int hash) throws XMLStreamException { // Let's validate completely, now: char c = nameBuffer[0]; int namePtr = 1; int last_colon = -1; // where the colon is if (c < 0xD800 || c >= 0xE000) { if (!XmlChars.is10NameStartChar(c)) { reportInvalidNameChar(c, 0); } } else { if (nameLen == 1) { reportInvalidFirstSurrogate(c); } // Only returns if ok; throws exception otherwise checkSurrogateNameChar(c, nameBuffer[1], 0); ++namePtr; } for (; namePtr < nameLen; ++namePtr) { c = nameBuffer[namePtr]; if (c < 0xD800 || c >= 0xE000) { if (c == ':') { if (last_colon >= 0) { reportMultipleColonsInName(); } last_colon = namePtr; } else { if (!XmlChars.is10NameChar(c)) { reportInvalidNameChar(c, namePtr); } } } else { if ((namePtr+1) >= nameLen) { // unpaired surrogate reportInvalidFirstSurrogate(c); } checkSurrogateNameChar(c, nameBuffer[namePtr+1], namePtr); } } return _symbols.addSymbol(nameBuffer, 0, nameLen, hash); } protected String parsePublicId(char quoteChar) throws XMLStreamException { char[] outputBuffer = _nameBuffer; int outPtr = 0; final int[] TYPES = XmlCharTypes.PUBID_CHARS; boolean addSpace = false; main_loop: while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } // Easier to check without char type table, first: char c = _inputBuffer[_inputPtr++]; if (c == quoteChar) { break main_loop; } if ((c > 0xFF) || TYPES[c] != XmlCharTypes.PUBID_OK) { throwUnexpectedChar(c, " in public identifier"); } // White space? Needs to be coalecsed if (c <= INT_SPACE) { addSpace = true; continue; } if (addSpace) { if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = ' '; addSpace = false; } if (outPtr >= outputBuffer.length) { _nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length); outPtr = 0; } outputBuffer[outPtr++] = c; } return new String(outputBuffer, 0, outPtr); } protected String parseSystemId(char quoteChar) throws XMLStreamException { char[] outputBuffer = _nameBuffer; int outPtr = 0; // attribute types are closest matches, so let's use them final int[] TYPES = sCharTypes.ATTR_CHARS; //boolean spaceToAdd = false; main_loop: while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (TYPES[c] != 0) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_ATTR_QUOTE: if (c == quoteChar) { break main_loop; } } } if (outPtr >= outputBuffer.length) { _nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length); outPtr = 0; } outputBuffer[outPtr++] = c; } return new String(outputBuffer, 0, outPtr); } /* /********************************************************************** /* Other parsing helper methods /********************************************************************** */
This method is called to verify that a surrogate pair found describes a legal surrogate pair (ie. expands to a legal XML char)
/** * This method is called to verify that a surrogate * pair found describes a legal surrogate pair (ie. expands * to a legal XML char) */
private char checkSurrogate(char firstChar) throws XMLStreamException { if (firstChar >= 0xDC00) { reportInvalidFirstSurrogate(firstChar); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char sec = _inputBuffer[_inputPtr++]; if (sec < 0xDC00 || sec >= 0xE000) { reportInvalidSecondSurrogate(sec); } // And the composite, is it ok? int val = ((firstChar - 0xD800) << 10) + 0x10000; if (val > XmlConsts.MAX_UNICODE_CHAR) { reportInvalidXmlChar(val); } return sec; } private int checkSurrogateNameChar(char firstChar, char sec, int index) throws XMLStreamException { if (firstChar >= 0xDC00) { reportInvalidFirstSurrogate(firstChar); } if (sec < 0xDC00 || sec >= 0xE000) { reportInvalidSecondSurrogate(sec); } // And the composite, is it ok? int val = ((firstChar - 0xD800) << 10) + 0x10000; if (val > XmlConsts.MAX_UNICODE_CHAR) { reportInvalidXmlChar(val); } // !!! TODO: xml 1.1 vs 1.0 rules: none valid for 1.0, many for 1.1 if (true) { reportInvalidNameChar(val, index); } return val; }
This method is similar to checkSurrogate, but returns the actual character code encoded by the surrogate pair. This is needed if further validation rules (such as name charactert checks) are to be done.
/** * This method is similar to <code>checkSurrogate</code>, but * returns the actual character code encoded by the surrogate * pair. This is needed if further validation rules (such as name * charactert checks) are to be done. */
private int decodeSurrogate(char firstChar) throws XMLStreamException { if (firstChar >= 0xDC00) { reportInvalidFirstSurrogate(firstChar); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char sec = _inputBuffer[_inputPtr++]; if (sec < 0xDC00 || sec >= 0xE000) { reportInvalidSecondSurrogate(sec); } // And the composite, is it ok? int val = ((firstChar - 0xD800) << 10) + 0x10000; if (val > XmlConsts.MAX_UNICODE_CHAR) { reportInvalidXmlChar(val); } return val; } private void reportInvalidFirstSurrogate(char ch) throws XMLStreamException { reportInputProblem("Invalid surrogate character (code 0x"+Integer.toHexString((int) ch)+"): can not start a surrogate pair"); } private void reportInvalidSecondSurrogate(char ch) throws XMLStreamException { reportInputProblem("Invalid surrogate character (code "+Integer.toHexString((int) ch)+"): is not legal as the second part of a surrogate pair"); } /* /********************************************************************** /* Location handling /********************************************************************** */ @Override public XMLStreamLocation2 getCurrentLocation() { return LocationImpl.fromZeroBased (_config.getPublicId(), _config.getSystemId(), _pastBytesOrChars + _inputPtr, _currRow, _inputPtr - _rowStartOffset); } @Override public int getCurrentColumnNr() { return _inputPtr - _rowStartOffset; } @Override public long getStartingByteOffset() { // N/A for this type return -1L; } @Override public long getStartingCharOffset() { return _startRawOffset; } @Override public long getEndingByteOffset() throws XMLStreamException { // N/A for this type return -1L; } @Override public long getEndingCharOffset() throws XMLStreamException { // Have to complete the token to know the ending location... if (_tokenIncomplete) { finishToken(); } return _pastBytesOrChars + _inputPtr; } protected final void markLF(int offset) { _rowStartOffset = offset; ++_currRow; } protected final void markLF() { _rowStartOffset = _inputPtr; ++_currRow; } protected final void setStartLocation() { _startRawOffset = _pastBytesOrChars + _inputPtr; _startRow = _currRow; _startColumn = _inputPtr - _rowStartOffset; } /* /********************************************************************** /* Input loading /********************************************************************** */ @Override protected final boolean loadMore() throws XMLStreamException { // If it's a block source, there's no Reader, or any more data: if (_in == null) { _inputEnd = 0; return false; } // Otherwise let's update offsets: _pastBytesOrChars += _inputEnd; _rowStartOffset -= _inputEnd; _inputPtr = 0; try { int count = _in.read(_inputBuffer, 0, _inputBuffer.length); if (count < 1) { _inputEnd = 0; if (count == 0) { /* Sanity check; should never happen with correctly written * InputStreams... */ reportInputProblem("Reader returned 0 bytes, even when asked to read up to "+_inputBuffer.length); } return false; } _inputEnd = count; return true; } catch (IOException ioe) { throw new IoStreamException(ioe); } } protected final char loadOne() throws XMLStreamException { if (!loadMore()) { reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(_currToken)); } return _inputBuffer[_inputPtr++]; } protected final char loadOne(int type) throws XMLStreamException { if (!loadMore()) { reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(type)); } return _inputBuffer[_inputPtr++]; } protected final boolean loadAndRetain(int nrOfChars) throws XMLStreamException { /* first: can't move, if we were handed an immutable block * (alternative to handing Reader as _in) */ if (_in == null) { return false; } // otherwise, need to use cut'n pasted code from loadMore()... _pastBytesOrChars += _inputPtr; _rowStartOffset -= _inputPtr; int remaining = (_inputEnd - _inputPtr); // must be > 0 System.arraycopy(_inputBuffer, _inputPtr, _inputBuffer, 0, remaining); _inputPtr = 0; _inputEnd = remaining; // temporarily set to cover copied stuff try { do { int max = _inputBuffer.length - _inputEnd; int count = _in.read(_inputBuffer, _inputEnd, max); if (count < 1) { if (count == 0) { // Sanity check, should never happen with non-buggy readers/stream reportInputProblem("Reader returned 0 bytes, even when asked to read up to "+max); } return false; } _inputEnd += count; } while (_inputEnd < nrOfChars); return true; } catch (IOException ioe) { throw new IoStreamException(ioe); } } }