package com.fasterxml.aalto.async;

import java.io.IOException;

import javax.xml.stream.XMLStreamException;

import com.fasterxml.aalto.AsyncInputFeeder;
import com.fasterxml.aalto.AsyncXMLStreamReader;
import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.in.ByteBasedPNameTable;
import com.fasterxml.aalto.in.ByteBasedScanner;
import com.fasterxml.aalto.in.PName;
import com.fasterxml.aalto.in.ReaderConfig;
import com.fasterxml.aalto.util.CharsetNames;
import com.fasterxml.aalto.util.DataUtil;
import com.fasterxml.aalto.util.XmlCharTypes;

public abstract class AsyncByteScanner
    extends ByteBasedScanner
    implements AsyncInputFeeder
{
    protected final static int EVENT_INCOMPLETE = AsyncXMLStreamReader.EVENT_INCOMPLETE;

    /*
    /**********************************************************************
    /* State consts
    /**********************************************************************
     */

    
Default starting state for many events/contexts -- nothing has been seen so far, no event incomplete. Not used for all event types.
/** * Default starting state for many events/contexts -- nothing has been * seen so far, no event incomplete. Not used for all event types. */
protected final static int STATE_DEFAULT = 0; // // // States for prolog/epilog major state:
State in which a less-than sign has been seen
/** * State in which a less-than sign has been seen */
protected final static int STATE_PROLOG_INITIAL = 1; // State before document when we may get xml declaration protected final static int STATE_PROLOG_SEEN_LT = 2; // "<" seen after xml declaration protected final static int STATE_PROLOG_DECL = 3; // "<!" seen after xml declaration // // // States for in-tree major state: protected final static int STATE_TREE_SEEN_LT = 1; // "<" seen protected final static int STATE_TREE_SEEN_AMP = 2; // "&" seen protected final static int STATE_TREE_SEEN_EXCL = 3; // "<!" seen protected final static int STATE_TREE_SEEN_SLASH = 4; // "</" seen protected final static int STATE_TREE_NUMERIC_ENTITY_START = 5; // "&#" and part of value protected final static int STATE_TREE_NAMED_ENTITY_START = 6; // "&" and part of name // // // States within event types (STATE_DEFAULT is shared): // XML declaration parsing protected final static int STATE_XMLDECL_AFTER_XML = 1; // "<?xml", need space protected final static int STATE_XMLDECL_BEFORE_VERSION = 2; // "<?xml ", can have more spaces protected final static int STATE_XMLDECL_VERSION = 3; // "<?xml ", part of "version" protected final static int STATE_XMLDECL_AFTER_VERSION = 4; // "<?xml version", need space or '=' protected final static int STATE_XMLDECL_VERSION_EQ = 5; // "<?xml version=", need space or quote protected final static int STATE_XMLDECL_VERSION_VALUE = 6; // parsing version value protected final static int STATE_XMLDECL_AFTER_VERSION_VALUE = 7; // version got; need space or '?' protected final static int STATE_XMLDECL_BEFORE_ENCODING = 8; // version, value, space got, need '?' or 'e' protected final static int STATE_XMLDECL_ENCODING = 9; // parsing "encoding" protected final static int STATE_XMLDECL_AFTER_ENCODING = 10; // 'encoding' got, need space or '=' protected final static int STATE_XMLDECL_ENCODING_EQ = 11; // "encoding=" protected final static int STATE_XMLDECL_ENCODING_VALUE = 12; // parsing encoding value protected final static int STATE_XMLDECL_AFTER_ENCODING_VALUE = 13; // encoding+value gotten; need space or '?' protected final static int STATE_XMLDECL_BEFORE_STANDALONE = 14; // after encoding+value+space; get '?' or 's' protected final static int STATE_XMLDECL_STANDALONE = 15; // parsing "standalone" protected final static int STATE_XMLDECL_AFTER_STANDALONE = 16; // 'standalone' got, need space or '=' protected final static int STATE_XMLDECL_STANDALONE_EQ = 17; // "standalone=" protected final static int STATE_XMLDECL_STANDALONE_VALUE = 18; // encoding+value gotten; need space or '?' protected final static int STATE_XMLDECL_AFTER_STANDALONE_VALUE = 19; // encoding+value gotten; need space or '?' protected final static int STATE_XMLDECL_ENDQ = 20; // "?" at the end of declaration // DOCTYPE declaration parsing protected final static int STATE_DTD_DOCTYPE = 1; // part of "DOCTYPE" protected final static int STATE_DTD_AFTER_DOCTYPE = 2; // "DOCTYPE", need space protected final static int STATE_DTD_BEFORE_ROOT_NAME = 3; // optional space before root name protected final static int STATE_DTD_ROOT_NAME = 4; // part of root name protected final static int STATE_DTD_AFTER_ROOT_NAME = 5; // root name gotten; need a space or '>' protected final static int STATE_DTD_BEFORE_IDS = 6; // before "PUBLIC" or "SYSTEM" token protected final static int STATE_DTD_PUBLIC_OR_SYSTEM = 7; // parsing "PUBLIC" or "SYSTEM" protected final static int STATE_DTD_AFTER_PUBLIC = 8; // "PUBLIC" found, need space protected final static int STATE_DTD_AFTER_SYSTEM = 9; // "SYSTEM" found, need space protected final static int STATE_DTD_BEFORE_PUBLIC_ID = 10; // after "PUBLIC", space, need quoted public id protected final static int STATE_DTD_PUBLIC_ID = 11; // parsing public ID protected final static int STATE_DTD_AFTER_PUBLIC_ID = 12; // public ID parsed, need space protected final static int STATE_DTD_BEFORE_SYSTEM_ID = 13; // about to parse quoted system id protected final static int STATE_DTD_SYSTEM_ID = 14; // parsing system ID protected final static int STATE_DTD_AFTER_SYSTEM_ID = 15; // after system ID, optional space, '>' or int subset protected final static int STATE_DTD_INT_SUBSET = 16; // parsing internal subset protected final static int STATE_DTD_EXPECT_CLOSING_GT = 50; // ']' gotten that should be followed by '>' // For CHARACTERS, default is the basic (and only) // just seen "&" protected final static int STATE_TEXT_AMP = 4; // just seen "&#" // protected final static int STATE_TEXT_AMP_AND_HASH = 5; // seen '&' and partial name: protected final static int STATE_TEXT_AMP_NAME = 6; // For comments, STATE_DEFAULT means "<!-" has been seen protected final static int STATE_COMMENT_CONTENT = 1; // "<!--" protected final static int STATE_COMMENT_HYPHEN = 2; // content, and one '-' protected final static int STATE_COMMENT_HYPHEN2 = 3; // content, "--" // For cdata, STATE_DEFAULT means that just "<![" has been seen protected final static int STATE_CDATA_CONTENT = 1; // start marker seen, maybe some content protected final static int STATE_CDATA_C = 2; // "<![C" protected final static int STATE_CDATA_CD = 3; // "<![CD" protected final static int STATE_CDATA_CDA = 4; // "<![CDA" protected final static int STATE_CDATA_CDAT = 5; // "<![CDAT" protected final static int STATE_CDATA_CDATA = 6; // "<![CDATA" // For PIs, default means that '<?' has been seen, nothing else // (note: funny ordering, starting with "quick path" entries) protected final static int STATE_PI_AFTER_TARGET = 1; // "<?", target ?> protected final static int STATE_PI_AFTER_TARGET_WS = 2; // "<?", target, ws protected final static int STATE_PI_AFTER_TARGET_QMARK = 3; // "<?", target, "?" protected final static int STATE_PI_IN_TARGET = 4; // "<?", part of target protected final static int STATE_PI_IN_DATA = 5; // "<?", target, ws, part of data // For start element, DEFAULT means that only '<' has been seen protected final static int STATE_SE_ELEM_NAME = 1; // "<" and part of name protected final static int STATE_SE_SPACE_OR_END = 2; // after elem name or attr, but need space protected final static int STATE_SE_SPACE_OR_ATTRNAME = 3; // after elem/attr and space protected final static int STATE_SE_ATTR_NAME = 4; // in attribute name protected final static int STATE_SE_SPACE_OR_EQ = 5; protected final static int STATE_SE_SPACE_OR_ATTRVALUE = 6; protected final static int STATE_SE_ATTR_VALUE_NORMAL = 7; protected final static int STATE_SE_ATTR_VALUE_NSDECL = 8; protected final static int STATE_SE_SEEN_SLASH = 9; // For END_ELEMENT, default means we are parsing name protected final static int STATE_EE_NEED_GT = 1; /* /********************************************************************** /* Markers to use for 'pending' character, if /* not multi-byte UTF character /********************************************************************** */ // Marker when dealing with general CR+LF pair protected final static int PENDING_STATE_CR = -1; // Parsing of possible XML declaration protected final static int PENDING_STATE_XMLDECL_LT = -5; // "<" at start of doc protected final static int PENDING_STATE_XMLDECL_LTQ = -6; // "<?" at start of doc protected final static int PENDING_STATE_XMLDECL_TARGET = -7; // "<?" at start of doc, part of name // Processing Instruction parsing: protected final static int PENDING_STATE_PI_QMARK = -15; // Comment parsing protected final static int PENDING_STATE_COMMENT_HYPHEN1 = -20; protected final static int PENDING_STATE_COMMENT_HYPHEN2 = -21; // CData parsing protected final static int PENDING_STATE_CDATA_BRACKET1 = -30; protected final static int PENDING_STATE_CDATA_BRACKET2 = -31; protected final static int PENDING_STATE_ENT_SEEN_HASH = -70; // seen &# protected final static int PENDING_STATE_ENT_SEEN_HASH_X = -71; // seen &#x protected final static int PENDING_STATE_ENT_IN_DEC_DIGIT = -72; // seen &# and 1 or more decimals protected final static int PENDING_STATE_ENT_IN_HEX_DIGIT = -73; // seen &#x and 1 or more hex digits // final static int PENDING_STATE_ENT_IN_NAME = -; // seen & and part of the name // partially handled entities within attribute/ns values use pending state as well protected final static int PENDING_STATE_ATTR_VALUE_AMP = -60; protected final static int PENDING_STATE_ATTR_VALUE_AMP_HASH = -61; protected final static int PENDING_STATE_ATTR_VALUE_AMP_HASH_X = -62; protected final static int PENDING_STATE_ATTR_VALUE_ENTITY_NAME = -63; protected final static int PENDING_STATE_ATTR_VALUE_DEC_DIGIT = -64; protected final static int PENDING_STATE_ATTR_VALUE_HEX_DIGIT = -65; protected final static int PENDING_STATE_TEXT_AMP = -80; // seen & protected final static int PENDING_STATE_TEXT_AMP_HASH = -81; // seen &# protected final static int PENDING_STATE_TEXT_DEC_ENTITY = -82; // seen &# and 1 or more decimals protected final static int PENDING_STATE_TEXT_HEX_ENTITY = -83; // seen &#x and 1 or more hex digits protected final static int PENDING_STATE_TEXT_IN_ENTITY = -84; // seen & and part of entity name protected final static int PENDING_STATE_TEXT_BRACKET1 = -85; // seen ] protected final static int PENDING_STATE_TEXT_BRACKET2 = -86; // seen ]] /* /********************************************************************** /* Decoding, symbol handling /********************************************************************** */
This is a simple container object that is used to access the decoding tables for characters. Indirection is needed since we actually support multiple utf-8 compatible encodings, not just utf-8 itself.

NOTE: non-final due to xml declaration handling occurring later.

/** * This is a simple container object that is used to access the * decoding tables for characters. Indirection is needed since * we actually support multiple utf-8 compatible encodings, not * just utf-8 itself. *<p> * NOTE: non-final due to xml declaration handling occurring later. */
protected XmlCharTypes _charTypes;
For now, symbol table contains prefixed names. In future it is possible that they may be split into prefixes and local names?

NOTE: non-final for async scanners

/** * For now, symbol table contains prefixed names. In future it is * possible that they may be split into prefixes and local names? *<p> * NOTE: non-final for async scanners */
protected ByteBasedPNameTable _symbols;
This buffer is used for name parsing. Will be expanded if/as needed; 32 ints can hold names 128 ascii chars long.
/** * This buffer is used for name parsing. Will be expanded if/as * needed; 32 ints can hold names 128 ascii chars long. */
protected int[] _quadBuffer = new int[32]; /* /********************************************************************** /* General state tracking /********************************************************************** */
Due to asynchronous nature of parsing, we may know what event we are trying to parse, even if it's not yet complete. Type of that event is stored here.
/** * Due to asynchronous nature of parsing, we may know what * event we are trying to parse, even if it's not yet * complete. Type of that event is stored here. */
protected int _nextEvent = EVENT_INCOMPLETE;
In addition to the event type, there is need for additional state information
/** * In addition to the event type, there is need for additional * state information */
protected int _state;
For token/state combinations that are 'shared' between events (or embedded in them), this is where the surrounding event state is retained.
/** * For token/state combinations that are 'shared' between * events (or embedded in them), this is where the surrounding * event state is retained. */
protected int _surroundingEvent = EVENT_INCOMPLETE;
There are some multi-byte combinations that must be handled as a unit: CR+LF linefeeds, multi-byte UTF-8 characters, and multi-character end markers for comments and PIs. Since they can be split across input buffer boundaries, first byte(s) may need to be temporarily stored.

If so, this int will store byte(s), in little-endian format (that is, first pending byte is at 0x000000FF, second [if any] at 0x0000FF00, and third at 0x00FF0000). This can be (and is) used to figure out actual number of bytes pending, for multi-byte (UTF-8) character decoding.

Note: it is assumed that if value is 0, there is no data. Thus, if 0 needed to be added pending, it has to be masked.

/** * There are some multi-byte combinations that must be handled * as a unit: CR+LF linefeeds, multi-byte UTF-8 characters, and * multi-character end markers for comments and PIs. * Since they can be split across input buffer * boundaries, first byte(s) may need to be temporarily stored. *<p> * If so, this int will store byte(s), in little-endian format * (that is, first pending byte is at 0x000000FF, second [if any] * at 0x0000FF00, and third at 0x00FF0000). This can be * (and is) used to figure out actual number of bytes pending, * for multi-byte (UTF-8) character decoding. *<p> * Note: it is assumed that if value is 0, there is no data. * Thus, if 0 needed to be added pending, it has to be masked. */
protected int _pendingInput = 0;
Flag that is sent when calling application indicates that there will be no more input to parse.
/** * Flag that is sent when calling application indicates that there will * be no more input to parse. */
protected boolean _endOfInput = false; /* /********************************************************************** /* Name/entity parsing state /********************************************************************** */
Number of complete quads parsed for current name (quads themselves are stored in _quadBuffer).
/** * Number of complete quads parsed for current name (quads * themselves are stored in {@link #_quadBuffer}). */
protected int _quadCount;
Bytes parsed for the current, incomplete, quad
/** * Bytes parsed for the current, incomplete, quad */
protected int _currQuad;
Number of bytes pending/buffered, stored in _currQuad
/** * Number of bytes pending/buffered, stored in {@link #_currQuad} */
protected int _currQuadBytes = 0;
Entity value accumulated so far
/** * Entity value accumulated so far */
protected int _entityValue = 0; /* /********************************************************************** /* (Start) element parsing state /********************************************************************** */ protected boolean _elemAllNsBound; protected boolean _elemAttrCount; protected byte _elemAttrQuote; protected PName _elemAttrName;
Pointer for the next character of currently being parsed value within attribute value buffer
/** * Pointer for the next character of currently being parsed value * within attribute value buffer */
protected int _elemAttrPtr;
Pointer for the next character of currently being parsed namespace URI for the current namespace declaration
/** * Pointer for the next character of currently being parsed namespace * URI for the current namespace declaration */
protected int _elemNsPtr; /* /********************************************************************** /* Other state /********************************************************************** */
Flag that indicates whether we are inside a declaration during parsing of internal DTD subset.
/** * Flag that indicates whether we are inside a declaration during parsing * of internal DTD subset. */
protected boolean _inDtdDeclaration; /* /********************************************************************** /* Life-cycle /********************************************************************** */ protected AsyncByteScanner(ReaderConfig cfg) { super(cfg); // 03-Apr-2018, tatu: Can not yet fetch `_charTypes` or `_symbols` since we // do not necessarily know actual encoding from XML declaration // _charTypes = cfg.getCharTypes(); // _symbols = cfg.getBBSymbols(); }
Initialization method to call when encoding has been definitely figured out, from XML declarations, or, from lack of one (using defaults).
Since:1.1.1
/** * Initialization method to call when encoding has been definitely figured out, * from XML declarations, or, from lack of one (using defaults). * * @since 1.1.1 */
protected void _activateEncoding() { // 04-Apr-2018, tatu: Not sure if we should try to enforce; gets tricky so for now // simply make first call stick if (_symbols == null) { _charTypes = _config.getCharTypes(); _symbols = _config.getBBSymbols(); } } @Override public void endOfInput() { _endOfInput = true; } @Override protected void _releaseBuffers() { super._releaseBuffers(); if (_symbols.maybeDirty()) { _config.updateBBSymbols(_symbols); } }
Since the async scanner has no access to whatever passes content, there is no input source in same sense as with blocking scanner; and there is nothing to close. But we can at least mark input as having ended.
/** * Since the async scanner has no access to whatever passes content, * there is no input source in same sense as with blocking scanner; * and there is nothing to close. But we can at least mark input * as having ended. */
@Override protected void _closeSource() throws IOException { // nothing to do, we are done. _endOfInput = true; } /* /********************************************************************** /* Shared helper methods /********************************************************************** */ protected void verifyAndSetXmlVersion() throws XMLStreamException { if (_textBuilder.equalsString("1.0")) { _config.setXmlVersion("1.0"); } else if (_textBuilder.equalsString("1.1")) { _config.setXmlVersion("1.1"); } else { reportInputProblem("Unrecognized XML version '"+_textBuilder.contentsAsString()+"' (expected '1.0' or '1.1')"); } } protected void verifyAndSetXmlEncoding() throws XMLStreamException { String enc = CharsetNames.normalize(_textBuilder.contentsAsString()); if ((CharsetNames.CS_UTF8 != enc) && (CharsetNames.CS_US_ASCII != enc) && (CharsetNames.CS_ISO_LATIN1 != enc)) { reportInputProblem("Unsupported encoding '"+enc+"': only UTF-8 and US-ASCII support by async parser"); } // 03-Apr-2018, tatu: Need to overwrite default (UTF-8) if declared otherwise. // And besides changing configs need to force use of new symbol tables, too... _config.setXmlEncoding(enc); if (enc != null) { _config.setActualEncoding(enc); } _charTypes = _config.getCharTypes(); } protected void verifyAndSetXmlStandalone() throws XMLStreamException { if (_textBuilder.equalsString("yes")) { _config.setXmlStandalone(Boolean.TRUE); } else if (_textBuilder.equalsString("no")) { _config.setXmlStandalone(Boolean.FALSE); } else { reportInputProblem("Invalid standalone value '"+_textBuilder.contentsAsString()+"': can only use 'yes' and 'no'"); } } protected void verifyAndSetPublicId() throws XMLStreamException { _publicId = _textBuilder.contentsAsString(); } protected void verifyAndSetSystemId() throws XMLStreamException { _systemId = _textBuilder.contentsAsString(); } /* /********************************************************************** /* Content accessors for less performance-critical sections /********************************************************************** */ protected abstract byte _currentByte() throws XMLStreamException; protected abstract byte _nextByte() throws XMLStreamException; protected abstract byte _prevByte() throws XMLStreamException; /* /********************************************************************** /* Abstract methods for subclasses to implement wrt prolog/epilog /********************************************************************** */ protected abstract int handlePI() throws XMLStreamException; protected abstract boolean handleDTDInternalSubset(boolean init) throws XMLStreamException; protected abstract int handleComment() throws XMLStreamException; protected abstract int handleStartElementStart(byte b) throws XMLStreamException; protected abstract int handleStartElement() throws XMLStreamException; protected abstract PName parsePName() throws XMLStreamException; protected abstract PName parseNewName(byte b) throws XMLStreamException; protected abstract boolean asyncSkipSpace() throws XMLStreamException; protected abstract boolean handlePartialCR() throws XMLStreamException; /* /********************************************************************** /* Second-level parsing; character content (in tree) /********************************************************************** */ @Override protected final void finishToken() throws XMLStreamException { _tokenIncomplete = false; switch (_currToken) { case PROCESSING_INSTRUCTION: finishPI(); break; case CHARACTERS: finishCharacters(); break; case COMMENT: finishComment(); break; case SPACE: finishSpace(); break; case DTD: finishDTD(true); // true -> get text break; case CDATA: finishCData(); break; default: ErrorConsts.throwInternalError(); } }
Method called to initialize state for CHARACTERS event, after just a single byte has been seen. What needs to be done next depends on whether coalescing mode is set or not: if it is not set, just a single character needs to be decoded, after which current event will be incomplete, but defined as CHARACTERS. In coalescing mode, the whole content must be read before current event can be defined. The reason for difference is that when XMLStreamReader.next() returns, no blocking can occur when calling other methods.
Returns:Event type detected; either CHARACTERS, if at least one full character was decoded (and can be returned), EVENT_INCOMPLETE if not (part of a multi-byte character split across input buffer boundary)
/** * Method called to initialize state for CHARACTERS event, after * just a single byte has been seen. What needs to be done next * depends on whether coalescing mode is set or not: if it is not * set, just a single character needs to be decoded, after which * current event will be incomplete, but defined as CHARACTERS. * In coalescing mode, the whole content must be read before * current event can be defined. The reason for difference is * that when <code>XMLStreamReader.next()</code> returns, no * blocking can occur when calling other methods. * * @return Event type detected; either CHARACTERS, if at least * one full character was decoded (and can be returned), * EVENT_INCOMPLETE if not (part of a multi-byte character * split across input buffer boundary) */
protected abstract int startCharacters(byte b) throws XMLStreamException; protected abstract boolean handleAttrValue() throws XMLStreamException; protected abstract boolean handleNsDecl() throws XMLStreamException; /* /********************************************************************** /* Abstract methods from base class, parsing /********************************************************************** */ @Override protected void finishCData() throws XMLStreamException { // N/A throwInternal(); } @Override protected void finishComment() throws XMLStreamException { // N/A throwInternal(); } @Override protected void finishDTD(boolean copyContents) throws XMLStreamException { // N/A throwInternal(); } @Override protected void finishPI() throws XMLStreamException { // N/A throwInternal(); } @Override protected void finishSpace() throws XMLStreamException { // N/A throwInternal(); } // // token-skip methods
Returns:True if the whole characters segment was succesfully skipped; false if not
/** * @return True if the whole characters segment was succesfully * skipped; false if not */
@Override protected abstract boolean skipCharacters() throws XMLStreamException; @Override protected void skipCData() throws XMLStreamException { // should never be called throwInternal(); } @Override protected void skipComment() throws XMLStreamException { // should never be called throwInternal(); } @Override protected void skipPI() throws XMLStreamException { // should never be called throwInternal(); } @Override protected void skipSpace() throws XMLStreamException { // should never be called throwInternal(); } @Override protected boolean loadMore() throws XMLStreamException { // should never get called throwInternal(); return false; // never gets here } @Override protected abstract void finishCharacters() throws XMLStreamException; /* /********************************************************************** /* Internal methods, name decoding /********************************************************************** */
Method called to process a sequence of bytes that is likely to be a PName. At this point we encountered an end marker, and may either hit a formerly seen well-formed PName; an as-of-yet unseen well-formed PName; or a non-well-formed sequence (containing one or more non-name chars without any valid end markers).
Params:
  • lastQuad – Word with last 0 to 3 bytes of the PName; not included in the quad array
  • lastByteCount – Number of bytes contained in lastQuad; 0 to 3.
/** * Method called to process a sequence of bytes that is likely to * be a PName. At this point we encountered an end marker, and * may either hit a formerly seen well-formed PName; an as-of-yet * unseen well-formed PName; or a non-well-formed sequence (containing * one or more non-name chars without any valid end markers). * * @param lastQuad Word with last 0 to 3 bytes of the PName; not included * in the quad array * @param lastByteCount Number of bytes contained in lastQuad; 0 to 3. */
protected final PName findPName(int lastQuad, int lastByteCount) throws XMLStreamException { // First, need to push back the byte read but not used: --_inputPtr; int qlen = _quadCount; // Also: if last quad is empty, will need take last from qbuf. if (lastByteCount == 0) { lastQuad = _quadBuffer[--qlen]; lastByteCount = 4; } // Separate handling for short names: if (qlen <= 1) { // short name? if (qlen == 0) { // 4-bytes or less; only has 'lastQuad' defined int hash = ByteBasedPNameTable.calcHash(lastQuad); PName name = _symbols.findSymbol(hash, lastQuad, 0); if (name == null) { // Let's simplify things a bit, and just use array based one then: _quadBuffer[0] = lastQuad; name = addPName(_symbols, hash, _quadBuffer, 1, lastByteCount); } return name; } int firstQuad = _quadBuffer[0]; int hash = ByteBasedPNameTable.calcHash(firstQuad, lastQuad); PName name = _symbols.findSymbol(hash, firstQuad, lastQuad); if (name == null) { // As above, let's just use array, then _quadBuffer[1] = lastQuad; name = addPName(_symbols, hash, _quadBuffer, 2, lastByteCount); } return name; } // Nope, long (3 quads or more). At this point, the last quad is // not yet in the array, let's add: if (qlen >= _quadBuffer.length) { // let's just double? _quadBuffer = DataUtil.growArrayBy(_quadBuffer, _quadBuffer.length); } _quadBuffer[qlen++] = lastQuad; int hash = ByteBasedPNameTable.calcHash(_quadBuffer, qlen); PName name = _symbols.findSymbol(hash, _quadBuffer, qlen); if (name == null) { name = addPName(_symbols, hash, _quadBuffer, qlen, lastByteCount); } return name; } protected final PName addPName(ByteBasedPNameTable symbols, int hash, int[] quads, int qlen, int lastQuadBytes) throws XMLStreamException { return addUTFPName(symbols, _charTypes, hash, quads, qlen, lastQuadBytes); } /* /********************************************************************** /* Internal methods, input validation /********************************************************************** */
Method called to verify validity of given character (from entity) and append it to the text buffer
/** * Method called to verify validity of given character (from entity) and * append it to the text buffer */
protected void verifyAndAppendEntityCharacter(int charFromEntity) throws XMLStreamException { verifyXmlChar(charFromEntity); // Ok; does it need a surrogate though? (over 16 bits) if ((charFromEntity >> 16) != 0) { charFromEntity -= 0x10000; _textBuilder.append((char) (0xD800 | (charFromEntity >> 10))); charFromEntity = 0xDC00 | (charFromEntity & 0x3FF); } _textBuilder.append((char) charFromEntity); }
Checks that a character for a PublicId
Params:
  • c – A character
See Also:
  • http://www.w3.org/TR/xml/#NT-PubidLiteral
Returns:true if the character is valid for use in the Public ID of an XML doctype declaration
/** * Checks that a character for a PublicId * * @param c A character * @return true if the character is valid for use in the Public ID * of an XML doctype declaration * * @see "http://www.w3.org/TR/xml/#NT-PubidLiteral" */
protected boolean validPublicIdChar(int c) { return c == 0xA || //<LF> c == 0xD || //<CR> c == 0x20 || //<SPACE> (c >= '0' && c <= '9') || //[0-9] (c >= '@' && c <= 'Z') || //@[A-Z] (c >= 'a' && c <= 'z') || c == '!' || (c >= 0x23 && c <= 0x25) || //#$% (c >= 0x27 && c <= 0x2F) || //'()*+,-./ (c >= ':' && c <= ';') || c == '=' || c == '?' || c == '_'; } /* /********************************************************************** /* Internal methods, error handling /********************************************************************** */ @Override protected int decodeCharForError(byte b) throws XMLStreamException { // !!! TBI return (int) b; } protected void checkPITargetName(PName targetName) throws XMLStreamException { String ln = targetName.getLocalName(); if (ln.length() == 3 && ln.equalsIgnoreCase("xml") && !targetName.hasPrefix()) { reportInputProblem(ErrorConsts.ERR_WF_PI_XML_TARGET); } } protected int throwInternal() { throw new IllegalStateException("Internal error: should never execute this code path"); } protected void reportInvalidOther(int mask, int ptr) throws XMLStreamException { _inputPtr = ptr; reportInvalidOther(mask); } /* /********************************************************************** /* Shared implementation for handling XML prolog; less performance /* sensitive so need not inline access /********************************************************************** */ @Override public final int nextFromProlog(boolean isProlog) throws XMLStreamException { // Had fully complete event? Need to reset state etc: if (_currToken != EVENT_INCOMPLETE) { // First: keep track of where event started setStartLocation(); // yet one more special case: after START_DOCUMENT need to check things... if (_currToken == START_DOCUMENT) { _currToken = EVENT_INCOMPLETE; if (_tokenName != null) { _nextEvent = PROCESSING_INSTRUCTION; _state = STATE_PI_AFTER_TARGET; checkPITargetName(_tokenName); return handlePI(); } } else { _currToken = _nextEvent = EVENT_INCOMPLETE; _state = STATE_DEFAULT; } } // Ok, do we know which event it will be? if (_nextEvent == EVENT_INCOMPLETE) { // nope // The very first thing: XML declaration handling if (_state == STATE_PROLOG_INITIAL) { if (_inputPtr >= _inputEnd) { return _currToken; } // Ok: see if we have what looks like XML declaration; process: if (_pendingInput != 0) { // already parsing (potential) XML declaration Boolean b = startXmlDeclaration(); // is or may be XML declaration, so: if (b == null) { // not yet known; bail out return EVENT_INCOMPLETE; } if (b == Boolean.FALSE) { // no real XML declaration; synthesize one return _startDocumentNoXmlDecl(); } return handleXmlDeclaration(); } if (_currentByte() == BYTE_LT) { // first byte, see if it could be XML declaration ++_inputPtr; _pendingInput = PENDING_STATE_XMLDECL_LT; Boolean b = startXmlDeclaration(); // is or may be XML declaration, so: if (b == null) { return EVENT_INCOMPLETE; } if (b == Boolean.FALSE) { // no real XML declaration; synthesize one return _startDocumentNoXmlDecl(); } return handleXmlDeclaration(); } // can't be XML declaration _state = STATE_DEFAULT; return _startDocumentNoXmlDecl(); } // First: did we have a lone CR at the end of the buffer? if (_pendingInput != 0) { // yup if (!handlePartialCR()) { return _currToken; } } while (_state == STATE_DEFAULT) { if (_inputPtr >= _inputEnd) { // no more input available if (_endOfInput) { // for good? That may be fine setStartLocation(); return TOKEN_EOI; } return _currToken; } byte b = _nextByte(); // Really should get white space or '<'... anything else is // pretty much an error. if (b == BYTE_LT) { // root element, comment, proc instr? _state = STATE_PROLOG_SEEN_LT; break; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { // Prolog/epilog ws is to be skipped, not part of Infoset if (!asyncSkipSpace()) { // ran out of input? if (_endOfInput) { // for good? That may be fine setStartLocation(); return TOKEN_EOI; } return _currToken; } } else { reportPrologUnexpChar(isProlog, decodeCharForError(b), null); } } if (_state == STATE_PROLOG_SEEN_LT) { if (_inputPtr >= _inputEnd) { return _currToken; } byte b = _nextByte(); if (b == BYTE_EXCL) { // comment or DOCTYPE declaration? _state = STATE_PROLOG_DECL; return handlePrologDeclStart(isProlog); } if (b == BYTE_QMARK) { // PI _nextEvent = PROCESSING_INSTRUCTION; _state = STATE_DEFAULT; return handlePI(); } if (b == BYTE_SLASH || !isProlog) { reportPrologUnexpElement(isProlog, b); } return handleStartElementStart(b); } if (_state == STATE_PROLOG_DECL) { return handlePrologDeclStart(isProlog); } // should never have anything else... return throwInternal(); } // At this point, we do know the event type switch (_nextEvent) { case START_ELEMENT: return handleStartElement(); case START_DOCUMENT: return handleXmlDeclaration(); case PROCESSING_INSTRUCTION: return handlePI(); case COMMENT: return handleComment(); case DTD: return handleDTD(); } return throwInternal(); // should never get here }
Helper method called when it is determined that the document does NOT start with an xml declaration. Needs to return START_DOCUMENT, and initialize other state appropriately.
/** * Helper method called when it is determined that the document does NOT start with * an xml declaration. Needs to return START_DOCUMENT, and initialize other state * appropriately. */
protected int _startDocumentNoXmlDecl() throws XMLStreamException { // 03-Apr-2018, tatu: We can finalize encoding at this point _activateEncoding(); _currToken = START_DOCUMENT; return START_DOCUMENT; } private final int handlePrologDeclStart(boolean isProlog) throws XMLStreamException { if (_inputPtr >= _inputEnd) { // nothing we can do? return EVENT_INCOMPLETE; } byte b = _nextByte(); // So far, we have seen "<!", need to know if it's DTD or COMMENT if (b == BYTE_HYPHEN) { _nextEvent = COMMENT; _state = STATE_DEFAULT; return handleComment(); } if (b == BYTE_D) { _nextEvent = DTD; _state = STATE_DEFAULT; return handleDTD(); } reportPrologUnexpChar(isProlog, decodeCharForError(b), " (expected '-' for COMMENT)"); return EVENT_INCOMPLETE; // never gets here }
Method that deals with recognizing XML declaration, but not with parsing its contents.
Returns:null if parsing is inconclusive (may or may not be XML declaration); Boolean.TRUE if complete XML declaration, and Boolean.FALSE if something else
/** * Method that deals with recognizing XML declaration, but not with parsing * its contents. * * @return null if parsing is inconclusive (may or may not be XML declaration); * Boolean.TRUE if complete XML declaration, and Boolean.FALSE if something * else */
private final Boolean startXmlDeclaration() throws XMLStreamException { if (_inputPtr >= _inputEnd) { return null; } if (_pendingInput == PENDING_STATE_XMLDECL_LT) { // "<" at start of doc if (_currentByte() != BYTE_QMARK) { // some other _pendingInput = 0; _state = STATE_PROLOG_SEEN_LT; return Boolean.FALSE; } ++_inputPtr; _pendingInput = PENDING_STATE_XMLDECL_LTQ; if (_inputPtr >= _inputEnd) { return null; } } if (_pendingInput == PENDING_STATE_XMLDECL_LTQ) { // "<?" at start of doc byte b = _nextByte(); _tokenName = _parseNewXmlDeclName(b); if (_tokenName == null) { // incomplete _pendingInput = PENDING_STATE_XMLDECL_TARGET; return null; } // xml or not? if (!"xml".equals(_tokenName.getPrefixedName())) { // nope: some other PI _pendingInput = 0; _state = STATE_PI_AFTER_TARGET; _nextEvent = PROCESSING_INSTRUCTION; checkPITargetName(_tokenName); return Boolean.FALSE; } } else if (_pendingInput == PENDING_STATE_XMLDECL_TARGET) { // "<?" at start of doc, part of name if ((_tokenName = _parseXmlDeclName()) == null) { // incomplete return null; } if (!"xml".equals(_tokenName.getPrefixedName())) { _pendingInput = 0; _state = STATE_PI_AFTER_TARGET; _nextEvent = PROCESSING_INSTRUCTION; checkPITargetName(_tokenName); return Boolean.FALSE; } } else { throwInternal(); } _pendingInput = 0; _nextEvent = START_DOCUMENT; _state = STATE_XMLDECL_AFTER_XML; return Boolean.TRUE; }
Method called to complete parsing of XML declaration, once it has been reliably detected.
Returns:Completed token (START_DOCUMENT), if fully parsed; incomplete (EVENT_INCOMPLETE) otherwise
/** * Method called to complete parsing of XML declaration, once it has * been reliably detected. * * @return Completed token (START_DOCUMENT), if fully parsed; incomplete (EVENT_INCOMPLETE) * otherwise */
private int handleXmlDeclaration() throws XMLStreamException { // First: left-over CRs? if (_pendingInput == PENDING_STATE_CR) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } } main_loop: while (_inputPtr < _inputEnd) { switch (_state) { case STATE_XMLDECL_AFTER_XML: // "<?xml", need space { byte b = _nextByte(); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_XMLDECL_BEFORE_VERSION; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after 'xml' in xml declaration)"); } } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_BEFORE_VERSION: if (!asyncSkipSpace()) { // not enough input break; } if ((_tokenName = _parseNewXmlDeclName(_nextByte())) == null) { // incomplete _state = STATE_XMLDECL_VERSION; break; } if (!_tokenName.hasPrefixedName("version")) { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'version'"); } _state = STATE_XMLDECL_AFTER_VERSION; continue main_loop; case STATE_XMLDECL_VERSION: // "<?xml ", part of "version" if ((_tokenName = _parseXmlDeclName()) == null) { // incomplete break; } if (!_tokenName.hasPrefixedName("version")) { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'version'"); } _state = STATE_XMLDECL_AFTER_VERSION; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_VERSION: // "<?xml version", need space or '=' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _nextByte(); if (b != BYTE_EQ) { reportPrologUnexpChar(true, decodeCharForError(b), " (expected '=' after 'version' in xml declaration)"); } } _state = STATE_XMLDECL_VERSION_EQ; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_VERSION_EQ: // "<?xml version=", need space or quote if (!asyncSkipSpace()) { // skip space, if any break; } _elemAttrQuote = _nextByte(); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' in xml declaration for version value)"); } { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseXmlDeclAttr(buf, 0)) { _state = STATE_XMLDECL_VERSION_VALUE; break; } } verifyAndSetXmlVersion(); _state = STATE_XMLDECL_AFTER_VERSION_VALUE; continue main_loop; case STATE_XMLDECL_VERSION_VALUE: // parsing version value if (!parseXmlDeclAttr(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength())) { _state = STATE_XMLDECL_VERSION_VALUE; break; } verifyAndSetXmlVersion(); _state = STATE_XMLDECL_AFTER_VERSION_VALUE; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_VERSION_VALUE: // version got; need space or '?' { byte b = _nextByte(); if (b == BYTE_QMARK) { _state = STATE_XMLDECL_ENDQ; continue main_loop; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_XMLDECL_BEFORE_ENCODING; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after version value in xml declaration)"); } } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_BEFORE_ENCODING: // version, value, space got, need '?' or 'e' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _nextByte(); if (b == BYTE_QMARK) { _state = STATE_XMLDECL_ENDQ; continue main_loop; } if ((_tokenName = _parseNewXmlDeclName(b)) == null) { // incomplete _state = STATE_XMLDECL_ENCODING; break; } // Can actually also get "standalone" instead... if (_tokenName.hasPrefixedName("encoding")) { _state = STATE_XMLDECL_AFTER_ENCODING; } else if (_tokenName.hasPrefixedName("standalone")) { _state = STATE_XMLDECL_AFTER_STANDALONE; continue main_loop; } else { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'encoding'"); } } continue main_loop; case STATE_XMLDECL_ENCODING: // parsing "encoding" if ((_tokenName = _parseXmlDeclName()) == null) { // incomplete break; } // Can actually also get "standalone" instead... if (_tokenName.hasPrefixedName("encoding")) { _state = STATE_XMLDECL_AFTER_ENCODING; } else if (_tokenName.hasPrefixedName("standalone")) { _state = STATE_XMLDECL_AFTER_STANDALONE; continue main_loop; } else { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'encoding'"); } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_ENCODING: // got "encoding"; must get ' ' or '=' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _nextByte(); if (b != BYTE_EQ) { reportPrologUnexpChar(true, decodeCharForError(b), " (expected '=' after 'encoding' in xml declaration)"); } } _state = STATE_XMLDECL_ENCODING_EQ; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_ENCODING_EQ: // "encoding=" if (!asyncSkipSpace()) { // skip space, if any break; } _elemAttrQuote = _nextByte(); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' in xml declaration for encoding value)"); } _state = STATE_XMLDECL_ENCODING_VALUE; { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseXmlDeclAttr(buf, 0)) { _state = STATE_XMLDECL_ENCODING_VALUE; break; } } verifyAndSetXmlEncoding(); _state = STATE_XMLDECL_AFTER_ENCODING_VALUE; break; case STATE_XMLDECL_ENCODING_VALUE: // parsing encoding value if (!parseXmlDeclAttr(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength())) { _state = STATE_XMLDECL_ENCODING_VALUE; break; } verifyAndSetXmlEncoding(); _state = STATE_XMLDECL_AFTER_ENCODING_VALUE; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_ENCODING_VALUE: // encoding+value gotten; need space or '?' { byte b = _nextByte(); if (b == BYTE_QMARK) { _state = STATE_XMLDECL_ENDQ; continue main_loop; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_XMLDECL_BEFORE_STANDALONE; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after encoding value in xml declaration)"); } } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_BEFORE_STANDALONE: // after encoding+value+space; get '?' or 's' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _nextByte(); if (b == BYTE_QMARK) { _state = STATE_XMLDECL_ENDQ; continue main_loop; } if ((_tokenName = _parseNewXmlDeclName(b)) == null) { // incomplete _state = STATE_XMLDECL_STANDALONE; break; } if (!_tokenName.hasPrefixedName("standalone")) { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'standalone'"); } } _state = STATE_XMLDECL_AFTER_STANDALONE; continue main_loop; case STATE_XMLDECL_STANDALONE: // parsing "standalone" if ((_tokenName = _parseXmlDeclName()) == null) { // incomplete break; } if (!_tokenName.hasPrefixedName("standalone")) { reportInputProblem("Unexpected keyword 'encoding' in XML declaration: expected 'standalone'"); } _state = STATE_XMLDECL_AFTER_STANDALONE; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_STANDALONE: // got "standalone"; must get ' ' or '=' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _nextByte(); if (b != BYTE_EQ) { reportPrologUnexpChar(true, decodeCharForError(b), " (expected '=' after 'standalone' in xml declaration)"); } } _state = STATE_XMLDECL_STANDALONE_EQ; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_STANDALONE_EQ: // "standalone=" if (!asyncSkipSpace()) { // skip space, if any break; } _elemAttrQuote = _nextByte(); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' in xml declaration for standalone value)"); } { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseXmlDeclAttr(buf, 0)) { _state = STATE_XMLDECL_STANDALONE_VALUE; break; } } verifyAndSetXmlStandalone(); _state = STATE_XMLDECL_AFTER_STANDALONE_VALUE; continue main_loop; case STATE_XMLDECL_STANDALONE_VALUE: // encoding+value gotten; need space or '?' if (!parseXmlDeclAttr(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength())) { _state = STATE_XMLDECL_STANDALONE_VALUE; break; } verifyAndSetXmlStandalone(); _state = STATE_XMLDECL_AFTER_STANDALONE_VALUE; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_STANDALONE_VALUE: // encoding+value gotten; need space or '?' if (!asyncSkipSpace()) { // skip space, if any break; } if (_nextByte() != BYTE_QMARK) { reportPrologUnexpChar(true, decodeCharForError(_prevByte()), " (expected '?>' to end xml declaration)"); } _state = STATE_XMLDECL_ENDQ; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_ENDQ: // Better clear up decoded name, to avoid later problems (would be taken as PI) _tokenName = null; _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; if (_nextByte() != BYTE_GT) { reportPrologUnexpChar(true, decodeCharForError(_prevByte()), " (expected '>' to end xml declaration)"); } // 03-Apr-2018, tatu: Finally! Done with XML declaration, we know the encoding for sure. _activateEncoding(); return START_DOCUMENT; default: throwInternal(); } } return EVENT_INCOMPLETE; } private int handleDTD() throws XMLStreamException { // First: left-over CRs? if (_pendingInput == PENDING_STATE_CR) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } } if (_state == STATE_DTD_INT_SUBSET) { if (handleDTDInternalSubset(false)) { // got it! _state = STATE_DTD_EXPECT_CLOSING_GT; } else { return EVENT_INCOMPLETE; } } main_loop: while (_inputPtr < _inputEnd) { switch (_state) { case STATE_DEFAULT: // seen 'D' _tokenName = parseNewName(BYTE_D); if (_tokenName == null) { _state = STATE_DTD_DOCTYPE; return EVENT_INCOMPLETE; } if (!"DOCTYPE".equals(_tokenName.getPrefixedName())) { reportPrologProblem(true, "expected 'DOCTYPE'"); } _state = STATE_DTD_AFTER_DOCTYPE; continue main_loop; case STATE_DTD_DOCTYPE: _tokenName = parsePName(); if (_tokenName == null) { _state = STATE_DTD_DOCTYPE; return EVENT_INCOMPLETE; } if (!"DOCTYPE".equals(_tokenName.getPrefixedName())) { reportPrologProblem(true, "expected 'DOCTYPE'"); } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_DTD_AFTER_DOCTYPE: { byte b = _nextByte(); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_ROOT_NAME; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after 'DOCTYPE')"); } } // fall through (ok to skip bounds checks, async-skip does it) case STATE_DTD_BEFORE_ROOT_NAME: if (!asyncSkipSpace()) { // not enough input break; } if ((_tokenName = parseNewName(_nextByte())) == null) { // incomplete _state = STATE_DTD_ROOT_NAME; break; } _state = STATE_DTD_ROOT_NAME; continue main_loop; case STATE_DTD_ROOT_NAME: if ((_tokenName = parsePName()) == null) { // incomplete break; } _state = STATE_DTD_AFTER_ROOT_NAME; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_DTD_AFTER_ROOT_NAME: { byte b = _nextByte(); if (b == BYTE_GT) { _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return DTD; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_IDS; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after root name in DOCTYPE declaration)"); } } // fall through (ok to skip bounds checks, async-skip does it) case STATE_DTD_BEFORE_IDS: if (!asyncSkipSpace()) { // not enough input break; } { byte b = _nextByte(); if (b == BYTE_GT) { _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return DTD; } PName name; if ((name = parseNewName(b)) == null) { _state = STATE_DTD_PUBLIC_OR_SYSTEM; break; } String str = name.getPrefixedName(); if ("PUBLIC".equals(str)) { _state = STATE_DTD_AFTER_PUBLIC; } else if ("SYSTEM".equals(str)) { _state = STATE_DTD_AFTER_SYSTEM; } else { reportPrologProblem(true, "unexpected token '"+str+"': expected either PUBLIC or SYSTEM"); } } continue main_loop; case STATE_DTD_PUBLIC_OR_SYSTEM: { PName name; if ((name = parsePName()) == null) { _state = STATE_DTD_PUBLIC_OR_SYSTEM; break; } String str = name.getPrefixedName(); if ("PUBLIC".equals(str)) { _state = STATE_DTD_AFTER_PUBLIC; } else if ("SYSTEM".equals(str)) { _state = STATE_DTD_AFTER_SYSTEM; } else { reportPrologProblem(true, "unexpected token '"+str+"': expected either PUBLIC or SYSTEM"); } } continue main_loop; case STATE_DTD_AFTER_PUBLIC: { byte b = _nextByte(); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_PUBLIC_ID; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after PUBLIC keyword)"); } } continue main_loop; case STATE_DTD_AFTER_SYSTEM: { byte b = _nextByte(); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_SYSTEM_ID; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after SYSTEM keyword)"); } } continue main_loop; case STATE_DTD_BEFORE_PUBLIC_ID: if (!asyncSkipSpace()) { break; } _elemAttrQuote = _nextByte(); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' for PUBLIC ID)"); } { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseDtdId(buf, 0, false)) { _state = STATE_DTD_PUBLIC_ID; break; } } verifyAndSetPublicId(); _state = STATE_DTD_AFTER_PUBLIC_ID; continue main_loop; case STATE_DTD_PUBLIC_ID: if (!parseDtdId(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength(), false)) { break; } verifyAndSetPublicId(); _state = STATE_DTD_AFTER_PUBLIC_ID; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_DTD_AFTER_PUBLIC_ID: { byte b = _nextByte(); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_SYSTEM_ID; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after PUBLIC ID)"); } } // fall through (ok to skip bounds checks, async-skip does it) case STATE_DTD_BEFORE_SYSTEM_ID: if (!asyncSkipSpace()) { break; } _elemAttrQuote = _nextByte(); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' for SYSTEM ID)"); } { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseDtdId(buf, 0, true)) { _state = STATE_DTD_SYSTEM_ID; break; } } verifyAndSetSystemId(); _state = STATE_DTD_AFTER_SYSTEM_ID; continue main_loop; case STATE_DTD_SYSTEM_ID: if (!parseDtdId(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength(), true)) { break; } verifyAndSetSystemId(); _state = STATE_DTD_AFTER_SYSTEM_ID; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_DTD_AFTER_SYSTEM_ID: if (!asyncSkipSpace()) { break; } { byte b = _nextByte(); if (b == BYTE_GT) { _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return DTD; } if (b != BYTE_LBRACKET) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected either '[' for internal subset, or '>' to end DOCTYPE)"); } } _state = STATE_DTD_INT_SUBSET; if (handleDTDInternalSubset(true)) { _state = STATE_DTD_EXPECT_CLOSING_GT; } else { return EVENT_INCOMPLETE; } // fall through case STATE_DTD_EXPECT_CLOSING_GT: if (!asyncSkipSpace()) { break; } { byte b = _nextByte(); if (b != BYTE_GT) { reportPrologUnexpChar(true, b, "expected '>' to end DTD"); } } _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return DTD; default: throwInternal(); } } return _currToken; } private final boolean parseDtdId(char[] outputBuffer, int outputPtr, boolean system) throws XMLStreamException { final int quote = (int) _elemAttrQuote; while (_inputPtr < _inputEnd) { int ch = _nextByte() & 0xFF; if (ch == quote) { _textBuilder.setCurrentLength(outputPtr); return true; } if (!system && !validPublicIdChar(ch)) { reportPrologUnexpChar(true, decodeCharForError((byte) ch), " (not valid in " + (system ? "SYSTEM" : "PUBLIC") + " ID)"); } if (outputPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outputPtr = 0; } outputBuffer[outputPtr++] = (char) ch; } _textBuilder.setCurrentLength(outputPtr); return false; } // // // NOTE: specialized versions of `parsePName`, `parseNewName`, to be // // // used in decoding `xml` and pseudo-attributes of XML declaration // // // Tricky part here is that it predates possible encoding declaration // // // so it is essentially part of bootstrapping private final PName _parseNewXmlDeclName(byte b) throws XMLStreamException { int q = b & 0xFF; if (q < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode throwUnexpectedChar(q, "; expected a name start character"); } _quadCount = 0; _currQuad = q; _currQuadBytes = 1; return _parseXmlDeclName(); } private final PName _parseXmlDeclName() throws XMLStreamException { int q = _currQuad; while (true) { int i; switch (_currQuadBytes) { case 0: if (_inputPtr >= _inputEnd) { return null; // all pointers have been set } q = _nextByte() & 0xFF; // Since name char validity is checked later on, only do quickie lookup if (q < 65) { // 'A' if (q < 45 || q > 58 || q == 47) { return _findXmlDeclName(q, 0); } } // fall through case 1: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 1; return null; } i = _nextByte() & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return _findXmlDeclName(q, 1); } } q = (q << 8) | i; // fall through case 2: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 2; return null; } i = _nextByte() & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return _findXmlDeclName(q, 2); } } q = (q << 8) | i; // fall through case 3: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 3; return null; } i = _nextByte() & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return _findXmlDeclName(q, 3); } } q = (q << 8) | i; } // If we get this far, need to add full quad into result array and update state if (_quadCount == 0) { // first quad _quadBuffer[0] = q; _quadCount = 1; } else { if (_quadCount >= _quadBuffer.length) { // let's just double? _quadBuffer = DataUtil.growArrayBy(_quadBuffer, _quadBuffer.length); } _quadBuffer[_quadCount++] = q; } _currQuadBytes = 0; } } protected final PName _findXmlDeclName(int lastQuad, int lastByteCount) throws XMLStreamException { int qlen = _quadCount; // Also: if last quad is empty, will need take last from qbuf. if (lastByteCount == 0) { lastQuad = _quadBuffer[--qlen]; // NOTE: do not change since we may need to delegate with original value, // and byte count not checked here // lastByteCount = 4; } // First things first: we are very likely to find one of short pseudo-attributes, so: PName pname; switch (qlen) { case 0: // 4-bytes or less; only has 'lastQuad' defined pname = AsyncXmlDeclHelper.find(lastQuad); break; case 1: pname = AsyncXmlDeclHelper.find(_quadBuffer[0], lastQuad); break; case 2: pname = AsyncXmlDeclHelper.find(_quadBuffer[0], _quadBuffer[1], lastQuad); break; default: pname = null; } if (pname != null) { // Need to push back the byte read but not used: --_inputPtr; return pname; } // Otherwise most likely a processing instruction instead of XML declaration. A few // ways we could deal with it, but for now let's finalize symbol table etc, delegate _activateEncoding(); return findPName(lastQuad, lastByteCount); }
Method called to try to parse an XML pseudo-attribute value. This is relatively simple, since we can't have linefeeds or entities; and although there are exact rules for what is allowed, we can do coarse parsing and only later on verify validity (for encoding could do stricter parsing in future?)

NOTE: pseudo-attribute values required to be 7-bit ASCII so can do crude cast.

Returns:True if we managed to parse the whole pseudo-attribute
/** * Method called to try to parse an XML pseudo-attribute value. This is relatively * simple, since we can't have linefeeds or entities; and although there are exact * rules for what is allowed, we can do coarse parsing and only later on verify * validity (for encoding could do stricter parsing in future?) *<p> * NOTE: pseudo-attribute values required to be 7-bit ASCII so can do crude cast. * * @return True if we managed to parse the whole pseudo-attribute */
protected boolean parseXmlDeclAttr(char[] outputBuffer, int outputPtr) throws XMLStreamException { final int quote = (int) _elemAttrQuote; while (_inputPtr < _inputEnd) { int ch = _nextByte() & 0xFF; if (ch == quote) { _textBuilder.setCurrentLength(outputPtr); return true; } // this is not exact check; but does work for all legal (valid) characters: if (ch <= INT_SPACE || ch > INT_z) { reportPrologUnexpChar(true, decodeCharForError((byte) ch), " (not valid in XML pseudo-attribute values)"); } if (outputPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outputPtr = 0; } outputBuffer[outputPtr++] = (char) ch; } _textBuilder.setCurrentLength(outputPtr); return false; } }