com.fasterxml.woodstox/woodstox-core/6.2.3 : com/ctc/wstx/io/StreamBootstrapper.java

StreamBootstrapper
https://github.com/FasterXML/woodstox: Woodstox is a high-performance XML processor that implements Stax (JSR-173), SAX2 and Stax2 APIs (FasterXML)
The Apache License, Version 2.0
Tatu Saloranta
package com.ctc.wstx.io;

import java.io.*;

import javax.xml.stream.Location;
import javax.xml.stream.XMLStreamException;

import com.ctc.wstx.api.ReaderConfig;
import com.ctc.wstx.cfg.ParsingErrorMsgs;
import com.ctc.wstx.cfg.XmlConsts;
import com.ctc.wstx.exc.*;

Input bootstrap class used with streams, when encoding is not known
(when encoding is specified by application, a reader is constructed,
and then reader-based bootstrapper is used).

Encoding used for an entity (including
main document entity) is determined using algorithms suggested in
XML 1.0#3 spec, appendix F
/**
 * Input bootstrap class used with streams, when encoding is not known
 * (when encoding is specified by application, a reader is constructed,
 * and then reader-based bootstrapper is used).
 *<p>
 * Encoding used for an entity (including
 * main document entity) is determined using algorithms suggested in
 * XML 1.0#3 spec, appendix F
 */
public final class StreamBootstrapper
    extends InputBootstrapper
{
    Let's size buffer at least big enough to contain the longest possible
prefix of a document needed to positively identify it starts with
the XML declaration. That means having (optional) BOM, and then first
6 characters ("/**
     * Let's size buffer at least big enough to contain the longest possible
     * prefix of a document needed to positively identify it starts with
     * the XML declaration. That means having (optional) BOM, and then first
     * 6 characters ("<?xml "), in whatever encoding. With 4-byte encodings
     * (UCS-4), that comes to 28 bytes. And for good measure, let's pad
     * that a bit as well....
     */
    final static int MIN_BUF_SIZE = 128;

    /*
    ////////////////////////////////////////
    // Configuration
    ////////////////////////////////////////
    */

    Underlying InputStream to use for reading content. May be null
if the actual data source is not stream-based but a block source.
/**
     * Underlying InputStream to use for reading content. May be null
     * if the actual data source is not stream-based but a block source.
     */
    final InputStream mIn;

    /*
    ///////////////////////////////////////////////////////////////
    // Input buffering
    ///////////////////////////////////////////////////////////////
    */

    private byte[] mByteBuffer;

    Whether byte buffer is recyclable or not
/**
     * Whether byte buffer is recyclable or not
     */
    private final boolean mRecycleBuffer;

    private int mInputPtr;
    private int mInputEnd;
    /*
    ///////////////////////////////////////////////////////////////
    // Physical encoding properties found so far
    ///////////////////////////////////////////////////////////////
    */

    boolean mBigEndian = true;

    boolean mHadBOM = false;

    boolean mByteSizeFound = false;

    For most encodings, number of physical characters needed for
decoding xml declaration characters (which for variable length
encodings like UTF-8 will be 1). Exception is EBCDIC, which
while a single-byte encoding, is denoted by -1 since it
needs an additional translation lookup.
/**
     * For most encodings, number of physical characters needed for
     * decoding xml declaration characters (which for variable length
     * encodings like UTF-8 will be 1). Exception is EBCDIC, which
     * while a single-byte encoding, is denoted by -1 since it
     * needs an additional translation lookup.
     */
    int mBytesPerChar; // minimum, ie. 1 for UTF-8

    Special case for 1-byte encodings: EBCDIC is problematic
as it's not 7-bit ascii compatible. We can deal with it,
still, but only with bit of extra state.
/**
     * Special case for 1-byte encodings: EBCDIC is problematic
     * as it's not 7-bit ascii compatible. We can deal with it,
     * still, but only with bit of extra state.
     */
    boolean mEBCDIC = false;

    String mInputEncoding = null;

    For single-byte non-ascii-compatible encodings (ok ok, really
just EBCDIC), we'll have to use a lookup table.
/**
     * For single-byte non-ascii-compatible encodings (ok ok, really
     * just EBCDIC), we'll have to use a lookup table.
     */
    int[] mSingleByteTranslation = null;

    /*
    ////////////////////////////////////////
    // Life-cycle
    ////////////////////////////////////////
    */

    private StreamBootstrapper(String pubId, SystemId sysId, InputStream in)
    {
        super(pubId, sysId);
        mIn = in;
        mInputPtr = mInputEnd = 0;
        mRecycleBuffer = true;
    }

    Params: start – Pointer to the first valid byte in the buffer
end – Pointer to the offset after last valid byte in the buffer/**
     * @param start Pointer to the first valid byte in the buffer
     * @param end Pointer to the offset <b>after</b> last valid byte in the buffer
     */
    private StreamBootstrapper(String pubId, SystemId sysId, byte[] data, int start, int end)
    {
        super(pubId, sysId);
        mIn = null;
        mRecycleBuffer = false;
        mByteBuffer = data;
        mInputPtr = start;
        mInputEnd = end;
    }

    /*
    ////////////////////////////////////////
    // Public API
    ////////////////////////////////////////
    */

    Factory method used when the underlying data provider is an 
actual stream.
/**
     * Factory method used when the underlying data provider is an 
     * actual stream.
     */
    public static StreamBootstrapper getInstance(String pubId, SystemId sysId, InputStream in)
    {
        return new StreamBootstrapper(pubId, sysId, in);
    }

    Factory method used when the underlying data provider is a pre-allocated
block source, and no stream is used.
Additionally the buffer passed is not owned by the bootstrapper
or Reader that is created, so it is not to be recycled.
/**
     * Factory method used when the underlying data provider is a pre-allocated
     * block source, and no stream is used.
     * Additionally the buffer passed is not owned by the bootstrapper
     * or Reader that is created, so it is not to be recycled.
     */
    public static StreamBootstrapper getInstance(String pubId, SystemId sysId, byte[] data, int start, int end)
    {
        return new StreamBootstrapper(pubId, sysId, data, start, end);
    }

    @SuppressWarnings("resource")
    @Override
    public Reader bootstrapInput(ReaderConfig cfg, boolean mainDoc, int xmlVersion)
        throws IOException, XMLStreamException
    {
        String normEnc = null;

        // First, let's get the buffers...
        int bufSize = cfg.getInputBufferLength();
        if (bufSize < MIN_BUF_SIZE) {
            bufSize = MIN_BUF_SIZE;
        }
        if (mByteBuffer == null) { // non-null if we were passed a buffer
            mByteBuffer = cfg.allocFullBBuffer(bufSize);
        }

        resolveStreamEncoding();

        if (hasXmlDecl()) {
            // note: readXmlDecl will set mXml11Handling too
            readXmlDecl(mainDoc, xmlVersion);
            if (mFoundEncoding != null) {
                normEnc = verifyXmlEncoding(mFoundEncoding);
            }
        } else {
            /* We'll actually then just inherit whatever main doc had...
             * (or in case there was no parent, just copy the 'unknown')
             */
            mXml11Handling = (XmlConsts.XML_V_11 == xmlVersion);
        }

        // Now, have we figured out the encoding?

        if (normEnc == null) { // not via xml declaration
            /* 21-Sep-2007, TSa: As with any non-UTF-8 encoding, declaration
             * isn't optional any more. Besides, we need that information
             * anyway to know which variant it is.
             */
            if (mEBCDIC) {
                if (mFoundEncoding == null || mFoundEncoding.length() == 0) {
                    reportXmlProblem("Missing encoding declaration: underlying encoding looks like an EBCDIC variant, but no xml encoding declaration found");
                }
                // Hmmh. What should be the canonical name? Let's just use found encoding?
                normEnc = mFoundEncoding;
            } else if (mBytesPerChar == 2) { // UTF-16, BE/LE
                normEnc = mBigEndian ? CharsetNames.CS_UTF16BE : CharsetNames.CS_UTF16LE;
            } else if (mBytesPerChar == 4) { // UCS-4... ?
                /* 22-Mar-2005, TSa: JDK apparently has no way of dealing
                 *   with these encodings... not sure if and how it should
                 *   be dealt with, really. Name could be UCS-4xx... or
                 *   perhaps UTF-32xx
                 */
                normEnc = mBigEndian ? CharsetNames.CS_UTF32BE : CharsetNames.CS_UTF32LE;
            } else {
                // Ok, default has to be UTF-8, as per XML specs
                normEnc = CharsetNames.CS_UTF8;
            }
        }

        mInputEncoding = normEnc;

        /* And then the reader. Let's figure out if we can use our own fast
         * implementations first:
         */
        BaseReader r;

        // Normalized, can thus use straight equality checks now
        if (normEnc == CharsetNames.CS_UTF8) {
            r = new UTF8Reader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer);
        } else if (normEnc == CharsetNames.CS_ISO_LATIN1) {
            r = new ISOLatinReader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer);
        } else if (normEnc == CharsetNames.CS_US_ASCII) {
            r = new AsciiReader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd, mRecycleBuffer);
        } else if (normEnc.startsWith(CharsetNames.CS_UTF32)) {
            // let's augment with actual endianness info
            if (normEnc == CharsetNames.CS_UTF32) {
                mInputEncoding = mBigEndian ? CharsetNames.CS_UTF32BE : CharsetNames.CS_UTF32LE;
            }
            r = new UTF32Reader(cfg, mIn, mByteBuffer, mInputPtr, mInputEnd,
				mRecycleBuffer, mBigEndian);
        } else {
            // Nah, JDK needs to try it
            // Ok; first, do we need to merge stuff back?
            InputStream in = mIn;
            if (mInputPtr < mInputEnd) {
                // 16-Oct-2020, tatu: But we may or may not have InputStream to merge
                //    as per [woodstox-core#117]
                if (in == null) {
                    in = new ByteArrayInputStream(mByteBuffer, mInputPtr, mInputEnd - mInputPtr);
                } else {
                    in = new MergedStream(cfg, in, mByteBuffer, mInputPtr, mInputEnd);
                }
            }
            // 20-Jan-2006, TSa: Ok; although it is possible to declare
            //   stream as 'UTF-16', JDK may need help in figuring out
            //   the right order, so let's be explicit:
            if (normEnc == CharsetNames.CS_UTF16) {
                mInputEncoding = normEnc = mBigEndian ? CharsetNames.CS_UTF16BE : CharsetNames.CS_UTF16LE;
            }
            try {
                return new InputStreamReader(in, normEnc);
            } catch (UnsupportedEncodingException usex) {
                throw new WstxIOException("Unsupported encoding: "+usex.getMessage());
            }
        }

        if (mXml11Handling) {
            r.setXmlCompliancy(XmlConsts.XML_V_11);
        }

        return r;
    }
    
    Since this class only gets used when encoding is not explicitly
passed, need use the encoding that was auto-detected...
/**
     * Since this class only gets used when encoding is not explicitly
     * passed, need use the encoding that was auto-detected...
     */
    @Override
    public String getInputEncoding() {
        return mInputEncoding;
    }

    @Override
    public int getInputTotal() {
        int total = mInputProcessed + mInputPtr;
        if (mBytesPerChar > 1) {
            total /= mBytesPerChar;
        }
        return total;
    }

    @Override
    public int getInputColumn() {
        int col = mInputPtr - mInputRowStart;
        if (mBytesPerChar > 1) {
            col /= mBytesPerChar;
        }
        return col;
    }

    /*
    ////////////////////////////////////////
    // Internal methods, parsing
    ////////////////////////////////////////
    */

    Method called to try to figure out physical encoding the underlying
input stream uses.
/**
     * Method called to try to figure out physical encoding the underlying
     * input stream uses.
     */
    protected void resolveStreamEncoding()
        throws IOException, WstxException
    {
        // Let's first set defaults:
        mBytesPerChar = 0;
        mBigEndian = true;

        /* Ok; first just need 4 bytes for determining bytes-per-char from
         * BOM or first char(s) of likely xml declaration:
         */
        if (ensureLoaded(4)) {
            bomblock:
            do { // BOM/auto-detection block
                int quartet = (mByteBuffer[0] << 24)
                    | ((mByteBuffer[1] & 0xFF) << 16)
                    | ((mByteBuffer[2] & 0xFF) << 8)
                    | (mByteBuffer[3] & 0xFF);

                /* Handling of (usually) optional BOM (required for
                 * multi-byte formats); first 32-bit charsets:
                 */
                switch (quartet) {
                case 0x0000FEFF:
                    mBigEndian = true;
                    mInputPtr = mBytesPerChar = 4;
                    break bomblock;
                case 0xFFFE0000: // UCS-4, LE?
                    mInputPtr = mBytesPerChar = 4;
                    mBigEndian = false;
                    break bomblock;
                case 0x0000FFFE: // UCS-4, in-order...
                    reportWeirdUCS4("2143");
                    break bomblock;
                case 0x0FEFF0000: // UCS-4, in-order...
                    reportWeirdUCS4("3412");
                    break bomblock;
                }

                // Ok, if not, how about 16-bit encoding BOMs?
                int msw = quartet >>> 16;
                if (msw == 0xFEFF) { // UTF-16, BE
                    mInputPtr = mBytesPerChar = 2;
                    mBigEndian = true;
                    break;
                }
                if (msw == 0xFFFE) { // UTF-16, LE
                    mInputPtr = mBytesPerChar = 2;
                    mBigEndian = false;
                    break;
                }

                // And if not, then UTF-8 BOM?
                if ((quartet >>> 8) == 0xEFBBBF) { // UTF-8
                    mInputPtr = 3;
                    mBytesPerChar = 1;
                    mBigEndian = true; // doesn't really matter
                    break;
                }

                /* And if that wasn't succesful, how about auto-detection
                 * for '<?xm' (or subset for multi-byte encodings) marker?
                 */
                // Note: none of these consume bytes... so ptr remains at 0

                switch (quartet) {
                case 0x0000003c: // UCS-4, BE?
                    mBigEndian = true;
                    mBytesPerChar = 4;
                    break bomblock;
                case 0x3c000000: // UCS-4, LE?
                    mBytesPerChar = 4;
                    mBigEndian = false;
                    break bomblock;
                case 0x00003c00: // UCS-4, in-order...
                    reportWeirdUCS4("2143");
                    break bomblock;
                case 0x003c0000: // UCS-4, in-order...
                    reportWeirdUCS4("3412");
                    break bomblock;
                case 0x003c003f: // UTF-16, BE
                    mBytesPerChar = 2;
                    mBigEndian = true;
                    break bomblock;
                case 0x3c003f00: // UTF-16, LE
                    mBytesPerChar = 2;
                    mBigEndian = false;
                    break bomblock;
                case 0x3c3f786d: // UTF-8, Ascii, ISO-Latin
                    mBytesPerChar = 1;
                    mBigEndian = true; // doesn't really matter
                    break bomblock;

                case 0x4c6fa794:
                    mBytesPerChar = -1;
                    mEBCDIC = true;

                    /* For xml declaration handling we can basically
                     * use any of EBCDIC variants, since declaration
                     * must not contain control or punctuation characters
                     * that would differ
                     */
                    mSingleByteTranslation = EBCDICCodec.getCp037Mapping();
                    break bomblock;
                }
                
                /* Otherwise it's either single-byte doc without xml
                 * declaration, or corrupt input...
                 */
            } while (false); // BOM/auto-detection block
            
            mHadBOM = (mInputPtr > 0);

            // Let's update location markers to ignore BOM.
            mInputProcessed = -mInputPtr;
            mInputRowStart = mInputPtr;
        }

        /* Hmmh. If we haven't figured it out, let's just assume
         * UTF-8 as per XML specs:
         */
        mByteSizeFound = (mBytesPerChar != 0);
        if (!mByteSizeFound) {
            mBytesPerChar = 1;
            mBigEndian = true; // doesn't matter
        }
    }

    Returns: Normalized encoding name/**
     * @return Normalized encoding name
     */
    protected String verifyXmlEncoding(String enc)
        throws WstxException
    {
        enc = CharsetNames.normalize(enc);

        // Let's actually verify we got matching information:
        if (enc == CharsetNames.CS_UTF8) {
            verifyEncoding(enc, 1);
        } else if (enc == CharsetNames.CS_ISO_LATIN1) {
            verifyEncoding(enc, 1);
        } else if (enc == CharsetNames.CS_US_ASCII) {
            verifyEncoding(enc, 1);
        } else if (enc == CharsetNames.CS_UTF16) {
            // BOM is obligatory, to know the ordering
            /* 22-Mar-2005, TSa: Actually, since we don't have a
             *   custom decoder, so the underlying JDK Reader may
             *   have dealt with it transparently... so we can not
             *   really throw an exception here.
             */
            //if (!mHadBOM) {
            //reportMissingBOM(enc);
            //}
            verifyEncoding(enc, 2);
        } else if (enc == CharsetNames.CS_UTF16LE) {
            verifyEncoding(enc, 2, false);
        } else if (enc == CharsetNames.CS_UTF16BE) {
            verifyEncoding(enc, 2, true);

        } else if (enc == CharsetNames.CS_UTF32) {
            // Do we require a BOM here? we can live without it...
            //if (!mHadBOM) {
            //    reportMissingBOM(enc);
            //}
            verifyEncoding(enc, 4);
        } else if (enc == CharsetNames.CS_UTF32LE) {
            verifyEncoding(enc, 4, false);
        } else if (enc == CharsetNames.CS_UTF32BE) {
            verifyEncoding(enc, 4, true);
        }
        return enc;
    }

    /*
    /////////////////////////////////////////////////////
    // Internal methods, loading input data
    /////////////////////////////////////////////////////
    */

    protected boolean ensureLoaded(int minimum)
        throws IOException
    {
        // Let's assume here buffer has enough room -- this will always
        // be true for the limited used this method gets
        int gotten = (mInputEnd - mInputPtr);
        while (gotten < minimum) {
            int count = (mIn == null) ? -1 : mIn.read(mByteBuffer, mInputEnd, mByteBuffer.length - mInputEnd);
            if (count < 1) {
                return false;
            }
            mInputEnd += count;
            gotten += count;
        }
        return true;
    }

    protected void loadMore()
        throws IOException, WstxException
    {
        /* Need to make sure offsets are properly updated for error
         * reporting purposes, and do this now while previous amounts
         * are still known.
         */
        /* Note: at this point these are all in bytes, not chars (for multibyte
         * encodings)
         */
        mInputProcessed += mInputEnd;
        mInputRowStart -= mInputEnd;

        mInputPtr = 0;
        mInputEnd = (mIn == null) ? -1 : mIn.read(mByteBuffer, 0, mByteBuffer.length);
        if (mInputEnd < 1) {
            throw new WstxEOFException(ParsingErrorMsgs.SUFFIX_IN_XML_DECL,
                                       getLocation());
        }
    }

    /*
    /////////////////////////////////////////////////////
    // Implementations of abstract parsing methods
    /////////////////////////////////////////////////////
    */

    @Override
    protected void pushback() {
        if (mBytesPerChar < 0) {
            mInputPtr += mBytesPerChar;
        } else {
            mInputPtr -= mBytesPerChar;
        }
    }

    @Override
    protected int getNext()
        throws IOException, WstxException
    {
        if (mBytesPerChar != 1) {
            if (mBytesPerChar == -1) { // need to translate
                return nextTranslated();
            }
            return nextMultiByte();
        }
        byte b = (mInputPtr < mInputEnd) ?
            mByteBuffer[mInputPtr++] : nextByte();
        return (b & 0xFF);
    }

    @Override
    protected int getNextAfterWs(boolean reqWs)
        throws IOException, WstxException
    {
        int count;

        if (mBytesPerChar == 1) { // single byte
            count = skipSbWs();
        } else {
            if (mBytesPerChar == -1) { // translated
                count = skipTranslatedWs();
            } else { // multi byte
                count = skipMbWs();
            }
        }

        if (reqWs && count == 0) {
            reportUnexpectedChar(getNext(), ERR_XMLDECL_EXP_SPACE);
        }

        // inlined getNext()
        if (mBytesPerChar != 1) {
            if (mBytesPerChar == -1) { // translated
                return nextTranslated();
            }
            return nextMultiByte();
        }
        byte b = (mInputPtr < mInputEnd) ?
            mByteBuffer[mInputPtr++] : nextByte();
        return (b & 0xFF);
    }

    Returns: First character that does not match expected, if any;
   CHAR_NULL if match succeeded/**
     * @return First character that does not match expected, if any;
     *    CHAR_NULL if match succeeded
     */
    @Override
    protected int checkKeyword(String exp)
        throws IOException, WstxException
    {
        if (mBytesPerChar != 1) {
            if (mBytesPerChar == -1) {
                return checkTranslatedKeyword(exp);
            }
            return checkMbKeyword(exp);
        }
        return checkSbKeyword(exp);
    }

    @Override
    protected int readQuotedValue(char[] kw, int quoteChar)
        throws IOException, WstxException
    {
        int i = 0;
        int len = kw.length;
        boolean simple = (mBytesPerChar == 1);
        boolean mb = !simple && (mBytesPerChar > 1);

        while (i < len) {
            int c;

            if (simple) {
                byte b = (mInputPtr < mInputEnd) ?
                    mByteBuffer[mInputPtr++] : nextByte();
                if (b == BYTE_NULL) {
                    reportNull();
                }
                if (b == BYTE_CR || b == BYTE_LF) {
                    skipSbLF(b);
                    b = BYTE_LF;
                }
                c = (b & 0xFF);
            } else {
                if (mb) {
                    c = nextMultiByte();
                    if (c ==  CHAR_CR || c == CHAR_LF) {
                        skipMbLF(c);
                        c = CHAR_LF;
                    }
                } else {
                    c = nextTranslated();
                    if (c ==  CHAR_CR || c == CHAR_LF) {
                        skipTranslatedLF(c);
                        c = CHAR_LF;
                    }
                }
            }

            if (c == quoteChar) {
                return (i < len) ? i : -1;
            }

            if (i < len) {
                kw[i++] = (char) c;
            }
        }
        
        /* If we end up this far, we ran out of buffer space... let's let
         * caller figure that out, though
         */
        return -1;
    }

    protected boolean hasXmlDecl()
        throws IOException, WstxException
    {
        /* Separate handling for common and fast case; 1/variable byte
         * encodings that have ASCII subset:
         */
        if (mBytesPerChar == 1) {
            /* However... there has to be at least 6 bytes available; and if
             * so, can check the 'signature' easily:
             */
            if (ensureLoaded(6)) {
                if (mByteBuffer[mInputPtr] == '<'
                    && mByteBuffer[mInputPtr+1] == '?'
                    && mByteBuffer[mInputPtr+2] == 'x'
                    && mByteBuffer[mInputPtr+3] == 'm'
                    && mByteBuffer[mInputPtr+4] == 'l'
                    && ((mByteBuffer[mInputPtr+5] & 0xFF) <= CHAR_SPACE)) {

                    // Let's skip stuff so far:
                    mInputPtr += 6;
                    return true;
                }
            }
        } else if (mBytesPerChar == -1) { // translated (EBCDIC)
            if (ensureLoaded(6)) {
                int start = mInputPtr; // if we have to 'unread' chars
                if (nextTranslated() == '<'
                    && nextTranslated() == '?'
                    && nextTranslated() == 'x'
                    && nextTranslated() == 'm'
                    && nextTranslated() == 'l'
                    && nextTranslated() <= CHAR_SPACE) {
                    return true;
                }
                mInputPtr = start; // push data back
            }
        } else {
            // ... and then for slower fixed-multibyte encodings:

            // Is there enough data for checks?
            if (ensureLoaded (6 * mBytesPerChar)) {
                int start = mInputPtr; // if we have to 'unread' chars
                if (nextMultiByte() == '<'
                    && nextMultiByte() == '?'
                    && nextMultiByte() == 'x'
                    && nextMultiByte() == 'm'
                    && nextMultiByte() == 'l'
                    && nextMultiByte() <= CHAR_SPACE) {
                    return true;
                }
                mInputPtr = start; // push data back
            }
        }

        return false;
    }

    @Override
    protected Location getLocation()
    {
        /* Ok; for fixed-size multi-byte encodings, need to divide numbers
         * to get character locations. For variable-length encodings the
         * good thing is that xml declaration only uses shortest codepoints,
         * ie. char count == byte count.
         */
        int total = mInputProcessed + mInputPtr;
        int col = mInputPtr - mInputRowStart;

        if (mBytesPerChar > 1) {
            total /= mBytesPerChar;
            col /= mBytesPerChar;
        }

        return new WstxInputLocation(null, mPublicId, mSystemId,
                                     total - 1, // 0-based
                                     mInputRow, col);
    }

    /*
    /////////////////////////////////////////////////////
    // Internal methods, single-byte access methods
    /////////////////////////////////////////////////////
    */

    protected byte nextByte()
        throws IOException, WstxException
    {
        if (mInputPtr >= mInputEnd) {
            loadMore();
        }
        return mByteBuffer[mInputPtr++];
    }

    protected int skipSbWs()
        throws IOException, WstxException
    {
        int count = 0;

        while (true) {
            byte b = (mInputPtr < mInputEnd) ?
                mByteBuffer[mInputPtr++] : nextByte();

            if ((b & 0xFF) > CHAR_SPACE) {
                --mInputPtr;
                break;
            }
            if (b == BYTE_CR || b == BYTE_LF) {
                skipSbLF(b);
            } else if (b == BYTE_NULL) {
                reportNull();
            }
            ++count;
        }
        return count;
    }

    protected void skipSbLF(byte lfByte)
        throws IOException, WstxException
    {
        if (lfByte == BYTE_CR) {
            byte b = (mInputPtr < mInputEnd) ?
                mByteBuffer[mInputPtr++] : nextByte();
            if (b != BYTE_LF) {
                --mInputPtr; // pushback if not 2-char/byte lf
            }
        }
        ++mInputRow;
        mInputRowStart = mInputPtr;
    }

    Returns: First character that does not match expected, if any;
   CHAR_NULL if match succeeded/**
     * @return First character that does not match expected, if any;
     *    CHAR_NULL if match succeeded
     */
    protected int checkSbKeyword(String expected)
        throws IOException, WstxException
    {
        int len = expected.length();
        
        for (int ptr = 1; ptr < len; ++ptr) {
            byte b = (mInputPtr < mInputEnd) ?
                mByteBuffer[mInputPtr++] : nextByte();
            
            if (b == BYTE_NULL) {
                reportNull();
            }
            if ((b & 0xFF) != expected.charAt(ptr)) {
                return (b & 0xFF);
            }
        }

        return CHAR_NULL;
    }

    /*
    /////////////////////////////////////////////////////
    // Internal methods, multi-byte/translated access/checks
    /////////////////////////////////////////////////////
    */

    protected int nextMultiByte()
        throws IOException, WstxException
    {
        byte b = (mInputPtr < mInputEnd) ?
            mByteBuffer[mInputPtr++] : nextByte();
        byte b2 = (mInputPtr < mInputEnd) ?
            mByteBuffer[mInputPtr++] : nextByte();
        int c;

        if (mBytesPerChar == 2) {
            if (mBigEndian) {
                c = ((b & 0xFF) << 8) | (b2 & 0xFF);
            } else {
                c = (b & 0xFF) | ((b2 & 0xFF) << 8);
            }
        } else {
            // Has to be 4 bytes
            byte b3 = (mInputPtr < mInputEnd) ?
                mByteBuffer[mInputPtr++] : nextByte();
            byte b4 = (mInputPtr < mInputEnd) ?
                mByteBuffer[mInputPtr++] : nextByte();
            
            if (mBigEndian) {
                c = (b  << 24) | ((b2 & 0xFF) << 16)
                    | ((b3 & 0xFF) << 8) | (b4 & 0xFF);
            } else {
                c = (b4  << 24) | ((b3 & 0xFF) << 16)
                    | ((b2 & 0xFF) << 8) | (b & 0xFF);
            }
        }

        // Let's catch null chars early
        if (c == 0) {
            reportNull();
        }
        return c;
    }

    protected int nextTranslated()
        throws IOException, WstxException
    {
        byte b = (mInputPtr < mInputEnd) ?
            mByteBuffer[mInputPtr++] : nextByte();
        int ch = mSingleByteTranslation[b & 0xFF];
        if (ch < 0) { // special char... won't care for now
            ch = -ch;
        }
        return ch;
    }

    protected int skipMbWs()
        throws IOException, WstxException
    {
        int count = 0;

        while (true) {
            int c = nextMultiByte();

            if (c > CHAR_SPACE) {
                mInputPtr -= mBytesPerChar;
                break;
            }
            if (c == CHAR_CR || c == CHAR_LF) {
                skipMbLF(c);
            } else if (c == CHAR_NULL) {
                reportNull();
            }
            ++count;
        }
        return count;
    }

    protected int skipTranslatedWs()
        throws IOException, WstxException
    {
        int count = 0;

        while (true) {
            int c = nextTranslated();

            // Hmmh. Are we to accept NEL (0x85)?
            if (c > CHAR_SPACE && c != CHAR_NEL) {
                --mInputPtr;
                break;
            }
            if (c == CHAR_CR || c == CHAR_LF) {
                skipTranslatedLF(c);
            } else if (c == CHAR_NULL) {
                reportNull();
            }
            ++count;
        }
        return count;
    }

    protected void skipMbLF(int lf)
        throws IOException, WstxException
    {
        if (lf == CHAR_CR) {
            int c = nextMultiByte();
            if (c != CHAR_LF) {
                mInputPtr -= mBytesPerChar;
            }
        }
        ++mInputRow;
        mInputRowStart = mInputPtr;
    }

    protected void skipTranslatedLF(int lf)
        throws IOException, WstxException
    {
        if (lf == CHAR_CR) {
            int c = nextTranslated();
            if (c != CHAR_LF) {
                mInputPtr -= 1;
            }
        }
        ++mInputRow;
        mInputRowStart = mInputPtr;
    }

    Returns: First character that does not match expected, if any;
   CHAR_NULL if match succeeded/**
     * @return First character that does not match expected, if any;
     *    CHAR_NULL if match succeeded
     */
    protected int checkMbKeyword(String expected)
        throws IOException, WstxException
    {
        int len = expected.length();
        
        for (int ptr = 1; ptr < len; ++ptr) {
            int c = nextMultiByte();
            if (c == BYTE_NULL) {
                reportNull();
            }
            if (c != expected.charAt(ptr)) {
              return c;
            }
        }

        return CHAR_NULL;
    }

    protected int checkTranslatedKeyword(String expected)
        throws IOException, WstxException
    {
        int len = expected.length();
        
        for (int ptr = 1; ptr < len; ++ptr) {
            int c = nextTranslated();
            if (c == BYTE_NULL) {
                reportNull();
            }
            if (c != expected.charAt(ptr)) {
              return c;
            }
        }

        return CHAR_NULL;
    }

    /*
    ////////////////////////////////////////
    // Other private methods:
    ////////////////////////////////////////
    */

    private void verifyEncoding(String id, int bpc)
        throws WstxException
    {
        if (mByteSizeFound) {
            /* Let's verify that if we matched an encoding, it's the same
             * as what was declared...
             */
            if (bpc != mBytesPerChar) {
                // [WSTX-138]: Needs to detect EBCDIC discrepancy
                if (mEBCDIC) {
                    reportXmlProblem("Declared encoding '"+id+"' incompatible with auto-detected physical encoding (EBCDIC variant), can not decode input since actual code page not known");
                }
                reportXmlProblem("Declared encoding '"+id+"' uses "+bpc
                                 +" bytes per character; but physical encoding appeared to use "+mBytesPerChar+"; cannot decode");
            }
        }
    }

    private void verifyEncoding(String id, int bpc, boolean bigEndian)
        throws WstxException
    {
        if (mByteSizeFound) {
            verifyEncoding(id, bpc);

            if (bigEndian != mBigEndian) {
                String bigStr = bigEndian ? "big" : "little";
                reportXmlProblem
                    ("Declared encoding '"+id+"' has different endianness ("
                     +bigStr+" endian) than what physical ordering appeared to be; cannot decode");
            }
        }
    }

    private void reportWeirdUCS4(String type)
        throws IOException
    {
        throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected");
    }

    /*
    private void reportMissingBOM(String enc)
        throws WstxException
    {
        throw new WstxException("Missing BOM for encoding '"+enc+"'; can not be omitted",
                                getLocation());
    }
    */
}
/

com.fasterxml.woodstox/ woodstox-core/ 6.2.3/ com/ctc/wstx/io/StreamBootstrapper.java