com.fasterxml.jackson.dataformat/jackson-dataformat-toml/2.12.3 : com/fasterxml/jackson/dataformat/toml/UTF8Reader.java

UTF8Reader
https://github.com/FasterXML/jackson-dataformats-text: Support for reading and writing TOML-encoded data via Jackson abstractions. (FasterXML)
The Apache Software License, Version 2.0
Tatu Saloranta
package com.fasterxml.jackson.dataformat.toml;

import java.io.*;

import com.fasterxml.jackson.core.io.IOContext;

Optimized Reader that reads UTF-8 encoded content from an input stream. Content may come either from a static byte[] buffer or InputStream. /**
 * Optimized Reader that reads UTF-8 encoded content from an input stream.
 * Content may come either from a static {@code byte[]} buffer or
 * {@link java.io.InputStream}.
 */
public final class UTF8Reader
    extends Reader
{
    IO context to use for returning input buffer, iff
buffer is to be recycled when input ends.
/**
     * IO context to use for returning input buffer, iff
     * buffer is to be recycled when input ends.
     */
    private final IOContext _ioContext;

    private InputStream _inputSource;

    private final boolean _autoClose;

    private byte[] _inputBuffer;

    Pointer to the next available byte (if any), iff less than
mByteBufferEnd
/**
     * Pointer to the next available byte (if any), iff less than
     * <code>mByteBufferEnd</code>
     */
    private int _inputPtr;

    Pointed to the end marker, that is, position one after the last
valid available byte.
/**
     * Pointed to the end marker, that is, position one after the last
     * valid available byte.
     */
    private int _inputEnd;

    Decoded first character of a surrogate pair, if one needs to be buffered
/**
     * Decoded first character of a surrogate pair, if one needs to be buffered
     */
    private int _surrogate = -1;

    Total read character count; used for error reporting purposes
/**
     * Total read character count; used for error reporting purposes
     */
    private int _charCount = 0;

    Total read byte count; used for error reporting purposes
/**
     * Total read byte count; used for error reporting purposes
     */
    private int _byteCount = 0;

    /*
    /**********************************************************************
    /* Life-cycle
    /**********************************************************************
     */

    // Constructor used when caller gives us
    private UTF8Reader(IOContext ctxt, InputStream in, boolean autoClose,
            byte[] buf, int ptr, int end)
    {
        super((in == null) ? buf : in);
        _ioContext = ctxt;
        _inputSource = in;
        _inputBuffer = buf;
        _inputPtr = ptr;
        _inputEnd = end;
        _autoClose = autoClose;
    }

    // Factory method used when dealing with InputStream; will need
    // to allocate and manage processing buffers
    public static UTF8Reader construct(IOContext ctxt, InputStream in, boolean autoClose)
    {
        final byte[] buf = ctxt.allocReadIOBuffer();
        return new UTF8Reader(ctxt, in, autoClose, buf, 0, 0);
    }

    // Factory method used when user passes us input in static pre-filled
    // input buffer: no InputStream nor buffer recycling used
    public static UTF8Reader construct(byte[] buf, int ptr, int len)
    {
        return new UTF8Reader(null, null, true, buf, ptr, ptr+len);
    }

    This method should be called along with (or instead of) normal
close. After calling this method, no further reads should be tried.
Method will try to recycle read buffers (if any).
/**
     * This method should be called along with (or instead of) normal
     * close. After calling this method, no further reads should be tried.
     * Method will try to recycle read buffers (if any).
     */
    private void freeBuffers()
    {
        if (_ioContext != null) {
            byte[] buf = _inputBuffer;
            if (buf != null) {
                _inputBuffer = null;
                _ioContext.releaseReadIOBuffer(buf);
            }
        }
    }

    /*
    /**********************************************************************
    /* Reader API
    /**********************************************************************
     */

    @Override
    public void close() throws IOException
    {
        InputStream in = _inputSource;
        if (in != null) {
            _inputSource = null;
            if (_autoClose) {
                in.close();
            }
        }
        freeBuffers();
    }

    private char[] _tmpBuffer = null;

    // This method is implemented by the base class, AND it should
    //never be called by our code. But let's still implement it bit more
    // efficiently just in case
    @Override
    public int read() throws IOException
    {
        if (_tmpBuffer == null) {
            _tmpBuffer = new char[1];
        }
        if (read(_tmpBuffer, 0, 1) < 1) {
            return -1;
        }
        return _tmpBuffer[0];
    }

    @Override
    public int read(char[] cbuf) throws IOException {
        return read(cbuf, 0, cbuf.length);
    }

    @Override
    public int read(char[] cbuf, int start, int len) throws IOException
    {
        // Already EOF?
        if (_inputBuffer == null) {
            return -1;
        }
        len += start;
        int outPtr = start;

        // Ok, first; do we have a surrogate from last round?
        if (_surrogate >= 0) {
            cbuf[outPtr++] = (char) _surrogate;
            _surrogate = -1;
            // No need to load more, already got one char
            // 05-Apr-2021, tatu: but if at the end must return:
            if (_inputPtr >= _inputEnd) {
                _charCount += 1;
                return 1;
            }
        } else {
            // To prevent unnecessary blocking (esp. with network streams),
            // we'll only require decoding of a single char
            int left = (_inputEnd - _inputPtr);

            // 09-Apr-2021, tatu: Not 100% sure this is valid logic for cases
            //   where content ends with UTF-8 multibyte characters... but
            //   this has existed in multiple incarnations for over a decade
            //   so... maybe I thought it all through?
            if (left < 4) {
                // Need to load more?
                if (left < 1 || _inputBuffer[_inputPtr] < 0) {
                    if (!loadMore(left)) { // (legal) EOF?
                        return -1;
                    }
                }
            }
        }
        final byte[] buf = _inputBuffer;
        int inPtr = _inputPtr;
        final int inBufLen = _inputEnd;

        main_loop:
        while (outPtr < len) {
            // At this point we have at least one byte available
            int c = (int) buf[inPtr++];

            // Let's first do the quickie loop for common case; 7-bit ASCII
            if (c >= 0) { // ASCII? can probably loop, then
                cbuf[outPtr++] = (char) c; // ok since MSB is never on

                // Ok, how many such chars could we safely process without overruns?
                // (will combine 2 in-loop comparisons into just one)
                int outMax = (len - outPtr); // max output
                int inMax = (inBufLen - inPtr); // max input
                int inEnd = inPtr + ((inMax < outMax) ? inMax : outMax);

                ascii_loop:
                while (true) {
                    if (inPtr >= inEnd) {
                        break main_loop;
                    }
                    c = buf[inPtr++];
                    if (c < 0) { // or multi-byte
                        break ascii_loop;
                    }
                    cbuf[outPtr++] = (char) c;
                }
            }

            int needed;

            // Ok; if we end here, we got multi-byte combination
            if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
                c = (c & 0x1F);
                needed = 1;
            } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
                c = (c & 0x0F);
                needed = 2;
            } else if ((c & 0xF8) == 0xF0) {
                // 4 bytes; double-char BS, with surrogates and all...
                c = (c & 0x0F);
                needed = 3;
            } else {
                reportInvalidInitial(c & 0xFF, outPtr-start);
                // never gets here...
                needed = 1;
            }
            // Do we have enough bytes? If not, let's just push back the
            // byte and leave, since we have already gotten at least one
            // char decoded. This way we will only block (with read from
            // input stream) when absolutely necessary.
            if ((inBufLen - inPtr) < needed) {
                --inPtr;
                break main_loop;
            }

            int d = (int) buf[inPtr++];
            if ((d & 0xC0) != 0x080) {
                reportInvalidOther(d & 0xFF, outPtr-start);
            }
            c = (c << 6) | (d & 0x3F);

            if (needed > 1) { // needed == 1 means 2 bytes total
                d = buf[inPtr++]; // 3rd byte
                if ((d & 0xC0) != 0x080) {
                    reportInvalidOther(d & 0xFF, outPtr-start);
                }
                c = (c << 6) | (d & 0x3F);
                if (needed > 2) { // 4 bytes? (need surrogates)
                    d = buf[inPtr++];
                    if ((d & 0xC0) != 0x080) {
                        reportInvalidOther(d & 0xFF, outPtr-start);
                    }
                    c = (c << 6) | (d & 0x3F);
                    /* Ugh. Need to mess with surrogates. Ok; let's inline them
                     * there, then, if there's room: if only room for one,
                     * need to save the surrogate for the rainy day...
                     */
                    c -= 0x10000; // to normalize it starting with 0x0
                    cbuf[outPtr++] = (char) (0xD800 + (c >> 10));
                    // hmmh. can this ever be 0? (not legal, at least?)
                    c = (0xDC00 | (c & 0x03FF));

                    // Room for second part?
                    if (outPtr >= len) { // nope
                        _surrogate = c;
                        break main_loop;
                    }
                    // sure, let's fall back to normal processing:
                }
                // Otherwise, should we check that 3-byte chars are
                // legal ones (should not expand to surrogates?
                // For now, let's not...
                /*
                else {
                    if (c >= 0xD800 && c < 0xE000) {
                        reportInvalid(c, outPtr-start, "(a surrogate character) ");
                    }
                }
                */
            }
            cbuf[outPtr++] = (char) c;
            if (inPtr >= inBufLen) {
                break main_loop;
            }
        }

        _inputPtr = inPtr;
        len = outPtr - start;
        _charCount += len;
        return len;
    }

    /*
    /**********************************************************************
    /* Internal methods: loading more input
    /**********************************************************************
     */

    Params: available – Number of "unused" bytes in the input buffer
Returns: True, if enough bytes were read to allow decoding of at least
  one full character; false if EOF was encountered instead./**
     * @param available Number of "unused" bytes in the input buffer
     *
     * @return True, if enough bytes were read to allow decoding of at least
     *   one full character; false if EOF was encountered instead.
     */
    private boolean loadMore(int available) throws IOException
    {
        if (_inputSource == null) {
            return false;
        }

        _byteCount += (_inputEnd - available);

        // Bytes that need to be moved to the beginning of buffer?
        if (available > 0) {
            if (_inputPtr > 0) {
                for (int i = 0; i < available; ++i) {
                    _inputBuffer[i] = _inputBuffer[_inputPtr+i];
                }
                _inputPtr = 0;
                _inputEnd = available;
            }
        } else {
            // Ok; here we can actually reasonably expect an EOF,
            // so let's do a separate read right away:
            int count = readBytes();
            if (count < 1) {
                freeBuffers(); // recycle eagerly
                if (count < 0) { // -1
                    return false;
                }
                // 0 count is no good; let's err out
                reportStrangeStream();
            }
        }

        // We now have at least one byte... and that allows us to
        // calculate exactly how many bytes we need!
        @SuppressWarnings("cast")
        int c = (int) _inputBuffer[_inputPtr];
        if (c >= 0) { // single byte (ascii) char... cool, can return
            return true;
        }

        // Ok, a multi-byte char, let's check how many bytes we'll need:
        int needed;
        if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
            needed = 2;
        } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
            needed = 3;
        } else if ((c & 0xF8) == 0xF0) {
            // 4 bytes; double-char BS, with surrogates and all...
            needed = 4;
        } else {
            reportInvalidInitial(c & 0xFF, 0);
            // never gets here... but compiler whines without this:
            needed = 1;
        }

        // And then we'll just need to load up to that many bytes;
        // if an EOF is hit, that'll be an error. But we need not do
        // actual decoding here, just load enough bytes.
        while ((_inputPtr + needed) > _inputEnd) {
            int count = readBytesAt(_inputEnd);
            if (count < 1) {
                if (count < 0) { // -1, EOF... no good!
                    freeBuffers();
                    reportUnexpectedEOF(_inputEnd, needed);
                }
                // 0 count is no good; let's err out
                reportStrangeStream();
            }
        }
        return true;
    }

    protected final int readBytes() throws IOException
    {
        _inputPtr = 0;
        _inputEnd = 0;
        if (_inputSource != null) {
            int count = _inputSource.read(_inputBuffer, 0, _inputBuffer.length);
            if (count > 0) {
                _inputEnd = count;
            }
            return count;
        }
        return -1;
    }

    protected final int readBytesAt(int offset) throws IOException
    {
        // shouldn't modify mBytePtr, assumed to be 'offset'
        if (_inputSource != null) {
            int count = _inputSource.read(_inputBuffer, offset, _inputBuffer.length - offset);
            if (count > 0) {
                _inputEnd += count;
            }
            return count;
        }
        return -1;
    }

    /*
    /**********************************************************************
    /* Internal methods: error reporting
    /**********************************************************************
     */

    private void reportInvalidInitial(int mask, int offset) throws IOException
    {
        // input (byte) ptr has been advanced by one, by now:
        int bytePos = _byteCount + _inputPtr - 1;
        int charPos = _charCount + offset + 1;

        throw new CharConversionException("Invalid UTF-8 start byte 0x"+Integer.toHexString(mask)
                +" (at char #"+charPos+", byte #"+bytePos+")");
    }

    private void reportInvalidOther(int mask, int offset)
        throws IOException
    {
        int bytePos = _byteCount + _inputPtr - 1;
        int charPos = _charCount + offset;

        throw new CharConversionException("Invalid UTF-8 middle byte 0x"+Integer.toHexString(mask)
                +" (at char #"+charPos+", byte #"+bytePos+")");
    }

    private void reportUnexpectedEOF(int gotBytes, int needed)
        throws IOException
    {
        int bytePos = _byteCount + gotBytes;
        int charPos = _charCount;

        throw new CharConversionException("Unexpected EOF in the middle of a multi-byte char: got "
                +gotBytes+", needed "+needed +", at char #"+charPos+", byte #"+bytePos+")");
    }

    protected void reportBounds(char[] cbuf, int start, int len) throws IOException {
        throw new ArrayIndexOutOfBoundsException("read(buf,"+start+","+len+"), cbuf["+cbuf.length+"]");
    }

    protected void reportStrangeStream() throws IOException {
        throw new IOException("Strange I/O stream, returned 0 bytes on read");
    }
}
Params:	available – Number of "unused" bytes in the input buffer
Returns:	True, if enough bytes were read to allow decoding of at least one full character; false if EOF was encountered instead.
/

com.fasterxml.jackson.dataformat/ jackson-dataformat-toml/ 2.12.3/ com/fasterxml/jackson/dataformat/toml/UTF8Reader.java