java/11 : jdk.compiler/com/sun/tools/javac/parser/UnicodeReader.java

UnicodeReader
https://openjdk.java.net/
GPLv2 + Classpath Exception
/*
 * Copyright (c) 2011, 2013, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package com.sun.tools.javac.parser;

import java.nio.CharBuffer;
import java.util.Arrays;

import com.sun.tools.javac.file.JavacFileManager;
import com.sun.tools.javac.resources.CompilerProperties.Errors;
import com.sun.tools.javac.util.ArrayUtils;
import com.sun.tools.javac.util.Log;
import com.sun.tools.javac.util.Name;
import com.sun.tools.javac.util.Names;

import static com.sun.tools.javac.util.LayoutCharacters.*;

The char reader used by the javac lexer/tokenizer. Returns the sequence of
characters contained in the input stream, handling unicode escape accordingly.
Additionally, it provides features for saving chars into a buffer and to retrieve
them at a later stage.
 This is NOT part of any supported API.
 If you write code that depends on this, you do so at your own risk.
 This code and its internal interfaces are subject to change or
 deletion without notice.
/** The char reader used by the javac lexer/tokenizer. Returns the sequence of
 * characters contained in the input stream, handling unicode escape accordingly.
 * Additionally, it provides features for saving chars into a buffer and to retrieve
 * them at a later stage.
 *
 *  <p><b>This is NOT part of any supported API.
 *  If you write code that depends on this, you do so at your own risk.
 *  This code and its internal interfaces are subject to change or
 *  deletion without notice.</b>
 */
public class UnicodeReader {

    The input buffer, index of next character to be read,
 index of one past last character in buffer.
/** The input buffer, index of next character to be read,
     *  index of one past last character in buffer.
     */
    protected char[] buf;
    protected int bp;
    protected final int buflen;

    The current character.
/** The current character.
     */
    protected char ch;

    The buffer index of the last converted unicode character
/** The buffer index of the last converted unicode character
     */
    protected int unicodeConversionBp = -1;

    protected Log log;
    protected Names names;

    A character buffer for saved chars.
/** A character buffer for saved chars.
     */
    protected char[] sbuf = new char[128];
    protected int sp;

    Create a scanner from the input array. This method might modify the array. To avoid copying the input array, ensure that inputLength < input.length or input[input.length -1] is a white space character. 
Params: sf – the factory which created this Scanner
buffer – the input, might be modified
Must be positive and less than or equal to input.length./**
     * Create a scanner from the input array.  This method might
     * modify the array.  To avoid copying the input array, ensure
     * that {@code inputLength < input.length} or
     * {@code input[input.length -1]} is a white space character.
     *
     * @param sf the factory which created this Scanner
     * @param buffer the input, might be modified
     * Must be positive and less than or equal to input.length.
     */
    protected UnicodeReader(ScannerFactory sf, CharBuffer buffer) {
        this(sf, JavacFileManager.toArray(buffer), buffer.limit());
    }

    protected UnicodeReader(ScannerFactory sf, char[] input, int inputLength) {
        log = sf.log;
        names = sf.names;
        if (inputLength == input.length) {
            if (input.length > 0 && Character.isWhitespace(input[input.length - 1])) {
                inputLength--;
            } else {
                input = Arrays.copyOf(input, inputLength + 1);
            }
        }
        buf = input;
        buflen = inputLength;
        buf[buflen] = EOI;
        bp = -1;
        scanChar();
    }

    Read next character.
/** Read next character.
     */
    protected void scanChar() {
        if (bp < buflen) {
            ch = buf[++bp];
            if (ch == '\\') {
                convertUnicode();
            }
        }
    }

    Read next character in comment, skipping over double '\' characters.
/** Read next character in comment, skipping over double '\' characters.
     */
    protected void scanCommentChar() {
        scanChar();
        if (ch == '\\') {
            if (peekChar() == '\\' && !isUnicode()) {
                skipChar();
            } else {
                convertUnicode();
            }
        }
    }

    Append a character to sbuf.
/** Append a character to sbuf.
     */
    protected void putChar(char ch, boolean scan) {
        sbuf = ArrayUtils.ensureCapacity(sbuf, sp);
        sbuf[sp++] = ch;
        if (scan)
            scanChar();
    }

    protected void putChar(char ch) {
        putChar(ch, false);
    }

    protected void putChar(boolean scan) {
        putChar(ch, scan);
    }

    Name name() {
        return names.fromChars(sbuf, 0, sp);
    }

    String chars() {
        return new String(sbuf, 0, sp);
    }

    Convert unicode escape; bp points to initial '\' character
 (Spec 3.3).
/** Convert unicode escape; bp points to initial '\' character
     *  (Spec 3.3).
     */
    protected void convertUnicode() {
        if (ch == '\\' && unicodeConversionBp != bp) {
            bp++; ch = buf[bp];
            if (ch == 'u') {
                do {
                    bp++; ch = buf[bp];
                } while (ch == 'u');
                int limit = bp + 3;
                if (limit < buflen) {
                    int d = digit(bp, 16);
                    int code = d;
                    while (bp < limit && d >= 0) {
                        bp++; ch = buf[bp];
                        d = digit(bp, 16);
                        code = (code << 4) + d;
                    }
                    if (d >= 0) {
                        ch = (char)code;
                        unicodeConversionBp = bp;
                        return;
                    }
                }
                log.error(bp, Errors.IllegalUnicodeEsc);
            } else {
                bp--;
                ch = '\\';
            }
        }
    }

    Are surrogates supported?
/** Are surrogates supported?
     */
    final static boolean surrogatesSupported = surrogatesSupported();
    private static boolean surrogatesSupported() {
        try {
            Character.isHighSurrogate('a');
            return true;
        } catch (NoSuchMethodError ex) {
            return false;
        }
    }

    Scan surrogate pairs.  If 'ch' is a high surrogate and
 the next character is a low surrogate, returns the code point
 constructed from these surrogates. Otherwise, returns -1.
 This method will not consume any of the characters.
/** Scan surrogate pairs.  If 'ch' is a high surrogate and
     *  the next character is a low surrogate, returns the code point
     *  constructed from these surrogates. Otherwise, returns -1.
     *  This method will not consume any of the characters.
     */
    protected int peekSurrogates() {
        if (surrogatesSupported && Character.isHighSurrogate(ch)) {
            char high = ch;
            int prevBP = bp;

            scanChar();

            char low = ch;

            ch = high;
            bp = prevBP;

            if (Character.isLowSurrogate(low)) {
                return Character.toCodePoint(high, low);
            }
        }

        return -1;
    }

    Convert an ASCII digit from its base (8, 10, or 16)
 to its value.
/** Convert an ASCII digit from its base (8, 10, or 16)
     *  to its value.
     */
    protected int digit(int pos, int base) {
        char c = ch;
        if ('0' <= c && c <= '9')
            return Character.digit(c, base); //a fast common case
        int codePoint = peekSurrogates();
        int result = codePoint >= 0 ? Character.digit(codePoint, base) : Character.digit(c, base);
        if (result >= 0 && c > 0x7f) {
            log.error(pos + 1, Errors.IllegalNonasciiDigit);
            if (codePoint >= 0)
                scanChar();
            ch = "0123456789abcdef".charAt(result);
        }
        return result;
    }

    protected boolean isUnicode() {
        return unicodeConversionBp == bp;
    }

    protected void skipChar() {
        bp++;
    }

    protected char peekChar() {
        return buf[bp + 1];
    }

    Returns a copy of the input buffer, up to its inputLength.
Unicode escape sequences are not translated.
/**
     * Returns a copy of the input buffer, up to its inputLength.
     * Unicode escape sequences are not translated.
     */
    public char[] getRawCharacters() {
        char[] chars = new char[buflen];
        System.arraycopy(buf, 0, chars, 0, buflen);
        return chars;
    }

    Returns a copy of a character array subset of the input buffer. The returned array begins at the beginIndex and extends to the character at index endIndex - 1. Thus the length of the substring is endIndex-beginIndex. This behavior is like String.substring(beginIndex, endIndex). Unicode escape sequences are not translated. 
Params: beginIndex – the beginning index, inclusive.
endIndex – the ending index, exclusive.
Throws: ArrayIndexOutOfBoundsException – if either offset is outside of the
        array bounds/**
     * Returns a copy of a character array subset of the input buffer.
     * The returned array begins at the {@code beginIndex} and
     * extends to the character at index {@code endIndex - 1}.
     * Thus the length of the substring is {@code endIndex-beginIndex}.
     * This behavior is like
     * {@code String.substring(beginIndex, endIndex)}.
     * Unicode escape sequences are not translated.
     *
     * @param beginIndex the beginning index, inclusive.
     * @param endIndex the ending index, exclusive.
     * @throws ArrayIndexOutOfBoundsException if either offset is outside of the
     *         array bounds
     */
    public char[] getRawCharacters(int beginIndex, int endIndex) {
        int length = endIndex - beginIndex;
        char[] chars = new char[length];
        System.arraycopy(buf, beginIndex, chars, 0, length);
        return chars;
    }
}
Params:	beginIndex – the beginning index, inclusive. endIndex – the ending index, exclusive.
Throws:	ArrayIndexOutOfBoundsException – if either offset is outside of the array bounds
/

java/ 11/ jdk.compiler/com/sun/tools/javac/parser/UnicodeReader.java