java/6 : sun/io/CharToByteUTF8.java

CharToByteUTF8
https://openjdk.java.net/
GPLv2 + Classpath Exception
/*
 * Copyright (c) 1996, 1997, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
package sun.io;


UCS2 (UTF16) -> UCS Transformation Format 8 (UTF-8) converter
It's represented like below.
# Bits   Bit pattern
1    7   0xxxxxxx
2   11   110xxxxx 10xxxxxx
3   16   1110xxxx 10xxxxxx 10xxxxxx
4   21   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5   26   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6   31   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
    UCS2 uses 1-3 / UTF16 uses 1-4 / UCS4 uses 1-6
/**
 * UCS2 (UTF16) -> UCS Transformation Format 8 (UTF-8) converter
 * It's represented like below.
 *
 * # Bits   Bit pattern
 * 1    7   0xxxxxxx
 * 2   11   110xxxxx 10xxxxxx
 * 3   16   1110xxxx 10xxxxxx 10xxxxxx
 * 4   21   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 * 5   26   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 * 6   31   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 *
 *     UCS2 uses 1-3 / UTF16 uses 1-4 / UCS4 uses 1-6
 */

public class CharToByteUTF8 extends CharToByteConverter {

    private char highHalfZoneCode;

    public int flush(byte[] output, int outStart, int outEnd)
        throws MalformedInputException
    {
        if (highHalfZoneCode != 0) {
            highHalfZoneCode = 0;
            badInputLength = 0;
            throw new MalformedInputException();
        }
        byteOff = charOff = 0;
        return 0;
    }

    Character conversion
/**
     * Character conversion
     */
    public int convert(char[] input, int inOff, int inEnd,
                       byte[] output, int outOff, int outEnd)
        throws ConversionBufferFullException, MalformedInputException
    {
        char inputChar;
        byte[] outputByte = new byte[6];
        int inputSize;
        int outputSize;

        charOff = inOff;
        byteOff = outOff;

        if (highHalfZoneCode != 0) {
            inputChar = highHalfZoneCode;
            highHalfZoneCode = 0;
            if (input[inOff] >= 0xdc00 && input[inOff] <= 0xdfff) {
                // This is legal UTF16 sequence.
                int ucs4 = (highHalfZoneCode - 0xd800) * 0x400
                    + (input[inOff] - 0xdc00) + 0x10000;
                output[0] = (byte)(0xf0 | ((ucs4 >> 18)) & 0x07);
                output[1] = (byte)(0x80 | ((ucs4 >> 12) & 0x3f));
                output[2] = (byte)(0x80 | ((ucs4 >> 6) & 0x3f));
                output[3] = (byte)(0x80 | (ucs4 & 0x3f));
                charOff++;
                highHalfZoneCode = 0;
            } else {
                // This is illegal UTF16 sequence.
                badInputLength = 0;
                throw new MalformedInputException();
            }
        }

        while(charOff < inEnd) {
            inputChar = input[charOff];
            if (inputChar < 0x80) {
                outputByte[0] = (byte)inputChar;
                inputSize = 1;
                outputSize = 1;
            } else if (inputChar < 0x800) {
                outputByte[0] = (byte)(0xc0 | ((inputChar >> 6) & 0x1f));
                outputByte[1] = (byte)(0x80 | (inputChar & 0x3f));
                inputSize = 1;
                outputSize = 2;
            } else if (inputChar >= 0xd800 && inputChar <= 0xdbff) {
                // this is <high-half zone code> in UTF-16
                if (charOff + 1 >= inEnd) {
                    highHalfZoneCode = inputChar;
                    break;
                }
                // check next char is valid <low-half zone code>
                char lowChar = input[charOff + 1];
                if (lowChar < 0xdc00 || lowChar > 0xdfff) {
                    badInputLength = 1;
                    throw new MalformedInputException();
                }
                int ucs4 = (inputChar - 0xd800) * 0x400 + (lowChar - 0xdc00)
                    + 0x10000;
                outputByte[0] = (byte)(0xf0 | ((ucs4 >> 18)) & 0x07);
                outputByte[1] = (byte)(0x80 | ((ucs4 >> 12) & 0x3f));
                outputByte[2] = (byte)(0x80 | ((ucs4 >> 6) & 0x3f));
                outputByte[3] = (byte)(0x80 | (ucs4 & 0x3f));
                outputSize = 4;
                inputSize = 2;
            } else {
                outputByte[0] = (byte)(0xe0 | ((inputChar >> 12)) & 0x0f);
                outputByte[1] = (byte)(0x80 | ((inputChar >> 6) & 0x3f));
                outputByte[2] = (byte)(0x80 | (inputChar & 0x3f));
                inputSize = 1;
                outputSize = 3;
            }
            if (byteOff + outputSize > outEnd) {
                throw new ConversionBufferFullException();
            }
            for (int i = 0; i < outputSize; i++) {
                output[byteOff++] = outputByte[i];
            }
            charOff += inputSize;
        }
        return byteOff - outOff;
    }

    public boolean canConvert(char ch) {
        return true;
    }

    public int getMaxBytesPerChar() {
        return 3;
    }

    public void reset() {
        byteOff = charOff = 0;
        highHalfZoneCode = 0;
    }

    public String getCharacterEncoding() {
        return "UTF8";
    }
}
/

java/ 6/ sun/io/CharToByteUTF8.java