/*
 * Copyright (c) 1996, 1997, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
package sun.io;


UCS2 (UTF16) -> UCS Transformation Format 8 (UTF-8) converter It's represented like below. # Bits Bit pattern 1 7 0xxxxxxx 2 11 110xxxxx 10xxxxxx 3 16 1110xxxx 10xxxxxx 10xxxxxx 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx UCS2 uses 1-3 / UTF16 uses 1-4 / UCS4 uses 1-6
/** * UCS2 (UTF16) -> UCS Transformation Format 8 (UTF-8) converter * It's represented like below. * * # Bits Bit pattern * 1 7 0xxxxxxx * 2 11 110xxxxx 10xxxxxx * 3 16 1110xxxx 10xxxxxx 10xxxxxx * 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx * 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx * * UCS2 uses 1-3 / UTF16 uses 1-4 / UCS4 uses 1-6 */
public class CharToByteUTF8 extends CharToByteConverter { private char highHalfZoneCode; public int flush(byte[] output, int outStart, int outEnd) throws MalformedInputException { if (highHalfZoneCode != 0) { highHalfZoneCode = 0; badInputLength = 0; throw new MalformedInputException(); } byteOff = charOff = 0; return 0; }
Character conversion
/** * Character conversion */
public int convert(char[] input, int inOff, int inEnd, byte[] output, int outOff, int outEnd) throws ConversionBufferFullException, MalformedInputException { char inputChar; byte[] outputByte = new byte[6]; int inputSize; int outputSize; charOff = inOff; byteOff = outOff; if (highHalfZoneCode != 0) { inputChar = highHalfZoneCode; highHalfZoneCode = 0; if (input[inOff] >= 0xdc00 && input[inOff] <= 0xdfff) { // This is legal UTF16 sequence. int ucs4 = (highHalfZoneCode - 0xd800) * 0x400 + (input[inOff] - 0xdc00) + 0x10000; output[0] = (byte)(0xf0 | ((ucs4 >> 18)) & 0x07); output[1] = (byte)(0x80 | ((ucs4 >> 12) & 0x3f)); output[2] = (byte)(0x80 | ((ucs4 >> 6) & 0x3f)); output[3] = (byte)(0x80 | (ucs4 & 0x3f)); charOff++; highHalfZoneCode = 0; } else { // This is illegal UTF16 sequence. badInputLength = 0; throw new MalformedInputException(); } } while(charOff < inEnd) { inputChar = input[charOff]; if (inputChar < 0x80) { outputByte[0] = (byte)inputChar; inputSize = 1; outputSize = 1; } else if (inputChar < 0x800) { outputByte[0] = (byte)(0xc0 | ((inputChar >> 6) & 0x1f)); outputByte[1] = (byte)(0x80 | (inputChar & 0x3f)); inputSize = 1; outputSize = 2; } else if (inputChar >= 0xd800 && inputChar <= 0xdbff) { // this is <high-half zone code> in UTF-16 if (charOff + 1 >= inEnd) { highHalfZoneCode = inputChar; break; } // check next char is valid <low-half zone code> char lowChar = input[charOff + 1]; if (lowChar < 0xdc00 || lowChar > 0xdfff) { badInputLength = 1; throw new MalformedInputException(); } int ucs4 = (inputChar - 0xd800) * 0x400 + (lowChar - 0xdc00) + 0x10000; outputByte[0] = (byte)(0xf0 | ((ucs4 >> 18)) & 0x07); outputByte[1] = (byte)(0x80 | ((ucs4 >> 12) & 0x3f)); outputByte[2] = (byte)(0x80 | ((ucs4 >> 6) & 0x3f)); outputByte[3] = (byte)(0x80 | (ucs4 & 0x3f)); outputSize = 4; inputSize = 2; } else { outputByte[0] = (byte)(0xe0 | ((inputChar >> 12)) & 0x0f); outputByte[1] = (byte)(0x80 | ((inputChar >> 6) & 0x3f)); outputByte[2] = (byte)(0x80 | (inputChar & 0x3f)); inputSize = 1; outputSize = 3; } if (byteOff + outputSize > outEnd) { throw new ConversionBufferFullException(); } for (int i = 0; i < outputSize; i++) { output[byteOff++] = outputByte[i]; } charOff += inputSize; } return byteOff - outOff; } public boolean canConvert(char ch) { return true; } public int getMaxBytesPerChar() { return 3; } public void reset() { byteOff = charOff = 0; highHalfZoneCode = 0; } public String getCharacterEncoding() { return "UTF8"; } }