/*
* Copyright (c) 1996, 1997, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.io;
UCS2 (UTF16) -> UCS Transformation Format 8 (UTF-8) converter
It's represented like below.
# Bits Bit pattern
1 7 0xxxxxxx
2 11 110xxxxx 10xxxxxx
3 16 1110xxxx 10xxxxxx 10xxxxxx
4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
UCS2 uses 1-3 / UTF16 uses 1-4 / UCS4 uses 1-6
/**
* UCS2 (UTF16) -> UCS Transformation Format 8 (UTF-8) converter
* It's represented like below.
*
* # Bits Bit pattern
* 1 7 0xxxxxxx
* 2 11 110xxxxx 10xxxxxx
* 3 16 1110xxxx 10xxxxxx 10xxxxxx
* 4 21 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
* 5 26 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
* 6 31 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
*
* UCS2 uses 1-3 / UTF16 uses 1-4 / UCS4 uses 1-6
*/
public class CharToByteUTF8 extends CharToByteConverter {
private char highHalfZoneCode;
public int flush(byte[] output, int outStart, int outEnd)
throws MalformedInputException
{
if (highHalfZoneCode != 0) {
highHalfZoneCode = 0;
badInputLength = 0;
throw new MalformedInputException();
}
byteOff = charOff = 0;
return 0;
}
Character conversion
/**
* Character conversion
*/
public int convert(char[] input, int inOff, int inEnd,
byte[] output, int outOff, int outEnd)
throws ConversionBufferFullException, MalformedInputException
{
char inputChar;
byte[] outputByte = new byte[6];
int inputSize;
int outputSize;
charOff = inOff;
byteOff = outOff;
if (highHalfZoneCode != 0) {
inputChar = highHalfZoneCode;
highHalfZoneCode = 0;
if (input[inOff] >= 0xdc00 && input[inOff] <= 0xdfff) {
// This is legal UTF16 sequence.
int ucs4 = (highHalfZoneCode - 0xd800) * 0x400
+ (input[inOff] - 0xdc00) + 0x10000;
output[0] = (byte)(0xf0 | ((ucs4 >> 18)) & 0x07);
output[1] = (byte)(0x80 | ((ucs4 >> 12) & 0x3f));
output[2] = (byte)(0x80 | ((ucs4 >> 6) & 0x3f));
output[3] = (byte)(0x80 | (ucs4 & 0x3f));
charOff++;
highHalfZoneCode = 0;
} else {
// This is illegal UTF16 sequence.
badInputLength = 0;
throw new MalformedInputException();
}
}
while(charOff < inEnd) {
inputChar = input[charOff];
if (inputChar < 0x80) {
outputByte[0] = (byte)inputChar;
inputSize = 1;
outputSize = 1;
} else if (inputChar < 0x800) {
outputByte[0] = (byte)(0xc0 | ((inputChar >> 6) & 0x1f));
outputByte[1] = (byte)(0x80 | (inputChar & 0x3f));
inputSize = 1;
outputSize = 2;
} else if (inputChar >= 0xd800 && inputChar <= 0xdbff) {
// this is <high-half zone code> in UTF-16
if (charOff + 1 >= inEnd) {
highHalfZoneCode = inputChar;
break;
}
// check next char is valid <low-half zone code>
char lowChar = input[charOff + 1];
if (lowChar < 0xdc00 || lowChar > 0xdfff) {
badInputLength = 1;
throw new MalformedInputException();
}
int ucs4 = (inputChar - 0xd800) * 0x400 + (lowChar - 0xdc00)
+ 0x10000;
outputByte[0] = (byte)(0xf0 | ((ucs4 >> 18)) & 0x07);
outputByte[1] = (byte)(0x80 | ((ucs4 >> 12) & 0x3f));
outputByte[2] = (byte)(0x80 | ((ucs4 >> 6) & 0x3f));
outputByte[3] = (byte)(0x80 | (ucs4 & 0x3f));
outputSize = 4;
inputSize = 2;
} else {
outputByte[0] = (byte)(0xe0 | ((inputChar >> 12)) & 0x0f);
outputByte[1] = (byte)(0x80 | ((inputChar >> 6) & 0x3f));
outputByte[2] = (byte)(0x80 | (inputChar & 0x3f));
inputSize = 1;
outputSize = 3;
}
if (byteOff + outputSize > outEnd) {
throw new ConversionBufferFullException();
}
for (int i = 0; i < outputSize; i++) {
output[byteOff++] = outputByte[i];
}
charOff += inputSize;
}
return byteOff - outOff;
}
public boolean canConvert(char ch) {
return true;
}
public int getMaxBytesPerChar() {
return 3;
}
public void reset() {
byteOff = charOff = 0;
highHalfZoneCode = 0;
}
public String getCharacterEncoding() {
return "UTF8";
}
}