/*
 * Copyright (c) 2007, 2017, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
package com.oracle.objectfile.io;

import java.io.CharConversionException;
import java.nio.ByteBuffer;

Implements UTF-8 encoding and decoding of strings with support for zero-bytes as string terminators.
/** * Implements UTF-8 encoding and decoding of strings with support for zero-bytes as string * terminators. */
public final class Utf8 { private Utf8() { }
Returns:the length in bytes of the UTF8 representation of the string
/** * @return the length in bytes of the UTF8 representation of the string */
public static int utf8Length(String string) { return utf8Length(string, 0, string.length()); }
Params:
  • beginIndex – first index that is part of the region, inclusive
  • endIndex – index at the end of the region, exclusive
Returns:the length in bytes of the UTF8 representation of the string region
/** * @param beginIndex first index that is part of the region, inclusive * @param endIndex index at the end of the region, exclusive * @return the length in bytes of the UTF8 representation of the string region */
public static int utf8Length(String s, int beginIndex, int endIndex) { if (beginIndex < 0 || endIndex > s.length() || beginIndex > endIndex) { throw new StringIndexOutOfBoundsException(); } int length = 0; for (int i = beginIndex; i < endIndex; i++) { final int c = s.charAt(i); if ((c >= 0x0001) && (c <= 0x007F)) { length++; } else if (c > 0x07FF) { length += 3; } else { length += 2; } } return length; }
Writes an UTF8-encoded string region to a given byte buffer.
Params:
  • dest – the byte buffer to write to
  • source – the String to be written
  • beginIndex – first index in source that is part of the region, inclusive
  • endIndex – index in source at the end of the region, exclusive
  • zeroTerminate – whether to write a final zero byte
/** * Writes an UTF8-encoded string region to a given byte buffer. * * @param dest the byte buffer to write to * @param source the String to be written * @param beginIndex first index in {@code source} that is part of the region, inclusive * @param endIndex index in {@code source} at the end of the region, exclusive * @param zeroTerminate whether to write a final zero byte */
public static void substringToUtf8(ByteBuffer dest, String source, int beginIndex, int endIndex, boolean zeroTerminate) { if (beginIndex < 0 || endIndex > source.length() || beginIndex > endIndex) { throw new StringIndexOutOfBoundsException(); } for (int i = beginIndex; i < endIndex; i++) { final char c = source.charAt(i); if ((c >= 0x0001) && (c <= 0x007F)) { dest.put((byte) c); } else if (c > 0x07FF) { dest.put((byte) (0xe0 | (byte) (c >> 12))); dest.put((byte) (0x80 | ((c & 0xfc0) >> 6))); dest.put((byte) (0x80 | (c & 0x3f))); } else { dest.put((byte) (0xc0 | (byte) (c >> 6))); dest.put((byte) (0x80 | (c & 0x3f))); } } if (zeroTerminate) { dest.put((byte) 0); } }
Converts a byte buffer of UTF-8 data to a String. The entire buffer until the buffer's limit is converted unless zeroTerminated is true, in which case conversion stops at the first zero byte.
Params:
  • zeroTerminated – if true, then a 0 byte marks the end of the string, and character '\0' in the input must be encoded as two bytes as opposed to one
  • source – the byte buffer to read from
Returns:the decoded string
/** * Converts a byte buffer of UTF-8 data to a String. The entire buffer until the * {@link ByteBuffer#limit() buffer's limit} is converted unless {@code zeroTerminated} is * {@code true}, in which case conversion stops at the first zero byte. * * @param zeroTerminated if true, then a 0 byte marks the end of the string, and character '\0' * in the input must be encoded as two bytes as opposed to one * @param source the byte buffer to read from * @return the decoded string */
public static String utf8ToString(boolean zeroTerminated, ByteBuffer source) throws CharConversionException { final StringBuilder sb = new StringBuilder(); while (source.hasRemaining()) { final int c0 = source.get() & 0xff; if (zeroTerminated && c0 == 0) { break; } switch (c0 >> 4) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: { /* 0xxxxxxx */ sb.append((char) c0); break; } case 12: case 13: { /* 110x xxxx 10xx xxxx */ final int c1 = source.get(); if ((c1 & 0xC0) != 0x80) { throw new CharConversionException(); } sb.append((char) (((c0 & 0x1F) << 6) | (c1 & 0x3F))); break; } case 14: { /* 1110 xxxx 10xx xxxx 10xx xxxx */ final int c1 = source.get(); final int c2 = source.get(); if (((c1 & 0xC0) != 0x80) || ((c2 & 0xC0) != 0x80)) { throw new CharConversionException(); } sb.append((char) (((c0 & 0x0F) << 12) | ((c1 & 0x3F) << 6) | (c2 & 0x3F))); break; } default: { /* 10xx xxxx, 1111 xxxx */ throw new CharConversionException(); } } } return sb.toString(); } }