package com.fasterxml.aalto.util;
import java.io.OutputStreamWriter;
import java.io.Writer;
Simple utility class that normalizes given character input character
set names into canonical (within context of this processor) names
/**
* Simple utility class that normalizes given character input character
* set names into canonical (within context of this processor) names
*/
public final class CharsetNames
implements XmlConsts
{
/*
/**********************************************************
/* Canonical names used internally
/**********************************************************
*/
// // // Unicode variants:
public final static String CS_US_ASCII = "US-ASCII";
public final static String CS_UTF8 = "UTF-8";
This constants is intentionally vague, so that some other information
will be needed to determine the endianness.
/**
* This constants is intentionally vague, so that some other information
* will be needed to determine the endianness.
*/
public final static String CS_UTF16 = "UTF-16";
public final static String CS_UTF16BE = "UTF-16BE";
public final static String CS_UTF16LE = "UTF-16LE";
public final static String CS_UTF32 = "UTF-32";
public final static String CS_UTF32BE = "UTF-32BE";
public final static String CS_UTF32LE = "UTF-32LE";
// // // 8-bit ISO encodings:
public final static String CS_ISO_LATIN1 = "ISO-8859-1";
// // // Japanese non-unicode encodings:
public final static String CS_SHIFT_JIS = "Shift_JIS";
// // // Other oddities:
public final static String CS_EBCDIC = "EBCDIC";
/*
/**********************************************************
/* Utility methods
/**********************************************************
*/
public static String normalize(String csName)
{
if (csName == null || csName.length() < 3) {
return csName;
}
/* Canonical charset names here are from IANA recommendation:
* http://www.iana.org/assignments/character-sets
* but comparison is done loosely (case-insensitive, ignoring
* spacing, underscore vs. hyphen etc) to try to make detection
* as extensive as possible.
*/
/* But first bit of pre-filtering: it seems like 'cs' prefix
* is applicable to pretty much all actual encodings (as per
* IANA recommendations; csASCII, csUcs4 etc). So, let's just
* strip out the prefix if so
*/
boolean gotCsPrefix = false;
char c = csName.charAt(0);
if (c == 'c' || c == 'C'){
char d = csName.charAt(1);
if (d == 's' || d == 'S') {
csName = csName.substring(2);
c = csName.charAt(0);
gotCsPrefix = true;
}
}
switch (c) {
case 'a':
case 'A':
if (csName == "ASCII" || equalEncodings(csName, "ASCII")) {
return CS_US_ASCII;
}
break;
case 'c':
case 'C':
// Hmmh. There are boatloads of these... but what to do with them?
if (encodingStartsWith(csName, "cs")) {
// !!! TBI
}
break;
case 'e':
case 'E':
if (csName.startsWith("EBCDIC") ||
csName.startsWith("ebcdic")) {
return CS_EBCDIC;
}
break;
case 'i':
case 'I':
if (csName == CS_ISO_LATIN1
|| equalEncodings(csName, CS_ISO_LATIN1)
|| equalEncodings(csName, "ISO-Latin1")) {
return CS_ISO_LATIN1;
}
if (encodingStartsWith(csName, "ISO-10646")) {
/* Hmmh. There are boatloads of alternatives here, it
* seems (see http://www.iana.org/assignments/character-sets
* for details)
*/
int ix = csName.indexOf("10646");
String suffix = csName.substring(ix+5);
if (equalEncodings(suffix, "UCS-Basic")) {
return CS_US_ASCII;
}
if (equalEncodings(suffix, "Unicode-Latin1")) {
return CS_ISO_LATIN1;
}
if (equalEncodings(suffix, "UCS-2")) {
return CS_UTF16; // endianness?
}
if (equalEncodings(suffix, "UCS-4")) {
return CS_UTF32; // endianness?
}
if (equalEncodings(suffix, "UTF-1")) {
// "Universal Transfer Format (1), this is the multibyte encoding, that subsets ASCII-7"???
return CS_US_ASCII;
}
if (equalEncodings(suffix, "J-1")) {
// Name: ISO-10646-J-1, Source: ISO 10646 Japanese, see RFC 1815.
// ... so what does that really mean? let's consider it ascii
return CS_US_ASCII;
}
if (equalEncodings(suffix, "US-ASCII")) {
return CS_US_ASCII;
}
}
break;
case 'j':
case 'J':
if (equalEncodings(csName, "JIS_Encoding")) {
return CS_SHIFT_JIS;
}
break;
case 's':
case 'S':
if (equalEncodings(csName, "Shift_JIS")) {
return CS_SHIFT_JIS;
}
break;
case 'u':
case 'U':
if (csName.length() < 2) { // sanity check
break;
}
switch (csName.charAt(1)) {
case 'c':
case 'C':
if (equalEncodings(csName, "UCS-2")) {
return CS_UTF16;
}
if (equalEncodings(csName, "UCS-4")) {
return CS_UTF32;
}
break;
case 'n': // csUnicodeXxx,
case 'N':
if (gotCsPrefix) {
if (equalEncodings(csName, "Unicode")) {
return CS_UTF16; // need BOM
}
if (equalEncodings(csName, "UnicodeAscii")) {
return CS_ISO_LATIN1;
}
if (equalEncodings(csName, "UnicodeAscii")) {
return CS_US_ASCII;
}
}
break;
case 's':
case 'S':
if (equalEncodings(csName, "US-ASCII")) {
return CS_US_ASCII;
}
break;
case 't':
case 'T':
if (csName == CS_UTF8 || equalEncodings(csName, CS_UTF8)) {
return CS_UTF8;
}
if (equalEncodings(csName, "UTF-16BE")) {
return CS_UTF16BE;
}
if (equalEncodings(csName, "UTF-16LE")) {
return CS_UTF16LE;
}
if (equalEncodings(csName, "UTF-16")) {
return CS_UTF16;
}
if (equalEncodings(csName, "UTF-32BE")) {
return CS_UTF32BE;
}
if (equalEncodings(csName, "UTF-32LE")) {
return CS_UTF32LE;
}
if (equalEncodings(csName, "UTF-32")) {
return CS_UTF32;
}
if (equalEncodings(csName, "UTF")) {
// 21-Jan-2006, TSa: ??? What is this to do... ?
return CS_UTF16;
}
}
break;
}
return csName;
}
Because of legacy encodings used by earlier JDK versions, we
need to be careful when accessing encoding names via JDK
classes.
/**
* Because of legacy encodings used by earlier JDK versions, we
* need to be careful when accessing encoding names via JDK
* classes.
*/
public static String findEncodingFor(Writer w)
{
if (w instanceof OutputStreamWriter) {
String enc = ((OutputStreamWriter) w).getEncoding();
return normalize(enc);
}
return null;
}
/*
/**********************************************************
/* Internal helper methods
/**********************************************************
*/
Internal constant used to denote END-OF-STRING
/**
* Internal constant used to denote END-OF-STRING
*/
private final static int EOS = 0x10000;
Method that implements a loose String comparison for encoding Strings. It will work like String.equalsIgnoreCase
, except that it will also ignore all hyphen, underscore and space characters. /**
* Method that implements a loose String comparison for encoding
* Strings. It will work like {@link String#equalsIgnoreCase},
* except that it will also ignore all hyphen, underscore and
* space characters.
*/
public static boolean equalEncodings(String str1, String str2)
{
int len1 = str1.length();
int len2 = str2.length();
int i1 = 0, i2 = 0;
// Need to loop completely over both Strings
while (i1 < len1 || i2 < len2) {
int c1 = (i1 >= len1) ? EOS : str1.charAt(i1++);
int c2 = (i2 >= len2) ? EOS : str2.charAt(i2++);
// Can first do a quick comparison (usually they are equal)
if (c1 == c2) {
continue;
}
// if not equal, maybe there are WS/hyphen/underscores to skip
while (c1 <= CHAR_SPACE || c1 == '_' || c1 == '-') {
c1 = (i1 >= len1) ? EOS : str1.charAt(i1++);
}
while (c2 <= CHAR_SPACE || c2 == '_' || c2 == '-') {
c2 = (i2 >= len2) ? EOS : str2.charAt(i2++);
}
// Ok, how about case differences, then?
if (c1 != c2) {
// If one is EOF, can't match (one is substring of the other)
if (c1 == EOS || c2 == EOS) {
return false;
}
if (Character.toLowerCase((char)c1) != Character.toLowerCase((char)c2)) {
return false;
}
}
}
// If we got this far, we are ok as long as we got through it all
return true;
}
public static boolean encodingStartsWith(String enc, String prefix)
{
int len1 = enc.length();
int len2 = prefix.length();
int i1 = 0, i2 = 0;
// Need to loop completely over both Strings
while (i1 < len1 || i2 < len2) {
int c1 = (i1 >= len1) ? EOS : enc.charAt(i1++);
int c2 = (i2 >= len2) ? EOS : prefix.charAt(i2++);
// Can first do a quick comparison (usually they are equal)
if (c1 == c2) {
continue;
}
// if not equal, maybe there are WS/hyphen/underscores to skip
while (c1 <= CHAR_SPACE || c1 == '_' || c1 == '-') {
c1 = (i1 >= len1) ? EOS : enc.charAt(i1++);
}
while (c2 <= CHAR_SPACE || c2 == '_' || c2 == '-') {
c2 = (i2 >= len2) ? EOS : prefix.charAt(i2++);
}
// Ok, how about case differences, then?
if (c1 != c2) {
if (c2 == EOS) { // Prefix done, good!
return true;
}
if (c1 == EOS) { // Encoding done, not good
return false;
}
if (Character.toLowerCase((char)c1) != Character.toLowerCase((char)c2)) {
return false;
}
}
}
// Ok, prefix was exactly the same as encoding... that's fine
return true;
}
}