package com.fasterxml.aalto.util;

import java.util.*;

This class is used for canonicalization of namespace URIs. It will act as a layer above String.intern(), trying to reduce calls to somewhat slow intern() method, and to do that as efficiently as possible considering that Strings in question are often longer than names in xml documents.
/** * This class is used for canonicalization of namespace URIs. * It will act as a layer above String.intern(), trying to reduce * calls to somewhat slow intern() method, and to do that as efficiently * as possible considering that Strings in question are often * longer than names in xml documents. */
public final class UriCanonicalizer { private BoundedHashMap mURIs = null; public UriCanonicalizer() { } private void init() { mURIs = new BoundedHashMap(); } public synchronized String canonicalizeURI(char[] ch, int len) { CanonicalKey key = new CanonicalKey(ch, len); if (mURIs == null) { init(); } else { String result = (String) mURIs.get(key); if (result != null) { return result; } } /* Key we have is not yet stable, as the underlying array * is shared and mutable. So: */ key = key.safeClone(); // Also, now we should intern() the URI String uri = new String(ch, 0, len).intern(); mURIs.put(key, uri); return uri; } /* /////////////////////////////////////////////////// // Helper classes /////////////////////////////////////////////////// */
We'll use a bounded map, which should work well for most normal cases, but avoid excesses for degenerate cases (unique URIs used as idenfitiers etc).
/** * We'll use a bounded map, which should work well for most normal * cases, but avoid excesses for degenerate cases (unique URIs * used as idenfitiers etc). */
@SuppressWarnings("serial") final static class BoundedHashMap extends LinkedHashMap<CanonicalKey, String> {
Let's create cache big enough to usually have enough space for all/most entries for normal cases, but that won't grow indefinitely for degenerate cases
/** * Let's create cache big enough to usually have enough space for * all/most entries for normal cases, but that won't grow * indefinitely for degenerate cases */
private final static int DEFAULT_SIZE = 64; private final static int MAX_SIZE = (int) (1023 * 0.7f); // 4k primary hash public BoundedHashMap() { super(DEFAULT_SIZE, 0.7f, true); } @Override public boolean removeEldestEntry(Map.Entry<CanonicalKey, String> entry) { return (size() >= MAX_SIZE); } } final static class CanonicalKey {
Array containing characters of the canonicalized String.
/** * Array containing characters of the canonicalized String. */
final char[] mChars;
Length of canonicalized String
/** * Length of canonicalized String */
final int mLength;
Hash of the URI string, calculated using fast(er) hash function (compared to regular String).
/** * Hash of the URI string, calculated using fast(er) hash * function (compared to regular String). */
final int mHash; public CanonicalKey(char[] buffer, int len) { mChars = buffer; mLength = len; mHash = calcKeyHash(buffer, len); } public CanonicalKey(char[] buffer, int len, int hashCode) { mChars = buffer; mLength = len; mHash = hashCode; } public CanonicalKey safeClone() { char[] newBuf = new char[mLength]; System.arraycopy(mChars, 0, newBuf, 0, mLength); return new CanonicalKey(newBuf, mLength, mHash); } public static int calcKeyHash(char[] buffer, int len) { /* Short URIs are not common, but if they were to * happen, let's just use regular String.hashCode(); * it's good one, and for short strings, fast enough */ if (len <= 8) { // we know it's at least one char, though int hash = buffer[0]; // For these, let's use regular hashing method for (int i = 1; i < len; ++i) { hash = (hash * 31) + buffer[i]; } return hash; } /* Ok, longer. So first let's use length xored with first char; * usually first 4 will just be "http" anyways (and could * just be skipped for good?) */ int hash = len ^ buffer[0]; /* Otherwise, let's start with length, xor with first char, * then latter chars separated by larger and larger * spaces. The idea is to severely limit time needed * to calc hash code as URIs can get quite long. * But let's ignore last 4 chars, for now (we'll use them * all after the loop) */ int ix = 2; // start from 3rd char (buffer[2]) int dist = 2; // and skip 1 char first int end = (len - 4); while (ix < end) { hash = (hash * 31) + buffer[ix]; ix += dist; ++dist; // will skip progressively longer spans } // And then last 4 chars... hash = (hash * 31) ^ (buffer[end] << 2) + buffer[end+1]; hash = (hash * 31) + (buffer[end+2] << 2) ^ buffer[end+3]; return hash; } @Override public String toString() { return "{URI, hash: 0x"+Integer.toHexString(mHash)+"}"; } @Override public int hashCode() { return mHash; } @Override public boolean equals(Object o) { if (o == this) return true; if (o == null) return false; if (o.getClass() != getClass()) return false; CanonicalKey other = (CanonicalKey) o; if (other.mLength != mLength) return false; char[] c1 = mChars; char[] c2 = other.mChars; for (int i = 0, len = mLength; i < len; ++i) { if (c1[i] != c2[i]) { return false; } } return true; } } }