com.fasterxml/aalto-xml/1.2.2 : com/fasterxml/aalto/util/UriCanonicalizer.java

UriCanonicalizer
http://github.com/FasterXML/aalto-xml/: Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2) (FasterXML)
The Apache Software License, Version 2.0
Tatu Saloranta
package com.fasterxml.aalto.util;

import java.util.*;

This class is used for canonicalization of namespace URIs.
It will act as a layer above String.intern(), trying to reduce
calls to somewhat slow intern() method, and to do that as efficiently
as possible considering that Strings in question are often
longer than names in xml documents.
/**
 * This class is used for canonicalization of namespace URIs.
 * It will act as a layer above String.intern(), trying to reduce
 * calls to somewhat slow intern() method, and to do that as efficiently
 * as possible considering that Strings in question are often
 * longer than names in xml documents.
 */
public final class UriCanonicalizer
{
    private BoundedHashMap mURIs = null;

    public UriCanonicalizer() { }

    private void init()
    {
        mURIs = new BoundedHashMap();
    }

    public synchronized String canonicalizeURI(char[] ch, int len)
    {
        CanonicalKey key = new CanonicalKey(ch, len);
        if (mURIs == null) {
            init();
        } else {
            String result = (String) mURIs.get(key);
            if (result != null) {
                return result;
            }
        }
        /* Key we have is not yet stable, as the underlying array 
         * is shared and mutable. So:
         */
        key = key.safeClone();
        // Also, now we should intern() the URI
        String uri = new String(ch, 0, len).intern();
        mURIs.put(key, uri);
        return uri;
    }

    /*
    ///////////////////////////////////////////////////
    // Helper classes
    ///////////////////////////////////////////////////
     */

    We'll use a bounded map, which should work well for most normal
cases, but avoid excesses for degenerate cases (unique URIs
used as idenfitiers etc).
/**
     * We'll use a bounded map, which should work well for most normal
     * cases, but avoid excesses for degenerate cases (unique URIs
     * used as idenfitiers etc).
     */
    @SuppressWarnings("serial")
	final static class BoundedHashMap
        extends LinkedHashMap<CanonicalKey, String>
    {
        Let's create cache big enough to usually have enough space for
all/most entries for normal cases, but that won't grow
indefinitely for degenerate cases
/**
         * Let's create cache big enough to usually have enough space for
         * all/most entries for normal cases, but that won't grow
         * indefinitely for degenerate cases
         */
        private final static int DEFAULT_SIZE = 64;
        
        private final static int MAX_SIZE = (int) (1023 * 0.7f); // 4k primary hash

        public BoundedHashMap()
        {
            super(DEFAULT_SIZE, 0.7f, true);
        }

        @Override
            public boolean removeEldestEntry(Map.Entry<CanonicalKey, String> entry)
        {
            return (size() >= MAX_SIZE);
        }
    }

    final static class CanonicalKey
    {
        Array containing characters of the canonicalized String.
/**
         * Array containing characters of the canonicalized String.
         */
        final char[] mChars;

        Length of canonicalized String
/**
         * Length of canonicalized String
         */
        final int mLength;

        Hash of the URI string, calculated using fast(er) hash
function (compared to regular String).
/**
         * Hash of the URI string, calculated using fast(er) hash
         * function (compared to regular String).
         */
        final int mHash;

        public CanonicalKey(char[] buffer, int len)
        {
            mChars = buffer;
            mLength = len;
            mHash = calcKeyHash(buffer, len);
        }

        public CanonicalKey(char[] buffer, int len, int hashCode)
        {
            mChars = buffer;
            mLength = len;
            mHash = hashCode;
        }

        public CanonicalKey safeClone()
        {
            char[] newBuf = new char[mLength];
            System.arraycopy(mChars, 0, newBuf, 0, mLength);
            return new CanonicalKey(newBuf, mLength, mHash);
        }

        public static int calcKeyHash(char[] buffer, int len)
        {
            /* Short URIs are not common, but if they were to
             * happen, let's just use regular String.hashCode();
             * it's good one, and for short strings, fast enough
             */
            if (len <= 8) { // we know it's at least one char, though
                int hash = buffer[0];
                // For these, let's use regular hashing method
                for (int i = 1; i < len; ++i) {
                    hash = (hash * 31) + buffer[i];
                }
                return hash;
            }

            /* Ok, longer. So first let's use length xored with first char;
             * usually first 4 will just be "http" anyways (and could
             * just be skipped for good?)
             */
            int hash = len ^ buffer[0];
            
            /* Otherwise, let's start with length, xor with first char,
             * then latter chars separated by larger and larger
             * spaces. The idea is to severely limit time needed
             * to calc hash code as URIs can get quite long.
             * But let's ignore last 4 chars, for now (we'll use them
             * all after the loop)
             */
            int ix = 2; // start from 3rd char (buffer[2])
            int dist = 2; // and skip 1 char first
            int end = (len - 4);
            
            while (ix < end) {
                hash = (hash * 31) + buffer[ix];
                ix += dist;
                ++dist; // will skip progressively longer spans
            }
            
            // And then last 4 chars...
            hash = (hash * 31) ^ (buffer[end] << 2) + buffer[end+1];
            hash = (hash * 31) + (buffer[end+2] << 2) ^ buffer[end+3];
            return hash;
        }

        @Override
            public String toString() { return "{URI, hash: 0x"+Integer.toHexString(mHash)+"}"; }

        @Override
            public int hashCode() { return mHash; }

        @Override
            public boolean equals(Object o)
        {
            if (o == this) return true;
            if (o == null) return false;
            if (o.getClass() != getClass()) return false;

            CanonicalKey other = (CanonicalKey) o;
            if (other.mLength != mLength) return false;
            char[] c1 = mChars;
            char[] c2 = other.mChars;
            for (int i = 0, len = mLength; i < len; ++i) {
                if (c1[i] != c2[i]) {
                    return false;
                }
            }
            return true;
        }

    }
}
/

com.fasterxml/ aalto-xml/ 1.2.2/ com/fasterxml/aalto/util/UriCanonicalizer.java