/* Woodstox XML processor
*
* Copyright (c) 2004- Tatu Saloranta, tatu.saloranta@iki.fi
*
* Licensed under the License specified in the file LICENSE which is
* included with the source code.
* You may not use this file except in compliance with the License.
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.ctc.wstx.util;
This class is a kind of specialized type-safe Map, from char array to
String value. Specialization means that in addition to type-safety
and specific access patterns (key char array, Value optionally interned
String; values added on access if necessary), and that instances are
meant to be used concurrently, but by using well-defined mechanisms
to obtain such concurrently usable instances. Main use for the class
is to store symbol table information for things like compilers and
parsers; especially when number of symbols (keywords) is limited.
For optimal performance, usage pattern should be one where matches
should be very common (esp. after "warm-up"), and as with most hash-based
maps/sets, that hash codes are uniformly distributed. Also, collisions
are slightly more expensive than with HashMap or HashSet, since hash codes
are not used in resolving collisions; that is, equals() comparison is
done with all symbols in same bucket index.
Finally, rehashing is also more expensive, as hash codes are not
stored; rehashing requires all entries' hash codes to be recalculated.
Reason for not storing hash codes is reduced memory usage, hoping
for better memory locality.
Usual usage pattern is to create a single "master" instance, and either
use that instance in sequential fashion, or to create derived "child"
instances, which after use, are asked to return possible symbol additions
to master instance. In either case benefit is that symbol table gets
initialized so that further uses are more efficient, as eventually all
symbols needed will already be in symbol table. At that point no more
Symbol String allocations are needed, nor changes to symbol table itself.
Note that while individual SymbolTable instances are NOT thread-safe
(much like generic collection classes), concurrently used "child"
instances can be freely used without synchronization. However, using
master table concurrently with child instances can only be done if
access to master instance is read-only (ie. no modifications done).
/**
* This class is a kind of specialized type-safe Map, from char array to
* String value. Specialization means that in addition to type-safety
* and specific access patterns (key char array, Value optionally interned
* String; values added on access if necessary), and that instances are
* meant to be used concurrently, but by using well-defined mechanisms
* to obtain such concurrently usable instances. Main use for the class
* is to store symbol table information for things like compilers and
* parsers; especially when number of symbols (keywords) is limited.
*<p>
* For optimal performance, usage pattern should be one where matches
* should be very common (esp. after "warm-up"), and as with most hash-based
* maps/sets, that hash codes are uniformly distributed. Also, collisions
* are slightly more expensive than with HashMap or HashSet, since hash codes
* are not used in resolving collisions; that is, equals() comparison is
* done with all symbols in same bucket index.<br>
* Finally, rehashing is also more expensive, as hash codes are not
* stored; rehashing requires all entries' hash codes to be recalculated.
* Reason for not storing hash codes is reduced memory usage, hoping
* for better memory locality.
*<p>
* Usual usage pattern is to create a single "master" instance, and either
* use that instance in sequential fashion, or to create derived "child"
* instances, which after use, are asked to return possible symbol additions
* to master instance. In either case benefit is that symbol table gets
* initialized so that further uses are more efficient, as eventually all
* symbols needed will already be in symbol table. At that point no more
* Symbol String allocations are needed, nor changes to symbol table itself.
*<p>
* Note that while individual SymbolTable instances are NOT thread-safe
* (much like generic collection classes), concurrently used "child"
* instances can be freely used without synchronization. However, using
* master table concurrently with child instances can only be done if
* access to master instance is read-only (ie. no modifications done).
*/
public class SymbolTable {
Default initial table size; no need to make it miniscule, due
to couple of things: first, overhead of array reallocation
is significant,
and second, overhead of rehashing is also non-negligible.
Let's use 128 as the default; it allows for up to 96 symbols,
and uses about 512 bytes on 32-bit machines.
/**
* Default initial table size; no need to make it miniscule, due
* to couple of things: first, overhead of array reallocation
* is significant,
* and second, overhead of rehashing is also non-negligible.
*<p>
* Let's use 128 as the default; it allows for up to 96 symbols,
* and uses about 512 bytes on 32-bit machines.
*/
protected static final int DEFAULT_TABLE_SIZE = 128;
protected static final float DEFAULT_FILL_FACTOR = 0.75f;
protected static final String EMPTY_STRING = "";
/*
////////////////////////////////////////
// Configuration:
////////////////////////////////////////
*/
Flag that determines whether Strings to be added need to be
interned before being added or not. Forcing intern()ing will add
some overhead when adding new Strings, but may be beneficial if such
Strings are generally used by other parts of system. Note that even
without interning, all returned String instances are guaranteed
to be comparable with equality (==) operator; it's just that such
guarantees are not made for Strings other classes return.
/**
* Flag that determines whether Strings to be added need to be
* interned before being added or not. Forcing intern()ing will add
* some overhead when adding new Strings, but may be beneficial if such
* Strings are generally used by other parts of system. Note that even
* without interning, all returned String instances are guaranteed
* to be comparable with equality (==) operator; it's just that such
* guarantees are not made for Strings other classes return.
*/
protected boolean mInternStrings;
/*
////////////////////////////////////////
// Actual symbol table data:
////////////////////////////////////////
*/
Primary matching symbols; it's expected most match occur from
here.
/**
* Primary matching symbols; it's expected most match occur from
* here.
*/
protected String[] mSymbols;
Overflow buckets; if primary doesn't match, lookup is done
from here.
Note: Number of buckets is half of number of symbol entries, on
assumption there's less need for buckets.
/**
* Overflow buckets; if primary doesn't match, lookup is done
* from here.
*<p>
* Note: Number of buckets is half of number of symbol entries, on
* assumption there's less need for buckets.
*/
protected Bucket[] mBuckets;
Current size (number of entries); needed to know if and when
rehash.
/**
* Current size (number of entries); needed to know if and when
* rehash.
*/
protected int mSize;
Limit that indicates maximum size this instance can hold before
it needs to be expanded and rehashed. Calculated using fill
factor passed in to constructor.
/**
* Limit that indicates maximum size this instance can hold before
* it needs to be expanded and rehashed. Calculated using fill
* factor passed in to constructor.
*/
protected int mSizeThreshold;
Mask used to get index from hash values; equal to
mBuckets.length - 1
, when mBuckets.length is
a power of two.
/**
* Mask used to get index from hash values; equal to
* <code>mBuckets.length - 1</code>, when mBuckets.length is
* a power of two.
*/
protected int mIndexMask;
/*
////////////////////////////////////////
// Information about concurrency
////////////////////////////////////////
*/
Version of this table instance; used when deriving new concurrently
used versions from existing 'master' instance.
/**
* Version of this table instance; used when deriving new concurrently
* used versions from existing 'master' instance.
*/
protected int mThisVersion;
Flag that indicates if any changes have been made to the data;
used to both determine if bucket array needs to be copied when
(first) change is made, and potentially if updated bucket list
is to be resync'ed back to master instance.
/**
* Flag that indicates if any changes have been made to the data;
* used to both determine if bucket array needs to be copied when
* (first) change is made, and potentially if updated bucket list
* is to be resync'ed back to master instance.
*/
protected boolean mDirty;
/*
////////////////////////////////////////
// Life-cycle:
////////////////////////////////////////
*/
Method for constructing a master symbol table instance; this one
will create master instance with default size, and with interning
enabled.
/**
* Method for constructing a master symbol table instance; this one
* will create master instance with default size, and with interning
* enabled.
*/
public SymbolTable() {
this(true);
}
Method for constructing a master symbol table instance.
/**
* Method for constructing a master symbol table instance.
*/
public SymbolTable(boolean internStrings) {
this(internStrings, DEFAULT_TABLE_SIZE);
}
Method for constructing a master symbol table instance.
/**
* Method for constructing a master symbol table instance.
*/
public SymbolTable(boolean internStrings, int initialSize) {
this(internStrings, initialSize, DEFAULT_FILL_FACTOR);
}
Main method for constructing a master symbol table instance; will
be called by other public constructors.
Params: - internStrings – Whether Strings to add are intern()ed or not
- initialSize – Minimum initial size for bucket array; internally
will always use a power of two equal to or bigger than this value.
- fillFactor – Maximum fill factor allowed for bucket table;
when more entries are added, table will be expanded.
/**
* Main method for constructing a master symbol table instance; will
* be called by other public constructors.
*
* @param internStrings Whether Strings to add are intern()ed or not
* @param initialSize Minimum initial size for bucket array; internally
* will always use a power of two equal to or bigger than this value.
* @param fillFactor Maximum fill factor allowed for bucket table;
* when more entries are added, table will be expanded.
*/
public SymbolTable(boolean internStrings, int initialSize,
float fillFactor)
{
mInternStrings = internStrings;
// Let's start versions from 1
mThisVersion = 1;
// And we'll also set flags so no copying of buckets is needed:
mDirty = true;
// No point in requesting funny initial sizes...
if (initialSize < 1) {
throw new IllegalArgumentException("Can not use negative/zero initial size: "+initialSize);
}
/* Initial size has to be a power of two. Also, let's not honour
* sizes that are ridiculously small...
*/
{
int currSize = 4;
while (currSize < initialSize) {
currSize += currSize;
}
initialSize = currSize;
}
mSymbols = new String[initialSize];
mBuckets = new Bucket[initialSize >> 1];
// Mask is easy to calc for powers of two.
mIndexMask = initialSize - 1;
mSize = 0;
// Sanity check for fill factor:
if (fillFactor < 0.01f) {
throw new IllegalArgumentException("Fill factor can not be lower than 0.01.");
}
if (fillFactor > 10.0f) { // just to catch stupid values, ie. useless from performance perspective
throw new IllegalArgumentException("Fill factor can not be higher than 10.0.");
}
mSizeThreshold = (int) (initialSize * fillFactor + 0.5);
}
Internal constructor used when creating child instances.
/**
* Internal constructor used when creating child instances.
*/
private SymbolTable(boolean internStrings, String[] symbols,
Bucket[] buckets, int size, int sizeThreshold,
int indexMask, int version)
{
mInternStrings = internStrings;
mSymbols = symbols;
mBuckets = buckets;
mSize = size;
mSizeThreshold = sizeThreshold;
mIndexMask = indexMask;
mThisVersion = version;
// Need to make copies of arrays, if/when adding new entries
mDirty = false;
}
"Factory" method; will create a new child instance of this symbol
table. It will be a copy-on-write instance, ie. it will only use
read-only copy of parent's data, but when changes are needed, a
copy will be created.
Note: while data access part of this method is synchronized, it is
generally not safe to both use makeChild/mergeChild, AND to use instance
actively. Instead, a separate 'root' instance should be used
on which only makeChild/mergeChild are called, but instance itself
is not used as a symbol table.
/**
* "Factory" method; will create a new child instance of this symbol
* table. It will be a copy-on-write instance, ie. it will only use
* read-only copy of parent's data, but when changes are needed, a
* copy will be created.
*<p>
* Note: while data access part of this method is synchronized, it is
* generally not safe to both use makeChild/mergeChild, AND to use instance
* actively. Instead, a separate 'root' instance should be used
* on which only makeChild/mergeChild are called, but instance itself
* is not used as a symbol table.
*/
public SymbolTable makeChild()
{
final boolean internStrings;
final String[] symbols;
final Bucket[] buckets;
final int size;
final int sizeThreshold;
final int indexMask;
final int version;
synchronized (this) {
internStrings = mInternStrings;
symbols = mSymbols;
buckets = mBuckets;
size = mSize;
sizeThreshold = mSizeThreshold;
indexMask = mIndexMask;
version = mThisVersion+1;
}
return new SymbolTable(internStrings, symbols, buckets,
size, sizeThreshold, indexMask, version);
}
Method that allows contents of child table to potentially be
"merged in" with contents of this symbol table.
Note that caller has to make sure symbol table passed in is
really a child or sibling of this symbol table.
/**
* Method that allows contents of child table to potentially be
* "merged in" with contents of this symbol table.
*<p>
* Note that caller has to make sure symbol table passed in is
* really a child or sibling of this symbol table.
*/
public synchronized void mergeChild(SymbolTable child)
{
// Let's do a basic sanity check first:
if (child.size() <= size()) { // nothing to add
return;
}
// Okie dokie, let's get the data in!
mSymbols = child.mSymbols;
mBuckets = child.mBuckets;
mSize = child.mSize;
mSizeThreshold = child.mSizeThreshold;
mIndexMask = child.mIndexMask;
mThisVersion++; // to prevent other children from overriding
// Dirty flag... well, let's just clear it, to force copying just
// in case. Shouldn't really matter, for master tables.
mDirty = false;
/* However, we have to mark child as dirty, so that it will not
* be modifying arrays we "took over" (since child may have
* returned an updated table before it stopped fully using
* the SymbolTable: for example, it may still use it for
* parsing PI targets in epilog)
*/
child.mDirty = false;
}
/*
////////////////////////////////////////////////////
// Public API, configuration
////////////////////////////////////////////////////
*/
public void setInternStrings(boolean state) {
mInternStrings = state;
}
/*
////////////////////////////////////////////////////
// Public API, generic accessors:
////////////////////////////////////////////////////
*/
public int size() { return mSize; }
public int version() { return mThisVersion; }
public boolean isDirty() { return mDirty; }
public boolean isDirectChildOf(SymbolTable t)
{
/* Actually, this doesn't really prove it is a child (would have to
* use sequence number, or identityHash to really prove it), but
* it's good enough if relationship is known to exist.
*/
/* (for real check, one would need to child/descendant stuff; or
* at least an identity hash... or maybe even just a _static_ global
* counter for instances... maybe that would actually be worth
* doing?)
*/
if (mThisVersion == (t.mThisVersion + 1)) {
return true;
}
return false;
}
/*
////////////////////////////////////////////////////
// Public API, accessing symbols:
////////////////////////////////////////////////////
*/
Main access method; will check if actual symbol String exists;
if so, returns it; if not, will create, add and return it.
Returns: The symbol matching String in input array
/**
* Main access method; will check if actual symbol String exists;
* if so, returns it; if not, will create, add and return it.
*
* @return The symbol matching String in input array
*/
/*
public String findSymbol(char[] buffer, int start, int len)
{
return findSymbol(buffer, start, len, calcHash(buffer, start, len));
}
*/
public String findSymbol(char[] buffer, int start, int len, int hash)
{
// Sanity check:
if (len < 1) {
return EMPTY_STRING;
}
hash &= mIndexMask;
String sym = mSymbols[hash];
// Optimal case; checking existing primary symbol for hash index:
if (sym != null) {
// Let's inline primary String equality checking:
if (sym.length() == len) {
int i = 0;
do {
if (sym.charAt(i) != buffer[start+i]) {
break;
}
} while (++i < len);
// Optimal case; primary match found
if (i == len) {
return sym;
}
}
// How about collision bucket?
Bucket b = mBuckets[hash >> 1];
if (b != null) {
sym = b.find(buffer, start, len);
if (sym != null) {
return sym;
}
}
}
// Need to expand?
if (mSize >= mSizeThreshold) {
rehash();
/* Need to recalc hash; rare occurence (index mask has been
* recalculated as part of rehash)
*/
hash = calcHash(buffer, start, len) & mIndexMask;
} else if (!mDirty) {
// Or perhaps we need to do copy-on-write?
copyArrays();
mDirty = true;
}
++mSize;
String newSymbol = new String(buffer, start, len);
if (mInternStrings) {
newSymbol = newSymbol.intern();
}
// Ok; do we need to add primary entry, or a bucket?
if (mSymbols[hash] == null) {
mSymbols[hash] = newSymbol;
} else {
int bix = hash >> 1;
mBuckets[bix] = new Bucket(newSymbol, mBuckets[bix]);
}
return newSymbol;
}
Similar to {link #findSymbol}, but will not add passed in symbol
if it is not in symbol table yet.
/**
* Similar to {link #findSymbol}, but will not add passed in symbol
* if it is not in symbol table yet.
*/
public String findSymbolIfExists(char[] buffer, int start, int len, int hash)
{
// Sanity check:
if (len < 1) {
return EMPTY_STRING;
}
hash &= mIndexMask;
String sym = mSymbols[hash];
// Optimal case; checking existing primary symbol for hash index:
if (sym != null) {
// Let's inline primary String equality checking:
if (sym.length() == len) {
int i = 0;
do {
if (sym.charAt(i) != buffer[start+i]) {
break;
}
} while (++i < len);
// Optimal case; primary match found
if (i == len) {
return sym;
}
}
// How about collision bucket?
Bucket b = mBuckets[hash >> 1];
if (b != null) {
sym = b.find(buffer, start, len);
if (sym != null) {
return sym;
}
}
}
return null;
}
Similar to to findSymbol(char[], int, int, int)
; used to either do potentially cheap intern() (if table already has intern()ed version), or to pre-populate symbol table with known values. /**
* Similar to to {@link #findSymbol(char[],int,int,int)}; used to either
* do potentially cheap intern() (if table already has intern()ed version),
* or to pre-populate symbol table with known values.
*/
public String findSymbol(String str)
{
int len = str.length();
// Sanity check:
if (len < 1) {
return EMPTY_STRING;
}
int index = calcHash(str) & mIndexMask;
String sym = mSymbols[index];
// Optimal case; checking existing primary symbol for hash index:
if (sym != null) {
// Let's inline primary String equality checking:
if (sym.length() == len) {
int i = 0;
for (; i < len; ++i) {
if (sym.charAt(i) != str.charAt(i)) {
break;
}
}
// Optimal case; primary match found
if (i == len) {
return sym;
}
}
// How about collision bucket?
Bucket b = mBuckets[index >> 1];
if (b != null) {
sym = b.find(str);
if (sym != null) {
return sym;
}
}
}
// Need to expand?
if (mSize >= mSizeThreshold) {
rehash();
/* Need to recalc hash; rare occurence (index mask has been
* recalculated as part of rehash)
*/
index = calcHash(str) & mIndexMask;
} else if (!mDirty) {
// Or perhaps we need to do copy-on-write?
copyArrays();
mDirty = true;
}
++mSize;
if (mInternStrings) {
str = str.intern();
}
// Ok; do we need to add primary entry, or a bucket?
if (mSymbols[index] == null) {
mSymbols[index] = str;
} else {
int bix = index >> 1;
mBuckets[bix] = new Bucket(str, mBuckets[bix]);
}
return str;
}
Implementation of a hashing method for variable length
Strings. Most of the time intention is that this calculation
is done by caller during parsing, not here; however, sometimes
it needs to be done for parsed "String" too.
Params: - len – Length of String; has to be at least 1 (caller guarantees
this pre-condition)
/**
* Implementation of a hashing method for variable length
* Strings. Most of the time intention is that this calculation
* is done by caller during parsing, not here; however, sometimes
* it needs to be done for parsed "String" too.
*
* @param len Length of String; has to be at least 1 (caller guarantees
* this pre-condition)
*/
@SuppressWarnings("cast")
public static int calcHash(char[] buffer, int start, int len) {
int hash = (int) buffer[start];
for (int i = 1; i < len; ++i) {
hash = (hash * 31) + (int) buffer[start+i];
}
return hash;
}
@SuppressWarnings("cast")
public static int calcHash(String key) {
int hash = (int) key.charAt(0);
for (int i = 1, len = key.length(); i < len; ++i) {
hash = (hash * 31) + (int) key.charAt(i);
}
return hash;
}
/*
//////////////////////////////////////////////////////////
// Internal methods
//////////////////////////////////////////////////////////
*/
Method called when copy-on-write is needed; generally when first
change is made to a derived symbol table.
/**
* Method called when copy-on-write is needed; generally when first
* change is made to a derived symbol table.
*/
private void copyArrays() {
String[] oldSyms = mSymbols;
int size = oldSyms.length;
mSymbols = new String[size];
System.arraycopy(oldSyms, 0, mSymbols, 0, size);
Bucket[] oldBuckets = mBuckets;
size = oldBuckets.length;
mBuckets = new Bucket[size];
System.arraycopy(oldBuckets, 0, mBuckets, 0, size);
}
Method called when size (number of entries) of symbol table grows
so big that load factor is exceeded. Since size has to remain
power of two, arrays will then always be doubled. Main work
is really redistributing old entries into new String/Bucket
entries.
/**
* Method called when size (number of entries) of symbol table grows
* so big that load factor is exceeded. Since size has to remain
* power of two, arrays will then always be doubled. Main work
* is really redistributing old entries into new String/Bucket
* entries.
*/
private void rehash()
{
int size = mSymbols.length;
int newSize = size + size;
String[] oldSyms = mSymbols;
Bucket[] oldBuckets = mBuckets;
mSymbols = new String[newSize];
mBuckets = new Bucket[newSize >> 1];
// Let's update index mask, threshold, now (needed for rehashing)
mIndexMask = newSize - 1;
mSizeThreshold += mSizeThreshold;
int count = 0; // let's do sanity check
/* Need to do two loops, unfortunately, since spillover area is
* only half the size:
*/
for (int i = 0; i < size; ++i) {
String symbol = oldSyms[i];
if (symbol != null) {
++count;
int index = calcHash(symbol) & mIndexMask;
if (mSymbols[index] == null) {
mSymbols[index] = symbol;
} else {
int bix = index >> 1;
mBuckets[bix] = new Bucket(symbol, mBuckets[bix]);
}
}
}
size >>= 1;
for (int i = 0; i < size; ++i) {
Bucket b = oldBuckets[i];
while (b != null) {
++count;
String symbol = b.getSymbol();
int index = calcHash(symbol) & mIndexMask;
if (mSymbols[index] == null) {
mSymbols[index] = symbol;
} else {
int bix = index >> 1;
mBuckets[bix] = new Bucket(symbol, mBuckets[bix]);
}
b = b.getNext();
}
}
if (count != mSize) {
throw new IllegalStateException("Internal error on SymbolTable.rehash(): had "+mSize+" entries; now have "+count+".");
}
}
/*
//////////////////////////////////////////////////////////
// Test/debug support:
//////////////////////////////////////////////////////////
*/
public double calcAvgSeek() {
int count = 0;
for (int i = 0, len = mSymbols.length; i < len; ++i) {
if (mSymbols[i] != null) {
++count;
}
}
for (int i = 0, len = mBuckets.length; i < len; ++i) {
Bucket b = mBuckets[i];
int cost = 2;
while (b != null) {
count += cost;
++cost;
b = b.getNext();
}
}
return ((double) count) / ((double) mSize);
}
/*
//////////////////////////////////////////////////////////
// Bucket class
//////////////////////////////////////////////////////////
*/
This class is a symbol table entry. Each entry acts as a node
in a linked list.
/**
* This class is a symbol table entry. Each entry acts as a node
* in a linked list.
*/
static final class Bucket {
private final String mSymbol;
private final Bucket mNext;
public Bucket(String symbol, Bucket next) {
mSymbol = symbol;
mNext = next;
}
public String getSymbol() { return mSymbol; }
public Bucket getNext() { return mNext; }
public String find(char[] buf, int start, int len) {
String sym = mSymbol;
Bucket b = mNext;
while (true) { // Inlined equality comparison:
if (sym.length() == len) {
int i = 0;
do {
if (sym.charAt(i) != buf[start+i]) {
break;
}
} while (++i < len);
if (i == len) {
return sym;
}
}
if (b == null) {
break;
}
sym = b.getSymbol();
b = b.getNext();
}
return null;
}
public String find(String str) {
String sym = mSymbol;
Bucket b = mNext;
while (true) {
if (sym.equals(str)) {
return sym;
}
if (b == null) {
break;
}
sym = b.getSymbol();
b = b.getNext();
}
return null;
}
}
}