package com.fasterxml.jackson.core.sym;
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicReference;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.util.InternCache;
Replacement for BytesToNameCanonicalizer
which aims at more localized
memory access due to flattening of name quad data.
Performance improvement modest for simple JSON document data binding (maybe 3%),
but should help more for larger symbol tables, or for binary formats like Smile.
Hash area is divided into 4 sections:
- Primary area (1/2 of total size), direct match from hash (LSB)
- Secondary area (1/4 of total size), match from
hash (LSB) >> 1
- Tertiary area (1/8 of total size), match from
hash (LSB) >> 2
- Spill-over area (remaining 1/8) with linear scan, insertion order
and within every area, entries are 4 int
s, where 1 - 3 ints contain 1 - 12 UTF-8 encoded bytes of name (null-padded), and last int is offset in _names
that contains actual name Strings. Since: 2.6
/**
* Replacement for <code>BytesToNameCanonicalizer</code> which aims at more localized
* memory access due to flattening of name quad data.
* Performance improvement modest for simple JSON document data binding (maybe 3%),
* but should help more for larger symbol tables, or for binary formats like Smile.
*<p>
* Hash area is divided into 4 sections:
*<ol>
* <li>Primary area (1/2 of total size), direct match from hash (LSB)</li>
* <li>Secondary area (1/4 of total size), match from {@code hash (LSB) >> 1}</li>
* <li>Tertiary area (1/8 of total size), match from {@code hash (LSB) >> 2}</li>
* <li>Spill-over area (remaining 1/8) with linear scan, insertion order</li>
* <li></li>
* </ol>
* and within every area, entries are 4 {@code int}s, where 1 - 3 ints contain 1 - 12
* UTF-8 encoded bytes of name (null-padded), and last int is offset in
* {@code _names} that contains actual name Strings.
*
* @since 2.6
*/
public final class ByteQuadsCanonicalizer
{
Initial size of the primary hash area. Each entry consumes 4 ints (16 bytes),
and secondary area is same as primary; so default size will use 2kB of memory
(plus 64x4 or 64x8 (256/512 bytes) for references to Strings, and Strings
themselves).
/**
* Initial size of the primary hash area. Each entry consumes 4 ints (16 bytes),
* and secondary area is same as primary; so default size will use 2kB of memory
* (plus 64x4 or 64x8 (256/512 bytes) for references to Strings, and Strings
* themselves).
*/
private static final int DEFAULT_T_SIZE = 64;
// private static final int DEFAULT_T_SIZE = 256;
Let's not expand symbol tables past some maximum size;
with unique (~= random) names.
Size is in
/**
* Let's not expand symbol tables past some maximum size;
* with unique (~= random) names.
* Size is in
*/
private static final int MAX_T_SIZE = 0x10000; // 64k entries == 2M mem hash area
No point in trying to construct tiny tables, just need to resize soon.
/**
* No point in trying to construct tiny tables, just need to resize soon.
*/
private final static int MIN_HASH_SIZE = 16;
Let's only share reasonably sized symbol tables. Max size set to 3/4 of 8k;
this corresponds to 256k main hash index. This should allow for enough distinct
names for almost any case, while preventing ballooning for cases where names
are unique (or close thereof).
/**
* Let's only share reasonably sized symbol tables. Max size set to 3/4 of 8k;
* this corresponds to 256k main hash index. This should allow for enough distinct
* names for almost any case, while preventing ballooning for cases where names
* are unique (or close thereof).
*/
final static int MAX_ENTRIES_FOR_REUSE = 6000;
/*
/**********************************************************
/* Linkage, needed for merging symbol tables
/**********************************************************
*/
Reference to the root symbol table, for child tables, so
that they can merge table information back as necessary.
/**
* Reference to the root symbol table, for child tables, so
* that they can merge table information back as necessary.
*/
final private ByteQuadsCanonicalizer _parent;
Member that is only used by the root table instance: root
passes immutable state info child instances, and children
may return new state if they add entries to the table.
Child tables do NOT use the reference.
/**
* Member that is only used by the root table instance: root
* passes immutable state info child instances, and children
* may return new state if they add entries to the table.
* Child tables do NOT use the reference.
*/
final private AtomicReference<TableInfo> _tableInfo;
Seed value we use as the base to make hash codes non-static between
different runs, but still stable for lifetime of a single symbol table
instance.
This is done for security reasons, to avoid potential DoS attack via
hash collisions.
/**
* Seed value we use as the base to make hash codes non-static between
* different runs, but still stable for lifetime of a single symbol table
* instance.
* This is done for security reasons, to avoid potential DoS attack via
* hash collisions.
*/
final private int _seed;
/*
/**********************************************************
/* Configuration
/**********************************************************
*/
Whether canonical symbol Strings are to be intern()ed before added
to the table or not.
NOTE: non-final to allow disabling intern()ing in case of excessive
collisions.
/**
* Whether canonical symbol Strings are to be intern()ed before added
* to the table or not.
*<p>
* NOTE: non-final to allow disabling intern()ing in case of excessive
* collisions.
*/
private boolean _intern;
Flag that indicates whether we should throw an exception if enough
hash collisions are detected (true); or just worked around (false).
Since: 2.4
/**
* Flag that indicates whether we should throw an exception if enough
* hash collisions are detected (true); or just worked around (false).
*
* @since 2.4
*/
private final boolean _failOnDoS;
/*
/**********************************************************
/* First, main hash area info
/**********************************************************
*/
Primary hash information area: consists of 2 * _hashSize
entries of 16 bytes (4 ints), arranged in a cascading lookup
structure (details of which may be tweaked depending on expected rates
of collisions).
/**
* Primary hash information area: consists of <code>2 * _hashSize</code>
* entries of 16 bytes (4 ints), arranged in a cascading lookup
* structure (details of which may be tweaked depending on expected rates
* of collisions).
*/
private int[] _hashArea;
Number of slots for primary entries within _hashArea
; which is at most 1/8
of actual size of the underlying array (4-int slots,
primary covers only half of the area; plus, additional area for longer
symbols after hash area).
/**
* Number of slots for primary entries within {@link #_hashArea}; which is
* at most <code>1/8</code> of actual size of the underlying array (4-int slots,
* primary covers only half of the area; plus, additional area for longer
* symbols after hash area).
*/
private int _hashSize;
Offset within _hashArea
where secondary entries start /**
* Offset within {@link #_hashArea} where secondary entries start
*/
private int _secondaryStart;
Offset within _hashArea
where tertiary entries start /**
* Offset within {@link #_hashArea} where tertiary entries start
*/
private int _tertiaryStart;
Constant that determines size of buckets for tertiary entries:
1 << _tertiaryShift
is the size, and shift value
is also used for translating from primary offset into
tertiary bucket (shift right by 4 + _tertiaryShift
).
Default value is 2, for buckets of 4 slots; grows bigger with
bigger table sizes.
/**
* Constant that determines size of buckets for tertiary entries:
* <code>1 << _tertiaryShift</code> is the size, and shift value
* is also used for translating from primary offset into
* tertiary bucket (shift right by <code>4 + _tertiaryShift</code>).
*<p>
* Default value is 2, for buckets of 4 slots; grows bigger with
* bigger table sizes.
*/
private int _tertiaryShift;
Total number of Strings in the symbol table; only used for child tables.
/**
* Total number of Strings in the symbol table; only used for child tables.
*/
private int _count;
/**
* Array that contains <code>String</code> instances matching
* entries in {@link #_hashArea}.
* Contains nulls for unused entries. Note that this size is twice
* that of {@link #_hashArea}
*/
private String[] _names;
/*
/**********************************************************
/* Then information on collisions etc
/**********************************************************
*/
Pointer to the offset within spill-over area where there is room for more spilled over entries (if any). Spill over area is within fixed-size portion of _hashArea
. /**
* Pointer to the offset within spill-over area where there is room
* for more spilled over entries (if any).
* Spill over area is within fixed-size portion of {@link #_hashArea}.
*/
private int _spilloverEnd;
/**
* Offset within {@link #_hashArea} that follows main slots and contains
* quads for longer names (13 bytes or longer), and points to the
* first available int that may be used for appending quads of the next
* long name.
* Note that long name area follows immediately after the fixed-size
* main hash area ({@link #_hashArea}).
*/
private int _longNameOffset;
/*
/**********************************************************
/* Sharing, versioning
/**********************************************************
*/
// // // Which of the buffers may be shared (and are copy-on-write)?
Flag that indicates whether underlying data structures for
the main hash area are shared or not. If they are, then they
need to be handled in copy-on-write way, i.e. if they need
to be modified, a copy needs to be made first; at this point
it will not be shared any more, and can be modified.
This flag needs to be checked both when adding new main entries,
and when adding new collision list queues (i.e. creating a new
collision list head entry)
/**
* Flag that indicates whether underlying data structures for
* the main hash area are shared or not. If they are, then they
* need to be handled in copy-on-write way, i.e. if they need
* to be modified, a copy needs to be made first; at this point
* it will not be shared any more, and can be modified.
*<p>
* This flag needs to be checked both when adding new main entries,
* and when adding new collision list queues (i.e. creating a new
* collision list head entry)
*/
private boolean _hashShared;
/*
/**********************************************************
/* Life-cycle: constructors
/**********************************************************
*/
Constructor used for creating per-JsonFactory
"root"
symbol tables: ones used for merging and sharing common symbols
Params: - sz – Initial primary hash area size
- intern – Whether Strings contained should be
String.intern
ed - seed – Random seed valued used to make it more difficult to cause
collisions (used for collision-based DoS attacks).
/**
* Constructor used for creating per-<code>JsonFactory</code> "root"
* symbol tables: ones used for merging and sharing common symbols
*
* @param sz Initial primary hash area size
* @param intern Whether Strings contained should be {@link String#intern}ed
* @param seed Random seed valued used to make it more difficult to cause
* collisions (used for collision-based DoS attacks).
*/
private ByteQuadsCanonicalizer(int sz, boolean intern, int seed, boolean failOnDoS) {
_parent = null;
_seed = seed;
_intern = intern;
_failOnDoS = failOnDoS;
// Sanity check: let's now allow hash sizes below certain minimum value
if (sz < MIN_HASH_SIZE) {
sz = MIN_HASH_SIZE;
} else {
// Also; size must be 2^N; otherwise hash algorithm won't
// work... so let's just pad it up, if so
if ((sz & (sz - 1)) != 0) { // only true if it's 2^N
int curr = MIN_HASH_SIZE;
while (curr < sz) {
curr += curr;
}
sz = curr;
}
}
_tableInfo = new AtomicReference<TableInfo>(TableInfo.createInitial(sz));
}
Constructor used when creating a child instance
/**
* Constructor used when creating a child instance
*/
private ByteQuadsCanonicalizer(ByteQuadsCanonicalizer parent, boolean intern,
int seed, boolean failOnDoS, TableInfo state)
{
_parent = parent;
_seed = seed;
_intern = intern;
_failOnDoS = failOnDoS;
_tableInfo = null; // not used by child tables
// Then copy shared state
_count = state.count;
_hashSize = state.size;
_secondaryStart = _hashSize << 2; // right after primary area
_tertiaryStart = _secondaryStart + (_secondaryStart >> 1); // right after secondary
_tertiaryShift = state.tertiaryShift;
_hashArea = state.mainHash;
_names = state.names;
_spilloverEnd = state.spilloverEnd;
_longNameOffset = state.longNameOffset;
// and then set other state to reflect sharing status
_hashShared = true;
}
/*
/**********************************************************
/* Life-cycle: factory methods, merging
/**********************************************************
*/
Factory method to call to create a symbol table instance with a
randomized seed value.
/**
* Factory method to call to create a symbol table instance with a
* randomized seed value.
*/
public static ByteQuadsCanonicalizer createRoot() {
// Need to use a variable seed, to thwart hash-collision based attacks.
// 14-Feb-2017, tatu: Does this actually help?
long now = System.currentTimeMillis();
// ensure it's not 0; and might as well require to be odd so:
int seed = (((int) now) + ((int) (now >>> 32))) | 1;
return createRoot(seed);
}
// Factory method that should only be called from unit tests, where seed
// value should remain the same.
protected static ByteQuadsCanonicalizer createRoot(int seed) {
return new ByteQuadsCanonicalizer(DEFAULT_T_SIZE, true, seed, true);
}
Factory method used to create actual symbol table instance to
use for parsing.
/**
* Factory method used to create actual symbol table instance to
* use for parsing.
*/
public ByteQuadsCanonicalizer makeChild(int flags) {
return new ByteQuadsCanonicalizer(this,
JsonFactory.Feature.INTERN_FIELD_NAMES.enabledIn(flags),
_seed,
JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW.enabledIn(flags),
_tableInfo.get());
}
Method called by the using code to indicate it is done with this instance.
This lets instance merge accumulated changes into parent (if need be),
safely and efficiently, and without calling code having to know about parent
information.
/**
* Method called by the using code to indicate it is done with this instance.
* This lets instance merge accumulated changes into parent (if need be),
* safely and efficiently, and without calling code having to know about parent
* information.
*/
public void release()
{
// we will try to merge if child table has new entries
// 28-Jul-2019, tatu: From [core#548]: do not share if immediate rehash needed
if ((_parent != null) && maybeDirty()) {
_parent.mergeChild(new TableInfo(this));
// Let's also mark this instance as dirty, so that just in
// case release was too early, there's no corruption of possibly shared data.
_hashShared = true;
}
}
private void mergeChild(TableInfo childState)
{
final int childCount = childState.count;
TableInfo currState = _tableInfo.get();
// Should usually grow; but occasionally could also shrink if (but only if)
// collision list overflow ends up clearing some collision lists.
if (childCount == currState.count) {
return;
}
// One caveat: let's try to avoid problems with degenerate cases of documents with
// generated "random" names: for these, symbol tables would bloat indefinitely.
// One way to do this is to just purge tables if they grow
// too large, and that's what we'll do here.
if (childCount > MAX_ENTRIES_FOR_REUSE) {
// At any rate, need to clean up the tables
childState = TableInfo.createInitial(DEFAULT_T_SIZE);
}
_tableInfo.compareAndSet(currState, childState);
}
/*
/**********************************************************
/* API, accessors
/**********************************************************
*/
public int size()
{
if (_tableInfo != null) { // root table
return _tableInfo.get().count;
}
// nope, child table
return _count;
}
Returns number of primary slots table has currently
/**
* Returns number of primary slots table has currently
*/
public int bucketCount() { return _hashSize; }
Method called to check to quickly see if a child symbol table
may have gotten additional entries. Used for checking to see
if a child table should be merged into shared table.
/**
* Method called to check to quickly see if a child symbol table
* may have gotten additional entries. Used for checking to see
* if a child table should be merged into shared table.
*/
public boolean maybeDirty() { return !_hashShared; }
public int hashSeed() { return _seed; }
Method mostly needed by unit tests; calculates number of
entries that are in the primary slot set. These are
"perfect" entries, accessible with a single lookup
/**
* Method mostly needed by unit tests; calculates number of
* entries that are in the primary slot set. These are
* "perfect" entries, accessible with a single lookup
*/
public int primaryCount()
{
int count = 0;
for (int offset = 3, end = _secondaryStart; offset < end; offset += 4) {
if (_hashArea[offset] != 0) {
++count;
}
}
return count;
}
Method mostly needed by unit tests; calculates number of entries
in secondary buckets
/**
* Method mostly needed by unit tests; calculates number of entries
* in secondary buckets
*/
public int secondaryCount() {
int count = 0;
int offset = _secondaryStart + 3;
for (int end = _tertiaryStart; offset < end; offset += 4) {
if (_hashArea[offset] != 0) {
++count;
}
}
return count;
}
Method mostly needed by unit tests; calculates number of entries
in tertiary buckets
/**
* Method mostly needed by unit tests; calculates number of entries
* in tertiary buckets
*/
public int tertiaryCount() {
int count = 0;
int offset = _tertiaryStart + 3; // to 1.5x, starting point of tertiary
for (int end = offset + _hashSize; offset < end; offset += 4) {
if (_hashArea[offset] != 0) {
++count;
}
}
return count;
}
Method mostly needed by unit tests; calculates number of entries
in shared spillover area
/**
* Method mostly needed by unit tests; calculates number of entries
* in shared spillover area
*/
public int spilloverCount() {
// difference between spillover end, start, divided by 4 (four ints per slot)
return (_spilloverEnd - _spilloverStart()) >> 2;
}
public int totalCount()
{
int count = 0;
for (int offset = 3, end = (_hashSize << 3); offset < end; offset += 4) {
if (_hashArea[offset] != 0) {
++count;
}
}
return count;
}
@Override
public String toString() {
int pri = primaryCount();
int sec = secondaryCount();
int tert = tertiaryCount();
int spill = spilloverCount();
int total = totalCount();
return String.format("[%s: size=%d, hashSize=%d, %d/%d/%d/%d pri/sec/ter/spill (=%s), total:%d]",
getClass().getName(), _count, _hashSize,
pri, sec, tert, spill, (pri+sec+tert+spill), total);
}
/*
/**********************************************************
/* Public API, accessing symbols
/**********************************************************
*/
public String findName(int q1)
{
int offset = _calcOffset(calcHash(q1));
// first: primary match?
final int[] hashArea = _hashArea;
int len = hashArea[offset+3];
if (len == 1) {
if (hashArea[offset] == q1) {
return _names[offset >> 2];
}
} else if (len == 0) { // empty slot; unlikely but avoid further lookups if so
return null;
}
// secondary? single slot shared by N/2 primaries
int offset2 = _secondaryStart + ((offset >> 3) << 2);
len = hashArea[offset2+3];
if (len == 1) {
if (hashArea[offset2] == q1) {
return _names[offset2 >> 2];
}
} else if (len == 0) { // empty slot; unlikely but avoid further lookups if so
return null;
}
// tertiary lookup & spillovers best to offline
return _findSecondary(offset, q1);
}
public String findName(int q1, int q2)
{
int offset = _calcOffset(calcHash(q1, q2));
final int[] hashArea = _hashArea;
int len = hashArea[offset+3];
if (len == 2) {
if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1])) {
return _names[offset >> 2];
}
} else if (len == 0) { // empty slot; unlikely but avoid further lookups if so
return null;
}
// secondary?
int offset2 = _secondaryStart + ((offset >> 3) << 2);
len = hashArea[offset2+3];
if (len == 2) {
if ((q1 == hashArea[offset2]) && (q2 == hashArea[offset2+1])) {
return _names[offset2 >> 2];
}
} else if (len == 0) { // empty slot? Short-circuit if no more spillovers
return null;
}
return _findSecondary(offset, q1, q2);
}
public String findName(int q1, int q2, int q3)
{
int offset = _calcOffset(calcHash(q1, q2, q3));
final int[] hashArea = _hashArea;
int len = hashArea[offset+3];
if (len == 3) {
if ((q1 == hashArea[offset]) && (hashArea[offset+1] == q2) && (hashArea[offset+2] == q3)) {
return _names[offset >> 2];
}
} else if (len == 0) { // empty slot; unlikely but avoid further lookups if so
return null;
}
// secondary?
int offset2 = _secondaryStart + ((offset >> 3) << 2);
len = hashArea[offset2+3];
if (len == 3) {
if ((q1 == hashArea[offset2]) && (hashArea[offset2+1] == q2) && (hashArea[offset2+2] == q3)) {
return _names[offset2 >> 2];
}
} else if (len == 0) { // empty slot? Short-circuit if no more spillovers
return null;
}
return _findSecondary(offset, q1, q2, q3);
}
public String findName(int[] q, int qlen)
{
/* This version differs significantly, because longer names do not fit within cell.
* Rather, they contain hash in main slot, and offset+length to extension area
* that contains actual quads.
*/
if (qlen < 4) { // another sanity check
switch (qlen) {
case 3:
return findName(q[0], q[1], q[2]);
case 2:
return findName(q[0], q[1]);
case 1:
return findName(q[0]);
default: // if 0 ever passed
return "";
}
}
final int hash = calcHash(q, qlen);
int offset = _calcOffset(hash);
final int[] hashArea = _hashArea;
final int len = hashArea[offset+3];
if ((hash == hashArea[offset]) && (len == qlen)) {
// probable but not guaranteed: verify
if (_verifyLongName(q, qlen, hashArea[offset+1])) {
return _names[offset >> 2];
}
}
if (len == 0) { // empty slot; unlikely but avoid further lookups if so
return null;
}
// secondary?
int offset2 = _secondaryStart + ((offset >> 3) << 2);
final int len2 = hashArea[offset2+3];
if ((hash == hashArea[offset2]) && (len2 == qlen)) {
if (_verifyLongName(q, qlen, hashArea[offset2+1])) {
return _names[offset2 >> 2];
}
}
return _findSecondary(offset, hash, q, qlen);
}
private final int _calcOffset(int hash)
{
// NOTE: simple for initial impl, but we may want to interleave it a bit
// in near future
// So: first, hash into primary hash index
int ix = hash & (_hashSize-1);
// keeping in mind we have 4 ints per entry
return (ix << 2);
}
/*
/**********************************************************
/* Access from spill-over areas
/**********************************************************
*/
private String _findSecondary(int origOffset, int q1)
{
// tertiary area division is dynamic. First; its size is N/4 compared to
// primary hash size; and offsets are for 4 int slots. So to get to logical
// index would shift by 4. But! Tertiary area is further split into buckets,
// determined by shift value. And finally, from bucket back into physical offsets
int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift);
final int[] hashArea = _hashArea;
final int bucketSize = (1 << _tertiaryShift);
for (int end = offset + bucketSize; offset < end; offset += 4) {
int len = hashArea[offset+3];
if ((q1 == hashArea[offset]) && (1 == len)) {
return _names[offset >> 2];
}
if (len == 0) {
return null;
}
}
// but if tertiary full, check out spill-over area as last resort
// shared spillover starts at 7/8 of the main hash area
// (which is sized at 2 * _hashSize), so:
for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) {
if ((q1 == hashArea[offset]) && (1 == hashArea[offset+3])) {
return _names[offset >> 2];
}
}
return null;
}
private String _findSecondary(int origOffset, int q1, int q2)
{
int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift);
final int[] hashArea = _hashArea;
final int bucketSize = (1 << _tertiaryShift);
for (int end = offset + bucketSize; offset < end; offset += 4) {
int len = hashArea[offset+3];
if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (2 == len)) {
return _names[offset >> 2];
}
if (len == 0) {
return null;
}
}
for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) {
if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (2 == hashArea[offset+3])) {
return _names[offset >> 2];
}
}
return null;
}
private String _findSecondary(int origOffset, int q1, int q2, int q3)
{
int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift);
final int[] hashArea = _hashArea;
final int bucketSize = (1 << _tertiaryShift);
for (int end = offset + bucketSize; offset < end; offset += 4) {
int len = hashArea[offset+3];
if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (q3 == hashArea[offset+2]) && (3 == len)) {
return _names[offset >> 2];
}
if (len == 0) {
return null;
}
}
for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) {
if ((q1 == hashArea[offset]) && (q2 == hashArea[offset+1]) && (q3 == hashArea[offset+2])
&& (3 == hashArea[offset+3])) {
return _names[offset >> 2];
}
}
return null;
}
private String _findSecondary(int origOffset, int hash, int[] q, int qlen)
{
int offset = _tertiaryStart + ((origOffset >> (_tertiaryShift + 2)) << _tertiaryShift);
final int[] hashArea = _hashArea;
final int bucketSize = (1 << _tertiaryShift);
for (int end = offset + bucketSize; offset < end; offset += 4) {
int len = hashArea[offset+3];
if ((hash == hashArea[offset]) && (qlen == len)) {
if (_verifyLongName(q, qlen, hashArea[offset+1])) {
return _names[offset >> 2];
}
}
if (len == 0) {
return null;
}
}
for (offset = _spilloverStart(); offset < _spilloverEnd; offset += 4) {
if ((hash == hashArea[offset]) && (qlen == hashArea[offset+3])) {
if (_verifyLongName(q, qlen, hashArea[offset+1])) {
return _names[offset >> 2];
}
}
}
return null;
}
private boolean _verifyLongName(int[] q, int qlen, int spillOffset)
{
final int[] hashArea = _hashArea;
// spillOffset assumed to be physical index right into quad string
int ix = 0;
switch (qlen) {
default:
return _verifyLongName2(q, qlen, spillOffset);
case 8:
if (q[ix++] != hashArea[spillOffset++]) return false;
case 7:
if (q[ix++] != hashArea[spillOffset++]) return false;
case 6:
if (q[ix++] != hashArea[spillOffset++]) return false;
case 5:
if (q[ix++] != hashArea[spillOffset++]) return false;
case 4: // always at least 4
if (q[ix++] != hashArea[spillOffset++]) return false;
if (q[ix++] != hashArea[spillOffset++]) return false;
if (q[ix++] != hashArea[spillOffset++]) return false;
if (q[ix++] != hashArea[spillOffset++]) return false;
}
return true;
}
private boolean _verifyLongName2(int[] q, int qlen, int spillOffset)
{
int ix = 0;
do {
if (q[ix++] != _hashArea[spillOffset++]) {
return false;
}
} while (ix < qlen);
return true;
}
/*
/**********************************************************
/* API, mutators
/**********************************************************
*/
public String addName(String name, int q1) {
_verifySharing();
if (_intern) {
name = InternCache.instance.intern(name);
}
int offset = _findOffsetForAdd(calcHash(q1));
_hashArea[offset] = q1;
_hashArea[offset+3] = 1;
_names[offset >> 2] = name;
++_count;
return name;
}
public String addName(String name, int q1, int q2) {
_verifySharing();
if (_intern) {
name = InternCache.instance.intern(name);
}
int hash = (q2 == 0) ? calcHash(q1) : calcHash(q1, q2);
int offset = _findOffsetForAdd(hash);
_hashArea[offset] = q1;
_hashArea[offset+1] = q2;
_hashArea[offset+3] = 2;
_names[offset >> 2] = name;
++_count;
return name;
}
public String addName(String name, int q1, int q2, int q3) {
_verifySharing();
if (_intern) {
name = InternCache.instance.intern(name);
}
int offset = _findOffsetForAdd(calcHash(q1, q2, q3));
_hashArea[offset] = q1;
_hashArea[offset+1] = q2;
_hashArea[offset+2] = q3;
_hashArea[offset+3] = 3;
_names[offset >> 2] = name;
++_count;
return name;
}
public String addName(String name, int[] q, int qlen)
{
_verifySharing();
if (_intern) {
name = InternCache.instance.intern(name);
}
int offset;
switch (qlen) {
case 1:
{
offset = _findOffsetForAdd(calcHash(q[0]));
_hashArea[offset] = q[0];
_hashArea[offset+3] = 1;
}
break;
case 2:
{
offset = _findOffsetForAdd(calcHash(q[0], q[1]));
_hashArea[offset] = q[0];
_hashArea[offset+1] = q[1];
_hashArea[offset+3] = 2;
}
break;
case 3:
{
offset = _findOffsetForAdd(calcHash(q[0], q[1], q[2]));
_hashArea[offset] = q[0];
_hashArea[offset+1] = q[1];
_hashArea[offset+2] = q[2];
_hashArea[offset+3] = 3;
}
break;
default:
final int hash = calcHash(q, qlen);
offset = _findOffsetForAdd(hash);
_hashArea[offset] = hash;
int longStart = _appendLongName(q, qlen);
_hashArea[offset+1] = longStart;
_hashArea[offset+3] = qlen;
}
// plus add the actual String
_names[offset >> 2] = name;
// and finally; see if we really should rehash.
++_count;
return name;
}
private void _verifySharing()
{
if (_hashShared) {
_hashArea = Arrays.copyOf(_hashArea, _hashArea.length);
_names = Arrays.copyOf(_names, _names.length);
_hashShared = false;
}
}
Method called to find the location within hash table to add a new symbol in.
/**
* Method called to find the location within hash table to add a new symbol in.
*/
private int _findOffsetForAdd(int hash)
{
// first, check the primary: if slot found, no need for resize
int offset = _calcOffset(hash);
final int[] hashArea = _hashArea;
if (hashArea[offset+3] == 0) {
//System.err.printf(" PRImary slot #%d, hash %X\n", (offset>>2), hash & 0x7F);
return offset;
}
// Otherwise let's see if we are due resize():
if (_checkNeedForRehash()) {
return _resizeAndFindOffsetForAdd(hash);
}
// If not, proceed with secondary slot
int offset2 = _secondaryStart + ((offset >> 3) << 2);
if (hashArea[offset2+3] == 0) {
//System.err.printf(" SECondary slot #%d (start x%X), hash %X\n",(offset >> 3), _secondaryStart, (hash & 0x7F));
return offset2;
}
// if not, tertiary?
offset2 = _tertiaryStart + ((offset >> (_tertiaryShift + 2)) << _tertiaryShift);
final int bucketSize = (1 << _tertiaryShift);
for (int end = offset2 + bucketSize; offset2 < end; offset2 += 4) {
if (hashArea[offset2+3] == 0) {
//System.err.printf(" TERtiary slot x%X (from x%X, start x%X), hash %X.\n", offset2, ((offset >> (_tertiaryShift + 2)) << _tertiaryShift), _tertiaryStart, (hash & 0x7F));
return offset2;
}
}
// and if even tertiary full, append at the end of spill area
offset = _spilloverEnd;
_spilloverEnd += 4;
//System.err.printf(" SPIll-over at x%X; start x%X; end x%X, hash %X\n", offset, _spilloverStart(), _hashArea.length, (hash & 0x7F));
// one caveat: in the unlikely event if spill-over filling up,
// check if that could be considered a DoS attack; handle appropriately
// (NOTE: approximate for now; we could verify details if that becomes necessary)
/* 31-Jul-2015, tatu: Note that spillover area does NOT end at end of array,
* since "long names" area follows. Instead, need to calculate from hash size.
*/
final int end = (_hashSize << 3);
if (_spilloverEnd >= end) {
if (_failOnDoS) {
_reportTooManyCollisions();
}
return _resizeAndFindOffsetForAdd(hash);
}
return offset;
}
// @since 2.10
private int _resizeAndFindOffsetForAdd(int hash)
{
// First things first: we need to resize+rehash (or, if too big, nuke contents)
rehash();
// Copy of main _findOffsetForAdd except for checks to resize: can not be needed
int offset = _calcOffset(hash);
final int[] hashArea = _hashArea;
if (hashArea[offset+3] == 0) {
return offset;
}
int offset2 = _secondaryStart + ((offset >> 3) << 2);
if (hashArea[offset2+3] == 0) {
return offset2;
}
offset2 = _tertiaryStart + ((offset >> (_tertiaryShift + 2)) << _tertiaryShift);
final int bucketSize = (1 << _tertiaryShift);
for (int end = offset2 + bucketSize; offset2 < end; offset2 += 4) {
if (hashArea[offset2+3] == 0) {
return offset2;
}
}
offset = _spilloverEnd;
_spilloverEnd += 4;
return offset;
}
// Helper method for checking if we should simply rehash() before add
private boolean _checkNeedForRehash() {
// Yes if above 80%, or above 50% AND have ~1% spill-overs
if (_count > (_hashSize >> 1)) { // over 50%
int spillCount = (_spilloverEnd - _spilloverStart()) >> 2;
if ((spillCount > (1 + _count >> 7))
|| (_count > (_hashSize * 0.80))) {
return true;
}
}
return false;
}
private int _appendLongName(int[] quads, int qlen)
{
int start = _longNameOffset;
// note: at this point we must already be shared. But may not have enough space
if ((start + qlen) > _hashArea.length) {
// try to increment in reasonable chunks; at least space that we need
int toAdd = (start + qlen) - _hashArea.length;
// but at least 1/8 of regular hash area size or 16kB (whichever smaller)
int minAdd = Math.min(4096, _hashSize);
int newSize = _hashArea.length + Math.max(toAdd, minAdd);
_hashArea = Arrays.copyOf(_hashArea, newSize);
}
System.arraycopy(quads, 0, _hashArea, start, qlen);
_longNameOffset += qlen;
return start;
}
/*
/**********************************************************
/* Hash calculation
/**********************************************************
*/
/* Note on hash calculation: we try to make it more difficult to
* generate collisions automatically; part of this is to avoid
* simple "multiply-add" algorithm (like JDK String.hashCode()),
* and add bit of shifting. And other part is to make this
* non-linear, at least for shorter symbols.
*/
// JDK uses 31; other fine choices are 33 and 65599, let's use 33
// as it seems to give fewest collisions for us
// (see [http://www.cse.yorku.ca/~oz/hash.html] for details)
private final static int MULT = 33;
private final static int MULT2 = 65599;
private final static int MULT3 = 31;
public int calcHash(int q1)
{
int hash = q1 ^ _seed;
/* 29-Mar-2015, tatu: Earlier used 15 + 9 right shifts, which worked ok
* except for one specific problem case: numbers. So needed to make sure
* that all 4 least-significant bits participate in hash. Couple of ways
* to work it out, but this is the simplest, fast and seems to do ok.
*/
hash += (hash >>> 16); // to xor hi- and low- 16-bits
hash ^= (hash << 3); // shuffle back a bit
hash += (hash >>> 12); // and bit more
return hash;
}
public int calcHash(int q1, int q2)
{
// For two quads, let's change algorithm a bit, to spice
// things up (can do bit more processing anyway)
int hash = q1;
hash += (hash >>> 15); // try mixing first and second byte pairs first
hash ^= (hash >>> 9); // as well as lowest 2 bytes
hash += (q2 * MULT); // then add second quad
hash ^= _seed;
hash += (hash >>> 16); // and shuffle some more
hash ^= (hash >>> 4);
hash += (hash << 3);
return hash;
}
public int calcHash(int q1, int q2, int q3)
{ // use same algorithm as multi-byte, tested to work well
int hash = q1 ^ _seed;
hash += (hash >>> 9);
hash *= MULT3;
hash += q2;
hash *= MULT;
hash += (hash >>> 15);
hash ^= q3;
// 26-Mar-2015, tatu: As per two-quad case, a short shift seems to help more here
hash += (hash >>> 4);
hash += (hash >>> 15);
hash ^= (hash << 9);
return hash;
}
public int calcHash(int[] q, int qlen)
{
if (qlen < 4) {
throw new IllegalArgumentException();
}
/* And then change handling again for "multi-quad" case; mostly
* to make calculation of collisions less fun. For example,
* add seed bit later in the game, and switch plus/xor around,
* use different shift lengths.
*/
int hash = q[0] ^ _seed;
hash += (hash >>> 9);
hash += q[1];
hash += (hash >>> 15);
hash *= MULT;
hash ^= q[2];
hash += (hash >>> 4);
for (int i = 3; i < qlen; ++i) {
int next = q[i];
next = next ^ (next >> 21);
hash += next;
}
hash *= MULT2;
// and finally shuffle some more once done
hash += (hash >>> 19);
hash ^= (hash << 5);
return hash;
}
/*
/**********************************************************
/* Rehashing
/**********************************************************
*/
private void rehash()
{
// Note: since we'll make copies, no need to unshare, can just mark as such:
_hashShared = false;
// And then we can first deal with the main hash area. Since we are expanding
// linearly (double up), we know there'll be no collisions during this phase.
final int[] oldHashArea = _hashArea;
final String[] oldNames = _names;
final int oldSize = _hashSize;
final int oldCount = _count;
final int newSize = oldSize + oldSize;
final int oldEnd = _spilloverEnd;
/* 13-Mar-2010, tatu: Let's guard against OOME that could be caused by
* large documents with unique (or mostly so) names
*/
if (newSize > MAX_T_SIZE) {
nukeSymbols(true);
return;
}
// double up main hash area, but do not expand long-name area:
_hashArea = new int[oldHashArea.length + (oldSize<<3)];
_hashSize = newSize;
_secondaryStart = (newSize << 2); // 4 ints per entry
_tertiaryStart = _secondaryStart + (_secondaryStart >> 1); // right after secondary
_tertiaryShift = _calcTertiaryShift(newSize);
// and simply double up name array
_names = new String[oldNames.length << 1];
nukeSymbols(false);
// Plus we can scan only through the primary hash area, looking for non-empty
// slots, without worrying about ordering. This should never reduce priority
// of existing entries: primaries remain primaries; however, due to increased
// space, secondaries may become primaries etc
int copyCount = 0;
int[] q = new int[16];
for (int offset = 0, end = oldEnd; offset < end; offset += 4) {
int len = oldHashArea[offset+3];
if (len == 0) { // empty slot, skip
continue;
}
++copyCount;
String name = oldNames[offset>>2];
switch (len) {
case 1:
q[0] = oldHashArea[offset];
addName(name, q, 1);
break;
case 2:
q[0] = oldHashArea[offset];
q[1] = oldHashArea[offset+1];
addName(name, q, 2);
break;
case 3:
q[0] = oldHashArea[offset];
q[1] = oldHashArea[offset+1];
q[2] = oldHashArea[offset+2];
addName(name, q, 3);
break;
default:
if (len > q.length) {
q = new int[len];
}
// #0 is hash, #1 offset
int qoff = oldHashArea[offset+1];
System.arraycopy(oldHashArea, qoff, q, 0, len);
addName(name, q, len);
break;
}
}
// Sanity checks: since corruption difficult to detect, assert explicitly
// with production code
if (copyCount != oldCount) {
throw new IllegalStateException("Failed rehash(): old count="+oldCount+", copyCount="+copyCount);
}
}
Helper method called to empty all shared symbols, but to leave
arrays allocated
/**
* Helper method called to empty all shared symbols, but to leave
* arrays allocated
*/
private void nukeSymbols(boolean fill) {
_count = 0;
// reset spill-over to empty (starting at 7/8 of hash area)
_spilloverEnd = _spilloverStart();
// and long name area to empty, starting immediately after hash area
_longNameOffset = _hashSize << 3;
if (fill) {
Arrays.fill(_hashArea, 0);
Arrays.fill(_names, null);
}
}
/*
/**********************************************************
/* Helper methods
/**********************************************************
*/
Helper method that calculates start of the spillover area
/**
* Helper method that calculates start of the spillover area
*/
private final int _spilloverStart() {
// we'll need slot at 1.75x of hashSize, but with 4-ints per slot.
// So basically multiply by 7
int offset = _hashSize;
return (offset << 3) - offset;
}
protected void _reportTooManyCollisions()
{
// First: do not fuzz about small symbol tables; may get balanced by doubling up
if (_hashSize <= 1024) { // would have spill-over area of 128 entries
return;
}
throw new IllegalStateException("Spill-over slots in symbol table with "+_count
+" entries, hash area of "+_hashSize+" slots is now full (all "
+(_hashSize >> 3)+" slots -- suspect a DoS attack based on hash collisions."
+" You can disable the check via `JsonFactory.Feature.FAIL_ON_SYMBOL_HASH_OVERFLOW`");
}
static int _calcTertiaryShift(int primarySlots)
{
// first: we only get 1/4 of slots of primary, to divide
int tertSlots = (primarySlots) >> 2;
// default is for buckets of 4 slots (each 4 ints, i.e. 1 << 4)
if (tertSlots < 64) {
return 4;
}
if (tertSlots <= 256) { // buckets of 8 slots (up to 256 == 32 x 8)
return 5;
}
if (tertSlots <= 1024) { // buckets of 16 slots (up to 1024 == 64 x 16)
return 6;
}
// and biggest buckets have 32 slots
return 7;
}
/*
/**********************************************************
/* Helper classes
/**********************************************************
*/
Immutable value class used for sharing information as efficiently
as possible, by only require synchronization of reference manipulation
but not access to contents.
Since: 2.1
/**
* Immutable value class used for sharing information as efficiently
* as possible, by only require synchronization of reference manipulation
* but not access to contents.
*
* @since 2.1
*/
private final static class TableInfo
{
public final int size;
public final int count;
public final int tertiaryShift;
public final int[] mainHash;
public final String[] names;
public final int spilloverEnd;
public final int longNameOffset;
public TableInfo(int size, int count, int tertiaryShift,
int[] mainHash, String[] names, int spilloverEnd, int longNameOffset)
{
this.size = size;
this.count = count;
this.tertiaryShift = tertiaryShift;
this.mainHash = mainHash;
this.names = names;
this.spilloverEnd = spilloverEnd;
this.longNameOffset = longNameOffset;
}
public TableInfo(ByteQuadsCanonicalizer src)
{
size = src._hashSize;
count = src._count;
tertiaryShift = src._tertiaryShift;
mainHash = src._hashArea;
names = src._names;
spilloverEnd = src._spilloverEnd;
longNameOffset = src._longNameOffset;
}
public static TableInfo createInitial(int sz) {
int hashAreaSize = sz << 3;
int tertShift = _calcTertiaryShift(sz);
return new TableInfo(sz, // hashSize
0, // count
tertShift,
new int[hashAreaSize], // mainHash, 2x slots, 4 ints per slot
new String[sz << 1], // names == 2x slots
hashAreaSize - sz, // at 7/8 of the total area
hashAreaSize // longNameOffset, immediately after main hashes
);
}
}
}