package com.ctc.wstx.util;

import java.util.*;

A specialized Map/Symbol table - like data structure that can be used for both checking whether a word (passed in as a char array) exists in certain set of words AND getting that word as a String. It is reasonably efficient both time and speed-wise, at least for certain use cases; specifically, if there is no existing key to use, it is more efficient way to get to a shared copy of that String The general usage pattern is expected to be such that most checks are positive, ie. that the word indeed is contained in the structure.

Although this is an efficient data struct for specific set of usage patterns, one restriction is that the full set of words to include has to be known before constructing the instnace. Also, the size of the set is limited to total word content of about 20k characters.

TODO: Should document the internal data structure...

/** * A specialized Map/Symbol table - like data structure that can be used * for both checking whether a word (passed in as a char array) exists * in certain set of words AND getting that word as a String. * It is reasonably efficient both time and speed-wise, at least for * certain use cases; specifically, if there is no existing key to use, * it is more efficient way to get to a shared copy of that String * The general usage pattern is expected * to be such that most checks are positive, ie. that the word indeed * is contained in the structure. *<p> * Although this is an efficient data struct for specific set of usage * patterns, one restriction is that the full set of words to include has to * be known before constructing the instnace. Also, the size of the set is * limited to total word content of about 20k characters. *<p> * TODO: Should document the internal data structure... */
public final class WordResolver {
Maximum number of words (Strings) an instance can contain
/** * Maximum number of words (Strings) an instance can contain */
public final static int MAX_WORDS = 0x2000; final static char CHAR_NULL = (char) 0;
Offset added to numbers to mark 'negative' numbers. Asymmetric, since range of negative markers needed is smaller than positive numbers...
/** * Offset added to numbers to mark 'negative' numbers. Asymmetric, * since range of negative markers needed is smaller than positive * numbers... */
final static int NEGATIVE_OFFSET = 0x10000 - MAX_WORDS;
This is actually just a guess; but in general linear search should be faster for short sequences (definitely for 4 or less; maybe up to 8 or less?)
/** * This is actually just a guess; but in general linear search should * be faster for short sequences (definitely for 4 or less; maybe up * to 8 or less?) */
final static int MIN_BINARY_SEARCH = 7;
Compressed presentation of the word set.
/** * Compressed presentation of the word set. */
final char[] mData;
Array of actual words returned resolved for matches.
/** * Array of actual words returned resolved for matches. */
final String[] mWords; /* //////////////////////////////////////////////// // Life-cycle //////////////////////////////////////////////// */ private WordResolver(String[] words, char[] index) { mWords = words; mData = index; }
Tries to construct an instance given ordered set of words.

Note: currently maximum number of words that can be contained is limited to MAX_WORDS; additionally, maximum length of all such words can not exceed roughly 28000 characters.

Returns:WordResolver constructed for given set of words, if the word set size is not too big; null to indicate "too big" instance.
/** * Tries to construct an instance given ordered set of words. *<p> * Note: currently maximum number of words that can be contained * is limited to {@link #MAX_WORDS}; additionally, maximum length * of all such words can not exceed roughly 28000 characters. * * @return WordResolver constructed for given set of words, if * the word set size is not too big; null to indicate "too big" * instance. */
public static WordResolver constructInstance(TreeSet<String> wordSet) { if (wordSet.size() > MAX_WORDS) { return null; } return new Builder(wordSet).construct(); } /* //////////////////////////////////////////////// // Public API //////////////////////////////////////////////// */
Returns:Number of words contained
/** * @return Number of words contained */
public int size() { return mWords.length; } /* public int indexSize() { return mData.length; } */
Params:
  • str – Character array that contains the word to find
  • start – Index of the first character of the word
  • end – Index following the last character of the word, so that end - start equals word length (similar to the way String.substring() has).
Returns:(Shared) string instance of the word, if it exists in the word set; null if not.
/** * @param str Character array that contains the word to find * @param start Index of the first character of the word * @param end Index following the last character of the word, * so that <code>end - start</code> equals word length (similar * to the way <code>String.substring()</code> has). * * @return (Shared) string instance of the word, if it exists in * the word set; null if not. */
@SuppressWarnings("cast") public String find(char[] str, final int start, final int end) { char[] data = mData; // 03-Jan-2006, TSa: Special case; one entry if (data == null) { return findFromOne(str, start, end); } int ptr = 0; // pointer to compressed set data int offset = start; while (true) { // End of input String? Need to match the runt entry! if (offset == end) { if (data[ptr+1] == CHAR_NULL) { return mWords[data[ptr+2] - NEGATIVE_OFFSET]; } return null; } int count = data[ptr++]; // Need to find the branch to follow, if any char c = str[offset++]; inner_block: do { // dummy loop, need to have break // Linear or binary search? if (count < MIN_BINARY_SEARCH) { // always at least two branches; never less if (data[ptr] == c) { ptr = (int) data[ptr+1]; break inner_block; } if (data[ptr+2] == c) { ptr = (int) data[ptr+3]; break inner_block; } int branchEnd = ptr + (count << 1); // Starts from entry #3, if such exists for (ptr += 4; ptr < branchEnd; ptr += 2) { if (data[ptr] == c) { ptr = (int) data[ptr+1]; break inner_block; } } return null; // No match! } else { // Ok, binary search: int low = 0; int high = count-1; int mid; while (low <= high) { mid = (low + high) >> 1; int ix = ptr + (mid << 1); int diff = data[ix] - c; if (diff > 0) { // char was 'higher', need to go down high = mid-1; } else if (diff < 0) { // lower, need to go up low = mid+1; } else { // match (so far) ptr = (int) data[ix+1]; break inner_block; } } return null; // No match! } } while (false); // Ok; now, is it the end? if (ptr >= NEGATIVE_OFFSET) { String word = mWords[ptr - NEGATIVE_OFFSET]; int expLen = (end - start); if (word.length() != expLen) { return null; } for (int i = offset - start; offset < end; ++i, ++offset) { if (word.charAt(i) != str[offset]) { return null; } } return word; } } // never gets here } private String findFromOne(char[] str, final int start, final int end) { String word = mWords[0]; int len = end-start; if (word.length() != len) { return null; } for (int i = 0; i < len; ++i) { if (word.charAt(i) != str[start+i]) { return null; } } return word; }
Returns:(Shared) string instance of the word, if it exists in the word set; null if not.
/** * @return (Shared) string instance of the word, if it exists in * the word set; null if not. */
@SuppressWarnings("cast") public String find(String str) { char[] data = mData; // 03-Jan-2006, TSa: Special case; one entry if (data == null) { String word = mWords[0]; return word.equals(str) ? word : null; } int ptr = 0; // pointer to compressed set data int offset = 0; int end = str.length(); while (true) { // End of input String? Need to match the runt entry! if (offset == end) { if (data[ptr+1] == CHAR_NULL) { return mWords[data[ptr+2] - NEGATIVE_OFFSET]; } return null; } int count = data[ptr++]; // Need to find the branch to follow, if any char c = str.charAt(offset++); inner_block: do { // dummy loop, need to have break // Linear or binary search? if (count < MIN_BINARY_SEARCH) { // always at least two branches; never less if (data[ptr] == c) { ptr = (int) data[ptr+1]; break inner_block; } if (data[ptr+2] == c) { ptr = (int) data[ptr+3]; break inner_block; } int branchEnd = ptr + (count << 1); // Starts from entry #3, if such exists for (ptr += 4; ptr < branchEnd; ptr += 2) { if (data[ptr] == c) { ptr = (int) data[ptr+1]; break inner_block; } } return null; // No match! } else { // Ok, binary search: int low = 0; int high = count-1; int mid; while (low <= high) { mid = (low + high) >> 1; int ix = ptr + (mid << 1); int diff = data[ix] - c; if (diff > 0) { // char was 'higher', need to go down high = mid-1; } else if (diff < 0) { // lower, need to go up low = mid+1; } else { // match (so far) ptr = (int) data[ix+1]; break inner_block; } } return null; // No match! } } while (false); // Ok; now, is it the end? if (ptr >= NEGATIVE_OFFSET) { String word = mWords[ptr - NEGATIVE_OFFSET]; if (word.length() != str.length()) { return null; } for (; offset < end; ++offset) { if (word.charAt(offset) != str.charAt(offset)) { return null; } } return word; } } // never gets here } /* //////////////////////////////////////////////// // Re-defined public methods //////////////////////////////////////////////// */ @Override public String toString() { StringBuilder sb = new StringBuilder(16 + (mWords.length << 3)); for (int i = 0, len = mWords.length; i < len; ++i) { if (i > 0) { sb.append(", "); } sb.append(mWords[i]); } return sb.toString(); } /* //////////////////////////////////////////////// // Helper classes //////////////////////////////////////////////// */ private final static class Builder { final String[] mWords; char[] mData;
Number of characters currently used from mData
/** * Number of characters currently used from mData */
int mSize; public Builder(TreeSet<String> wordSet) { int wordCount = wordSet.size(); mWords = new String[wordCount]; wordSet.toArray(mWords); /* 03-Jan-2006, TSa: Special case: just one entry; if so, * let's leave char array null, and just have the String * array with one entry. */ if (wordCount < 2) { if (wordCount == 0) { throw new IllegalArgumentException(); // not legal } mData = null; } else { /* Let's guess approximate size we should need, assuming * average word length of 6 characters, overhead matching * compression (ie. about 1-to-1 ratio overall) */ int size = wordCount * 6; if (size < 256) { size = 256; } mData = new char[size]; } }
Returns:Raw character data that contains compressed structure of the word set
/** * @return Raw character data that contains compressed structure * of the word set */
public WordResolver construct() { char[] result; /* 03-Jan-2006, TSa: Special case: just one entry; if so, * let's leave char array null, and just have the String * array with one entry. */ if (mData == null) { result = null; } else { constructBranch(0, 0, mWords.length); // Too big? if (mSize > NEGATIVE_OFFSET) { return null; } result = new char[mSize]; System.arraycopy(mData, 0, result, 0, mSize); } return new WordResolver(mWords, result); }
Method that is called recursively to build the data representation for a branch, ie. part of word set tree that still has more than one ending
Params:
  • charIndex – Index of the character in words to consider for this round
  • start – Index of the first word to be processed
  • end – Index of the word after last word to be processed (so that number of words is end - start - 1
/** * Method that is called recursively to build the data * representation for a branch, ie. part of word set tree * that still has more than one ending * * @param charIndex Index of the character in words to consider * for this round * @param start Index of the first word to be processed * @param end Index of the word after last word to be processed * (so that number of words is <code>end - start - 1</code> */
@SuppressWarnings("cast") private void constructBranch(int charIndex, int start, int end) { // If more than one entry, need to divide into groups // First, need to add placeholder for branch count: if (mSize >= mData.length) { expand(1); } mData[mSize++] = 0; // placeholder! /* structStart will point to second char of first entry * (which will temporarily have entry count, eventually 'link' * to continuation) */ int structStart = mSize + 1; int groupCount = 0; int groupStart = start; String[] words = mWords; boolean gotRunt; /* First thing we need to do is a special check for the * first entry -- it may be "runt" word, one that has no * more chars but also has a longer version ("id" vs. * "identifier"). If so, it needs to be marked; this is done * by adding a special entry before other entries (since such * entry would always be ordered first alphabetically) */ if (words[groupStart].length() == charIndex) { // yup, got one: if ((mSize + 2) > mData.length) { expand(2); } /* First null marks the "missing" char (or, end-of-word); * and then we need the index */ mData[mSize++] = CHAR_NULL; mData[mSize++] = (char) (NEGATIVE_OFFSET + groupStart); // Ok, let's then ignore that entry ++groupStart; ++groupCount; gotRunt = true; } else { gotRunt = false; } // Ok, then, let's find the ('real') groupings: while (groupStart < end) { // Inner loop, let's find the group: char c = words[groupStart].charAt(charIndex); int j = groupStart+1; while (j < end && words[j].charAt(charIndex) == c) { ++j; } /* Ok, let's store the char in there, along with count; * count will be needed in second, and will then get * overwritten with actual data later on */ if ((mSize + 2) > mData.length) { expand(2); } mData[mSize++] = c; mData[mSize++] = (char) (j - groupStart); // entries in group groupStart = j; ++groupCount; } /* Ok, groups found; need to loop through them, recursively * calling branch and/or leaf methods */ // first let's output the header, ie. group count: mData[structStart-2] = (char) groupCount; groupStart = start; // Do we have the "runt" to skip? if (gotRunt) { structStart += 2; ++groupStart; } int structEnd = mSize; ++charIndex; for (; structStart < structEnd; structStart += 2) { groupCount = (int) mData[structStart]; // no sign expansion, is ok /* Ok, count gotten, can either create a branch (if more than * one entry) or leaf (just one entry) */ if (groupCount == 1) { mData[structStart] = (char) (NEGATIVE_OFFSET + groupStart); } else { mData[structStart] = (char) mSize; constructBranch(charIndex, groupStart, groupStart + groupCount); } groupStart += groupCount; } // done! } private char[] expand(int needSpace) { char[] old = mData; int len = old.length; int newSize = len + ((len < 4096) ? len : (len >> 1)); /* Let's verify we get enough; should always be true but * better safe than sorry */ if (newSize < (mSize + needSpace)) { newSize = mSize + needSpace + 64; } mData = new char[newSize]; System.arraycopy(old, 0, mData, 0, len); return mData; } } /* //////////////////////////////////////////////////// // Simple test driver, useful for debugging // (uncomment if needed -- commented out so it won't // affect coverage testing) //////////////////////////////////////////////////// */ /* public static void main(String[] args) { if (args.length < 2) { System.err.println("Usage: "+WordResolver.class+" word1 [word2] ... [wordN] keyword"); System.exit(1); } String key = args[args.length-1]; TreeSet words = new TreeSet(); for (int i = 0; i < args.length-1; ++i) { words.add(args[i]); } WordResolver set = WordResolver.constructInstance(words); //outputData(set.mData); // Ok, and then the test! char[] keyA = new char[key.length() + 4]; key.getChars(0, key.length(), keyA, 2); //System.out.println("Word '"+key+"' found via array search: "+WordResolver.find(data, keyA, 2, key.length() + 2)); System.out.println("Word '"+key+"' found via array search: "+set.find(keyA, 2, key.length() + 2)); } static void outputData(char[] data) { for (int i = 0; i < data.length; ++i) { char c = data[i]; System.out.print(Integer.toHexString(i)+" ["+Integer.toHexString(c)+"]"); if (c > 32 && c <= 127) { // printable char (letter) System.out.println(" -> '"+c+"'"); } else { System.out.println(); } } } */ }