package test;
import java.io.*;
public final class TestScannerPerf
{
final static int INT_AMP = '&';
final static int INT_LT = '<';
final static int INT_RBRACKET = ']';
final static int INT_SPACE = ' ';
final static int INT_TAB = '\t';
final static int INT_CR = '\r';
final static int INT_LF = '\n';
final static byte BYTE_LF = (byte) '\n';
final static byte BYTE_NULL = (byte)0;
final int mRepCount;
int mTmpChar = 0;
final byte[] mData;
final byte[] mInputBuffer = new byte[4000];
final char[] mOutputBuffer = new char[2000];
final static int MB_CODE_BASE = 5;
final static int[] CHAR_TYPES = new int[256];
static {
int code;
for (int i = 128; i < 256; ++i) {
int c = i;
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
code = MB_CODE_BASE + 1;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
code = MB_CODE_BASE + 2;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
code = MB_CODE_BASE + 3;
} else {
code = 1;
}
CHAR_TYPES[c] = code;
}
for (int i = 0; i < 32; ++i) {
CHAR_TYPES[i] = 1; // invalid white space
}
CHAR_TYPES['\r'] = 2;
CHAR_TYPES['\n'] = 2;
CHAR_TYPES['\t'] = 0; // no processing needed
CHAR_TYPES['<'] = 3;
CHAR_TYPES['&'] = 4;
CHAR_TYPES[']'] = 5;
}
InputStream mIn;
int mLineNr;
int mByteCount;
int mTagCount;
int mEntityCount;
int mBracketCount;
int mInputPtr;
int mInputLen;
int mTmpType = 0;
public TestScannerPerf(byte[] data, int repCount)
{
mData = data;
mRepCount = repCount;
}
public void test()
throws IOException
{
int round = 0;
mIn = new ByteArrayInputStream(mData);
for (; true; ++round) {
long now = System.currentTimeMillis();
String msg = "[null]";
int total = 0;
final int TYPES = 3;
if ((round % TYPES) == 0) {
System.out.println();
}
for (int i = 0; i < mRepCount; ++i) {
mIn.reset();
mLineNr = 0;
mTagCount = 0;
mByteCount = 0;
switch (round % TYPES) {
case 0:
msg = "[Scanner-code]";
total += testScannerCode();
break;
case 1:
msg = "[Scanner-int-arr]";
total += testScannerInts();
break;
case 2:
msg = "[Scanner-int-arr2]";
total += testScannerInts2();
break;
default:
throw new Error("Unexpected round, #"+i);
}
}
now = System.currentTimeMillis() - now;
System.out.println(msg+" -> "+now+" msecs (total "+total
+", byte count 0x"+Integer.toHexString(mByteCount)+")");
try { Thread.sleep(200L); } catch (Exception e) { }
System.gc();
try { Thread.sleep(200L); } catch (Exception e) { }
}
}
private int testScannerCode()
throws IOException
{
final char[] outBuf = mOutputBuffer;
int outPtr = 0;
int c = 0;
mInputLen = 0;
mInputPtr = 0;
main_loop:
while (true) {
// Next thing: let's get the first byte:
int ptr = mInputPtr;
ascii_loop:
while (true) {
if (ptr >= mInputLen) {
if (!loadMoreBytes()) {
break main_loop;
}
ptr = mInputPtr;
}
c = (int) mInputBuffer[ptr++];
if (c <= INT_RBRACKET) {
//if (c <= INT_LT) {
if (c < 0) {
break ascii_loop;
}
if (c < INT_SPACE) {
if (c == INT_CR) {
++mLineNr;
} else if (c == INT_LF) {
++mLineNr;
} else if (c != INT_TAB) {
throw new Error();
}
} else if (c == INT_LT) {
++mTagCount;
} else if (c == INT_AMP) {
++mEntityCount;
} else if (c == INT_RBRACKET) {
++mBracketCount;
}
}
// !!! TODO: xml1.1, 0x7F?
if (outPtr >= outBuf.length) {
outPtr = 0;
}
outBuf[outPtr++] = (char) c;
}
c = decodeMultiByteChar(c, ptr);
if (c < 0) { // surrogate pair
if (outPtr >= outBuf.length) {
outPtr = 0;
}
c = -c;
// Let's add first part right away:
outBuf[outPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
// And let the other char output in general loop
}
if (outPtr >= outBuf.length) {
outPtr = 0;
}
outBuf[outPtr++] = (char) c;
}
return mByteCount;
}
private int testScannerInts()
throws IOException
{
int outPtr = 0;
int c = 0;
final int[] TYPES = CHAR_TYPES;
final byte[] inputBuffer = mInputBuffer;
final char[] outputBuffer = mOutputBuffer;
mInputLen = 0;
mInputPtr = 0;
main_loop:
while (true) {
// Next thing: let's get the first byte:
int ptr = mInputPtr;
ascii_loop:
while (true) {
if (ptr >= mInputLen) {
if (!loadMoreBytes()) {
break main_loop;
}
ptr = mInputPtr;
}
c = (int) inputBuffer[ptr++] & 0xFF;
int type = TYPES[c];
if (type != 0) {
switch (type) {
case 1:
throw new Error("Invalid white space");
case 2:
if (c == INT_CR) {
++mLineNr;
} else if (c == INT_LF) {
++mLineNr;
}
break;
case 3:
++mTagCount;
break;
case 4:
++mEntityCount;
break;
case 5:
++mBracketCount;
break;
case 6: // 2 bytes
case 7: // 3 bytes
case 8: // 4 bytes
break ascii_loop;
default:
throw new Error();
}
}
if (outPtr >= outputBuffer.length) {
outPtr = 0;
}
outputBuffer[outPtr++] = (char) c;
}
c = decodeMultiByteChar(c, ptr);
if (c < 0) { // surrogate pair
if (outPtr >= outputBuffer.length) {
outPtr = 0;
}
c = -c;
// Let's add first part right away:
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
// And let the other char output in general loop
}
if (outPtr >= outputBuffer.length) {
outPtr = 0;
}
outputBuffer[outPtr++] = (char) c;
}
return mByteCount;
}
private int testScannerInts2()
throws IOException
{
int outPtr = 0;
int c = 0;
final int[] TYPES = CHAR_TYPES;
final byte[] inputBuffer = mInputBuffer;
char[] outputBuffer = mOutputBuffer;
mInputLen = 0;
mInputPtr = 0;
main_loop:
while (true) {
// Next thing: let's get the first byte:
ascii_loop:
while (true) {
int ptr = mInputPtr;
if (ptr >= mInputLen) {
if (!loadMoreBytes()) {
break main_loop;
}
ptr = mInputPtr;
}
if (outPtr >= outputBuffer.length) {
outputBuffer = mOutputBuffer;
outPtr = 0;
}
int max = mInputLen;
{
int max2 = ptr + (outputBuffer.length - outPtr);
if (max2 < max) {
max = max2;
}
}
while (ptr < max) {
c = (int) inputBuffer[ptr++] & 0xFF;
if (TYPES[c] != 0) {
mInputPtr = ptr;
break ascii_loop;
}
outputBuffer[outPtr++] = (char) c;
}
mInputPtr = ptr;
}
switch (TYPES[c]) {
case 1:
throw new Error("Invalid white space");
case 2:
if (c == INT_CR) {
++mLineNr;
} else if (c == INT_LF) {
++mLineNr;
}
break;
case 3:
++mTagCount;
break;
case 4:
// should expand entity
++mEntityCount;
break;
case 5:
++mBracketCount;
break;
case 6: // 2 bytes
c = decodeMultiByteChar(c, mInputPtr);
break;
case 7: // 3 bytes
c = decodeMultiByteChar(c, mInputPtr);
break;
case 8: // 4 bytes
{
c = decodeMultiByteChar(c, mInputPtr);
if (outPtr >= outputBuffer.length) {
outputBuffer = mOutputBuffer;
outPtr = 0;
}
outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10));
c = 0xDC00 | (c & 0x3FF);
}
break;
default:
throw new Error();
}
if (outPtr >= outputBuffer.length) {
outputBuffer = mOutputBuffer;
outPtr = 0;
}
outputBuffer[outPtr++] = (char) c;
}
return mByteCount;
}
/*
private final int decode(int ptr, int c, int type)
throws IOException
{
switch (type) {
case 1:
throw new Error("Invalid white space");
case 2:
if (c == INT_CR) {
++mLineNr;
} else if (c == INT_LF) {
++mLineNr;
}
break;
case 3:
++mTagCount;
break;
case 4:
// should expand entity
++mEntityCount;
break;
case 5:
++mBracketCount;
break;
case 6: // 2 bytes
case 7: // 3 bytes
case 8: // 4 bytes
c = decodeMultiByteChar(c, ptr);
break;
default:
throw new Error();
}
mInputPtr = ptr;
return c;
}
*/
private final boolean loadMoreBytes()
throws IOException
{
mByteCount += mInputLen;
mInputPtr = 0;
int count = mIn.read(mInputBuffer);
if (count < 0) {
mInputLen = 0;
return false;
}
mInputLen = count;
return true;
}
private final void loadMoreBytesGuaranteed()
throws IOException
{
if (!loadMoreBytes()) {
throw new Error();
}
}
/*
private final void markLF()
{
++mLineNr;
}
private final void markLF(int pos)
{
++mLineNr;
}
private final int handleEntityInText()
{
++mEntityCount;
return '&';
}
*/
private final int decodeMultiByteChar(int c, int ptr)
throws IOException
{
int needed;
if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
c &= 0x1F;
needed = 1;
} else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
c &= 0x0F;
needed = 2;
} else if ((c & 0xF8) == 0xF0) {
// 4 bytes; double-char with surrogates and all...
c &= 0x07;
needed = 3;
} else {
throw new Error("Unexpected multi-byte first byte 0x"+Integer.toHexString(c));
}
if (ptr >= mInputLen) { // 2nd byte
loadMoreBytesGuaranteed();
ptr = mInputPtr;
}
int d = (int) mInputBuffer[ptr++];
if ((d & 0xC0) != 0x080) {
throw new Error();
}
c = (c << 6) | (d & 0x3F);
if (needed > 1) { // needed == 1 means 2 bytes total
if (ptr >= mInputLen) {
loadMoreBytesGuaranteed();
ptr = mInputPtr;
}
d = (int) mInputBuffer[ptr++];
if ((d & 0xC0) != 0x080) {
throw new Error();
}
c = (c << 6) | (d & 0x3F);
if (needed > 2) { // 4 bytes? (need surrogates)
if (ptr >= mInputLen) {
loadMoreBytesGuaranteed();
ptr = mInputPtr;
}
d = (int) mInputBuffer[ptr++];
if ((d & 0xC0) != 0x080) {
throw new Error();
}
c = (c << 6) | (d & 0x3F);
/* Need to signal such pair differently (to make comparison
* easier)
*/
return -c;
}
}
mInputPtr = ptr;
return c;
}
/*
private final int decodeMultiByteChar(int c, int type, int ptr)
throws IOException
{
// let's see how many add'l bytes are needed
type -= MB_CODE_BASE;
c &= (0x3F >> type); // 1f/0f/07 (for 2/3/4 bytes)
if (ptr >= mInputEnd) { // 2nd byte
loadMoreBytesGuaranteed();
ptr = mInputPtr;
}
int d = (int) mInputBuffer[ptr++];
if ((d & 0xC0) != 0x080) {
throw new Error();
}
c = (c << 6) | (d & 0x3F);
if (type > 1) { // needed == 1 means 2 bytes total
if (ptr >= mInputEnd) {
loadMoreBytesGuaranteed();
ptr = mInputPtr;
}
d = (int) mInputBuffer[ptr++];
if ((d & 0xC0) != 0x080) {
throw new Error();
}
c = (c << 6) | (d & 0x3F);
if (type > 2) { // 4 bytes? (need surrogates)
if (ptr >= mInputEnd) {
loadMoreBytesGuaranteed();
ptr = mInputPtr;
}
d = (int) mInputBuffer[ptr++];
if ((d & 0xC0) != 0x080) {
throw new Error();
}
c = (c << 6) | (d & 0x3F);
// Need to signal such pair differently (to make comparison
// easier)
return -c;
}
}
mInputPtr = ptr;
return c;
}
*/
private static byte[] readData(File f)
throws IOException
{
int len = (int) f.length();
byte[] data = new byte[len];
int offset = 0;
FileInputStream fis = new FileInputStream(f);
while (len > 0) {
int count = fis.read(data, offset, len-offset);
offset += count;
len -= count;
}
fis.close();
return data;
}
public static void main(String[] args)
throws IOException
{
if (args.length != 1) {
System.err.println("Usage: java ... [input file]");
System.exit(1);
}
byte[] data = readData(new File(args[0]));
int len = data.length;
int repCount = 1;
int THRESHOLD = 10 * 1000 * 1000;
if (len < THRESHOLD) {
repCount = (THRESHOLD / len);
}
//if (repCount > 2) { repCount /= 2; }
System.out.println("Ok, read in test data, "+len+" bytes; using "+repCount+" repetitions");
new TestScannerPerf(data, repCount).test();
}
}