/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
/*
* This package is based on the work done by Keiron Liddle, Aftex Software
* <keiron@aftexsw.com> to whom the Ant project is very grateful for his
* great code.
*/
package org.apache.tools.bzip2;
import java.io.IOException;
import java.io.OutputStream;
An output stream that compresses into the BZip2 format (without the file
header chars) into another stream.
The compression requires large amounts of memory. Thus you should call the close()
method as soon as possible, to force CBZip2OutputStream
to release the allocated memory.
You can shrink the amount of allocated memory and maybe raise
the compression speed by choosing a lower blocksize, which in turn
may cause a lower compression ratio. You can avoid unnecessary
memory allocation by avoiding using a blocksize which is bigger
than the size of the input.
You can compute the memory usage for compressing by the
following formula:
<code>400k + (9 * blocksize)</code>.
To get the memory required for decompression by CBZip2InputStream
use
<code>65k + (5 * blocksize)</code>.
Memory usage by blocksize
Blocksize
Compression
memory usage
Decompression
memory usage
100k
1300k
565k
200k
2200k
1065k
300k
3100k
1565k
400k
4000k
2065k
500k
4900k
2565k
600k
5800k
3065k
700k
6700k
3565k
800k
7600k
4065k
900k
8500k
4565k
For decompression CBZip2InputStream
allocates less memory if the
bzipped input is smaller than one block.
Instances of this class are not threadsafe.
TODO: Update to BZip2 1.0.1
/**
* An output stream that compresses into the BZip2 format (without the file
* header chars) into another stream.
*
* <p>
* The compression requires large amounts of memory. Thus you should call the
* {@link #close() close()} method as soon as possible, to force
* <code>CBZip2OutputStream</code> to release the allocated memory.
* </p>
*
* <p>You can shrink the amount of allocated memory and maybe raise
* the compression speed by choosing a lower blocksize, which in turn
* may cause a lower compression ratio. You can avoid unnecessary
* memory allocation by avoiding using a blocksize which is bigger
* than the size of the input.</p>
*
* <p>You can compute the memory usage for compressing by the
* following formula:</p>
*
* <pre>
* <code>400k + (9 * blocksize)</code>.
* </pre>
*
* <p>To get the memory required for decompression by {@link
* CBZip2InputStream CBZip2InputStream} use</p>
*
* <pre>
* <code>65k + (5 * blocksize)</code>.
* </pre>
*
* <table style="border:1px solid black">
* <caption>Memory usage by blocksize</caption>
* <tr>
* <th style="text-align:right">Blocksize</th>
* <th style="text-align:right">Compression<br>memory usage</th>
* <th style="text-align:right">Decompression<br>memory usage</th>
* </tr>
* <tr>
* <td style="text-align:right">100k</td>
* <td style="text-align:right">1300k</td>
* <td style="text-align:right">565k</td>
* </tr>
* <tr>
* <td style="text-align:right">200k</td>
* <td style="text-align:right">2200k</td>
* <td style="text-align:right">1065k</td>
* </tr>
* <tr>
* <td style="text-align:right">300k</td>
* <td style="text-align:right">3100k</td>
* <td style="text-align:right">1565k</td>
* </tr>
* <tr>
* <td style="text-align:right">400k</td>
* <td style="text-align:right">4000k</td>
* <td style="text-align:right">2065k</td>
* </tr>
* <tr>
* <td style="text-align:right">500k</td>
* <td style="text-align:right">4900k</td>
* <td style="text-align:right">2565k</td>
* </tr>
* <tr>
* <td style="text-align:right">600k</td>
* <td style="text-align:right">5800k</td>
* <td style="text-align:right">3065k</td>
* </tr>
* <tr>
* <td style="text-align:right">700k</td>
* <td style="text-align:right">6700k</td>
* <td style="text-align:right">3565k</td>
* </tr>
* <tr>
* <td style="text-align:right">800k</td>
* <td style="text-align:right">7600k</td>
* <td style="text-align:right">4065k</td>
* </tr>
* <tr>
* <td style="text-align:right">900k</td>
* <td style="text-align:right">8500k</td>
* <td style="text-align:right">4565k</td>
* </tr>
* </table>
*
* <p>
* For decompression <code>CBZip2InputStream</code> allocates less memory if the
* bzipped input is smaller than one block.
* </p>
*
* <p>
* Instances of this class are not threadsafe.
* </p>
*
* <p>
* TODO: Update to BZip2 1.0.1
* </p>
*
*/
public class CBZip2OutputStream extends OutputStream
implements BZip2Constants {
The minimum supported blocksize == 1
.
/**
* The minimum supported blocksize <code> == 1</code>.
*/
public static final int MIN_BLOCKSIZE = 1;
The maximum supported blocksize == 9
.
/**
* The maximum supported blocksize <code> == 9</code>.
*/
public static final int MAX_BLOCKSIZE = 9;
This constant is accessible by subclasses for historical
purposes. If you don't know what it means then you don't need
it.
/**
* This constant is accessible by subclasses for historical
* purposes. If you don't know what it means then you don't need
* it.
*/
protected static final int SETMASK = (1 << 21);
This constant is accessible by subclasses for historical
purposes. If you don't know what it means then you don't need
it.
/**
* This constant is accessible by subclasses for historical
* purposes. If you don't know what it means then you don't need
* it.
*/
protected static final int CLEARMASK = (~SETMASK);
This constant is accessible by subclasses for historical
purposes. If you don't know what it means then you don't need
it.
/**
* This constant is accessible by subclasses for historical
* purposes. If you don't know what it means then you don't need
* it.
*/
protected static final int GREATER_ICOST = 15;
This constant is accessible by subclasses for historical
purposes. If you don't know what it means then you don't need
it.
/**
* This constant is accessible by subclasses for historical
* purposes. If you don't know what it means then you don't need
* it.
*/
protected static final int LESSER_ICOST = 0;
This constant is accessible by subclasses for historical
purposes. If you don't know what it means then you don't need
it.
/**
* This constant is accessible by subclasses for historical
* purposes. If you don't know what it means then you don't need
* it.
*/
protected static final int SMALL_THRESH = 20;
This constant is accessible by subclasses for historical
purposes. If you don't know what it means then you don't need
it.
/**
* This constant is accessible by subclasses for historical
* purposes. If you don't know what it means then you don't need
* it.
*/
protected static final int DEPTH_THRESH = 10;
This constant is accessible by subclasses for historical
purposes. If you don't know what it means then you don't need
it.
/**
* This constant is accessible by subclasses for historical
* purposes. If you don't know what it means then you don't need
* it.
*/
protected static final int WORK_FACTOR = 30;
This constant is accessible by subclasses for historical
purposes. If you don't know what it means then you don't need
it.
If you are ever unlucky/improbable enough to get a stack
overflow whilst sorting, increase the following constant and
try again. In practice I have never seen the stack go above 27
elems, so the following limit seems very generous.
/**
* This constant is accessible by subclasses for historical
* purposes. If you don't know what it means then you don't need
* it.
* <p>If you are ever unlucky/improbable enough to get a stack
* overflow whilst sorting, increase the following constant and
* try again. In practice I have never seen the stack go above 27
* elems, so the following limit seems very generous.</p>
*/
protected static final int QSORT_STACK_SIZE = 1000;
Knuth's increments seem to work better than Incerpi-Sedgewick here.
Possibly because the number of elems to sort is usually small, typically
<= 20.
/**
* Knuth's increments seem to work better than Incerpi-Sedgewick here.
* Possibly because the number of elems to sort is usually small, typically
* <= 20.
*/
@SuppressWarnings("unused")
private static final int[] INCS = {1, 4, 13, 40, 121, 364, 1093, 3280,
9841, 29524, 88573, 265720, 797161,
2391484};
This method is accessible by subclasses for historical
purposes. If you don't know what it does then you don't need
it.
Params: - len – char[]
- freq – char[]
- alphaSize – int
- maxLen – int
/**
* This method is accessible by subclasses for historical
* purposes. If you don't know what it does then you don't need
* it.
*
* @param len char[]
* @param freq char[]
* @param alphaSize int
* @param maxLen int
*/
protected static void hbMakeCodeLengths(char[] len, int[] freq,
int alphaSize, int maxLen) {
/*
* Nodes and heap entries run from 1. Entry 0 for both the heap and
* nodes is a sentinel.
*/
final int[] heap = new int[MAX_ALPHA_SIZE * 2];
final int[] weight = new int[MAX_ALPHA_SIZE * 2];
final int[] parent = new int[MAX_ALPHA_SIZE * 2];
for (int i = alphaSize; --i >= 0;) {
weight[i + 1] = (freq[i] == 0 ? 1 : freq[i]) << 8;
}
for (boolean tooLong = true; tooLong;) {
tooLong = false;
int nNodes = alphaSize;
int nHeap = 0;
heap[0] = 0;
weight[0] = 0;
parent[0] = -2;
for (int i = 1; i <= alphaSize; i++) {
parent[i] = -1;
nHeap++;
heap[nHeap] = i;
int zz = nHeap;
int tmp = heap[zz];
while (weight[tmp] < weight[heap[zz >> 1]]) {
heap[zz] = heap[zz >> 1];
zz >>= 1;
}
heap[zz] = tmp;
}
// assert (nHeap < (MAX_ALPHA_SIZE + 2)) : nHeap;
while (nHeap > 1) {
int n1 = heap[1];
heap[1] = heap[nHeap];
nHeap--;
int yy = 0;
int zz = 1;
int tmp = heap[1];
while (true) {
yy = zz << 1;
if (yy > nHeap) {
break;
}
if ((yy < nHeap)
&& (weight[heap[yy + 1]] < weight[heap[yy]])) {
yy++;
}
if (weight[tmp] < weight[heap[yy]]) {
break;
}
heap[zz] = heap[yy];
zz = yy;
}
heap[zz] = tmp;
int n2 = heap[1];
heap[1] = heap[nHeap];
nHeap--;
yy = 0;
zz = 1;
tmp = heap[1];
while (true) {
yy = zz << 1;
if (yy > nHeap) {
break;
}
if ((yy < nHeap)
&& (weight[heap[yy + 1]] < weight[heap[yy]])) {
yy++;
}
if (weight[tmp] < weight[heap[yy]]) {
break;
}
heap[zz] = heap[yy];
zz = yy;
}
heap[zz] = tmp;
nNodes++;
parent[n1] = parent[n2] = nNodes;
final int weight_n1 = weight[n1];
final int weight_n2 = weight[n2];
weight[nNodes] = (((weight_n1 & 0xffffff00)
+ (weight_n2 & 0xffffff00))
|
(1 + (((weight_n1 & 0x000000ff)
> (weight_n2 & 0x000000ff))
? (weight_n1 & 0x000000ff)
: (weight_n2 & 0x000000ff))
));
parent[nNodes] = -1;
nHeap++;
heap[nHeap] = nNodes;
tmp = 0;
zz = nHeap;
tmp = heap[zz];
final int weight_tmp = weight[tmp];
while (weight_tmp < weight[heap[zz >> 1]]) {
heap[zz] = heap[zz >> 1];
zz >>= 1;
}
heap[zz] = tmp;
}
// assert (nNodes < (MAX_ALPHA_SIZE * 2)) : nNodes;
for (int i = 1; i <= alphaSize; i++) {
int j = 0;
int k = i;
for (int parent_k; (parent_k = parent[k]) >= 0;) {
k = parent_k;
j++;
}
len[i - 1] = (char) j;
if (j > maxLen) {
tooLong = true;
}
}
if (tooLong) {
for (int i = 1; i < alphaSize; i++) {
int j = weight[i] >> 8;
j = 1 + (j >> 1);
weight[i] = j << 8;
}
}
}
}
private static void hbMakeCodeLengths(final byte[] len, final int[] freq,
final Data dat, final int alphaSize,
final int maxLen) {
/*
* Nodes and heap entries run from 1. Entry 0 for both the heap and
* nodes is a sentinel.
*/
final int[] heap = dat.heap;
final int[] weight = dat.weight;
final int[] parent = dat.parent;
for (int i = alphaSize; --i >= 0;) {
weight[i + 1] = (freq[i] == 0 ? 1 : freq[i]) << 8;
}
for (boolean tooLong = true; tooLong;) {
tooLong = false;
int nNodes = alphaSize;
int nHeap = 0;
heap[0] = 0;
weight[0] = 0;
parent[0] = -2;
for (int i = 1; i <= alphaSize; i++) {
parent[i] = -1;
nHeap++;
heap[nHeap] = i;
int zz = nHeap;
int tmp = heap[zz];
while (weight[tmp] < weight[heap[zz >> 1]]) {
heap[zz] = heap[zz >> 1];
zz >>= 1;
}
heap[zz] = tmp;
}
while (nHeap > 1) {
int n1 = heap[1];
heap[1] = heap[nHeap];
nHeap--;
int yy = 0;
int zz = 1;
int tmp = heap[1];
while (true) {
yy = zz << 1;
if (yy > nHeap) {
break;
}
if ((yy < nHeap)
&& (weight[heap[yy + 1]] < weight[heap[yy]])) {
yy++;
}
if (weight[tmp] < weight[heap[yy]]) {
break;
}
heap[zz] = heap[yy];
zz = yy;
}
heap[zz] = tmp;
int n2 = heap[1];
heap[1] = heap[nHeap];
nHeap--;
yy = 0;
zz = 1;
tmp = heap[1];
while (true) {
yy = zz << 1;
if (yy > nHeap) {
break;
}
if ((yy < nHeap)
&& (weight[heap[yy + 1]] < weight[heap[yy]])) {
yy++;
}
if (weight[tmp] < weight[heap[yy]]) {
break;
}
heap[zz] = heap[yy];
zz = yy;
}
heap[zz] = tmp;
nNodes++;
parent[n1] = parent[n2] = nNodes;
final int weight_n1 = weight[n1];
final int weight_n2 = weight[n2];
weight[nNodes] = ((weight_n1 & 0xffffff00)
+ (weight_n2 & 0xffffff00))
| (1 + (((weight_n1 & 0x000000ff)
> (weight_n2 & 0x000000ff))
? (weight_n1 & 0x000000ff)
: (weight_n2 & 0x000000ff)));
parent[nNodes] = -1;
nHeap++;
heap[nHeap] = nNodes;
tmp = 0;
zz = nHeap;
tmp = heap[zz];
final int weight_tmp = weight[tmp];
while (weight_tmp < weight[heap[zz >> 1]]) {
heap[zz] = heap[zz >> 1];
zz >>= 1;
}
heap[zz] = tmp;
}
for (int i = 1; i <= alphaSize; i++) {
int j = 0;
int k = i;
for (int parent_k; (parent_k = parent[k]) >= 0;) {
k = parent_k;
j++;
}
len[i - 1] = (byte) j;
if (j > maxLen) {
tooLong = true;
}
}
if (tooLong) {
for (int i = 1; i < alphaSize; i++) {
int j = weight[i] >> 8;
j = 1 + (j >> 1);
weight[i] = j << 8;
}
}
}
}
Index of the last char in the block, so the block size == last + 1.
/**
* Index of the last char in the block, so the block size == last + 1.
*/
private int last;
Always: in the range 0 .. 9. The current block size is 100000 * this
number.
/**
* Always: in the range 0 .. 9. The current block size is 100000 * this
* number.
*/
private final int blockSize100k;
private int bsBuff;
private int bsLive;
private final CRC crc = new CRC();
private int nInUse;
private int nMTF;
private int currentChar = -1;
private int runLength = 0;
private int blockCRC;
private int combinedCRC;
private final int allowableBlockSize;
All memory intensive stuff.
/**
* All memory intensive stuff.
*/
private Data data;
private BlockSort blockSorter;
private OutputStream out;
Chooses a blocksize based on the given length of the data to compress.
Params: - inputLength –
The length of the data which will be compressed by
CBZip2OutputStream
.
Returns: The blocksize, between MIN_BLOCKSIZE
and MAX_BLOCKSIZE
both inclusive. For a negative inputLength
this method returns MAX_BLOCKSIZE
always.
/**
* Chooses a blocksize based on the given length of the data to compress.
*
* @param inputLength
* The length of the data which will be compressed by
* <code>CBZip2OutputStream</code>.
* @return The blocksize, between {@link #MIN_BLOCKSIZE} and
* {@link #MAX_BLOCKSIZE} both inclusive. For a negative
* <code>inputLength</code> this method returns <code>MAX_BLOCKSIZE</code>
* always.
*/
public static int chooseBlockSize(long inputLength) {
return (inputLength > 0) ? (int) Math
.min((inputLength / 132000) + 1, 9) : MAX_BLOCKSIZE;
}
Constructs a new CBZip2OutputStream
with a blocksize of 900k.
Attention: The caller is responsible to write the two BZip2 magic
bytes "BZ"
to the specified stream prior to calling this
constructor.
Params: - out – *
the destination stream.
Throws: - IOException –
if an I/O error occurs in the specified stream.
- NullPointerException –
if
out == null
.
/**
* Constructs a new <code>CBZip2OutputStream</code> with a blocksize of 900k.
*
* <p>
* <b>Attention: </b>The caller is responsible to write the two BZip2 magic
* bytes <code>"BZ"</code> to the specified stream prior to calling this
* constructor.
* </p>
*
* @param out *
* the destination stream.
*
* @throws IOException
* if an I/O error occurs in the specified stream.
* @throws NullPointerException
* if <code>out == null</code>.
*/
public CBZip2OutputStream(final OutputStream out) throws IOException {
this(out, MAX_BLOCKSIZE);
}
Constructs a new CBZip2OutputStream
with specified blocksize.
Attention: The caller is responsible to write the two BZip2 magic
bytes "BZ"
to the specified stream prior to calling this
constructor.
Params: - out –
the destination stream.
- blockSize –
the blockSize as 100k units.
Throws: - IOException –
if an I/O error occurs in the specified stream.
- IllegalArgumentException –
if
(blockSize < 1) || (blockSize > 9)
. - NullPointerException –
if
out == null
.
See Also:
/**
* Constructs a new <code>CBZip2OutputStream</code> with specified blocksize.
*
* <p>
* <b>Attention: </b>The caller is responsible to write the two BZip2 magic
* bytes <code>"BZ"</code> to the specified stream prior to calling this
* constructor.
* </p>
*
*
* @param out
* the destination stream.
* @param blockSize
* the blockSize as 100k units.
*
* @throws IOException
* if an I/O error occurs in the specified stream.
* @throws IllegalArgumentException
* if <code>(blockSize < 1) || (blockSize > 9)</code>.
* @throws NullPointerException
* if <code>out == null</code>.
*
* @see #MIN_BLOCKSIZE
* @see #MAX_BLOCKSIZE
*/
public CBZip2OutputStream(final OutputStream out, final int blockSize)
throws IOException {
super();
if (blockSize < 1) {
throw new IllegalArgumentException("blockSize(" + blockSize
+ ") < 1");
}
if (blockSize > 9) {
throw new IllegalArgumentException("blockSize(" + blockSize
+ ") > 9");
}
this.blockSize100k = blockSize;
this.out = out;
/* 20 is just a paranoia constant */
this.allowableBlockSize = (this.blockSize100k * BZip2Constants.baseBlockSize) - 20;
init();
}
{@inheritDoc} /** {@inheritDoc} */
@Override
public void write(final int b) throws IOException {
if (this.out != null) {
write0(b);
} else {
throw new IOException("closed");
}
}
Writes the current byte to the buffer, run-length encoding it
if it has been repeated at least four times (the first step
RLEs sequences of four identical bytes).
Flushes the current block before writing data if it is
full.
"write to the buffer" means adding to data.buffer starting
two steps "after" this.last - initially starting at index 1
(not 0) - and updating this.last to point to the last index
written minus 1.
/**
* Writes the current byte to the buffer, run-length encoding it
* if it has been repeated at least four times (the first step
* RLEs sequences of four identical bytes).
*
* <p>Flushes the current block before writing data if it is
* full.</p>
*
* <p>"write to the buffer" means adding to data.buffer starting
* two steps "after" this.last - initially starting at index 1
* (not 0) - and updating this.last to point to the last index
* written minus 1.</p>
*/
private void writeRun() throws IOException {
final int lastShadow = this.last;
if (lastShadow < this.allowableBlockSize) {
final int currentCharShadow = this.currentChar;
final Data dataShadow = this.data;
dataShadow.inUse[currentCharShadow] = true;
final byte ch = (byte) currentCharShadow;
int runLengthShadow = this.runLength;
this.crc.updateCRC(currentCharShadow, runLengthShadow);
final byte[] block = dataShadow.block;
switch (runLengthShadow) {
case 1:
block[lastShadow + 2] = ch;
this.last = lastShadow + 1;
break;
case 2:
block[lastShadow + 2] = ch;
block[lastShadow + 3] = ch;
this.last = lastShadow + 2;
break;
case 3:
block[lastShadow + 2] = ch;
block[lastShadow + 3] = ch;
block[lastShadow + 4] = ch;
this.last = lastShadow + 3;
break;
default:
runLengthShadow -= 4;
dataShadow.inUse[runLengthShadow] = true;
block[lastShadow + 2] = ch;
block[lastShadow + 3] = ch;
block[lastShadow + 4] = ch;
block[lastShadow + 5] = ch;
block[lastShadow + 6] = (byte) runLengthShadow;
this.last = lastShadow + 5;
break;
}
} else {
endBlock();
initBlock();
writeRun();
}
}
Overridden to close the stream.
/**
* Overridden to close the stream.
*/
@Override
protected void finalize() throws Throwable {
finish();
super.finalize();
}
public void finish() throws IOException {
if (out != null) {
try {
if (this.runLength > 0) {
writeRun();
}
this.currentChar = -1;
endBlock();
endCompression();
} finally {
this.out = null;
this.data = null;
this.blockSorter = null;
}
}
}
@Override
public void close() throws IOException {
if (out != null) {
OutputStream outShadow = this.out;
finish();
outShadow.close();
}
}
@Override
public void flush() throws IOException {
OutputStream outShadow = this.out;
if (outShadow != null) {
outShadow.flush();
}
}
private void init() throws IOException {
// write magic: done by caller who created this stream
// this.out.write('B');
// this.out.write('Z');
this.data = new Data(this.blockSize100k);
this.blockSorter = new BlockSort(this.data);
/*
* Write `magic' bytes h indicating file-format == huffmanised, followed
* by a digit indicating blockSize100k.
*/
bsPutUByte('h');
bsPutUByte('0' + this.blockSize100k);
this.combinedCRC = 0;
initBlock();
}
private void initBlock() {
// blockNo++;
this.crc.initialiseCRC();
this.last = -1;
// ch = 0;
boolean[] inUse = this.data.inUse;
for (int i = 256; --i >= 0;) {
inUse[i] = false;
}
}
private void endBlock() throws IOException {
this.blockCRC = this.crc.getFinalCRC();
this.combinedCRC = (this.combinedCRC << 1) | (this.combinedCRC >>> 31);
this.combinedCRC ^= this.blockCRC;
// empty block at end of file
if (this.last == -1) {
return;
}
/* sort the block and establish posn of original string */
blockSort();
/*
* A 6-byte block header, the value chosen arbitrarily as 0x314159265359
* :-). A 32 bit value does not really give a strong enough guarantee
* that the value will not appear by chance in the compressed
* datastream. Worst-case probability of this event, for a 900k block,
* is about 2.0e-3 for 32 bits, 1.0e-5 for 40 bits and 4.0e-8 for 48
* bits. For a compressed file of size 100Gb -- about 100000 blocks --
* only a 48-bit marker will do. NB: normal compression/ decompression
* do not rely on these statistical properties. They are only important
* when trying to recover blocks from damaged files.
*/
bsPutUByte(0x31);
bsPutUByte(0x41);
bsPutUByte(0x59);
bsPutUByte(0x26);
bsPutUByte(0x53);
bsPutUByte(0x59);
/* Now the block's CRC, so it is in a known place. */
bsPutInt(this.blockCRC);
/* Now a single bit indicating no randomisation. */
bsW(1, 0);
/* Finally, block's contents proper. */
moveToFrontCodeAndSend();
}
private void endCompression() throws IOException {
/*
* Now another magic 48-bit number, 0x177245385090, to indicate the end
* of the last block. (sqrt(pi), if you want to know. I did want to use
* e, but it contains too much repetition -- 27 18 28 18 28 46 -- for me
* to feel statistically comfortable. Call me paranoid.)
*/
bsPutUByte(0x17);
bsPutUByte(0x72);
bsPutUByte(0x45);
bsPutUByte(0x38);
bsPutUByte(0x50);
bsPutUByte(0x90);
bsPutInt(this.combinedCRC);
bsFinishedWithStream();
}
Returns the blocksize parameter specified at construction time.
Returns: int
/**
* Returns the blocksize parameter specified at construction time.
*
* @return int
*/
public final int getBlockSize() {
return this.blockSize100k;
}
@Override
public void write(final byte[] buf, int offs, final int len)
throws IOException {
if (offs < 0) {
throw new IndexOutOfBoundsException("offs(" + offs + ") < 0.");
}
if (len < 0) {
throw new IndexOutOfBoundsException("len(" + len + ") < 0.");
}
if (offs + len > buf.length) {
throw new IndexOutOfBoundsException("offs(" + offs + ") + len("
+ len + ") > buf.length("
+ buf.length + ").");
}
if (this.out == null) {
throw new IOException("stream closed");
}
for (int hi = offs + len; offs < hi;) {
write0(buf[offs++]);
}
}
Keeps track of the last bytes written and implicitly performs
run-length encoding as the first step of the bzip2 algorithm.
/**
* Keeps track of the last bytes written and implicitly performs
* run-length encoding as the first step of the bzip2 algorithm.
*/
private void write0(int b) throws IOException {
if (this.currentChar != -1) {
b &= 0xff;
if (this.currentChar == b) {
if (++this.runLength > 254) {
writeRun();
this.currentChar = -1;
this.runLength = 0;
}
// else nothing to do
} else {
writeRun();
this.runLength = 1;
this.currentChar = b;
}
} else {
this.currentChar = b & 0xff;
this.runLength++;
}
}
private static void hbAssignCodes(final int[] code, final byte[] length,
final int minLen, final int maxLen,
final int alphaSize) {
int vec = 0;
for (int n = minLen; n <= maxLen; n++) {
for (int i = 0; i < alphaSize; i++) {
if ((length[i] & 0xff) == n) {
code[i] = vec;
vec++;
}
}
vec <<= 1;
}
}
private void bsFinishedWithStream() throws IOException {
while (this.bsLive > 0) {
int ch = this.bsBuff >> 24;
this.out.write(ch); // write 8-bit
this.bsBuff <<= 8;
this.bsLive -= 8;
}
}
private void bsW(final int n, final int v) throws IOException {
final OutputStream outShadow = this.out;
int bsLiveShadow = this.bsLive;
int bsBuffShadow = this.bsBuff;
while (bsLiveShadow >= 8) {
outShadow.write(bsBuffShadow >> 24); // write 8-bit
bsBuffShadow <<= 8;
bsLiveShadow -= 8;
}
this.bsBuff = bsBuffShadow | (v << (32 - bsLiveShadow - n));
this.bsLive = bsLiveShadow + n;
}
private void bsPutUByte(final int c) throws IOException {
bsW(8, c);
}
private void bsPutInt(final int u) throws IOException {
bsW(8, (u >> 24) & 0xff);
bsW(8, (u >> 16) & 0xff);
bsW(8, (u >> 8) & 0xff);
bsW(8, u & 0xff);
}
private void sendMTFValues() throws IOException {
final byte[][] len = this.data.sendMTFValues_len;
final int alphaSize = this.nInUse + 2;
for (int t = N_GROUPS; --t >= 0;) {
byte[] len_t = len[t];
for (int v = alphaSize; --v >= 0;) {
len_t[v] = GREATER_ICOST;
}
}
/* Decide how many coding tables to use */
// assert (this.nMTF > 0) : this.nMTF;
final int nGroups = (this.nMTF < 200) ? 2 : (this.nMTF < 600) ? 3
: (this.nMTF < 1200) ? 4 : (this.nMTF < 2400) ? 5 : 6;
/* Generate an initial set of coding tables */
sendMTFValues0(nGroups, alphaSize);
/*
* Iterate up to N_ITERS times to improve the tables.
*/
final int nSelectors = sendMTFValues1(nGroups, alphaSize);
/* Compute MTF values for the selectors. */
sendMTFValues2(nGroups, nSelectors);
/* Assign actual codes for the tables. */
sendMTFValues3(nGroups, alphaSize);
/* Transmit the mapping table. */
sendMTFValues4();
/* Now the selectors. */
sendMTFValues5(nGroups, nSelectors);
/* Now the coding tables. */
sendMTFValues6(nGroups, alphaSize);
/* And finally, the block data proper */
sendMTFValues7();
}
private void sendMTFValues0(final int nGroups, final int alphaSize) {
final byte[][] len = this.data.sendMTFValues_len;
final int[] mtfFreq = this.data.mtfFreq;
int remF = this.nMTF;
int gs = 0;
for (int nPart = nGroups; nPart > 0; nPart--) {
final int tFreq = remF / nPart;
int ge = gs - 1;
int aFreq = 0;
while (aFreq < tFreq && ge < alphaSize - 1) {
aFreq += mtfFreq[++ge];
}
if (ge > gs && nPart != nGroups && nPart != 1 && (nGroups - nPart & 1) != 0) {
aFreq -= mtfFreq[ge--];
}
final byte[] len_np = len[nPart - 1];
for (int v = alphaSize; --v >= 0;) {
if (v >= gs && v <= ge) {
len_np[v] = LESSER_ICOST;
} else {
len_np[v] = GREATER_ICOST;
}
}
gs = ge + 1;
remF -= aFreq;
}
}
private int sendMTFValues1(final int nGroups, final int alphaSize) {
final Data dataShadow = this.data;
final int[][] rfreq = dataShadow.sendMTFValues_rfreq;
final int[] fave = dataShadow.sendMTFValues_fave;
final short[] cost = dataShadow.sendMTFValues_cost;
final char[] sfmap = dataShadow.sfmap;
final byte[] selector = dataShadow.selector;
final byte[][] len = dataShadow.sendMTFValues_len;
final byte[] len_0 = len[0];
final byte[] len_1 = len[1];
final byte[] len_2 = len[2];
final byte[] len_3 = len[3];
final byte[] len_4 = len[4];
final byte[] len_5 = len[5];
final int nMTFShadow = this.nMTF;
int nSelectors = 0;
for (int iter = 0; iter < N_ITERS; iter++) {
for (int t = nGroups; --t >= 0;) {
fave[t] = 0;
int[] rfreqt = rfreq[t];
for (int i = alphaSize; --i >= 0;) {
rfreqt[i] = 0;
}
}
nSelectors = 0;
for (int gs = 0; gs < this.nMTF;) {
/* Set group start & end marks. */
/*
* Calculate the cost of this group as coded by each of the
* coding tables.
*/
final int ge = Math.min(gs + G_SIZE - 1, nMTFShadow - 1);
if (nGroups == N_GROUPS) {
// unrolled version of the else-block
short cost0 = 0;
short cost1 = 0;
short cost2 = 0;
short cost3 = 0;
short cost4 = 0;
short cost5 = 0;
for (int i = gs; i <= ge; i++) {
final int icv = sfmap[i];
cost0 += len_0[icv] & 0xff;
cost1 += len_1[icv] & 0xff;
cost2 += len_2[icv] & 0xff;
cost3 += len_3[icv] & 0xff;
cost4 += len_4[icv] & 0xff;
cost5 += len_5[icv] & 0xff;
}
cost[0] = cost0;
cost[1] = cost1;
cost[2] = cost2;
cost[3] = cost3;
cost[4] = cost4;
cost[5] = cost5;
} else {
for (int t = nGroups; --t >= 0;) {
cost[t] = 0;
}
for (int i = gs; i <= ge; i++) {
final int icv = sfmap[i];
for (int t = nGroups; --t >= 0;) {
cost[t] += len[t][icv] & 0xff;
}
}
}
/*
* Find the coding table which is best for this group, and
* record its identity in the selector table.
*/
int bt = -1;
for (int t = nGroups, bc = 999999999; --t >= 0;) {
final int cost_t = cost[t];
if (cost_t < bc) {
bc = cost_t;
bt = t;
}
}
fave[bt]++;
selector[nSelectors] = (byte) bt;
nSelectors++;
/*
* Increment the symbol frequencies for the selected table.
*/
final int[] rfreq_bt = rfreq[bt];
for (int i = gs; i <= ge; i++) {
rfreq_bt[sfmap[i]]++;
}
gs = ge + 1;
}
/*
* Recompute the tables based on the accumulated frequencies.
*/
for (int t = 0; t < nGroups; t++) {
hbMakeCodeLengths(len[t], rfreq[t], this.data, alphaSize, 20);
}
}
return nSelectors;
}
private void sendMTFValues2(final int nGroups, final int nSelectors) {
// assert (nGroups < 8) : nGroups;
final Data dataShadow = this.data;
byte[] pos = dataShadow.sendMTFValues2_pos;
for (int i = nGroups; --i >= 0;) {
pos[i] = (byte) i;
}
for (int i = 0; i < nSelectors; i++) {
final byte ll_i = dataShadow.selector[i];
byte tmp = pos[0];
int j = 0;
while (ll_i != tmp) {
j++;
byte tmp2 = tmp;
tmp = pos[j];
pos[j] = tmp2;
}
pos[0] = tmp;
dataShadow.selectorMtf[i] = (byte) j;
}
}
private void sendMTFValues3(final int nGroups, final int alphaSize) {
int[][] code = this.data.sendMTFValues_code;
byte[][] len = this.data.sendMTFValues_len;
for (int t = 0; t < nGroups; t++) {
int minLen = 32;
int maxLen = 0;
final byte[] len_t = len[t];
for (int i = alphaSize; --i >= 0;) {
final int l = len_t[i] & 0xff;
if (l > maxLen) {
maxLen = l;
}
if (l < minLen) {
minLen = l;
}
}
// assert (maxLen <= 20) : maxLen;
// assert (minLen >= 1) : minLen;
hbAssignCodes(code[t], len[t], minLen, maxLen, alphaSize);
}
}
private void sendMTFValues4() throws IOException {
final boolean[] inUse = this.data.inUse;
final boolean[] inUse16 = this.data.sentMTFValues4_inUse16;
for (int i = 16; --i >= 0;) {
inUse16[i] = false;
final int i16 = i * 16;
for (int j = 16; --j >= 0;) {
if (inUse[i16 + j]) {
inUse16[i] = true;
}
}
}
for (int i = 0; i < 16; i++) {
bsW(1, inUse16[i] ? 1 : 0);
}
final OutputStream outShadow = this.out;
int bsLiveShadow = this.bsLive;
int bsBuffShadow = this.bsBuff;
for (int i = 0; i < 16; i++) {
if (inUse16[i]) {
final int i16 = i * 16;
for (int j = 0; j < 16; j++) {
// inlined: bsW(1, inUse[i16 + j] ? 1 : 0);
while (bsLiveShadow >= 8) {
outShadow.write(bsBuffShadow >> 24); // write 8-bit
bsBuffShadow <<= 8;
bsLiveShadow -= 8;
}
if (inUse[i16 + j]) {
bsBuffShadow |= 1 << (32 - bsLiveShadow - 1);
}
bsLiveShadow++;
}
}
}
this.bsBuff = bsBuffShadow;
this.bsLive = bsLiveShadow;
}
private void sendMTFValues5(final int nGroups, final int nSelectors)
throws IOException {
bsW(3, nGroups);
bsW(15, nSelectors);
final OutputStream outShadow = this.out;
final byte[] selectorMtf = this.data.selectorMtf;
int bsLiveShadow = this.bsLive;
int bsBuffShadow = this.bsBuff;
for (int i = 0; i < nSelectors; i++) {
for (int j = 0, hj = selectorMtf[i] & 0xff; j < hj; j++) {
// inlined: bsW(1, 1);
while (bsLiveShadow >= 8) {
outShadow.write(bsBuffShadow >> 24);
bsBuffShadow <<= 8;
bsLiveShadow -= 8;
}
bsBuffShadow |= 1 << (32 - bsLiveShadow - 1);
bsLiveShadow++;
}
// inlined: bsW(1, 0);
while (bsLiveShadow >= 8) {
outShadow.write(bsBuffShadow >> 24);
bsBuffShadow <<= 8;
bsLiveShadow -= 8;
}
// bsBuffShadow |= 0 << (32 - bsLiveShadow - 1);
bsLiveShadow++;
}
this.bsBuff = bsBuffShadow;
this.bsLive = bsLiveShadow;
}
private void sendMTFValues6(final int nGroups, final int alphaSize)
throws IOException {
final byte[][] len = this.data.sendMTFValues_len;
final OutputStream outShadow = this.out;
int bsLiveShadow = this.bsLive;
int bsBuffShadow = this.bsBuff;
for (int t = 0; t < nGroups; t++) {
byte[] len_t = len[t];
int curr = len_t[0] & 0xff;
// inlined: bsW(5, curr);
while (bsLiveShadow >= 8) {
outShadow.write(bsBuffShadow >> 24); // write 8-bit
bsBuffShadow <<= 8;
bsLiveShadow -= 8;
}
bsBuffShadow |= curr << (32 - bsLiveShadow - 5);
bsLiveShadow += 5;
for (int i = 0; i < alphaSize; i++) {
int lti = len_t[i] & 0xff;
while (curr < lti) {
// inlined: bsW(2, 2);
while (bsLiveShadow >= 8) {
outShadow.write(bsBuffShadow >> 24); // write 8-bit
bsBuffShadow <<= 8;
bsLiveShadow -= 8;
}
bsBuffShadow |= 2 << (32 - bsLiveShadow - 2);
bsLiveShadow += 2;
curr++; /* 10 */
}
while (curr > lti) {
// inlined: bsW(2, 3);
while (bsLiveShadow >= 8) {
outShadow.write(bsBuffShadow >> 24); // write 8-bit
bsBuffShadow <<= 8;
bsLiveShadow -= 8;
}
bsBuffShadow |= 3 << (32 - bsLiveShadow - 2);
bsLiveShadow += 2;
curr--; /* 11 */
}
// inlined: bsW(1, 0);
while (bsLiveShadow >= 8) {
outShadow.write(bsBuffShadow >> 24); // write 8-bit
bsBuffShadow <<= 8;
bsLiveShadow -= 8;
}
// bsBuffShadow |= 0 << (32 - bsLiveShadow - 1);
bsLiveShadow++;
}
}
this.bsBuff = bsBuffShadow;
this.bsLive = bsLiveShadow;
}
private void sendMTFValues7() throws IOException {
final Data dataShadow = this.data;
final byte[][] len = dataShadow.sendMTFValues_len;
final int[][] code = dataShadow.sendMTFValues_code;
final OutputStream outShadow = this.out;
final byte[] selector = dataShadow.selector;
final char[] sfmap = dataShadow.sfmap;
final int nMTFShadow = this.nMTF;
int selCtr = 0;
int bsLiveShadow = this.bsLive;
int bsBuffShadow = this.bsBuff;
for (int gs = 0; gs < nMTFShadow;) {
final int ge = Math.min(gs + G_SIZE - 1, nMTFShadow - 1);
final int selector_selCtr = selector[selCtr] & 0xff;
final int[] code_selCtr = code[selector_selCtr];
final byte[] len_selCtr = len[selector_selCtr];
while (gs <= ge) {
final int sfmap_i = sfmap[gs];
//
// inlined: bsW(len_selCtr[sfmap_i] & 0xff,
// code_selCtr[sfmap_i]);
//
while (bsLiveShadow >= 8) {
outShadow.write(bsBuffShadow >> 24);
bsBuffShadow <<= 8;
bsLiveShadow -= 8;
}
final int n = len_selCtr[sfmap_i] & 0xFF;
bsBuffShadow |= code_selCtr[sfmap_i] << (32 - bsLiveShadow - n);
bsLiveShadow += n;
gs++;
}
gs = ge + 1;
selCtr++;
}
this.bsBuff = bsBuffShadow;
this.bsLive = bsLiveShadow;
}
private void moveToFrontCodeAndSend() throws IOException {
bsW(24, this.data.origPtr);
generateMTFValues();
sendMTFValues();
}
private void blockSort() {
blockSorter.blockSort(data, last);
}
/*
* Performs Move-To-Front on the Burrows-Wheeler transformed
* buffer, storing the MTFed data in data.sfmap in RUNA/RUNB
* run-length-encoded form.
*
* <p>Keeps track of byte frequencies in data.mtfFreq at the same time.</p>
*/
private void generateMTFValues() {
final int lastShadow = this.last;
final Data dataShadow = this.data;
final boolean[] inUse = dataShadow.inUse;
final byte[] block = dataShadow.block;
final int[] fmap = dataShadow.fmap;
final char[] sfmap = dataShadow.sfmap;
final int[] mtfFreq = dataShadow.mtfFreq;
final byte[] unseqToSeq = dataShadow.unseqToSeq;
final byte[] yy = dataShadow.generateMTFValues_yy;
// make maps
int nInUseShadow = 0;
for (int i = 0; i < 256; i++) {
if (inUse[i]) {
unseqToSeq[i] = (byte) nInUseShadow;
nInUseShadow++;
}
}
this.nInUse = nInUseShadow;
final int eob = nInUseShadow + 1;
for (int i = eob; i >= 0; i--) {
mtfFreq[i] = 0;
}
for (int i = nInUseShadow; --i >= 0;) {
yy[i] = (byte) i;
}
int wr = 0;
int zPend = 0;
for (int i = 0; i <= lastShadow; i++) {
final byte ll_i = unseqToSeq[block[fmap[i]] & 0xff];
byte tmp = yy[0];
int j = 0;
while (ll_i != tmp) {
j++;
byte tmp2 = tmp;
tmp = yy[j];
yy[j] = tmp2;
}
yy[0] = tmp;
if (j == 0) {
zPend++;
} else {
if (zPend > 0) {
zPend--;
while (true) {
if ((zPend & 1) == 0) {
sfmap[wr] = RUNA;
wr++;
mtfFreq[RUNA]++;
} else {
sfmap[wr] = RUNB;
wr++;
mtfFreq[RUNB]++;
}
if (zPend >= 2) {
zPend = (zPend - 2) >> 1;
} else {
break;
}
}
zPend = 0;
}
sfmap[wr] = (char) (j + 1);
wr++;
mtfFreq[j + 1]++;
}
}
if (zPend > 0) {
zPend--;
while (true) {
if ((zPend & 1) == 0) {
sfmap[wr] = RUNA;
wr++;
mtfFreq[RUNA]++;
} else {
sfmap[wr] = RUNB;
wr++;
mtfFreq[RUNB]++;
}
if (zPend >= 2) {
zPend = (zPend - 2) >> 1;
} else {
break;
}
}
}
sfmap[wr] = (char) eob;
mtfFreq[eob]++;
this.nMTF = wr + 1;
}
static final class Data {
// with blockSize 900k
/* maps unsigned byte => "does it occur in block" */
final boolean[] inUse = new boolean[256]; // 256 byte
final byte[] unseqToSeq = new byte[256]; // 256 byte
final int[] mtfFreq = new int[MAX_ALPHA_SIZE]; // 1032 byte
final byte[] selector = new byte[MAX_SELECTORS]; // 18002 byte
final byte[] selectorMtf = new byte[MAX_SELECTORS]; // 18002 byte
final byte[] generateMTFValues_yy = new byte[256]; // 256 byte
final byte[][] sendMTFValues_len = new byte[N_GROUPS][MAX_ALPHA_SIZE]; // 1548
// byte
final int[][] sendMTFValues_rfreq = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192
// byte
final int[] sendMTFValues_fave = new int[N_GROUPS]; // 24 byte
final short[] sendMTFValues_cost = new short[N_GROUPS]; // 12 byte
final int[][] sendMTFValues_code = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192
// byte
final byte[] sendMTFValues2_pos = new byte[N_GROUPS]; // 6 byte
final boolean[] sentMTFValues4_inUse16 = new boolean[16]; // 16 byte
final int[] heap = new int[MAX_ALPHA_SIZE + 2]; // 1040 byte
final int[] weight = new int[MAX_ALPHA_SIZE * 2]; // 2064 byte
final int[] parent = new int[MAX_ALPHA_SIZE * 2]; // 2064 byte
// ------------
// 333408 byte
/* holds the RLEd block of original data starting at index 1.
* After sorting the last byte added to the buffer is at index
* 0. */
final byte[] block; // 900021 byte
/* maps index in Burrows-Wheeler transformed block => index of
* byte in original block */
final int[] fmap; // 3600000 byte
final char[] sfmap; // 3600000 byte
// ------------
// 8433529 byte
// ============
Index of original line in Burrows-Wheeler table.
This is the index in fmap that points to the last byte
of the original data.
/**
* Index of original line in Burrows-Wheeler table.
*
* <p>This is the index in fmap that points to the last byte
* of the original data.</p>
*/
int origPtr;
Data(int blockSize100k) {
super();
final int n = blockSize100k * BZip2Constants.baseBlockSize;
this.block = new byte[(n + 1 + NUM_OVERSHOOT_BYTES)];
this.fmap = new int[n];
this.sfmap = new char[2 * n];
}
}
}