/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.commons.compress.compressors.snappy;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.compress.compressors.lz77support.AbstractLZ77CompressorInputStream;
import org.apache.commons.compress.utils.ByteUtils;
CompressorInputStream for the raw Snappy format.
This implementation uses an internal buffer in order to handle
the back-references that are at the heart of the LZ77 algorithm.
The size of the buffer must be at least as big as the biggest
offset used in the compressed stream. The current version of the
Snappy algorithm as defined by Google works on 32k blocks and
doesn't contain offsets bigger than 32k which is the default block
size used by this class.
See Also: Since: 1.7
/**
* CompressorInputStream for the raw Snappy format.
*
* <p>This implementation uses an internal buffer in order to handle
* the back-references that are at the heart of the LZ77 algorithm.
* The size of the buffer must be at least as big as the biggest
* offset used in the compressed stream. The current version of the
* Snappy algorithm as defined by Google works on 32k blocks and
* doesn't contain offsets bigger than 32k which is the default block
* size used by this class.</p>
*
* @see <a href="https://github.com/google/snappy/blob/master/format_description.txt">Snappy compressed format description</a>
* @since 1.7
*/
public class SnappyCompressorInputStream extends AbstractLZ77CompressorInputStream {
Mask used to determine the type of "tag" is being processed /** Mask used to determine the type of "tag" is being processed */
private static final int TAG_MASK = 0x03;
Default block size /** Default block size */
public static final int DEFAULT_BLOCK_SIZE = 32768;
The size of the uncompressed data /** The size of the uncompressed data */
private final int size;
Number of uncompressed bytes still to be read. /** Number of uncompressed bytes still to be read. */
private int uncompressedBytesRemaining;
Current state of the stream /** Current state of the stream */
private State state = State.NO_BLOCK;
private boolean endReached = false;
Constructor using the default buffer size of 32k.
Params: - is –
An InputStream to read compressed data from
Throws: - IOException – if reading fails
/**
* Constructor using the default buffer size of 32k.
*
* @param is
* An InputStream to read compressed data from
*
* @throws IOException if reading fails
*/
public SnappyCompressorInputStream(final InputStream is) throws IOException {
this(is, DEFAULT_BLOCK_SIZE);
}
Constructor using a configurable buffer size.
Params: - is –
An InputStream to read compressed data from
- blockSize –
The block size used in compression
Throws: - IOException – if reading fails
- IllegalArgumentException – if blockSize is not bigger than 0
/**
* Constructor using a configurable buffer size.
*
* @param is
* An InputStream to read compressed data from
* @param blockSize
* The block size used in compression
*
* @throws IOException if reading fails
* @throws IllegalArgumentException if blockSize is not bigger than 0
*/
public SnappyCompressorInputStream(final InputStream is, final int blockSize)
throws IOException {
super(is, blockSize);
uncompressedBytesRemaining = size = (int) readSize();
}
{@inheritDoc}
/**
* {@inheritDoc}
*/
@Override
public int read(final byte[] b, final int off, final int len) throws IOException {
if (len == 0) {
return 0;
}
if (endReached) {
return -1;
}
switch (state) {
case NO_BLOCK:
fill();
return read(b, off, len);
case IN_LITERAL:
int litLen = readLiteral(b, off, len);
if (!hasMoreDataInBlock()) {
state = State.NO_BLOCK;
}
return litLen > 0 ? litLen : read(b, off, len);
case IN_BACK_REFERENCE:
int backReferenceLen = readBackReference(b, off, len);
if (!hasMoreDataInBlock()) {
state = State.NO_BLOCK;
}
return backReferenceLen > 0 ? backReferenceLen : read(b, off, len);
default:
throw new IOException("Unknown stream state " + state);
}
}
Try to fill the buffer with the next block of data.
/**
* Try to fill the buffer with the next block of data.
*/
private void fill() throws IOException {
if (uncompressedBytesRemaining == 0) {
endReached = true;
return;
}
int b = readOneByte();
if (b == -1) {
throw new IOException("Premature end of stream reading block start");
}
int length = 0;
int offset = 0;
switch (b & TAG_MASK) {
case 0x00:
length = readLiteralLength(b);
if (length < 0) {
throw new IOException("Illegal block with a negative literal size found");
}
uncompressedBytesRemaining -= length;
startLiteral(length);
state = State.IN_LITERAL;
break;
case 0x01:
/*
* These elements can encode lengths between [4..11] bytes and
* offsets between [0..2047] bytes. (len-4) occupies three bits
* and is stored in bits [2..4] of the tag byte. The offset
* occupies 11 bits, of which the upper three are stored in the
* upper three bits ([5..7]) of the tag byte, and the lower
* eight are stored in a byte following the tag byte.
*/
length = 4 + ((b >> 2) & 0x07);
if (length < 0) {
throw new IOException("Illegal block with a negative match length found");
}
uncompressedBytesRemaining -= length;
offset = (b & 0xE0) << 3;
b = readOneByte();
if (b == -1) {
throw new IOException("Premature end of stream reading back-reference length");
}
offset |= b;
try {
startBackReference(offset, length);
} catch (IllegalArgumentException ex) {
throw new IOException("Illegal block with bad offset found", ex);
}
state = State.IN_BACK_REFERENCE;
break;
case 0x02:
/*
* These elements can encode lengths between [1..64] and offsets
* from [0..65535]. (len-1) occupies six bits and is stored in
* the upper six bits ([2..7]) of the tag byte. The offset is
* stored as a little-endian 16-bit integer in the two bytes
* following the tag byte.
*/
length = (b >> 2) + 1;
if (length < 0) {
throw new IOException("Illegal block with a negative match length found");
}
uncompressedBytesRemaining -= length;
offset = (int) ByteUtils.fromLittleEndian(supplier, 2);
try {
startBackReference(offset, length);
} catch (IllegalArgumentException ex) {
throw new IOException("Illegal block with bad offset found", ex);
}
state = State.IN_BACK_REFERENCE;
break;
case 0x03:
/*
* These are like the copies with 2-byte offsets (see previous
* subsection), except that the offset is stored as a 32-bit
* integer instead of a 16-bit integer (and thus will occupy
* four bytes).
*/
length = (b >> 2) + 1;
if (length < 0) {
throw new IOException("Illegal block with a negative match length found");
}
uncompressedBytesRemaining -= length;
offset = (int) ByteUtils.fromLittleEndian(supplier, 4) & 0x7fffffff;
try {
startBackReference(offset, length);
} catch (IllegalArgumentException ex) {
throw new IOException("Illegal block with bad offset found", ex);
}
state = State.IN_BACK_REFERENCE;
break;
default:
// impossible as TAG_MASK is two bits and all four possible cases have been covered
break;
}
}
/*
* For literals up to and including 60 bytes in length, the
* upper six bits of the tag byte contain (len-1). The literal
* follows immediately thereafter in the bytestream. - For
* longer literals, the (len-1) value is stored after the tag
* byte, little-endian. The upper six bits of the tag byte
* describe how many bytes are used for the length; 60, 61, 62
* or 63 for 1-4 bytes, respectively. The literal itself follows
* after the length.
*/
private int readLiteralLength(final int b) throws IOException {
int length;
switch (b >> 2) {
case 60:
length = readOneByte();
if (length == -1) {
throw new IOException("Premature end of stream reading literal length");
}
break;
case 61:
length = (int) ByteUtils.fromLittleEndian(supplier, 2);
break;
case 62:
length = (int) ByteUtils.fromLittleEndian(supplier, 3);
break;
case 63:
length = (int) ByteUtils.fromLittleEndian(supplier, 4);
break;
default:
length = b >> 2;
break;
}
return length + 1;
}
The stream starts with the uncompressed length (up to a maximum of 2^32 -
1), stored as a little-endian varint. Varints consist of a series of
bytes, where the lower 7 bits are data and the upper bit is set iff there
are more bytes to be read. In other words, an uncompressed length of 64
would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE)
would be stored as 0xFE 0xFF 0x7F.
Throws: - IOException –
Could not read a byte
Returns: The size of the uncompressed data
/**
* The stream starts with the uncompressed length (up to a maximum of 2^32 -
* 1), stored as a little-endian varint. Varints consist of a series of
* bytes, where the lower 7 bits are data and the upper bit is set iff there
* are more bytes to be read. In other words, an uncompressed length of 64
* would be stored as 0x40, and an uncompressed length of 2097150 (0x1FFFFE)
* would be stored as 0xFE 0xFF 0x7F.
*
* @return The size of the uncompressed data
*
* @throws IOException
* Could not read a byte
*/
private long readSize() throws IOException {
int index = 0;
long sz = 0;
int b = 0;
do {
b = readOneByte();
if (b == -1) {
throw new IOException("Premature end of stream reading size");
}
sz |= (b & 0x7f) << (index++ * 7);
} while (0 != (b & 0x80));
return sz;
}
Get the uncompressed size of the stream
Returns: the uncompressed size
/**
* Get the uncompressed size of the stream
*
* @return the uncompressed size
*/
@Override
public int getSize() {
return size;
}
private enum State {
NO_BLOCK, IN_LITERAL, IN_BACK_REFERENCE
}
}