/*
 * LZ4 Library
 * Copyright (c) 2011-2016, Yann Collet
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification,
 * are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice, this
 *   list of conditions and the following disclaimer.
 *
 * * Redistributions in binary form must reproduce the above copyright notice, this
 *   list of conditions and the following disclaimer in the documentation and/or
 *   other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.apache.lucene.util.compress;

import java.io.IOException;
import java.util.Arrays;

import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.FutureArrays;
import org.apache.lucene.util.FutureObjects;
import org.apache.lucene.util.packed.PackedInts;

LZ4 compression and decompression routines. https://github.com/lz4/lz4/tree/dev/lib http://fastcompression.blogspot.fr/p/lz4.html The high-compression option is a simpler version of the one of the original algorithm, and only retains a better hash table that remembers about more occurrences of a previous 4-bytes sequence, and removes all the logic about handling of the case when overlapping matches are found.
/** * LZ4 compression and decompression routines. * * https://github.com/lz4/lz4/tree/dev/lib * http://fastcompression.blogspot.fr/p/lz4.html * * The high-compression option is a simpler version of the one of the original * algorithm, and only retains a better hash table that remembers about more * occurrences of a previous 4-bytes sequence, and removes all the logic about * handling of the case when overlapping matches are found. */
public final class LZ4 { private LZ4() {} static final int MEMORY_USAGE = 14; static final int MIN_MATCH = 4; // minimum length of a match static final int MAX_DISTANCE = 1 << 16; // maximum distance of a reference static final int LAST_LITERALS = 5; // the last 5 bytes must be encoded as literals static final int HASH_LOG_HC = 15; // log size of the dictionary for compressHC static final int HASH_TABLE_SIZE_HC = 1 << HASH_LOG_HC; private static int hash(int i, int hashBits) { return (i * -1640531535) >>> (32 - hashBits); } private static int hashHC(int i) { return hash(i, HASH_LOG_HC); } private static int readInt(byte[] buf, int i) { return ((buf[i] & 0xFF) << 24) | ((buf[i+1] & 0xFF) << 16) | ((buf[i+2] & 0xFF) << 8) | (buf[i+3] & 0xFF); } private static int commonBytes(byte[] b, int o1, int o2, int limit) { assert o1 < o2; // never -1 because lengths always differ return FutureArrays.mismatch(b, o1, limit, b, o2, limit); }
Decompress at least decompressedLen bytes into dest[dOff:]. Please note that dest must be large enough to be able to hold all decompressed data (meaning that you need to know the total decompressed length). If the given bytes were compressed using a preset dictionary then the same dictionary must be provided in dest[dOff-dictLen:dOff].
/** * Decompress at least {@code decompressedLen} bytes into * {@code dest[dOff:]}. Please note that {@code dest} must be large * enough to be able to hold <b>all</b> decompressed data (meaning that you * need to know the total decompressed length). * If the given bytes were compressed using a preset dictionary then the same * dictionary must be provided in {@code dest[dOff-dictLen:dOff]}. */
public static int decompress(DataInput compressed, int decompressedLen, byte[] dest, int dOff) throws IOException { final int destEnd = dOff + decompressedLen; do { // literals final int token = compressed.readByte() & 0xFF; int literalLen = token >>> 4; if (literalLen != 0) { if (literalLen == 0x0F) { byte len; while ((len = compressed.readByte()) == (byte) 0xFF) { literalLen += 0xFF; } literalLen += len & 0xFF; } compressed.readBytes(dest, dOff, literalLen); dOff += literalLen; } if (dOff >= destEnd) { break; } // matchs final int matchDec = (compressed.readByte() & 0xFF) | ((compressed.readByte() & 0xFF) << 8); assert matchDec > 0; int matchLen = token & 0x0F; if (matchLen == 0x0F) { int len; while ((len = compressed.readByte()) == (byte) 0xFF) { matchLen += 0xFF; } matchLen += len & 0xFF; } matchLen += MIN_MATCH; // copying a multiple of 8 bytes can make decompression from 5% to 10% faster final int fastLen = (matchLen + 7) & 0xFFFFFFF8; if (matchDec < matchLen || dOff + fastLen > destEnd) { // overlap -> naive incremental copy for (int ref = dOff - matchDec, end = dOff + matchLen; dOff < end; ++ref, ++dOff) { dest[dOff] = dest[ref]; } } else { // no overlap -> arraycopy System.arraycopy(dest, dOff - matchDec, dest, dOff, fastLen); dOff += matchLen; } } while (dOff < destEnd); return dOff; } private static void encodeLen(int l, DataOutput out) throws IOException { while (l >= 0xFF) { out.writeByte((byte) 0xFF); l -= 0xFF; } out.writeByte((byte) l); } private static void encodeLiterals(byte[] bytes, int token, int anchor, int literalLen, DataOutput out) throws IOException { out.writeByte((byte) token); // encode literal length if (literalLen >= 0x0F) { encodeLen(literalLen - 0x0F, out); } // encode literals out.writeBytes(bytes, anchor, literalLen); } private static void encodeLastLiterals(byte[] bytes, int anchor, int literalLen, DataOutput out) throws IOException { final int token = Math.min(literalLen, 0x0F) << 4; encodeLiterals(bytes, token, anchor, literalLen, out); } private static void encodeSequence(byte[] bytes, int anchor, int matchRef, int matchOff, int matchLen, DataOutput out) throws IOException { final int literalLen = matchOff - anchor; assert matchLen >= 4; // encode token final int token = (Math.min(literalLen, 0x0F) << 4) | Math.min(matchLen - 4, 0x0F); encodeLiterals(bytes, token, anchor, literalLen, out); // encode match dec final int matchDec = matchOff - matchRef; assert matchDec > 0 && matchDec < 1 << 16; out.writeByte((byte) matchDec); out.writeByte((byte) (matchDec >>> 8)); // encode match len if (matchLen >= MIN_MATCH + 0x0F) { encodeLen(matchLen - 0x0F - MIN_MATCH, out); } }
A record of previous occurrences of sequences of 4 bytes.
/** * A record of previous occurrences of sequences of 4 bytes. */
static abstract class HashTable {
Reset this hash table in order to compress the given content.
/** Reset this hash table in order to compress the given content. */
abstract void reset(byte[] b, int off, int len);
Init dictLen bytes to be used as a dictionary.
/** Init {@code dictLen} bytes to be used as a dictionary. */
abstract void initDictionary(int dictLen);
Advance the cursor to {@off} and return an index that stored the same 4 bytes as b[o:o+4). This may only be called on strictly increasing sequences of offsets. A return value of -1 indicates that no other index could be found.
/** * Advance the cursor to {@off} and return an index that stored the same * 4 bytes as {@code b[o:o+4)}. This may only be called on strictly * increasing sequences of offsets. A return value of {@code -1} indicates * that no other index could be found. */
abstract int get(int off);
Return an index that less than off and stores the same 4 bytes. Unlike get, it doesn't need to be called on increasing offsets. A return value of -1 indicates that no other index could be found.
/** * Return an index that less than {@code off} and stores the same 4 * bytes. Unlike {@link #get}, it doesn't need to be called on increasing * offsets. A return value of {@code -1} indicates that no other index could * be found. */
abstract int previous(int off); // For testing abstract boolean assertReset(); }
Simple lossy HashTable that only stores the last ocurrence for each hash on 2^14 bytes of memory.
/** * Simple lossy {@link HashTable} that only stores the last ocurrence for * each hash on {@code 2^14} bytes of memory. */
public static final class FastCompressionHashTable extends HashTable { private byte[] bytes; private int base; private int lastOff; private int end; private int hashLog; private PackedInts.Mutable hashTable;
Sole constructor
/** Sole constructor */
public FastCompressionHashTable() {} @Override void reset(byte[] bytes, int off, int len) { FutureObjects.checkFromIndexSize(off, len, bytes.length); this.bytes = bytes; this.base = off; this.end = off + len; final int bitsPerOffset = PackedInts.bitsRequired(len - LAST_LITERALS); final int bitsPerOffsetLog = 32 - Integer.numberOfLeadingZeros(bitsPerOffset - 1); hashLog = MEMORY_USAGE + 3 - bitsPerOffsetLog; if (hashTable == null || hashTable.size() < 1 << hashLog || hashTable.getBitsPerValue() < bitsPerOffset) { hashTable = PackedInts.getMutable(1 << hashLog, bitsPerOffset, PackedInts.DEFAULT); } else { // Avoid calling hashTable.clear(), this makes it costly to compress many short sequences otherwise. // Instead, get() checks that references are less than the current offset. } this.lastOff = off - 1; } @Override void initDictionary(int dictLen) { for (int i = 0; i < dictLen; ++i) { final int v = readInt(bytes, base + i); final int h = hash(v, hashLog); hashTable.set(h, i); } lastOff += dictLen; } @Override int get(int off) { assert off > lastOff; assert off < end; final int v = readInt(bytes, off); final int h = hash(v, hashLog); final int ref = base + (int) hashTable.get(h); hashTable.set(h, off - base); lastOff = off; if (ref < off && off - ref < MAX_DISTANCE && readInt(bytes, ref) == v) { return ref; } else { return -1; } } @Override public int previous(int off) { return -1; } @Override boolean assertReset() { return true; } }
A higher-precision HashTable. It stores up to 256 occurrences of 4-bytes sequences in the last 2^16 bytes, which makes it much more likely to find matches than FastCompressionHashTable.
/** * A higher-precision {@link HashTable}. It stores up to 256 occurrences of * 4-bytes sequences in the last {@code 2^16} bytes, which makes it much more * likely to find matches than {@link FastCompressionHashTable}. */
public static final class HighCompressionHashTable extends HashTable { private static final int MAX_ATTEMPTS = 256; private static final int MASK = MAX_DISTANCE - 1; private byte[] bytes; private int base; private int next; private int end; private final int[] hashTable; private final short[] chainTable; private int attempts = 0;
Sole constructor
/** Sole constructor */
public HighCompressionHashTable() { hashTable = new int[HASH_TABLE_SIZE_HC]; Arrays.fill(hashTable, -1); chainTable = new short[MAX_DISTANCE]; Arrays.fill(chainTable, (short) 0xFFFF); } @Override void reset(byte[] bytes, int off, int len) { FutureObjects.checkFromIndexSize(off, len, bytes.length); if (end - base < chainTable.length) { // The last call to compress was done on less than 64kB, let's not reset // the hashTable and only reset the relevant parts of the chainTable. // This helps avoid slowing down calling compress() many times on short // inputs. int startOffset = base & MASK; int endOffset = end == 0 ? 0 : ((end - 1) & MASK) + 1; if (startOffset < endOffset) { Arrays.fill(chainTable, startOffset, endOffset, (short) 0xFFFF); } else { Arrays.fill(chainTable, 0, endOffset, (short) 0xFFFF); Arrays.fill(chainTable, startOffset, chainTable.length, (short) 0xFFFF); } } else { // The last call to compress was done on a large enough amount of data // that it's fine to reset both tables Arrays.fill(hashTable, -1); Arrays.fill(chainTable, (short) 0xFFFF); } this.bytes = bytes; this.base = off; this.next = off; this.end = off + len; } @Override void initDictionary(int dictLen) { assert next == base; for (int i = 0; i < dictLen; ++i) { addHash(base + i); } next += dictLen; } @Override int get(int off) { assert off >= next; assert off < end; for (; next < off; next++) { addHash(next); } final int v = readInt(bytes, off); final int h = hashHC(v); attempts = 0; int ref = hashTable[h]; if (ref >= off) { // remainder from a previous call to compress() return -1; } for (int min = Math.max(base, off - MAX_DISTANCE + 1); ref >= min && attempts < MAX_ATTEMPTS; ref -= chainTable[ref & MASK] & 0xFFFF, attempts++) { if (readInt(bytes, ref) == v) { return ref; } } return -1; } private void addHash(int off) { final int v = readInt(bytes, off); final int h = hashHC(v); int delta = off - hashTable[h]; if (delta <= 0 || delta >= MAX_DISTANCE) { delta = MAX_DISTANCE - 1; } chainTable[off & MASK] = (short) delta; hashTable[h] = off; } @Override int previous(int off) { final int v = readInt(bytes, off); for (int ref = off - (chainTable[off & MASK] & 0xFFFF); ref >= base && attempts < MAX_ATTEMPTS; ref -= chainTable[ref & MASK] & 0xFFFF, attempts++ ) { if (readInt(bytes, ref) == v) { return ref; } } return -1; } @Override boolean assertReset() { for (int i = 0; i < chainTable.length; ++i) { assert chainTable[i] == (short) 0xFFFF : i; } return true; } }
Compress bytes[off:off+len] into out using at most 16kB of memory. ht shouldn't be shared across threads but can safely be reused.
/** * Compress {@code bytes[off:off+len]} into {@code out} using at most 16kB of * memory. {@code ht} shouldn't be shared across threads but can safely be * reused. */
public static void compress(byte[] bytes, int off, int len, DataOutput out, HashTable ht) throws IOException { compressWithDictionary(bytes, off, 0, len, out, ht); }
Compress bytes[dictOff+dictLen:dictOff+dictLen+len] into out using at most 16kB of memory. bytes[dictOff:dictOff+dictLen] will be used as a dictionary. dictLen must not be greater than 64kB, the maximum window size. ht shouldn't be shared across threads but can safely be reused.
/** * Compress {@code bytes[dictOff+dictLen:dictOff+dictLen+len]} into * {@code out} using at most 16kB of memory. * {@code bytes[dictOff:dictOff+dictLen]} will be used as a dictionary. * {@code dictLen} must not be greater than 64kB, the maximum window size. * * {@code ht} shouldn't be shared across threads but can safely be reused. */
public static void compressWithDictionary(byte[] bytes, int dictOff, int dictLen, int len, DataOutput out, HashTable ht) throws IOException { FutureObjects.checkFromIndexSize(dictOff, dictLen, bytes.length); FutureObjects.checkFromIndexSize(dictOff + dictLen, len, bytes.length); if (dictLen > MAX_DISTANCE) { throw new IllegalArgumentException("dictLen must not be greater than 64kB, but got " + dictLen); } final int end = dictOff + dictLen + len; int off = dictOff + dictLen; int anchor = off; if (len > LAST_LITERALS + MIN_MATCH) { final int limit = end - LAST_LITERALS; final int matchLimit = limit - MIN_MATCH; ht.reset(bytes, dictOff, dictLen + len); ht.initDictionary(dictLen); main: while (off <= limit) { // find a match int ref; while (true) { if (off >= matchLimit) { break main; } ref = ht.get(off); if (ref != -1) { assert ref >= dictOff && ref < off; assert readInt(bytes, ref) == readInt(bytes, off); break; } ++off; } // compute match length int matchLen = MIN_MATCH + commonBytes(bytes, ref + MIN_MATCH, off + MIN_MATCH, limit); // try to find a better match for (int r = ht.previous(ref), min = Math.max(off - MAX_DISTANCE + 1, dictOff); r >= min; r = ht.previous(r)) { assert readInt(bytes, r) == readInt(bytes, off); int rMatchLen = MIN_MATCH + commonBytes(bytes, r + MIN_MATCH, off + MIN_MATCH, limit); if (rMatchLen > matchLen) { ref = r; matchLen = rMatchLen; } } encodeSequence(bytes, anchor, ref, off, matchLen, out); off += matchLen; anchor = off; } } // last literals final int literalLen = end - anchor; assert literalLen >= LAST_LITERALS || literalLen == len; encodeLastLiterals(bytes, anchor, end - anchor, out); } }