/*
 * Copyright 2004-2019 H2 Group. Multiple-Licensed under the MPL 2.0,
 * and the EPL 1.0 (http://h2database.com/html/license.html).
 *
 * This code is based on the LZF algorithm from Marc Lehmann. It is a
 * re-implementation of the C code:
 * http://cvs.schmorp.de/liblzf/lzf_c.c?view=markup
 * http://cvs.schmorp.de/liblzf/lzf_d.c?view=markup
 *
 * According to a mail from Marc Lehmann, it's OK to use his algorithm:
 * Date: 2010-07-15 15:57
 * Subject: Re: Question about LZF licensing
 * ...
 * The algorithm is not copyrighted (and cannot be copyrighted afaik) - as long
 * as you wrote everything yourself, without copying my code, that's just fine
 * (looking is of course fine too).
 * ...
 *
 * Still I would like to keep his copyright info:
 *
 * Copyright (c) 2000-2005 Marc Alexander Lehmann <schmorp@schmorp.de>
 * Copyright (c) 2005 Oren J. Maurice <oymaurice@hazorea.org.il>
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *   1.  Redistributions of source code must retain the above copyright notice,
 *       this list of conditions and the following disclaimer.
 *
 *   2.  Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *
 *   3.  The name of the author may not be used to endorse or promote products
 *       derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO
 * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.h2.compress;

import java.nio.ByteBuffer;

This class implements the LZF lossless data compression algorithm. LZF is a Lempel-Ziv variant with byte-aligned output, and optimized for speed.

Safety/Use Notes:

  • Each instance should be used by a single thread only.
  • The data buffers should be smaller than 1 GB.
  • For performance reasons, safety checks on expansion are omitted.
  • Invalid compressed data can cause an ArrayIndexOutOfBoundsException.

The LZF compressed format knows literal runs and back-references:

  • Literal run: directly copy bytes from input to output.
  • Back-reference: copy previous data to output stream, with specified offset from location and length. The length is at least 3 bytes.

The first byte of the compressed stream is the control byte. For literal runs, the highest three bits of the control byte are not set, the lower bits are the literal run length, and the next bytes are data to copy directly into the output. For back-references, the highest three bits of the control byte are the back-reference length. If all three bits are set, then the back-reference length is stored in the next byte. The lower bits of the control byte combined with the next byte form the offset for the back-reference.

/** * <p> * This class implements the LZF lossless data compression algorithm. LZF is a * Lempel-Ziv variant with byte-aligned output, and optimized for speed. * </p> * <p> * Safety/Use Notes: * </p> * <ul> * <li>Each instance should be used by a single thread only.</li> * <li>The data buffers should be smaller than 1 GB.</li> * <li>For performance reasons, safety checks on expansion are omitted.</li> * <li>Invalid compressed data can cause an ArrayIndexOutOfBoundsException.</li> * </ul> * <p> * The LZF compressed format knows literal runs and back-references: * </p> * <ul> * <li>Literal run: directly copy bytes from input to output.</li> * <li>Back-reference: copy previous data to output stream, with specified * offset from location and length. The length is at least 3 bytes.</li> * </ul> *<p> * The first byte of the compressed stream is the control byte. For literal * runs, the highest three bits of the control byte are not set, the lower * bits are the literal run length, and the next bytes are data to copy directly * into the output. For back-references, the highest three bits of the control * byte are the back-reference length. If all three bits are set, then the * back-reference length is stored in the next byte. The lower bits of the * control byte combined with the next byte form the offset for the * back-reference. * </p> */
public final class CompressLZF implements Compressor {
The number of entries in the hash table. The size is a trade-off between hash collisions (reduced compression) and speed (amount that fits in CPU cache).
/** * The number of entries in the hash table. The size is a trade-off between * hash collisions (reduced compression) and speed (amount that fits in CPU * cache). */
private static final int HASH_SIZE = 1 << 14;
The maximum number of literals in a chunk (32).
/** * The maximum number of literals in a chunk (32). */
private static final int MAX_LITERAL = 1 << 5;
The maximum offset allowed for a back-reference (8192).
/** * The maximum offset allowed for a back-reference (8192). */
private static final int MAX_OFF = 1 << 13;
The maximum back-reference length (264).
/** * The maximum back-reference length (264). */
private static final int MAX_REF = (1 << 8) + (1 << 3);
Hash table for matching byte sequences (reused for performance).
/** * Hash table for matching byte sequences (reused for performance). */
private int[] cachedHashTable; @Override public void setOptions(String options) { // nothing to do }
Return the integer with the first two bytes 0, then the bytes at the index, then at index+1.
/** * Return the integer with the first two bytes 0, then the bytes at the * index, then at index+1. */
private static int first(byte[] in, int inPos) { return (in[inPos] << 8) | (in[inPos + 1] & 255); }
Return the integer with the first two bytes 0, then the bytes at the index, then at index+1.
/** * Return the integer with the first two bytes 0, then the bytes at the * index, then at index+1. */
private static int first(ByteBuffer in, int inPos) { return (in.get(inPos) << 8) | (in.get(inPos + 1) & 255); }
Shift the value 1 byte left, and add the byte at index inPos+2.
/** * Shift the value 1 byte left, and add the byte at index inPos+2. */
private static int next(int v, byte[] in, int inPos) { return (v << 8) | (in[inPos + 2] & 255); }
Shift the value 1 byte left, and add the byte at index inPos+2.
/** * Shift the value 1 byte left, and add the byte at index inPos+2. */
private static int next(int v, ByteBuffer in, int inPos) { return (v << 8) | (in.get(inPos + 2) & 255); }
Compute the address in the hash table.
/** * Compute the address in the hash table. */
private static int hash(int h) { return ((h * 2777) >> 9) & (HASH_SIZE - 1); } @Override public int compress(byte[] in, int inLen, byte[] out, int outPos) { int inPos = 0; if (cachedHashTable == null) { cachedHashTable = new int[HASH_SIZE]; } int[] hashTab = cachedHashTable; int literals = 0; outPos++; int future = first(in, 0); while (inPos < inLen - 4) { byte p2 = in[inPos + 2]; // next future = (future << 8) + (p2 & 255); int off = hash(future); int ref = hashTab[off]; hashTab[off] = inPos; // if (ref < inPos // && ref > 0 // && (off = inPos - ref - 1) < MAX_OFF // && in[ref + 2] == p2 // && (((in[ref] & 255) << 8) | (in[ref + 1] & 255)) == // ((future >> 8) & 0xffff)) { if (ref < inPos && ref > 0 && (off = inPos - ref - 1) < MAX_OFF && in[ref + 2] == p2 && in[ref + 1] == (byte) (future >> 8) && in[ref] == (byte) (future >> 16)) { // match int maxLen = inLen - inPos - 2; if (maxLen > MAX_REF) { maxLen = MAX_REF; } if (literals == 0) { // multiple back-references, // so there is no literal run control byte outPos--; } else { // set the control byte at the start of the literal run // to store the number of literals out[outPos - literals - 1] = (byte) (literals - 1); literals = 0; } int len = 3; while (len < maxLen && in[ref + len] == in[inPos + len]) { len++; } len -= 2; if (len < 7) { out[outPos++] = (byte) ((off >> 8) + (len << 5)); } else { out[outPos++] = (byte) ((off >> 8) + (7 << 5)); out[outPos++] = (byte) (len - 7); } out[outPos++] = (byte) off; // move one byte forward to allow for a literal run control byte outPos++; inPos += len; // rebuild the future, and store the last bytes to the // hashtable. Storing hashes of the last bytes in back-reference // improves the compression ratio and only reduces speed // slightly. future = first(in, inPos); future = next(future, in, inPos); hashTab[hash(future)] = inPos++; future = next(future, in, inPos); hashTab[hash(future)] = inPos++; } else { // copy one byte from input to output as part of literal out[outPos++] = in[inPos++]; literals++; // at the end of this literal chunk, write the length // to the control byte and start a new chunk if (literals == MAX_LITERAL) { out[outPos - literals - 1] = (byte) (literals - 1); literals = 0; // move ahead one byte to allow for the // literal run control byte outPos++; } } } // write the remaining few bytes as literals while (inPos < inLen) { out[outPos++] = in[inPos++]; literals++; if (literals == MAX_LITERAL) { out[outPos - literals - 1] = (byte) (literals - 1); literals = 0; outPos++; } } // writes the final literal run length to the control byte out[outPos - literals - 1] = (byte) (literals - 1); if (literals == 0) { outPos--; } return outPos; }
Compress a number of bytes.
Params:
  • in – the input data
  • inPos – the offset at the input buffer
  • out – the output area
  • outPos – the offset at the output array
Returns:the end position
/** * Compress a number of bytes. * * @param in the input data * @param inPos the offset at the input buffer * @param out the output area * @param outPos the offset at the output array * @return the end position */
public int compress(ByteBuffer in, int inPos, byte[] out, int outPos) { int inLen = in.capacity() - inPos; if (cachedHashTable == null) { cachedHashTable = new int[HASH_SIZE]; } int[] hashTab = cachedHashTable; int literals = 0; outPos++; int future = first(in, 0); while (inPos < inLen - 4) { byte p2 = in.get(inPos + 2); // next future = (future << 8) + (p2 & 255); int off = hash(future); int ref = hashTab[off]; hashTab[off] = inPos; // if (ref < inPos // && ref > 0 // && (off = inPos - ref - 1) < MAX_OFF // && in[ref + 2] == p2 // && (((in[ref] & 255) << 8) | (in[ref + 1] & 255)) == // ((future >> 8) & 0xffff)) { if (ref < inPos && ref > 0 && (off = inPos - ref - 1) < MAX_OFF && in.get(ref + 2) == p2 && in.get(ref + 1) == (byte) (future >> 8) && in.get(ref) == (byte) (future >> 16)) { // match int maxLen = inLen - inPos - 2; if (maxLen > MAX_REF) { maxLen = MAX_REF; } if (literals == 0) { // multiple back-references, // so there is no literal run control byte outPos--; } else { // set the control byte at the start of the literal run // to store the number of literals out[outPos - literals - 1] = (byte) (literals - 1); literals = 0; } int len = 3; while (len < maxLen && in.get(ref + len) == in.get(inPos + len)) { len++; } len -= 2; if (len < 7) { out[outPos++] = (byte) ((off >> 8) + (len << 5)); } else { out[outPos++] = (byte) ((off >> 8) + (7 << 5)); out[outPos++] = (byte) (len - 7); } out[outPos++] = (byte) off; // move one byte forward to allow for a literal run control byte outPos++; inPos += len; // rebuild the future, and store the last bytes to the // hashtable. Storing hashes of the last bytes in back-reference // improves the compression ratio and only reduces speed // slightly. future = first(in, inPos); future = next(future, in, inPos); hashTab[hash(future)] = inPos++; future = next(future, in, inPos); hashTab[hash(future)] = inPos++; } else { // copy one byte from input to output as part of literal out[outPos++] = in.get(inPos++); literals++; // at the end of this literal chunk, write the length // to the control byte and start a new chunk if (literals == MAX_LITERAL) { out[outPos - literals - 1] = (byte) (literals - 1); literals = 0; // move ahead one byte to allow for the // literal run control byte outPos++; } } } // write the remaining few bytes as literals while (inPos < inLen) { out[outPos++] = in.get(inPos++); literals++; if (literals == MAX_LITERAL) { out[outPos - literals - 1] = (byte) (literals - 1); literals = 0; outPos++; } } // writes the final literal run length to the control byte out[outPos - literals - 1] = (byte) (literals - 1); if (literals == 0) { outPos--; } return outPos; } @Override public void expand(byte[] in, int inPos, int inLen, byte[] out, int outPos, int outLen) { // if ((inPos | outPos | outLen) < 0) { if (inPos < 0 || outPos < 0 || outLen < 0) { throw new IllegalArgumentException(); } do { int ctrl = in[inPos++] & 255; if (ctrl < MAX_LITERAL) { // literal run of length = ctrl + 1, ctrl++; // copy to output and move forward this many bytes // while (ctrl-- > 0) { // out[outPos++] = in[inPos++]; // } System.arraycopy(in, inPos, out, outPos, ctrl); outPos += ctrl; inPos += ctrl; } else { // back reference // the highest 3 bits are the match length int len = ctrl >> 5; // if the length is maxed, add the next byte to the length if (len == 7) { len += in[inPos++] & 255; } // minimum back-reference is 3 bytes, // so 2 was subtracted before storing size len += 2; // ctrl is now the offset for a back-reference... // the logical AND operation removes the length bits ctrl = -((ctrl & 0x1f) << 8) - 1; // the next byte augments/increases the offset ctrl -= in[inPos++] & 255; // copy the back-reference bytes from the given // location in output to current position ctrl += outPos; if (outPos + len >= out.length) { // reduce array bounds checking throw new ArrayIndexOutOfBoundsException(); } for (int i = 0; i < len; i++) { out[outPos++] = out[ctrl++]; } } } while (outPos < outLen); }
Expand a number of compressed bytes.
Params:
  • in – the compressed data
  • out – the output area
/** * Expand a number of compressed bytes. * * @param in the compressed data * @param out the output area */
public static void expand(ByteBuffer in, ByteBuffer out) { do { int ctrl = in.get() & 255; if (ctrl < MAX_LITERAL) { // literal run of length = ctrl + 1, ctrl++; // copy to output and move forward this many bytes // (maybe slice would be faster) for (int i = 0; i < ctrl; i++) { out.put(in.get()); } } else { // back reference // the highest 3 bits are the match length int len = ctrl >> 5; // if the length is maxed, add the next byte to the length if (len == 7) { len += in.get() & 255; } // minimum back-reference is 3 bytes, // so 2 was subtracted before storing size len += 2; // ctrl is now the offset for a back-reference... // the logical AND operation removes the length bits ctrl = -((ctrl & 0x1f) << 8) - 1; // the next byte augments/increases the offset ctrl -= in.get() & 255; // copy the back-reference bytes from the given // location in output to current position // (maybe slice would be faster) ctrl += out.position(); for (int i = 0; i < len; i++) { out.put(out.get(ctrl++)); } } } while (out.position() < out.capacity()); } @Override public int getAlgorithm() { return Compressor.LZF; } }