/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.compress;
import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.BytesRef;
Utility class that can efficiently compress arrays that mostly contain
characters in the [0x1F,0x3F) or [0x5F,0x7F) ranges, which notably
include all digits, lowercase characters, '.', '-' and '_'.
/**
* Utility class that can efficiently compress arrays that mostly contain
* characters in the [0x1F,0x3F) or [0x5F,0x7F) ranges, which notably
* include all digits, lowercase characters, '.', '-' and '_'.
*/
public final class LowercaseAsciiCompression {
private static final boolean isCompressible(int b) {
final int high3Bits = (b + 1) & ~0x1F;
return high3Bits == 0x20 || high3Bits == 0x60;
}
private LowercaseAsciiCompression() {}
Compress in[0:len]
into out
. This returns false
if the content cannot be compressed. The number of bytes written is guaranteed to be less than len
otherwise. /**
* Compress {@code in[0:len]} into {@code out}.
* This returns {@code false} if the content cannot be compressed. The number
* of bytes written is guaranteed to be less than {@code len} otherwise.
*/
public static boolean compress(byte[] in, int len, byte[] tmp, DataOutput out) throws IOException {
if (len < 8) {
return false;
}
// 1. Count exceptions and fail compression if there are too many of them.
final int maxExceptions = len >>> 5;
int previousExceptionIndex = 0;
int numExceptions = 0;
for (int i = 0; i < len; ++i) {
final int b = in[i] & 0xFF;
if (isCompressible(b) == false) {
while (i - previousExceptionIndex > 0xFF) {
++numExceptions;
previousExceptionIndex += 0xFF;
}
if (++numExceptions > maxExceptions) {
return false;
}
previousExceptionIndex = i;
}
}
assert numExceptions <= maxExceptions;
// 2. Now move all bytes to the [0,0x40) range (6 bits). This loop gets auto-vectorized on JDK13+.
final int compressedLen = len - (len >>> 2); // ignores exceptions
assert compressedLen < len;
for (int i = 0; i < len; ++i) {
int b = (in[i] & 0xFF) + 1;
tmp[i] = (byte) ((b & 0x1F) | ((b & 0x40) >>> 1));
}
// 3. Now pack the bytes so that we record 4 ASCII chars in 3 bytes
int o = 0;
for (int i = compressedLen; i < len; ++i) {
tmp[o++] |= (tmp[i] & 0x30) << 2; // bits 4-5
}
for (int i = compressedLen; i < len; ++i) {
tmp[o++] |= (tmp[i] & 0x0C) << 4; // bits 2-3
}
for (int i = compressedLen; i < len; ++i) {
tmp[o++] |= (tmp[i] & 0x03) << 6; // bits 0-1
}
assert o <= compressedLen;
out.writeBytes(tmp, 0, compressedLen);
// 4. Finally record exceptions
out.writeVInt(numExceptions);
if (numExceptions > 0) {
previousExceptionIndex = 0;
int numExceptions2 = 0;
for (int i = 0; i < len; ++i) {
int b = in[i] & 0xFF;
if (isCompressible(b) == false) {
while (i - previousExceptionIndex > 0xFF) {
// We record deltas between exceptions as bytes, so we need to create
// "artificial" exceptions if the delta between two of them is greater
// than the maximum unsigned byte value.
out.writeByte((byte) 0xFF);
previousExceptionIndex += 0xFF;
out.writeByte(in[previousExceptionIndex]);
numExceptions2++;
}
out.writeByte((byte) (i - previousExceptionIndex));
previousExceptionIndex = i;
out.writeByte((byte) b);
numExceptions2++;
}
}
if (numExceptions != numExceptions2) {
throw new IllegalStateException("" + numExceptions + " <> " + numExceptions2 + " " + new BytesRef(in, 0, len).utf8ToString());
}
}
return true;
}
Decompress data that has been compressed with compress(byte[], int, byte[], DataOutput)
. len
must be the original length, not the compressed length. /**
* Decompress data that has been compressed with {@link #compress(byte[], int, byte[], DataOutput)}.
* {@code len} must be the original length, not the compressed length.
*/
public static void decompress(DataInput in, byte[] out, int len) throws IOException {
final int saved = len >>> 2;
int compressedLen = len - saved;
// 1. Copy the packed bytes
in.readBytes(out, 0, compressedLen);
// 2. Restore the leading 2 bits of each packed byte into whole bytes
for (int i = 0; i < saved; ++i) {
out[compressedLen + i] = (byte) (((out[i] & 0xC0) >>> 2) | ((out[saved + i] & 0xC0) >>> 4) | ((out[(saved<<1) + i] & 0xC0) >>> 6));
}
// 3. Move back to the original range. This loop gets auto-vectorized on JDK13+.
for (int i = 0; i < len; ++i) {
final byte b = out[i];
out[i] = (byte) (((b & 0x1F) | 0x20 | ((b & 0x20) << 1)) - 1);
}
// 4. Restore exceptions
final int numExceptions = in.readVInt();
int i = 0;
for (int exception = 0; exception < numExceptions; ++exception) {
i += in.readByte() & 0xFF;
out[i] = in.readByte();
}
}
}