/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.commons.compress.compressors.gzip;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.EOFException;
import java.io.InputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.BufferedInputStream;
import java.util.zip.DataFormatException;
import java.util.zip.Deflater;
import java.util.zip.Inflater;
import java.util.zip.CRC32;

import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.utils.ByteUtils;
import org.apache.commons.compress.utils.CharsetNames;
import org.apache.commons.compress.utils.CountingInputStream;
import org.apache.commons.compress.utils.IOUtils;
import org.apache.commons.compress.utils.InputStreamStatistics;

Input stream that decompresses .gz files.

This supports decompressing concatenated .gz files which is important when decompressing standalone .gz files.

GZIPInputStream doesn't decompress concatenated .gz files: it stops after the first member and silently ignores the rest. It doesn't leave the read position to point to the beginning of the next member, which makes it difficult workaround the lack of concatenation support.

Instead of using GZIPInputStream, this class has its own .gz container format decoder. The actual decompression is done with Inflater.

If you use the constructor GzipCompressorInputStream(in) or GzipCompressorInputStream(in, false) with some InputStream in then read will return -1 as soon as the first internal member has been read completely. The stream in will be positioned at the start of the second gzip member if there is one.

If you use the constructor GzipCompressorInputStream(in, true) with some InputStream in then read will return -1 once the stream in has been exhausted. The data read from a stream constructed this way will consist of the concatenated data of all gzip members contained inside in.

See Also:
  • https://tools.ietf.org/html/rfc1952
/** * Input stream that decompresses .gz files. * * <p>This supports decompressing concatenated .gz files which is important * when decompressing standalone .gz files.</p> * * <p> * {@link java.util.zip.GZIPInputStream} doesn't decompress concatenated .gz * files: it stops after the first member and silently ignores the rest. * It doesn't leave the read position to point to the beginning of the next * member, which makes it difficult workaround the lack of concatenation * support. * </p> * * <p> * Instead of using <code>GZIPInputStream</code>, this class has its own .gz * container format decoder. The actual decompression is done with * {@link java.util.zip.Inflater}. * </p> * * <p>If you use the constructor {@code GzipCompressorInputStream(in)} * or {@code GzipCompressorInputStream(in, false)} with some {@code * InputStream} {@code in} then {@link #read} will return -1 as soon * as the first internal member has been read completely. The stream * {@code in} will be positioned at the start of the second gzip * member if there is one.</p> * * <p>If you use the constructor {@code GzipCompressorInputStream(in, * true)} with some {@code InputStream} {@code in} then {@link #read} * will return -1 once the stream {@code in} has been exhausted. The * data read from a stream constructed this way will consist of the * concatenated data of all gzip members contained inside {@code * in}.</p> * * @see "https://tools.ietf.org/html/rfc1952" */
public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics { // Header flags // private static final int FTEXT = 0x01; // Uninteresting for us private static final int FHCRC = 0x02; private static final int FEXTRA = 0x04; private static final int FNAME = 0x08; private static final int FCOMMENT = 0x10; private static final int FRESERVED = 0xE0; private final CountingInputStream countingStream; // Compressed input stream, possibly wrapped in a // BufferedInputStream, always wrapped in countingStream above private final InputStream in; // True if decompressing multi member streams. private final boolean decompressConcatenated; // Buffer to hold the input data private final byte[] buf = new byte[8192]; // Amount of data in buf. private int bufUsed; // Decompressor private Inflater inf = new Inflater(true); // CRC32 from uncompressed data private final CRC32 crc = new CRC32(); // True once everything has been decompressed private boolean endReached = false; // used in no-arg read method private final byte[] oneByte = new byte[1]; private final GzipParameters parameters = new GzipParameters();
Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.

This is equivalent to GzipCompressorInputStream(inputStream, false) and thus will not decompress concatenated .gz files.

Params:
  • inputStream – the InputStream from which this object should be created of
Throws:
/** * Constructs a new input stream that decompresses gzip-compressed data * from the specified input stream. * <p> * This is equivalent to * <code>GzipCompressorInputStream(inputStream, false)</code> and thus * will not decompress concatenated .gz files. * * @param inputStream the InputStream from which this object should * be created of * * @throws IOException if the stream could not be created */
public GzipCompressorInputStream(final InputStream inputStream) throws IOException { this(inputStream, false); }
Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.

If decompressConcatenated is false: This decompressor might read more input than it will actually use. If inputStream supports mark and reset, then the input position will be adjusted so that it is right after the last byte of the compressed stream. If mark isn't supported, the input position will be undefined.

Params:
  • inputStream – the InputStream from which this object should be created of
  • decompressConcatenated – if true, decompress until the end of the input; if false, stop after the first .gz member
Throws:
/** * Constructs a new input stream that decompresses gzip-compressed data * from the specified input stream. * <p> * If <code>decompressConcatenated</code> is {@code false}: * This decompressor might read more input than it will actually use. * If <code>inputStream</code> supports <code>mark</code> and * <code>reset</code>, then the input position will be adjusted * so that it is right after the last byte of the compressed stream. * If <code>mark</code> isn't supported, the input position will be * undefined. * * @param inputStream the InputStream from which this object should * be created of * @param decompressConcatenated * if true, decompress until the end of the input; * if false, stop after the first .gz member * * @throws IOException if the stream could not be created */
public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException { countingStream = new CountingInputStream(inputStream); // Mark support is strictly needed for concatenated files only, // but it's simpler if it is always available. if (countingStream.markSupported()) { in = countingStream; } else { in = new BufferedInputStream(countingStream); } this.decompressConcatenated = decompressConcatenated; init(true); }
Provides the stream's meta data - may change with each stream when decompressing concatenated streams.
Returns:the stream's meta data
Since:1.8
/** * Provides the stream's meta data - may change with each stream * when decompressing concatenated streams. * @return the stream's meta data * @since 1.8 */
public GzipParameters getMetaData() { return parameters; } private boolean init(final boolean isFirstMember) throws IOException { assert isFirstMember || decompressConcatenated; // Check the magic bytes without a possibility of EOFException. final int magic0 = in.read(); final int magic1 = in.read(); // If end of input was reached after decompressing at least // one .gz member, we have reached the end of the file successfully. if (magic0 == -1 && !isFirstMember) { return false; } if (magic0 != 31 || magic1 != 139) { throw new IOException(isFirstMember ? "Input is not in the .gz format" : "Garbage after a valid .gz stream"); } // Parsing the rest of the header may throw EOFException. final DataInput inData = new DataInputStream(in); final int method = inData.readUnsignedByte(); if (method != Deflater.DEFLATED) { throw new IOException("Unsupported compression method " + method + " in the .gz header"); } final int flg = inData.readUnsignedByte(); if ((flg & FRESERVED) != 0) { throw new IOException( "Reserved flags are set in the .gz header"); } parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4) * 1000); switch (inData.readUnsignedByte()) { // extra flags case 2: parameters.setCompressionLevel(Deflater.BEST_COMPRESSION); break; case 4: parameters.setCompressionLevel(Deflater.BEST_SPEED); break; default: // ignored for now break; } parameters.setOperatingSystem(inData.readUnsignedByte()); // Extra field, ignored if ((flg & FEXTRA) != 0) { int xlen = inData.readUnsignedByte(); xlen |= inData.readUnsignedByte() << 8; // This isn't as efficient as calling in.skip would be, // but it's lazier to handle unexpected end of input this way. // Most files don't have an extra field anyway. while (xlen-- > 0) { inData.readUnsignedByte(); } } // Original file name if ((flg & FNAME) != 0) { parameters.setFilename(new String(readToNull(inData), CharsetNames.ISO_8859_1)); } // Comment if ((flg & FCOMMENT) != 0) { parameters.setComment(new String(readToNull(inData), CharsetNames.ISO_8859_1)); } // Header "CRC16" which is actually a truncated CRC32 (which isn't // as good as real CRC16). I don't know if any encoder implementation // sets this, so it's not worth trying to verify it. GNU gzip 1.4 // doesn't support this field, but zlib seems to be able to at least // skip over it. if ((flg & FHCRC) != 0) { inData.readShort(); } // Reset inf.reset(); crc.reset(); return true; } private static byte[] readToNull(final DataInput inData) throws IOException { final ByteArrayOutputStream bos = new ByteArrayOutputStream(); int b = 0; while ((b = inData.readUnsignedByte()) != 0x00) { // NOPMD bos.write(b); } return bos.toByteArray(); } @Override public int read() throws IOException { return read(oneByte, 0, 1) == -1 ? -1 : oneByte[0] & 0xFF; }
{@inheritDoc}
Since:1.1
/** * {@inheritDoc} * * @since 1.1 */
@Override public int read(final byte[] b, int off, int len) throws IOException { if (endReached) { return -1; } int size = 0; while (len > 0) { if (inf.needsInput()) { // Remember the current position because we may need to // rewind after reading too much input. in.mark(buf.length); bufUsed = in.read(buf); if (bufUsed == -1) { throw new EOFException(); } inf.setInput(buf, 0, bufUsed); } int ret; try { ret = inf.inflate(b, off, len); } catch (final DataFormatException e) { throw new IOException("Gzip-compressed data is corrupt"); } crc.update(b, off, ret); off += ret; len -= ret; size += ret; count(ret); if (inf.finished()) { // We may have read too many bytes. Rewind the read // position to match the actual amount used. // // NOTE: The "if" is there just in case. Since we used // in.mark earlier, it should always skip enough. in.reset(); final int skipAmount = bufUsed - inf.getRemaining(); if (IOUtils.skip(in, skipAmount) != skipAmount) { throw new IOException(); } bufUsed = 0; final DataInput inData = new DataInputStream(in); // CRC32 final long crcStored = ByteUtils.fromLittleEndian(inData, 4); if (crcStored != crc.getValue()) { throw new IOException("Gzip-compressed data is corrupt " + "(CRC32 error)"); } // Uncompressed size modulo 2^32 (ISIZE in the spec) final long isize = ByteUtils.fromLittleEndian(inData, 4); if (isize != (inf.getBytesWritten() & 0xffffffffL)) { throw new IOException("Gzip-compressed data is corrupt" + "(uncompressed size mismatch)"); } // See if this is the end of the file. if (!decompressConcatenated || !init(false)) { inf.end(); inf = null; endReached = true; return size == 0 ? -1 : size; } } } return size; }
Checks if the signature matches what is expected for a .gz file.
Params:
  • signature – the bytes to check
  • length – the number of bytes to check
Returns: true if this is a .gz stream, false otherwise
Since:1.1
/** * Checks if the signature matches what is expected for a .gz file. * * @param signature the bytes to check * @param length the number of bytes to check * @return true if this is a .gz stream, false otherwise * * @since 1.1 */
public static boolean matches(final byte[] signature, final int length) { return length >= 2 && signature[0] == 31 && signature[1] == -117; }
Closes the input stream (unless it is System.in).
Since:1.2
/** * Closes the input stream (unless it is System.in). * * @since 1.2 */
@Override public void close() throws IOException { if (inf != null) { inf.end(); inf = null; } if (this.in != System.in) { this.in.close(); } }
Since:1.17
/** * @since 1.17 */
@Override public long getCompressedCount() { return countingStream.getBytesRead(); } }