/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.io.input;

import static org.apache.commons.io.IOUtils.EOF;

import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;

import org.apache.commons.io.ByteOrderMark;

This class is used to wrap a stream that includes an encoded ByteOrderMark as its first bytes. This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the first byte in the stream. The ByteOrderMark implementation has the following pre-defined BOMs:

Example 1 - Detect and exclude a UTF-8 BOM

BOMInputStream bomIn = new BOMInputStream(in);
if (bomIn.hasBOM()) {
    // has a UTF-8 BOM
}

Example 2 - Detect a UTF-8 BOM (but don't exclude it)

boolean include = true;
BOMInputStream bomIn = new BOMInputStream(in, include);
if (bomIn.hasBOM()) {
    // has a UTF-8 BOM
}

Example 3 - Detect Multiple BOMs

BOMInputStream bomIn = new BOMInputStream(in,
  ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE,
  ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE
  );
if (bomIn.hasBOM() == false) {
    // No BOM found
} else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) {
    // has a UTF-16LE BOM
} else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) {
    // has a UTF-16BE BOM
} else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) {
    // has a UTF-32LE BOM
} else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) {
    // has a UTF-32BE BOM
}
See Also:
Version:$Id$
Since:2.0
/** * This class is used to wrap a stream that includes an encoded {@link ByteOrderMark} as its first bytes. * * This class detects these bytes and, if required, can automatically skip them and return the subsequent byte as the * first byte in the stream. * * The {@link ByteOrderMark} implementation has the following pre-defined BOMs: * <ul> * <li>UTF-8 - {@link ByteOrderMark#UTF_8}</li> * <li>UTF-16BE - {@link ByteOrderMark#UTF_16LE}</li> * <li>UTF-16LE - {@link ByteOrderMark#UTF_16BE}</li> * <li>UTF-32BE - {@link ByteOrderMark#UTF_32LE}</li> * <li>UTF-32LE - {@link ByteOrderMark#UTF_32BE}</li> * </ul> * * * <h3>Example 1 - Detect and exclude a UTF-8 BOM</h3> * * <pre> * BOMInputStream bomIn = new BOMInputStream(in); * if (bomIn.hasBOM()) { * // has a UTF-8 BOM * } * </pre> * * <h3>Example 2 - Detect a UTF-8 BOM (but don't exclude it)</h3> * * <pre> * boolean include = true; * BOMInputStream bomIn = new BOMInputStream(in, include); * if (bomIn.hasBOM()) { * // has a UTF-8 BOM * } * </pre> * * <h3>Example 3 - Detect Multiple BOMs</h3> * * <pre> * BOMInputStream bomIn = new BOMInputStream(in, * ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_16BE, * ByteOrderMark.UTF_32LE, ByteOrderMark.UTF_32BE * ); * if (bomIn.hasBOM() == false) { * // No BOM found * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16LE)) { * // has a UTF-16LE BOM * } else if (bomIn.hasBOM(ByteOrderMark.UTF_16BE)) { * // has a UTF-16BE BOM * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32LE)) { * // has a UTF-32LE BOM * } else if (bomIn.hasBOM(ByteOrderMark.UTF_32BE)) { * // has a UTF-32BE BOM * } * </pre> * * @see org.apache.commons.io.ByteOrderMark * @see <a href="http://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia - Byte Order Mark</a> * @version $Id$ * @since 2.0 */
public class BOMInputStream extends ProxyInputStream { private final boolean include;
BOMs are sorted from longest to shortest.
/** * BOMs are sorted from longest to shortest. */
private final List<ByteOrderMark> boms; private ByteOrderMark byteOrderMark; private int[] firstBytes; private int fbLength; private int fbIndex; private int markFbIndex; private boolean markedAtStart;
Constructs a new BOM InputStream that excludes a ByteOrderMark.UTF_8 BOM.
Params:
  • delegate – the InputStream to delegate to
/** * Constructs a new BOM InputStream that excludes a {@link ByteOrderMark#UTF_8} BOM. * * @param delegate * the InputStream to delegate to */
public BOMInputStream(final InputStream delegate) { this(delegate, false, ByteOrderMark.UTF_8); }
Constructs a new BOM InputStream that detects a a ByteOrderMark.UTF_8 and optionally includes it.
Params:
  • delegate – the InputStream to delegate to
  • include – true to include the UTF-8 BOM or false to exclude it
/** * Constructs a new BOM InputStream that detects a a {@link ByteOrderMark#UTF_8} and optionally includes it. * * @param delegate * the InputStream to delegate to * @param include * true to include the UTF-8 BOM or false to exclude it */
public BOMInputStream(final InputStream delegate, final boolean include) { this(delegate, include, ByteOrderMark.UTF_8); }
Constructs a new BOM InputStream that excludes the specified BOMs.
Params:
  • delegate – the InputStream to delegate to
  • boms – The BOMs to detect and exclude
/** * Constructs a new BOM InputStream that excludes the specified BOMs. * * @param delegate * the InputStream to delegate to * @param boms * The BOMs to detect and exclude */
public BOMInputStream(final InputStream delegate, final ByteOrderMark... boms) { this(delegate, false, boms); }
Compares ByteOrderMark objects in descending length order.
/** * Compares ByteOrderMark objects in descending length order. */
private static final Comparator<ByteOrderMark> ByteOrderMarkLengthComparator = new Comparator<ByteOrderMark>() { @Override public int compare(final ByteOrderMark bom1, final ByteOrderMark bom2) { final int len1 = bom1.length(); final int len2 = bom2.length(); if (len1 > len2) { return EOF; } if (len2 > len1) { return 1; } return 0; } };
Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them.
Params:
  • delegate – the InputStream to delegate to
  • include – true to include the specified BOMs or false to exclude them
  • boms – The BOMs to detect and optionally exclude
/** * Constructs a new BOM InputStream that detects the specified BOMs and optionally includes them. * * @param delegate * the InputStream to delegate to * @param include * true to include the specified BOMs or false to exclude them * @param boms * The BOMs to detect and optionally exclude */
public BOMInputStream(final InputStream delegate, final boolean include, final ByteOrderMark... boms) { super(delegate); if (boms == null || boms.length == 0) { throw new IllegalArgumentException("No BOMs specified"); } this.include = include; final List<ByteOrderMark> list = Arrays.asList(boms); // Sort the BOMs to match the longest BOM first because some BOMs have the same starting two bytes. Collections.sort(list, ByteOrderMarkLengthComparator); this.boms = list; }
Indicates whether the stream contains one of the specified BOMs.
Throws:
  • IOException – if an error reading the first bytes of the stream occurs
Returns:true if the stream has one of the specified BOMs, otherwise false if it does not
/** * Indicates whether the stream contains one of the specified BOMs. * * @return true if the stream has one of the specified BOMs, otherwise false if it does not * @throws IOException * if an error reading the first bytes of the stream occurs */
public boolean hasBOM() throws IOException { return getBOM() != null; }
Indicates whether the stream contains the specified BOM.
Params:
  • bom – The BOM to check for
Throws:
Returns:true if the stream has the specified BOM, otherwise false if it does not
/** * Indicates whether the stream contains the specified BOM. * * @param bom * The BOM to check for * @return true if the stream has the specified BOM, otherwise false if it does not * @throws IllegalArgumentException * if the BOM is not one the stream is configured to detect * @throws IOException * if an error reading the first bytes of the stream occurs */
public boolean hasBOM(final ByteOrderMark bom) throws IOException { if (!boms.contains(bom)) { throw new IllegalArgumentException("Stream not configure to detect " + bom); } getBOM(); return byteOrderMark != null && byteOrderMark.equals(bom); }
Return the BOM (Byte Order Mark).
Throws:
  • IOException – if an error reading the first bytes of the stream occurs
Returns:The BOM or null if none
/** * Return the BOM (Byte Order Mark). * * @return The BOM or null if none * @throws IOException * if an error reading the first bytes of the stream occurs */
public ByteOrderMark getBOM() throws IOException { if (firstBytes == null) { fbLength = 0; // BOMs are sorted from longest to shortest final int maxBomSize = boms.get(0).length(); firstBytes = new int[maxBomSize]; // Read first maxBomSize bytes for (int i = 0; i < firstBytes.length; i++) { firstBytes[i] = in.read(); fbLength++; if (firstBytes[i] < 0) { break; } } // match BOM in firstBytes byteOrderMark = find(); if (byteOrderMark != null) { if (!include) { if (byteOrderMark.length() < firstBytes.length) { fbIndex = byteOrderMark.length(); } else { fbLength = 0; } } } } return byteOrderMark; }
Return the BOM charset Name - ByteOrderMark.getCharsetName().
Throws:
  • IOException – if an error reading the first bytes of the stream occurs
Returns:The BOM charset Name or null if no BOM found
/** * Return the BOM charset Name - {@link ByteOrderMark#getCharsetName()}. * * @return The BOM charset Name or null if no BOM found * @throws IOException * if an error reading the first bytes of the stream occurs * */
public String getBOMCharsetName() throws IOException { getBOM(); return byteOrderMark == null ? null : byteOrderMark.getCharsetName(); }
This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte read() method, either returning a valid byte or -1 to indicate that the initial bytes have been processed already.
Throws:
Returns:the byte read (excluding BOM) or -1 if the end of stream
/** * This method reads and either preserves or skips the first bytes in the stream. It behaves like the single-byte * <code>read()</code> method, either returning a valid byte or -1 to indicate that the initial bytes have been * processed already. * * @return the byte read (excluding BOM) or -1 if the end of stream * @throws IOException * if an I/O error occurs */
private int readFirstBytes() throws IOException { getBOM(); return fbIndex < fbLength ? firstBytes[fbIndex++] : EOF; }
Find a BOM with the specified bytes.
Returns:The matched BOM or null if none matched
/** * Find a BOM with the specified bytes. * * @return The matched BOM or null if none matched */
private ByteOrderMark find() { for (final ByteOrderMark bom : boms) { if (matches(bom)) { return bom; } } return null; }
Check if the bytes match a BOM.
Params:
  • bom – The BOM
Returns:true if the bytes match the bom, otherwise false
/** * Check if the bytes match a BOM. * * @param bom * The BOM * @return true if the bytes match the bom, otherwise false */
private boolean matches(final ByteOrderMark bom) { // if (bom.length() != fbLength) { // return false; // } // firstBytes may be bigger than the BOM bytes for (int i = 0; i < bom.length(); i++) { if (bom.get(i) != firstBytes[i]) { return false; } } return true; } // ---------------------------------------------------------------------------- // Implementation of InputStream // ----------------------------------------------------------------------------
Invokes the delegate's read() method, detecting and optionally skipping BOM.
Throws:
Returns:the byte read (excluding BOM) or -1 if the end of stream
/** * Invokes the delegate's <code>read()</code> method, detecting and optionally skipping BOM. * * @return the byte read (excluding BOM) or -1 if the end of stream * @throws IOException * if an I/O error occurs */
@Override public int read() throws IOException { final int b = readFirstBytes(); return b >= 0 ? b : in.read(); }
Invokes the delegate's read(byte[], int, int) method, detecting and optionally skipping BOM.
Params:
  • buf – the buffer to read the bytes into
  • off – The start offset
  • len – The number of bytes to read (excluding BOM)
Throws:
Returns:the number of bytes read or -1 if the end of stream
/** * Invokes the delegate's <code>read(byte[], int, int)</code> method, detecting and optionally skipping BOM. * * @param buf * the buffer to read the bytes into * @param off * The start offset * @param len * The number of bytes to read (excluding BOM) * @return the number of bytes read or -1 if the end of stream * @throws IOException * if an I/O error occurs */
@Override public int read(final byte[] buf, int off, int len) throws IOException { int firstCount = 0; int b = 0; while (len > 0 && b >= 0) { b = readFirstBytes(); if (b >= 0) { buf[off++] = (byte) (b & 0xFF); len--; firstCount++; } } final int secondCount = in.read(buf, off, len); return secondCount < 0 ? firstCount > 0 ? firstCount : EOF : firstCount + secondCount; }
Invokes the delegate's read(byte[]) method, detecting and optionally skipping BOM.
Params:
  • buf – the buffer to read the bytes into
Throws:
Returns:the number of bytes read (excluding BOM) or -1 if the end of stream
/** * Invokes the delegate's <code>read(byte[])</code> method, detecting and optionally skipping BOM. * * @param buf * the buffer to read the bytes into * @return the number of bytes read (excluding BOM) or -1 if the end of stream * @throws IOException * if an I/O error occurs */
@Override public int read(final byte[] buf) throws IOException { return read(buf, 0, buf.length); }
Invokes the delegate's mark(int) method.
Params:
  • readlimit – read ahead limit
/** * Invokes the delegate's <code>mark(int)</code> method. * * @param readlimit * read ahead limit */
@Override public synchronized void mark(final int readlimit) { markFbIndex = fbIndex; markedAtStart = firstBytes == null; in.mark(readlimit); }
Invokes the delegate's reset() method.
Throws:
  • IOException – if an I/O error occurs
/** * Invokes the delegate's <code>reset()</code> method. * * @throws IOException * if an I/O error occurs */
@Override public synchronized void reset() throws IOException { fbIndex = markFbIndex; if (markedAtStart) { firstBytes = null; } in.reset(); }
Invokes the delegate's skip(long) method, detecting and optionally skipping BOM.
Params:
  • n – the number of bytes to skip
Throws:
Returns:the number of bytes to skipped or -1 if the end of stream
/** * Invokes the delegate's <code>skip(long)</code> method, detecting and optionally skipping BOM. * * @param n * the number of bytes to skip * @return the number of bytes to skipped or -1 if the end of stream * @throws IOException * if an I/O error occurs */
@Override public long skip(final long n) throws IOException { int skipped = 0; while ((n > skipped) && (readFirstBytes() >= 0)) { skipped++; } return in.skip(n - skipped) + skipped; } }