/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.commons.compress.archivers.zip;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
A ZipEncoding, which uses a java.nio Charset
to encode names. The methods of this class are reentrant.
@Immutable
/**
* A ZipEncoding, which uses a java.nio {@link
* java.nio.charset.Charset Charset} to encode names.
* <p>The methods of this class are reentrant.</p>
* @Immutable
*/
class NioZipEncoding implements ZipEncoding, CharsetAccessor {
private final Charset charset;
private final boolean useReplacement;
private static final char REPLACEMENT = '?';
private static final byte[] REPLACEMENT_BYTES = { (byte) REPLACEMENT };
private static final String REPLACEMENT_STRING = String.valueOf(REPLACEMENT);
private static final char[] HEX_CHARS = new char[] {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
};
Construct an NioZipEncoding using the given charset.
Params: - charset – The character set to use.
- useReplacement – should invalid characters be replaced, or reported.
/**
* Construct an NioZipEncoding using the given charset.
* @param charset The character set to use.
* @param useReplacement should invalid characters be replaced, or reported.
*/
NioZipEncoding(final Charset charset, boolean useReplacement) {
this.charset = charset;
this.useReplacement = useReplacement;
}
@Override
public Charset getCharset() {
return charset;
}
See Also: - canEncode.canEncode(String)
/**
* @see ZipEncoding#canEncode(java.lang.String)
*/
@Override
public boolean canEncode(final String name) {
final CharsetEncoder enc = newEncoder();
return enc.canEncode(name);
}
See Also: - encode.encode(String)
/**
* @see ZipEncoding#encode(java.lang.String)
*/
@Override
public ByteBuffer encode(final String name) {
final CharsetEncoder enc = newEncoder();
final CharBuffer cb = CharBuffer.wrap(name);
CharBuffer tmp = null;
ByteBuffer out = ByteBuffer.allocate(estimateInitialBufferSize(enc, cb.remaining()));
while (cb.hasRemaining()) {
final CoderResult res = enc.encode(cb, out, false);
if (res.isUnmappable() || res.isMalformed()) {
// write the unmappable characters in utf-16
// pseudo-URL encoding style to ByteBuffer.
int spaceForSurrogate = estimateIncrementalEncodingSize(enc, 6 * res.length());
if (spaceForSurrogate > out.remaining()) {
// if the destination buffer isn't over sized, assume that the presence of one
// unmappable character makes it likely that there will be more. Find all the
// un-encoded characters and allocate space based on those estimates.
int charCount = 0;
for (int i = cb.position() ; i < cb.limit(); i++) {
charCount += !enc.canEncode(cb.get(i)) ? 6 : 1;
}
int totalExtraSpace = estimateIncrementalEncodingSize(enc, charCount);
out = ZipEncodingHelper.growBufferBy(out, totalExtraSpace - out.remaining());
}
if (tmp == null) {
tmp = CharBuffer.allocate(6);
}
for (int i = 0; i < res.length(); ++i) {
out = encodeFully(enc, encodeSurrogate(tmp, cb.get()), out);
}
} else if (res.isOverflow()) {
int increment = estimateIncrementalEncodingSize(enc, cb.remaining());
out = ZipEncodingHelper.growBufferBy(out, increment);
} else if (res.isUnderflow() || res.isError()) {
break;
}
}
// tell the encoder we are done
enc.encode(cb, out, true);
// may have caused underflow, but that's been ignored traditionally
out.limit(out.position());
out.rewind();
return out;
}
See Also: - decode.decode(byte[])
/**
* @see
* ZipEncoding#decode(byte[])
*/
@Override
public String decode(final byte[] data) throws IOException {
return newDecoder()
.decode(ByteBuffer.wrap(data)).toString();
}
private static ByteBuffer encodeFully(CharsetEncoder enc, CharBuffer cb, ByteBuffer out) {
ByteBuffer o = out;
while (cb.hasRemaining()) {
CoderResult result = enc.encode(cb, o, false);
if (result.isOverflow()) {
int increment = estimateIncrementalEncodingSize(enc, cb.remaining());
o = ZipEncodingHelper.growBufferBy(o, increment);
}
}
return o;
}
private static CharBuffer encodeSurrogate(CharBuffer cb, char c) {
cb.position(0).limit(6);
cb.put('%');
cb.put('U');
cb.put(HEX_CHARS[(c >> 12) & 0x0f]);
cb.put(HEX_CHARS[(c >> 8) & 0x0f]);
cb.put(HEX_CHARS[(c >> 4) & 0x0f]);
cb.put(HEX_CHARS[c & 0x0f]);
cb.flip();
return cb;
}
private CharsetEncoder newEncoder() {
if (useReplacement) {
return charset.newEncoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.replaceWith(REPLACEMENT_BYTES);
} else {
return charset.newEncoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
}
}
private CharsetDecoder newDecoder() {
if (!useReplacement) {
return this.charset.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
.onUnmappableCharacter(CodingErrorAction.REPORT);
} else {
return charset.newDecoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.replaceWith(REPLACEMENT_STRING);
}
}
Estimate the initial encoded size (in bytes) for a character buffer.
The estimate assumes that one character consumes uses the maximum length encoding,
whilst the rest use an average size encoding. This accounts for any BOM for UTF-16, at
the expense of a couple of extra bytes for UTF-8 encoded ASCII.
Params: - enc – encoder to use for estimates
- charChount – number of characters in string
Returns: estimated size in bytes.
/**
* Estimate the initial encoded size (in bytes) for a character buffer.
* <p>
* The estimate assumes that one character consumes uses the maximum length encoding,
* whilst the rest use an average size encoding. This accounts for any BOM for UTF-16, at
* the expense of a couple of extra bytes for UTF-8 encoded ASCII.
* </p>
*
* @param enc encoder to use for estimates
* @param charChount number of characters in string
* @return estimated size in bytes.
*/
private static int estimateInitialBufferSize(CharsetEncoder enc, int charChount) {
float first = enc.maxBytesPerChar();
float rest = (charChount - 1) * enc.averageBytesPerChar();
return (int) Math.ceil(first + rest);
}
Estimate the size needed for remaining characters
Params: - enc – encoder to use for estimates
- charCount – number of characters remaining
Returns: estimated size in bytes.
/**
* Estimate the size needed for remaining characters
*
* @param enc encoder to use for estimates
* @param charCount number of characters remaining
* @return estimated size in bytes.
*/
private static int estimateIncrementalEncodingSize(CharsetEncoder enc, int charCount) {
return (int) Math.ceil(charCount * enc.averageBytesPerChar());
}
}