/* Copyright (c) 2008 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.yaml.snakeyaml.external.com.google.gdata.util.common.base;

A UnicodeEscaper that escapes some set of Java characters using the URI percent encoding scheme. The set of safe characters (those which remain unescaped) can be specified on construction.

For details on escaping URIs for use in web pages, see section 2.4 of RFC 3986.

In most cases this class should not need to be used directly. If you have no special requirements for escaping your URIs, you should use either uriEscaper.uriEscaper() or uriEscaper.uriEscaper(boolean).

When encoding a String, the following rules apply:

  • The alphanumeric characters "a" through "z", "A" through "Z" and "0" through "9" remain the same.
  • Any additionally specified safe characters remain the same.
  • If plusForSpace was specified, the space character " " is converted into a plus sign "+".
  • All other characters are converted into one or more bytes using UTF-8 encoding and each byte is then represented by the 3-character string "%XY", where "XY" is the two-digit, uppercase, hexadecimal representation of the byte value.

RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!", "~", "*", "'", "(" and ")". It goes on to state:

Unreserved characters can be escaped without changing the semantics of the URI, but this should not be done unless the URI is being used in a context that does not allow the unescaped character to appear.

For performance reasons the only currently supported character encoding of this class is UTF-8.

Note: This escaper produces uppercase hexidecimal sequences. From RFC 3986:
"URI producers and normalizers should use uppercase hexadecimal digits for all percent-encodings."

/** * A {@code UnicodeEscaper} that escapes some set of Java characters using the * URI percent encoding scheme. The set of safe characters (those which remain * unescaped) can be specified on construction. * * <p> * For details on escaping URIs for use in web pages, see section 2.4 of <a * href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>. * * <p> * In most cases this class should not need to be used directly. If you have no * special requirements for escaping your URIs, you should use either * {@link CharEscapers#uriEscaper()} or {@link CharEscapers#uriEscaper(boolean)}. * * <p> * When encoding a String, the following rules apply: * <ul> * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" * through "9" remain the same. * <li>Any additionally specified safe characters remain the same. * <li>If {@code plusForSpace} was specified, the space character " " is * converted into a plus sign "+". * <li>All other characters are converted into one or more bytes using UTF-8 * encoding and each byte is then represented by the 3-character string "%XY", * where "XY" is the two-digit, uppercase, hexadecimal representation of the * byte value. * </ul> * * <p> * RFC 2396 specifies the set of unreserved characters as "-", "_", ".", "!", * "~", "*", "'", "(" and ")". It goes on to state: * * <p> * <i>Unreserved characters can be escaped without changing the semantics of the * URI, but this should not be done unless the URI is being used in a context * that does not allow the unescaped character to appear.</i> * * <p> * For performance reasons the only currently supported character encoding of * this class is UTF-8. * * <p> * <b>Note</b>: This escaper produces uppercase hexidecimal sequences. From <a * href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br> * <i>"URI producers and normalizers should use uppercase hexadecimal digits for * all percent-encodings."</i> * * */
public class PercentEscaper extends UnicodeEscaper {
A string of safe characters that mimics the behavior of URLEncoder.
/** * A string of safe characters that mimics the behavior of * {@link java.net.URLEncoder}. * */
public static final String SAFECHARS_URLENCODER = "-_.*";
A string of characters that do not need to be encoded when used in URI path segments, as specified in RFC 3986. Note that some of these characters do need to be escaped when used in other parts of the URI.
/** * A string of characters that do not need to be encoded when used in URI * path segments, as specified in RFC 3986. Note that some of these * characters do need to be escaped when used in other parts of the URI. */
public static final String SAFEPATHCHARS_URLENCODER = "-_.!~*'()@:$&,;=";
A string of characters that do not need to be encoded when used in URI query strings, as specified in RFC 3986. Note that some of these characters do need to be escaped when used in other parts of the URI.
/** * A string of characters that do not need to be encoded when used in URI * query strings, as specified in RFC 3986. Note that some of these * characters do need to be escaped when used in other parts of the URI. */
public static final String SAFEQUERYSTRINGCHARS_URLENCODER = "-_.!~*'()@:$,;/?:"; // In some uri escapers spaces are escaped to '+' private static final char[] URI_ESCAPED_SPACE = { '+' }; private static final char[] UPPER_HEX_DIGITS = "0123456789ABCDEF".toCharArray();
If true we should convert space to the + character.
/** * If true we should convert space to the {@code +} character. */
private final boolean plusForSpace;
An array of flags where for any char c if safeOctets[c] is true then c should remain unmodified in the output. If c > safeOctets.length then it should be escaped.
/** * An array of flags where for any {@code char c} if {@code safeOctets[c]} * is true then {@code c} should remain unmodified in the output. If * {@code c > safeOctets.length} then it should be escaped. */
private final boolean[] safeOctets;
Constructs a URI escaper with the specified safe characters and optional handling of the space character.
Params:
  • safeChars – a non null string specifying additional safe characters for this escaper (the ranges 0..9, a..z and A..Z are always safe and should not be specified here)
  • plusForSpace – true if ASCII space should be escaped to + rather than %20
Throws:
/** * Constructs a URI escaper with the specified safe characters and optional * handling of the space character. * * @param safeChars * a non null string specifying additional safe characters for * this escaper (the ranges 0..9, a..z and A..Z are always safe * and should not be specified here) * @param plusForSpace * true if ASCII space should be escaped to {@code +} rather than * {@code %20} * @throws IllegalArgumentException * if any of the parameters were invalid */
public PercentEscaper(String safeChars, boolean plusForSpace) { // Avoid any misunderstandings about the behavior of this escaper if (safeChars.matches(".*[0-9A-Za-z].*")) { throw new IllegalArgumentException( "Alphanumeric characters are always 'safe' and should not be " + "explicitly specified"); } // Avoid ambiguous parameters. Safe characters are never modified so if // space is a safe character then setting plusForSpace is meaningless. if (plusForSpace && safeChars.contains(" ")) { throw new IllegalArgumentException( "plusForSpace cannot be specified when space is a 'safe' character"); } if (safeChars.contains("%")) { throw new IllegalArgumentException("The '%' character cannot be specified as 'safe'"); } this.plusForSpace = plusForSpace; this.safeOctets = createSafeOctets(safeChars); }
Creates a boolean[] with entries corresponding to the character values for 0-9, A-Z, a-z and those specified in safeChars set to true. The array is as small as is required to hold the given character information.
/** * Creates a boolean[] with entries corresponding to the character values * for 0-9, A-Z, a-z and those specified in safeChars set to true. The array * is as small as is required to hold the given character information. */
private static boolean[] createSafeOctets(String safeChars) { int maxChar = 'z'; char[] safeCharArray = safeChars.toCharArray(); for (char c : safeCharArray) { maxChar = Math.max(c, maxChar); } boolean[] octets = new boolean[maxChar + 1]; for (int c = '0'; c <= '9'; c++) { octets[c] = true; } for (int c = 'A'; c <= 'Z'; c++) { octets[c] = true; } for (int c = 'a'; c <= 'z'; c++) { octets[c] = true; } for (char c : safeCharArray) { octets[c] = true; } return octets; } /* * Overridden for performance. For unescaped strings this improved the * performance of the uri escaper from ~760ns to ~400ns as measured by * {@link CharEscapersBenchmark}. */ @Override protected int nextEscapeIndex(CharSequence csq, int index, int end) { for (; index < end; index++) { char c = csq.charAt(index); if (c >= safeOctets.length || !safeOctets[c]) { break; } } return index; } /* * Overridden for performance. For unescaped strings this improved the * performance of the uri escaper from ~400ns to ~170ns as measured by * {@link CharEscapersBenchmark}. */ @Override public String escape(String s) { int slen = s.length(); for (int index = 0; index < slen; index++) { char c = s.charAt(index); if (c >= safeOctets.length || !safeOctets[c]) { return escapeSlow(s, index); } } return s; }
Escapes the given Unicode code point in UTF-8.
/** * Escapes the given Unicode code point in UTF-8. */
@Override protected char[] escape(int cp) { // We should never get negative values here but if we do it will throw // an // IndexOutOfBoundsException, so at least it will get spotted. if (cp < safeOctets.length && safeOctets[cp]) { return null; } else if (cp == ' ' && plusForSpace) { return URI_ESCAPED_SPACE; } else if (cp <= 0x7F) { // Single byte UTF-8 characters // Start with "%--" and fill in the blanks char[] dest = new char[3]; dest[0] = '%'; dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; dest[1] = UPPER_HEX_DIGITS[cp >>> 4]; return dest; } else if (cp <= 0x7ff) { // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff] // Start with "%--%--" and fill in the blanks char[] dest = new char[6]; dest[0] = '%'; dest[3] = '%'; dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; cp >>>= 4; dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; cp >>>= 2; dest[2] = UPPER_HEX_DIGITS[cp & 0xF]; cp >>>= 4; dest[1] = UPPER_HEX_DIGITS[0xC | cp]; return dest; } else if (cp <= 0xffff) { // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff] // Start with "%E-%--%--" and fill in the blanks char[] dest = new char[9]; dest[0] = '%'; dest[1] = 'E'; dest[3] = '%'; dest[6] = '%'; dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; cp >>>= 4; dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; cp >>>= 2; dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; cp >>>= 4; dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; cp >>>= 2; dest[2] = UPPER_HEX_DIGITS[cp]; return dest; } else if (cp <= 0x10ffff) { char[] dest = new char[12]; // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff] // Start with "%F-%--%--%--" and fill in the blanks dest[0] = '%'; dest[1] = 'F'; dest[3] = '%'; dest[6] = '%'; dest[9] = '%'; dest[11] = UPPER_HEX_DIGITS[cp & 0xF]; cp >>>= 4; dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; cp >>>= 2; dest[8] = UPPER_HEX_DIGITS[cp & 0xF]; cp >>>= 4; dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; cp >>>= 2; dest[5] = UPPER_HEX_DIGITS[cp & 0xF]; cp >>>= 4; dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)]; cp >>>= 2; dest[2] = UPPER_HEX_DIGITS[cp & 0x7]; return dest; } else { // If this ever happens it is due to bug in UnicodeEscaper, not bad // input. throw new IllegalArgumentException("Invalid unicode character value " + cp); } } }