/*
* Copyright (C) 2009 The Guava Authors
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.google.common.xml;
import com.google.common.annotations.Beta;
import com.google.common.annotations.GwtCompatible;
import com.google.common.escape.Escaper;
import com.google.common.escape.Escapers;
Escaper
instances suitable for strings to be included in XML attribute values and elements' text contents. When possible, avoid manual escaping by using templating systems and high-level APIs that provide autoescaping. For example, consider XOM or JDOM.
Note: Currently the escapers provided by this class do not escape any characters
outside the ASCII character range. Unlike HTML escaping the XML escapers will not escape
non-ASCII characters to their numeric entity replacements. These XML escapers provide the minimal
level of escaping to ensure that the output can be safely included in a Unicode XML document.
For details on the behavior of the escapers in this class, see sections 2.2 and 2.4 of the XML specification.
Author: Alex Matevossian, David Beaumont Since: 15.0
/**
* {@code Escaper} instances suitable for strings to be included in XML attribute values and
* elements' text contents. When possible, avoid manual escaping by using templating systems and
* high-level APIs that provide autoescaping. For example, consider <a
* href="http://www.xom.nu/">XOM</a> or <a href="http://www.jdom.org/">JDOM</a>.
*
* <p><b>Note:</b> Currently the escapers provided by this class do not escape any characters
* outside the ASCII character range. Unlike HTML escaping the XML escapers will not escape
* non-ASCII characters to their numeric entity replacements. These XML escapers provide the minimal
* level of escaping to ensure that the output can be safely included in a Unicode XML document.
*
*
* <p>For details on the behavior of the escapers in this class, see sections <a
* href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> and <a
* href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification.
*
* @author Alex Matevossian
* @author David Beaumont
* @since 15.0
*/
@Beta
@GwtCompatible
public class XmlEscapers {
private XmlEscapers() {}
private static final char MIN_ASCII_CONTROL_CHAR = 0x00;
private static final char MAX_ASCII_CONTROL_CHAR = 0x1F;
// For each xxxEscaper() method, please add links to external reference pages
// that are considered authoritative for the behavior of that escaper.
Returns an Escaper
instance that escapes special characters in a string so it can safely be included in an XML document as element content. See section 2.4 of the XML specification.
Note: Double and single quotes are not escaped, so it is not safe to use this escaper to escape attribute values. Use xmlContentEscaper
if the output can appear in element content or xmlAttributeEscaper
in attribute values.
This escaper substitutes 0xFFFD
for non-whitespace control characters and the character values 0xFFFE
and 0xFFFF
which are not permitted in XML. For more detail see section 2.2 of
the XML specification.
This escaper does not escape non-ASCII characters to their numeric character references
(NCR). Any non-ASCII characters appearing in the input will be preserved in the output.
Specifically "\r" (carriage return) is preserved in the output, which may result in it being
silently converted to "\n" when the XML is parsed.
This escaper does not treat surrogate pairs specially and does not perform Unicode
validation on its input.
/**
* Returns an {@link Escaper} instance that escapes special characters in a string so it can
* safely be included in an XML document as element content. See section <a
* href="http://www.w3.org/TR/2008/REC-xml-20081126/#syntax">2.4</a> of the XML specification.
*
* <p><b>Note:</b> Double and single quotes are not escaped, so it is <b>not safe</b> to use this
* escaper to escape attribute values. Use {@link #xmlContentEscaper} if the output can appear in
* element content or {@link #xmlAttributeEscaper} in attribute values.
*
* <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the
* character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more
* detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of
* the XML specification.
*
* <p>This escaper does not escape non-ASCII characters to their numeric character references
* (NCR). Any non-ASCII characters appearing in the input will be preserved in the output.
* Specifically "\r" (carriage return) is preserved in the output, which may result in it being
* silently converted to "\n" when the XML is parsed.
*
* <p>This escaper does not treat surrogate pairs specially and does not perform Unicode
* validation on its input.
*/
public static Escaper xmlContentEscaper() {
return XML_CONTENT_ESCAPER;
}
Returns an Escaper
instance that escapes special characters in a string so it can safely be included in XML document as an attribute value. See section 3.3.3 of the XML
specification.
This escaper substitutes 0xFFFD
for non-whitespace control characters and the character values 0xFFFE
and 0xFFFF
which are not permitted in XML. For more detail see section 2.2 of
the XML specification.
This escaper does not escape non-ASCII characters to their numeric character references (NCR). However, horizontal tab '\t'
, line feed '\n'
and carriage return
'\r'
are escaped to a corresponding NCR "	"
, "
"
, and "
"
respectively. Any other non-ASCII characters appearing in the input will be preserved in the output.
This escaper does not treat surrogate pairs specially and does not perform Unicode
validation on its input.
/**
* Returns an {@link Escaper} instance that escapes special characters in a string so it can
* safely be included in XML document as an attribute value. See section <a
* href="http://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize">3.3.3</a> of the XML
* specification.
*
* <p>This escaper substitutes {@code 0xFFFD} for non-whitespace control characters and the
* character values {@code 0xFFFE} and {@code 0xFFFF} which are not permitted in XML. For more
* detail see section <a href="http://www.w3.org/TR/2008/REC-xml-20081126/#charsets">2.2</a> of
* the XML specification.
*
* <p>This escaper does not escape non-ASCII characters to their numeric character references
* (NCR). However, horizontal tab {@code '\t'}, line feed {@code '\n'} and carriage return {@code
* '\r'} are escaped to a corresponding NCR {@code "	"}, {@code "
"}, and {@code "
"}
* respectively. Any other non-ASCII characters appearing in the input will be preserved in the
* output.
*
* <p>This escaper does not treat surrogate pairs specially and does not perform Unicode
* validation on its input.
*/
public static Escaper xmlAttributeEscaper() {
return XML_ATTRIBUTE_ESCAPER;
}
private static final Escaper XML_ESCAPER;
private static final Escaper XML_CONTENT_ESCAPER;
private static final Escaper XML_ATTRIBUTE_ESCAPER;
static {
Escapers.Builder builder = Escapers.builder();
// The char values \uFFFE and \uFFFF are explicitly not allowed in XML
// (Unicode code points above \uFFFF are represented via surrogate pairs
// which means they are treated as pairs of safe characters).
builder.setSafeRange(Character.MIN_VALUE, '\uFFFD');
// Unsafe characters are replaced with the Unicode replacement character.
builder.setUnsafeReplacement("\uFFFD");
/*
* Except for \n, \t, and \r, all ASCII control characters are replaced with the Unicode
* replacement character.
*
* Implementation note: An alternative to the following would be to make a map that simply
* replaces the allowed ASCII whitespace characters with themselves and to set the minimum safe
* character to 0x20. However this would slow down the escaping of simple strings that contain
* \t, \n, or \r.
*/
for (char c = MIN_ASCII_CONTROL_CHAR; c <= MAX_ASCII_CONTROL_CHAR; c++) {
if (c != '\t' && c != '\n' && c != '\r') {
builder.addEscape(c, "\uFFFD");
}
}
// Build the content escaper first and then add quote escaping for the
// general escaper.
builder.addEscape('&', "&");
builder.addEscape('<', "<");
builder.addEscape('>', ">");
XML_CONTENT_ESCAPER = builder.build();
builder.addEscape('\'', "'");
builder.addEscape('"', """);
XML_ESCAPER = builder.build();
builder.addEscape('\t', "	");
builder.addEscape('\n', "
");
builder.addEscape('\r', "
");
XML_ATTRIBUTE_ESCAPER = builder.build();
}
}