/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.joox;

import static java.util.Arrays.asList;
import static org.joox.JOOX.$;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.datatype.DatatypeFactory;
import javax.xml.datatype.XMLGregorianCalendar;
import javax.xml.namespace.NamespaceContext;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathFunctionResolver;

import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;


DOM utilities
Author:Lukas Eder
/** * DOM utilities * * @author Lukas Eder */
class Util {
A flag indicating whether xalan extensions have been loaded
/** * A flag indicating whether xalan extensions have been loaded */
private static volatile boolean xalanExtensionLoaded = false;
The xalan extensions NamespaceContext if available
/** * The xalan extensions {@link NamespaceContext} if available */
private static NamespaceContext xalanNamespaceContext;
The xalan extensions XPathFunctionResolver if available
/** * The xalan extensions {@link XPathFunctionResolver} if available */
private static XPathFunctionResolver xalanFunctionResolver;
A pattern for the dd.mm.yyyy format
/** * A pattern for the dd.mm.yyyy format */
private static final Pattern PATTERN_DD_MM_YYYY = Pattern.compile( "^(\\d{2})[-\\./](\\d{2})[-\\./](\\d{4})(?:\\s(\\d{2})(?:[-\\./:](\\d{2})(?:[-\\./:](\\d{2})(?:\\.(\\d+))?)?)?)?$");
A pattern for various yyyy-mm-dd formats
/** * A pattern for various yyyy-mm-dd formats */
private static final Pattern PATTERN_YYYY_MM_DD = Pattern.compile( "^(\\d{4})(?:[-\\./](\\d{2})(?:[-\\./](\\d{2})(?:(?:[\\sT]|'T')(\\d{2})(?:[-\\./:](\\d{2})(?:[-\\./:](\\d{2})(?:\\.(\\d+))?)?)?)?)?)?$");
Create some content in the context of a given document
Returns:
  • A DocumentFragment if text is well-formed.
  • null, if text is plain text or not well formed
/** * Create some content in the context of a given document * * @return <ul> * <li>A {@link DocumentFragment} if <code>text</code> is * well-formed.</li> * <li><code>null</code>, if <code>text</code> is plain text or not * well formed</li> * </ul> */
static final DocumentFragment createContent(Document doc, String text) { // [#150] Text might hold XML content, which can be leniently identified by the presence // of either < or & characters (other entities, like >, ", ' are not stricly XML content) if (text != null && (text.contains("<") || text.contains("&"))) { DocumentBuilder builder = JOOX.builder(); // [#162] Prevent log output builder.setErrorHandler(new DefaultHandler()); try { // [#128] Trimming will get rid of leading and trailing whitespace, which would // otherwise cause a HIERARCHY_REQUEST_ERR raised by the parser text = text.trim(); // There is a processing instruction. We can safely assume // valid XML and parse it as such if (text.startsWith("<?xml")) { Document parsed = builder.parse(new InputSource(new StringReader(text))); DocumentFragment fragment = parsed.createDocumentFragment(); fragment.appendChild(parsed.getDocumentElement()); return (DocumentFragment) doc.importNode(fragment, true); } // Any XML document fragment. To be on the safe side, fragments // are wrapped in a dummy root node else { String wrapped = "<dummy>" + text + "</dummy>"; Document parsed = builder.parse(new InputSource(new StringReader(wrapped))); DocumentFragment fragment = parsed.createDocumentFragment(); NodeList children = parsed.getDocumentElement().getChildNodes(); // appendChild removes children also from NodeList! while (children.getLength() > 0) { fragment.appendChild(children.item(0)); } return (DocumentFragment) doc.importNode(fragment, true); } } // This does not occur catch (IOException ignore) {} // The XML content is invalid catch (SAXException ignore) {} } // Plain text or invalid XML return null; }
Get an attribute value if it exists, or null
/** * Get an attribute value if it exists, or <code>null</code> */
static final String attr(Element element, String name) { return attr(element, name, true); }
Get an attribute value if it exists, or null
/** * Get an attribute value if it exists, or <code>null</code> */
static final String attr(Element element, String name, boolean ignoreNamespace) { NamedNodeMap attributes = element.getAttributes(); for (int i = 0; i < attributes.getLength(); i++) { String localName = attributes.item(i).getNodeName(); // [#103] If namespaces are ignored, consider only local // part of possibly namespace-unaware Element if (ignoreNamespace) localName = stripNamespace(localName); if (name.equals(localName)) return attributes.item(i).getNodeValue(); } return null; }
Make a list of elements available in a document.
  • Any element that is already in the document will be detached from its parent
  • Any element that is not already in the document will be deep-imported
Params:
  • document – The document to import elements into
  • elements – The elements that are made available to a document.
Returns:Elements that are all in the supplied document, but detached.
/** * Make a list of elements available in a document. * <ul> * <li>Any element that is already in the document will be detached from its * parent</li> * <li>Any element that is not already in the document will be deep-imported * </li> * </ul> * * @param document The document to import elements into * @param elements The elements that are made available to a document. * @return Elements that are all in the supplied document, but detached. */
static final List<Element> importOrDetach(Document document, Element... elements) { List<Element> detached = new ArrayList<Element>(); for (Element e : elements) { if (document != e.getOwnerDocument()) { detached.add((Element) document.importNode(e, true)); } else { Node parent = e.getParentNode(); if (parent != null) parent.removeChild(e); detached.add(e); } } return detached; }
Transform an Match[] into an Element[], removing duplicates.
/** * Transform an {@link Match}[] into an {@link Element}[], removing duplicates. */
static final Element[] elements(Match... content) { Set<Element> result = new LinkedHashSet<Element>(); for (Match x : content) result.addAll(x.get()); return result.toArray(new Element[result.size()]); }
Transform an Element into a String.
/** * Transform an {@link Element} into a <code>String</code>. */
static final String toString(Element element) { try { ByteArrayOutputStream out = new ByteArrayOutputStream(); Transformer transformer = TransformerFactory.newInstance().newTransformer(); transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); Source source = new DOMSource(element); Result target = new StreamResult(out); transformer.transform(source, target); return out.toString("UTF-8"); } catch (Exception e) { return "[ ERROR IN toString() : " + e.getMessage() + " ]"; } }
Check whether there are any element nodes in a NodeList
/** * Check whether there are any element nodes in a {@link NodeList} */
static final boolean textNodesOnly(NodeList list) { final int length = list.getLength(); for (int i = 0; i < length; i++) if (list.item(i).getNodeType() != Node.TEXT_NODE) return false; return true; }
Return an XPath expression describing an element
/** * Return an XPath expression describing an element */
static final String xpath(Element element) { StringBuilder sb = new StringBuilder(); Node iterator = element; while (iterator.getNodeType() == Node.ELEMENT_NODE) { sb.insert(0, "]"); sb.insert(0, siblingIndex((Element) iterator) + 1); sb.insert(0, "["); sb.insert(0, ((Element) iterator).getTagName()); sb.insert(0, "/"); iterator = iterator.getParentNode(); } return sb.toString(); }
Return an path expression describing an element
/** * Return an path expression describing an element */
static final String path(Element element) { StringBuilder sb = new StringBuilder(); Node iterator = element; while (iterator.getNodeType() == Node.ELEMENT_NODE) { sb.insert(0, $(iterator).tag()); sb.insert(0, "/"); iterator = iterator.getParentNode(); } return sb.toString(); }
Find the index among siblings of the same tag name
/** * Find the index among siblings of the same tag name */
private static final int siblingIndex(Element element) { // The document element has index 0 if (element.getParentNode() == element.getOwnerDocument()) return 0; // All other elements are compared with siblings with the same name // TODO: How to deal with namespaces here? Omit or keep? else return $(element).parent().children(JOOX.tag(element.getTagName(), false)).get().indexOf(element); }
Create a context object
/** * Create a context object */
static final Context context(Element match, int matchIndex, int matchSize) { return new DefaultContext(match, matchIndex, matchSize); }
Create a context object
/** * Create a context object */
static final Context context(Element match, int matchIndex, int matchSize, Element element, int elementIndex, int elementSize) { return new DefaultContext(match, matchIndex, matchSize, element, elementIndex, elementSize); }
Return string or "" if string is null
/** * Return <code>string</code> or <code>""</code> if <code>string</code> is * <code>null</code> */
static final String nonNull(String string) { return string == null ? "" : string; }
Split a string into values
/** * Split a string into values */
static final List<String> split(String value) { List<String> result = new ArrayList<String>(); SplitState state = SplitState.NEW; StringBuilder sb = new StringBuilder(); for (int i = 0; i < value.length(); i++) { char c = value.charAt(i); stateSwitch: switch (state) { // Seeking the first character of a new word case NEW: case NEW_WITH_AT_LEAST_ONE_WORD: { newSwitch: switch (c) { // Empty word case ',': case ';': { state = SplitState.NEW_WITH_AT_LEAST_ONE_WORD; result.add(""); break newSwitch; } // Ignorable whitespace case ' ': case '\t': case '\n': case '\r': { break newSwitch; } // Start of a delimited word case '"': { state = SplitState.DELIMITED; break newSwitch; } // Start of a non-delimited word default: { state = SplitState.NON_DELIMITED; sb.append(c); break newSwitch; } } break stateSwitch; } // Within a delimited word case DELIMITED: { delimitedSwitch: switch (c) { // Potential ending delimiter case '"': { // Escaped delimiter, consume subsequent quote char if (i + 1 < value.length() && value.charAt(i + 1) == '"') { sb.append(c); i++; } // Delimiter not followed by whitespace or word stop else if (i + 1 < value.length() && !asList(',', ';', ' ', '\t', '\n', '\r').contains(value.charAt(i + 1))) { sb.append(c); } // Consume word stop following delimiter else if (i + 1 < value.length() && asList(',', ';').contains(value.charAt(i + 1))) { result.add(sb.toString()); sb = new StringBuilder(); state = SplitState.NEW_WITH_AT_LEAST_ONE_WORD; i++; } // Ending delimiter. Either it's the last character // or it is followed by whitespace else { result.add(sb.toString()); sb = new StringBuilder(); state = SplitState.NEW; } break delimitedSwitch; } // Any word content default: { sb.append(c); break delimitedSwitch; } } break stateSwitch; } case NON_DELIMITED: { nonDelimitedSwitch: switch (c) { // Hard word stop case ',': case ';': { result.add(sb.toString()); sb = new StringBuilder(); state = SplitState.NEW_WITH_AT_LEAST_ONE_WORD; break nonDelimitedSwitch; } // Soft word stop case ' ': case '\t': case '\n': case '\r': { result.add(sb.toString()); sb = new StringBuilder(); state = SplitState.NEW; break nonDelimitedSwitch; } // Any word content default: { sb.append(c); break nonDelimitedSwitch; } } break stateSwitch; } } } // Cleaning up the last word switch (state) { // We were beginning a new word, so ignore sb content case NEW: break; // The content of sb is relevant, so add it case NEW_WITH_AT_LEAST_ONE_WORD: case DELIMITED: case NON_DELIMITED: result.add(sb.toString()); break; } return result; }
The states in the state machine for splitting strings into lists
/** * The states in the state machine for splitting strings into lists */
static enum SplitState {
This is the initial state before a new word
/** * This is the initial state before a new word */
NEW,
Like NEW, but there will be at least one word. This is useful for trailing empty strings when content ends with ',' or ';'
/** * Like {@link #NEW}, but there will be at least one word. This is * useful for trailing empty strings when content ends with * <code>','</code> or <code>';'</code> */
NEW_WITH_AT_LEAST_ONE_WORD,
The state within a word delimited by '"'
/** * The state within a word delimited by <code>'"'</code> */
DELIMITED,
The state not within a word delimited by '"'
/** * The state not within a word delimited by <code>'"'</code> */
NON_DELIMITED, }
Make a given XPath object "xalan-extension aware", if Xalan is on the classpath.
/** * Make a given {@link XPath} object "xalan-extension aware", if Xalan is on * the classpath. */
@SuppressWarnings("deprecation") static final void xalanExtensionAware(XPath xpath) { // Load xalan extensions thread-safely for all of jOOX if (!xalanExtensionLoaded) { synchronized (Util.class) { if (!xalanExtensionLoaded) { xalanExtensionLoaded = true; try { xalanNamespaceContext = (NamespaceContext) Class.forName("org.apache.xalan.extensions.ExtensionNamespaceContext").newInstance(); xalanFunctionResolver = (XPathFunctionResolver) Class.forName("org.apache.xalan.extensions.XPathFunctionResolverImpl").newInstance(); } catch (Exception ignore) { } } } } if (xalanNamespaceContext != null && xalanFunctionResolver != null) { xpath.setNamespaceContext(xalanNamespaceContext); xpath.setXPathFunctionResolver(xalanFunctionResolver); } }
Parse any date format
/** * Parse any date format */
static final java.util.Date parseDate(String formatted) { if (formatted == null || formatted.trim().equals("")) return null; try { DatatypeFactory factory = DatatypeFactory.newInstance(); XMLGregorianCalendar calendar = factory.newXMLGregorianCalendar(formatted); return calendar.toGregorianCalendar().getTime(); } catch (Exception e) { Matcher matcher = PATTERN_DD_MM_YYYY.matcher(formatted); // Try matching dd.MM.yyyy date formats first if (matcher.find()) { String yyyy = matcher.group(3); String mm = matcher.group(2); String dd = matcher.group(1); String hh = defaultIfEmpty(matcher.group(4), "0"); String min = defaultIfEmpty(matcher.group(5), "0"); String ss = defaultIfEmpty(matcher.group(6), "0"); String ms = defaultIfEmpty(matcher.group(7), "0"); return getDate(Integer.parseInt(yyyy), Integer.parseInt(mm), Integer.parseInt(dd), Integer.parseInt(hh), Integer.parseInt(min), Integer.parseInt(ss), Integer.parseInt(ms)); } // Then try matching yyyy-MM-dd date formats else { Matcher matcher2 = PATTERN_YYYY_MM_DD.matcher(formatted); if (matcher2.find()) { String yyyy = matcher2.group(1); String mm = defaultIfEmpty(matcher2.group(2), "1"); String dd = defaultIfEmpty(matcher2.group(3), "1"); String hh = defaultIfEmpty(matcher2.group(4), "0"); String min = defaultIfEmpty(matcher2.group(5), "0"); String ss = defaultIfEmpty(matcher2.group(6), "0"); String ms = defaultIfEmpty(matcher2.group(7), "0"); return getDate(Integer.parseInt(yyyy), Integer.parseInt(mm), Integer.parseInt(dd), Integer.parseInt(hh), Integer.parseInt(min), Integer.parseInt(ss), Integer.parseInt(ms)); } // Finally, try matching plain timestamps else { try { return new Date(Long.parseLong(formatted)); } catch (NumberFormatException ignore) { return null; } } } } } private static final Date getDate(int year, int month, int day, int hour, int minute, int second, int millisecond) { Calendar calendar = Calendar.getInstance(); calendar.setTimeInMillis(0); calendar.set(year, month - 1, day, hour, minute, second); calendar.set(Calendar.MILLISECOND, millisecond); return calendar.getTime(); } static final String defaultIfEmpty(String string, String defaultString) { if (string == null || string.equals("")) return defaultString; return string; } static final String getNamespace(String tagName) { int index = tagName.indexOf(':'); if (index > -1) return tagName.substring(0, index); return null; } static final String stripNamespace(String tagName) { int index = tagName.indexOf(':'); if (index > -1) return tagName.substring(index + 1); return tagName; } }