/*
 * Copyright (c) 1998, 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package com.sun.xml.internal.dtdparser;

import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Set;
import java.util.Vector;
import java.util.logging.Level;
import java.util.logging.Logger;

This implements parsing of XML 1.0 DTDs.

This conforms to the portion of the XML 1.0 specification related to the external DTD subset.

For multi-language applications (such as web servers using XML processing to create dynamic content), a method supports choosing a locale for parser diagnostics which is both understood by the message recipient and supported by the parser.

This parser produces a stream of parse events. It supports some features (exposing comments, CDATA sections, and entity references) which are not required to be reported by conformant XML processors.

Author:David Brownell, Janet Koenig, Kohsuke KAWAGUCHI
Version:$Id: DTDParser.java,v 1.2 2009-04-16 15:25:49 snajper Exp $
/** * This implements parsing of XML 1.0 DTDs. * <p> * This conforms to the portion of the XML 1.0 specification related to the * external DTD subset. * <p> * For multi-language applications (such as web servers using XML processing to * create dynamic content), a method supports choosing a locale for parser * diagnostics which is both understood by the message recipient and supported * by the parser. * <p> * This parser produces a stream of parse events. It supports some features * (exposing comments, CDATA sections, and entity references) which are not * required to be reported by conformant XML processors. * * @author David Brownell * @author Janet Koenig * @author Kohsuke KAWAGUCHI * @version $Id: DTDParser.java,v 1.2 2009-04-16 15:25:49 snajper Exp $ */
public class DTDParser { public final static String TYPE_CDATA = "CDATA"; public final static String TYPE_ID = "ID"; public final static String TYPE_IDREF = "IDREF"; public final static String TYPE_IDREFS = "IDREFS"; public final static String TYPE_ENTITY = "ENTITY"; public final static String TYPE_ENTITIES = "ENTITIES"; public final static String TYPE_NMTOKEN = "NMTOKEN"; public final static String TYPE_NMTOKENS = "NMTOKENS"; public final static String TYPE_NOTATION = "NOTATION"; public final static String TYPE_ENUMERATION = "ENUMERATION"; // stack of input entities being merged private InputEntity in; // temporaries reused during parsing private StringBuffer strTmp; private char nameTmp[]; private NameCache nameCache; private char charTmp[] = new char[2]; // temporary DTD parsing state private boolean doLexicalPE; // DTD state, used during parsing // private SimpleHashtable elements = new SimpleHashtable (47); protected final Set declaredElements = new java.util.HashSet(); private SimpleHashtable params = new SimpleHashtable(7); // exposed to package-private subclass Hashtable notations = new Hashtable(7); SimpleHashtable entities = new SimpleHashtable(17); private SimpleHashtable ids = new SimpleHashtable(); // listeners for DTD parsing events private DTDEventListener dtdHandler; private EntityResolver resolver; private Locale locale; // string constants -- use these copies so "==" works // package private static final String strANY = "ANY"; static final String strEMPTY = "EMPTY"; private static final Logger LOGGER = Logger.getLogger(DTDParser.class.getName());
Used by applications to request locale for diagnostics.
Params:
  • l – The locale to use, or null to use system defaults (which may include only message IDs).
/** * Used by applications to request locale for diagnostics. * * @param l The locale to use, or null to use system defaults (which may * include only message IDs). */
public void setLocale(Locale l) throws SAXException { if (l != null && !messages.isLocaleSupported(l.toString())) { throw new SAXException(messages.getMessage(locale, "P-078", new Object[]{l})); } locale = l; }
Returns the diagnostic locale.
/** * Returns the diagnostic locale. */
public Locale getLocale() { return locale; }
Chooses a client locale to use for diagnostics, using the first language specified in the list that is supported by this parser. That locale is then set using setLocale(). Such a list could be provided by a variety of user preference mechanisms, including the HTTP Accept-Language header field.
Params:
  • languages – Array of language specifiers, ordered with the most preferable one at the front. For example, "en-ca" then "fr-ca", followed by "zh_CN". Both RFC 1766 and Java styles are supported.
See Also:
Returns:The chosen locale, or null.
/** * Chooses a client locale to use for diagnostics, using the first language * specified in the list that is supported by this parser. That locale is * then set using <a href="#setLocale(java.util.Locale)"> setLocale()</a>. * Such a list could be provided by a variety of user preference mechanisms, * including the HTTP <em>Accept-Language</em> header field. * * @param languages Array of language specifiers, ordered with the most * preferable one at the front. For example, "en-ca" then "fr-ca", followed * by "zh_CN". Both RFC 1766 and Java styles are supported. * @return The chosen locale, or null. * @see MessageCatalog */
public Locale chooseLocale(String languages[]) throws SAXException { Locale l = messages.chooseLocale(languages); if (l != null) { setLocale(l); } return l; }
Lets applications control entity resolution.
/** * Lets applications control entity resolution. */
public void setEntityResolver(EntityResolver r) { resolver = r; }
Returns the object used to resolve entities
/** * Returns the object used to resolve entities */
public EntityResolver getEntityResolver() { return resolver; }
Used by applications to set handling of DTD parsing events.
/** * Used by applications to set handling of DTD parsing events. */
public void setDtdHandler(DTDEventListener handler) { dtdHandler = handler; if (handler != null) { handler.setDocumentLocator(new Locator() { @Override public String getPublicId() { return DTDParser.this.getPublicId(); } @Override public String getSystemId() { return DTDParser.this.getSystemId(); } @Override public int getLineNumber() { return DTDParser.this.getLineNumber(); } @Override public int getColumnNumber() { return DTDParser.this.getColumnNumber(); } }); } }
Returns the handler used to for DTD parsing events.
/** * Returns the handler used to for DTD parsing events. */
public DTDEventListener getDtdHandler() { return dtdHandler; }
Parse a DTD.
/** * Parse a DTD. */
public void parse(InputSource in) throws IOException, SAXException { init(); parseInternal(in); }
Parse a DTD.
/** * Parse a DTD. */
public void parse(String uri) throws IOException, SAXException { InputSource inSource; init(); // System.out.println ("parse (\"" + uri + "\")"); inSource = resolver.resolveEntity(null, uri); // If custom resolver punts resolution to parser, handle it ... if (inSource == null) { inSource = Resolver.createInputSource(new java.net.URL(uri), false); // ... or if custom resolver doesn't correctly construct the // input entity, patch it up enough so relative URIs work, and // issue a warning to minimize later confusion. } else if (inSource.getSystemId() == null) { warning("P-065", null); inSource.setSystemId(uri); } parseInternal(inSource); } // makes sure the parser is reset to "before a document" private void init() { in = null; // alloc temporary data used in parsing strTmp = new StringBuffer(); nameTmp = new char[20]; nameCache = new NameCache(); // reset doc info // isInAttribute = false; doLexicalPE = false; entities.clear(); notations.clear(); params.clear(); // elements.clear (); declaredElements.clear(); // initialize predefined references ... re-interpreted later builtin("amp", "&#38;"); builtin("lt", "&#60;"); builtin("gt", ">"); builtin("quot", "\""); builtin("apos", "'"); if (locale == null) { locale = Locale.getDefault(); } if (resolver == null) { resolver = new Resolver(); } if (dtdHandler == null) { dtdHandler = new DTDHandlerBase(); } } private void builtin(String entityName, String entityValue) { InternalEntity entity; entity = new InternalEntity(entityName, entityValue.toCharArray()); entities.put(entityName, entity); } //////////////////////////////////////////////////////////////// // // parsing is by recursive descent, code roughly // following the BNF rules except tweaked for simple // lookahead. rules are more or less in numeric order, // except where code sharing suggests other structures. // // a classic benefit of recursive descent parsers: it's // relatively easy to get diagnostics that make sense. // //////////////////////////////////////////////////////////////// @SuppressWarnings("CallToThreadDumpStack") private void parseInternal(InputSource input) throws IOException, SAXException { if (input == null) { fatal("P-000"); } try { in = InputEntity.getInputEntity(dtdHandler, locale); in.init(input, null, null, false); dtdHandler.startDTD(in); // [30] extSubset ::= TextDecl? extSubsetDecl // [31] extSubsetDecl ::= ( markupdecl | conditionalSect // | PEReference | S )* // ... same as [79] extPE, which is where the code is ExternalEntity externalSubset = new ExternalEntity(in); externalParameterEntity(externalSubset); if (!in.isEOF()) { fatal("P-001", new Object[]{Integer.toHexString(((int) getc()))}); } afterRoot(); dtdHandler.endDTD(); } catch (EndOfInputException e) { if (!in.isDocument()) { String name = in.getName(); do { // force a relevant URI and line number in = in.pop(); } while (in.isInternal()); fatal("P-002", new Object[]{name}); } else { fatal("P-003", null); } } catch (RuntimeException e) { LOGGER.log(Level.SEVERE, "Internal DTD parser error.", e); throw new SAXParseException(e.getMessage() != null ? e.getMessage() : e.getClass().getName(), getPublicId(), getSystemId(), getLineNumber(), getColumnNumber()); } finally { // recycle temporary data used during parsing strTmp = null; nameTmp = null; nameCache = null; // ditto input sources etc if (in != null) { in.close(); in = null; } // get rid of all DTD info ... some of it would be // useful for editors etc, investigate later. params.clear(); entities.clear(); notations.clear(); declaredElements.clear(); // elements.clear(); ids.clear(); } } void afterRoot() throws SAXException { // Make sure all IDREFs match declared ID attributes. We scan // after the document element is parsed, since XML allows forward // references, and only now can we know if they're all resolved. for (Enumeration e = ids.keys(); e.hasMoreElements();) { String id = (String) e.nextElement(); Boolean value = (Boolean) ids.get(id); if (Boolean.FALSE.equals(value)) { error("V-024", new Object[]{id}); } } } // role is for diagnostics private void whitespace(String roleId) throws IOException, SAXException { // [3] S ::= (#x20 | #x9 | #xd | #xa)+ if (!maybeWhitespace()) { fatal("P-004", new Object[]{messages.getMessage(locale, roleId)}); } } // S? private boolean maybeWhitespace() throws IOException, SAXException { if (!doLexicalPE) { return in.maybeWhitespace(); } // see getc() for the PE logic -- this lets us splice // expansions of PEs in "anywhere". getc() has smarts, // so for external PEs we don't bypass it. // XXX we can marginally speed PE handling, and certainly // be cleaner (hence potentially more correct), by using // the observations that expanded PEs only start and stop // where whitespace is allowed. getc wouldn't need any // "lexical" PE expansion logic, and no other method needs // to handle termination of PEs. (parsing of literals would // still need to pop entities, but not parsing of references // in content.) char c = getc(); boolean saw = false; while (c == ' ' || c == '\t' || c == '\n' || c == '\r') { saw = true; // this gracefully ends things when we stop playing // with internal parameters. caller should have a // grammar rule allowing whitespace at end of entity. if (in.isEOF() && !in.isInternal()) { return saw; } c = getc(); } ungetc(); return saw; } private String maybeGetName() throws IOException, SAXException { NameCacheEntry entry = maybeGetNameCacheEntry(); return (entry == null) ? null : entry.name; } private NameCacheEntry maybeGetNameCacheEntry() throws IOException, SAXException { // [5] Name ::= (Letter|'_'|':') (Namechar)* char c = getc(); if (!XmlChars.isLetter(c) && c != ':' && c != '_') { ungetc(); return null; } return nameCharString(c); } // Used when parsing enumerations private String getNmtoken() throws IOException, SAXException { // [7] Nmtoken ::= (Namechar)+ char c = getc(); if (!XmlChars.isNameChar(c)) { fatal("P-006", new Object[]{Character.valueOf(c)}); } return nameCharString(c).name; } // n.b. this gets used when parsing attribute values (for // internal references) so we can't use strTmp; it's also // a hotspot for CPU and memory in the parser (called at least // once for each element) so this has been optimized a bit. private NameCacheEntry nameCharString(char c) throws IOException, SAXException { int i = 1; nameTmp[0] = c; for (;;) { if ((c = in.getNameChar()) == 0) { break; } if (i >= nameTmp.length) { char tmp[] = new char[nameTmp.length + 10]; System.arraycopy(nameTmp, 0, tmp, 0, nameTmp.length); nameTmp = tmp; } nameTmp[i++] = c; } return nameCache.lookupEntry(nameTmp, i); } // // much similarity between parsing entity values in DTD // and attribute values (in DTD or content) ... both follow // literal parsing rules, newline canonicalization, etc // // leaves value in 'strTmp' ... either a "replacement text" (4.5), // or else partially normalized attribute value (the first bit // of 3.3.3's spec, without the "if not CDATA" bits). // @SuppressWarnings("UnusedAssignment") private void parseLiteral(boolean isEntityValue) throws IOException, SAXException { // [9] EntityValue ::= // '"' ([^"&%] | Reference | PEReference)* '"' // | "'" ([^'&%] | Reference | PEReference)* "'" // [10] AttValue ::= // '"' ([^"&] | Reference )* '"' // | "'" ([^'&] | Reference )* "'" char quote = getc(); char c; InputEntity source = in; if (quote != '\'' && quote != '"') { fatal("P-007"); } // don't report entity expansions within attributes, // they're reported "fully expanded" via SAX // isInAttribute = !isEntityValue; // get value into strTmp strTmp = new StringBuffer(); // scan, allowing entity push/pop wherever ... // expanded entities can't terminate the literal! for (;;) { if (in != source && in.isEOF()) { // we don't report end of parsed entities // within attributes (no SAX hooks) in = in.pop(); continue; } if ((c = getc()) == quote && in == source) { break; } // // Basically the "reference in attribute value" // row of the chart in section 4.4 of the spec // if (c == '&') { String entityName = maybeGetName(); if (entityName != null) { nextChar(';', "F-020", entityName); // 4.4 says: bypass these here ... we'll catch // forbidden refs to unparsed entities on use if (isEntityValue) { strTmp.append('&'); strTmp.append(entityName); strTmp.append(';'); continue; } expandEntityInLiteral(entityName, entities, isEntityValue); // character references are always included immediately } else if ((getc()) == '#') { int tmp = parseCharNumber(); if (tmp > 0xffff) { tmp = surrogatesToCharTmp(tmp); strTmp.append(charTmp[0]); if (tmp == 2) { strTmp.append(charTmp[1]); } } else { strTmp.append((char) tmp); } } else { fatal("P-009"); } continue; } // expand parameter entities only within entity value literals if (c == '%' && isEntityValue) { String entityName = maybeGetName(); if (entityName != null) { nextChar(';', "F-021", entityName); expandEntityInLiteral(entityName, params, isEntityValue); continue; } else { fatal("P-011"); } } // For attribute values ... if (!isEntityValue) { // 3.3.3 says whitespace normalizes to space... if (c == ' ' || c == '\t' || c == '\n' || c == '\r') { strTmp.append(' '); continue; } // "<" not legal in parsed literals ... if (c == '<') { fatal("P-012"); } } strTmp.append(c); } // isInAttribute = false; } // does a SINGLE expansion of the entity (often reparsed later) private void expandEntityInLiteral(String name, SimpleHashtable table, boolean isEntityValue) throws IOException, SAXException { Object entity = table.get(name); if (entity instanceof InternalEntity) { InternalEntity value = (InternalEntity) entity; pushReader(value.buf, name, !value.isPE); } else if (entity instanceof ExternalEntity) { if (!isEntityValue) // must be a PE ... { fatal("P-013", new Object[]{name}); } // XXX if this returns false ... pushReader((ExternalEntity) entity); } else if (entity == null) { // // Note: much confusion about whether spec requires such // errors to be fatal in many cases, but none about whether // it allows "normal" errors to be unrecoverable! // fatal((table == params) ? "V-022" : "P-014", new Object[]{name}); } } // [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") // for PUBLIC and SYSTEM literals, also "<?xml ...type='literal'?>' // NOTE: XML spec should explicitly say that PE ref syntax is // ignored in PIs, comments, SystemLiterals, and Pubid Literal // values ... can't process the XML spec's own DTD without doing // that for comments. private String getQuotedString(String type, String extra) throws IOException, SAXException { // use in.getc to bypass PE processing char quote = in.getc(); if (quote != '\'' && quote != '"') { fatal("P-015", new Object[]{ messages.getMessage(locale, type, new Object[]{extra}) }); } char c; strTmp = new StringBuffer(); while ((c = in.getc()) != quote) { strTmp.append((char) c); } return strTmp.toString(); } private String parsePublicId() throws IOException, SAXException { // [12] PubidLiteral ::= ('"' PubidChar* '"') | ("'" PubidChar* "'") // [13] PubidChar ::= #x20|#xd|#xa|[a-zA-Z0-9]|[-'()+,./:=?;!*#@$_%] String retval = getQuotedString("F-033", null); for (int i = 0; i < retval.length(); i++) { char c = retval.charAt(i); if (" \r\n-'()+,./:=?;!*#@$_%0123456789".indexOf(c) == -1 && !(c >= 'A' && c <= 'Z') && !(c >= 'a' && c <= 'z')) { fatal("P-016", new Object[]{Character.valueOf(c)}); } } strTmp = new StringBuffer(); strTmp.append(retval); return normalize(false); } // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) // handled by: InputEntity.parsedContent() private boolean maybeComment(boolean skipStart) throws IOException, SAXException { // [15] Comment ::= '<!--' // ( (Char - '-') | ('-' (Char - '-'))* // '-->' if (!in.peek(skipStart ? "!--" : "<!--", null)) { return false; } boolean savedLexicalPE = doLexicalPE; boolean saveCommentText; doLexicalPE = false; saveCommentText = false; if (saveCommentText) { strTmp = new StringBuffer(); } oneComment: for (;;) { try { // bypass PE expansion, but permit PEs // to complete ... valid docs won't care. for (;;) { int c = getc(); if (c == '-') { c = getc(); if (c != '-') { if (saveCommentText) { strTmp.append('-'); } ungetc(); continue; } nextChar('>', "F-022", null); break oneComment; } if (saveCommentText) { strTmp.append((char) c); } } } catch (EndOfInputException e) { // // This is fatal EXCEPT when we're processing a PE... // in which case a validating processor reports an error. // External PEs are easy to detect; internal ones we // infer by being an internal entity outside an element. // if (in.isInternal()) { error("V-021", null); } fatal("P-017"); } } doLexicalPE = savedLexicalPE; if (saveCommentText) { dtdHandler.comment(strTmp.toString()); } return true; } private boolean maybePI(boolean skipStart) throws IOException, SAXException { // [16] PI ::= '<?' PITarget // (S (Char* - (Char* '?>' Char*)))? // '?>' // [17] PITarget ::= Name - (('X'|'x')('M'|'m')('L'|'l') boolean savedLexicalPE = doLexicalPE; if (!in.peek(skipStart ? "?" : "<?", null)) { return false; } doLexicalPE = false; String target = maybeGetName(); if (target == null) { fatal("P-018"); } if ("xml".equals(target)) { fatal("P-019"); } if ("xml".equalsIgnoreCase(target)) { fatal("P-020", new Object[]{target}); } if (maybeWhitespace()) { strTmp = new StringBuffer(); try { for (;;) { // use in.getc to bypass PE processing char c = in.getc(); //Reached the end of PI. if (c == '?' && in.peekc('>')) { break; } strTmp.append(c); } } catch (EndOfInputException e) { fatal("P-021"); } dtdHandler.processingInstruction(target, strTmp.toString()); } else { if (!in.peek("?>", null)) { fatal("P-022"); } dtdHandler.processingInstruction(target, ""); } doLexicalPE = savedLexicalPE; return true; } // [18] CDSect ::= CDStart CData CDEnd // [19] CDStart ::= '<![CDATA[' // [20] CData ::= (Char* - (Char* ']]>' Char*)) // [21] CDEnd ::= ']]>' // // ... handled by InputEntity.unparsedContent() // collapsing several rules together ... // simpler than attribute literals -- no reference parsing! private String maybeReadAttribute(String name, boolean must) throws IOException, SAXException { // [24] VersionInfo ::= S 'version' Eq \'|\" versionNum \'|\" // [80] EncodingDecl ::= S 'encoding' Eq \'|\" EncName \'|\" // [32] SDDecl ::= S 'standalone' Eq \'|\" ... \'|\" if (!maybeWhitespace()) { if (!must) { return null; } fatal("P-024", new Object[]{name}); // NOTREACHED } if (!peek(name)) { if (must) { fatal("P-024", new Object[]{name}); } else { // To ensure that the whitespace is there so that when we // check for the next attribute we assure that the // whitespace still exists. ungetc(); return null; } } // [25] Eq ::= S? '=' S? maybeWhitespace(); nextChar('=', "F-023", null); maybeWhitespace(); return getQuotedString("F-035", name); } private void readVersion(boolean must, String versionNum) throws IOException, SAXException { String value = maybeReadAttribute("version", must); // [26] versionNum ::= ([a-zA-Z0-9_.:]| '-')+ if (must && value == null) { fatal("P-025", new Object[]{versionNum}); } if (value != null) { int length = value.length(); for (int i = 0; i < length; i++) { char c = value.charAt(i); if (!((c >= '0' && c <= '9') || c == '_' || c == '.' || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == ':' || c == '-')) { fatal("P-026", new Object[]{value}); } } } if (value != null && !value.equals(versionNum)) { error("P-027", new Object[]{versionNum, value}); } } // common code used by most markup declarations // ... S (Q)Name ... private String getMarkupDeclname(String roleId, boolean qname) throws IOException, SAXException { String name; whitespace(roleId); name = maybeGetName(); if (name == null) { fatal("P-005", new Object[]{messages.getMessage(locale, roleId)}); } return name; } private boolean maybeMarkupDecl() throws IOException, SAXException { // [29] markupdecl ::= elementdecl | Attlistdecl // | EntityDecl | NotationDecl | PI | Comment return maybeElementDecl() || maybeAttlistDecl() || maybeEntityDecl() || maybeNotationDecl() || maybePI(false) || maybeComment(false); } private static final String XmlLang = "xml:lang"; private boolean isXmlLang(String value) { // [33] LanguageId ::= Langcode ('-' Subcode)* // [34] Langcode ::= ISO639Code | IanaCode | UserCode // [35] ISO639Code ::= [a-zA-Z] [a-zA-Z] // [36] IanaCode ::= [iI] '-' SubCode // [37] UserCode ::= [xX] '-' SubCode // [38] SubCode ::= [a-zA-Z]+ // the ISO and IANA codes (and subcodes) are registered, // but that's neither a WF nor a validity constraint. int nextSuffix; char c; if (value.length() < 2) { return false; } c = value.charAt(1); if (c == '-') { // IANA, or user, code c = value.charAt(0); if (!(c == 'i' || c == 'I' || c == 'x' || c == 'X')) { return false; } nextSuffix = 1; } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { // 2 letter ISO code, or error c = value.charAt(0); if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) { return false; } nextSuffix = 2; } else { return false; } // here "suffix" ::= '-' [a-zA-Z]+ suffix* while (nextSuffix < value.length()) { c = value.charAt(nextSuffix); if (c != '-') { break; } while (++nextSuffix < value.length()) { c = value.charAt(nextSuffix); if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) { break; } } } return value.length() == nextSuffix && c != '-'; } // // CHAPTER 3: Logical Structures //
To validate, subclassers should at this time make sure that values are of the declared types:
  • ID and IDREF(S) values are Names
  • NMTOKEN(S) are Nmtokens
  • ENUMERATION values match one of the tokens
  • NOTATION values match a notation name
  • ENTITIY(IES) values match an unparsed external entity

Separately, make sure IDREF values match some ID provided in the document (in the afterRoot method).

/** * To validate, subclassers should at this time make sure that values are of * the declared types:<UL> <LI> ID and IDREF(S) values are Names <LI> * NMTOKEN(S) are Nmtokens <LI> ENUMERATION values match one of the tokens * <LI> NOTATION values match a notation name <LI> ENTITIY(IES) values match * an unparsed external entity </UL> * <p> * <P> Separately, make sure IDREF values match some ID provided in the * document (in the afterRoot method). */
/* void validateAttributeSyntax (Attribute attr, String value) throws DTDParseException { // ID, IDREF(S) ... values are Names if (Attribute.ID == attr.type()) { if (!XmlNames.isName (value)) error ("V-025", new Object [] { value }); Boolean b = (Boolean) ids.getNonInterned (value); if (b == null || b.equals (Boolean.FALSE)) ids.put (value.intern (), Boolean.TRUE); else error ("V-026", new Object [] { value }); } else if (Attribute.IDREF == attr.type()) { if (!XmlNames.isName (value)) error ("V-027", new Object [] { value }); Boolean b = (Boolean) ids.getNonInterned (value); if (b == null) ids.put (value.intern (), Boolean.FALSE); } else if (Attribute.IDREFS == attr.type()) { StringTokenizer tokenizer = new StringTokenizer (value); Boolean b; boolean sawValue = false; while (tokenizer.hasMoreTokens ()) { value = tokenizer.nextToken (); if (!XmlNames.isName (value)) error ("V-027", new Object [] { value }); b = (Boolean) ids.getNonInterned (value); if (b == null) ids.put (value.intern (), Boolean.FALSE); sawValue = true; } if (!sawValue) error ("V-039", null); // NMTOKEN(S) ... values are Nmtoken(s) } else if (Attribute.NMTOKEN == attr.type()) { if (!XmlNames.isNmtoken (value)) error ("V-028", new Object [] { value }); } else if (Attribute.NMTOKENS == attr.type()) { StringTokenizer tokenizer = new StringTokenizer (value); boolean sawValue = false; while (tokenizer.hasMoreTokens ()) { value = tokenizer.nextToken (); if (!XmlNames.isNmtoken (value)) error ("V-028", new Object [] { value }); sawValue = true; } if (!sawValue) error ("V-032", null); // ENUMERATION ... values match one of the tokens } else if (Attribute.ENUMERATION == attr.type()) { for (int i = 0; i < attr.values().length; i++) if (value.equals (attr.values()[i])) return; error ("V-029", new Object [] { value }); // NOTATION values match a notation name } else if (Attribute.NOTATION == attr.type()) { // // XXX XML 1.0 spec should probably list references to // externally defined notations in standalone docs as // validity errors. Ditto externally defined unparsed // entities; neither should show up in attributes, else // one needs to read the external declarations in order // to make sense of the document (exactly what tagging // a doc as "standalone" intends you won't need to do). // for (int i = 0; i < attr.values().length; i++) if (value.equals (attr.values()[i])) return; error ("V-030", new Object [] { value }); // ENTITY(IES) values match an unparsed entity(ies) } else if (Attribute.ENTITY == attr.type()) { // see note above re standalone if (!isUnparsedEntity (value)) error ("V-031", new Object [] { value }); } else if (Attribute.ENTITIES == attr.type()) { StringTokenizer tokenizer = new StringTokenizer (value); boolean sawValue = false; while (tokenizer.hasMoreTokens ()) { value = tokenizer.nextToken (); // see note above re standalone if (!isUnparsedEntity (value)) error ("V-031", new Object [] { value }); sawValue = true; } if (!sawValue) error ("V-040", null); } else if (Attribute.CDATA != attr.type()) throw new InternalError (attr.type()); } */ /* private boolean isUnparsedEntity (String name) { Object e = entities.getNonInterned (name); if (e == null || !(e instanceof ExternalEntity)) return false; return ((ExternalEntity)e).notation != null; } */ private boolean maybeElementDecl() throws IOException, SAXException { // [45] elementDecl ::= '<!ELEMENT' S Name S contentspec S? '>' // [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | children InputEntity start = peekDeclaration("!ELEMENT"); if (start == null) { return false; } // n.b. for content models where inter-element whitespace is // ignorable, we mark that fact here. String name = getMarkupDeclname("F-015", true); // Element element = (Element) elements.get (name); // boolean declEffective = false; /* if (element != null) { if (element.contentModel() != null) { error ("V-012", new Object [] { name }); } // else <!ATTLIST name ...> came first } else { element = new Element(name); elements.put (element.name(), element); declEffective = true; } */ if (declaredElements.contains(name)) { error("V-012", new Object[]{name}); } else { declaredElements.add(name); // declEffective = true; } short modelType; whitespace("F-000"); if (peek(strEMPTY)) { /// // leave element.contentModel as null for this case. dtdHandler.startContentModel(name, modelType = DTDEventListener.CONTENT_MODEL_EMPTY); } else if (peek(strANY)) { /// element.setContentModel(new StringModel(StringModelType.ANY)); dtdHandler.startContentModel(name, modelType = DTDEventListener.CONTENT_MODEL_ANY); } else { modelType = getMixedOrChildren(name); } dtdHandler.endContentModel(name, modelType); maybeWhitespace(); char c = getc(); if (c != '>') { fatal("P-036", new Object[]{name, Character.valueOf(c)}); } if (start != in) { error("V-013", null); } /// dtdHandler.elementDecl(element); return true; } // We're leaving the content model as a regular expression; // it's an efficient natural way to express such things, and // libraries often interpret them. No whitespace in the // model we store, though!
returns content model type.
/** * returns content model type. */
private short getMixedOrChildren(String elementName/*Element element*/) throws IOException, SAXException { InputEntity start; // [47] children ::= (choice|seq) ('?'|'*'|'+')? strTmp = new StringBuffer(); nextChar('(', "F-028", elementName); start = in; maybeWhitespace(); strTmp.append('('); short modelType; if (peek("#PCDATA")) { strTmp.append("#PCDATA"); dtdHandler.startContentModel(elementName, modelType = DTDEventListener.CONTENT_MODEL_MIXED); getMixed(elementName, start); } else { dtdHandler.startContentModel(elementName, modelType = DTDEventListener.CONTENT_MODEL_CHILDREN); getcps(elementName, start); } return modelType; } // '(' S? already consumed // matching ')' must be in "start" entity if validating private void getcps(/*Element element,*/String elementName, InputEntity start) throws IOException, SAXException { // [48] cp ::= (Name|choice|seq) ('?'|'*'|'+')? // [49] choice ::= '(' S? cp (S? '|' S? cp)* S? ')' // [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')' boolean decided = false; char type = 0; // ContentModel retval, temp, current; // retval = temp = current = null; dtdHandler.startModelGroup(); do { String tag; tag = maybeGetName(); if (tag != null) { strTmp.append(tag); // temp = new ElementModel(tag); // getFrequency((RepeatableContent)temp); ///-> dtdHandler.childElement(tag, getFrequency()); ///<- } else if (peek("(")) { InputEntity next = in; strTmp.append('('); maybeWhitespace(); // temp = getcps(element, next); // getFrequency(temp); ///-> getcps(elementName, next); /// getFrequency(); <- this looks like a bug ///<- } else { fatal((type == 0) ? "P-039" : ((type == ',') ? "P-037" : "P-038"), new Object[]{Character.valueOf(getc())}); } maybeWhitespace(); if (decided) { char c = getc(); // if (current != null) { // current.addChild(temp); // } if (c == type) { strTmp.append(type); maybeWhitespace(); reportConnector(type); continue; } else if (c == '\u0029') { // rparen ungetc(); continue; } else { fatal((type == 0) ? "P-041" : "P-040", new Object[]{ Character.valueOf(c), Character.valueOf(type) }); } } else { type = getc(); switch (type) { case '|': case ',': reportConnector(type); break; default: // retval = temp; ungetc(); continue; } // retval = (ContentModel)current; decided = true; // current.addChild(temp); strTmp.append(type); } maybeWhitespace(); } while (!peek(")")); if (in != start) { error("V-014", new Object[]{elementName}); } strTmp.append(')'); dtdHandler.endModelGroup(getFrequency()); // return retval; } private void reportConnector(char type) throws SAXException { switch (type) { case '|': dtdHandler.connector(DTDEventListener.CHOICE); ///<- return; case ',': dtdHandler.connector(DTDEventListener.SEQUENCE); ///<- return; default: throw new Error(); //assertion failed. } } private short getFrequency() throws IOException, SAXException { final char c = getc(); if (c == '?') { strTmp.append(c); return DTDEventListener.OCCURENCE_ZERO_OR_ONE; // original.setRepeat(Repeat.ZERO_OR_ONE); } else if (c == '+') { strTmp.append(c); return DTDEventListener.OCCURENCE_ONE_OR_MORE; // original.setRepeat(Repeat.ONE_OR_MORE); } else if (c == '*') { strTmp.append(c); return DTDEventListener.OCCURENCE_ZERO_OR_MORE; // original.setRepeat(Repeat.ZERO_OR_MORE); } else { ungetc(); return DTDEventListener.OCCURENCE_ONCE; } } // '(' S? '#PCDATA' already consumed // matching ')' must be in "start" entity if validating private void getMixed(String elementName, /*Element element,*/ InputEntity start) throws IOException, SAXException { // [51] Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' // | '(' S? '#PCDATA' S? ')' maybeWhitespace(); if (peek("\u0029*") || peek("\u0029")) { if (in != start) { error("V-014", new Object[]{elementName}); } strTmp.append(')'); // element.setContentModel(new StringModel(StringModelType.PCDATA)); return; } ArrayList l = new ArrayList(); // l.add(new StringModel(StringModelType.PCDATA)); while (peek("|")) { String name; strTmp.append('|'); maybeWhitespace(); doLexicalPE = true; name = maybeGetName(); if (name == null) { fatal("P-042", new Object[]{elementName, Integer.toHexString(getc())}); } if (l.contains(name)) { error("V-015", new Object[]{name}); } else { l.add(name); dtdHandler.mixedElement(name); } strTmp.append(name); maybeWhitespace(); } if (!peek("\u0029*")) // right paren { fatal("P-043", new Object[]{elementName, Character.valueOf(getc())}); } if (in != start) { error("V-014", new Object[]{elementName}); } strTmp.append(')'); // ChoiceModel cm = new ChoiceModel((Collection)l); // cm.setRepeat(Repeat.ZERO_OR_MORE); // element.setContentModel(cm); } private boolean maybeAttlistDecl() throws IOException, SAXException { // [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' InputEntity start = peekDeclaration("!ATTLIST"); if (start == null) { return false; } String elementName = getMarkupDeclname("F-016", true); // Element element = (Element) elements.get (name); // if (element == null) { // // not yet declared -- no problem. // element = new Element(name); // elements.put(name, element); // } while (!peek(">")) { // [53] AttDef ::= S Name S AttType S DefaultDecl // [54] AttType ::= StringType | TokenizedType | EnumeratedType // look for global attribute definitions, don't expand for now... maybeWhitespace(); char c = getc(); if (c == '%') { String entityName = maybeGetName(); if (entityName != null) { nextChar(';', "F-021", entityName); whitespace("F-021"); continue; } else { fatal("P-011"); } } ungetc(); // look for attribute name otherwise String attName = maybeGetName(); if (attName == null) { fatal("P-044", new Object[]{Character.valueOf(getc())}); } whitespace("F-001"); /// Attribute a = new Attribute (name); String typeName; Vector values = null; // notation/enumeration values // Note: use the type constants from Attribute // so that "==" may be used (faster) // [55] StringType ::= 'CDATA' if (peek(TYPE_CDATA)) /// a.setType(Attribute.CDATA); { typeName = TYPE_CDATA; } // [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' // | 'ENTITY' | 'ENTITIES' // | 'NMTOKEN' | 'NMTOKENS' // n.b. if "IDREFS" is there, both "ID" and "IDREF" // match peekahead ... so this order matters! else if (peek(TYPE_IDREFS)) { typeName = TYPE_IDREFS; } else if (peek(TYPE_IDREF)) { typeName = TYPE_IDREF; } else if (peek(TYPE_ID)) { typeName = TYPE_ID; // TODO: should implement this error check? /// if (element.id() != null) { /// error ("V-016", new Object [] { element.id() }); /// } else /// element.setId(name); } else if (peek(TYPE_ENTITY)) { typeName = TYPE_ENTITY; } else if (peek(TYPE_ENTITIES)) { typeName = TYPE_ENTITIES; } else if (peek(TYPE_NMTOKENS)) { typeName = TYPE_NMTOKENS; } else if (peek(TYPE_NMTOKEN)) { typeName = TYPE_NMTOKEN; } // [57] EnumeratedType ::= NotationType | Enumeration // [58] NotationType ::= 'NOTATION' S '(' S? Name // (S? '|' S? Name)* S? ')' else if (peek(TYPE_NOTATION)) { typeName = TYPE_NOTATION; whitespace("F-002"); nextChar('(', "F-029", null); maybeWhitespace(); values = new Vector(); do { String name; if ((name = maybeGetName()) == null) { fatal("P-068"); } // permit deferred declarations if (notations.get(name) == null) { notations.put(name, name); } values.addElement(name); maybeWhitespace(); if (peek("|")) { maybeWhitespace(); } } while (!peek(")")); /// a.setValues(new String [v.size ()]); /// for (int i = 0; i < v.size (); i++) /// a.setValue(i, (String)v.elementAt(i)); // [59] Enumeration ::= '(' S? Nmtoken (S? '|' Nmtoken)* S? ')' } else if (peek("(")) { /// a.setType(Attribute.ENUMERATION); typeName = TYPE_ENUMERATION; maybeWhitespace(); /// Vector v = new Vector (); values = new Vector(); do { String name = getNmtoken(); /// v.addElement (name); values.addElement(name); maybeWhitespace(); if (peek("|")) { maybeWhitespace(); } } while (!peek(")")); /// a.setValues(new String [v.size ()]); /// for (int i = 0; i < v.size (); i++) /// a.setValue(i, (String)v.elementAt(i)); } else { fatal("P-045", new Object[]{attName, Character.valueOf(getc())}); typeName = null; } short attributeUse; String defaultValue = null; // [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' // | (('#FIXED' S)? AttValue) whitespace("F-003"); if (peek("#REQUIRED")) { attributeUse = DTDEventListener.USE_REQUIRED; } /// a.setIsRequired(true); else if (peek("#FIXED")) { /// if (a.type() == Attribute.ID) if (typeName == TYPE_ID) { error("V-017", new Object[]{attName}); } /// a.setIsFixed(true); attributeUse = DTDEventListener.USE_FIXED; whitespace("F-004"); parseLiteral(false); /// if (a.type() != Attribute.CDATA) /// a.setDefaultValue(normalize(false)); /// else /// a.setDefaultValue(strTmp.toString()); if (typeName == TYPE_CDATA) { defaultValue = normalize(false); } else { defaultValue = strTmp.toString(); } // TODO: implement this check /// if (a.type() != Attribute.CDATA) /// validateAttributeSyntax (a, a.defaultValue()); } else if (!peek("#IMPLIED")) { attributeUse = DTDEventListener.USE_IMPLIED; /// if (a.type() == Attribute.ID) if (typeName == TYPE_ID) { error("V-018", new Object[]{attName}); } parseLiteral(false); /// if (a.type() != Attribute.CDATA) /// a.setDefaultValue(normalize(false)); /// else /// a.setDefaultValue(strTmp.toString()); if (typeName == TYPE_CDATA) { defaultValue = normalize(false); } else { defaultValue = strTmp.toString(); } // TODO: implement this check /// if (a.type() != Attribute.CDATA) /// validateAttributeSyntax (a, a.defaultValue()); } else { // TODO: this looks like an fatal error. attributeUse = DTDEventListener.USE_NORMAL; } if (XmlLang.equals(attName) && defaultValue/* a.defaultValue()*/ != null && !isXmlLang(defaultValue/*a.defaultValue()*/)) { error("P-033", new Object[]{defaultValue /*a.defaultValue()*/}); } // TODO: isn't it an error to specify the same attribute twice? /// if (!element.attributes().contains(a)) { /// element.addAttribute(a); /// dtdHandler.attributeDecl(a); /// } String[] v = (values != null) ? (String[]) values.toArray(new String[values.size()]) : null; dtdHandler.attributeDecl(elementName, attName, typeName, v, attributeUse, defaultValue); maybeWhitespace(); } if (start != in) { error("V-013", null); } return true; } // used when parsing literal attribute values, // or public identifiers. // // input in strTmp private String normalize(boolean invalidIfNeeded) { // this can allocate an extra string... String s = strTmp.toString(); String s2 = s.trim(); boolean didStrip = false; if (s != s2) { s = s2; didStrip = true; } strTmp = new StringBuffer(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); if (!XmlChars.isSpace(c)) { strTmp.append(c); continue; } strTmp.append(' '); while (++i < s.length() && XmlChars.isSpace(s.charAt(i))) { didStrip = true; } i--; } if (didStrip) { return strTmp.toString(); } else { return s; } } private boolean maybeConditionalSect() throws IOException, SAXException { // [61] conditionalSect ::= includeSect | ignoreSect if (!peek("<![")) { return false; } String keyword; InputEntity start = in; maybeWhitespace(); if ((keyword = maybeGetName()) == null) { fatal("P-046"); } maybeWhitespace(); nextChar('[', "F-030", null); // [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' // extSubsetDecl ']]>' if ("INCLUDE".equals(keyword)) { for (;;) { while (in.isEOF() && in != start) { in = in.pop(); } if (in.isEOF()) { error("V-020", null); } if (peek("]]>")) { break; } doLexicalPE = false; if (maybeWhitespace()) { continue; } if (maybePEReference()) { continue; } doLexicalPE = true; if (maybeMarkupDecl() || maybeConditionalSect()) { continue; } fatal("P-047"); } // [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' // ignoreSectcontents ']]>' // [64] ignoreSectcontents ::= Ignore ('<![' // ignoreSectcontents ']]>' Ignore)* // [65] Ignore ::= Char* - (Char* ('<![' | ']]>') Char*) } else if ("IGNORE".equals(keyword)) { int nestlevel = 1; // ignoreSectcontents doLexicalPE = false; while (nestlevel > 0) { char c = getc(); // will pop input entities if (c == '<') { if (peek("![")) { nestlevel++; } } else if (c == ']') { if (peek("]>")) { nestlevel--; } } else { continue; } } } else { fatal("P-048", new Object[]{keyword}); } return true; } // // CHAPTER 4: Physical Structures // // parse decimal or hex numeric character reference private int parseCharNumber() throws IOException, SAXException { char c; int retval = 0; // n.b. we ignore overflow ... if (getc() != 'x') { ungetc(); for (;;) { c = getc(); if (c >= '0' && c <= '9') { retval *= 10; retval += (c - '0'); continue; } if (c == ';') { return retval; } fatal("P-049"); } } else { for (;;) { c = getc(); if (c >= '0' && c <= '9') { retval <<= 4; retval += (c - '0'); continue; } if (c >= 'a' && c <= 'f') { retval <<= 4; retval += 10 + (c - 'a'); continue; } if (c >= 'A' && c <= 'F') { retval <<= 4; retval += 10 + (c - 'A'); continue; } if (c == ';') { return retval; } fatal("P-050"); } } } // parameter is a UCS-4 character ... i.e. not just 16 bit UNICODE, // though still subject to the 'Char' construct in XML private int surrogatesToCharTmp(int ucs4) throws SAXException { if (ucs4 <= 0xffff) { if (XmlChars.isChar(ucs4)) { charTmp[0] = (char) ucs4; return 1; } } else if (ucs4 <= 0x0010ffff) { // we represent these as UNICODE surrogate pairs ucs4 -= 0x10000; charTmp[0] = (char) (0xd800 | ((ucs4 >> 10) & 0x03ff)); charTmp[1] = (char) (0xdc00 | (ucs4 & 0x03ff)); return 2; } fatal("P-051", new Object[]{Integer.toHexString(ucs4)}); // NOTREACHED return -1; } private boolean maybePEReference() throws IOException, SAXException { // This is the SYNTACTIC version of this construct. // When processing external entities, there is also // a LEXICAL version; see getc() and doLexicalPE. // [69] PEReference ::= '%' Name ';' if (!in.peekc('%')) { return false; } String name = maybeGetName(); Object entity; if (name == null) { fatal("P-011"); } nextChar(';', "F-021", name); entity = params.get(name); if (entity instanceof InternalEntity) { InternalEntity value = (InternalEntity) entity; pushReader(value.buf, name, false); } else if (entity instanceof ExternalEntity) { pushReader((ExternalEntity) entity); externalParameterEntity((ExternalEntity) entity); } else if (entity == null) { error("V-022", new Object[]{name}); } return true; } private boolean maybeEntityDecl() throws IOException, SAXException { // [70] EntityDecl ::= GEDecl | PEDecl // [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' // [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDEF S? '>' // [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) // [74] PEDef ::= EntityValue | ExternalID // InputEntity start = peekDeclaration("!ENTITY"); if (start == null) { return false; } String entityName; SimpleHashtable defns; ExternalEntity externalId; boolean doStore; // PE expansion gets selectively turned off several places: // in ENTITY declarations (here), in comments, in PIs. // Here, we allow PE entities to be declared, and allows // literals to include PE refs without the added spaces // required with their expansion in markup decls. doLexicalPE = false; whitespace("F-005"); if (in.peekc('%')) { whitespace("F-006"); defns = params; } else { defns = entities; } ungetc(); // leave some whitespace doLexicalPE = true; entityName = getMarkupDeclname("F-017", false); whitespace("F-007"); externalId = maybeExternalID(); // // first definition sticks ... e.g. internal subset PEs are used // to override DTD defaults. It's also an "error" to incorrectly // redefine builtin internal entities, but since reporting such // errors is optional we only give warnings ("just in case") for // non-parameter entities. // doStore = (defns.get(entityName) == null); if (!doStore && defns == entities) { warning("P-054", new Object[]{entityName}); } // internal entities if (externalId == null) { char value[]; InternalEntity entity; doLexicalPE = false; // "ab%bar;cd" -maybe-> "abcd" parseLiteral(true); doLexicalPE = true; if (doStore) { value = new char[strTmp.length()]; if (value.length != 0) { strTmp.getChars(0, value.length, value, 0); } entity = new InternalEntity(entityName, value); entity.isPE = (defns == params); defns.put(entityName, entity); if (defns == entities) { dtdHandler.internalGeneralEntityDecl(entityName, new String(value)); } } // external entities (including unparsed) } else { // [76] NDataDecl ::= S 'NDATA' S Name if (defns == entities && maybeWhitespace() && peek("NDATA")) { externalId.notation = getMarkupDeclname("F-018", false); // flag undeclared notation for checking after // the DTD is fully processed if (notations.get(externalId.notation) == null) { notations.put(externalId.notation, Boolean.TRUE); } } externalId.name = entityName; externalId.isPE = (defns == params); if (doStore) { defns.put(entityName, externalId); if (externalId.notation != null) { dtdHandler.unparsedEntityDecl(entityName, externalId.publicId, externalId.systemId, externalId.notation); } else if (defns == entities) { dtdHandler.externalGeneralEntityDecl(entityName, externalId.publicId, externalId.systemId); } } } maybeWhitespace(); nextChar('>', "F-031", entityName); if (start != in) { error("V-013", null); } return true; } private ExternalEntity maybeExternalID() throws IOException, SAXException { // [75] ExternalID ::= 'SYSTEM' S SystemLiteral // | 'PUBLIC' S' PubidLiteral S Systemliteral String temp = null; ExternalEntity retval; if (peek("PUBLIC")) { whitespace("F-009"); temp = parsePublicId(); } else if (!peek("SYSTEM")) { return null; } retval = new ExternalEntity(in); retval.publicId = temp; whitespace("F-008"); retval.systemId = parseSystemId(); return retval; } private String parseSystemId() throws IOException, SAXException { String uri = getQuotedString("F-034", null); int temp = uri.indexOf(':'); // resolve relative URIs ... must do it here since // it's relative to the source file holding the URI! // "new java.net.URL (URL, string)" conforms to RFC 1630, // but we can't use that except when the URI is a URL. // The entity resolver is allowed to handle URIs that are // not URLs, so we pass URIs through with scheme intact if (temp == -1 || uri.indexOf('/') < temp) { String baseURI; baseURI = in.getSystemId(); if (baseURI == null) { fatal("P-055", new Object[]{uri}); } if (uri.length() == 0) { uri = "."; } baseURI = baseURI.substring(0, baseURI.lastIndexOf('/') + 1); if (uri.charAt(0) != '/') { uri = baseURI + uri; } else { // XXX slashes at the beginning of a relative URI are // a special case we don't handle. throw new InternalError(); } // letting other code map any "/xxx/../" or "/./" to "/", // since all URIs must handle it the same. } // check for fragment ID in URI if (uri.indexOf('#') != -1) { error("P-056", new Object[]{uri}); } return uri; } private void maybeTextDecl() throws IOException, SAXException { // [77] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' if (peek("<?xml")) { readVersion(false, "1.0"); readEncoding(true); maybeWhitespace(); if (!peek("?>")) { fatal("P-057"); } } } private void externalParameterEntity(ExternalEntity next) throws IOException, SAXException { // // Reap the intended benefits of standalone declarations: // don't deal with external parameter entities, except to // validate the standalone declaration. // // n.b. "in external parameter entities" (and external // DTD subset, same grammar) parameter references can // occur "within" markup declarations ... expansions can // cross syntax rules. Flagged here; affects getc(). // [79] ExtPE ::= TextDecl? extSubsetDecl // [31] extSubsetDecl ::= ( markupdecl | conditionalSect // | PEReference | S )* InputEntity pe; // XXX if this returns false ... pe = in; maybeTextDecl(); while (!pe.isEOF()) { // pop internal PEs (and whitespace before/after) if (in.isEOF()) { in = in.pop(); continue; } doLexicalPE = false; if (maybeWhitespace()) { continue; } if (maybePEReference()) { continue; } doLexicalPE = true; if (maybeMarkupDecl() || maybeConditionalSect()) { continue; } break; } // if (in != pe) throw new InternalError("who popped my PE?"); if (!pe.isEOF()) { fatal("P-059", new Object[]{in.getName()}); } } private void readEncoding(boolean must) throws IOException, SAXException { // [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* String name = maybeReadAttribute("encoding", must); if (name == null) { return; } for (int i = 0; i < name.length(); i++) { char c = name.charAt(i); if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { continue; } if (i != 0 && ((c >= '0' && c <= '9') || c == '-' || c == '_' || c == '.')) { continue; } fatal("P-060", new Object[]{Character.valueOf(c)}); } // // This should be the encoding in use, and it's even an error for // it to be anything else (in certain cases that are impractical to // to test, and may even be insufficient). So, we do the best we // can, and warn if things look suspicious. Note that Java doesn't // uniformly expose the encodings, and that the names it uses // internally are nonstandard. Also, that the XML spec allows // such "errors" not to be reported at all. // String currentEncoding = in.getEncoding(); if (currentEncoding != null && !name.equalsIgnoreCase(currentEncoding)) { warning("P-061", new Object[]{name, currentEncoding}); } } private boolean maybeNotationDecl() throws IOException, SAXException { // [82] NotationDecl ::= '<!NOTATION' S Name S // (ExternalID | PublicID) S? '>' // [83] PublicID ::= 'PUBLIC' S PubidLiteral InputEntity start = peekDeclaration("!NOTATION"); if (start == null) { return false; } String name = getMarkupDeclname("F-019", false); ExternalEntity entity = new ExternalEntity(in); whitespace("F-011"); if (peek("PUBLIC")) { whitespace("F-009"); entity.publicId = parsePublicId(); if (maybeWhitespace()) { if (!peek(">")) { entity.systemId = parseSystemId(); } else { ungetc(); } } } else if (peek("SYSTEM")) { whitespace("F-008"); entity.systemId = parseSystemId(); } else { fatal("P-062"); } maybeWhitespace(); nextChar('>', "F-032", name); if (start != in) { error("V-013", null); } if (entity.systemId != null && entity.systemId.indexOf('#') != -1) { error("P-056", new Object[]{entity.systemId}); } Object value = notations.get(name); if (value != null && value instanceof ExternalEntity) { warning("P-063", new Object[]{name}); } else { notations.put(name, entity); dtdHandler.notationDecl(name, entity.publicId, entity.systemId); } return true; } //////////////////////////////////////////////////////////////// // // UTILITIES // //////////////////////////////////////////////////////////////// private char getc() throws IOException, SAXException { if (!doLexicalPE) { char c = in.getc(); return c; } // // External parameter entities get funky processing of '%param;' // references. It's not clearly defined in the XML spec; but it // boils down to having those refs be _lexical_ in most cases to // include partial syntax productions. It also needs selective // enabling; "<!ENTITY % foo ...>" must work, for example, and // if "bar" is an empty string PE, "ab%bar;cd" becomes "abcd" // if it's expanded in a literal, else "ab cd". PEs also do // not expand within comments or PIs, and external PEs are only // allowed to have markup decls (and so aren't handled lexically). // // This PE handling should be merged into maybeWhitespace, where // it can be dealt with more consistently. // // Also, there are some validity constraints in this area. // char c; while (in.isEOF()) { if (in.isInternal() || (doLexicalPE && !in.isDocument())) { in = in.pop(); } else { fatal("P-064", new Object[]{in.getName()}); } } if ((c = in.getc()) == '%' && doLexicalPE) { // PE ref ::= '%' name ';' String name = maybeGetName(); Object entity; if (name == null) { fatal("P-011"); } nextChar(';', "F-021", name); entity = params.get(name); // push a magic "entity" before and after the // real one, so ungetc() behaves uniformly pushReader(" ".toCharArray(), null, false); if (entity instanceof InternalEntity) { pushReader(((InternalEntity) entity).buf, name, false); } else if (entity instanceof ExternalEntity) // PEs can't be unparsed! // XXX if this returns false ... { pushReader((ExternalEntity) entity); } else if (entity == null) // see note in maybePEReference re making this be nonfatal. { fatal("V-022"); } else { throw new InternalError(); } pushReader(" ".toCharArray(), null, false); return in.getc(); } return c; } private void ungetc() { in.ungetc(); } private boolean peek(String s) throws IOException, SAXException { return in.peek(s, null); } // Return the entity starting the specified declaration // (for validating declaration nesting) else null. private InputEntity peekDeclaration(String s) throws IOException, SAXException { InputEntity start; if (!in.peekc('<')) { return null; } start = in; if (in.peek(s, null)) { return start; } in.ungetc(); return null; } private void nextChar(char c, String location, String near) throws IOException, SAXException { while (in.isEOF() && !in.isDocument()) { in = in.pop(); } if (!in.peekc(c)) { fatal("P-008", new Object[]{Character.valueOf(c), messages.getMessage(locale, location), (near == null ? "" : ('"' + near + '"'))}); } } private void pushReader(char buf[], String name, boolean isGeneral) throws SAXException { InputEntity r = InputEntity.getInputEntity(dtdHandler, locale); r.init(buf, name, in, !isGeneral); in = r; } private boolean pushReader(ExternalEntity next) throws IOException, SAXException { InputEntity r = InputEntity.getInputEntity(dtdHandler, locale); InputSource s; try { s = next.getInputSource(resolver); } catch (IOException e) { String msg = "unable to open the external entity from :" + next.systemId; if (next.publicId != null) { msg += " (public id:" + next.publicId + ")"; } SAXParseException spe = new SAXParseException(msg, getPublicId(), getSystemId(), getLineNumber(), getColumnNumber(), e); dtdHandler.fatalError(spe); throw e; } r.init(s, next.name, in, next.isPE); in = r; return true; } public String getPublicId() { return (in == null) ? null : in.getPublicId(); } public String getSystemId() { return (in == null) ? null : in.getSystemId(); } public int getLineNumber() { return (in == null) ? -1 : in.getLineNumber(); } public int getColumnNumber() { return (in == null) ? -1 : in.getColumnNumber(); } // error handling convenience routines private void warning(String messageId, Object parameters[]) throws SAXException { SAXParseException e = new SAXParseException(messages.getMessage(locale, messageId, parameters), getPublicId(), getSystemId(), getLineNumber(), getColumnNumber()); dtdHandler.warning(e); } void error(String messageId, Object parameters[]) throws SAXException { SAXParseException e = new SAXParseException(messages.getMessage(locale, messageId, parameters), getPublicId(), getSystemId(), getLineNumber(), getColumnNumber()); dtdHandler.error(e); } private void fatal(String messageId) throws SAXException { fatal(messageId, null); } private void fatal(String messageId, Object parameters[]) throws SAXException { SAXParseException e = new SAXParseException(messages.getMessage(locale, messageId, parameters), getPublicId(), getSystemId(), getLineNumber(), getColumnNumber()); dtdHandler.fatalError(e); throw e; } // // Map char arrays to strings ... cuts down both on memory and // CPU usage for element/attribute/other names that are reused. // // Documents typically repeat names a lot, so we more or less // intern all the strings within the document; since some strings // are repeated in multiple documents (e.g. stylesheets) we go // a bit further, and intern globally. // static class NameCache { // // Unless we auto-grow this, the default size should be a // reasonable bit larger than needed for most XML files // we've yet seen (and be prime). If it's too small, the // penalty is just excess cache collisions. // NameCacheEntry hashtable[] = new NameCacheEntry[541]; // // Usually we just want to get the 'symbol' for these chars // String lookup(char value[], int len) { return lookupEntry(value, len).name; } // // Sometimes we need to scan the chars in the resulting // string, so there's an accessor which exposes them. // (Mostly for element end tags.) // NameCacheEntry lookupEntry(char value[], int len) { int index = 0; NameCacheEntry entry; // hashing to get index for (int i = 0; i < len; i++) { index = index * 31 + value[i]; } index &= 0x7fffffff; index %= hashtable.length; // return entry if one's there ... for (entry = hashtable[index]; entry != null; entry = entry.next) { if (entry.matches(value, len)) { return entry; } } // else create new one entry = new NameCacheEntry(); entry.chars = new char[len]; System.arraycopy(value, 0, entry.chars, 0, len); entry.name = new String(entry.chars); // // NOTE: JDK 1.1 has a fixed size string intern table, // with non-GC'd entries. It can panic here; that's a // JDK problem, use 1.2 or later with many identifiers. // entry.name = entry.name.intern(); // "global" intern entry.next = hashtable[index]; hashtable[index] = entry; return entry; } } static class NameCacheEntry { String name; char chars[]; NameCacheEntry next; boolean matches(char value[], int len) { if (chars == null || chars.length != len) { return false; } for (int i = 0; i < len; i++) { if (value[i] != chars[i]) { return false; } } return true; } } // // Message catalog for diagnostics. // static final Catalog messages = new Catalog(); static final class Catalog extends MessageCatalog { Catalog() { super(DTDParser.class); } } }