/*
 * Copyright (c) 1998, 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package com.sun.xml.internal.dtdparser;

import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;

import java.io.CharConversionException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Arrays;
import java.util.Locale;

This is how the parser talks to its input entities, of all kinds. The entities are in a stack.

For internal entities, the character arrays are referenced here, and read from as needed (they're read-only). External entities have mutable buffers, that are read into as needed.

Note: This maps CRLF (and CR) to LF without regard for whether it's in an external (parsed) entity or not. The XML 1.0 spec is inconsistent in explaining EOL handling; this is the sensible way.

Author:David Brownell, Janet Koenig
Version:1.4 00/08/05
/** * This is how the parser talks to its input entities, of all kinds. * The entities are in a stack. * <p> * <P> For internal entities, the character arrays are referenced here, * and read from as needed (they're read-only). External entities have * mutable buffers, that are read into as needed. * <p> * <P> <em>Note:</em> This maps CRLF (and CR) to LF without regard for * whether it's in an external (parsed) entity or not. The XML 1.0 spec * is inconsistent in explaining EOL handling; this is the sensible way. * * @author David Brownell * @author Janet Koenig * @version 1.4 00/08/05 */
public class InputEntity { private int start, finish; private char buf []; private int lineNumber = 1; private boolean returnedFirstHalf = false; private boolean maybeInCRLF = false; // name of entity (never main document or unnamed DTD PE) private String name; private InputEntity next; // for system and public IDs in diagnostics private InputSource input; // this is a buffer; some buffers can be replenished. private Reader reader; private boolean isClosed; private DTDEventListener errHandler; private Locale locale; private StringBuffer rememberedText; private int startRemember; // record if this is a PE, so endParsedEntity won't be called private boolean isPE; // InputStreamReader throws an internal per-read exception, so // we minimize reads. We also add a byte to compensate for the // "ungetc" byte we keep, so that our downstream reads are as // nicely sized as we can make them. final private static int BUFSIZ = 8 * 1024 + 1; final private static char newline [] = {'\n'}; public static InputEntity getInputEntity(DTDEventListener h, Locale l) { InputEntity retval = new InputEntity(); retval.errHandler = h; retval.locale = l; return retval; } private InputEntity() { } // // predicate: return true iff this is an internal entity reader, // and so may safely be "popped" as needed. external entities have // syntax to uphold; internal parameter entities have at most validity // constraints to monitor. also, only external entities get decent // location diagnostics. // public boolean isInternal() { return reader == null; } // // predicate: return true iff this is the toplevel document // public boolean isDocument() { return next == null; } // // predicate: return true iff this is a PE expansion (so that // LexicalEventListner.endParsedEntity won't be called) // public boolean isParameterEntity() { return isPE; } // // return name of current entity // public String getName() { return name; } // // use this for an external parsed entity // public void init(InputSource in, String name, InputEntity stack, boolean isPE) throws IOException, SAXException { input = in; this.isPE = isPE; reader = in.getCharacterStream(); if (reader == null) { InputStream bytes = in.getByteStream(); if (bytes == null) if (Boolean.valueOf(System.getProperty("enableExternalEntityProcessing"))) reader = XmlReader.createReader(new URL(in.getSystemId()).openStream()); else fatal("P-082", new Object[] {in.getSystemId()}); else if (in.getEncoding() != null) reader = XmlReader.createReader(in.getByteStream(), in.getEncoding()); else reader = XmlReader.createReader(in.getByteStream()); } next = stack; buf = new char[BUFSIZ]; this.name = name; checkRecursion(stack); } // // use this for an internal parsed entity; buffer is readonly // public void init(char b [], String name, InputEntity stack, boolean isPE) throws SAXException { next = stack; buf = Arrays.copyOf(b, b.length); finish = b.length; this.name = name; this.isPE = isPE; checkRecursion(stack); } private void checkRecursion(InputEntity stack) throws SAXException { if (stack == null) return; for (stack = stack.next; stack != null; stack = stack.next) { if (stack.name != null && stack.name.equals(name)) fatal("P-069", new Object[]{name}); } } public InputEntity pop() throws IOException { // caller has ensured there's nothing left to read close(); return next; }
returns true iff there's no more data to consume ...
/** * returns true iff there's no more data to consume ... */
public boolean isEOF() throws IOException, SAXException { // called to ensure WF-ness of included entities and to pop // input entities appropriately ... EOF is not always legal. if (start >= finish) { fillbuf(); return start >= finish; } else return false; }
Returns the name of the encoding in use, else null; the name returned is in as standard a form as we can get.
/** * Returns the name of the encoding in use, else null; the name * returned is in as standard a form as we can get. */
public String getEncoding() { if (reader == null) return null; if (reader instanceof XmlReader) return ((XmlReader) reader).getEncoding(); // XXX prefer a java2std() call to normalize names... if (reader instanceof InputStreamReader) return ((InputStreamReader) reader).getEncoding(); return null; }
returns the next name char, or NUL ... faster than getc(), and the common "name or nmtoken must be next" case won't need ungetc().
/** * returns the next name char, or NUL ... faster than getc(), * and the common "name or nmtoken must be next" case won't * need ungetc(). */
public char getNameChar() throws IOException, SAXException { if (finish <= start) fillbuf(); if (finish > start) { char c = buf[start++]; if (XmlChars.isNameChar(c)) return c; start--; } return 0; }
gets the next Java character -- might be part of an XML text character represented by a surrogate pair, or be the end of the entity.
/** * gets the next Java character -- might be part of an XML * text character represented by a surrogate pair, or be * the end of the entity. */
public char getc() throws IOException, SAXException { if (finish <= start) fillbuf(); if (finish > start) { char c = buf[start++]; // [2] Char ::= #x0009 | #x000A | #x000D // | [#x0020-#xD7FF] // | [#xE000-#xFFFD] // plus surrogate _pairs_ representing [#x10000-#x10ffff] if (returnedFirstHalf) { if (c >= 0xdc00 && c <= 0xdfff) { returnedFirstHalf = false; return c; } else fatal("P-070", new Object[]{Integer.toHexString(c)}); } if ((c >= 0x0020 && c <= 0xD7FF) || c == 0x0009 // no surrogates! || (c >= 0xE000 && c <= 0xFFFD)) return c; // // CRLF and CR are both line ends; map both to LF, and // keep line count correct. // else if (c == '\r' && !isInternal()) { maybeInCRLF = true; c = getc(); if (c != '\n') ungetc(); maybeInCRLF = false; lineNumber++; return '\n'; } else if (c == '\n' || c == '\r') { // LF, or 2nd char in CRLF if (!isInternal() && !maybeInCRLF) lineNumber++; return c; } // surrogates... if (c >= 0xd800 && c < 0xdc00) { returnedFirstHalf = true; return c; } fatal("P-071", new Object[]{Integer.toHexString(c)}); } throw new EndOfInputException(); }
lookahead one character
/** * lookahead one character */
public boolean peekc(char c) throws IOException, SAXException { if (finish <= start) fillbuf(); if (finish > start) { if (buf[start] == c) { start++; return true; } else return false; } return false; }
two character pushback is guaranteed
/** * two character pushback is guaranteed */
public void ungetc() { if (start == 0) throw new InternalError("ungetc"); start--; if (buf[start] == '\n' || buf[start] == '\r') { if (!isInternal()) lineNumber--; } else if (returnedFirstHalf) returnedFirstHalf = false; }
optional grammatical whitespace (discarded)
/** * optional grammatical whitespace (discarded) */
public boolean maybeWhitespace() throws IOException, SAXException { char c; boolean isSpace = false; boolean sawCR = false; // [3] S ::= #20 | #09 | #0D | #0A for (; ;) { if (finish <= start) fillbuf(); if (finish <= start) return isSpace; c = buf[start++]; if (c == 0x20 || c == 0x09 || c == '\n' || c == '\r') { isSpace = true; // // CR, LF are line endings ... CLRF is one, not two! // if ((c == '\n' || c == '\r') && !isInternal()) { if (!(c == '\n' && sawCR)) { lineNumber++; sawCR = false; } if (c == '\r') sawCR = true; } } else { start--; return isSpace; } } }
normal content; whitespace in markup may be handled specially if the parser uses the content model.

content terminates with markup delimiter characters, namely ampersand (&amp;) and left angle bracket (&lt;).

the document handler's characters() method is called on all the content found

/** * normal content; whitespace in markup may be handled * specially if the parser uses the content model. * <p> * <P> content terminates with markup delimiter characters, * namely ampersand (&amp;amp;) and left angle bracket (&amp;lt;). * <p> * <P> the document handler's characters() method is called * on all the content found */
public boolean parsedContent(DTDEventListener docHandler /*ElementValidator validator*/) throws IOException, SAXException { // [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) int first; // first char to return int last; // last char to return boolean sawContent; // sent any chars? char c; // deliver right out of the buffer, until delimiter, EOF, // or error, refilling as we go for (first = last = start, sawContent = false; ; last++) { // buffer empty? if (last >= finish) { if (last > first) { // validator.text (); docHandler.characters(buf, first, last - first); sawContent = true; start = last; } if (isEOF()) // calls fillbuf return sawContent; first = start; last = first - 1; // incremented in loop continue; } c = buf[last]; // // pass most chars through ASAP; this inlines the code of // [2] !XmlChars.isChar(c) leaving only characters needing // special treatment ... line ends, surrogates, and: // 0x0026 == '&' // 0x003C == '<' // 0x005D == ']' // Comparisons ordered for speed on 'typical' text // if ((c > 0x005D && c <= 0xD7FF) // a-z and more || (c < 0x0026 && c >= 0x0020) // space & punct || (c > 0x003C && c < 0x005D) // A-Z & punct || (c > 0x0026 && c < 0x003C) // 0-9 & punct || c == 0x0009 || (c >= 0xE000 && c <= 0xFFFD) ) continue; // terminate on markup delimiters if (c == '<' || c == '&') break; // count lines if (c == '\n') { if (!isInternal()) lineNumber++; continue; } // External entities get CR, CRLF --> LF mapping // Internal ones got it already, and we can't repeat // else we break char ref handling!! if (c == '\r') { if (isInternal()) continue; docHandler.characters(buf, first, last - first); docHandler.characters(newline, 0, 1); sawContent = true; lineNumber++; if (finish > (last + 1)) { if (buf[last + 1] == '\n') last++; } else { // CR at end of buffer // XXX case not yet handled: CRLF here will look like two lines } first = start = last + 1; continue; } // ']]>' is a WF error -- must fail if we see it if (c == ']') { switch (finish - last) { // for suspicious end-of-buffer cases, get more data // into the buffer to rule out this sequence. case 2: if (buf[last + 1] != ']') continue; // FALLTHROUGH case 1: if (reader == null || isClosed) continue; if (last == first) throw new InternalError("fillbuf"); last--; if (last > first) { // validator.text (); docHandler.characters(buf, first, last - first); sawContent = true; start = last; } fillbuf(); first = last = start; continue; // otherwise any "]]>" would be buffered, and we can // see right away if that's what we have default: if (buf[last + 1] == ']' && buf[last + 2] == '>') fatal("P-072", null); continue; } } // correctly paired surrogates are OK if (c >= 0xd800 && c <= 0xdfff) { if ((last + 1) >= finish) { if (last > first) { // validator.text (); docHandler.characters(buf, first, last - first); sawContent = true; start = last + 1; } if (isEOF()) { // calls fillbuf fatal("P-081", new Object[]{Integer.toHexString(c)}); } first = start; last = first; continue; } if (checkSurrogatePair(last)) last++; else { last--; // also terminate on surrogate pair oddities break; } continue; } fatal("P-071", new Object[]{Integer.toHexString(c)}); } if (last == first) return sawContent; // validator.text (); docHandler.characters(buf, first, last - first); start = last; return true; }
CDATA -- character data, terminated by "]]>" and optionally including unescaped markup delimiters (ampersand and left angle bracket). This should otherwise be exactly like character data, modulo differences in error report details.

The document handler's characters() or ignorableWhitespace() methods are invoked on all the character data found

Params:
  • docHandler – gets callbacks for character data
  • ignorableWhitespace – if true, whitespace characters will be reported using docHandler.ignorableWhitespace(); implicitly, non-whitespace characters will cause validation errors
  • whitespaceInvalidMessage – if true, ignorable whitespace causes a validity error report as well as a callback
/** * CDATA -- character data, terminated by {@code "]]>"} and optionally * including unescaped markup delimiters (ampersand and left angle * bracket). This should otherwise be exactly like character data, * modulo differences in error report details. * <p> * <P> The document handler's characters() or ignorableWhitespace() * methods are invoked on all the character data found * * @param docHandler gets callbacks for character data * @param ignorableWhitespace if true, whitespace characters will * be reported using docHandler.ignorableWhitespace(); implicitly, * non-whitespace characters will cause validation errors * @param whitespaceInvalidMessage if true, ignorable whitespace * causes a validity error report as well as a callback */
public boolean unparsedContent(DTDEventListener docHandler, /*ElementValidator validator,*/ boolean ignorableWhitespace, String whitespaceInvalidMessage) throws IOException, SAXException { // [18] CDSect ::= CDStart CData CDEnd // [19] CDStart ::= '<![CDATA[' // [20] CData ::= (Char* - (Char* ']]>' Char*)) // [21] CDEnd ::= ']]>' // caller peeked the leading '<' ... if (!peek("![CDATA[", null)) return false; docHandler.startCDATA(); // only a literal ']]>' stops this ... int last; for (; ;) { // until ']]>' seen boolean done = false; char c; // don't report ignorable whitespace as "text" for // validation purposes. boolean white = ignorableWhitespace; for (last = start; last < finish; last++) { c = buf[last]; // // Reject illegal characters. // if (!XmlChars.isChar(c)) { white = false; if (c >= 0xd800 && c <= 0xdfff) { if (checkSurrogatePair(last)) { last++; continue; } else { last--; break; } } fatal("P-071", new Object[] {Integer.toHexString(buf[last])}); } if (c == '\n') { if (!isInternal()) lineNumber++; continue; } if (c == '\r') { // As above, we can't repeat CR/CRLF --> LF mapping if (isInternal()) continue; if (white) { if (whitespaceInvalidMessage != null && errHandler != null) errHandler.error(new SAXParseException(DTDParser.messages.getMessage(locale, whitespaceInvalidMessage), null)); docHandler.ignorableWhitespace(buf, start, last - start); docHandler.ignorableWhitespace(newline, 0, 1); } else { // validator.text (); docHandler.characters(buf, start, last - start); docHandler.characters(newline, 0, 1); } lineNumber++; if (finish > (last + 1)) { if (buf[last + 1] == '\n') last++; } else { // CR at end of buffer // XXX case not yet handled ... as above } start = last + 1; continue; } if (c != ']') { if (c != ' ' && c != '\t') white = false; continue; } if ((last + 2) < finish) { if (buf[last + 1] == ']' && buf[last + 2] == '>') { done = true; break; } white = false; continue; } else { //last--; break; } } if (white) { if (whitespaceInvalidMessage != null && errHandler != null) errHandler.error(new SAXParseException(DTDParser.messages.getMessage(locale, whitespaceInvalidMessage), null)); docHandler.ignorableWhitespace(buf, start, last - start); } else { // validator.text (); docHandler.characters(buf, start, last - start); } if (done) { start = last + 3; break; } start = last; if (isEOF()) fatal("P-073", null); } docHandler.endCDATA(); return true; } // return false to backstep at end of buffer) private boolean checkSurrogatePair(int offset) throws SAXException { if ((offset + 1) >= finish) return false; char c1 = buf[offset++]; char c2 = buf[offset]; if ((c1 >= 0xd800 && c1 < 0xdc00) && (c2 >= 0xdc00 && c2 <= 0xdfff)) return true; fatal("P-074", new Object[]{ Integer.toHexString(c1 & 0x0ffff), Integer.toHexString(c2 & 0x0ffff) }); return false; }
whitespace in markup (flagged to app, discardable)

the document handler's ignorableWhitespace() method is called on all the whitespace found

/** * whitespace in markup (flagged to app, discardable) * <p> * <P> the document handler's ignorableWhitespace() method * is called on all the whitespace found */
public boolean ignorableWhitespace(DTDEventListener handler) throws IOException, SAXException { char c; boolean isSpace = false; int first; // [3] S ::= #20 | #09 | #0D | #0A for (first = start; ;) { if (finish <= start) { if (isSpace) handler.ignorableWhitespace(buf, first, start - first); fillbuf(); first = start; } if (finish <= start) return isSpace; c = buf[start++]; switch (c) { case '\n': if (!isInternal()) lineNumber++; // XXX handles Macintosh line endings wrong // fallthrough case 0x09: case 0x20: isSpace = true; continue; case '\r': isSpace = true; if (!isInternal()) lineNumber++; handler.ignorableWhitespace(buf, first, (start - 1) - first); handler.ignorableWhitespace(newline, 0, 1); if (start < finish && buf[start] == '\n') ++start; first = start; continue; default: ungetc(); if (isSpace) handler.ignorableWhitespace(buf, first, start - first); return isSpace; } } }
returns false iff 'next' string isn't as provided, else skips that text and returns true.

NOTE: two alternative string representations are both passed in, since one is faster.

/** * returns false iff 'next' string isn't as provided, * else skips that text and returns true. * <p> * <P> NOTE: two alternative string representations are * both passed in, since one is faster. */
public boolean peek(String next, char chars []) throws IOException, SAXException { int len; int i; if (chars != null) len = chars.length; else len = next.length(); // buffer should hold the whole thing ... give it a // chance for the end-of-buffer case and cope with EOF // by letting fillbuf compact and fill if (finish <= start || (finish - start) < len) fillbuf(); // can't peek past EOF if (finish <= start) return false; // compare the string; consume iff it matches if (chars != null) { for (i = 0; i < len && (start + i) < finish; i++) { if (buf[start + i] != chars[i]) return false; } } else { for (i = 0; i < len && (start + i) < finish; i++) { if (buf[start + i] != next.charAt(i)) return false; } } // if the first fillbuf didn't get enough data, give // fillbuf another chance to read if (i < len) { if (reader == null || isClosed) return false; // // This diagnostic "knows" that the only way big strings would // fail to be peeked is where it's a symbol ... e.g. for an // </EndTag> construct. That knowledge could also be applied // to get rid of the symbol length constraint, since having // the wrong symbol is a fatal error anyway ... // if (len > buf.length) { fatal("P-077", new Object[]{Integer.valueOf(buf.length)}); } fillbuf(); return peek(next, chars); } start += len; return true; } // // Support for reporting the internal DTD subset, so <!DOCTYPE...> // declarations can be recreated. This is collected as a single // string; such subsets are normally small, and many applications // don't even care about this. // public void startRemembering() { if (startRemember != 0) throw new InternalError(); startRemember = start; } public String rememberText() { String retval; // If the internal subset crossed a buffer boundary, we // created a temporary buffer. if (rememberedText != null) { rememberedText.append(buf, startRemember, start - startRemember); retval = rememberedText.toString(); } else retval = new String(buf, startRemember, start - startRemember); startRemember = 0; rememberedText = null; return retval; } private InputEntity getTopEntity() { InputEntity current = this; // don't report locations within internal entities! while (current != null && current.input == null) current = current.next; return current == null ? this : current; }
Returns the public ID of this input source, if known
/** * Returns the public ID of this input source, if known */
public String getPublicId() { InputEntity where = getTopEntity(); if (where == this) return input.getPublicId(); return where.getPublicId(); }
Returns the system ID of this input source, if known
/** * Returns the system ID of this input source, if known */
public String getSystemId() { InputEntity where = getTopEntity(); if (where == this) return input.getSystemId(); return where.getSystemId(); }
Returns the current line number in this input source
/** * Returns the current line number in this input source */
public int getLineNumber() { InputEntity where = getTopEntity(); if (where == this) return lineNumber; return where.getLineNumber(); }
returns -1; maintaining column numbers hurts performance
/** * returns -1; maintaining column numbers hurts performance */
public int getColumnNumber() { return -1; // not maintained (speed) } // // n.b. for non-EOF end-of-buffer cases, reader should return // at least a handful of bytes so various lookaheads behave. // // two character pushback exists except at first; characters // represented by surrogate pairs can't be pushed back (they'd // only be in character data anyway). // // DTD exception thrown on char conversion problems; line number // will be low, as a rule. // private void fillbuf() throws IOException, SAXException { // don't touched fixed buffers, that'll usually // change entity values (and isn't needed anyway) // likewise, ignore closed streams if (reader == null || isClosed) return; // if remembering DTD text, copy! if (startRemember != 0) { if (rememberedText == null) rememberedText = new StringBuffer(buf.length); rememberedText.append(buf, startRemember, start - startRemember); } boolean extra = (finish > 0) && (start > 0); int len; if (extra) // extra pushback start--; len = finish - start; System.arraycopy(buf, start, buf, 0, len); start = 0; finish = len; try { len = buf.length - len; len = reader.read(buf, finish, len); } catch (UnsupportedEncodingException e) { fatal("P-075", new Object[]{e.getMessage()}); } catch (CharConversionException e) { fatal("P-076", new Object[]{e.getMessage()}); } if (len >= 0) finish += len; else close(); if (extra) // extra pushback start++; if (startRemember != 0) // assert extra == true startRemember = 1; } public void close() { try { if (reader != null && !isClosed) reader.close(); isClosed = true; } catch (IOException e) { /* NOTHING */ } } private void fatal(String messageId, Object params []) throws SAXException { SAXParseException x = new SAXParseException(DTDParser.messages.getMessage(locale, messageId, params), null); // not continuable ... e.g. WF errors close(); if (errHandler != null) { errHandler.fatalError(x); } throw x; } }