javax.mail/javax.mail-api/1.6.2 : javax/mail/internet/HeaderTokenizer.java

HeaderTokenizer
http://javaee.github.io/javamail/javax.mail-api: JavaMail API jar (Oracle)
CDDL/GPLv2+CE
Bill Shannon (Oracle)
/*
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
 *
 * Copyright (c) 1997-2017 Oracle and/or its affiliates. All rights reserved.
 *
 * The contents of this file are subject to the terms of either the GNU
 * General Public License Version 2 only ("GPL") or the Common Development
 * and Distribution License("CDDL") (collectively, the "License").  You
 * may not use this file except in compliance with the License.  You can
 * obtain a copy of the License at
 * https://oss.oracle.com/licenses/CDDL+GPL-1.1
 * or LICENSE.txt.  See the License for the specific
 * language governing permissions and limitations under the License.
 *
 * When distributing the software, include this License Header Notice in each
 * file and include the License file at LICENSE.txt.
 *
 * GPL Classpath Exception:
 * Oracle designates this particular file as subject to the "Classpath"
 * exception as provided by Oracle in the GPL Version 2 section of the License
 * file that accompanied this code.
 *
 * Modifications:
 * If applicable, add the following below the License Header, with the fields
 * enclosed by brackets [] replaced by your own identifying information:
 * "Portions Copyright [year] [name of copyright owner]"
 *
 * Contributor(s):
 * If you wish your version of this file to be governed by only the CDDL or
 * only the GPL Version 2, indicate your decision by adding "[Contributor]
 * elects to include this software in this distribution under the [CDDL or GPL
 * Version 2] license."  If you don't indicate a single choice of license, a
 * recipient has the option to distribute your version of this file under
 * either the CDDL, the GPL Version 2 or to extend the choice of license to
 * its licensees as provided above.  However, if you add GPL Version 2 code
 * and therefore, elected the GPL Version 2 license, then the option applies
 * only if the new code is made subject to such option by the copyright
 * holder.
 */

package javax.mail.internet;

import java.util.*;

This class tokenizes RFC822 and MIME headers into the basic
symbols specified by RFC822 and MIME. 
This class handles folded headers (ie headers with embedded
CRLF SPACE sequences). The folds are removed in the returned
tokens. 
Author:  John Mani,  Bill Shannon/**
 * This class tokenizes RFC822 and MIME headers into the basic
 * symbols specified by RFC822 and MIME. <p>
 *
 * This class handles folded headers (ie headers with embedded
 * CRLF SPACE sequences). The folds are removed in the returned
 * tokens. 
 *
 * @author  John Mani
 * @author  Bill Shannon
 */

public class HeaderTokenizer {

    The Token class represents tokens returned by the 
HeaderTokenizer.
/**
     * The Token class represents tokens returned by the 
     * HeaderTokenizer.
     */
    public static class Token {

	private int type;
	private String value;

	Token type indicating an ATOM.
/**
	 * Token type indicating an ATOM.
	 */
	public static final int ATOM 		= -1;

	Token type indicating a quoted string. The value 
field contains the string without the quotes.
/**
	 * Token type indicating a quoted string. The value 
	 * field contains the string without the quotes.
 	 */
	public static final int QUOTEDSTRING 	= -2;

	Token type indicating a comment. The value field 
contains the comment string without the comment 
start and end symbols.
/**
	 * Token type indicating a comment. The value field 
	 * contains the comment string without the comment 
	 * start and end symbols.
	 */
	public static final int COMMENT		= -3;

	Token type indicating end of input.
/**
	 * Token type indicating end of input.
	 */
	public static final int  EOF 		= -4;

	Constructor.
Params: type – 	Token type
value – 	Token value/**
	 * Constructor.
	 * @param	type	Token type
	 * @param	value	Token value
	 */
	public Token(int type, String value) {
	     this.type = type;
	     this.value = value;
	}

	Return the type of the token. If the token represents a
delimiter or a control character, the type is that character
itself, converted to an integer. Otherwise, it's value is 
one of the following:

ATOM A sequence of ASCII characters 
delimited by either SPACE, CTL, "(", <"> or the 
specified SPECIALS
QUOTEDSTRING A sequence of ASCII characters
within quotes
COMMENT A sequence of ASCII characters 
within "(" and ")".
EOF End of header

Returns: 	the token type/**
	 * Return the type of the token. If the token represents a
	 * delimiter or a control character, the type is that character
	 * itself, converted to an integer. Otherwise, it's value is 
	 * one of the following:
	 * <ul>
	 * <li><code>ATOM</code> A sequence of ASCII characters 
	 *	delimited by either SPACE, CTL, "(", &lt;"&gt; or the 
	 *	specified SPECIALS
	 * <li><code>QUOTEDSTRING</code> A sequence of ASCII characters
	 *	within quotes
	 * <li><code>COMMENT</code> A sequence of ASCII characters 
	 *	within "(" and ")".
	 * <li><code>EOF</code> End of header
	 * </ul>
	 *
	 * @return	the token type
	 */
	public int getType() {
	    return type;
	}

	Returns the value of the token just read. When the current
token is a quoted string, this field contains the body of the
string, without the quotes. When the current token is a comment,
this field contains the body of the comment.
Returns: 	token value/**
	 * Returns the value of the token just read. When the current
	 * token is a quoted string, this field contains the body of the
	 * string, without the quotes. When the current token is a comment,
	 * this field contains the body of the comment.
	 *
	 * @return	token value
	 */
	public String getValue() {
	    return value;
	}
    }

    private String string; // the string to be tokenized
    private boolean skipComments; // should comments be skipped ?
    private String delimiters; // delimiter string
    private int currentPos; // current parse position
    private int maxPos; // string length
    private int nextPos; // track start of next Token for next()
    private int peekPos; // track start of next Token for peek()

    RFC822 specials
/**
     * RFC822 specials
     */
    public final static String RFC822 = "()<>@,;:\\\"\t .[]";

    MIME specials
/**
     * MIME specials
     */
    public final static String MIME = "()<>@,;:\\\"\t []/?=";

    // The EOF Token
    private final static Token EOFToken = new Token(Token.EOF, null);

    Constructor that takes a rfc822 style header.
Params: header – 	The rfc822 header to be tokenized
delimiters –      Set of delimiter characters 
			to be used to delimit ATOMS. These
			are usually RFC822 or 
			MIME
skipComments –  If true, comments are skipped and
			not returned as tokens/**
     * Constructor that takes a rfc822 style header.
     *
     * @param	header	The rfc822 header to be tokenized
     * @param	delimiters      Set of delimiter characters 
     *				to be used to delimit ATOMS. These
     *				are usually <code>RFC822</code> or 
     *				<code>MIME</code>
     * @param   skipComments  If true, comments are skipped and
     *				not returned as tokens
     */
    public HeaderTokenizer(String header, String delimiters,
    			   boolean skipComments) {
	string = (header == null) ? "" : header; // paranoia ?!
	this.skipComments = skipComments;
	this.delimiters = delimiters;
	currentPos = nextPos = peekPos = 0;
	maxPos = string.length();
    }

    Constructor. Comments are ignored and not returned as tokens
Params: header –  The header that is tokenized
delimiters –  The delimiters to be used/**
     * Constructor. Comments are ignored and not returned as tokens
     *
     * @param	header  The header that is tokenized
     * @param	delimiters  The delimiters to be used
     */
    public HeaderTokenizer(String header, String delimiters) {
	this(header, delimiters, true);
    }

    Constructor. The RFC822 defined delimiters - RFC822 - are
used to delimit ATOMS. Also comments are skipped and not
returned as tokens
Params: header – 	the header string/**
     * Constructor. The RFC822 defined delimiters - RFC822 - are
     * used to delimit ATOMS. Also comments are skipped and not
     * returned as tokens
     *
     * @param	header	the header string
     */
    public HeaderTokenizer(String header)  {
	this(header, RFC822);
    }

    Parses the next token from this String. 
Clients sit in a loop calling next() to parse successive
tokens until an EOF Token is returned.
Throws: ParseException – if the parse fails
Returns: 		the next Token/**
     * Parses the next token from this String. <p>
     *
     * Clients sit in a loop calling next() to parse successive
     * tokens until an EOF Token is returned.
     *
     * @return		the next Token
     * @exception	ParseException if the parse fails
     */
    public Token next() throws ParseException { 
	return next('\0', false);
    }

    Parses the next token from this String.
If endOfAtom is not NUL, the token extends until the
endOfAtom character is seen, or to the end of the header.
This method is useful when parsing headers that don't
obey the MIME specification, e.g., by failing to quote
parameter values that contain spaces.
Params: endOfAtom – 	if not NUL, character marking end of token
Throws: ParseException – if the parse fails
Returns: 		the next Token
Since: 		JavaMail 1.5/**
     * Parses the next token from this String.
     * If endOfAtom is not NUL, the token extends until the
     * endOfAtom character is seen, or to the end of the header.
     * This method is useful when parsing headers that don't
     * obey the MIME specification, e.g., by failing to quote
     * parameter values that contain spaces.
     *
     * @param	endOfAtom	if not NUL, character marking end of token
     * @return		the next Token
     * @exception	ParseException if the parse fails
     * @since		JavaMail 1.5
     */
    public Token next(char endOfAtom) throws ParseException { 
	return next(endOfAtom, false);
    }

    Parses the next token from this String.
endOfAtom is handled as above.  If keepEscapes is true,
any backslash escapes are preserved in the returned string.
This method is useful when parsing headers that don't
obey the MIME specification, e.g., by failing to escape
backslashes in the filename parameter.
Params: endOfAtom – 	if not NUL, character marking end of token
keepEscapes – 	keep all backslashes in returned string?
Throws: ParseException – if the parse fails
Returns: 		the next Token
Since: 		JavaMail 1.5/**
     * Parses the next token from this String.
     * endOfAtom is handled as above.  If keepEscapes is true,
     * any backslash escapes are preserved in the returned string.
     * This method is useful when parsing headers that don't
     * obey the MIME specification, e.g., by failing to escape
     * backslashes in the filename parameter.
     *
     * @param	endOfAtom	if not NUL, character marking end of token
     * @param	keepEscapes	keep all backslashes in returned string?
     * @return		the next Token
     * @exception	ParseException if the parse fails
     * @since		JavaMail 1.5
     */
    public Token next(char endOfAtom, boolean keepEscapes)
				throws ParseException { 
	Token tk;

	currentPos = nextPos; // setup currentPos
	tk = getNext(endOfAtom, keepEscapes);
	nextPos = peekPos = currentPos; // update currentPos and peekPos
	return tk;
    }

    Peek at the next token, without actually removing the token
from the parse stream. Invoking this method multiple times
will return successive tokens, until next() is
called. 
Throws: ParseException – if the parse fails
Returns: 		the next Token/**
     * Peek at the next token, without actually removing the token
     * from the parse stream. Invoking this method multiple times
     * will return successive tokens, until <code>next()</code> is
     * called. <p>
     *
     * @return		the next Token
     * @exception	ParseException if the parse fails
     */
    public Token peek() throws ParseException {
	Token tk;

	currentPos = peekPos; // setup currentPos
	tk = getNext('\0', false);
	peekPos = currentPos; // update peekPos
	return tk;
    }

    Return the rest of the Header.
Returns: String	rest of header. null is returned if we are
		already at end of header/**
     * Return the rest of the Header.
     *
     * @return String	rest of header. null is returned if we are
     *			already at end of header
     */
    public String getRemainder() {
	if (nextPos >= string.length())
	    return null;
	return string.substring(nextPos);
    }

    /*
     * Return the next token starting from 'currentPos'. After the
     * parse, 'currentPos' is updated to point to the start of the 
     * next token.
     */
    private Token getNext(char endOfAtom, boolean keepEscapes)
				throws ParseException {
	// If we're already at end of string, return EOF
	if (currentPos >= maxPos)
	    return EOFToken;

	// Skip white-space, position currentPos beyond the space
	if (skipWhiteSpace() == Token.EOF)
	    return EOFToken;

	char c; 
	int start; 
	boolean filter = false;
	
	c = string.charAt(currentPos);

	// Check or Skip comments and position currentPos
	// beyond the comment
	while (c == '(') {
	    // Parsing comment ..
	    int nesting;
	    for (start = ++currentPos, nesting = 1; 
		 nesting > 0 && currentPos < maxPos;
		 currentPos++) {
		c = string.charAt(currentPos);
		if (c == '\\') {  // Escape sequence
		    currentPos++; // skip the escaped character
		    filter = true;
		} else if (c == '\r')
		    filter = true;
		else if (c == '(')
		    nesting++;
		else if (c == ')')
		    nesting--;
	    }
	    if (nesting != 0)
		throw new ParseException("Unbalanced comments");

	    if (!skipComments) {
		// Return the comment, if we are asked to.
		// Note that the comment start & end markers are ignored.
		String s;
		if (filter) // need to go thru the token again.
		    s = filterToken(string, start, currentPos-1, keepEscapes);
		else
		    s = string.substring(start,currentPos-1);

		return new Token(Token.COMMENT, s);
	    }

	    // Skip any whitespace after the comment.
	    if (skipWhiteSpace() == Token.EOF)
		return EOFToken;
	    c = string.charAt(currentPos);
	}

	// Check for quoted-string and position currentPos 
	//  beyond the terminating quote
	if (c == '"') {
	    currentPos++;	// skip initial quote
	    return collectString('"', keepEscapes);
	}
	
	// Check for SPECIAL or CTL
	if (c < 040 || c >= 0177 || delimiters.indexOf(c) >= 0) {
	    if (endOfAtom > 0 && c != endOfAtom) {
		// not expecting a special character here,
		// pretend it's a quoted string
		return collectString(endOfAtom, keepEscapes);
	    }
	    currentPos++; // re-position currentPos
	    char ch[] = new char[1];
	    ch[0] = c;
	    return new Token((int)c, new String(ch));
	}

	// Check for ATOM
	for (start = currentPos; currentPos < maxPos; currentPos++) {
	    c = string.charAt(currentPos);
	    // ATOM is delimited by either SPACE, CTL, "(", <"> 
	    // or the specified SPECIALS
	    if (c < 040 || c >= 0177 || c == '(' || c == ' ' ||
			c == '"' || delimiters.indexOf(c) >= 0) {
		if (endOfAtom > 0 && c != endOfAtom) {
		    // not the expected atom after all;
		    // back up and pretend it's a quoted string
		    currentPos = start;
		    return collectString(endOfAtom, keepEscapes);
		}
		break;
	    }
	}
	return new Token(Token.ATOM, string.substring(start, currentPos));
    }

    private Token collectString(char eos, boolean keepEscapes)
				throws ParseException {
	int start;
	boolean filter = false;
	for (start = currentPos; currentPos < maxPos; currentPos++) {
	    char c = string.charAt(currentPos);
	    if (c == '\\') { // Escape sequence
		currentPos++;
		filter = true;
	    } else if (c == '\r')
		filter = true;
	    else if (c == eos) {
		currentPos++;
		String s;

		if (filter)
		    s = filterToken(string, start, currentPos-1, keepEscapes);
		else
		    s = string.substring(start, currentPos-1);

		if (c != '"') {		// not a real quoted string
		    s = trimWhiteSpace(s);
		    currentPos--;	// back up before the eos char
		}

		return new Token(Token.QUOTEDSTRING, s);
	    }
	}

	// ran off the end of the string

	// if we're looking for a matching quote, that's an error
	if (eos == '"')
	    throw new ParseException("Unbalanced quoted string");

	// otherwise, just return whatever's left
	String s;
	if (filter)
	    s = filterToken(string, start, currentPos, keepEscapes);
	else
	    s = string.substring(start, currentPos);
	s = trimWhiteSpace(s);
	return new Token(Token.QUOTEDSTRING, s);
    }

    // Skip SPACE, HT, CR and NL
    private int skipWhiteSpace() {
	char c;
	for (; currentPos < maxPos; currentPos++)
	    if (((c = string.charAt(currentPos)) != ' ') && 
		(c != '\t') && (c != '\r') && (c != '\n'))
		return currentPos;
	return Token.EOF;
    }

    // Trim SPACE, HT, CR and NL from end of string
    private static String trimWhiteSpace(String s) {
	char c;
	int i;
	for (i = s.length() - 1; i >= 0; i--) {
	    if (((c = s.charAt(i)) != ' ') && 
		(c != '\t') && (c != '\r') && (c != '\n'))
		break;
	}
	if (i <= 0)
	    return "";
	else
	    return s.substring(0, i + 1);
    }

    /* Process escape sequences and embedded LWSPs from a comment or
     * quoted string.
     */
    private static String filterToken(String s, int start, int end,
				boolean keepEscapes) {
	StringBuilder sb = new StringBuilder();
	char c;
	boolean gotEscape = false;
	boolean gotCR = false;

	for (int i = start; i < end; i++) {
	    c = s.charAt(i);
	    if (c == '\n' && gotCR) {
		// This LF is part of an unescaped 
		// CRLF sequence (i.e, LWSP). Skip it.
		gotCR = false;
		continue;
	    }

	    gotCR = false;
	    if (!gotEscape) {
		// Previous character was NOT '\'
		if (c == '\\') // skip this character
		    gotEscape = true;
		else if (c == '\r') // skip this character
		    gotCR = true;
		else // append this character
		    sb.append(c);
	    } else {
		// Previous character was '\'. So no need to 
		// bother with any special processing, just 
		// append this character.  If keepEscapes is
		// set, keep the backslash.  IE6 fails to escape
		// backslashes in quoted strings in HTTP headers,
		// e.g., in the filename parameter.
		if (keepEscapes)
		    sb.append('\\');
		sb.append(c);
		gotEscape = false;
	    }
	}
	return sb.toString();
    }
}
Throws:	ParseException – if the parse fails
Returns:	the next Token
Params:	endOfAtom – if not NUL, character marking end of token
Throws:	ParseException – if the parse fails
Returns:	the next Token
Since:	JavaMail 1.5
/

javax.mail/ javax.mail-api/ 1.6.2/ javax/mail/internet/HeaderTokenizer.java