org.antlr/ST4/4.1 : org/stringtemplate/v4/compiler/STLexer.java

STLexer
http://nexus.sonatype.org/oss-repository-hosting.html/ST4: StringTemplate is a java template engine for generating source code, web pages, emails, or any other formatted text output. StringTemplate is particularly good at multi-targeted code generators, multiple site skins, and internationalization/localization. It evolved over years of effort developing jGuru.com. StringTemplate also powers the ANTLR 3 and 4 code generator. Its distinguishing characteristic is that unlike other engines, it strictly enforces model-view separation. Strict separation makes websites and code generators more flexible and maintainable; it also provides an excellent defense against malicious template authors.
/*
 * [The "BSD license"]
 *  Copyright (c) 2011 Terence Parr
 *  All rights reserved.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *  1. Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *  2. Redistributions in binary form must reproduce the above copyright
 *     notice, this list of conditions and the following disclaimer in the
 *     documentation and/or other materials provided with the distribution.
 *  3. The name of the author may not be used to endorse or promote products
 *     derived from this software without specific prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 *  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 *  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 *  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 *  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 *  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 *  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 *  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 *  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.stringtemplate.v4.compiler;

import org.antlr.runtime.CharStream;
import org.antlr.runtime.CommonToken;
import org.antlr.runtime.MismatchedTokenException;
import org.antlr.runtime.NoViableAltException;
import org.antlr.runtime.RecognitionException;
import org.antlr.runtime.Token;
import org.antlr.runtime.TokenSource;
import org.stringtemplate.v4.STGroup;
import org.stringtemplate.v4.misc.ErrorManager;
import org.stringtemplate.v4.misc.Misc;

import java.util.ArrayList;
import java.util.List;

This class represents the tokenizer for templates. It operates in two modes: inside and outside of expressions. It implements the TokenSource interface so it can be used with ANTLR parsers. Outside of expressions, we can return these token types: TEXT, INDENT, LDELIM (start of expression), RCURLY (end of subtemplate), and NEWLINE. Inside of an expression, this lexer returns all of the tokens needed by STParser. From the parser's point of view, it can treat a template as a simple stream of elements.  This class defines the token types and communicates these values to STParser.g via STLexer.tokens file (which must remain consistent).
/**
 * This class represents the tokenizer for templates. It operates in two modes:
 * inside and outside of expressions. It implements the {@link TokenSource}
 * interface so it can be used with ANTLR parsers. Outside of expressions, we
 * can return these token types: {@link #TEXT}, {@link #INDENT}, {@link #LDELIM}
 * (start of expression), {@link #RCURLY} (end of subtemplate), and
 * {@link #NEWLINE}. Inside of an expression, this lexer returns all of the
 * tokens needed by {@link STParser}. From the parser's point of view, it can
 * treat a template as a simple stream of elements.
 * <p>
 * This class defines the token types and communicates these values to
 * {@code STParser.g} via {@code STLexer.tokens} file (which must remain
 * consistent).</p>
 */
public class STLexer implements TokenSource {
    public static final char EOF = (char)-1;            // EOF char
    public static final int EOF_TYPE = CharStream.EOF;  // EOF token type

    We build STToken tokens instead of relying on CommonToken so we can override toString(). It just converts token types to token names like 23 to "LDELIM". /** We build {@code STToken} tokens instead of relying on {@link CommonToken}
	 *  so we can override {@link #toString()}. It just converts token types to
     *  token names like 23 to {@code "LDELIM"}.
     */
    public static class STToken extends CommonToken {
        public STToken(CharStream input, int type, int start, int stop) {
            super(input, type, DEFAULT_CHANNEL, start, stop);
        }
        public STToken(int type, String text) { super(type, text); }

		@Override
        public String toString() {
            String channelStr = "";
            if ( channel>0 ) {
                channelStr=",channel="+channel;
            }
            String txt = getText();
            if ( txt!=null ) txt = Misc.replaceEscapes(txt);
            else txt = "<no text>";
			String tokenName;
			if ( type==EOF_TYPE ) tokenName = "EOF";
			else tokenName = STParser.tokenNames[type];
			return "[@"+getTokenIndex()+","+start+":"+stop+"='"+txt+"',<"+ tokenName +">"+channelStr+","+line+":"+getCharPositionInLine()+"]";
        }
    }

    public static final Token SKIP = new STToken(-1, "<skip>");

    // must follow STLexer.tokens file that STParser.g loads
    public static final int RBRACK=17;
    public static final int LBRACK=16;
    public static final int ELSE=5;
    public static final int ELLIPSIS=11;
    public static final int LCURLY=20;
    public static final int BANG=10;
    public static final int EQUALS=12;
    public static final int TEXT=22;
    public static final int ID=25;
    public static final int SEMI=9;
    public static final int LPAREN=14;
    public static final int IF=4;
    public static final int ELSEIF=6;
    public static final int COLON=13;
    public static final int RPAREN=15;
    public static final int COMMA=18;
    public static final int RCURLY=21;
    public static final int ENDIF=7;
    public static final int RDELIM=24;
    public static final int SUPER=8;
    public static final int DOT=19;
    public static final int LDELIM=23;
    public static final int STRING=26;
	public static final int PIPE=28;
	public static final int OR=29;
	public static final int AND=30;
	public static final int INDENT=31;
    public static final int NEWLINE=32;
    public static final int AT=33;
    public static final int REGION_END=34;
	public static final int TRUE=35;
	public static final int FALSE=36;
	public static final int COMMENT=37;
	public static final int SLASH=38;


    The char which delimits the start of an expression. /** The char which delimits the start of an expression. */
    char delimiterStartChar = '<';
    The char which delimits the end of an expression. /** The char which delimits the end of an expression. */
    char delimiterStopChar = '>';

	This keeps track of the current mode of the lexer. Are we inside or
outside an ST expression?
/**
	 * This keeps track of the current mode of the lexer. Are we inside or
	 * outside an ST expression?
	 */
    boolean scanningInsideExpr = false;

    To be able to properly track the inside/outside mode, we need to
 track how deeply nested we are in some templates. Otherwise, we
 know whether a '}' and the outermost subtemplate to send this
 back to outside mode.
/** To be able to properly track the inside/outside mode, we need to
     *  track how deeply nested we are in some templates. Otherwise, we
     *  know whether a <code>'}'</code> and the outermost subtemplate to send this
	 *  back to outside mode.
     */
	public int subtemplateDepth = 0; // start out *not* in a {...} subtemplate

	ErrorManager errMgr;

	template embedded in a group file? this is the template /** template embedded in a group file? this is the template */
	Token templateToken;

    CharStream input;
	current character /** current character */
    char c;

    When we started token, track initial coordinates so we can properly
 build token objects.
/** When we started token, track initial coordinates so we can properly
     *  build token objects.
     */
    int startCharIndex;
    int startLine;
    int startCharPositionInLine;

    Our lexer routines might have to emit more than a single token. We
 buffer everything through this list.
/** Our lexer routines might have to emit more than a single token. We
     *  buffer everything through this list.
     */
    List<Token> tokens = new ArrayList<Token>();

	public STLexer(CharStream input) { this(STGroup.DEFAULT_ERR_MGR, input, null, '<', '>'); }

    public STLexer(ErrorManager errMgr, CharStream input, Token templateToken) {
		this(errMgr, input, templateToken, '<', '>');
	}

	public STLexer(ErrorManager errMgr,
				   CharStream input,
				   Token templateToken,
				   char delimiterStartChar,
				   char delimiterStopChar)
	{
		this.errMgr = errMgr;
		this.input = input;
		c = (char)input.LA(1); // prime lookahead
		this.templateToken = templateToken;
		this.delimiterStartChar = delimiterStartChar;
		this.delimiterStopChar = delimiterStopChar;
	}

	@Override
	public Token nextToken() {
		Token t;
		if ( tokens.size()>0 ) { t = tokens.remove(0); }
		else t = _nextToken();
//		System.out.println(t);
		return t;
	}

    Consume if x is next character on the input stream. /** Consume if {@code x} is next character on the input stream.
	 */
    public void match(char x) {
        if ( c != x ) {
			NoViableAltException e = new NoViableAltException("",0,0,input);
			errMgr.lexerError(input.getSourceName(), "expecting '"+x+"', found '"+str(c)+"'", templateToken, e);
		}
		consume();
    }

    protected void consume() {
        input.consume();
        c = (char)input.LA(1);
    }

    public void emit(Token token) { tokens.add(token); }

    public Token _nextToken() {
		//System.out.println("nextToken: c="+(char)c+"@"+input.index());
        while ( true ) { // lets us avoid recursion when skipping stuff
            startCharIndex = input.index();
            startLine = input.getLine();
            startCharPositionInLine = input.getCharPositionInLine();

            if ( c==EOF ) return newToken(EOF_TYPE);
            Token t;
            if ( scanningInsideExpr ) t = inside();
            else t = outside();
            if ( t!=SKIP ) return t;
        }
    }

    protected Token outside() {
        if ( input.getCharPositionInLine()==0 && (c==' '||c=='\t') ) {
            while ( c==' ' || c=='\t' ) consume(); // scarf indent
            if ( c!=EOF ) return newToken(INDENT);
            return newToken(TEXT);
        }
        if ( c==delimiterStartChar ) {
            consume();
            if ( c=='!' ) return COMMENT();
            if ( c=='\\' ) return ESCAPE(); // <\\> <\uFFFF> <\n> etc...
            scanningInsideExpr = true;
            return newToken(LDELIM);
        }
        if ( c=='\r' ) { consume(); consume(); return newToken(NEWLINE); } // \r\n -> \n
        if ( c=='\n') {	consume(); return newToken(NEWLINE); }
        if ( c=='}' && subtemplateDepth>0 ) {
            scanningInsideExpr = true;
            subtemplateDepth--;
            consume();
            return newTokenFromPreviousChar(RCURLY);
        }
        return mTEXT();
    }

    protected Token inside() {
        while ( true ) {
            switch ( c ) {
                case ' ': case '\t': case '\n': case '\r':
					consume();
					return SKIP;
                case '.' :
					consume();
					if ( input.LA(1)=='.' && input.LA(2)=='.' ) {
						consume();
						match('.');
						return newToken(ELLIPSIS);
					}
					return newToken(DOT);
                case ',' : consume(); return newToken(COMMA);
				case ':' : consume(); return newToken(COLON);
				case ';' : consume(); return newToken(SEMI);
                case '(' : consume(); return newToken(LPAREN);
                case ')' : consume(); return newToken(RPAREN);
                case '[' : consume(); return newToken(LBRACK);
                case ']' : consume(); return newToken(RBRACK);
				case '=' : consume(); return newToken(EQUALS);
                case '!' : consume(); return newToken(BANG);
				case '/' : consume(); return newToken(SLASH);
                case '@' :
                    consume();
                    if ( c=='e' && input.LA(2)=='n' && input.LA(3)=='d' ) {
                        consume(); consume(); consume();
                        return newToken(REGION_END);
                    }
                    return newToken(AT);
                case '"' : return mSTRING();
                case '&' : consume(); match('&'); return newToken(AND); // &&
                case '|' : consume(); match('|'); return newToken(OR); // ||
				case '{' : return subTemplate();
				default:
					if ( c==delimiterStopChar ) {
						consume();
						scanningInsideExpr =false;
						return newToken(RDELIM);
					}
                    if ( isIDStartLetter(c) ) {
						Token id = mID();
						String name = id.getText();
						if ( name.equals("if") ) return newToken(IF);
						else if ( name.equals("endif") ) return newToken(ENDIF);
						else if ( name.equals("else") ) return newToken(ELSE);
						else if ( name.equals("elseif") ) return newToken(ELSEIF);
						else if ( name.equals("super") ) return newToken(SUPER);
						else if ( name.equals("true") ) return newToken(TRUE);
						else if ( name.equals("false") ) return newToken(FALSE);
						return id;
					}
					RecognitionException re =
						new NoViableAltException("",0,0,input);
                    re.line = startLine;
                    re.charPositionInLine = startCharPositionInLine;
					errMgr.lexerError(input.getSourceName(), "invalid character '"+str(c)+"'", templateToken, re);
					if (c==EOF) {
						return newToken(EOF_TYPE);
					}
					consume();
            }
        }
    }

    Token subTemplate() {
        // look for "{ args ID (',' ID)* '|' ..."
		subtemplateDepth++;
        int m = input.mark();
        int curlyStartChar = startCharIndex;
        int curlyLine = startLine;
        int curlyPos = startCharPositionInLine;
        List<Token> argTokens = new ArrayList<Token>();
        consume();
		Token curly = newTokenFromPreviousChar(LCURLY);
        WS();
        argTokens.add( mID() );
        WS();
        while ( c==',' ) {
			consume();
            argTokens.add( newTokenFromPreviousChar(COMMA) );
            WS();
            argTokens.add( mID() );
            WS();
        }
        WS();
        if ( c=='|' ) {
			consume();
            argTokens.add( newTokenFromPreviousChar(PIPE) );
            if ( isWS(c) ) consume(); // ignore a single whitespace after |
            //System.out.println("matched args: "+argTokens);
            for (Token t : argTokens) emit(t);
			input.release(m);
			scanningInsideExpr = false;
			startCharIndex = curlyStartChar; // reset state
			startLine = curlyLine;
			startCharPositionInLine = curlyPos;
			return curly;
		}
		input.rewind(m);
		startCharIndex = curlyStartChar; // reset state
		startLine = curlyLine;
        startCharPositionInLine = curlyPos;
		consume();
		scanningInsideExpr = false;
        return curly;
    }

    Token ESCAPE() {
		startCharIndex = input.index();
		startCharPositionInLine = input.getCharPositionInLine();
		consume(); // kill \\
		if ( c=='u') return UNICODE();
		String text;
        switch ( c ) {
            case '\\' : LINEBREAK(); return SKIP;
			case 'n'  : text = "\n"; break;
			case 't'  : text = "\t"; break;
			case ' '  : text = " "; break;
            default :
                NoViableAltException e = new NoViableAltException("",0,0,input);
                errMgr.lexerError(input.getSourceName(), "invalid escaped char: '"+str(c)+"'", templateToken, e);
				consume();
				match(delimiterStopChar);
				return SKIP;
        }
        consume();
		Token t = newToken(TEXT, text, input.getCharPositionInLine()-2);
        match(delimiterStopChar);
        return t;
    }

    Token UNICODE() {
        consume();
        char[] chars = new char[4];
        if ( !isUnicodeLetter(c) ) {
            NoViableAltException e = new NoViableAltException("",0,0,input);
            errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
        }
        chars[0] = c;
        consume();
        if ( !isUnicodeLetter(c) ) {
            NoViableAltException e = new NoViableAltException("",0,0,input);
			errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
        }
        chars[1] = c;
        consume();
        if ( !isUnicodeLetter(c) ) {
            NoViableAltException e = new NoViableAltException("",0,0,input);
			errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
        }
        chars[2] = c;
        consume();
        if ( !isUnicodeLetter(c) ) {
            NoViableAltException e = new NoViableAltException("",0,0,input);
			errMgr.lexerError(input.getSourceName(), "invalid unicode char: '"+str(c)+"'", templateToken, e);
        }
        chars[3] = c;
        // ESCAPE kills >
        char uc = (char)Integer.parseInt(new String(chars), 16);
        Token t = newToken(TEXT, String.valueOf(uc), input.getCharPositionInLine()-6);
		consume();
		match(delimiterStopChar);
		return t;
    }

    Token mTEXT() {
		boolean modifiedText = false;
        StringBuilder buf = new StringBuilder();
        while ( c != EOF && c != delimiterStartChar ) {
			if ( c=='\r' || c=='\n') break;
			if ( c=='}' && subtemplateDepth>0 ) break;
            if ( c=='\\' ) {
                if ( input.LA(2)=='\\' ) { // convert \\ to \
                    consume(); consume(); buf.append('\\');
                    modifiedText = true;
                    continue;
                }
                if ( input.LA(2)==delimiterStartChar ||
					 input.LA(2)=='}' )
				{
                    modifiedText = true;
                    consume(); // toss out \ char
                    buf.append(c); consume();
                }
                else {
                    buf.append(c);
                    consume();
                }
                continue;
            }
            buf.append(c);
            consume();
        }
        if ( modifiedText )	return newToken(TEXT, buf.toString());
        else return newToken(TEXT);
    }

     ID  : ('a'..'z'|'A'..'Z'|'_'|'/')
       ('a'..'z'|'A'..'Z'|'0'..'9'|'_'|'/')*
     ;
 
/** <pre>
	 *  ID  : ('a'..'z'|'A'..'Z'|'_'|'/')
	 *        ('a'..'z'|'A'..'Z'|'0'..'9'|'_'|'/')*
	 *      ;
	 *  </pre>
	 */
    Token mID() {
        // called from subTemplate; so keep resetting position during speculation
        startCharIndex = input.index();
        startLine = input.getLine();
        startCharPositionInLine = input.getCharPositionInLine();
        consume();
        while ( isIDLetter(c) ) {
            consume();
        }
        return newToken(ID);
    }

     STRING : '"'
          (   '\\' '"'
          |   '\\' ~'"'
          |   ~('\\'|'"')
          )*
          '"'
        ;

/** <pre>
	 *  STRING : '"'
	 *           (   '\\' '"'
	 *           |   '\\' ~'"'
	 *           |   ~('\\'|'"')
	 *           )*
	 *           '"'
	 *         ;
	 * </pre>
	 */
    Token mSTRING() {
    	//{setText(getText().substring(1, getText().length()-1));}
        boolean sawEscape = false;
        StringBuilder buf = new StringBuilder();
        buf.append(c); consume();
        while ( c != '"' ) {
            if ( c=='\\' ) {
                sawEscape = true;
                consume();
				switch ( c ) {
					case 'n' : buf.append('\n'); break;
					case 'r' : buf.append('\r'); break;
					case 't' : buf.append('\t'); break;
                	default : buf.append(c); break;
				}
				consume();
                continue;
            }
            buf.append(c);
            consume();
			if ( c==EOF ) {
				RecognitionException re =
					new MismatchedTokenException((int)'"', input);
				re.line = input.getLine();
				re.charPositionInLine = input.getCharPositionInLine();
				errMgr.lexerError(input.getSourceName(), "EOF in string", templateToken, re);
				break;
			}
        }
        buf.append(c);
        consume();
        if ( sawEscape ) return newToken(STRING, buf.toString());
        else return newToken(STRING);
    }

    void WS() {
        while ( c==' ' || c=='\t' || c=='\n' || c=='\r' ) consume();
    }

    Token COMMENT() {
        match('!');
        while ( !(c=='!' && input.LA(2)==delimiterStopChar) ) {
			if (c==EOF) {
				RecognitionException re =
					new MismatchedTokenException((int)'!', input);
				re.line = input.getLine();
				re.charPositionInLine = input.getCharPositionInLine();
				errMgr.lexerError(input.getSourceName(), "Nonterminated comment starting at " +
					startLine+":"+startCharPositionInLine+": '!"+
					delimiterStopChar+"' missing", templateToken, re);
				break;
			}
			consume();
		}
        consume(); consume(); // grab !>
		return newToken(COMMENT);
    }

    void LINEBREAK() {
        match('\\'); // only kill 2nd \ as ESCAPE() kills first one
        match(delimiterStopChar);
        while ( c==' ' || c=='\t' ) consume(); // scarf WS after <\\>
		if ( c==EOF ) {
			RecognitionException re = new RecognitionException(input);
			re.line = input.getLine();
			re.charPositionInLine = input.getCharPositionInLine();
			errMgr.lexerError(input.getSourceName(), "Missing newline after newline escape <\\\\>",
				              templateToken, re);
			return;
		}
		if ( c=='\r' ) consume();
        match('\n');
        while ( c==' ' || c=='\t' ) consume(); // scarf any indent
    }

    public static boolean isIDStartLetter(char c) { return isIDLetter(c); }
	public static boolean isIDLetter(char c) { return c>='a'&&c<='z' || c>='A'&&c<='Z' || c>='0'&&c<='9' || c=='-' || c=='_'; }
    public static boolean isWS(char c) { return c==' ' || c=='\t' || c=='\n' || c=='\r'; }
    public static boolean isUnicodeLetter(char c) { return c>='a'&&c<='f' || c>='A'&&c<='F' || c>='0'&&c<='9'; }

    public Token newToken(int ttype) {
        STToken t = new STToken(input, ttype, startCharIndex, input.index()-1);
        t.setLine(startLine);
        t.setCharPositionInLine(startCharPositionInLine);
		return t;
	}

    public Token newTokenFromPreviousChar(int ttype) {
        STToken t = new STToken(input, ttype, input.index()-1, input.index()-1);
        t.setLine(input.getLine());
        t.setCharPositionInLine(input.getCharPositionInLine()-1);
        return t;
    }

    public Token newToken(int ttype, String text, int pos) {
        STToken t = new STToken(ttype, text);
		t.setStartIndex(startCharIndex);
		t.setStopIndex(input.index()-1);
        t.setLine(input.getLine());
        t.setCharPositionInLine(pos);
        return t;
    }

	public Token newToken(int ttype, String text) {
		STToken t = new STToken(ttype, text);
        t.setStartIndex(startCharIndex);
        t.setStopIndex(input.index()-1);
		t.setLine(startLine);
		t.setCharPositionInLine(startCharPositionInLine);
		return t;
	}

//    public String getErrorHeader() {
//        return startLine+":"+startCharPositionInLine;
//    }
//
	@Override
    public String getSourceName() {
        return "no idea";
    }

	public static String str(int c) {
		if ( c==EOF ) return "<EOF>";
		return String.valueOf((char)c);
	}
}
/

org.antlr/ ST4/ 4.1/ org/stringtemplate/v4/compiler/STLexer.java