java/10 : java.xml.bind/com/sun/xml/internal/org/jvnet/mimepull/MIMEParser.java

MIMEParser
https://openjdk.java.net/
GPLv2 + Classpath Exception
/*
 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package com.sun.xml.internal.org.jvnet.mimepull;

import java.io.InputStream;
import java.io.IOException;
import java.util.*;
import java.util.logging.Logger;
import java.nio.ByteBuffer;
import java.util.logging.Level;

Pull parser for the MIME messages. Applications can use pull API to continue
the parsing MIME messages lazily.
for e.g.:

MIMEParser parser = ...
Iterator it = parser.iterator();
while(it.hasNext()) {
  MIMEEvent event = it.next();
  ...
}

Author: Jitendra Kotamraju/**
 * Pull parser for the MIME messages. Applications can use pull API to continue
 * the parsing MIME messages lazily.
 *
 * <pre>
 * for e.g.:
 * <p>
 *
 * MIMEParser parser = ...
 * Iterator<MIMEEvent> it = parser.iterator();
 * while(it.hasNext()) {
 *   MIMEEvent event = it.next();
 *   ...
 * }
 * </pre>
 *
 * @author Jitendra Kotamraju
 */
class MIMEParser implements Iterable<MIMEEvent> {

    private static final Logger LOGGER = Logger.getLogger(MIMEParser.class.getName());

    private static final String HEADER_ENCODING = "ISO8859-1";

    // Actually, the grammar doesn't support whitespace characters
    // after boundary. But the mail implementation checks for it.
    // We will only check for these many whitespace characters after boundary
    private static final int NO_LWSP = 1000;
    private enum STATE {START_MESSAGE, SKIP_PREAMBLE, START_PART, HEADERS, BODY, END_PART, END_MESSAGE}
    private STATE state = STATE.START_MESSAGE;

    private final InputStream in;
    private final byte[] bndbytes;
    private final int bl;
    private final MIMEConfig config;
    private final int[] bcs = new int[128]; // BnM algo: Bad Character Shift table
    private final int[] gss;                // BnM algo : Good Suffix Shift table

    Have we parsed the data from our InputStream yet?
/**
     * Have we parsed the data from our InputStream yet?
     */
    private boolean parsed;

    /*
     * Read and process body partsList until we see the
     * terminating boundary line (or EOF).
     */
    private boolean done = false;

    private boolean eof;
    private final int capacity;
    private byte[] buf;
    private int len;
    private boolean bol;        // beginning of the line

    /*
     * Parses the MIME content. At the EOF, it also closes input stream
     */
    MIMEParser(InputStream in, String boundary, MIMEConfig config) {
        this.in = in;
        this.bndbytes = getBytes("--"+boundary);
        bl = bndbytes.length;
        this.config = config;
        gss = new int[bl];
        compileBoundaryPattern();

        // \r\n + boundary + "--\r\n" + lots of LWSP
        capacity = config.chunkSize+2+bl+4+NO_LWSP;
        createBuf(capacity);
    }

    Returns iterator for the parsing events. Use the iterator to advance
the parsing.
Returns: iterator for parsing events/**
     * Returns iterator for the parsing events. Use the iterator to advance
     * the parsing.
     *
     * @return iterator for parsing events
     */
    @Override
    public Iterator<MIMEEvent> iterator() {
        return new MIMEEventIterator();
    }

    class MIMEEventIterator implements Iterator<MIMEEvent> {

        @Override
        public boolean hasNext() {
            return !parsed;
        }

        @Override
        public MIMEEvent next() {

            if (parsed) {
                throw new NoSuchElementException();
            }

            switch(state) {
                case START_MESSAGE :
                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_MESSAGE);}
                    state = STATE.SKIP_PREAMBLE;
                    return MIMEEvent.START_MESSAGE;

                case SKIP_PREAMBLE :
                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.SKIP_PREAMBLE);}
                    skipPreamble();
                    // fall through
                case START_PART :
                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.START_PART);}
                    state = STATE.HEADERS;
                    return MIMEEvent.START_PART;

                case HEADERS :
                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.HEADERS);}
                    InternetHeaders ih = readHeaders();
                    state = STATE.BODY;
                    bol = true;
                    return new MIMEEvent.Headers(ih);

                case BODY :
                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.BODY);}
                    ByteBuffer buf = readBody();
                    bol = false;
                    return new MIMEEvent.Content(buf);

                case END_PART :
                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_PART);}
                    if (done) {
                        state = STATE.END_MESSAGE;
                    } else {
                        state = STATE.START_PART;
                    }
                    return MIMEEvent.END_PART;

                case END_MESSAGE :
                    if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "MIMEParser state={0}", STATE.END_MESSAGE);}
                    parsed = true;
                    return MIMEEvent.END_MESSAGE;

                default :
                    throw new MIMEParsingException("Unknown Parser state = "+state);
            }
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }

    Collects the headers for the current part by parsing mesage stream.
Returns: headers for the current part/**
     * Collects the headers for the current part by parsing mesage stream.
     *
     * @return headers for the current part
     */
    private InternetHeaders readHeaders() {
        if (!eof) {
            fillBuf();
        }
        return new InternetHeaders(new LineInputStream());
    }

    Reads and saves the part of the current attachment part's content.
At the end of this method, buf should have the remaining data
at index 0.
Returns: a chunk of the part's content/**
     * Reads and saves the part of the current attachment part's content.
     * At the end of this method, buf should have the remaining data
     * at index 0.
     *
     * @return a chunk of the part's content
     *
     */
    private ByteBuffer readBody() {
        if (!eof) {
            fillBuf();
        }
        int start = match(buf, 0, len);     // matches boundary
        if (start == -1) {
            // No boundary is found
            assert eof || len >= config.chunkSize;
            int chunkSize = eof ? len : config.chunkSize;
            if (eof) {
                done = true;
                throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary.");
            }
            return adjustBuf(chunkSize, len-chunkSize);
        }
        // Found boundary.
        // Is it at the start of a line ?
        int chunkLen = start;
        if (bol && start == 0) {
            // nothing to do
        } else if (start > 0 && (buf[start-1] == '\n' || buf[start-1] =='\r')) {
            --chunkLen;
            if (buf[start-1] == '\n' && start >1 && buf[start-2] == '\r') {
                --chunkLen;
            }
        } else {
           return adjustBuf(start+1, len-start-1);  // boundary is not at beginning of a line
        }

        if (start+bl+1 < len && buf[start+bl] == '-' && buf[start+bl+1] == '-') {
            state = STATE.END_PART;
            done = true;
            return adjustBuf(chunkLen, 0);
        }

        // Consider all the whitespace in boundary+whitespace+"\r\n"
        int lwsp = 0;
        for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) {
            ++lwsp;
        }

        // Check for \n or \r\n in boundary+whitespace+"\n" or boundary+whitespace+"\r\n"
        if (start+bl+lwsp < len && buf[start+bl+lwsp] == '\n') {
            state = STATE.END_PART;
            return adjustBuf(chunkLen, len-start-bl-lwsp-1);
        } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp] == '\r' && buf[start+bl+lwsp+1] == '\n') {
            state = STATE.END_PART;
            return adjustBuf(chunkLen, len-start-bl-lwsp-2);
        } else if (start+bl+lwsp+1 < len) {
            return adjustBuf(chunkLen+1, len-chunkLen-1);       // boundary string in a part data
        } else if (eof) {
            done = true;
            throw new MIMEParsingException("Reached EOF, but there is no closing MIME boundary.");
        }

        // Some more data needed to determine if it is indeed a proper boundary
        return adjustBuf(chunkLen, len-chunkLen);
    }

    Returns a chunk from the original buffer. A new buffer is
created with the remaining bytes.
Params: chunkSize – create a chunk with these many bytes
remaining – bytes from the end of the buffer that need to be copied to
       the beginning of the new buffer
Returns: chunk/**
     * Returns a chunk from the original buffer. A new buffer is
     * created with the remaining bytes.
     *
     * @param chunkSize create a chunk with these many bytes
     * @param remaining bytes from the end of the buffer that need to be copied to
     *        the beginning of the new buffer
     * @return chunk
     */
    private ByteBuffer adjustBuf(int chunkSize, int remaining) {
        assert buf != null;
        assert chunkSize >= 0;
        assert remaining >= 0;

        byte[] temp = buf;
        // create a new buf and adjust it without this chunk
        createBuf(remaining);
        System.arraycopy(temp, len-remaining, buf, 0, remaining);
        len = remaining;

        return ByteBuffer.wrap(temp, 0, chunkSize);
    }

    private void createBuf(int min) {
        buf = new byte[min < capacity ? capacity : min];
    }

    Skips the preamble to find the first attachment part
/**
     * Skips the preamble to find the first attachment part
     */
    private void skipPreamble() {

        while(true) {
            if (!eof) {
                fillBuf();
            }
            int start = match(buf, 0, len);     // matches boundary
            if (start == -1) {
                // No boundary is found
                if (eof) {
                    throw new MIMEParsingException("Missing start boundary");
                } else {
                    adjustBuf(len-bl+1, bl-1);
                    continue;
                }
            }

            if (start > config.chunkSize) {
                adjustBuf(start, len-start);
                continue;
            }
            // Consider all the whitespace boundary+whitespace+"\r\n"
            int lwsp = 0;
            for(int i=start+bl; i < len && (buf[i] == ' ' || buf[i] == '\t'); i++) {
                ++lwsp;
            }
            // Check for \n or \r\n
            if (start+bl+lwsp < len && (buf[start+bl+lwsp] == '\n' || buf[start+bl+lwsp] == '\r') ) {
                if (buf[start+bl+lwsp] == '\n') {
                    adjustBuf(start+bl+lwsp+1, len-start-bl-lwsp-1);
                    break;
                } else if (start+bl+lwsp+1 < len && buf[start+bl+lwsp+1] == '\n') {
                    adjustBuf(start+bl+lwsp+2, len-start-bl-lwsp-2);
                    break;
                }
            }
            adjustBuf(start+1, len-start-1);
        }
        if (LOGGER.isLoggable(Level.FINE)) {LOGGER.log(Level.FINE, "Skipped the preamble. buffer len={0}", len);}
    }

    private static byte[] getBytes(String s) {
        char [] chars= s.toCharArray();
        int size = chars.length;
        byte[] bytes = new byte[size];

        for (int i = 0; i < size;) {
            bytes[i] = (byte) chars[i++];
        }
        return bytes;
    }

        Boyer-Moore search method. Copied from java.util.regex.Pattern.java
Pre calculates arrays needed to generate the bad character
shift and the good suffix shift. Only the last seven bits
are used to see if chars match; This keeps the tables small
and covers the heavily used ASCII range, but occasionally
results in an aliased match for the bad character shift.
/**
     * Boyer-Moore search method. Copied from java.util.regex.Pattern.java
     *
     * Pre calculates arrays needed to generate the bad character
     * shift and the good suffix shift. Only the last seven bits
     * are used to see if chars match; This keeps the tables small
     * and covers the heavily used ASCII range, but occasionally
     * results in an aliased match for the bad character shift.
     */
    private void compileBoundaryPattern() {
        int i, j;

        // Precalculate part of the bad character shift
        // It is a table for where in the pattern each
        // lower 7-bit value occurs
        for (i = 0; i < bndbytes.length; i++) {
            bcs[bndbytes[i]&0x7F] = i + 1;
        }

        // Precalculate the good suffix shift
        // i is the shift amount being considered
NEXT:   for (i = bndbytes.length; i > 0; i--) {
            // j is the beginning index of suffix being considered
            for (j = bndbytes.length - 1; j >= i; j--) {
                // Testing for good suffix
                if (bndbytes[j] == bndbytes[j-i]) {
                    // src[j..len] is a good suffix
                    gss[j-1] = i;
                } else {
                    // No match. The array has already been
                    // filled up with correct values before.
                    continue NEXT;
                }
            }
            // This fills up the remaining of optoSft
            // any suffix can not have larger shift amount
            // then its sub-suffix. Why???
            while (j > 0) {
                gss[--j] = i;
            }
        }
        // Set the guard value because of unicode compression
        gss[bndbytes.length -1] = 1;
    }

    Finds the boundary in the given buffer using Boyer-Moore algo.
Copied from java.util.regex.Pattern.java
Params: mybuf – boundary to be searched in this mybuf
off – start index in mybuf
len – number of bytes in mybuf
Returns: -1 if there is no match or index where the match starts/**
     * Finds the boundary in the given buffer using Boyer-Moore algo.
     * Copied from java.util.regex.Pattern.java
     *
     * @param mybuf boundary to be searched in this mybuf
     * @param off start index in mybuf
     * @param len number of bytes in mybuf
     *
     * @return -1 if there is no match or index where the match starts
     */
    private int match(byte[] mybuf, int off, int len) {
        int last = len - bndbytes.length;

        // Loop over all possible match positions in text
NEXT:   while (off <= last) {
            // Loop over pattern from right to left
            for (int j = bndbytes.length - 1; j >= 0; j--) {
                byte ch = mybuf[off+j];
                if (ch != bndbytes[j]) {
                    // Shift search to the right by the maximum of the
                    // bad character shift and the good suffix shift
                    off += Math.max(j + 1 - bcs[ch&0x7F], gss[j]);
                    continue NEXT;
                }
            }
            // Entire pattern matched starting at off
            return off;
        }
        return -1;
    }

    Fills the remaining buf to the full capacity
/**
     * Fills the remaining buf to the full capacity
     */
    private void fillBuf() {
        if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "Before fillBuf() buffer len={0}", len);}
        assert !eof;
        while(len < buf.length) {
            int read;
            try {
                read = in.read(buf, len, buf.length-len);
            } catch(IOException ioe) {
                throw new MIMEParsingException(ioe);
            }
            if (read == -1) {
                eof = true;
                try {
                    if (LOGGER.isLoggable(Level.FINE)) {LOGGER.fine("Closing the input stream.");}
                    in.close();
                } catch(IOException ioe) {
                    throw new MIMEParsingException(ioe);
                }
                break;
            } else {
                len += read;
            }
        }
        if (LOGGER.isLoggable(Level.FINER)) {LOGGER.log(Level.FINER, "After fillBuf() buffer len={0}", len);}
    }

    private void doubleBuf() {
        byte[] temp = new byte[2*len];
        System.arraycopy(buf, 0, temp, 0, len);
        buf = temp;
        if (!eof) {
            fillBuf();
        }
    }

    class LineInputStream {
        private int offset;

        /*
         * Read a line containing only ASCII characters from the input
         * stream. A line is terminated by a CR or NL or CR-NL sequence.
         * A common error is a CR-CR-NL sequence, which will also terminate
         * a line.
         * The line terminator is not returned as part of the returned
         * String. Returns null if no data is available. <p>
         *
         * This class is similar to the deprecated
         * <code>DataInputStream.readLine()</code>
         */
        public String readLine() throws IOException {

            int hdrLen = 0;
            int lwsp = 0;
            while(offset+hdrLen < len) {
                if (buf[offset+hdrLen] == '\n') {
                    lwsp = 1;
                    break;
                }
                if (offset+hdrLen+1 == len) {
                    doubleBuf();
                }
                if (offset+hdrLen+1 >= len) {   // No more data in the stream
                    assert eof;
                    return null;
                }
                if (buf[offset+hdrLen] == '\r' && buf[offset+hdrLen+1] == '\n') {
                    lwsp = 2;
                    break;
                }
                ++hdrLen;
            }
            if (hdrLen == 0) {
                adjustBuf(offset+lwsp, len-offset-lwsp);
                return null;
            }

            String hdr = new String(buf, offset, hdrLen, HEADER_ENCODING);
            offset += hdrLen+lwsp;
            return hdr;
        }

    }

}
Params:	chunkSize – create a chunk with these many bytes remaining – bytes from the end of the buffer that need to be copied to the beginning of the new buffer
Returns:	chunk
Params:	mybuf – boundary to be searched in this mybuf off – start index in mybuf len – number of bytes in mybuf
Returns:	-1 if there is no match or index where the match starts
/

java/ 10/ java.xml.bind/com/sun/xml/internal/org/jvnet/mimepull/MIMEParser.java