com.fasterxml/aalto-xml/1.2.2 : com/fasterxml/aalto/in/AttributeCollector.java

AttributeCollector
http://github.com/FasterXML/aalto-xml/: Ultra-high performance non-blocking XML processor (Stax/Stax2, SAX/SAX2) (FasterXML)
The Apache Software License, Version 2.0
Tatu Saloranta
/* Aalto XML processor
 *
 * Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi
 *
 * Licensed under the License specified in the file LICENSE which is
 * included with the source code.
 * You may not use this file except in compliance with the License.
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.fasterxml.aalto.in;

import java.text.MessageFormat;

import javax.xml.namespace.QName;
import javax.xml.stream.Location;
import javax.xml.stream.XMLStreamException;

import org.codehaus.stax2.typed.Base64Variant;
import org.codehaus.stax2.typed.TypedArrayDecoder;
import org.codehaus.stax2.typed.TypedValueDecoder;
import org.codehaus.stax2.typed.TypedXMLStreamException;

import org.codehaus.stax2.ri.typed.CharArrayBase64Decoder;
import org.codehaus.stax2.ri.typed.ValueDecoderFactory;


import com.fasterxml.aalto.impl.ErrorConsts;
import com.fasterxml.aalto.util.DataUtil;

Object used by the tokenizer to collect and store information
about attributes, specifically, names and values.

/**
 * Object used by the tokenizer to collect and store information
 * about attributes, specifically, names and values.
 *<p>
 */
public final class AttributeCollector
{
    private final static int INT_SPACE = 0x0020;

    Let's guess that most of the time there won't be more than
12 attributes. Since the underlying buffer will be expanded
as necessary, exact value is chosen to minimize overhead
rather than eliminate any resizing.
/**
     * Let's guess that most of the time there won't be more than
     * 12 attributes. Since the underlying buffer will be expanded
     * as necessary, exact value is chosen to minimize overhead
     * rather than eliminate any resizing.
     */
    private final static int DEFAULT_ENTRY_COUNT = 12;

    The default length of the value buffer is also chosen more
to minimize overhead than to eliminate all need for resizing.
/**
     * The default length of the value buffer is also chosen more
     * to minimize overhead than to eliminate all need for resizing.
     */
    private final static int DEFAULT_BUFFER_LENGTH = 120;

    // // // Configuration

    final ReaderConfig _config;

    // // // State: actual collected attributes

    Number of attributes currently held by this collector.
/**
     * Number of attributes currently held by this collector.
     */
    private int _attrCount;

    private PName[] _names = null;

    Consequtive character array, in which attribute values are
concatenated in
/**
     * Consequtive character array, in which attribute values are
     * concatenated in
     */
    private char[] _valueBuffer = null;

    // // // State: hash table (-like structure) for attributes

    Int-based compact data structure that contains mapping from
attribute names to attribute indexes in the main attribute name array.

Data structure contains two separate areas; main hash area (with
size _hashAreaSize), and remaining spillover area
that follows hash area up until (but not including)
_spillAreaEnd index.
Main hash area only contains indexes (index+1; 0 signifying empty slot)
to actual attributes; spillover area has both hash and index for
any spilled entry. Spilled entries are simply stored in order
added, and need to be searched using linear search. In case of both
primary hash hits and spills, eventual comparison with the local
name needs to be done with actual name array.
/**
     * Int-based compact data structure that contains mapping from
     * attribute names to attribute indexes in the main attribute name array.
     *<p>
     * Data structure contains two separate areas; main hash area (with
     * size <code>_hashAreaSize</code>), and remaining spillover area
     * that follows hash area up until (but not including)
     * <code>_spillAreaEnd</code> index.
     * Main hash area only contains indexes (index+1; 0 signifying empty slot)
     * to actual attributes; spillover area has both hash and index for
     * any spilled entry. Spilled entries are simply stored in order
     * added, and need to be searched using linear search. In case of both
     * primary hash hits and spills, eventual comparison with the local
     * name needs to be done with actual name array.
     */
    protected int[] _attrMap = null;

    Size of hash area in _attrMap; generally at least 20%
more than number of attributes (_attrCount).
/**
     * Size of hash area in <code>_attrMap</code>; generally at least 20%
     * more than number of attributes (<code>_attrCount</code>).
     */
    protected int _hashAreaSize;

    Pointer to int slot right after last spill entry, in
_attrMap array.
/**
     * Pointer to int slot right after last spill entry, in
     * <code>_attrMap</code> array.
     */
    protected int _spillAreaEnd;

    // // // State: work-in-progress:

    Array that contains ending offsets of the values in the shared
buffer. Entries contain character offset after the end of
the matching offset; so entry 0 for example contains starting
offset of the entry 1.
/**
     * Array that contains ending offsets of the values in the shared
     * buffer. Entries contain character offset after the end of
     * the matching offset; so entry 0 for example contains starting
     * offset of the entry 1.
     */
    private int[] _valueOffsets = null;

    Flag used to indicate that all attribute values for an element
have been parsed, and that next call to startNewValue
should reset the value structures
/**
     * Flag used to indicate that all attribute values for an element
     * have been parsed, and that next call to <code>startNewValue</code>
     * should reset the value structures
     */
    private boolean _needToResetValues = true;

    For some errors, we'll have to temporarily store error message,
to be thrown at a later point.
/**
     * For some errors, we'll have to temporarily store error message,
     * to be thrown at a later point.
     */
    private String _errorMsg = null;

    // // // Temporary storage for optimizations

    Concatenated String that contains all the attribute values
for the element. Allows some buffer reuse, and should result
in slight speed optimization, for elements with lots of
attributes that are usually all (or none) accessed.
/**
     * Concatenated String that contains all the attribute values
     * for the element. Allows some buffer reuse, and should result
     * in slight speed optimization, for elements with lots of
     * attributes that are usually all (or none) accessed.
     */
    private String _allAttrValues = null;

    /*
    /**********************************************************************
    /* Life-cycle methods (creation, further construction)
    /**********************************************************************
     */

    protected AttributeCollector(ReaderConfig cfg)
    {
        _config = cfg;
        _attrCount = 0;
    }

    Method called by the parser right after attribute name has been
parsed, but before value has been parsed.
Returns: Underlying character buffer to use for storing attribute
  value characters/**
     * Method called by the parser right after attribute name has been
     * parsed, but before value has been parsed.
     *
     * @return Underlying character buffer to use for storing attribute
     *   value characters
     */
    public char[] startNewValue(PName attrName, int currOffset)
    {
        int count;

        if (_needToResetValues) {
            _needToResetValues = false;
            _attrCount = count = 0;
            _allAttrValues = null;
            if (_valueBuffer == null) { // first time for this instance
                _names = new PName[DEFAULT_ENTRY_COUNT];
                _valueBuffer = new char[DEFAULT_BUFFER_LENGTH];
                _valueOffsets = new int[DEFAULT_ENTRY_COUNT];
            }
        } else {
            // Not enough room for a new entry?
            count = _attrCount;
            if (count >= _valueOffsets.length) {
                int[] oldVal = _valueOffsets;
                PName[] oldNames = _names;
                int oldLen = oldVal.length;
                int newLen = oldLen + oldLen;
                _valueOffsets = new int[newLen];
                _names = new PName[newLen];
                for (int i = 0; i < oldLen; ++i) {
                    _valueOffsets[i] = oldVal[i];
                    _names[i] = oldNames[i];
                }
            }
            if (count > 0) { // no predecessor for the first entry
                _valueOffsets[count-1] = currOffset;
            }
        }
        _names[count] = attrName;
        ++_attrCount;
        return _valueBuffer;
    }

    public char[] continueValue()
    {
        return _valueBuffer;
    }

    Method called after all attribute entries have been parsed,
and thus the end of the last value in the buffer is known.
Returns: Number of attributes collected/**
     * Method called after all attribute entries have been parsed,
     * and thus the end of the last value in the buffer is known.
     *
     * @return Number of attributes collected
     */
    public final int finishLastValue(int endingOffset)
    {
        // Did we get any values?
        if (_needToResetValues) { // nope
            return 0;
        }
        _needToResetValues = true; // so it'll get reset next time a value is started

        // Since a previous startNewValue checked buffers, no check needed
        int count = _attrCount;
        _valueOffsets[count-1] = endingOffset;

        /* So far so good. But now, also need to ensure there are no
         * duplicates. This also allows us to create a hash for efficient
         * access by name as a side effect. Since hash table building
         * overhead is somewhat significant, let's only use it for 3 or
         * more attributes.
         */
        if (count < 3) {
            _hashAreaSize = 0;
            if (count == 2) {
                PName[] names = _names;
                if (names[0].boundEquals(names[1])) {
                    noteDupAttr(0, 1);
                    return -1;
                }
            }
            return count;
        }
        return finishLastValue2();
    }

    public final int finishLastValue2()
    {
        int count = _attrCount;
        PName[] names = _names;

        // Ok, nope, better use a hash:
        /* Ok, finally, let's create attribute map, to allow efficient
         * access by prefix+localname combination. Could do it on-demand,
         * but this way we can check for duplicates right away.
         */
        int[] map = _attrMap;

        /* What's minimum size to contain at most 80% full hash area,
         * plus 1/8 spill area (12.5% spilled entries, two ints each)?
         * Since we'll need 8 for 4 entries and up, and minimum to get
         * here is 3 entries, let's just skip 4 entry map...
         */
        int hashCount = 8;
        {
            int min = count + (count >> 2); // == 80% fill rate
            /* Need to get 2^N size that can contain all elements, with
             * 80% fill rate
             */
            while (hashCount < min) {
                hashCount += hashCount; // 2x
            }
            // And then add the spill area
            _hashAreaSize = hashCount;
            min = hashCount + (hashCount >> 4); // 12.5 x 2 ints
            if (map == null || map.length < min) {
                map = new int[min];
            } else {
                /* Need to clear old hash entries (if any). But note that
                 * spilled entries we can leave alone -- they are just ints,
                 * and get overwritten if and as needed
                 */
                map[0] = map[1] = map[2] = map[3] = map[4] = map[5] = map[6] = map[7] = 0;
                for (int i = 8; i < hashCount; ++i) {
                    map[i] = 0;
                }
            }
        }

        {
            int mask = hashCount-1;
            int spillIndex = hashCount;

            // Ok, array's fine, let's hash 'em in!
            for (int i = 0; i < count; ++i) {
                PName newName = names[i];
                int hash = newName.boundHashCode();
                int index = hash & mask;
                // Hash slot available?
                int oldNameIndex = map[index];
                if (oldNameIndex == 0) { // yup
                    map[index] = i+1; // since 0 is marker
                } else { // nope, collision, need to spill
                    --oldNameIndex; // to unmask 0 etc
                    // But first, is it a dup?
                    if (names[oldNameIndex].boundEquals(newName)) {
                        // Only first collision needs to be reported
                        if (_errorMsg == null) {
                            noteDupAttr(oldNameIndex, i);
                        }
                        /* let's still continue to build hash, even if there's
                         * collision; to keep data as consistent (and accessible)
                         * as possible
                         */
                    }
                    /* Is there room to spill into? (need to 2 int spaces;
                     * one for hash, the other for index)
                     */
                    if ((spillIndex + 1)>= map.length) {
                        // Let's just add room for 4 spills...
                        map = DataUtil.growArrayBy(map, 8);
                    }
                    // Let's first ensure we aren't adding a dup:
                    for (int j = hashCount; j < spillIndex; j += 2) {
                        if (map[j] == hash) {
                            oldNameIndex = map[j+1];
                            if (names[oldNameIndex].boundEquals(newName)) {
                                if (_errorMsg == null) { 
                                    noteDupAttr(oldNameIndex, i);
                                }
                                break;
                            }
                        }
                    }
                    map[spillIndex++] = hash;
                    map[spillIndex++] = i; // no need to mask 0
                }
            }
            _spillAreaEnd = spillIndex;
        }
        _attrMap = map;

        return (_errorMsg == null) ? count : -1;
    }

    Method called by the owner, when the 
/**
     * Method called by the owner, when the 
     */
    public char[] valueBufferFull()
    {
        /* Let's just double the size as necessary? Could also grow
         * by less (50%?)... but shouldn't greatly matter
         */
        _valueBuffer = DataUtil.growArrayBy(_valueBuffer, _valueBuffer.length);
        return _valueBuffer;
    }

    /*
    /**********************************************************************
    /* Accessors
    /**********************************************************************
     */

    public final int getCount() { return _attrCount; }

    public final PName getName(int index) { return _names[index]; }

    public final QName getQName(int index)
    {
        return _names[index].constructQName();
    }

    public String getValue(int index)
    {
        int count = _attrCount;

        /* Note: no checks, caller is to ensure index is ok. Acceptable
         * since it's not externally exposed
         */
        if (_allAttrValues == null) {
            int len = _valueOffsets[count-1];
            _allAttrValues = (len == 0) ? "" : new String(_valueBuffer, 0, len);
        }
        if (index == 0) {
            if (count == 1) { // Degenerate case; only one substring?
                return _allAttrValues;
            }
            int len = _valueOffsets[0];
            return (len == 0) ? "" : _allAttrValues.substring(0, len);
        }
        /* !!! 11-Nov-2006, tatus: Should we cache constructed value?
         *   Might be worth the trouble
         */
        int start = _valueOffsets[index-1];
        int end = _valueOffsets[index];
        return (start == end) ? "" : _allAttrValues.substring(start, end);
    }

    public String getValue(String nsUri, String localName)
    {
        int ix = findIndex(nsUri, localName);
        return (ix >= 0) ? getValue(ix) : null;
    }

    public int findIndex(String nsUri, String localName)
    {
        int hashSize = _hashAreaSize;

        // No hash? Linear search, then:
        if (hashSize < 1) {
            for (int i = 0, len = _attrCount; i < len; ++i) {
                PName curr = _names[i];
                if (curr.boundEquals(nsUri, localName)) {
                    return i;
                }
            }
            return -1;
        }

        // Need to/can use hash... primary hit?
        int hash = PName.boundHashCode(nsUri, localName);
        int ix = _attrMap[hash & (hashSize-1)];

        if (ix > 0) { // has primary entry, does it match?
            --ix;
            // Is primary candidate match?
            if (_names[ix].boundEquals(nsUri, localName)) {
                return ix;
            }
            /* Nope, need to traverse spill list, which has 2 entries for
             * each spilled attribute id; first for hash value, second index.
             */
            for (int i = hashSize, len = _spillAreaEnd; i < len; i += 2) {
                if (_attrMap[i] != hash) {
                    continue;
                }
                /* Note: spill indexes are not off-by-one, since there's
                 * no need to mask 0
                 */
                ix = _attrMap[i+1];
                if (_names[ix].boundEquals(nsUri, localName)) {
                    return ix;
                }
            }
        }

        return -1;
    }

    public String getErrorMsg() { return _errorMsg; }

    /*
    /**********************************************************************
    /* Type-safe accessors to support TypedXMLStreamReader
    /**********************************************************************
     */

    public final void decodeValue(int index, TypedValueDecoder dec)
        throws IllegalArgumentException
    {
        if (index < 0 || index >= _attrCount) {
            throw new IllegalArgumentException("Invalid index "+index+"; current element has only "+_attrCount+" attributes");
        }
        // No cached String values, better just pass char array ref
        int start, end;

        if (index == 0) {
            start = 0;
            end = _valueOffsets[0];
        } else {
            start = _valueOffsets[index-1];
            end = _valueOffsets[index];
        }
        // Nonetheless, must trim before passing the value
        final char[] buf = _valueBuffer;
        while (true) {
            if (start >= end) {
                dec.handleEmptyValue();
                return;
            }
            if (!isSpace(buf[start])) {
                break;
            }
            ++start;
        }
        // Trailing space?
        while (--end > start && isSpace(buf[end])) { }
        dec.decode(buf, start, end+1);
    }

    public final int decodeValues(int index, TypedArrayDecoder dec, XmlScanner scanner)
        throws XMLStreamException
    {
        if (index < 0 || index >= _attrCount) {
            throw new IllegalArgumentException("Invalid index "+index+"; current element has only "+_attrCount+" attributes");
        }
        int start, end;

        if (index == 0) {
            start = 0;
            end = _valueOffsets[0];
        } else {
            start = _valueOffsets[index-1];
            end = _valueOffsets[index];
        }
        return decodeValues(dec, _valueBuffer, start, end, scanner);
    }

    private final int decodeValues(TypedArrayDecoder dec,
                                   final char[] buf, int ptr, final int end,
                                   final XmlScanner scanner)
        throws XMLStreamException
    {
        int start = ptr;
        int count = 0;

        try {
            decode_loop:
            while (ptr < end) {
                // First, any space to skip?
                while (buf[ptr] <= INT_SPACE) {
                    if (++ptr >= end) {
                        break decode_loop;
                    }
                }
                // Then let's figure out non-space char (token)
                start = ptr;
                ++ptr;
                while (ptr < end && buf[ptr] > INT_SPACE) {
                    ++ptr;
                }
                int tokenEnd = ptr;
                ++ptr; // to skip trailing space (or, beyond end)
                // Ok, decode... any more room?
                ++count;
                if (dec.decodeValue(buf, start, tokenEnd)) {
                    if (!checkExpand(dec)) {
                        break;
                    }
                }
            }
        } catch (IllegalArgumentException iae) {
            // Need to convert to a checked stream exception
            Location loc = scanner.getCurrentLocation();
            String lexical = new String(buf, start, (ptr-start));
            throw new TypedXMLStreamException(lexical, iae.getMessage(), loc, iae);
        }
        return count;
    }

    public byte[] decodeBinaryValue(int index, Base64Variant v, CharArrayBase64Decoder dec, XmlScanner scanner)
        throws XMLStreamException
    {
        if (index < 0 || index >= _attrCount) {
            throw new IllegalArgumentException("Invalid index "+index+"; current element has only "+_attrCount+" attributes");
        }
        int start, end;

        if (index == 0) {
            start = 0;
            end = _valueOffsets[0];
        } else {
            start = _valueOffsets[index-1];
            end = _valueOffsets[index];
        }
        int len = end-start;
        dec.init(v, true, _valueBuffer, start, end, /* addl segments */  null);
        try {
            return dec.decodeCompletely();
        } catch (IllegalArgumentException iae) {
            // Need to convert to a checked stream exception
            String lexical = new String(_valueBuffer, start, len);
            throw new TypedXMLStreamException(lexical, iae.getMessage(), scanner.getCurrentLocation(), iae);
        }
    }

    private final static boolean isSpace(char c)
    {
        return ((int) c) <= INT_SPACE;
    }

    Internal method used to see if we can expand the buffer that
the array decoder has. Bit messy, but simpler than having
separately typed instances; and called rarely so that performance
downside of instanceof is irrelevant.
/**
     * Internal method used to see if we can expand the buffer that
     * the array decoder has. Bit messy, but simpler than having
     * separately typed instances; and called rarely so that performance
     * downside of instanceof is irrelevant.
     */
    private final boolean checkExpand(TypedArrayDecoder tad)
    {
        if (tad instanceof ValueDecoderFactory.BaseArrayDecoder) {
            ((ValueDecoderFactory.BaseArrayDecoder) tad).expand();
            return true;
        }
        return false;
    }

    /*
    /**********************************************************************
    /* Internal methods
    /**********************************************************************
     */

    private void noteDupAttr(int ix1, int ix2)
    {
        _errorMsg = MessageFormat.format(ErrorConsts.ERR_WF_DUP_ATTRS,
                new Object[] { _names[ix1].toString(), ix1,
                        _names[ix2].toString(), ix2
        });

    }
}
/

com.fasterxml/ aalto-xml/ 1.2.2/ com/fasterxml/aalto/in/AttributeCollector.java