package com.fasterxml.jackson.dataformat.smile;

Constants used by SmileGenerator and SmileParser
Author:tatu
/** * Constants used by {@link SmileGenerator} and {@link SmileParser} * * @author tatu */
public final class SmileConstants { /* /********************************************************** /* Thresholds /********************************************************** */
Encoding has special "short" forms for value Strings that can be represented by 64 bytes of UTF-8 or less.
/** * Encoding has special "short" forms for value Strings that can * be represented by 64 bytes of UTF-8 or less. */
public final static int MAX_SHORT_VALUE_STRING_BYTES = 64;
Maximum byte length for short ASCII names is 64.
/** * Maximum byte length for short ASCII names is 64. */
public final static int MAX_SHORT_NAME_ASCII_BYTES = 64;
Maximum byte length for short non-ASCII names is slightly less due to having to reserve bytes 0xF8 and above (but we get one more as values 0 and 1 are not valid)
/** * Maximum byte length for short non-ASCII names is slightly * less due to having to reserve bytes 0xF8 and above (but * we get one more as values 0 and 1 are not valid) */
public final static int MAX_SHORT_NAME_UNICODE_BYTES = 56;
Regardless of ASCII/non-ASCII aspect, maximum byte length for any short name is then 64 bytes.
/** * Regardless of ASCII/non-ASCII aspect, maximum byte length for any * short name is then 64 bytes. */
public final static int MAX_SHORT_NAME_ANY_BYTES = 64;
Longest back reference we use for field names is 10 bits; no point in keeping much more around
/** * Longest back reference we use for field names is 10 bits; no point * in keeping much more around */
public final static int MAX_SHARED_NAMES = 1024;
Longest back reference we use for short shared String values is 10 bits, so up to (1 << 10) values to keep track of.
/** * Longest back reference we use for short shared String values is 10 bits, * so up to (1 << 10) values to keep track of. */
public final static int MAX_SHARED_STRING_VALUES = 1024;
Also: whereas we can refer to names of any length, we will only consider text values that are considered "tiny" or "short" (ones encoded with length prefix); this value thereby has to be maximum length of Strings that can be encoded as such.
/** * Also: whereas we can refer to names of any length, we will only consider * text values that are considered "tiny" or "short" (ones encoded with * length prefix); this value thereby has to be maximum length of Strings * that can be encoded as such. */
public final static int MAX_SHARED_STRING_LENGTH_BYTES = 65;
And to make encoding logic tight and simple, we can always require that output buffer has this amount of space available before encoding possibly short String (3 bytes since longest UTF-8 encoded Java char is 3 bytes). Two extra bytes need to be reserved as well; first for token indicator, and second for terminating null byte (in case it's not a short String after all)
/** * And to make encoding logic tight and simple, we can always * require that output buffer has this amount of space * available before encoding possibly short String (3 bytes since * longest UTF-8 encoded Java char is 3 bytes). * Two extra bytes need to be reserved as well; first for token indicator, * and second for terminating null byte (in case it's not a short String after all) */
public final static int MIN_BUFFER_FOR_POSSIBLE_SHORT_STRING = 1 + (3 * 65); /* /********************************************************** /* Byte markers /********************************************************** */
We need a byte marker to denote end of variable-length Strings. Although null byte is commonly used, let's try to avoid using it since it can't be embedded in Web Sockets content (similarly, 0xFF can't). There are multiple candidates for bytes UTF-8 can not have; 0xFC is chosen to allow reasonable ordering (highest values meaning most significant framing function; 0xFF being end-of-content and so on)
/** * We need a byte marker to denote end of variable-length Strings. Although * null byte is commonly used, let's try to avoid using it since it can't * be embedded in Web Sockets content (similarly, 0xFF can't). There are * multiple candidates for bytes UTF-8 can not have; 0xFC is chosen to * allow reasonable ordering (highest values meaning most significant * framing function; 0xFF being end-of-content and so on) */
public final static int INT_MARKER_END_OF_STRING = 0xFC; public final static byte BYTE_MARKER_END_OF_STRING = (byte) INT_MARKER_END_OF_STRING;
In addition we can use a marker to allow simple framing; splitting of physical data (like file) into distinct logical sections like JSON documents. 0xFF makes sense here since it is also used as end marker for Web Sockets.
/** * In addition we can use a marker to allow simple framing; splitting * of physical data (like file) into distinct logical sections like * JSON documents. 0xFF makes sense here since it is also used * as end marker for Web Sockets. */
public final static byte BYTE_MARKER_END_OF_CONTENT = (byte) 0xFF; /* /********************************************************** /* Format header: put smile on your data... /********************************************************** */
First byte of data header (0x3A)
/** * First byte of data header (0x3A) */
public final static byte HEADER_BYTE_1 = (byte) ':';
Second byte of data header (0x29)
/** * Second byte of data header (0x29) */
public final static byte HEADER_BYTE_2 = (byte) ')';
Third byte of data header
/** * Third byte of data header */
public final static byte HEADER_BYTE_3 = (byte) '\n';
Current version consists of four zero bits (nibble)
/** * Current version consists of four zero bits (nibble) */
public final static int HEADER_VERSION_0 = 0x0;
Fourth byte of data header; contains version nibble, may have flags
/** * Fourth byte of data header; contains version nibble, may * have flags */
public final static byte HEADER_BYTE_4 = (HEADER_VERSION_0 << 4);
Indicator bit that indicates whether encoded content may have Shared names (back references to recently encoded field names). If no header available, must be processed as if this was set to true. If (and only if) header exists, and value is 0, can parser omit storing of seen names, as it is guaranteed that no back references exist.
/** * Indicator bit that indicates whether encoded content may * have Shared names (back references to recently encoded field * names). If no header available, must be * processed as if this was set to true. * If (and only if) header exists, and value is 0, can parser * omit storing of seen names, as it is guaranteed that no back * references exist. */
public final static int HEADER_BIT_HAS_SHARED_NAMES = 0x01;
Indicator bit that indicates whether encoded content may have shared String values (back references to recently encoded 'short' String values, where short is defined as 64 bytes or less). If no header available, can be assumed to be 0 (false). If header exists, and bit value is 1, parsers has to store up to 1024 most recently seen distinct short String values.
/** * Indicator bit that indicates whether encoded content may * have shared String values (back references to recently encoded * 'short' String values, where short is defined as 64 bytes or less). * If no header available, can be assumed to be 0 (false). * If header exists, and bit value is 1, parsers has to store up * to 1024 most recently seen distinct short String values. */
public final static int HEADER_BIT_HAS_SHARED_STRING_VALUES = 0x02;
Indicator bit that indicates whether encoded content may contain raw (unquoted) binary values. If no header available, can be assumed to be 0 (false). If header exists, and bit value is 1, parser can not assume that specific byte values always have default meaning (specifically, content end marker 0xFF and header signature can be contained in binary values)

Note that this bit being true does not automatically mean that such raw binary content indeed exists; just that it may exist. This because header is written before any binary data may be written.

/** * Indicator bit that indicates whether encoded content may * contain raw (unquoted) binary values. * If no header available, can be assumed to be 0 (false). * If header exists, and bit value is 1, parser can not assume that * specific byte values always have default meaning (specifically, * content end marker 0xFF and header signature can be contained * in binary values) *<p> * Note that this bit being true does not automatically mean that * such raw binary content indeed exists; just that it may exist. * This because header is written before any binary data may be * written. */
public final static int HEADER_BIT_HAS_RAW_BINARY = 0x04; /* /********************************************************** /* Type prefixes: 3 MSB of token byte /********************************************************** */ public final static int TOKEN_PREFIX_INTEGER = 0x24; public final static int TOKEN_PREFIX_FP = 0x28; // Shared strings are back references for last 63 short (< 64 byte) string values // NOTE: 0x00 is reserved, not used with current version (may be used in future) public final static int TOKEN_PREFIX_SHARED_STRING_SHORT = 0x00; // literals are put between 0x20 and 0x3F to reserve markers (smiley), along with ints/doubles //public final static int TOKEN_PREFIX_MISC_NUMBERS = 0x20; public final static int TOKEN_PREFIX_SHARED_STRING_LONG = 0xEC; public final static int TOKEN_PREFIX_TINY_ASCII = 0x40; public final static int TOKEN_PREFIX_SMALL_ASCII = 0x60; public final static int TOKEN_PREFIX_TINY_UNICODE = 0x80; public final static int TOKEN_PREFIX_SHORT_UNICODE = 0xA0; // Small ints are 4-bit (-16 to +15) integer constants public final static int TOKEN_PREFIX_SMALL_INT = 0xC0; // And misc types have empty at the end too, to reserve 0xF8 - 0xFF public final static int TOKEN_PREFIX_MISC_OTHER = 0xE0; /* /********************************************************** /* Token literals, normal mode /********************************************************** */ // First, non-structured literals public final static byte TOKEN_LITERAL_EMPTY_STRING = 0x20; public final static byte TOKEN_LITERAL_NULL = 0x21; public final static byte TOKEN_LITERAL_FALSE = 0x22; public final static byte TOKEN_LITERAL_TRUE = 0x23; // And then structured literals public final static byte TOKEN_LITERAL_START_ARRAY = (byte) 0xF8; public final static byte TOKEN_LITERAL_END_ARRAY = (byte) 0xF9; public final static byte TOKEN_LITERAL_START_OBJECT = (byte) 0xFA; public final static byte TOKEN_LITERAL_END_OBJECT = (byte) 0xFB; /* /********************************************************** /* Subtype constants for misc text/binary types /********************************************************** */ public final static int INT_MISC_BINARY_7BIT = 0xE8; public final static int INT_MISC_BINARY_RAW = 0xFD;
Type (for misc, other) used for variable length UTF-8 encoded text, when it is known to only contain ASCII chars. Note: 2 LSB are reserved for future use; must be zeroes for now
/** * Type (for misc, other) used for * variable length UTF-8 encoded text, when it is known to only contain ASCII chars. * Note: 2 LSB are reserved for future use; must be zeroes for now */
public final static byte TOKEN_MISC_LONG_TEXT_ASCII = (byte) 0xE0;
Type (for misc, other) used for variable length UTF-8 encoded text, when it is NOT known to only contain ASCII chars (which means it MAY have multi-byte characters) Note: 2 LSB are reserved for future use; must be zeroes for now
/** * Type (for misc, other) used * for variable length UTF-8 encoded text, when it is NOT known to only contain ASCII chars * (which means it MAY have multi-byte characters) * Note: 2 LSB are reserved for future use; must be zeroes for now */
public final static byte TOKEN_MISC_LONG_TEXT_UNICODE = (byte) 0xE4;
Type (for misc, other) used for "safe" (encoded by only using 7 LSB, giving 8/7 expansion ratio). This is usually done to ensure that certain bytes are never included in encoded data (like 0xFF) Note: 2 LSB are reserved for future use; must be zeroes for now
/** * Type (for misc, other) used * for "safe" (encoded by only using 7 LSB, giving 8/7 expansion ratio). * This is usually done to ensure that certain bytes are never included * in encoded data (like 0xFF) * Note: 2 LSB are reserved for future use; must be zeroes for now */
public final static byte TOKEN_MISC_BINARY_7BIT = (byte) INT_MISC_BINARY_7BIT;
Raw binary data marker is specifically chosen as separate from other types, since it can have significant impact on framing (or rather fast scanning based on structure and framing markers).
/** * Raw binary data marker is specifically chosen as separate from * other types, since it can have significant impact on framing * (or rather fast scanning based on structure and framing markers). */
public final static byte TOKEN_MISC_BINARY_RAW = (byte) INT_MISC_BINARY_RAW; /* /********************************************************** /* Modifiers for numeric entries /********************************************************** */
Numeric subtype (2 LSB) indicating 32-bit integer (int)
/** * Numeric subtype (2 LSB) * indicating 32-bit integer (int) */
public final static int TOKEN_MISC_INTEGER_32 = 0x00;
Numeric subtype (2 LSB) indicating 32-bit integer (long)
/** * Numeric subtype (2 LSB) * indicating 32-bit integer (long) */
public final static int TOKEN_MISC_INTEGER_64 = 0x01;
Numeric subtype (2 LSB) for indicating BigInteger type.
/** * Numeric subtype (2 LSB) for * indicating {@link java.math.BigInteger} type. */
public final static int TOKEN_MISC_INTEGER_BIG = 0x02; // Note: type 3 (0xF3) reserved for future use
Numeric subtype (2 LSB) for indicating 32-bit IEEE single precision floating point number.
/** * Numeric subtype (2 LSB) for * indicating 32-bit IEEE single precision floating point number. */
public final static int TOKEN_MISC_FLOAT_32 = 0x00;
Numeric subtype (2 LSB) indicating 64-bit IEEE double precision floating point number.
/** * Numeric subtype (2 LSB) * indicating 64-bit IEEE double precision floating point number. */
public final static int TOKEN_MISC_FLOAT_64 = 0x01;
Numeric subtype (2 LSB) for indicating BigDecimal type.
/** * Numeric subtype (2 LSB) for * indicating {@link java.math.BigDecimal} type. */
public final static int TOKEN_MISC_FLOAT_BIG = 0x02; // Note: type 3 (0xF7) reserved for future use /* /********************************************************** /* Token types for keys /********************************************************** */
Let's use same code for empty key as for empty String value
/** * Let's use same code for empty key as for empty String value */
public final static byte TOKEN_KEY_EMPTY_STRING = 0x20; public final static int TOKEN_PREFIX_KEY_SHARED_LONG = 0x30; public final static byte TOKEN_KEY_LONG_STRING = 0x34; public final static int TOKEN_PREFIX_KEY_SHARED_SHORT = 0x40; public final static int TOKEN_PREFIX_KEY_ASCII = 0x80; public final static int TOKEN_PREFIX_KEY_UNICODE = 0xC0; /* /********************************************************** /* Basic UTF-8 decode/encode table /********************************************************** */
Additionally we can combine UTF-8 decoding info into similar data table. Values indicate "byte length - 1"; meaning -1 is used for invalid bytes, 0 for single-byte codes, 1 for 2-byte codes and 2 for 3-byte codes.
/** * Additionally we can combine UTF-8 decoding info into similar * data table. * Values indicate "byte length - 1"; meaning -1 is used for * invalid bytes, 0 for single-byte codes, 1 for 2-byte codes * and 2 for 3-byte codes. */
public final static int[] sUtf8UnitLengths; static { int[] table = new int[256]; for (int c = 128; c < 256; ++c) { int code; // We'll add number of bytes needed for decoding if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) code = 1; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) code = 2; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... code = 3; } else { // And -1 seems like a good "universal" error marker... code = -1; } table[c] = code; } sUtf8UnitLengths = table; } }