/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.db;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.security.MessageDigest;
import java.util.*;

import org.apache.cassandra.cache.IMeasurableMemory;
import org.apache.cassandra.config.*;
import org.apache.cassandra.db.rows.*;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.io.util.DataInputPlus;
import org.apache.cassandra.io.util.DataOutputPlus;
import org.apache.cassandra.utils.ByteBufferUtil;

A clustering prefix is the unit of what a ClusteringComparator can compare.

It holds values for the clustering columns of a table (potentially only a prefix of all of them) and has a "kind" that allows us to implement slices with inclusive and exclusive bounds.

In practice, ClusteringPrefix is just the common parts to its 3 main subtype: Clustering and ClusteringBound/ClusteringBoundary, where: 1) Clustering represents the clustering values for a row, i.e. the values for it's clustering columns. 2) ClusteringBound represents a bound (start or end) of a slice (of rows) or a range tombstone. 3) ClusteringBoundary represents the threshold between two adjacent range tombstones. See those classes for more details.

/** * A clustering prefix is the unit of what a {@link ClusteringComparator} can compare. * <p> * It holds values for the clustering columns of a table (potentially only a prefix of all of them) and has * a "kind" that allows us to implement slices with inclusive and exclusive bounds. * <p> * In practice, {@code ClusteringPrefix} is just the common parts to its 3 main subtype: {@link Clustering} and * {@link ClusteringBound}/{@link ClusteringBoundary}, where: * 1) {@code Clustering} represents the clustering values for a row, i.e. the values for it's clustering columns. * 2) {@code ClusteringBound} represents a bound (start or end) of a slice (of rows) or a range tombstone. * 3) {@code ClusteringBoundary} represents the threshold between two adjacent range tombstones. * See those classes for more details. */
public interface ClusteringPrefix extends IMeasurableMemory, Clusterable { public static final Serializer serializer = new Serializer();
The kind of clustering prefix this actually is. The kind STATIC_CLUSTERING is only implemented by Clustering.STATIC_CLUSTERING and CLUSTERING is implemented by the Clustering class. The rest is used by ClusteringBound and ClusteringBoundary.
/** * The kind of clustering prefix this actually is. * * The kind {@code STATIC_CLUSTERING} is only implemented by {@link Clustering#STATIC_CLUSTERING} and {@code CLUSTERING} is * implemented by the {@link Clustering} class. The rest is used by {@link ClusteringBound} and {@link ClusteringBoundary}. */
public enum Kind { // WARNING: the ordering of that enum matters because we use ordinal() in the serialization EXCL_END_BOUND (0, -1), INCL_START_BOUND (0, -1), EXCL_END_INCL_START_BOUNDARY(0, -1), STATIC_CLUSTERING (1, -1), CLUSTERING (2, 0), INCL_END_EXCL_START_BOUNDARY(3, 1), INCL_END_BOUND (3, 1), EXCL_START_BOUND (3, 1); private final int comparison;
Return the comparison of this kind to CLUSTERING. For bounds/boundaries, this basically tells us if we sort before or after our clustering values.
/** * Return the comparison of this kind to CLUSTERING. * For bounds/boundaries, this basically tells us if we sort before or after our clustering values. */
public final int comparedToClustering; Kind(int comparison, int comparedToClustering) { this.comparison = comparison; this.comparedToClustering = comparedToClustering; }
Compares the 2 provided kind.

Note: this should be used instead of Enum<Kind>.compareTo when comparing clustering prefixes. We do not override that latter method because it is final for an enum.

/** * Compares the 2 provided kind. * <p> * Note: this should be used instead of {@link #compareTo} when comparing clustering prefixes. We do * not override that latter method because it is final for an enum. */
public static int compare(Kind k1, Kind k2) { return Integer.compare(k1.comparison, k2.comparison); }
Returns the inverse of the current kind.

This invert both start into end (and vice-versa) and inclusive into exclusive (and vice-versa).

Returns:the invert of this kind. For instance, if this kind is an exlusive start, this return an inclusive end.
/** * Returns the inverse of the current kind. * <p> * This invert both start into end (and vice-versa) and inclusive into exclusive (and vice-versa). * * @return the invert of this kind. For instance, if this kind is an exlusive start, this return * an inclusive end. */
public Kind invert() { switch (this) { case EXCL_START_BOUND: return INCL_END_BOUND; case INCL_START_BOUND: return EXCL_END_BOUND; case EXCL_END_BOUND: return INCL_START_BOUND; case INCL_END_BOUND: return EXCL_START_BOUND; case EXCL_END_INCL_START_BOUNDARY: return INCL_END_EXCL_START_BOUNDARY; case INCL_END_EXCL_START_BOUNDARY: return EXCL_END_INCL_START_BOUNDARY; default: return this; } } public boolean isBound() { switch (this) { case INCL_START_BOUND: case INCL_END_BOUND: case EXCL_START_BOUND: case EXCL_END_BOUND: return true; default: return false; } } public boolean isBoundary() { switch (this) { case INCL_END_EXCL_START_BOUNDARY: case EXCL_END_INCL_START_BOUNDARY: return true; default: return false; } } public boolean isStart() { switch (this) { case INCL_START_BOUND: case EXCL_END_INCL_START_BOUNDARY: case INCL_END_EXCL_START_BOUNDARY: case EXCL_START_BOUND: return true; default: return false; } } public boolean isEnd() { switch (this) { case INCL_END_BOUND: case EXCL_END_INCL_START_BOUNDARY: case INCL_END_EXCL_START_BOUNDARY: case EXCL_END_BOUND: return true; default: return false; } } public boolean isOpen(boolean reversed) { return isBoundary() || (reversed ? isEnd() : isStart()); } public boolean isClose(boolean reversed) { return isBoundary() || (reversed ? isStart() : isEnd()); } public Kind closeBoundOfBoundary(boolean reversed) { assert isBoundary(); return reversed ? (this == INCL_END_EXCL_START_BOUNDARY ? EXCL_START_BOUND : INCL_START_BOUND) : (this == INCL_END_EXCL_START_BOUNDARY ? INCL_END_BOUND : EXCL_END_BOUND); } public Kind openBoundOfBoundary(boolean reversed) { assert isBoundary(); return reversed ? (this == INCL_END_EXCL_START_BOUNDARY ? INCL_END_BOUND : EXCL_END_BOUND) : (this == INCL_END_EXCL_START_BOUNDARY ? EXCL_START_BOUND : INCL_START_BOUND); } } public Kind kind();
The number of values in this prefix. There can't be more values that the this is a prefix of has of clustering columns.
Returns:the number of values in this prefix.
/** * The number of values in this prefix. * * There can't be more values that the this is a prefix of has of clustering columns. * * @return the number of values in this prefix. */
public int size();
Retrieves the ith value of this prefix.
Params:
  • i – the index of the value to retrieve. Must be such that 0 <= i < size().
Returns:the ith value of this prefix. Note that a value can be null.
/** * Retrieves the ith value of this prefix. * * @param i the index of the value to retrieve. Must be such that {@code 0 <= i < size()}. * * @return the ith value of this prefix. Note that a value can be {@code null}. */
public ByteBuffer get(int i);
Adds the data of this clustering prefix to the provided digest.
Params:
  • digest – the digest to which to add this prefix.
/** * Adds the data of this clustering prefix to the provided digest. * * @param digest the digest to which to add this prefix. */
public void digest(MessageDigest digest);
The size of the data hold by this prefix.
Returns:the size of the data hold by this prefix (this is not the size of the object in memory, just the size of the data it stores).
/** * The size of the data hold by this prefix. * * @return the size of the data hold by this prefix (this is not the size of the object in memory, just * the size of the data it stores). */
public int dataSize();
Generates a proper string representation of the prefix.
Params:
  • metadata – the metadata for the table the clustering prefix is of.
Returns:a human-readable string representation fo this prefix.
/** * Generates a proper string representation of the prefix. * * @param metadata the metadata for the table the clustering prefix is of. * @return a human-readable string representation fo this prefix. */
public String toString(CFMetaData metadata);
The values of this prefix as an array.

Please note that this may or may not require an array creation. So 1) you should *not* modify the returned array and 2) it's more efficient to use size() and get unless you actually need an array.

Returns:the values for this prefix as an array.
/** * The values of this prefix as an array. * <p> * Please note that this may or may not require an array creation. So 1) you should *not* * modify the returned array and 2) it's more efficient to use {@link #size()} and * {@link #get} unless you actually need an array. * * @return the values for this prefix as an array. */
public ByteBuffer[] getRawValues(); public static class Serializer { public void serialize(ClusteringPrefix clustering, DataOutputPlus out, int version, List<AbstractType<?>> types) throws IOException { // We shouldn't serialize static clusterings assert clustering.kind() != Kind.STATIC_CLUSTERING; if (clustering.kind() == Kind.CLUSTERING) { out.writeByte(clustering.kind().ordinal()); Clustering.serializer.serialize((Clustering)clustering, out, version, types); } else { ClusteringBoundOrBoundary.serializer.serialize((ClusteringBoundOrBoundary)clustering, out, version, types); } } public void skip(DataInputPlus in, int version, List<AbstractType<?>> types) throws IOException { Kind kind = Kind.values()[in.readByte()]; // We shouldn't serialize static clusterings assert kind != Kind.STATIC_CLUSTERING; if (kind == Kind.CLUSTERING) Clustering.serializer.skip(in, version, types); else ClusteringBoundOrBoundary.serializer.skipValues(in, kind, version, types); } public ClusteringPrefix deserialize(DataInputPlus in, int version, List<AbstractType<?>> types) throws IOException { Kind kind = Kind.values()[in.readByte()]; // We shouldn't serialize static clusterings assert kind != Kind.STATIC_CLUSTERING; if (kind == Kind.CLUSTERING) return Clustering.serializer.deserialize(in, version, types); else return ClusteringBoundOrBoundary.serializer.deserializeValues(in, kind, version, types); } public long serializedSize(ClusteringPrefix clustering, int version, List<AbstractType<?>> types) { // We shouldn't serialize static clusterings assert clustering.kind() != Kind.STATIC_CLUSTERING; if (clustering.kind() == Kind.CLUSTERING) return 1 + Clustering.serializer.serializedSize((Clustering)clustering, version, types); else return ClusteringBoundOrBoundary.serializer.serializedSize((ClusteringBoundOrBoundary)clustering, version, types); } void serializeValuesWithoutSize(ClusteringPrefix clustering, DataOutputPlus out, int version, List<AbstractType<?>> types) throws IOException { int offset = 0; int clusteringSize = clustering.size(); // serialize in batches of 32, to avoid garbage when deserializing headers while (offset < clusteringSize) { // we micro-batch the headers, so that we can incur fewer method calls, // and generate no garbage on deserialization; // we piggyback on vint encoding so that, typically, only 1 byte is used per 32 clustering values, // i.e. more than we ever expect to see int limit = Math.min(clusteringSize, offset + 32); out.writeUnsignedVInt(makeHeader(clustering, offset, limit)); while (offset < limit) { ByteBuffer v = clustering.get(offset); if (v != null && v.hasRemaining()) types.get(offset).writeValue(v, out); offset++; } } } long valuesWithoutSizeSerializedSize(ClusteringPrefix clustering, int version, List<AbstractType<?>> types) { long result = 0; int offset = 0; int clusteringSize = clustering.size(); while (offset < clusteringSize) { int limit = Math.min(clusteringSize, offset + 32); result += TypeSizes.sizeofUnsignedVInt(makeHeader(clustering, offset, limit)); offset = limit; } for (int i = 0; i < clusteringSize; i++) { ByteBuffer v = clustering.get(i); if (v == null || !v.hasRemaining()) continue; // handled in the header result += types.get(i).writtenLength(v); } return result; } ByteBuffer[] deserializeValuesWithoutSize(DataInputPlus in, int size, int version, List<AbstractType<?>> types) throws IOException { // Callers of this method should handle the case where size = 0 (in all case we want to return a special value anyway). assert size > 0; ByteBuffer[] values = new ByteBuffer[size]; int offset = 0; while (offset < size) { long header = in.readUnsignedVInt(); int limit = Math.min(size, offset + 32); while (offset < limit) { values[offset] = isNull(header, offset) ? null : (isEmpty(header, offset) ? ByteBufferUtil.EMPTY_BYTE_BUFFER : types.get(offset).readValue(in, DatabaseDescriptor.getMaxValueSize())); offset++; } } return values; } void skipValuesWithoutSize(DataInputPlus in, int size, int version, List<AbstractType<?>> types) throws IOException { // Callers of this method should handle the case where size = 0 (in all case we want to return a special value anyway). assert size > 0; int offset = 0; while (offset < size) { long header = in.readUnsignedVInt(); int limit = Math.min(size, offset + 32); while (offset < limit) { if (!isNull(header, offset) && !isEmpty(header, offset)) types.get(offset).skipValue(in); offset++; } } }
Whatever the type of a given clustering column is, its value can always be either empty or null. So we at least need to distinguish those 2 values, and because we want to be able to store fixed width values without appending their (fixed) size first, we need a way to encode empty values too. So for that, every clustering prefix includes a "header" that contains 2 bits per element in the prefix. For each element, those 2 bits encode whether the element is null, empty, or none of those.
/** * Whatever the type of a given clustering column is, its value can always be either empty or null. So we at least need to distinguish those * 2 values, and because we want to be able to store fixed width values without appending their (fixed) size first, we need a way to encode * empty values too. So for that, every clustering prefix includes a "header" that contains 2 bits per element in the prefix. For each element, * those 2 bits encode whether the element is null, empty, or none of those. */
private static long makeHeader(ClusteringPrefix clustering, int offset, int limit) { long header = 0; for (int i = offset ; i < limit ; i++) { ByteBuffer v = clustering.get(i); // no need to do modulo arithmetic for i, since the left-shift execute on the modulus of RH operand by definition if (v == null) header |= (1L << (i * 2) + 1); else if (!v.hasRemaining()) header |= (1L << (i * 2)); } return header; } // no need to do modulo arithmetic for i, since the left-shift execute on the modulus of RH operand by definition private static boolean isNull(long header, int i) { long mask = 1L << (i * 2) + 1; return (header & mask) != 0; } // no need to do modulo arithmetic for i, since the left-shift execute on the modulus of RH operand by definition private static boolean isEmpty(long header, int i) { long mask = 1L << (i * 2); return (header & mask) != 0; } }
Helper class that makes the deserialization of clustering prefixes faster.

The main reason for this is that when we deserialize rows from sstables, there is many cases where we have a bunch of rows to skip at the beginning of an index block because those rows are before the requested slice. This class make sure we can answer the question "is the next row on disk before the requested slice" with as little work as possible. It does that by providing a comparison method that deserialize only what is needed to decide of the comparison.

/** * Helper class that makes the deserialization of clustering prefixes faster. * <p> * The main reason for this is that when we deserialize rows from sstables, there is many cases where we have * a bunch of rows to skip at the beginning of an index block because those rows are before the requested slice. * This class make sure we can answer the question "is the next row on disk before the requested slice" with as * little work as possible. It does that by providing a comparison method that deserialize only what is needed * to decide of the comparison. */
public static class Deserializer { private final ClusteringComparator comparator; private final DataInputPlus in; private final SerializationHeader serializationHeader; private boolean nextIsRow; private long nextHeader; private int nextSize; private ClusteringPrefix.Kind nextKind; private int deserializedSize; private ByteBuffer[] nextValues; public Deserializer(ClusteringComparator comparator, DataInputPlus in, SerializationHeader header) { this.comparator = comparator; this.in = in; this.serializationHeader = header; } public void prepare(int flags, int extendedFlags) throws IOException { if (UnfilteredSerializer.isStatic(extendedFlags)) throw new IOException("Corrupt flags value for clustering prefix (isStatic flag set): " + flags); this.nextIsRow = UnfilteredSerializer.kind(flags) == Unfiltered.Kind.ROW; this.nextKind = nextIsRow ? Kind.CLUSTERING : ClusteringPrefix.Kind.values()[in.readByte()]; this.nextSize = nextIsRow ? comparator.size() : in.readUnsignedShort(); this.deserializedSize = 0; // The point of the deserializer is that some of the clustering prefix won't actually be used (because they are not // within the bounds of the query), and we want to reduce allocation for them. So we only reuse the values array // between elements if 1) we haven't returned the previous element (if we have, nextValues will be null) and 2) // nextValues is of the proper size. Note that the 2nd condition may not hold for range tombstone bounds, but all // rows have a fixed size clustering, so we'll still save in the common case. if (nextValues == null || nextValues.length != nextSize) this.nextValues = new ByteBuffer[nextSize]; } public int compareNextTo(ClusteringBoundOrBoundary bound) throws IOException { if (bound == ClusteringBound.TOP) return -1; for (int i = 0; i < bound.size(); i++) { if (!hasComponent(i)) return nextKind.comparedToClustering; int cmp = comparator.compareComponent(i, nextValues[i], bound.get(i)); if (cmp != 0) return cmp; } if (bound.size() == nextSize) return Kind.compare(nextKind, bound.kind()); // We know that we'll have exited already if nextSize < bound.size return -bound.kind().comparedToClustering; } private boolean hasComponent(int i) throws IOException { if (i >= nextSize) return false; while (deserializedSize <= i) deserializeOne(); return true; } private boolean deserializeOne() throws IOException { if (deserializedSize == nextSize) return false; if ((deserializedSize % 32) == 0) nextHeader = in.readUnsignedVInt(); int i = deserializedSize++; nextValues[i] = Serializer.isNull(nextHeader, i) ? null : (Serializer.isEmpty(nextHeader, i) ? ByteBufferUtil.EMPTY_BYTE_BUFFER : serializationHeader.clusteringTypes().get(i).readValue(in, DatabaseDescriptor.getMaxValueSize())); return true; } private void deserializeAll() throws IOException { while (deserializeOne()) continue; } public ClusteringBoundOrBoundary deserializeNextBound() throws IOException { assert !nextIsRow; deserializeAll(); ClusteringBoundOrBoundary bound = ClusteringBoundOrBoundary.create(nextKind, nextValues); nextValues = null; return bound; } public Clustering deserializeNextClustering() throws IOException { assert nextIsRow; deserializeAll(); Clustering clustering = Clustering.make(nextValues); nextValues = null; return clustering; } public ClusteringPrefix.Kind skipNext() throws IOException { for (int i = deserializedSize; i < nextSize; i++) { if ((i % 32) == 0) nextHeader = in.readUnsignedVInt(); if (!Serializer.isNull(nextHeader, i) && !Serializer.isEmpty(nextHeader, i)) serializationHeader.clusteringTypes().get(i).skipValue(in); } deserializedSize = nextSize; return nextKind; } } }