/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.db;

import java.io.DataInput;
import java.io.IOException;
import java.io.IOError;
import java.nio.ByteBuffer;
import java.security.MessageDigest;
import java.util.*;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;

import org.apache.cassandra.cql3.ColumnIdentifier;
import org.apache.cassandra.cql3.SuperColumnCompatibility;
import org.apache.cassandra.config.SchemaConstants;
import org.apache.cassandra.utils.AbstractIterator;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.PeekingIterator;

import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.ColumnDefinition;
import org.apache.cassandra.db.filter.ColumnFilter;
import org.apache.cassandra.db.filter.DataLimits;
import org.apache.cassandra.db.rows.*;
import org.apache.cassandra.db.partitions.*;
import org.apache.cassandra.db.context.CounterContext;
import org.apache.cassandra.db.marshal.*;
import org.apache.cassandra.io.util.DataInputPlus;
import org.apache.cassandra.io.util.DataOutputPlus;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.utils.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import static com.google.common.collect.Iterables.all;
import static org.apache.cassandra.utils.ByteBufferUtil.bytes;

Functions to deal with the old format.
/** * Functions to deal with the old format. */
public abstract class LegacyLayout { private static final Logger logger = LoggerFactory.getLogger(LegacyLayout.class); private static final NoSpamLogger noSpamLogger = NoSpamLogger.getLogger(logger, 1L, TimeUnit.MINUTES); public final static int MAX_CELL_NAME_LENGTH = FBUtilities.MAX_UNSIGNED_SHORT; public final static int STATIC_PREFIX = 0xFFFF; public final static int DELETION_MASK = 0x01; public final static int EXPIRATION_MASK = 0x02; public final static int COUNTER_MASK = 0x04; public final static int COUNTER_UPDATE_MASK = 0x08; private final static int RANGE_TOMBSTONE_MASK = 0x10; // Used in decodeBound if the number of components in the legacy bound is greater than the clustering size, // indicating a complex column deletion (i.e. a collection tombstone), but the referenced column is either // not present in the current table metadata, or is not currently a complex column. In that case, we'll // check the dropped columns for the table which should contain the previous column definition. If that // previous definition is also not complex (indicating that the column may have been dropped and re-added // with different types multiple times), we use this fake definition to ensure that the complex deletion // can be safely processed. This resulting deletion should be filtered out of any row created by a // CellGrouper by the dropped column check, but this gives us an extra level of confidence as that check // is timestamp based and so is fallible in the face of clock drift. private static final ColumnDefinition INVALID_DROPPED_COMPLEX_SUBSTITUTE_COLUMN = new ColumnDefinition("", "", ColumnIdentifier.getInterned(ByteBufferUtil.EMPTY_BYTE_BUFFER, UTF8Type.instance), SetType.getInstance(UTF8Type.instance, true), ColumnDefinition.NO_POSITION, ColumnDefinition.Kind.REGULAR); private LegacyLayout() {} public static AbstractType<?> makeLegacyComparator(CFMetaData metadata) { ClusteringComparator comparator = metadata.comparator; if (!metadata.isCompound()) { assert comparator.size() == 1; return comparator.subtype(0); } boolean hasCollections = metadata.hasCollectionColumns() || metadata.hasDroppedCollectionColumns(); List<AbstractType<?>> types = new ArrayList<>(comparator.size() + (metadata.isDense() ? 0 : 1) + (hasCollections ? 1 : 0)); types.addAll(comparator.subtypes()); if (!metadata.isDense()) { types.add(UTF8Type.instance); if (hasCollections) { Map<ByteBuffer, CollectionType> defined = new HashMap<>(); for (CFMetaData.DroppedColumn def : metadata.getDroppedColumns().values()) if (def.type instanceof CollectionType && def.type.isMultiCell()) defined.put(bytes(def.name), (CollectionType) def.type); for (ColumnDefinition def : metadata.partitionColumns()) if (def.type instanceof CollectionType && def.type.isMultiCell()) defined.put(def.name.bytes, (CollectionType) def.type); types.add(ColumnToCollectionType.getInstance(defined)); } } return CompositeType.getInstance(types); } public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer cellname) throws UnknownColumnException { assert cellname != null; if (metadata.isSuper()) { assert superColumnName != null; return decodeForSuperColumn(metadata, Clustering.make(superColumnName), cellname); } assert superColumnName == null; return decodeCellName(metadata, cellname); } private static LegacyCellName decodeForSuperColumn(CFMetaData metadata, Clustering clustering, ByteBuffer subcol) { ColumnDefinition def = metadata.getColumnDefinition(subcol); if (def != null) { // it's a statically defined subcolumn return new LegacyCellName(clustering, def, null); } def = metadata.compactValueColumn(); assert def != null && def.type instanceof MapType; return new LegacyCellName(clustering, def, subcol); } public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer cellname) throws UnknownColumnException { return decodeCellName(metadata, cellname, false); } public static LegacyCellName decodeCellName(CFMetaData metadata, ByteBuffer cellname, boolean readAllAsDynamic) throws UnknownColumnException { Clustering clustering = decodeClustering(metadata, cellname); if (metadata.isSuper()) return decodeForSuperColumn(metadata, clustering, CompositeType.extractComponent(cellname, 1)); if (metadata.isDense() || (metadata.isCompactTable() && readAllAsDynamic)) return new LegacyCellName(clustering, metadata.compactValueColumn(), null); ByteBuffer column = metadata.isCompound() ? CompositeType.extractComponent(cellname, metadata.comparator.size()) : cellname; if (column == null) { // Tables for composite 2ndary indexes used to be compound but dense, but we've transformed them into regular tables // (non compact ones) but with no regular column (i.e. we only care about the clustering). So we'll get here // in that case, and what we want to return is basically a row marker. if (metadata.partitionColumns().isEmpty()) return new LegacyCellName(clustering, null, null); // Otherwise, we shouldn't get there throw new IllegalArgumentException("No column name component found in cell name"); } // Row marker, this is ok if (!column.hasRemaining()) return new LegacyCellName(clustering, null, null); ColumnDefinition def = metadata.getColumnDefinition(column); if (metadata.isCompactTable()) { if (def == null || def.isPrimaryKeyColumn()) // If it's a compact table, it means the column is in fact a "dynamic" one return new LegacyCellName(Clustering.make(column), metadata.compactValueColumn(), null); } else if (def == null) { throw new UnknownColumnException(metadata, column); } ByteBuffer collectionElement = metadata.isCompound() ? CompositeType.extractComponent(cellname, metadata.comparator.size() + 1) : null; // Note that because static compact columns are translated to static defs in the new world order, we need to force a static // clustering if the definition is static (as it might not be in this case). return new LegacyCellName(def.isStatic() ? Clustering.STATIC_CLUSTERING : clustering, def, collectionElement); } public static LegacyBound decodeSliceBound(CFMetaData metadata, ByteBuffer bound, boolean isStart) { return decodeBound(metadata, bound, isStart, false); } public static LegacyBound decodeTombstoneBound(CFMetaData metadata, ByteBuffer bound, boolean isStart) { return decodeBound(metadata, bound, isStart, true); } private static LegacyBound decodeBound(CFMetaData metadata, ByteBuffer bound, boolean isStart, boolean isDeletion) { if (!bound.hasRemaining()) return isStart ? LegacyBound.BOTTOM : LegacyBound.TOP; if (!metadata.isCompound()) { // The non compound case is a lot easier, in that there is no EOC nor collection to worry about, so dealing // with that first. return new LegacyBound(isStart ? ClusteringBound.inclusiveStartOf(bound) : ClusteringBound.inclusiveEndOf(bound), false, null); } int clusteringSize = metadata.comparator.size(); boolean isStatic = metadata.isCompound() && CompositeType.isStaticName(bound); List<ByteBuffer> components = CompositeType.splitName(bound); byte eoc = CompositeType.lastEOC(bound); // if the bound we have decoded is static, 2.2 format requires there to be N empty clusterings assert !isStatic || (components.size() >= clusteringSize && all(components.subList(0, clusteringSize), ByteBufferUtil.EMPTY_BYTE_BUFFER::equals)); ColumnDefinition collectionName = null; if (components.size() > clusteringSize) { // For a deletion, there can be more components than the clustering size only in the case this is the // bound of a collection range tombstone. In such a case, there is exactly one more component, and that // component is the name of the collection being deleted, since we do not support collection range deletions. // If the bound is not part of a deletion, it is from slice query filter. The column name may be: // - a valid, non-collection column; in this case we expect a single extra component // - an empty buffer, representing a row marker; in this case we also expect a single extra empty component // - a valid collection column and the first part of a cell path; in this case we expect exactly two extra components // In any of these slice cases, these items are unnecessary for the bound we construct, // so we can simply remove them, after corroborating we have encountered one of these scenario. assert !metadata.isCompactTable() : toDebugHex(components); // In all cases, the element straight after the clusterings should contain the name of a column. if (components.size() > clusteringSize + 1) { // we accept bounds from paging state that occur inside a complex column - in this case, we expect // two excess components, the first of which is a column name, the second a key into the collection if (isDeletion) throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": deletion can have at most one extra component"); if (clusteringSize + 2 != components.size()) throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": complex slices require exactly two extra components"); // decode simply to verify that we have (or may have had) a complex column; we assume the collection key is valid decodeBoundLookupComplexColumn(metadata, components, clusteringSize, isStatic); components.remove(clusteringSize + 1); } else if (isDeletion) { collectionName = decodeBoundLookupComplexColumn(metadata, components, clusteringSize, isStatic); } else if (components.get(clusteringSize).hasRemaining()) { decodeBoundVerifySimpleColumn(metadata, components, clusteringSize, isStatic); } components.remove(clusteringSize); } boolean isInclusive; if (isStart) { isInclusive = eoc <= 0; } else { isInclusive = eoc >= 0; // for an end bound, if we only have a prefix of all the components and the final EOC is zero, // then it should only match up to the prefix but no further, that is, it is an inclusive bound // of the exact prefix but an exclusive bound of anything beyond it, so adding an empty // composite value ensures this behavior, see CASSANDRA-12423 for more details if (eoc == 0 && components.size() < clusteringSize) { components.add(ByteBufferUtil.EMPTY_BYTE_BUFFER); isInclusive = false; } } ClusteringPrefix.Kind boundKind = ClusteringBound.boundKind(isStart, isInclusive); ClusteringBound cb = ClusteringBound.create(boundKind, components.toArray(new ByteBuffer[components.size()])); return new LegacyBound(cb, isStatic, collectionName); } // finds the simple column definition associated with components.get(clusteringSize) // if no such columns exists, or ever existed, we throw an exception; if we do not know, we return a dummy column definition private static ColumnDefinition decodeBoundLookupComplexColumn(CFMetaData metadata, List<ByteBuffer> components, int clusteringSize, boolean isStatic) { ByteBuffer columnNameBytes = components.get(clusteringSize); ColumnDefinition columnName = metadata.getColumnDefinition(columnNameBytes); if (columnName == null || !columnName.isComplex()) { columnName = metadata.getDroppedColumnDefinition(columnNameBytes, isStatic); // if no record of the column having ever existed is found, something is badly wrong if (columnName == null) throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": expected complex column at position " + clusteringSize); // if we do have a record of dropping this column but it wasn't previously complex, use a fake // column definition for safety (see the comment on the constant declaration for details) if (!columnName.isComplex()) columnName = INVALID_DROPPED_COMPLEX_SUBSTITUTE_COLUMN; } return columnName; } // finds the simple column definition associated with components.get(clusteringSize) // if no such columns exists, and definitely never existed, we throw an exception private static void decodeBoundVerifySimpleColumn(CFMetaData metadata, List<ByteBuffer> components, int clusteringSize, boolean isStatic) { ByteBuffer columnNameBytes = components.get(clusteringSize); ColumnDefinition columnName = metadata.getColumnDefinition(columnNameBytes); if (columnName == null || !columnName.isSimple()) { columnName = metadata.getDroppedColumnDefinition(columnNameBytes, isStatic); // if no record of the column having ever existed is found, something is badly wrong if (columnName == null) throw new IllegalArgumentException("Invalid bound " + toDebugHex(components) + ": expected simple column at position " + clusteringSize); } } private static String toDebugHex(Collection<ByteBuffer> buffers) { return buffers.stream().map(ByteBufferUtil::bytesToHex).collect(Collectors.joining()); } public static ByteBuffer encodeBound(CFMetaData metadata, ClusteringBound bound, boolean isStart) { if (bound == ClusteringBound.BOTTOM || bound == ClusteringBound.TOP || metadata.comparator.size() == 0) return ByteBufferUtil.EMPTY_BYTE_BUFFER; ClusteringPrefix clustering = bound.clustering(); if (!metadata.isCompound()) { assert clustering.size() == 1; return clustering.get(0); } CompositeType ctype = CompositeType.getInstance(metadata.comparator.subtypes()); CompositeType.Builder builder = ctype.builder(); for (int i = 0; i < clustering.size(); i++) builder.add(clustering.get(i)); if (isStart) return bound.isInclusive() ? builder.build() : builder.buildAsEndOfRange(); else return bound.isInclusive() ? builder.buildAsEndOfRange() : builder.build(); } public static ByteBuffer encodeCellName(CFMetaData metadata, ClusteringPrefix clustering, ByteBuffer columnName, ByteBuffer collectionElement) { boolean isStatic = clustering == Clustering.STATIC_CLUSTERING; if (!metadata.isCompound()) { if (isStatic) return columnName; assert clustering.size() == 1 : "Expected clustering size to be 1, but was " + clustering.size(); return clustering.get(0); } // We use comparator.size() rather than clustering.size() because of static clusterings int clusteringSize = metadata.comparator.size(); int size = clusteringSize + (metadata.isDense() ? 0 : 1) + (collectionElement == null ? 0 : 1); if (metadata.isSuper()) size = clusteringSize + 1; ByteBuffer[] values = new ByteBuffer[size]; for (int i = 0; i < clusteringSize; i++) { if (isStatic) { values[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER; continue; } ByteBuffer v = clustering.get(i); // we can have null (only for dense compound tables for backward compatibility reasons) but that // means we're done and should stop there as far as building the composite is concerned. if (v == null) return CompositeType.build(Arrays.copyOfRange(values, 0, i)); values[i] = v; } if (metadata.isSuper()) { // We need to set the "column" (in thrift terms) name, i.e. the value corresponding to the subcomparator. // What it is depends if this a cell for a declared "static" column or a "dynamic" column part of the // super-column internal map. assert columnName != null; // This should never be null for supercolumns, see decodeForSuperColumn() above values[clusteringSize] = columnName.equals(SuperColumnCompatibility.SUPER_COLUMN_MAP_COLUMN) ? collectionElement : columnName; } else { if (!metadata.isDense()) values[clusteringSize] = columnName; if (collectionElement != null) values[clusteringSize + 1] = collectionElement; } return CompositeType.build(isStatic, values); } public static Clustering decodeClustering(CFMetaData metadata, ByteBuffer value) { int csize = metadata.comparator.size(); if (csize == 0) return Clustering.EMPTY; if (metadata.isCompound() && CompositeType.isStaticName(value)) return Clustering.STATIC_CLUSTERING; List<ByteBuffer> components = metadata.isCompound() ? CompositeType.splitName(value) : Collections.singletonList(value); return Clustering.make(components.subList(0, Math.min(csize, components.size())).toArray(new ByteBuffer[csize])); } public static ByteBuffer encodeClustering(CFMetaData metadata, ClusteringPrefix clustering) { if (clustering.size() == 0) return ByteBufferUtil.EMPTY_BYTE_BUFFER; if (!metadata.isCompound()) { assert clustering.size() == 1; return clustering.get(0); } ByteBuffer[] values = new ByteBuffer[clustering.size()]; for (int i = 0; i < clustering.size(); i++) values[i] = clustering.get(i); return CompositeType.build(values); }
The maximum number of cells to include per partition when converting to the old format.

We already apply the limit during the actual query, but for queries that counts cells and not rows (thrift queries and distinct queries as far as old nodes are concerned), we may still include a little bit more than requested because DataLimits always include full rows. So if the limit ends in the middle of a queried row, the full row will be part of our result. This would confuse old nodes however so we make sure to truncate it to what's expected before writting it on the wire.

Params:
  • command – the read commmand for which to determine the maximum cells per partition. This can be null in which case Integer.MAX_VALUE is returned.
Returns:the maximum number of cells per partition that should be enforced according to the read command if post-query limitation are in order (see above). This will be Integer.MAX_VALUE if no such limits are necessary.
/** * The maximum number of cells to include per partition when converting to the old format. * <p> * We already apply the limit during the actual query, but for queries that counts cells and not rows (thrift queries * and distinct queries as far as old nodes are concerned), we may still include a little bit more than requested * because {@link DataLimits} always include full rows. So if the limit ends in the middle of a queried row, the * full row will be part of our result. This would confuse old nodes however so we make sure to truncate it to * what's expected before writting it on the wire. * * @param command the read commmand for which to determine the maximum cells per partition. This can be {@code null} * in which case {@code Integer.MAX_VALUE} is returned. * @return the maximum number of cells per partition that should be enforced according to the read command if * post-query limitation are in order (see above). This will be {@code Integer.MAX_VALUE} if no such limits are * necessary. */
private static int maxCellsPerPartition(ReadCommand command) { if (command == null) return Integer.MAX_VALUE; DataLimits limits = command.limits(); // There is 2 types of DISTINCT queries: those that includes only the partition key, and those that include static columns. // On old nodes, the latter expects the first row in term of CQL count, which is what we already have and there is no additional // limit to apply. The former however expect only one cell per partition and rely on it (See CASSANDRA-10762). if (limits.isDistinct()) return command.columnFilter().fetchedColumns().statics.isEmpty() ? 1 : Integer.MAX_VALUE; switch (limits.kind()) { case THRIFT_LIMIT: case SUPER_COLUMN_COUNTING_LIMIT: return limits.perPartitionCount(); default: return Integer.MAX_VALUE; } } // For serializing to old wire format public static LegacyUnfilteredPartition fromUnfilteredRowIterator(ReadCommand command, UnfilteredRowIterator iterator) { // we need to extract the range tombstone so materialize the partition. Since this is // used for the on-wire format, this is not worst than it used to be. final ImmutableBTreePartition partition = ImmutableBTreePartition.create(iterator); DeletionInfo info = partition.deletionInfo(); Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> pair = fromRowIterator(partition.metadata(), partition.iterator(), partition.staticRow()); LegacyLayout.LegacyRangeTombstoneList rtl = pair.left; // Processing the cell iterator results in the LegacyRangeTombstoneList being populated, so we do this // before we use the LegacyRangeTombstoneList at all List<LegacyLayout.LegacyCell> cells = Lists.newArrayList(pair.right); int maxCellsPerPartition = maxCellsPerPartition(command); if (cells.size() > maxCellsPerPartition) cells = cells.subList(0, maxCellsPerPartition); // The LegacyRangeTombstoneList already has range tombstones for the single-row deletions and complex // deletions. Go through our normal range tombstones and add then to the LegacyRTL so that the range // tombstones all get merged and sorted properly. if (info.hasRanges()) { Iterator<RangeTombstone> rangeTombstoneIterator = info.rangeIterator(false); while (rangeTombstoneIterator.hasNext()) { RangeTombstone rt = rangeTombstoneIterator.next(); Slice slice = rt.deletedSlice(); LegacyLayout.LegacyBound start = new LegacyLayout.LegacyBound(slice.start(), false, null); LegacyLayout.LegacyBound end = new LegacyLayout.LegacyBound(slice.end(), false, null); rtl.add(start, end, rt.deletionTime().markedForDeleteAt(), rt.deletionTime().localDeletionTime()); } } return new LegacyUnfilteredPartition(info.getPartitionDeletion(), rtl, cells); } public static void serializeAsLegacyPartition(ReadCommand command, UnfilteredRowIterator partition, DataOutputPlus out, int version) throws IOException { assert version < MessagingService.VERSION_30; out.writeBoolean(true); LegacyLayout.LegacyUnfilteredPartition legacyPartition = LegacyLayout.fromUnfilteredRowIterator(command, partition); UUIDSerializer.serializer.serialize(partition.metadata().cfId, out, version); DeletionTime.serializer.serialize(legacyPartition.partitionDeletion, out); legacyPartition.rangeTombstones.serialize(out, partition.metadata()); // begin cell serialization out.writeInt(legacyPartition.cells.size()); for (LegacyLayout.LegacyCell cell : legacyPartition.cells) { ByteBufferUtil.writeWithShortLength(cell.name.encode(partition.metadata()), out); out.writeByte(cell.serializationFlags()); if (cell.isExpiring()) { out.writeInt(cell.ttl); out.writeInt(cell.localDeletionTime); } else if (cell.isTombstone()) { out.writeLong(cell.timestamp); out.writeInt(TypeSizes.sizeof(cell.localDeletionTime)); out.writeInt(cell.localDeletionTime); continue; } else if (cell.isCounterUpdate()) { out.writeLong(cell.timestamp); long count = CounterContext.instance().getUpdateCount(cell.value); ByteBufferUtil.writeWithLength(ByteBufferUtil.bytes(count), out); continue; } else if (cell.isCounter()) { out.writeLong(Long.MIN_VALUE); // timestampOfLastDelete (not used, and MIN_VALUE is the default) } out.writeLong(cell.timestamp); ByteBufferUtil.writeWithLength(cell.value, out); } } // For the old wire format // Note: this can return null if an empty partition is serialized! public static UnfilteredRowIterator deserializeLegacyPartition(DataInputPlus in, int version, SerializationHelper.Flag flag, ByteBuffer key) throws IOException { assert version < MessagingService.VERSION_30; // This is only used in mutation, and mutation have never allowed "null" column families boolean present = in.readBoolean(); if (!present) return null; CFMetaData metadata = CFMetaData.serializer.deserialize(in, version); LegacyDeletionInfo info = LegacyDeletionInfo.deserialize(metadata, in); int size = in.readInt(); Iterator<LegacyCell> cells = deserializeCells(metadata, in, flag, size); SerializationHelper helper = new SerializationHelper(metadata, version, flag); return onWireCellstoUnfilteredRowIterator(metadata, metadata.partitioner.decorateKey(key), info, cells, false, helper); } // For the old wire format public static long serializedSizeAsLegacyPartition(ReadCommand command, UnfilteredRowIterator partition, int version) { assert version < MessagingService.VERSION_30; if (partition.isEmpty()) return TypeSizes.sizeof(false); long size = TypeSizes.sizeof(true); LegacyLayout.LegacyUnfilteredPartition legacyPartition = LegacyLayout.fromUnfilteredRowIterator(command, partition); size += UUIDSerializer.serializer.serializedSize(partition.metadata().cfId, version); size += DeletionTime.serializer.serializedSize(legacyPartition.partitionDeletion); size += legacyPartition.rangeTombstones.serializedSize(partition.metadata()); // begin cell serialization size += TypeSizes.sizeof(legacyPartition.cells.size()); for (LegacyLayout.LegacyCell cell : legacyPartition.cells) { size += ByteBufferUtil.serializedSizeWithShortLength(cell.name.encode(partition.metadata())); size += 1; // serialization flags if (cell.isExpiring()) { size += TypeSizes.sizeof(cell.ttl); size += TypeSizes.sizeof(cell.localDeletionTime); } else if (cell.isTombstone()) { size += TypeSizes.sizeof(cell.timestamp); // localDeletionTime replaces cell.value as the body size += TypeSizes.sizeof(TypeSizes.sizeof(cell.localDeletionTime)); size += TypeSizes.sizeof(cell.localDeletionTime); continue; } else if (cell.isCounterUpdate()) { size += TypeSizes.sizeof(cell.timestamp); long count = CounterContext.instance().getUpdateCount(cell.value); size += ByteBufferUtil.serializedSizeWithLength(ByteBufferUtil.bytes(count)); continue; } else if (cell.isCounter()) { size += TypeSizes.sizeof(Long.MIN_VALUE); // timestampOfLastDelete } size += TypeSizes.sizeof(cell.timestamp); size += ByteBufferUtil.serializedSizeWithLength(cell.value); } return size; } // For thrift sake public static UnfilteredRowIterator toUnfilteredRowIterator(CFMetaData metadata, DecoratedKey key, LegacyDeletionInfo delInfo, Iterator<LegacyCell> cells) { SerializationHelper helper = new SerializationHelper(metadata, 0, SerializationHelper.Flag.LOCAL); return toUnfilteredRowIterator(metadata, key, delInfo, cells, false, helper); } // For deserializing old wire format public static UnfilteredRowIterator onWireCellstoUnfilteredRowIterator(CFMetaData metadata, DecoratedKey key, LegacyDeletionInfo delInfo, Iterator<LegacyCell> cells, boolean reversed, SerializationHelper helper) { // If the table is a static compact, the "column_metadata" are now internally encoded as // static. This has already been recognized by decodeCellName, but it means the cells // provided are not in the expected order (the "static" cells are not necessarily at the front). // So sort them to make sure toUnfilteredRowIterator works as expected. // Further, if the query is reversed, then the on-wire format still has cells in non-reversed // order, but we need to have them reverse in the final UnfilteredRowIterator. So reverse them. if (metadata.isStaticCompactTable() || reversed) { List<LegacyCell> l = new ArrayList<>(); Iterators.addAll(l, cells); Collections.sort(l, legacyCellComparator(metadata, reversed)); cells = l.iterator(); } return toUnfilteredRowIterator(metadata, key, delInfo, cells, reversed, helper); } private static UnfilteredRowIterator toUnfilteredRowIterator(CFMetaData metadata, DecoratedKey key, LegacyDeletionInfo delInfo, Iterator<LegacyCell> cells, boolean reversed, SerializationHelper helper) { // A reducer that basically does nothing, we know the 2 merged iterators can't have conflicting atoms (since we merge cells with range tombstones). MergeIterator.Reducer<LegacyAtom, LegacyAtom> reducer = new MergeIterator.Reducer<LegacyAtom, LegacyAtom>() { private LegacyAtom atom; public void reduce(int idx, LegacyAtom current) { // We're merging cell with range tombstones, so we should always only have a single atom to reduce. assert atom == null; atom = current; } protected LegacyAtom getReduced() { return atom; } protected void onKeyChange() { atom = null; } }; List<Iterator<LegacyAtom>> iterators = Arrays.asList(asLegacyAtomIterator(cells), asLegacyAtomIterator(delInfo.inRowRangeTombstones())); PeekingIterator<LegacyAtom> atoms = Iterators.peekingIterator(MergeIterator.get(iterators, legacyAtomComparator(metadata), reducer)); // Check if we have some static Row staticRow = atoms.hasNext() && atoms.peek().isStatic() ? getNextRow(CellGrouper.staticGrouper(metadata, helper), atoms) : Rows.EMPTY_STATIC_ROW; Iterator<Row> rows = convertToRows(new CellGrouper(metadata, helper), atoms); Iterator<RangeTombstone> ranges = delInfo.deletionInfo.rangeIterator(reversed); return new RowAndDeletionMergeIterator(metadata, key, delInfo.deletionInfo.getPartitionDeletion(), ColumnFilter.all(metadata), staticRow, reversed, EncodingStats.NO_STATS, rows, ranges, true); } public static Row extractStaticColumns(CFMetaData metadata, DataInputPlus in, Columns statics) throws IOException { assert !statics.isEmpty(); assert metadata.isCompactTable(); if (metadata.isSuper()) // TODO: there is in practice nothing to do here, but we need to handle the column_metadata for super columns somewhere else throw new UnsupportedOperationException(); Set<ByteBuffer> columnsToFetch = new HashSet<>(statics.size()); for (ColumnDefinition column : statics) columnsToFetch.add(column.name.bytes); Row.Builder builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds()); builder.newRow(Clustering.STATIC_CLUSTERING); boolean foundOne = false; LegacyAtom atom; while ((atom = readLegacyAtomSkippingUnknownColumn(metadata,in)) != null) { if (atom.isCell()) { LegacyCell cell = atom.asCell(); if (!columnsToFetch.contains(cell.name.encode(metadata))) continue; foundOne = true; builder.addCell(new BufferCell(cell.name.column, cell.timestamp, cell.ttl, cell.localDeletionTime, cell.value, null)); } else { LegacyRangeTombstone tombstone = atom.asRangeTombstone(); // TODO: we need to track tombstones and potentially ignore cells that are // shadowed (or even better, replace them by tombstones). throw new UnsupportedOperationException(); } } return foundOne ? builder.build() : Rows.EMPTY_STATIC_ROW; } private static LegacyAtom readLegacyAtomSkippingUnknownColumn(CFMetaData metadata, DataInputPlus in) throws IOException { while (true) { try { return readLegacyAtom(metadata, in, false); } catch (UnknownColumnException e) { // Simply skip, as the method name implies. } } } private static Row getNextRow(CellGrouper grouper, PeekingIterator<? extends LegacyAtom> cells) { if (!cells.hasNext()) return null; grouper.reset(); while (cells.hasNext() && grouper.addAtom(cells.peek())) { // We've added the cell already in the grouper, so just skip it cells.next(); } return grouper.getRow(); } @SuppressWarnings("unchecked") private static Iterator<LegacyAtom> asLegacyAtomIterator(Iterator<? extends LegacyAtom> iter) { return (Iterator<LegacyAtom>)iter; } private static Iterator<Row> convertToRows(final CellGrouper grouper, final PeekingIterator<LegacyAtom> atoms) { return new AbstractIterator<Row>() { protected Row computeNext() { if (!atoms.hasNext()) return endOfData(); return getNextRow(grouper, atoms); } }; } public static Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> fromRowIterator(final RowIterator iterator) { return fromRowIterator(iterator.metadata(), iterator, iterator.staticRow()); } private static Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> fromRowIterator(final CFMetaData metadata, final Iterator<Row> iterator, final Row staticRow) { LegacyRangeTombstoneList deletions = new LegacyRangeTombstoneList(new LegacyBoundComparator(metadata.comparator), 10); Iterator<LegacyCell> cells = new AbstractIterator<LegacyCell>() { private Iterator<LegacyCell> currentRow = initializeRow(); private Iterator<LegacyCell> initializeRow() { if (staticRow == null || staticRow.isEmpty()) return Collections.<LegacyLayout.LegacyCell>emptyIterator(); Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> row = fromRow(metadata, staticRow); deletions.addAll(row.left); return row.right; } protected LegacyCell computeNext() { while (true) { if (currentRow.hasNext()) return currentRow.next(); if (!iterator.hasNext()) return endOfData(); Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> row = fromRow(metadata, iterator.next()); deletions.addAll(row.left); currentRow = row.right; } } }; return Pair.create(deletions, cells); } private static Pair<LegacyRangeTombstoneList, Iterator<LegacyCell>> fromRow(final CFMetaData metadata, final Row row) { // convert any complex deletions or row deletion into normal range tombstones so that we can build and send a proper RangeTombstoneList // to legacy nodes LegacyRangeTombstoneList deletions = new LegacyRangeTombstoneList(new LegacyBoundComparator(metadata.comparator), 10); if (!row.deletion().isLive()) { Clustering clustering = row.clustering(); ClusteringBound startBound = ClusteringBound.inclusiveStartOf(clustering); ClusteringBound endBound = ClusteringBound.inclusiveEndOf(clustering); LegacyBound start = new LegacyLayout.LegacyBound(startBound, false, null); LegacyBound end = new LegacyLayout.LegacyBound(endBound, false, null); deletions.add(start, end, row.deletion().time().markedForDeleteAt(), row.deletion().time().localDeletionTime()); } for (ColumnData cd : row) { ColumnDefinition col = cd.column(); if (col.isSimple()) continue; DeletionTime delTime = ((ComplexColumnData)cd).complexDeletion(); if (!delTime.isLive()) { Clustering clustering = row.clustering(); boolean isStatic = clustering == Clustering.STATIC_CLUSTERING; assert isStatic == col.isStatic(); ClusteringBound startBound = isStatic ? LegacyDeletionInfo.staticBound(metadata, true) : ClusteringBound.inclusiveStartOf(clustering); ClusteringBound endBound = isStatic ? LegacyDeletionInfo.staticBound(metadata, false) : ClusteringBound.inclusiveEndOf(clustering); LegacyLayout.LegacyBound start = new LegacyLayout.LegacyBound(startBound, isStatic, col); LegacyLayout.LegacyBound end = new LegacyLayout.LegacyBound(endBound, isStatic, col); deletions.add(start, end, delTime.markedForDeleteAt(), delTime.localDeletionTime()); } } Iterator<LegacyCell> cells = new AbstractIterator<LegacyCell>() { private final Iterator<Cell> cells = row.cellsInLegacyOrder(metadata, false).iterator(); // we don't have (and shouldn't have) row markers for compact tables. private boolean hasReturnedRowMarker = metadata.isCompactTable(); protected LegacyCell computeNext() { if (!hasReturnedRowMarker) { hasReturnedRowMarker = true; // don't include a row marker if there's no timestamp on the primary key; this is the 3.0+ equivalent // of a row marker if (!row.primaryKeyLivenessInfo().isEmpty()) { LegacyCellName cellName = new LegacyCellName(row.clustering(), null, null); LivenessInfo info = row.primaryKeyLivenessInfo(); return new LegacyCell(info.isExpiring() ? LegacyCell.Kind.EXPIRING : LegacyCell.Kind.REGULAR, cellName, ByteBufferUtil.EMPTY_BYTE_BUFFER, info.timestamp(), info.localExpirationTime(), info.ttl()); } } if (!cells.hasNext()) return endOfData(); return makeLegacyCell(row.clustering(), cells.next()); } }; return Pair.create(deletions, cells); } private static LegacyCell makeLegacyCell(Clustering clustering, Cell cell) { LegacyCell.Kind kind; if (cell.isCounterCell()) kind = LegacyCell.Kind.COUNTER; else if (cell.isTombstone()) kind = LegacyCell.Kind.DELETED; else if (cell.isExpiring()) kind = LegacyCell.Kind.EXPIRING; else kind = LegacyCell.Kind.REGULAR; CellPath path = cell.path(); assert path == null || path.size() == 1; LegacyCellName name = new LegacyCellName(clustering, cell.column(), path == null ? null : path.get(0)); return new LegacyCell(kind, name, cell.value(), cell.timestamp(), cell.localDeletionTime(), cell.ttl()); } public static RowIterator toRowIterator(final CFMetaData metadata, final DecoratedKey key, final Iterator<LegacyCell> cells, final int nowInSec) { SerializationHelper helper = new SerializationHelper(metadata, 0, SerializationHelper.Flag.LOCAL); return UnfilteredRowIterators.filter(toUnfilteredRowIterator(metadata, key, LegacyDeletionInfo.live(), cells, false, helper), nowInSec); } public static Comparator<LegacyCell> legacyCellComparator(CFMetaData metadata) { return legacyCellComparator(metadata, false); } public static Comparator<LegacyCell> legacyCellComparator(final CFMetaData metadata, final boolean reversed) { final Comparator<LegacyCellName> cellNameComparator = legacyCellNameComparator(metadata, reversed); return new Comparator<LegacyCell>() { public int compare(LegacyCell cell1, LegacyCell cell2) { LegacyCellName c1 = cell1.name; LegacyCellName c2 = cell2.name; int c = cellNameComparator.compare(c1, c2); if (c != 0) return c; // The actual sorting when the cellname is equal doesn't matter, we just want to make // sure the cells are not considered equal. if (cell1.timestamp != cell2.timestamp) return cell1.timestamp < cell2.timestamp ? -1 : 1; if (cell1.localDeletionTime != cell2.localDeletionTime) return cell1.localDeletionTime < cell2.localDeletionTime ? -1 : 1; return cell1.value.compareTo(cell2.value); } }; } // Note that this doesn't exactly compare cells as they were pre-3.0 because within a row they sort columns like // in 3.0, that is, with simple columns before complex columns. In other words, this comparator makes sure cells // are in the proper order to convert them to actual 3.0 rows. public static Comparator<LegacyCellName> legacyCellNameComparator(final CFMetaData metadata, final boolean reversed) { return new Comparator<LegacyCellName>() { public int compare(LegacyCellName c1, LegacyCellName c2) { // Compare clustering first if (c1.clustering == Clustering.STATIC_CLUSTERING) { if (c2.clustering != Clustering.STATIC_CLUSTERING) return -1; } else if (c2.clustering == Clustering.STATIC_CLUSTERING) { return 1; } else { int c = metadata.comparator.compare(c1.clustering, c2.clustering); if (c != 0) return reversed ? -c : c; } // Note that when reversed, we only care about the clustering being reversed, so it's ok // not to take reversed into account below. // Then check the column name if (c1.column != c2.column) { // A null for the column means it's a row marker if (c1.column == null) return -1; if (c2.column == null) return 1; assert c1.column.isRegular() || c1.column.isStatic(); assert c2.column.isRegular() || c2.column.isStatic(); int cmp = c1.column.compareTo(c2.column); if (cmp != 0) return cmp; } assert (c1.collectionElement == null) == (c2.collectionElement == null); if (c1.collectionElement != null) { AbstractType<?> colCmp = ((CollectionType)c1.column.type).nameComparator(); return colCmp.compare(c1.collectionElement, c2.collectionElement); } return 0; } }; } private static boolean equalValues(ClusteringPrefix c1, ClusteringPrefix c2, ClusteringComparator comparator) { assert c1.size() == c2.size(); for (int i = 0; i < c1.size(); i++) { if (comparator.compareComponent(i, c1.get(i), c2.get(i)) != 0) return false; } return true; } private static Comparator<LegacyAtom> legacyAtomComparator(CFMetaData metadata) { return (o1, o2) -> { // First we want to compare by clustering, but we have to be careful with range tombstone, because // we can have collection deletion and we want those to sort properly just before the column they // delete, not before the whole row. // We also want to special case static so they sort before any non-static. Note in particular that // this special casing is important in the case of one of the Atom being Bound.BOTTOM: we want // it to sort after the static as we deal with static first in toUnfilteredAtomIterator and having // Bound.BOTTOM first would mess that up (note that static deletion is handled through a specific // static tombstone, see LegacyDeletionInfo.add()). if (o1.isStatic() != o2.isStatic()) return o1.isStatic() ? -1 : 1; ClusteringPrefix c1 = o1.clustering(); ClusteringPrefix c2 = o2.clustering(); int clusteringComparison; if (c1.size() != c2.size() || (o1.isCell() == o2.isCell()) || !equalValues(c1, c2, metadata.comparator)) { clusteringComparison = metadata.comparator.compare(c1, c2); } else { // one is a cell and one is a range tombstone, and both have the same prefix size (that is, the // range tombstone is either a row deletion or a collection deletion). LegacyRangeTombstone rt = o1.isCell() ? o2.asRangeTombstone() : o1.asRangeTombstone(); clusteringComparison = rt.isCollectionTombstone() ? 0 : metadata.comparator.compare(c1, c2); } // Note that if both are range tombstones and have the same clustering, then they are equal. if (clusteringComparison != 0) return clusteringComparison; if (o1.isCell()) { LegacyCell cell1 = o1.asCell(); if (o2.isCell()) { LegacyCell cell2 = o2.asCell(); // Check for row marker cells if (cell1.name.column == null) return cell2.name.column == null ? 0 : -1; return cell2.name.column == null ? 1 : cell1.name.column.compareTo(cell2.name.column); } LegacyRangeTombstone rt2 = o2.asRangeTombstone(); assert rt2.isCollectionTombstone(); // otherwise, we shouldn't have got a clustering equality if (cell1.name.column == null) return -1; int cmp = cell1.name.column.compareTo(rt2.start.collectionName); // If both are for the same column, then the RT should come first return cmp == 0 ? 1 : cmp; } else { assert o2.isCell(); LegacyCell cell2 = o2.asCell(); LegacyRangeTombstone rt1 = o1.asRangeTombstone(); assert rt1.isCollectionTombstone(); // otherwise, we shouldn't have got a clustering equality if (cell2.name.column == null) return 1; int cmp = rt1.start.collectionName.compareTo(cell2.name.column); // If both are for the same column, then the RT should come first return cmp == 0 ? -1 : cmp; } }; } public static LegacyAtom readLegacyAtom(CFMetaData metadata, DataInputPlus in, boolean readAllAsDynamic) throws IOException, UnknownColumnException { ByteBuffer cellname = ByteBufferUtil.readWithShortLength(in); if (!cellname.hasRemaining()) return null; // END_OF_ROW try { int b = in.readUnsignedByte(); return (b & RANGE_TOMBSTONE_MASK) != 0 ? readLegacyRangeTombstoneBody(metadata, in, cellname) : readLegacyCellBody(metadata, in, cellname, b, SerializationHelper.Flag.LOCAL, readAllAsDynamic); } catch (UnknownColumnException e) { // We legitimately can get here in 2 cases: // 1) for system tables, because we've unceremoniously removed columns (without registering them as dropped) // 2) for dropped columns. // In any other case, there is a mismatch between the schema and the data, and we complain loudly in // that case. Note that if we are in a legit case of an unknown column, we want to simply skip that cell, // but we don't do this here and re-throw the exception because the calling code sometimes has to know // about this happening. This does mean code calling this method should handle this case properly. if (!metadata.ksName.equals(SchemaConstants.SYSTEM_KEYSPACE_NAME) && metadata.getDroppedColumnDefinition(e.columnName) == null) throw new IllegalStateException(String.format("Got cell for unknown column %s in sstable of %s.%s: " + "This suggest a problem with the schema which doesn't list " + "this column. Even if that column was dropped, it should have " + "been listed as such", metadata.ksName, metadata.cfName, UTF8Type.instance.compose(e.columnName)), e); throw e; } } public static LegacyCell readLegacyCell(CFMetaData metadata, DataInput in, SerializationHelper.Flag flag) throws IOException, UnknownColumnException { ByteBuffer cellname = ByteBufferUtil.readWithShortLength(in); int b = in.readUnsignedByte(); return readLegacyCellBody(metadata, in, cellname, b, flag, false); } public static LegacyCell readLegacyCellBody(CFMetaData metadata, DataInput in, ByteBuffer cellname, int mask, SerializationHelper.Flag flag, boolean readAllAsDynamic) throws IOException, UnknownColumnException { // Note that we want to call decodeCellName only after we've deserialized other parts, since it can throw // and we want to throw only after having deserialized the full cell. if ((mask & COUNTER_MASK) != 0) { in.readLong(); // timestampOfLastDelete: this has been unused for a long time so we ignore it long ts = in.readLong(); ByteBuffer value = ByteBufferUtil.readWithLength(in); if (flag == SerializationHelper.Flag.FROM_REMOTE || (flag == SerializationHelper.Flag.LOCAL && CounterContext.instance().shouldClearLocal(value))) value = CounterContext.instance().clearAllLocal(value); return new LegacyCell(LegacyCell.Kind.COUNTER, decodeCellName(metadata, cellname, readAllAsDynamic), value, ts, Cell.NO_DELETION_TIME, Cell.NO_TTL); } else if ((mask & EXPIRATION_MASK) != 0) { int ttl = in.readInt(); int expiration = in.readInt(); long ts = in.readLong(); ByteBuffer value = ByteBufferUtil.readWithLength(in); return new LegacyCell(LegacyCell.Kind.EXPIRING, decodeCellName(metadata, cellname, readAllAsDynamic), value, ts, expiration, ttl); } else { long ts = in.readLong(); ByteBuffer value = ByteBufferUtil.readWithLength(in); LegacyCellName name = decodeCellName(metadata, cellname, readAllAsDynamic); return (mask & COUNTER_UPDATE_MASK) != 0 ? new LegacyCell(LegacyCell.Kind.COUNTER, name, CounterContext.instance().createUpdate(ByteBufferUtil.toLong(value)), ts, Cell.NO_DELETION_TIME, Cell.NO_TTL) : ((mask & DELETION_MASK) == 0 ? new LegacyCell(LegacyCell.Kind.REGULAR, name, value, ts, Cell.NO_DELETION_TIME, Cell.NO_TTL) : new LegacyCell(LegacyCell.Kind.DELETED, name, ByteBufferUtil.EMPTY_BYTE_BUFFER, ts, ByteBufferUtil.toInt(value), Cell.NO_TTL)); } } public static LegacyRangeTombstone readLegacyRangeTombstoneBody(CFMetaData metadata, DataInputPlus in, ByteBuffer boundname) throws IOException { LegacyBound min = decodeTombstoneBound(metadata, boundname, true); LegacyBound max = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), false); DeletionTime dt = DeletionTime.serializer.deserialize(in); return new LegacyRangeTombstone(min, max, dt); } public static Iterator<LegacyCell> deserializeCells(final CFMetaData metadata, final DataInput in, final SerializationHelper.Flag flag, final int size) { return new AbstractIterator<LegacyCell>() { private int i = 0; protected LegacyCell computeNext() { if (i >= size) return endOfData(); ++i; try { return readLegacyCell(metadata, in, flag); } catch (UnknownColumnException e) { // We can get there if we read a cell for a dropped column, and if that is the case, // then simply ignore the cell is fine. But also not that we ignore if it's the // system keyspace because for those table we actually remove columns without registering // them in the dropped columns if (metadata.ksName.equals(SchemaConstants.SYSTEM_KEYSPACE_NAME) || metadata.getDroppedColumnDefinition(e.columnName) != null) return computeNext(); else throw new IOError(e); } catch (IOException e) { throw new IOError(e); } } }; } public static class CellGrouper {
The fake TTL used for expired rows that have been compacted.
/** * The fake TTL used for expired rows that have been compacted. */
private static final int FAKE_TTL = 1; public final CFMetaData metadata; private final boolean isStatic; private final SerializationHelper helper; private final Row.Builder builder; private Clustering clustering; private LegacyRangeTombstone rowDeletion; private LegacyRangeTombstone collectionDeletion; public CellGrouper(CFMetaData metadata, SerializationHelper helper) { this(metadata, helper, false); } private CellGrouper(CFMetaData metadata, SerializationHelper helper, boolean isStatic) { this.metadata = metadata; this.isStatic = isStatic; this.helper = helper; // We cannot use a sorted builder because we don't have exactly the same ordering in 3.0 and pre-3.0. More precisely, within a row, we // store all simple columns before the complex ones in 3.0, which we use to sort everything sorted by the column name before. Note however // that the unsorted builder won't have to reconcile cells, so the exact value we pass for nowInSec doesn't matter. this.builder = BTreeRow.unsortedBuilder(FBUtilities.nowInSeconds()); } public static CellGrouper staticGrouper(CFMetaData metadata, SerializationHelper helper) { return new CellGrouper(metadata, helper, true); } public void reset() { this.clustering = null; this.rowDeletion = null; this.collectionDeletion = null; } public boolean addAtom(LegacyAtom atom) { return atom.isCell() ? addCell(atom.asCell()) : addRangeTombstone(atom.asRangeTombstone()); } public boolean addCell(LegacyCell cell) { if (clustering == null) { clustering = cell.name.clustering; assert !isStatic || clustering == Clustering.STATIC_CLUSTERING; builder.newRow(clustering); } else if (!clustering.equals(cell.name.clustering)) { return false; } // Ignore shadowed cells if (rowDeletion != null && rowDeletion.deletionTime.deletes(cell.timestamp)) return true; ColumnDefinition column = cell.name.column; if (column == null) { // It's the row marker assert !cell.value.hasRemaining(); // In 2.1, the row marker expired cell might have been converted into a deleted one by compaction. // If we do not set the primary key liveness info for this row and it does not contains any regular columns // the row will be empty. To avoid that, we reuse the localDeletionTime but use a fake TTL. // The only time in 2.x that we actually delete a row marker is in 2i tables, so in that case we do // want to actually propagate the row deletion. (CASSANDRA-13320) if (!cell.isTombstone()) builder.addPrimaryKeyLivenessInfo(LivenessInfo.withExpirationTime(cell.timestamp, cell.ttl, cell.localDeletionTime)); else if (metadata.isIndex()) builder.addRowDeletion(Row.Deletion.regular(new DeletionTime(cell.timestamp, cell.localDeletionTime))); else builder.addPrimaryKeyLivenessInfo(LivenessInfo.create(cell.timestamp, FAKE_TTL, cell.localDeletionTime)); } else { if (collectionDeletion != null && collectionDeletion.start.collectionName.name.equals(column.name) && collectionDeletion.deletionTime.deletes(cell.timestamp)) return true; if (column.isPrimaryKeyColumn() && metadata.isCQLTable()) { noSpamLogger.warn("Illegal cell name for CQL3 table {}.{}. {} is defined as a primary key column", metadata.ksName, metadata.cfName, column.name); return true; } if (helper.includes(column)) { CellPath path = null; if (column.isComplex()) { // Recalling startOfComplexColumn for every cell is a big inefficient, but it's ok in practice // and it's simpler. And since 1) this only matter for super column selection in thrift in // practice and 2) is only used during upgrade, it's probably worth keeping things simple. helper.startOfComplexColumn(column); path = cell.name.collectionElement == null ? null : CellPath.create(cell.name.collectionElement); if (!helper.includes(path)) return true; } Cell c = new BufferCell(column, cell.timestamp, cell.ttl, cell.localDeletionTime, cell.value, path); if (!helper.isDropped(c, column.isComplex())) builder.addCell(c); if (column.isComplex()) { helper.endOfComplexColumn(); } } } return true; } private boolean addRangeTombstone(LegacyRangeTombstone tombstone) { if (tombstone.isRowDeletion(metadata)) return addRowTombstone(tombstone); else if (tombstone.isCollectionTombstone()) return addCollectionTombstone(tombstone); else return addGenericRangeTombstone(tombstone); } private boolean addRowTombstone(LegacyRangeTombstone tombstone) { if (clustering != null) { // If we're already in the row, there might be a chance that there were two range tombstones // written, as 2.x storage format does not guarantee just one range tombstone, unlike 3.x. // We have to make sure that clustering matches, which would mean that tombstone is for the // same row. if (rowDeletion != null && clustering.equals(tombstone.start.getAsClustering(metadata))) { // If the tombstone superceeds the previous delete, we discard the previous one if (tombstone.deletionTime.supersedes(rowDeletion.deletionTime)) { builder.addRowDeletion(Row.Deletion.regular(tombstone.deletionTime)); rowDeletion = tombstone; } return true; } // If we're already within a row and there was no delete written before that one, it can't be the same one return false; } clustering = tombstone.start.getAsClustering(metadata); builder.newRow(clustering); builder.addRowDeletion(Row.Deletion.regular(tombstone.deletionTime)); rowDeletion = tombstone; return true; } private boolean addCollectionTombstone(LegacyRangeTombstone tombstone) { if (!helper.includes(tombstone.start.collectionName)) return false; // see CASSANDRA-13109 // The helper needs to be informed about the current complex column identifier before // it can perform the comparison between the recorded drop time and the RT deletion time. // If the RT has been superceded by a drop, we still return true as we don't want the // grouper to terminate yet. helper.startOfComplexColumn(tombstone.start.collectionName); if (helper.isDroppedComplexDeletion(tombstone.deletionTime)) return true; if (clustering == null) { clustering = tombstone.start.getAsClustering(metadata); builder.newRow(clustering); } else if (!clustering.equals(tombstone.start.getAsClustering(metadata))) { return false; } builder.addComplexDeletion(tombstone.start.collectionName, tombstone.deletionTime); if (rowDeletion == null || tombstone.deletionTime.supersedes(rowDeletion.deletionTime)) collectionDeletion = tombstone; return true; } private boolean addGenericRangeTombstone(LegacyRangeTombstone tombstone) { /* * We can see a non-collection, non-row deletion in two scenarios: * * 1. Most commonly, the tombstone's start bound is bigger than current row's clustering, which means that * the current row is over, and we should move on to the next row or RT; * * 2. Less commonly, the tombstone's start bound is smaller than current row's clustering, which means that * we've crossed an index boundary and are seeing a non-closed RT from the previous block, repeated; * we should ignore it and stay in the current row. * * In either case, clustering should be non-null, or we shouldn't have gotten to this method at all * However, to be absolutely SURE we're in case two above, we check here. */ return clustering != null && metadata.comparator.compare(clustering, tombstone.start.bound.clustering()) > 0; } public Row getRow() { return builder.build(); } } public static class LegacyUnfilteredPartition { public final DeletionTime partitionDeletion; public final LegacyRangeTombstoneList rangeTombstones; public final List<LegacyCell> cells; private LegacyUnfilteredPartition(DeletionTime partitionDeletion, LegacyRangeTombstoneList rangeTombstones, List<LegacyCell> cells) { this.partitionDeletion = partitionDeletion; this.rangeTombstones = rangeTombstones; this.cells = cells; } public void digest(CFMetaData metadata, MessageDigest digest) { for (LegacyCell cell : cells) { digest.update(cell.name.encode(metadata).duplicate()); if (cell.isCounter()) CounterContext.instance().updateDigest(digest, cell.value); else digest.update(cell.value.duplicate()); FBUtilities.updateWithLong(digest, cell.timestamp); FBUtilities.updateWithByte(digest, cell.serializationFlags()); if (cell.isExpiring()) FBUtilities.updateWithInt(digest, cell.ttl); if (cell.isCounter()) { // Counters used to have the timestampOfLastDelete field, which we stopped using long ago and has been hard-coded // to Long.MIN_VALUE but was still taken into account in 2.2 counter digests (to maintain backward compatibility // in the first place). FBUtilities.updateWithLong(digest, Long.MIN_VALUE); } } if (partitionDeletion.markedForDeleteAt() != Long.MIN_VALUE) digest.update(ByteBufferUtil.bytes(partitionDeletion.markedForDeleteAt())); if (!rangeTombstones.isEmpty()) rangeTombstones.updateDigest(digest); } } public static class LegacyCellName { public final Clustering clustering; public final ColumnDefinition column; public final ByteBuffer collectionElement; private LegacyCellName(Clustering clustering, ColumnDefinition column, ByteBuffer collectionElement) { this.clustering = clustering; this.column = column; this.collectionElement = collectionElement; } public ByteBuffer encode(CFMetaData metadata) { return encodeCellName(metadata, clustering, column == null ? ByteBufferUtil.EMPTY_BYTE_BUFFER : column.name.bytes, collectionElement); } public ByteBuffer superColumnSubName() { assert collectionElement != null; return collectionElement; } public ByteBuffer superColumnName() { return clustering.get(0); } @Override public String toString() { StringBuilder sb = new StringBuilder(); for (int i = 0; i < clustering.size(); i++) sb.append(i > 0 ? ":" : "").append(clustering.get(i) == null ? "null" : ByteBufferUtil.bytesToHex(clustering.get(i))); return String.format("Cellname(clustering=%s, column=%s, collElt=%s)", sb.toString(), column == null ? "null" : column.name, collectionElement == null ? "null" : ByteBufferUtil.bytesToHex(collectionElement)); } } public static class LegacyBound { public static final LegacyBound BOTTOM = new LegacyBound(ClusteringBound.BOTTOM, false, null); public static final LegacyBound TOP = new LegacyBound(ClusteringBound.TOP, false, null); public final ClusteringBound bound; public final boolean isStatic; public final ColumnDefinition collectionName; public LegacyBound(ClusteringBound bound, boolean isStatic, ColumnDefinition collectionName) { this.bound = bound; this.isStatic = isStatic; this.collectionName = collectionName; } public Clustering getAsClustering(CFMetaData metadata) { if (isStatic) return Clustering.STATIC_CLUSTERING; assert bound.size() == metadata.comparator.size(); ByteBuffer[] values = new ByteBuffer[bound.size()]; for (int i = 0; i < bound.size(); i++) values[i] = bound.get(i); return Clustering.make(values); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append(bound.kind()).append('('); for (int i = 0; i < bound.size(); i++) sb.append(i > 0 ? ":" : "").append(bound.get(i) == null ? "null" : ByteBufferUtil.bytesToHex(bound.get(i))); sb.append(')'); return String.format("Bound(%s, collection=%s)", sb.toString(), collectionName == null ? "null" : collectionName.name); } } public interface LegacyAtom { public boolean isCell(); // note that for static atoms, LegacyCell and LegacyRangeTombstone behave differently here: // - LegacyCell returns the modern Clustering.STATIC_CLUSTERING // - LegacyRangeTombstone returns the 2.2 bound (i.e. N empty ByteBuffer, where N is number of clusterings) // in LegacyDeletionInfo.add(), we split any LRT with a static bound out into the inRowRangeTombstones collection // these are merged with regular row cells, in the CellGrouper, and their clustering is obtained via start.bound.getAsClustering // (also, it should be impossibly to issue raw static row deletions anyway) public ClusteringPrefix clustering(); public boolean isStatic(); public LegacyCell asCell(); public LegacyRangeTombstone asRangeTombstone(); }
A legacy cell.

This is used as a temporary object to facilitate dealing with the legacy format, this is not meant to be optimal.

/** * A legacy cell. * <p> * This is used as a temporary object to facilitate dealing with the legacy format, this * is not meant to be optimal. */
public static class LegacyCell implements LegacyAtom { private final static int DELETION_MASK = 0x01; private final static int EXPIRATION_MASK = 0x02; private final static int COUNTER_MASK = 0x04; private final static int COUNTER_UPDATE_MASK = 0x08; private final static int RANGE_TOMBSTONE_MASK = 0x10; public enum Kind { REGULAR, EXPIRING, DELETED, COUNTER } public final Kind kind; public final LegacyCellName name; public final ByteBuffer value; public final long timestamp; public final int localDeletionTime; public final int ttl; private LegacyCell(Kind kind, LegacyCellName name, ByteBuffer value, long timestamp, int localDeletionTime, int ttl) { this.kind = kind; this.name = name; this.value = value; this.timestamp = timestamp; this.localDeletionTime = localDeletionTime; this.ttl = ttl; } public static LegacyCell regular(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, ByteBuffer value, long timestamp) throws UnknownColumnException { return new LegacyCell(Kind.REGULAR, decodeCellName(metadata, superColumnName, name), value, timestamp, Cell.NO_DELETION_TIME, Cell.NO_TTL); } public static LegacyCell expiring(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, ByteBuffer value, long timestamp, int ttl, int nowInSec) throws UnknownColumnException { /* * CASSANDRA-14092: Max expiration date capping is maybe performed here, expiration overflow policy application * is done at {@link org.apache.cassandra.thrift.ThriftValidation#validateTtl(CFMetaData, Column)} */ return new LegacyCell(Kind.EXPIRING, decodeCellName(metadata, superColumnName, name), value, timestamp, ExpirationDateOverflowHandling.computeLocalExpirationTime(nowInSec, ttl), ttl); } public static LegacyCell tombstone(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, long timestamp, int nowInSec) throws UnknownColumnException { return new LegacyCell(Kind.DELETED, decodeCellName(metadata, superColumnName, name), ByteBufferUtil.EMPTY_BYTE_BUFFER, timestamp, nowInSec, LivenessInfo.NO_TTL); } public static LegacyCell counterUpdate(CFMetaData metadata, ByteBuffer superColumnName, ByteBuffer name, long value) throws UnknownColumnException { // See UpdateParameters.addCounter() for more details on this ByteBuffer counterValue = CounterContext.instance().createUpdate(value); return counter(decodeCellName(metadata, superColumnName, name), counterValue); } public static LegacyCell counter(LegacyCellName name, ByteBuffer value) { return new LegacyCell(Kind.COUNTER, name, value, FBUtilities.timestampMicros(), Cell.NO_DELETION_TIME, Cell.NO_TTL); } public byte serializationFlags() { if (isExpiring()) return EXPIRATION_MASK; if (isTombstone()) return DELETION_MASK; if (isCounterUpdate()) return COUNTER_UPDATE_MASK; if (isCounter()) return COUNTER_MASK; return 0; } public boolean isCounterUpdate() { // See UpdateParameters.addCounter() for more details on this return isCounter() && CounterContext.instance().isUpdate(value); } public ClusteringPrefix clustering() { return name.clustering; } public boolean isStatic() { return name.clustering == Clustering.STATIC_CLUSTERING; } public boolean isCell() { return true; } public LegacyCell asCell() { return this; } public LegacyRangeTombstone asRangeTombstone() { throw new UnsupportedOperationException(); } public boolean isCounter() { return kind == Kind.COUNTER; } public boolean isExpiring() { return kind == Kind.EXPIRING; } public boolean isTombstone() { return kind == Kind.DELETED; } public boolean isLive(int nowInSec) { if (isTombstone()) return false; return !isExpiring() || nowInSec < localDeletionTime; } @Override public String toString() { return String.format("LegacyCell(%s, name=%s, v=%s, ts=%s, ldt=%s, ttl=%s)", kind, name, ByteBufferUtil.bytesToHex(value), timestamp, localDeletionTime, ttl); } }
A legacy range tombstone.

This is used as a temporary object to facilitate dealing with the legacy format, this is not meant to be optimal.

/** * A legacy range tombstone. * <p> * This is used as a temporary object to facilitate dealing with the legacy format, this * is not meant to be optimal. */
public static class LegacyRangeTombstone implements LegacyAtom { public final LegacyBound start; public final LegacyBound stop; public final DeletionTime deletionTime; public LegacyRangeTombstone(LegacyBound start, LegacyBound stop, DeletionTime deletionTime) { // Because of the way RangeTombstoneList work, we can have a tombstone where only one of // the bound has a collectionName. That happens if we have a big tombstone A (spanning one // or multiple rows) and a collection tombstone B. In that case, RangeTombstoneList will // split this into 3 RTs: the first one from the beginning of A to the beginning of B, // then B, then a third one from the end of B to the end of A. To make this simpler, if // we detect that case we transform the 1st and 3rd tombstone so they don't end in the middle // of a row (which is still correct). if ((start.collectionName == null) != (stop.collectionName == null)) { if (start.collectionName == null) stop = new LegacyBound(stop.bound, stop.isStatic, null); else start = new LegacyBound(start.bound, start.isStatic, null); } else if (!Objects.equals(start.collectionName, stop.collectionName)) { // We're in the similar but slightly more complex case where on top of the big tombstone // A, we have 2 (or more) collection tombstones B and C within A. So we also end up with // a tombstone that goes between the end of B and the start of C. start = new LegacyBound(start.bound, start.isStatic, null); stop = new LegacyBound(stop.bound, stop.isStatic, null); } this.start = start; this.stop = stop; this.deletionTime = deletionTime; }
See Also:
/** @see LegacyAtom#clustering for static inconsistencies explained */
public ClusteringPrefix clustering() { return start.bound; } public LegacyRangeTombstone withNewStart(LegacyBound newStart) { return new LegacyRangeTombstone(newStart, stop, deletionTime); } public LegacyRangeTombstone withNewEnd(LegacyBound newStop) { return new LegacyRangeTombstone(start, newStop, deletionTime); } public boolean isCell() { return false; } public boolean isStatic() { return start.isStatic || stop.isStatic; } public LegacyCell asCell() { throw new UnsupportedOperationException(); } public LegacyRangeTombstone asRangeTombstone() { return this; } public boolean isCollectionTombstone() { return start.collectionName != null; } public boolean isRowDeletion(CFMetaData metadata) { if (start.collectionName != null || stop.collectionName != null || start.bound.size() != metadata.comparator.size() || stop.bound.size() != metadata.comparator.size()) return false; for (int i = 0; i < start.bound.size(); i++) if (!Objects.equals(start.bound.get(i), stop.bound.get(i))) return false; return true; } @Override public String toString() { return String.format("RT(%s-%s, %s)", start, stop, deletionTime); } } public static class LegacyDeletionInfo { public final MutableDeletionInfo deletionInfo; public final List<LegacyRangeTombstone> inRowTombstones = new ArrayList<>(); private LegacyDeletionInfo(MutableDeletionInfo deletionInfo) { this.deletionInfo = deletionInfo; } public static LegacyDeletionInfo live() { return new LegacyDeletionInfo(MutableDeletionInfo.live()); } public void add(DeletionTime topLevel) { deletionInfo.add(topLevel); } private static ClusteringBound staticBound(CFMetaData metadata, boolean isStart) { // In pre-3.0 nodes, static row started by a clustering with all empty values so we // preserve that here. Note that in practice, it doesn't really matter since the rest // of the code will ignore the bound for RT that have their static flag set. ByteBuffer[] values = new ByteBuffer[metadata.comparator.size()]; for (int i = 0; i < values.length; i++) values[i] = ByteBufferUtil.EMPTY_BYTE_BUFFER; return isStart ? ClusteringBound.inclusiveStartOf(values) : ClusteringBound.inclusiveEndOf(values); } public void add(CFMetaData metadata, LegacyRangeTombstone tombstone) { if (metadata.hasStaticColumns()) { /* * For table having static columns we have to deal with the following cases: * 1. the end of the tombstone is static (in which case either the start is static or is BOTTOM, which is the same * for our consideration). This mean that either the range only delete the static row, or that it's a collection * tombstone of a static collection. In both case, we just add the tombstone to the inRowTombstones. * 2. only the start is static. There is then 2 subcase: either the start is inclusive, and that mean we include the * static row and more (so we add an inRowTombstone for the static and deal with the rest normally). Or the start * is exclusive, and that means we explicitely exclude the static (in which case we can just add the tombstone * as if it started at BOTTOM). * 3. none of the bound are static but the start is BOTTOM. This means we intended to delete the static row so we * need to add it to the inRowTombstones (and otherwise handle the range normally). */ if (tombstone.stop.isStatic) { // If the start is BOTTOM, we replace it by the beginning of the starting row so as to not confuse the // RangeTombstone.isRowDeletion() method if (tombstone.start == LegacyBound.BOTTOM) tombstone = tombstone.withNewStart(new LegacyBound(staticBound(metadata, true), true, null)); inRowTombstones.add(tombstone); return; } if (tombstone.start.isStatic) { if (tombstone.start.bound.isInclusive()) inRowTombstones.add(tombstone.withNewEnd(new LegacyBound(staticBound(metadata, false), true, null))); tombstone = tombstone.withNewStart(LegacyBound.BOTTOM); } else if (tombstone.start == LegacyBound.BOTTOM) { inRowTombstones.add(new LegacyRangeTombstone(new LegacyBound(staticBound(metadata, true), true, null), new LegacyBound(staticBound(metadata, false), true, null), tombstone.deletionTime)); } } if (tombstone.isCollectionTombstone() || tombstone.isRowDeletion(metadata)) inRowTombstones.add(tombstone); else add(metadata, new RangeTombstone(Slice.make(tombstone.start.bound, tombstone.stop.bound), tombstone.deletionTime)); } public void add(CFMetaData metadata, RangeTombstone tombstone) { deletionInfo.add(tombstone, metadata.comparator); } public Iterator<LegacyRangeTombstone> inRowRangeTombstones() { return inRowTombstones.iterator(); } public static LegacyDeletionInfo deserialize(CFMetaData metadata, DataInputPlus in) throws IOException { DeletionTime topLevel = DeletionTime.serializer.deserialize(in); int rangeCount = in.readInt(); if (rangeCount == 0) return new LegacyDeletionInfo(new MutableDeletionInfo(topLevel)); LegacyDeletionInfo delInfo = new LegacyDeletionInfo(new MutableDeletionInfo(topLevel)); for (int i = 0; i < rangeCount; i++) { LegacyBound start = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), true); LegacyBound end = decodeTombstoneBound(metadata, ByteBufferUtil.readWithShortLength(in), false); int delTime = in.readInt(); long markedAt = in.readLong(); delInfo.add(metadata, new LegacyRangeTombstone(start, end, new DeletionTime(markedAt, delTime))); } return delInfo; } }
A helper class for LegacyRangeTombstoneList. This replaces the Comparator that RTL used before 3.0.
/** * A helper class for LegacyRangeTombstoneList. This replaces the Comparator<Composite> that RTL used before 3.0. */
private static class LegacyBoundComparator implements Comparator<LegacyBound> { ClusteringComparator clusteringComparator; public LegacyBoundComparator(ClusteringComparator clusteringComparator) { this.clusteringComparator = clusteringComparator; } public int compare(LegacyBound a, LegacyBound b) { // In the legacy sorting, BOTTOM comes before anything else if (a == LegacyBound.BOTTOM) return b == LegacyBound.BOTTOM ? 0 : -1; if (b == LegacyBound.BOTTOM) return 1; // Excluding BOTTOM, statics are always before anything else. if (a.isStatic != b.isStatic) return a.isStatic ? -1 : 1; // We have to be careful with bound comparison because of collections. Namely, if the 2 bounds represent the // same prefix, then we should take the collectionName into account before taking the bounds kind // (ClusteringPrefix.Kind). This means we can't really call ClusteringComparator.compare() directly. // For instance, if // a is (bound=INCL_START_BOUND('x'), collectionName='d') // b is (bound=INCL_END_BOUND('x'), collectionName='c') // Ten b < a since the element 'c' of collection 'x' comes before element 'd', but calling // clusteringComparator.compare(a.bound, b.bound) returns -1. // See CASSANDRA-13125 for details. int sa = a.bound.size(); int sb = b.bound.size(); for (int i = 0; i < Math.min(sa, sb); i++) { int cmp = clusteringComparator.compareComponent(i, a.bound.get(i), b.bound.get(i)); if (cmp != 0) return cmp; } if (sa != sb) return sa < sb ? a.bound.kind().comparedToClustering : -b.bound.kind().comparedToClustering; // Both bound represent the same prefix, compare the collection names // If one has a collection name and the other doesn't, the other comes before as it points to the beginning of the row. if ((a.collectionName == null) != (b.collectionName == null)) return a.collectionName == null ? -1 : 1; // If they both have a collection, compare that first if (a.collectionName != null) { int cmp = UTF8Type.instance.compare(a.collectionName.name.bytes, b.collectionName.name.bytes); if (cmp != 0) return cmp; } // Lastly, if everything so far is equal, compare their clustering kind return ClusteringPrefix.Kind.compare(a.bound.kind(), b.bound.kind()); } }
Almost an entire copy of RangeTombstoneList from C* 2.1. The main difference is that LegacyBoundComparator is used in place of Comparator<Composite> (because Composite doesn't exist any more). This class is needed to allow us to convert single-row deletions and complex deletions into range tombstones and properly merge them into the normal set of range tombstones.
/** * Almost an entire copy of RangeTombstoneList from C* 2.1. The main difference is that LegacyBoundComparator * is used in place of {@code Comparator<Composite>} (because Composite doesn't exist any more). * * This class is needed to allow us to convert single-row deletions and complex deletions into range tombstones * and properly merge them into the normal set of range tombstones. */
public static class LegacyRangeTombstoneList { private final LegacyBoundComparator comparator; // Note: we don't want to use a List for the markedAts and delTimes to avoid boxing. We could // use a List for starts and ends, but having arrays everywhere is almost simpler. LegacyBound[] starts; LegacyBound[] ends; private long[] markedAts; private int[] delTimes; private int size; private LegacyRangeTombstoneList(LegacyBoundComparator comparator, LegacyBound[] starts, LegacyBound[] ends, long[] markedAts, int[] delTimes, int size) { assert starts.length == ends.length && starts.length == markedAts.length && starts.length == delTimes.length; this.comparator = comparator; this.starts = starts; this.ends = ends; this.markedAts = markedAts; this.delTimes = delTimes; this.size = size; } public LegacyRangeTombstoneList(LegacyBoundComparator comparator, int capacity) { this(comparator, new LegacyBound[capacity], new LegacyBound[capacity], new long[capacity], new int[capacity], 0); } @Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append('['); for (int i = 0; i < size; i++) { if (i > 0) sb.append(','); sb.append('(').append(starts[i]).append(", ").append(ends[i]).append(')'); } return sb.append(']').toString(); } public boolean isEmpty() { return size == 0; } public int size() { return size; }
Adds a new range tombstone. This method will be faster if the new tombstone sort after all the currently existing ones (this is a common use case), but it doesn't assume it.
/** * Adds a new range tombstone. * * This method will be faster if the new tombstone sort after all the currently existing ones (this is a common use case), * but it doesn't assume it. */
public void add(LegacyBound start, LegacyBound end, long markedAt, int delTime) { if (isEmpty()) { addInternal(0, start, end, markedAt, delTime); return; } int c = comparator.compare(ends[size-1], start); // Fast path if we add in sorted order if (c <= 0) { addInternal(size, start, end, markedAt, delTime); } else { // Note: insertFrom expect i to be the insertion point in term of interval ends int pos = Arrays.binarySearch(ends, 0, size, start, comparator); insertFrom((pos >= 0 ? pos : -pos-1), start, end, markedAt, delTime); } } /* * Inserts a new element starting at index i. This method assumes that: * ends[i-1] <= start <= ends[i] * * A RangeTombstoneList is a list of range [s_0, e_0]...[s_n, e_n] such that: * - s_i <= e_i * - e_i <= s_i+1 * - if s_i == e_i and e_i == s_i+1 then s_i+1 < e_i+1 * Basically, range are non overlapping except for their bound and in order. And while * we allow ranges with the same value for the start and end, we don't allow repeating * such range (so we can't have [0, 0][0, 0] even though it would respect the first 2 * conditions). * */
Adds all the range tombstones of tombstones to this RangeTombstoneList.
/** * Adds all the range tombstones of {@code tombstones} to this RangeTombstoneList. */
public void addAll(LegacyRangeTombstoneList tombstones) { if (tombstones.isEmpty()) return; if (isEmpty()) { copyArrays(tombstones, this); return; } /* * We basically have 2 techniques we can use here: either we repeatedly call add() on tombstones values, * or we do a merge of both (sorted) lists. If this lists is bigger enough than the one we add, then * calling add() will be faster, otherwise it's merging that will be faster. * * Let's note that during memtables updates, it might not be uncommon that a new update has only a few range * tombstones, while the CF we're adding it to (the one in the memtable) has many. In that case, using add() is * likely going to be faster. * * In other cases however, like when diffing responses from multiple nodes, the tombstone lists we "merge" will * be likely sized, so using add() might be a bit inefficient. * * Roughly speaking (this ignore the fact that updating an element is not exactly constant but that's not a big * deal), if n is the size of this list and m is tombstones size, merging is O(n+m) while using add() is O(m*log(n)). * * But let's not crank up a logarithm computation for that. Long story short, merging will be a bad choice only * if this list size is lot bigger that the other one, so let's keep it simple. */ if (size > 10 * tombstones.size) { for (int i = 0; i < tombstones.size; i++) add(tombstones.starts[i], tombstones.ends[i], tombstones.markedAts[i], tombstones.delTimes[i]); } else { int i = 0; int j = 0; while (i < size && j < tombstones.size) { if (comparator.compare(tombstones.starts[j], ends[i]) <= 0) { insertFrom(i, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]); j++; } else { i++; } } // Addds the remaining ones from tombstones if any (note that addInternal will increment size if relevant). for (; j < tombstones.size; j++) addInternal(size, tombstones.starts[j], tombstones.ends[j], tombstones.markedAts[j], tombstones.delTimes[j]); } } private static void copyArrays(LegacyRangeTombstoneList src, LegacyRangeTombstoneList dst) { dst.grow(src.size); System.arraycopy(src.starts, 0, dst.starts, 0, src.size); System.arraycopy(src.ends, 0, dst.ends, 0, src.size); System.arraycopy(src.markedAts, 0, dst.markedAts, 0, src.size); System.arraycopy(src.delTimes, 0, dst.delTimes, 0, src.size); dst.size = src.size; } private void insertFrom(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime) { while (i < size) { assert i == 0 || comparator.compare(ends[i-1], start) <= 0; int c = comparator.compare(start, ends[i]); assert c <= 0; if (c == 0) { // If start == ends[i], then we can insert from the next one (basically the new element // really start at the next element), except for the case where starts[i] == ends[i]. // In this latter case, if we were to move to next element, we could end up with ...[x, x][x, x]... if (comparator.compare(starts[i], ends[i]) == 0) { // The current element cover a single value which is equal to the start of the inserted // element. If the inserted element overwrites the current one, just remove the current // (it's included in what we insert) and proceed with the insert. if (markedAt > markedAts[i]) { removeInternal(i); continue; } // Otherwise (the current singleton interval override the new one), we want to leave the // current element and move to the next, unless start == end since that means the new element // is in fact fully covered by the current one (so we're done) if (comparator.compare(start, end) == 0) return; } i++; continue; } // Do we overwrite the current element? if (markedAt > markedAts[i]) { // We do overwrite. // First deal with what might come before the newly added one. if (comparator.compare(starts[i], start) < 0) { addInternal(i, starts[i], start, markedAts[i], delTimes[i]); i++; // We don't need to do the following line, but in spirit that's what we want to do // setInternal(i, start, ends[i], markedAts, delTime]) } // now, start <= starts[i] // Does the new element stops before/at the current one, int endCmp = comparator.compare(end, starts[i]); if (endCmp <= 0) { // Here start <= starts[i] and end <= starts[i] // This means the current element is before the current one. However, one special // case is if end == starts[i] and starts[i] == ends[i]. In that case, // the new element entirely overwrite the current one and we can just overwrite if (endCmp == 0 && comparator.compare(starts[i], ends[i]) == 0) setInternal(i, start, end, markedAt, delTime); else addInternal(i, start, end, markedAt, delTime); return; } // Do we overwrite the current element fully? int cmp = comparator.compare(ends[i], end); if (cmp <= 0) { // We do overwrite fully: // update the current element until it's end and continue // on with the next element (with the new inserted start == current end). // If we're on the last element, we can optimize if (i == size-1) { setInternal(i, start, end, markedAt, delTime); return; } setInternal(i, start, ends[i], markedAt, delTime); if (cmp == 0) return; start = ends[i]; i++; } else { // We don't ovewrite fully. Insert the new interval, and then update the now next // one to reflect the not overwritten parts. We're then done. addInternal(i, start, end, markedAt, delTime); i++; setInternal(i, end, ends[i], markedAts[i], delTimes[i]); return; } } else { // we don't overwrite the current element // If the new interval starts before the current one, insert that new interval if (comparator.compare(start, starts[i]) < 0) { // If we stop before the start of the current element, just insert the new // interval and we're done; otherwise insert until the beginning of the // current element if (comparator.compare(end, starts[i]) <= 0) { addInternal(i, start, end, markedAt, delTime); return; } addInternal(i, start, starts[i], markedAt, delTime); i++; } // After that, we're overwritten on the current element but might have // some residual parts after ... // ... unless we don't extend beyond it. if (comparator.compare(end, ends[i]) <= 0) return; start = ends[i]; i++; } } // If we got there, then just insert the remainder at the end addInternal(i, start, end, markedAt, delTime); } private int capacity() { return starts.length; } private void addInternal(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime) { assert i >= 0; if (size == capacity()) growToFree(i); else if (i < size) moveElements(i); setInternal(i, start, end, markedAt, delTime); size++; } private void removeInternal(int i) { assert i >= 0; System.arraycopy(starts, i+1, starts, i, size - i - 1); System.arraycopy(ends, i+1, ends, i, size - i - 1); System.arraycopy(markedAts, i+1, markedAts, i, size - i - 1); System.arraycopy(delTimes, i+1, delTimes, i, size - i - 1); --size; starts[size] = null; ends[size] = null; } /* * Grow the arrays, leaving index i "free" in the process. */ private void growToFree(int i) { int newLength = (capacity() * 3) / 2 + 1; grow(i, newLength); } /* * Grow the arrays to match newLength capacity. */ private void grow(int newLength) { if (capacity() < newLength) grow(-1, newLength); } private void grow(int i, int newLength) { starts = grow(starts, size, newLength, i); ends = grow(ends, size, newLength, i); markedAts = grow(markedAts, size, newLength, i); delTimes = grow(delTimes, size, newLength, i); } private static LegacyBound[] grow(LegacyBound[] a, int size, int newLength, int i) { if (i < 0 || i >= size) return Arrays.copyOf(a, newLength); LegacyBound[] newA = new LegacyBound[newLength]; System.arraycopy(a, 0, newA, 0, i); System.arraycopy(a, i, newA, i+1, size - i); return newA; } private static long[] grow(long[] a, int size, int newLength, int i) { if (i < 0 || i >= size) return Arrays.copyOf(a, newLength); long[] newA = new long[newLength]; System.arraycopy(a, 0, newA, 0, i); System.arraycopy(a, i, newA, i+1, size - i); return newA; } private static int[] grow(int[] a, int size, int newLength, int i) { if (i < 0 || i >= size) return Arrays.copyOf(a, newLength); int[] newA = new int[newLength]; System.arraycopy(a, 0, newA, 0, i); System.arraycopy(a, i, newA, i+1, size - i); return newA; } /* * Move elements so that index i is "free", assuming the arrays have at least one free slot at the end. */ private void moveElements(int i) { if (i >= size) return; System.arraycopy(starts, i, starts, i+1, size - i); System.arraycopy(ends, i, ends, i+1, size - i); System.arraycopy(markedAts, i, markedAts, i+1, size - i); System.arraycopy(delTimes, i, delTimes, i+1, size - i); // we set starts[i] to null to indicate the position is now empty, so that we update boundaryHeapSize // when we set it starts[i] = null; } private void setInternal(int i, LegacyBound start, LegacyBound end, long markedAt, int delTime) { starts[i] = start; ends[i] = end; markedAts[i] = markedAt; delTimes[i] = delTime; } public void updateDigest(MessageDigest digest) { ByteBuffer longBuffer = ByteBuffer.allocate(8); for (int i = 0; i < size; i++) { for (int j = 0; j < starts[i].bound.size(); j++) digest.update(starts[i].bound.get(j).duplicate()); if (starts[i].collectionName != null) digest.update(starts[i].collectionName.name.bytes.duplicate()); for (int j = 0; j < ends[i].bound.size(); j++) digest.update(ends[i].bound.get(j).duplicate()); if (ends[i].collectionName != null) digest.update(ends[i].collectionName.name.bytes.duplicate()); longBuffer.putLong(0, markedAts[i]); digest.update(longBuffer.array(), 0, 8); } } public void serialize(DataOutputPlus out, CFMetaData metadata) throws IOException { out.writeInt(size); if (size == 0) return; if (metadata.isCompound()) serializeCompound(out, metadata.isDense()); else serializeSimple(out); } private void serializeCompound(DataOutputPlus out, boolean isDense) throws IOException { List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); if (!isDense) types.add(UTF8Type.instance); CompositeType type = CompositeType.getInstance(types); for (int i = 0; i < size; i++) { LegacyBound start = starts[i]; LegacyBound end = ends[i]; CompositeType.Builder startBuilder = type.builder(start.isStatic); CompositeType.Builder endBuilder = type.builder(end.isStatic); for (int j = 0; j < start.bound.clustering().size(); j++) { startBuilder.add(start.bound.get(j)); endBuilder.add(end.bound.get(j)); } if (start.collectionName != null) startBuilder.add(start.collectionName.name.bytes); if (end.collectionName != null) endBuilder.add(end.collectionName.name.bytes); ByteBufferUtil.writeWithShortLength(startBuilder.build(), out); ByteBufferUtil.writeWithShortLength(endBuilder.buildAsEndOfRange(), out); out.writeInt(delTimes[i]); out.writeLong(markedAts[i]); } } private void serializeSimple(DataOutputPlus out) throws IOException { List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); assert types.size() == 1 : types; for (int i = 0; i < size; i++) { LegacyBound start = starts[i]; LegacyBound end = ends[i]; ClusteringPrefix startClustering = start.bound.clustering(); ClusteringPrefix endClustering = end.bound.clustering(); assert startClustering.size() == 1; assert endClustering.size() == 1; ByteBufferUtil.writeWithShortLength(startClustering.get(0), out); ByteBufferUtil.writeWithShortLength(endClustering.get(0), out); out.writeInt(delTimes[i]); out.writeLong(markedAts[i]); } } public long serializedSize(CFMetaData metadata) { long size = 0; size += TypeSizes.sizeof(this.size); if (this.size == 0) return size; if (metadata.isCompound()) return size + serializedSizeCompound(metadata.isDense()); else return size + serializedSizeSimple(); } private long serializedSizeCompound(boolean isDense) { long size = 0; List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); if (!isDense) types.add(UTF8Type.instance); CompositeType type = CompositeType.getInstance(types); for (int i = 0; i < this.size; i++) { LegacyBound start = starts[i]; LegacyBound end = ends[i]; CompositeType.Builder startBuilder = type.builder(); CompositeType.Builder endBuilder = type.builder(); for (int j = 0; j < start.bound.size(); j++) startBuilder.add(start.bound.get(j)); for (int j = 0; j < end.bound.size(); j++) endBuilder.add(end.bound.get(j)); if (start.collectionName != null) startBuilder.add(start.collectionName.name.bytes); if (end.collectionName != null) endBuilder.add(end.collectionName.name.bytes); size += ByteBufferUtil.serializedSizeWithShortLength(startBuilder.build()); size += ByteBufferUtil.serializedSizeWithShortLength(endBuilder.buildAsEndOfRange()); size += TypeSizes.sizeof(delTimes[i]); size += TypeSizes.sizeof(markedAts[i]); } return size; } private long serializedSizeSimple() { long size = 0; List<AbstractType<?>> types = new ArrayList<>(comparator.clusteringComparator.subtypes()); assert types.size() == 1 : types; for (int i = 0; i < this.size; i++) { LegacyBound start = starts[i]; LegacyBound end = ends[i]; ClusteringPrefix startClustering = start.bound.clustering(); ClusteringPrefix endClustering = end.bound.clustering(); assert startClustering.size() == 1; assert endClustering.size() == 1; size += ByteBufferUtil.serializedSizeWithShortLength(startClustering.get(0)); size += ByteBufferUtil.serializedSizeWithShortLength(endClustering.get(0)); size += TypeSizes.sizeof(delTimes[i]); size += TypeSizes.sizeof(markedAts[i]); } return size; } } }