/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.cassandra.io.sstable;

import java.util.*;

public class Downsampling
{
    
The base (down)sampling level determines the granularity at which we can down/upsample. A higher number allows us to approximate more closely the ideal sampling. (It could also mean we do a lot of expensive almost-no-op resamplings from N to N-1, but the thresholds in IndexSummaryManager prevent that.) BSL must be a power of two in order to have good sampling patterns. This cannot be changed without rebuilding all index summaries at full sampling; for now we treat it as a constant.
/** * The base (down)sampling level determines the granularity at which we can down/upsample. * * A higher number allows us to approximate more closely the ideal sampling. (It could also mean we do a lot of * expensive almost-no-op resamplings from N to N-1, but the thresholds in IndexSummaryManager prevent that.) * * BSL must be a power of two in order to have good sampling patterns. This cannot be changed without rebuilding * all index summaries at full sampling; for now we treat it as a constant. */
public static final int BASE_SAMPLING_LEVEL = 128; private static final Map<Integer, List<Integer>> samplePatternCache = new HashMap<>(); private static final Map<Integer, List<Integer>> originalIndexCache = new HashMap<>();
Gets a list L of starting indices for downsampling rounds: the first round should start with the offset given by L[0], the second by the offset in L[1], etc.
Params:
  • samplingLevel – the base sampling level
Returns:A list of `samplingLevel` unique indices between 0 and `samplingLevel`
/** * Gets a list L of starting indices for downsampling rounds: the first round should start with the offset * given by L[0], the second by the offset in L[1], etc. * * @param samplingLevel the base sampling level * * @return A list of `samplingLevel` unique indices between 0 and `samplingLevel` */
public static List<Integer> getSamplingPattern(int samplingLevel) { List<Integer> pattern = samplePatternCache.get(samplingLevel); if (pattern != null) return pattern; if (samplingLevel <= 1) return Arrays.asList(0); int[] odds = new int[samplingLevel / 2]; int[] evens = new int[samplingLevel / 2]; for (int i = 1; i < samplingLevel; i += 2) odds[i/2] = i; for (int i = 0; i < samplingLevel; i += 2) evens[i/2] = i; // especially for latter rounds, it's important that we spread out the start points, so we'll // make a recursive call to get an ordering for this list of start points List<Integer> ordering = getSamplingPattern(samplingLevel/2); List<Integer> startIndices = new ArrayList<>(samplingLevel); for (Integer index : ordering) startIndices.add(odds[index]); for (Integer index : ordering) startIndices.add(evens[index]); samplePatternCache.put(samplingLevel, startIndices); return startIndices; }
Returns a list that can be used to translate current index summary indexes to their original index before downsampling. (This repeats every `samplingLevel`, so that's how many entries we return.) For example, if [0, 64] is returned, the current index summary entry at index 0 was originally at index 0, and the current index 1 was originally at index 64.
Params:
  • samplingLevel – the current sampling level for the index summary
Returns:a list of original indexes for current summary entries
/** * Returns a list that can be used to translate current index summary indexes to their original index before * downsampling. (This repeats every `samplingLevel`, so that's how many entries we return.) * * For example, if [0, 64] is returned, the current index summary entry at index 0 was originally * at index 0, and the current index 1 was originally at index 64. * * @param samplingLevel the current sampling level for the index summary * * @return a list of original indexes for current summary entries */
public static List<Integer> getOriginalIndexes(int samplingLevel) { List<Integer> originalIndexes = originalIndexCache.get(samplingLevel); if (originalIndexes != null) return originalIndexes; List<Integer> pattern = getSamplingPattern(BASE_SAMPLING_LEVEL).subList(0, BASE_SAMPLING_LEVEL - samplingLevel); originalIndexes = new ArrayList<>(samplingLevel); for (int j = 0; j < BASE_SAMPLING_LEVEL; j++) { if (!pattern.contains(j)) originalIndexes.add(j); } originalIndexCache.put(samplingLevel, originalIndexes); return originalIndexes; }
Calculates the effective index interval after the entry at `index` in an IndexSummary. In other words, this returns the number of partitions in the primary on-disk index before the next partition that has an entry in the index summary. If samplingLevel == BASE_SAMPLING_LEVEL, this will be equal to the index interval.
Params:
  • index – an index into an IndexSummary
  • samplingLevel – the current sampling level for that IndexSummary
  • minIndexInterval – the min index interval (effective index interval at full sampling)
Returns:the number of partitions before the next index summary entry, inclusive on one end
/** * Calculates the effective index interval after the entry at `index` in an IndexSummary. In other words, this * returns the number of partitions in the primary on-disk index before the next partition that has an entry in * the index summary. If samplingLevel == BASE_SAMPLING_LEVEL, this will be equal to the index interval. * @param index an index into an IndexSummary * @param samplingLevel the current sampling level for that IndexSummary * @param minIndexInterval the min index interval (effective index interval at full sampling) * @return the number of partitions before the next index summary entry, inclusive on one end */
public static int getEffectiveIndexIntervalAfterIndex(int index, int samplingLevel, int minIndexInterval) { assert index >= 0; index %= samplingLevel; List<Integer> originalIndexes = getOriginalIndexes(samplingLevel); int nextEntryOriginalIndex = (index == originalIndexes.size() - 1) ? BASE_SAMPLING_LEVEL : originalIndexes.get(index + 1); return (nextEntryOriginalIndex - originalIndexes.get(index)) * minIndexInterval; } public static int[] getStartPoints(int currentSamplingLevel, int newSamplingLevel) { List<Integer> allStartPoints = getSamplingPattern(BASE_SAMPLING_LEVEL); // calculate starting indexes for sampling rounds int initialRound = BASE_SAMPLING_LEVEL - currentSamplingLevel; int numRounds = Math.abs(currentSamplingLevel - newSamplingLevel); int[] startPoints = new int[numRounds]; for (int i = 0; i < numRounds; ++i) { int start = allStartPoints.get(initialRound + i); // our "ideal" start points will be affected by the removal of items in earlier rounds, so go through all // earlier rounds, and if we see an index that comes before our ideal start point, decrement the start point int adjustment = 0; for (int j = 0; j < initialRound; ++j) { if (allStartPoints.get(j) < start) adjustment++; } startPoints[i] = start - adjustment; } return startPoints; } }