/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.index;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;

import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.PackedInts;
import org.apache.lucene.util.packed.PackedLongValues;

Maps per-segment ordinals to/from global ordinal space, using a compact packed-ints representation.

NOTE: this is a costly operation, as it must merge sort all terms, and may require non-trivial RAM once done. It's better to operate in segment-private ordinal space instead when possible.

@lucene.internal
/** Maps per-segment ordinals to/from global ordinal space, using a compact packed-ints representation. * * <p><b>NOTE</b>: this is a costly operation, as it must merge sort all terms, and may require non-trivial RAM once done. It's better to operate in * segment-private ordinal space instead when possible. * * @lucene.internal */
public class OrdinalMap implements Accountable { // TODO: we could also have a utility method to merge Terms[] and use size() as a weight when we need it // TODO: use more efficient packed ints structures? private static class TermsEnumIndex { public final static TermsEnumIndex[] EMPTY_ARRAY = new TermsEnumIndex[0]; final int subIndex; final TermsEnum termsEnum; BytesRef currentTerm; public TermsEnumIndex(TermsEnum termsEnum, int subIndex) { this.termsEnum = termsEnum; this.subIndex = subIndex; } public BytesRef next() throws IOException { currentTerm = termsEnum.next(); return currentTerm; } } private static class SegmentMap implements Accountable { private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(SegmentMap.class);
Build a map from an index into a sorted view of `weights` to an index into `weights`.
/** Build a map from an index into a sorted view of `weights` to an index into `weights`. */
private static int[] map(final long[] weights) { final int[] newToOld = new int[weights.length]; for (int i = 0; i < weights.length; ++i) { newToOld[i] = i; } new InPlaceMergeSorter() { @Override protected void swap(int i, int j) { final int tmp = newToOld[i]; newToOld[i] = newToOld[j]; newToOld[j] = tmp; } @Override protected int compare(int i, int j) { // j first since we actually want higher weights first return Long.compare(weights[newToOld[j]], weights[newToOld[i]]); } }.sort(0, weights.length); return newToOld; }
Inverse the map.
/** Inverse the map. */
private static int[] inverse(int[] map) { final int[] inverse = new int[map.length]; for (int i = 0; i < map.length; ++i) { inverse[map[i]] = i; } return inverse; } private final int[] newToOld, oldToNew; SegmentMap(long[] weights) { newToOld = map(weights); oldToNew = inverse(newToOld); assert Arrays.equals(newToOld, inverse(oldToNew)); } int newToOld(int segment) { return newToOld[segment]; } int oldToNew(int segment) { return oldToNew[segment]; } @Override public long ramBytesUsed() { return BASE_RAM_BYTES_USED + RamUsageEstimator.sizeOf(newToOld) + RamUsageEstimator.sizeOf(oldToNew); } }
Create an ordinal map that uses the number of unique values of each SortedDocValues instance as a weight.
See Also:
/** * Create an ordinal map that uses the number of unique values of each * {@link SortedDocValues} instance as a weight. * @see #build(IndexReader.CacheKey, TermsEnum[], long[], float) */
public static OrdinalMap build(IndexReader.CacheKey owner, SortedDocValues[] values, float acceptableOverheadRatio) throws IOException { final TermsEnum[] subs = new TermsEnum[values.length]; final long[] weights = new long[values.length]; for (int i = 0; i < values.length; ++i) { subs[i] = values[i].termsEnum(); weights[i] = values[i].getValueCount(); } return build(owner, subs, weights, acceptableOverheadRatio); }
Create an ordinal map that uses the number of unique values of each SortedSetDocValues instance as a weight.
See Also:
/** * Create an ordinal map that uses the number of unique values of each * {@link SortedSetDocValues} instance as a weight. * @see #build(IndexReader.CacheKey, TermsEnum[], long[], float) */
public static OrdinalMap build(IndexReader.CacheKey owner, SortedSetDocValues[] values, float acceptableOverheadRatio) throws IOException { final TermsEnum[] subs = new TermsEnum[values.length]; final long[] weights = new long[values.length]; for (int i = 0; i < values.length; ++i) { subs[i] = values[i].termsEnum(); weights[i] = values[i].getValueCount(); } return build(owner, subs, weights, acceptableOverheadRatio); }
Creates an ordinal map that allows mapping ords to/from a merged space from subs.
Params:
  • owner – a cache key
  • subs – TermsEnums that support TermsEnum.ord(). They need not be dense (e.g. can be FilteredTermsEnums}.
  • weights – a weight for each sub. This is ideally correlated with the number of unique terms that each sub introduces compared to the other subs
Throws:
/** * Creates an ordinal map that allows mapping ords to/from a merged * space from <code>subs</code>. * @param owner a cache key * @param subs TermsEnums that support {@link TermsEnum#ord()}. They need * not be dense (e.g. can be FilteredTermsEnums}. * @param weights a weight for each sub. This is ideally correlated with * the number of unique terms that each sub introduces compared * to the other subs * @throws IOException if an I/O error occurred. */
public static OrdinalMap build(IndexReader.CacheKey owner, TermsEnum subs[], long[] weights, float acceptableOverheadRatio) throws IOException { if (subs.length != weights.length) { throw new IllegalArgumentException("subs and weights must have the same length"); } // enums are not sorted, so let's sort to save memory final SegmentMap segmentMap = new SegmentMap(weights); return new OrdinalMap(owner, subs, segmentMap, acceptableOverheadRatio); } private static final long BASE_RAM_BYTES_USED = RamUsageEstimator.shallowSizeOfInstance(OrdinalMap.class);
Cache key of whoever asked for this awful thing
/** Cache key of whoever asked for this awful thing */
public final IndexReader.CacheKey owner; // globalOrd -> (globalOrd - segmentOrd) where segmentOrd is the the ordinal in the first segment that contains this term final PackedLongValues globalOrdDeltas; // globalOrd -> first segment container final PackedLongValues firstSegments; // for every segment, segmentOrd -> globalOrd final LongValues segmentToGlobalOrds[]; // the map from/to segment ids final SegmentMap segmentMap; // ram usage final long ramBytesUsed; OrdinalMap(IndexReader.CacheKey owner, TermsEnum subs[], SegmentMap segmentMap, float acceptableOverheadRatio) throws IOException { // create the ordinal mappings by pulling a termsenum over each sub's // unique terms, and walking a multitermsenum over those this.owner = owner; this.segmentMap = segmentMap; // even though we accept an overhead ratio, we keep these ones with COMPACT // since they are only used to resolve values given a global ord, which is // slow anyway PackedLongValues.Builder globalOrdDeltas = PackedLongValues.monotonicBuilder(PackedInts.COMPACT); PackedLongValues.Builder firstSegments = PackedLongValues.packedBuilder(PackedInts.COMPACT); final PackedLongValues.Builder[] ordDeltas = new PackedLongValues.Builder[subs.length]; for (int i = 0; i < ordDeltas.length; i++) { ordDeltas[i] = PackedLongValues.monotonicBuilder(acceptableOverheadRatio); } long[] ordDeltaBits = new long[subs.length]; long[] segmentOrds = new long[subs.length]; // Just merge-sorts by term: PriorityQueue<TermsEnumIndex> queue = new PriorityQueue<TermsEnumIndex>(subs.length) { @Override protected boolean lessThan(TermsEnumIndex a, TermsEnumIndex b) { return a.currentTerm.compareTo(b.currentTerm) < 0; } }; for (int i = 0; i < subs.length; i++) { TermsEnumIndex sub = new TermsEnumIndex(subs[segmentMap.newToOld(i)], i); if (sub.next() != null) { queue.add(sub); } } BytesRefBuilder scratch = new BytesRefBuilder(); long globalOrd = 0; while (queue.size() != 0) { TermsEnumIndex top = queue.top(); scratch.copyBytes(top.currentTerm); int firstSegmentIndex = Integer.MAX_VALUE; long globalOrdDelta = Long.MAX_VALUE; // Advance past this term, recording the per-segment ord deltas: while (true) { top = queue.top(); long segmentOrd = top.termsEnum.ord(); long delta = globalOrd - segmentOrd; int segmentIndex = top.subIndex; // We compute the least segment where the term occurs. In case the // first segment contains most (or better all) values, this will // help save significant memory if (segmentIndex < firstSegmentIndex) { firstSegmentIndex = segmentIndex; globalOrdDelta = delta; } ordDeltaBits[segmentIndex] |= delta; // for each per-segment ord, map it back to the global term; the while loop is needed // in case the incoming TermsEnums don't have compact ordinals (some ordinal values // are skipped), which can happen e.g. with a FilteredTermsEnum: assert segmentOrds[segmentIndex] <= segmentOrd; // TODO: we could specialize this case (the while loop is not needed when the ords // are compact) do { ordDeltas[segmentIndex].add(delta); segmentOrds[segmentIndex]++; } while (segmentOrds[segmentIndex] <= segmentOrd); if (top.next() == null) { queue.pop(); if (queue.size() == 0) { break; } } else { queue.updateTop(); } if (queue.top().currentTerm.equals(scratch.get()) == false) { break; } } // for each unique term, just mark the first segment index/delta where it occurs firstSegments.add(firstSegmentIndex); globalOrdDeltas.add(globalOrdDelta); globalOrd++; } this.firstSegments = firstSegments.build(); this.globalOrdDeltas = globalOrdDeltas.build(); // ordDeltas is typically the bottleneck, so let's see what we can do to make it faster segmentToGlobalOrds = new LongValues[subs.length]; long ramBytesUsed = BASE_RAM_BYTES_USED + this.globalOrdDeltas.ramBytesUsed() + this.firstSegments.ramBytesUsed() + RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds) + segmentMap.ramBytesUsed(); for (int i = 0; i < ordDeltas.length; ++i) { final PackedLongValues deltas = ordDeltas[i].build(); if (ordDeltaBits[i] == 0L) { // segment ords perfectly match global ordinals // likely in case of low cardinalities and large segments segmentToGlobalOrds[i] = LongValues.IDENTITY; } else { final int bitsRequired = ordDeltaBits[i] < 0 ? 64 : PackedInts.bitsRequired(ordDeltaBits[i]); final long monotonicBits = deltas.ramBytesUsed() * 8; final long packedBits = bitsRequired * deltas.size(); if (deltas.size() <= Integer.MAX_VALUE && packedBits <= monotonicBits * (1 + acceptableOverheadRatio)) { // monotonic compression mostly adds overhead, let's keep the mapping in plain packed ints final int size = (int) deltas.size(); final PackedInts.Mutable newDeltas = PackedInts.getMutable(size, bitsRequired, acceptableOverheadRatio); final PackedLongValues.Iterator it = deltas.iterator(); for (int ord = 0; ord < size; ++ord) { newDeltas.set(ord, it.next()); } assert it.hasNext() == false; segmentToGlobalOrds[i] = new LongValues() { @Override public long get(long ord) { return ord + newDeltas.get((int) ord); } }; ramBytesUsed += newDeltas.ramBytesUsed(); } else { segmentToGlobalOrds[i] = new LongValues() { @Override public long get(long ord) { return ord + deltas.get(ord); } }; ramBytesUsed += deltas.ramBytesUsed(); } ramBytesUsed += RamUsageEstimator.shallowSizeOf(segmentToGlobalOrds[i]); } } this.ramBytesUsed = ramBytesUsed; }
Given a segment number, return a LongValues instance that maps segment ordinals to global ordinals.
/** * Given a segment number, return a {@link LongValues} instance that maps * segment ordinals to global ordinals. */
public LongValues getGlobalOrds(int segmentIndex) { return segmentToGlobalOrds[segmentMap.oldToNew(segmentIndex)]; }
Given global ordinal, returns the ordinal of the first segment which contains this ordinal (the corresponding to the segment return getFirstSegmentNumber).
/** * Given global ordinal, returns the ordinal of the first segment which contains * this ordinal (the corresponding to the segment return {@link #getFirstSegmentNumber}). */
public long getFirstSegmentOrd(long globalOrd) { return globalOrd - globalOrdDeltas.get(globalOrd); }
Given a global ordinal, returns the index of the first segment that contains this term.
/** * Given a global ordinal, returns the index of the first * segment that contains this term. */
public int getFirstSegmentNumber(long globalOrd) { return segmentMap.newToOld((int) firstSegments.get(globalOrd)); }
Returns the total number of unique terms in global ord space.
/** * Returns the total number of unique terms in global ord space. */
public long getValueCount() { return globalOrdDeltas.size(); } @Override public long ramBytesUsed() { return ramBytesUsed; } @Override public Collection<Accountable> getChildResources() { List<Accountable> resources = new ArrayList<>(); resources.add(Accountables.namedAccountable("global ord deltas", globalOrdDeltas)); resources.add(Accountables.namedAccountable("first segments", firstSegments)); resources.add(Accountables.namedAccountable("segment map", segmentMap)); // TODO: would be nice to return actual child segment deltas too, but the optimizations are confusing return resources; } }