/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
Merges segments of approximately equal size, subject to an allowed number of segments per tier. This is similar to LogByteSizeMergePolicy
, except this merge policy is able to merge non-adjacent segment, and separates how many segments are merged at once (setMaxMergeAtOnce
) from how many segments are allowed per tier (setSegmentsPerTier
). This merge policy also does not over-merge (i.e. cascade merges). For normal merging, this policy first computes a
"budget" of how many segments are allowed to be in the
index. If the index is over-budget, then the policy
sorts segments by decreasing size (pro-rating by percent
deletes), and then finds the least-cost merge. Merge
cost is measured by a combination of the "skew" of the
merge (size of largest segment divided by smallest segment),
total merge size and percent deletes reclaimed,
so that merges with lower skew, smaller size
and those reclaiming more deletes, are
favored.
If a merge will produce a segment that's larger than setMaxMergedSegmentMB
, then the policy will merge fewer segments (down to 1 at once, if that one has deletions) to keep the segment size under budget.
NOTE: this policy freely merges non-adjacent segments; if this is a problem, use LogMergePolicy
.
NOTE: This policy always merges by byte size
of the segments, always pro-rates by percent deletes
NOTE Starting with Lucene 7.5, there are several changes:
- findForcedMerges and findForcedDeletesMerges) respect the max segment
size by default.
- When findforcedmerges is called with maxSegmentCount other than 1,
the resulting index is not guaranteed to have <= maxSegmentCount segments.
Rather it is on a "best effort" basis. Specifically the theoretical ideal
segment size is calculated and a "fudge factor" of 25% is added as the
new maxSegmentSize, which is respected.
- findForcedDeletesMerges will not produce segments greater than
maxSegmentSize.
@lucene.experimental
/**
* Merges segments of approximately equal size, subject to
* an allowed number of segments per tier. This is similar
* to {@link LogByteSizeMergePolicy}, except this merge
* policy is able to merge non-adjacent segment, and
* separates how many segments are merged at once ({@link
* #setMaxMergeAtOnce}) from how many segments are allowed
* per tier ({@link #setSegmentsPerTier}). This merge
* policy also does not over-merge (i.e. cascade merges).
*
* <p>For normal merging, this policy first computes a
* "budget" of how many segments are allowed to be in the
* index. If the index is over-budget, then the policy
* sorts segments by decreasing size (pro-rating by percent
* deletes), and then finds the least-cost merge. Merge
* cost is measured by a combination of the "skew" of the
* merge (size of largest segment divided by smallest segment),
* total merge size and percent deletes reclaimed,
* so that merges with lower skew, smaller size
* and those reclaiming more deletes, are
* favored.
*
* <p>If a merge will produce a segment that's larger than
* {@link #setMaxMergedSegmentMB}, then the policy will
* merge fewer segments (down to 1 at once, if that one has
* deletions) to keep the segment size under budget.
*
* <p><b>NOTE</b>: this policy freely merges non-adjacent
* segments; if this is a problem, use {@link
* LogMergePolicy}.
*
* <p><b>NOTE</b>: This policy always merges by byte size
* of the segments, always pro-rates by percent deletes
*
* <p><b>NOTE</b> Starting with Lucene 7.5, there are several changes:
*
* - findForcedMerges and findForcedDeletesMerges) respect the max segment
* size by default.
*
* - When findforcedmerges is called with maxSegmentCount other than 1,
* the resulting index is not guaranteed to have <= maxSegmentCount segments.
* Rather it is on a "best effort" basis. Specifically the theoretical ideal
* segment size is calculated and a "fudge factor" of 25% is added as the
* new maxSegmentSize, which is respected.
*
* - findForcedDeletesMerges will not produce segments greater than
* maxSegmentSize.
*
* @lucene.experimental
*/
// TODO
// - we could try to take into account whether a large
// merge is already running (under CMS) and then bias
// ourselves towards picking smaller merges if so (or,
// maybe CMS should do so)
public class TieredMergePolicy extends MergePolicy {
Default noCFSRatio. If a merge's size is >= 10%
of the index, then we disable compound file for it. @see MergePolicy#setNoCFSRatio /** Default noCFSRatio. If a merge's size is {@code >= 10%} of
* the index, then we disable compound file for it.
* @see MergePolicy#setNoCFSRatio */
public static final double DEFAULT_NO_CFS_RATIO = 0.1;
// User-specified maxMergeAtOnce. In practice we always take the min of its
// value and segsPerTier to avoid suboptimal merging.
private int maxMergeAtOnce = 10;
private long maxMergedSegmentBytes = 5*1024*1024*1024L;
private int maxMergeAtOnceExplicit = 30;
private long floorSegmentBytes = 2*1024*1024L;
private double segsPerTier = 10.0;
private double forceMergeDeletesPctAllowed = 10.0;
private double deletesPctAllowed = 33.0;
Sole constructor, setting all settings to their
defaults. /** Sole constructor, setting all settings to their
* defaults. */
public TieredMergePolicy() {
super(DEFAULT_NO_CFS_RATIO, MergePolicy.DEFAULT_MAX_CFS_SEGMENT_SIZE);
}
Maximum number of segments to be merged at a time during "normal" merging. For explicit merging (eg, forceMerge or forceMergeDeletes was called), see setMaxMergeAtOnceExplicit
. Default is 10. /** Maximum number of segments to be merged at a time
* during "normal" merging. For explicit merging (eg,
* forceMerge or forceMergeDeletes was called), see {@link
* #setMaxMergeAtOnceExplicit}. Default is 10. */
public TieredMergePolicy setMaxMergeAtOnce(int v) {
if (v < 2) {
throw new IllegalArgumentException("maxMergeAtOnce must be > 1 (got " + v + ")");
}
maxMergeAtOnce = v;
return this;
}
private enum MERGE_TYPE {
NATURAL, FORCE_MERGE, FORCE_MERGE_DELETES
}
Returns the current maxMergeAtOnce setting.
See Also: - setMaxMergeAtOnce
/** Returns the current maxMergeAtOnce setting.
*
* @see #setMaxMergeAtOnce */
public int getMaxMergeAtOnce() {
return maxMergeAtOnce;
}
// TODO: should addIndexes do explicit merging, too? And,
// if user calls IW.maybeMerge "explicitly"
Maximum number of segments to be merged at a time,
during forceMerge or forceMergeDeletes. Default is 30. /** Maximum number of segments to be merged at a time,
* during forceMerge or forceMergeDeletes. Default is 30. */
public TieredMergePolicy setMaxMergeAtOnceExplicit(int v) {
if (v < 2) {
throw new IllegalArgumentException("maxMergeAtOnceExplicit must be > 1 (got " + v + ")");
}
maxMergeAtOnceExplicit = v;
return this;
}
Returns the current maxMergeAtOnceExplicit setting.
See Also: - setMaxMergeAtOnceExplicit
/** Returns the current maxMergeAtOnceExplicit setting.
*
* @see #setMaxMergeAtOnceExplicit */
public int getMaxMergeAtOnceExplicit() {
return maxMergeAtOnceExplicit;
}
Maximum sized segment to produce during
normal merging. This setting is approximate: the
estimate of the merged segment size is made by summing
sizes of to-be-merged segments (compensating for
percent deleted docs). Default is 5 GB. /** Maximum sized segment to produce during
* normal merging. This setting is approximate: the
* estimate of the merged segment size is made by summing
* sizes of to-be-merged segments (compensating for
* percent deleted docs). Default is 5 GB. */
public TieredMergePolicy setMaxMergedSegmentMB(double v) {
if (v < 0.0) {
throw new IllegalArgumentException("maxMergedSegmentMB must be >=0 (got " + v + ")");
}
v *= 1024 * 1024;
maxMergedSegmentBytes = v > Long.MAX_VALUE ? Long.MAX_VALUE : (long) v;
return this;
}
Returns the current maxMergedSegmentMB setting.
See Also: - setMaxMergedSegmentMB
/** Returns the current maxMergedSegmentMB setting.
*
* @see #setMaxMergedSegmentMB */
public double getMaxMergedSegmentMB() {
return maxMergedSegmentBytes/1024.0/1024.0;
}
Controls the maximum percentage of deleted documents that is tolerated in
the index. Lower values make the index more space efficient at the
expense of increased CPU and I/O activity. Values must be between 20 and
50. Default value is 33. /** Controls the maximum percentage of deleted documents that is tolerated in
* the index. Lower values make the index more space efficient at the
* expense of increased CPU and I/O activity. Values must be between 20 and
* 50. Default value is 33. */
public TieredMergePolicy setDeletesPctAllowed(double v) {
if (v < 20 || v > 50) {
throw new IllegalArgumentException("indexPctDeletedTarget must be >= 20.0 and <= 50 (got " + v + ")");
}
deletesPctAllowed = v;
return this;
}
Returns the current deletesPctAllowed setting.
See Also: - setDeletesPctAllowed
/** Returns the current deletesPctAllowed setting.
*
* @see #setDeletesPctAllowed */
public double getDeletesPctAllowed() {
return deletesPctAllowed;
}
Segments smaller than this are "rounded up" to this
size, ie treated as equal (floor) size for merge
selection. This is to prevent frequent flushing of
tiny segments from allowing a long tail in the index.
Default is 2 MB. /** Segments smaller than this are "rounded up" to this
* size, ie treated as equal (floor) size for merge
* selection. This is to prevent frequent flushing of
* tiny segments from allowing a long tail in the index.
* Default is 2 MB. */
public TieredMergePolicy setFloorSegmentMB(double v) {
if (v <= 0.0) {
throw new IllegalArgumentException("floorSegmentMB must be > 0.0 (got " + v + ")");
}
v *= 1024 * 1024;
floorSegmentBytes = v > Long.MAX_VALUE ? Long.MAX_VALUE : (long) v;
return this;
}
Returns the current floorSegmentMB.
@see #setFloorSegmentMB /** Returns the current floorSegmentMB.
*
* @see #setFloorSegmentMB */
public double getFloorSegmentMB() {
return floorSegmentBytes/(1024*1024.);
}
When forceMergeDeletes is called, we only merge away a
segment if its delete percentage is over this
threshold. Default is 10%. /** When forceMergeDeletes is called, we only merge away a
* segment if its delete percentage is over this
* threshold. Default is 10%. */
public TieredMergePolicy setForceMergeDeletesPctAllowed(double v) {
if (v < 0.0 || v > 100.0) {
throw new IllegalArgumentException("forceMergeDeletesPctAllowed must be between 0.0 and 100.0 inclusive (got " + v + ")");
}
forceMergeDeletesPctAllowed = v;
return this;
}
Returns the current forceMergeDeletesPctAllowed setting.
See Also: - setForceMergeDeletesPctAllowed
/** Returns the current forceMergeDeletesPctAllowed setting.
*
* @see #setForceMergeDeletesPctAllowed */
public double getForceMergeDeletesPctAllowed() {
return forceMergeDeletesPctAllowed;
}
Sets the allowed number of segments per tier. Smaller
values mean more merging but fewer segments.
Default is 10.0.
/** Sets the allowed number of segments per tier. Smaller
* values mean more merging but fewer segments.
*
* <p>Default is 10.0.</p> */
public TieredMergePolicy setSegmentsPerTier(double v) {
if (v < 2.0) {
throw new IllegalArgumentException("segmentsPerTier must be >= 2.0 (got " + v + ")");
}
segsPerTier = v;
return this;
}
Returns the current segmentsPerTier setting.
See Also: - setSegmentsPerTier
/** Returns the current segmentsPerTier setting.
*
* @see #setSegmentsPerTier */
public double getSegmentsPerTier() {
return segsPerTier;
}
private static class SegmentSizeAndDocs {
private final SegmentCommitInfo segInfo;
private final long sizeInBytes;
private final int delCount;
private final int maxDoc;
private final String name;
SegmentSizeAndDocs(SegmentCommitInfo info, final long sizeInBytes, final int segDelCount) throws IOException {
segInfo = info;
this.name = info.info.name;
this.sizeInBytes = sizeInBytes;
this.delCount = segDelCount;
this.maxDoc = info.info.maxDoc();
}
}
Holds score and explanation for a single candidate
merge. /** Holds score and explanation for a single candidate
* merge. */
protected static abstract class MergeScore {
Sole constructor. (For invocation by subclass
constructors, typically implicit.) /** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected MergeScore() {
}
Returns the score for this merge candidate; lower
scores are better. /** Returns the score for this merge candidate; lower
* scores are better. */
abstract double getScore();
Human readable explanation of how the merge got this
score. /** Human readable explanation of how the merge got this
* score. */
abstract String getExplanation();
}
// The size can change concurrently while we are running here, because deletes
// are now applied concurrently, and this can piss off TimSort! So we
// call size() once per segment and sort by that:
private List<SegmentSizeAndDocs> getSortedBySegmentSize(final SegmentInfos infos, final MergeContext mergeContext) throws IOException {
List<SegmentSizeAndDocs> sortedBySize = new ArrayList<>();
for (SegmentCommitInfo info : infos) {
sortedBySize.add(new SegmentSizeAndDocs(info, size(info, mergeContext), mergeContext.numDeletesToMerge(info)));
}
sortedBySize.sort((o1, o2) -> {
// Sort by largest size:
int cmp = Long.compare(o2.sizeInBytes, o1.sizeInBytes);
if (cmp == 0) {
cmp = o1.name.compareTo(o2.name);
}
return cmp;
});
return sortedBySize;
}
@Override
public MergeSpecification findMerges(MergeTrigger mergeTrigger, SegmentInfos infos, MergeContext mergeContext) throws IOException {
final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments();
// Compute total index bytes & print details about the index
long totIndexBytes = 0;
long minSegmentBytes = Long.MAX_VALUE;
int totalDelDocs = 0;
int totalMaxDoc = 0;
long mergingBytes = 0;
List<SegmentSizeAndDocs> sortedInfos = getSortedBySegmentSize(infos, mergeContext);
Iterator<SegmentSizeAndDocs> iter = sortedInfos.iterator();
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
final long segBytes = segSizeDocs.sizeInBytes;
if (verbose(mergeContext)) {
String extra = merging.contains(segSizeDocs.segInfo) ? " [merging]" : "";
if (segBytes >= maxMergedSegmentBytes) {
extra += " [skip: too large]";
} else if (segBytes < floorSegmentBytes) {
extra += " [floored]";
}
message(" seg=" + segString(mergeContext, Collections.singleton(segSizeDocs.segInfo)) + " size=" + String.format(Locale.ROOT, "%.3f", segBytes / 1024 / 1024.) + " MB" + extra, mergeContext);
}
if (merging.contains(segSizeDocs.segInfo)) {
mergingBytes += segSizeDocs.sizeInBytes;
iter.remove();
// if this segment is merging, then its deletes are being reclaimed already.
// only count live docs in the total max doc
totalMaxDoc += segSizeDocs.maxDoc - segSizeDocs.delCount;
} else {
totalDelDocs += segSizeDocs.delCount;
totalMaxDoc += segSizeDocs.maxDoc;
}
minSegmentBytes = Math.min(segBytes, minSegmentBytes);
totIndexBytes += segBytes;
}
assert totalMaxDoc >= 0;
assert totalDelDocs >= 0;
final double totalDelPct = 100 * (double) totalDelDocs / totalMaxDoc;
int allowedDelCount = (int) (deletesPctAllowed * totalMaxDoc / 100);
// If we have too-large segments, grace them out of the maximum segment count
// If we're above certain thresholds of deleted docs, we can merge very large segments.
int tooBigCount = 0;
iter = sortedInfos.iterator();
// remove large segments from consideration under two conditions.
// 1> Overall percent deleted docs relatively small and this segment is larger than 50% maxSegSize
// 2> overall percent deleted docs large and this segment is large and has few deleted docs
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
double segDelPct = 100 * (double) segSizeDocs.delCount / (double) segSizeDocs.maxDoc;
if (segSizeDocs.sizeInBytes > maxMergedSegmentBytes / 2 && (totalDelPct <= deletesPctAllowed || segDelPct <= deletesPctAllowed)) {
iter.remove();
tooBigCount++; // Just for reporting purposes.
totIndexBytes -= segSizeDocs.sizeInBytes;
allowedDelCount -= segSizeDocs.delCount;
}
}
allowedDelCount = Math.max(0, allowedDelCount);
final int mergeFactor = (int) Math.min(maxMergeAtOnce, segsPerTier);
// Compute max allowed segments in the index
long levelSize = Math.max(minSegmentBytes, floorSegmentBytes);
long bytesLeft = totIndexBytes;
double allowedSegCount = 0;
while (true) {
final double segCountLevel = bytesLeft / (double) levelSize;
if (segCountLevel < segsPerTier || levelSize == maxMergedSegmentBytes) {
allowedSegCount += Math.ceil(segCountLevel);
break;
}
allowedSegCount += segsPerTier;
bytesLeft -= segsPerTier * levelSize;
levelSize = Math.min(maxMergedSegmentBytes, levelSize * mergeFactor);
}
// allowedSegCount may occasionally be less than segsPerTier
// if segment sizes are below the floor size
allowedSegCount = Math.max(allowedSegCount, segsPerTier);
if (verbose(mergeContext) && tooBigCount > 0) {
message(" allowedSegmentCount=" + allowedSegCount + " vs count=" + infos.size() +
" (eligible count=" + sortedInfos.size() + ") tooBigCount= " + tooBigCount, mergeContext);
}
return doFindMerges(sortedInfos, maxMergedSegmentBytes, mergeFactor, (int) allowedSegCount, allowedDelCount, MERGE_TYPE.NATURAL,
mergeContext, mergingBytes >= maxMergedSegmentBytes);
}
private MergeSpecification doFindMerges(List<SegmentSizeAndDocs> sortedEligibleInfos,
final long maxMergedSegmentBytes,
final int mergeFactor, final int allowedSegCount,
final int allowedDelCount, final MERGE_TYPE mergeType,
MergeContext mergeContext,
boolean maxMergeIsRunning) throws IOException {
List<SegmentSizeAndDocs> sortedEligible = new ArrayList<>(sortedEligibleInfos);
Map<SegmentCommitInfo, SegmentSizeAndDocs> segInfosSizes = new HashMap<>();
for (SegmentSizeAndDocs segSizeDocs : sortedEligible) {
segInfosSizes.put(segSizeDocs.segInfo, segSizeDocs);
}
int originalSortedSize = sortedEligible.size();
if (verbose(mergeContext)) {
message("findMerges: " + originalSortedSize + " segments", mergeContext);
}
if (originalSortedSize == 0) {
return null;
}
final Set<SegmentCommitInfo> toBeMerged = new HashSet<>();
MergeSpecification spec = null;
// Cycle to possibly select more than one merge:
// The trigger point for total deleted documents in the index leads to a bunch of large segment
// merges at the same time. So only put one large merge in the list of merges per cycle. We'll pick up another
// merge next time around.
boolean haveOneLargeMerge = false;
while (true) {
// Gather eligible segments for merging, ie segments
// not already being merged and not already picked (by
// prior iteration of this loop) for merging:
// Remove ineligible segments. These are either already being merged or already picked by prior iterations
Iterator<SegmentSizeAndDocs> iter = sortedEligible.iterator();
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
if (toBeMerged.contains(segSizeDocs.segInfo)) {
iter.remove();
}
}
if (verbose(mergeContext)) {
message(" allowedSegmentCount=" + allowedSegCount + " vs count=" + originalSortedSize + " (eligible count=" + sortedEligible.size() + ")", mergeContext);
}
if (sortedEligible.size() == 0) {
return spec;
}
final int remainingDelCount = sortedEligible.stream().mapToInt(c -> c.delCount).sum();
if (mergeType == MERGE_TYPE.NATURAL &&
sortedEligible.size() <= allowedSegCount &&
remainingDelCount <= allowedDelCount) {
return spec;
}
// OK we are over budget -- find best merge!
MergeScore bestScore = null;
List<SegmentCommitInfo> best = null;
boolean bestTooLarge = false;
long bestMergeBytes = 0;
for (int startIdx = 0; startIdx < sortedEligible.size(); startIdx++) {
long totAfterMergeBytes = 0;
final List<SegmentCommitInfo> candidate = new ArrayList<>();
boolean hitTooLarge = false;
long bytesThisMerge = 0;
for (int idx = startIdx; idx < sortedEligible.size() && candidate.size() < mergeFactor && bytesThisMerge < maxMergedSegmentBytes; idx++) {
final SegmentSizeAndDocs segSizeDocs = sortedEligible.get(idx);
final long segBytes = segSizeDocs.sizeInBytes;
if (totAfterMergeBytes + segBytes > maxMergedSegmentBytes) {
hitTooLarge = true;
if (candidate.size() == 0) {
// We should never have something coming in that _cannot_ be merged, so handle singleton merges
candidate.add(segSizeDocs.segInfo);
bytesThisMerge += segBytes;
}
// NOTE: we continue, so that we can try
// "packing" smaller segments into this merge
// to see if we can get closer to the max
// size; this in general is not perfect since
// this is really "bin packing" and we'd have
// to try different permutations.
continue;
}
candidate.add(segSizeDocs.segInfo);
bytesThisMerge += segBytes;
totAfterMergeBytes += segBytes;
}
// We should never see an empty candidate: we iterated over maxMergeAtOnce
// segments, and already pre-excluded the too-large segments:
assert candidate.size() > 0;
// A singleton merge with no deletes makes no sense. We can get here when forceMerge is looping around...
if (candidate.size() == 1) {
SegmentSizeAndDocs segSizeDocs = segInfosSizes.get(candidate.get(0));
if (segSizeDocs.delCount == 0) {
continue;
}
}
// If we didn't find a too-large merge and have a list of candidates
// whose length is less than the merge factor, it means we are reaching
// the tail of the list of segments and will only find smaller merges.
// Stop here.
if (bestScore != null &&
hitTooLarge == false &&
candidate.size() < mergeFactor) {
break;
}
final MergeScore score = score(candidate, hitTooLarge, segInfosSizes);
if (verbose(mergeContext)) {
message(" maybe=" + segString(mergeContext, candidate) + " score=" + score.getScore() + " " + score.getExplanation() + " tooLarge=" + hitTooLarge + " size=" + String.format(Locale.ROOT, "%.3f MB", totAfterMergeBytes/1024./1024.), mergeContext);
}
if ((bestScore == null || score.getScore() < bestScore.getScore()) && (!hitTooLarge || !maxMergeIsRunning)) {
best = candidate;
bestScore = score;
bestTooLarge = hitTooLarge;
bestMergeBytes = totAfterMergeBytes;
}
}
if (best == null) {
return spec;
}
// The mergeType == FORCE_MERGE_DELETES behaves as the code does currently and can create a large number of
// concurrent big merges. If we make findForcedDeletesMerges behave as findForcedMerges and cycle through
// we should remove this.
if (haveOneLargeMerge == false || bestTooLarge == false || mergeType == MERGE_TYPE.FORCE_MERGE_DELETES) {
haveOneLargeMerge |= bestTooLarge;
if (spec == null) {
spec = new MergeSpecification();
}
final OneMerge merge = new OneMerge(best);
spec.add(merge);
if (verbose(mergeContext)) {
message(" add merge=" + segString(mergeContext, merge.segments) + " size=" + String.format(Locale.ROOT, "%.3f MB", bestMergeBytes / 1024. / 1024.) + " score=" + String.format(Locale.ROOT, "%.3f", bestScore.getScore()) + " " + bestScore.getExplanation() + (bestTooLarge ? " [max merge]" : ""), mergeContext);
}
}
// whether we're going to return this list in the spec of not, we need to remove it from
// consideration on the next loop.
toBeMerged.addAll(best);
}
}
Expert: scores one merge; subclasses can override. /** Expert: scores one merge; subclasses can override. */
protected MergeScore score(List<SegmentCommitInfo> candidate, boolean hitTooLarge, Map<SegmentCommitInfo, SegmentSizeAndDocs> segmentsSizes) throws IOException {
long totBeforeMergeBytes = 0;
long totAfterMergeBytes = 0;
long totAfterMergeBytesFloored = 0;
for(SegmentCommitInfo info : candidate) {
final long segBytes = segmentsSizes.get(info).sizeInBytes;
totAfterMergeBytes += segBytes;
totAfterMergeBytesFloored += floorSize(segBytes);
totBeforeMergeBytes += info.sizeInBytes();
}
// Roughly measure "skew" of the merge, i.e. how
// "balanced" the merge is (whether the segments are
// about the same size), which can range from
// 1.0/numSegsBeingMerged (good) to 1.0 (poor). Heavily
// lopsided merges (skew near 1.0) is no good; it means
// O(N^2) merge cost over time:
final double skew;
if (hitTooLarge) {
// Pretend the merge has perfect skew; skew doesn't
// matter in this case because this merge will not
// "cascade" and so it cannot lead to N^2 merge cost
// over time:
final int mergeFactor = (int) Math.min(maxMergeAtOnce, segsPerTier);
skew = 1.0/mergeFactor;
} else {
skew = ((double) floorSize(segmentsSizes.get(candidate.get(0)).sizeInBytes)) / totAfterMergeBytesFloored;
}
// Strongly favor merges with less skew (smaller
// mergeScore is better):
double mergeScore = skew;
// Gently favor smaller merges over bigger ones. We
// don't want to make this exponent too large else we
// can end up doing poor merges of small segments in
// order to avoid the large merges:
mergeScore *= Math.pow(totAfterMergeBytes, 0.05);
// Strongly favor merges that reclaim deletes:
final double nonDelRatio = ((double) totAfterMergeBytes)/totBeforeMergeBytes;
mergeScore *= Math.pow(nonDelRatio, 2);
final double finalMergeScore = mergeScore;
return new MergeScore() {
@Override
public double getScore() {
return finalMergeScore;
}
@Override
public String getExplanation() {
return "skew=" + String.format(Locale.ROOT, "%.3f", skew) + " nonDelRatio=" + String.format(Locale.ROOT, "%.3f", nonDelRatio);
}
};
}
@Override
public MergeSpecification findForcedMerges(SegmentInfos infos, int maxSegmentCount, Map<SegmentCommitInfo, Boolean> segmentsToMerge, MergeContext mergeContext) throws IOException {
if (verbose(mergeContext)) {
message("findForcedMerges maxSegmentCount=" + maxSegmentCount + " infos=" + segString(mergeContext, infos) +
" segmentsToMerge=" + segmentsToMerge, mergeContext);
}
List<SegmentSizeAndDocs> sortedSizeAndDocs = getSortedBySegmentSize(infos, mergeContext);
long totalMergeBytes = 0;
final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments();
// Trim the list down, remove if we're respecting max segment size and it's not original. Presumably it's been merged before and
// is close enough to the max segment size we shouldn't add it in again.
Iterator<SegmentSizeAndDocs> iter = sortedSizeAndDocs.iterator();
boolean forceMergeRunning = false;
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
final Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo);
if (isOriginal == null) {
iter.remove();
} else {
if (merging.contains(segSizeDocs.segInfo)) {
forceMergeRunning = true;
iter.remove();
} else {
totalMergeBytes += segSizeDocs.sizeInBytes;
}
}
}
long maxMergeBytes = maxMergedSegmentBytes;
// Set the maximum segment size based on how many segments have been specified.
if (maxSegmentCount == 1) maxMergeBytes = Long.MAX_VALUE;
else if (maxSegmentCount != Integer.MAX_VALUE) {
// Fudge this up a bit so we have a better chance of not having to rewrite segments. If we use the exact size,
// it's almost guaranteed that the segments won't fit perfectly and we'll be left with more segments than
// we want and have to re-merge in the code at the bottom of this method.
maxMergeBytes = Math.max((long) (((double) totalMergeBytes / (double) maxSegmentCount)), maxMergedSegmentBytes);
maxMergeBytes = (long) ((double) maxMergeBytes * 1.25);
}
iter = sortedSizeAndDocs.iterator();
boolean foundDeletes = false;
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
Boolean isOriginal = segmentsToMerge.get(segSizeDocs.segInfo);
if (segSizeDocs.delCount != 0) { // This is forceMerge, all segments with deleted docs should be merged.
if (isOriginal != null && isOriginal) {
foundDeletes = true;
}
continue;
}
// Let the scoring handle whether to merge large segments.
if (maxSegmentCount == Integer.MAX_VALUE && isOriginal != null && isOriginal == false) {
iter.remove();
}
// Don't try to merge a segment with no deleted docs that's over the max size.
if (maxSegmentCount != Integer.MAX_VALUE && segSizeDocs.sizeInBytes >= maxMergeBytes) {
iter.remove();
}
}
// Nothing to merge this round.
if (sortedSizeAndDocs.size() == 0) {
return null;
}
// We should never bail if there are segments that have deleted documents, all deleted docs should be purged.
if (foundDeletes == false) {
SegmentCommitInfo infoZero = sortedSizeAndDocs.get(0).segInfo;
if ((maxSegmentCount != Integer.MAX_VALUE && maxSegmentCount > 1 && sortedSizeAndDocs.size() <= maxSegmentCount) ||
(maxSegmentCount == 1 && sortedSizeAndDocs.size() == 1 && (segmentsToMerge.get(infoZero) != null || isMerged(infos, infoZero, mergeContext)))) {
if (verbose(mergeContext)) {
message("already merged", mergeContext);
}
return null;
}
}
if (verbose(mergeContext)) {
message("eligible=" + sortedSizeAndDocs, mergeContext);
}
final int startingSegmentCount = sortedSizeAndDocs.size();
final boolean finalMerge = startingSegmentCount < maxSegmentCount + maxMergeAtOnceExplicit - 1;
if (finalMerge && forceMergeRunning) {
return null;
}
// This is the special case of merging down to one segment
if (sortedSizeAndDocs.size() < maxMergeAtOnceExplicit && maxSegmentCount == 1 && totalMergeBytes < maxMergeBytes) {
MergeSpecification spec = new MergeSpecification();
List<SegmentCommitInfo> allOfThem = new ArrayList<>();
for (SegmentSizeAndDocs segSizeDocs : sortedSizeAndDocs) {
allOfThem.add(segSizeDocs.segInfo);
}
spec.add(new OneMerge(allOfThem));
return spec;
}
MergeSpecification spec = null;
int index = startingSegmentCount - 1;
int resultingSegments = startingSegmentCount;
while (true) {
List<SegmentCommitInfo> candidate = new ArrayList<>();
long currentCandidateBytes = 0L;
int mergesAllowed = maxMergeAtOnceExplicit;
while (index >= 0 && resultingSegments > maxSegmentCount && mergesAllowed > 0) {
final SegmentCommitInfo current = sortedSizeAndDocs.get(index).segInfo;
final int initialCandidateSize = candidate.size();
final long currentSegmentSize = current.sizeInBytes();
// We either add to the bin because there's space or because the it is the smallest possible bin since
// decrementing the index will move us to even larger segments.
if (currentCandidateBytes + currentSegmentSize <= maxMergeBytes || initialCandidateSize < 2) {
candidate.add(current);
--index;
currentCandidateBytes += currentSegmentSize;
--mergesAllowed;
if (initialCandidateSize > 0) {
// Any merge that handles two or more segments reduces the resulting number of segments
// by the number of segments handled - 1
--resultingSegments;
}
} else {
break;
}
}
final int candidateSize = candidate.size();
// While a force merge is running, only merges that cover the maximum allowed number of segments or that create a segment close to the
// maximum allowed segment sized are permitted
if (candidateSize > 1 && (forceMergeRunning == false || candidateSize == maxMergeAtOnceExplicit || candidateSize > 0.7 * maxMergeBytes)) {
final OneMerge merge = new OneMerge(candidate);
if (verbose(mergeContext)) {
message("add merge=" + segString(mergeContext, merge.segments), mergeContext);
}
if (spec == null) {
spec = new MergeSpecification();
}
spec.add(merge);
} else {
return spec;
}
}
}
@Override
public MergeSpecification findForcedDeletesMerges(SegmentInfos infos, MergeContext mergeContext) throws IOException {
if (verbose(mergeContext)) {
message("findForcedDeletesMerges infos=" + segString(mergeContext, infos) + " forceMergeDeletesPctAllowed=" + forceMergeDeletesPctAllowed, mergeContext);
}
// First do a quick check that there's any work to do.
// NOTE: this makes BaseMergePOlicyTestCase.testFindForcedDeletesMerges work
final Set<SegmentCommitInfo> merging = mergeContext.getMergingSegments();
boolean haveWork = false;
for(SegmentCommitInfo info : infos) {
int delCount = mergeContext.numDeletesToMerge(info);
assert assertDelCount(delCount, info);
double pctDeletes = 100.*((double) delCount)/info.info.maxDoc();
if (pctDeletes > forceMergeDeletesPctAllowed && !merging.contains(info)) {
haveWork = true;
break;
}
}
if (haveWork == false) {
return null;
}
List<SegmentSizeAndDocs> sortedInfos = getSortedBySegmentSize(infos, mergeContext);
Iterator<SegmentSizeAndDocs> iter = sortedInfos.iterator();
while (iter.hasNext()) {
SegmentSizeAndDocs segSizeDocs = iter.next();
double pctDeletes = 100. * ((double) segSizeDocs.delCount / (double) segSizeDocs.maxDoc);
if (merging.contains(segSizeDocs.segInfo) || pctDeletes <= forceMergeDeletesPctAllowed) {
iter.remove();
}
}
if (verbose(mergeContext)) {
message("eligible=" + sortedInfos, mergeContext);
}
return doFindMerges(sortedInfos, maxMergedSegmentBytes,
maxMergeAtOnceExplicit, Integer.MAX_VALUE, 0, MERGE_TYPE.FORCE_MERGE_DELETES, mergeContext, false);
}
private long floorSize(long bytes) {
return Math.max(floorSegmentBytes, bytes);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("[" + getClass().getSimpleName() + ": ");
sb.append("maxMergeAtOnce=").append(maxMergeAtOnce).append(", ");
sb.append("maxMergeAtOnceExplicit=").append(maxMergeAtOnceExplicit).append(", ");
sb.append("maxMergedSegmentMB=").append(maxMergedSegmentBytes/1024/1024.).append(", ");
sb.append("floorSegmentMB=").append(floorSegmentBytes/1024/1024.).append(", ");
sb.append("forceMergeDeletesPctAllowed=").append(forceMergeDeletesPctAllowed).append(", ");
sb.append("segmentsPerTier=").append(segsPerTier).append(", ");
sb.append("maxCFSSegmentSizeMB=").append(getMaxCFSSegmentSizeMB()).append(", ");
sb.append("noCFSRatio=").append(noCFSRatio).append(", ");
sb.append("deletesPctAllowed=").append(deletesPctAllowed);
return sb.toString();
}
}