/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.search.uhighlight;

import java.text.BreakIterator;
import java.text.CharacterIterator;

Wraps another BreakIterator to skip past breaks that would result in passages that are too short. It's still possible to get a short passage but only at the very end of the input text.

Important: This is not a general purpose BreakIterator; it's only designed to work in a way compatible with the UnifiedHighlighter. Some assumptions are checked with Java assertions.

@lucene.experimental
/** * Wraps another {@link BreakIterator} to skip past breaks that would result in passages that are too * short. It's still possible to get a short passage but only at the very end of the input text. * <p> * Important: This is not a general purpose {@link BreakIterator}; it's only designed to work in a way * compatible with the {@link UnifiedHighlighter}. Some assumptions are checked with Java assertions. * * @lucene.experimental */
public class LengthGoalBreakIterator extends BreakIterator { private final BreakIterator baseIter; private final int lengthGoal; private final float fragmentAlignment; // how much text to align before match-fragment, valid in range [0, 1] private final boolean isMinimumLength; // if false then is "closest to" length private int currentCache;
Breaks will be at least minLength apart (to the extent possible), while trying to position the match inside the fragment according to fragmentAlignment.
/** * Breaks will be at least {@code minLength} apart (to the extent possible), * while trying to position the match inside the fragment according to {@code fragmentAlignment}. */
public static LengthGoalBreakIterator createMinLength(BreakIterator baseIter, int minLength, float fragmentAlignment) { return new LengthGoalBreakIterator(baseIter, minLength, fragmentAlignment, true, baseIter.current()); }
For backwards compatibility you can initialise the break iterator without fragmentAlignment.
/** For backwards compatibility you can initialise the break iterator without fragmentAlignment. */
@Deprecated public static LengthGoalBreakIterator createMinLength(BreakIterator baseIter, int minLength) { return createMinLength(baseIter, minLength, 0.f); }
Breaks will be on average targetLength apart; the closest break to this target (before or after) is chosen. The match will be positioned according to fragmentAlignment as much as possible.
/** * Breaks will be on average {@code targetLength} apart; the closest break to this target (before or after) * is chosen. The match will be positioned according to {@code fragmentAlignment} as much as possible. */
public static LengthGoalBreakIterator createClosestToLength(BreakIterator baseIter, int targetLength, float fragmentAlignment) { return new LengthGoalBreakIterator(baseIter, targetLength, fragmentAlignment, false, baseIter.current()); }
For backwards compatibility you can initialise the break iterator without fragmentAlignment.
/** For backwards compatibility you can initialise the break iterator without fragmentAlignment. */
@Deprecated public static LengthGoalBreakIterator createClosestToLength(BreakIterator baseIter, int targetLength) { return createClosestToLength(baseIter, targetLength, 0.f); } private LengthGoalBreakIterator(BreakIterator baseIter, int lengthGoal, float fragmentAlignment, boolean isMinimumLength, int currentCache) { this.baseIter = baseIter; this.currentCache = currentCache; this.lengthGoal = lengthGoal; if (fragmentAlignment < 0.f || fragmentAlignment > 1.f || !Float.isFinite(fragmentAlignment)) { throw new IllegalArgumentException("fragmentAlignment must be >= zero and <= one"); } this.fragmentAlignment = fragmentAlignment; this.isMinimumLength = isMinimumLength; } // note: the only methods that will get called are setText(txt), getText(), // getSummaryPassagesNoHighlight: current(), first(), next() // highlightOffsetsEnums: preceding(int), and following(int) // Nonetheless we make some attempt to implement the rest; mostly delegating. @Override public String toString() { String goalDesc = isMinimumLength ? "minLen" : "targetLen"; return getClass().getSimpleName() + "{" + goalDesc + "=" + lengthGoal + ", fragAlign=" + fragmentAlignment + ", baseIter=" + baseIter + "}"; } @Override public Object clone() { return new LengthGoalBreakIterator( (BreakIterator) baseIter.clone(), lengthGoal, fragmentAlignment, isMinimumLength, currentCache ); } @Override public CharacterIterator getText() { return baseIter.getText(); } @Override public void setText(String newText) { baseIter.setText(newText); currentCache = baseIter.current(); } @Override public void setText(CharacterIterator newText) { baseIter.setText(newText); currentCache = baseIter.current(); } @Override public int current() { return currentCache; } @Override public int first() { return currentCache = baseIter.first(); } @Override public int last() { return currentCache = baseIter.last(); } @Override public int next(int n) { assert false : "Not supported"; return baseIter.next(n); // probably wrong } // Called by getSummaryPassagesNoHighlight to generate default summary. @Override public int next() { return following(currentCache, currentCache + lengthGoal); } @Override public int previous() { assert false : "Not supported"; return baseIter.previous(); } @Override public int following(int matchEndIndex) { return following(matchEndIndex, (matchEndIndex + 1) + (int)(lengthGoal * (1.f - fragmentAlignment))); } private int following(int matchEndIndex, int targetIdx) { if (targetIdx >= getText().getEndIndex()) { if (currentCache == baseIter.last()) { return DONE; } return currentCache = baseIter.last(); } final int afterIdx = baseIter.following(targetIdx - 1); if (afterIdx == DONE) { currentCache = baseIter.last(); return DONE; } if (afterIdx == targetIdx) { // right on the money return currentCache = afterIdx; } if (isMinimumLength) { // thus never undershoot return currentCache = afterIdx; } // note: it is a shame that we invoke preceding() *one more time*; BI's are sometimes expensive. // Find closest break to target final int beforeIdx = baseIter.preceding(targetIdx); if (targetIdx - beforeIdx < afterIdx - targetIdx && beforeIdx > matchEndIndex) { return currentCache = beforeIdx; } return currentCache = afterIdx; } // called at start of new Passage given first word start offset @Override public int preceding(int matchStartIndex) { final int targetIdx = (matchStartIndex - 1) - (int)(lengthGoal * fragmentAlignment); if (targetIdx <= 0) { if (currentCache == baseIter.first()) { return DONE; } return currentCache = baseIter.first(); } final int beforeIdx = baseIter.preceding(targetIdx + 1); if (beforeIdx == DONE) { currentCache = baseIter.first(); return DONE; } if (beforeIdx == targetIdx) { // right on the money return currentCache = beforeIdx; } if (isMinimumLength) { // thus never undershoot return currentCache = beforeIdx; } // note: it is a shame that we invoke following() *one more time*; BI's are sometimes expensive. // Find closest break to target final int afterIdx = baseIter.following(targetIdx - 1); if (afterIdx - targetIdx < targetIdx - beforeIdx && afterIdx < matchStartIndex) { return currentCache = afterIdx; } return currentCache = beforeIdx; } @Override public boolean isBoundary(int offset) { assert false : "Not supported"; return baseIter.isBoundary(offset); } }