/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.SmallFloat;
A subclass of Similarity
that provides a simplified API for its descendants. Subclasses are only required to implement the score
and toString()
methods. Implementing explain(List<Explanation>, BasicStats, double, double)
is optional, inasmuch as SimilarityBase already provides a basic explanation of the score and the term frequency. However, implementers of a subclass are encouraged to include as much detail about the scoring method as possible.
Note: multi-word queries such as phrase queries are scored in a different way
than Lucene's default ranking algorithm: whereas it "fakes" an IDF value for
the phrase as a whole (since it does not know it), this class instead scores
phrases as a summation of the individual term scores.
@lucene.experimental
/**
* A subclass of {@code Similarity} that provides a simplified API for its
* descendants. Subclasses are only required to implement the {@link #score}
* and {@link #toString()} methods. Implementing
* {@link #explain(List, BasicStats, double, double)} is optional,
* inasmuch as SimilarityBase already provides a basic explanation of the score
* and the term frequency. However, implementers of a subclass are encouraged to
* include as much detail about the scoring method as possible.
* <p>
* Note: multi-word queries such as phrase queries are scored in a different way
* than Lucene's default ranking algorithm: whereas it "fakes" an IDF value for
* the phrase as a whole (since it does not know it), this class instead scores
* phrases as a summation of the individual term scores.
* @lucene.experimental
*/
public abstract class SimilarityBase extends Similarity {
For log2(double)
. Precomputed for efficiency reasons. /** For {@link #log2(double)}. Precomputed for efficiency reasons. */
private static final double LOG_2 = Math.log(2);
True if overlap tokens (tokens with a position of increment of zero) are
discounted from the document's length.
/**
* True if overlap tokens (tokens with a position of increment of zero) are
* discounted from the document's length.
*/
protected boolean discountOverlaps = true;
Sole constructor. (For invocation by subclass
constructors, typically implicit.)
/**
* Sole constructor. (For invocation by subclass
* constructors, typically implicit.)
*/
public SimilarityBase() {}
Determines whether overlap tokens (Tokens with
0 position increment) are ignored when computing
norm. By default this is true, meaning overlap
tokens do not count when computing norms.
@lucene.experimental
@see #computeNorm
/** Determines whether overlap tokens (Tokens with
* 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap
* tokens do not count when computing norms.
*
* @lucene.experimental
*
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
}
Returns true if overlap tokens are discounted from the document's length.
See Also: - setDiscountOverlaps
/**
* Returns true if overlap tokens are discounted from the document's length.
* @see #setDiscountOverlaps
*/
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
SimScorer weights[] = new SimScorer[termStats.length];
for (int i = 0; i < termStats.length; i++) {
BasicStats stats = newStats(collectionStats.field(), boost);
fillBasicStats(stats, collectionStats, termStats[i]);
weights[i] = new BasicSimScorer(stats);
}
if (weights.length == 1) {
return weights[0];
} else {
return new MultiSimilarity.MultiSimScorer(weights);
}
}
Factory method to return a custom stats object /** Factory method to return a custom stats object */
protected BasicStats newStats(String field, double boost) {
return new BasicStats(field, boost);
}
Fills all member fields defined in BasicStats
in stats
. Subclasses can override this method to fill additional stats. /** Fills all member fields defined in {@code BasicStats} in {@code stats}.
* Subclasses can override this method to fill additional stats. */
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
// TODO: validate this for real, somewhere else
assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq();
assert termStats.docFreq() <= collectionStats.sumDocFreq();
// TODO: add sumDocFreq for field (numberOfFieldPostings)
stats.setNumberOfDocuments(collectionStats.docCount());
stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq());
stats.setAvgFieldLength(collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
stats.setDocFreq(termStats.docFreq());
stats.setTotalTermFreq(termStats.totalTermFreq());
}
Scores the document doc
. Subclasses must apply their scoring formula in this class.
Params: - stats – the corpus level statistics.
- freq – the term frequency.
- docLen – the document length.
Returns: the score.
/**
* Scores the document {@code doc}.
* <p>Subclasses must apply their scoring formula in this class.</p>
* @param stats the corpus level statistics.
* @param freq the term frequency.
* @param docLen the document length.
* @return the score.
*/
protected abstract double score(BasicStats stats, double freq, double docLen);
Subclasses should implement this method to explain the score. expl
already contains the score, the name of the class and the doc id, as well as the term frequency and its explanation; subclasses can add additional clauses to explain details of their scoring formulae. The default implementation does nothing.
Params: - subExpls – the list of details of the explanation to extend
- stats – the corpus level statistics.
- freq – the term frequency.
- docLen – the document length.
/**
* Subclasses should implement this method to explain the score. {@code expl}
* already contains the score, the name of the class and the doc id, as well
* as the term frequency and its explanation; subclasses can add additional
* clauses to explain details of their scoring formulae.
* <p>The default implementation does nothing.</p>
*
* @param subExpls the list of details of the explanation to extend
* @param stats the corpus level statistics.
* @param freq the term frequency.
* @param docLen the document length.
*/
protected void explain(
List<Explanation> subExpls, BasicStats stats, double freq, double docLen) {}
Explains the score. The implementation here provides a basic explanation
in the format score(name-of-similarity, doc=doc-id,
freq=term-frequency), computed from:, and attaches the score (computed via the score(BasicStats, double, double)
method) and the explanation for the term frequency. Subclasses content with this format may add additional details in explain(List<Explanation>, BasicStats, double, double)
. Params: - stats – the corpus level statistics.
- freq – the term frequency and its explanation.
- docLen – the document length.
Returns: the explanation.
/**
* Explains the score. The implementation here provides a basic explanation
* in the format <em>score(name-of-similarity, doc=doc-id,
* freq=term-frequency), computed from:</em>, and
* attaches the score (computed via the {@link #score(BasicStats, double, double)}
* method) and the explanation for the term frequency. Subclasses content with
* this format may add additional details in
* {@link #explain(List, BasicStats, double, double)}.
*
* @param stats the corpus level statistics.
* @param freq the term frequency and its explanation.
* @param docLen the document length.
* @return the explanation.
*/
protected Explanation explain(
BasicStats stats, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
explain(subs, stats, freq.getValue().floatValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue().floatValue(), docLen),
"score(" + getClass().getSimpleName() + ", freq=" + freq.getValue() +"), computed from:",
subs);
}
Subclasses must override this method to return the name of the Similarity
and preferably the values of parameters (if any) as well.
/**
* Subclasses must override this method to return the name of the Similarity
* and preferably the values of parameters (if any) as well.
*/
@Override
public abstract String toString();
// ------------------------------ Norm handling ------------------------------
Cache of decoded bytes. /** Cache of decoded bytes. */
private static final float[] LENGTH_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++) {
LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i);
}
}
Encodes the document length in the same way as BM25Similarity
. /** Encodes the document length in the same way as {@link BM25Similarity}. */
@Override
public final long computeNorm(FieldInvertState state) {
final int numTerms;
if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) {
numTerms = state.getUniqueTermCount();
} else if (discountOverlaps) {
numTerms = state.getLength() - state.getNumOverlap();
} else {
numTerms = state.getLength();
}
return SmallFloat.intToByte4(numTerms);
}
// ----------------------------- Static methods ------------------------------
Returns the base two logarithm of x
. /** Returns the base two logarithm of {@code x}. */
public static double log2(double x) {
// Put this to a 'util' class if we need more of these.
return Math.log(x) / LOG_2;
}
// --------------------------------- Classes ---------------------------------
Delegates the score(float, long)
and explain(Explanation, long)
methods to SimilarityBase.score(BasicStats, double, double)
and SimilarityBase.explain(BasicStats, Explanation, double)
, respectively. /** Delegates the {@link #score(float, long)} and
* {@link #explain(Explanation, long)} methods to
* {@link SimilarityBase#score(BasicStats, double, double)} and
* {@link SimilarityBase#explain(BasicStats, Explanation, double)},
* respectively.
*/
final class BasicSimScorer extends SimScorer {
final BasicStats stats;
BasicSimScorer(BasicStats stats) {
this.stats = stats;
}
double getLengthValue(long norm) {
return LENGTH_TABLE[Byte.toUnsignedInt((byte) norm)];
}
@Override
public float score(float freq, long norm) {
return (float) SimilarityBase.this.score(stats, freq, getLengthValue(norm));
}
@Override
public Explanation explain(Explanation freq, long norm) {
return SimilarityBase.this.explain(stats, freq, getLengthValue(norm));
}
}
}