/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search.similarities;


import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;

BM25 Similarity. Introduced in Stephen E. Robertson, Steve Walker, Susan Jones, Micheline Hancock-Beaulieu, and Mike Gatford. Okapi at TREC-3. In Proceedings of the Third Text REtrieval Conference (TREC 1994). Gaithersburg, USA, November 1994.
/** * BM25 Similarity. Introduced in Stephen E. Robertson, Steve Walker, * Susan Jones, Micheline Hancock-Beaulieu, and Mike Gatford. Okapi at TREC-3. * In Proceedings of the Third <b>T</b>ext <b>RE</b>trieval <b>C</b>onference (TREC 1994). * Gaithersburg, USA, November 1994. */
public class BM25Similarity extends Similarity { private final float k1; private final float b;
BM25 with the supplied parameter values.
Params:
  • k1 – Controls non-linear term frequency normalization (saturation).
  • b – Controls to what degree document length normalizes tf values.
Throws:
/** * BM25 with the supplied parameter values. * @param k1 Controls non-linear term frequency normalization (saturation). * @param b Controls to what degree document length normalizes tf values. * @throws IllegalArgumentException if {@code k1} is infinite or negative, or if {@code b} is * not within the range {@code [0..1]} */
public BM25Similarity(float k1, float b) { if (Float.isFinite(k1) == false || k1 < 0) { throw new IllegalArgumentException("illegal k1 value: " + k1 + ", must be a non-negative finite value"); } if (Float.isNaN(b) || b < 0 || b > 1) { throw new IllegalArgumentException("illegal b value: " + b + ", must be between 0 and 1"); } this.k1 = k1; this.b = b; }
BM25 with these default values:
  • k1 = 1.2
  • b = 0.75
/** BM25 with these default values: * <ul> * <li>{@code k1 = 1.2}</li> * <li>{@code b = 0.75}</li> * </ul> */
public BM25Similarity() { this(1.2f, 0.75f); }
Implemented as log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5)).
/** Implemented as <code>log(1 + (docCount - docFreq + 0.5)/(docFreq + 0.5))</code>. */
protected float idf(long docFreq, long docCount) { return (float) Math.log(1 + (docCount - docFreq + 0.5D)/(docFreq + 0.5D)); }
The default implementation returns 1
/** The default implementation returns <code>1</code> */
protected float scorePayload(int doc, int start, int end, BytesRef payload) { return 1; }
The default implementation computes the average as sumTotalTermFreq / docCount
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
protected float avgFieldLength(CollectionStatistics collectionStats) { return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount()); }
True if overlap tokens (tokens with a position of increment of zero) are discounted from the document's length.
/** * True if overlap tokens (tokens with a position of increment of zero) are * discounted from the document's length. */
protected boolean discountOverlaps = true;
Sets whether overlap tokens (Tokens with 0 position increment) are ignored when computing norm. By default this is true, meaning overlap tokens do not count when computing norms.
/** Sets whether overlap tokens (Tokens with 0 position increment) are * ignored when computing norm. By default this is true, meaning overlap * tokens do not count when computing norms. */
public void setDiscountOverlaps(boolean v) { discountOverlaps = v; }
Returns true if overlap tokens are discounted from the document's length.
See Also:
  • setDiscountOverlaps
/** * Returns true if overlap tokens are discounted from the document's length. * @see #setDiscountOverlaps */
public boolean getDiscountOverlaps() { return discountOverlaps; }
Cache of decoded bytes.
/** Cache of decoded bytes. */
private static final float[] LENGTH_TABLE = new float[256]; static { for (int i = 0; i < 256; i++) { LENGTH_TABLE[i] = SmallFloat.byte4ToInt((byte) i); } } @Override public final long computeNorm(FieldInvertState state) { final int numTerms; if (state.getIndexOptions() == IndexOptions.DOCS && state.getIndexCreatedVersionMajor() >= 8) { numTerms = state.getUniqueTermCount(); } else if (discountOverlaps) { numTerms = state.getLength() - state.getNumOverlap(); } else { numTerms = state.getLength(); } return SmallFloat.intToByte4(numTerms); }
Computes a score factor for a simple term and returns an explanation for that score factor.

The default implementation uses:

idf(docFreq, docCount);
Note that CollectionStatistics.docCount() is used instead of IndexReader#numDocs() because also TermStatistics.docFreq() is used, and when the latter is inaccurate, so is CollectionStatistics.docCount(), and in the same direction. In addition, CollectionStatistics.docCount() does not skew when fields are sparse.
Params:
  • collectionStats – collection-level statistics
  • termStats – term-level statistics for the term
Returns:an Explain object that includes both an idf score factor and an explanation for the term.
/** * Computes a score factor for a simple term and returns an explanation * for that score factor. * * <p> * The default implementation uses: * * <pre class="prettyprint"> * idf(docFreq, docCount); * </pre> * * Note that {@link CollectionStatistics#docCount()} is used instead of * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also * {@link TermStatistics#docFreq()} is used, and when the latter * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction. * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the term * @return an Explain object that includes both an idf score factor and an explanation for the term. */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount(); final float idf = idf(df, docCount); return Explanation.match(idf, "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:", Explanation.match(df, "n, number of documents containing term"), Explanation.match(docCount, "N, total number of documents with field")); }
Computes a score factor for a phrase.

The default implementation sums the idf factor for each term in the phrase.

Params:
  • collectionStats – collection-level statistics
  • termStats – term-level statistics for the terms in the phrase
Returns:an Explain object that includes both an idf score factor for the phrase and an explanation for each term.
/** * Computes a score factor for a phrase. * * <p> * The default implementation sums the idf factor for * each term in the phrase. * * @param collectionStats collection-level statistics * @param termStats term-level statistics for the terms in the phrase * @return an Explain object that includes both an idf * score factor for the phrase and an explanation * for each term. */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) { double idf = 0d; // sum into a double before casting into a float List<Explanation> details = new ArrayList<>(); for (final TermStatistics stat : termStats ) { Explanation idfExplain = idfExplain(collectionStats, stat); details.add(idfExplain); idf += idfExplain.getValue().floatValue(); } return Explanation.match((float) idf, "idf, sum of:", details); } @Override public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); float avgdl = avgFieldLength(collectionStats); float[] cache = new float[256]; for (int i = 0; i < cache.length; i++) { cache[i] = 1f / (k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl)); } return new BM25Scorer(boost, k1, b, idf, avgdl, cache); }
Collection statistics for the BM25 model.
/** Collection statistics for the BM25 model. */
private static class BM25Scorer extends SimScorer {
query boost
/** query boost */
private final float boost;
k1 value for scale factor
/** k1 value for scale factor */
private final float k1;
b value for length normalization impact
/** b value for length normalization impact */
private final float b;
BM25's idf
/** BM25's idf */
private final Explanation idf;
The average document length.
/** The average document length. */
private final float avgdl;
precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl)
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
private final float[] cache;
weight (idf * boost)
/** weight (idf * boost) */
private final float weight; BM25Scorer(float boost, float k1, float b, Explanation idf, float avgdl, float[] cache) { this.boost = boost; this.idf = idf; this.avgdl = avgdl; this.k1 = k1; this.b = b; this.cache = cache; this.weight = boost * idf.getValue().floatValue(); } @Override public float score(float freq, long encodedNorm) { // In order to guarantee monotonicity with both freq and norm without // promoting to doubles, we rewrite freq / (freq + norm) to // 1 - 1 / (1 + freq * 1/norm). // freq * 1/norm is guaranteed to be monotonic for both freq and norm due // to the fact that multiplication and division round to the nearest // float. And then monotonicity is preserved through composition via // x -> 1 + x and x -> 1 - 1/x. // Finally we expand weight * (1 - 1 / (1 + freq * 1/norm)) to // weight - weight / (1 + freq * 1/norm), which runs slightly faster. float normInverse = cache[((byte) encodedNorm) & 0xFF]; return weight - weight / (1f + freq * normInverse); } @Override public Explanation explain(Explanation freq, long encodedNorm) { List<Explanation> subs = new ArrayList<>(explainConstantFactors()); Explanation tfExpl = explainTF(freq, encodedNorm); subs.add(tfExpl); float normInverse = cache[((byte) encodedNorm) & 0xFF]; // not using "product of" since the rewrite that we do in score() // introduces a small rounding error that CheckHits complains about return Explanation.match(weight - weight / (1f + freq.getValue().floatValue() * normInverse), "score(freq="+freq.getValue()+"), computed as boost * idf * tf from:", subs); } private Explanation explainTF(Explanation freq, long norm) { List<Explanation> subs = new ArrayList<>(); subs.add(freq); subs.add(Explanation.match(k1, "k1, term saturation parameter")); float doclen = LENGTH_TABLE[((byte) norm) & 0xff]; subs.add(Explanation.match(b, "b, length normalization parameter")); if ((norm & 0xFF) > 39) { subs.add(Explanation.match(doclen, "dl, length of field (approximate)")); } else { subs.add(Explanation.match(doclen, "dl, length of field")); } subs.add(Explanation.match(avgdl, "avgdl, average length of field")); float normInverse = 1f / (k1 * ((1 - b) + b * doclen / avgdl)); return Explanation.match( 1f - 1f / (1 + freq.getValue().floatValue() * normInverse), "tf, computed as freq / (freq + k1 * (1 - b + b * dl / avgdl)) from:", subs); } private List<Explanation> explainConstantFactors() { List<Explanation> subs = new ArrayList<>(); // query boost if (boost != 1.0f) { subs.add(Explanation.match(boost, "boost")); } // idf subs.add(idf); return subs; } } @Override public String toString() { return "BM25(k1=" + k1 + ",b=" + b + ")"; }
Returns the k1 parameter
See Also:
  • BM25Similarity(float, float)
/** * Returns the <code>k1</code> parameter * @see #BM25Similarity(float, float) */
public final float getK1() { return k1; }
Returns the b parameter
See Also:
  • BM25Similarity(float, float)
/** * Returns the <code>b</code> parameter * @see #BM25Similarity(float, float) */
public final float getB() { return b; } }