org.apache.lucene/lucene-sandbox/8.7.0 : org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java

FuzzyLikeThisQuery
https://lucene.apache.org/lucene-parent/lucene-sandbox: Lucene Sandbox (The Apache Software Foundation)
Apache License, Version 2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.sandbox.queries;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Objects;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.FuzzyTermsEnum;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.automaton.LevenshteinAutomata;

Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
of fuzzy scoring factors.
This generally produces good results for queries where users may provide details in a number of 
fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
a fast query.
For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer 
terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query 
term) and this is factored into the variant's boost. If the source query term does not exist in the
index the average IDF of the variants is used.
/**
 * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
 * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
 * of fuzzy scoring factors.
 * This generally produces good results for queries where users may provide details in a number of 
 * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
 * a fast query.
 * 
 * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
 * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
 * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer 
 * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query 
 * term) and this is factored into the variant's boost. If the source query term does not exist in the
 * index the average IDF of the variants is used.
 */
public class FuzzyLikeThisQuery extends Query
{
  // TODO: generalize this query (at least it should not reuse this static sim!
  // a better way might be to convert this into multitermquery rewrite methods.
  // the rewrite method can 'average' the TermStates's term statistics (docfreq,totalTermFreq)
  // provided to TermQuery, so that the general idea is agnostic to any scoring system...
  static TFIDFSimilarity sim=new ClassicSimilarity();
  ArrayList<FieldVals> fieldVals=new ArrayList<>();
  Analyzer analyzer;

  int MAX_VARIANTS_PER_TERM=50;
  boolean ignoreTF=false;
  private int maxNumTerms;

  @Override
  public int hashCode() {
    int prime = 31;
    int result = classHash();
    result = prime * result + Objects.hashCode(analyzer);
    result = prime * result + Objects.hashCode(fieldVals);
    result = prime * result + (ignoreTF ? 1231 : 1237);
    result = prime * result + maxNumTerms;
    return result;
  }

  @Override
  public boolean equals(Object other) {
    return sameClassAs(other) &&
      equalsTo(getClass().cast(other));
  }

  private boolean equalsTo(FuzzyLikeThisQuery other) {
    return Objects.equals(analyzer, other.analyzer) &&
      Objects.equals(fieldVals, other.fieldVals) &&
      ignoreTF == other.ignoreTF &&
      maxNumTerms == other.maxNumTerms;
  }

  Params: maxNumTerms – The total number of terms clauses that will appear once rewritten as a BooleanQuery/**
   * 
   * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
   */
  public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
  {
    this.analyzer=analyzer;
    this.maxNumTerms = maxNumTerms;
  }

  static class FieldVals
  {
    String queryString;
    String fieldName;
    int maxEdits;
    int prefixLength;
    public FieldVals(String name, int maxEdits, int length, String queryString)
    {
      fieldName = name;
      this.maxEdits = maxEdits;
      prefixLength = length;
      this.queryString = queryString;
    }

    @Override
    public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result
        + ((fieldName == null) ? 0 : fieldName.hashCode());
      result = prime * result + maxEdits;
      result = prime * result + prefixLength;
      result = prime * result
        + ((queryString == null) ? 0 : queryString.hashCode());
      return result;
    }

    @Override
    public boolean equals(Object obj) {
      if (this == obj)
        return true;
      if (obj == null)
        return false;
      if (getClass() != obj.getClass())
        return false;
      FieldVals other = (FieldVals) obj;
      if (fieldName == null) {
        if (other.fieldName != null)
          return false;
      } else if (!fieldName.equals(other.fieldName))
        return false;
      if (maxEdits != other.maxEdits) {
        return false;
      }
      if (prefixLength != other.prefixLength)
        return false;
      if (queryString == null) {
        if (other.queryString != null)
          return false;
      } else if (!queryString.equals(other.queryString))
        return false;
      return true;
    }
    


  }
    
  Adds user input for "fuzzification" 
Params: queryString – The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
minSimilarity – The minimum similarity of the term variants; must be 0, 1 or 2 (see FuzzyTermsEnum)
prefixLength – Length of required common prefix on variant terms (see FuzzyTermsEnum)/**
   * Adds user input for "fuzzification" 
   * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
   * @param minSimilarity The minimum similarity of the term variants; must be 0, 1 or 2 (see FuzzyTermsEnum)
   * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermsEnum)
   */
  public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength) 
  {
    int maxEdits = (int) minSimilarity;
    if (maxEdits != minSimilarity || maxEdits < 0 || maxEdits > LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE) {
      throw new IllegalArgumentException("minSimilarity must integer value between 0 and " + LevenshteinAutomata.MAXIMUM_SUPPORTED_DISTANCE + ", inclusive; got " + minSimilarity);
    }
    fieldVals.add(new FieldVals(fieldName,maxEdits,prefixLength,queryString));
  }


  private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException {
    if (f.queryString == null) return;
    final Terms terms = MultiTerms.getTerms(reader, f.fieldName);
    if (terms == null) {
      return;
    }
    try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

      int corpusNumDocs = reader.numDocs();
      HashSet<String> processedTerms = new HashSet<>();
      ts.reset();
      while (ts.incrementToken()) {
        String term = termAtt.toString();
        if (!processedTerms.contains(term)) {
          processedTerms.add(term);
          ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
          float minScore = 0;
          Term startTerm = new Term(f.fieldName, term);
          FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, startTerm, f.maxEdits, f.prefixLength, true);
          //store the df so all variants use same idf
          int df = reader.docFreq(startTerm);
          int numVariants = 0;
          int totalVariantDocFreqs = 0;
          BytesRef possibleMatch;
          BoostAttribute boostAtt =
            fe.attributes().addAttribute(BoostAttribute.class);
          while ((possibleMatch = fe.next()) != null) {
            numVariants++;
            totalVariantDocFreqs += fe.docFreq();
            float score = boostAtt.getBoost();
            if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
              ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
              variantsQ.insertWithOverflow(st);
              minScore = variantsQ.top().score; // maintain minScore
            }
            fe.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
          }

          if (numVariants > 0) {
            int avgDf = totalVariantDocFreqs / numVariants;
            if (df == 0)//no direct match we can use as df for all variants
              {
                df = avgDf; //use avg df of all variants
              }

            // take the top variants (scored by edit distance) and reset the score
            // to include an IDF factor then add to the global queue for ranking
            // overall top query terms
            int size = variantsQ.size();
            for (int i = 0; i < size; i++) {
              ScoreTerm st = variantsQ.pop();
              st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
              q.insertWithOverflow(st);
            }
          }
        }
      }
      ts.end();
    }
  }

  private Query newTermQuery(IndexReader reader, Term term) throws IOException {
    if (ignoreTF) {
      return new ConstantScoreQuery(new TermQuery(term));
    } else {
      // we build an artificial TermStates that will give an overall df and ttf
      // equal to 1
      TermStates context = new TermStates(reader.getContext());
      for (LeafReaderContext leafContext : reader.leaves()) {
        Terms terms = leafContext.reader().terms(term.field());
        if (terms != null) {
          TermsEnum termsEnum = terms.iterator();
          if (termsEnum.seekExact(term.bytes())) {
            int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1
            context.register(termsEnum.termState(), leafContext.ord, freq, freq);
          }
        }
      }
      return new TermQuery(term, context);
    }
  }

  @Override
  public void visit(QueryVisitor visitor) {
    visitor.visitLeaf(this);
  }

  @Override
  public Query rewrite(IndexReader reader) throws IOException
  {
    ScoreTermQueue q = new ScoreTermQueue(maxNumTerms);
    //load up the list of possible terms
    for (FieldVals f : fieldVals) {
      addTerms(reader, f, q);
    }
        
    BooleanQuery.Builder bq = new BooleanQuery.Builder();
        
    //create BooleanQueries to hold the variants for each token/field pair and ensure it
    // has no coord factor
    //Step 1: sort the termqueries by term/field
    HashMap<Term,ArrayList<ScoreTerm>> variantQueries=new HashMap<>();
    int size = q.size();
    for(int i = 0; i < size; i++)
      {
        ScoreTerm st = q.pop();
        ArrayList<ScoreTerm> l= variantQueries.get(st.fuzziedSourceTerm);
        if(l==null)
          {
            l=new ArrayList<>();
            variantQueries.put(st.fuzziedSourceTerm,l);
          }
        l.add(st);
      }
    //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
    for (Iterator<ArrayList<ScoreTerm>> iter = variantQueries.values().iterator(); iter.hasNext();)
      {
        ArrayList<ScoreTerm> variants = iter.next();
        if(variants.size()==1)
          {
            //optimize where only one selected variant
            ScoreTerm st= variants.get(0);
            Query tq = newTermQuery(reader, st.term);
            // set the boost to a mix of IDF and score
            bq.add(new BoostQuery(tq, st.score), BooleanClause.Occur.SHOULD); 
          }
        else
          {
            BooleanQuery.Builder termVariants=new BooleanQuery.Builder();
            for (Iterator<ScoreTerm> iterator2 = variants.iterator(); iterator2
                   .hasNext();)
              {
                ScoreTerm st = iterator2.next();
                // found a match
                Query tq = newTermQuery(reader, st.term);
                // set the boost using the ScoreTerm's score
                termVariants.add(new BoostQuery(tq, st.score), BooleanClause.Occur.SHOULD);          // add to query                    
              }
            bq.add(termVariants.build(), BooleanClause.Occur.SHOULD);          // add to query
          }
      }
    //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
    // booleans with a minimum-should-match of NumFields-1?
    return bq.build();
  }

  //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
  // term variants) then is reset with IDF for use in ranking against all other
  // terms/fields
  private static class ScoreTerm{
    public Term term;
    public float score;
    Term fuzziedSourceTerm;
        
    public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
      this.term = term;
      this.score = score;
      this.fuzziedSourceTerm=fuzziedSourceTerm;
    }
  }
      
  private static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {        
    public ScoreTermQueue(int size){
      super(size);
    }
        
    /* (non-Javadoc)
     * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
     */
    @Override
    protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
      if (termA.score== termB.score)
        return termA.term.compareTo(termB.term) > 0;
      else
        return termA.score < termB.score;
    }
        
  }    
      
  /* (non-Javadoc)
   * @see org.apache.lucene.search.Query#toString(java.lang.String)
   */
  @Override
  public String toString(String field)
  {
    return null;
  }


  public boolean isIgnoreTF()
  {
    return ignoreTF;
  }


  public void setIgnoreTF(boolean ignoreTF)
  {
    this.ignoreTF = ignoreTF;
  }
    
}
/

org.apache.lucene/ lucene-sandbox/ 8.7.0/ org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java