org.apache.lucene/lucene-core/8.7.0 : org/apache/lucene/search/PhraseQuery.java

PhraseQuery
https://lucene.apache.org/lucene-parent/lucene-core: Apache Lucene Java Core (The Apache Software Foundation)
Apache License, Version 2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.Set;

import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsReader;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReaderContext;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowImpactsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;

A Query that matches documents containing a particular sequence of terms.
A PhraseQuery is built by QueryParser for input like "new york".
This query may be combined with other terms or queries with a BooleanQuery. 
NOTE: All terms in the phrase must match, even those at the same position. If you have terms at the same position, perhaps synonyms, you probably want MultiPhraseQuery instead which only requires one term at a position to match. 
Also, Leading holes don't have any particular meaning for this query
and will be ignored. For instance this query:
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("body", "one"), 4);
builder.add(new Term("body", "two"), 5);
PhraseQuery pq = builder.build();

is equivalent to the below query:
PhraseQuery.Builder builder = new PhraseQuery.Builder();
builder.add(new Term("body", "one"), 0);
builder.add(new Term("body", "two"), 1);
PhraseQuery pq = builder.build();

/** A Query that matches documents containing a particular sequence of terms.
 * A PhraseQuery is built by QueryParser for input like <code>"new york"</code>.
 * 
 * <p>This query may be combined with other terms or queries with a {@link BooleanQuery}.
 *
 * <p><b>NOTE</b>:
 * All terms in the phrase must match, even those at the same position. If you
 * have terms at the same position, perhaps synonyms, you probably want {@link MultiPhraseQuery}
 * instead which only requires one term at a position to match.
 * <br >Also, Leading holes don't have any particular meaning for this query
 * and will be ignored. For instance this query:
 * <pre class="prettyprint">
 * PhraseQuery.Builder builder = new PhraseQuery.Builder();
 * builder.add(new Term("body", "one"), 4);
 * builder.add(new Term("body", "two"), 5);
 * PhraseQuery pq = builder.build();
 * </pre>
 * is equivalent to the below query:
 * <pre class="prettyprint">
 * PhraseQuery.Builder builder = new PhraseQuery.Builder();
 * builder.add(new Term("body", "one"), 0);
 * builder.add(new Term("body", "two"), 1);
 * PhraseQuery pq = builder.build();
 * </pre>
 */
public class PhraseQuery extends Query {

  A builder for phrase queries. /** A builder for phrase queries. */
  public static class Builder {

    private int slop;
    private final List<Term> terms;
    private final List<Integer> positions;

    Sole constructor. /** Sole constructor. */
    public Builder() {
      slop = 0;
      terms = new ArrayList<>();
      positions = new ArrayList<>();
    }

    Set the slop.
See Also: getSlop.getSlop()/**
     * Set the slop.
     * @see PhraseQuery#getSlop()
     */
    public Builder setSlop(int slop) {
      this.slop = slop;
      return this;
    }

    Adds a term to the end of the query phrase.
The relative position of the term is the one immediately after the last term added.
/**
     * Adds a term to the end of the query phrase.
     * The relative position of the term is the one immediately after the last term added.
     */
    public Builder add(Term term) {
      return add(term, positions.isEmpty() ? 0 : 1 + positions.get(positions.size() - 1));
    }

    Adds a term to the end of the query phrase. The relative position of the term within the phrase is specified explicitly, but must be greater than or equal to that of the previously added term. A greater position allows phrases with gaps (e.g. in connection with stopwords). If the position is equal, you most likely should be using MultiPhraseQuery instead which only requires one term at each position to match; this class requires all of them. /**
     * Adds a term to the end of the query phrase.
     * The relative position of the term within the phrase is specified explicitly, but must be greater than
     * or equal to that of the previously added term.
     * A greater position allows phrases with gaps (e.g. in connection with stopwords).
     * If the position is equal, you most likely should be using
     * {@link MultiPhraseQuery} instead which only requires one term at each position to match; this class requires
     * all of them.
     */
    public Builder add(Term term, int position) {
      Objects.requireNonNull(term, "Cannot add a null term to PhraseQuery");
      if (position < 0) {
        throw new IllegalArgumentException("Positions must be >= 0, got " + position);
      }
      if (positions.isEmpty() == false) {
        final int lastPosition = positions.get(positions.size() - 1);
        if (position < lastPosition) {
          throw new IllegalArgumentException("Positions must be added in order, got " + position + " after " + lastPosition);
        }
      }
      if (terms.isEmpty() == false && term.field().equals(terms.get(0).field()) == false) {
        throw new IllegalArgumentException("All terms must be on the same field, got " + term.field() + " and " + terms.get(0).field());
      }
      terms.add(term);
      positions.add(position);
      return this;
    }

    Build a phrase query based on the terms that have been added.
/**
     * Build a phrase query based on the terms that have been added.
     */
    public PhraseQuery build() {
      Term[] terms = this.terms.toArray(new Term[this.terms.size()]);
      int[] positions = new int[this.positions.size()];
      for (int i = 0; i < positions.length; ++i) {
        positions[i] = this.positions.get(i);
      }
      return new PhraseQuery(slop, terms, positions);
    }

  }

  private final int slop;
  private final String field;
  private final Term[] terms;
  private final int[] positions;

  private PhraseQuery(int slop, Term[] terms, int[] positions) {
    if (terms.length != positions.length) {
      throw new IllegalArgumentException("Must have as many terms as positions");
    }
    if (slop < 0) {
      throw new IllegalArgumentException("Slop must be >= 0, got " + slop);
    }
    for (Term term : terms) {
      Objects.requireNonNull(term, "Cannot add a null term to PhraseQuery");
    }
    for (int i = 1; i < terms.length; ++i) {
      if (terms[i-1].field().equals(terms[i].field()) == false) {
        throw new IllegalArgumentException("All terms should have the same field");
      }
    }
    for (int position : positions) {
      if (position < 0) {
        throw new IllegalArgumentException("Positions must be >= 0, got " + position);
      }
    }
    for (int i = 1; i < positions.length; ++i) {
      if (positions[i] < positions[i - 1]) {
        throw new IllegalArgumentException("Positions should not go backwards, got "
            + positions[i-1] + " before " + positions[i]);
      }
    }
    this.slop = slop;
    this.terms = terms;
    this.positions = positions;
    this.field = terms.length == 0 ? null : terms[0].field();
  }

  private static int[] incrementalPositions(int length) {
    int[] positions = new int[length];
    for (int i = 0; i < length; ++i) {
      positions[i] = i;
    }
    return positions;
  }

  private static Term[] toTerms(String field, String... termStrings) {
    Term[] terms = new Term[termStrings.length];
    for (int i = 0; i < terms.length; ++i) {
      Objects.requireNonNull(termStrings[i], "Cannot add a null term to PhraseQuery");
      terms[i] = new Term(field, termStrings[i]);
    }
    return terms;
  }

  private static Term[] toTerms(String field, BytesRef... termBytes) {
    Term[] terms = new Term[termBytes.length];
    for (int i = 0; i < terms.length; ++i) {
      Objects.requireNonNull(termBytes[i], "Cannot add a null term to PhraseQuery");
      terms[i] = new Term(field, termBytes[i]);
    }
    return terms;
  }

  Create a phrase query which will match documents that contain the given list of terms at consecutive positions in field, and at a maximum edit distance of slop. For more complicated use-cases, use Builder. 
See Also: getSlop()/**
   * Create a phrase query which will match documents that contain the given
   * list of terms at consecutive positions in {@code field}, and at a
   * maximum edit distance of {@code slop}. For more complicated use-cases,
   * use {@link PhraseQuery.Builder}.
   * @see #getSlop()
   */
  public PhraseQuery(int slop, String field, String... terms) {
    this(slop, toTerms(field, terms), incrementalPositions(terms.length));
  }

  Create a phrase query which will match documents that contain the given list of terms at consecutive positions in field. /**
   * Create a phrase query which will match documents that contain the given
   * list of terms at consecutive positions in {@code field}.
   */
  public PhraseQuery(String field, String... terms) {
    this(0, field, terms);
  }

  Create a phrase query which will match documents that contain the given list of terms at consecutive positions in field, and at a maximum edit distance of slop. For more complicated use-cases, use Builder. 
See Also: getSlop()/**
   * Create a phrase query which will match documents that contain the given
   * list of terms at consecutive positions in {@code field}, and at a
   * maximum edit distance of {@code slop}. For more complicated use-cases,
   * use {@link PhraseQuery.Builder}.
   * @see #getSlop()
   */
  public PhraseQuery(int slop, String field, BytesRef... terms) {
    this(slop, toTerms(field, terms), incrementalPositions(terms.length));
  }

  Create a phrase query which will match documents that contain the given list of terms at consecutive positions in field. /**
   * Create a phrase query which will match documents that contain the given
   * list of terms at consecutive positions in {@code field}.
   */
  public PhraseQuery(String field, BytesRef... terms) {
    this(0, field, terms);
  }

  Return the slop for this PhraseQuery. The slop is an edit distance between respective positions of terms as defined in this PhraseQuery and the positions of terms in a document. 
For instance, when searching for "quick fox", it is expected that the difference between the positions of fox and quick is 1. So "a quick brown fox" would be at an edit distance of 1 since the difference of the positions of fox and quick is 2. Similarly, "the fox is quick" would be at an edit distance of 3 since the difference of the positions of fox and quick is -2. The slop defines the maximum edit distance for a document to match. 
More exact matches are scored higher than sloppier matches, thus search
results are sorted by exactness.
/**
   * Return the slop for this {@link PhraseQuery}.
   *
   * <p>The slop is an edit distance between respective positions of terms as
   * defined in this {@link PhraseQuery} and the positions of terms in a
   * document.
   *
   * <p>For instance, when searching for {@code "quick fox"}, it is expected that
   * the difference between the positions of {@code fox} and {@code quick} is 1.
   * So {@code "a quick brown fox"} would be at an edit distance of 1 since the
   * difference of the positions of {@code fox} and {@code quick} is 2.
   * Similarly, {@code "the fox is quick"} would be at an edit distance of 3
   * since the difference of the positions of {@code fox} and {@code quick} is -2.
   * The slop defines the maximum edit distance for a document to match.
   *
   * <p>More exact matches are scored higher than sloppier matches, thus search
   * results are sorted by exactness.
   */
  public int getSlop() { return slop; }

  Returns the field this query applies to /** Returns the field this query applies to */
  public String getField() { return field; }

  Returns the list of terms in this phrase. /** Returns the list of terms in this phrase. */
  public Term[] getTerms() {
    return terms;
  }

  Returns the relative positions of terms in this phrase.
/**
   * Returns the relative positions of terms in this phrase.
   */
  public int[] getPositions() {
    return positions;
  }

  @Override
  public Query rewrite(IndexReader reader) throws IOException {
    if (terms.length == 0) {
      return new MatchNoDocsQuery("empty PhraseQuery");
    } else if (terms.length == 1) {
      return new TermQuery(terms[0]);
    } else if (positions[0] != 0) {
      int[] newPositions = new int[positions.length];
      for (int i = 0; i < positions.length; ++i) {
        newPositions[i] = positions[i] - positions[0];
      }
      return new PhraseQuery(slop, terms, newPositions);
    } else {
      return super.rewrite(reader);
    }
  }

  @Override
  public void visit(QueryVisitor visitor) {
    if (visitor.acceptField(field) == false) {
      return;
    }
    QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.MUST, this);
    v.consumeTerms(this, terms);
  }

  static class PostingsAndFreq implements Comparable<PostingsAndFreq> {
    final PostingsEnum postings;
    final ImpactsEnum impacts;
    final int position;
    final Term[] terms;
    final int nTerms; // for faster comparisons

    public PostingsAndFreq(PostingsEnum postings, ImpactsEnum impacts, int position, Term... terms) {
      this.postings = postings;
      this.impacts = impacts;
      this.position = position;
      nTerms = terms==null ? 0 : terms.length;
      if (nTerms>0) {
        if (terms.length==1) {
          this.terms = terms;
        } else {
          Term[] terms2 = new Term[terms.length];
          System.arraycopy(terms, 0, terms2, 0, terms.length);
          Arrays.sort(terms2);
          this.terms = terms2;
        }
      } else {
        this.terms = null;
      }
    }

    public PostingsAndFreq(PostingsEnum postings, ImpactsEnum impacts, int position, List<Term> terms) {
      this.postings = postings;
      this.impacts = impacts;
      this.position = position;
      nTerms = terms == null ? 0 : terms.size();
      if (nTerms > 0) {
        Term[] terms2 = terms.toArray(new Term[0]);
        if (nTerms > 1) {
          Arrays.sort(terms2);
        }
        this.terms = terms2;
      } else {
        this.terms = null;
      }
    }

    @Override
    public int compareTo(PostingsAndFreq other) {
      if (position != other.position) {
        return position - other.position;
      }
      if (nTerms != other.nTerms) {
        return nTerms - other.nTerms;
      }
      if (nTerms == 0) {
        return 0;
      }
      for (int i=0; i<terms.length; i++) {
        int res = terms[i].compareTo(other.terms[i]);
        if (res!=0) return res;
      }
      return 0;
    }

    @Override
    public int hashCode() {
      final int prime = 31;
      int result = 1;
      result = prime * result + position;
      for (int i=0; i<nTerms; i++) {
        result = prime * result + terms[i].hashCode(); 
      }
      return result;
    }

    @Override
    public boolean equals(Object obj) {
      if (this == obj) return true;
      if (obj == null) return false;
      if (getClass() != obj.getClass()) return false;
      PostingsAndFreq other = (PostingsAndFreq) obj;
      if (position != other.position) return false;
      if (terms == null) return other.terms == null;
      return Arrays.equals(terms, other.terms);
    }
  }

  A guess of the average number of simple operations for the initial seek and buffer refill per document for the positions of a term. See also BlockImpactsPostingsEnum.nextPosition().  Aside: Instead of being constant this could depend among others on Lucene84PostingsFormat.BLOCK_SIZE, TermsEnum.docFreq(), TermsEnum.totalTermFreq(), DocIdSetIterator.cost() (expected number of matching docs), IndexReader.maxDoc() (total number of docs in the segment), and the seek time and block size of the device storing the index. /** A guess of
   * the average number of simple operations for the initial seek and buffer refill
   * per document for the positions of a term.
   * See also {@link Lucene84PostingsReader.BlockImpactsPostingsEnum#nextPosition()}.
   * <p>
   * Aside: Instead of being constant this could depend among others on
   * {@link Lucene84PostingsFormat#BLOCK_SIZE},
   * {@link TermsEnum#docFreq()},
   * {@link TermsEnum#totalTermFreq()},
   * {@link DocIdSetIterator#cost()} (expected number of matching docs),
   * {@link LeafReader#maxDoc()} (total number of docs in the segment),
   * and the seek time and block size of the device storing the index.
   */
  private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128;

  Number of simple operations in BlockImpactsPostingsEnum.nextPosition() when no seek or buffer refill is done. /** Number of simple operations in {@link Lucene84PostingsReader.BlockImpactsPostingsEnum#nextPosition()}
   *  when no seek or buffer refill is done.
   */
  private static final int TERM_OPS_PER_POS = 7;

  Returns an expected cost in simple operations of processing the occurrences of a term in a document that contains the term. This is for use by TwoPhaseIterator.matchCost implementations. @param termsEnum The term is the term at which this TermsEnum is positioned. /** Returns an expected cost in simple operations
   *  of processing the occurrences of a term
   *  in a document that contains the term.
   *  This is for use by {@link TwoPhaseIterator#matchCost} implementations.
   *  @param termsEnum The term is the term at which this TermsEnum is positioned.
   */
  static float termPositionsCost(TermsEnum termsEnum) throws IOException {
    int docFreq = termsEnum.docFreq();
    assert docFreq > 0;
    long totalTermFreq = termsEnum.totalTermFreq();
    float expOccurrencesInMatchingDoc = totalTermFreq / (float) docFreq;
    return TERM_POSNS_SEEK_OPS_PER_DOC + expOccurrencesInMatchingDoc * TERM_OPS_PER_POS;
  }


  @Override
  public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
    return new PhraseWeight(this, field, searcher, scoreMode) {

      private transient TermStates states[];

      @Override
      protected Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException {
        final int[] positions = PhraseQuery.this.getPositions();
        if (positions.length < 2) {
          throw new IllegalStateException("PhraseWeight does not support less than 2 terms, call rewrite first");
        } else if (positions[0] != 0) {
          throw new IllegalStateException("PhraseWeight requires that the first position is 0, call rewrite first");
        }
        final IndexReaderContext context = searcher.getTopReaderContext();
        states = new TermStates[terms.length];
        TermStatistics termStats[] = new TermStatistics[terms.length];
        int termUpTo = 0;
        for (int i = 0; i < terms.length; i++) {
          final Term term = terms[i];
          states[i] = TermStates.build(context, term, scoreMode.needsScores());
          if (scoreMode.needsScores()) {
            TermStates ts = states[i];
            if (ts.docFreq() > 0) {
              termStats[termUpTo++] = searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq());
            }
          }
        }
        if (termUpTo > 0) {
          return similarity.scorer(boost, searcher.collectionStatistics(field), ArrayUtil.copyOfSubArray(termStats, 0, termUpTo));
        } else {
          return null; // no terms at all, we won't use similarity
        }
      }

      @Override
      protected PhraseMatcher getPhraseMatcher(LeafReaderContext context, SimScorer scorer, boolean exposeOffsets) throws IOException {
        assert terms.length > 0;
        final LeafReader reader = context.reader();
        PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.length];

        final Terms fieldTerms = reader.terms(field);
        if (fieldTerms == null) {
          return null;
        }

        if (fieldTerms.hasPositions() == false) {
          throw new IllegalStateException("field \"" + field + "\" was indexed without position data; cannot run PhraseQuery (phrase=" + getQuery() + ")");
        }

        // Reuse single TermsEnum below:
        final TermsEnum te = fieldTerms.iterator();
        float totalMatchCost = 0;

        for (int i = 0; i < terms.length; i++) {
          final Term t = terms[i];
          final TermState state = states[i].get(context);
          if (state == null) { /* term doesnt exist in this segment */
            assert termNotInReader(reader, t): "no termstate found but term exists in reader";
            return null;
          }
          te.seekExact(t.bytes(), state);
          PostingsEnum postingsEnum;
          ImpactsEnum impactsEnum;
          if (scoreMode == ScoreMode.TOP_SCORES) {
            postingsEnum = impactsEnum = te.impacts(exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
          } else {
            postingsEnum = te.postings(null, exposeOffsets ? PostingsEnum.OFFSETS : PostingsEnum.POSITIONS);
            impactsEnum = new SlowImpactsEnum(postingsEnum);
          }
          postingsFreqs[i] = new PostingsAndFreq(postingsEnum, impactsEnum, positions[i], t);
          totalMatchCost += termPositionsCost(te);
        }

        // sort by increasing docFreq order
        if (slop == 0) {
          ArrayUtil.timSort(postingsFreqs);
          return new ExactPhraseMatcher(postingsFreqs, scoreMode, scorer, totalMatchCost);
        }
        else {
          return new SloppyPhraseMatcher(postingsFreqs, slop, scoreMode, scorer, totalMatchCost, exposeOffsets);
        }
      }

      @Override
      public void extractTerms(Set<Term> queryTerms) {
        Collections.addAll(queryTerms, terms);
      }
    };
  }

  // only called from assert
  private static boolean termNotInReader(LeafReader reader, Term term) throws IOException {
    return reader.docFreq(term) == 0;
  }

  Prints a user-readable version of this query. /** Prints a user-readable version of this query. */
  @Override
  public String toString(String f) {
    StringBuilder buffer = new StringBuilder();
    if (field != null && !field.equals(f)) {
      buffer.append(field);
      buffer.append(":");
    }

    buffer.append("\"");
    final int maxPosition;
    if (positions.length == 0) {
      maxPosition = -1;
    } else {
      maxPosition = positions[positions.length - 1];
    }
    String[] pieces = new String[maxPosition + 1];
    for (int i = 0; i < terms.length; i++) {
      int pos = positions[i];
      String s = pieces[pos];
      if (s == null) {
        s = (terms[i]).text();
      } else {
        s = s + "|" + (terms[i]).text();
      }
      pieces[pos] = s;
    }
    for (int i = 0; i < pieces.length; i++) {
      if (i > 0) {
        buffer.append(' ');
      }
      String s = pieces[i];
      if (s == null) {
        buffer.append('?');
      } else {
        buffer.append(s);
      }
    }
    buffer.append("\"");

    if (slop != 0) {
      buffer.append("~");
      buffer.append(slop);
    }

    return buffer.toString();
  }

  Returns true iff o is equal to this. /** Returns true iff <code>o</code> is equal to this. */
  @Override
  public boolean equals(Object other) {
    return sameClassAs(other) &&
           equalsTo(getClass().cast(other));
  }
  
  private boolean equalsTo(PhraseQuery other) {
    return slop == other.slop && 
           Arrays.equals(terms, other.terms) && 
           Arrays.equals(positions, other.positions);
  }

  Returns a hash code value for this object./** Returns a hash code value for this object.*/
  @Override
  public int hashCode() {
    int h = classHash();
    h = 31 * h + slop;
    h = 31 * h + Arrays.hashCode(terms);
    h = 31 * h + Arrays.hashCode(positions);
    return h;
  }

}
/

org.apache.lucene/ lucene-core/ 8.7.0/ org/apache/lucene/search/PhraseQuery.java