/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.search;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;

import org.apache.lucene.index.Impact;
import org.apache.lucene.index.Impacts;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.ImpactsSource;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.search.similarities.Similarity.SimScorer;
import org.apache.lucene.util.PriorityQueue;

final class ExactPhraseMatcher extends PhraseMatcher {

  private static class PostingsAndPosition {
    private final PostingsEnum postings;
    private final int offset;
    private int freq, upTo, pos;

    public PostingsAndPosition(PostingsEnum postings, int offset) {
      this.postings = postings;
      this.offset = offset;
    }
  }

  private final PostingsAndPosition[] postings;
  private final DocIdSetIterator approximation;
  private final ImpactsDISI impactsApproximation;

  ExactPhraseMatcher(PhraseQuery.PostingsAndFreq[] postings, ScoreMode scoreMode, SimScorer scorer, float matchCost) {
    super(matchCost);

    final DocIdSetIterator approximation = ConjunctionDISI.intersectIterators(Arrays.stream(postings).map(p -> p.postings).collect(Collectors.toList()));
    final ImpactsSource impactsSource = mergeImpacts(Arrays.stream(postings).map(p -> p.impacts).toArray(ImpactsEnum[]::new));

    if (scoreMode == ScoreMode.TOP_SCORES) {
      this.approximation = this.impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
    } else {
      this.approximation = approximation;
      this.impactsApproximation = new ImpactsDISI(approximation, impactsSource, scorer);
    }

    List<PostingsAndPosition> postingsAndPositions = new ArrayList<>();
    for(PhraseQuery.PostingsAndFreq posting : postings) {
      postingsAndPositions.add(new PostingsAndPosition(posting.postings, posting.position));
    }
    this.postings = postingsAndPositions.toArray(new PostingsAndPosition[postingsAndPositions.size()]);
  }

  @Override
  DocIdSetIterator approximation() {
    return approximation;
  }

  @Override
  ImpactsDISI impactsApproximation() {
    return impactsApproximation;
  }

  @Override
  float maxFreq() {
    int minFreq = postings[0].freq;
    for (int i = 1; i < postings.length; i++) {
      minFreq = Math.min(minFreq, postings[i].freq);
    }
    return minFreq;
  }

  
Advance the given pos enum to the first doc on or after target. Return false if the enum was exhausted before reaching target and true otherwise.
/** Advance the given pos enum to the first doc on or after {@code target}. * Return {@code false} if the enum was exhausted before reaching * {@code target} and {@code true} otherwise. */
private static boolean advancePosition(PostingsAndPosition posting, int target) throws IOException { while (posting.pos < target) { if (posting.upTo == posting.freq) { return false; } else { posting.pos = posting.postings.nextPosition(); posting.upTo += 1; } } return true; } @Override public void reset() throws IOException { for (PostingsAndPosition posting : postings) { posting.freq = posting.postings.freq(); posting.pos = -1; posting.upTo = 0; } } @Override public boolean nextMatch() throws IOException { final PostingsAndPosition lead = postings[0]; if (lead.upTo < lead.freq) { lead.pos = lead.postings.nextPosition(); lead.upTo += 1; } else { return false; } advanceHead: while (true) { final int phrasePos = lead.pos - lead.offset; for (int j = 1; j < postings.length; ++j) { final PostingsAndPosition posting = postings[j]; final int expectedPos = phrasePos + posting.offset; // advance up to the same position as the lead if (advancePosition(posting, expectedPos) == false) { break advanceHead; } if (posting.pos != expectedPos) { // we advanced too far if (advancePosition(lead, posting.pos - posting.offset + lead.offset)) { continue advanceHead; } else { break advanceHead; } } } return true; } return false; } @Override float sloppyWeight() { return 1; } @Override public int startPosition() { return postings[0].pos; } @Override public int endPosition() { return postings[postings.length - 1].pos; } @Override public int startOffset() throws IOException { return postings[0].postings.startOffset(); } @Override public int endOffset() throws IOException { return postings[postings.length - 1].postings.endOffset(); }
Merge impacts for multiple terms of an exact phrase.
/** * Merge impacts for multiple terms of an exact phrase. */
static ImpactsSource mergeImpacts(ImpactsEnum[] impactsEnums) { // Iteration of block boundaries uses the impacts enum with the lower cost. // This is consistent with BlockMaxConjunctionScorer. int tmpLeadIndex = -1; for (int i = 0; i < impactsEnums.length; ++i) { if (tmpLeadIndex == -1 || impactsEnums[i].cost() < impactsEnums[tmpLeadIndex].cost()) { tmpLeadIndex = i; } } final int leadIndex = tmpLeadIndex; return new ImpactsSource() { class SubIterator { final Iterator<Impact> iterator; Impact current; SubIterator(List<Impact> impacts) { this.iterator = impacts.iterator(); this.current = iterator.next(); } boolean next() { if (iterator.hasNext() == false) { current = null; return false; } else { current = iterator.next(); return true; } } } @Override public Impacts getImpacts() throws IOException { final Impacts[] impacts = new Impacts[impactsEnums.length]; for (int i = 0; i < impactsEnums.length; ++i) { impacts[i] = impactsEnums[i].getImpacts(); } final Impacts lead = impacts[leadIndex]; return new Impacts() { @Override public int numLevels() { // Delegate to the lead return lead.numLevels(); } @Override public int getDocIdUpTo(int level) { // Delegate to the lead return lead.getDocIdUpTo(level); }
Return the minimum level whose impacts are valid up to docIdUpTo, or -1 if there is no such level.
/** * Return the minimum level whose impacts are valid up to {@code docIdUpTo}, * or {@code -1} if there is no such level. */
private int getLevel(Impacts impacts, int docIdUpTo) { for (int level = 0, numLevels = impacts.numLevels(); level < numLevels; ++level) { if (impacts.getDocIdUpTo(level) >= docIdUpTo) { return level; } } return -1; } @Override public List<Impact> getImpacts(int level) { final int docIdUpTo = getDocIdUpTo(level); PriorityQueue<SubIterator> pq = new PriorityQueue<SubIterator>(impacts.length) { @Override protected boolean lessThan(SubIterator a, SubIterator b) { return a.current.freq < b.current.freq; } }; boolean hasImpacts = false; List<Impact> onlyImpactList = null; for (int i = 0; i < impacts.length; ++i) { int impactsLevel = getLevel(impacts[i], docIdUpTo); if (impactsLevel == -1) { // This instance doesn't have useful impacts, ignore it: this is safe. continue; } List<Impact> impactList = impacts[i].getImpacts(impactsLevel); Impact firstImpact = impactList.get(0); if (firstImpact.freq == Integer.MAX_VALUE && firstImpact.norm == 1L) { // Dummy impacts, ignore it too. continue; } SubIterator subIterator = new SubIterator(impactList); pq.add(subIterator); if (hasImpacts == false) { hasImpacts = true; onlyImpactList = impactList; } else { onlyImpactList = null; // there are multiple impacts } } if (hasImpacts == false) { return Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); } else if (onlyImpactList != null) { return onlyImpactList; } // Idea: merge impacts by freq. The tricky thing is that we need to // consider freq values that are not in the impacts too. For // instance if the list of impacts is [{freq=2,norm=10}, {freq=4,norm=12}], // there might well be a document that has a freq of 2 and a length of 11, // which was just not added to the list of impacts because {freq=2,norm=10} // is more competitive. // We walk impacts in parallel through a PQ ordered by freq. At any time, // the competitive impact consists of the lowest freq among all entries of // the PQ (the top) and the highest norm (tracked separately). List<Impact> mergedImpacts = new ArrayList<>(); SubIterator top = pq.top(); int currentFreq = top.current.freq; long currentNorm = 0; for (SubIterator it : pq) { if (Long.compareUnsigned(it.current.norm, currentNorm) > 0) { currentNorm = it.current.norm; } } outer: while (true) { if (mergedImpacts.size() > 0 && mergedImpacts.get(mergedImpacts.size() - 1).norm == currentNorm) { mergedImpacts.get(mergedImpacts.size() - 1).freq = currentFreq; } else { mergedImpacts.add(new Impact(currentFreq, currentNorm)); } do { if (top.next() == false) { // At least one clause doesn't have any more documents below the current norm, // so we can safely ignore further clauses. The only reason why they have more // impacts is because they cover more documents that we are not interested in. break outer; } if (Long.compareUnsigned(top.current.norm, currentNorm) > 0) { currentNorm = top.current.norm; } top = pq.updateTop(); } while (top.current.freq == currentFreq); currentFreq = top.current.freq; } return mergedImpacts; } }; } @Override public void advanceShallow(int target) throws IOException { for (ImpactsEnum impactsEnum : impactsEnums) { impactsEnum.advanceShallow(target); } } }; } }