org.apache.lucene/lucene-core/8.2.0 : org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java

IntersectTermsEnum
http://lucene.apache.org/lucene-parent/lucene-core: Apache Lucene Java Core (The Apache Software Foundation)
Apache 2
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.blocktree;


import java.io.IOException;

import org.apache.lucene.index.BaseTermsEnum;
import org.apache.lucene.index.ImpactsEnum;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.RunAutomaton;
import org.apache.lucene.util.automaton.Transition;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;

This is used to implement efficient Terms.intersect for block-tree. Note that it cannot seek, except for the initial term on init. It just "nexts" through the intersection of the automaton and the terms. It does not use the terms index at all: on init, it loads the root block, and scans its way to the initial term. Likewise, in next it scans until it finds a term that matches the current automaton transition. /** This is used to implement efficient {@link Terms#intersect} for
 *  block-tree.  Note that it cannot seek, except for the initial term on
 *  init.  It just "nexts" through the intersection of the automaton and
 *  the terms.  It does not use the terms index at all: on init, it
 *  loads the root block, and scans its way to the initial term.
 *  Likewise, in next it scans until it finds a term that matches the
 *  current automaton transition. */

final class IntersectTermsEnum extends BaseTermsEnum {

  //static boolean DEBUG = BlockTreeTermsWriter.DEBUG;

  final IndexInput in;
  final static Outputs<BytesRef> fstOutputs = ByteSequenceOutputs.getSingleton();

  IntersectTermsEnumFrame[] stack;
      
  @SuppressWarnings({"rawtypes","unchecked"}) private FST.Arc<BytesRef>[] arcs = new FST.Arc[5];

  final RunAutomaton runAutomaton;
  final Automaton automaton;
  final BytesRef commonSuffix;

  private IntersectTermsEnumFrame currentFrame;
  private Transition currentTransition;

  private final BytesRef term = new BytesRef();

  private final FST.BytesReader fstReader;

  final FieldReader fr;

  private BytesRef savedStartTerm;

  // TODO: in some cases we can filter by length?  eg
  // regexp foo*bar must be at least length 6 bytes
  public IntersectTermsEnum(FieldReader fr, Automaton automaton, RunAutomaton runAutomaton, BytesRef commonSuffix, BytesRef startTerm) throws IOException {
    this.fr = fr;

    assert automaton != null;
    assert runAutomaton != null;

    this.runAutomaton = runAutomaton;
    this.automaton = automaton;
    this.commonSuffix = commonSuffix;

    in = fr.parent.termsIn.clone();
    stack = new IntersectTermsEnumFrame[5];
    for(int idx=0;idx<stack.length;idx++) {
      stack[idx] = new IntersectTermsEnumFrame(this, idx);
    }
    for(int arcIdx=0;arcIdx<arcs.length;arcIdx++) {
      arcs[arcIdx] = new FST.Arc<>();
    }

    
    fstReader = fr.index.getBytesReader();

    // TODO: if the automaton is "smallish" we really
    // should use the terms index to seek at least to
    // the initial term and likely to subsequent terms
    // (or, maybe just fallback to ATE for such cases).
    // Else the seek cost of loading the frames will be
    // too costly.

    final FST.Arc<BytesRef> arc = fr.index.getFirstArc(arcs[0]);
    // Empty string prefix must have an output in the index!
    assert arc.isFinal();

    // Special pushFrame since it's the first one:
    final IntersectTermsEnumFrame f = stack[0];
    f.fp = f.fpOrig = fr.rootBlockFP;
    f.prefix = 0;
    f.setState(0);
    f.arc = arc;
    f.outputPrefix = arc.output;
    f.load(fr.rootCode);

    // for assert:
    assert setSavedStartTerm(startTerm);

    currentFrame = f;
    if (startTerm != null) {
      seekToStartTerm(startTerm);
    }
    currentTransition = currentFrame.transition;
  }

  // only for assert:
  private boolean setSavedStartTerm(BytesRef startTerm) {
    savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm);
    return true;
  }

  @Override
  public TermState termState() throws IOException {
    currentFrame.decodeMetaData();
    return currentFrame.termState.clone();
  }

  private IntersectTermsEnumFrame getFrame(int ord) throws IOException {
    if (ord >= stack.length) {
      final IntersectTermsEnumFrame[] next = new IntersectTermsEnumFrame[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
      System.arraycopy(stack, 0, next, 0, stack.length);
      for(int stackOrd=stack.length;stackOrd<next.length;stackOrd++) {
        next[stackOrd] = new IntersectTermsEnumFrame(this, stackOrd);
      }
      stack = next;
    }
    assert stack[ord].ord == ord;
    return stack[ord];
  }

  private FST.Arc<BytesRef> getArc(int ord) {
    if (ord >= arcs.length) {
      @SuppressWarnings({"rawtypes","unchecked"}) final FST.Arc<BytesRef>[] next =
      new FST.Arc[ArrayUtil.oversize(1+ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
      System.arraycopy(arcs, 0, next, 0, arcs.length);
      for(int arcOrd=arcs.length;arcOrd<next.length;arcOrd++) {
        next[arcOrd] = new FST.Arc<>();
      }
      arcs = next;
    }
    return arcs[ord];
  }

  private IntersectTermsEnumFrame pushFrame(int state) throws IOException {
    assert currentFrame != null;

    final IntersectTermsEnumFrame f = getFrame(currentFrame == null ? 0 : 1+currentFrame.ord);
        
    f.fp = f.fpOrig = currentFrame.lastSubFP;
    f.prefix = currentFrame.prefix + currentFrame.suffix;
    f.setState(state);

    // Walk the arc through the index -- we only
    // "bother" with this so we can get the floor data
    // from the index and skip floor blocks when
    // possible:
    FST.Arc<BytesRef> arc = currentFrame.arc;
    int idx = currentFrame.prefix;
    assert currentFrame.suffix > 0;
    BytesRef output = currentFrame.outputPrefix;
    while (idx < f.prefix) {
      final int target = term.bytes[idx] & 0xff;
      // TODO: we could be more efficient for the next()
      // case by using current arc as starting point,
      // passed to findTargetArc
      arc = fr.index.findTargetArc(target, arc, getArc(1+idx), fstReader);
      assert arc != null;
      output = fstOutputs.add(output, arc.output);
      idx++;
    }

    f.arc = arc;
    f.outputPrefix = output;
    assert arc.isFinal();
    f.load(fstOutputs.add(output, arc.nextFinalOutput));
    return f;
  }

  @Override
  public BytesRef term() {
    return term;
  }

  @Override
  public int docFreq() throws IOException {
    currentFrame.decodeMetaData();
    return currentFrame.termState.docFreq;
  }

  @Override
  public long totalTermFreq() throws IOException {
    currentFrame.decodeMetaData();
    return currentFrame.termState.totalTermFreq;
  }

  @Override
  public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
    currentFrame.decodeMetaData();
    return fr.parent.postingsReader.postings(fr.fieldInfo, currentFrame.termState, reuse, flags);
  }

  @Override
  public ImpactsEnum impacts(int flags) throws IOException {
    currentFrame.decodeMetaData();
    return fr.parent.postingsReader.impacts(fr.fieldInfo, currentFrame.termState, flags);
  }

  private int getState() {
    int state = currentFrame.state;
    for(int idx=0;idx<currentFrame.suffix;idx++) {
      state = runAutomaton.step(state,  currentFrame.suffixBytes[currentFrame.startBytePos+idx] & 0xff);
      assert state != -1;
    }
    return state;
  }

  // NOTE: specialized to only doing the first-time
  // seek, but we could generalize it to allow
  // arbitrary seekExact/Ceil.  Note that this is a
  // seekFloor!
  private void seekToStartTerm(BytesRef target) throws IOException {
    assert currentFrame.ord == 0;
    if (term.length < target.length) {
      term.bytes = ArrayUtil.grow(term.bytes, target.length);
    }
    FST.Arc<BytesRef> arc = arcs[0];
    assert arc == currentFrame.arc;

    for(int idx=0;idx<=target.length;idx++) {

      while (true) {
        final int savNextEnt = currentFrame.nextEnt;
        final int savePos = currentFrame.suffixesReader.getPosition();
        final int saveStartBytePos = currentFrame.startBytePos;
        final int saveSuffix = currentFrame.suffix;
        final long saveLastSubFP = currentFrame.lastSubFP;
        final int saveTermBlockOrd = currentFrame.termState.termBlockOrd;

        final boolean isSubBlock = currentFrame.next();

        term.length = currentFrame.prefix + currentFrame.suffix;
        if (term.bytes.length < term.length) {
          term.bytes = ArrayUtil.grow(term.bytes, term.length);
        }
        System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);

        if (isSubBlock && StringHelper.startsWith(target, term)) {
          // Recurse
          currentFrame = pushFrame(getState());
          break;
        } else {
          final int cmp = term.compareTo(target);
          if (cmp < 0) {
            if (currentFrame.nextEnt == currentFrame.entCount) {
              if (!currentFrame.isLastInFloor) {
                // Advance to next floor block
                currentFrame.loadNextFloorBlock();
                continue;
              } else {
                return;
              }
            }
            continue;
          } else if (cmp == 0) {
            return;
          } else {
            // Fallback to prior entry: the semantics of
            // this method is that the first call to
            // next() will return the term after the
            // requested term
            currentFrame.nextEnt = savNextEnt;
            currentFrame.lastSubFP = saveLastSubFP;
            currentFrame.startBytePos = saveStartBytePos;
            currentFrame.suffix = saveSuffix;
            currentFrame.suffixesReader.setPosition(savePos);
            currentFrame.termState.termBlockOrd = saveTermBlockOrd;
            System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
            term.length = currentFrame.prefix + currentFrame.suffix;
            // If the last entry was a block we don't
            // need to bother recursing and pushing to
            // the last term under it because the first
            // next() will simply skip the frame anyway
            return;
          }
        }
      }
    }

    assert false;
  }

  private boolean popPushNext() throws IOException {
    // Pop finished frames
    while (currentFrame.nextEnt == currentFrame.entCount) {
      if (!currentFrame.isLastInFloor) {
        // Advance to next floor block
        currentFrame.loadNextFloorBlock();
        break;
      } else {
        if (currentFrame.ord == 0) {
          throw NoMoreTermsException.INSTANCE;
        }
        final long lastFP = currentFrame.fpOrig;
        currentFrame = stack[currentFrame.ord-1];
        currentTransition = currentFrame.transition;
        assert currentFrame.lastSubFP == lastFP;
      }
    }

    return currentFrame.next();
  }

  // Only used internally when there are no more terms in next():
  private static final class NoMoreTermsException extends RuntimeException {

    // Only used internally when there are no more terms in next():
    public static final NoMoreTermsException INSTANCE = new NoMoreTermsException();

    private NoMoreTermsException() {
    }

    @Override
    public Throwable fillInStackTrace() {
      // Do nothing:
      return this;
    }    
  }

  @Override
  public BytesRef next() throws IOException {
    try {
      return _next();
    } catch (NoMoreTermsException eoi) {
      // Provoke NPE if we are (illegally!) called again:
      currentFrame = null;
      return null;
    }
  }

  private BytesRef _next() throws IOException {

    boolean isSubBlock = popPushNext();

    nextTerm:

    while (true) {
      assert currentFrame.transition == currentTransition;

      int state;
      int lastState;

      // NOTE: suffix == 0 can only happen on the first term in a block, when
      // there is a term exactly matching a prefix in the index.  If we
      // could somehow re-org the code so we only checked this case immediately
      // after pushing a frame...
      if (currentFrame.suffix != 0) {

        final byte[] suffixBytes = currentFrame.suffixBytes;

        // This is the first byte of the suffix of the term we are now on:
        final int label = suffixBytes[currentFrame.startBytePos] & 0xff;

        if (label < currentTransition.min) {
          // Common case: we are scanning terms in this block to "catch up" to
          // current transition in the automaton:
          int minTrans = currentTransition.min;
          while (currentFrame.nextEnt < currentFrame.entCount) {
            isSubBlock = currentFrame.next();
            if ((suffixBytes[currentFrame.startBytePos] & 0xff) >= minTrans) {
              continue nextTerm;
            }
          }

          // End of frame:
          isSubBlock = popPushNext();
          continue nextTerm;
        }

        // Advance where we are in the automaton to match this label:

        while (label > currentTransition.max) {
          if (currentFrame.transitionIndex >= currentFrame.transitionCount-1) {
            // Pop this frame: no further matches are possible because
            // we've moved beyond what the max transition will allow
            if (currentFrame.ord == 0) {
              // Provoke NPE if we are (illegally!) called again:
              currentFrame = null;
              return null;
            }
            currentFrame = stack[currentFrame.ord-1];
            currentTransition = currentFrame.transition;
            isSubBlock = popPushNext();
            continue nextTerm;
          }
          currentFrame.transitionIndex++;
          automaton.getNextTransition(currentTransition);

          if (label < currentTransition.min) {
            int minTrans = currentTransition.min;
            while (currentFrame.nextEnt < currentFrame.entCount) {
              isSubBlock = currentFrame.next();
              if ((suffixBytes[currentFrame.startBytePos] & 0xff) >= minTrans) {
                continue nextTerm;
              }
            }

            // End of frame:
            isSubBlock = popPushNext();
            continue nextTerm;
          }
        }

        if (commonSuffix != null && !isSubBlock) {
          final int termLen = currentFrame.prefix + currentFrame.suffix;
          if (termLen < commonSuffix.length) {
            // No match
            isSubBlock = popPushNext();
            continue nextTerm;
          }

          final byte[] commonSuffixBytes = commonSuffix.bytes;

          final int lenInPrefix = commonSuffix.length - currentFrame.suffix;
          assert commonSuffix.offset == 0;
          int suffixBytesPos;
          int commonSuffixBytesPos = 0;

          if (lenInPrefix > 0) {
            // A prefix of the common suffix overlaps with
            // the suffix of the block prefix so we first
            // test whether the prefix part matches:
            final byte[] termBytes = term.bytes;
            int termBytesPos = currentFrame.prefix - lenInPrefix;
            assert termBytesPos >= 0;
            final int termBytesPosEnd = currentFrame.prefix;
            while (termBytesPos < termBytesPosEnd) {
              if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
                isSubBlock = popPushNext();
                continue nextTerm;
              }
            }
            suffixBytesPos = currentFrame.startBytePos;
          } else {
            suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - commonSuffix.length;
          }

          // Test overlapping suffix part:
          final int commonSuffixBytesPosEnd = commonSuffix.length;
          while (commonSuffixBytesPos < commonSuffixBytesPosEnd) {
            if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) {
              isSubBlock = popPushNext();
              continue nextTerm;
            }
          }
        }

        // TODO: maybe we should do the same linear test
        // that AutomatonTermsEnum does, so that if we
        // reach a part of the automaton where .* is
        // "temporarily" accepted, we just blindly .next()
        // until the limit

        // See if the term suffix matches the automaton:

        // We know from above that the first byte in our suffix (label) matches
        // the current transition, so we step from the 2nd byte
        // in the suffix:
        lastState = currentFrame.state;
        state = currentTransition.dest;

        int end = currentFrame.startBytePos + currentFrame.suffix;
        for (int idx=currentFrame.startBytePos+1;idx<end;idx++) {
          lastState = state;
          state = runAutomaton.step(state, suffixBytes[idx] & 0xff);
          if (state == -1) {
            // No match
            isSubBlock = popPushNext();
            continue nextTerm;
          }
        }
      } else {
        state = currentFrame.state;
        lastState = currentFrame.lastState;
      }

      if (isSubBlock) {
        // Match!  Recurse:
        copyTerm();
        currentFrame = pushFrame(state);
        currentTransition = currentFrame.transition;
        currentFrame.lastState = lastState;
      } else if (runAutomaton.isAccept(state)) {
        copyTerm();
        assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0: "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString();
        return term;
      } else {
        // This term is a prefix of a term accepted by the automaton, but is not itself acceptd
      }

      isSubBlock = popPushNext();
    }
  }

  // for debugging
  @SuppressWarnings("unused")
  static String brToString(BytesRef b) {
    try {
      return b.utf8ToString() + " " + b;
    } catch (Throwable t) {
      // If BytesRef isn't actually UTF8, or it's eg a
      // prefix of UTF8 that ends mid-unicode-char, we
      // fallback to hex:
      return b.toString();
    }
  }

  private void copyTerm() {
    final int len = currentFrame.prefix + currentFrame.suffix;
    if (term.bytes.length < len) {
      term.bytes = ArrayUtil.grow(term.bytes, len);
    }
    System.arraycopy(currentFrame.suffixBytes, currentFrame.startBytePos, term.bytes, currentFrame.prefix, currentFrame.suffix);
    term.length = len;
  }

  @Override
  public boolean seekExact(BytesRef text) {
    throw new UnsupportedOperationException();
  }

  @Override
  public void seekExact(long ord) {
    throw new UnsupportedOperationException();
  }

  @Override
  public long ord() {
    throw new UnsupportedOperationException();
  }

  @Override
  public SeekStatus seekCeil(BytesRef text) {
    throw new UnsupportedOperationException();
  }
}
/

org.apache.lucene/ lucene-core/ 8.2.0/ org/apache/lucene/codecs/blocktree/IntersectTermsEnum.java