org.apache.lucene/lucene-codecs/8.7.0 : org/apache/lucene/codecs/uniformsplit/BlockLine.java

BlockLine
https://lucene.apache.org/lucene-parent/lucene-codecs: Codecs and postings formats for Apache Lucene. (The Apache Software Foundation)
Apache License, Version 2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.codecs.uniformsplit;

import java.io.IOException;

import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.RamUsageEstimator;

One term block line.
 Contains a term and its details as a BlockTermState. 
 The line is written to the block file in two parts. The first part is the term followed by an offset to the details region. The second part is the term BlockTermState, written in the details region, after all the terms of the block. 
 The separate details region allows fast scan of the terms without having to decode the details for each term. At read time, the Serializer.readLine only reads the term and its offset to the details. The corresponding BlockTermState is decoded on demand in the BlockReader (see BlockReader.readTermStateIfNotRead). 
@lucene.experimental /**
 * One term block line.
 * <p>
 * Contains a term and its details as a {@link BlockTermState}.
 * <p>
 * The line is written to the {@link UniformSplitPostingsFormat#TERMS_BLOCKS_EXTENSION block file}
 * in two parts. The first part is the term followed by an offset to the details
 * region. The second part is the term {@link BlockTermState}, written in
 * the details region, after all the terms of the block.
 * <p>
 * The separate details region allows fast scan of the terms without having
 * to decode the details for each term. At read time, the {@link BlockLine.Serializer#readLine}
 * only reads the term and its offset to the details. The corresponding {@link BlockTermState}
 * is decoded on demand in the {@link BlockReader} (see {@link BlockReader#readTermStateIfNotRead}).
 *
 * @lucene.experimental
 */
public class BlockLine implements Accountable {

  private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(BlockLine.class);

  protected TermBytes termBytes;
  protected int termStateRelativeOffset;

  Only used for writing.
/**
   * Only used for writing.
   */
  protected final BlockTermState termState;

  Constructor used for writing a BlockLine. /**
   * Constructor used for writing a {@link BlockLine}.
   */
  protected BlockLine(TermBytes termBytes, BlockTermState termState) {
    this(termBytes, -1, termState);
  }

  Constructor used for reading a BlockLine. /**
   * Constructor used for reading a {@link BlockLine}.
   */
  protected BlockLine(TermBytes termBytes, int termStateRelativeOffset) {
    this(termBytes, termStateRelativeOffset, null);
  }

  private BlockLine(TermBytes termBytes, int termStateRelativeOffset, BlockTermState termState) {
    reset(termBytes, termStateRelativeOffset);
    this.termState = termState;
  }

  Resets this BlockLine to reuse it when reading. /**
   * Resets this {@link BlockLine} to reuse it when reading.
   */
  protected BlockLine reset(TermBytes termBytes, int termStateRelativeOffset) {
    assert termState == null;
    this.termBytes = termBytes;
    this.termStateRelativeOffset = termStateRelativeOffset;
    return this;
  }

  public TermBytes getTermBytes() {
    return termBytes;
  }

  Returns: The offset of the TermState bytes in the block, relatively to the term states base offset./**
   * @return The offset of the {@link org.apache.lucene.index.TermState}
   * bytes in the block, relatively to the term states base offset.
   */
  public int getTermStateRelativeOffset() {
    return termStateRelativeOffset;
  }

  @Override
  public long ramBytesUsed() {
    return BASE_RAM_USAGE
        + termBytes.ramBytesUsed()
        + RamUsageUtil.ramBytesUsed(termState);
  }

  Reads/writes block lines with terms encoded incrementally inside a block.
This class keeps a state of the previous term read to decode the next term.
/**
   * Reads/writes block lines with terms encoded incrementally inside a block.
   * This class keeps a state of the previous term read to decode the next term.
   */
  public static class Serializer implements Accountable {

    private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(Serializer.class);

    protected final BytesRef currentTerm;

    public Serializer() {
      currentTerm = new BytesRef(64);
    }

    Reads the current line.
Params: isIncrementalEncodingSeed – Whether the term is a seed of the incremental encoding. true for the first and middle term, false for other terms.
reuse –  A BlockLine instance to reuse; or null if none./**
     * Reads the current line.
     *
     * @param isIncrementalEncodingSeed Whether the term is a seed of the
     *                                  incremental encoding. {@code true} for the first and
     *                                  middle term, {@code false} for other terms.
     * @param reuse                     A {@link BlockLine} instance to reuse; or null if none.
     */
    public BlockLine readLine(DataInput blockInput, boolean isIncrementalEncodingSeed, BlockLine reuse) throws IOException {
      int termStateRelativeOffset = blockInput.readVInt();
      if (termStateRelativeOffset < 0) {
        throw new CorruptIndexException("Illegal termStateRelativeOffset= " + termStateRelativeOffset, blockInput);
      }
      return reuse == null ?
          new BlockLine(readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, null), termStateRelativeOffset)
          : reuse.reset(readIncrementallyEncodedTerm(blockInput, isIncrementalEncodingSeed, reuse.termBytes), termStateRelativeOffset);
    }

    Writes a line and its offset to the corresponding term state details in
the details region.
Params: blockOutput –               The output pointing to the block terms region.
termStateRelativeOffset –   The offset to the corresponding term
                                 state details in the details region.
isIncrementalEncodingSeed – Whether the term is a seed of the incremental encoding. true for the first and middle term, false for other terms./**
     * Writes a line and its offset to the corresponding term state details in
     * the details region.
     *
     * @param blockOutput               The output pointing to the block terms region.
     * @param termStateRelativeOffset   The offset to the corresponding term
     *                                  state details in the details region.
     * @param isIncrementalEncodingSeed Whether the term is a seed of
     *                                  the incremental encoding. {@code true} for the first
     *                                  and middle term, {@code false} for other terms.
     */
    public void writeLine(DataOutput blockOutput, BlockLine line, BlockLine previousLine,
                                 int termStateRelativeOffset, boolean isIncrementalEncodingSeed) throws IOException {
      blockOutput.writeVInt(termStateRelativeOffset);
      writeIncrementallyEncodedTerm(line.getTermBytes(), previousLine == null ? null : previousLine.getTermBytes(),
          isIncrementalEncodingSeed, blockOutput);
    }

    Writes the term state details of a line in the details region.
Params: termStatesOutput – The output pointing to the details region./**
     * Writes the term state details of a line in the details region.
     *
     * @param termStatesOutput The output pointing to the details region.
     */
    protected void writeLineTermState(DataOutput termStatesOutput, BlockLine line,
                                   FieldInfo fieldInfo, DeltaBaseTermStateSerializer encoder) throws IOException {
      assert line.termState != null;
      encoder.writeTermState(termStatesOutput, fieldInfo, line.termState);
    }

    protected void writeIncrementallyEncodedTerm(TermBytes termBytes, TermBytes previousTermBytes,
                                                      boolean isIncrementalEncodingSeed, DataOutput blockOutput) throws IOException {
      BytesRef term = termBytes.getTerm();
      assert term.offset == 0;
      if (isIncrementalEncodingSeed) {
        // Mdp length is always 1 for an incremental encoding seed.
        blockOutput.writeVLong(term.length);
        blockOutput.writeBytes(term.bytes, 0, term.length);
        return;
      }
      if (term.length == 0) {
        // Empty term.
        blockOutput.writeVLong(0);
        return;
      }

      // For other lines we store:
      // - Mdp length.
      // - Suffix length.
      // - Suffix bytes.
      // Instead of writing mdp length and suffix length with 2 VInt, we can compress the storage
      // by merging them in a single VLong. The idea is to leverage the information we have about
      // the previous line. We know the previous line term length. And we know that
      // new line mdp length <= (previous line term length + 1)
      // So if numMdpBits = numBitsToEncode(previous line term length),
      // then we know we can encode (new line mdp length - 1) in numMdpBits.
      // Hence we encode (new line mdp length - 1) in the rightmost numMdpBits of the VLong.
      // And we encode new line suffix length in the remaining left bits of the VLong.
      // Most of the time both values will be encoded in a single byte.

      assert previousTermBytes != null;
      assert termBytes.getMdpLength() >= 1;

      int numMdpBits = numBitsToEncode(previousTermBytes.getTerm().length);
      assert numBitsToEncode(termBytes.getMdpLength() - 1) <= numMdpBits;

      long mdpAndSuffixLengths = (((long) termBytes.getSuffixLength()) << numMdpBits) | (termBytes.getMdpLength() - 1);
      assert mdpAndSuffixLengths != 0;
      blockOutput.writeVLong(mdpAndSuffixLengths);
      blockOutput.writeBytes(term.bytes, termBytes.getSuffixOffset(), termBytes.getSuffixLength());
    }

    protected TermBytes readIncrementallyEncodedTerm(DataInput blockInput, boolean isIncrementalEncodingSeed, TermBytes reuse) throws IOException {
      assert currentTerm.offset == 0;
      int mdpLength;
      if (isIncrementalEncodingSeed) {
        int length = (int) blockInput.readVLong();
        mdpLength = length == 0 ? 0 : 1;
        readBytes(blockInput, currentTerm, 0, length);
      } else {
        long mdpAndSuffixLengths = blockInput.readVLong();
        if (mdpAndSuffixLengths == 0) {
          // Empty term.
          mdpLength = 0;
          currentTerm.length = 0;
        } else {
          int numMdpBits = numBitsToEncode(currentTerm.length);
          mdpLength = (int) (mdpAndSuffixLengths & ((1 << numMdpBits) - 1)) + 1; // Get rightmost numMdpBits.
          int suffixLength = (int) (mdpAndSuffixLengths >>> numMdpBits); // Get remaining left bits.
          assert mdpLength >= 1;
          assert suffixLength >= 1;
          readBytes(blockInput, currentTerm, mdpLength - 1, suffixLength);
        }
      }
      return reuse == null ?
          new TermBytes(mdpLength, currentTerm)
          : reuse.reset(mdpLength, currentTerm);
    }

    Reads length bytes from the given DataInput and stores them at offset in bytes.bytes. /**
     * Reads {@code length} bytes from the given {@link DataInput} and stores
     * them at {@code offset} in {@code bytes.bytes}.
     */
    protected void readBytes(DataInput input, BytesRef bytes, int offset, int length) throws IOException {
      assert bytes.offset == 0;
      bytes.length = offset + length;
      bytes.bytes = ArrayUtil.grow(bytes.bytes, bytes.length);
      input.readBytes(bytes.bytes, offset, length);
    }

    @Override
    public long ramBytesUsed() {
      return BASE_RAM_USAGE
          + RamUsageUtil.ramBytesUsed(currentTerm);
    }

    Gets the number of bits required to encode the value of the provided int.
Returns 0 for int value 0. Equivalent to (log2(i) + 1).
/**
     * Gets the number of bits required to encode the value of the provided int.
     * Returns 0 for int value 0. Equivalent to (log2(i) + 1).
     */
    protected static int numBitsToEncode(int i) {
      return 32 - Integer.numberOfLeadingZeros(i);
    }
  }
}
/

org.apache.lucene/ lucene-codecs/ 8.7.0/ org/apache/lucene/codecs/uniformsplit/BlockLine.java