/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.codecs.uniformsplit;

import java.io.IOException;

import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.OffHeapFSTStore;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;

Immutable stateless FST-based index dictionary kept in memory.

Use Builder to build the IndexDictionary.

Create a stateful Browser to seek a term in this IndexDictionary and get its corresponding block file pointer to the terms block file.

Its greatest advantage is to be very compact in memory thanks to both the compaction of the FST as a byte array, and the incremental encoding of the leaves block pointer values, which are long integers in increasing order, with PositiveIntOutputs.
With a compact dictionary in memory we can increase the number of blocks. This allows us to reduce the average block size, which means faster scan inside a block.

@lucene.experimental
/** * Immutable stateless {@link FST}-based index dictionary kept in memory. * <p> * Use {@link IndexDictionary.Builder} to build the {@link IndexDictionary}. * <p> * Create a stateful {@link IndexDictionary.Browser} to seek a term in this * {@link IndexDictionary} and get its corresponding block file pointer to * the terms block file. * <p> * Its greatest advantage is to be very compact in memory thanks to both * the compaction of the {@link FST} as a byte array, and the incremental * encoding of the leaves block pointer values, which are long integers in * increasing order, with {@link PositiveIntOutputs}.<br> * With a compact dictionary in memory we can increase the number of blocks. * This allows us to reduce the average block size, which means faster scan * inside a block. * * @lucene.experimental */
public class FSTDictionary implements IndexDictionary { private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(FSTDictionary.class); protected final FST<Long> fst; protected FSTDictionary(FST<Long> fst) { this.fst = fst; } @Override public long ramBytesUsed() { return BASE_RAM_USAGE + fst.ramBytesUsed(); } @Override public void write(DataOutput output, BlockEncoder blockEncoder) throws IOException { if (blockEncoder == null) { fst.save(output, output); } else { ByteBuffersDataOutput bytesDataOutput = ByteBuffersDataOutput.newResettableInstance(); fst.save(bytesDataOutput, bytesDataOutput); BlockEncoder.WritableBytes encodedBytes = blockEncoder.encode(bytesDataOutput.toDataInput(), bytesDataOutput.size()); output.writeVLong(encodedBytes.size()); encodedBytes.writeTo(output); } }
Reads a FSTDictionary from the provided input.
Params:
  • blockDecoder – The BlockDecoder to use for specific decoding; or null if none.
/** * Reads a {@link FSTDictionary} from the provided input. * @param blockDecoder The {@link BlockDecoder} to use for specific decoding; or null if none. */
protected static FSTDictionary read(DataInput input, BlockDecoder blockDecoder, boolean isFSTOnHeap) throws IOException { DataInput fstDataInput; if (blockDecoder == null) { fstDataInput = input; } else { long numBytes = input.readVLong(); BytesRef decodedBytes = blockDecoder.decode(input, numBytes); fstDataInput = new ByteArrayDataInput(decodedBytes.bytes, 0, decodedBytes.length); // OffHeapFSTStore.init() requires a DataInput which is an instance of IndexInput. // When the block is decoded we must load the FST on heap. isFSTOnHeap = true; } PositiveIntOutputs fstOutputs = PositiveIntOutputs.getSingleton(); FST<Long> fst = isFSTOnHeap ? new FST<>(fstDataInput, fstDataInput, fstOutputs) : new FST<>(fstDataInput, fstDataInput, fstOutputs, new OffHeapFSTStore()); return new FSTDictionary(fst); } @Override public Browser browser() { return new Browser(); }
Stateful Browser to seek a term in this FSTDictionary and get its corresponding block file pointer in the block file.
/** * Stateful {@link Browser} to seek a term in this {@link FSTDictionary} * and get its corresponding block file pointer in the block file. */
protected class Browser implements IndexDictionary.Browser { protected final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst); @Override public long seekBlock(BytesRef term) throws IOException { BytesRefFSTEnum.InputOutput<Long> seekFloor = fstEnum.seekFloor(term); return seekFloor == null ? -1 : seekFloor.output; } }
Provides stateful Browser to seek in the FSTDictionary.
@lucene.experimental
/** * Provides stateful {@link Browser} to seek in the {@link FSTDictionary}. * * @lucene.experimental */
public static class BrowserSupplier implements IndexDictionary.BrowserSupplier { protected final IndexInput dictionaryInput; protected final BlockDecoder blockDecoder; protected final boolean isFSTOnHeap;
Lazy loaded immutable index dictionary FST. The FST is either kept off-heap, or hold in RAM on-heap.
/** * Lazy loaded immutable index dictionary FST. * The FST is either kept off-heap, or hold in RAM on-heap. */
protected IndexDictionary dictionary; public BrowserSupplier(IndexInput dictionaryInput, long dictionaryStartFP, BlockDecoder blockDecoder, boolean isFSTOnHeap) throws IOException { this.dictionaryInput = dictionaryInput.clone(); this.dictionaryInput.seek(dictionaryStartFP); this.blockDecoder = blockDecoder; this.isFSTOnHeap = isFSTOnHeap; } @Override public IndexDictionary.Browser get() throws IOException { // This double-check idiom does not require the dictionary to be volatile // because it is immutable. See section "Double-Checked Locking Immutable Objects" // of https://www.cs.umd.edu/~pugh/java/memoryModel/DoubleCheckedLocking.html. if (dictionary == null) { synchronized (this) { if (dictionary == null) { dictionary = read(dictionaryInput, blockDecoder, isFSTOnHeap); } } } return dictionary.browser(); } @Override public long ramBytesUsed() { return dictionary == null ? 0 : dictionary.ramBytesUsed(); } }
Builds an immutable FSTDictionary.
@lucene.experimental
/** * Builds an immutable {@link FSTDictionary}. * * @lucene.experimental */
public static class Builder implements IndexDictionary.Builder { protected final org.apache.lucene.util.fst.Builder<Long> fstBuilder; protected final IntsRefBuilder scratchInts; public Builder() { PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); fstBuilder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE1, outputs); scratchInts = new IntsRefBuilder(); } @Override public void add(BytesRef blockKey, long blockFilePointer) throws IOException { fstBuilder.add(Util.toIntsRef(blockKey, scratchInts), blockFilePointer); } @Override public FSTDictionary build() throws IOException { return new FSTDictionary(fstBuilder.finish()); } } }