/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.codecs.uniformsplit;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;

import static org.apache.lucene.codecs.uniformsplit.UniformSplitPostingsFormat.*;

A block-based terms index and dictionary based on the Uniform Split technique.
See Also:
  • UniformSplitTermsWriter
@lucene.experimental
/** * A block-based terms index and dictionary based on the Uniform Split technique. * * @see UniformSplitTermsWriter * @lucene.experimental */
public class UniformSplitTermsReader extends FieldsProducer { private static final long BASE_RAM_USAGE = RamUsageEstimator.shallowSizeOfInstance(UniformSplitTermsReader.class) + RamUsageEstimator.shallowSizeOfInstance(IndexInput.class) * 2; protected final PostingsReaderBase postingsReader; protected final int version; protected final IndexInput blockInput; protected final IndexInput dictionaryInput; protected final Map<String, UniformSplitTerms> fieldToTermsMap; // Keeps the order of the field names; much more efficient than having a TreeMap for the fieldToTermsMap. protected final Collection<String> sortedFieldNames;
Params:
  • blockDecoder – Optional block decoder, may be null if none. It can be used for decompression or decryption.
  • dictionaryOnHeap – Whether to force loading the terms dictionary on-heap. By default it is kept off-heap without impact on performance. If block encoding/decoding is used, then the dictionary is always loaded on-heap whatever this parameter value is.
/** * @param blockDecoder Optional block decoder, may be null if none. * It can be used for decompression or decryption. * @param dictionaryOnHeap Whether to force loading the terms dictionary on-heap. By default it is kept off-heap without * impact on performance. If block encoding/decoding is used, then the dictionary is always * loaded on-heap whatever this parameter value is. */
public UniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder, boolean dictionaryOnHeap) throws IOException { this(postingsReader, state, blockDecoder, dictionaryOnHeap, FieldMetadata.Serializer.INSTANCE, NAME, VERSION_START, VERSION_CURRENT, TERMS_BLOCKS_EXTENSION, TERMS_DICTIONARY_EXTENSION); }
See Also:
  • UniformSplitTermsReader(PostingsReaderBase, SegmentReadState, BlockDecoder, boolean)
/** * @see #UniformSplitTermsReader(PostingsReaderBase, SegmentReadState, BlockDecoder, boolean) */
protected UniformSplitTermsReader(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder, boolean dictionaryOnHeap, FieldMetadata.Serializer fieldMetadataReader, String codecName, int versionStart, int versionCurrent, String termsBlocksExtension, String dictionaryExtension) throws IOException { IndexInput dictionaryInput = null; IndexInput blockInput = null; boolean success = false; try { this.postingsReader = postingsReader; String segmentName = state.segmentInfo.name; String termsName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, termsBlocksExtension); blockInput = state.directory.openInput(termsName, state.context); version = CodecUtil.checkIndexHeader(blockInput, codecName, versionStart, versionCurrent, state.segmentInfo.getId(), state.segmentSuffix); String indexName = IndexFileNames.segmentFileName(segmentName, state.segmentSuffix, dictionaryExtension); dictionaryInput = state.directory.openInput(indexName, state.context); CodecUtil.checkIndexHeader(dictionaryInput, codecName, version, version, state.segmentInfo.getId(), state.segmentSuffix); CodecUtil.checksumEntireFile(dictionaryInput); postingsReader.init(blockInput, state); CodecUtil.retrieveChecksum(blockInput); seekFieldsMetadata(blockInput); Collection<FieldMetadata> fieldMetadataCollection = readFieldsMetadata(blockInput, blockDecoder, state.fieldInfos, fieldMetadataReader, state.segmentInfo.maxDoc()); fieldToTermsMap = new HashMap<>(); this.blockInput = blockInput; this.dictionaryInput = dictionaryInput; fillFieldMap(postingsReader, state, blockDecoder, dictionaryOnHeap, dictionaryInput, blockInput, fieldMetadataCollection, state.fieldInfos); List<String> fieldNames = new ArrayList<>(fieldToTermsMap.keySet()); Collections.sort(fieldNames); sortedFieldNames = Collections.unmodifiableList(fieldNames); success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(blockInput, dictionaryInput); } } } protected void fillFieldMap(PostingsReaderBase postingsReader, SegmentReadState state, BlockDecoder blockDecoder, boolean dictionaryOnHeap, IndexInput dictionaryInput, IndexInput blockInput, Collection<FieldMetadata> fieldMetadataCollection, FieldInfos fieldInfos) throws IOException { for (FieldMetadata fieldMetadata : fieldMetadataCollection) { IndexDictionary.BrowserSupplier dictionaryBrowserSupplier = createDictionaryBrowserSupplier(state, dictionaryInput, fieldMetadata, blockDecoder, dictionaryOnHeap); fieldToTermsMap.put(fieldMetadata.getFieldInfo().name, new UniformSplitTerms(blockInput, fieldMetadata, postingsReader, blockDecoder, dictionaryBrowserSupplier)); } } protected IndexDictionary.BrowserSupplier createDictionaryBrowserSupplier(SegmentReadState state, IndexInput dictionaryInput, FieldMetadata fieldMetadata, BlockDecoder blockDecoder, boolean dictionaryOnHeap) throws IOException { return new FSTDictionary.BrowserSupplier(dictionaryInput, fieldMetadata.getDictionaryStartFP(), blockDecoder, dictionaryOnHeap); }
Params:
/** * @param indexInput {@link IndexInput} must be positioned to the fields metadata * details by calling {@link #seekFieldsMetadata(IndexInput)} before this call. * @param blockDecoder Optional block decoder, may be null if none. */
protected Collection<FieldMetadata> readFieldsMetadata(IndexInput indexInput, BlockDecoder blockDecoder, FieldInfos fieldInfos, FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException { int numFields = indexInput.readVInt(); if (numFields < 0) { throw new CorruptIndexException("Illegal number of fields= " + numFields, indexInput); } return (blockDecoder != null && version >= VERSION_ENCODABLE_FIELDS_METADATA) ? readEncodedFieldsMetadata(numFields, indexInput, blockDecoder, fieldInfos, fieldMetadataReader, maxNumDocs) : readUnencodedFieldsMetadata(numFields, indexInput, fieldInfos, fieldMetadataReader, maxNumDocs); } protected Collection<FieldMetadata> readEncodedFieldsMetadata(int numFields, DataInput metadataInput, BlockDecoder blockDecoder, FieldInfos fieldInfos, FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException { long encodedLength = metadataInput.readVLong(); if (encodedLength < 0) { throw new CorruptIndexException("Illegal encoded length: " + encodedLength, metadataInput); } BytesRef decodedBytes = blockDecoder.decode(metadataInput, encodedLength); DataInput decodedMetadataInput = new ByteArrayDataInput(decodedBytes.bytes, 0, decodedBytes.length); return readUnencodedFieldsMetadata(numFields, decodedMetadataInput, fieldInfos, fieldMetadataReader, maxNumDocs); } protected Collection<FieldMetadata> readUnencodedFieldsMetadata(int numFields, DataInput metadataInput, FieldInfos fieldInfos, FieldMetadata.Serializer fieldMetadataReader, int maxNumDocs) throws IOException { Collection<FieldMetadata> fieldMetadataCollection = new ArrayList<>(numFields); for (int i = 0; i < numFields; i++) { fieldMetadataCollection.add(fieldMetadataReader.read(metadataInput, fieldInfos, maxNumDocs)); } return fieldMetadataCollection; } @Override public void close() throws IOException { try { IOUtils.close(blockInput, dictionaryInput, postingsReader); } finally { // Clear so refs to terms index is GCable even if app hangs onto us. fieldToTermsMap.clear(); } } @Override public void checkIntegrity() throws IOException { // term dictionary CodecUtil.checksumEntireFile(blockInput); // postings postingsReader.checkIntegrity(); } @Override public Iterator<String> iterator() { return sortedFieldNames.iterator(); } @Override public Terms terms(String field) { return fieldToTermsMap.get(field); } @Override public int size() { return fieldToTermsMap.size(); } @Override public long ramBytesUsed() { long ramUsage = BASE_RAM_USAGE; ramUsage += postingsReader.ramBytesUsed(); ramUsage += RamUsageUtil.ramBytesUsedByHashMapOfSize(fieldToTermsMap.size()); ramUsage += getTermsRamBytesUsed(); ramUsage += RamUsageUtil.ramBytesUsedByUnmodifiableArrayListOfSize(sortedFieldNames.size()); return ramUsage; } protected long getTermsRamBytesUsed() { long ramUsage = 0L; for (UniformSplitTerms terms : fieldToTermsMap.values()) { ramUsage += terms.ramBytesUsed(); } return ramUsage; }
Positions the given IndexInput at the beginning of the fields metadata.
/** * Positions the given {@link IndexInput} at the beginning of the fields metadata. */
protected void seekFieldsMetadata(IndexInput indexInput) throws IOException { indexInput.seek(indexInput.length() - CodecUtil.footerLength() - 8); indexInput.seek(indexInput.readLong()); } }