package org.apache.lucene.codecs.idversion;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.blocktree.BlockTreeTermsWriter;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.ByteSequenceOutputs;
import org.apache.lucene.util.fst.BytesRefFSTEnum;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PairOutputs.Pair;
import org.apache.lucene.util.fst.PairOutputs;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
public final class VersionBlockTreeTermsWriter extends FieldsConsumer {
static final PairOutputs<BytesRef,Long> FST_OUTPUTS = new PairOutputs<>(ByteSequenceOutputs.getSingleton(),
PositiveIntOutputs.getSingleton());
static final Pair<BytesRef,Long> NO_OUTPUT = FST_OUTPUTS.getNoOutput();
public final static int DEFAULT_MIN_BLOCK_SIZE = 25;
public final static int DEFAULT_MAX_BLOCK_SIZE = 48;
static final int OUTPUT_FLAGS_NUM_BITS = 2;
static final int OUTPUT_FLAGS_MASK = 0x3;
static final int OUTPUT_FLAG_IS_FLOOR = 0x1;
static final int OUTPUT_FLAG_HAS_TERMS = 0x2;
static final String TERMS_EXTENSION = "tiv";
final static String TERMS_CODEC_NAME = "VersionBlockTreeTermsDict";
public static final int VERSION_START = 1;
public static final int VERSION_CURRENT = VERSION_START;
static final String TERMS_INDEX_EXTENSION = "tipv";
final static String TERMS_INDEX_CODEC_NAME = "VersionBlockTreeTermsIndex";
private final IndexOutput out;
private final IndexOutput indexOut;
final int maxDoc;
final int minItemsInBlock;
final int maxItemsInBlock;
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
private static class FieldMetaData {
public final FieldInfo fieldInfo;
public final Pair<BytesRef,Long> rootCode;
public final long numTerms;
public final long indexStartFP;
public final BytesRef minTerm;
public final BytesRef maxTerm;
public FieldMetaData(FieldInfo fieldInfo, Pair<BytesRef,Long> rootCode, long numTerms, long indexStartFP,
BytesRef minTerm, BytesRef maxTerm) {
assert numTerms > 0;
this.fieldInfo = fieldInfo;
assert rootCode != null: "field=" + fieldInfo.name + " numTerms=" + numTerms;
this.rootCode = rootCode;
this.indexStartFP = indexStartFP;
this.numTerms = numTerms;
this.minTerm = minTerm;
this.maxTerm = maxTerm;
}
}
private final List<FieldMetaData> fields = new ArrayList<>();
private final String segment;
public VersionBlockTreeTermsWriter(
SegmentWriteState state,
PostingsWriterBase postingsWriter,
int minItemsInBlock,
int maxItemsInBlock)
throws IOException
{
BlockTreeTermsWriter.validateSettings(minItemsInBlock, maxItemsInBlock);
segment = state.segmentInfo.name;
maxDoc = state.segmentInfo.maxDoc();
final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION);
out = state.directory.createOutput(termsFileName, state.context);
boolean success = false;
IndexOutput indexOut = null;
try {
fieldInfos = state.fieldInfos;
this.minItemsInBlock = minItemsInBlock;
this.maxItemsInBlock = maxItemsInBlock;
CodecUtil.writeIndexHeader(out, TERMS_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
indexOut = state.directory.createOutput(termsIndexFileName, state.context);
CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
this.postingsWriter = postingsWriter;
postingsWriter.init(out, state);
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(out, indexOut);
}
}
this.indexOut = indexOut;
}
private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
out.writeLong(dirStart);
}
private void (IndexOutput indexOut, long dirStart) throws IOException {
indexOut.writeLong(dirStart);
}
@Override
public void write(Fields fields, NormsProducer norms) throws IOException {
String lastField = null;
for(String field : fields) {
assert lastField == null || lastField.compareTo(field) < 0;
lastField = field;
Terms terms = fields.terms(field);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator();
TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field));
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
termsWriter.write(term, termsEnum, norms);
}
termsWriter.finish();
}
}
static long encodeOutput(long fp, boolean hasTerms, boolean isFloor) {
assert fp < (1L << 62);
return (fp << 2) | (hasTerms ? OUTPUT_FLAG_HAS_TERMS : 0) | (isFloor ? OUTPUT_FLAG_IS_FLOOR : 0);
}
private static class PendingEntry {
public final boolean isTerm;
protected PendingEntry(boolean isTerm) {
this.isTerm = isTerm;
}
}
private static final class PendingTerm extends PendingEntry {
public final byte[] termBytes;
public final BlockTermState state;
public PendingTerm(BytesRef term, BlockTermState state) {
super(true);
this.termBytes = new byte[term.length];
System.arraycopy(term.bytes, term.offset, termBytes, 0, term.length);
this.state = state;
}
@Override
public String toString() {
return brToString(termBytes);
}
}
@SuppressWarnings("unused")
static String brToString(BytesRef b) {
try {
return b.utf8ToString() + " " + b;
} catch (Throwable t) {
return b.toString();
}
}
@SuppressWarnings("unused")
static String brToString(byte[] b) {
return brToString(new BytesRef(b));
}
private static final class PendingBlock extends PendingEntry {
public final BytesRef prefix;
public final long fp;
public FST<Pair<BytesRef,Long>> index;
public List<FST<Pair<BytesRef,Long>>> subIndices;
public final boolean hasTerms;
public final boolean isFloor;
public final int floorLeadByte;
private final long maxVersion;
public PendingBlock(BytesRef prefix, long maxVersion, long fp, boolean hasTerms, boolean isFloor, int floorLeadByte, List<FST<Pair<BytesRef,Long>>> subIndices) {
super(false);
this.prefix = prefix;
this.maxVersion = maxVersion;
this.fp = fp;
this.hasTerms = hasTerms;
this.isFloor = isFloor;
this.floorLeadByte = floorLeadByte;
this.subIndices = subIndices;
}
@Override
public String toString() {
return "BLOCK: " + brToString(prefix);
}
public void compileIndex(List<PendingBlock> blocks, RAMOutputStream scratchBytes, IntsRefBuilder scratchIntsRef) throws IOException {
assert (isFloor && blocks.size() > 1) || (isFloor == false && blocks.size() == 1): "isFloor=" + isFloor + " blocks=" + blocks;
assert this == blocks.get(0);
assert scratchBytes.getFilePointer() == 0;
long maxVersionIndex = maxVersion;
scratchBytes.writeVLong(encodeOutput(fp, hasTerms, isFloor));
if (isFloor) {
scratchBytes.writeVInt(blocks.size()-1);
for (int i=1;i<blocks.size();i++) {
PendingBlock sub = blocks.get(i);
maxVersionIndex = Math.max(maxVersionIndex, sub.maxVersion);
scratchBytes.writeByte((byte) sub.floorLeadByte);
assert sub.fp > fp;
scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0));
}
}
final Builder<Pair<BytesRef,Long>> indexBuilder = new Builder<>(FST.INPUT_TYPE.BYTE1,
0, 0, true, false, Integer.MAX_VALUE,
FST_OUTPUTS, true, 15);
final byte[] bytes = new byte[(int) scratchBytes.getFilePointer()];
assert bytes.length > 0;
scratchBytes.writeTo(bytes, 0);
indexBuilder.add(Util.toIntsRef(prefix, scratchIntsRef), FST_OUTPUTS.newPair(new BytesRef(bytes, 0, bytes.length), Long.MAX_VALUE - maxVersionIndex));
scratchBytes.reset();
for(PendingBlock block : blocks) {
if (block.subIndices != null) {
for(FST<Pair<BytesRef,Long>> subIndex : block.subIndices) {
append(indexBuilder, subIndex, scratchIntsRef);
}
block.subIndices = null;
}
}
index = indexBuilder.finish();
assert subIndices == null;
}
private void append(Builder<Pair<BytesRef,Long>> builder, FST<Pair<BytesRef,Long>> subIndex, IntsRefBuilder scratchIntsRef) throws IOException {
final BytesRefFSTEnum<Pair<BytesRef,Long>> subIndexEnum = new BytesRefFSTEnum<>(subIndex);
BytesRefFSTEnum.InputOutput<Pair<BytesRef,Long>> indexEnt;
while((indexEnt = subIndexEnum.next()) != null) {
builder.add(Util.toIntsRef(indexEnt.input, scratchIntsRef), indexEnt.output);
}
}
}
private final RAMOutputStream scratchBytes = new RAMOutputStream();
private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
class TermsWriter {
private final FieldInfo fieldInfo;
private long numTerms;
final FixedBitSet docsSeen;
long indexStartFP;
private final BytesRefBuilder lastTerm = new BytesRefBuilder();
private int[] prefixStarts = new int[8];
private final List<PendingEntry> pending = new ArrayList<>();
private final List<PendingBlock> newBlocks = new ArrayList<>();
private PendingTerm firstPendingTerm;
private PendingTerm lastPendingTerm;
void writeBlocks(int prefixLength, int count) throws IOException {
assert count > 0;
assert prefixLength > 0 || count == pending.size();
int lastSuffixLeadLabel = -1;
boolean hasTerms = false;
boolean hasSubBlocks = false;
int start = pending.size()-count;
int end = pending.size();
int nextBlockStart = start;
int nextFloorLeadLabel = -1;
for (int i=start; i<end; i++) {
PendingEntry ent = pending.get(i);
int suffixLeadLabel;
if (ent.isTerm) {
PendingTerm term = (PendingTerm) ent;
if (term.termBytes.length == prefixLength) {
assert lastSuffixLeadLabel == -1;
suffixLeadLabel = -1;
} else {
suffixLeadLabel = term.termBytes[prefixLength] & 0xff;
}
} else {
PendingBlock block = (PendingBlock) ent;
assert block.prefix.length > prefixLength;
suffixLeadLabel = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff;
}
if (suffixLeadLabel != lastSuffixLeadLabel) {
int itemsInBlock = i - nextBlockStart;
if (itemsInBlock >= minItemsInBlock && end-nextBlockStart > maxItemsInBlock) {
boolean isFloor = itemsInBlock < count;
newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, i, hasTerms, hasSubBlocks));
hasTerms = false;
hasSubBlocks = false;
nextFloorLeadLabel = suffixLeadLabel;
nextBlockStart = i;
}
lastSuffixLeadLabel = suffixLeadLabel;
}
if (ent.isTerm) {
hasTerms = true;
} else {
hasSubBlocks = true;
}
}
if (nextBlockStart < end) {
int itemsInBlock = end - nextBlockStart;
boolean isFloor = itemsInBlock < count;
newBlocks.add(writeBlock(prefixLength, isFloor, nextFloorLeadLabel, nextBlockStart, end, hasTerms, hasSubBlocks));
}
assert newBlocks.isEmpty() == false;
PendingBlock firstBlock = newBlocks.get(0);
assert firstBlock.isFloor || newBlocks.size() == 1;
firstBlock.compileIndex(newBlocks, scratchBytes, scratchIntsRef);
pending.subList(pending.size()-count, pending.size()).clear();
pending.add(firstBlock);
newBlocks.clear();
}
private PendingBlock writeBlock(int prefixLength, boolean isFloor, int floorLeadLabel, int start, int end, boolean hasTerms, boolean hasSubBlocks) throws IOException {
assert end > start;
long startFP = out.getFilePointer();
boolean hasFloorLeadLabel = isFloor && floorLeadLabel != -1;
final BytesRef prefix = new BytesRef(prefixLength + (hasFloorLeadLabel ? 1 : 0));
System.arraycopy(lastTerm.bytes(), 0, prefix.bytes, 0, prefixLength);
prefix.length = prefixLength;
int numEntries = end - start;
int code = numEntries << 1;
if (end == pending.size()) {
code |= 1;
}
out.writeVInt(code);
boolean isLeafBlock = hasSubBlocks == false;
final List<FST<Pair<BytesRef,Long>>> subIndices;
boolean absolute = true;
long maxVersionInBlock = -1;
if (isLeafBlock) {
subIndices = null;
for (int i=start;i<end;i++) {
PendingEntry ent = pending.get(i);
assert ent.isTerm: "i=" + i;
PendingTerm term = (PendingTerm) ent;
assert StringHelper.startsWith(term.termBytes, prefix): "term.term=" + term.termBytes + " prefix=" + prefix;
BlockTermState state = term.state;
maxVersionInBlock = Math.max(maxVersionInBlock, ((IDVersionTermState) state).idVersion);
final int suffix = term.termBytes.length - prefixLength;
suffixWriter.writeVInt(suffix);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;
postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute);
absolute = false;
}
} else {
subIndices = new ArrayList<>();
for (int i=start;i<end;i++) {
PendingEntry ent = pending.get(i);
if (ent.isTerm) {
PendingTerm term = (PendingTerm) ent;
assert StringHelper.startsWith(term.termBytes, prefix): "term.term=" + term.termBytes + " prefix=" + prefix;
BlockTermState state = term.state;
maxVersionInBlock = Math.max(maxVersionInBlock, ((IDVersionTermState) state).idVersion);
final int suffix = term.termBytes.length - prefixLength;
suffixWriter.writeVInt(suffix<<1);
suffixWriter.writeBytes(term.termBytes, prefixLength, suffix);
assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel;
postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute);
absolute = false;
} else {
PendingBlock block = (PendingBlock) ent;
maxVersionInBlock = Math.max(maxVersionInBlock, block.maxVersion);
assert StringHelper.startsWith(block.prefix, prefix);
final int suffix = block.prefix.length - prefixLength;
assert suffix > 0;
suffixWriter.writeVInt((suffix<<1)|1);
suffixWriter.writeBytes(block.prefix.bytes, prefixLength, suffix);
assert floorLeadLabel == -1 || (block.prefix.bytes[prefixLength] & 0xff) >= floorLeadLabel;
assert block.fp < startFP;
suffixWriter.writeVLong(startFP - block.fp);
subIndices.add(block.index);
}
}
assert subIndices.size() != 0;
}
out.writeVInt((int) (suffixWriter.getFilePointer() << 1) | (isLeafBlock ? 1:0));
suffixWriter.writeTo(out);
suffixWriter.reset();
out.writeVInt((int) metaWriter.getFilePointer());
metaWriter.writeTo(out);
metaWriter.reset();
if (hasFloorLeadLabel) {
prefix.bytes[prefix.length++] = (byte) floorLeadLabel;
}
return new PendingBlock(prefix, maxVersionInBlock, startFP, hasTerms, isFloor, floorLeadLabel, subIndices);
}
TermsWriter(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
docsSeen = new FixedBitSet(maxDoc);
postingsWriter.setField(fieldInfo);
}
public void write(BytesRef text, TermsEnum termsEnum, NormsProducer norms) throws IOException {
BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen, norms);
if (state != null && ((IDVersionPostingsWriter) postingsWriter).lastDocID != -1) {
assert state.docFreq != 0;
assert fieldInfo.getIndexOptions() == IndexOptions.DOCS || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter;
pushTerm(text);
PendingTerm term = new PendingTerm(BytesRef.deepCopyOf(text), state);
pending.add(term);
numTerms++;
if (firstPendingTerm == null) {
firstPendingTerm = term;
}
lastPendingTerm = term;
}
}
private void pushTerm(BytesRef text) throws IOException {
int limit = Math.min(lastTerm.length(), text.length);
int pos = 0;
while (pos < limit && lastTerm.byteAt(pos) == text.bytes[text.offset+pos]) {
pos++;
}
for(int i=lastTerm.length()-1;i>=pos;i--) {
int prefixTopSize = pending.size() - prefixStarts[i];
if (prefixTopSize >= minItemsInBlock) {
writeBlocks(i+1, prefixTopSize);
prefixStarts[i] -= prefixTopSize-1;
}
}
if (prefixStarts.length < text.length) {
prefixStarts = ArrayUtil.grow(prefixStarts, text.length);
}
for(int i=pos;i<text.length;i++) {
prefixStarts[i] = pending.size();
}
lastTerm.copyBytes(text);
}
public void finish() throws IOException {
if (numTerms > 0) {
writeBlocks(0, pending.size());
assert pending.size() == 1 && !pending.get(0).isTerm: "pending.size()=" + pending.size() + " pending=" + pending;
final PendingBlock root = (PendingBlock) pending.get(0);
assert root.prefix.length == 0;
assert root.index.getEmptyOutput() != null;
indexStartFP = indexOut.getFilePointer();
root.index.save(indexOut, indexOut);
assert firstPendingTerm != null;
BytesRef minTerm = new BytesRef(firstPendingTerm.termBytes);
assert lastPendingTerm != null;
BytesRef maxTerm = new BytesRef(lastPendingTerm.termBytes);
fields.add(new FieldMetaData(fieldInfo,
((PendingBlock) pending.get(0)).index.getEmptyOutput(),
numTerms,
indexStartFP,
minTerm, maxTerm));
} else {
}
}
private final RAMOutputStream suffixWriter = new RAMOutputStream();
private final RAMOutputStream metaWriter = new RAMOutputStream();
}
private boolean closed;
@Override
public void close() throws IOException {
if (closed) {
return;
}
closed = true;
boolean success = false;
try {
final long dirStart = out.getFilePointer();
final long indexDirStart = indexOut.getFilePointer();
out.writeVInt(fields.size());
for(FieldMetaData field : fields) {
out.writeVInt(field.fieldInfo.number);
assert field.numTerms > 0;
out.writeVLong(field.numTerms);
out.writeVInt(field.rootCode.output1.length);
out.writeBytes(field.rootCode.output1.bytes, field.rootCode.output1.offset, field.rootCode.output1.length);
out.writeVLong(field.rootCode.output2);
indexOut.writeVLong(field.indexStartFP);
writeBytesRef(out, field.minTerm);
writeBytesRef(out, field.maxTerm);
}
writeTrailer(out, dirStart);
CodecUtil.writeFooter(out);
writeIndexTrailer(indexOut, indexDirStart);
CodecUtil.writeFooter(indexOut);
success = true;
} finally {
if (success) {
IOUtils.close(out, indexOut, postingsWriter);
} else {
IOUtils.closeWhileHandlingException(out, indexOut, postingsWriter);
}
}
}
private static void writeBytesRef(IndexOutput out, BytesRef bytes) throws IOException {
out.writeVInt(bytes.length);
out.writeBytes(bytes.bytes, bytes.offset, bytes.length);
}
}