/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.index;
import java.io.IOException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.automaton.CompiledAutomaton;
Access to the terms in a specific field. See Fields
. @lucene.experimental
/**
* Access to the terms in a specific field. See {@link Fields}.
* @lucene.experimental
*/
public abstract class Terms {
Sole constructor. (For invocation by subclass
constructors, typically implicit.) /** Sole constructor. (For invocation by subclass
* constructors, typically implicit.) */
protected Terms() {
}
Returns an iterator that will step through all
terms. This method will not return null. /** Returns an iterator that will step through all
* terms. This method will not return null. */
public abstract TermsEnum iterator() throws IOException;
Returns a TermsEnum that iterates over all terms and documents that are accepted by the provided CompiledAutomaton
. If the startTerm
is provided then the returned enum will only return terms > startTerm
, but you still must call next() first to get to the first term. Note that the provided startTerm
must be accepted by
the automaton.
This is an expert low-level API and will only work for NORMAL
compiled automata. To handle any compiled automata you should instead use CompiledAutomaton.getTermsEnum
instead.
NOTE: the returned TermsEnum cannot seek
.
/** Returns a TermsEnum that iterates over all terms and
* documents that are accepted by the provided {@link
* CompiledAutomaton}. If the <code>startTerm</code> is
* provided then the returned enum will only return terms
* {@code > startTerm}, but you still must call
* next() first to get to the first term. Note that the
* provided <code>startTerm</code> must be accepted by
* the automaton.
*
* <p>This is an expert low-level API and will only work
* for {@code NORMAL} compiled automata. To handle any
* compiled automata you should instead use
* {@link CompiledAutomaton#getTermsEnum} instead.
*
* <p><b>NOTE</b>: the returned TermsEnum cannot seek</p>.
*/
public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) throws IOException {
// TODO: could we factor out a common interface b/w
// CompiledAutomaton and FST? Then we could pass FST there too,
// and likely speed up resolving terms to deleted docs ... but
// AutomatonTermsEnum makes this tricky because of its on-the-fly cycle
// detection
// TODO: eventually we could support seekCeil/Exact on
// the returned enum, instead of only being able to seek
// at the start
TermsEnum termsEnum = iterator();
if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
}
if (startTerm == null) {
return new AutomatonTermsEnum(termsEnum, compiled);
} else {
return new AutomatonTermsEnum(termsEnum, compiled) {
@Override
protected BytesRef nextSeekTerm(BytesRef term) throws IOException {
if (term == null) {
term = startTerm;
}
return super.nextSeekTerm(term);
}
};
}
}
Returns the number of terms for this field, or -1 if this
measure isn't stored by the codec. Note that, just like
other term measures, this measure does not take deleted
documents into account. /** Returns the number of terms for this field, or -1 if this
* measure isn't stored by the codec. Note that, just like
* other term measures, this measure does not take deleted
* documents into account. */
public abstract long size() throws IOException;
Returns the sum of TermsEnum.totalTermFreq
for all terms in this field. Note that, just like other term measures, this measure does not take deleted documents into account. /** Returns the sum of {@link TermsEnum#totalTermFreq} for
* all terms in this field. Note that, just like other term
* measures, this measure does not take deleted documents
* into account. */
public abstract long getSumTotalTermFreq() throws IOException;
Returns the sum of TermsEnum.docFreq()
for all terms in this field. Note that, just like other term measures, this measure does not take deleted documents into account. /** Returns the sum of {@link TermsEnum#docFreq()} for
* all terms in this field. Note that, just like other term
* measures, this measure does not take deleted documents
* into account. */
public abstract long getSumDocFreq() throws IOException;
Returns the number of documents that have at least one
term for this field. Note that, just like other term
measures, this measure does not take deleted documents
into account. /** Returns the number of documents that have at least one
* term for this field. Note that, just like other term
* measures, this measure does not take deleted documents
* into account. */
public abstract int getDocCount() throws IOException;
Returns true if documents in this field store per-document term frequency (PostingsEnum.freq
). /** Returns true if documents in this field store
* per-document term frequency ({@link PostingsEnum#freq}). */
public abstract boolean hasFreqs();
Returns true if documents in this field store offsets. /** Returns true if documents in this field store offsets. */
public abstract boolean hasOffsets();
Returns true if documents in this field store positions. /** Returns true if documents in this field store positions. */
public abstract boolean hasPositions();
Returns true if documents in this field store payloads. /** Returns true if documents in this field store payloads. */
public abstract boolean hasPayloads();
Zero-length array of Terms
. /** Zero-length array of {@link Terms}. */
public final static Terms[] EMPTY_ARRAY = new Terms[0];
Returns the smallest term (in lexicographic order) in the field.
Note that, just like other term measures, this measure does not
take deleted documents into account. This returns
null when there are no terms. /** Returns the smallest term (in lexicographic order) in the field.
* Note that, just like other term measures, this measure does not
* take deleted documents into account. This returns
* null when there are no terms. */
public BytesRef getMin() throws IOException {
return iterator().next();
}
Returns the largest term (in lexicographic order) in the field.
Note that, just like other term measures, this measure does not
take deleted documents into account. This returns
null when there are no terms. /** Returns the largest term (in lexicographic order) in the field.
* Note that, just like other term measures, this measure does not
* take deleted documents into account. This returns
* null when there are no terms. */
@SuppressWarnings("fallthrough")
public BytesRef getMax() throws IOException {
long size = size();
if (size == 0) {
// empty: only possible from a FilteredTermsEnum...
return null;
} else if (size >= 0) {
// try to seek-by-ord
try {
TermsEnum iterator = iterator();
iterator.seekExact(size - 1);
return iterator.term();
} catch (UnsupportedOperationException e) {
// ok
}
}
// otherwise: binary search
TermsEnum iterator = iterator();
BytesRef v = iterator.next();
if (v == null) {
// empty: only possible from a FilteredTermsEnum...
return v;
}
BytesRefBuilder scratch = new BytesRefBuilder();
scratch.append((byte) 0);
// Iterates over digits:
while (true) {
int low = 0;
int high = 256;
// Binary search current digit to find the highest
// digit before END:
while (low != high) {
int mid = (low+high) >>> 1;
scratch.setByteAt(scratch.length()-1, (byte) mid);
if (iterator.seekCeil(scratch.get()) == TermsEnum.SeekStatus.END) {
// Scratch was too high
if (mid == 0) {
scratch.setLength(scratch.length() - 1);
return scratch.get();
}
high = mid;
} else {
// Scratch was too low; there is at least one term
// still after it:
if (low == mid) {
break;
}
low = mid;
}
}
// Recurse to next digit:
scratch.setLength(scratch.length() + 1);
scratch.grow(scratch.length());
}
}
Expert: returns additional information about this Terms instance
for debugging purposes.
/**
* Expert: returns additional information about this Terms instance
* for debugging purposes.
*/
public Object getStats() throws IOException {
StringBuilder sb = new StringBuilder();
sb.append("impl=").append(getClass().getSimpleName());
sb.append(",size=").append(size());
sb.append(",docCount=").append(getDocCount());
sb.append(",sumTotalTermFreq=").append(getSumTotalTermFreq());
sb.append(",sumDocFreq=").append(getSumDocFreq());
return sb.toString();
}
}