/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.queries.intervals;
import java.util.Arrays;
import java.util.List;
import java.util.function.Predicate;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.CompiledAutomaton;
Constructor functions for IntervalsSource
types These sources implement minimum-interval algorithms taken from the paper
Efficient Optimally Lazy Algorithms for Minimal-Interval Semantics By default, sources that are sensitive to internal gaps (e.g. PHRASE and MAXGAPS) will rewrite their sub-sources so that disjunctions of different lengths are pulled up to the top of the interval tree. For example, PHRASE(or(PHRASE("a", "b", "c"), "b"), "c") will automatically rewrite itself to OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c")) to ensure that documents containing "b c" are matched. This can lead to less efficient queries, as more terms need to be loaded (for example, the "c" iterator above is loaded twice), so if you care more about speed than about accuracy you can use the or(boolean, IntervalsSource...)
factory method to prevent rewriting. /**
* Constructor functions for {@link IntervalsSource} types
*
* These sources implement minimum-interval algorithms taken from the paper
* <a href="http://vigna.di.unimi.it/ftp/papers/EfficientAlgorithmsMinimalIntervalSemantics.pdf">
* Efficient Optimally Lazy Algorithms for Minimal-Interval Semantics</a>
*
* By default, sources that are sensitive to internal gaps (e.g. PHRASE and MAXGAPS) will
* rewrite their sub-sources so that disjunctions of different lengths are pulled up
* to the top of the interval tree. For example, PHRASE(or(PHRASE("a", "b", "c"), "b"), "c")
* will automatically rewrite itself to OR(PHRASE("a", "b", "c", "c"), PHRASE("b", "c"))
* to ensure that documents containing "b c" are matched. This can lead to less efficient
* queries, as more terms need to be loaded (for example, the "c" iterator above is loaded
* twice), so if you care more about speed than about accuracy you can use the
* {@link #or(boolean, IntervalsSource...)} factory method to prevent rewriting.
*/
public final class Intervals {
private Intervals() {}
Return an IntervalsSource
exposing intervals for a term /**
* Return an {@link IntervalsSource} exposing intervals for a term
*/
public static IntervalsSource term(BytesRef term) {
return new TermIntervalsSource(term);
}
Return an IntervalsSource
exposing intervals for a term /**
* Return an {@link IntervalsSource} exposing intervals for a term
*/
public static IntervalsSource term(String term) {
return new TermIntervalsSource(new BytesRef(term));
}
Return an IntervalsSource
exposing intervals for a term, filtered by the value of the term's payload at each position /**
* Return an {@link IntervalsSource} exposing intervals for a term, filtered by the value
* of the term's payload at each position
*/
public static IntervalsSource term(String term, Predicate<BytesRef> payloadFilter) {
return term(new BytesRef(term), payloadFilter);
}
Return an IntervalsSource
exposing intervals for a term, filtered by the value of the term's payload at each position /**
* Return an {@link IntervalsSource} exposing intervals for a term, filtered by the value
* of the term's payload at each position
*/
public static IntervalsSource term(BytesRef term, Predicate<BytesRef> payloadFilter) {
return new PayloadFilteredTermIntervalsSource(term, payloadFilter);
}
Return an IntervalsSource
exposing intervals for a phrase consisting of a list of terms /**
* Return an {@link IntervalsSource} exposing intervals for a phrase consisting of a list of terms
*/
public static IntervalsSource phrase(String... terms) {
if (terms.length == 1) {
return Intervals.term(terms[0]);
}
IntervalsSource[] sources = new IntervalsSource[terms.length];
int i = 0;
for (String term : terms) {
sources[i] = term(term);
i++;
}
return phrase(sources);
}
Return an IntervalsSource
exposing intervals for a phrase consisting of a list of IntervalsSources /**
* Return an {@link IntervalsSource} exposing intervals for a phrase consisting of a list of IntervalsSources
*/
public static IntervalsSource phrase(IntervalsSource... subSources) {
return BlockIntervalsSource.build(Arrays.asList(subSources));
}
Return an IntervalsSource
over the disjunction of a set of sub-sources Automatically rewrites if wrapped by an interval source that is sensitive to internal gaps /**
* Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
*
* Automatically rewrites if wrapped by an interval source that is sensitive to
* internal gaps
*/
public static IntervalsSource or(IntervalsSource... subSources) {
return or(true, Arrays.asList(subSources));
}
Return an IntervalsSource
over the disjunction of a set of sub-sources Params: - rewrite – if
false
, do not rewrite intervals that are sensitive to internal gaps; this may run more efficiently, but can miss valid hits due to minimization - subSources – the sources to combine
/**
* Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
*
* @param rewrite if {@code false}, do not rewrite intervals that are sensitive to
* internal gaps; this may run more efficiently, but can miss valid
* hits due to minimization
* @param subSources the sources to combine
*/
public static IntervalsSource or(boolean rewrite, IntervalsSource... subSources) {
return or(rewrite, Arrays.asList(subSources));
}
Return an IntervalsSource
over the disjunction of a set of sub-sources /**
* Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
*/
public static IntervalsSource or(List<IntervalsSource> subSources) {
return or(true, subSources);
}
Return an IntervalsSource
over the disjunction of a set of sub-sources Params: - rewrite – if
false
, do not rewrite intervals that are sensitive to internal gaps; this may run more efficiently, but can miss valid hits due to minimization - subSources – the sources to combine
/**
* Return an {@link IntervalsSource} over the disjunction of a set of sub-sources
*
* @param rewrite if {@code false}, do not rewrite intervals that are sensitive to
* internal gaps; this may run more efficiently, but can miss valid
* hits due to minimization
* @param subSources the sources to combine
*/
public static IntervalsSource or(boolean rewrite, List<IntervalsSource> subSources) {
return DisjunctionIntervalsSource.create(subSources, rewrite);
}
Return an IntervalsSource
over the disjunction of all terms that begin with a prefix Throws: - IllegalStateException – if the prefix expands to more than 128 terms
/**
* Return an {@link IntervalsSource} over the disjunction of all terms that begin with a prefix
*
* @throws IllegalStateException if the prefix expands to more than 128 terms
*/
public static IntervalsSource prefix(BytesRef prefix) {
return prefix(prefix, 128);
}
Expert: Return an IntervalsSource
over the disjunction of all terms that begin with a prefix WARNING: Setting maxExpansions
to higher than the default value of 128 can be both slow and memory-intensive Params: - prefix – the prefix to expand
- maxExpansions – the maximum number of terms to expand to
Throws: - IllegalStateException – if the prefix expands to more than
maxExpansions
terms
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that begin with a prefix
*
* WARNING: Setting {@code maxExpansions} to higher than the default value of 128
* can be both slow and memory-intensive
*
* @param prefix the prefix to expand
* @param maxExpansions the maximum number of terms to expand to
*
* @throws IllegalStateException if the prefix expands to more than {@code maxExpansions} terms
*/
public static IntervalsSource prefix(BytesRef prefix, int maxExpansions) {
CompiledAutomaton ca = new CompiledAutomaton(PrefixQuery.toAutomaton(prefix));
return new MultiTermIntervalsSource(ca, maxExpansions, prefix.utf8ToString() + "*");
}
Return an IntervalsSource
over the disjunction of all terms that match a wildcard glob Throws: - IllegalStateException – if the wildcard glob expands to more than 128 terms
See Also:
/**
* Return an {@link IntervalsSource} over the disjunction of all terms that match a wildcard glob
*
* @throws IllegalStateException if the wildcard glob expands to more than 128 terms
*
* @see WildcardQuery for glob format
*/
public static IntervalsSource wildcard(BytesRef wildcard) {
return wildcard(wildcard, 128);
}
Expert: Return an IntervalsSource
over the disjunction of all terms that match a wildcard glob WARNING: Setting maxExpansions
to higher than the default value of 128 can be both slow and memory-intensive Params: - wildcard – the glob to expand
- maxExpansions – the maximum number of terms to expand to
Throws: - IllegalStateException – if the wildcard glob expands to more than
maxExpansions
terms
See Also:
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that match a wildcard glob
*
* WARNING: Setting {@code maxExpansions} to higher than the default value of 128
* can be both slow and memory-intensive
*
* @param wildcard the glob to expand
* @param maxExpansions the maximum number of terms to expand to
*
* @throws IllegalStateException if the wildcard glob expands to more than {@code maxExpansions} terms
*
* @see WildcardQuery for glob format
*/
public static IntervalsSource wildcard(BytesRef wildcard, int maxExpansions) {
CompiledAutomaton ca = new CompiledAutomaton(WildcardQuery.toAutomaton(new Term("", wildcard)));
return new MultiTermIntervalsSource(ca, maxExpansions, wildcard.utf8ToString());
}
Expert: Return an IntervalsSource
over the disjunction of all terms that's accepted by the given automaton Params: - automaton – accepts terms for to expand to
- pattern – string representation of the given automaton, mostly used in exception messages
Throws: - IllegalStateException – if the automaton accepts more than 128 terms
Deprecated: use multiterm(CompiledAutomaton, String)
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton
*
* @param automaton accepts terms for to expand to
* @param pattern string representation of the given automaton, mostly used in exception messages
*
* @throws IllegalStateException if the automaton accepts more than 128 terms
*
* @deprecated use {@link #multiterm(CompiledAutomaton, String)}
*/
@Deprecated
public static IntervalsSource multiterm(Automaton automaton, String pattern) {
return multiterm(automaton, 128, pattern);
}
Expert: Return an IntervalsSource
over the disjunction of all terms that's accepted by the given automaton WARNING: Setting maxExpansions
to higher than the default value of 128 can be both slow and memory-intensive Params: - automaton – accepts terms for to expand to
- maxExpansions – the maximum number of terms to expand to
- pattern – string representation of the given automaton, mostly used in exception messages
Throws: - IllegalStateException – if the automaton accepts more than
maxExpansions
terms
Deprecated: use multiterm(CompiledAutomaton, int, String)
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton
*
* WARNING: Setting {@code maxExpansions} to higher than the default value of 128
* can be both slow and memory-intensive
*
* @param automaton accepts terms for to expand to
* @param maxExpansions the maximum number of terms to expand to
* @param pattern string representation of the given automaton, mostly used in exception messages
*
* @throws IllegalStateException if the automaton accepts more than {@code maxExpansions} terms
*
* @deprecated use {@link #multiterm(CompiledAutomaton, int, String)}
*/
@Deprecated
public static IntervalsSource multiterm(Automaton automaton, int maxExpansions, String pattern) {
CompiledAutomaton ca = new CompiledAutomaton(automaton);
return new MultiTermIntervalsSource(ca, maxExpansions, pattern);
}
Expert: Return an IntervalsSource
over the disjunction of all terms that's accepted by the given automaton Params: - ca – an automaton accepting matching terms
- pattern – string representation of the given automaton, mostly used in exception messages
Throws: - IllegalStateException – if the automaton accepts more than 128 terms
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton
*
* @param ca an automaton accepting matching terms
* @param pattern string representation of the given automaton, mostly used in exception messages
*
* @throws IllegalStateException if the automaton accepts more than 128 terms
*/
public static IntervalsSource multiterm(CompiledAutomaton ca, String pattern) {
return multiterm(ca, 128, pattern);
}
Expert: Return an IntervalsSource
over the disjunction of all terms that's accepted by the given automaton WARNING: Setting maxExpansions
to higher than the default value of 128 can be both slow and memory-intensive Params: - ca – an automaton accepting matching terms
- maxExpansions – the maximum number of terms to expand to
- pattern – string representation of the given automaton, mostly used in exception messages
Throws: - IllegalStateException – if the automaton accepts more than
maxExpansions
terms
/**
* Expert: Return an {@link IntervalsSource} over the disjunction of all terms that's accepted by the given automaton
*
* WARNING: Setting {@code maxExpansions} to higher than the default value of 128
* can be both slow and memory-intensive
*
* @param ca an automaton accepting matching terms
* @param maxExpansions the maximum number of terms to expand to
* @param pattern string representation of the given automaton, mostly used in exception messages
*
* @throws IllegalStateException if the automaton accepts more than {@code maxExpansions} terms
*/
public static IntervalsSource multiterm(CompiledAutomaton ca, int maxExpansions, String pattern) {
return new MultiTermIntervalsSource(ca, maxExpansions, pattern);
}
Create an IntervalsSource
that filters a sub-source by the width of its intervals Params: - width – the maximum width of intervals in the sub-source to filter
- subSource – the sub-source to filter
/**
* Create an {@link IntervalsSource} that filters a sub-source by the width of its intervals
* @param width the maximum width of intervals in the sub-source to filter
* @param subSource the sub-source to filter
*/
public static IntervalsSource maxwidth(int width, IntervalsSource subSource) {
return FilteredIntervalsSource.maxWidth(subSource, width);
}
Create an IntervalsSource
that filters a sub-source by its gaps Params: - gaps – the maximum number of gaps in the sub-source to filter
- subSource – the sub-source to filter
/**
* Create an {@link IntervalsSource} that filters a sub-source by its gaps
* @param gaps the maximum number of gaps in the sub-source to filter
* @param subSource the sub-source to filter
*/
public static IntervalsSource maxgaps(int gaps, IntervalsSource subSource) {
return FilteredIntervalsSource.maxGaps(subSource, gaps);
}
Create an IntervalsSource
that wraps another source, extending its intervals by a number of positions before and after. This can be useful for adding defined gaps in a block query; for example, to find 'a b [2 arbitrary terms] c', you can call: Intervals.phrase(Intervals.term("a"), Intervals.extend(Intervals.term("b"), 0, 2), Intervals.term("c"));
Note that calling IntervalIterator.gaps()
on iterators returned by this source delegates directly to the wrapped iterator, and does not include the extensions. Params: - source – the source to extend
- before – how many positions to extend before the delegated interval
- after – how many positions to extend after the delegated interval
/**
* Create an {@link IntervalsSource} that wraps another source, extending its
* intervals by a number of positions before and after.
*
* This can be useful for adding defined gaps in a block query; for example,
* to find 'a b [2 arbitrary terms] c', you can call:
* <pre>
* Intervals.phrase(Intervals.term("a"), Intervals.extend(Intervals.term("b"), 0, 2), Intervals.term("c"));
* </pre>
*
* Note that calling {@link IntervalIterator#gaps()} on iterators returned by this source
* delegates directly to the wrapped iterator, and does not include the extensions.
*
* @param source the source to extend
* @param before how many positions to extend before the delegated interval
* @param after how many positions to extend after the delegated interval
*/
public static IntervalsSource extend(IntervalsSource source, int before, int after) {
return new ExtendedIntervalsSource(source, before, after);
}
Create an ordered IntervalsSource
Returns intervals in which the subsources all appear in the given order Params: - subSources – an ordered set of
IntervalsSource
objects
/**
* Create an ordered {@link IntervalsSource}
*
* Returns intervals in which the subsources all appear in the given order
*
* @param subSources an ordered set of {@link IntervalsSource} objects
*/
public static IntervalsSource ordered(IntervalsSource... subSources) {
return OrderedIntervalsSource.build(Arrays.asList(subSources));
}
Create an unordered IntervalsSource
Returns intervals in which all the subsources appear. The subsources may overlap Params: - subSources – an unordered set of
IntervalsSource
s
/**
* Create an unordered {@link IntervalsSource}
*
* Returns intervals in which all the subsources appear. The subsources may overlap
*
* @param subSources an unordered set of {@link IntervalsSource}s
*/
public static IntervalsSource unordered(IntervalsSource... subSources) {
return UnorderedIntervalsSource.build(Arrays.asList(subSources));
}
Create an unordered IntervalsSource
allowing no overlaps between subsources Returns intervals in which both the subsources appear and do not overlap. /**
* Create an unordered {@link IntervalsSource} allowing no overlaps between subsources
*
* Returns intervals in which both the subsources appear and do not overlap.
*/
public static IntervalsSource unorderedNoOverlaps(IntervalsSource a, IntervalsSource b) {
return Intervals.or(Intervals.ordered(a, b), Intervals.ordered(b, a));
}
Create an IntervalsSource
that always returns intervals from a specific field This is useful for comparing intervals across multiple fields, for example fields that have been analyzed differently, allowing you to search for stemmed terms near unstemmed terms, etc. /**
* Create an {@link IntervalsSource} that always returns intervals from a specific field
*
* This is useful for comparing intervals across multiple fields, for example fields that
* have been analyzed differently, allowing you to search for stemmed terms near unstemmed
* terms, etc.
*/
public static IntervalsSource fixField(String field, IntervalsSource source) {
return new FixedFieldIntervalsSource(field, source);
}
Create a non-overlapping IntervalsSource
Returns intervals of the minuend that do not overlap with intervals from the subtrahend
Params: - minuend – the
IntervalsSource
to filter - subtrahend – the
IntervalsSource
to filter by
/**
* Create a non-overlapping IntervalsSource
*
* Returns intervals of the minuend that do not overlap with intervals from the subtrahend
* @param minuend the {@link IntervalsSource} to filter
* @param subtrahend the {@link IntervalsSource} to filter by
*/
public static IntervalsSource nonOverlapping(IntervalsSource minuend, IntervalsSource subtrahend) {
return new NonOverlappingIntervalsSource(minuend, subtrahend);
}
Returns intervals from a source that overlap with intervals from another source
Params: - source – the source to filter
- reference – the source to filter by
/**
* Returns intervals from a source that overlap with intervals from another source
* @param source the source to filter
* @param reference the source to filter by
*/
public static IntervalsSource overlapping(IntervalsSource source, IntervalsSource reference) {
return new OverlappingIntervalsSource(source, reference);
}
Create a not-within IntervalsSource
Returns intervals of the minuend that do not appear within a set number of positions of intervals from the subtrahend query Params: - minuend – the
IntervalsSource
to filter - positions – the minimum distance that intervals from the minuend may occur from intervals
of the subtrahend
- subtrahend – the
IntervalsSource
to filter by
/**
* Create a not-within {@link IntervalsSource}
*
* Returns intervals of the minuend that do not appear within a set number of positions of
* intervals from the subtrahend query
*
* @param minuend the {@link IntervalsSource} to filter
* @param positions the minimum distance that intervals from the minuend may occur from intervals
* of the subtrahend
* @param subtrahend the {@link IntervalsSource} to filter by
*/
public static IntervalsSource notWithin(IntervalsSource minuend, int positions, IntervalsSource subtrahend) {
return new NonOverlappingIntervalsSource(minuend, Intervals.extend(subtrahend, positions, positions));
}
Returns intervals of the source that appear within a set number of positions of intervals from
the reference
Params: - source – the
IntervalsSource
to filter - positions – the maximum distance that intervals of the source may occur from intervals of the reference
- reference – the
IntervalsSource
to filter by
/**
* Returns intervals of the source that appear within a set number of positions of intervals from
* the reference
*
* @param source the {@link IntervalsSource} to filter
* @param positions the maximum distance that intervals of the source may occur from intervals of the reference
* @param reference the {@link IntervalsSource} to filter by
*/
public static IntervalsSource within(IntervalsSource source, int positions, IntervalsSource reference) {
return containedBy(source, Intervals.extend(reference, positions, positions));
}
Create a not-containing IntervalsSource
Returns intervals from the minuend that do not contain intervals of the subtrahend Params: - minuend – the
IntervalsSource
to filter - subtrahend – the
IntervalsSource
to filter by
/**
* Create a not-containing {@link IntervalsSource}
*
* Returns intervals from the minuend that do not contain intervals of the subtrahend
*
* @param minuend the {@link IntervalsSource} to filter
* @param subtrahend the {@link IntervalsSource} to filter by
*/
public static IntervalsSource notContaining(IntervalsSource minuend, IntervalsSource subtrahend) {
return NotContainingIntervalsSource.build(minuend, subtrahend);
}
Create a containing IntervalsSource
Returns intervals from the big source that contain one or more intervals from the small source Params: - big – the
IntervalsSource
to filter - small – the
IntervalsSource
to filter by
/**
* Create a containing {@link IntervalsSource}
*
* Returns intervals from the big source that contain one or more intervals from
* the small source
*
* @param big the {@link IntervalsSource} to filter
* @param small the {@link IntervalsSource} to filter by
*/
public static IntervalsSource containing(IntervalsSource big, IntervalsSource small) {
return ContainingIntervalsSource.build(big, small);
}
Create a not-contained-by IntervalsSource
Returns intervals from the small IntervalsSource
that do not appear within intervals from the big IntervalsSource
. Params: - small – the
IntervalsSource
to filter - big – the
IntervalsSource
to filter by
/**
* Create a not-contained-by {@link IntervalsSource}
*
* Returns intervals from the small {@link IntervalsSource} that do not appear within
* intervals from the big {@link IntervalsSource}.
*
* @param small the {@link IntervalsSource} to filter
* @param big the {@link IntervalsSource} to filter by
*/
public static IntervalsSource notContainedBy(IntervalsSource small, IntervalsSource big) {
return NotContainedByIntervalsSource.build(small, big);
}
Create a contained-by IntervalsSource
Returns intervals from the small query that appear within intervals of the big query Params: - small – the
IntervalsSource
to filter - big – the
IntervalsSource
to filter by
/**
* Create a contained-by {@link IntervalsSource}
*
* Returns intervals from the small query that appear within intervals of the big query
*
* @param small the {@link IntervalsSource} to filter
* @param big the {@link IntervalsSource} to filter by
*/
public static IntervalsSource containedBy(IntervalsSource small, IntervalsSource big) {
return ContainedByIntervalsSource.build(small, big);
}
Return intervals that span combinations of intervals from minShouldMatch
of the sources /**
* Return intervals that span combinations of intervals from {@code minShouldMatch} of the sources
*/
public static IntervalsSource atLeast(int minShouldMatch, IntervalsSource... sources) {
return new MinimumShouldMatchIntervalsSource(sources, minShouldMatch);
}
Returns intervals from the source that appear before intervals from the reference
/**
* Returns intervals from the source that appear before intervals from the reference
*/
public static IntervalsSource before(IntervalsSource source, IntervalsSource reference) {
return ContainedByIntervalsSource.build(source,
Intervals.extend(new OffsetIntervalsSource(reference, true), Integer.MAX_VALUE, 0));
}
Returns intervals from the source that appear after intervals from the reference
/**
* Returns intervals from the source that appear after intervals from the reference
*/
public static IntervalsSource after(IntervalsSource source, IntervalsSource reference) {
return ContainedByIntervalsSource.build(source,
Intervals.extend(new OffsetIntervalsSource(reference, false), 0, Integer.MAX_VALUE));
}
}