/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.query;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.StopFilter;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;

An Analyzer used primarily at query time to wrap another analyzer and provide a layer of protection which prevents very common words from being passed into queries.

For very large indexes the cost of reading TermDocs for a very common word can be high. This analyzer was created after experience with a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for this term to take 2 seconds.

Since:3.1
/** * An {@link Analyzer} used primarily at query time to wrap another analyzer and provide a layer of protection * which prevents very common words from being passed into queries. * <p> * For very large indexes the cost * of reading TermDocs for a very common word can be high. This analyzer was created after experience with * a 38 million doc index which had a term in around 50% of docs and was causing TermQueries for * this term to take 2 seconds. * </p> * * @since 3.1 */
public final class QueryAutoStopWordAnalyzer extends AnalyzerWrapper { private final Analyzer delegate; private final Map<String, Set<String>> stopWordsPerField = new HashMap<>(); //The default maximum percentage (40%) of index documents which //can contain a term, after which the term is considered to be a stop word. public static final float defaultMaxDocFreqPercent = 0.4f;
Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all indexed fields from terms with a document frequency percentage greater than defaultMaxDocFreqPercent
Params:
  • delegate – Analyzer whose TokenStream will be filtered
  • indexReader – IndexReader to identify the stopwords from
Throws:
  • IOException – Can be thrown while reading from the IndexReader
/** * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all * indexed fields from terms with a document frequency percentage greater than * {@link #defaultMaxDocFreqPercent} * * @param delegate Analyzer whose TokenStream will be filtered * @param indexReader IndexReader to identify the stopwords from * @throws IOException Can be thrown while reading from the IndexReader */
public QueryAutoStopWordAnalyzer( Analyzer delegate, IndexReader indexReader) throws IOException { this(delegate, indexReader, defaultMaxDocFreqPercent); }
Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all indexed fields from terms with a document frequency greater than the given maxDocFreq
Params:
  • delegate – Analyzer whose TokenStream will be filtered
  • indexReader – IndexReader to identify the stopwords from
  • maxDocFreq – Document frequency terms should be above in order to be stopwords
Throws:
  • IOException – Can be thrown while reading from the IndexReader
/** * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all * indexed fields from terms with a document frequency greater than the given * maxDocFreq * * @param delegate Analyzer whose TokenStream will be filtered * @param indexReader IndexReader to identify the stopwords from * @param maxDocFreq Document frequency terms should be above in order to be stopwords * @throws IOException Can be thrown while reading from the IndexReader */
public QueryAutoStopWordAnalyzer( Analyzer delegate, IndexReader indexReader, int maxDocFreq) throws IOException { this(delegate, indexReader, FieldInfos.getIndexedFields(indexReader), maxDocFreq); }
Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all indexed fields from terms with a document frequency percentage greater than the given maxPercentDocs
Params:
  • delegate – Analyzer whose TokenStream will be filtered
  • indexReader – IndexReader to identify the stopwords from
  • maxPercentDocs – The maximum percentage (between 0.0 and 1.0) of index documents which contain a term, after which the word is considered to be a stop word
Throws:
  • IOException – Can be thrown while reading from the IndexReader
/** * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for all * indexed fields from terms with a document frequency percentage greater than * the given maxPercentDocs * * @param delegate Analyzer whose TokenStream will be filtered * @param indexReader IndexReader to identify the stopwords from * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which * contain a term, after which the word is considered to be a stop word * @throws IOException Can be thrown while reading from the IndexReader */
public QueryAutoStopWordAnalyzer( Analyzer delegate, IndexReader indexReader, float maxPercentDocs) throws IOException { this(delegate, indexReader, FieldInfos.getIndexedFields(indexReader), maxPercentDocs); }
Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the given selection of fields from terms with a document frequency percentage greater than the given maxPercentDocs
Params:
  • delegate – Analyzer whose TokenStream will be filtered
  • indexReader – IndexReader to identify the stopwords from
  • fields – Selection of fields to calculate stopwords for
  • maxPercentDocs – The maximum percentage (between 0.0 and 1.0) of index documents which contain a term, after which the word is considered to be a stop word
Throws:
  • IOException – Can be thrown while reading from the IndexReader
/** * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the * given selection of fields from terms with a document frequency percentage * greater than the given maxPercentDocs * * @param delegate Analyzer whose TokenStream will be filtered * @param indexReader IndexReader to identify the stopwords from * @param fields Selection of fields to calculate stopwords for * @param maxPercentDocs The maximum percentage (between 0.0 and 1.0) of index documents which * contain a term, after which the word is considered to be a stop word * @throws IOException Can be thrown while reading from the IndexReader */
public QueryAutoStopWordAnalyzer( Analyzer delegate, IndexReader indexReader, Collection<String> fields, float maxPercentDocs) throws IOException { this(delegate, indexReader, fields, (int) (indexReader.numDocs() * maxPercentDocs)); }
Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the given selection of fields from terms with a document frequency greater than the given maxDocFreq
Params:
  • delegate – Analyzer whose TokenStream will be filtered
  • indexReader – IndexReader to identify the stopwords from
  • fields – Selection of fields to calculate stopwords for
  • maxDocFreq – Document frequency terms should be above in order to be stopwords
Throws:
  • IOException – Can be thrown while reading from the IndexReader
/** * Creates a new QueryAutoStopWordAnalyzer with stopwords calculated for the * given selection of fields from terms with a document frequency greater than * the given maxDocFreq * * @param delegate Analyzer whose TokenStream will be filtered * @param indexReader IndexReader to identify the stopwords from * @param fields Selection of fields to calculate stopwords for * @param maxDocFreq Document frequency terms should be above in order to be stopwords * @throws IOException Can be thrown while reading from the IndexReader */
public QueryAutoStopWordAnalyzer( Analyzer delegate, IndexReader indexReader, Collection<String> fields, int maxDocFreq) throws IOException { super(delegate.getReuseStrategy()); this.delegate = delegate; for (String field : fields) { Set<String> stopWords = new HashSet<>(); Terms terms = MultiTerms.getTerms(indexReader, field); CharsRefBuilder spare = new CharsRefBuilder(); if (terms != null) { TermsEnum te = terms.iterator(); BytesRef text; while ((text = te.next()) != null) { if (te.docFreq() > maxDocFreq) { spare.copyUTF8Bytes(text); stopWords.add(spare.toString()); } } } stopWordsPerField.put(field, stopWords); } } @Override protected Analyzer getWrappedAnalyzer(String fieldName) { return delegate; } @Override protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) { Set<String> stopWords = stopWordsPerField.get(fieldName); if (stopWords == null) { return components; } StopFilter stopFilter = new StopFilter(components.getTokenStream(), new CharArraySet(stopWords, false)); return new TokenStreamComponents(components.getSource(), stopFilter); }
Provides information on which stop words have been identified for a field
Params:
  • fieldName – The field for which stop words identified in "addStopWords" method calls will be returned
Returns:the stop words identified for a field
/** * Provides information on which stop words have been identified for a field * * @param fieldName The field for which stop words identified in "addStopWords" * method calls will be returned * @return the stop words identified for a field */
public String[] getStopWords(String fieldName) { Set<String> stopWords = stopWordsPerField.get(fieldName); return stopWords != null ? stopWords.toArray(new String[stopWords.size()]) : new String[0]; }
Provides information on which stop words have been identified for all fields
Returns:the stop words (as terms)
/** * Provides information on which stop words have been identified for all fields * * @return the stop words (as terms) */
public Term[] getStopWords() { List<Term> allStopWords = new ArrayList<>(); for (String fieldName : stopWordsPerField.keySet()) { Set<String> stopWords = stopWordsPerField.get(fieldName); for (String text : stopWords) { allStopWords.add(new Term(fieldName, text)); } } return allStopWords.toArray(new Term[allStopWords.size()]); } }