org.apache.lucene/lucene-analyzers-common/8.7.0 : org/apache/lucene/analysis/miscellaneous/ProtectedTermFilterFactory.java

ProtectedTermFilterFactory
https://lucene.apache.org/lucene-parent/lucene-analyzers-common: Additional Analyzers (The Apache Software Foundation)
Apache License, Version 2.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.lucene.analysis.miscellaneous;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.function.Function;
import java.util.function.Predicate; // javadocs

import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.ResourceLoader;
import org.apache.lucene.analysis.util.ResourceLoaderAware;
import org.apache.lucene.analysis.util.TokenFilterFactory;

Factory for a ProtectedTermFilter CustomAnalyzer example:
Analyzer ana = CustomAnalyzer.builder()
  .withTokenizer("standard")
  .when("protectedterm", "ignoreCase", "true", "protected", "protectedTerms.txt")
    .addTokenFilter("truncate", "prefixLength", "4")
    .addTokenFilter("lowercase")
  .endwhen()
  .build();

Solr example, in which conditional filters are specified via the wrappedFilters
parameter - a comma-separated list of case-insensitive TokenFilter SPI names - and conditional
filter args are specified via filterName.argName parameters:
<fieldType name="reverse_lower_with_exceptions" class="solr.TextField" positionIncrementGap="100">
  <analyzer>
    <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    <filter class="solr.ProtectedTermFilterFactory" ignoreCase="true" protected="protectedTerms.txt"
            wrappedFilters="truncate,lowercase" truncate.prefixLength="4" />
  </analyzer>
</fieldType>
When using the wrappedFilters parameter, each filter name must be unique, so if you
need to specify the same filter more than once, you must add case-insensitive unique '-id' suffixes
(note that the '-id' suffix is stripped prior to SPI lookup), e.g.:
<fieldType name="double_synonym_with_exceptions" class="solr.TextField" positionIncrementGap="100">
  <analyzer>
    <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    <filter class="solr.ProtectedTermFilterFactory" ignoreCase="true" protected="protectedTerms.txt"
            wrappedFilters="synonymgraph-A,synonymgraph-B"
            synonymgraph-A.synonyms="synonyms-1.txt"
            synonymgraph-B.synonyms="synonyms-2.txt"/>
  </analyzer>
</fieldType>
See related Builder.whenTerm(Predicate<CharSequence>) 
Since: 7.4.0
@lucene.spi "protectedTerm"/**
 * Factory for a {@link ProtectedTermFilter}
 *
 * <p>CustomAnalyzer example:
 * <pre class="prettyprint">
 * Analyzer ana = CustomAnalyzer.builder()
 *   .withTokenizer("standard")
 *   .when("protectedterm", "ignoreCase", "true", "protected", "protectedTerms.txt")
 *     .addTokenFilter("truncate", "prefixLength", "4")
 *     .addTokenFilter("lowercase")
 *   .endwhen()
 *   .build();
 * </pre>
 *
 * <p>Solr example, in which conditional filters are specified via the <code>wrappedFilters</code>
 * parameter - a comma-separated list of case-insensitive TokenFilter SPI names - and conditional
 * filter args are specified via <code>filterName.argName</code> parameters:
 * <pre class="prettyprint">
 * &lt;fieldType name="reverse_lower_with_exceptions" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
 *     &lt;filter class="solr.ProtectedTermFilterFactory" ignoreCase="true" protected="protectedTerms.txt"
 *             wrappedFilters="truncate,lowercase" truncate.prefixLength="4" /&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 *
 * <p>When using the <code>wrappedFilters</code> parameter, each filter name must be unique, so if you
 * need to specify the same filter more than once, you must add case-insensitive unique '-id' suffixes
 * (note that the '-id' suffix is stripped prior to SPI lookup), e.g.:
 * <pre class="prettyprint">
 * &lt;fieldType name="double_synonym_with_exceptions" class="solr.TextField" positionIncrementGap="100"&gt;
 *   &lt;analyzer&gt;
 *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
 *     &lt;filter class="solr.ProtectedTermFilterFactory" ignoreCase="true" protected="protectedTerms.txt"
 *             wrappedFilters="synonymgraph-A,synonymgraph-B"
 *             synonymgraph-A.synonyms="synonyms-1.txt"
 *             synonymgraph-B.synonyms="synonyms-2.txt"/&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
 *
 * <p>See related {@link org.apache.lucene.analysis.custom.CustomAnalyzer.Builder#whenTerm(Predicate)}
 *
 * @since 7.4.0
 * @lucene.spi {@value #NAME}
 */
public class ProtectedTermFilterFactory extends ConditionalTokenFilterFactory implements ResourceLoaderAware {

  public static final String NAME = "protectedTerm";

  public static final String PROTECTED_TERMS = "protected";
  public static final char FILTER_ARG_SEPARATOR = '.';
  public static final char FILTER_NAME_ID_SEPARATOR = '-';

  private final String termFiles;
  private final boolean ignoreCase;
  private final String wrappedFilters;

  private CharArraySet protectedTerms;

  public ProtectedTermFilterFactory(Map<String, String> args) {
    super(args);
    termFiles = require(args, PROTECTED_TERMS);
    ignoreCase = getBoolean(args, "ignoreCase", false);
    wrappedFilters = get(args, "wrappedFilters");
    if (wrappedFilters != null) {
      handleWrappedFilterArgs(args);
    }
    if (!args.isEmpty()) {
      throw new IllegalArgumentException("Unknown parameters: " + args);
    }
  }

  private void handleWrappedFilterArgs(Map<String, String> args) {
    LinkedHashMap<String, Map<String, String>> wrappedFilterArgs = new LinkedHashMap<>();
    splitAt(',', wrappedFilters).forEach(filterName -> {          // Format: SPIname[-id]
      filterName = filterName.trim().toLowerCase(Locale.ROOT);             // Treat case-insensitively
      if (wrappedFilterArgs.containsKey(filterName)) {
        throw new IllegalArgumentException("wrappedFilters contains duplicate '"
            + filterName + "'. Add unique '-id' suffixes (stripped prior to SPI lookup).");
      }
      wrappedFilterArgs.put(filterName, new HashMap<>());
    });
    for (Iterator<Map.Entry<String, String>> iterator = args.entrySet().iterator(); iterator.hasNext(); ) {
      Map.Entry<String, String> entry = iterator.next();
      String filterArgKey = entry.getKey();
      String argValue = entry.getValue();
      List<String> splitKey = splitAt(FILTER_ARG_SEPARATOR, filterArgKey); // Format: filterName.argKey
      if (splitKey.size() == 2) {                                          // Skip if no slash
        String filterName = splitKey.get(0).toLowerCase(Locale.ROOT);
        if (wrappedFilterArgs.containsKey(filterName)) {                   // Skip if not in "wrappedFilter" arg
          Map<String, String> filterArgs = wrappedFilterArgs.computeIfAbsent(filterName, k -> new HashMap<>());
          String argKey = splitKey.get(1);
          filterArgs.put(argKey, argValue); // argKey is guaranteed unique, don't need to check for duplicates
          iterator.remove();
        }
      }
    }
    if (args.isEmpty()) {
      populateInnerFilters(wrappedFilterArgs);
    }
  }

  private void populateInnerFilters(LinkedHashMap<String, Map<String, String>> wrappedFilterArgs) {
    List<TokenFilterFactory> innerFilters = new ArrayList<>();
    wrappedFilterArgs.forEach((filterName, filterArgs) -> {
      int idSuffixPos = filterName.indexOf(FILTER_NAME_ID_SEPARATOR); // Format: SPIname[-id]
      if (idSuffixPos != -1) {                                        // Strip '-id' suffix, if any, prior to SPI lookup
        filterName = filterName.substring(0, idSuffixPos);
      }
      innerFilters.add(TokenFilterFactory.forName(filterName, filterArgs));
    });
    setInnerFilters(innerFilters);
  }

  public boolean isIgnoreCase() {
    return ignoreCase;
  }

  public CharArraySet getProtectedTerms() {
    return protectedTerms;
  }

  @Override
  protected ConditionalTokenFilter create(TokenStream input, Function<TokenStream, TokenStream> inner) {
    return new ProtectedTermFilter(protectedTerms, input, inner);
  }

  @Override
  public void doInform(ResourceLoader loader) throws IOException {
    protectedTerms = getWordSet(loader, termFiles, ignoreCase);
  }
}
/

org.apache.lucene/ lucene-analyzers-common/ 8.7.0/ org/apache/lucene/analysis/miscellaneous/ProtectedTermFilterFactory.java