/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.analysis.charfilter;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;

import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;

// TODO: save/load?

Holds a map of String input to String output, to be used with MappingCharFilter. Use the Builder to create this.
/** * Holds a map of String input to String output, to be used * with {@link MappingCharFilter}. Use the {@link Builder} * to create this. */
public class NormalizeCharMap { final FST<CharsRef> map; final Map<Character,FST.Arc<CharsRef>> cachedRootArcs = new HashMap<>(); // Use the builder to create: private NormalizeCharMap(FST<CharsRef> map) { this.map = map; if (map != null) { try { // Pre-cache root arcs: final FST.Arc<CharsRef> scratchArc = new FST.Arc<>(); final FST.BytesReader fstReader = map.getBytesReader(); map.getFirstArc(scratchArc); if (FST.targetHasArcs(scratchArc)) { map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader); while(true) { assert scratchArc.label != FST.END_LABEL; cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc)); if (scratchArc.isLast()) { break; } map.readNextRealArc(scratchArc, fstReader); } } //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } } }
Builds an NormalizeCharMap.

Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap

@lucene.experimental
/** * Builds an NormalizeCharMap. * <p> * Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap * @lucene.experimental */
public static class Builder { private final Map<String,String> pendingPairs = new TreeMap<>();
Records a replacement to be applied to the input stream. Whenever singleMatch occurs in the input, it will be replaced with replacement.
Params:
  • match – input String to be replaced
  • replacement – output String
Throws:
/** Records a replacement to be applied to the input * stream. Whenever <code>singleMatch</code> occurs in * the input, it will be replaced with * <code>replacement</code>. * * @param match input String to be replaced * @param replacement output String * @throws IllegalArgumentException if * <code>match</code> is the empty string, or was * already previously added */
public void add(String match, String replacement) { if (match.length() == 0 ){ throw new IllegalArgumentException("cannot match the empty string"); } if (pendingPairs.containsKey(match)) { throw new IllegalArgumentException("match \"" + match + "\" was already added"); } pendingPairs.put(match, replacement); }
Builds the NormalizeCharMap; call this once you are done calling add.
/** Builds the NormalizeCharMap; call this once you * are done calling {@link #add}. */
public NormalizeCharMap build() { final FST<CharsRef> map; try { final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs); final IntsRefBuilder scratch = new IntsRefBuilder(); for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { builder.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); } map = builder.finish(); pendingPairs.clear(); } catch (IOException ioe) { // Bogus FST IOExceptions!! (will never happen) throw new RuntimeException(ioe); } return new NormalizeCharMap(map); } } }