/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.codec.language;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;

import org.apache.commons.codec.CharEncoding;
import org.apache.commons.codec.EncoderException;
import org.apache.commons.codec.StringEncoder;

Encodes a string into a Daitch-Mokotoff Soundex value.

The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling.

The main differences compared to the other soundex variants are:

  • coded names are 6 digits long
  • the initial character of the name is coded
  • rules to encoded multi-character n-grams
  • multiple possible encodings for the same name (branching)

This implementation supports branching, depending on the used method:

  • encode(String) - branching disabled, only the first code will be returned
  • soundex(String) - branching enabled, all codes will be returned, separated by '|'

Note: this implementation has additional branching rules compared to the original description of the algorithm. The rules can be customized by overriding the default rules contained in the resource file org/apache/commons/codec/language/dmrules.txt.

This class is thread-safe.

See Also:
Version:$Id$
Since:1.10
/** * Encodes a string into a Daitch-Mokotoff Soundex value. * <p> * The Daitch-Mokotoff Soundex algorithm is a refinement of the Russel and American Soundex algorithms, yielding greater * accuracy in matching especially Slavish and Yiddish surnames with similar pronunciation but differences in spelling. * </p> * <p> * The main differences compared to the other soundex variants are: * </p> * <ul> * <li>coded names are 6 digits long * <li>the initial character of the name is coded * <li>rules to encoded multi-character n-grams * <li>multiple possible encodings for the same name (branching) * </ul> * <p> * This implementation supports branching, depending on the used method: * <ul> * <li>{@link #encode(String)} - branching disabled, only the first code will be returned * <li>{@link #soundex(String)} - branching enabled, all codes will be returned, separated by '|' * </ul> * <p> * Note: this implementation has additional branching rules compared to the original description of the algorithm. The * rules can be customized by overriding the default rules contained in the resource file * {@code org/apache/commons/codec/language/dmrules.txt}. * </p> * <p> * This class is thread-safe. * </p> * * @see Soundex * @see <a href="http://en.wikipedia.org/wiki/Daitch%E2%80%93Mokotoff_Soundex"> Wikipedia - Daitch-Mokotoff Soundex</a> * @see <a href="http://www.avotaynu.com/soundex.htm">Avotaynu - Soundexing and Genealogy</a> * * @version $Id$ * @since 1.10 */
public class DaitchMokotoffSoundex implements StringEncoder {
Inner class representing a branch during DM soundex encoding.
/** * Inner class representing a branch during DM soundex encoding. */
private static final class Branch { private final StringBuilder builder; private String cachedString; private String lastReplacement; private Branch() { builder = new StringBuilder(); lastReplacement = null; cachedString = null; }
Creates a new branch, identical to this branch.
Returns:a new, identical branch
/** * Creates a new branch, identical to this branch. * * @return a new, identical branch */
public Branch createBranch() { final Branch branch = new Branch(); branch.builder.append(toString()); branch.lastReplacement = this.lastReplacement; return branch; } @Override public boolean equals(final Object other) { if (this == other) { return true; } if (!(other instanceof Branch)) { return false; } return toString().equals(((Branch) other).toString()); }
Finish this branch by appending '0's until the maximum code length has been reached.
/** * Finish this branch by appending '0's until the maximum code length has been reached. */
public void finish() { while (builder.length() < MAX_LENGTH) { builder.append('0'); cachedString = null; } } @Override public int hashCode() { return toString().hashCode(); }
Process the next replacement to be added to this branch.
Params:
  • replacement – the next replacement to append
  • forceAppend – indicates if the default processing shall be overridden
/** * Process the next replacement to be added to this branch. * * @param replacement * the next replacement to append * @param forceAppend * indicates if the default processing shall be overridden */
public void processNextReplacement(final String replacement, final boolean forceAppend) { final boolean append = lastReplacement == null || !lastReplacement.endsWith(replacement) || forceAppend; if (append && builder.length() < MAX_LENGTH) { builder.append(replacement); // remove all characters after the maximum length if (builder.length() > MAX_LENGTH) { builder.delete(MAX_LENGTH, builder.length()); } cachedString = null; } lastReplacement = replacement; } @Override public String toString() { if (cachedString == null) { cachedString = builder.toString(); } return cachedString; } }
Inner class for storing rules.
/** * Inner class for storing rules. */
private static final class Rule { private final String pattern; private final String[] replacementAtStart; private final String[] replacementBeforeVowel; private final String[] replacementDefault; protected Rule(final String pattern, final String replacementAtStart, final String replacementBeforeVowel, final String replacementDefault) { this.pattern = pattern; this.replacementAtStart = replacementAtStart.split("\\|"); this.replacementBeforeVowel = replacementBeforeVowel.split("\\|"); this.replacementDefault = replacementDefault.split("\\|"); } public int getPatternLength() { return pattern.length(); } public String[] getReplacements(final String context, final boolean atStart) { if (atStart) { return replacementAtStart; } final int nextIndex = getPatternLength(); final boolean nextCharIsVowel = nextIndex < context.length() ? isVowel(context.charAt(nextIndex)) : false; if (nextCharIsVowel) { return replacementBeforeVowel; } return replacementDefault; } private boolean isVowel(final char ch) { return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; } public boolean matches(final String context) { return context.startsWith(pattern); } @Override public String toString() { return String.format("%s=(%s,%s,%s)", pattern, Arrays.asList(replacementAtStart), Arrays.asList(replacementBeforeVowel), Arrays.asList(replacementDefault)); } } private static final String COMMENT = "//"; private static final String DOUBLE_QUOTE = "\""; private static final String MULTILINE_COMMENT_END = "*/"; private static final String MULTILINE_COMMENT_START = "/*";
The resource file containing the replacement and folding rules
/** The resource file containing the replacement and folding rules */
private static final String RESOURCE_FILE = "org/apache/commons/codec/language/dmrules.txt";
The code length of a DM soundex value.
/** The code length of a DM soundex value. */
private static final int MAX_LENGTH = 6;
Transformation rules indexed by the first character of their pattern.
/** Transformation rules indexed by the first character of their pattern. */
private static final Map<Character, List<Rule>> RULES = new HashMap<>();
Folding rules.
/** Folding rules. */
private static final Map<Character, Character> FOLDINGS = new HashMap<>(); static { final InputStream rulesIS = DaitchMokotoffSoundex.class.getClassLoader().getResourceAsStream(RESOURCE_FILE); if (rulesIS == null) { throw new IllegalArgumentException("Unable to load resource: " + RESOURCE_FILE); } try (final Scanner scanner = new Scanner(rulesIS, CharEncoding.UTF_8)) { parseRules(scanner, RESOURCE_FILE, RULES, FOLDINGS); } // sort RULES by pattern length in descending order for (final Map.Entry<Character, List<Rule>> rule : RULES.entrySet()) { final List<Rule> ruleList = rule.getValue(); Collections.sort(ruleList, new Comparator<Rule>() { @Override public int compare(final Rule rule1, final Rule rule2) { return rule2.getPatternLength() - rule1.getPatternLength(); } }); } } private static void parseRules(final Scanner scanner, final String location, final Map<Character, List<Rule>> ruleMapping, final Map<Character, Character> asciiFoldings) { int currentLine = 0; boolean inMultilineComment = false; while (scanner.hasNextLine()) { currentLine++; final String rawLine = scanner.nextLine(); String line = rawLine; if (inMultilineComment) { if (line.endsWith(MULTILINE_COMMENT_END)) { inMultilineComment = false; } continue; } if (line.startsWith(MULTILINE_COMMENT_START)) { inMultilineComment = true; } else { // discard comments final int cmtI = line.indexOf(COMMENT); if (cmtI >= 0) { line = line.substring(0, cmtI); } // trim leading-trailing whitespace line = line.trim(); if (line.length() == 0) { continue; // empty lines can be safely skipped } if (line.contains("=")) { // folding final String[] parts = line.split("="); if (parts.length != 2) { throw new IllegalArgumentException("Malformed folding statement split into " + parts.length + " parts: " + rawLine + " in " + location); } final String leftCharacter = parts[0]; final String rightCharacter = parts[1]; if (leftCharacter.length() != 1 || rightCharacter.length() != 1) { throw new IllegalArgumentException("Malformed folding statement - " + "patterns are not single characters: " + rawLine + " in " + location); } asciiFoldings.put(leftCharacter.charAt(0), rightCharacter.charAt(0)); } else { // rule final String[] parts = line.split("\\s+"); if (parts.length != 4) { throw new IllegalArgumentException("Malformed rule statement split into " + parts.length + " parts: " + rawLine + " in " + location); } try { final String pattern = stripQuotes(parts[0]); final String replacement1 = stripQuotes(parts[1]); final String replacement2 = stripQuotes(parts[2]); final String replacement3 = stripQuotes(parts[3]); final Rule r = new Rule(pattern, replacement1, replacement2, replacement3); final char patternKey = r.pattern.charAt(0); List<Rule> rules = ruleMapping.get(patternKey); if (rules == null) { rules = new ArrayList<>(); ruleMapping.put(patternKey, rules); } rules.add(r); } catch (final IllegalArgumentException e) { throw new IllegalStateException( "Problem parsing line '" + currentLine + "' in " + location, e); } } } } } private static String stripQuotes(String str) { if (str.startsWith(DOUBLE_QUOTE)) { str = str.substring(1); } if (str.endsWith(DOUBLE_QUOTE)) { str = str.substring(0, str.length() - 1); } return str; }
Whether to use ASCII folding prior to encoding.
/** Whether to use ASCII folding prior to encoding. */
private final boolean folding;
Creates a new instance with ASCII-folding enabled.
/** * Creates a new instance with ASCII-folding enabled. */
public DaitchMokotoffSoundex() { this(true); }
Creates a new instance.

With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. è -> e.

Params:
  • folding – if ASCII-folding shall be performed before encoding
/** * Creates a new instance. * <p> * With ASCII-folding enabled, certain accented characters will be transformed to equivalent ASCII characters, e.g. * è -&gt; e. * </p> * * @param folding * if ASCII-folding shall be performed before encoding */
public DaitchMokotoffSoundex(final boolean folding) { this.folding = folding; }
Performs a cleanup of the input string before the actual soundex transformation.

Removes all whitespace characters and performs ASCII folding if enabled.

Params:
  • input – the input string to cleanup
Returns:a cleaned up string
/** * Performs a cleanup of the input string before the actual soundex transformation. * <p> * Removes all whitespace characters and performs ASCII folding if enabled. * </p> * * @param input * the input string to cleanup * @return a cleaned up string */
private String cleanup(final String input) { final StringBuilder sb = new StringBuilder(); for (char ch : input.toCharArray()) { if (Character.isWhitespace(ch)) { continue; } ch = Character.toLowerCase(ch); if (folding && FOLDINGS.containsKey(ch)) { ch = FOLDINGS.get(ch); } sb.append(ch); } return sb.toString(); }
Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching.

This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an EncoderException if the supplied object is not of type java.lang.String.

Params:
  • obj – Object to encode
Throws:
See Also:
  • soundex(String)
Returns:An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String supplied.
/** * Encodes an Object using the Daitch-Mokotoff soundex algorithm without branching. * <p> * This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an * EncoderException if the supplied object is not of type java.lang.String. * </p> * * @see #soundex(String) * * @param obj * Object to encode * @return An object (of type java.lang.String) containing the DM soundex code, which corresponds to the String * supplied. * @throws EncoderException * if the parameter supplied is not of type java.lang.String * @throws IllegalArgumentException * if a character is not mapped */
@Override public Object encode(final Object obj) throws EncoderException { if (!(obj instanceof String)) { throw new EncoderException( "Parameter supplied to DaitchMokotoffSoundex encode is not of type java.lang.String"); } return encode((String) obj); }
Encodes a String using the Daitch-Mokotoff soundex algorithm without branching.
Params:
  • source – A String object to encode
Throws:
See Also:
  • soundex(String)
Returns:A DM Soundex code corresponding to the String supplied
/** * Encodes a String using the Daitch-Mokotoff soundex algorithm without branching. * * @see #soundex(String) * * @param source * A String object to encode * @return A DM Soundex code corresponding to the String supplied * @throws IllegalArgumentException * if a character is not mapped */
@Override public String encode(final String source) { if (source == null) { return null; } return soundex(source, false)[0]; }
Encodes a String using the Daitch-Mokotoff soundex algorithm with branching.

In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, separated by '|'.

Example: the name "AUERBACH" is encoded as both

  • 097400
  • 097500

Thus the result will be "097400|097500".

Params:
  • source – A String object to encode
Throws:
Returns:A string containing a set of DM Soundex codes corresponding to the String supplied
/** * Encodes a String using the Daitch-Mokotoff soundex algorithm with branching. * <p> * In case a string is encoded into multiple codes (see branching rules), the result will contain all codes, * separated by '|'. * </p> * <p> * Example: the name "AUERBACH" is encoded as both * </p> * <ul> * <li>097400</li> * <li>097500</li> * </ul> * <p> * Thus the result will be "097400|097500". * </p> * * @param source * A String object to encode * @return A string containing a set of DM Soundex codes corresponding to the String supplied * @throws IllegalArgumentException * if a character is not mapped */
public String soundex(final String source) { final String[] branches = soundex(source, true); final StringBuilder sb = new StringBuilder(); int index = 0; for (final String branch : branches) { sb.append(branch); if (++index < branches.length) { sb.append('|'); } } return sb.toString(); }
Perform the actual DM Soundex algorithm on the input string.
Params:
  • source – A String object to encode
  • branching – If branching shall be performed
Returns:A string array containing all DM Soundex codes corresponding to the String supplied depending on the selected branching mode
/** * Perform the actual DM Soundex algorithm on the input string. * * @param source * A String object to encode * @param branching * If branching shall be performed * @return A string array containing all DM Soundex codes corresponding to the String supplied depending on the * selected branching mode */
private String[] soundex(final String source, final boolean branching) { if (source == null) { return null; } final String input = cleanup(source); final Set<Branch> currentBranches = new LinkedHashSet<>(); currentBranches.add(new Branch()); char lastChar = '\0'; for (int index = 0; index < input.length(); index++) { final char ch = input.charAt(index); // ignore whitespace inside a name if (Character.isWhitespace(ch)) { continue; } final String inputContext = input.substring(index); final List<Rule> rules = RULES.get(ch); if (rules == null) { continue; } // use an EMPTY_LIST to avoid false positive warnings wrt potential null pointer access @SuppressWarnings("unchecked") final List<Branch> nextBranches = branching ? new ArrayList<Branch>() : Collections.EMPTY_LIST; for (final Rule rule : rules) { if (rule.matches(inputContext)) { if (branching) { nextBranches.clear(); } final String[] replacements = rule.getReplacements(inputContext, lastChar == '\0'); final boolean branchingRequired = replacements.length > 1 && branching; for (final Branch branch : currentBranches) { for (final String nextReplacement : replacements) { // if we have multiple replacements, always create a new branch final Branch nextBranch = branchingRequired ? branch.createBranch() : branch; // special rule: occurrences of mn or nm are treated differently final boolean force = (lastChar == 'm' && ch == 'n') || (lastChar == 'n' && ch == 'm'); nextBranch.processNextReplacement(nextReplacement, force); if (branching) { nextBranches.add(nextBranch); } else { break; } } } if (branching) { currentBranches.clear(); currentBranches.addAll(nextBranches); } index += rule.getPatternLength() - 1; break; } } lastChar = ch; } final String[] result = new String[currentBranches.size()]; int index = 0; for (final Branch branch : currentBranches) { branch.finish(); result[index++] = branch.toString(); } return result; } }