/*
 * Copyright (c) 2002-2016, the original author or authors.
 *
 * This software is distributable under the BSD license. See the terms of the
 * BSD license in the documentation provided with this software.
 *
 * http://www.opensource.org/licenses/bsd-license.php
 */
package jdk.internal.org.jline.utils;

import java.util.HashMap;
import java.util.Map;

The Damerau-Levenshtein Algorithm is an extension to the Levenshtein Algorithm which solves the edit distance problem between a source string and a target string with the following operations:
  • Character Insertion
  • Character Deletion
  • Character Replacement
  • Adjacent Character Swap
Note that the adjacent character swap operation is an edit that may be applied when two adjacent characters in the source string match two adjacent characters in the target string, but in reverse order, rather than a general allowance for adjacent character swaps.

This implementation allows the client to specify the costs of the various edit operations with the restriction that the cost of two swap operations must not be less than the cost of a delete operation followed by an insert operation. This restriction is required to preclude two swaps involving the same character being required for optimality which, in turn, enables a fast dynamic programming solution.

The running time of the Damerau-Levenshtein algorithm is O(n*m) where n is the length of the source string and m is the length of the target string. This implementation consumes O(n*m) space.

Author:Kevin L. Stern
/** * The Damerau-Levenshtein Algorithm is an extension to the Levenshtein * Algorithm which solves the edit distance problem between a source string and * a target string with the following operations: * * <ul> * <li>Character Insertion</li> * <li>Character Deletion</li> * <li>Character Replacement</li> * <li>Adjacent Character Swap</li> * </ul> * * Note that the adjacent character swap operation is an edit that may be * applied when two adjacent characters in the source string match two adjacent * characters in the target string, but in reverse order, rather than a general * allowance for adjacent character swaps. * <p> * * This implementation allows the client to specify the costs of the various * edit operations with the restriction that the cost of two swap operations * must not be less than the cost of a delete operation followed by an insert * operation. This restriction is required to preclude two swaps involving the * same character being required for optimality which, in turn, enables a fast * dynamic programming solution. * <p> * * The running time of the Damerau-Levenshtein algorithm is O(n*m) where n is * the length of the source string and m is the length of the target string. * This implementation consumes O(n*m) space. * * @author Kevin L. Stern */
public class Levenshtein { public static int distance(CharSequence lhs, CharSequence rhs) { return distance(lhs, rhs, 1, 1, 1, 1); } public static int distance(CharSequence source, CharSequence target, int deleteCost, int insertCost, int replaceCost, int swapCost) { /* * Required to facilitate the premise to the algorithm that two swaps of the * same character are never required for optimality. */ if (2 * swapCost < insertCost + deleteCost) { throw new IllegalArgumentException("Unsupported cost assignment"); } if (source.length() == 0) { return target.length() * insertCost; } if (target.length() == 0) { return source.length() * deleteCost; } int[][] table = new int[source.length()][target.length()]; Map<Character, Integer> sourceIndexByCharacter = new HashMap<>(); if (source.charAt(0) != target.charAt(0)) { table[0][0] = Math.min(replaceCost, deleteCost + insertCost); } sourceIndexByCharacter.put(source.charAt(0), 0); for (int i = 1; i < source.length(); i++) { int deleteDistance = table[i - 1][0] + deleteCost; int insertDistance = (i + 1) * deleteCost + insertCost; int matchDistance = i * deleteCost + (source.charAt(i) == target.charAt(0) ? 0 : replaceCost); table[i][0] = Math.min(Math.min(deleteDistance, insertDistance), matchDistance); } for (int j = 1; j < target.length(); j++) { int deleteDistance = (j + 1) * insertCost + deleteCost; int insertDistance = table[0][j - 1] + insertCost; int matchDistance = j * insertCost + (source.charAt(0) == target.charAt(j) ? 0 : replaceCost); table[0][j] = Math.min(Math.min(deleteDistance, insertDistance), matchDistance); } for (int i = 1; i < source.length(); i++) { int maxSourceLetterMatchIndex = source.charAt(i) == target.charAt(0) ? 0 : -1; for (int j = 1; j < target.length(); j++) { Integer candidateSwapIndex = sourceIndexByCharacter.get(target.charAt(j)); int jSwap = maxSourceLetterMatchIndex; int deleteDistance = table[i - 1][j] + deleteCost; int insertDistance = table[i][j - 1] + insertCost; int matchDistance = table[i - 1][j - 1]; if (source.charAt(i) != target.charAt(j)) { matchDistance += replaceCost; } else { maxSourceLetterMatchIndex = j; } int swapDistance; if (candidateSwapIndex != null && jSwap != -1) { int iSwap = candidateSwapIndex; int preSwapCost; if (iSwap == 0 && jSwap == 0) { preSwapCost = 0; } else { preSwapCost = table[Math.max(0, iSwap - 1)][Math.max(0, jSwap - 1)]; } swapDistance = preSwapCost + (i - iSwap - 1) * deleteCost + (j - jSwap - 1) * insertCost + swapCost; } else { swapDistance = Integer.MAX_VALUE; } table[i][j] = Math.min(Math.min(Math.min(deleteDistance, insertDistance), matchDistance), swapDistance); } sourceIndexByCharacter.put(source.charAt(i), i); } return table[source.length() - 1][target.length() - 1]; } }