Skip to content

Commit a338a9f

Browse files
committed
Added Damerau-Levenshtein distance
1 parent 2144570 commit a338a9f

File tree

2 files changed

+172
-1
lines changed

2 files changed

+172
-1
lines changed

README.md

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ A library implementing different string similarity algorithms.
44

55
Currently implemeted:
66
- Levenshtein edit distance;
7+
- Damerau-Levenshtein distance;
78
- Jaro-Winkler similarity;
89
- Longest Common Subsequence edit distance;
910
- Q-Gram (Ukkonen);
@@ -42,6 +43,43 @@ public class MyApp {
4243
}
4344
```
4445

46+
## Damerau-Levenshtein
47+
Similar to Levenshtein, Damerau-Levenshtein distance is the minimum number of operations needed to transform one string into the other, where an operation is defined as an insertion, deletion, or substitution of a single character, or a **transposition of two adjacent characters**.
48+
49+
This is not to be confused with the optimal string alignment distance, which is an extension where no substring can be edited more than once.
50+
51+
```java
52+
import info.debatty.java.stringsimilarity.*;
53+
54+
public class MyApp {
55+
56+
57+
public static void main(String[] args) {
58+
Damerau d = new Damerau();
59+
60+
// One substitution
61+
System.out.println(d.absoluteDistance("ABCDEF", "ABDCEF"));
62+
63+
// Substitution of 2 characters that are far from each other
64+
// => 1 deletion + 1 insertion
65+
System.out.println(d.absoluteDistance("ABCDEF", "BCDAEF"));
66+
67+
// distance and similarity allways produce a result between 0 and 1
68+
System.out.println(d.distance("ABCDEF", "GHABCDE"));
69+
}
70+
}
71+
```
72+
73+
Will produce:
74+
75+
```
76+
1
77+
2
78+
0.23076923076923078
79+
```
80+
81+
82+
4583
## Jaro-Winkler
4684
Jaro-Winkler is a string edit distance that was developed in the area of record linkage (duplicate detection) (Winkler, 1990). The Jaro–Winkler distance metric is designed and best suited for short strings such as person names, and to detect typos.
4785

@@ -62,7 +100,6 @@ public class MyApp {
62100
}
63101
```
64102

65-
66103
## Longest Common Subsequence
67104

68105
The longest common subsequence (LCS) problem consists in finding the longest subsequence common to two (or more) sequences. It differs from problems of finding common substrings: unlike substrings, subsequences are not required to occupy consecutive positions within the original sequences.
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/*
2+
* The MIT License
3+
*
4+
* Copyright 2015 Thibault Debatty.
5+
*
6+
* Permission is hereby granted, free of charge, to any person obtaining a copy
7+
* of this software and associated documentation files (the "Software"), to deal
8+
* in the Software without restriction, including without limitation the rights
9+
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10+
* copies of the Software, and to permit persons to whom the Software is
11+
* furnished to do so, subject to the following conditions:
12+
*
13+
* The above copyright notice and this permission notice shall be included in
14+
* all copies or substantial portions of the Software.
15+
*
16+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22+
* THE SOFTWARE.
23+
*/
24+
package info.debatty.java.stringsimilarity;
25+
26+
import java.util.HashMap;
27+
28+
/**
29+
* Implementation of Damerau-Levenshtein distance, computed as the
30+
* minimum number of operations needed to transform one string into the other,
31+
* where an operation is defined as an insertion, deletion, or substitution of a
32+
* single character, or a transposition of two adjacent characters.
33+
*
34+
* This is not to be confused with the optimal string alignment distance, which
35+
* is an extension where no substring can be edited more than once.
36+
*
37+
* @author Thibault Debatty
38+
*/
39+
public class Damerau implements StringSimilarityInterface {
40+
41+
/**
42+
* @param args the command line arguments
43+
*/
44+
public static void main(String[] args) {
45+
46+
Damerau d = new Damerau();
47+
System.out.println(d.absoluteDistance("ABCDEF", "ABDCEF"));
48+
System.out.println(d.absoluteDistance("ABCDEF", "BACDFE"));
49+
System.out.println(d.absoluteDistance("ABCDEF", "ABCDE"));
50+
System.out.println(d.absoluteDistance("ABCDEF", "BCDEF"));
51+
System.out.println(d.absoluteDistance("ABCDEF", "ABCGDEF"));
52+
System.out.println(d.absoluteDistance("ABCDEF", "BCDAEF"));
53+
54+
System.out.println(d.distance("ABCDEF", "GHABCDE"));
55+
}
56+
57+
public int absoluteDistance(String s1, String s2) {
58+
59+
// INFinite distance is the max possible distance
60+
int INF = s1.length() + s2.length();
61+
62+
// Create and initialize the character array indices
63+
HashMap<Character, Integer> DA = new HashMap<Character, Integer>();
64+
65+
for (int d = 0; d < s1.length(); d++) {
66+
if (!DA.containsKey(s1.charAt(d))) {
67+
DA.put(s1.charAt(d), 0);
68+
}
69+
}
70+
71+
for (int d = 0; d < s2.length(); d++) {
72+
if (!DA.containsKey(s2.charAt(d))) {
73+
DA.put(s2.charAt(d), 0);
74+
}
75+
}
76+
77+
// Create the distance matrix H[0 .. s1.length+1][0 .. s2.length+1]
78+
int[][] H = new int[s1.length() + 2][s2.length() + 2];
79+
80+
// initialize the left and top edges of H
81+
for (int i = 0; i <= s1.length(); i++) {
82+
H[i + 1][0] = INF;
83+
H[i + 1][1] = i;
84+
}
85+
86+
for (int j = 0; j <= s2.length(); j++) {
87+
H[0][j + 1] = INF;
88+
H[1][j + 1] = j;
89+
90+
}
91+
92+
93+
// fill in the distance matrix H
94+
// look at each character in s1
95+
for (int i = 1; i <= s1.length(); i++) {
96+
int DB = 0;
97+
98+
// look at each character in b
99+
for (int j = 1; j <= s2.length(); j++) {
100+
int i1 = DA.get(s2.charAt(j - 1));
101+
int j1 = DB;
102+
103+
int cost = 1;
104+
if (s1.charAt(i - 1) == s2.charAt(j - 1)) {
105+
cost = 0;
106+
DB = j;
107+
}
108+
109+
H[i + 1][j + 1] = min(
110+
H[i][j] + cost, // substitution
111+
H[i + 1][j] + 1, // insertion
112+
H[i][j + 1] + 1, // deletion
113+
H[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1));
114+
}
115+
116+
DA.put(s1.charAt(i - 1), i);
117+
}
118+
119+
return H[s1.length() + 1][s2.length() + 1];
120+
}
121+
122+
public double similarity(String s1, String s2) {
123+
return 1.0 - distance(s1, s2);
124+
}
125+
126+
public double distance(String s1, String s2) {
127+
return (double) absoluteDistance(s1, s2) / (s1.length() + s2.length());
128+
}
129+
130+
protected static int min(int a, int b, int c, int d) {
131+
return Math.min(a, Math.min(b, Math.min(c, d)));
132+
}
133+
134+
}

0 commit comments

Comments
 (0)