Skip to content

Commit 886a4b8

Browse files
committed
QGram and NGram
+ Shingling
1 parent ff304a2 commit 886a4b8

File tree

4 files changed

+253
-8
lines changed

4 files changed

+253
-8
lines changed

README.md

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,5 +76,48 @@ public class MyApp {
7676
}
7777
```
7878

79+
## Q-Gram
80+
81+
Q-Gram edit distance, not to confuse with N-Gram distance defined by Kondrak (below), is the relative number of n-grams both strings have in common.
82+
83+
```java
84+
import info.debatty.java.stringsimilarity.*;
85+
86+
public class MyApp {
87+
88+
public static void main(String[] args) {
89+
QGram dig = new QGram(2);
90+
91+
// Should be 2: CD and CE
92+
System.out.println(dig.absoluteDistance("ABCD", "ABCE"));
93+
94+
// Should be 0.5 (2 / 4)
95+
System.out.println(dig.distance("ABCD", "ABCE"));
96+
}
97+
}
98+
```
99+
100+
## N-Gram similarity (Kondrak)
101+
102+
N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance", String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126.
103+
104+
http://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf
105+
106+
The algorithm uses affixing with special character '\n' two increase the weight of first characters. The normalization is achieved by dividing the total similarity score the original length of the longer word.
107+
108+
```java
109+
import info.debatty.java.stringsimilarity.*;
110+
111+
public class MyApp {
112+
113+
public static void main(String[] args) {
114+
NGram twogram = new NGram(2);
115+
116+
// Should be 0.41666
117+
System.out.println(twogram.distance("ABCD", "ABTUIO"));
118+
}
119+
}
120+
```
121+
79122

80123

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
package info.debatty.java.stringsimilarity;
2+
3+
import java.io.Serializable;
4+
import java.security.InvalidParameterException;
5+
import java.util.HashSet;
6+
7+
/**
8+
* A k-shingling is a set of unique k-grams, used to measure the similarity of
9+
* two documents.
10+
*
11+
* Generally speaking, a k-gram is any sequence of k tokens. We use here the
12+
* definition from Leskovec, Rajaraman & Ullman (2014),
13+
* "Mining of Massive Datasets", Cambridge University Press:
14+
* Multiple subsequent spaces are replaced by a single space, and a k-gram is a
15+
* sequence of k characters.
16+
*
17+
* @author Thibault Debatty http://www.debatty.info
18+
*/
19+
public class KShingling extends HashSet<String> implements Serializable {
20+
21+
22+
public static void main(String[] args) {
23+
String s1 = "my string, \n my song";
24+
String s2 = "another string, from a song";
25+
KShingling ks = new KShingling(4);
26+
ks.parse(s1);
27+
ks.parse(s2);
28+
System.out.println(ks.toString());
29+
30+
for (boolean b : ks.booleanVectorOf(s1)) {
31+
System.out.print(b ? "1" : "0");
32+
}
33+
System.out.print("\n");
34+
35+
for (boolean b : ks.booleanVectorOf(s2)) {
36+
System.out.print(b ? "1" : "0");
37+
}
38+
System.out.print("\n");
39+
40+
ks.add("Another shingle?");
41+
}
42+
43+
protected int k = 5;
44+
45+
public KShingling() {
46+
super();
47+
}
48+
49+
public KShingling(int k) {
50+
super();
51+
this.setK(k);
52+
}
53+
54+
public int getK() {
55+
return k;
56+
}
57+
58+
/**
59+
* Set the size of k-grams.
60+
* Default value is 5 (recommended for emails).
61+
* A good rule of thumb is to imagine that there are only 20 characters
62+
* and estimate the number of k-shingles as 20^k. For large documents,
63+
* such as research articles, choice k = 9 is considered safe.
64+
* @param k
65+
*/
66+
public final void setK(int k) {
67+
if (k <= 0) {
68+
throw new InvalidParameterException("k should be positive!");
69+
}
70+
71+
this.k = k;
72+
}
73+
74+
public boolean parse(String s) {
75+
s = s.replaceAll("\\s+", " ");
76+
for (int i = 0; i < (s.length() - k + 1); i++) {
77+
this.add(s.substring(i, i+k));
78+
}
79+
return true;
80+
}
81+
82+
@Override
83+
public boolean add(String s) {
84+
if (s.length() != k) {
85+
throw new InvalidParameterException("This size of this String (" +
86+
s.length() + ") is different from k (" + k + ")");
87+
}
88+
89+
return super.add(s);
90+
}
91+
92+
public boolean[] booleanVectorOf(String s) {
93+
boolean[] r = new boolean[this.size()];
94+
95+
int i = 0;
96+
for (String shingle : this) {
97+
r[i] = s.contains(shingle);
98+
i++;
99+
}
100+
101+
return r;
102+
}
103+
104+
}

src/info/debatty/java/stringsimilarity/NGram.java

Lines changed: 33 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,42 @@
11
package info.debatty.java.stringsimilarity;
22

33
/**
4+
* N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance",
5+
* String Processing and Information Retrieval, Lecture Notes in Computer
6+
* Science Volume 3772, 2005, pp 115-126.
7+
*
8+
* The algorithm uses affixing with special character '\n' two increase the
9+
* weight of first characters. The normalization is achieved by dividing the
10+
* total similarity score the original length of the longer word.
411
*
512
* http://webdocs.cs.ualberta.ca/~kondrak/papers/spire05.pdf
6-
* @author tibo
713
*/
8-
public class NGram {
14+
public class NGram implements StringSimilarityInterface {
15+
16+
public static void main(String[] args) {
17+
NGram twogram = new NGram(2);
18+
19+
System.out.println(twogram.distance("ABCD", "ABTUIO"));
20+
}
21+
22+
private final int n;
23+
24+
public NGram(int n) {
25+
this.n = n;
26+
}
27+
28+
public NGram() {
29+
this.n = 2;
30+
}
931

10-
public static double Distance(String s0, String s1) {
11-
return Distance(s0, s1, 2);
32+
@Override
33+
public double similarity(String s1, String s2) {
34+
return distance(s1, s2);
1235
}
1336

14-
private static double Distance(String s0, String s1, int n) {
37+
@Override
38+
public double distance(String s0, String s1) {
39+
final char special = '\n';
1540
final int sl = s0.length();
1641
final int tl = s1.length();
1742

@@ -41,7 +66,7 @@ private static double Distance(String s0, String s1, int n) {
4166
//construct sa with prefix
4267
for (int i = 0; i < sa.length; i++) {
4368
if (i < n - 1) {
44-
sa[i] = 0; //add prefix
69+
sa[i] = special; //add prefix
4570
} else {
4671
sa[i] = s0.charAt(i - n + 1);
4772
}
@@ -63,7 +88,7 @@ private static double Distance(String s0, String s1, int n) {
6388
//construct t_j n-gram
6489
if (j < n) {
6590
for (int ti = 0; ti < n - j; ti++) {
66-
t_j[ti] = 0; //add prefix
91+
t_j[ti] = special; //add prefix
6792
}
6893
for (int ti = n - j; ti < n; ti++) {
6994
t_j[ti] = s1.charAt(ti - (n - j));
@@ -79,7 +104,7 @@ private static double Distance(String s0, String s1, int n) {
79104
for (int ni = 0; ni < n; ni++) {
80105
if (sa[i - 1 + ni] != t_j[ni]) {
81106
cost++;
82-
} else if (sa[i - 1 + ni] == 0) { //discount matches on prefix
107+
} else if (sa[i - 1 + ni] == special) { //discount matches on prefix
83108
tn--;
84109
}
85110
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
package info.debatty.java.stringsimilarity;
2+
3+
/**
4+
* QGram similarity is the relative number of n-grams both strings have in
5+
* common.
6+
*
7+
* @author Thibault Debatty
8+
*/
9+
public class QGram implements StringSimilarityInterface {
10+
11+
public static void main(String[] args) {
12+
QGram dig = new QGram(2);
13+
14+
// Should be 2: CD and CE
15+
System.out.println(dig.absoluteDistance("ABCD", "ABCE"));
16+
17+
// Should be 0.5 (2 / 4)
18+
System.out.println(dig.distance("ABCD", "ABCE"));
19+
}
20+
21+
private int n;
22+
23+
public QGram(int n) {
24+
this.n = n;
25+
}
26+
27+
public QGram() {
28+
this.n = 3;
29+
}
30+
31+
@Override
32+
public double similarity(String s1, String s2) {
33+
return 1.0 - distance(s1, s2);
34+
}
35+
36+
@Override
37+
public double distance(String s1, String s2) {
38+
KShingling sh = new KShingling(n);
39+
sh.parse(s1);
40+
sh.parse(s2);
41+
42+
boolean[] b1 = sh.booleanVectorOf(s1);
43+
boolean[] b2 = sh.booleanVectorOf(s2);
44+
45+
int d = 0;
46+
for (int i = 0; i < b1.length; i++) {
47+
if (b1[i] != b2[i]) {
48+
d++;
49+
}
50+
}
51+
52+
return ((double) d) / sh.size();
53+
}
54+
55+
public int absoluteDistance(String s1, String s2) {
56+
KShingling sh = new KShingling(n);
57+
sh.parse(s1);
58+
sh.parse(s2);
59+
60+
boolean[] b1 = sh.booleanVectorOf(s1);
61+
boolean[] b2 = sh.booleanVectorOf(s2);
62+
63+
int d = 0;
64+
for (int i = 0; i < b1.length; i++) {
65+
if (b1[i] != b2[i]) {
66+
d++;
67+
}
68+
}
69+
70+
return d;
71+
}
72+
73+
}

0 commit comments

Comments
 (0)