Skip to content

Commit 34e7a50

Browse files
committed
Speed optimization
1 parent 6d1daf3 commit 34e7a50

File tree

6 files changed

+192
-170
lines changed

6 files changed

+192
-170
lines changed

src/main/java/info/debatty/java/stringsimilarity/Cosine.java

Lines changed: 23 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -24,18 +24,10 @@
2424

2525
package info.debatty.java.stringsimilarity;
2626

27-
import java.util.HashMap;
28-
import java.util.HashSet;
29-
import java.util.Set;
30-
3127
/**
32-
* Implements Cosine Similarity.
33-
* The strings are first transformed in vectors of occurences of k-shingles
34-
* (sequences of k characters). In this n-dimensional space, the similarity
35-
* between the two strings is the cosine of their respective vectors.
3628
* @author Thibault Debatty
3729
*/
38-
public class Cosine implements StringSimilarityInterface {
30+
public class Cosine extends SetBasedStringSimilarity {
3931

4032
/**
4133
* @param args the command line arguments
@@ -59,75 +51,54 @@ public static void main(String[] args) {
5951
System.out.println(cos.similarity("ABAB", "BAB"));
6052
}
6153

62-
private int k;
63-
54+
/**
55+
* Implements Cosine Similarity.
56+
* The strings are first transformed in vectors of occurrences of k-shingles
57+
* (sequences of k characters). In this n-dimensional space, the similarity
58+
* between the two strings is the cosine of their respective vectors.
59+
*
60+
* @param k
61+
*/
6462
public Cosine(int k) {
65-
this.k = k;
63+
super(k);
6664
}
6765

6866
public Cosine() {
69-
this.k = 3;
67+
super(3);
7068
}
7169

72-
/**
73-
* Computes the cosine similarity of s1 and s2.
74-
* The strings are first converted to vectors in the space of k-shingles.
75-
* The cosine similarity is computed as V1 . V2 / (|V1| * |V2|)
76-
* @param s1
77-
* @param s2
78-
* @return Cosine similarity
79-
*/
80-
public double similarity(String s1, String s2) {
81-
if (s1.equals(s2)) {
82-
return 1.0;
83-
}
84-
85-
86-
if (s1.equals("") || s2.equals("")) {
87-
return 0.0;
88-
}
89-
90-
KShingling ks = new KShingling(this.k);
91-
HashMap<String, Integer> profile1 = ks.getProfile(s1);
92-
HashMap<String, Integer> profile2 = ks.getProfile(s2);
70+
71+
public double similarity(int[] profile1, int[] profile2) {
9372

94-
return dotProduct(profile1, profile2) / (norm(profile1) * norm(profile2));
73+
return dotProduct(profile1, profile2) / (norm(profile1) * norm(profile2));
9574
}
9675

97-
public double distance(String s1, String s2) {
98-
return 1.0 - similarity(s1, s2);
99-
}
76+
10077

10178
/**
10279
* Compute the norm L2 : sqrt(Sum_i( v_i^2))
10380
* @param profile
10481
* @return L2 norm
10582
*/
106-
protected static double norm(HashMap<String, Integer> profile) {
83+
protected static double norm(int[] profile) {
10784
double agg = 0;
10885

109-
for (int v : profile.values()) {
86+
for (int v : profile) {
11087
agg += v * v;
11188
}
11289

11390
return Math.sqrt(agg);
11491
}
11592

116-
protected static double dotProduct(HashMap<String, Integer> profile1,
117-
HashMap<String, Integer> profile2) {
93+
protected static double dotProduct(int[] profile1, int[] profile2) {
94+
int length = Math.max(profile1.length, profile2.length);
95+
profile1 = java.util.Arrays.copyOf(profile1, length);
96+
profile2 = java.util.Arrays.copyOf(profile2, length);
11897

11998
double agg = 0;
120-
Set<String> union = new HashSet<String>();
121-
union.addAll(profile1.keySet());
122-
union.addAll(profile2.keySet());
123-
124-
for (String key : union) {
125-
int v1 = profile1.containsKey(key) ? profile1.get(key) : 0;
126-
int v2 = profile2.containsKey(key) ? profile2.get(key) : 0;
127-
agg += v1 * v2;
99+
for (int i = 0; i < length; i++) {
100+
agg += profile1[i] * profile2[i];
128101
}
129-
130102
return agg;
131103
}
132-
133104
}

src/main/java/info/debatty/java/stringsimilarity/Jaccard.java

Lines changed: 21 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,11 @@
2424

2525
package info.debatty.java.stringsimilarity;
2626

27-
import java.util.HashMap;
28-
import java.util.HashSet;
29-
import java.util.Set;
30-
3127
/**
3228
*
3329
* @author Thibault Debatty
3430
*/
35-
public class Jaccard implements StringSimilarityInterface {
31+
public class Jaccard extends SetBasedStringSimilarity {
3632

3733
/**
3834
* @param args the command line arguments
@@ -47,7 +43,7 @@ public static void main(String[] args) {
4743
System.out.println(j2.similarity("ABCDE", "ABCDF"));
4844
}
4945

50-
private final int k;
46+
5147

5248
/**
5349
* The strings are first transformed into sets of k-shingles (sequences of k
@@ -57,17 +53,13 @@ public static void main(String[] args) {
5753
* @param k
5854
*/
5955
public Jaccard(int k) {
60-
this.k = k;
56+
super(k);
6157
}
6258

6359
public Jaccard() {
64-
this.k = 3;
60+
super(3);
6561
}
6662

67-
public double similarity(String s1, String s2) {
68-
KShingling ks = new KShingling(this.k);
69-
return similarity(ks.getProfile(s1), ks.getProfile(s2));
70-
}
7163

7264
/**
7365
* Compute and return the Jaccard index similarity between two string profiles.
@@ -82,23 +74,24 @@ public double similarity(String s1, String s2) {
8274
* @param profile2
8375
* @return
8476
*/
85-
public double similarity(HashMap<String,Integer> profile1,
86-
HashMap<String,Integer> profile2) {
87-
Set<String> set1 = profile1.keySet();
88-
Set<String> set2 = profile2.keySet();
89-
90-
Set union = new HashSet();
91-
union.addAll(set1);
92-
union.addAll(set2);
77+
public double similarity(int[] profile1, int[] profile2) {
78+
int length = Math.max(profile1.length, profile2.length);
79+
profile1 = java.util.Arrays.copyOf(profile1, length);
80+
profile2 = java.util.Arrays.copyOf(profile2, length);
9381

94-
Set inter = new HashSet(set1);
95-
inter.retainAll(set2);
82+
int inter = 0;
83+
int union = 0;
9684

97-
return (double) inter.size() / union.size();
98-
}
99-
100-
public double distance(String s1, String s2) {
101-
return 1.0 - similarity(s1, s2);
102-
}
85+
for (int i = 0; i < length; i++) {
86+
if (profile1[i] > 0 || profile2[i] > 0) {
87+
union++;
88+
89+
if (profile1[i] > 0 && profile2[i] > 0) {
90+
inter++;
91+
}
92+
}
93+
}
10394

95+
return (double) inter / union;
96+
}
10497
}

src/main/java/info/debatty/java/stringsimilarity/KShingling.java

Lines changed: 51 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
package info.debatty.java.stringsimilarity;
22

33
import java.security.InvalidParameterException;
4+
import java.util.ArrayList;
45
import java.util.HashMap;
6+
import java.util.Iterator;
7+
import java.util.List;
58
import java.util.regex.Pattern;
69

710
/**
@@ -30,7 +33,8 @@ public static void main(String[] args) {
3033
System.out.println(ks.getProfile("ABCAB"));
3134
}
3235

33-
protected int k = 5;
36+
protected int k;
37+
private HashMap<String, Integer> shingles = new HashMap<String, Integer>();
3438

3539
/**
3640
* k-shingling is the operation of transforming a string (or text document) into
@@ -42,35 +46,28 @@ public static void main(String[] args) {
4246
* "Mining of Massive Datasets", Cambridge University Press:
4347
* Multiple subsequent spaces are replaced by a single space, and a k-gram is a
4448
* sequence of k characters.
49+
*
50+
* Default value of k is 5 (recommended for emails).
51+
* A good rule of thumb is to imagine that there are only 20 characters
52+
* and estimate the number of k-shingles as 20^k. For large documents,
53+
* such as research articles, k = 9 is considered a safe choice.
4554
*/
4655
public KShingling() {
47-
56+
k = 5;
4857
}
4958

5059
public KShingling(int k) {
51-
this.setK(k);
52-
}
53-
54-
public int getK() {
55-
return k;
56-
}
57-
58-
/**
59-
* Set the size of k-grams.
60-
* Default value is 5 (recommended for emails).
61-
* A good rule of thumb is to imagine that there are only 20 characters
62-
* and estimate the number of k-shingles as 20^k. For large documents,
63-
* such as research articles, choice k = 9 is considered safe.
64-
* @param k
65-
*/
66-
public final void setK(int k) {
6760
if (k <= 0) {
6861
throw new InvalidParameterException("k should be positive!");
6962
}
7063

7164
this.k = k;
7265
}
7366

67+
public int getK() {
68+
return k;
69+
}
70+
7471
/**
7572
* Pattern for finding multiple following spaces
7673
*/
@@ -91,7 +88,32 @@ public final void setK(int k) {
9188
* @param s
9289
* @return
9390
*/
94-
public HashMap<String, Integer> getProfile(String s) {
91+
public int[] getProfile(String s) {
92+
ArrayList<Integer> r = new ArrayList<Integer>(shingles.size());
93+
for (int i = 0; i < shingles.size(); i++) {
94+
r.add(0);
95+
}
96+
97+
s = spaceReg.matcher(s).replaceAll(" ");
98+
String shingle;
99+
for (int i = 0; i < (s.length() - k + 1); i++) {
100+
shingle = s.substring(i, i+k);
101+
int position;
102+
103+
if (shingles.containsKey(shingle)) {
104+
position = shingles.get(shingle);
105+
r.set(position, r.get(position) + 1);
106+
107+
} else {
108+
shingles.put(shingle, shingles.size());
109+
r.add(1);
110+
}
111+
112+
}
113+
114+
return convertIntegers(r);
115+
116+
/*
95117
HashMap<String, Integer> r = new HashMap<String, Integer>(s.length() / 2);
96118
s = spaceReg.matcher(s).replaceAll(" ");
97119
String kgram;
@@ -107,6 +129,15 @@ public HashMap<String, Integer> getProfile(String s) {
107129
r.put(kgram, 1);
108130
}
109131
}
110-
return r;
132+
return r;*/
133+
}
134+
135+
public static int[] convertIntegers(List<Integer> integers) {
136+
int[] ret = new int[integers.size()];
137+
Iterator<Integer> iterator = integers.iterator();
138+
for (int i = 0; i < ret.length; i++) {
139+
ret[i] = iterator.next().intValue();
140+
}
141+
return ret;
111142
}
112143
}

0 commit comments

Comments
 (0)