-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathSentenceComparison.java
More file actions
158 lines (129 loc) · 3.68 KB
/
SentenceComparison.java
File metadata and controls
158 lines (129 loc) · 3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import java.io.*;
import javax.swing.*;
import java.util.*;
/** @author Gergely Kota
SentenceComparison compares every sentence with every other sentence
in the other document. For each sentence, a running max of how much
it cheats is held, in the end the average of the cheat values for
all the sentences is used for the cheat value of the document
Algorithm based on one created by Tom Slattery and Joshua Louie for
a previous version of SPLAT
*/
public class SentenceComparison extends DocumentComparison
{
private String delims = ".?!;\n";
public SentenceComparison(File loc)
{
super(loc);
setTolerance(50);
}
public PairComparison compare(File f1, File f2)
{
PairComparison pc = new PairComparison(f1, f2);
Sentence[] s1 = build(f1);
Sentence[] s2 = build(f2);
for(int i = 0; i < s1.length; i++)
for(int j = 0; j < s2.length; j++)
{
s2[j].setScore(s2[j].cheatFrom(s1[i]));
s1[i].setScore(s1[i].cheatFrom(s2[j]));
}
// log results
double x = getTolerance()/100.0;
for(int i = 0; i < s1.length; i++)
if(s1[i].score > x)
pc.log(s1[i].index, s1[i].index + s1[i].sentence.length(), 0, 0);
for(int i = 0; i < s2.length; i++)
if(s2[i].score > x)
pc.log(0, 0, s2[i].index, s2[i].index + s2[i].sentence.length());
pc.setScore(getScore(s2));
return pc;
}
public static String description()
{
return "Compares documents based on the similarity of sentences";
}
private double getScore(Sentence[] s)
{
double sum = 0;
for(int i = 0; i < s.length; i++)
if(s[i].score > getTolerance()/100.0)
sum += s[i].score;
return sum/s.length;
}
public String canonicalForm(File f)
{
return getOriginalContent(f).toLowerCase();
}
public int indexMap(File f, int index)
{
return index;
}
private Sentence[] build(File f)
{
ArrayList temp = new ArrayList();
int index = 0;
String all = getContent(f);
StringTokenizer sentences = new StringTokenizer(all, delims);
while(sentences.hasMoreTokens())
{
String s = sentences.nextToken();
// set the index oof where this sentence really starts
// this will be the first occurence after the prev sentence's index
index = all.indexOf(s, index);
temp.add(new Sentence(s, index));
}
Object[] o = temp.toArray();
Sentence[] s = new Sentence[o.length];
for(int i = 0; i < s.length; i++)
s[i] = (Sentence) o[i];
return s;
}
private class Sentence
{
private int index;
private String sentence;
private ArrayList words;
private double score;
public Sentence(String s, int i)
{
index = i;
sentence = s;
words = new ArrayList();
score = 0;
StringTokenizer st = new StringTokenizer(s);
while(st.hasMoreTokens())
{
String temp = st.nextToken().intern();
if(!words.contains(temp))
words.add(temp);
}
}
public void setScore(double x)
{
if(x > score)
score = x;
}
public double cheatFrom(Sentence s)
{
int total = 0;
for(int i = 0; i < words.size(); i++)
if(s.words.contains(words.get(i)))
total++;
return ((double)total)/words.size();
}
}
/* --------------------------------------------- */
/* --------------------------------------------- */
public static void main(String[] args)
{
long x = System.currentTimeMillis();
SentenceComparison sc = new SentenceComparison(new File("Downloads"));
sc.setTolerance(60);
// sc.read();
// System.out.println(System.currentTimeMillis() - x);
// sc.compare();
// System.out.println(System.currentTimeMillis() - x);
new HTMLFrame(sc.read().compare(new File("Downloads/TextPad.txt"))).show();
}
}