-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtest.rb
More file actions
263 lines (225 loc) · 7.88 KB
/
test.rb
File metadata and controls
263 lines (225 loc) · 7.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
require "interscript"
require "interscript/compiler/ruby"
require "optparse"
class Interscript::GeoTest
def initialize(file, verbose: false)
@file = file
@verbose = verbose
end
def start
run read
run group_by_ufi
run group_by_uni
run group_by_transl
run cluster_by_related
done
# --- analysis ---
analyze_uni_uniqueness
analyze_related_clusters
analyze_translit_systems
analyze_usability_of_related_clusters
analyze_good_clusters
end
def self.start(...) = new(...).start
# Parse TSV into an array of structure instances
def read
records = File.read(@file).split(/\r?\n/).map { |i| i.split("\t") }
headers = records.shift.map(&:downcase).map(&:to_sym)
records = records.map { |i| i.map.with_index { |v,idx| [headers[idx], v] }.to_h }
@records = records.map { |i| Name.new(self, **i) }
end
def run(*) = print "."
def done = puts
def group_by_ufi
@records_by_ufi = @records.group_by(&:ufi)
end
def group_by_uni
@records_by_uni = @records.group_by(&:uni)
end
def group_by_transl
@records_by_transl = @records.group_by(&:transl_cd).sort_by { |_,v| -v.length }.to_h
end
def cluster_by_related
@related_clusters = {}
@records.each do |record|
next unless record.related
my_cluster = (
(@related_clusters[record.uni] || []) +
(@related_clusters[record.related.uni] || []) +
[record, record.related]
).uniq(&:object_id)
my_cluster.each { |r| @related_clusters[r.uni] = my_cluster }
end
@unique_related_clusters = @related_clusters.values.uniq(&:object_id)
end
attr_reader :records, :records_by_ufi, :records_by_uni, :records_by_transl, :related_clusters, :unique_related_clusters
def analyze_uni_uniqueness
count = @records_by_uni.values.select { |i| i.length > 1 }.count
puts "#{count} records have a non-unique UNI (should be 0)"
puts
end
def analyze_related_clusters
puts "Out of #{@related_clusters.length} related clusters we get #{@unique_related_clusters.length} unique related clusters"
puts "Unique clusters have #{@unique_related_clusters.map(&:length).sum} members in total (this should match a number of related clusters)"
print "Hash of cluster length to a number of clusters of that kind: "
p @unique_related_clusters.group_by(&:length).transform_values(&:length)
puts
end
def analyze_usability_of_related_clusters
errors = {
length: [],
no_transl: [],
no_script: [],
too_much_script: [],
no_map: [],
}
good = []
@unique_related_clusters.each do |cluster|
if cluster.length < 2
# A bug - likely due to wrong data
errors[:length] << cluster
elsif cluster.none? { |i| %w[NS DS VS].include? i.nt }
# We can do nothing about it
errors[:no_script] << cluster
elsif cluster.none? { |i| i.transl_cd != '' }
# TODO: Add some heuristics per run?
errors[:no_transl] << cluster
elsif cluster.count { |i| %w[NS DS VS].include? i.nt } > 1
# TODO: split those by some heuristic like by LC
errors[:too_much_script] << cluster
elsif cluster.none? { |i| geo_to_is i.transl_cd }
# We don't have a usable map for those entries
errors[:no_map] << cluster
else
good << cluster
end
end
puts "Among the unique clusters:"
puts "- #{errors[:length].length} clusters are too short"
puts "- #{errors[:no_script].length} clusters contain no non-ASCII entries"
puts "- #{errors[:no_transl].length} clusters contain no transliteration info"
puts "- #{errors[:too_much_script].length} clusters contain more than 1 non-ASCII entries"
puts "- #{errors[:no_map].length} clusters are transliterated with a map not present in Interscript"
puts "Remaining #{good.length} clusters seem to be usable"
puts
@good_clusters = good
end
def compare_and_return_error(first, second)
if first == second
nil
elsif first.downcase == second.downcase
"Incorrect casing"
elsif first.gsub(/[^[:alpha:][:space:]]/,'') == second.gsub(/[^[:alpha:][:space:]]/,'')
"Incorrect punctuation"
elsif first.downcase.gsub(/[^[:alpha:][:space:]]/,'') == second.downcase.gsub(/[^[:alpha:][:space:]]/,'')
"Incorrect casing and punctuation"
elsif first.gsub(/[^[:alpha:]]/,'') == second.gsub(/[^[:alpha:]]/,'')
"Incorrect spacing or punctuation"
elsif first.downcase.gsub(/[^[:alpha:]]/,'') == second.downcase.gsub(/[^[:alpha:]]/,'')
"Incorrect casing and (spacing or punctuation)"
else
"Incorrect transliteration"
end
end
def analyze_good_clusters
results = {}
maps = {}
@good_clusters.each do |cluster|
cluster = cluster.dup
original = cluster.find { |i| %w[NS DS VS].include? i.nt }
cluster.delete(original)
# The rest of entries in the cluster are transliterated entries
cluster.each do |i|
group = [original, i]
transl = i.transl_cd
results[transl] ||= []
map_id = geo_to_is transl
unless map_id
results[transl] << {error: "No support in Interscript", group: group}
next
end
compiler = Interscript.load(map_id, maps, compiler: Interscript::Compiler::Ruby)
result_fnro = compiler.(original.full_name_ro)
result_fnrg = compiler.(original.full_name_rg)
if error = compare_and_return_error(result_fnro, i.full_name_ro)
results[transl] << {error: error, group: group, result: [result_fnro, result_fnrg]}
elsif error = compare_and_return_error(result_fnrg, i.full_name_rg)
results[transl] << {error: error, group: group, result: [result_fnro, result_fnrg]}
else
results[transl] << {ok: true, group: group}
end
end
end
# Compare transliteration result
results.each do |transl, results|
print "#{transl}: "
all = results.length
good = results.select { |i| i[:ok] }.length
errors = results.select { |i| i[:error] }
print "#{good}/#{all} (#{(good*100.0/all).round(2)}%)"
unless errors.empty?
print " (Errors: "
print errors.group_by { |i| i[:error] }.transform_values(&:length).map { |error, count|
"#{error} * #{count}"
}.join(", ")
print ")"
end
puts
if @verbose && !errors.empty?
pp errors
end
end
end
def geo_to_is(name)
(@geo_to_is_cache ||= {})[name] ||= begin
File.basename(Interscript.locate(name), ".imp") rescue nil
end
end
def analyze_translit_systems
puts "Transliteration systems used:"
@records_by_transl.each do |transl,names|
print "- #{transl.inspect} * #{names.length} "
print "(#{names.select { |i| i.related }.length} with a pair)"
print " implemented in Interscript as #{geo_to_is transl}" if geo_to_is transl
puts
end
puts
end
class Name
FIELDS=%i[ufi uni mgrs nt lc full_name_ro full_name_rg name_link transl_cd]
INT_FIELDS=%i[ufi uni name_link]
attr_accessor *FIELDS
def initialize(geotest, **kwargs)
@geotest = geotest
kwargs.each do |k,v|
if INT_FIELDS.include?(k)
v = v == '' ? nil : v.to_i
end
instance_variable_set(:"@#{k}", v)
end
end
def inspect
"#<Name #{FIELDS.map { |i| "#{i}=#{send(i)}" }.join(" ")}>"
end
def related
return nil unless name_link
@geotest.records_by_uni[name_link]&.first
end
def related_cluster
@geotest.related_clusters[uni] || []
end
end
end
options = {}
OptionParser.new do |opts|
opts.banner = "Usage: #{$0} [options] file"
opts.on("-v", "--verbose", "Describe all failures") do
options[:verbose] = true
end
opts.on("-h", "--help", "Prints this help") do
puts opts
exit
end
end.parse!(ARGV.empty? ? ["--help"] : ARGV)
file = ARGV[0]
Interscript::GeoTest.start(file, **options)