geotest/test.rb at master · interscript/geotest · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
require "interscript"
require "interscript/compiler/ruby"
require "optparse"

class Interscript::GeoTest
  def initialize(file, verbose: false)
    @file = file
    @verbose = verbose
  end

  def start
    run read
    run group_by_ufi
    run group_by_uni
    run group_by_transl
    run cluster_by_related
    done
    # --- analysis ---
    analyze_uni_uniqueness
    analyze_related_clusters
    analyze_translit_systems
    analyze_usability_of_related_clusters
    analyze_good_clusters
  end

  def self.start(...) = new(...).start

  # Parse TSV into an array of structure instances
  def read
    records = File.read(@file).split(/\r?\n/).map { |i| i.split("\t") }
    headers = records.shift.map(&:downcase).map(&:to_sym)
    records = records.map { |i| i.map.with_index { |v,idx| [headers[idx], v] }.to_h }
    @records = records.map { |i| Name.new(self, **i) }
  end

  def run(*) = print "."
  def done = puts

  def group_by_ufi
    @records_by_ufi = @records.group_by(&:ufi)
  end

  def group_by_uni
    @records_by_uni = @records.group_by(&:uni)
  end

  def group_by_transl
    @records_by_transl = @records.group_by(&:transl_cd).sort_by { |_,v| -v.length }.to_h
  end

  def cluster_by_related
    @related_clusters = {}
    @records.each do |record|
      next unless record.related

      my_cluster = (
        (@related_clusters[record.uni] || []) +
        (@related_clusters[record.related.uni] || []) +
        [record, record.related]
      ).uniq(&:object_id)

      my_cluster.each { |r| @related_clusters[r.uni] = my_cluster }
    end

    @unique_related_clusters = @related_clusters.values.uniq(&:object_id)
  end

  attr_reader :records, :records_by_ufi, :records_by_uni, :records_by_transl, :related_clusters, :unique_related_clusters

  def analyze_uni_uniqueness
    count = @records_by_uni.values.select { |i| i.length > 1 }.count
    puts "#{count} records have a non-unique UNI (should be 0)"
    puts
  end

  def analyze_related_clusters
    puts "Out of #{@related_clusters.length} related clusters we get #{@unique_related_clusters.length} unique related clusters"
    puts "Unique clusters have #{@unique_related_clusters.map(&:length).sum} members in total (this should match a number of related clusters)"
    print "Hash of cluster length to a number of clusters of that kind: "
    p @unique_related_clusters.group_by(&:length).transform_values(&:length)
    puts
  end

  def analyze_usability_of_related_clusters
    errors = {
      length: [],
      no_transl: [],
      no_script: [],
      too_much_script: [],
      no_map: [],
    }
    good = []

    @unique_related_clusters.each do |cluster|
      if cluster.length < 2
        # A bug - likely due to wrong data
        errors[:length] << cluster
      elsif cluster.none? { |i| %w[NS DS VS].include? i.nt }
        # We can do nothing about it
        errors[:no_script] << cluster
      elsif cluster.none? { |i| i.transl_cd != '' }
        # TODO: Add some heuristics per run?
        errors[:no_transl] << cluster
      elsif cluster.count { |i| %w[NS DS VS].include? i.nt } > 1
        # TODO: split those by some heuristic like by LC
        errors[:too_much_script] << cluster
      elsif cluster.none? { |i| geo_to_is i.transl_cd }
        # We don't have a usable map for those entries
        errors[:no_map] << cluster
      else
        good << cluster
      end
    end

    puts "Among the unique clusters:"
    puts "- #{errors[:length].length} clusters are too short"
    puts "- #{errors[:no_script].length} clusters contain no non-ASCII entries"
    puts "- #{errors[:no_transl].length} clusters contain no transliteration info"
    puts "- #{errors[:too_much_script].length} clusters contain more than 1 non-ASCII entries"
    puts "- #{errors[:no_map].length} clusters are transliterated with a map not present in Interscript"
    puts "Remaining #{good.length} clusters seem to be usable"
    puts

    @good_clusters = good
  end

  def compare_and_return_error(first, second)
    if first == second
      nil
    elsif first.downcase == second.downcase
      "Incorrect casing"
    elsif first.gsub(/[^[:alpha:][:space:]]/,'') == second.gsub(/[^[:alpha:][:space:]]/,'')
      "Incorrect punctuation"
    elsif first.downcase.gsub(/[^[:alpha:][:space:]]/,'') == second.downcase.gsub(/[^[:alpha:][:space:]]/,'')
      "Incorrect casing and punctuation"
    elsif first.gsub(/[^[:alpha:]]/,'') == second.gsub(/[^[:alpha:]]/,'')
      "Incorrect spacing or punctuation"
    elsif first.downcase.gsub(/[^[:alpha:]]/,'') == second.downcase.gsub(/[^[:alpha:]]/,'')
      "Incorrect casing and (spacing or punctuation)"
    else
      "Incorrect transliteration"
    end
  end

  def analyze_good_clusters
    results = {}
    maps = {}

    @good_clusters.each do |cluster|
      cluster = cluster.dup

      original = cluster.find { |i| %w[NS DS VS].include? i.nt }
      cluster.delete(original)

      # The rest of entries in the cluster are transliterated entries
      cluster.each do |i|
        group = [original, i]
        transl = i.transl_cd
        results[transl] ||= []
        map_id = geo_to_is transl
        unless map_id
          results[transl] << {error: "No support in Interscript", group: group}
          next
        end
        compiler = Interscript.load(map_id, maps, compiler: Interscript::Compiler::Ruby)
        result_fnro = compiler.(original.full_name_ro)
        result_fnrg = compiler.(original.full_name_rg)

        if error = compare_and_return_error(result_fnro, i.full_name_ro)
          results[transl] << {error: error, group: group, result: [result_fnro, result_fnrg]}
        elsif error = compare_and_return_error(result_fnrg, i.full_name_rg)
          results[transl] << {error: error, group: group, result: [result_fnro, result_fnrg]}
        else
          results[transl] << {ok: true, group: group}
        end
      end
    end

    # Compare transliteration result
    results.each do |transl, results|
      print "#{transl}: "
      all = results.length
      good = results.select { |i| i[:ok] }.length
      errors = results.select { |i| i[:error] }
      print "#{good}/#{all} (#{(good*100.0/all).round(2)}%)"
      unless errors.empty?
        print " (Errors: "
        print errors.group_by { |i| i[:error] }.transform_values(&:length).map { |error, count|
          "#{error} * #{count}"
        }.join(", ")
        print ")"
      end
      puts

      if @verbose && !errors.empty?
        pp errors
      end
    end
  end

  def geo_to_is(name)
    (@geo_to_is_cache ||= {})[name] ||= begin
      File.basename(Interscript.locate(name), ".imp") rescue nil
    end
  end

  def analyze_translit_systems
    puts "Transliteration systems used:"
    @records_by_transl.each do |transl,names|
      print "- #{transl.inspect} * #{names.length} "
      print "(#{names.select { |i| i.related }.length} with a pair)"
      print " implemented in Interscript as #{geo_to_is transl}" if geo_to_is transl
      puts
    end
    puts
  end

  class Name
    FIELDS=%i[ufi uni mgrs nt lc full_name_ro full_name_rg name_link transl_cd]
    INT_FIELDS=%i[ufi uni name_link]
    attr_accessor *FIELDS

    def initialize(geotest, **kwargs)
      @geotest = geotest
      kwargs.each do |k,v|
        if INT_FIELDS.include?(k)
          v = v == '' ? nil : v.to_i
        end
        instance_variable_set(:"@#{k}", v)
      end
    end

    def inspect
      "#<Name #{FIELDS.map { |i| "#{i}=#{send(i)}" }.join(" ")}>"
    end

    def related
      return nil unless name_link
      @geotest.records_by_uni[name_link]&.first
    end

    def related_cluster
      @geotest.related_clusters[uni] || []
    end
  end
end

options = {}
OptionParser.new do |opts|
  opts.banner = "Usage: #{$0} [options] file"

  opts.on("-v", "--verbose", "Describe all failures") do
    options[:verbose] = true
  end

  opts.on("-h", "--help", "Prints this help") do
    puts opts
    exit
  end
end.parse!(ARGV.empty? ? ["--help"] : ARGV)

file = ARGV[0]
Interscript::GeoTest.start(file, **options)