Skip to content

Commit da73dde

Browse files
committed
Add script to extract glossary
1 parent c239214 commit da73dde

File tree

1 file changed

+118
-2
lines changed

1 file changed

+118
-2
lines changed

script/update-docs.rb

Lines changed: 118 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
require "set"
99
require 'fileutils'
1010
require 'yaml'
11+
require 'json'
1112
require 'diffy'
1213
require_relative "version"
1314
require_relative 'asciidoctor-extensions'
@@ -95,6 +96,86 @@ def extract_headings(html)
9596
headings
9697
end
9798

99+
def extract_glossary_from_html(content, lang = 'en')
100+
# skip front matter
101+
content = content.split(/^---$/)[2] || content
102+
103+
doc = Nokogiri::HTML::DocumentFragment.parse(content)
104+
105+
glossary = {}
106+
107+
doc.css('dt').each do |dt|
108+
def_anchor = dt.css('a[id^="def_"]').first
109+
next unless def_anchor
110+
111+
term_id = def_anchor['id']
112+
next unless term_id&.start_with?('def_')
113+
114+
term_name = dt.text.strip
115+
# hack to handle this one weird (also) thing
116+
term_names = []
117+
if term_name == 'tree-ish (also treeish)'
118+
term_names = ['tree-ish', 'treeish']
119+
elsif term_name == 'arbre-esque (aussi arbresque)'
120+
term_names = ['arbre-esque', 'arbresque']
121+
else
122+
term_names = [term_name]
123+
end
124+
current_element = dt.next_element
125+
raise 'Expected dd' unless current_element&.name == 'dd'
126+
127+
# Fix up the links because they'regoing to be on a different page
128+
if lang == 'en'
129+
glossary_url = '/docs/gitglossary'
130+
else
131+
glossary_url = "/docs/gitglossary/#{lang}"
132+
end
133+
134+
definition_fragment = Nokogiri::HTML::DocumentFragment.parse(current_element.inner_html.strip)
135+
definition_fragment.css('a[href^="#def_"]').each do |link|
136+
href = link['href']
137+
if href&.start_with?('#def_')
138+
link['href'] = "#{glossary_url}#{href}"
139+
link['target'] = '_blank'
140+
end
141+
end
142+
definition = definition_fragment.to_html
143+
144+
term_names.each do |term|
145+
glossary[term] = definition
146+
end
147+
end
148+
149+
glossary
150+
end
151+
152+
def save_glossary_files(glossary_data_by_lang)
153+
return if glossary_data_by_lang.empty?
154+
155+
glossary_dir = "#{SITE_ROOT}static/js/glossary"
156+
FileUtils.mkdir_p(glossary_dir)
157+
158+
glossary_data_by_lang.each do |lang, glossary_data|
159+
output_file = "#{glossary_dir}/#{lang}.json"
160+
puts " saving glossary data to #{output_file} (#{glossary_data.size} terms)"
161+
File.write(output_file, JSON.pretty_generate(glossary_data))
162+
end
163+
end
164+
165+
def mark_glossary_tooltips(html, glossary_data_by_lang, lang)
166+
current_glossary = glossary_data_by_lang[lang] || {}
167+
168+
html.gsub(/<([^&]+)>/) do |match|
169+
term = $1
170+
# Only mark terms that exist in the glossary
171+
if current_glossary.key?(term)
172+
"<span class=\"hover-term\" data-term=\"#{term}\">&lt;#{term}&gt;</span>"
173+
else
174+
match
175+
end
176+
end
177+
end
178+
98179
def index_l10n_doc(filter_tags, doc_list, get_content)
99180
rebuild = ENV.fetch("REBUILD_DOC", nil)
100181
rerun = ENV["RERUN"] || rebuild || false
@@ -139,8 +220,15 @@ def index_l10n_doc(filter_tags, doc_list, get_content)
139220
end
140221

141222
check_paths = Set.new([])
223+
glossary_data_by_lang = {}
224+
225+
# Process glossary docs first so that we can use the parsed glossary to mark
226+
# tooltip items in the other documents
227+
glossary_docs = doc_files.select { |entry| File.basename(entry[0], ".#{ext}") == 'gitglossary' }
228+
other_docs = doc_files.reject { |entry| File.basename(entry[0], ".#{ext}") == 'gitglossary' }
229+
ordered_docs = glossary_docs + other_docs
142230

143-
doc_files.each do |entry|
231+
ordered_docs.each do |entry|
144232
full_path, sha = entry
145233
ids = Set.new([])
146234
lang = File.dirname(full_path)
@@ -177,6 +265,12 @@ def index_l10n_doc(filter_tags, doc_list, get_content)
177265
next if !rerun && lang_data[lang] == asciidoc_sha
178266

179267
html = asciidoc.render
268+
269+
if path == 'gitglossary'
270+
glossary_data_by_lang[lang] = extract_glossary_from_html(html, lang)
271+
puts " extracted #{glossary_data_by_lang[lang].size} glossary terms for #{lang}"
272+
end
273+
180274
html.gsub!(/linkgit:(\S+?)\[(\d+)\]/) do |line|
181275
x = /^linkgit:(\S+?)\[(\d+)\]/.match(line)
182276
relurl = "docs/#{x[1].gsub(/&#x2d;/, '-')}/#{lang}"
@@ -223,6 +317,8 @@ def index_l10n_doc(filter_tags, doc_list, get_content)
223317
"#{before}{{< relurl \"#{after}\" >}}"
224318
end
225319

320+
html = mark_glossary_tooltips(html, glossary_data_by_lang, lang)
321+
226322
# Write <docname>/<lang>.html
227323
front_matter = {
228324
"category" => "manual",
@@ -248,6 +344,8 @@ def index_l10n_doc(filter_tags, doc_list, get_content)
248344
lang_data[lang] = asciidoc_sha
249345
end
250346

347+
save_glossary_files(glossary_data_by_lang)
348+
251349
# In some cases, translations are not complete. As a consequence, some
252350
# translated manual pages may point to other translated manual pages that do
253351
# not exist. In these cases, redirect to the English version.
@@ -432,8 +530,15 @@ def index_doc(filter_tags, doc_list, get_content)
432530
end
433531

434532
check_paths = Set.new([])
533+
glossary_data_by_lang = {}
435534

436-
doc_files.each do |entry|
535+
# Process glossary docs first so that we can use the parsed glossary to mark
536+
# tooltip items in the other documents
537+
glossary_docs = doc_files.select { |entry| File.basename(entry[0].sub(/\.adoc$/, '.txt'), '.txt') == 'gitglossary' }
538+
other_docs = doc_files.reject { |entry| File.basename(entry[0].sub(/\.adoc$/, '.txt'), '.txt') == 'gitglossary' }
539+
ordered_docs = glossary_docs + other_docs
540+
541+
ordered_docs.each do |entry|
437542
path, sha = entry
438543
txt_path = path.sub(/\.adoc$/, '.txt')
439544
ids = Set.new([])
@@ -482,6 +587,12 @@ def index_doc(filter_tags, doc_list, get_content)
482587

483588
# Generate HTML
484589
html = asciidoc.render
590+
591+
if docname == 'gitglossary'
592+
glossary_data_by_lang['en'] = extract_glossary_from_html(html, 'en')
593+
puts " extracted #{glossary_data_by_lang['en'].size} glossary terms for 'en'"
594+
end
595+
485596
html.gsub!(/linkgit:+(\S+?)\[(\d+)\]/) do |line|
486597
x = /^linkgit:+(\S+?)\[(\d+)\]/.match(line)
487598
if x[1] == "curl"
@@ -522,6 +633,8 @@ def index_doc(filter_tags, doc_list, get_content)
522633
"#{before}{{< relurl \"#{after}\" >}}"
523634
end
524635

636+
html = mark_glossary_tooltips(html, glossary_data_by_lang, 'en')
637+
525638
doc_versions = version_map.keys.sort{|a, b| Version.version_to_num(a) <=> Version.version_to_num(b)}
526639
doc_version_index = doc_versions.index(version)
527640

@@ -640,6 +753,9 @@ def index_doc(filter_tags, doc_list, get_content)
640753
end
641754
end
642755
end
756+
757+
save_glossary_files(glossary_data_by_lang)
758+
643759
data["latest-version"] = version if !data["latest-version"] || Version.version_to_num(data["latest-version"]) < Version.version_to_num(version)
644760
end
645761

0 commit comments

Comments
 (0)