From 15273ba1649c0ccbc51e30a0db526374049a088f Mon Sep 17 00:00:00 2001 From: Emma Date: Wed, 22 Apr 2026 14:44:07 +0100 Subject: [PATCH 1/7] Update duplicates rake tasks to use govuk index The govuk index will be the only relevant index to run these rake tasks on, once the government index has been retired (this is in progress). The duplicates are located by content_id, which only appears as a field in the content indexes and not the auxiliary indexes. --- lib/tasks/duplicates.rake | 8 ++++---- spec/unit/tasks/duplicates_spec.rb | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/tasks/duplicates.rake b/lib/tasks/duplicates.rake index a85f6d6aa..ab74a9f91 100644 --- a/lib/tasks/duplicates.rake +++ b/lib/tasks/duplicates.rake @@ -2,8 +2,8 @@ require "rummager" namespace :duplicates do desc "Find all documents with the same content_id" - task :find, [:index] do |_t, args| - index = args[:index] || "government" + task :find do + index = SearchConfig.govuk_index_name duplicates = Search::DuplicateFinder.new(index:).find_duplicates @@ -16,8 +16,8 @@ namespace :duplicates do end desc "Find all documents with the same content_id and remove them" - task :remove, [:index] do |_t, args| - index = args[:index] || "government" + task :remove do + index = SearchConfig.govuk_index_name duplicates = Search::DuplicateFinder.new(index:).find_duplicates puts "No duplicates found" if duplicates.empty? diff --git a/spec/unit/tasks/duplicates_spec.rb b/spec/unit/tasks/duplicates_spec.rb index 708715c41..4945e0338 100644 --- a/spec/unit/tasks/duplicates_spec.rb +++ b/spec/unit/tasks/duplicates_spec.rb @@ -20,7 +20,7 @@ }, ] end - let(:index) { "govuk" } + let(:index) { "govuk_test" } before do Rake::Task[task_name].reenable From 193520cd1a72baa85cad5175bfeae692e9db49ef Mon Sep 17 00:00:00 2001 From: Emma Date: Tue, 28 Apr 2026 17:04:55 +0100 Subject: [PATCH 2/7] Refactor duplicates This code can be simplified now that we're only using the duplicates rake tasks on the govuk index. --- lib/search/duplicate_finder.rb | 8 +------- lib/search/duplicate_remover.rb | 7 +++---- lib/tasks/duplicates.rake | 10 +++------- spec/integration/search/duplicate_finder_spec.rb | 6 +++--- spec/integration/search/duplicate_remover_spec.rb | 6 +++--- spec/unit/tasks/duplicates_spec.rb | 13 +++++-------- 6 files changed, 18 insertions(+), 32 deletions(-) diff --git a/lib/search/duplicate_finder.rb b/lib/search/duplicate_finder.rb index 79619f9d8..504d19be5 100644 --- a/lib/search/duplicate_finder.rb +++ b/lib/search/duplicate_finder.rb @@ -2,15 +2,9 @@ module Search class DuplicateFinder TIMEOUT = 60 - attr_reader :index - - def initialize(index:) - @index = index - end - def find_duplicates response = Services.elasticsearch(timeout: TIMEOUT).search( - index: index, + index: SearchConfig.govuk_index_name, size: 0, body: { aggs: { diff --git a/lib/search/duplicate_remover.rb b/lib/search/duplicate_remover.rb index c9396ab7e..54d11c55c 100644 --- a/lib/search/duplicate_remover.rb +++ b/lib/search/duplicate_remover.rb @@ -1,9 +1,8 @@ module Search class DuplicateRemover - attr_reader :index, :logger + attr_reader :logger - def initialize(index:, logger: Logger.new($stdout)) - @index = index + def initialize(logger: Logger.new($stdout)) @logger = logger end @@ -39,7 +38,7 @@ def sort_by_updated_at(documents) def delete_document(link) Services.elasticsearch.delete_by_query( - index: index, + index: SearchConfig.govuk_index_name, body: { query: { term: { link: link } } }, ) logger.info "Deleted duplicate document: #{link}" diff --git a/lib/tasks/duplicates.rake b/lib/tasks/duplicates.rake index ab74a9f91..c05f73e05 100644 --- a/lib/tasks/duplicates.rake +++ b/lib/tasks/duplicates.rake @@ -3,9 +3,7 @@ require "rummager" namespace :duplicates do desc "Find all documents with the same content_id" task :find do - index = SearchConfig.govuk_index_name - - duplicates = Search::DuplicateFinder.new(index:).find_duplicates + duplicates = Search::DuplicateFinder.new.find_duplicates duplicates.each do |duplicate| puts "Content_id: #{duplicate[:content_id]}" @@ -17,11 +15,9 @@ namespace :duplicates do desc "Find all documents with the same content_id and remove them" task :remove do - index = SearchConfig.govuk_index_name - - duplicates = Search::DuplicateFinder.new(index:).find_duplicates + duplicates = Search::DuplicateFinder.new.find_duplicates puts "No duplicates found" if duplicates.empty? - Search::DuplicateRemover.new(index:).remove_duplicates(duplicates: duplicates) + Search::DuplicateRemover.new.remove_duplicates(duplicates: duplicates) end end diff --git a/spec/integration/search/duplicate_finder_spec.rb b/spec/integration/search/duplicate_finder_spec.rb index 692bc2c37..fa5a4bcaa 100644 --- a/spec/integration/search/duplicate_finder_spec.rb +++ b/spec/integration/search/duplicate_finder_spec.rb @@ -4,7 +4,7 @@ let(:index) { "govuk_test" } describe "there are no documents in Elasticsearch" do it "returns an empty array" do - expect(Search::DuplicateFinder.new(index:).find_duplicates).to eq([]) + expect(Search::DuplicateFinder.new.find_duplicates).to eq([]) end end describe "there are documents in Elasticsearch, none have a duplicate content_id" do @@ -12,7 +12,7 @@ (1..10).each do |n| commit_document(index, { link: "link/path#{n}", content_id: SecureRandom.uuid }) end - expect(Search::DuplicateFinder.new(index:).find_duplicates).to be_empty + expect(Search::DuplicateFinder.new.find_duplicates).to be_empty end end describe "there are documents in Elasticsearch, some have a duplicate content_id" do @@ -27,7 +27,7 @@ commit_document(index, { link: "link/path_c", content_id: "other", title: "title_c", updated_at: date_2 }) commit_document(index, { link: "link/path_d", content_id: "other", title: "title_d" }) - result = Search::DuplicateFinder.new(index:).find_duplicates + result = Search::DuplicateFinder.new.find_duplicates expect(result).to match_array([ a_hash_including( diff --git a/spec/integration/search/duplicate_remover_spec.rb b/spec/integration/search/duplicate_remover_spec.rb index e26e2d6cb..44671d20f 100644 --- a/spec/integration/search/duplicate_remover_spec.rb +++ b/spec/integration/search/duplicate_remover_spec.rb @@ -1,11 +1,11 @@ require "spec_helper" RSpec.describe Search::DuplicateRemover do - let(:index) { "government_test" } + let(:index) { "govuk_test" } let(:io) { StringIO.new } let(:logger) { Logger.new(io) } - let(:duplicates) { Search::DuplicateFinder.new(index:).find_duplicates } - subject(:remover) { described_class.new(index:, logger:) } + let(:duplicates) { Search::DuplicateFinder.new.find_duplicates } + subject(:remover) { described_class.new(logger:) } context "A set of duplicate documents has no updated_at field" do before :each do diff --git a/spec/unit/tasks/duplicates_spec.rb b/spec/unit/tasks/duplicates_spec.rb index 4945e0338..4ef4165b1 100644 --- a/spec/unit/tasks/duplicates_spec.rb +++ b/spec/unit/tasks/duplicates_spec.rb @@ -20,21 +20,19 @@ }, ] end - let(:index) { "govuk_test" } before do Rake::Task[task_name].reenable allow(Search::DuplicateFinder) .to receive(:new) - .with(index:) - .and_return(double(find_duplicates: fake_duplicates)) + .and_return(double(find_duplicates: fake_duplicates)) end describe "duplicates:find" do let(:task_name) { "duplicates:find" } it "prints duplicate sets in the expected format" do - output = capture_stdout { Rake::Task[task_name].invoke(index) } + output = capture_stdout { Rake::Task[task_name].invoke } expect(output).to include("Content_id: aaa-111") expect(output).to include(" T1 /a1 2020-01-01") @@ -51,19 +49,18 @@ before do allow(Search::DuplicateRemover) .to receive(:new) - .with(index:) - .and_return(duplicate_remover) + .and_return(duplicate_remover) end describe "there are duplicates" do it "removes duplicates" do - Rake::Task[task_name].invoke(index) + Rake::Task[task_name].invoke expect(duplicate_remover).to have_received(:remove_duplicates).with(duplicates: fake_duplicates).once end end describe "there are no duplicates" do let(:fake_duplicates) { [] } it "does not remove duplicates" do - output = capture_stdout { Rake::Task[task_name].invoke(index) } + output = capture_stdout { Rake::Task[task_name].invoke } expect(output).to eq("No duplicates found\n") end end From 807786bb5b143b199683a0212a80056309bae06c Mon Sep 17 00:00:00 2001 From: Emma Date: Wed, 22 Apr 2026 17:08:33 +0100 Subject: [PATCH 3/7] Remove export:search rake task This has been broken since 2020, so it seems unlikely that it will be missed. We can get the results in a json format from search API anyway. See PR that broke the task: https://github.com/alphagov/search-api/pull/2062 --- lib/tasks/export.rake | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 lib/tasks/export.rake diff --git a/lib/tasks/export.rake b/lib/tasks/export.rake deleted file mode 100644 index e60b6f041..000000000 --- a/lib/tasks/export.rake +++ /dev/null @@ -1,38 +0,0 @@ -require "csv" -require "rack" -require "rummager" - -namespace :export do - desc "Get all results which match the given search. Set FIELDS to control the exported fields." - task :search, [:query_string] do |_, args| - params = Rack::Utils.parse_nested_query(args.query_string) - .merge("fields" => "content_id,#{ENV.fetch('FIELDS', '')}") - .transform_values { |v| [v] } - search_params = SearchConfig.parse_parameters(params) - query = search_params.search_config.generate_query_for_params(search_params) - query[:sort] = %i[document_type _uid] - fields = search_params.return_fields.uniq - base_uri = search_params.search_config.base_uri - - CSV.open("export-search.csv", "wb", headers: fields, write_headers: true, force_quotes: true) do |csv| - ScrollEnumerator.new( - client: Services.elasticsearch(hosts: base_uri), - index_names: SearchConfig.content_index_names + [SearchConfig.govuk_index_name], - search_body: query, - ) do |hit| - csv << fields.map do |f| - value = hit["_source"][f] - - case value - when Hash - value.fetch("slug", value) - when Array - value.join(",") - else - value - end - end - end - end - end -end From 7efc2ab40781489f34f171a280cb01ced330f0cc Mon Sep 17 00:00:00 2001 From: Emma Date: Fri, 24 Apr 2026 11:00:40 +0100 Subject: [PATCH 4/7] Update delete:by_format rake task to use govuk index The govuk index will be the only relevant index to run this rake task on, once the government index has been retired (this is in progress). That's because the format field only appears in the content indexes, not the auxiliary indexes. --- lib/tasks/delete.rake | 9 ++++----- spec/integration/tasks/delete_spec.rb | 16 ++++------------ 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/lib/tasks/delete.rake b/lib/tasks/delete.rake index b3d3d4f07..1ab984c2d 100644 --- a/lib/tasks/delete.rake +++ b/lib/tasks/delete.rake @@ -14,16 +14,15 @@ namespace :delete do end desc " - Delete all documents by format from an index. + Delete all documents by format from the govuk index. Usage - rake 'delete:by_format[format_name, elasticsearch_index]' + rake 'delete:by_format[format_name]' " - task :by_format, [:format, :index_name] do |_, args| + task :by_format, [:format] do |_, args| format = args[:format] - index = args[:index_name] + index = SearchConfig.govuk_index_name abort "Specify format for deletion" if format.nil? - abort "Specify an index" if index.nil? warn_for_single_cluster_run client = Services.elasticsearch(cluster: Clusters.default_cluster, timeout: 5.0) diff --git a/spec/integration/tasks/delete_spec.rb b/spec/integration/tasks/delete_spec.rb index 48aa9fcce..ed6d6ba3f 100644 --- a/spec/integration/tasks/delete_spec.rb +++ b/spec/integration/tasks/delete_spec.rb @@ -54,28 +54,20 @@ describe "delete:by_format" do let(:task_name) { "delete:by_format" } let(:task) { Rake::Task[task_name] } - let(:index) { SearchConfig.all_index_names.first } + let(:index) { SearchConfig.govuk_index_name } let(:format) { "answer" } context "when format is missing" do it "prints a warning" do expect { - task.invoke(nil, index) + task.invoke(nil) }.to output("Specify format for deletion\n").to_stderr.and raise_error(SystemExit) end end - context "when index_name is missing" do - it "prints a warning" do - expect { - task.invoke(format, nil) - }.to output("Specify an index\n").to_stderr.and raise_error(SystemExit) - end - end - context "when there are no documents for the format" do it "prints no documents to delete" do - output = capture_stdout { task.invoke(format, index) } + output = capture_stdout { task.invoke(format) } expect(output).to match(/No #{format} documents to delete/) end end @@ -87,7 +79,7 @@ it "deletes all documents in batches" do output = capture_stdout do - expect { task.invoke(format, index) }.to change { + expect { task.invoke(format) }.to change { client.count(index:, body: { query: { term: { format: format } } })["count"] }.from(3).to(0) end From 915bd4b5fc54e0a1f2c873b4dbeac491aa82c1ad Mon Sep 17 00:00:00 2001 From: Emma Date: Fri, 24 Apr 2026 11:11:15 +0100 Subject: [PATCH 5/7] Update debug:show_new_synonyms to use govuk index The govuk index will be the only relevant index to run this rake task on, once the government index has been retired (this is in progress). That's because the title and description fields only appear in the content indexes, not the auxiliary indexes. --- lib/tasks/debug.rake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/tasks/debug.rake b/lib/tasks/debug.rake index ea0398ea9..df33154a7 100644 --- a/lib/tasks/debug.rake +++ b/lib/tasks/debug.rake @@ -28,8 +28,8 @@ namespace :debug do end desc "New synonyms test" - task :show_new_synonyms, [:query, :index_name] do |_, args| - index = args.index_name || SearchConfig.govuk_index_name + task :show_new_synonyms, [:query] do |_, args| + index = SearchConfig.govuk_index_name model = Debug::Synonyms::Analyzer.new(index: index) search_tokens = model.analyze_query(args.query) From 721cda04d10f38f63a7b70dfc85b932ec937595a Mon Sep 17 00:00:00 2001 From: Emma Date: Fri, 24 Apr 2026 11:44:59 +0100 Subject: [PATCH 6/7] Update search:update_supertypes to use govuk index The govuk index will be the only relevant index to run this rake task on, once the government index has been retired (this is in progress). That's because the content_store_document_type fields only appears in the content indexes, not the auxiliary indexes. --- lib/tasks/indices.rake | 4 +--- spec/unit/tasks/indices_spec.rb | 10 ++++------ 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/lib/tasks/indices.rake b/lib/tasks/indices.rake index 033ad4125..c61dba134 100644 --- a/lib/tasks/indices.rake +++ b/lib/tasks/indices.rake @@ -88,9 +88,7 @@ govuk_document_types gem using sidekiq jobs. This does not update the schema. " task :update_supertypes do - index_names.each do |index_name| - GovukIndex::Updater.update(index_name, GovukIndex::SupertypeJob) - end + GovukIndex::Updater.update(SearchConfig.govuk_index_name, GovukIndex::SupertypeJob) end desc "Migrate the data to a new schema definition diff --git a/spec/unit/tasks/indices_spec.rb b/spec/unit/tasks/indices_spec.rb index 51ef03ac4..a8f4467b0 100644 --- a/spec/unit/tasks/indices_spec.rb +++ b/spec/unit/tasks/indices_spec.rb @@ -162,16 +162,14 @@ describe "search:update_supertypes" do let(:task_name) { "search:update_supertypes" } - it "updates supertypes for all indices" do + it "updates supertypes for govuk index" do allow(GovukIndex::Updater).to receive(:update) Rake::Task[task_name].invoke - index_names.each do |index_name| - expect(GovukIndex::Updater) - .to have_received(:update) - .with(index_name, GovukIndex::SupertypeJob) - end + expect(GovukIndex::Updater) + .to have_received(:update) + .with(govuk_index_name, GovukIndex::SupertypeJob) end end From 545abd623fbda90a9de37fc349db19be9a3a404c Mon Sep 17 00:00:00 2001 From: Emma Date: Fri, 24 Apr 2026 12:35:07 +0100 Subject: [PATCH 7/7] Update rank_eval to use govuk index All documents have now been migrated to the govuk index, so we can remove references to other indices. --- lib/debug/rank_eval.rb | 16 +++------------- spec/support/rank_eval_test_helpers.rb | 3 +-- 2 files changed, 4 insertions(+), 15 deletions(-) diff --git a/lib/debug/rank_eval.rb b/lib/debug/rank_eval.rb index 5a23035de..4cadd7287 100644 --- a/lib/debug/rank_eval.rb +++ b/lib/debug/rank_eval.rb @@ -41,7 +41,7 @@ def evaluate }, ratings: data[:judgements].map do |judgement| { - _index: index_for_link(judgement[:link]), + _index: govuk_index_name, _id: judgement[:link], rating: judgement[:score], } @@ -92,8 +92,8 @@ def rank_eval(requests) headers: { "Content-Type" => "application/json" }, timeout: 120, } - indices = "*" - url = "#{uri}/#{indices}/_rank_eval" + index = SearchConfig.govuk_index_name + url = "#{uri}/#{index}/_rank_eval" response = HTTParty.post(url, options) puts "Elasticsearch: #{response.code}: #{response.message}" JSON.parse(response.body).with_indifferent_access @@ -111,16 +111,6 @@ def ignore_extra_judgements(data) end end - def index_for_link(link) - return government_index_name if link.start_with? "/government/" - - govuk_index_name - end - - def government_index_name - @government_index_name ||= @search_config.get_index_for_alias(SearchConfig.content_index_names) - end - def govuk_index_name @govuk_index_name ||= @search_config.get_index_for_alias(SearchConfig.govuk_index_name) end diff --git a/spec/support/rank_eval_test_helpers.rb b/spec/support/rank_eval_test_helpers.rb index 783682ab5..f83d46c33 100644 --- a/spec/support/rank_eval_test_helpers.rb +++ b/spec/support/rank_eval_test_helpers.rb @@ -6,7 +6,6 @@ def mock_judgement_csv CSV.generate do |csv| csv << %w[query rating link score] csv << ["harry potter", "relevant", "/harry-potter", 3] - # use /government to test fetching alias for government index csv << ["passport", "relevant", "/government/renew-a-passport", 3] # add repeated row to test ignore_extra_judgements csv << ["passport", "near", "/government/renew-a-passport", 2] @@ -24,7 +23,7 @@ def rank_eval_expected_output def stub_rank_eval_request es_source = ENV["ELASTICSEARCH_URI"] || "http://localhost:9200" - stub_request(:post, "#{es_source}/*/_rank_eval") + stub_request(:post, "#{es_source}/govuk_test/_rank_eval") .to_return( status: 200, body: {