Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 3 additions & 13 deletions lib/debug/rank_eval.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def evaluate
},
ratings: data[:judgements].map do |judgement|
{
_index: index_for_link(judgement[:link]),
_index: govuk_index_name,
_id: judgement[:link],
rating: judgement[:score],
}
Expand Down Expand Up @@ -92,8 +92,8 @@ def rank_eval(requests)
headers: { "Content-Type" => "application/json" },
timeout: 120,
}
indices = "*"
url = "#{uri}/#{indices}/_rank_eval"
index = SearchConfig.govuk_index_name
url = "#{uri}/#{index}/_rank_eval"
response = HTTParty.post(url, options)
puts "Elasticsearch: #{response.code}: #{response.message}"
JSON.parse(response.body).with_indifferent_access
Expand All @@ -111,16 +111,6 @@ def ignore_extra_judgements(data)
end
end

def index_for_link(link)
return government_index_name if link.start_with? "/government/"

govuk_index_name
end

def government_index_name
@government_index_name ||= @search_config.get_index_for_alias(SearchConfig.content_index_names)
end

def govuk_index_name
@govuk_index_name ||= @search_config.get_index_for_alias(SearchConfig.govuk_index_name)
end
Expand Down
8 changes: 1 addition & 7 deletions lib/search/duplicate_finder.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,9 @@ module Search
class DuplicateFinder
TIMEOUT = 60

attr_reader :index

def initialize(index:)
@index = index
end

def find_duplicates
response = Services.elasticsearch(timeout: TIMEOUT).search(
index: index,
index: SearchConfig.govuk_index_name,
size: 0,
body: {
aggs: {
Expand Down
7 changes: 3 additions & 4 deletions lib/search/duplicate_remover.rb
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
module Search
class DuplicateRemover
attr_reader :index, :logger
attr_reader :logger

def initialize(index:, logger: Logger.new($stdout))
@index = index
def initialize(logger: Logger.new($stdout))
@logger = logger
end

Expand Down Expand Up @@ -39,7 +38,7 @@ def sort_by_updated_at(documents)

def delete_document(link)
Services.elasticsearch.delete_by_query(
index: index,
index: SearchConfig.govuk_index_name,
body: { query: { term: { link: link } } },
)
logger.info "Deleted duplicate document: #{link}"
Expand Down
4 changes: 2 additions & 2 deletions lib/tasks/debug.rake
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ namespace :debug do
end

desc "New synonyms test"
task :show_new_synonyms, [:query, :index_name] do |_, args|
index = args.index_name || SearchConfig.govuk_index_name
task :show_new_synonyms, [:query] do |_, args|
index = SearchConfig.govuk_index_name
model = Debug::Synonyms::Analyzer.new(index: index)

search_tokens = model.analyze_query(args.query)
Expand Down
9 changes: 4 additions & 5 deletions lib/tasks/delete.rake
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,15 @@ namespace :delete do
end

desc "
Delete all documents by format from an index.
Delete all documents by format from the govuk index.
Usage
rake 'delete:by_format[format_name, elasticsearch_index]'
rake 'delete:by_format[format_name]'
"
task :by_format, [:format, :index_name] do |_, args|
task :by_format, [:format] do |_, args|
format = args[:format]
index = args[:index_name]
index = SearchConfig.govuk_index_name

abort "Specify format for deletion" if format.nil?
abort "Specify an index" if index.nil?

warn_for_single_cluster_run
client = Services.elasticsearch(cluster: Clusters.default_cluster, timeout: 5.0)
Expand Down
14 changes: 5 additions & 9 deletions lib/tasks/duplicates.rake
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@ require "rummager"

namespace :duplicates do
desc "Find all documents with the same content_id"
task :find, [:index] do |_t, args|
index = args[:index] || "government"

duplicates = Search::DuplicateFinder.new(index:).find_duplicates
task :find do
duplicates = Search::DuplicateFinder.new.find_duplicates

duplicates.each do |duplicate|
puts "Content_id: #{duplicate[:content_id]}"
Expand All @@ -16,12 +14,10 @@ namespace :duplicates do
end

desc "Find all documents with the same content_id and remove them"
task :remove, [:index] do |_t, args|
index = args[:index] || "government"

duplicates = Search::DuplicateFinder.new(index:).find_duplicates
task :remove do
duplicates = Search::DuplicateFinder.new.find_duplicates
puts "No duplicates found" if duplicates.empty?

Search::DuplicateRemover.new(index:).remove_duplicates(duplicates: duplicates)
Search::DuplicateRemover.new.remove_duplicates(duplicates: duplicates)
end
end
38 changes: 0 additions & 38 deletions lib/tasks/export.rake

This file was deleted.

4 changes: 1 addition & 3 deletions lib/tasks/indices.rake
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,7 @@ govuk_document_types gem using sidekiq jobs.
This does not update the schema.
"
task :update_supertypes do
index_names.each do |index_name|
GovukIndex::Updater.update(index_name, GovukIndex::SupertypeJob)
end
GovukIndex::Updater.update(SearchConfig.govuk_index_name, GovukIndex::SupertypeJob)
end

desc "Migrate the data to a new schema definition
Expand Down
6 changes: 3 additions & 3 deletions spec/integration/search/duplicate_finder_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
let(:index) { "govuk_test" }
describe "there are no documents in Elasticsearch" do
it "returns an empty array" do
expect(Search::DuplicateFinder.new(index:).find_duplicates).to eq([])
expect(Search::DuplicateFinder.new.find_duplicates).to eq([])
end
end
describe "there are documents in Elasticsearch, none have a duplicate content_id" do
it "returns an empty array" do
(1..10).each do |n|
commit_document(index, { link: "link/path#{n}", content_id: SecureRandom.uuid })
end
expect(Search::DuplicateFinder.new(index:).find_duplicates).to be_empty
expect(Search::DuplicateFinder.new.find_duplicates).to be_empty
end
end
describe "there are documents in Elasticsearch, some have a duplicate content_id" do
Expand All @@ -27,7 +27,7 @@
commit_document(index, { link: "link/path_c", content_id: "other", title: "title_c", updated_at: date_2 })
commit_document(index, { link: "link/path_d", content_id: "other", title: "title_d" })

result = Search::DuplicateFinder.new(index:).find_duplicates
result = Search::DuplicateFinder.new.find_duplicates

expect(result).to match_array([
a_hash_including(
Expand Down
6 changes: 3 additions & 3 deletions spec/integration/search/duplicate_remover_spec.rb
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
require "spec_helper"

RSpec.describe Search::DuplicateRemover do
let(:index) { "government_test" }
let(:index) { "govuk_test" }
let(:io) { StringIO.new }
let(:logger) { Logger.new(io) }
let(:duplicates) { Search::DuplicateFinder.new(index:).find_duplicates }
subject(:remover) { described_class.new(index:, logger:) }
let(:duplicates) { Search::DuplicateFinder.new.find_duplicates }
subject(:remover) { described_class.new(logger:) }

context "A set of duplicate documents has no updated_at field" do
before :each do
Expand Down
16 changes: 4 additions & 12 deletions spec/integration/tasks/delete_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -54,28 +54,20 @@
describe "delete:by_format" do
let(:task_name) { "delete:by_format" }
let(:task) { Rake::Task[task_name] }
let(:index) { SearchConfig.all_index_names.first }
let(:index) { SearchConfig.govuk_index_name }
let(:format) { "answer" }

context "when format is missing" do
it "prints a warning" do
expect {
task.invoke(nil, index)
task.invoke(nil)
}.to output("Specify format for deletion\n").to_stderr.and raise_error(SystemExit)
end
end

context "when index_name is missing" do
it "prints a warning" do
expect {
task.invoke(format, nil)
}.to output("Specify an index\n").to_stderr.and raise_error(SystemExit)
end
end

context "when there are no documents for the format" do
it "prints no documents to delete" do
output = capture_stdout { task.invoke(format, index) }
output = capture_stdout { task.invoke(format) }
expect(output).to match(/No #{format} documents to delete/)
end
end
Expand All @@ -87,7 +79,7 @@

it "deletes all documents in batches" do
output = capture_stdout do
expect { task.invoke(format, index) }.to change {
expect { task.invoke(format) }.to change {
client.count(index:, body: { query: { term: { format: format } } })["count"]
}.from(3).to(0)
end
Expand Down
3 changes: 1 addition & 2 deletions spec/support/rank_eval_test_helpers.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ def mock_judgement_csv
CSV.generate do |csv|
csv << %w[query rating link score]
csv << ["harry potter", "relevant", "/harry-potter", 3]
# use /government to test fetching alias for government index
csv << ["passport", "relevant", "/government/renew-a-passport", 3]
# add repeated row to test ignore_extra_judgements
csv << ["passport", "near", "/government/renew-a-passport", 2]
Expand All @@ -24,7 +23,7 @@ def rank_eval_expected_output

def stub_rank_eval_request
es_source = ENV["ELASTICSEARCH_URI"] || "http://localhost:9200"
stub_request(:post, "#{es_source}/*/_rank_eval")
stub_request(:post, "#{es_source}/govuk_test/_rank_eval")
.to_return(
status: 200,
body: {
Expand Down
13 changes: 5 additions & 8 deletions spec/unit/tasks/duplicates_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,19 @@
},
]
end
let(:index) { "govuk" }

before do
Rake::Task[task_name].reenable
allow(Search::DuplicateFinder)
.to receive(:new)
.with(index:)
.and_return(double(find_duplicates: fake_duplicates))
.and_return(double(find_duplicates: fake_duplicates))
end

describe "duplicates:find" do
let(:task_name) { "duplicates:find" }

it "prints duplicate sets in the expected format" do
output = capture_stdout { Rake::Task[task_name].invoke(index) }
output = capture_stdout { Rake::Task[task_name].invoke }

expect(output).to include("Content_id: aaa-111")
expect(output).to include(" T1 /a1 2020-01-01")
Expand All @@ -51,19 +49,18 @@
before do
allow(Search::DuplicateRemover)
.to receive(:new)
.with(index:)
.and_return(duplicate_remover)
.and_return(duplicate_remover)
end
describe "there are duplicates" do
it "removes duplicates" do
Rake::Task[task_name].invoke(index)
Rake::Task[task_name].invoke
expect(duplicate_remover).to have_received(:remove_duplicates).with(duplicates: fake_duplicates).once
end
end
describe "there are no duplicates" do
let(:fake_duplicates) { [] }
it "does not remove duplicates" do
output = capture_stdout { Rake::Task[task_name].invoke(index) }
output = capture_stdout { Rake::Task[task_name].invoke }
expect(output).to eq("No duplicates found\n")
end
end
Expand Down
10 changes: 4 additions & 6 deletions spec/unit/tasks/indices_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -162,16 +162,14 @@
describe "search:update_supertypes" do
let(:task_name) { "search:update_supertypes" }

it "updates supertypes for all indices" do
it "updates supertypes for govuk index" do
allow(GovukIndex::Updater).to receive(:update)

Rake::Task[task_name].invoke

index_names.each do |index_name|
expect(GovukIndex::Updater)
.to have_received(:update)
.with(index_name, GovukIndex::SupertypeJob)
end
expect(GovukIndex::Updater)
.to have_received(:update)
.with(govuk_index_name, GovukIndex::SupertypeJob)
end
end

Expand Down
Loading