Skip to content

Commit 3652732

Browse files
committed
🩹(backend) fix empty indexation batch
As we filter the empty documents from the batch during indexing some batches can be empty and cause an error. Now they are ignored. Add --batch-size argument to the index command. Signed-off-by: Fabre Florian <ffabre@hybird.org>
1 parent 5427f18 commit 3652732

File tree

3 files changed

+84
-11
lines changed

3 files changed

+84
-11
lines changed

src/backend/core/management/commands/index.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,17 @@ class Command(BaseCommand):
1717

1818
help = __doc__
1919

20+
def add_arguments(self, parser):
21+
"""Add argument to require forcing execution when not in debug mode."""
22+
parser.add_argument(
23+
"--batch-size",
24+
action="store",
25+
dest="batch_size",
26+
type=int,
27+
default=50,
28+
help="Indexation query batch size",
29+
)
30+
2031
def handle(self, *args, **options):
2132
"""Launch and log search index generation."""
2233
indexer = get_document_indexer()
@@ -26,9 +37,10 @@ def handle(self, *args, **options):
2637

2738
logger.info("Starting to regenerate Find index...")
2839
start = time.perf_counter()
40+
batch_size = options["batch_size"]
2941

3042
try:
31-
count = indexer.index()
43+
count = indexer.index(batch_size=batch_size)
3244
except Exception as err:
3345
raise CommandError("Unable to regenerate index") from err
3446

src/backend/core/services/search_indexers.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,11 @@ class BaseDocumentIndexer(ABC):
102102
`serialize_document()` and `push()` to define backend-specific behavior.
103103
"""
104104

105-
def __init__(self, batch_size=None):
105+
def __init__(self):
106106
"""
107107
Initialize the indexer.
108-
109-
Args:
110-
batch_size (int, optional): Number of documents per batch.
111-
Defaults to settings.SEARCH_INDEXER_BATCH_SIZE.
112108
"""
113-
self.batch_size = batch_size or settings.SEARCH_INDEXER_BATCH_SIZE
109+
self.batch_size = settings.SEARCH_INDEXER_BATCH_SIZE
114110
self.indexer_url = settings.SEARCH_INDEXER_URL
115111
self.indexer_secret = settings.SEARCH_INDEXER_SECRET
116112
self.search_url = settings.SEARCH_INDEXER_QUERY_URL
@@ -130,19 +126,26 @@ def __init__(self, batch_size=None):
130126
"SEARCH_INDEXER_QUERY_URL must be set in Django settings."
131127
)
132128

133-
def index(self, queryset=None):
129+
def index(self, queryset=None, batch_size=None):
134130
"""
135131
Fetch documents in batches, serialize them, and push to the search backend.
132+
133+
Args:
134+
queryset (optional): Document queryset
135+
Defaults to all documents without filter.
136+
batch_size (int, optional): Number of documents per batch.
137+
Defaults to settings.SEARCH_INDEXER_BATCH_SIZE.
136138
"""
137139
last_id = 0
138140
count = 0
139141
queryset = queryset or models.Document.objects.all()
142+
batch_size = batch_size or self.batch_size
140143

141144
while True:
142145
documents_batch = list(
143146
queryset.filter(
144147
id__gt=last_id,
145-
).order_by("id")[: self.batch_size]
148+
).order_by("id")[:batch_size]
146149
)
147150

148151
if not documents_batch:
@@ -158,8 +161,9 @@ def index(self, queryset=None):
158161
if document.content or document.title
159162
]
160163

161-
self.push(serialized_batch)
162-
count += len(serialized_batch)
164+
if serialized_batch:
165+
self.push(serialized_batch)
166+
count += len(serialized_batch)
163167

164168
return count
165169

src/backend/core/tests/test_services_search_indexers.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,44 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(
299299
assert seen_doc_ids == {str(d.id) for d in documents}
300300

301301

302+
@patch.object(SearchIndexer, "push")
303+
@pytest.mark.usefixtures("indexer_settings")
304+
def test_services_search_indexers_batch_size_argument(mock_push):
305+
"""
306+
Documents indexing should be processed in batches,
307+
batch_size overrides SEARCH_INDEXER_BATCH_SIZE
308+
"""
309+
documents = factories.DocumentFactory.create_batch(5)
310+
311+
# Attach a single user access to each document
312+
expected_user_subs = {}
313+
for document in documents:
314+
access = factories.UserDocumentAccessFactory(document=document)
315+
expected_user_subs[str(document.id)] = str(access.user.sub)
316+
317+
assert SearchIndexer().index(batch_size=2) == 5
318+
319+
# Should be 3 batches: 2 + 2 + 1
320+
assert mock_push.call_count == 3
321+
322+
seen_doc_ids = set()
323+
324+
for call in mock_push.call_args_list:
325+
batch = call.args[0]
326+
assert isinstance(batch, list)
327+
328+
for doc_json in batch:
329+
doc_id = doc_json["id"]
330+
seen_doc_ids.add(doc_id)
331+
332+
# Only one user expected per document
333+
assert doc_json["users"] == [expected_user_subs[doc_id]]
334+
assert doc_json["groups"] == []
335+
336+
# Make sure all 5 documents were indexed
337+
assert seen_doc_ids == {str(d.id) for d in documents}
338+
339+
302340
@patch.object(SearchIndexer, "push")
303341
@pytest.mark.usefixtures("indexer_settings")
304342
def test_services_search_indexers_ignore_empty_documents(mock_push):
@@ -327,6 +365,25 @@ def test_services_search_indexers_ignore_empty_documents(mock_push):
327365
}
328366

329367

368+
@patch.object(SearchIndexer, "push")
369+
def test_services_search_indexers_skip_empty_batches(mock_push, indexer_settings):
370+
"""
371+
Documents indexing batch can be empty if all the docs are empty.
372+
"""
373+
indexer_settings.SEARCH_INDEXER_BATCH_SIZE = 2
374+
375+
document = factories.DocumentFactory()
376+
377+
# Only empty docs
378+
factories.DocumentFactory.create_batch(5, content="", title="")
379+
380+
assert SearchIndexer().index() == 1
381+
assert mock_push.call_count == 1
382+
383+
results = [doc["id"] for doc in mock_push.call_args[0][0]]
384+
assert results == [str(document.id)]
385+
386+
330387
@patch.object(SearchIndexer, "push")
331388
@pytest.mark.usefixtures("indexer_settings")
332389
def test_services_search_indexers_ancestors_link_reach(mock_push):

0 commit comments

Comments
 (0)