From fd49f28449aa281a5e0a24d45b38b6846cdcb06b Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Wed, 23 Jul 2025 05:49:57 +0200 Subject: [PATCH 01/22] =?UTF-8?q?=F0=9F=94=A7(compose)=20configure=20exter?= =?UTF-8?q?nal=20network=20for=20communication=20with=20search?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Search in Docs relies on an external project like "La Suite Find". We need to declare a common external network in order to connect to the search app and index our documents. --- compose.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/compose.yml b/compose.yml index a774f11e07..a24a3974cf 100644 --- a/compose.yml +++ b/compose.yml @@ -72,6 +72,9 @@ services: - env.d/development/postgresql.local ports: - "8071:8000" + networks: + - default + - lasuite-net volumes: - ./src/backend:/app - ./data/static:/data/static @@ -217,3 +220,8 @@ services: kc_postgresql: condition: service_healthy restart: true + +networks: + lasuite-net: + name: lasuite-net + driver: bridge From 254a5418da2f984960f8367fc9e09e8c6402d1e6 Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Wed, 6 Aug 2025 07:32:27 +0200 Subject: [PATCH 02/22] =?UTF-8?q?=E2=9C=A8(backend)=20add=20dummy=20conten?= =?UTF-8?q?t=20to=20demo=20documents?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We need to content in our demo documents so that we can test indexing. --- src/backend/core/models.py | 47 ++++++++++--------- .../demo/management/commands/create_demo.py | 37 ++++++++++----- 2 files changed, 51 insertions(+), 33 deletions(-) diff --git a/src/backend/core/models.py b/src/backend/core/models.py index 6e0ad69e4f..dcb13f9b16 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -432,32 +432,35 @@ def __init__(self, *args, **kwargs): def save(self, *args, **kwargs): """Write content to object storage only if _content has changed.""" super().save(*args, **kwargs) - if self._content: - file_key = self.file_key - bytes_content = self._content.encode("utf-8") + self.save_content(self._content) - # Attempt to directly check if the object exists using the storage client. - try: - response = default_storage.connection.meta.client.head_object( - Bucket=default_storage.bucket_name, Key=file_key - ) - except ClientError as excpt: - # If the error is a 404, the object doesn't exist, so we should create it. - if excpt.response["Error"]["Code"] == "404": - has_changed = True - else: - raise + def save_content(self, content): + """Save content to object storage.""" + + file_key = self.file_key + bytes_content = content.encode("utf-8") + + # Attempt to directly check if the object exists using the storage client. + try: + response = default_storage.connection.meta.client.head_object( + Bucket=default_storage.bucket_name, Key=file_key + ) + except ClientError as excpt: + # If the error is a 404, the object doesn't exist, so we should create it. + if excpt.response["Error"]["Code"] == "404": + has_changed = True else: - # Compare the existing ETag with the MD5 hash of the new content. - has_changed = ( - response["ETag"].strip('"') - != hashlib.md5(bytes_content).hexdigest() # noqa: S324 - ) + raise + else: + # Compare the existing ETag with the MD5 hash of the new content. + has_changed = ( + response["ETag"].strip('"') != hashlib.md5(bytes_content).hexdigest() # noqa: S324 + ) - if has_changed: - content_file = ContentFile(bytes_content) - default_storage.save(file_key, content_file) + if has_changed: + content_file = ContentFile(bytes_content) + default_storage.save(file_key, content_file) def is_leaf(self): """ diff --git a/src/backend/demo/management/commands/create_demo.py b/src/backend/demo/management/commands/create_demo.py index 3681c378a0..ebce4eae25 100644 --- a/src/backend/demo/management/commands/create_demo.py +++ b/src/backend/demo/management/commands/create_demo.py @@ -1,16 +1,19 @@ # ruff: noqa: S311, S106 """create_demo management command""" +import base64 import logging import math import random import time from collections import defaultdict +from uuid import uuid4 from django import db from django.conf import settings from django.core.management.base import BaseCommand, CommandError +import pycrdt from faker import Faker from core import models @@ -27,6 +30,16 @@ def random_true_with_probability(probability): return random.random() < probability +def get_ydoc_for_text(text): + """Return a ydoc from plain text for demo purposes.""" + ydoc = pycrdt.Doc() + paragraph = pycrdt.XmlElement("p", {}, [pycrdt.XmlText(text)]) + fragment = pycrdt.XmlFragment([paragraph]) + ydoc["document-store"] = fragment + update = ydoc.get_update() + return base64.b64encode(update).decode("utf-8") + + class BulkQueue: """A utility class to create Django model instances in bulk by just pushing to a queue.""" @@ -48,7 +61,7 @@ def _bulk_create(self, objects): self.queue[objects[0]._meta.model.__name__] = [] # noqa: SLF001 def push(self, obj): - """Add a model instance to queue to that it gets created in bulk.""" + """Add a model instance to queue so that it gets created in bulk.""" objects = self.queue[obj._meta.model.__name__] # noqa: SLF001 objects.append(obj) if len(objects) > self.BATCH_SIZE: @@ -139,17 +152,19 @@ def create_demo(stdout): # pylint: disable=protected-access key = models.Document._int2str(i) # noqa: SLF001 padding = models.Document.alphabet[0] * (models.Document.steplen - len(key)) - queue.push( - models.Document( - depth=1, - path=f"{padding}{key}", - creator_id=random.choice(users_ids), - title=fake.sentence(nb_words=4), - link_reach=models.LinkReachChoices.AUTHENTICATED - if random_true_with_probability(0.5) - else random.choice(models.LinkReachChoices.values), - ) + title = fake.sentence(nb_words=4) + document = models.Document( + id=uuid4(), + depth=1, + path=f"{padding}{key}", + creator_id=random.choice(users_ids), + title=title, + link_reach=models.LinkReachChoices.AUTHENTICATED + if random_true_with_probability(0.5) + else random.choice(models.LinkReachChoices.values), ) + document.save_content(get_ydoc_for_text(f"Content for {title:s}")) + queue.push(document) queue.flush() From cb77f6803b27ab6a33364186475cbf93b7f490b3 Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Thu, 24 Jul 2025 12:31:20 +0200 Subject: [PATCH 03/22] =?UTF-8?q?=E2=9C=A8(backend)=20add=20document=20sea?= =?UTF-8?q?rch=20indexer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add indexer that loops across documents in the database, formats them as json objects and indexes them in the remote "Find" mico-service. --- Makefile | 4 + src/backend/core/management/commands/index.py | 28 ++ src/backend/core/services/search_indexers.py | 170 ++++++++++++ .../tests/test_services_search_indexers.py | 243 ++++++++++++++++++ src/backend/core/tests/test_utils.py | 25 ++ src/backend/core/utils.py | 22 ++ src/backend/impress/settings.py | 11 + 7 files changed, 503 insertions(+) create mode 100644 src/backend/core/management/commands/index.py create mode 100644 src/backend/core/services/search_indexers.py create mode 100644 src/backend/core/tests/test_services_search_indexers.py diff --git a/Makefile b/Makefile index 2655167912..c0ea3b59fd 100644 --- a/Makefile +++ b/Makefile @@ -247,6 +247,10 @@ demo: ## flush db then create a demo for load testing purpose @$(MANAGE) create_demo .PHONY: demo +index: ## index all documents to remote search + @$(MANAGE) index +.PHONY: index + # Nota bene: Black should come after isort just in case they don't agree... lint: ## lint back-end python sources lint: \ diff --git a/src/backend/core/management/commands/index.py b/src/backend/core/management/commands/index.py new file mode 100644 index 0000000000..85aeec905a --- /dev/null +++ b/src/backend/core/management/commands/index.py @@ -0,0 +1,28 @@ +""" +Handle search setup that needs to be done at bootstrap time. +""" + +import logging +import time + +from django.core.management.base import BaseCommand + +from ...services.search_indexers import FindDocumentIndexer + +logger = logging.getLogger("docs.search.bootstrap_search") + + +class Command(BaseCommand): + """Index all documents to remote search service""" + + help = __doc__ + + def handle(self, *args, **options): + """Launch and log search index generation.""" + logger.info("Starting to regenerate Find index...") + start = time.perf_counter() + + FindDocumentIndexer().index() + + duration = time.perf_counter() - start + logger.info(f"Search index regenerated in {duration:.2f} seconds.") diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py new file mode 100644 index 0000000000..742be87eb2 --- /dev/null +++ b/src/backend/core/services/search_indexers.py @@ -0,0 +1,170 @@ +"""Document search index management utilities and indexers""" + +import logging +from abc import ABC, abstractmethod +from collections import defaultdict + +from django.conf import settings + +import requests + +from core import models, utils + +logger = logging.getLogger(__name__) + + +def get_batch_accesses_by_users_and_teams(paths): + """ + Get accesses related to a list of document paths, + grouped by users and teams, including all ancestor paths. + """ + print("paths: ", paths) + ancestor_map = utils.get_ancestor_to_descendants_map(paths, steplen=models.Document.steplen) + ancestor_paths = list(ancestor_map.keys()) + print("ancestor map: ", ancestor_map) + print("ancestor paths: ", ancestor_paths) + + access_qs = models.DocumentAccess.objects.filter( + document__path__in=ancestor_paths + ).values("document__path", "user__sub", "team") + + access_by_document_path = defaultdict(lambda: {"users": set(), "teams": set()}) + + for access in access_qs: + ancestor_path = access["document__path"] + user_sub = access["user__sub"] + team = access["team"] + + for descendant_path in ancestor_map.get(ancestor_path, []): + if user_sub: + access_by_document_path[descendant_path]["users"].add(str(user_sub)) + if team: + access_by_document_path[descendant_path]["teams"].add(team) + + return dict(access_by_document_path) + + +class BaseDocumentIndexer(ABC): + """ + Base class for document indexers. + + Handles batching and access resolution. Subclasses must implement both + `serialize_document()` and `push()` to define backend-specific behavior. + """ + + def __init__(self, batch_size=None): + """ + Initialize the indexer. + + Args: + batch_size (int, optional): Number of documents per batch. + Defaults to settings.SEARCH_INDEXER_BATCH_SIZE. + """ + self.batch_size = batch_size or settings.SEARCH_INDEXER_BATCH_SIZE + + def index(self): + """ + Fetch documents in batches, serialize them, and push to the search backend. + """ + last_id = 0 + while True: + documents_batch = list( + models.Document.objects.filter( + id__gt=last_id, + ).order_by("id")[: self.batch_size] + ) + + if not documents_batch: + break + + doc_paths = [doc.path for doc in documents_batch] + last_id = documents_batch[-1].id + accesses_by_document_path = get_batch_accesses_by_users_and_teams(doc_paths) + + serialized_batch = [ + self.serialize_document(document, accesses_by_document_path) + for document in documents_batch + ] + self.push(serialized_batch) + + @abstractmethod + def serialize_document(self, document, accesses): + """ + Convert a Document instance to a JSON-serializable format for indexing. + + Must be implemented by subclasses. + """ + + @abstractmethod + def push(self, data): + """ + Push a batch of serialized documents to the backend. + + Must be implemented by subclasses. + """ + + +class FindDocumentIndexer(BaseDocumentIndexer): + """ + Document indexer that pushes documents to La Suite Find app. + """ + + def serialize_document(self, document, accesses): + """ + Convert a Document to the JSON format expected by La Suite Find. + + Args: + document (Document): The document instance. + accesses (dict): Mapping of document ID to user/team access. + + Returns: + dict: A JSON-serializable dictionary. + """ + doc_path = document.path + text_content = utils.base64_yjs_to_text(document.content) + return { + "id": str(document.id), + "title": document.title, + "content": text_content, + "depth": document.depth, + "path": document.path, + "numchild": document.numchild, + "created_at": document.created_at.isoformat(), + "updated_at": document.updated_at.isoformat(), + "users": list(accesses.get(doc_path, {}).get("users", set())), + "groups": list(accesses.get(doc_path, {}).get("teams", set())), + "reach": document.computed_link_reach, + "size": len(text_content.encode("utf-8")), + "is_active": not bool(document.ancestors_deleted_at), + } + + def push(self, data): + """ + Push a batch of documents to the Find backend. + + Args: + data (list): List of document dictionaries. + """ + url = getattr(settings, "SEARCH_INDEXER_URL", None) + if not url: + raise RuntimeError( + "SEARCH_INDEXER_URL must be set in Django settings before indexing." + ) + + secret = getattr(settings, "SEARCH_INDEXER_SECRET", None) + if not secret: + raise RuntimeError( + "SEARCH_INDEXER_SECRET must be set in Django settings before indexing." + ) + try: + response = requests.post( + url, + json=data, + headers={"Authorization": f"Bearer {secret}"}, + timeout=10, + ) + response.raise_for_status() + except requests.exceptions.HTTPError as e: + logger.error("HTTPError: %s", e) + logger.error("Response content: %s", response.text) + raise diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py new file mode 100644 index 0000000000..2ad37c387f --- /dev/null +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -0,0 +1,243 @@ +"""Tests for Documents search indexers""" + +from unittest.mock import patch + +import pytest + +from core import factories, utils +from core.services.search_indexers import FindDocumentIndexer + +pytestmark = pytest.mark.django_db + + +def test_push_raises_error_if_search_indexer_url_is_none(settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty. + """ + settings.SEARCH_INDEXER_URL = None + indexer = FindDocumentIndexer() + + with pytest.raises(RuntimeError) as exc_info: + indexer.push([]) + + assert "SEARCH_INDEXER_URL must be set in Django settings before indexing." in str( + exc_info.value + ) + + +def test_push_raises_error_if_search_indexer_url_is_empty(settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_URL is empty string. + """ + settings.SEARCH_INDEXER_URL = "" + indexer = FindDocumentIndexer() + + with pytest.raises(RuntimeError) as exc_info: + indexer.push([]) + + assert "SEARCH_INDEXER_URL must be set in Django settings before indexing." in str( + exc_info.value + ) + + +def test_push_raises_error_if_search_indexer_secret_is_none(settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None or empty. + """ + settings.SEARCH_INDEXER_SECRET = None + indexer = FindDocumentIndexer() + + with pytest.raises(RuntimeError) as exc_info: + indexer.push([]) + + assert ( + "SEARCH_INDEXER_SECRET must be set in Django settings before indexing." + in str(exc_info.value) + ) + + +def test_push_raises_error_if_search_indexer_secret_is_empty(settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is empty string. + """ + settings.SEARCH_INDEXER_SECRET = "" + indexer = FindDocumentIndexer() + + with pytest.raises(RuntimeError) as exc_info: + indexer.push([]) + + assert ( + "SEARCH_INDEXER_SECRET must be set in Django settings before indexing." + in str(exc_info.value) + ) + + +def test_services_search_indexers_serialize_document_returns_expected_json(): + """ + It should serialize documents with correct metadata and access control. + """ + user_a, user_b = factories.UserFactory.create_batch(2) + document = factories.DocumentFactory() + factories.DocumentFactory(parent=document) + + factories.UserDocumentAccessFactory(document=document, user=user_a) + factories.UserDocumentAccessFactory(document=document, user=user_b) + factories.TeamDocumentAccessFactory(document=document, team="team1") + factories.TeamDocumentAccessFactory(document=document, team="team2") + + accesses = { + document.path: { + "users": {str(user_a.sub), str(user_b.sub)}, + "teams": {"team1", "team2"}, + } + } + + indexer = FindDocumentIndexer() + result = indexer.serialize_document(document, accesses) + + assert set(result.pop("users")) == {str(user_a.sub), str(user_b.sub)} + assert set(result.pop("groups")) == {"team1", "team2"} + assert result == { + "id": str(document.id), + "title": document.title, + "depth": 1, + "path": document.path, + "numchild": 1, + "content": utils.base64_yjs_to_text(document.content), + "created_at": document.created_at.isoformat(), + "updated_at": document.updated_at.isoformat(), + "reach": document.link_reach, + 'size': 13, + "is_active": True, + } + + +def test_services_search_indexers_serialize_document_deleted(): + """Deleted documents are marked as just in the serialized json.""" + parent = factories.DocumentFactory() + document = factories.DocumentFactory(parent=parent) + + parent.soft_delete() + document.refresh_from_db() + + indexer = FindDocumentIndexer() + result = indexer.serialize_document(document, {}) + + assert result["is_active"] is False + + +@patch.object(FindDocumentIndexer, "push") +def test_services_search_indexers_batches_pass_only_batch_accesses(mock_push, settings): + """ + Documents indexing should be processed in batches, + and only the access data relevant to each batch should be used. + """ + settings.SEARCH_INDEXER_BATCH_SIZE = 2 + documents = factories.DocumentFactory.create_batch(5) + + # Attach a single user access to each document + expected_user_subs = {} + for document in documents: + access = factories.UserDocumentAccessFactory(document=document) + expected_user_subs[str(document.id)] = str(access.user.sub) + + FindDocumentIndexer().index() + + # Should be 3 batches: 2 + 2 + 1 + assert mock_push.call_count == 3 + + seen_doc_ids = set() + + for call in mock_push.call_args_list: + batch = call.args[0] + assert isinstance(batch, list) + + for doc_json in batch: + doc_id = doc_json["id"] + seen_doc_ids.add(doc_id) + + # Only one user expected per document + assert doc_json["users"] == [expected_user_subs[doc_id]] + assert doc_json["groups"] == [] + + # Make sure all 5 documents were indexed + assert seen_doc_ids == {str(d.id) for d in documents} + + +@patch.object(FindDocumentIndexer, "push") +def test_services_search_indexers_ancestors_link_reach(mock_push): + """Document accesses and reach should take into account ancestors link reaches.""" + great_grand_parent = factories.DocumentFactory(link_reach="restricted") + grand_parent = factories.DocumentFactory(parent=great_grand_parent, link_reach="authenticated") + parent = factories.DocumentFactory(parent=grand_parent, link_reach="public") + document = factories.DocumentFactory(parent=parent, link_reach="restricted") + + FindDocumentIndexer().index() + + seen_doc_ids = set() + results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} + assert len(results) == 4 + assert results[str(great_grand_parent.id)]["reach"] == "restricted" + assert results[str(grand_parent.id)]["reach"] == "authenticated" + assert results[str(parent.id)]["reach"] == "public" + assert results[str(document.id)]["reach"] == "public" + + +@patch.object(FindDocumentIndexer, "push") +def test_services_search_indexers_ancestors_users(mock_push): + """Document accesses and reach should include users from ancestors.""" + user_gp, user_p, user_d = factories.UserFactory.create_batch(3) + + grand_parent = factories.DocumentFactory(users=[user_gp]) + parent = factories.DocumentFactory(parent=grand_parent, users=[user_p]) + document = factories.DocumentFactory(parent=parent, users=[user_d]) + + FindDocumentIndexer().index() + + seen_doc_ids = set() + results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} + assert len(results) == 3 + assert results[str(grand_parent.id)]["users"] == [str(user_gp.sub)] + assert set(results[str(parent.id)]["users"]) == {str(user_gp.sub), str(user_p.sub)} + assert set(results[str(document.id)]["users"]) == {str(user_gp.sub), str(user_p.sub), str(user_d.sub)} + + +@patch.object(FindDocumentIndexer, "push") +def test_services_search_indexers_ancestors_teams(mock_push): + """Document accesses and reach should include teams from ancestors.""" + grand_parent = factories.DocumentFactory(teams=["team_gp"]) + parent = factories.DocumentFactory(parent=grand_parent, teams=["team_p"]) + document = factories.DocumentFactory(parent=parent, teams=["team_d"]) + + FindDocumentIndexer().index() + + seen_doc_ids = set() + results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} + assert len(results) == 3 + assert results[str(grand_parent.id)]["groups"] == ["team_gp"] + assert set(results[str(parent.id)]["groups"]) == {"team_gp", "team_p"} + assert set(results[str(document.id)]["groups"]) == {"team_gp", "team_p", "team_d"} + + +@patch("requests.post") +def test_push_uses_correct_url_and_data(mock_post, settings): + """ + push() should call requests.post with the correct URL from settings + the timeout set to 10 seconds and the data as JSON. + """ + settings.SEARCH_INDEXER_URL = "http://example.com/index" + + indexer = FindDocumentIndexer() + sample_data = [{"id": "123", "title": "Test"}] + + mock_response = mock_post.return_value + mock_response.raise_for_status.return_value = None # No error + + indexer.push(sample_data) + + mock_post.assert_called_once() + args, kwargs = mock_post.call_args + + assert args[0] == settings.SEARCH_INDEXER_URL + assert kwargs.get("json") == sample_data + assert kwargs.get("timeout") == 10 diff --git a/src/backend/core/tests/test_utils.py b/src/backend/core/tests/test_utils.py index 37b2e32d5e..66285afe07 100644 --- a/src/backend/core/tests/test_utils.py +++ b/src/backend/core/tests/test_utils.py @@ -75,3 +75,28 @@ def test_utils_extract_attachments(): base64_string = base64.b64encode(update).decode("utf-8") # image_key2 is missing the "/media/" part and shouldn't get extracted assert utils.extract_attachments(base64_string) == [image_key1, image_key3] + + +def test_utils_get_ancestor_to_descendants_map_single_path(): + """Test ancestor mapping of a single path.""" + paths = ['000100020005'] + result = utils.get_ancestor_to_descendants_map(paths, steplen=4) + + assert result == { + '0001': {'000100020005'}, + '00010002': {'000100020005'}, + '000100020005': {'000100020005'}, + } + + +def test_utils_get_ancestor_to_descendants_map_multiple_paths(): + """Test ancestor mapping of multiple paths with shared prefixes.""" + paths = ['000100020005', '00010003'] + result = utils.get_ancestor_to_descendants_map(paths, steplen=4) + + assert result == { + '0001': {'000100020005', '00010003'}, + '00010002': {'000100020005'}, + '000100020005': {'000100020005'}, + '00010003': {'00010003'}, + } diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py index 780431f495..746a0dfec2 100644 --- a/src/backend/core/utils.py +++ b/src/backend/core/utils.py @@ -1,6 +1,7 @@ """Utils for the core app.""" import base64 +from collections import defaultdict import re import pycrdt @@ -9,6 +10,27 @@ from core import enums +def get_ancestor_to_descendants_map(paths, steplen): + """ + Given a list of document paths, return a mapping of ancestor_path -> set of descendant_paths. + + Each path is assumed to use materialized path format with fixed-length segments. + + Args: + paths (list of str): List of full document paths. + steplen (int): Length of each path segment. + + Returns: + dict[str, set[str]]: Mapping from ancestor path to its descendant paths (including itself). + """ + ancestor_map = defaultdict(set) + for path in paths: + for i in range(steplen, len(path) + 1, steplen): + ancestor = path[:i] + ancestor_map[ancestor].add(path) + return ancestor_map + + def filter_descendants(paths, root_paths, skip_sorting=False): """ Filters paths to keep only those that are descendants of any path in root_paths. diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index 2229036c8a..af849bd2a4 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -99,6 +99,17 @@ class Base(Configuration): } DEFAULT_AUTO_FIELD = "django.db.models.AutoField" + # Search + SEARCH_INDEXER_BATCH_SIZE = values.IntegerValue( + default=100_000, environ_name="SEARCH_INDEXER_BATCH_SIZE", environ_prefix=None + ) + SEARCH_INDEXER_URL = values.Value( + default=None, environ_name="SEARCH_INDEXER_URL", environ_prefix=None + ) + SEARCH_INDEXER_SECRET = values.Value( + default=None, environ_name="SEARCH_INDEXER_SECRET", environ_prefix=None + ) + # Static files (CSS, JavaScript, Images) STATIC_URL = "/static/" STATIC_ROOT = os.path.join(DATA_DIR, "static") From 1a9011f5faa57eebd2aec3f23416db0480e53075 Mon Sep 17 00:00:00 2001 From: Samuel Paccoud - DINUM Date: Wed, 6 Aug 2025 17:35:38 +0200 Subject: [PATCH 04/22] =?UTF-8?q?=E2=9C=A8(backend)=20add=20async=20trigge?= =?UTF-8?q?rs=20to=20enable=20document=20indexation=20with=20find?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On document content or permission changes, start a celery job that will call the indexation API of the app "Find". Signed-off-by: Fabre Florian --- src/backend/core/api/viewsets.py | 5 + src/backend/core/management/commands/index.py | 2 +- src/backend/core/models.py | 22 +++ src/backend/core/services/search_indexers.py | 107 ++++++++++++++- src/backend/core/tasks/find.py | 96 +++++++++++++ .../core/tests/test_models_documents.py | 126 ++++++++++++++++++ .../tests/test_services_search_indexers.py | 23 +++- src/backend/core/tests/test_utils.py | 18 +-- src/backend/core/utils.py | 2 +- 9 files changed, 381 insertions(+), 20 deletions(-) create mode 100644 src/backend/core/tasks/find.py diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 84402ceaae..45d62a22f6 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -1064,6 +1064,11 @@ def duplicate(self, request, *args, **kwargs): {"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED ) + # TODO + # @drf.decorators.action(detail=False, methods=["get"]) + # def search(self, request, *args, **kwargs): + # index.search() + @drf.decorators.action(detail=True, methods=["get"], url_path="versions") def versions_list(self, request, *args, **kwargs): """ diff --git a/src/backend/core/management/commands/index.py b/src/backend/core/management/commands/index.py index 85aeec905a..b7eab9505e 100644 --- a/src/backend/core/management/commands/index.py +++ b/src/backend/core/management/commands/index.py @@ -25,4 +25,4 @@ def handle(self, *args, **options): FindDocumentIndexer().index() duration = time.perf_counter() - start - logger.info(f"Search index regenerated in {duration:.2f} seconds.") + logger.info("Search index regenerated in %.2f seconds.", duration) diff --git a/src/backend/core/models.py b/src/backend/core/models.py index dcb13f9b16..d226fad7c4 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -20,7 +20,9 @@ from django.core.files.storage import default_storage from django.core.mail import send_mail from django.db import models, transaction +from django.db.models import signals from django.db.models.functions import Left, Length +from django.dispatch import receiver from django.template.loader import render_to_string from django.utils import timezone from django.utils.functional import cached_property @@ -40,6 +42,7 @@ get_equivalent_link_definition, ) from .validators import sub_validator +from .tasks.find import trigger_document_indexer logger = getLogger(__name__) @@ -950,6 +953,16 @@ def restore(self): ) +@receiver(signals.post_save, sender=Document) +def document_post_save(sender, instance, **kwargs): + """ + Asynchronous call to the document indexer at the end of the transaction. + Note : Within the transaction we can have an empty content and a serialization + error. + """ + trigger_document_indexer(instance, on_commit=True) + + class LinkTrace(BaseModel): """ Relation model to trace accesses to a document via a link by a logged-in user. @@ -1175,6 +1188,15 @@ def get_abilities(self, user): } +@receiver(signals.post_save, sender=DocumentAccess) +def document_access_post_save(sender, instance, created, **kwargs): + """ + Asynchronous call to the document indexer at the end of the transaction. + """ + if not created: + trigger_document_indexer(instance.document, on_commit=True) + + class DocumentAskForAccess(BaseModel): """Relation model to ask for access to a document.""" diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index 742be87eb2..dda2b58555 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -5,6 +5,7 @@ from collections import defaultdict from django.conf import settings +from django.contrib.auth.models import AnonymousUser import requests @@ -18,11 +19,13 @@ def get_batch_accesses_by_users_and_teams(paths): Get accesses related to a list of document paths, grouped by users and teams, including all ancestor paths. """ - print("paths: ", paths) - ancestor_map = utils.get_ancestor_to_descendants_map(paths, steplen=models.Document.steplen) + # print("paths: ", paths) + ancestor_map = utils.get_ancestor_to_descendants_map( + paths, steplen=models.Document.steplen + ) ancestor_paths = list(ancestor_map.keys()) - print("ancestor map: ", ancestor_map) - print("ancestor paths: ", ancestor_paths) + # print("ancestor map: ", ancestor_map) + # print("ancestor paths: ", ancestor_paths) access_qs = models.DocumentAccess.objects.filter( document__path__in=ancestor_paths @@ -44,6 +47,22 @@ def get_batch_accesses_by_users_and_teams(paths): return dict(access_by_document_path) +def get_visited_document_ids_of(user): + if isinstance(user, AnonymousUser): + return [] + + # TODO : exclude links when user already have a specific access to the doc + qs = models.LinkTrace.objects.filter( + user=user + ).exclude( + document__accesses__user=user, + ) + + return list({ + str(id) for id in qs.values_list("document_id", flat=True) + }) + + class BaseDocumentIndexer(ABC): """ Base class for document indexers. @@ -84,6 +103,7 @@ def index(self): serialized_batch = [ self.serialize_document(document, accesses_by_document_path) for document in documents_batch + if document.content ] self.push(serialized_batch) @@ -103,6 +123,38 @@ def push(self, data): Must be implemented by subclasses. """ + def search(self, text, user, token): + """ + Search for documents in Find app. + """ + visited_ids = get_visited_document_ids_of(user) + + response = self.search_query(data={ + "q": text, + "visited": visited_ids, + "services": ["docs"], + }, token=token) + + print(response) + + return self.format_response(response) + + @abstractmethod + def search_query(self, data, token) -> dict: + """ + Retreive documents from the Find app API. + + Must be implemented by subclasses. + """ + + @abstractmethod + def format_response(self, data: dict): + """ + Convert the JSON response from Find app as document queryset. + + Must be implemented by subclasses. + """ + class FindDocumentIndexer(BaseDocumentIndexer): """ @@ -121,10 +173,12 @@ def serialize_document(self, document, accesses): dict: A JSON-serializable dictionary. """ doc_path = document.path - text_content = utils.base64_yjs_to_text(document.content) + doc_content = document.content + text_content = utils.base64_yjs_to_text(doc_content) if doc_content else "" + return { "id": str(document.id), - "title": document.title, + "title": document.title or "", "content": text_content, "depth": document.depth, "path": document.path, @@ -138,6 +192,46 @@ def serialize_document(self, document, accesses): "is_active": not bool(document.ancestors_deleted_at), } + def search_query(self, data, token) -> requests.Response: + """ + Retrieve documents from the Find app API. + + Args: + data (dict): search data + token (str): OICD token + + Returns: + dict: A JSON-serializable dictionary. + """ + url = getattr(settings, "SEARCH_INDEXER_QUERY_URL", None) + + if not url: + raise RuntimeError( + "SEARCH_INDEXER_QUERY_URL must be set in Django settings before indexing." + ) + + try: + response = requests.post( + url, + json=data, + headers={"Authorization": f"Bearer {token}"}, + timeout=10, + ) + response.raise_for_status() + return response.json() + except requests.exceptions.HTTPError as e: + logger.error("HTTPError: %s", e) + logger.error("Response content: %s", response.text) # type: ignore + raise + + def format_response(self, data: dict): + """ + Retrieve documents ids from Find app response and return a queryset. + """ + return models.Document.objects.filter(pk__in=[ + d['_id'] for d in data + ]) + def push(self, data): """ Push a batch of documents to the Find backend. @@ -156,6 +250,7 @@ def push(self, data): raise RuntimeError( "SEARCH_INDEXER_SECRET must be set in Django settings before indexing." ) + try: response = requests.post( url, diff --git a/src/backend/core/tasks/find.py b/src/backend/core/tasks/find.py new file mode 100644 index 0000000000..858c83ee01 --- /dev/null +++ b/src/backend/core/tasks/find.py @@ -0,0 +1,96 @@ +"""Trigger document indexation using celery task.""" + +from logging import getLogger + +from django.conf import settings +from django.core.cache import cache +from django.db import transaction + +from core import models +from core.services.search_indexers import ( + FindDocumentIndexer, + get_batch_accesses_by_users_and_teams, +) + +from impress.celery_app import app + +logger = getLogger(__file__) + + +def document_indexer_debounce_key(document_id): + """Returns debounce cache key""" + return f"doc-indexer-debounce-{document_id}" + + +def incr_counter(key): + """Increase or reset counter""" + try: + return cache.incr(key) + except ValueError: + cache.set(key, 1) + return 1 + + +def decr_counter(key): + """Decrease or reset counter""" + try: + return cache.decr(key) + except ValueError: + cache.set(key, 0) + return 0 + + +@app.task +def document_indexer_task(document_id): + """Send indexation query for a document using celery task.""" + key = document_indexer_debounce_key(document_id) + + # check if the counter : if still up, skip the task. only the last one + # within the countdown delay will do the query. + if decr_counter(key) > 0: + logger.info("Skip document %s indexation", document_id) + return + + doc = models.Document.objects.get(pk=document_id) + indexer = FindDocumentIndexer() + accesses = get_batch_accesses_by_users_and_teams((doc.path,)) + + data = indexer.serialize_document(document=doc, accesses=accesses) + + logger.info("Start document %s indexation", document_id) + indexer.push(data) + + +def trigger_document_indexer(document, on_commit=False): + """ + Trigger indexation task with debounce a delay set by the SEARCH_INDEXER_COUNTDOWN setting. + + Args: + document (Document): The document instance. + on_commit (bool): Wait for the end of the transaction before starting the task + (some fields may be in wrong state within the transaction) + """ + + if document.deleted_at or document.ancestors_deleted_at: + pass + + if on_commit: + + def _aux(): + trigger_document_indexer(document, on_commit=False) + + transaction.on_commit(_aux) + else: + key = document_indexer_debounce_key(document.pk) + countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1) + + logger.info( + "Add task for document %s indexation in %.2f seconds", + document.pk, countdown + ) + + # Each time this method is called during the countdown, we increment the + # counter and each task decrease it, so the index be run only once. + incr_counter(key) + + document_indexer_task.apply_async(args=[document.pk], countdown=countdown) diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index 69236b6e96..1c4df5deb8 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -5,6 +5,7 @@ import random import smtplib +import time from logging import Logger from unittest import mock @@ -13,12 +14,15 @@ from django.core.cache import cache from django.core.exceptions import ValidationError from django.core.files.storage import default_storage +from django.db import transaction from django.test.utils import override_settings from django.utils import timezone import pytest from core import factories, models +from core.services.search_indexers import FindDocumentIndexer +from core.tasks.find import document_indexer_debounce_key pytestmark = pytest.mark.django_db @@ -1445,3 +1449,125 @@ def test_models_documents_compute_ancestors_links_paths_mapping_structure( {"link_reach": sibling.link_reach, "link_role": sibling.link_role}, ], } + + +@mock.patch.object(FindDocumentIndexer, "push") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer(mock_push, settings): + """Test indexation task on document creation""" + settings.SEARCH_INDEXER_COUNTDOWN = 0 + + user = factories.UserFactory() + + with transaction.atomic(): + doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3) + + factories.UserDocumentAccessFactory(document=doc1, user=user) + factories.UserDocumentAccessFactory(document=doc2, user=user) + factories.UserDocumentAccessFactory(document=doc3, user=user) + + time.sleep(0.1) # waits for the end of the tasks + + accesses = { + str(doc1.path): {"users": [user.sub]}, + str(doc2.path): {"users": [user.sub]}, + str(doc3.path): {"users": [user.sub]}, + } + + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = FindDocumentIndexer() + + def sortkey(d): + return d["id"] + + assert sorted(data, key=sortkey) == sorted( + [ + indexer.serialize_document(doc1, accesses), + indexer.serialize_document(doc2, accesses), + indexer.serialize_document(doc3, accesses), + ], + key=sortkey, + ) + + # The debounce counters should be reset + assert cache.get(document_indexer_debounce_key(doc1.pk)) == 0 + assert cache.get(document_indexer_debounce_key(doc2.pk)) == 0 + assert cache.get(document_indexer_debounce_key(doc3.pk)) == 0 + + +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_debounce(settings): + """Test indexation task skipping on document update""" + settings.SEARCH_INDEXER_COUNTDOWN = 0 + + indexer = FindDocumentIndexer() + user = factories.UserFactory() + + with mock.patch.object(FindDocumentIndexer, "push"): + with transaction.atomic(): + doc = factories.DocumentFactory() + factories.UserDocumentAccessFactory(document=doc, user=user) + + accesses = { + str(doc.path): {"users": [user.sub]}, + } + + time.sleep(0.1) # waits for the end of the tasks + + with mock.patch.object(FindDocumentIndexer, "push") as mock_push: + # Simulate 1 waiting task + cache.set(document_indexer_debounce_key(doc.pk), 1) + + # save doc to trigger the indexer, but nothing should be done since + # the counter is over 0 + with transaction.atomic(): + doc.save() + + time.sleep(0.1) + + assert [call.args[0] for call in mock_push.call_args_list] == [] + + with mock.patch.object(FindDocumentIndexer, "push") as mock_push: + # No waiting task + cache.set(document_indexer_debounce_key(doc.pk), 0) + + with transaction.atomic(): + doc = models.Document.objects.get(pk=doc.pk) + doc.save() + + time.sleep(0.1) + + assert [call.args[0] for call in mock_push.call_args_list] == [ + indexer.serialize_document(doc, accesses), + ] + + +@pytest.mark.django_db(transaction=True) +def test_models_documents_access_post_save_indexer(settings): + """Test indexation task on DocumentAccess update""" + settings.SEARCH_INDEXER_COUNTDOWN = 0 + + indexer = FindDocumentIndexer() + user = factories.UserFactory() + + with mock.patch.object(FindDocumentIndexer, "push"): + with transaction.atomic(): + doc = factories.DocumentFactory() + doc_access = factories.UserDocumentAccessFactory(document=doc, user=user) + + accesses = { + str(doc.path): {"users": [user.sub]}, + } + + indexer = FindDocumentIndexer() + + with mock.patch.object(FindDocumentIndexer, "push") as mock_push: + with transaction.atomic(): + doc_access.save() + + time.sleep(0.1) + + assert [call.args[0] for call in mock_push.call_args_list] == [ + indexer.serialize_document(doc, accesses), + ] diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index 2ad37c387f..0d74d8457b 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -107,7 +107,7 @@ def test_services_search_indexers_serialize_document_returns_expected_json(): "created_at": document.created_at.isoformat(), "updated_at": document.updated_at.isoformat(), "reach": document.link_reach, - 'size': 13, + "size": 13, "is_active": True, } @@ -126,6 +126,17 @@ def test_services_search_indexers_serialize_document_deleted(): assert result["is_active"] is False +def test_services_search_indexers_serialize_document_empty(): + """Empty documents returns empty content in the serialized json.""" + document = factories.DocumentFactory(content="", title=None) + + indexer = FindDocumentIndexer() + result = indexer.serialize_document(document, {}) + + assert result["content"] == "" + assert result["title"] == "" + + @patch.object(FindDocumentIndexer, "push") def test_services_search_indexers_batches_pass_only_batch_accesses(mock_push, settings): """ @@ -168,7 +179,9 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(mock_push, se def test_services_search_indexers_ancestors_link_reach(mock_push): """Document accesses and reach should take into account ancestors link reaches.""" great_grand_parent = factories.DocumentFactory(link_reach="restricted") - grand_parent = factories.DocumentFactory(parent=great_grand_parent, link_reach="authenticated") + grand_parent = factories.DocumentFactory( + parent=great_grand_parent, link_reach="authenticated" + ) parent = factories.DocumentFactory(parent=grand_parent, link_reach="public") document = factories.DocumentFactory(parent=parent, link_reach="restricted") @@ -199,7 +212,11 @@ def test_services_search_indexers_ancestors_users(mock_push): assert len(results) == 3 assert results[str(grand_parent.id)]["users"] == [str(user_gp.sub)] assert set(results[str(parent.id)]["users"]) == {str(user_gp.sub), str(user_p.sub)} - assert set(results[str(document.id)]["users"]) == {str(user_gp.sub), str(user_p.sub), str(user_d.sub)} + assert set(results[str(document.id)]["users"]) == { + str(user_gp.sub), + str(user_p.sub), + str(user_d.sub), + } @patch.object(FindDocumentIndexer, "push") diff --git a/src/backend/core/tests/test_utils.py b/src/backend/core/tests/test_utils.py index 66285afe07..42d588c536 100644 --- a/src/backend/core/tests/test_utils.py +++ b/src/backend/core/tests/test_utils.py @@ -79,24 +79,24 @@ def test_utils_extract_attachments(): def test_utils_get_ancestor_to_descendants_map_single_path(): """Test ancestor mapping of a single path.""" - paths = ['000100020005'] + paths = ["000100020005"] result = utils.get_ancestor_to_descendants_map(paths, steplen=4) assert result == { - '0001': {'000100020005'}, - '00010002': {'000100020005'}, - '000100020005': {'000100020005'}, + "0001": {"000100020005"}, + "00010002": {"000100020005"}, + "000100020005": {"000100020005"}, } def test_utils_get_ancestor_to_descendants_map_multiple_paths(): """Test ancestor mapping of multiple paths with shared prefixes.""" - paths = ['000100020005', '00010003'] + paths = ["000100020005", "00010003"] result = utils.get_ancestor_to_descendants_map(paths, steplen=4) assert result == { - '0001': {'000100020005', '00010003'}, - '00010002': {'000100020005'}, - '000100020005': {'000100020005'}, - '00010003': {'00010003'}, + "0001": {"000100020005", "00010003"}, + "00010002": {"000100020005"}, + "000100020005": {"000100020005"}, + "00010003": {"00010003"}, } diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py index 746a0dfec2..357ede03c3 100644 --- a/src/backend/core/utils.py +++ b/src/backend/core/utils.py @@ -1,8 +1,8 @@ """Utils for the core app.""" import base64 -from collections import defaultdict import re +from collections import defaultdict import pycrdt from bs4 import BeautifulSoup From 0145a0575f6f67589565fc2e5c0bf0e6f6f1b38c Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Wed, 13 Aug 2025 06:49:36 +0200 Subject: [PATCH 05/22] =?UTF-8?q?=F0=9F=94=A7(compose)=20Add=20some=20igno?= =?UTF-8?q?re=20for=20docker-compose=20local=20overrides?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Fabre Florian --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 6eea8d7e7c..16d803b568 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,10 @@ venv.bak/ env.d/development/*.local env.d/terraform +# Docker +compose.override.yml +docker/auth/*.local + # npm node_modules From 47fd1dfd112f71e4e3d3cec88c69da6b26db2dc4 Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Wed, 13 Aug 2025 06:50:23 +0200 Subject: [PATCH 06/22] =?UTF-8?q?=E2=9C=A8(backend)=20add=20unit=20test=20?= =?UTF-8?q?for=20the=20'index'=20command?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Fabre Florian --- src/backend/core/tests/commands/test_index.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 src/backend/core/tests/commands/test_index.py diff --git a/src/backend/core/tests/commands/test_index.py b/src/backend/core/tests/commands/test_index.py new file mode 100644 index 0000000000..1da70344e1 --- /dev/null +++ b/src/backend/core/tests/commands/test_index.py @@ -0,0 +1,49 @@ +""" +Unit test for `index` command. +""" + +from unittest import mock + +from django.core.management import call_command +from django.db import transaction + +import pytest + +from core import factories +from core.services.search_indexers import FindDocumentIndexer + + +@pytest.mark.django_db +def test_index(): + """Test the command `index` that run the Find app indexer for all the available documents.""" + user = factories.UserFactory() + indexer = FindDocumentIndexer() + + with transaction.atomic(): + doc = factories.DocumentFactory() + empty_doc = factories.DocumentFactory(title=None, content='') + no_title_doc = factories.DocumentFactory(title=None) + + factories.UserDocumentAccessFactory(document=doc, user=user) + factories.UserDocumentAccessFactory(document=empty_doc, user=user) + factories.UserDocumentAccessFactory(document=no_title_doc, user=user) + + accesses = { + str(doc.path): {"users": [user.sub]}, + str(empty_doc.path): {"users": [user.sub]}, + str(no_title_doc.path): {"users": [user.sub]}, + } + + def sortkey(d): + return d["id"] + + with mock.patch.object(FindDocumentIndexer, "push") as mock_push: + call_command("index") + + push_call_args = [call.args[0] for call in mock_push.call_args_list] + + assert len(push_call_args) == 1 # called once but with a batch of docs + assert sorted(push_call_args[0], key=sortkey) == sorted([ + indexer.serialize_document(doc, accesses), + indexer.serialize_document(no_title_doc, accesses), + ], key=sortkey) From 1b36f6b729e15dd93a7e50e9771030a58136f4cf Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Wed, 13 Aug 2025 06:50:58 +0200 Subject: [PATCH 07/22] =?UTF-8?q?=E2=9C=A8(backend)=20add=20document=20sea?= =?UTF-8?q?rch=20view?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New API view that calls the indexed documents search view (resource server) of app "Find". Signed-off-by: Fabre Florian --- env.d/development/common | 5 + src/backend/core/api/serializers.py | 14 ++ src/backend/core/api/viewsets.py | 39 ++++- src/backend/core/models.py | 2 +- src/backend/core/services/search_indexers.py | 35 +++-- src/backend/core/tasks/find.py | 3 +- src/backend/core/tests/commands/test_index.py | 13 +- .../documents/test_api_documents_search.py | 137 ++++++++++++++++++ .../tests/test_services_search_indexers.py | 95 +++++++++++- src/backend/impress/settings.py | 3 + 10 files changed, 312 insertions(+), 34 deletions(-) create mode 100644 src/backend/core/tests/documents/test_api_documents_search.py diff --git a/env.d/development/common b/env.d/development/common index de857d5b2a..6df4a72248 100644 --- a/env.d/development/common +++ b/env.d/development/common @@ -49,6 +49,11 @@ LOGOUT_REDIRECT_URL=http://localhost:3000 OIDC_REDIRECT_ALLOWED_HOSTS=["http://localhost:8083", "http://localhost:3000"] OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"} +# Store OIDC tokens in the session +OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session +OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session +OIDC_STORE_REFRESH_TOKEN_KEY = AnExampleKeyForDevPurposeOnly + # AI AI_FEATURE_ENABLED=true AI_BASE_URL=https://openaiendpoint.com diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index 81b26d5e80..be5db70841 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -889,3 +889,17 @@ class MoveDocumentSerializer(serializers.Serializer): choices=enums.MoveNodePositionChoices.choices, default=enums.MoveNodePositionChoices.LAST_CHILD, ) + + +class FindDocumentSerializer(serializers.Serializer): + """Serializer for Find search requests""" + + q = serializers.CharField(required=True) + + def validate_q(self, value): + """Ensure the text field is not empty.""" + + if len(value.strip()) == 0: + raise serializers.ValidationError("Text field cannot be empty.") + + return value diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 45d62a22f6..78dd9dd987 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -21,6 +21,7 @@ from django.db.models.functions import Left, Length from django.http import Http404, StreamingHttpResponse from django.urls import reverse +from django.utils.decorators import method_decorator from django.utils.functional import cached_property from django.utils.text import capfirst, slugify from django.utils.translation import gettext_lazy as _ @@ -31,6 +32,7 @@ from csp.constants import NONE from csp.decorators import csp_update from lasuite.malware_detection import malware_detection +from lasuite.oidc_login.decorators import refresh_oidc_access_token from rest_framework import filters, status, viewsets from rest_framework import response as drf_response from rest_framework.permissions import AllowAny @@ -47,6 +49,7 @@ from core.services.converter_services import ( YdocConverter, ) +from core.services.search_indexers import FindDocumentIndexer from core.tasks.mail import send_ask_for_access_mail from core.utils import extract_attachments, filter_descendants @@ -373,6 +376,7 @@ class DocumentViewSet( list_serializer_class = serializers.ListDocumentSerializer trashbin_serializer_class = serializers.ListDocumentSerializer tree_serializer_class = serializers.ListDocumentSerializer + search_serializer_class = serializers.ListDocumentSerializer def get_queryset(self): """Get queryset performing all annotation and filtering on the document tree structure.""" @@ -1064,10 +1068,37 @@ def duplicate(self, request, *args, **kwargs): {"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED ) - # TODO - # @drf.decorators.action(detail=False, methods=["get"]) - # def search(self, request, *args, **kwargs): - # index.search() + @drf.decorators.action(detail=False, methods=["get"], url_path="search") + @method_decorator(refresh_oidc_access_token) + def search(self, request, *args, **kwargs): + """ + Returns a DRF response containing the filtered, annotated and ordered document list. + The filtering allows full text search through the opensearch indexation app "find". + """ + access_token = request.session.get("oidc_access_token") + + serializer = serializers.FindDocumentSerializer(data=request.query_params) + serializer.is_valid(raise_exception=True) + + try: + indexer = FindDocumentIndexer() + queryset = indexer.search( + text=serializer.validated_data.get("q", ""), + user=request.user, + token=access_token, + ) + except RuntimeError: + return drf.response.Response( + {"detail": "The service is not configured properly."}, + status=status.HTTP_401_UNAUTHORIZED, + ) + + return self.get_response_for_queryset( + queryset, + context={ + "request": request, + }, + ) @drf.decorators.action(detail=True, methods=["get"], url_path="versions") def versions_list(self, request, *args, **kwargs): diff --git a/src/backend/core/models.py b/src/backend/core/models.py index d226fad7c4..cf4035f7d9 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -41,8 +41,8 @@ RoleChoices, get_equivalent_link_definition, ) -from .validators import sub_validator from .tasks.find import trigger_document_indexer +from .validators import sub_validator logger = getLogger(__name__) diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index dda2b58555..e24ac3900b 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -48,19 +48,19 @@ def get_batch_accesses_by_users_and_teams(paths): def get_visited_document_ids_of(user): + """ + Returns the ids of the documents that have a linktrace to the user and NOT owned. + It will be use to limit the opensearch responses to the public documents already + "visited" by the user. + """ if isinstance(user, AnonymousUser): return [] - # TODO : exclude links when user already have a specific access to the doc - qs = models.LinkTrace.objects.filter( - user=user - ).exclude( + qs = models.LinkTrace.objects.filter(user=user).exclude( document__accesses__user=user, ) - return list({ - str(id) for id in qs.values_list("document_id", flat=True) - }) + return list({str(id) for id in qs.values_list("document_id", flat=True)}) class BaseDocumentIndexer(ABC): @@ -129,13 +129,14 @@ def search(self, text, user, token): """ visited_ids = get_visited_document_ids_of(user) - response = self.search_query(data={ - "q": text, - "visited": visited_ids, - "services": ["docs"], - }, token=token) - - print(response) + response = self.search_query( + data={ + "q": text, + "visited": visited_ids, + "services": ["docs"], + }, + token=token, + ) return self.format_response(response) @@ -207,7 +208,7 @@ def search_query(self, data, token) -> requests.Response: if not url: raise RuntimeError( - "SEARCH_INDEXER_QUERY_URL must be set in Django settings before indexing." + "SEARCH_INDEXER_QUERY_URL must be set in Django settings before search." ) try: @@ -228,9 +229,7 @@ def format_response(self, data: dict): """ Retrieve documents ids from Find app response and return a queryset. """ - return models.Document.objects.filter(pk__in=[ - d['_id'] for d in data - ]) + return models.Document.objects.filter(pk__in=[d["_id"] for d in data]) def push(self, data): """ diff --git a/src/backend/core/tasks/find.py b/src/backend/core/tasks/find.py index 858c83ee01..f65d3d0578 100644 --- a/src/backend/core/tasks/find.py +++ b/src/backend/core/tasks/find.py @@ -86,7 +86,8 @@ def _aux(): logger.info( "Add task for document %s indexation in %.2f seconds", - document.pk, countdown + document.pk, + countdown, ) # Each time this method is called during the countdown, we increment the diff --git a/src/backend/core/tests/commands/test_index.py b/src/backend/core/tests/commands/test_index.py index 1da70344e1..a726e712f0 100644 --- a/src/backend/core/tests/commands/test_index.py +++ b/src/backend/core/tests/commands/test_index.py @@ -21,7 +21,7 @@ def test_index(): with transaction.atomic(): doc = factories.DocumentFactory() - empty_doc = factories.DocumentFactory(title=None, content='') + empty_doc = factories.DocumentFactory(title=None, content="") no_title_doc = factories.DocumentFactory(title=None) factories.UserDocumentAccessFactory(document=doc, user=user) @@ -43,7 +43,10 @@ def sortkey(d): push_call_args = [call.args[0] for call in mock_push.call_args_list] assert len(push_call_args) == 1 # called once but with a batch of docs - assert sorted(push_call_args[0], key=sortkey) == sorted([ - indexer.serialize_document(doc, accesses), - indexer.serialize_document(no_title_doc, accesses), - ], key=sortkey) + assert sorted(push_call_args[0], key=sortkey) == sorted( + [ + indexer.serialize_document(doc, accesses), + indexer.serialize_document(no_title_doc, accesses), + ], + key=sortkey, + ) diff --git a/src/backend/core/tests/documents/test_api_documents_search.py b/src/backend/core/tests/documents/test_api_documents_search.py new file mode 100644 index 0000000000..79f1e6af62 --- /dev/null +++ b/src/backend/core/tests/documents/test_api_documents_search.py @@ -0,0 +1,137 @@ +""" +Tests for Documents API endpoint in impress's core app: list +""" + +import pytest +import responses +from faker import Faker +from rest_framework.test import APIClient + +from core import factories, models + +fake = Faker() +pytestmark = pytest.mark.django_db + + +@pytest.mark.parametrize("role", models.LinkRoleChoices.values) +@pytest.mark.parametrize("reach", models.LinkReachChoices.values) +@responses.activate +def test_api_documents_search_anonymous(reach, role, settings): + """ + Anonymous users should not be allowed to search documents whatever the + link reach and link role + """ + factories.DocumentFactory(link_reach=reach, link_role=role) + settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + + factories.DocumentFactory(link_reach=reach, link_role=role) + + # Find response + responses.add( + responses.POST, + "http://find/api/v1.0/search", + json=[], + status=200, + ) + + response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"}) + + assert response.status_code == 200 + assert response.json() == { + "count": 0, + "next": None, + "previous": None, + "results": [], + } + + +def test_api_documents_search_endpoint_is_none(settings): + """Missing SEARCH_INDEXER_QUERY_URL should throw an error""" + settings.SEARCH_INDEXER_QUERY_URL = None + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"}) + + assert response.status_code == 401 + assert response.json() == {"detail": "The service is not configured properly."} + + +@responses.activate +def test_api_documents_search_invalid_params(settings): + """Validate the format of documents as returned by the search view.""" + settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + response = APIClient().get("/api/v1.0/documents/search/") + + assert response.status_code == 400 + assert response.json() == {"q": ["This field is required."]} + + +@responses.activate +def test_api_documents_search_format(settings): + """Validate the format of documents as returned by the search view.""" + settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + user_a, user_b, user_c = factories.UserFactory.create_batch(3) + document = factories.DocumentFactory( + title="alpha", + users=(user_a, user_c), + link_traces=(user, user_b), + ) + access = factories.UserDocumentAccessFactory(document=document, user=user) + + # Find response + responses.add( + responses.POST, + "http://find/api/v1.0/search", + json=[ + {"_id": str(document.pk)}, + ], + status=200, + ) + response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"}) + + assert response.status_code == 200 + content = response.json() + results = content.pop("results") + assert content == { + "count": 1, + "next": None, + "previous": None, + } + assert len(results) == 1 + assert results[0] == { + "id": str(document.id), + "abilities": document.get_abilities(user), + "ancestors_link_reach": None, + "ancestors_link_role": None, + "computed_link_reach": document.computed_link_reach, + "computed_link_role": document.computed_link_role, + "created_at": document.created_at.isoformat().replace("+00:00", "Z"), + "creator": str(document.creator.id), + "depth": 1, + "excerpt": document.excerpt, + "link_reach": document.link_reach, + "link_role": document.link_role, + "nb_accesses_ancestors": 3, + "nb_accesses_direct": 3, + "numchild": 0, + "path": document.path, + "title": document.title, + "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"), + "user_role": access.role, + } diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index 0d74d8457b..5f04f3ae6d 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -1,11 +1,17 @@ """Tests for Documents search indexers""" +from functools import partial from unittest.mock import patch +from django.contrib.auth.models import AnonymousUser + import pytest -from core import factories, utils -from core.services.search_indexers import FindDocumentIndexer +from core import factories, models, utils +from core.services.search_indexers import ( + FindDocumentIndexer, + get_visited_document_ids_of, +) pytestmark = pytest.mark.django_db @@ -187,7 +193,6 @@ def test_services_search_indexers_ancestors_link_reach(mock_push): FindDocumentIndexer().index() - seen_doc_ids = set() results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 4 assert results[str(great_grand_parent.id)]["reach"] == "restricted" @@ -207,7 +212,6 @@ def test_services_search_indexers_ancestors_users(mock_push): FindDocumentIndexer().index() - seen_doc_ids = set() results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 3 assert results[str(grand_parent.id)]["users"] == [str(user_gp.sub)] @@ -228,7 +232,6 @@ def test_services_search_indexers_ancestors_teams(mock_push): FindDocumentIndexer().index() - seen_doc_ids = set() results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 3 assert results[str(grand_parent.id)]["groups"] == ["team_gp"] @@ -258,3 +261,85 @@ def test_push_uses_correct_url_and_data(mock_post, settings): assert args[0] == settings.SEARCH_INDEXER_URL assert kwargs.get("json") == sample_data assert kwargs.get("timeout") == 10 + + +def test_get_visited_document_ids_of(): + """ + get_visited_document_ids_of() returns the ids of the documents viewed + by the user BUT without specific access configuration (like public ones) + """ + user = factories.UserFactory() + other = factories.UserFactory() + anonymous = AnonymousUser() + + assert not get_visited_document_ids_of(anonymous) + assert not get_visited_document_ids_of(user) + + doc1, doc2, _ = factories.DocumentFactory.create_batch(3) + + create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False) + + create_link(document=doc1) + create_link(document=doc2) + + # The third document is not visited + assert sorted(get_visited_document_ids_of(user)) == sorted( + [str(doc1.pk), str(doc2.pk)] + ) + + factories.UserDocumentAccessFactory(user=other, document=doc1) + factories.UserDocumentAccessFactory(user=user, document=doc2) + + # The second document have an access for the user + assert get_visited_document_ids_of(user) == [str(doc1.pk)] + + +@patch("requests.post") +def test_services_search_indexers_search(mock_post, settings): + """ + search() should call requests.post to SEARCH_INDEXER_QUERY_URL with the + document ids from linktraces. + """ + user = factories.UserFactory() + indexer = FindDocumentIndexer() + + mock_response = mock_post.return_value + mock_response.raise_for_status.return_value = None # No error + + doc1, doc2, _ = factories.DocumentFactory.create_batch(3) + + create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False) + + create_link(document=doc1) + create_link(document=doc2) + + indexer.search("alpha", user=user, token="mytoken") + + args, kwargs = mock_post.call_args + + assert args[0] == settings.SEARCH_INDEXER_QUERY_URL + + query_data = kwargs.get("json") + assert query_data["q"] == "alpha" + assert sorted(query_data["visited"]) == sorted([str(doc1.pk), str(doc2.pk)]) + assert query_data["services"] == ["docs"] + + assert kwargs.get("headers") == {"Authorization": "Bearer mytoken"} + assert kwargs.get("timeout") == 10 + + +def test_search_query_raises_error_if_search_endpoint_is_none(settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None or empty. + """ + settings.SEARCH_INDEXER_QUERY_URL = None + indexer = FindDocumentIndexer() + user = factories.UserFactory() + + with pytest.raises(RuntimeError) as exc_info: + indexer.search("alpha", user=user, token="mytoken") + + assert ( + "SEARCH_INDEXER_QUERY_URL must be set in Django settings before search." + in str(exc_info.value) + ) diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index af849bd2a4..d5950d1f83 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -109,6 +109,9 @@ class Base(Configuration): SEARCH_INDEXER_SECRET = values.Value( default=None, environ_name="SEARCH_INDEXER_SECRET", environ_prefix=None ) + SEARCH_INDEXER_QUERY_URL = values.Value( + default=None, environ_name="SEARCH_INDEXER_QUERY_URL", environ_prefix=None + ) # Static files (CSS, JavaScript, Images) STATIC_URL = "/static/" From ff08b3e9774406ce31f897cf6ed2a5ee76ad34bc Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Thu, 11 Sep 2025 14:05:56 +0200 Subject: [PATCH 08/22] =?UTF-8?q?=E2=9C=A8(backend)=20improve=20search=20i?= =?UTF-8?q?ndexer=20service=20configuration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New SEARCH_INDEXER_CLASS setting to define the indexer service class. Raise ImpoperlyConfigured errors instead of RuntimeError in index service. Signed-off-by: Fabre Florian --- env.d/development/common | 2 +- src/backend/core/api/serializers.py | 10 +- src/backend/core/api/viewsets.py | 10 +- src/backend/core/models.py | 4 +- src/backend/core/services/search_indexers.py | 70 +++++---- src/backend/core/tasks/find.py | 13 +- src/backend/core/tests/commands/test_index.py | 12 +- .../documents/test_api_documents_search.py | 7 +- .../core/tests/test_models_documents.py | 10 +- .../tests/test_services_search_indexers.py | 144 +++++++++++++----- src/backend/impress/settings.py | 10 ++ 11 files changed, 193 insertions(+), 99 deletions(-) diff --git a/env.d/development/common b/env.d/development/common index 6df4a72248..eb5b0c5499 100644 --- a/env.d/development/common +++ b/env.d/development/common @@ -52,7 +52,7 @@ OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"} # Store OIDC tokens in the session OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session -OIDC_STORE_REFRESH_TOKEN_KEY = AnExampleKeyForDevPurposeOnly +OIDC_STORE_REFRESH_TOKEN_KEY = ThisIsAnExampleKeyForDevPurposeOnly # AI AI_FEATURE_ENABLED=true diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index be5db70841..d99e1f5b19 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -894,12 +894,4 @@ class MoveDocumentSerializer(serializers.Serializer): class FindDocumentSerializer(serializers.Serializer): """Serializer for Find search requests""" - q = serializers.CharField(required=True) - - def validate_q(self, value): - """Ensure the text field is not empty.""" - - if len(value.strip()) == 0: - raise serializers.ValidationError("Text field cannot be empty.") - - return value + q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True) diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 78dd9dd987..5b9ef3e8f2 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -12,7 +12,7 @@ from django.contrib.postgres.aggregates import ArrayAgg from django.contrib.postgres.search import TrigramSimilarity from django.core.cache import cache -from django.core.exceptions import ValidationError +from django.core.exceptions import ImproperlyConfigured, ValidationError from django.core.files.storage import default_storage from django.core.validators import URLValidator from django.db import connection, transaction @@ -49,7 +49,7 @@ from core.services.converter_services import ( YdocConverter, ) -from core.services.search_indexers import FindDocumentIndexer +from core.services.search_indexers import get_document_indexer_class from core.tasks.mail import send_ask_for_access_mail from core.utils import extract_attachments, filter_descendants @@ -1081,15 +1081,15 @@ def search(self, request, *args, **kwargs): serializer.is_valid(raise_exception=True) try: - indexer = FindDocumentIndexer() + indexer = get_document_indexer_class()() queryset = indexer.search( text=serializer.validated_data.get("q", ""), user=request.user, token=access_token, ) - except RuntimeError: + except ImproperlyConfigured: return drf.response.Response( - {"detail": "The service is not configured properly."}, + {"detail": "The service is not properly configured."}, status=status.HTTP_401_UNAUTHORIZED, ) diff --git a/src/backend/core/models.py b/src/backend/core/models.py index cf4035f7d9..0960a9bbdb 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -954,7 +954,7 @@ def restore(self): @receiver(signals.post_save, sender=Document) -def document_post_save(sender, instance, **kwargs): +def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-argument """ Asynchronous call to the document indexer at the end of the transaction. Note : Within the transaction we can have an empty content and a serialization @@ -1189,7 +1189,7 @@ def get_abilities(self, user): @receiver(signals.post_save, sender=DocumentAccess) -def document_access_post_save(sender, instance, created, **kwargs): +def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument """ Asynchronous call to the document indexer at the end of the transaction. """ diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index e24ac3900b..48a437becb 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -3,9 +3,12 @@ import logging from abc import ABC, abstractmethod from collections import defaultdict +from functools import cache from django.conf import settings from django.contrib.auth.models import AnonymousUser +from django.core.exceptions import ImproperlyConfigured +from django.utils.module_loading import import_string import requests @@ -14,18 +17,33 @@ logger = logging.getLogger(__name__) +@cache +def get_document_indexer_class() -> "BaseDocumentIndexer": + """Return the indexer backend class based on the settings.""" + classpath = settings.SEARCH_INDEXER_CLASS + + if not classpath: + raise ImproperlyConfigured( + "SEARCH_INDEXER_CLASS must be set in Django settings." + ) + + try: + return import_string(settings.SEARCH_INDEXER_CLASS) + except ImportError as err: + raise ImproperlyConfigured( + f"SEARCH_INDEXER_CLASS setting is not valid : {err}" + ) from err + + def get_batch_accesses_by_users_and_teams(paths): """ Get accesses related to a list of document paths, grouped by users and teams, including all ancestor paths. """ - # print("paths: ", paths) ancestor_map = utils.get_ancestor_to_descendants_map( paths, steplen=models.Document.steplen ) ancestor_paths = list(ancestor_map.keys()) - # print("ancestor map: ", ancestor_map) - # print("ancestor paths: ", ancestor_paths) access_qs = models.DocumentAccess.objects.filter( document__path__in=ancestor_paths @@ -80,6 +98,24 @@ def __init__(self, batch_size=None): Defaults to settings.SEARCH_INDEXER_BATCH_SIZE. """ self.batch_size = batch_size or settings.SEARCH_INDEXER_BATCH_SIZE + self.indexer_url = settings.SEARCH_INDEXER_URL + self.indexer_secret = settings.SEARCH_INDEXER_SECRET + self.search_url = settings.SEARCH_INDEXER_QUERY_URL + + if not self.indexer_url: + raise ImproperlyConfigured( + "SEARCH_INDEXER_URL must be set in Django settings." + ) + + if not self.indexer_secret: + raise ImproperlyConfigured( + "SEARCH_INDEXER_SECRET must be set in Django settings." + ) + + if not self.search_url: + raise ImproperlyConfigured( + "SEARCH_INDEXER_QUERY_URL must be set in Django settings." + ) def index(self): """ @@ -143,7 +179,7 @@ def search(self, text, user, token): @abstractmethod def search_query(self, data, token) -> dict: """ - Retreive documents from the Find app API. + Retrieve documents from the Find app API. Must be implemented by subclasses. """ @@ -204,16 +240,9 @@ def search_query(self, data, token) -> requests.Response: Returns: dict: A JSON-serializable dictionary. """ - url = getattr(settings, "SEARCH_INDEXER_QUERY_URL", None) - - if not url: - raise RuntimeError( - "SEARCH_INDEXER_QUERY_URL must be set in Django settings before search." - ) - try: response = requests.post( - url, + self.search_url, json=data, headers={"Authorization": f"Bearer {token}"}, timeout=10, @@ -222,7 +251,6 @@ def search_query(self, data, token) -> requests.Response: return response.json() except requests.exceptions.HTTPError as e: logger.error("HTTPError: %s", e) - logger.error("Response content: %s", response.text) # type: ignore raise def format_response(self, data: dict): @@ -238,27 +266,15 @@ def push(self, data): Args: data (list): List of document dictionaries. """ - url = getattr(settings, "SEARCH_INDEXER_URL", None) - if not url: - raise RuntimeError( - "SEARCH_INDEXER_URL must be set in Django settings before indexing." - ) - - secret = getattr(settings, "SEARCH_INDEXER_SECRET", None) - if not secret: - raise RuntimeError( - "SEARCH_INDEXER_SECRET must be set in Django settings before indexing." - ) try: response = requests.post( - url, + self.indexer_url, json=data, - headers={"Authorization": f"Bearer {secret}"}, + headers={"Authorization": f"Bearer {self.indexer_secret}"}, timeout=10, ) response.raise_for_status() except requests.exceptions.HTTPError as e: logger.error("HTTPError: %s", e) - logger.error("Response content: %s", response.text) raise diff --git a/src/backend/core/tasks/find.py b/src/backend/core/tasks/find.py index f65d3d0578..4a58b8304f 100644 --- a/src/backend/core/tasks/find.py +++ b/src/backend/core/tasks/find.py @@ -1,5 +1,6 @@ """Trigger document indexation using celery task.""" +from functools import partial from logging import getLogger from django.conf import settings @@ -8,8 +9,8 @@ from core import models from core.services.search_indexers import ( - FindDocumentIndexer, get_batch_accesses_by_users_and_teams, + get_document_indexer_class, ) from impress.celery_app import app @@ -52,7 +53,7 @@ def document_indexer_task(document_id): return doc = models.Document.objects.get(pk=document_id) - indexer = FindDocumentIndexer() + indexer = get_document_indexer_class()() accesses = get_batch_accesses_by_users_and_teams((doc.path,)) data = indexer.serialize_document(document=doc, accesses=accesses) @@ -75,11 +76,9 @@ def trigger_document_indexer(document, on_commit=False): pass if on_commit: - - def _aux(): - trigger_document_indexer(document, on_commit=False) - - transaction.on_commit(_aux) + transaction.on_commit( + partial(trigger_document_indexer, document, on_commit=False) + ) else: key = document_indexer_debounce_key(document.pk) countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1) diff --git a/src/backend/core/tests/commands/test_index.py b/src/backend/core/tests/commands/test_index.py index a726e712f0..d30e01181c 100644 --- a/src/backend/core/tests/commands/test_index.py +++ b/src/backend/core/tests/commands/test_index.py @@ -2,6 +2,7 @@ Unit test for `index` command. """ +from operator import itemgetter from unittest import mock from django.core.management import call_command @@ -34,19 +35,18 @@ def test_index(): str(no_title_doc.path): {"users": [user.sub]}, } - def sortkey(d): - return d["id"] - with mock.patch.object(FindDocumentIndexer, "push") as mock_push: call_command("index") push_call_args = [call.args[0] for call in mock_push.call_args_list] - assert len(push_call_args) == 1 # called once but with a batch of docs - assert sorted(push_call_args[0], key=sortkey) == sorted( + # called once but with a batch of docs + mock_push.assert_called_once() + + assert sorted(push_call_args[0], key=itemgetter("id")) == sorted( [ indexer.serialize_document(doc, accesses), indexer.serialize_document(no_title_doc, accesses), ], - key=sortkey, + key=itemgetter("id"), ) diff --git a/src/backend/core/tests/documents/test_api_documents_search.py b/src/backend/core/tests/documents/test_api_documents_search.py index 79f1e6af62..f5dcfa2425 100644 --- a/src/backend/core/tests/documents/test_api_documents_search.py +++ b/src/backend/core/tests/documents/test_api_documents_search.py @@ -57,7 +57,7 @@ def test_api_documents_search_endpoint_is_none(settings): response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"}) assert response.status_code == 401 - assert response.json() == {"detail": "The service is not configured properly."} + assert response.json() == {"detail": "The service is not properly configured."} @responses.activate @@ -75,6 +75,11 @@ def test_api_documents_search_invalid_params(settings): assert response.status_code == 400 assert response.json() == {"q": ["This field is required."]} + response = APIClient().get("/api/v1.0/documents/search/", data={"q": " "}) + + assert response.status_code == 400 + assert response.json() == {"q": ["This field may not be blank."]} + @responses.activate def test_api_documents_search_format(settings): diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index 1c4df5deb8..fb09abaced 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -3,6 +3,7 @@ """ # pylint: disable=too-many-lines +from operator import itemgetter import random import smtplib import time @@ -1466,7 +1467,7 @@ def test_models_documents_post_save_indexer(mock_push, settings): factories.UserDocumentAccessFactory(document=doc2, user=user) factories.UserDocumentAccessFactory(document=doc3, user=user) - time.sleep(0.1) # waits for the end of the tasks + time.sleep(0.2) # waits for the end of the tasks accesses = { str(doc1.path): {"users": [user.sub]}, @@ -1478,16 +1479,13 @@ def test_models_documents_post_save_indexer(mock_push, settings): indexer = FindDocumentIndexer() - def sortkey(d): - return d["id"] - - assert sorted(data, key=sortkey) == sorted( + assert sorted(data, key=itemgetter('id')) == sorted( [ indexer.serialize_document(doc1, accesses), indexer.serialize_document(doc2, accesses), indexer.serialize_document(doc3, accesses), ], - key=sortkey, + key=itemgetter('id'), ) # The debounce counters should be reset diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index 5f04f3ae6d..9b4309acfa 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -4,77 +4,154 @@ from unittest.mock import patch from django.contrib.auth.models import AnonymousUser +from django.core.exceptions import ImproperlyConfigured +from django.utils.module_loading import import_string import pytest from core import factories, models, utils from core.services.search_indexers import ( + BaseDocumentIndexer, FindDocumentIndexer, + get_document_indexer_class, get_visited_document_ids_of, ) pytestmark = pytest.mark.django_db -def test_push_raises_error_if_search_indexer_url_is_none(settings): +class FakeDocumentIndexer(BaseDocumentIndexer): + """Fake indexer for test purpose""" + + def serialize_document(self, document, accesses): + return {} + + def push(self, data): + pass + + def search_query(self, data, token): + return {} + + def format_response(self, data: dict): + return {} + + + +@pytest.fixture(name="fake_indexer_settings") +def fake_indexer_settings_fixture(settings): + """Fixture to switch the indexer to the FakeDocumentIndexer.""" + _original_backend = str(settings.SEARCH_INDEXER_CLASS) + + settings.SEARCH_INDEXER_CLASS = ( + "core.tests.test_services_search_indexers.FakeDocumentIndexer" + ) + get_document_indexer_class.cache_clear() + + yield settings + + settings.SEARCH_INDEXER_CLASS = _original_backend + # clear cache to prevent issues with other tests + get_document_indexer_class.cache_clear() + + +def test_services_search_indexer_class_is_empty(fake_indexer_settings): + """ + Should raise ImproperlyConfigured if SEARCH_INDEXER_CLASS is None or empty. + """ + fake_indexer_settings.SEARCH_INDEXER_CLASS = None + + with pytest.raises(ImproperlyConfigured) as exc_info: + get_document_indexer_class() + + assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value) + + fake_indexer_settings.SEARCH_INDEXER_CLASS = "" + + # clear cache again + get_document_indexer_class.cache_clear() + + with pytest.raises(ImproperlyConfigured) as exc_info: + get_document_indexer_class() + + assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value) + + +def test_services_search_indexer_class_invalid(fake_indexer_settings): + """ + Should raise RuntimeError if SEARCH_INDEXER_CLASS cannot be imported. + """ + fake_indexer_settings.SEARCH_INDEXER_CLASS = "unknown.Unknown" + + with pytest.raises(ImproperlyConfigured) as exc_info: + get_document_indexer_class() + + assert ( + "SEARCH_INDEXER_CLASS setting is not valid : No module named 'unknown'" + in str(exc_info.value) + ) + + +def test_services_search_indexer_class(fake_indexer_settings): + """ + Import indexer class defined in setting SEARCH_INDEXER_CLASS. + """ + fake_indexer_settings.SEARCH_INDEXER_CLASS = ( + "core.tests.test_services_search_indexers.FakeDocumentIndexer" + ) + + assert get_document_indexer_class() == import_string( + "core.tests.test_services_search_indexers.FakeDocumentIndexer" + ) + +def test_services_search_indexer_url_is_none(settings): """ Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty. """ settings.SEARCH_INDEXER_URL = None - indexer = FindDocumentIndexer() - with pytest.raises(RuntimeError) as exc_info: - indexer.push([]) + with pytest.raises(ImproperlyConfigured) as exc_info: + FindDocumentIndexer() - assert "SEARCH_INDEXER_URL must be set in Django settings before indexing." in str( - exc_info.value - ) + assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value) -def test_push_raises_error_if_search_indexer_url_is_empty(settings): +def test_services_search_indexer_url_is_empty(settings): """ Indexer should raise RuntimeError if SEARCH_INDEXER_URL is empty string. """ settings.SEARCH_INDEXER_URL = "" - indexer = FindDocumentIndexer() - with pytest.raises(RuntimeError) as exc_info: - indexer.push([]) + with pytest.raises(ImproperlyConfigured) as exc_info: + FindDocumentIndexer() - assert "SEARCH_INDEXER_URL must be set in Django settings before indexing." in str( - exc_info.value - ) + assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value) -def test_push_raises_error_if_search_indexer_secret_is_none(settings): +def test_services_search_indexer_secret_is_none(settings): """ Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None or empty. """ settings.SEARCH_INDEXER_SECRET = None - indexer = FindDocumentIndexer() - with pytest.raises(RuntimeError) as exc_info: - indexer.push([]) + with pytest.raises(ImproperlyConfigured) as exc_info: + FindDocumentIndexer() - assert ( - "SEARCH_INDEXER_SECRET must be set in Django settings before indexing." - in str(exc_info.value) + assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str( + exc_info.value ) -def test_push_raises_error_if_search_indexer_secret_is_empty(settings): +def test_services_search_indexer_secret_is_empty(settings): """ Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is empty string. """ settings.SEARCH_INDEXER_SECRET = "" - indexer = FindDocumentIndexer() - with pytest.raises(RuntimeError) as exc_info: - indexer.push([]) + with pytest.raises(ImproperlyConfigured) as exc_info: + FindDocumentIndexer() - assert ( - "SEARCH_INDEXER_SECRET must be set in Django settings before indexing." - in str(exc_info.value) + assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str( + exc_info.value ) @@ -333,13 +410,10 @@ def test_search_query_raises_error_if_search_endpoint_is_none(settings): Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None or empty. """ settings.SEARCH_INDEXER_QUERY_URL = None - indexer = FindDocumentIndexer() - user = factories.UserFactory() - with pytest.raises(RuntimeError) as exc_info: - indexer.search("alpha", user=user, token="mytoken") + with pytest.raises(ImproperlyConfigured) as exc_info: + FindDocumentIndexer() - assert ( - "SEARCH_INDEXER_QUERY_URL must be set in Django settings before search." - in str(exc_info.value) + assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str( + exc_info.value ) diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index d5950d1f83..be0b99540e 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -100,6 +100,11 @@ class Base(Configuration): DEFAULT_AUTO_FIELD = "django.db.models.AutoField" # Search + SEARCH_INDEXER_CLASS = values.Value( + default="core.services.search_indexers.FindDocumentIndexer", + environ_name="SEARCH_INDEXER_CLASS", + environ_prefix=None, + ) SEARCH_INDEXER_BATCH_SIZE = values.IntegerValue( default=100_000, environ_name="SEARCH_INDEXER_BATCH_SIZE", environ_prefix=None ) @@ -952,6 +957,11 @@ class Test(Base): # Tests are raising warnings because the /data/static directory does not exist STATIC_ROOT = None + # Setup indexer configuration to make test working on the CI. + SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest" # noqa + SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/" + SEARCH_INDEXER_QUERY_URL = "http://localhost:8081/api/v1.0/documents/search/" + CELERY_TASK_ALWAYS_EAGER = values.BooleanValue(True) def __init__(self): From f49902a2b2e3e3fec3fd11f14fe5be5fb8bdbc00 Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Fri, 12 Sep 2025 14:11:23 +0200 Subject: [PATCH 09/22] =?UTF-8?q?=E2=9C=A8(backend)=20refactor=20indexatio?= =?UTF-8?q?n=20signals=20and=20fix=20circular=20import=20issues?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Fabre Florian --- CHANGELOG.md | 3 ++ src/backend/core/apps.py | 22 ++++++++++----- src/backend/core/models.py | 22 --------------- src/backend/core/signals.py | 28 +++++++++++++++++++ src/backend/core/tasks/find.py | 13 +++++---- .../core/tests/test_models_documents.py | 6 ++-- .../tests/test_services_search_indexers.py | 2 +- 7 files changed, 57 insertions(+), 39 deletions(-) create mode 100644 src/backend/core/signals.py diff --git a/CHANGELOG.md b/CHANGELOG.md index a43001980c..55587ab44d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -97,6 +97,9 @@ and this project adheres to - ♿ update labels and shared document icon accessibility #1442 - 🍱(frontend) Fonts GDPR compliants #1453 - ♻️(service-worker) improve SW registration and update handling #1473 +- ✨(backend) add async indexation of documents on save (or access save) #1276 +- ✨(backend) add debounce mechanism to limit indexation jobs #1276 +- ✨(api) add API route to search for indexed documents in Find #1276 ### Fixed diff --git a/src/backend/core/apps.py b/src/backend/core/apps.py index bba7de0f42..b8ce648834 100644 --- a/src/backend/core/apps.py +++ b/src/backend/core/apps.py @@ -1,11 +1,19 @@ """Impress Core application""" -# from django.apps import AppConfig -# from django.utils.translation import gettext_lazy as _ +from django.apps import AppConfig +from django.utils.translation import gettext_lazy as _ -# class CoreConfig(AppConfig): -# """Configuration class for the impress core app.""" -# name = "core" -# app_label = "core" -# verbose_name = _("impress core application") +class CoreConfig(AppConfig): + """Configuration class for the impress core app.""" + + name = "core" + app_label = "core" + verbose_name = _("Impress core application") + + def ready(self): + """ + Import signals when the app is ready. + """ + # pylint: disable=import-outside-toplevel, unused-import + from . import signals # noqa: PLC0415 diff --git a/src/backend/core/models.py b/src/backend/core/models.py index 0960a9bbdb..dcb13f9b16 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -20,9 +20,7 @@ from django.core.files.storage import default_storage from django.core.mail import send_mail from django.db import models, transaction -from django.db.models import signals from django.db.models.functions import Left, Length -from django.dispatch import receiver from django.template.loader import render_to_string from django.utils import timezone from django.utils.functional import cached_property @@ -41,7 +39,6 @@ RoleChoices, get_equivalent_link_definition, ) -from .tasks.find import trigger_document_indexer from .validators import sub_validator logger = getLogger(__name__) @@ -953,16 +950,6 @@ def restore(self): ) -@receiver(signals.post_save, sender=Document) -def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-argument - """ - Asynchronous call to the document indexer at the end of the transaction. - Note : Within the transaction we can have an empty content and a serialization - error. - """ - trigger_document_indexer(instance, on_commit=True) - - class LinkTrace(BaseModel): """ Relation model to trace accesses to a document via a link by a logged-in user. @@ -1188,15 +1175,6 @@ def get_abilities(self, user): } -@receiver(signals.post_save, sender=DocumentAccess) -def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument - """ - Asynchronous call to the document indexer at the end of the transaction. - """ - if not created: - trigger_document_indexer(instance.document, on_commit=True) - - class DocumentAskForAccess(BaseModel): """Relation model to ask for access to a document.""" diff --git a/src/backend/core/signals.py b/src/backend/core/signals.py new file mode 100644 index 0000000000..b6a1292973 --- /dev/null +++ b/src/backend/core/signals.py @@ -0,0 +1,28 @@ +""" +Declare and configure the signals for the impress core application +""" + +from django.db.models import signals +from django.dispatch import receiver + +from . import models +from .tasks.find import trigger_document_indexer + + +@receiver(signals.post_save, sender=models.Document) +def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-argument + """ + Asynchronous call to the document indexer at the end of the transaction. + Note : Within the transaction we can have an empty content and a serialization + error. + """ + trigger_document_indexer(instance, on_commit=True) + + +@receiver(signals.post_save, sender=models.DocumentAccess) +def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument + """ + Asynchronous call to the document indexer at the end of the transaction. + """ + if not created: + trigger_document_indexer(instance.document, on_commit=True) diff --git a/src/backend/core/tasks/find.py b/src/backend/core/tasks/find.py index 4a58b8304f..10b15c353e 100644 --- a/src/backend/core/tasks/find.py +++ b/src/backend/core/tasks/find.py @@ -7,12 +7,6 @@ from django.core.cache import cache from django.db import transaction -from core import models -from core.services.search_indexers import ( - get_batch_accesses_by_users_and_teams, - get_document_indexer_class, -) - from impress.celery_app import app logger = getLogger(__file__) @@ -52,6 +46,13 @@ def document_indexer_task(document_id): logger.info("Skip document %s indexation", document_id) return + # pylint: disable=import-outside-toplevel + from core import models # noqa: PLC0415 + from core.services.search_indexers import ( # noqa: PLC0415 + get_batch_accesses_by_users_and_teams, + get_document_indexer_class, + ) + doc = models.Document.objects.get(pk=document_id) indexer = get_document_indexer_class()() accesses = get_batch_accesses_by_users_and_teams((doc.path,)) diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index fb09abaced..99694e4401 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -3,11 +3,11 @@ """ # pylint: disable=too-many-lines -from operator import itemgetter import random import smtplib import time from logging import Logger +from operator import itemgetter from unittest import mock from django.contrib.auth.models import AnonymousUser @@ -1479,13 +1479,13 @@ def test_models_documents_post_save_indexer(mock_push, settings): indexer = FindDocumentIndexer() - assert sorted(data, key=itemgetter('id')) == sorted( + assert sorted(data, key=itemgetter("id")) == sorted( [ indexer.serialize_document(doc1, accesses), indexer.serialize_document(doc2, accesses), indexer.serialize_document(doc3, accesses), ], - key=itemgetter('id'), + key=itemgetter("id"), ) # The debounce counters should be reset diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index 9b4309acfa..e0f43fcaba 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -36,7 +36,6 @@ def format_response(self, data: dict): return {} - @pytest.fixture(name="fake_indexer_settings") def fake_indexer_settings_fixture(settings): """Fixture to switch the indexer to the FakeDocumentIndexer.""" @@ -103,6 +102,7 @@ def test_services_search_indexer_class(fake_indexer_settings): "core.tests.test_services_search_indexers.FakeDocumentIndexer" ) + def test_services_search_indexer_url_is_none(settings): """ Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty. From 9525b9728696ad34ee0f31b5726d10346101514b Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Wed, 17 Sep 2025 07:47:15 +0200 Subject: [PATCH 10/22] =?UTF-8?q?=E2=9C=A8(backend)=20add=20fallback=20sea?= =?UTF-8?q?rch=20&=20default=20ordering?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Filter deleted documents from visited ones. Set default ordering to the Find API search call (-updated_at) BaseDocumentIndexer.search now returns a list of document ids instead of models. Do not call the indexer in signals when SEARCH_INDEXER_CLASS is not defined or properly configured. Signed-off-by: Fabre Florian --- src/backend/core/api/serializers.py | 4 + src/backend/core/api/viewsets.py | 46 +++- src/backend/core/services/search_indexers.py | 78 ++++-- src/backend/core/signals.py | 11 +- src/backend/core/tasks/find.py | 47 ++-- src/backend/core/tests/commands/test_index.py | 1 + src/backend/core/tests/conftest.py | 29 +++ .../documents/test_api_documents_search.py | 122 +++++++-- .../core/tests/test_models_documents.py | 129 ++++++++- .../tests/test_services_search_indexers.py | 245 +++++++++++++----- src/backend/impress/settings.py | 7 +- 11 files changed, 562 insertions(+), 157 deletions(-) diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index d99e1f5b19..ca6b444dee 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -895,3 +895,7 @@ class FindDocumentSerializer(serializers.Serializer): """Serializer for Find search requests""" q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True) + page_size = serializers.IntegerField( + required=False, min_value=1, max_value=50, default=20 + ) + page = serializers.IntegerField(required=False, min_value=1, default=1) diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 5b9ef3e8f2..0b383ffa4c 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -12,7 +12,7 @@ from django.contrib.postgres.aggregates import ArrayAgg from django.contrib.postgres.search import TrigramSimilarity from django.core.cache import cache -from django.core.exceptions import ImproperlyConfigured, ValidationError +from django.core.exceptions import ValidationError from django.core.files.storage import default_storage from django.core.validators import URLValidator from django.db import connection, transaction @@ -49,7 +49,10 @@ from core.services.converter_services import ( YdocConverter, ) -from core.services.search_indexers import get_document_indexer_class +from core.services.search_indexers import ( + default_document_indexer, + get_visited_document_ids_of, +) from core.tasks.mail import send_ask_for_access_mail from core.utils import extract_attachments, filter_descendants @@ -1076,23 +1079,42 @@ def search(self, request, *args, **kwargs): The filtering allows full text search through the opensearch indexation app "find". """ access_token = request.session.get("oidc_access_token") + user = request.user serializer = serializers.FindDocumentSerializer(data=request.query_params) serializer.is_valid(raise_exception=True) - try: - indexer = get_document_indexer_class()() - queryset = indexer.search( - text=serializer.validated_data.get("q", ""), - user=request.user, - token=access_token, + indexer = default_document_indexer() + + if not indexer: + queryset = self.get_queryset() + filterset = DocumentFilter( + {"title": serializer.validated_data.get("q", "")}, queryset=queryset ) - except ImproperlyConfigured: - return drf.response.Response( - {"detail": "The service is not properly configured."}, - status=status.HTTP_401_UNAUTHORIZED, + + if not filterset.is_valid(): + raise drf.exceptions.ValidationError(filterset.errors) + + queryset = filterset.filter_queryset(queryset).order_by("-updated_at") + + return self.get_response_for_queryset( + queryset, + context={ + "request": request, + }, ) + queryset = models.Document.objects.all() + results = indexer.search( + text=serializer.validated_data.get("q", ""), + token=access_token, + visited=get_visited_document_ids_of(queryset, user), + page=serializer.validated_data.get("page", 1), + page_size=serializer.validated_data.get("page_size", 20), + ) + + queryset = queryset.filter(pk__in=results) + return self.get_response_for_queryset( queryset, context={ diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index 48a437becb..255aa3bbba 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -8,6 +8,7 @@ from django.conf import settings from django.contrib.auth.models import AnonymousUser from django.core.exceptions import ImproperlyConfigured +from django.db.models import Subquery from django.utils.module_loading import import_string import requests @@ -18,7 +19,23 @@ @cache -def get_document_indexer_class() -> "BaseDocumentIndexer": +def default_document_indexer(): + """Returns default indexer service is enabled and properly configured.""" + + # For this usecase an empty indexer class is not an issue but a feature. + if not getattr(settings, "SEARCH_INDEXER_CLASS", None): + logger.info("Document indexer is not configured (see SEARCH_INDEXER_CLASS)") + return None + + try: + return get_document_indexer_class()() + except ImproperlyConfigured as err: + logger.error("Document indexer is not properly configured : %s", err) + return None + + +@cache +def get_document_indexer_class(): """Return the indexer backend class based on the settings.""" classpath = settings.SEARCH_INDEXER_CLASS @@ -65,7 +82,7 @@ def get_batch_accesses_by_users_and_teams(paths): return dict(access_by_document_path) -def get_visited_document_ids_of(user): +def get_visited_document_ids_of(queryset, user): """ Returns the ids of the documents that have a linktrace to the user and NOT owned. It will be use to limit the opensearch responses to the public documents already @@ -74,11 +91,18 @@ def get_visited_document_ids_of(user): if isinstance(user, AnonymousUser): return [] - qs = models.LinkTrace.objects.filter(user=user).exclude( - document__accesses__user=user, + qs = models.LinkTrace.objects.filter(user=user) + + docs = ( + queryset.exclude(accesses__user=user) + .filter( + deleted_at__isnull=True, + ancestors_deleted_at__isnull=True, + ) + .filter(pk__in=Subquery(qs.values("document_id"))) ) - return list({str(id) for id in qs.values_list("document_id", flat=True)}) + return list({str(id) for id in docs.values_list("pk", flat=True)}) class BaseDocumentIndexer(ABC): @@ -159,22 +183,41 @@ def push(self, data): Must be implemented by subclasses. """ - def search(self, text, user, token): + # pylint: disable-next=too-many-arguments,too-many-positional-arguments + def search(self, text, token, visited=(), page=1, page_size=50): """ Search for documents in Find app. - """ - visited_ids = get_visited_document_ids_of(user) + Ensure the same default ordering as "Docs" list : -updated_at + Returns ids of the documents + + Args: + text (str): Text search content. + token (str): OIDC Authentication token. + visited (list, optional): + List of ids of active public documents with LinkTrace + Defaults to settings.SEARCH_INDEXER_BATCH_SIZE. + page (int, optional): + The page number to retrieve. + Defaults to 1 if not specified. + page_size (int, optional): + The number of results to return per page. + Defaults to 50 if not specified. + """ response = self.search_query( data={ "q": text, - "visited": visited_ids, + "visited": visited, "services": ["docs"], + "page_number": page, + "page_size": page_size, + "order_by": "updated_at", + "order_direction": "desc", }, token=token, ) - return self.format_response(response) + return [d["_id"] for d in response] @abstractmethod def search_query(self, data, token) -> dict: @@ -184,14 +227,6 @@ def search_query(self, data, token) -> dict: Must be implemented by subclasses. """ - @abstractmethod - def format_response(self, data: dict): - """ - Convert the JSON response from Find app as document queryset. - - Must be implemented by subclasses. - """ - class FindDocumentIndexer(BaseDocumentIndexer): """ @@ -253,12 +288,6 @@ def search_query(self, data, token) -> requests.Response: logger.error("HTTPError: %s", e) raise - def format_response(self, data: dict): - """ - Retrieve documents ids from Find app response and return a queryset. - """ - return models.Document.objects.filter(pk__in=[d["_id"] for d in data]) - def push(self, data): """ Push a batch of documents to the Find backend. @@ -266,7 +295,6 @@ def push(self, data): Args: data (list): List of document dictionaries. """ - try: response = requests.post( self.indexer_url, diff --git a/src/backend/core/signals.py b/src/backend/core/signals.py index b6a1292973..fbb9368a90 100644 --- a/src/backend/core/signals.py +++ b/src/backend/core/signals.py @@ -2,10 +2,14 @@ Declare and configure the signals for the impress core application """ +from functools import partial + +from django.db import transaction from django.db.models import signals from django.dispatch import receiver from . import models +from .services.search_indexers import default_document_indexer from .tasks.find import trigger_document_indexer @@ -16,7 +20,8 @@ def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-ar Note : Within the transaction we can have an empty content and a serialization error. """ - trigger_document_indexer(instance, on_commit=True) + if default_document_indexer() is not None: + transaction.on_commit(partial(trigger_document_indexer, instance)) @receiver(signals.post_save, sender=models.DocumentAccess) @@ -24,5 +29,5 @@ def document_access_post_save(sender, instance, created, **kwargs): # pylint: d """ Asynchronous call to the document indexer at the end of the transaction. """ - if not created: - trigger_document_indexer(instance.document, on_commit=True) + if not created and default_document_indexer() is not None: + transaction.on_commit(partial(trigger_document_indexer, instance.document)) diff --git a/src/backend/core/tasks/find.py b/src/backend/core/tasks/find.py index 10b15c353e..f4d4149545 100644 --- a/src/backend/core/tasks/find.py +++ b/src/backend/core/tasks/find.py @@ -1,11 +1,9 @@ """Trigger document indexation using celery task.""" -from functools import partial from logging import getLogger from django.conf import settings from django.core.cache import cache -from django.db import transaction from impress.celery_app import app @@ -37,7 +35,7 @@ def decr_counter(key): @app.task def document_indexer_task(document_id): - """Send indexation query for a document using celery task.""" + """Celery Task : Sends indexation query for a document.""" key = document_indexer_debounce_key(document_id) # check if the counter : if still up, skip the task. only the last one @@ -46,6 +44,7 @@ def document_indexer_task(document_id): logger.info("Skip document %s indexation", document_id) return + # Prevents some circular imports # pylint: disable=import-outside-toplevel from core import models # noqa: PLC0415 from core.services.search_indexers import ( # noqa: PLC0415 @@ -63,35 +62,27 @@ def document_indexer_task(document_id): indexer.push(data) -def trigger_document_indexer(document, on_commit=False): +def trigger_document_indexer(document): """ Trigger indexation task with debounce a delay set by the SEARCH_INDEXER_COUNTDOWN setting. Args: document (Document): The document instance. - on_commit (bool): Wait for the end of the transaction before starting the task - (some fields may be in wrong state within the transaction) """ - if document.deleted_at or document.ancestors_deleted_at: - pass - - if on_commit: - transaction.on_commit( - partial(trigger_document_indexer, document, on_commit=False) - ) - else: - key = document_indexer_debounce_key(document.pk) - countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1) - - logger.info( - "Add task for document %s indexation in %.2f seconds", - document.pk, - countdown, - ) - - # Each time this method is called during the countdown, we increment the - # counter and each task decrease it, so the index be run only once. - incr_counter(key) - - document_indexer_task.apply_async(args=[document.pk], countdown=countdown) + return + + key = document_indexer_debounce_key(document.pk) + countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1) + + logger.info( + "Add task for document %s indexation in %.2f seconds", + document.pk, + countdown, + ) + + # Each time this method is called during the countdown, we increment the + # counter and each task decrease it, so the index be run only once. + incr_counter(key) + + document_indexer_task.apply_async(args=[document.pk], countdown=countdown) diff --git a/src/backend/core/tests/commands/test_index.py b/src/backend/core/tests/commands/test_index.py index d30e01181c..169e0b830b 100644 --- a/src/backend/core/tests/commands/test_index.py +++ b/src/backend/core/tests/commands/test_index.py @@ -15,6 +15,7 @@ @pytest.mark.django_db +@pytest.mark.usefixtures("indexer_settings") def test_index(): """Test the command `index` that run the Find app indexer for all the available documents.""" user = factories.UserFactory() diff --git a/src/backend/core/tests/conftest.py b/src/backend/core/tests/conftest.py index 00e830e18f..2102879b52 100644 --- a/src/backend/core/tests/conftest.py +++ b/src/backend/core/tests/conftest.py @@ -24,3 +24,32 @@ def mock_user_teams(): "core.models.User.teams", new_callable=mock.PropertyMock ) as mock_teams: yield mock_teams + + +@pytest.fixture(name="indexer_settings") +def indexer_settings_fixture(settings): + """ + Setup valid settings for the document indexer. Clear the indexer cache. + """ + + # pylint: disable-next=import-outside-toplevel + from core.services.search_indexers import ( # noqa: PLC0415 + default_document_indexer, + get_document_indexer_class, + ) + + default_document_indexer.cache_clear() + get_document_indexer_class.cache_clear() + + settings.SEARCH_INDEXER_CLASS = "core.services.search_indexers.FindDocumentIndexer" + settings.SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest" + settings.SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/" + settings.SEARCH_INDEXER_QUERY_URL = ( + "http://localhost:8081/api/v1.0/documents/search/" + ) + + yield settings + + # clear cache to prevent issues with other tests + default_document_indexer.cache_clear() + get_document_indexer_class.cache_clear() diff --git a/src/backend/core/tests/documents/test_api_documents_search.py b/src/backend/core/tests/documents/test_api_documents_search.py index f5dcfa2425..2c3b2d3523 100644 --- a/src/backend/core/tests/documents/test_api_documents_search.py +++ b/src/backend/core/tests/documents/test_api_documents_search.py @@ -2,12 +2,15 @@ Tests for Documents API endpoint in impress's core app: list """ +from json import loads as json_loads + import pytest import responses from faker import Faker from rest_framework.test import APIClient from core import factories, models +from core.services.search_indexers import default_document_indexer fake = Faker() pytestmark = pytest.mark.django_db @@ -16,13 +19,12 @@ @pytest.mark.parametrize("role", models.LinkRoleChoices.values) @pytest.mark.parametrize("reach", models.LinkReachChoices.values) @responses.activate -def test_api_documents_search_anonymous(reach, role, settings): +def test_api_documents_search_anonymous(reach, role, indexer_settings): """ Anonymous users should not be allowed to search documents whatever the link reach and link role """ - factories.DocumentFactory(link_reach=reach, link_role=role) - settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" factories.DocumentFactory(link_reach=reach, link_role=role) @@ -45,46 +47,90 @@ def test_api_documents_search_anonymous(reach, role, settings): } -def test_api_documents_search_endpoint_is_none(settings): - """Missing SEARCH_INDEXER_QUERY_URL should throw an error""" - settings.SEARCH_INDEXER_QUERY_URL = None +def test_api_documents_search_endpoint_is_none(indexer_settings): + """ + Missing SEARCH_INDEXER_QUERY_URL, so the indexer is not properly configured. + Should fallback on title filter + """ + indexer_settings.SEARCH_INDEXER_QUERY_URL = None + + assert default_document_indexer() is None user = factories.UserFactory() + document = factories.DocumentFactory(title="alpha") + access = factories.UserDocumentAccessFactory(document=document, user=user) client = APIClient() client.force_login(user) - response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"}) + response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"}) - assert response.status_code == 401 - assert response.json() == {"detail": "The service is not properly configured."} + assert response.status_code == 200 + content = response.json() + results = content.pop("results") + assert content == { + "count": 1, + "next": None, + "previous": None, + } + assert len(results) == 1 + assert results[0] == { + "id": str(document.id), + "abilities": document.get_abilities(user), + "ancestors_link_reach": None, + "ancestors_link_role": None, + "computed_link_reach": document.computed_link_reach, + "computed_link_role": document.computed_link_role, + "created_at": document.created_at.isoformat().replace("+00:00", "Z"), + "creator": str(document.creator.id), + "depth": 1, + "excerpt": document.excerpt, + "link_reach": document.link_reach, + "link_role": document.link_role, + "nb_accesses_ancestors": 1, + "nb_accesses_direct": 1, + "numchild": 0, + "path": document.path, + "title": document.title, + "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"), + "user_role": access.role, + } @responses.activate -def test_api_documents_search_invalid_params(settings): +def test_api_documents_search_invalid_params(indexer_settings): """Validate the format of documents as returned by the search view.""" - settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" user = factories.UserFactory() client = APIClient() client.force_login(user) - response = APIClient().get("/api/v1.0/documents/search/") + response = client.get("/api/v1.0/documents/search/") assert response.status_code == 400 assert response.json() == {"q": ["This field is required."]} - response = APIClient().get("/api/v1.0/documents/search/", data={"q": " "}) + response = client.get("/api/v1.0/documents/search/", data={"q": " "}) assert response.status_code == 400 assert response.json() == {"q": ["This field may not be blank."]} + response = client.get( + "/api/v1.0/documents/search/", data={"q": "any", "page": "NaN"} + ) + + assert response.status_code == 400 + assert response.json() == {"page": ["A valid integer is required."]} + @responses.activate -def test_api_documents_search_format(settings): +def test_api_documents_search_format(indexer_settings): """Validate the format of documents as returned by the search view.""" - settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + + assert default_document_indexer() is not None user = factories.UserFactory() @@ -140,3 +186,49 @@ def test_api_documents_search_format(settings): "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"), "user_role": access.role, } + + +@responses.activate +def test_api_documents_search_pagination(indexer_settings): + """Documents should be ordered by descending "updated_at" by default""" + indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + + assert default_document_indexer() is not None + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + docs = factories.DocumentFactory.create_batch(10) + + # Find response + # pylint: disable-next=assignment-from-none + api_search = responses.add( + responses.POST, + "http://find/api/v1.0/search", + json=[{"_id": str(doc.pk)} for doc in docs], + status=200, + ) + + response = client.get( + "/api/v1.0/documents/search/", data={"q": "alpha", "page": 2, "page_size": 5} + ) + + assert response.status_code == 200 + content = response.json() + results = content.pop("results") + assert len(results) == 5 + + # Check the query parameters. + assert api_search.call_count == 1 + assert api_search.calls[0].response.status_code == 200 + assert json_loads(api_search.calls[0].request.body) == { + "q": "alpha", + "visited": [], + "services": ["docs"], + "page_number": 2, + "page_size": 5, + "order_by": "updated_at", + "order_direction": "desc", + } diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index 99694e4401..e65c00877c 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -1454,9 +1454,9 @@ def test_models_documents_compute_ancestors_links_paths_mapping_structure( @mock.patch.object(FindDocumentIndexer, "push") @pytest.mark.django_db(transaction=True) -def test_models_documents_post_save_indexer(mock_push, settings): +def test_models_documents_post_save_indexer(mock_push, indexer_settings): """Test indexation task on document creation""" - settings.SEARCH_INDEXER_COUNTDOWN = 0 + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 user = factories.UserFactory() @@ -1494,10 +1494,127 @@ def test_models_documents_post_save_indexer(mock_push, settings): assert cache.get(document_indexer_debounce_key(doc3.pk)) == 0 +@mock.patch.object(FindDocumentIndexer, "push") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings): + """Skip indexation task on deleted or ancestor_deleted documents""" + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 + + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory() + doc_deleted = factories.DocumentFactory() + doc_ancestor_deleted = factories.DocumentFactory(parent=doc_deleted) + doc_deleted.soft_delete() + doc_ancestor_deleted.ancestors_deleted_at = doc_deleted.deleted_at + + factories.UserDocumentAccessFactory(document=doc, user=user) + factories.UserDocumentAccessFactory(document=doc_deleted, user=user) + factories.UserDocumentAccessFactory(document=doc_ancestor_deleted, user=user) + + doc_deleted.refresh_from_db() + doc_ancestor_deleted.refresh_from_db() + + assert doc_deleted.deleted_at is not None + assert doc_deleted.ancestors_deleted_at is not None + + assert doc_ancestor_deleted.deleted_at is None + assert doc_ancestor_deleted.ancestors_deleted_at is not None + + time.sleep(0.2) # waits for the end of the tasks + + accesses = { + str(doc.path): {"users": [user.sub]}, + str(doc_deleted.path): {"users": [user.sub]}, + str(doc_ancestor_deleted.path): {"users": [user.sub]}, + } + + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = FindDocumentIndexer() + + # Only the not deleted document is indexed + assert data == [ + indexer.serialize_document(doc, accesses), + ] + + # The debounce counters should be reset + assert cache.get(document_indexer_debounce_key(doc.pk)) == 0 + + # These caches are not filled + assert cache.get(document_indexer_debounce_key(doc_deleted.pk)) is None + assert cache.get(document_indexer_debounce_key(doc_ancestor_deleted.pk)) is None + + +@mock.patch.object(FindDocumentIndexer, "push") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_restored(mock_push, indexer_settings): + """Restart indexation task on restored documents""" + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 + + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory() + doc_deleted = factories.DocumentFactory() + doc_ancestor_deleted = factories.DocumentFactory(parent=doc_deleted) + doc_deleted.soft_delete() + doc_ancestor_deleted.ancestors_deleted_at = doc_deleted.deleted_at + + factories.UserDocumentAccessFactory(document=doc, user=user) + factories.UserDocumentAccessFactory(document=doc_deleted, user=user) + factories.UserDocumentAccessFactory(document=doc_ancestor_deleted, user=user) + + doc_deleted.refresh_from_db() + doc_ancestor_deleted.refresh_from_db() + + assert doc_deleted.deleted_at is not None + assert doc_deleted.ancestors_deleted_at is not None + + assert doc_ancestor_deleted.deleted_at is None + assert doc_ancestor_deleted.ancestors_deleted_at is not None + + time.sleep(0.2) # waits for the end of the tasks + + doc_deleted.restore() + + doc_deleted.refresh_from_db() + doc_ancestor_deleted.refresh_from_db() + + assert doc_deleted.deleted_at is None + assert doc_deleted.ancestors_deleted_at is None + + assert doc_ancestor_deleted.deleted_at is None + assert doc_ancestor_deleted.ancestors_deleted_at is None + + time.sleep(0.2) + + accesses = { + str(doc.path): {"users": [user.sub]}, + str(doc_deleted.path): {"users": [user.sub]}, + str(doc_ancestor_deleted.path): {"users": [user.sub]}, + } + + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = FindDocumentIndexer() + + # All docs are re-indexed + assert sorted(data, key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc, accesses), + indexer.serialize_document(doc_deleted, accesses), + # The restored document child is not saved so no indexation. + ], + key=itemgetter("id"), + ) + + @pytest.mark.django_db(transaction=True) -def test_models_documents_post_save_indexer_debounce(settings): +def test_models_documents_post_save_indexer_debounce(indexer_settings): """Test indexation task skipping on document update""" - settings.SEARCH_INDEXER_COUNTDOWN = 0 + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 indexer = FindDocumentIndexer() user = factories.UserFactory() @@ -1542,9 +1659,9 @@ def test_models_documents_post_save_indexer_debounce(settings): @pytest.mark.django_db(transaction=True) -def test_models_documents_access_post_save_indexer(settings): +def test_models_documents_access_post_save_indexer(indexer_settings): """Test indexation task on DocumentAccess update""" - settings.SEARCH_INDEXER_COUNTDOWN = 0 + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 indexer = FindDocumentIndexer() user = factories.UserFactory() diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index e0f43fcaba..63c2d305ea 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -1,6 +1,7 @@ """Tests for Documents search indexers""" from functools import partial +from json import dumps as json_dumps from unittest.mock import patch from django.contrib.auth.models import AnonymousUser @@ -8,11 +9,14 @@ from django.utils.module_loading import import_string import pytest +import responses +from requests import HTTPError from core import factories, models, utils from core.services.search_indexers import ( BaseDocumentIndexer, FindDocumentIndexer, + default_document_indexer, get_document_indexer_class, get_visited_document_ids_of, ) @@ -32,39 +36,19 @@ def push(self, data): def search_query(self, data, token): return {} - def format_response(self, data: dict): - return {} - - -@pytest.fixture(name="fake_indexer_settings") -def fake_indexer_settings_fixture(settings): - """Fixture to switch the indexer to the FakeDocumentIndexer.""" - _original_backend = str(settings.SEARCH_INDEXER_CLASS) - - settings.SEARCH_INDEXER_CLASS = ( - "core.tests.test_services_search_indexers.FakeDocumentIndexer" - ) - get_document_indexer_class.cache_clear() - - yield settings - - settings.SEARCH_INDEXER_CLASS = _original_backend - # clear cache to prevent issues with other tests - get_document_indexer_class.cache_clear() - -def test_services_search_indexer_class_is_empty(fake_indexer_settings): +def test_services_search_indexer_class_is_empty(indexer_settings): """ Should raise ImproperlyConfigured if SEARCH_INDEXER_CLASS is None or empty. """ - fake_indexer_settings.SEARCH_INDEXER_CLASS = None + indexer_settings.SEARCH_INDEXER_CLASS = None with pytest.raises(ImproperlyConfigured) as exc_info: get_document_indexer_class() assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value) - fake_indexer_settings.SEARCH_INDEXER_CLASS = "" + indexer_settings.SEARCH_INDEXER_CLASS = "" # clear cache again get_document_indexer_class.cache_clear() @@ -75,11 +59,11 @@ def test_services_search_indexer_class_is_empty(fake_indexer_settings): assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value) -def test_services_search_indexer_class_invalid(fake_indexer_settings): +def test_services_search_indexer_class_invalid(indexer_settings): """ Should raise RuntimeError if SEARCH_INDEXER_CLASS cannot be imported. """ - fake_indexer_settings.SEARCH_INDEXER_CLASS = "unknown.Unknown" + indexer_settings.SEARCH_INDEXER_CLASS = "unknown.Unknown" with pytest.raises(ImproperlyConfigured) as exc_info: get_document_indexer_class() @@ -90,11 +74,11 @@ def test_services_search_indexer_class_invalid(fake_indexer_settings): ) -def test_services_search_indexer_class(fake_indexer_settings): +def test_services_search_indexer_class(indexer_settings): """ Import indexer class defined in setting SEARCH_INDEXER_CLASS. """ - fake_indexer_settings.SEARCH_INDEXER_CLASS = ( + indexer_settings.SEARCH_INDEXER_CLASS = ( "core.tests.test_services_search_indexers.FakeDocumentIndexer" ) @@ -103,11 +87,43 @@ def test_services_search_indexer_class(fake_indexer_settings): ) -def test_services_search_indexer_url_is_none(settings): +def test_services_search_indexer_is_configured(indexer_settings): + """ + Should return true only when the indexer class and other configuration settings + are valid. + """ + indexer_settings.SEARCH_INDEXER_CLASS = None + + # None + default_document_indexer.cache_clear() + assert not default_document_indexer() + + # Empty + indexer_settings.SEARCH_INDEXER_CLASS = "" + + default_document_indexer.cache_clear() + assert not default_document_indexer() + + # Valid class + indexer_settings.SEARCH_INDEXER_CLASS = ( + "core.services.search_indexers.FindDocumentIndexer" + ) + + default_document_indexer.cache_clear() + assert default_document_indexer() is not None + + indexer_settings.SEARCH_INDEXER_URL = "" + + # Invalid url + default_document_indexer.cache_clear() + assert not default_document_indexer() + + +def test_services_search_indexer_url_is_none(indexer_settings): """ Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty. """ - settings.SEARCH_INDEXER_URL = None + indexer_settings.SEARCH_INDEXER_URL = None with pytest.raises(ImproperlyConfigured) as exc_info: FindDocumentIndexer() @@ -115,11 +131,11 @@ def test_services_search_indexer_url_is_none(settings): assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value) -def test_services_search_indexer_url_is_empty(settings): +def test_services_search_indexer_url_is_empty(indexer_settings): """ Indexer should raise RuntimeError if SEARCH_INDEXER_URL is empty string. """ - settings.SEARCH_INDEXER_URL = "" + indexer_settings.SEARCH_INDEXER_URL = "" with pytest.raises(ImproperlyConfigured) as exc_info: FindDocumentIndexer() @@ -127,11 +143,11 @@ def test_services_search_indexer_url_is_empty(settings): assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value) -def test_services_search_indexer_secret_is_none(settings): +def test_services_search_indexer_secret_is_none(indexer_settings): """ - Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None or empty. + Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None. """ - settings.SEARCH_INDEXER_SECRET = None + indexer_settings.SEARCH_INDEXER_SECRET = None with pytest.raises(ImproperlyConfigured) as exc_info: FindDocumentIndexer() @@ -141,11 +157,11 @@ def test_services_search_indexer_secret_is_none(settings): ) -def test_services_search_indexer_secret_is_empty(settings): +def test_services_search_indexer_secret_is_empty(indexer_settings): """ Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is empty string. """ - settings.SEARCH_INDEXER_SECRET = "" + indexer_settings.SEARCH_INDEXER_SECRET = "" with pytest.raises(ImproperlyConfigured) as exc_info: FindDocumentIndexer() @@ -155,6 +171,35 @@ def test_services_search_indexer_secret_is_empty(settings): ) +def test_services_search_endpoint_is_none(indexer_settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None. + """ + indexer_settings.SEARCH_INDEXER_QUERY_URL = None + + with pytest.raises(ImproperlyConfigured) as exc_info: + FindDocumentIndexer() + + assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str( + exc_info.value + ) + + +def test_services_search_endpoint_is_empty(indexer_settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is empty. + """ + indexer_settings.SEARCH_INDEXER_QUERY_URL = "" + + with pytest.raises(ImproperlyConfigured) as exc_info: + FindDocumentIndexer() + + assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str( + exc_info.value + ) + + +@pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_serialize_document_returns_expected_json(): """ It should serialize documents with correct metadata and access control. @@ -195,6 +240,7 @@ def test_services_search_indexers_serialize_document_returns_expected_json(): } +@pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_serialize_document_deleted(): """Deleted documents are marked as just in the serialized json.""" parent = factories.DocumentFactory() @@ -209,6 +255,7 @@ def test_services_search_indexers_serialize_document_deleted(): assert result["is_active"] is False +@pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_serialize_document_empty(): """Empty documents returns empty content in the serialized json.""" document = factories.DocumentFactory(content="", title=None) @@ -220,13 +267,35 @@ def test_services_search_indexers_serialize_document_empty(): assert result["title"] == "" +@responses.activate +def test_services_search_indexers_index_errors(indexer_settings): + """ + Documents indexing response handling on Find API HTTP errors. + """ + factories.DocumentFactory() + + indexer_settings.SEARCH_INDEXER_URL = "http://app-find/api/v1.0/documents/index/" + + responses.add( + responses.POST, + "http://app-find/api/v1.0/documents/index/", + status=401, + body=json_dumps({"message": "Authentication failed."}), + ) + + with pytest.raises(HTTPError): + FindDocumentIndexer().index() + + @patch.object(FindDocumentIndexer, "push") -def test_services_search_indexers_batches_pass_only_batch_accesses(mock_push, settings): +def test_services_search_indexers_batches_pass_only_batch_accesses( + mock_push, indexer_settings +): """ Documents indexing should be processed in batches, and only the access data relevant to each batch should be used. """ - settings.SEARCH_INDEXER_BATCH_SIZE = 2 + indexer_settings.SEARCH_INDEXER_BATCH_SIZE = 2 documents = factories.DocumentFactory.create_batch(5) # Attach a single user access to each document @@ -259,6 +328,7 @@ def test_services_search_indexers_batches_pass_only_batch_accesses(mock_push, se @patch.object(FindDocumentIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_ancestors_link_reach(mock_push): """Document accesses and reach should take into account ancestors link reaches.""" great_grand_parent = factories.DocumentFactory(link_reach="restricted") @@ -279,6 +349,7 @@ def test_services_search_indexers_ancestors_link_reach(mock_push): @patch.object(FindDocumentIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_ancestors_users(mock_push): """Document accesses and reach should include users from ancestors.""" user_gp, user_p, user_d = factories.UserFactory.create_batch(3) @@ -301,6 +372,7 @@ def test_services_search_indexers_ancestors_users(mock_push): @patch.object(FindDocumentIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_ancestors_teams(mock_push): """Document accesses and reach should include teams from ancestors.""" grand_parent = factories.DocumentFactory(teams=["team_gp"]) @@ -317,12 +389,12 @@ def test_services_search_indexers_ancestors_teams(mock_push): @patch("requests.post") -def test_push_uses_correct_url_and_data(mock_post, settings): +def test_push_uses_correct_url_and_data(mock_post, indexer_settings): """ push() should call requests.post with the correct URL from settings the timeout set to 10 seconds and the data as JSON. """ - settings.SEARCH_INDEXER_URL = "http://example.com/index" + indexer_settings.SEARCH_INDEXER_URL = "http://example.com/index" indexer = FindDocumentIndexer() sample_data = [{"id": "123", "title": "Test"}] @@ -335,7 +407,7 @@ def test_push_uses_correct_url_and_data(mock_post, settings): mock_post.assert_called_once() args, kwargs = mock_post.call_args - assert args[0] == settings.SEARCH_INDEXER_URL + assert args[0] == indexer_settings.SEARCH_INDEXER_URL assert kwargs.get("json") == sample_data assert kwargs.get("timeout") == 10 @@ -348,9 +420,10 @@ def test_get_visited_document_ids_of(): user = factories.UserFactory() other = factories.UserFactory() anonymous = AnonymousUser() + queryset = models.Document.objects.all() - assert not get_visited_document_ids_of(anonymous) - assert not get_visited_document_ids_of(user) + assert not get_visited_document_ids_of(queryset, anonymous) + assert not get_visited_document_ids_of(queryset, user) doc1, doc2, _ = factories.DocumentFactory.create_batch(3) @@ -360,7 +433,7 @@ def test_get_visited_document_ids_of(): create_link(document=doc2) # The third document is not visited - assert sorted(get_visited_document_ids_of(user)) == sorted( + assert sorted(get_visited_document_ids_of(queryset, user)) == sorted( [str(doc1.pk), str(doc2.pk)] ) @@ -368,11 +441,67 @@ def test_get_visited_document_ids_of(): factories.UserDocumentAccessFactory(user=user, document=doc2) # The second document have an access for the user - assert get_visited_document_ids_of(user) == [str(doc1.pk)] + assert get_visited_document_ids_of(queryset, user) == [str(doc1.pk)] + + +@pytest.mark.usefixtures("indexer_settings") +def test_get_visited_document_ids_of_deleted(): + """ + get_visited_document_ids_of() returns the ids of the documents viewed + by the user if they are not deleted. + """ + user = factories.UserFactory() + anonymous = AnonymousUser() + queryset = models.Document.objects.all() + + assert not get_visited_document_ids_of(queryset, anonymous) + assert not get_visited_document_ids_of(queryset, user) + + doc = factories.DocumentFactory() + doc_deleted = factories.DocumentFactory() + doc_ancestor_deleted = factories.DocumentFactory(parent=doc_deleted) + + create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False) + + create_link(document=doc) + create_link(document=doc_deleted) + create_link(document=doc_ancestor_deleted) + + # The all documents are visited + assert sorted(get_visited_document_ids_of(queryset, user)) == sorted( + [str(doc.pk), str(doc_deleted.pk), str(doc_ancestor_deleted.pk)] + ) + + doc_deleted.soft_delete() + + # Only the first document is not deleted + assert get_visited_document_ids_of(queryset, user) == [str(doc.pk)] + + +@responses.activate +def test_services_search_indexers_search_errors(indexer_settings): + """ + Documents indexing response handling on Find API HTTP errors. + """ + factories.DocumentFactory() + + indexer_settings.SEARCH_INDEXER_QUERY_URL = ( + "http://app-find/api/v1.0/documents/search/" + ) + + responses.add( + responses.POST, + "http://app-find/api/v1.0/documents/search/", + status=401, + body=json_dumps({"message": "Authentication failed."}), + ) + + with pytest.raises(HTTPError): + FindDocumentIndexer().search("alpha", token="mytoken") @patch("requests.post") -def test_services_search_indexers_search(mock_post, settings): +def test_services_search_indexers_search(mock_post, indexer_settings): """ search() should call requests.post to SEARCH_INDEXER_QUERY_URL with the document ids from linktraces. @@ -390,30 +519,22 @@ def test_services_search_indexers_search(mock_post, settings): create_link(document=doc1) create_link(document=doc2) - indexer.search("alpha", user=user, token="mytoken") + visited = get_visited_document_ids_of(models.Document.objects.all(), user) + + indexer.search("alpha", visited=visited, token="mytoken") args, kwargs = mock_post.call_args - assert args[0] == settings.SEARCH_INDEXER_QUERY_URL + assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL query_data = kwargs.get("json") assert query_data["q"] == "alpha" assert sorted(query_data["visited"]) == sorted([str(doc1.pk), str(doc2.pk)]) assert query_data["services"] == ["docs"] + assert query_data["page_number"] == 1 + assert query_data["page_size"] == 50 + assert query_data["order_by"] == "updated_at" + assert query_data["order_direction"] == "desc" assert kwargs.get("headers") == {"Authorization": "Bearer mytoken"} assert kwargs.get("timeout") == 10 - - -def test_search_query_raises_error_if_search_endpoint_is_none(settings): - """ - Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None or empty. - """ - settings.SEARCH_INDEXER_QUERY_URL = None - - with pytest.raises(ImproperlyConfigured) as exc_info: - FindDocumentIndexer() - - assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str( - exc_info.value - ) diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index be0b99540e..83a0e20a4f 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -101,7 +101,7 @@ class Base(Configuration): # Search SEARCH_INDEXER_CLASS = values.Value( - default="core.services.search_indexers.FindDocumentIndexer", + default=None, environ_name="SEARCH_INDEXER_CLASS", environ_prefix=None, ) @@ -957,11 +957,6 @@ class Test(Base): # Tests are raising warnings because the /data/static directory does not exist STATIC_ROOT = None - # Setup indexer configuration to make test working on the CI. - SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest" # noqa - SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/" - SEARCH_INDEXER_QUERY_URL = "http://localhost:8081/api/v1.0/documents/search/" - CELERY_TASK_ALWAYS_EAGER = values.BooleanValue(True) def __init__(self): From 57a6f73d9b319ce17443b63dc06b3343a971a8c5 Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Mon, 22 Sep 2025 16:05:39 +0200 Subject: [PATCH 11/22] =?UTF-8?q?=E2=9C=A8(backend)=20Index=20partially=20?= =?UTF-8?q?empty=20documents?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only documents without title and content are ignored by indexer. --- src/backend/core/management/commands/index.py | 9 +++-- src/backend/core/services/search_indexers.py | 8 ++++- .../tests/test_services_search_indexers.py | 36 ++++++++++++++++--- 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/src/backend/core/management/commands/index.py b/src/backend/core/management/commands/index.py index b7eab9505e..6a15faac6e 100644 --- a/src/backend/core/management/commands/index.py +++ b/src/backend/core/management/commands/index.py @@ -21,8 +21,11 @@ def handle(self, *args, **options): """Launch and log search index generation.""" logger.info("Starting to regenerate Find index...") start = time.perf_counter() - - FindDocumentIndexer().index() + count = FindDocumentIndexer().index() duration = time.perf_counter() - start - logger.info("Search index regenerated in %.2f seconds.", duration) + logger.info( + "Search index regenerated from %d document(s) in %.2f seconds.", + count, + duration, + ) diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index 255aa3bbba..69eb1f67ed 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -146,6 +146,8 @@ def index(self): Fetch documents in batches, serialize them, and push to the search backend. """ last_id = 0 + count = 0 + while True: documents_batch = list( models.Document.objects.filter( @@ -163,9 +165,13 @@ def index(self): serialized_batch = [ self.serialize_document(document, accesses_by_document_path) for document in documents_batch - if document.content + if document.content or document.title ] + self.push(serialized_batch) + count += len(serialized_batch) + + return count @abstractmethod def serialize_document(self, document, accesses): diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index 63c2d305ea..14d47d9dc4 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -304,7 +304,7 @@ def test_services_search_indexers_batches_pass_only_batch_accesses( access = factories.UserDocumentAccessFactory(document=document) expected_user_subs[str(document.id)] = str(access.user.sub) - FindDocumentIndexer().index() + assert FindDocumentIndexer().index() == 5 # Should be 3 batches: 2 + 2 + 1 assert mock_push.call_count == 3 @@ -327,6 +327,34 @@ def test_services_search_indexers_batches_pass_only_batch_accesses( assert seen_doc_ids == {str(d.id) for d in documents} +@patch.object(FindDocumentIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +def test_services_search_indexers_ignore_empty_documents(mock_push): + """ + Documents indexing should be processed in batches, + and only the access data relevant to each batch should be used. + """ + document = factories.DocumentFactory() + factories.DocumentFactory(content="", title="") + empty_title = factories.DocumentFactory(title="") + empty_content = factories.DocumentFactory(content="") + + assert FindDocumentIndexer().index() == 3 + + assert mock_push.call_count == 1 + + # Make sure only not eempty documents are indexed + results = {doc["id"] for doc in mock_push.call_args[0][0]} + assert results == { + str(d.id) + for d in ( + document, + empty_content, + empty_title, + ) + } + + @patch.object(FindDocumentIndexer, "push") @pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_ancestors_link_reach(mock_push): @@ -338,7 +366,7 @@ def test_services_search_indexers_ancestors_link_reach(mock_push): parent = factories.DocumentFactory(parent=grand_parent, link_reach="public") document = factories.DocumentFactory(parent=parent, link_reach="restricted") - FindDocumentIndexer().index() + assert FindDocumentIndexer().index() == 4 results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 4 @@ -358,7 +386,7 @@ def test_services_search_indexers_ancestors_users(mock_push): parent = factories.DocumentFactory(parent=grand_parent, users=[user_p]) document = factories.DocumentFactory(parent=parent, users=[user_d]) - FindDocumentIndexer().index() + assert FindDocumentIndexer().index() == 3 results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 3 @@ -379,7 +407,7 @@ def test_services_search_indexers_ancestors_teams(mock_push): parent = factories.DocumentFactory(parent=grand_parent, teams=["team_p"]) document = factories.DocumentFactory(parent=parent, teams=["team_d"]) - FindDocumentIndexer().index() + assert FindDocumentIndexer().index() == 3 results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 3 From ad92bf614db913df01492a8a3a042b878afc9b8f Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Wed, 24 Sep 2025 13:44:37 +0200 Subject: [PATCH 12/22] =?UTF-8?q?=E2=9C=A8(backend)=20Index=20deleted=20do?= =?UTF-8?q?cuments?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add SEARCH_INDEXER_COUNTDOWN as configurable setting. Make the search backend creation simplier (only 'get_document_indexer' now). Allow indexation of deleted documents. Signed-off-by: Fabre Florian --- src/backend/core/api/viewsets.py | 27 +++-- src/backend/core/management/commands/index.py | 15 ++- src/backend/core/services/search_indexers.py | 35 ++---- src/backend/core/signals.py | 6 +- src/backend/core/tasks/find.py | 45 +++---- src/backend/core/tests/commands/test_index.py | 14 ++- src/backend/core/tests/conftest.py | 9 +- .../documents/test_api_documents_search.py | 8 +- .../core/tests/test_models_documents.py | 112 +++++++++++------- .../tests/test_services_search_indexers.py | 54 ++------- src/backend/impress/settings.py | 3 + 11 files changed, 175 insertions(+), 153 deletions(-) diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 0b383ffa4c..7132687c77 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -50,7 +50,7 @@ YdocConverter, ) from core.services.search_indexers import ( - default_document_indexer, + get_document_indexer, get_visited_document_ids_of, ) from core.tasks.mail import send_ask_for_access_mail @@ -1076,7 +1076,14 @@ def duplicate(self, request, *args, **kwargs): def search(self, request, *args, **kwargs): """ Returns a DRF response containing the filtered, annotated and ordered document list. - The filtering allows full text search through the opensearch indexation app "find". + + Applies filtering based on request parameter 'q' from `FindDocumentSerializer`. + Depending of the configuration it can be: + - A fulltext search through the opensearch indexation app "find" if the backend is + enabled (see SEARCH_BACKEND_CLASS) + - A filtering by the model field 'title'. + + The ordering is always by the most recent first. """ access_token = request.session.get("oidc_access_token") user = request.user @@ -1084,13 +1091,15 @@ def search(self, request, *args, **kwargs): serializer = serializers.FindDocumentSerializer(data=request.query_params) serializer.is_valid(raise_exception=True) - indexer = default_document_indexer() + indexer = get_document_indexer() + text = serializer.validated_data["q"] + # The indexer is not configured, so we fallback on a simple filter on the + # model field 'title'. if not indexer: + # As the 'list' view we get a prefiltered queryset (deleted docs are excluded) queryset = self.get_queryset() - filterset = DocumentFilter( - {"title": serializer.validated_data.get("q", "")}, queryset=queryset - ) + filterset = DocumentFilter({"title": text}, queryset=queryset) if not filterset.is_valid(): raise drf.exceptions.ValidationError(filterset.errors) @@ -1105,15 +1114,17 @@ def search(self, request, *args, **kwargs): ) queryset = models.Document.objects.all() + + # Retrieve the documents ids from Find. results = indexer.search( - text=serializer.validated_data.get("q", ""), + text=text, token=access_token, visited=get_visited_document_ids_of(queryset, user), page=serializer.validated_data.get("page", 1), page_size=serializer.validated_data.get("page_size", 20), ) - queryset = queryset.filter(pk__in=results) + queryset = queryset.filter(pk__in=results).order_by("-updated_at") return self.get_response_for_queryset( queryset, diff --git a/src/backend/core/management/commands/index.py b/src/backend/core/management/commands/index.py index 6a15faac6e..7db787cf98 100644 --- a/src/backend/core/management/commands/index.py +++ b/src/backend/core/management/commands/index.py @@ -5,9 +5,9 @@ import logging import time -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandError -from ...services.search_indexers import FindDocumentIndexer +from core.services.search_indexers import get_document_indexer logger = logging.getLogger("docs.search.bootstrap_search") @@ -19,9 +19,18 @@ class Command(BaseCommand): def handle(self, *args, **options): """Launch and log search index generation.""" + indexer = get_document_indexer() + + if not indexer: + raise CommandError("The indexer is not enabled or properly configured.") + logger.info("Starting to regenerate Find index...") start = time.perf_counter() - count = FindDocumentIndexer().index() + + try: + count = indexer.index() + except Exception as err: + raise CommandError("Unable to regenerate index") from err duration = time.perf_counter() - start logger.info( diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index 69eb1f67ed..c5c26e4b71 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -19,37 +19,24 @@ @cache -def default_document_indexer(): - """Returns default indexer service is enabled and properly configured.""" +def get_document_indexer(): + """Returns an instance of indexer service if enabled and properly configured.""" + classpath = settings.SEARCH_INDEXER_CLASS # For this usecase an empty indexer class is not an issue but a feature. - if not getattr(settings, "SEARCH_INDEXER_CLASS", None): + if not classpath: logger.info("Document indexer is not configured (see SEARCH_INDEXER_CLASS)") return None try: - return get_document_indexer_class()() + indexer_class = import_string(settings.SEARCH_INDEXER_CLASS) + return indexer_class() + except ImportError as err: + logger.error("SEARCH_INDEXER_CLASS setting is not valid : %s", err) except ImproperlyConfigured as err: logger.error("Document indexer is not properly configured : %s", err) - return None - -@cache -def get_document_indexer_class(): - """Return the indexer backend class based on the settings.""" - classpath = settings.SEARCH_INDEXER_CLASS - - if not classpath: - raise ImproperlyConfigured( - "SEARCH_INDEXER_CLASS must be set in Django settings." - ) - - try: - return import_string(settings.SEARCH_INDEXER_CLASS) - except ImportError as err: - raise ImproperlyConfigured( - f"SEARCH_INDEXER_CLASS setting is not valid : {err}" - ) from err + return None def get_batch_accesses_by_users_and_teams(paths): @@ -100,9 +87,11 @@ def get_visited_document_ids_of(queryset, user): ancestors_deleted_at__isnull=True, ) .filter(pk__in=Subquery(qs.values("document_id"))) + .order_by("pk") + .distinct("pk") ) - return list({str(id) for id in docs.values_list("pk", flat=True)}) + return [str(id) for id in docs.values_list("pk", flat=True)] class BaseDocumentIndexer(ABC): diff --git a/src/backend/core/signals.py b/src/backend/core/signals.py index fbb9368a90..112a42f5bc 100644 --- a/src/backend/core/signals.py +++ b/src/backend/core/signals.py @@ -9,7 +9,6 @@ from django.dispatch import receiver from . import models -from .services.search_indexers import default_document_indexer from .tasks.find import trigger_document_indexer @@ -20,8 +19,7 @@ def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-ar Note : Within the transaction we can have an empty content and a serialization error. """ - if default_document_indexer() is not None: - transaction.on_commit(partial(trigger_document_indexer, instance)) + transaction.on_commit(partial(trigger_document_indexer, instance)) @receiver(signals.post_save, sender=models.DocumentAccess) @@ -29,5 +27,5 @@ def document_access_post_save(sender, instance, created, **kwargs): # pylint: d """ Asynchronous call to the document indexer at the end of the transaction. """ - if not created and default_document_indexer() is not None: + if not created: transaction.on_commit(partial(trigger_document_indexer, instance.document)) diff --git a/src/backend/core/tasks/find.py b/src/backend/core/tasks/find.py index f4d4149545..c9d36b157e 100644 --- a/src/backend/core/tasks/find.py +++ b/src/backend/core/tasks/find.py @@ -10,13 +10,10 @@ logger = getLogger(__file__) -def document_indexer_debounce_key(document_id): - """Returns debounce cache key""" - return f"doc-indexer-debounce-{document_id}" - - -def incr_counter(key): +def indexer_debounce_lock(document_id): """Increase or reset counter""" + key = f"doc-indexer-debounce-{document_id}" + try: return cache.incr(key) except ValueError: @@ -24,8 +21,10 @@ def incr_counter(key): return 1 -def decr_counter(key): +def indexer_debounce_release(document_id): """Decrease or reset counter""" + key = f"doc-indexer-debounce-{document_id}" + try: return cache.decr(key) except ValueError: @@ -36,24 +35,26 @@ def decr_counter(key): @app.task def document_indexer_task(document_id): """Celery Task : Sends indexation query for a document.""" - key = document_indexer_debounce_key(document_id) + # Prevents some circular imports + # pylint: disable=import-outside-toplevel + from core import models # noqa : PLC0415 + from core.services.search_indexers import ( # noqa : PLC0415 + get_batch_accesses_by_users_and_teams, + get_document_indexer, + ) # check if the counter : if still up, skip the task. only the last one # within the countdown delay will do the query. - if decr_counter(key) > 0: + if indexer_debounce_release(document_id) > 0: logger.info("Skip document %s indexation", document_id) return - # Prevents some circular imports - # pylint: disable=import-outside-toplevel - from core import models # noqa: PLC0415 - from core.services.search_indexers import ( # noqa: PLC0415 - get_batch_accesses_by_users_and_teams, - get_document_indexer_class, - ) + indexer = get_document_indexer() + + if indexer is None: + return doc = models.Document.objects.get(pk=document_id) - indexer = get_document_indexer_class()() accesses = get_batch_accesses_by_users_and_teams((doc.path,)) data = indexer.serialize_document(document=doc, accesses=accesses) @@ -69,11 +70,11 @@ def trigger_document_indexer(document): Args: document (Document): The document instance. """ - if document.deleted_at or document.ancestors_deleted_at: - return + countdown = settings.SEARCH_INDEXER_COUNTDOWN - key = document_indexer_debounce_key(document.pk) - countdown = getattr(settings, "SEARCH_INDEXER_COUNTDOWN", 1) + # DO NOT create a task if indexation if disabled + if not settings.SEARCH_INDEXER_CLASS: + return logger.info( "Add task for document %s indexation in %.2f seconds", @@ -83,6 +84,6 @@ def trigger_document_indexer(document): # Each time this method is called during the countdown, we increment the # counter and each task decrease it, so the index be run only once. - incr_counter(key) + indexer_debounce_lock(document.pk) document_indexer_task.apply_async(args=[document.pk], countdown=countdown) diff --git a/src/backend/core/tests/commands/test_index.py b/src/backend/core/tests/commands/test_index.py index 169e0b830b..78d3024958 100644 --- a/src/backend/core/tests/commands/test_index.py +++ b/src/backend/core/tests/commands/test_index.py @@ -5,7 +5,7 @@ from operator import itemgetter from unittest import mock -from django.core.management import call_command +from django.core.management import CommandError, call_command from django.db import transaction import pytest @@ -51,3 +51,15 @@ def test_index(): ], key=itemgetter("id"), ) + + +@pytest.mark.django_db +@pytest.mark.usefixtures("indexer_settings") +def test_index_improperly_configured(indexer_settings): + """The command should raise an exception if the indexer is not configured""" + indexer_settings.SEARCH_INDEXER_CLASS = None + + with pytest.raises(CommandError) as err: + call_command("index") + + assert str(err.value) == "The indexer is not enabled or properly configured." diff --git a/src/backend/core/tests/conftest.py b/src/backend/core/tests/conftest.py index 2102879b52..7c5a59b863 100644 --- a/src/backend/core/tests/conftest.py +++ b/src/backend/core/tests/conftest.py @@ -34,12 +34,10 @@ def indexer_settings_fixture(settings): # pylint: disable-next=import-outside-toplevel from core.services.search_indexers import ( # noqa: PLC0415 - default_document_indexer, - get_document_indexer_class, + get_document_indexer, ) - default_document_indexer.cache_clear() - get_document_indexer_class.cache_clear() + get_document_indexer.cache_clear() settings.SEARCH_INDEXER_CLASS = "core.services.search_indexers.FindDocumentIndexer" settings.SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest" @@ -51,5 +49,4 @@ def indexer_settings_fixture(settings): yield settings # clear cache to prevent issues with other tests - default_document_indexer.cache_clear() - get_document_indexer_class.cache_clear() + get_document_indexer.cache_clear() diff --git a/src/backend/core/tests/documents/test_api_documents_search.py b/src/backend/core/tests/documents/test_api_documents_search.py index 2c3b2d3523..0c46eb2b0f 100644 --- a/src/backend/core/tests/documents/test_api_documents_search.py +++ b/src/backend/core/tests/documents/test_api_documents_search.py @@ -10,7 +10,7 @@ from rest_framework.test import APIClient from core import factories, models -from core.services.search_indexers import default_document_indexer +from core.services.search_indexers import get_document_indexer fake = Faker() pytestmark = pytest.mark.django_db @@ -54,7 +54,7 @@ def test_api_documents_search_endpoint_is_none(indexer_settings): """ indexer_settings.SEARCH_INDEXER_QUERY_URL = None - assert default_document_indexer() is None + assert get_document_indexer() is None user = factories.UserFactory() document = factories.DocumentFactory(title="alpha") @@ -130,7 +130,7 @@ def test_api_documents_search_format(indexer_settings): """Validate the format of documents as returned by the search view.""" indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" - assert default_document_indexer() is not None + assert get_document_indexer() is not None user = factories.UserFactory() @@ -193,7 +193,7 @@ def test_api_documents_search_pagination(indexer_settings): """Documents should be ordered by descending "updated_at" by default""" indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" - assert default_document_indexer() is not None + assert get_document_indexer() is not None user = factories.UserFactory() diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index e65c00877c..22c153245b 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -5,7 +5,6 @@ import random import smtplib -import time from logging import Logger from operator import itemgetter from unittest import mock @@ -23,7 +22,6 @@ from core import factories, models from core.services.search_indexers import FindDocumentIndexer -from core.tasks.find import document_indexer_debounce_key pytestmark = pytest.mark.django_db @@ -1458,6 +1456,48 @@ def test_models_documents_post_save_indexer(mock_push, indexer_settings): """Test indexation task on document creation""" indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 + with transaction.atomic(): + doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3) + + accesses = {} + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = FindDocumentIndexer() + + assert sorted(data, key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc1, accesses), + indexer.serialize_document(doc2, accesses), + indexer.serialize_document(doc3, accesses), + ], + key=itemgetter("id"), + ) + + # The debounce counters should be reset + assert cache.get(f"doc-indexer-debounce-{doc1.pk}") == 0 + assert cache.get(f"doc-indexer-debounce-{doc2.pk}") == 0 + assert cache.get(f"doc-indexer-debounce-{doc3.pk}") == 0 + + +@mock.patch.object(FindDocumentIndexer, "push") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_not_configured(mock_push, indexer_settings): + """Task should not start an indexation when disabled""" + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 + indexer_settings.SEARCH_INDEXER_CLASS = None + + with transaction.atomic(): + factories.DocumentFactory() + + assert mock_push.call_args_list == [] + + +@mock.patch.object(FindDocumentIndexer, "push") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_with_accesses(mock_push, indexer_settings): + """Test indexation task on document creation""" + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 + user = factories.UserFactory() with transaction.atomic(): @@ -1467,8 +1507,6 @@ def test_models_documents_post_save_indexer(mock_push, indexer_settings): factories.UserDocumentAccessFactory(document=doc2, user=user) factories.UserDocumentAccessFactory(document=doc3, user=user) - time.sleep(0.2) # waits for the end of the tasks - accesses = { str(doc1.path): {"users": [user.sub]}, str(doc2.path): {"users": [user.sub]}, @@ -1489,15 +1527,15 @@ def test_models_documents_post_save_indexer(mock_push, indexer_settings): ) # The debounce counters should be reset - assert cache.get(document_indexer_debounce_key(doc1.pk)) == 0 - assert cache.get(document_indexer_debounce_key(doc2.pk)) == 0 - assert cache.get(document_indexer_debounce_key(doc3.pk)) == 0 + assert cache.get(f"doc-indexer-debounce-{doc1.pk}") == 0 + assert cache.get(f"doc-indexer-debounce-{doc2.pk}") == 0 + assert cache.get(f"doc-indexer-debounce-{doc3.pk}") == 0 @mock.patch.object(FindDocumentIndexer, "push") @pytest.mark.django_db(transaction=True) def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings): - """Skip indexation task on deleted or ancestor_deleted documents""" + """Indexation task on deleted or ancestor_deleted documents""" indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 user = factories.UserFactory() @@ -1522,8 +1560,6 @@ def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings) assert doc_ancestor_deleted.deleted_at is None assert doc_ancestor_deleted.ancestors_deleted_at is not None - time.sleep(0.2) # waits for the end of the tasks - accesses = { str(doc.path): {"users": [user.sub]}, str(doc_deleted.path): {"users": [user.sub]}, @@ -1534,17 +1570,21 @@ def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings) indexer = FindDocumentIndexer() - # Only the not deleted document is indexed - assert data == [ - indexer.serialize_document(doc, accesses), - ] + # Even deleted document are re-indexed : only update their status in the future ? + assert sorted(data, key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc, accesses), + indexer.serialize_document(doc_deleted, accesses), + indexer.serialize_document(doc_ancestor_deleted, accesses), + indexer.serialize_document(doc_deleted, accesses), # soft_delete() + ], + key=itemgetter("id"), + ) # The debounce counters should be reset - assert cache.get(document_indexer_debounce_key(doc.pk)) == 0 - - # These caches are not filled - assert cache.get(document_indexer_debounce_key(doc_deleted.pk)) is None - assert cache.get(document_indexer_debounce_key(doc_ancestor_deleted.pk)) is None + assert cache.get(f"doc-indexer-debounce-{doc.pk}") == 0 + assert cache.get(f"doc-indexer-debounce-{doc_deleted.pk}") == 0 + assert cache.get(f"doc-indexer-debounce-{doc_ancestor_deleted.pk}") == 0 @mock.patch.object(FindDocumentIndexer, "push") @@ -1575,20 +1615,16 @@ def test_models_documents_post_save_indexer_restored(mock_push, indexer_settings assert doc_ancestor_deleted.deleted_at is None assert doc_ancestor_deleted.ancestors_deleted_at is not None - time.sleep(0.2) # waits for the end of the tasks - - doc_deleted.restore() - - doc_deleted.refresh_from_db() - doc_ancestor_deleted.refresh_from_db() + doc_restored = models.Document.objects.get(pk=doc_deleted.pk) + doc_restored.restore() - assert doc_deleted.deleted_at is None - assert doc_deleted.ancestors_deleted_at is None + doc_ancestor_restored = models.Document.objects.get(pk=doc_ancestor_deleted.pk) - assert doc_ancestor_deleted.deleted_at is None - assert doc_ancestor_deleted.ancestors_deleted_at is None + assert doc_restored.deleted_at is None + assert doc_restored.ancestors_deleted_at is None - time.sleep(0.2) + assert doc_ancestor_restored.deleted_at is None + assert doc_ancestor_restored.ancestors_deleted_at is None accesses = { str(doc.path): {"users": [user.sub]}, @@ -1605,7 +1641,9 @@ def test_models_documents_post_save_indexer_restored(mock_push, indexer_settings [ indexer.serialize_document(doc, accesses), indexer.serialize_document(doc_deleted, accesses), - # The restored document child is not saved so no indexation. + indexer.serialize_document(doc_deleted, accesses), # soft_delete() + indexer.serialize_document(doc_restored, accesses), # restore() + indexer.serialize_document(doc_ancestor_deleted, accesses), ], key=itemgetter("id"), ) @@ -1628,31 +1666,25 @@ def test_models_documents_post_save_indexer_debounce(indexer_settings): str(doc.path): {"users": [user.sub]}, } - time.sleep(0.1) # waits for the end of the tasks - with mock.patch.object(FindDocumentIndexer, "push") as mock_push: # Simulate 1 waiting task - cache.set(document_indexer_debounce_key(doc.pk), 1) + cache.set(f"doc-indexer-debounce-{doc.pk}", 1) # save doc to trigger the indexer, but nothing should be done since # the counter is over 0 with transaction.atomic(): doc.save() - time.sleep(0.1) - assert [call.args[0] for call in mock_push.call_args_list] == [] with mock.patch.object(FindDocumentIndexer, "push") as mock_push: # No waiting task - cache.set(document_indexer_debounce_key(doc.pk), 0) + cache.set(f"doc-indexer-debounce-{doc.pk}", 0) with transaction.atomic(): doc = models.Document.objects.get(pk=doc.pk) doc.save() - time.sleep(0.1) - assert [call.args[0] for call in mock_push.call_args_list] == [ indexer.serialize_document(doc, accesses), ] @@ -1681,8 +1713,6 @@ def test_models_documents_access_post_save_indexer(indexer_settings): with transaction.atomic(): doc_access.save() - time.sleep(0.1) - assert [call.args[0] for call in mock_push.call_args_list] == [ indexer.serialize_document(doc, accesses), ] diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index 14d47d9dc4..15686b7744 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -16,8 +16,7 @@ from core.services.search_indexers import ( BaseDocumentIndexer, FindDocumentIndexer, - default_document_indexer, - get_document_indexer_class, + get_document_indexer, get_visited_document_ids_of, ) @@ -37,41 +36,13 @@ def search_query(self, data, token): return {} -def test_services_search_indexer_class_is_empty(indexer_settings): - """ - Should raise ImproperlyConfigured if SEARCH_INDEXER_CLASS is None or empty. - """ - indexer_settings.SEARCH_INDEXER_CLASS = None - - with pytest.raises(ImproperlyConfigured) as exc_info: - get_document_indexer_class() - - assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value) - - indexer_settings.SEARCH_INDEXER_CLASS = "" - - # clear cache again - get_document_indexer_class.cache_clear() - - with pytest.raises(ImproperlyConfigured) as exc_info: - get_document_indexer_class() - - assert "SEARCH_INDEXER_CLASS must be set in Django settings." in str(exc_info.value) - - def test_services_search_indexer_class_invalid(indexer_settings): """ Should raise RuntimeError if SEARCH_INDEXER_CLASS cannot be imported. """ indexer_settings.SEARCH_INDEXER_CLASS = "unknown.Unknown" - with pytest.raises(ImproperlyConfigured) as exc_info: - get_document_indexer_class() - - assert ( - "SEARCH_INDEXER_CLASS setting is not valid : No module named 'unknown'" - in str(exc_info.value) - ) + assert get_document_indexer() is None def test_services_search_indexer_class(indexer_settings): @@ -82,8 +53,9 @@ def test_services_search_indexer_class(indexer_settings): "core.tests.test_services_search_indexers.FakeDocumentIndexer" ) - assert get_document_indexer_class() == import_string( - "core.tests.test_services_search_indexers.FakeDocumentIndexer" + assert isinstance( + get_document_indexer(), + import_string("core.tests.test_services_search_indexers.FakeDocumentIndexer"), ) @@ -95,28 +67,28 @@ def test_services_search_indexer_is_configured(indexer_settings): indexer_settings.SEARCH_INDEXER_CLASS = None # None - default_document_indexer.cache_clear() - assert not default_document_indexer() + get_document_indexer.cache_clear() + assert not get_document_indexer() # Empty indexer_settings.SEARCH_INDEXER_CLASS = "" - default_document_indexer.cache_clear() - assert not default_document_indexer() + get_document_indexer.cache_clear() + assert not get_document_indexer() # Valid class indexer_settings.SEARCH_INDEXER_CLASS = ( "core.services.search_indexers.FindDocumentIndexer" ) - default_document_indexer.cache_clear() - assert default_document_indexer() is not None + get_document_indexer.cache_clear() + assert get_document_indexer() is not None indexer_settings.SEARCH_INDEXER_URL = "" # Invalid url - default_document_indexer.cache_clear() - assert not default_document_indexer() + get_document_indexer.cache_clear() + assert not get_document_indexer() def test_services_search_indexer_url_is_none(indexer_settings): diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index 83a0e20a4f..fdc81c6ceb 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -111,6 +111,9 @@ class Base(Configuration): SEARCH_INDEXER_URL = values.Value( default=None, environ_name="SEARCH_INDEXER_URL", environ_prefix=None ) + SEARCH_INDEXER_COUNTDOWN = values.IntegerValue( + default=1, environ_name="SEARCH_INDEXER_COUNTDOWN", environ_prefix=None + ) SEARCH_INDEXER_SECRET = values.Value( default=None, environ_name="SEARCH_INDEXER_SECRET", environ_prefix=None ) From f9c0a2b3e5350fed9519c5cc510d99bfe13e4cc0 Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Wed, 1 Oct 2025 07:18:26 +0200 Subject: [PATCH 13/22] =?UTF-8?q?=F0=9F=94=A7(backend)=20tool=20for=20vali?= =?UTF-8?q?d=20fernet=20key=20used=20in=20OIDC=20token=20storage?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add bin/fernetkey that generates a key for the OIDC_STORE_REFRESH_TOKEN_KEY setting. Signed-off-by: Fabre Florian --- bin/fernetkey | 6 +++++ env.d/development/common | 9 ++++--- .../core/tests/test_models_documents.py | 26 ++++++++++++++----- 3 files changed, 32 insertions(+), 9 deletions(-) create mode 100755 bin/fernetkey diff --git a/bin/fernetkey b/bin/fernetkey new file mode 100755 index 0000000000..8bbac1096e --- /dev/null +++ b/bin/fernetkey @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +# shellcheck source=bin/_config.sh +source "$(dirname "${BASH_SOURCE[0]}")/_config.sh" + +_dc_run app-dev python -c 'from cryptography.fernet import Fernet;import sys; sys.stdout.write("\n" + Fernet.generate_key().decode() + "\n");' diff --git a/env.d/development/common b/env.d/development/common index eb5b0c5499..839bc246ee 100644 --- a/env.d/development/common +++ b/env.d/development/common @@ -50,9 +50,12 @@ OIDC_REDIRECT_ALLOWED_HOSTS=["http://localhost:8083", "http://localhost:3000"] OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"} # Store OIDC tokens in the session -OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session -OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session -OIDC_STORE_REFRESH_TOKEN_KEY = ThisIsAnExampleKeyForDevPurposeOnly +OIDC_STORE_ACCESS_TOKEN = True +OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session. + +# Must be a valid Fernet key (32 url-safe base64-encoded bytes) +# To create one, use the bin/fernetkey command. +# OIDC_STORE_REFRESH_TOKEN_KEY="your-32-byte-encryption-key==" # AI AI_FEATURE_ENABLED=true diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index 22c153245b..b808c6c16d 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -1541,9 +1541,16 @@ def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings) user = factories.UserFactory() with transaction.atomic(): - doc = factories.DocumentFactory() - doc_deleted = factories.DocumentFactory() - doc_ancestor_deleted = factories.DocumentFactory(parent=doc_deleted) + doc = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + doc_deleted = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + doc_ancestor_deleted = factories.DocumentFactory( + parent=doc_deleted, + link_reach=models.LinkReachChoices.AUTHENTICATED, + ) doc_deleted.soft_delete() doc_ancestor_deleted.ancestors_deleted_at = doc_deleted.deleted_at @@ -1596,9 +1603,16 @@ def test_models_documents_post_save_indexer_restored(mock_push, indexer_settings user = factories.UserFactory() with transaction.atomic(): - doc = factories.DocumentFactory() - doc_deleted = factories.DocumentFactory() - doc_ancestor_deleted = factories.DocumentFactory(parent=doc_deleted) + doc = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + doc_deleted = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + doc_ancestor_deleted = factories.DocumentFactory( + parent=doc_deleted, + link_reach=models.LinkReachChoices.AUTHENTICATED, + ) doc_deleted.soft_delete() doc_ancestor_deleted.ancestors_deleted_at = doc_deleted.deleted_at From bc895512b4fcc6acd4127ddf2c8b16d2d6841f39 Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Mon, 6 Oct 2025 16:03:14 +0200 Subject: [PATCH 14/22] =?UTF-8?q?=F0=9F=94=A7(backend)=20setup=20Docs=20ap?= =?UTF-8?q?p=20dockers=20to=20work=20with=20Find?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add nginx with 'nginx' alias to the 'lasuite-net' network (keycloak calls) Add celery-dev to the 'lasuite-net' network (Find API calls in jobs) Set app-dev alias as 'impress' in the 'lasuite-net' network Add indexer configuration in common settings Signed-off-by: Fabre Florian --- compose.yml | 14 ++++++++++++-- env.d/development/common | 8 +++++++- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/compose.yml b/compose.yml index a24a3974cf..1873d604dc 100644 --- a/compose.yml +++ b/compose.yml @@ -73,8 +73,10 @@ services: ports: - "8071:8000" networks: - - default - - lasuite-net + default: {} + lasuite-net: + aliases: + - impress volumes: - ./src/backend:/app - ./data/static:/data/static @@ -95,6 +97,9 @@ services: command: ["celery", "-A", "impress.celery_app", "worker", "-l", "DEBUG"] environment: - DJANGO_CONFIGURATION=Development + networks: + - default + - lasuite-net env_file: - env.d/development/common - env.d/development/common.local @@ -110,6 +115,11 @@ services: image: nginx:1.25 ports: - "8083:8083" + networks: + default: {} + lasuite-net: + aliases: + - nginx volumes: - ./docker/files/etc/nginx/conf.d:/etc/nginx/conf.d:ro depends_on: diff --git a/env.d/development/common b/env.d/development/common index 839bc246ee..7ad7589fb0 100644 --- a/env.d/development/common +++ b/env.d/development/common @@ -76,4 +76,10 @@ Y_PROVIDER_API_BASE_URL=http://y-provider-development:4444/api/ Y_PROVIDER_API_KEY=yprovider-api-key # Theme customization -THEME_CUSTOMIZATION_CACHE_TIMEOUT=15 \ No newline at end of file +THEME_CUSTOMIZATION_CACHE_TIMEOUT=15 + +# Indexer +SEARCH_INDEXER_CLASS="core.services.search_indexers.FindDocumentIndexer" +SEARCH_INDEXER_SECRET=find-api-key-for-docs-with-exactly-50-chars-length # Key generated by create_demo in Find app. +SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/" +SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/" From 33f1555377b348b19fb65a331e1c82dde7274144 Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Tue, 7 Oct 2025 20:54:21 +0200 Subject: [PATCH 15/22] =?UTF-8?q?=E2=9C=A8(backend)=20some=20refactor=20of?= =?UTF-8?q?=20indexer=20classes=20&=20modules?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename FindDocumentIndexer as SearchIndexer Rename FindDocumentSerializer as SearchDocumentSerializer Rename package core.tasks.find as core.task.search Remove logs on http errors in SearchIndexer Factorise some code in search API view. Signed-off-by: Fabre Florian --- env.d/development/common | 2 +- src/backend/core/api/serializers.py | 4 +- src/backend/core/api/viewsets.py | 81 ++++++++++--------- src/backend/core/services/search_indexers.py | 40 ++++----- src/backend/core/signals.py | 2 +- src/backend/core/tasks/{find.py => search.py} | 0 src/backend/core/tests/commands/test_index.py | 6 +- src/backend/core/tests/conftest.py | 2 +- .../core/tests/test_models_documents.py | 36 ++++----- .../tests/test_services_search_indexers.py | 50 ++++++------ 10 files changed, 112 insertions(+), 111 deletions(-) rename src/backend/core/tasks/{find.py => search.py} (100%) diff --git a/env.d/development/common b/env.d/development/common index 7ad7589fb0..a6bfb65b51 100644 --- a/env.d/development/common +++ b/env.d/development/common @@ -79,7 +79,7 @@ Y_PROVIDER_API_KEY=yprovider-api-key THEME_CUSTOMIZATION_CACHE_TIMEOUT=15 # Indexer -SEARCH_INDEXER_CLASS="core.services.search_indexers.FindDocumentIndexer" +SEARCH_INDEXER_CLASS="core.services.search_indexers.SearchIndexer" SEARCH_INDEXER_SECRET=find-api-key-for-docs-with-exactly-50-chars-length # Key generated by create_demo in Find app. SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/" SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/" diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index ca6b444dee..45ec34c68e 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -891,8 +891,8 @@ class MoveDocumentSerializer(serializers.Serializer): ) -class FindDocumentSerializer(serializers.Serializer): - """Serializer for Find search requests""" +class SearchDocumentSerializer(serializers.Serializer): + """Serializer for fulltext search requests through Find application""" q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True) page_size = serializers.IntegerField( diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 7132687c77..c232fd14c2 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -1071,16 +1071,49 @@ def duplicate(self, request, *args, **kwargs): {"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED ) + def _simple_search_queryset(self, params): + """ + Returns a queryset filtered by the content of the document title + """ + text = params.validated_data["q"] + + # As the 'list' view we get a prefiltered queryset (deleted docs are excluded) + queryset = self.get_queryset() + filterset = DocumentFilter({"title": text}, queryset=queryset) + + if not filterset.is_valid(): + raise drf.exceptions.ValidationError(filterset.errors) + + return filterset.filter_queryset(queryset) + + def _fulltext_search_queryset(self, indexer, token, user, params): + """ + Returns a queryset from the results the fulltext search of Find + """ + text = params.validated_data["q"] + queryset = models.Document.objects.all() + + # Retrieve the documents ids from Find. + results = indexer.search( + text=text, + token=token, + visited=get_visited_document_ids_of(queryset, user), + page=params.validated_data.get("page", 1), + page_size=params.validated_data.get("page_size", 20), + ) + + return queryset.filter(pk__in=results) + @drf.decorators.action(detail=False, methods=["get"], url_path="search") @method_decorator(refresh_oidc_access_token) def search(self, request, *args, **kwargs): """ Returns a DRF response containing the filtered, annotated and ordered document list. - Applies filtering based on request parameter 'q' from `FindDocumentSerializer`. + Applies filtering based on request parameter 'q' from `SearchDocumentSerializer`. Depending of the configuration it can be: - A fulltext search through the opensearch indexation app "find" if the backend is - enabled (see SEARCH_BACKEND_CLASS) + enabled (see SEARCH_INDEXER_CLASS) - A filtering by the model field 'title'. The ordering is always by the most recent first. @@ -1088,46 +1121,22 @@ def search(self, request, *args, **kwargs): access_token = request.session.get("oidc_access_token") user = request.user - serializer = serializers.FindDocumentSerializer(data=request.query_params) - serializer.is_valid(raise_exception=True) + params = serializers.SearchDocumentSerializer(data=request.query_params) + params.is_valid(raise_exception=True) indexer = get_document_indexer() - text = serializer.validated_data["q"] - - # The indexer is not configured, so we fallback on a simple filter on the - # model field 'title'. - if not indexer: - # As the 'list' view we get a prefiltered queryset (deleted docs are excluded) - queryset = self.get_queryset() - filterset = DocumentFilter({"title": text}, queryset=queryset) - - if not filterset.is_valid(): - raise drf.exceptions.ValidationError(filterset.errors) - queryset = filterset.filter_queryset(queryset).order_by("-updated_at") - - return self.get_response_for_queryset( - queryset, - context={ - "request": request, - }, + if indexer: + queryset = self._fulltext_search_queryset( + indexer, token=access_token, user=user, params=params ) - - queryset = models.Document.objects.all() - - # Retrieve the documents ids from Find. - results = indexer.search( - text=text, - token=access_token, - visited=get_visited_document_ids_of(queryset, user), - page=serializer.validated_data.get("page", 1), - page_size=serializer.validated_data.get("page_size", 20), - ) - - queryset = queryset.filter(pk__in=results).order_by("-updated_at") + else: + # The indexer is not configured, we fallback on a simple icontains filter by the + # model field 'title'. + queryset = self._simple_search_queryset(params) return self.get_response_for_queryset( - queryset, + queryset.order_by("-updated_at"), context={ "request": request, }, diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index c5c26e4b71..abaa4d454c 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -223,7 +223,7 @@ def search_query(self, data, token) -> dict: """ -class FindDocumentIndexer(BaseDocumentIndexer): +class SearchIndexer(BaseDocumentIndexer): """ Document indexer that pushes documents to La Suite Find app. """ @@ -270,18 +270,14 @@ def search_query(self, data, token) -> requests.Response: Returns: dict: A JSON-serializable dictionary. """ - try: - response = requests.post( - self.search_url, - json=data, - headers={"Authorization": f"Bearer {token}"}, - timeout=10, - ) - response.raise_for_status() - return response.json() - except requests.exceptions.HTTPError as e: - logger.error("HTTPError: %s", e) - raise + response = requests.post( + self.search_url, + json=data, + headers={"Authorization": f"Bearer {token}"}, + timeout=10, + ) + response.raise_for_status() + return response.json() def push(self, data): """ @@ -290,14 +286,10 @@ def push(self, data): Args: data (list): List of document dictionaries. """ - try: - response = requests.post( - self.indexer_url, - json=data, - headers={"Authorization": f"Bearer {self.indexer_secret}"}, - timeout=10, - ) - response.raise_for_status() - except requests.exceptions.HTTPError as e: - logger.error("HTTPError: %s", e) - raise + response = requests.post( + self.indexer_url, + json=data, + headers={"Authorization": f"Bearer {self.indexer_secret}"}, + timeout=10, + ) + response.raise_for_status() diff --git a/src/backend/core/signals.py b/src/backend/core/signals.py index 112a42f5bc..1aa95f44be 100644 --- a/src/backend/core/signals.py +++ b/src/backend/core/signals.py @@ -9,7 +9,7 @@ from django.dispatch import receiver from . import models -from .tasks.find import trigger_document_indexer +from .tasks.search import trigger_document_indexer @receiver(signals.post_save, sender=models.Document) diff --git a/src/backend/core/tasks/find.py b/src/backend/core/tasks/search.py similarity index 100% rename from src/backend/core/tasks/find.py rename to src/backend/core/tasks/search.py diff --git a/src/backend/core/tests/commands/test_index.py b/src/backend/core/tests/commands/test_index.py index 78d3024958..ad7d39e6e0 100644 --- a/src/backend/core/tests/commands/test_index.py +++ b/src/backend/core/tests/commands/test_index.py @@ -11,7 +11,7 @@ import pytest from core import factories -from core.services.search_indexers import FindDocumentIndexer +from core.services.search_indexers import SearchIndexer @pytest.mark.django_db @@ -19,7 +19,7 @@ def test_index(): """Test the command `index` that run the Find app indexer for all the available documents.""" user = factories.UserFactory() - indexer = FindDocumentIndexer() + indexer = SearchIndexer() with transaction.atomic(): doc = factories.DocumentFactory() @@ -36,7 +36,7 @@ def test_index(): str(no_title_doc.path): {"users": [user.sub]}, } - with mock.patch.object(FindDocumentIndexer, "push") as mock_push: + with mock.patch.object(SearchIndexer, "push") as mock_push: call_command("index") push_call_args = [call.args[0] for call in mock_push.call_args_list] diff --git a/src/backend/core/tests/conftest.py b/src/backend/core/tests/conftest.py index 7c5a59b863..ba607b4228 100644 --- a/src/backend/core/tests/conftest.py +++ b/src/backend/core/tests/conftest.py @@ -39,7 +39,7 @@ def indexer_settings_fixture(settings): get_document_indexer.cache_clear() - settings.SEARCH_INDEXER_CLASS = "core.services.search_indexers.FindDocumentIndexer" + settings.SEARCH_INDEXER_CLASS = "core.services.search_indexers.SearchIndexer" settings.SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest" settings.SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/" settings.SEARCH_INDEXER_QUERY_URL = ( diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index b808c6c16d..46309bd357 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -21,7 +21,7 @@ import pytest from core import factories, models -from core.services.search_indexers import FindDocumentIndexer +from core.services.search_indexers import SearchIndexer pytestmark = pytest.mark.django_db @@ -1450,7 +1450,7 @@ def test_models_documents_compute_ancestors_links_paths_mapping_structure( } -@mock.patch.object(FindDocumentIndexer, "push") +@mock.patch.object(SearchIndexer, "push") @pytest.mark.django_db(transaction=True) def test_models_documents_post_save_indexer(mock_push, indexer_settings): """Test indexation task on document creation""" @@ -1462,7 +1462,7 @@ def test_models_documents_post_save_indexer(mock_push, indexer_settings): accesses = {} data = [call.args[0] for call in mock_push.call_args_list] - indexer = FindDocumentIndexer() + indexer = SearchIndexer() assert sorted(data, key=itemgetter("id")) == sorted( [ @@ -1479,7 +1479,7 @@ def test_models_documents_post_save_indexer(mock_push, indexer_settings): assert cache.get(f"doc-indexer-debounce-{doc3.pk}") == 0 -@mock.patch.object(FindDocumentIndexer, "push") +@mock.patch.object(SearchIndexer, "push") @pytest.mark.django_db(transaction=True) def test_models_documents_post_save_indexer_not_configured(mock_push, indexer_settings): """Task should not start an indexation when disabled""" @@ -1492,7 +1492,7 @@ def test_models_documents_post_save_indexer_not_configured(mock_push, indexer_se assert mock_push.call_args_list == [] -@mock.patch.object(FindDocumentIndexer, "push") +@mock.patch.object(SearchIndexer, "push") @pytest.mark.django_db(transaction=True) def test_models_documents_post_save_indexer_with_accesses(mock_push, indexer_settings): """Test indexation task on document creation""" @@ -1515,7 +1515,7 @@ def test_models_documents_post_save_indexer_with_accesses(mock_push, indexer_set data = [call.args[0] for call in mock_push.call_args_list] - indexer = FindDocumentIndexer() + indexer = SearchIndexer() assert sorted(data, key=itemgetter("id")) == sorted( [ @@ -1532,7 +1532,7 @@ def test_models_documents_post_save_indexer_with_accesses(mock_push, indexer_set assert cache.get(f"doc-indexer-debounce-{doc3.pk}") == 0 -@mock.patch.object(FindDocumentIndexer, "push") +@mock.patch.object(SearchIndexer, "push") @pytest.mark.django_db(transaction=True) def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings): """Indexation task on deleted or ancestor_deleted documents""" @@ -1575,7 +1575,7 @@ def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings) data = [call.args[0] for call in mock_push.call_args_list] - indexer = FindDocumentIndexer() + indexer = SearchIndexer() # Even deleted document are re-indexed : only update their status in the future ? assert sorted(data, key=itemgetter("id")) == sorted( @@ -1594,7 +1594,7 @@ def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings) assert cache.get(f"doc-indexer-debounce-{doc_ancestor_deleted.pk}") == 0 -@mock.patch.object(FindDocumentIndexer, "push") +@mock.patch.object(SearchIndexer, "push") @pytest.mark.django_db(transaction=True) def test_models_documents_post_save_indexer_restored(mock_push, indexer_settings): """Restart indexation task on restored documents""" @@ -1648,7 +1648,7 @@ def test_models_documents_post_save_indexer_restored(mock_push, indexer_settings data = [call.args[0] for call in mock_push.call_args_list] - indexer = FindDocumentIndexer() + indexer = SearchIndexer() # All docs are re-indexed assert sorted(data, key=itemgetter("id")) == sorted( @@ -1668,10 +1668,10 @@ def test_models_documents_post_save_indexer_debounce(indexer_settings): """Test indexation task skipping on document update""" indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 - indexer = FindDocumentIndexer() + indexer = SearchIndexer() user = factories.UserFactory() - with mock.patch.object(FindDocumentIndexer, "push"): + with mock.patch.object(SearchIndexer, "push"): with transaction.atomic(): doc = factories.DocumentFactory() factories.UserDocumentAccessFactory(document=doc, user=user) @@ -1680,7 +1680,7 @@ def test_models_documents_post_save_indexer_debounce(indexer_settings): str(doc.path): {"users": [user.sub]}, } - with mock.patch.object(FindDocumentIndexer, "push") as mock_push: + with mock.patch.object(SearchIndexer, "push") as mock_push: # Simulate 1 waiting task cache.set(f"doc-indexer-debounce-{doc.pk}", 1) @@ -1691,7 +1691,7 @@ def test_models_documents_post_save_indexer_debounce(indexer_settings): assert [call.args[0] for call in mock_push.call_args_list] == [] - with mock.patch.object(FindDocumentIndexer, "push") as mock_push: + with mock.patch.object(SearchIndexer, "push") as mock_push: # No waiting task cache.set(f"doc-indexer-debounce-{doc.pk}", 0) @@ -1709,10 +1709,10 @@ def test_models_documents_access_post_save_indexer(indexer_settings): """Test indexation task on DocumentAccess update""" indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 - indexer = FindDocumentIndexer() + indexer = SearchIndexer() user = factories.UserFactory() - with mock.patch.object(FindDocumentIndexer, "push"): + with mock.patch.object(SearchIndexer, "push"): with transaction.atomic(): doc = factories.DocumentFactory() doc_access = factories.UserDocumentAccessFactory(document=doc, user=user) @@ -1721,9 +1721,9 @@ def test_models_documents_access_post_save_indexer(indexer_settings): str(doc.path): {"users": [user.sub]}, } - indexer = FindDocumentIndexer() + indexer = SearchIndexer() - with mock.patch.object(FindDocumentIndexer, "push") as mock_push: + with mock.patch.object(SearchIndexer, "push") as mock_push: with transaction.atomic(): doc_access.save() diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index 15686b7744..a65f92ca06 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -15,7 +15,7 @@ from core import factories, models, utils from core.services.search_indexers import ( BaseDocumentIndexer, - FindDocumentIndexer, + SearchIndexer, get_document_indexer, get_visited_document_ids_of, ) @@ -78,7 +78,7 @@ def test_services_search_indexer_is_configured(indexer_settings): # Valid class indexer_settings.SEARCH_INDEXER_CLASS = ( - "core.services.search_indexers.FindDocumentIndexer" + "core.services.search_indexers.SearchIndexer" ) get_document_indexer.cache_clear() @@ -98,7 +98,7 @@ def test_services_search_indexer_url_is_none(indexer_settings): indexer_settings.SEARCH_INDEXER_URL = None with pytest.raises(ImproperlyConfigured) as exc_info: - FindDocumentIndexer() + SearchIndexer() assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value) @@ -110,7 +110,7 @@ def test_services_search_indexer_url_is_empty(indexer_settings): indexer_settings.SEARCH_INDEXER_URL = "" with pytest.raises(ImproperlyConfigured) as exc_info: - FindDocumentIndexer() + SearchIndexer() assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value) @@ -122,7 +122,7 @@ def test_services_search_indexer_secret_is_none(indexer_settings): indexer_settings.SEARCH_INDEXER_SECRET = None with pytest.raises(ImproperlyConfigured) as exc_info: - FindDocumentIndexer() + SearchIndexer() assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str( exc_info.value @@ -136,7 +136,7 @@ def test_services_search_indexer_secret_is_empty(indexer_settings): indexer_settings.SEARCH_INDEXER_SECRET = "" with pytest.raises(ImproperlyConfigured) as exc_info: - FindDocumentIndexer() + SearchIndexer() assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str( exc_info.value @@ -150,7 +150,7 @@ def test_services_search_endpoint_is_none(indexer_settings): indexer_settings.SEARCH_INDEXER_QUERY_URL = None with pytest.raises(ImproperlyConfigured) as exc_info: - FindDocumentIndexer() + SearchIndexer() assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str( exc_info.value @@ -164,7 +164,7 @@ def test_services_search_endpoint_is_empty(indexer_settings): indexer_settings.SEARCH_INDEXER_QUERY_URL = "" with pytest.raises(ImproperlyConfigured) as exc_info: - FindDocumentIndexer() + SearchIndexer() assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str( exc_info.value @@ -192,7 +192,7 @@ def test_services_search_indexers_serialize_document_returns_expected_json(): } } - indexer = FindDocumentIndexer() + indexer = SearchIndexer() result = indexer.serialize_document(document, accesses) assert set(result.pop("users")) == {str(user_a.sub), str(user_b.sub)} @@ -221,7 +221,7 @@ def test_services_search_indexers_serialize_document_deleted(): parent.soft_delete() document.refresh_from_db() - indexer = FindDocumentIndexer() + indexer = SearchIndexer() result = indexer.serialize_document(document, {}) assert result["is_active"] is False @@ -232,7 +232,7 @@ def test_services_search_indexers_serialize_document_empty(): """Empty documents returns empty content in the serialized json.""" document = factories.DocumentFactory(content="", title=None) - indexer = FindDocumentIndexer() + indexer = SearchIndexer() result = indexer.serialize_document(document, {}) assert result["content"] == "" @@ -256,10 +256,10 @@ def test_services_search_indexers_index_errors(indexer_settings): ) with pytest.raises(HTTPError): - FindDocumentIndexer().index() + SearchIndexer().index() -@patch.object(FindDocumentIndexer, "push") +@patch.object(SearchIndexer, "push") def test_services_search_indexers_batches_pass_only_batch_accesses( mock_push, indexer_settings ): @@ -276,7 +276,7 @@ def test_services_search_indexers_batches_pass_only_batch_accesses( access = factories.UserDocumentAccessFactory(document=document) expected_user_subs[str(document.id)] = str(access.user.sub) - assert FindDocumentIndexer().index() == 5 + assert SearchIndexer().index() == 5 # Should be 3 batches: 2 + 2 + 1 assert mock_push.call_count == 3 @@ -299,7 +299,7 @@ def test_services_search_indexers_batches_pass_only_batch_accesses( assert seen_doc_ids == {str(d.id) for d in documents} -@patch.object(FindDocumentIndexer, "push") +@patch.object(SearchIndexer, "push") @pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_ignore_empty_documents(mock_push): """ @@ -311,7 +311,7 @@ def test_services_search_indexers_ignore_empty_documents(mock_push): empty_title = factories.DocumentFactory(title="") empty_content = factories.DocumentFactory(content="") - assert FindDocumentIndexer().index() == 3 + assert SearchIndexer().index() == 3 assert mock_push.call_count == 1 @@ -327,7 +327,7 @@ def test_services_search_indexers_ignore_empty_documents(mock_push): } -@patch.object(FindDocumentIndexer, "push") +@patch.object(SearchIndexer, "push") @pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_ancestors_link_reach(mock_push): """Document accesses and reach should take into account ancestors link reaches.""" @@ -338,7 +338,7 @@ def test_services_search_indexers_ancestors_link_reach(mock_push): parent = factories.DocumentFactory(parent=grand_parent, link_reach="public") document = factories.DocumentFactory(parent=parent, link_reach="restricted") - assert FindDocumentIndexer().index() == 4 + assert SearchIndexer().index() == 4 results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 4 @@ -348,7 +348,7 @@ def test_services_search_indexers_ancestors_link_reach(mock_push): assert results[str(document.id)]["reach"] == "public" -@patch.object(FindDocumentIndexer, "push") +@patch.object(SearchIndexer, "push") @pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_ancestors_users(mock_push): """Document accesses and reach should include users from ancestors.""" @@ -358,7 +358,7 @@ def test_services_search_indexers_ancestors_users(mock_push): parent = factories.DocumentFactory(parent=grand_parent, users=[user_p]) document = factories.DocumentFactory(parent=parent, users=[user_d]) - assert FindDocumentIndexer().index() == 3 + assert SearchIndexer().index() == 3 results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 3 @@ -371,7 +371,7 @@ def test_services_search_indexers_ancestors_users(mock_push): } -@patch.object(FindDocumentIndexer, "push") +@patch.object(SearchIndexer, "push") @pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_ancestors_teams(mock_push): """Document accesses and reach should include teams from ancestors.""" @@ -379,7 +379,7 @@ def test_services_search_indexers_ancestors_teams(mock_push): parent = factories.DocumentFactory(parent=grand_parent, teams=["team_p"]) document = factories.DocumentFactory(parent=parent, teams=["team_d"]) - assert FindDocumentIndexer().index() == 3 + assert SearchIndexer().index() == 3 results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} assert len(results) == 3 @@ -396,7 +396,7 @@ def test_push_uses_correct_url_and_data(mock_post, indexer_settings): """ indexer_settings.SEARCH_INDEXER_URL = "http://example.com/index" - indexer = FindDocumentIndexer() + indexer = SearchIndexer() sample_data = [{"id": "123", "title": "Test"}] mock_response = mock_post.return_value @@ -497,7 +497,7 @@ def test_services_search_indexers_search_errors(indexer_settings): ) with pytest.raises(HTTPError): - FindDocumentIndexer().search("alpha", token="mytoken") + SearchIndexer().search("alpha", token="mytoken") @patch("requests.post") @@ -507,7 +507,7 @@ def test_services_search_indexers_search(mock_post, indexer_settings): document ids from linktraces. """ user = factories.UserFactory() - indexer = FindDocumentIndexer() + indexer = SearchIndexer() mock_response = mock_post.return_value mock_response.raise_for_status.return_value = None # No error From 337129cb484b50427816fa15aa5cfdf97d9fe410 Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Tue, 14 Oct 2025 11:30:01 +0200 Subject: [PATCH 16/22] =?UTF-8?q?=E2=9C=A8(backend)=20throttle=20indexatio?= =?UTF-8?q?n=20tasks=20instead=20of=20debounce=20(simplier)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace indexer_debounce_lock|release functions by indexer_throttle_acquire() Instead of mutex-like mechanism, simply set a flag in cache for an amount of time that prevents any other task creation. Signed-off-by: Fabre Florian --- src/backend/core/tasks/search.py | 81 +++++++++--------- .../documents/test_api_documents_search.py | 2 + .../core/tests/test_models_documents.py | 84 ++++++++++++++----- 3 files changed, 105 insertions(+), 62 deletions(-) diff --git a/src/backend/core/tasks/search.py b/src/backend/core/tasks/search.py index c9d36b157e..9f3a8cf1a3 100644 --- a/src/backend/core/tasks/search.py +++ b/src/backend/core/tasks/search.py @@ -5,56 +5,50 @@ from django.conf import settings from django.core.cache import cache -from impress.celery_app import app +from django_redis.cache import RedisCache -logger = getLogger(__file__) +from core import models +from core.services.search_indexers import ( + get_batch_accesses_by_users_and_teams, + get_document_indexer, +) +from impress.celery_app import app -def indexer_debounce_lock(document_id): - """Increase or reset counter""" - key = f"doc-indexer-debounce-{document_id}" +logger = getLogger(__file__) - try: - return cache.incr(key) - except ValueError: - cache.set(key, 1) - return 1 +def indexer_throttle_acquire(document_id, timeout=0, atomic=True): + """ + Enable the task throttle flag for a delay. + Uses redis locks if available to ensure atomic changes + """ + key = f"doc-indexer-throttle-{document_id}" -def indexer_debounce_release(document_id): - """Decrease or reset counter""" - key = f"doc-indexer-debounce-{document_id}" + if isinstance(cache, RedisCache) and atomic: + with cache.locks(key): + return indexer_throttle_acquire(document_id, timeout, atomic=False) - try: - return cache.decr(key) - except ValueError: - cache.set(key, 0) - return 0 + # Use add() here : + # - set the flag and returns true if not exist + # - do nothing and return false if exist + return cache.add(key, 1, timeout=timeout) @app.task def document_indexer_task(document_id): """Celery Task : Sends indexation query for a document.""" - # Prevents some circular imports - # pylint: disable=import-outside-toplevel - from core import models # noqa : PLC0415 - from core.services.search_indexers import ( # noqa : PLC0415 - get_batch_accesses_by_users_and_teams, - get_document_indexer, - ) - - # check if the counter : if still up, skip the task. only the last one - # within the countdown delay will do the query. - if indexer_debounce_release(document_id) > 0: - logger.info("Skip document %s indexation", document_id) - return - indexer = get_document_indexer() if indexer is None: return - doc = models.Document.objects.get(pk=document_id) + try: + doc = models.Document.objects.get(pk=document_id) + except models.Document.DoesNotExist: + # Skip the task if the document does not exist. + return + accesses = get_batch_accesses_by_users_and_teams((doc.path,)) data = indexer.serialize_document(document=doc, accesses=accesses) @@ -76,14 +70,15 @@ def trigger_document_indexer(document): if not settings.SEARCH_INDEXER_CLASS: return - logger.info( - "Add task for document %s indexation in %.2f seconds", - document.pk, - countdown, - ) - - # Each time this method is called during the countdown, we increment the + # Each time this method is called during a countdown, we increment the # counter and each task decrease it, so the index be run only once. - indexer_debounce_lock(document.pk) - - document_indexer_task.apply_async(args=[document.pk], countdown=countdown) + if indexer_throttle_acquire(document.pk, timeout=countdown): + logger.info( + "Add task for document %s indexation in %.2f seconds", + document.pk, + countdown, + ) + + document_indexer_task.apply_async(args=[document.pk]) + else: + logger.info("Skip task for document %s indexation", document.pk) diff --git a/src/backend/core/tests/documents/test_api_documents_search.py b/src/backend/core/tests/documents/test_api_documents_search.py index 0c46eb2b0f..869a8d5666 100644 --- a/src/backend/core/tests/documents/test_api_documents_search.py +++ b/src/backend/core/tests/documents/test_api_documents_search.py @@ -93,6 +93,7 @@ def test_api_documents_search_endpoint_is_none(indexer_settings): "path": document.path, "title": document.title, "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"), + "deleted_at": None, "user_role": access.role, } @@ -184,6 +185,7 @@ def test_api_documents_search_format(indexer_settings): "path": document.path, "title": document.title, "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"), + "deleted_at": None, "user_role": access.role, } diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index 46309bd357..f15d470105 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -22,6 +22,7 @@ from core import factories, models from core.services.search_indexers import SearchIndexer +from core.tasks.search import document_indexer_task pytestmark = pytest.mark.django_db @@ -1473,10 +1474,10 @@ def test_models_documents_post_save_indexer(mock_push, indexer_settings): key=itemgetter("id"), ) - # The debounce counters should be reset - assert cache.get(f"doc-indexer-debounce-{doc1.pk}") == 0 - assert cache.get(f"doc-indexer-debounce-{doc2.pk}") == 0 - assert cache.get(f"doc-indexer-debounce-{doc3.pk}") == 0 + # The throttle counters should be reset + assert cache.get(f"doc-indexer-throttle-{doc1.pk}") is None + assert cache.get(f"doc-indexer-throttle-{doc2.pk}") is None + assert cache.get(f"doc-indexer-throttle-{doc3.pk}") is None @mock.patch.object(SearchIndexer, "push") @@ -1486,10 +1487,31 @@ def test_models_documents_post_save_indexer_not_configured(mock_push, indexer_se indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 indexer_settings.SEARCH_INDEXER_CLASS = None + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory() + factories.UserDocumentAccessFactory(document=doc, user=user) + + assert mock_push.assert_not_called + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_wrongly_configured( + mock_push, indexer_settings +): + """Task should not start an indexation when disabled""" + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 + indexer_settings.SEARCH_INDEXER_URL = None + + user = factories.UserFactory() + with transaction.atomic(): - factories.DocumentFactory() + doc = factories.DocumentFactory() + factories.UserDocumentAccessFactory(document=doc, user=user) - assert mock_push.call_args_list == [] + assert mock_push.assert_not_called @mock.patch.object(SearchIndexer, "push") @@ -1526,10 +1548,10 @@ def test_models_documents_post_save_indexer_with_accesses(mock_push, indexer_set key=itemgetter("id"), ) - # The debounce counters should be reset - assert cache.get(f"doc-indexer-debounce-{doc1.pk}") == 0 - assert cache.get(f"doc-indexer-debounce-{doc2.pk}") == 0 - assert cache.get(f"doc-indexer-debounce-{doc3.pk}") == 0 + # The throttle counters should be reset + assert cache.get(f"doc-indexer-throttle-{doc1.pk}") is None + assert cache.get(f"doc-indexer-throttle-{doc2.pk}") is None + assert cache.get(f"doc-indexer-throttle-{doc3.pk}") is None @mock.patch.object(SearchIndexer, "push") @@ -1588,10 +1610,34 @@ def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings) key=itemgetter("id"), ) - # The debounce counters should be reset - assert cache.get(f"doc-indexer-debounce-{doc.pk}") == 0 - assert cache.get(f"doc-indexer-debounce-{doc_deleted.pk}") == 0 - assert cache.get(f"doc-indexer-debounce-{doc_ancestor_deleted.pk}") == 0 + # The throttle counters should be reset + assert cache.get(f"doc-indexer-throttle-{doc.pk}") is None + assert cache.get(f"doc-indexer-throttle-{doc_deleted.pk}") is None + assert cache.get(f"doc-indexer-throttle-{doc_ancestor_deleted.pk}") is None + + +@pytest.mark.django_db(transaction=True) +def test_models_documents_indexer_hard_deleted(indexer_settings): + """Indexation task on hard deleted document""" + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 + + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + factories.UserDocumentAccessFactory(document=doc, user=user) + + doc_id = doc.pk + doc.delete() + + # Call task on deleted document. + document_indexer_task.apply(args=[doc_id]) + + with mock.patch.object(SearchIndexer, "push") as mock_push: + # Hard delete document are not re-indexed. + assert mock_push.assert_not_called @mock.patch.object(SearchIndexer, "push") @@ -1664,7 +1710,7 @@ def test_models_documents_post_save_indexer_restored(mock_push, indexer_settings @pytest.mark.django_db(transaction=True) -def test_models_documents_post_save_indexer_debounce(indexer_settings): +def test_models_documents_post_save_indexer_throttle(indexer_settings): """Test indexation task skipping on document update""" indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 @@ -1681,11 +1727,11 @@ def test_models_documents_post_save_indexer_debounce(indexer_settings): } with mock.patch.object(SearchIndexer, "push") as mock_push: - # Simulate 1 waiting task - cache.set(f"doc-indexer-debounce-{doc.pk}", 1) + # Simulate 1 running task + cache.set(f"doc-indexer-throttle-{doc.pk}", 1) # save doc to trigger the indexer, but nothing should be done since - # the counter is over 0 + # the flag is up with transaction.atomic(): doc.save() @@ -1693,7 +1739,7 @@ def test_models_documents_post_save_indexer_debounce(indexer_settings): with mock.patch.object(SearchIndexer, "push") as mock_push: # No waiting task - cache.set(f"doc-indexer-debounce-{doc.pk}", 0) + cache.delete(f"doc-indexer-throttle-{doc.pk}") with transaction.atomic(): doc = models.Document.objects.get(pk=doc.pk) From 4974b5cf31b7f068ffba1c28c8c5e45870fd10eb Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Fri, 31 Oct 2025 11:20:46 +0100 Subject: [PATCH 17/22] =?UTF-8?q?=E2=9C=A8(backend)=20keep=20ordering=20fr?= =?UTF-8?q?om=20fulltext=20search=20in=20results?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keep ordering by score from Find API on search/ results and fallback search still uses "-update_at" ordering as default Refactor pagination to work with a list instead of a queryset Signed-off-by: Fabre Florian --- src/backend/core/api/viewsets.py | 58 +++-- .../documents/test_api_documents_search.py | 234 ++++++++++++++++-- 2 files changed, 245 insertions(+), 47 deletions(-) diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index c232fd14c2..f2988dca55 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -1071,12 +1071,10 @@ def duplicate(self, request, *args, **kwargs): {"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED ) - def _simple_search_queryset(self, params): + def _search_simple(self, request, text): """ Returns a queryset filtered by the content of the document title """ - text = params.validated_data["q"] - # As the 'list' view we get a prefiltered queryset (deleted docs are excluded) queryset = self.get_queryset() filterset = DocumentFilter({"title": text}, queryset=queryset) @@ -1084,25 +1082,47 @@ def _simple_search_queryset(self, params): if not filterset.is_valid(): raise drf.exceptions.ValidationError(filterset.errors) - return filterset.filter_queryset(queryset) + queryset = filterset.filter_queryset(queryset) + + return self.get_response_for_queryset( + queryset.order_by("-updated_at"), + context={ + "request": request, + }, + ) - def _fulltext_search_queryset(self, indexer, token, user, params): + def _search_fulltext(self, indexer, request, params): """ Returns a queryset from the results the fulltext search of Find """ + access_token = request.session.get("oidc_access_token") + user = request.user text = params.validated_data["q"] queryset = models.Document.objects.all() # Retrieve the documents ids from Find. results = indexer.search( text=text, - token=token, + token=access_token, visited=get_visited_document_ids_of(queryset, user), - page=params.validated_data.get("page", 1), - page_size=params.validated_data.get("page_size", 20), + page=1, + page_size=100, ) - return queryset.filter(pk__in=results) + docs_by_uuid = {str(d.pk): d for d in queryset.filter(pk__in=results)} + ordered_docs = [docs_by_uuid[id] for id in results] + + page = self.paginate_queryset(ordered_docs) + + serializer = self.get_serializer( + page if page else ordered_docs, + many=True, + context={ + "request": request, + }, + ) + + return self.get_paginated_response(serializer.data) @drf.decorators.action(detail=False, methods=["get"], url_path="search") @method_decorator(refresh_oidc_access_token) @@ -1118,29 +1138,17 @@ def search(self, request, *args, **kwargs): The ordering is always by the most recent first. """ - access_token = request.session.get("oidc_access_token") - user = request.user - params = serializers.SearchDocumentSerializer(data=request.query_params) params.is_valid(raise_exception=True) indexer = get_document_indexer() if indexer: - queryset = self._fulltext_search_queryset( - indexer, token=access_token, user=user, params=params - ) - else: - # The indexer is not configured, we fallback on a simple icontains filter by the - # model field 'title'. - queryset = self._simple_search_queryset(params) + return self._search_fulltext(indexer, request, params=params) - return self.get_response_for_queryset( - queryset.order_by("-updated_at"), - context={ - "request": request, - }, - ) + # The indexer is not configured, we fallback on a simple icontains filter by the + # model field 'title'. + return self._search_simple(request, text=params.validated_data["q"]) @drf.decorators.action(detail=True, methods=["get"], url_path="versions") def versions_list(self, request, *args, **kwargs): diff --git a/src/backend/core/tests/documents/test_api_documents_search.py b/src/backend/core/tests/documents/test_api_documents_search.py index 869a8d5666..3476617822 100644 --- a/src/backend/core/tests/documents/test_api_documents_search.py +++ b/src/backend/core/tests/documents/test_api_documents_search.py @@ -2,8 +2,11 @@ Tests for Documents API endpoint in impress's core app: list """ +import random from json import loads as json_loads +from django.test import RequestFactory + import pytest import responses from faker import Faker @@ -16,6 +19,15 @@ pytestmark = pytest.mark.django_db +def build_search_url(**kwargs): + """Build absolute uri for search endpoint with ORDERED query arguments""" + return ( + RequestFactory() + .get("/api/v1.0/documents/search/", dict(sorted(kwargs.items()))) + .build_absolute_uri() + ) + + @pytest.mark.parametrize("role", models.LinkRoleChoices.values) @pytest.mark.parametrize("reach", models.LinkReachChoices.values) @responses.activate @@ -191,8 +203,62 @@ def test_api_documents_search_format(indexer_settings): @responses.activate -def test_api_documents_search_pagination(indexer_settings): - """Documents should be ordered by descending "updated_at" by default""" +@pytest.mark.parametrize( + "pagination, status, expected", + ( + ( + {"page": 1, "page_size": 10}, + 200, + { + "count": 10, + "previous": None, + "next": None, + "range": (0, None), + }, + ), + ( + {}, + 200, + { + "count": 10, + "previous": None, + "next": None, + "range": (0, None), + "api_page_size": 21, # default page_size is 20 + }, + ), + ( + {"page": 2, "page_size": 10}, + 404, + {}, + ), + ( + {"page": 1, "page_size": 5}, + 200, + { + "count": 10, + "previous": None, + "next": {"page": 2, "page_size": 5}, + "range": (0, 5), + }, + ), + ( + {"page": 2, "page_size": 5}, + 200, + { + "count": 10, + "previous": {"page_size": 5}, + "next": None, + "range": (5, None), + }, + ), + ({"page": 3, "page_size": 5}, 404, {}), + ), +) +def test_api_documents_search_pagination( + indexer_settings, pagination, status, expected +): + """Documents should be ordered by descending "score" by default""" indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" assert get_document_indexer() is not None @@ -202,35 +268,159 @@ def test_api_documents_search_pagination(indexer_settings): client = APIClient() client.force_login(user) - docs = factories.DocumentFactory.create_batch(10) + docs = factories.DocumentFactory.create_batch(10, title="alpha", users=[user]) + + docs_by_uuid = {str(doc.pk): doc for doc in docs} + api_results = [{"_id": id} for id in docs_by_uuid.keys()] + + # reorder randomly to simulate score ordering + random.shuffle(api_results) # Find response # pylint: disable-next=assignment-from-none api_search = responses.add( responses.POST, "http://find/api/v1.0/search", - json=[{"_id": str(doc.pk)} for doc in docs], + json=api_results, status=200, ) response = client.get( - "/api/v1.0/documents/search/", data={"q": "alpha", "page": 2, "page_size": 5} + "/api/v1.0/documents/search/", + data={ + "q": "alpha", + **pagination, + }, ) - assert response.status_code == 200 - content = response.json() - results = content.pop("results") - assert len(results) == 5 - - # Check the query parameters. - assert api_search.call_count == 1 - assert api_search.calls[0].response.status_code == 200 - assert json_loads(api_search.calls[0].request.body) == { - "q": "alpha", - "visited": [], - "services": ["docs"], - "page_number": 2, - "page_size": 5, - "order_by": "updated_at", - "order_direction": "desc", - } + assert response.status_code == status + + if response.status_code < 300: + previous_url = ( + build_search_url(q="alpha", **expected["previous"]) + if expected["previous"] + else None + ) + next_url = ( + build_search_url(q="alpha", **expected["next"]) + if expected["next"] + else None + ) + start, end = expected["range"] + + content = response.json() + + assert content["count"] == expected["count"] + assert content["previous"] == previous_url + assert content["next"] == next_url + + results = content.pop("results") + + # The find api results ordering by score is kept + assert [r["id"] for r in results] == [r["_id"] for r in api_results[start:end]] + + # Check the query parameters. + assert api_search.call_count == 1 + assert api_search.calls[0].response.status_code == 200 + assert json_loads(api_search.calls[0].request.body) == { + "q": "alpha", + "visited": [], + "services": ["docs"], + "page_number": 1, + "page_size": 100, + "order_by": "updated_at", + "order_direction": "desc", + } + + +@responses.activate +@pytest.mark.parametrize( + "pagination, status, expected", + ( + ( + {"page": 1, "page_size": 10}, + 200, + {"count": 10, "previous": None, "next": None, "range": (0, None)}, + ), + ( + {}, + 200, + {"count": 10, "previous": None, "next": None, "range": (0, None)}, + ), + ( + {"page": 2, "page_size": 10}, + 404, + {}, + ), + ( + {"page": 1, "page_size": 5}, + 200, + { + "count": 10, + "previous": None, + "next": {"page": 2, "page_size": 5}, + "range": (0, 5), + }, + ), + ( + {"page": 2, "page_size": 5}, + 200, + { + "count": 10, + "previous": {"page_size": 5}, + "next": None, + "range": (5, None), + }, + ), + ({"page": 3, "page_size": 5}, 404, {}), + ), +) +def test_api_documents_search_pagination_endpoint_is_none( + indexer_settings, pagination, status, expected +): + """Documents should be ordered by descending "-updated_at" by default""" + indexer_settings.SEARCH_INDEXER_QUERY_URL = None + + assert get_document_indexer() is None + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + factories.DocumentFactory.create_batch(10, title="alpha", users=[user]) + + response = client.get( + "/api/v1.0/documents/search/", + data={ + "q": "alpha", + **pagination, + }, + ) + + assert response.status_code == status + + if response.status_code < 300: + previous_url = ( + build_search_url(q="alpha", **expected["previous"]) + if expected["previous"] + else None + ) + next_url = ( + build_search_url(q="alpha", **expected["next"]) + if expected["next"] + else None + ) + queryset = models.Document.objects.order_by("-updated_at") + start, end = expected["range"] + expected_results = [str(d.pk) for d in queryset[start:end]] + + content = response.json() + + assert content["count"] == expected["count"] + assert content["previous"] == previous_url + assert content["next"] == next_url + + results = content.pop("results") + + assert [r["id"] for r in results] == expected_results From 78e2bc690e5abfd74ce37d7d9daa2d74c7e63c01 Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Mon, 3 Nov 2025 10:40:40 +0100 Subject: [PATCH 18/22] =?UTF-8?q?=F0=9F=94=A7(compose)=20disable=20indexer?= =?UTF-8?q?=20in=20default=20configuration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Set SEARCH_INDEXER_CLASS=None as default configuration for dev. Rename docker network 'lasuite-net' as 'lasuite' to match with Drive configuration. Signed-off-by: Fabre Florian --- compose.yml | 10 +++++----- env.d/development/common | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/compose.yml b/compose.yml index 1873d604dc..cb5905c330 100644 --- a/compose.yml +++ b/compose.yml @@ -74,7 +74,7 @@ services: - "8071:8000" networks: default: {} - lasuite-net: + lasuite: aliases: - impress volumes: @@ -99,7 +99,7 @@ services: - DJANGO_CONFIGURATION=Development networks: - default - - lasuite-net + - lasuite env_file: - env.d/development/common - env.d/development/common.local @@ -117,7 +117,7 @@ services: - "8083:8083" networks: default: {} - lasuite-net: + lasuite: aliases: - nginx volumes: @@ -232,6 +232,6 @@ services: restart: true networks: - lasuite-net: - name: lasuite-net + lasuite: + name: lasuite-network driver: bridge diff --git a/env.d/development/common b/env.d/development/common index a6bfb65b51..5b1564adf5 100644 --- a/env.d/development/common +++ b/env.d/development/common @@ -78,8 +78,8 @@ Y_PROVIDER_API_KEY=yprovider-api-key # Theme customization THEME_CUSTOMIZATION_CACHE_TIMEOUT=15 -# Indexer -SEARCH_INDEXER_CLASS="core.services.search_indexers.SearchIndexer" -SEARCH_INDEXER_SECRET=find-api-key-for-docs-with-exactly-50-chars-length # Key generated by create_demo in Find app. +# Indexer (disabled) +# SEARCH_INDEXER_CLASS="core.services.search_indexers.SearchIndexer" +SEARCH_INDEXER_SECRET=find-api-key-for-docs-with-exactly-50-chars-length # Key generated by create_demo in Find app. SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/" SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/" From a83f140510cc44421c5eab4d9f46334c4e45585d Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Fri, 3 Oct 2025 10:35:50 +0200 Subject: [PATCH 19/22] =?UTF-8?q?=F0=9F=93=9D(backend)=20add=20fulltext=20?= =?UTF-8?q?search=20documentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add documentation for env & Find+Docs configuration in dev mode Signed-off-by: Fabre Florian --- docs/architecture.md | 1 + docs/env.md | 6 ++++++ docs/search.md | 39 +++++++++++++++++++++++++++++++++++++ docs/system-requirements.md | 11 +++++++++++ env.d/development/common | 1 + 5 files changed, 58 insertions(+) create mode 100644 docs/search.md diff --git a/docs/architecture.md b/docs/architecture.md index 230d32458b..f858eb01d3 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -12,6 +12,7 @@ flowchart TD Back --> DB("Database (PostgreSQL)") Back <--> Celery --> DB Back ----> S3("Minio (S3)") + Back -- REST API --> Find ``` ### Architecture decision records diff --git a/docs/env.md b/docs/env.md index 0b3f9b3bf6..6ccab5e578 100644 --- a/docs/env.md +++ b/docs/env.md @@ -93,6 +93,12 @@ These are the environment variables you can set for the `impress-backend` contai | OIDC_USERINFO_SHORTNAME_FIELD | OIDC token claims to create shortname | first_name | | POSTHOG_KEY | Posthog key for analytics | | | REDIS_URL | Cache url | redis://redis:6379/1 | +| SEARCH_INDEXER_CLASS | Class of the backend for document indexation & search | | +| SEARCH_INDEXER_BATCH_SIZE | Size of each batch for indexation of all documents | 100000 | +| SEARCH_INDEXER_COUNTDOWN | Minimum debounce delay of indexation jobs (in seconds) | 1 | +| SEARCH_INDEXER_URL | Find application endpoint for indexation | | +| SEARCH_INDEXER_SECRET | Token for indexation queries | | +| SEARCH_INDEXER_QUERY_URL | Find application endpoint for search | | | SENTRY_DSN | Sentry host | | | SESSION_COOKIE_AGE | duration of the cookie session | 60*60*12 | | SPECTACULAR_SETTINGS_ENABLE_DJANGO_DEPLOY_CHECK | | false | diff --git a/docs/search.md b/docs/search.md new file mode 100644 index 0000000000..635810812b --- /dev/null +++ b/docs/search.md @@ -0,0 +1,39 @@ +# Setup the Find search for Impress + +This configuration will enable the fulltext search feature for Docs : +- Each save on **core.Document** or **core.DocumentAccess** will trigger the indexer +- The `api/v1.0/documents/search/` will work as a proxy with the Find API for fulltext search. + +## Create an index service for Docs + +Configure a **Service** for Docs application with these settings + +- **Name**: `docs`
_request.auth.name of the Docs application._ +- **Client id**: `impress`
_Name of the token audience or client_id of the Docs application._ + +See [how-to-use-indexer.md](how-to-use-indexer.md) for details. + +## Configure settings of Docs + +Add those Django settings the Docs application to enable the feature. + +```shell +SEARCH_INDEXER_CLASS="core.services.search_indexers.FindDocumentIndexer" +SEARCH_INDEXER_COUNTDOWN=10 # Debounce delay in seconds for the indexer calls. + +# The token from service "docs" of Find application (development). +SEARCH_INDEXER_SECRET="find-api-key-for-docs-with-exactly-50-chars-length" +SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/" + +# Search endpoint. Uses the OIDC token for authentication +SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/" +``` + +We also need to enable the **OIDC Token** refresh or the authentication will fail quickly. + +```shell +# Store OIDC tokens in the session +OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session +OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session +OIDC_STORE_REFRESH_TOKEN_KEY = "your-32-byte-encryption-key==" # Must be a valid Fernet key (32 url-safe base64-encoded bytes) +``` diff --git a/docs/system-requirements.md b/docs/system-requirements.md index 8f2daafb2c..db337d9b23 100644 --- a/docs/system-requirements.md +++ b/docs/system-requirements.md @@ -97,6 +97,17 @@ Production deployments differ significantly from development environments. The t | 5433 | PostgreSQL (Keycloak) | | 1081 | MailCatcher | +**With fulltext search service** + +| Port | Service | +| --------- | --------------------- | +| 8081 | Find (Django) | +| 9200 | Opensearch | +| 9600 | Opensearch admin | +| 5601 | Opensearch dashboard | +| 25432 | PostgreSQL (Find) | + + ## 6. Sizing Guidelines **RAM** – start at 8 GB dev / 16 GB staging / 32 GB prod. Postgres and Keycloak are the first to OOM; scale them first. diff --git a/env.d/development/common b/env.d/development/common index 5b1564adf5..cdd734b55f 100644 --- a/env.d/development/common +++ b/env.d/development/common @@ -36,6 +36,7 @@ OIDC_OP_JWKS_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/c OIDC_OP_AUTHORIZATION_ENDPOINT=http://localhost:8083/realms/impress/protocol/openid-connect/auth OIDC_OP_TOKEN_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/token OIDC_OP_USER_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/userinfo +OIDC_OP_INTROSPECTION_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/token/introspect OIDC_RP_CLIENT_ID=impress OIDC_RP_CLIENT_SECRET=ThisIsAnExampleKeyForDevPurposeOnly From 5427f18ec0321e885f27bb3022ffbec889d4760c Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Fri, 31 Oct 2025 17:06:46 +0100 Subject: [PATCH 20/22] =?UTF-8?q?=E2=9C=A8(backend)=20use=20batches=20in?= =?UTF-8?q?=20indexing=20task?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduce the number of Find API calls by grouping all the latest changes for indexation : send all the documents updated or deleted since the triggering of the task. Signed-off-by: Fabre Florian --- src/backend/core/models.py | 3 +- src/backend/core/services/search_indexers.py | 5 +- src/backend/core/signals.py | 8 +- src/backend/core/tasks/search.py | 79 ++-- src/backend/core/tests/conftest.py | 1 + .../core/tests/test_models_documents.py | 331 ------------- .../tests/test_models_documents_indexer.py | 441 ++++++++++++++++++ 7 files changed, 498 insertions(+), 370 deletions(-) create mode 100644 src/backend/core/tests/test_models_documents_indexer.py diff --git a/src/backend/core/models.py b/src/backend/core/models.py index dcb13f9b16..e7994d9212 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -904,7 +904,8 @@ def soft_delete(self): # Mark all descendants as soft deleted self.get_descendants().filter(ancestors_deleted_at__isnull=True).update( - ancestors_deleted_at=self.ancestors_deleted_at + ancestors_deleted_at=self.ancestors_deleted_at, + updated_at=self.updated_at, ) @transaction.atomic diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index abaa4d454c..45184f556a 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -130,16 +130,17 @@ def __init__(self, batch_size=None): "SEARCH_INDEXER_QUERY_URL must be set in Django settings." ) - def index(self): + def index(self, queryset=None): """ Fetch documents in batches, serialize them, and push to the search backend. """ last_id = 0 count = 0 + queryset = queryset or models.Document.objects.all() while True: documents_batch = list( - models.Document.objects.filter( + queryset.filter( id__gt=last_id, ).order_by("id")[: self.batch_size] ) diff --git a/src/backend/core/signals.py b/src/backend/core/signals.py index 1aa95f44be..4bd9647798 100644 --- a/src/backend/core/signals.py +++ b/src/backend/core/signals.py @@ -9,7 +9,7 @@ from django.dispatch import receiver from . import models -from .tasks.search import trigger_document_indexer +from .tasks.search import trigger_batch_document_indexer @receiver(signals.post_save, sender=models.Document) @@ -19,7 +19,7 @@ def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-ar Note : Within the transaction we can have an empty content and a serialization error. """ - transaction.on_commit(partial(trigger_document_indexer, instance)) + transaction.on_commit(partial(trigger_batch_document_indexer, instance)) @receiver(signals.post_save, sender=models.DocumentAccess) @@ -28,4 +28,6 @@ def document_access_post_save(sender, instance, created, **kwargs): # pylint: d Asynchronous call to the document indexer at the end of the transaction. """ if not created: - transaction.on_commit(partial(trigger_document_indexer, instance.document)) + transaction.on_commit( + partial(trigger_batch_document_indexer, instance.document) + ) diff --git a/src/backend/core/tasks/search.py b/src/backend/core/tasks/search.py index 9f3a8cf1a3..ebdf6c15ed 100644 --- a/src/backend/core/tasks/search.py +++ b/src/backend/core/tasks/search.py @@ -4,12 +4,12 @@ from django.conf import settings from django.core.cache import cache +from django.db.models import Q from django_redis.cache import RedisCache from core import models from core.services.search_indexers import ( - get_batch_accesses_by_users_and_teams, get_document_indexer, ) @@ -18,16 +18,30 @@ logger = getLogger(__file__) -def indexer_throttle_acquire(document_id, timeout=0, atomic=True): +@app.task +def document_indexer_task(document_id): + """Celery Task : Sends indexation query for a document.""" + indexer = get_document_indexer() + + if indexer is None: + return + + logger.info("Start document %s indexation", document_id) + indexer.index(models.Document.objects.filter(pk=document_id)) + + +def batch_indexer_throttle_acquire(timeout: int = 0, atomic: bool = True): """ Enable the task throttle flag for a delay. Uses redis locks if available to ensure atomic changes """ - key = f"doc-indexer-throttle-{document_id}" + key = "document-batch-indexer-throttle" + # Redis is used as cache database (not in tests). Use the lock feature here + # to ensure atomicity of changes to the throttle flag. if isinstance(cache, RedisCache) and atomic: with cache.locks(key): - return indexer_throttle_acquire(document_id, timeout, atomic=False) + return batch_indexer_throttle_acquire(timeout, atomic=False) # Use add() here : # - set the flag and returns true if not exist @@ -36,49 +50,48 @@ def indexer_throttle_acquire(document_id, timeout=0, atomic=True): @app.task -def document_indexer_task(document_id): - """Celery Task : Sends indexation query for a document.""" +def batch_document_indexer_task(timestamp): + """Celery Task : Sends indexation query for a batch of documents.""" indexer = get_document_indexer() - if indexer is None: - return - - try: - doc = models.Document.objects.get(pk=document_id) - except models.Document.DoesNotExist: - # Skip the task if the document does not exist. - return - - accesses = get_batch_accesses_by_users_and_teams((doc.path,)) - - data = indexer.serialize_document(document=doc, accesses=accesses) + if indexer: + queryset = models.Document.objects.filter( + Q(updated_at__gte=timestamp) + | Q(deleted_at__gte=timestamp) + | Q(ancestors_deleted_at__gte=timestamp) + ) - logger.info("Start document %s indexation", document_id) - indexer.push(data) + count = indexer.index(queryset) + logger.info("Indexed %d documents", count) -def trigger_document_indexer(document): +def trigger_batch_document_indexer(item): """ Trigger indexation task with debounce a delay set by the SEARCH_INDEXER_COUNTDOWN setting. Args: document (Document): The document instance. """ - countdown = settings.SEARCH_INDEXER_COUNTDOWN + countdown = int(settings.SEARCH_INDEXER_COUNTDOWN) # DO NOT create a task if indexation if disabled if not settings.SEARCH_INDEXER_CLASS: return - # Each time this method is called during a countdown, we increment the - # counter and each task decrease it, so the index be run only once. - if indexer_throttle_acquire(document.pk, timeout=countdown): - logger.info( - "Add task for document %s indexation in %.2f seconds", - document.pk, - countdown, - ) - - document_indexer_task.apply_async(args=[document.pk]) + if countdown > 0: + # Each time this method is called during a countdown, we increment the + # counter and each task decrease it, so the index be run only once. + if batch_indexer_throttle_acquire(timeout=countdown): + logger.info( + "Add task for batch document indexation from updated_at=%s in %d seconds", + item.updated_at.isoformat(), + countdown, + ) + + batch_document_indexer_task.apply_async( + args=[item.updated_at], countdown=countdown + ) + else: + logger.info("Skip task for batch document %s indexation", item.pk) else: - logger.info("Skip task for document %s indexation", document.pk) + document_indexer_task.apply(args=[item.pk]) diff --git a/src/backend/core/tests/conftest.py b/src/backend/core/tests/conftest.py index ba607b4228..65e3926934 100644 --- a/src/backend/core/tests/conftest.py +++ b/src/backend/core/tests/conftest.py @@ -45,6 +45,7 @@ def indexer_settings_fixture(settings): settings.SEARCH_INDEXER_QUERY_URL = ( "http://localhost:8081/api/v1.0/documents/search/" ) + settings.SEARCH_INDEXER_COUNTDOWN = 1 yield settings diff --git a/src/backend/core/tests/test_models_documents.py b/src/backend/core/tests/test_models_documents.py index f15d470105..69236b6e96 100644 --- a/src/backend/core/tests/test_models_documents.py +++ b/src/backend/core/tests/test_models_documents.py @@ -6,7 +6,6 @@ import random import smtplib from logging import Logger -from operator import itemgetter from unittest import mock from django.contrib.auth.models import AnonymousUser @@ -14,15 +13,12 @@ from django.core.cache import cache from django.core.exceptions import ValidationError from django.core.files.storage import default_storage -from django.db import transaction from django.test.utils import override_settings from django.utils import timezone import pytest from core import factories, models -from core.services.search_indexers import SearchIndexer -from core.tasks.search import document_indexer_task pytestmark = pytest.mark.django_db @@ -1449,330 +1445,3 @@ def test_models_documents_compute_ancestors_links_paths_mapping_structure( {"link_reach": sibling.link_reach, "link_role": sibling.link_role}, ], } - - -@mock.patch.object(SearchIndexer, "push") -@pytest.mark.django_db(transaction=True) -def test_models_documents_post_save_indexer(mock_push, indexer_settings): - """Test indexation task on document creation""" - indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 - - with transaction.atomic(): - doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3) - - accesses = {} - data = [call.args[0] for call in mock_push.call_args_list] - - indexer = SearchIndexer() - - assert sorted(data, key=itemgetter("id")) == sorted( - [ - indexer.serialize_document(doc1, accesses), - indexer.serialize_document(doc2, accesses), - indexer.serialize_document(doc3, accesses), - ], - key=itemgetter("id"), - ) - - # The throttle counters should be reset - assert cache.get(f"doc-indexer-throttle-{doc1.pk}") is None - assert cache.get(f"doc-indexer-throttle-{doc2.pk}") is None - assert cache.get(f"doc-indexer-throttle-{doc3.pk}") is None - - -@mock.patch.object(SearchIndexer, "push") -@pytest.mark.django_db(transaction=True) -def test_models_documents_post_save_indexer_not_configured(mock_push, indexer_settings): - """Task should not start an indexation when disabled""" - indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 - indexer_settings.SEARCH_INDEXER_CLASS = None - - user = factories.UserFactory() - - with transaction.atomic(): - doc = factories.DocumentFactory() - factories.UserDocumentAccessFactory(document=doc, user=user) - - assert mock_push.assert_not_called - - -@mock.patch.object(SearchIndexer, "push") -@pytest.mark.django_db(transaction=True) -def test_models_documents_post_save_indexer_wrongly_configured( - mock_push, indexer_settings -): - """Task should not start an indexation when disabled""" - indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 - indexer_settings.SEARCH_INDEXER_URL = None - - user = factories.UserFactory() - - with transaction.atomic(): - doc = factories.DocumentFactory() - factories.UserDocumentAccessFactory(document=doc, user=user) - - assert mock_push.assert_not_called - - -@mock.patch.object(SearchIndexer, "push") -@pytest.mark.django_db(transaction=True) -def test_models_documents_post_save_indexer_with_accesses(mock_push, indexer_settings): - """Test indexation task on document creation""" - indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 - - user = factories.UserFactory() - - with transaction.atomic(): - doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3) - - factories.UserDocumentAccessFactory(document=doc1, user=user) - factories.UserDocumentAccessFactory(document=doc2, user=user) - factories.UserDocumentAccessFactory(document=doc3, user=user) - - accesses = { - str(doc1.path): {"users": [user.sub]}, - str(doc2.path): {"users": [user.sub]}, - str(doc3.path): {"users": [user.sub]}, - } - - data = [call.args[0] for call in mock_push.call_args_list] - - indexer = SearchIndexer() - - assert sorted(data, key=itemgetter("id")) == sorted( - [ - indexer.serialize_document(doc1, accesses), - indexer.serialize_document(doc2, accesses), - indexer.serialize_document(doc3, accesses), - ], - key=itemgetter("id"), - ) - - # The throttle counters should be reset - assert cache.get(f"doc-indexer-throttle-{doc1.pk}") is None - assert cache.get(f"doc-indexer-throttle-{doc2.pk}") is None - assert cache.get(f"doc-indexer-throttle-{doc3.pk}") is None - - -@mock.patch.object(SearchIndexer, "push") -@pytest.mark.django_db(transaction=True) -def test_models_documents_post_save_indexer_deleted(mock_push, indexer_settings): - """Indexation task on deleted or ancestor_deleted documents""" - indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 - - user = factories.UserFactory() - - with transaction.atomic(): - doc = factories.DocumentFactory( - link_reach=models.LinkReachChoices.AUTHENTICATED - ) - doc_deleted = factories.DocumentFactory( - link_reach=models.LinkReachChoices.AUTHENTICATED - ) - doc_ancestor_deleted = factories.DocumentFactory( - parent=doc_deleted, - link_reach=models.LinkReachChoices.AUTHENTICATED, - ) - doc_deleted.soft_delete() - doc_ancestor_deleted.ancestors_deleted_at = doc_deleted.deleted_at - - factories.UserDocumentAccessFactory(document=doc, user=user) - factories.UserDocumentAccessFactory(document=doc_deleted, user=user) - factories.UserDocumentAccessFactory(document=doc_ancestor_deleted, user=user) - - doc_deleted.refresh_from_db() - doc_ancestor_deleted.refresh_from_db() - - assert doc_deleted.deleted_at is not None - assert doc_deleted.ancestors_deleted_at is not None - - assert doc_ancestor_deleted.deleted_at is None - assert doc_ancestor_deleted.ancestors_deleted_at is not None - - accesses = { - str(doc.path): {"users": [user.sub]}, - str(doc_deleted.path): {"users": [user.sub]}, - str(doc_ancestor_deleted.path): {"users": [user.sub]}, - } - - data = [call.args[0] for call in mock_push.call_args_list] - - indexer = SearchIndexer() - - # Even deleted document are re-indexed : only update their status in the future ? - assert sorted(data, key=itemgetter("id")) == sorted( - [ - indexer.serialize_document(doc, accesses), - indexer.serialize_document(doc_deleted, accesses), - indexer.serialize_document(doc_ancestor_deleted, accesses), - indexer.serialize_document(doc_deleted, accesses), # soft_delete() - ], - key=itemgetter("id"), - ) - - # The throttle counters should be reset - assert cache.get(f"doc-indexer-throttle-{doc.pk}") is None - assert cache.get(f"doc-indexer-throttle-{doc_deleted.pk}") is None - assert cache.get(f"doc-indexer-throttle-{doc_ancestor_deleted.pk}") is None - - -@pytest.mark.django_db(transaction=True) -def test_models_documents_indexer_hard_deleted(indexer_settings): - """Indexation task on hard deleted document""" - indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 - - user = factories.UserFactory() - - with transaction.atomic(): - doc = factories.DocumentFactory( - link_reach=models.LinkReachChoices.AUTHENTICATED - ) - factories.UserDocumentAccessFactory(document=doc, user=user) - - doc_id = doc.pk - doc.delete() - - # Call task on deleted document. - document_indexer_task.apply(args=[doc_id]) - - with mock.patch.object(SearchIndexer, "push") as mock_push: - # Hard delete document are not re-indexed. - assert mock_push.assert_not_called - - -@mock.patch.object(SearchIndexer, "push") -@pytest.mark.django_db(transaction=True) -def test_models_documents_post_save_indexer_restored(mock_push, indexer_settings): - """Restart indexation task on restored documents""" - indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 - - user = factories.UserFactory() - - with transaction.atomic(): - doc = factories.DocumentFactory( - link_reach=models.LinkReachChoices.AUTHENTICATED - ) - doc_deleted = factories.DocumentFactory( - link_reach=models.LinkReachChoices.AUTHENTICATED - ) - doc_ancestor_deleted = factories.DocumentFactory( - parent=doc_deleted, - link_reach=models.LinkReachChoices.AUTHENTICATED, - ) - doc_deleted.soft_delete() - doc_ancestor_deleted.ancestors_deleted_at = doc_deleted.deleted_at - - factories.UserDocumentAccessFactory(document=doc, user=user) - factories.UserDocumentAccessFactory(document=doc_deleted, user=user) - factories.UserDocumentAccessFactory(document=doc_ancestor_deleted, user=user) - - doc_deleted.refresh_from_db() - doc_ancestor_deleted.refresh_from_db() - - assert doc_deleted.deleted_at is not None - assert doc_deleted.ancestors_deleted_at is not None - - assert doc_ancestor_deleted.deleted_at is None - assert doc_ancestor_deleted.ancestors_deleted_at is not None - - doc_restored = models.Document.objects.get(pk=doc_deleted.pk) - doc_restored.restore() - - doc_ancestor_restored = models.Document.objects.get(pk=doc_ancestor_deleted.pk) - - assert doc_restored.deleted_at is None - assert doc_restored.ancestors_deleted_at is None - - assert doc_ancestor_restored.deleted_at is None - assert doc_ancestor_restored.ancestors_deleted_at is None - - accesses = { - str(doc.path): {"users": [user.sub]}, - str(doc_deleted.path): {"users": [user.sub]}, - str(doc_ancestor_deleted.path): {"users": [user.sub]}, - } - - data = [call.args[0] for call in mock_push.call_args_list] - - indexer = SearchIndexer() - - # All docs are re-indexed - assert sorted(data, key=itemgetter("id")) == sorted( - [ - indexer.serialize_document(doc, accesses), - indexer.serialize_document(doc_deleted, accesses), - indexer.serialize_document(doc_deleted, accesses), # soft_delete() - indexer.serialize_document(doc_restored, accesses), # restore() - indexer.serialize_document(doc_ancestor_deleted, accesses), - ], - key=itemgetter("id"), - ) - - -@pytest.mark.django_db(transaction=True) -def test_models_documents_post_save_indexer_throttle(indexer_settings): - """Test indexation task skipping on document update""" - indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 - - indexer = SearchIndexer() - user = factories.UserFactory() - - with mock.patch.object(SearchIndexer, "push"): - with transaction.atomic(): - doc = factories.DocumentFactory() - factories.UserDocumentAccessFactory(document=doc, user=user) - - accesses = { - str(doc.path): {"users": [user.sub]}, - } - - with mock.patch.object(SearchIndexer, "push") as mock_push: - # Simulate 1 running task - cache.set(f"doc-indexer-throttle-{doc.pk}", 1) - - # save doc to trigger the indexer, but nothing should be done since - # the flag is up - with transaction.atomic(): - doc.save() - - assert [call.args[0] for call in mock_push.call_args_list] == [] - - with mock.patch.object(SearchIndexer, "push") as mock_push: - # No waiting task - cache.delete(f"doc-indexer-throttle-{doc.pk}") - - with transaction.atomic(): - doc = models.Document.objects.get(pk=doc.pk) - doc.save() - - assert [call.args[0] for call in mock_push.call_args_list] == [ - indexer.serialize_document(doc, accesses), - ] - - -@pytest.mark.django_db(transaction=True) -def test_models_documents_access_post_save_indexer(indexer_settings): - """Test indexation task on DocumentAccess update""" - indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 - - indexer = SearchIndexer() - user = factories.UserFactory() - - with mock.patch.object(SearchIndexer, "push"): - with transaction.atomic(): - doc = factories.DocumentFactory() - doc_access = factories.UserDocumentAccessFactory(document=doc, user=user) - - accesses = { - str(doc.path): {"users": [user.sub]}, - } - - indexer = SearchIndexer() - - with mock.patch.object(SearchIndexer, "push") as mock_push: - with transaction.atomic(): - doc_access.save() - - assert [call.args[0] for call in mock_push.call_args_list] == [ - indexer.serialize_document(doc, accesses), - ] diff --git a/src/backend/core/tests/test_models_documents_indexer.py b/src/backend/core/tests/test_models_documents_indexer.py new file mode 100644 index 0000000000..9e171f724d --- /dev/null +++ b/src/backend/core/tests/test_models_documents_indexer.py @@ -0,0 +1,441 @@ +""" +Unit tests for the Document model +""" +# pylint: disable=too-many-lines + +from operator import itemgetter +from unittest import mock + +from django.core.cache import cache +from django.db import transaction + +import pytest + +from core import factories, models +from core.services.search_indexers import SearchIndexer + +pytestmark = pytest.mark.django_db + + +def reset_batch_indexer_throttle(): + """Reset throttle flag""" + cache.delete("document-batch-indexer-throttle") + + +@pytest.fixture(autouse=True) +def reset_throttle(): + """Reset throttle flag before each test""" + reset_batch_indexer_throttle() + yield + reset_batch_indexer_throttle() + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer(mock_push): + """Test indexation task on document creation""" + with transaction.atomic(): + doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3) + + accesses = {} + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = SearchIndexer() + + assert len(data) == 1 + + # One call + assert sorted(data[0], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc1, accesses), + indexer.serialize_document(doc2, accesses), + indexer.serialize_document(doc3, accesses), + ], + key=itemgetter("id"), + ) + + # The throttle counters should be reset + assert cache.get("document-batch-indexer-throttle") == 1 + + +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_no_batches(indexer_settings): + """Test indexation task on doculment creation, no throttle""" + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 + + with mock.patch.object(SearchIndexer, "push") as mock_push: + with transaction.atomic(): + doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3) + + accesses = {} + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = SearchIndexer() + + # 3 calls + assert len(data) == 3 + # one document per call + assert [len(d) for d in data] == [1] * 3 + # all documents are indexed + assert sorted([d[0] for d in data], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc1, accesses), + indexer.serialize_document(doc2, accesses), + indexer.serialize_document(doc3, accesses), + ], + key=itemgetter("id"), + ) + + # The throttle counters should be reset + assert cache.get("file-batch-indexer-throttle") is None + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_not_configured(mock_push, indexer_settings): + """Task should not start an indexation when disabled""" + indexer_settings.SEARCH_INDEXER_CLASS = None + + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory() + factories.UserDocumentAccessFactory(document=doc, user=user) + + assert mock_push.assert_not_called + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_wrongly_configured( + mock_push, indexer_settings +): + """Task should not start an indexation when disabled""" + indexer_settings.SEARCH_INDEXER_URL = None + + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory() + factories.UserDocumentAccessFactory(document=doc, user=user) + + assert mock_push.assert_not_called + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_with_accesses(mock_push): + """Test indexation task on document creation""" + user = factories.UserFactory() + + with transaction.atomic(): + doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3) + + factories.UserDocumentAccessFactory(document=doc1, user=user) + factories.UserDocumentAccessFactory(document=doc2, user=user) + factories.UserDocumentAccessFactory(document=doc3, user=user) + + accesses = { + str(doc1.path): {"users": [user.sub]}, + str(doc2.path): {"users": [user.sub]}, + str(doc3.path): {"users": [user.sub]}, + } + + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = SearchIndexer() + + assert len(data) == 1 + assert sorted(data[0], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc1, accesses), + indexer.serialize_document(doc2, accesses), + indexer.serialize_document(doc3, accesses), + ], + key=itemgetter("id"), + ) + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_deleted(mock_push): + """Indexation task on deleted or ancestor_deleted documents""" + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + main_doc = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + child_doc = factories.DocumentFactory( + parent=main_doc, + link_reach=models.LinkReachChoices.AUTHENTICATED, + ) + + factories.UserDocumentAccessFactory(document=doc, user=user) + factories.UserDocumentAccessFactory(document=main_doc, user=user) + factories.UserDocumentAccessFactory(document=child_doc, user=user) + + # Manually reset the throttle flag here or the next indexation will be ignored for 1 second + reset_batch_indexer_throttle() + + with transaction.atomic(): + main_doc_deleted = models.Document.objects.get(pk=main_doc.pk) + main_doc_deleted.soft_delete() + + child_doc_deleted = models.Document.objects.get(pk=child_doc.pk) + + main_doc_deleted.refresh_from_db() + child_doc_deleted.refresh_from_db() + + assert main_doc_deleted.deleted_at is not None + assert child_doc_deleted.ancestors_deleted_at is not None + + assert child_doc_deleted.deleted_at is None + assert child_doc_deleted.ancestors_deleted_at is not None + + accesses = { + str(doc.path): {"users": [user.sub]}, + str(main_doc_deleted.path): {"users": [user.sub]}, + str(child_doc_deleted.path): {"users": [user.sub]}, + } + + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = SearchIndexer() + + assert len(data) == 2 + + # First indexation on document creation + assert sorted(data[0], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc, accesses), + indexer.serialize_document(main_doc, accesses), + indexer.serialize_document(child_doc, accesses), + ], + key=itemgetter("id"), + ) + + # Even deleted items are re-indexed : only update their status in the future + assert sorted(data[1], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(main_doc_deleted, accesses), # soft_delete() + indexer.serialize_document(child_doc_deleted, accesses), + ], + key=itemgetter("id"), + ) + + +@pytest.mark.django_db(transaction=True) +@pytest.mark.usefixtures("indexer_settings") +def test_models_documents_indexer_hard_deleted(): + """Indexation task on hard deleted document""" + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + factories.UserDocumentAccessFactory(document=doc, user=user) + + # Call task on deleted document. + with mock.patch.object(SearchIndexer, "push") as mock_push: + doc.delete() + + # Hard delete document are not re-indexed. + assert mock_push.assert_not_called + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_restored(mock_push): + """Restart indexation task on restored documents""" + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + doc_deleted = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + doc_ancestor_deleted = factories.DocumentFactory( + parent=doc_deleted, + link_reach=models.LinkReachChoices.AUTHENTICATED, + ) + + factories.UserDocumentAccessFactory(document=doc, user=user) + factories.UserDocumentAccessFactory(document=doc_deleted, user=user) + factories.UserDocumentAccessFactory(document=doc_ancestor_deleted, user=user) + + doc_deleted.soft_delete() + + doc_deleted.refresh_from_db() + doc_ancestor_deleted.refresh_from_db() + + assert doc_deleted.deleted_at is not None + assert doc_deleted.ancestors_deleted_at is not None + + assert doc_ancestor_deleted.deleted_at is None + assert doc_ancestor_deleted.ancestors_deleted_at is not None + + # Manually reset the throttle flag here or the next indexation will be ignored for 1 second + reset_batch_indexer_throttle() + + with transaction.atomic(): + doc_restored = models.Document.objects.get(pk=doc_deleted.pk) + doc_restored.restore() + + doc_ancestor_restored = models.Document.objects.get(pk=doc_ancestor_deleted.pk) + + assert doc_restored.deleted_at is None + assert doc_restored.ancestors_deleted_at is None + + assert doc_ancestor_restored.deleted_at is None + assert doc_ancestor_restored.ancestors_deleted_at is None + + accesses = { + str(doc.path): {"users": [user.sub]}, + str(doc_deleted.path): {"users": [user.sub]}, + str(doc_ancestor_deleted.path): {"users": [user.sub]}, + } + + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = SearchIndexer() + + # All docs are re-indexed + assert len(data) == 2 + + # First indexation on items creation & soft delete (in the same transaction) + assert sorted(data[0], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc, accesses), + indexer.serialize_document(doc_deleted, accesses), + indexer.serialize_document(doc_ancestor_deleted, accesses), + ], + key=itemgetter("id"), + ) + + # Restored items are re-indexed : only update their status in the future + assert sorted(data[1], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc_restored, accesses), # restore() + indexer.serialize_document(doc_ancestor_restored, accesses), + ], + key=itemgetter("id"), + ) + + +@pytest.mark.django_db(transaction=True) +@pytest.mark.usefixtures("indexer_settings") +def test_models_documents_post_save_indexer_throttle(): + """Test indexation task skipping on document update""" + indexer = SearchIndexer() + user = factories.UserFactory() + + with mock.patch.object(SearchIndexer, "push"): + with transaction.atomic(): + docs = factories.DocumentFactory.create_batch(5, users=(user,)) + + accesses = {str(item.path): {"users": [user.sub]} for item in docs} + + with mock.patch.object(SearchIndexer, "push") as mock_push: + # Simulate 1 running task + cache.set("document-batch-indexer-throttle", 1) + + # save doc to trigger the indexer, but nothing should be done since + # the flag is up + with transaction.atomic(): + docs[0].save() + docs[2].save() + docs[3].save() + + assert [call.args[0] for call in mock_push.call_args_list] == [] + + with mock.patch.object(SearchIndexer, "push") as mock_push: + # No waiting task + cache.delete("document-batch-indexer-throttle") + + with transaction.atomic(): + docs[0].save() + docs[2].save() + docs[3].save() + + data = [call.args[0] for call in mock_push.call_args_list] + + # One call + assert len(data) == 1 + + assert sorted(data[0], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(docs[0], accesses), + indexer.serialize_document(docs[2], accesses), + indexer.serialize_document(docs[3], accesses), + ], + key=itemgetter("id"), + ) + + +@pytest.mark.django_db(transaction=True) +@pytest.mark.usefixtures("indexer_settings") +def test_models_documents_access_post_save_indexer(): + """Test indexation task on DocumentAccess update""" + users = factories.UserFactory.create_batch(3) + + with mock.patch.object(SearchIndexer, "push"): + with transaction.atomic(): + doc = factories.DocumentFactory(users=users) + doc_accesses = models.DocumentAccess.objects.filter(document=doc).order_by( + "user__sub" + ) + + reset_batch_indexer_throttle() + + with mock.patch.object(SearchIndexer, "push") as mock_push: + with transaction.atomic(): + for doc_access in doc_accesses: + doc_access.save() + + data = [call.args[0] for call in mock_push.call_args_list] + + # One call + assert len(data) == 1 + + assert [d["id"] for d in data[0]] == [str(doc.pk)] + + +@pytest.mark.django_db(transaction=True) +def test_models_items_access_post_save_indexer_no_throttle(indexer_settings): + """Test indexation task on ItemAccess update, no throttle""" + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 + + users = factories.UserFactory.create_batch(3) + + with transaction.atomic(): + doc = factories.DocumentFactory(users=users) + doc_accesses = models.DocumentAccess.objects.filter(document=doc).order_by( + "user__sub" + ) + + reset_batch_indexer_throttle() + + with mock.patch.object(SearchIndexer, "push") as mock_push: + with transaction.atomic(): + for doc_access in doc_accesses: + doc_access.save() + + data = [call.args[0] for call in mock_push.call_args_list] + + # 3 calls + assert len(data) == 3 + # one document per call + assert [len(d) for d in data] == [1] * 3 + # the same document is indexed 3 times + assert [d[0]["id"] for d in data] == [str(doc.pk)] * 3 From 3652732852af104cf2bed4709d41a1d787e40c9c Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Fri, 14 Nov 2025 10:22:29 +0100 Subject: [PATCH 21/22] =?UTF-8?q?=F0=9F=A9=B9(backend)=20fix=20empty=20ind?= =?UTF-8?q?exation=20batch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As we filter the empty documents from the batch during indexing some batches can be empty and cause an error. Now they are ignored. Add --batch-size argument to the index command. Signed-off-by: Fabre Florian --- src/backend/core/management/commands/index.py | 14 ++++- src/backend/core/services/search_indexers.py | 24 ++++---- .../tests/test_services_search_indexers.py | 57 +++++++++++++++++++ 3 files changed, 84 insertions(+), 11 deletions(-) diff --git a/src/backend/core/management/commands/index.py b/src/backend/core/management/commands/index.py index 7db787cf98..af046e0608 100644 --- a/src/backend/core/management/commands/index.py +++ b/src/backend/core/management/commands/index.py @@ -17,6 +17,17 @@ class Command(BaseCommand): help = __doc__ + def add_arguments(self, parser): + """Add argument to require forcing execution when not in debug mode.""" + parser.add_argument( + "--batch-size", + action="store", + dest="batch_size", + type=int, + default=50, + help="Indexation query batch size", + ) + def handle(self, *args, **options): """Launch and log search index generation.""" indexer = get_document_indexer() @@ -26,9 +37,10 @@ def handle(self, *args, **options): logger.info("Starting to regenerate Find index...") start = time.perf_counter() + batch_size = options["batch_size"] try: - count = indexer.index() + count = indexer.index(batch_size=batch_size) except Exception as err: raise CommandError("Unable to regenerate index") from err diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index 45184f556a..424897c920 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -102,15 +102,11 @@ class BaseDocumentIndexer(ABC): `serialize_document()` and `push()` to define backend-specific behavior. """ - def __init__(self, batch_size=None): + def __init__(self): """ Initialize the indexer. - - Args: - batch_size (int, optional): Number of documents per batch. - Defaults to settings.SEARCH_INDEXER_BATCH_SIZE. """ - self.batch_size = batch_size or settings.SEARCH_INDEXER_BATCH_SIZE + self.batch_size = settings.SEARCH_INDEXER_BATCH_SIZE self.indexer_url = settings.SEARCH_INDEXER_URL self.indexer_secret = settings.SEARCH_INDEXER_SECRET self.search_url = settings.SEARCH_INDEXER_QUERY_URL @@ -130,19 +126,26 @@ def __init__(self, batch_size=None): "SEARCH_INDEXER_QUERY_URL must be set in Django settings." ) - def index(self, queryset=None): + def index(self, queryset=None, batch_size=None): """ Fetch documents in batches, serialize them, and push to the search backend. + + Args: + queryset (optional): Document queryset + Defaults to all documents without filter. + batch_size (int, optional): Number of documents per batch. + Defaults to settings.SEARCH_INDEXER_BATCH_SIZE. """ last_id = 0 count = 0 queryset = queryset or models.Document.objects.all() + batch_size = batch_size or self.batch_size while True: documents_batch = list( queryset.filter( id__gt=last_id, - ).order_by("id")[: self.batch_size] + ).order_by("id")[:batch_size] ) if not documents_batch: @@ -158,8 +161,9 @@ def index(self, queryset=None): if document.content or document.title ] - self.push(serialized_batch) - count += len(serialized_batch) + if serialized_batch: + self.push(serialized_batch) + count += len(serialized_batch) return count diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index a65f92ca06..0a6daf4045 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -299,6 +299,44 @@ def test_services_search_indexers_batches_pass_only_batch_accesses( assert seen_doc_ids == {str(d.id) for d in documents} +@patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +def test_services_search_indexers_batch_size_argument(mock_push): + """ + Documents indexing should be processed in batches, + batch_size overrides SEARCH_INDEXER_BATCH_SIZE + """ + documents = factories.DocumentFactory.create_batch(5) + + # Attach a single user access to each document + expected_user_subs = {} + for document in documents: + access = factories.UserDocumentAccessFactory(document=document) + expected_user_subs[str(document.id)] = str(access.user.sub) + + assert SearchIndexer().index(batch_size=2) == 5 + + # Should be 3 batches: 2 + 2 + 1 + assert mock_push.call_count == 3 + + seen_doc_ids = set() + + for call in mock_push.call_args_list: + batch = call.args[0] + assert isinstance(batch, list) + + for doc_json in batch: + doc_id = doc_json["id"] + seen_doc_ids.add(doc_id) + + # Only one user expected per document + assert doc_json["users"] == [expected_user_subs[doc_id]] + assert doc_json["groups"] == [] + + # Make sure all 5 documents were indexed + assert seen_doc_ids == {str(d.id) for d in documents} + + @patch.object(SearchIndexer, "push") @pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_ignore_empty_documents(mock_push): @@ -327,6 +365,25 @@ def test_services_search_indexers_ignore_empty_documents(mock_push): } +@patch.object(SearchIndexer, "push") +def test_services_search_indexers_skip_empty_batches(mock_push, indexer_settings): + """ + Documents indexing batch can be empty if all the docs are empty. + """ + indexer_settings.SEARCH_INDEXER_BATCH_SIZE = 2 + + document = factories.DocumentFactory() + + # Only empty docs + factories.DocumentFactory.create_batch(5, content="", title="") + + assert SearchIndexer().index() == 1 + assert mock_push.call_count == 1 + + results = [doc["id"] for doc in mock_push.call_args[0][0]] + assert results == [str(document.id)] + + @patch.object(SearchIndexer, "push") @pytest.mark.usefixtures("indexer_settings") def test_services_search_indexers_ancestors_link_reach(mock_push): From f2106dd88098a803645233956f74081e1bbcab42 Mon Sep 17 00:00:00 2001 From: Fabre Florian Date: Fri, 14 Nov 2025 13:31:09 +0100 Subject: [PATCH 22/22] =?UTF-8?q?=E2=9C=A8(backend)=20adapt=20to=20Find=20?= =?UTF-8?q?new=20search=20pagination?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use nb_results instead of page/page_size argument for /search API. Signed-off-by: Fabre Florian --- docs/env.md | 1 + docs/search.md | 2 + src/backend/core/api/viewsets.py | 2 - src/backend/core/services/search_indexers.py | 14 +++---- src/backend/core/tasks/search.py | 8 ++-- .../documents/test_api_documents_search.py | 3 +- .../tests/test_services_search_indexers.py | 42 ++++++++++++++++++- src/backend/impress/settings.py | 3 ++ 8 files changed, 56 insertions(+), 19 deletions(-) diff --git a/docs/env.md b/docs/env.md index 6ccab5e578..cc09a7142f 100644 --- a/docs/env.md +++ b/docs/env.md @@ -99,6 +99,7 @@ These are the environment variables you can set for the `impress-backend` contai | SEARCH_INDEXER_URL | Find application endpoint for indexation | | | SEARCH_INDEXER_SECRET | Token for indexation queries | | | SEARCH_INDEXER_QUERY_URL | Find application endpoint for search | | +| SEARCH_INDEXER_QUERY_LIMIT | Maximum number of results expected from search endpoint | 50 | | SENTRY_DSN | Sentry host | | | SESSION_COOKIE_AGE | duration of the cookie session | 60*60*12 | | SPECTACULAR_SETTINGS_ENABLE_DJANGO_DEPLOY_CHECK | | false | diff --git a/docs/search.md b/docs/search.md index 635810812b..416f972bd3 100644 --- a/docs/search.md +++ b/docs/search.md @@ -27,6 +27,8 @@ SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/" # Search endpoint. Uses the OIDC token for authentication SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/" +# Maximum number of results expected from the search endpoint +SEARCH_INDEXER_QUERY_LIMIT=50 ``` We also need to enable the **OIDC Token** refresh or the authentication will fail quickly. diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index f2988dca55..0a9d16aaa9 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -1105,8 +1105,6 @@ def _search_fulltext(self, indexer, request, params): text=text, token=access_token, visited=get_visited_document_ids_of(queryset, user), - page=1, - page_size=100, ) docs_by_uuid = {str(d.pk): d for d in queryset.filter(pk__in=results)} diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py index 424897c920..a4bb9eec6b 100644 --- a/src/backend/core/services/search_indexers.py +++ b/src/backend/core/services/search_indexers.py @@ -110,6 +110,7 @@ def __init__(self): self.indexer_url = settings.SEARCH_INDEXER_URL self.indexer_secret = settings.SEARCH_INDEXER_SECRET self.search_url = settings.SEARCH_INDEXER_QUERY_URL + self.search_limit = settings.SEARCH_INDEXER_QUERY_LIMIT if not self.indexer_url: raise ImproperlyConfigured( @@ -184,7 +185,7 @@ def push(self, data): """ # pylint: disable-next=too-many-arguments,too-many-positional-arguments - def search(self, text, token, visited=(), page=1, page_size=50): + def search(self, text, token, visited=(), nb_results=None): """ Search for documents in Find app. Ensure the same default ordering as "Docs" list : -updated_at @@ -197,20 +198,17 @@ def search(self, text, token, visited=(), page=1, page_size=50): visited (list, optional): List of ids of active public documents with LinkTrace Defaults to settings.SEARCH_INDEXER_BATCH_SIZE. - page (int, optional): - The page number to retrieve. - Defaults to 1 if not specified. - page_size (int, optional): - The number of results to return per page. + nb_results (int, optional): + The number of results to return. Defaults to 50 if not specified. """ + nb_results = nb_results or self.search_limit response = self.search_query( data={ "q": text, "visited": visited, "services": ["docs"], - "page_number": page, - "page_size": page_size, + "nb_results": nb_results, "order_by": "updated_at", "order_direction": "desc", }, diff --git a/src/backend/core/tasks/search.py b/src/backend/core/tasks/search.py index ebdf6c15ed..4b30c6a7de 100644 --- a/src/backend/core/tasks/search.py +++ b/src/backend/core/tasks/search.py @@ -23,11 +23,9 @@ def document_indexer_task(document_id): """Celery Task : Sends indexation query for a document.""" indexer = get_document_indexer() - if indexer is None: - return - - logger.info("Start document %s indexation", document_id) - indexer.index(models.Document.objects.filter(pk=document_id)) + if indexer: + logger.info("Start document %s indexation", document_id) + indexer.index(models.Document.objects.filter(pk=document_id)) def batch_indexer_throttle_acquire(timeout: int = 0, atomic: bool = True): diff --git a/src/backend/core/tests/documents/test_api_documents_search.py b/src/backend/core/tests/documents/test_api_documents_search.py index 3476617822..c6d0d8e3ac 100644 --- a/src/backend/core/tests/documents/test_api_documents_search.py +++ b/src/backend/core/tests/documents/test_api_documents_search.py @@ -326,8 +326,7 @@ def test_api_documents_search_pagination( "q": "alpha", "visited": [], "services": ["docs"], - "page_number": 1, - "page_size": 100, + "nb_results": 50, "order_by": "updated_at", "order_direction": "desc", } diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py index 0a6daf4045..61488a921b 100644 --- a/src/backend/core/tests/test_services_search_indexers.py +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -588,10 +588,48 @@ def test_services_search_indexers_search(mock_post, indexer_settings): assert query_data["q"] == "alpha" assert sorted(query_data["visited"]) == sorted([str(doc1.pk), str(doc2.pk)]) assert query_data["services"] == ["docs"] - assert query_data["page_number"] == 1 - assert query_data["page_size"] == 50 + assert query_data["nb_results"] == 50 assert query_data["order_by"] == "updated_at" assert query_data["order_direction"] == "desc" assert kwargs.get("headers") == {"Authorization": "Bearer mytoken"} assert kwargs.get("timeout") == 10 + + +@patch("requests.post") +def test_services_search_indexers_search_nb_results(mock_post, indexer_settings): + """ + Find API call should have nb_results == SEARCH_INDEXER_QUERY_LIMIT + or the given nb_results argument. + """ + indexer_settings.SEARCH_INDEXER_QUERY_LIMIT = 25 + + user = factories.UserFactory() + indexer = SearchIndexer() + + mock_response = mock_post.return_value + mock_response.raise_for_status.return_value = None # No error + + doc1, doc2, _ = factories.DocumentFactory.create_batch(3) + + create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False) + + create_link(document=doc1) + create_link(document=doc2) + + visited = get_visited_document_ids_of(models.Document.objects.all(), user) + + indexer.search("alpha", visited=visited, token="mytoken") + + args, kwargs = mock_post.call_args + + assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL + assert kwargs.get("json")["nb_results"] == 25 + + # The argument overrides the setting value + indexer.search("alpha", visited=visited, token="mytoken", nb_results=109) + + args, kwargs = mock_post.call_args + + assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL + assert kwargs.get("json")["nb_results"] == 109 diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index fdc81c6ceb..4e3c55c87a 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -120,6 +120,9 @@ class Base(Configuration): SEARCH_INDEXER_QUERY_URL = values.Value( default=None, environ_name="SEARCH_INDEXER_QUERY_URL", environ_prefix=None ) + SEARCH_INDEXER_QUERY_LIMIT = values.PositiveIntegerValue( + default=50, environ_name="SEARCH_INDEXER_QUERY_LIMIT", environ_prefix=None + ) # Static files (CSS, JavaScript, Images) STATIC_URL = "/static/"