diff --git a/.gitignore b/.gitignore
index 6eea8d7e7c..16d803b568 100644
--- a/.gitignore
+++ b/.gitignore
@@ -43,6 +43,10 @@ venv.bak/
env.d/development/*.local
env.d/terraform
+# Docker
+compose.override.yml
+docker/auth/*.local
+
# npm
node_modules
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a43001980c..55587ab44d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -97,6 +97,9 @@ and this project adheres to
- ♿ update labels and shared document icon accessibility #1442
- 🍱(frontend) Fonts GDPR compliants #1453
- ♻️(service-worker) improve SW registration and update handling #1473
+- ✨(backend) add async indexation of documents on save (or access save) #1276
+- ✨(backend) add debounce mechanism to limit indexation jobs #1276
+- ✨(api) add API route to search for indexed documents in Find #1276
### Fixed
diff --git a/Makefile b/Makefile
index 2655167912..c0ea3b59fd 100644
--- a/Makefile
+++ b/Makefile
@@ -247,6 +247,10 @@ demo: ## flush db then create a demo for load testing purpose
@$(MANAGE) create_demo
.PHONY: demo
+index: ## index all documents to remote search
+ @$(MANAGE) index
+.PHONY: index
+
# Nota bene: Black should come after isort just in case they don't agree...
lint: ## lint back-end python sources
lint: \
diff --git a/bin/fernetkey b/bin/fernetkey
new file mode 100755
index 0000000000..8bbac1096e
--- /dev/null
+++ b/bin/fernetkey
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+# shellcheck source=bin/_config.sh
+source "$(dirname "${BASH_SOURCE[0]}")/_config.sh"
+
+_dc_run app-dev python -c 'from cryptography.fernet import Fernet;import sys; sys.stdout.write("\n" + Fernet.generate_key().decode() + "\n");'
diff --git a/compose.yml b/compose.yml
index a774f11e07..cb5905c330 100644
--- a/compose.yml
+++ b/compose.yml
@@ -72,6 +72,11 @@ services:
- env.d/development/postgresql.local
ports:
- "8071:8000"
+ networks:
+ default: {}
+ lasuite:
+ aliases:
+ - impress
volumes:
- ./src/backend:/app
- ./data/static:/data/static
@@ -92,6 +97,9 @@ services:
command: ["celery", "-A", "impress.celery_app", "worker", "-l", "DEBUG"]
environment:
- DJANGO_CONFIGURATION=Development
+ networks:
+ - default
+ - lasuite
env_file:
- env.d/development/common
- env.d/development/common.local
@@ -107,6 +115,11 @@ services:
image: nginx:1.25
ports:
- "8083:8083"
+ networks:
+ default: {}
+ lasuite:
+ aliases:
+ - nginx
volumes:
- ./docker/files/etc/nginx/conf.d:/etc/nginx/conf.d:ro
depends_on:
@@ -217,3 +230,8 @@ services:
kc_postgresql:
condition: service_healthy
restart: true
+
+networks:
+ lasuite:
+ name: lasuite-network
+ driver: bridge
diff --git a/docs/architecture.md b/docs/architecture.md
index 230d32458b..f858eb01d3 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -12,6 +12,7 @@ flowchart TD
Back --> DB("Database (PostgreSQL)")
Back <--> Celery --> DB
Back ----> S3("Minio (S3)")
+ Back -- REST API --> Find
```
### Architecture decision records
diff --git a/docs/env.md b/docs/env.md
index 0b3f9b3bf6..cc09a7142f 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -93,6 +93,13 @@ These are the environment variables you can set for the `impress-backend` contai
| OIDC_USERINFO_SHORTNAME_FIELD | OIDC token claims to create shortname | first_name |
| POSTHOG_KEY | Posthog key for analytics | |
| REDIS_URL | Cache url | redis://redis:6379/1 |
+| SEARCH_INDEXER_CLASS | Class of the backend for document indexation & search | |
+| SEARCH_INDEXER_BATCH_SIZE | Size of each batch for indexation of all documents | 100000 |
+| SEARCH_INDEXER_COUNTDOWN | Minimum debounce delay of indexation jobs (in seconds) | 1 |
+| SEARCH_INDEXER_URL | Find application endpoint for indexation | |
+| SEARCH_INDEXER_SECRET | Token for indexation queries | |
+| SEARCH_INDEXER_QUERY_URL | Find application endpoint for search | |
+| SEARCH_INDEXER_QUERY_LIMIT | Maximum number of results expected from search endpoint | 50 |
| SENTRY_DSN | Sentry host | |
| SESSION_COOKIE_AGE | duration of the cookie session | 60*60*12 |
| SPECTACULAR_SETTINGS_ENABLE_DJANGO_DEPLOY_CHECK | | false |
diff --git a/docs/search.md b/docs/search.md
new file mode 100644
index 0000000000..416f972bd3
--- /dev/null
+++ b/docs/search.md
@@ -0,0 +1,41 @@
+# Setup the Find search for Impress
+
+This configuration will enable the fulltext search feature for Docs :
+- Each save on **core.Document** or **core.DocumentAccess** will trigger the indexer
+- The `api/v1.0/documents/search/` will work as a proxy with the Find API for fulltext search.
+
+## Create an index service for Docs
+
+Configure a **Service** for Docs application with these settings
+
+- **Name**: `docs`
_request.auth.name of the Docs application._
+- **Client id**: `impress`
_Name of the token audience or client_id of the Docs application._
+
+See [how-to-use-indexer.md](how-to-use-indexer.md) for details.
+
+## Configure settings of Docs
+
+Add those Django settings the Docs application to enable the feature.
+
+```shell
+SEARCH_INDEXER_CLASS="core.services.search_indexers.FindDocumentIndexer"
+SEARCH_INDEXER_COUNTDOWN=10 # Debounce delay in seconds for the indexer calls.
+
+# The token from service "docs" of Find application (development).
+SEARCH_INDEXER_SECRET="find-api-key-for-docs-with-exactly-50-chars-length"
+SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/"
+
+# Search endpoint. Uses the OIDC token for authentication
+SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/"
+# Maximum number of results expected from the search endpoint
+SEARCH_INDEXER_QUERY_LIMIT=50
+```
+
+We also need to enable the **OIDC Token** refresh or the authentication will fail quickly.
+
+```shell
+# Store OIDC tokens in the session
+OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session
+OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session
+OIDC_STORE_REFRESH_TOKEN_KEY = "your-32-byte-encryption-key==" # Must be a valid Fernet key (32 url-safe base64-encoded bytes)
+```
diff --git a/docs/system-requirements.md b/docs/system-requirements.md
index 8f2daafb2c..db337d9b23 100644
--- a/docs/system-requirements.md
+++ b/docs/system-requirements.md
@@ -97,6 +97,17 @@ Production deployments differ significantly from development environments. The t
| 5433 | PostgreSQL (Keycloak) |
| 1081 | MailCatcher |
+**With fulltext search service**
+
+| Port | Service |
+| --------- | --------------------- |
+| 8081 | Find (Django) |
+| 9200 | Opensearch |
+| 9600 | Opensearch admin |
+| 5601 | Opensearch dashboard |
+| 25432 | PostgreSQL (Find) |
+
+
## 6. Sizing Guidelines
**RAM** – start at 8 GB dev / 16 GB staging / 32 GB prod. Postgres and Keycloak are the first to OOM; scale them first.
diff --git a/env.d/development/common b/env.d/development/common
index de857d5b2a..cdd734b55f 100644
--- a/env.d/development/common
+++ b/env.d/development/common
@@ -36,6 +36,7 @@ OIDC_OP_JWKS_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/c
OIDC_OP_AUTHORIZATION_ENDPOINT=http://localhost:8083/realms/impress/protocol/openid-connect/auth
OIDC_OP_TOKEN_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/token
OIDC_OP_USER_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/userinfo
+OIDC_OP_INTROSPECTION_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/token/introspect
OIDC_RP_CLIENT_ID=impress
OIDC_RP_CLIENT_SECRET=ThisIsAnExampleKeyForDevPurposeOnly
@@ -49,6 +50,14 @@ LOGOUT_REDIRECT_URL=http://localhost:3000
OIDC_REDIRECT_ALLOWED_HOSTS=["http://localhost:8083", "http://localhost:3000"]
OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"}
+# Store OIDC tokens in the session
+OIDC_STORE_ACCESS_TOKEN = True
+OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session.
+
+# Must be a valid Fernet key (32 url-safe base64-encoded bytes)
+# To create one, use the bin/fernetkey command.
+# OIDC_STORE_REFRESH_TOKEN_KEY="your-32-byte-encryption-key=="
+
# AI
AI_FEATURE_ENABLED=true
AI_BASE_URL=https://openaiendpoint.com
@@ -68,4 +77,10 @@ Y_PROVIDER_API_BASE_URL=http://y-provider-development:4444/api/
Y_PROVIDER_API_KEY=yprovider-api-key
# Theme customization
-THEME_CUSTOMIZATION_CACHE_TIMEOUT=15
\ No newline at end of file
+THEME_CUSTOMIZATION_CACHE_TIMEOUT=15
+
+# Indexer (disabled)
+# SEARCH_INDEXER_CLASS="core.services.search_indexers.SearchIndexer"
+SEARCH_INDEXER_SECRET=find-api-key-for-docs-with-exactly-50-chars-length # Key generated by create_demo in Find app.
+SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/"
+SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/"
diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py
index 81b26d5e80..45ec34c68e 100644
--- a/src/backend/core/api/serializers.py
+++ b/src/backend/core/api/serializers.py
@@ -889,3 +889,13 @@ class MoveDocumentSerializer(serializers.Serializer):
choices=enums.MoveNodePositionChoices.choices,
default=enums.MoveNodePositionChoices.LAST_CHILD,
)
+
+
+class SearchDocumentSerializer(serializers.Serializer):
+ """Serializer for fulltext search requests through Find application"""
+
+ q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True)
+ page_size = serializers.IntegerField(
+ required=False, min_value=1, max_value=50, default=20
+ )
+ page = serializers.IntegerField(required=False, min_value=1, default=1)
diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py
index 84402ceaae..0a9d16aaa9 100644
--- a/src/backend/core/api/viewsets.py
+++ b/src/backend/core/api/viewsets.py
@@ -21,6 +21,7 @@
from django.db.models.functions import Left, Length
from django.http import Http404, StreamingHttpResponse
from django.urls import reverse
+from django.utils.decorators import method_decorator
from django.utils.functional import cached_property
from django.utils.text import capfirst, slugify
from django.utils.translation import gettext_lazy as _
@@ -31,6 +32,7 @@
from csp.constants import NONE
from csp.decorators import csp_update
from lasuite.malware_detection import malware_detection
+from lasuite.oidc_login.decorators import refresh_oidc_access_token
from rest_framework import filters, status, viewsets
from rest_framework import response as drf_response
from rest_framework.permissions import AllowAny
@@ -47,6 +49,10 @@
from core.services.converter_services import (
YdocConverter,
)
+from core.services.search_indexers import (
+ get_document_indexer,
+ get_visited_document_ids_of,
+)
from core.tasks.mail import send_ask_for_access_mail
from core.utils import extract_attachments, filter_descendants
@@ -373,6 +379,7 @@ class DocumentViewSet(
list_serializer_class = serializers.ListDocumentSerializer
trashbin_serializer_class = serializers.ListDocumentSerializer
tree_serializer_class = serializers.ListDocumentSerializer
+ search_serializer_class = serializers.ListDocumentSerializer
def get_queryset(self):
"""Get queryset performing all annotation and filtering on the document tree structure."""
@@ -1064,6 +1071,83 @@ def duplicate(self, request, *args, **kwargs):
{"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED
)
+ def _search_simple(self, request, text):
+ """
+ Returns a queryset filtered by the content of the document title
+ """
+ # As the 'list' view we get a prefiltered queryset (deleted docs are excluded)
+ queryset = self.get_queryset()
+ filterset = DocumentFilter({"title": text}, queryset=queryset)
+
+ if not filterset.is_valid():
+ raise drf.exceptions.ValidationError(filterset.errors)
+
+ queryset = filterset.filter_queryset(queryset)
+
+ return self.get_response_for_queryset(
+ queryset.order_by("-updated_at"),
+ context={
+ "request": request,
+ },
+ )
+
+ def _search_fulltext(self, indexer, request, params):
+ """
+ Returns a queryset from the results the fulltext search of Find
+ """
+ access_token = request.session.get("oidc_access_token")
+ user = request.user
+ text = params.validated_data["q"]
+ queryset = models.Document.objects.all()
+
+ # Retrieve the documents ids from Find.
+ results = indexer.search(
+ text=text,
+ token=access_token,
+ visited=get_visited_document_ids_of(queryset, user),
+ )
+
+ docs_by_uuid = {str(d.pk): d for d in queryset.filter(pk__in=results)}
+ ordered_docs = [docs_by_uuid[id] for id in results]
+
+ page = self.paginate_queryset(ordered_docs)
+
+ serializer = self.get_serializer(
+ page if page else ordered_docs,
+ many=True,
+ context={
+ "request": request,
+ },
+ )
+
+ return self.get_paginated_response(serializer.data)
+
+ @drf.decorators.action(detail=False, methods=["get"], url_path="search")
+ @method_decorator(refresh_oidc_access_token)
+ def search(self, request, *args, **kwargs):
+ """
+ Returns a DRF response containing the filtered, annotated and ordered document list.
+
+ Applies filtering based on request parameter 'q' from `SearchDocumentSerializer`.
+ Depending of the configuration it can be:
+ - A fulltext search through the opensearch indexation app "find" if the backend is
+ enabled (see SEARCH_INDEXER_CLASS)
+ - A filtering by the model field 'title'.
+
+ The ordering is always by the most recent first.
+ """
+ params = serializers.SearchDocumentSerializer(data=request.query_params)
+ params.is_valid(raise_exception=True)
+
+ indexer = get_document_indexer()
+
+ if indexer:
+ return self._search_fulltext(indexer, request, params=params)
+
+ # The indexer is not configured, we fallback on a simple icontains filter by the
+ # model field 'title'.
+ return self._search_simple(request, text=params.validated_data["q"])
+
@drf.decorators.action(detail=True, methods=["get"], url_path="versions")
def versions_list(self, request, *args, **kwargs):
"""
diff --git a/src/backend/core/apps.py b/src/backend/core/apps.py
index bba7de0f42..b8ce648834 100644
--- a/src/backend/core/apps.py
+++ b/src/backend/core/apps.py
@@ -1,11 +1,19 @@
"""Impress Core application"""
-# from django.apps import AppConfig
-# from django.utils.translation import gettext_lazy as _
+from django.apps import AppConfig
+from django.utils.translation import gettext_lazy as _
-# class CoreConfig(AppConfig):
-# """Configuration class for the impress core app."""
-# name = "core"
-# app_label = "core"
-# verbose_name = _("impress core application")
+class CoreConfig(AppConfig):
+ """Configuration class for the impress core app."""
+
+ name = "core"
+ app_label = "core"
+ verbose_name = _("Impress core application")
+
+ def ready(self):
+ """
+ Import signals when the app is ready.
+ """
+ # pylint: disable=import-outside-toplevel, unused-import
+ from . import signals # noqa: PLC0415
diff --git a/src/backend/core/management/commands/index.py b/src/backend/core/management/commands/index.py
new file mode 100644
index 0000000000..af046e0608
--- /dev/null
+++ b/src/backend/core/management/commands/index.py
@@ -0,0 +1,52 @@
+"""
+Handle search setup that needs to be done at bootstrap time.
+"""
+
+import logging
+import time
+
+from django.core.management.base import BaseCommand, CommandError
+
+from core.services.search_indexers import get_document_indexer
+
+logger = logging.getLogger("docs.search.bootstrap_search")
+
+
+class Command(BaseCommand):
+ """Index all documents to remote search service"""
+
+ help = __doc__
+
+ def add_arguments(self, parser):
+ """Add argument to require forcing execution when not in debug mode."""
+ parser.add_argument(
+ "--batch-size",
+ action="store",
+ dest="batch_size",
+ type=int,
+ default=50,
+ help="Indexation query batch size",
+ )
+
+ def handle(self, *args, **options):
+ """Launch and log search index generation."""
+ indexer = get_document_indexer()
+
+ if not indexer:
+ raise CommandError("The indexer is not enabled or properly configured.")
+
+ logger.info("Starting to regenerate Find index...")
+ start = time.perf_counter()
+ batch_size = options["batch_size"]
+
+ try:
+ count = indexer.index(batch_size=batch_size)
+ except Exception as err:
+ raise CommandError("Unable to regenerate index") from err
+
+ duration = time.perf_counter() - start
+ logger.info(
+ "Search index regenerated from %d document(s) in %.2f seconds.",
+ count,
+ duration,
+ )
diff --git a/src/backend/core/models.py b/src/backend/core/models.py
index 6e0ad69e4f..e7994d9212 100644
--- a/src/backend/core/models.py
+++ b/src/backend/core/models.py
@@ -432,32 +432,35 @@ def __init__(self, *args, **kwargs):
def save(self, *args, **kwargs):
"""Write content to object storage only if _content has changed."""
super().save(*args, **kwargs)
-
if self._content:
- file_key = self.file_key
- bytes_content = self._content.encode("utf-8")
+ self.save_content(self._content)
- # Attempt to directly check if the object exists using the storage client.
- try:
- response = default_storage.connection.meta.client.head_object(
- Bucket=default_storage.bucket_name, Key=file_key
- )
- except ClientError as excpt:
- # If the error is a 404, the object doesn't exist, so we should create it.
- if excpt.response["Error"]["Code"] == "404":
- has_changed = True
- else:
- raise
+ def save_content(self, content):
+ """Save content to object storage."""
+
+ file_key = self.file_key
+ bytes_content = content.encode("utf-8")
+
+ # Attempt to directly check if the object exists using the storage client.
+ try:
+ response = default_storage.connection.meta.client.head_object(
+ Bucket=default_storage.bucket_name, Key=file_key
+ )
+ except ClientError as excpt:
+ # If the error is a 404, the object doesn't exist, so we should create it.
+ if excpt.response["Error"]["Code"] == "404":
+ has_changed = True
else:
- # Compare the existing ETag with the MD5 hash of the new content.
- has_changed = (
- response["ETag"].strip('"')
- != hashlib.md5(bytes_content).hexdigest() # noqa: S324
- )
+ raise
+ else:
+ # Compare the existing ETag with the MD5 hash of the new content.
+ has_changed = (
+ response["ETag"].strip('"') != hashlib.md5(bytes_content).hexdigest() # noqa: S324
+ )
- if has_changed:
- content_file = ContentFile(bytes_content)
- default_storage.save(file_key, content_file)
+ if has_changed:
+ content_file = ContentFile(bytes_content)
+ default_storage.save(file_key, content_file)
def is_leaf(self):
"""
@@ -901,7 +904,8 @@ def soft_delete(self):
# Mark all descendants as soft deleted
self.get_descendants().filter(ancestors_deleted_at__isnull=True).update(
- ancestors_deleted_at=self.ancestors_deleted_at
+ ancestors_deleted_at=self.ancestors_deleted_at,
+ updated_at=self.updated_at,
)
@transaction.atomic
diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py
new file mode 100644
index 0000000000..a4bb9eec6b
--- /dev/null
+++ b/src/backend/core/services/search_indexers.py
@@ -0,0 +1,298 @@
+"""Document search index management utilities and indexers"""
+
+import logging
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from functools import cache
+
+from django.conf import settings
+from django.contrib.auth.models import AnonymousUser
+from django.core.exceptions import ImproperlyConfigured
+from django.db.models import Subquery
+from django.utils.module_loading import import_string
+
+import requests
+
+from core import models, utils
+
+logger = logging.getLogger(__name__)
+
+
+@cache
+def get_document_indexer():
+ """Returns an instance of indexer service if enabled and properly configured."""
+ classpath = settings.SEARCH_INDEXER_CLASS
+
+ # For this usecase an empty indexer class is not an issue but a feature.
+ if not classpath:
+ logger.info("Document indexer is not configured (see SEARCH_INDEXER_CLASS)")
+ return None
+
+ try:
+ indexer_class = import_string(settings.SEARCH_INDEXER_CLASS)
+ return indexer_class()
+ except ImportError as err:
+ logger.error("SEARCH_INDEXER_CLASS setting is not valid : %s", err)
+ except ImproperlyConfigured as err:
+ logger.error("Document indexer is not properly configured : %s", err)
+
+ return None
+
+
+def get_batch_accesses_by_users_and_teams(paths):
+ """
+ Get accesses related to a list of document paths,
+ grouped by users and teams, including all ancestor paths.
+ """
+ ancestor_map = utils.get_ancestor_to_descendants_map(
+ paths, steplen=models.Document.steplen
+ )
+ ancestor_paths = list(ancestor_map.keys())
+
+ access_qs = models.DocumentAccess.objects.filter(
+ document__path__in=ancestor_paths
+ ).values("document__path", "user__sub", "team")
+
+ access_by_document_path = defaultdict(lambda: {"users": set(), "teams": set()})
+
+ for access in access_qs:
+ ancestor_path = access["document__path"]
+ user_sub = access["user__sub"]
+ team = access["team"]
+
+ for descendant_path in ancestor_map.get(ancestor_path, []):
+ if user_sub:
+ access_by_document_path[descendant_path]["users"].add(str(user_sub))
+ if team:
+ access_by_document_path[descendant_path]["teams"].add(team)
+
+ return dict(access_by_document_path)
+
+
+def get_visited_document_ids_of(queryset, user):
+ """
+ Returns the ids of the documents that have a linktrace to the user and NOT owned.
+ It will be use to limit the opensearch responses to the public documents already
+ "visited" by the user.
+ """
+ if isinstance(user, AnonymousUser):
+ return []
+
+ qs = models.LinkTrace.objects.filter(user=user)
+
+ docs = (
+ queryset.exclude(accesses__user=user)
+ .filter(
+ deleted_at__isnull=True,
+ ancestors_deleted_at__isnull=True,
+ )
+ .filter(pk__in=Subquery(qs.values("document_id")))
+ .order_by("pk")
+ .distinct("pk")
+ )
+
+ return [str(id) for id in docs.values_list("pk", flat=True)]
+
+
+class BaseDocumentIndexer(ABC):
+ """
+ Base class for document indexers.
+
+ Handles batching and access resolution. Subclasses must implement both
+ `serialize_document()` and `push()` to define backend-specific behavior.
+ """
+
+ def __init__(self):
+ """
+ Initialize the indexer.
+ """
+ self.batch_size = settings.SEARCH_INDEXER_BATCH_SIZE
+ self.indexer_url = settings.SEARCH_INDEXER_URL
+ self.indexer_secret = settings.SEARCH_INDEXER_SECRET
+ self.search_url = settings.SEARCH_INDEXER_QUERY_URL
+ self.search_limit = settings.SEARCH_INDEXER_QUERY_LIMIT
+
+ if not self.indexer_url:
+ raise ImproperlyConfigured(
+ "SEARCH_INDEXER_URL must be set in Django settings."
+ )
+
+ if not self.indexer_secret:
+ raise ImproperlyConfigured(
+ "SEARCH_INDEXER_SECRET must be set in Django settings."
+ )
+
+ if not self.search_url:
+ raise ImproperlyConfigured(
+ "SEARCH_INDEXER_QUERY_URL must be set in Django settings."
+ )
+
+ def index(self, queryset=None, batch_size=None):
+ """
+ Fetch documents in batches, serialize them, and push to the search backend.
+
+ Args:
+ queryset (optional): Document queryset
+ Defaults to all documents without filter.
+ batch_size (int, optional): Number of documents per batch.
+ Defaults to settings.SEARCH_INDEXER_BATCH_SIZE.
+ """
+ last_id = 0
+ count = 0
+ queryset = queryset or models.Document.objects.all()
+ batch_size = batch_size or self.batch_size
+
+ while True:
+ documents_batch = list(
+ queryset.filter(
+ id__gt=last_id,
+ ).order_by("id")[:batch_size]
+ )
+
+ if not documents_batch:
+ break
+
+ doc_paths = [doc.path for doc in documents_batch]
+ last_id = documents_batch[-1].id
+ accesses_by_document_path = get_batch_accesses_by_users_and_teams(doc_paths)
+
+ serialized_batch = [
+ self.serialize_document(document, accesses_by_document_path)
+ for document in documents_batch
+ if document.content or document.title
+ ]
+
+ if serialized_batch:
+ self.push(serialized_batch)
+ count += len(serialized_batch)
+
+ return count
+
+ @abstractmethod
+ def serialize_document(self, document, accesses):
+ """
+ Convert a Document instance to a JSON-serializable format for indexing.
+
+ Must be implemented by subclasses.
+ """
+
+ @abstractmethod
+ def push(self, data):
+ """
+ Push a batch of serialized documents to the backend.
+
+ Must be implemented by subclasses.
+ """
+
+ # pylint: disable-next=too-many-arguments,too-many-positional-arguments
+ def search(self, text, token, visited=(), nb_results=None):
+ """
+ Search for documents in Find app.
+ Ensure the same default ordering as "Docs" list : -updated_at
+
+ Returns ids of the documents
+
+ Args:
+ text (str): Text search content.
+ token (str): OIDC Authentication token.
+ visited (list, optional):
+ List of ids of active public documents with LinkTrace
+ Defaults to settings.SEARCH_INDEXER_BATCH_SIZE.
+ nb_results (int, optional):
+ The number of results to return.
+ Defaults to 50 if not specified.
+ """
+ nb_results = nb_results or self.search_limit
+ response = self.search_query(
+ data={
+ "q": text,
+ "visited": visited,
+ "services": ["docs"],
+ "nb_results": nb_results,
+ "order_by": "updated_at",
+ "order_direction": "desc",
+ },
+ token=token,
+ )
+
+ return [d["_id"] for d in response]
+
+ @abstractmethod
+ def search_query(self, data, token) -> dict:
+ """
+ Retrieve documents from the Find app API.
+
+ Must be implemented by subclasses.
+ """
+
+
+class SearchIndexer(BaseDocumentIndexer):
+ """
+ Document indexer that pushes documents to La Suite Find app.
+ """
+
+ def serialize_document(self, document, accesses):
+ """
+ Convert a Document to the JSON format expected by La Suite Find.
+
+ Args:
+ document (Document): The document instance.
+ accesses (dict): Mapping of document ID to user/team access.
+
+ Returns:
+ dict: A JSON-serializable dictionary.
+ """
+ doc_path = document.path
+ doc_content = document.content
+ text_content = utils.base64_yjs_to_text(doc_content) if doc_content else ""
+
+ return {
+ "id": str(document.id),
+ "title": document.title or "",
+ "content": text_content,
+ "depth": document.depth,
+ "path": document.path,
+ "numchild": document.numchild,
+ "created_at": document.created_at.isoformat(),
+ "updated_at": document.updated_at.isoformat(),
+ "users": list(accesses.get(doc_path, {}).get("users", set())),
+ "groups": list(accesses.get(doc_path, {}).get("teams", set())),
+ "reach": document.computed_link_reach,
+ "size": len(text_content.encode("utf-8")),
+ "is_active": not bool(document.ancestors_deleted_at),
+ }
+
+ def search_query(self, data, token) -> requests.Response:
+ """
+ Retrieve documents from the Find app API.
+
+ Args:
+ data (dict): search data
+ token (str): OICD token
+
+ Returns:
+ dict: A JSON-serializable dictionary.
+ """
+ response = requests.post(
+ self.search_url,
+ json=data,
+ headers={"Authorization": f"Bearer {token}"},
+ timeout=10,
+ )
+ response.raise_for_status()
+ return response.json()
+
+ def push(self, data):
+ """
+ Push a batch of documents to the Find backend.
+
+ Args:
+ data (list): List of document dictionaries.
+ """
+ response = requests.post(
+ self.indexer_url,
+ json=data,
+ headers={"Authorization": f"Bearer {self.indexer_secret}"},
+ timeout=10,
+ )
+ response.raise_for_status()
diff --git a/src/backend/core/signals.py b/src/backend/core/signals.py
new file mode 100644
index 0000000000..4bd9647798
--- /dev/null
+++ b/src/backend/core/signals.py
@@ -0,0 +1,33 @@
+"""
+Declare and configure the signals for the impress core application
+"""
+
+from functools import partial
+
+from django.db import transaction
+from django.db.models import signals
+from django.dispatch import receiver
+
+from . import models
+from .tasks.search import trigger_batch_document_indexer
+
+
+@receiver(signals.post_save, sender=models.Document)
+def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-argument
+ """
+ Asynchronous call to the document indexer at the end of the transaction.
+ Note : Within the transaction we can have an empty content and a serialization
+ error.
+ """
+ transaction.on_commit(partial(trigger_batch_document_indexer, instance))
+
+
+@receiver(signals.post_save, sender=models.DocumentAccess)
+def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument
+ """
+ Asynchronous call to the document indexer at the end of the transaction.
+ """
+ if not created:
+ transaction.on_commit(
+ partial(trigger_batch_document_indexer, instance.document)
+ )
diff --git a/src/backend/core/tasks/search.py b/src/backend/core/tasks/search.py
new file mode 100644
index 0000000000..4b30c6a7de
--- /dev/null
+++ b/src/backend/core/tasks/search.py
@@ -0,0 +1,95 @@
+"""Trigger document indexation using celery task."""
+
+from logging import getLogger
+
+from django.conf import settings
+from django.core.cache import cache
+from django.db.models import Q
+
+from django_redis.cache import RedisCache
+
+from core import models
+from core.services.search_indexers import (
+ get_document_indexer,
+)
+
+from impress.celery_app import app
+
+logger = getLogger(__file__)
+
+
+@app.task
+def document_indexer_task(document_id):
+ """Celery Task : Sends indexation query for a document."""
+ indexer = get_document_indexer()
+
+ if indexer:
+ logger.info("Start document %s indexation", document_id)
+ indexer.index(models.Document.objects.filter(pk=document_id))
+
+
+def batch_indexer_throttle_acquire(timeout: int = 0, atomic: bool = True):
+ """
+ Enable the task throttle flag for a delay.
+ Uses redis locks if available to ensure atomic changes
+ """
+ key = "document-batch-indexer-throttle"
+
+ # Redis is used as cache database (not in tests). Use the lock feature here
+ # to ensure atomicity of changes to the throttle flag.
+ if isinstance(cache, RedisCache) and atomic:
+ with cache.locks(key):
+ return batch_indexer_throttle_acquire(timeout, atomic=False)
+
+ # Use add() here :
+ # - set the flag and returns true if not exist
+ # - do nothing and return false if exist
+ return cache.add(key, 1, timeout=timeout)
+
+
+@app.task
+def batch_document_indexer_task(timestamp):
+ """Celery Task : Sends indexation query for a batch of documents."""
+ indexer = get_document_indexer()
+
+ if indexer:
+ queryset = models.Document.objects.filter(
+ Q(updated_at__gte=timestamp)
+ | Q(deleted_at__gte=timestamp)
+ | Q(ancestors_deleted_at__gte=timestamp)
+ )
+
+ count = indexer.index(queryset)
+ logger.info("Indexed %d documents", count)
+
+
+def trigger_batch_document_indexer(item):
+ """
+ Trigger indexation task with debounce a delay set by the SEARCH_INDEXER_COUNTDOWN setting.
+
+ Args:
+ document (Document): The document instance.
+ """
+ countdown = int(settings.SEARCH_INDEXER_COUNTDOWN)
+
+ # DO NOT create a task if indexation if disabled
+ if not settings.SEARCH_INDEXER_CLASS:
+ return
+
+ if countdown > 0:
+ # Each time this method is called during a countdown, we increment the
+ # counter and each task decrease it, so the index be run only once.
+ if batch_indexer_throttle_acquire(timeout=countdown):
+ logger.info(
+ "Add task for batch document indexation from updated_at=%s in %d seconds",
+ item.updated_at.isoformat(),
+ countdown,
+ )
+
+ batch_document_indexer_task.apply_async(
+ args=[item.updated_at], countdown=countdown
+ )
+ else:
+ logger.info("Skip task for batch document %s indexation", item.pk)
+ else:
+ document_indexer_task.apply(args=[item.pk])
diff --git a/src/backend/core/tests/commands/test_index.py b/src/backend/core/tests/commands/test_index.py
new file mode 100644
index 0000000000..ad7d39e6e0
--- /dev/null
+++ b/src/backend/core/tests/commands/test_index.py
@@ -0,0 +1,65 @@
+"""
+Unit test for `index` command.
+"""
+
+from operator import itemgetter
+from unittest import mock
+
+from django.core.management import CommandError, call_command
+from django.db import transaction
+
+import pytest
+
+from core import factories
+from core.services.search_indexers import SearchIndexer
+
+
+@pytest.mark.django_db
+@pytest.mark.usefixtures("indexer_settings")
+def test_index():
+ """Test the command `index` that run the Find app indexer for all the available documents."""
+ user = factories.UserFactory()
+ indexer = SearchIndexer()
+
+ with transaction.atomic():
+ doc = factories.DocumentFactory()
+ empty_doc = factories.DocumentFactory(title=None, content="")
+ no_title_doc = factories.DocumentFactory(title=None)
+
+ factories.UserDocumentAccessFactory(document=doc, user=user)
+ factories.UserDocumentAccessFactory(document=empty_doc, user=user)
+ factories.UserDocumentAccessFactory(document=no_title_doc, user=user)
+
+ accesses = {
+ str(doc.path): {"users": [user.sub]},
+ str(empty_doc.path): {"users": [user.sub]},
+ str(no_title_doc.path): {"users": [user.sub]},
+ }
+
+ with mock.patch.object(SearchIndexer, "push") as mock_push:
+ call_command("index")
+
+ push_call_args = [call.args[0] for call in mock_push.call_args_list]
+
+ # called once but with a batch of docs
+ mock_push.assert_called_once()
+
+ assert sorted(push_call_args[0], key=itemgetter("id")) == sorted(
+ [
+ indexer.serialize_document(doc, accesses),
+ indexer.serialize_document(no_title_doc, accesses),
+ ],
+ key=itemgetter("id"),
+ )
+
+
+@pytest.mark.django_db
+@pytest.mark.usefixtures("indexer_settings")
+def test_index_improperly_configured(indexer_settings):
+ """The command should raise an exception if the indexer is not configured"""
+ indexer_settings.SEARCH_INDEXER_CLASS = None
+
+ with pytest.raises(CommandError) as err:
+ call_command("index")
+
+ assert str(err.value) == "The indexer is not enabled or properly configured."
diff --git a/src/backend/core/tests/conftest.py b/src/backend/core/tests/conftest.py
index 00e830e18f..65e3926934 100644
--- a/src/backend/core/tests/conftest.py
+++ b/src/backend/core/tests/conftest.py
@@ -24,3 +24,30 @@ def mock_user_teams():
"core.models.User.teams", new_callable=mock.PropertyMock
) as mock_teams:
yield mock_teams
+
+
+@pytest.fixture(name="indexer_settings")
+def indexer_settings_fixture(settings):
+ """
+ Setup valid settings for the document indexer. Clear the indexer cache.
+ """
+
+ # pylint: disable-next=import-outside-toplevel
+ from core.services.search_indexers import ( # noqa: PLC0415
+ get_document_indexer,
+ )
+
+ get_document_indexer.cache_clear()
+
+ settings.SEARCH_INDEXER_CLASS = "core.services.search_indexers.SearchIndexer"
+ settings.SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest"
+ settings.SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/"
+ settings.SEARCH_INDEXER_QUERY_URL = (
+ "http://localhost:8081/api/v1.0/documents/search/"
+ )
+ settings.SEARCH_INDEXER_COUNTDOWN = 1
+
+ yield settings
+
+ # clear cache to prevent issues with other tests
+ get_document_indexer.cache_clear()
diff --git a/src/backend/core/tests/documents/test_api_documents_search.py b/src/backend/core/tests/documents/test_api_documents_search.py
new file mode 100644
index 0000000000..c6d0d8e3ac
--- /dev/null
+++ b/src/backend/core/tests/documents/test_api_documents_search.py
@@ -0,0 +1,425 @@
+"""
+Tests for Documents API endpoint in impress's core app: list
+"""
+
+import random
+from json import loads as json_loads
+
+from django.test import RequestFactory
+
+import pytest
+import responses
+from faker import Faker
+from rest_framework.test import APIClient
+
+from core import factories, models
+from core.services.search_indexers import get_document_indexer
+
+fake = Faker()
+pytestmark = pytest.mark.django_db
+
+
+def build_search_url(**kwargs):
+ """Build absolute uri for search endpoint with ORDERED query arguments"""
+ return (
+ RequestFactory()
+ .get("/api/v1.0/documents/search/", dict(sorted(kwargs.items())))
+ .build_absolute_uri()
+ )
+
+
+@pytest.mark.parametrize("role", models.LinkRoleChoices.values)
+@pytest.mark.parametrize("reach", models.LinkReachChoices.values)
+@responses.activate
+def test_api_documents_search_anonymous(reach, role, indexer_settings):
+ """
+ Anonymous users should not be allowed to search documents whatever the
+ link reach and link role
+ """
+ indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
+
+ factories.DocumentFactory(link_reach=reach, link_role=role)
+
+ # Find response
+ responses.add(
+ responses.POST,
+ "http://find/api/v1.0/search",
+ json=[],
+ status=200,
+ )
+
+ response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"})
+
+ assert response.status_code == 200
+ assert response.json() == {
+ "count": 0,
+ "next": None,
+ "previous": None,
+ "results": [],
+ }
+
+
+def test_api_documents_search_endpoint_is_none(indexer_settings):
+ """
+ Missing SEARCH_INDEXER_QUERY_URL, so the indexer is not properly configured.
+ Should fallback on title filter
+ """
+ indexer_settings.SEARCH_INDEXER_QUERY_URL = None
+
+ assert get_document_indexer() is None
+
+ user = factories.UserFactory()
+ document = factories.DocumentFactory(title="alpha")
+ access = factories.UserDocumentAccessFactory(document=document, user=user)
+
+ client = APIClient()
+ client.force_login(user)
+
+ response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"})
+
+ assert response.status_code == 200
+ content = response.json()
+ results = content.pop("results")
+ assert content == {
+ "count": 1,
+ "next": None,
+ "previous": None,
+ }
+ assert len(results) == 1
+ assert results[0] == {
+ "id": str(document.id),
+ "abilities": document.get_abilities(user),
+ "ancestors_link_reach": None,
+ "ancestors_link_role": None,
+ "computed_link_reach": document.computed_link_reach,
+ "computed_link_role": document.computed_link_role,
+ "created_at": document.created_at.isoformat().replace("+00:00", "Z"),
+ "creator": str(document.creator.id),
+ "depth": 1,
+ "excerpt": document.excerpt,
+ "link_reach": document.link_reach,
+ "link_role": document.link_role,
+ "nb_accesses_ancestors": 1,
+ "nb_accesses_direct": 1,
+ "numchild": 0,
+ "path": document.path,
+ "title": document.title,
+ "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"),
+ "deleted_at": None,
+ "user_role": access.role,
+ }
+
+
+@responses.activate
+def test_api_documents_search_invalid_params(indexer_settings):
+ """Validate the format of documents as returned by the search view."""
+ indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
+
+ user = factories.UserFactory()
+
+ client = APIClient()
+ client.force_login(user)
+
+ response = client.get("/api/v1.0/documents/search/")
+
+ assert response.status_code == 400
+ assert response.json() == {"q": ["This field is required."]}
+
+ response = client.get("/api/v1.0/documents/search/", data={"q": " "})
+
+ assert response.status_code == 400
+ assert response.json() == {"q": ["This field may not be blank."]}
+
+ response = client.get(
+ "/api/v1.0/documents/search/", data={"q": "any", "page": "NaN"}
+ )
+
+ assert response.status_code == 400
+ assert response.json() == {"page": ["A valid integer is required."]}
+
+
+@responses.activate
+def test_api_documents_search_format(indexer_settings):
+ """Validate the format of documents as returned by the search view."""
+ indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
+
+ assert get_document_indexer() is not None
+
+ user = factories.UserFactory()
+
+ client = APIClient()
+ client.force_login(user)
+
+ user_a, user_b, user_c = factories.UserFactory.create_batch(3)
+ document = factories.DocumentFactory(
+ title="alpha",
+ users=(user_a, user_c),
+ link_traces=(user, user_b),
+ )
+ access = factories.UserDocumentAccessFactory(document=document, user=user)
+
+ # Find response
+ responses.add(
+ responses.POST,
+ "http://find/api/v1.0/search",
+ json=[
+ {"_id": str(document.pk)},
+ ],
+ status=200,
+ )
+ response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"})
+
+ assert response.status_code == 200
+ content = response.json()
+ results = content.pop("results")
+ assert content == {
+ "count": 1,
+ "next": None,
+ "previous": None,
+ }
+ assert len(results) == 1
+ assert results[0] == {
+ "id": str(document.id),
+ "abilities": document.get_abilities(user),
+ "ancestors_link_reach": None,
+ "ancestors_link_role": None,
+ "computed_link_reach": document.computed_link_reach,
+ "computed_link_role": document.computed_link_role,
+ "created_at": document.created_at.isoformat().replace("+00:00", "Z"),
+ "creator": str(document.creator.id),
+ "depth": 1,
+ "excerpt": document.excerpt,
+ "link_reach": document.link_reach,
+ "link_role": document.link_role,
+ "nb_accesses_ancestors": 3,
+ "nb_accesses_direct": 3,
+ "numchild": 0,
+ "path": document.path,
+ "title": document.title,
+ "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"),
+ "deleted_at": None,
+ "user_role": access.role,
+ }
+
+
+@responses.activate
+@pytest.mark.parametrize(
+ "pagination, status, expected",
+ (
+ (
+ {"page": 1, "page_size": 10},
+ 200,
+ {
+ "count": 10,
+ "previous": None,
+ "next": None,
+ "range": (0, None),
+ },
+ ),
+ (
+ {},
+ 200,
+ {
+ "count": 10,
+ "previous": None,
+ "next": None,
+ "range": (0, None),
+ "api_page_size": 21, # default page_size is 20
+ },
+ ),
+ (
+ {"page": 2, "page_size": 10},
+ 404,
+ {},
+ ),
+ (
+ {"page": 1, "page_size": 5},
+ 200,
+ {
+ "count": 10,
+ "previous": None,
+ "next": {"page": 2, "page_size": 5},
+ "range": (0, 5),
+ },
+ ),
+ (
+ {"page": 2, "page_size": 5},
+ 200,
+ {
+ "count": 10,
+ "previous": {"page_size": 5},
+ "next": None,
+ "range": (5, None),
+ },
+ ),
+ ({"page": 3, "page_size": 5}, 404, {}),
+ ),
+)
+def test_api_documents_search_pagination(
+ indexer_settings, pagination, status, expected
+):
+ """Documents should be ordered by descending "score" by default"""
+ indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search"
+
+ assert get_document_indexer() is not None
+
+ user = factories.UserFactory()
+
+ client = APIClient()
+ client.force_login(user)
+
+ docs = factories.DocumentFactory.create_batch(10, title="alpha", users=[user])
+
+ docs_by_uuid = {str(doc.pk): doc for doc in docs}
+ api_results = [{"_id": id} for id in docs_by_uuid.keys()]
+
+ # reorder randomly to simulate score ordering
+ random.shuffle(api_results)
+
+ # Find response
+ # pylint: disable-next=assignment-from-none
+ api_search = responses.add(
+ responses.POST,
+ "http://find/api/v1.0/search",
+ json=api_results,
+ status=200,
+ )
+
+ response = client.get(
+ "/api/v1.0/documents/search/",
+ data={
+ "q": "alpha",
+ **pagination,
+ },
+ )
+
+ assert response.status_code == status
+
+ if response.status_code < 300:
+ previous_url = (
+ build_search_url(q="alpha", **expected["previous"])
+ if expected["previous"]
+ else None
+ )
+ next_url = (
+ build_search_url(q="alpha", **expected["next"])
+ if expected["next"]
+ else None
+ )
+ start, end = expected["range"]
+
+ content = response.json()
+
+ assert content["count"] == expected["count"]
+ assert content["previous"] == previous_url
+ assert content["next"] == next_url
+
+ results = content.pop("results")
+
+ # The find api results ordering by score is kept
+ assert [r["id"] for r in results] == [r["_id"] for r in api_results[start:end]]
+
+ # Check the query parameters.
+ assert api_search.call_count == 1
+ assert api_search.calls[0].response.status_code == 200
+ assert json_loads(api_search.calls[0].request.body) == {
+ "q": "alpha",
+ "visited": [],
+ "services": ["docs"],
+ "nb_results": 50,
+ "order_by": "updated_at",
+ "order_direction": "desc",
+ }
+
+
+@responses.activate
+@pytest.mark.parametrize(
+ "pagination, status, expected",
+ (
+ (
+ {"page": 1, "page_size": 10},
+ 200,
+ {"count": 10, "previous": None, "next": None, "range": (0, None)},
+ ),
+ (
+ {},
+ 200,
+ {"count": 10, "previous": None, "next": None, "range": (0, None)},
+ ),
+ (
+ {"page": 2, "page_size": 10},
+ 404,
+ {},
+ ),
+ (
+ {"page": 1, "page_size": 5},
+ 200,
+ {
+ "count": 10,
+ "previous": None,
+ "next": {"page": 2, "page_size": 5},
+ "range": (0, 5),
+ },
+ ),
+ (
+ {"page": 2, "page_size": 5},
+ 200,
+ {
+ "count": 10,
+ "previous": {"page_size": 5},
+ "next": None,
+ "range": (5, None),
+ },
+ ),
+ ({"page": 3, "page_size": 5}, 404, {}),
+ ),
+)
+def test_api_documents_search_pagination_endpoint_is_none(
+ indexer_settings, pagination, status, expected
+):
+ """Documents should be ordered by descending "-updated_at" by default"""
+ indexer_settings.SEARCH_INDEXER_QUERY_URL = None
+
+ assert get_document_indexer() is None
+
+ user = factories.UserFactory()
+
+ client = APIClient()
+ client.force_login(user)
+
+ factories.DocumentFactory.create_batch(10, title="alpha", users=[user])
+
+ response = client.get(
+ "/api/v1.0/documents/search/",
+ data={
+ "q": "alpha",
+ **pagination,
+ },
+ )
+
+ assert response.status_code == status
+
+ if response.status_code < 300:
+ previous_url = (
+ build_search_url(q="alpha", **expected["previous"])
+ if expected["previous"]
+ else None
+ )
+ next_url = (
+ build_search_url(q="alpha", **expected["next"])
+ if expected["next"]
+ else None
+ )
+ queryset = models.Document.objects.order_by("-updated_at")
+ start, end = expected["range"]
+ expected_results = [str(d.pk) for d in queryset[start:end]]
+
+ content = response.json()
+
+ assert content["count"] == expected["count"]
+ assert content["previous"] == previous_url
+ assert content["next"] == next_url
+
+ results = content.pop("results")
+
+ assert [r["id"] for r in results] == expected_results
diff --git a/src/backend/core/tests/test_models_documents_indexer.py b/src/backend/core/tests/test_models_documents_indexer.py
new file mode 100644
index 0000000000..9e171f724d
--- /dev/null
+++ b/src/backend/core/tests/test_models_documents_indexer.py
@@ -0,0 +1,441 @@
+"""
+Unit tests for the Document model
+"""
+# pylint: disable=too-many-lines
+
+from operator import itemgetter
+from unittest import mock
+
+from django.core.cache import cache
+from django.db import transaction
+
+import pytest
+
+from core import factories, models
+from core.services.search_indexers import SearchIndexer
+
+pytestmark = pytest.mark.django_db
+
+
+def reset_batch_indexer_throttle():
+ """Reset throttle flag"""
+ cache.delete("document-batch-indexer-throttle")
+
+
+@pytest.fixture(autouse=True)
+def reset_throttle():
+ """Reset throttle flag before each test"""
+ reset_batch_indexer_throttle()
+ yield
+ reset_batch_indexer_throttle()
+
+
+@mock.patch.object(SearchIndexer, "push")
+@pytest.mark.usefixtures("indexer_settings")
+@pytest.mark.django_db(transaction=True)
+def test_models_documents_post_save_indexer(mock_push):
+ """Test indexation task on document creation"""
+ with transaction.atomic():
+ doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3)
+
+ accesses = {}
+ data = [call.args[0] for call in mock_push.call_args_list]
+
+ indexer = SearchIndexer()
+
+ assert len(data) == 1
+
+ # One call
+ assert sorted(data[0], key=itemgetter("id")) == sorted(
+ [
+ indexer.serialize_document(doc1, accesses),
+ indexer.serialize_document(doc2, accesses),
+ indexer.serialize_document(doc3, accesses),
+ ],
+ key=itemgetter("id"),
+ )
+
+ # The throttle counters should be reset
+ assert cache.get("document-batch-indexer-throttle") == 1
+
+
+@pytest.mark.django_db(transaction=True)
+def test_models_documents_post_save_indexer_no_batches(indexer_settings):
+ """Test indexation task on doculment creation, no throttle"""
+ indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0
+
+ with mock.patch.object(SearchIndexer, "push") as mock_push:
+ with transaction.atomic():
+ doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3)
+
+ accesses = {}
+ data = [call.args[0] for call in mock_push.call_args_list]
+
+ indexer = SearchIndexer()
+
+ # 3 calls
+ assert len(data) == 3
+ # one document per call
+ assert [len(d) for d in data] == [1] * 3
+ # all documents are indexed
+ assert sorted([d[0] for d in data], key=itemgetter("id")) == sorted(
+ [
+ indexer.serialize_document(doc1, accesses),
+ indexer.serialize_document(doc2, accesses),
+ indexer.serialize_document(doc3, accesses),
+ ],
+ key=itemgetter("id"),
+ )
+
+ # The throttle counters should be reset
+ assert cache.get("file-batch-indexer-throttle") is None
+
+
+@mock.patch.object(SearchIndexer, "push")
+@pytest.mark.django_db(transaction=True)
+def test_models_documents_post_save_indexer_not_configured(mock_push, indexer_settings):
+ """Task should not start an indexation when disabled"""
+ indexer_settings.SEARCH_INDEXER_CLASS = None
+
+ user = factories.UserFactory()
+
+ with transaction.atomic():
+ doc = factories.DocumentFactory()
+ factories.UserDocumentAccessFactory(document=doc, user=user)
+
+ assert mock_push.assert_not_called
+
+
+@mock.patch.object(SearchIndexer, "push")
+@pytest.mark.django_db(transaction=True)
+def test_models_documents_post_save_indexer_wrongly_configured(
+ mock_push, indexer_settings
+):
+ """Task should not start an indexation when disabled"""
+ indexer_settings.SEARCH_INDEXER_URL = None
+
+ user = factories.UserFactory()
+
+ with transaction.atomic():
+ doc = factories.DocumentFactory()
+ factories.UserDocumentAccessFactory(document=doc, user=user)
+
+ assert mock_push.assert_not_called
+
+
+@mock.patch.object(SearchIndexer, "push")
+@pytest.mark.usefixtures("indexer_settings")
+@pytest.mark.django_db(transaction=True)
+def test_models_documents_post_save_indexer_with_accesses(mock_push):
+ """Test indexation task on document creation"""
+ user = factories.UserFactory()
+
+ with transaction.atomic():
+ doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3)
+
+ factories.UserDocumentAccessFactory(document=doc1, user=user)
+ factories.UserDocumentAccessFactory(document=doc2, user=user)
+ factories.UserDocumentAccessFactory(document=doc3, user=user)
+
+ accesses = {
+ str(doc1.path): {"users": [user.sub]},
+ str(doc2.path): {"users": [user.sub]},
+ str(doc3.path): {"users": [user.sub]},
+ }
+
+ data = [call.args[0] for call in mock_push.call_args_list]
+
+ indexer = SearchIndexer()
+
+ assert len(data) == 1
+ assert sorted(data[0], key=itemgetter("id")) == sorted(
+ [
+ indexer.serialize_document(doc1, accesses),
+ indexer.serialize_document(doc2, accesses),
+ indexer.serialize_document(doc3, accesses),
+ ],
+ key=itemgetter("id"),
+ )
+
+
+@mock.patch.object(SearchIndexer, "push")
+@pytest.mark.usefixtures("indexer_settings")
+@pytest.mark.django_db(transaction=True)
+def test_models_documents_post_save_indexer_deleted(mock_push):
+ """Indexation task on deleted or ancestor_deleted documents"""
+ user = factories.UserFactory()
+
+ with transaction.atomic():
+ doc = factories.DocumentFactory(
+ link_reach=models.LinkReachChoices.AUTHENTICATED
+ )
+ main_doc = factories.DocumentFactory(
+ link_reach=models.LinkReachChoices.AUTHENTICATED
+ )
+ child_doc = factories.DocumentFactory(
+ parent=main_doc,
+ link_reach=models.LinkReachChoices.AUTHENTICATED,
+ )
+
+ factories.UserDocumentAccessFactory(document=doc, user=user)
+ factories.UserDocumentAccessFactory(document=main_doc, user=user)
+ factories.UserDocumentAccessFactory(document=child_doc, user=user)
+
+ # Manually reset the throttle flag here or the next indexation will be ignored for 1 second
+ reset_batch_indexer_throttle()
+
+ with transaction.atomic():
+ main_doc_deleted = models.Document.objects.get(pk=main_doc.pk)
+ main_doc_deleted.soft_delete()
+
+ child_doc_deleted = models.Document.objects.get(pk=child_doc.pk)
+
+ main_doc_deleted.refresh_from_db()
+ child_doc_deleted.refresh_from_db()
+
+ assert main_doc_deleted.deleted_at is not None
+ assert child_doc_deleted.ancestors_deleted_at is not None
+
+ assert child_doc_deleted.deleted_at is None
+ assert child_doc_deleted.ancestors_deleted_at is not None
+
+ accesses = {
+ str(doc.path): {"users": [user.sub]},
+ str(main_doc_deleted.path): {"users": [user.sub]},
+ str(child_doc_deleted.path): {"users": [user.sub]},
+ }
+
+ data = [call.args[0] for call in mock_push.call_args_list]
+
+ indexer = SearchIndexer()
+
+ assert len(data) == 2
+
+ # First indexation on document creation
+ assert sorted(data[0], key=itemgetter("id")) == sorted(
+ [
+ indexer.serialize_document(doc, accesses),
+ indexer.serialize_document(main_doc, accesses),
+ indexer.serialize_document(child_doc, accesses),
+ ],
+ key=itemgetter("id"),
+ )
+
+ # Even deleted items are re-indexed : only update their status in the future
+ assert sorted(data[1], key=itemgetter("id")) == sorted(
+ [
+ indexer.serialize_document(main_doc_deleted, accesses), # soft_delete()
+ indexer.serialize_document(child_doc_deleted, accesses),
+ ],
+ key=itemgetter("id"),
+ )
+
+
+@pytest.mark.django_db(transaction=True)
+@pytest.mark.usefixtures("indexer_settings")
+def test_models_documents_indexer_hard_deleted():
+ """Indexation task on hard deleted document"""
+ user = factories.UserFactory()
+
+ with transaction.atomic():
+ doc = factories.DocumentFactory(
+ link_reach=models.LinkReachChoices.AUTHENTICATED
+ )
+ factories.UserDocumentAccessFactory(document=doc, user=user)
+
+ # Call task on deleted document.
+ with mock.patch.object(SearchIndexer, "push") as mock_push:
+ doc.delete()
+
+ # Hard delete document are not re-indexed.
+ assert mock_push.assert_not_called
+
+
+@mock.patch.object(SearchIndexer, "push")
+@pytest.mark.usefixtures("indexer_settings")
+@pytest.mark.django_db(transaction=True)
+def test_models_documents_post_save_indexer_restored(mock_push):
+ """Restart indexation task on restored documents"""
+ user = factories.UserFactory()
+
+ with transaction.atomic():
+ doc = factories.DocumentFactory(
+ link_reach=models.LinkReachChoices.AUTHENTICATED
+ )
+ doc_deleted = factories.DocumentFactory(
+ link_reach=models.LinkReachChoices.AUTHENTICATED
+ )
+ doc_ancestor_deleted = factories.DocumentFactory(
+ parent=doc_deleted,
+ link_reach=models.LinkReachChoices.AUTHENTICATED,
+ )
+
+ factories.UserDocumentAccessFactory(document=doc, user=user)
+ factories.UserDocumentAccessFactory(document=doc_deleted, user=user)
+ factories.UserDocumentAccessFactory(document=doc_ancestor_deleted, user=user)
+
+ doc_deleted.soft_delete()
+
+ doc_deleted.refresh_from_db()
+ doc_ancestor_deleted.refresh_from_db()
+
+ assert doc_deleted.deleted_at is not None
+ assert doc_deleted.ancestors_deleted_at is not None
+
+ assert doc_ancestor_deleted.deleted_at is None
+ assert doc_ancestor_deleted.ancestors_deleted_at is not None
+
+ # Manually reset the throttle flag here or the next indexation will be ignored for 1 second
+ reset_batch_indexer_throttle()
+
+ with transaction.atomic():
+ doc_restored = models.Document.objects.get(pk=doc_deleted.pk)
+ doc_restored.restore()
+
+ doc_ancestor_restored = models.Document.objects.get(pk=doc_ancestor_deleted.pk)
+
+ assert doc_restored.deleted_at is None
+ assert doc_restored.ancestors_deleted_at is None
+
+ assert doc_ancestor_restored.deleted_at is None
+ assert doc_ancestor_restored.ancestors_deleted_at is None
+
+ accesses = {
+ str(doc.path): {"users": [user.sub]},
+ str(doc_deleted.path): {"users": [user.sub]},
+ str(doc_ancestor_deleted.path): {"users": [user.sub]},
+ }
+
+ data = [call.args[0] for call in mock_push.call_args_list]
+
+ indexer = SearchIndexer()
+
+ # All docs are re-indexed
+ assert len(data) == 2
+
+ # First indexation on items creation & soft delete (in the same transaction)
+ assert sorted(data[0], key=itemgetter("id")) == sorted(
+ [
+ indexer.serialize_document(doc, accesses),
+ indexer.serialize_document(doc_deleted, accesses),
+ indexer.serialize_document(doc_ancestor_deleted, accesses),
+ ],
+ key=itemgetter("id"),
+ )
+
+ # Restored items are re-indexed : only update their status in the future
+ assert sorted(data[1], key=itemgetter("id")) == sorted(
+ [
+ indexer.serialize_document(doc_restored, accesses), # restore()
+ indexer.serialize_document(doc_ancestor_restored, accesses),
+ ],
+ key=itemgetter("id"),
+ )
+
+
+@pytest.mark.django_db(transaction=True)
+@pytest.mark.usefixtures("indexer_settings")
+def test_models_documents_post_save_indexer_throttle():
+ """Test indexation task skipping on document update"""
+ indexer = SearchIndexer()
+ user = factories.UserFactory()
+
+ with mock.patch.object(SearchIndexer, "push"):
+ with transaction.atomic():
+ docs = factories.DocumentFactory.create_batch(5, users=(user,))
+
+ accesses = {str(item.path): {"users": [user.sub]} for item in docs}
+
+ with mock.patch.object(SearchIndexer, "push") as mock_push:
+ # Simulate 1 running task
+ cache.set("document-batch-indexer-throttle", 1)
+
+ # save doc to trigger the indexer, but nothing should be done since
+ # the flag is up
+ with transaction.atomic():
+ docs[0].save()
+ docs[2].save()
+ docs[3].save()
+
+ assert [call.args[0] for call in mock_push.call_args_list] == []
+
+ with mock.patch.object(SearchIndexer, "push") as mock_push:
+ # No waiting task
+ cache.delete("document-batch-indexer-throttle")
+
+ with transaction.atomic():
+ docs[0].save()
+ docs[2].save()
+ docs[3].save()
+
+ data = [call.args[0] for call in mock_push.call_args_list]
+
+ # One call
+ assert len(data) == 1
+
+ assert sorted(data[0], key=itemgetter("id")) == sorted(
+ [
+ indexer.serialize_document(docs[0], accesses),
+ indexer.serialize_document(docs[2], accesses),
+ indexer.serialize_document(docs[3], accesses),
+ ],
+ key=itemgetter("id"),
+ )
+
+
+@pytest.mark.django_db(transaction=True)
+@pytest.mark.usefixtures("indexer_settings")
+def test_models_documents_access_post_save_indexer():
+ """Test indexation task on DocumentAccess update"""
+ users = factories.UserFactory.create_batch(3)
+
+ with mock.patch.object(SearchIndexer, "push"):
+ with transaction.atomic():
+ doc = factories.DocumentFactory(users=users)
+ doc_accesses = models.DocumentAccess.objects.filter(document=doc).order_by(
+ "user__sub"
+ )
+
+ reset_batch_indexer_throttle()
+
+ with mock.patch.object(SearchIndexer, "push") as mock_push:
+ with transaction.atomic():
+ for doc_access in doc_accesses:
+ doc_access.save()
+
+ data = [call.args[0] for call in mock_push.call_args_list]
+
+ # One call
+ assert len(data) == 1
+
+ assert [d["id"] for d in data[0]] == [str(doc.pk)]
+
+
+@pytest.mark.django_db(transaction=True)
+def test_models_items_access_post_save_indexer_no_throttle(indexer_settings):
+ """Test indexation task on ItemAccess update, no throttle"""
+ indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0
+
+ users = factories.UserFactory.create_batch(3)
+
+ with transaction.atomic():
+ doc = factories.DocumentFactory(users=users)
+ doc_accesses = models.DocumentAccess.objects.filter(document=doc).order_by(
+ "user__sub"
+ )
+
+ reset_batch_indexer_throttle()
+
+ with mock.patch.object(SearchIndexer, "push") as mock_push:
+ with transaction.atomic():
+ for doc_access in doc_accesses:
+ doc_access.save()
+
+ data = [call.args[0] for call in mock_push.call_args_list]
+
+ # 3 calls
+ assert len(data) == 3
+ # one document per call
+ assert [len(d) for d in data] == [1] * 3
+ # the same document is indexed 3 times
+ assert [d[0]["id"] for d in data] == [str(doc.pk)] * 3
diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py
new file mode 100644
index 0000000000..61488a921b
--- /dev/null
+++ b/src/backend/core/tests/test_services_search_indexers.py
@@ -0,0 +1,635 @@
+"""Tests for Documents search indexers"""
+
+from functools import partial
+from json import dumps as json_dumps
+from unittest.mock import patch
+
+from django.contrib.auth.models import AnonymousUser
+from django.core.exceptions import ImproperlyConfigured
+from django.utils.module_loading import import_string
+
+import pytest
+import responses
+from requests import HTTPError
+
+from core import factories, models, utils
+from core.services.search_indexers import (
+ BaseDocumentIndexer,
+ SearchIndexer,
+ get_document_indexer,
+ get_visited_document_ids_of,
+)
+
+pytestmark = pytest.mark.django_db
+
+
+class FakeDocumentIndexer(BaseDocumentIndexer):
+ """Fake indexer for test purpose"""
+
+ def serialize_document(self, document, accesses):
+ return {}
+
+ def push(self, data):
+ pass
+
+ def search_query(self, data, token):
+ return {}
+
+
+def test_services_search_indexer_class_invalid(indexer_settings):
+ """
+ Should raise RuntimeError if SEARCH_INDEXER_CLASS cannot be imported.
+ """
+ indexer_settings.SEARCH_INDEXER_CLASS = "unknown.Unknown"
+
+ assert get_document_indexer() is None
+
+
+def test_services_search_indexer_class(indexer_settings):
+ """
+ Import indexer class defined in setting SEARCH_INDEXER_CLASS.
+ """
+ indexer_settings.SEARCH_INDEXER_CLASS = (
+ "core.tests.test_services_search_indexers.FakeDocumentIndexer"
+ )
+
+ assert isinstance(
+ get_document_indexer(),
+ import_string("core.tests.test_services_search_indexers.FakeDocumentIndexer"),
+ )
+
+
+def test_services_search_indexer_is_configured(indexer_settings):
+ """
+ Should return true only when the indexer class and other configuration settings
+ are valid.
+ """
+ indexer_settings.SEARCH_INDEXER_CLASS = None
+
+ # None
+ get_document_indexer.cache_clear()
+ assert not get_document_indexer()
+
+ # Empty
+ indexer_settings.SEARCH_INDEXER_CLASS = ""
+
+ get_document_indexer.cache_clear()
+ assert not get_document_indexer()
+
+ # Valid class
+ indexer_settings.SEARCH_INDEXER_CLASS = (
+ "core.services.search_indexers.SearchIndexer"
+ )
+
+ get_document_indexer.cache_clear()
+ assert get_document_indexer() is not None
+
+ indexer_settings.SEARCH_INDEXER_URL = ""
+
+ # Invalid url
+ get_document_indexer.cache_clear()
+ assert not get_document_indexer()
+
+
+def test_services_search_indexer_url_is_none(indexer_settings):
+ """
+ Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty.
+ """
+ indexer_settings.SEARCH_INDEXER_URL = None
+
+ with pytest.raises(ImproperlyConfigured) as exc_info:
+ SearchIndexer()
+
+ assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value)
+
+
+def test_services_search_indexer_url_is_empty(indexer_settings):
+ """
+ Indexer should raise RuntimeError if SEARCH_INDEXER_URL is empty string.
+ """
+ indexer_settings.SEARCH_INDEXER_URL = ""
+
+ with pytest.raises(ImproperlyConfigured) as exc_info:
+ SearchIndexer()
+
+ assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value)
+
+
+def test_services_search_indexer_secret_is_none(indexer_settings):
+ """
+ Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None.
+ """
+ indexer_settings.SEARCH_INDEXER_SECRET = None
+
+ with pytest.raises(ImproperlyConfigured) as exc_info:
+ SearchIndexer()
+
+ assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str(
+ exc_info.value
+ )
+
+
+def test_services_search_indexer_secret_is_empty(indexer_settings):
+ """
+ Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is empty string.
+ """
+ indexer_settings.SEARCH_INDEXER_SECRET = ""
+
+ with pytest.raises(ImproperlyConfigured) as exc_info:
+ SearchIndexer()
+
+ assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str(
+ exc_info.value
+ )
+
+
+def test_services_search_endpoint_is_none(indexer_settings):
+ """
+ Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None.
+ """
+ indexer_settings.SEARCH_INDEXER_QUERY_URL = None
+
+ with pytest.raises(ImproperlyConfigured) as exc_info:
+ SearchIndexer()
+
+ assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str(
+ exc_info.value
+ )
+
+
+def test_services_search_endpoint_is_empty(indexer_settings):
+ """
+ Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is empty.
+ """
+ indexer_settings.SEARCH_INDEXER_QUERY_URL = ""
+
+ with pytest.raises(ImproperlyConfigured) as exc_info:
+ SearchIndexer()
+
+ assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str(
+ exc_info.value
+ )
+
+
+@pytest.mark.usefixtures("indexer_settings")
+def test_services_search_indexers_serialize_document_returns_expected_json():
+ """
+ It should serialize documents with correct metadata and access control.
+ """
+ user_a, user_b = factories.UserFactory.create_batch(2)
+ document = factories.DocumentFactory()
+ factories.DocumentFactory(parent=document)
+
+ factories.UserDocumentAccessFactory(document=document, user=user_a)
+ factories.UserDocumentAccessFactory(document=document, user=user_b)
+ factories.TeamDocumentAccessFactory(document=document, team="team1")
+ factories.TeamDocumentAccessFactory(document=document, team="team2")
+
+ accesses = {
+ document.path: {
+ "users": {str(user_a.sub), str(user_b.sub)},
+ "teams": {"team1", "team2"},
+ }
+ }
+
+ indexer = SearchIndexer()
+ result = indexer.serialize_document(document, accesses)
+
+ assert set(result.pop("users")) == {str(user_a.sub), str(user_b.sub)}
+ assert set(result.pop("groups")) == {"team1", "team2"}
+ assert result == {
+ "id": str(document.id),
+ "title": document.title,
+ "depth": 1,
+ "path": document.path,
+ "numchild": 1,
+ "content": utils.base64_yjs_to_text(document.content),
+ "created_at": document.created_at.isoformat(),
+ "updated_at": document.updated_at.isoformat(),
+ "reach": document.link_reach,
+ "size": 13,
+ "is_active": True,
+ }
+
+
+@pytest.mark.usefixtures("indexer_settings")
+def test_services_search_indexers_serialize_document_deleted():
+ """Deleted documents are marked as just in the serialized json."""
+ parent = factories.DocumentFactory()
+ document = factories.DocumentFactory(parent=parent)
+
+ parent.soft_delete()
+ document.refresh_from_db()
+
+ indexer = SearchIndexer()
+ result = indexer.serialize_document(document, {})
+
+ assert result["is_active"] is False
+
+
+@pytest.mark.usefixtures("indexer_settings")
+def test_services_search_indexers_serialize_document_empty():
+ """Empty documents returns empty content in the serialized json."""
+ document = factories.DocumentFactory(content="", title=None)
+
+ indexer = SearchIndexer()
+ result = indexer.serialize_document(document, {})
+
+ assert result["content"] == ""
+ assert result["title"] == ""
+
+
+@responses.activate
+def test_services_search_indexers_index_errors(indexer_settings):
+ """
+ Documents indexing response handling on Find API HTTP errors.
+ """
+ factories.DocumentFactory()
+
+ indexer_settings.SEARCH_INDEXER_URL = "http://app-find/api/v1.0/documents/index/"
+
+ responses.add(
+ responses.POST,
+ "http://app-find/api/v1.0/documents/index/",
+ status=401,
+ body=json_dumps({"message": "Authentication failed."}),
+ )
+
+ with pytest.raises(HTTPError):
+ SearchIndexer().index()
+
+
+@patch.object(SearchIndexer, "push")
+def test_services_search_indexers_batches_pass_only_batch_accesses(
+ mock_push, indexer_settings
+):
+ """
+ Documents indexing should be processed in batches,
+ and only the access data relevant to each batch should be used.
+ """
+ indexer_settings.SEARCH_INDEXER_BATCH_SIZE = 2
+ documents = factories.DocumentFactory.create_batch(5)
+
+ # Attach a single user access to each document
+ expected_user_subs = {}
+ for document in documents:
+ access = factories.UserDocumentAccessFactory(document=document)
+ expected_user_subs[str(document.id)] = str(access.user.sub)
+
+ assert SearchIndexer().index() == 5
+
+ # Should be 3 batches: 2 + 2 + 1
+ assert mock_push.call_count == 3
+
+ seen_doc_ids = set()
+
+ for call in mock_push.call_args_list:
+ batch = call.args[0]
+ assert isinstance(batch, list)
+
+ for doc_json in batch:
+ doc_id = doc_json["id"]
+ seen_doc_ids.add(doc_id)
+
+ # Only one user expected per document
+ assert doc_json["users"] == [expected_user_subs[doc_id]]
+ assert doc_json["groups"] == []
+
+ # Make sure all 5 documents were indexed
+ assert seen_doc_ids == {str(d.id) for d in documents}
+
+
+@patch.object(SearchIndexer, "push")
+@pytest.mark.usefixtures("indexer_settings")
+def test_services_search_indexers_batch_size_argument(mock_push):
+ """
+ Documents indexing should be processed in batches,
+ batch_size overrides SEARCH_INDEXER_BATCH_SIZE
+ """
+ documents = factories.DocumentFactory.create_batch(5)
+
+ # Attach a single user access to each document
+ expected_user_subs = {}
+ for document in documents:
+ access = factories.UserDocumentAccessFactory(document=document)
+ expected_user_subs[str(document.id)] = str(access.user.sub)
+
+ assert SearchIndexer().index(batch_size=2) == 5
+
+ # Should be 3 batches: 2 + 2 + 1
+ assert mock_push.call_count == 3
+
+ seen_doc_ids = set()
+
+ for call in mock_push.call_args_list:
+ batch = call.args[0]
+ assert isinstance(batch, list)
+
+ for doc_json in batch:
+ doc_id = doc_json["id"]
+ seen_doc_ids.add(doc_id)
+
+ # Only one user expected per document
+ assert doc_json["users"] == [expected_user_subs[doc_id]]
+ assert doc_json["groups"] == []
+
+ # Make sure all 5 documents were indexed
+ assert seen_doc_ids == {str(d.id) for d in documents}
+
+
+@patch.object(SearchIndexer, "push")
+@pytest.mark.usefixtures("indexer_settings")
+def test_services_search_indexers_ignore_empty_documents(mock_push):
+ """
+ Documents indexing should be processed in batches,
+ and only the access data relevant to each batch should be used.
+ """
+ document = factories.DocumentFactory()
+ factories.DocumentFactory(content="", title="")
+ empty_title = factories.DocumentFactory(title="")
+ empty_content = factories.DocumentFactory(content="")
+
+ assert SearchIndexer().index() == 3
+
+ assert mock_push.call_count == 1
+
+ # Make sure only not eempty documents are indexed
+ results = {doc["id"] for doc in mock_push.call_args[0][0]}
+ assert results == {
+ str(d.id)
+ for d in (
+ document,
+ empty_content,
+ empty_title,
+ )
+ }
+
+
+@patch.object(SearchIndexer, "push")
+def test_services_search_indexers_skip_empty_batches(mock_push, indexer_settings):
+ """
+ Documents indexing batch can be empty if all the docs are empty.
+ """
+ indexer_settings.SEARCH_INDEXER_BATCH_SIZE = 2
+
+ document = factories.DocumentFactory()
+
+ # Only empty docs
+ factories.DocumentFactory.create_batch(5, content="", title="")
+
+ assert SearchIndexer().index() == 1
+ assert mock_push.call_count == 1
+
+ results = [doc["id"] for doc in mock_push.call_args[0][0]]
+ assert results == [str(document.id)]
+
+
+@patch.object(SearchIndexer, "push")
+@pytest.mark.usefixtures("indexer_settings")
+def test_services_search_indexers_ancestors_link_reach(mock_push):
+ """Document accesses and reach should take into account ancestors link reaches."""
+ great_grand_parent = factories.DocumentFactory(link_reach="restricted")
+ grand_parent = factories.DocumentFactory(
+ parent=great_grand_parent, link_reach="authenticated"
+ )
+ parent = factories.DocumentFactory(parent=grand_parent, link_reach="public")
+ document = factories.DocumentFactory(parent=parent, link_reach="restricted")
+
+ assert SearchIndexer().index() == 4
+
+ results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
+ assert len(results) == 4
+ assert results[str(great_grand_parent.id)]["reach"] == "restricted"
+ assert results[str(grand_parent.id)]["reach"] == "authenticated"
+ assert results[str(parent.id)]["reach"] == "public"
+ assert results[str(document.id)]["reach"] == "public"
+
+
+@patch.object(SearchIndexer, "push")
+@pytest.mark.usefixtures("indexer_settings")
+def test_services_search_indexers_ancestors_users(mock_push):
+ """Document accesses and reach should include users from ancestors."""
+ user_gp, user_p, user_d = factories.UserFactory.create_batch(3)
+
+ grand_parent = factories.DocumentFactory(users=[user_gp])
+ parent = factories.DocumentFactory(parent=grand_parent, users=[user_p])
+ document = factories.DocumentFactory(parent=parent, users=[user_d])
+
+ assert SearchIndexer().index() == 3
+
+ results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
+ assert len(results) == 3
+ assert results[str(grand_parent.id)]["users"] == [str(user_gp.sub)]
+ assert set(results[str(parent.id)]["users"]) == {str(user_gp.sub), str(user_p.sub)}
+ assert set(results[str(document.id)]["users"]) == {
+ str(user_gp.sub),
+ str(user_p.sub),
+ str(user_d.sub),
+ }
+
+
+@patch.object(SearchIndexer, "push")
+@pytest.mark.usefixtures("indexer_settings")
+def test_services_search_indexers_ancestors_teams(mock_push):
+ """Document accesses and reach should include teams from ancestors."""
+ grand_parent = factories.DocumentFactory(teams=["team_gp"])
+ parent = factories.DocumentFactory(parent=grand_parent, teams=["team_p"])
+ document = factories.DocumentFactory(parent=parent, teams=["team_d"])
+
+ assert SearchIndexer().index() == 3
+
+ results = {doc["id"]: doc for doc in mock_push.call_args[0][0]}
+ assert len(results) == 3
+ assert results[str(grand_parent.id)]["groups"] == ["team_gp"]
+ assert set(results[str(parent.id)]["groups"]) == {"team_gp", "team_p"}
+ assert set(results[str(document.id)]["groups"]) == {"team_gp", "team_p", "team_d"}
+
+
+@patch("requests.post")
+def test_push_uses_correct_url_and_data(mock_post, indexer_settings):
+ """
+ push() should call requests.post with the correct URL from settings
+ the timeout set to 10 seconds and the data as JSON.
+ """
+ indexer_settings.SEARCH_INDEXER_URL = "http://example.com/index"
+
+ indexer = SearchIndexer()
+ sample_data = [{"id": "123", "title": "Test"}]
+
+ mock_response = mock_post.return_value
+ mock_response.raise_for_status.return_value = None # No error
+
+ indexer.push(sample_data)
+
+ mock_post.assert_called_once()
+ args, kwargs = mock_post.call_args
+
+ assert args[0] == indexer_settings.SEARCH_INDEXER_URL
+ assert kwargs.get("json") == sample_data
+ assert kwargs.get("timeout") == 10
+
+
+def test_get_visited_document_ids_of():
+ """
+ get_visited_document_ids_of() returns the ids of the documents viewed
+ by the user BUT without specific access configuration (like public ones)
+ """
+ user = factories.UserFactory()
+ other = factories.UserFactory()
+ anonymous = AnonymousUser()
+ queryset = models.Document.objects.all()
+
+ assert not get_visited_document_ids_of(queryset, anonymous)
+ assert not get_visited_document_ids_of(queryset, user)
+
+ doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
+
+ create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
+
+ create_link(document=doc1)
+ create_link(document=doc2)
+
+ # The third document is not visited
+ assert sorted(get_visited_document_ids_of(queryset, user)) == sorted(
+ [str(doc1.pk), str(doc2.pk)]
+ )
+
+ factories.UserDocumentAccessFactory(user=other, document=doc1)
+ factories.UserDocumentAccessFactory(user=user, document=doc2)
+
+ # The second document have an access for the user
+ assert get_visited_document_ids_of(queryset, user) == [str(doc1.pk)]
+
+
+@pytest.mark.usefixtures("indexer_settings")
+def test_get_visited_document_ids_of_deleted():
+ """
+ get_visited_document_ids_of() returns the ids of the documents viewed
+ by the user if they are not deleted.
+ """
+ user = factories.UserFactory()
+ anonymous = AnonymousUser()
+ queryset = models.Document.objects.all()
+
+ assert not get_visited_document_ids_of(queryset, anonymous)
+ assert not get_visited_document_ids_of(queryset, user)
+
+ doc = factories.DocumentFactory()
+ doc_deleted = factories.DocumentFactory()
+ doc_ancestor_deleted = factories.DocumentFactory(parent=doc_deleted)
+
+ create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
+
+ create_link(document=doc)
+ create_link(document=doc_deleted)
+ create_link(document=doc_ancestor_deleted)
+
+ # The all documents are visited
+ assert sorted(get_visited_document_ids_of(queryset, user)) == sorted(
+ [str(doc.pk), str(doc_deleted.pk), str(doc_ancestor_deleted.pk)]
+ )
+
+ doc_deleted.soft_delete()
+
+ # Only the first document is not deleted
+ assert get_visited_document_ids_of(queryset, user) == [str(doc.pk)]
+
+
+@responses.activate
+def test_services_search_indexers_search_errors(indexer_settings):
+ """
+ Documents indexing response handling on Find API HTTP errors.
+ """
+ factories.DocumentFactory()
+
+ indexer_settings.SEARCH_INDEXER_QUERY_URL = (
+ "http://app-find/api/v1.0/documents/search/"
+ )
+
+ responses.add(
+ responses.POST,
+ "http://app-find/api/v1.0/documents/search/",
+ status=401,
+ body=json_dumps({"message": "Authentication failed."}),
+ )
+
+ with pytest.raises(HTTPError):
+ SearchIndexer().search("alpha", token="mytoken")
+
+
+@patch("requests.post")
+def test_services_search_indexers_search(mock_post, indexer_settings):
+ """
+ search() should call requests.post to SEARCH_INDEXER_QUERY_URL with the
+ document ids from linktraces.
+ """
+ user = factories.UserFactory()
+ indexer = SearchIndexer()
+
+ mock_response = mock_post.return_value
+ mock_response.raise_for_status.return_value = None # No error
+
+ doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
+
+ create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
+
+ create_link(document=doc1)
+ create_link(document=doc2)
+
+ visited = get_visited_document_ids_of(models.Document.objects.all(), user)
+
+ indexer.search("alpha", visited=visited, token="mytoken")
+
+ args, kwargs = mock_post.call_args
+
+ assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL
+
+ query_data = kwargs.get("json")
+ assert query_data["q"] == "alpha"
+ assert sorted(query_data["visited"]) == sorted([str(doc1.pk), str(doc2.pk)])
+ assert query_data["services"] == ["docs"]
+ assert query_data["nb_results"] == 50
+ assert query_data["order_by"] == "updated_at"
+ assert query_data["order_direction"] == "desc"
+
+ assert kwargs.get("headers") == {"Authorization": "Bearer mytoken"}
+ assert kwargs.get("timeout") == 10
+
+
+@patch("requests.post")
+def test_services_search_indexers_search_nb_results(mock_post, indexer_settings):
+ """
+ Find API call should have nb_results == SEARCH_INDEXER_QUERY_LIMIT
+ or the given nb_results argument.
+ """
+ indexer_settings.SEARCH_INDEXER_QUERY_LIMIT = 25
+
+ user = factories.UserFactory()
+ indexer = SearchIndexer()
+
+ mock_response = mock_post.return_value
+ mock_response.raise_for_status.return_value = None # No error
+
+ doc1, doc2, _ = factories.DocumentFactory.create_batch(3)
+
+ create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False)
+
+ create_link(document=doc1)
+ create_link(document=doc2)
+
+ visited = get_visited_document_ids_of(models.Document.objects.all(), user)
+
+ indexer.search("alpha", visited=visited, token="mytoken")
+
+ args, kwargs = mock_post.call_args
+
+ assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL
+ assert kwargs.get("json")["nb_results"] == 25
+
+ # The argument overrides the setting value
+ indexer.search("alpha", visited=visited, token="mytoken", nb_results=109)
+
+ args, kwargs = mock_post.call_args
+
+ assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL
+ assert kwargs.get("json")["nb_results"] == 109
diff --git a/src/backend/core/tests/test_utils.py b/src/backend/core/tests/test_utils.py
index 37b2e32d5e..42d588c536 100644
--- a/src/backend/core/tests/test_utils.py
+++ b/src/backend/core/tests/test_utils.py
@@ -75,3 +75,28 @@ def test_utils_extract_attachments():
base64_string = base64.b64encode(update).decode("utf-8")
# image_key2 is missing the "/media/" part and shouldn't get extracted
assert utils.extract_attachments(base64_string) == [image_key1, image_key3]
+
+
+def test_utils_get_ancestor_to_descendants_map_single_path():
+ """Test ancestor mapping of a single path."""
+ paths = ["000100020005"]
+ result = utils.get_ancestor_to_descendants_map(paths, steplen=4)
+
+ assert result == {
+ "0001": {"000100020005"},
+ "00010002": {"000100020005"},
+ "000100020005": {"000100020005"},
+ }
+
+
+def test_utils_get_ancestor_to_descendants_map_multiple_paths():
+ """Test ancestor mapping of multiple paths with shared prefixes."""
+ paths = ["000100020005", "00010003"]
+ result = utils.get_ancestor_to_descendants_map(paths, steplen=4)
+
+ assert result == {
+ "0001": {"000100020005", "00010003"},
+ "00010002": {"000100020005"},
+ "000100020005": {"000100020005"},
+ "00010003": {"00010003"},
+ }
diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py
index 780431f495..357ede03c3 100644
--- a/src/backend/core/utils.py
+++ b/src/backend/core/utils.py
@@ -2,6 +2,7 @@
import base64
import re
+from collections import defaultdict
import pycrdt
from bs4 import BeautifulSoup
@@ -9,6 +10,27 @@
from core import enums
+def get_ancestor_to_descendants_map(paths, steplen):
+ """
+ Given a list of document paths, return a mapping of ancestor_path -> set of descendant_paths.
+
+ Each path is assumed to use materialized path format with fixed-length segments.
+
+ Args:
+ paths (list of str): List of full document paths.
+ steplen (int): Length of each path segment.
+
+ Returns:
+ dict[str, set[str]]: Mapping from ancestor path to its descendant paths (including itself).
+ """
+ ancestor_map = defaultdict(set)
+ for path in paths:
+ for i in range(steplen, len(path) + 1, steplen):
+ ancestor = path[:i]
+ ancestor_map[ancestor].add(path)
+ return ancestor_map
+
+
def filter_descendants(paths, root_paths, skip_sorting=False):
"""
Filters paths to keep only those that are descendants of any path in root_paths.
diff --git a/src/backend/demo/management/commands/create_demo.py b/src/backend/demo/management/commands/create_demo.py
index 3681c378a0..ebce4eae25 100644
--- a/src/backend/demo/management/commands/create_demo.py
+++ b/src/backend/demo/management/commands/create_demo.py
@@ -1,16 +1,19 @@
# ruff: noqa: S311, S106
"""create_demo management command"""
+import base64
import logging
import math
import random
import time
from collections import defaultdict
+from uuid import uuid4
from django import db
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
+import pycrdt
from faker import Faker
from core import models
@@ -27,6 +30,16 @@ def random_true_with_probability(probability):
return random.random() < probability
+def get_ydoc_for_text(text):
+ """Return a ydoc from plain text for demo purposes."""
+ ydoc = pycrdt.Doc()
+ paragraph = pycrdt.XmlElement("p", {}, [pycrdt.XmlText(text)])
+ fragment = pycrdt.XmlFragment([paragraph])
+ ydoc["document-store"] = fragment
+ update = ydoc.get_update()
+ return base64.b64encode(update).decode("utf-8")
+
+
class BulkQueue:
"""A utility class to create Django model instances in bulk by just pushing to a queue."""
@@ -48,7 +61,7 @@ def _bulk_create(self, objects):
self.queue[objects[0]._meta.model.__name__] = [] # noqa: SLF001
def push(self, obj):
- """Add a model instance to queue to that it gets created in bulk."""
+ """Add a model instance to queue so that it gets created in bulk."""
objects = self.queue[obj._meta.model.__name__] # noqa: SLF001
objects.append(obj)
if len(objects) > self.BATCH_SIZE:
@@ -139,17 +152,19 @@ def create_demo(stdout):
# pylint: disable=protected-access
key = models.Document._int2str(i) # noqa: SLF001
padding = models.Document.alphabet[0] * (models.Document.steplen - len(key))
- queue.push(
- models.Document(
- depth=1,
- path=f"{padding}{key}",
- creator_id=random.choice(users_ids),
- title=fake.sentence(nb_words=4),
- link_reach=models.LinkReachChoices.AUTHENTICATED
- if random_true_with_probability(0.5)
- else random.choice(models.LinkReachChoices.values),
- )
+ title = fake.sentence(nb_words=4)
+ document = models.Document(
+ id=uuid4(),
+ depth=1,
+ path=f"{padding}{key}",
+ creator_id=random.choice(users_ids),
+ title=title,
+ link_reach=models.LinkReachChoices.AUTHENTICATED
+ if random_true_with_probability(0.5)
+ else random.choice(models.LinkReachChoices.values),
)
+ document.save_content(get_ydoc_for_text(f"Content for {title:s}"))
+ queue.push(document)
queue.flush()
diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py
index 2229036c8a..4e3c55c87a 100755
--- a/src/backend/impress/settings.py
+++ b/src/backend/impress/settings.py
@@ -99,6 +99,31 @@ class Base(Configuration):
}
DEFAULT_AUTO_FIELD = "django.db.models.AutoField"
+ # Search
+ SEARCH_INDEXER_CLASS = values.Value(
+ default=None,
+ environ_name="SEARCH_INDEXER_CLASS",
+ environ_prefix=None,
+ )
+ SEARCH_INDEXER_BATCH_SIZE = values.IntegerValue(
+ default=100_000, environ_name="SEARCH_INDEXER_BATCH_SIZE", environ_prefix=None
+ )
+ SEARCH_INDEXER_URL = values.Value(
+ default=None, environ_name="SEARCH_INDEXER_URL", environ_prefix=None
+ )
+ SEARCH_INDEXER_COUNTDOWN = values.IntegerValue(
+ default=1, environ_name="SEARCH_INDEXER_COUNTDOWN", environ_prefix=None
+ )
+ SEARCH_INDEXER_SECRET = values.Value(
+ default=None, environ_name="SEARCH_INDEXER_SECRET", environ_prefix=None
+ )
+ SEARCH_INDEXER_QUERY_URL = values.Value(
+ default=None, environ_name="SEARCH_INDEXER_QUERY_URL", environ_prefix=None
+ )
+ SEARCH_INDEXER_QUERY_LIMIT = values.PositiveIntegerValue(
+ default=50, environ_name="SEARCH_INDEXER_QUERY_LIMIT", environ_prefix=None
+ )
+
# Static files (CSS, JavaScript, Images)
STATIC_URL = "/static/"
STATIC_ROOT = os.path.join(DATA_DIR, "static")