diff --git a/.gitignore b/.gitignore index 6eea8d7e7c..16d803b568 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,10 @@ venv.bak/ env.d/development/*.local env.d/terraform +# Docker +compose.override.yml +docker/auth/*.local + # npm node_modules diff --git a/CHANGELOG.md b/CHANGELOG.md index a43001980c..55587ab44d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -97,6 +97,9 @@ and this project adheres to - ♿ update labels and shared document icon accessibility #1442 - 🍱(frontend) Fonts GDPR compliants #1453 - ♻️(service-worker) improve SW registration and update handling #1473 +- ✨(backend) add async indexation of documents on save (or access save) #1276 +- ✨(backend) add debounce mechanism to limit indexation jobs #1276 +- ✨(api) add API route to search for indexed documents in Find #1276 ### Fixed diff --git a/Makefile b/Makefile index 2655167912..c0ea3b59fd 100644 --- a/Makefile +++ b/Makefile @@ -247,6 +247,10 @@ demo: ## flush db then create a demo for load testing purpose @$(MANAGE) create_demo .PHONY: demo +index: ## index all documents to remote search + @$(MANAGE) index +.PHONY: index + # Nota bene: Black should come after isort just in case they don't agree... lint: ## lint back-end python sources lint: \ diff --git a/bin/fernetkey b/bin/fernetkey new file mode 100755 index 0000000000..8bbac1096e --- /dev/null +++ b/bin/fernetkey @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +# shellcheck source=bin/_config.sh +source "$(dirname "${BASH_SOURCE[0]}")/_config.sh" + +_dc_run app-dev python -c 'from cryptography.fernet import Fernet;import sys; sys.stdout.write("\n" + Fernet.generate_key().decode() + "\n");' diff --git a/compose.yml b/compose.yml index a774f11e07..cb5905c330 100644 --- a/compose.yml +++ b/compose.yml @@ -72,6 +72,11 @@ services: - env.d/development/postgresql.local ports: - "8071:8000" + networks: + default: {} + lasuite: + aliases: + - impress volumes: - ./src/backend:/app - ./data/static:/data/static @@ -92,6 +97,9 @@ services: command: ["celery", "-A", "impress.celery_app", "worker", "-l", "DEBUG"] environment: - DJANGO_CONFIGURATION=Development + networks: + - default + - lasuite env_file: - env.d/development/common - env.d/development/common.local @@ -107,6 +115,11 @@ services: image: nginx:1.25 ports: - "8083:8083" + networks: + default: {} + lasuite: + aliases: + - nginx volumes: - ./docker/files/etc/nginx/conf.d:/etc/nginx/conf.d:ro depends_on: @@ -217,3 +230,8 @@ services: kc_postgresql: condition: service_healthy restart: true + +networks: + lasuite: + name: lasuite-network + driver: bridge diff --git a/docs/architecture.md b/docs/architecture.md index 230d32458b..f858eb01d3 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -12,6 +12,7 @@ flowchart TD Back --> DB("Database (PostgreSQL)") Back <--> Celery --> DB Back ----> S3("Minio (S3)") + Back -- REST API --> Find ``` ### Architecture decision records diff --git a/docs/env.md b/docs/env.md index 0b3f9b3bf6..cc09a7142f 100644 --- a/docs/env.md +++ b/docs/env.md @@ -93,6 +93,13 @@ These are the environment variables you can set for the `impress-backend` contai | OIDC_USERINFO_SHORTNAME_FIELD | OIDC token claims to create shortname | first_name | | POSTHOG_KEY | Posthog key for analytics | | | REDIS_URL | Cache url | redis://redis:6379/1 | +| SEARCH_INDEXER_CLASS | Class of the backend for document indexation & search | | +| SEARCH_INDEXER_BATCH_SIZE | Size of each batch for indexation of all documents | 100000 | +| SEARCH_INDEXER_COUNTDOWN | Minimum debounce delay of indexation jobs (in seconds) | 1 | +| SEARCH_INDEXER_URL | Find application endpoint for indexation | | +| SEARCH_INDEXER_SECRET | Token for indexation queries | | +| SEARCH_INDEXER_QUERY_URL | Find application endpoint for search | | +| SEARCH_INDEXER_QUERY_LIMIT | Maximum number of results expected from search endpoint | 50 | | SENTRY_DSN | Sentry host | | | SESSION_COOKIE_AGE | duration of the cookie session | 60*60*12 | | SPECTACULAR_SETTINGS_ENABLE_DJANGO_DEPLOY_CHECK | | false | diff --git a/docs/search.md b/docs/search.md new file mode 100644 index 0000000000..416f972bd3 --- /dev/null +++ b/docs/search.md @@ -0,0 +1,41 @@ +# Setup the Find search for Impress + +This configuration will enable the fulltext search feature for Docs : +- Each save on **core.Document** or **core.DocumentAccess** will trigger the indexer +- The `api/v1.0/documents/search/` will work as a proxy with the Find API for fulltext search. + +## Create an index service for Docs + +Configure a **Service** for Docs application with these settings + +- **Name**: `docs`
_request.auth.name of the Docs application._ +- **Client id**: `impress`
_Name of the token audience or client_id of the Docs application._ + +See [how-to-use-indexer.md](how-to-use-indexer.md) for details. + +## Configure settings of Docs + +Add those Django settings the Docs application to enable the feature. + +```shell +SEARCH_INDEXER_CLASS="core.services.search_indexers.FindDocumentIndexer" +SEARCH_INDEXER_COUNTDOWN=10 # Debounce delay in seconds for the indexer calls. + +# The token from service "docs" of Find application (development). +SEARCH_INDEXER_SECRET="find-api-key-for-docs-with-exactly-50-chars-length" +SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/" + +# Search endpoint. Uses the OIDC token for authentication +SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/" +# Maximum number of results expected from the search endpoint +SEARCH_INDEXER_QUERY_LIMIT=50 +``` + +We also need to enable the **OIDC Token** refresh or the authentication will fail quickly. + +```shell +# Store OIDC tokens in the session +OIDC_STORE_ACCESS_TOKEN = True # Store the access token in the session +OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session +OIDC_STORE_REFRESH_TOKEN_KEY = "your-32-byte-encryption-key==" # Must be a valid Fernet key (32 url-safe base64-encoded bytes) +``` diff --git a/docs/system-requirements.md b/docs/system-requirements.md index 8f2daafb2c..db337d9b23 100644 --- a/docs/system-requirements.md +++ b/docs/system-requirements.md @@ -97,6 +97,17 @@ Production deployments differ significantly from development environments. The t | 5433 | PostgreSQL (Keycloak) | | 1081 | MailCatcher | +**With fulltext search service** + +| Port | Service | +| --------- | --------------------- | +| 8081 | Find (Django) | +| 9200 | Opensearch | +| 9600 | Opensearch admin | +| 5601 | Opensearch dashboard | +| 25432 | PostgreSQL (Find) | + + ## 6. Sizing Guidelines **RAM** – start at 8 GB dev / 16 GB staging / 32 GB prod. Postgres and Keycloak are the first to OOM; scale them first. diff --git a/env.d/development/common b/env.d/development/common index de857d5b2a..cdd734b55f 100644 --- a/env.d/development/common +++ b/env.d/development/common @@ -36,6 +36,7 @@ OIDC_OP_JWKS_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/c OIDC_OP_AUTHORIZATION_ENDPOINT=http://localhost:8083/realms/impress/protocol/openid-connect/auth OIDC_OP_TOKEN_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/token OIDC_OP_USER_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/userinfo +OIDC_OP_INTROSPECTION_ENDPOINT=http://nginx:8083/realms/impress/protocol/openid-connect/token/introspect OIDC_RP_CLIENT_ID=impress OIDC_RP_CLIENT_SECRET=ThisIsAnExampleKeyForDevPurposeOnly @@ -49,6 +50,14 @@ LOGOUT_REDIRECT_URL=http://localhost:3000 OIDC_REDIRECT_ALLOWED_HOSTS=["http://localhost:8083", "http://localhost:3000"] OIDC_AUTH_REQUEST_EXTRA_PARAMS={"acr_values": "eidas1"} +# Store OIDC tokens in the session +OIDC_STORE_ACCESS_TOKEN = True +OIDC_STORE_REFRESH_TOKEN = True # Store the encrypted refresh token in the session. + +# Must be a valid Fernet key (32 url-safe base64-encoded bytes) +# To create one, use the bin/fernetkey command. +# OIDC_STORE_REFRESH_TOKEN_KEY="your-32-byte-encryption-key==" + # AI AI_FEATURE_ENABLED=true AI_BASE_URL=https://openaiendpoint.com @@ -68,4 +77,10 @@ Y_PROVIDER_API_BASE_URL=http://y-provider-development:4444/api/ Y_PROVIDER_API_KEY=yprovider-api-key # Theme customization -THEME_CUSTOMIZATION_CACHE_TIMEOUT=15 \ No newline at end of file +THEME_CUSTOMIZATION_CACHE_TIMEOUT=15 + +# Indexer (disabled) +# SEARCH_INDEXER_CLASS="core.services.search_indexers.SearchIndexer" +SEARCH_INDEXER_SECRET=find-api-key-for-docs-with-exactly-50-chars-length # Key generated by create_demo in Find app. +SEARCH_INDEXER_URL="http://find:8000/api/v1.0/documents/index/" +SEARCH_INDEXER_QUERY_URL="http://find:8000/api/v1.0/documents/search/" diff --git a/src/backend/core/api/serializers.py b/src/backend/core/api/serializers.py index 81b26d5e80..45ec34c68e 100644 --- a/src/backend/core/api/serializers.py +++ b/src/backend/core/api/serializers.py @@ -889,3 +889,13 @@ class MoveDocumentSerializer(serializers.Serializer): choices=enums.MoveNodePositionChoices.choices, default=enums.MoveNodePositionChoices.LAST_CHILD, ) + + +class SearchDocumentSerializer(serializers.Serializer): + """Serializer for fulltext search requests through Find application""" + + q = serializers.CharField(required=True, allow_blank=False, trim_whitespace=True) + page_size = serializers.IntegerField( + required=False, min_value=1, max_value=50, default=20 + ) + page = serializers.IntegerField(required=False, min_value=1, default=1) diff --git a/src/backend/core/api/viewsets.py b/src/backend/core/api/viewsets.py index 84402ceaae..0a9d16aaa9 100644 --- a/src/backend/core/api/viewsets.py +++ b/src/backend/core/api/viewsets.py @@ -21,6 +21,7 @@ from django.db.models.functions import Left, Length from django.http import Http404, StreamingHttpResponse from django.urls import reverse +from django.utils.decorators import method_decorator from django.utils.functional import cached_property from django.utils.text import capfirst, slugify from django.utils.translation import gettext_lazy as _ @@ -31,6 +32,7 @@ from csp.constants import NONE from csp.decorators import csp_update from lasuite.malware_detection import malware_detection +from lasuite.oidc_login.decorators import refresh_oidc_access_token from rest_framework import filters, status, viewsets from rest_framework import response as drf_response from rest_framework.permissions import AllowAny @@ -47,6 +49,10 @@ from core.services.converter_services import ( YdocConverter, ) +from core.services.search_indexers import ( + get_document_indexer, + get_visited_document_ids_of, +) from core.tasks.mail import send_ask_for_access_mail from core.utils import extract_attachments, filter_descendants @@ -373,6 +379,7 @@ class DocumentViewSet( list_serializer_class = serializers.ListDocumentSerializer trashbin_serializer_class = serializers.ListDocumentSerializer tree_serializer_class = serializers.ListDocumentSerializer + search_serializer_class = serializers.ListDocumentSerializer def get_queryset(self): """Get queryset performing all annotation and filtering on the document tree structure.""" @@ -1064,6 +1071,83 @@ def duplicate(self, request, *args, **kwargs): {"id": str(duplicated_document.id)}, status=status.HTTP_201_CREATED ) + def _search_simple(self, request, text): + """ + Returns a queryset filtered by the content of the document title + """ + # As the 'list' view we get a prefiltered queryset (deleted docs are excluded) + queryset = self.get_queryset() + filterset = DocumentFilter({"title": text}, queryset=queryset) + + if not filterset.is_valid(): + raise drf.exceptions.ValidationError(filterset.errors) + + queryset = filterset.filter_queryset(queryset) + + return self.get_response_for_queryset( + queryset.order_by("-updated_at"), + context={ + "request": request, + }, + ) + + def _search_fulltext(self, indexer, request, params): + """ + Returns a queryset from the results the fulltext search of Find + """ + access_token = request.session.get("oidc_access_token") + user = request.user + text = params.validated_data["q"] + queryset = models.Document.objects.all() + + # Retrieve the documents ids from Find. + results = indexer.search( + text=text, + token=access_token, + visited=get_visited_document_ids_of(queryset, user), + ) + + docs_by_uuid = {str(d.pk): d for d in queryset.filter(pk__in=results)} + ordered_docs = [docs_by_uuid[id] for id in results] + + page = self.paginate_queryset(ordered_docs) + + serializer = self.get_serializer( + page if page else ordered_docs, + many=True, + context={ + "request": request, + }, + ) + + return self.get_paginated_response(serializer.data) + + @drf.decorators.action(detail=False, methods=["get"], url_path="search") + @method_decorator(refresh_oidc_access_token) + def search(self, request, *args, **kwargs): + """ + Returns a DRF response containing the filtered, annotated and ordered document list. + + Applies filtering based on request parameter 'q' from `SearchDocumentSerializer`. + Depending of the configuration it can be: + - A fulltext search through the opensearch indexation app "find" if the backend is + enabled (see SEARCH_INDEXER_CLASS) + - A filtering by the model field 'title'. + + The ordering is always by the most recent first. + """ + params = serializers.SearchDocumentSerializer(data=request.query_params) + params.is_valid(raise_exception=True) + + indexer = get_document_indexer() + + if indexer: + return self._search_fulltext(indexer, request, params=params) + + # The indexer is not configured, we fallback on a simple icontains filter by the + # model field 'title'. + return self._search_simple(request, text=params.validated_data["q"]) + @drf.decorators.action(detail=True, methods=["get"], url_path="versions") def versions_list(self, request, *args, **kwargs): """ diff --git a/src/backend/core/apps.py b/src/backend/core/apps.py index bba7de0f42..b8ce648834 100644 --- a/src/backend/core/apps.py +++ b/src/backend/core/apps.py @@ -1,11 +1,19 @@ """Impress Core application""" -# from django.apps import AppConfig -# from django.utils.translation import gettext_lazy as _ +from django.apps import AppConfig +from django.utils.translation import gettext_lazy as _ -# class CoreConfig(AppConfig): -# """Configuration class for the impress core app.""" -# name = "core" -# app_label = "core" -# verbose_name = _("impress core application") +class CoreConfig(AppConfig): + """Configuration class for the impress core app.""" + + name = "core" + app_label = "core" + verbose_name = _("Impress core application") + + def ready(self): + """ + Import signals when the app is ready. + """ + # pylint: disable=import-outside-toplevel, unused-import + from . import signals # noqa: PLC0415 diff --git a/src/backend/core/management/commands/index.py b/src/backend/core/management/commands/index.py new file mode 100644 index 0000000000..af046e0608 --- /dev/null +++ b/src/backend/core/management/commands/index.py @@ -0,0 +1,52 @@ +""" +Handle search setup that needs to be done at bootstrap time. +""" + +import logging +import time + +from django.core.management.base import BaseCommand, CommandError + +from core.services.search_indexers import get_document_indexer + +logger = logging.getLogger("docs.search.bootstrap_search") + + +class Command(BaseCommand): + """Index all documents to remote search service""" + + help = __doc__ + + def add_arguments(self, parser): + """Add argument to require forcing execution when not in debug mode.""" + parser.add_argument( + "--batch-size", + action="store", + dest="batch_size", + type=int, + default=50, + help="Indexation query batch size", + ) + + def handle(self, *args, **options): + """Launch and log search index generation.""" + indexer = get_document_indexer() + + if not indexer: + raise CommandError("The indexer is not enabled or properly configured.") + + logger.info("Starting to regenerate Find index...") + start = time.perf_counter() + batch_size = options["batch_size"] + + try: + count = indexer.index(batch_size=batch_size) + except Exception as err: + raise CommandError("Unable to regenerate index") from err + + duration = time.perf_counter() - start + logger.info( + "Search index regenerated from %d document(s) in %.2f seconds.", + count, + duration, + ) diff --git a/src/backend/core/models.py b/src/backend/core/models.py index 6e0ad69e4f..e7994d9212 100644 --- a/src/backend/core/models.py +++ b/src/backend/core/models.py @@ -432,32 +432,35 @@ def __init__(self, *args, **kwargs): def save(self, *args, **kwargs): """Write content to object storage only if _content has changed.""" super().save(*args, **kwargs) - if self._content: - file_key = self.file_key - bytes_content = self._content.encode("utf-8") + self.save_content(self._content) - # Attempt to directly check if the object exists using the storage client. - try: - response = default_storage.connection.meta.client.head_object( - Bucket=default_storage.bucket_name, Key=file_key - ) - except ClientError as excpt: - # If the error is a 404, the object doesn't exist, so we should create it. - if excpt.response["Error"]["Code"] == "404": - has_changed = True - else: - raise + def save_content(self, content): + """Save content to object storage.""" + + file_key = self.file_key + bytes_content = content.encode("utf-8") + + # Attempt to directly check if the object exists using the storage client. + try: + response = default_storage.connection.meta.client.head_object( + Bucket=default_storage.bucket_name, Key=file_key + ) + except ClientError as excpt: + # If the error is a 404, the object doesn't exist, so we should create it. + if excpt.response["Error"]["Code"] == "404": + has_changed = True else: - # Compare the existing ETag with the MD5 hash of the new content. - has_changed = ( - response["ETag"].strip('"') - != hashlib.md5(bytes_content).hexdigest() # noqa: S324 - ) + raise + else: + # Compare the existing ETag with the MD5 hash of the new content. + has_changed = ( + response["ETag"].strip('"') != hashlib.md5(bytes_content).hexdigest() # noqa: S324 + ) - if has_changed: - content_file = ContentFile(bytes_content) - default_storage.save(file_key, content_file) + if has_changed: + content_file = ContentFile(bytes_content) + default_storage.save(file_key, content_file) def is_leaf(self): """ @@ -901,7 +904,8 @@ def soft_delete(self): # Mark all descendants as soft deleted self.get_descendants().filter(ancestors_deleted_at__isnull=True).update( - ancestors_deleted_at=self.ancestors_deleted_at + ancestors_deleted_at=self.ancestors_deleted_at, + updated_at=self.updated_at, ) @transaction.atomic diff --git a/src/backend/core/services/search_indexers.py b/src/backend/core/services/search_indexers.py new file mode 100644 index 0000000000..a4bb9eec6b --- /dev/null +++ b/src/backend/core/services/search_indexers.py @@ -0,0 +1,298 @@ +"""Document search index management utilities and indexers""" + +import logging +from abc import ABC, abstractmethod +from collections import defaultdict +from functools import cache + +from django.conf import settings +from django.contrib.auth.models import AnonymousUser +from django.core.exceptions import ImproperlyConfigured +from django.db.models import Subquery +from django.utils.module_loading import import_string + +import requests + +from core import models, utils + +logger = logging.getLogger(__name__) + + +@cache +def get_document_indexer(): + """Returns an instance of indexer service if enabled and properly configured.""" + classpath = settings.SEARCH_INDEXER_CLASS + + # For this usecase an empty indexer class is not an issue but a feature. + if not classpath: + logger.info("Document indexer is not configured (see SEARCH_INDEXER_CLASS)") + return None + + try: + indexer_class = import_string(settings.SEARCH_INDEXER_CLASS) + return indexer_class() + except ImportError as err: + logger.error("SEARCH_INDEXER_CLASS setting is not valid : %s", err) + except ImproperlyConfigured as err: + logger.error("Document indexer is not properly configured : %s", err) + + return None + + +def get_batch_accesses_by_users_and_teams(paths): + """ + Get accesses related to a list of document paths, + grouped by users and teams, including all ancestor paths. + """ + ancestor_map = utils.get_ancestor_to_descendants_map( + paths, steplen=models.Document.steplen + ) + ancestor_paths = list(ancestor_map.keys()) + + access_qs = models.DocumentAccess.objects.filter( + document__path__in=ancestor_paths + ).values("document__path", "user__sub", "team") + + access_by_document_path = defaultdict(lambda: {"users": set(), "teams": set()}) + + for access in access_qs: + ancestor_path = access["document__path"] + user_sub = access["user__sub"] + team = access["team"] + + for descendant_path in ancestor_map.get(ancestor_path, []): + if user_sub: + access_by_document_path[descendant_path]["users"].add(str(user_sub)) + if team: + access_by_document_path[descendant_path]["teams"].add(team) + + return dict(access_by_document_path) + + +def get_visited_document_ids_of(queryset, user): + """ + Returns the ids of the documents that have a linktrace to the user and NOT owned. + It will be use to limit the opensearch responses to the public documents already + "visited" by the user. + """ + if isinstance(user, AnonymousUser): + return [] + + qs = models.LinkTrace.objects.filter(user=user) + + docs = ( + queryset.exclude(accesses__user=user) + .filter( + deleted_at__isnull=True, + ancestors_deleted_at__isnull=True, + ) + .filter(pk__in=Subquery(qs.values("document_id"))) + .order_by("pk") + .distinct("pk") + ) + + return [str(id) for id in docs.values_list("pk", flat=True)] + + +class BaseDocumentIndexer(ABC): + """ + Base class for document indexers. + + Handles batching and access resolution. Subclasses must implement both + `serialize_document()` and `push()` to define backend-specific behavior. + """ + + def __init__(self): + """ + Initialize the indexer. + """ + self.batch_size = settings.SEARCH_INDEXER_BATCH_SIZE + self.indexer_url = settings.SEARCH_INDEXER_URL + self.indexer_secret = settings.SEARCH_INDEXER_SECRET + self.search_url = settings.SEARCH_INDEXER_QUERY_URL + self.search_limit = settings.SEARCH_INDEXER_QUERY_LIMIT + + if not self.indexer_url: + raise ImproperlyConfigured( + "SEARCH_INDEXER_URL must be set in Django settings." + ) + + if not self.indexer_secret: + raise ImproperlyConfigured( + "SEARCH_INDEXER_SECRET must be set in Django settings." + ) + + if not self.search_url: + raise ImproperlyConfigured( + "SEARCH_INDEXER_QUERY_URL must be set in Django settings." + ) + + def index(self, queryset=None, batch_size=None): + """ + Fetch documents in batches, serialize them, and push to the search backend. + + Args: + queryset (optional): Document queryset + Defaults to all documents without filter. + batch_size (int, optional): Number of documents per batch. + Defaults to settings.SEARCH_INDEXER_BATCH_SIZE. + """ + last_id = 0 + count = 0 + queryset = queryset or models.Document.objects.all() + batch_size = batch_size or self.batch_size + + while True: + documents_batch = list( + queryset.filter( + id__gt=last_id, + ).order_by("id")[:batch_size] + ) + + if not documents_batch: + break + + doc_paths = [doc.path for doc in documents_batch] + last_id = documents_batch[-1].id + accesses_by_document_path = get_batch_accesses_by_users_and_teams(doc_paths) + + serialized_batch = [ + self.serialize_document(document, accesses_by_document_path) + for document in documents_batch + if document.content or document.title + ] + + if serialized_batch: + self.push(serialized_batch) + count += len(serialized_batch) + + return count + + @abstractmethod + def serialize_document(self, document, accesses): + """ + Convert a Document instance to a JSON-serializable format for indexing. + + Must be implemented by subclasses. + """ + + @abstractmethod + def push(self, data): + """ + Push a batch of serialized documents to the backend. + + Must be implemented by subclasses. + """ + + # pylint: disable-next=too-many-arguments,too-many-positional-arguments + def search(self, text, token, visited=(), nb_results=None): + """ + Search for documents in Find app. + Ensure the same default ordering as "Docs" list : -updated_at + + Returns ids of the documents + + Args: + text (str): Text search content. + token (str): OIDC Authentication token. + visited (list, optional): + List of ids of active public documents with LinkTrace + Defaults to settings.SEARCH_INDEXER_BATCH_SIZE. + nb_results (int, optional): + The number of results to return. + Defaults to 50 if not specified. + """ + nb_results = nb_results or self.search_limit + response = self.search_query( + data={ + "q": text, + "visited": visited, + "services": ["docs"], + "nb_results": nb_results, + "order_by": "updated_at", + "order_direction": "desc", + }, + token=token, + ) + + return [d["_id"] for d in response] + + @abstractmethod + def search_query(self, data, token) -> dict: + """ + Retrieve documents from the Find app API. + + Must be implemented by subclasses. + """ + + +class SearchIndexer(BaseDocumentIndexer): + """ + Document indexer that pushes documents to La Suite Find app. + """ + + def serialize_document(self, document, accesses): + """ + Convert a Document to the JSON format expected by La Suite Find. + + Args: + document (Document): The document instance. + accesses (dict): Mapping of document ID to user/team access. + + Returns: + dict: A JSON-serializable dictionary. + """ + doc_path = document.path + doc_content = document.content + text_content = utils.base64_yjs_to_text(doc_content) if doc_content else "" + + return { + "id": str(document.id), + "title": document.title or "", + "content": text_content, + "depth": document.depth, + "path": document.path, + "numchild": document.numchild, + "created_at": document.created_at.isoformat(), + "updated_at": document.updated_at.isoformat(), + "users": list(accesses.get(doc_path, {}).get("users", set())), + "groups": list(accesses.get(doc_path, {}).get("teams", set())), + "reach": document.computed_link_reach, + "size": len(text_content.encode("utf-8")), + "is_active": not bool(document.ancestors_deleted_at), + } + + def search_query(self, data, token) -> requests.Response: + """ + Retrieve documents from the Find app API. + + Args: + data (dict): search data + token (str): OICD token + + Returns: + dict: A JSON-serializable dictionary. + """ + response = requests.post( + self.search_url, + json=data, + headers={"Authorization": f"Bearer {token}"}, + timeout=10, + ) + response.raise_for_status() + return response.json() + + def push(self, data): + """ + Push a batch of documents to the Find backend. + + Args: + data (list): List of document dictionaries. + """ + response = requests.post( + self.indexer_url, + json=data, + headers={"Authorization": f"Bearer {self.indexer_secret}"}, + timeout=10, + ) + response.raise_for_status() diff --git a/src/backend/core/signals.py b/src/backend/core/signals.py new file mode 100644 index 0000000000..4bd9647798 --- /dev/null +++ b/src/backend/core/signals.py @@ -0,0 +1,33 @@ +""" +Declare and configure the signals for the impress core application +""" + +from functools import partial + +from django.db import transaction +from django.db.models import signals +from django.dispatch import receiver + +from . import models +from .tasks.search import trigger_batch_document_indexer + + +@receiver(signals.post_save, sender=models.Document) +def document_post_save(sender, instance, **kwargs): # pylint: disable=unused-argument + """ + Asynchronous call to the document indexer at the end of the transaction. + Note : Within the transaction we can have an empty content and a serialization + error. + """ + transaction.on_commit(partial(trigger_batch_document_indexer, instance)) + + +@receiver(signals.post_save, sender=models.DocumentAccess) +def document_access_post_save(sender, instance, created, **kwargs): # pylint: disable=unused-argument + """ + Asynchronous call to the document indexer at the end of the transaction. + """ + if not created: + transaction.on_commit( + partial(trigger_batch_document_indexer, instance.document) + ) diff --git a/src/backend/core/tasks/search.py b/src/backend/core/tasks/search.py new file mode 100644 index 0000000000..4b30c6a7de --- /dev/null +++ b/src/backend/core/tasks/search.py @@ -0,0 +1,95 @@ +"""Trigger document indexation using celery task.""" + +from logging import getLogger + +from django.conf import settings +from django.core.cache import cache +from django.db.models import Q + +from django_redis.cache import RedisCache + +from core import models +from core.services.search_indexers import ( + get_document_indexer, +) + +from impress.celery_app import app + +logger = getLogger(__file__) + + +@app.task +def document_indexer_task(document_id): + """Celery Task : Sends indexation query for a document.""" + indexer = get_document_indexer() + + if indexer: + logger.info("Start document %s indexation", document_id) + indexer.index(models.Document.objects.filter(pk=document_id)) + + +def batch_indexer_throttle_acquire(timeout: int = 0, atomic: bool = True): + """ + Enable the task throttle flag for a delay. + Uses redis locks if available to ensure atomic changes + """ + key = "document-batch-indexer-throttle" + + # Redis is used as cache database (not in tests). Use the lock feature here + # to ensure atomicity of changes to the throttle flag. + if isinstance(cache, RedisCache) and atomic: + with cache.locks(key): + return batch_indexer_throttle_acquire(timeout, atomic=False) + + # Use add() here : + # - set the flag and returns true if not exist + # - do nothing and return false if exist + return cache.add(key, 1, timeout=timeout) + + +@app.task +def batch_document_indexer_task(timestamp): + """Celery Task : Sends indexation query for a batch of documents.""" + indexer = get_document_indexer() + + if indexer: + queryset = models.Document.objects.filter( + Q(updated_at__gte=timestamp) + | Q(deleted_at__gte=timestamp) + | Q(ancestors_deleted_at__gte=timestamp) + ) + + count = indexer.index(queryset) + logger.info("Indexed %d documents", count) + + +def trigger_batch_document_indexer(item): + """ + Trigger indexation task with debounce a delay set by the SEARCH_INDEXER_COUNTDOWN setting. + + Args: + document (Document): The document instance. + """ + countdown = int(settings.SEARCH_INDEXER_COUNTDOWN) + + # DO NOT create a task if indexation if disabled + if not settings.SEARCH_INDEXER_CLASS: + return + + if countdown > 0: + # Each time this method is called during a countdown, we increment the + # counter and each task decrease it, so the index be run only once. + if batch_indexer_throttle_acquire(timeout=countdown): + logger.info( + "Add task for batch document indexation from updated_at=%s in %d seconds", + item.updated_at.isoformat(), + countdown, + ) + + batch_document_indexer_task.apply_async( + args=[item.updated_at], countdown=countdown + ) + else: + logger.info("Skip task for batch document %s indexation", item.pk) + else: + document_indexer_task.apply(args=[item.pk]) diff --git a/src/backend/core/tests/commands/test_index.py b/src/backend/core/tests/commands/test_index.py new file mode 100644 index 0000000000..ad7d39e6e0 --- /dev/null +++ b/src/backend/core/tests/commands/test_index.py @@ -0,0 +1,65 @@ +""" +Unit test for `index` command. +""" + +from operator import itemgetter +from unittest import mock + +from django.core.management import CommandError, call_command +from django.db import transaction + +import pytest + +from core import factories +from core.services.search_indexers import SearchIndexer + + +@pytest.mark.django_db +@pytest.mark.usefixtures("indexer_settings") +def test_index(): + """Test the command `index` that run the Find app indexer for all the available documents.""" + user = factories.UserFactory() + indexer = SearchIndexer() + + with transaction.atomic(): + doc = factories.DocumentFactory() + empty_doc = factories.DocumentFactory(title=None, content="") + no_title_doc = factories.DocumentFactory(title=None) + + factories.UserDocumentAccessFactory(document=doc, user=user) + factories.UserDocumentAccessFactory(document=empty_doc, user=user) + factories.UserDocumentAccessFactory(document=no_title_doc, user=user) + + accesses = { + str(doc.path): {"users": [user.sub]}, + str(empty_doc.path): {"users": [user.sub]}, + str(no_title_doc.path): {"users": [user.sub]}, + } + + with mock.patch.object(SearchIndexer, "push") as mock_push: + call_command("index") + + push_call_args = [call.args[0] for call in mock_push.call_args_list] + + # called once but with a batch of docs + mock_push.assert_called_once() + + assert sorted(push_call_args[0], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc, accesses), + indexer.serialize_document(no_title_doc, accesses), + ], + key=itemgetter("id"), + ) + + +@pytest.mark.django_db +@pytest.mark.usefixtures("indexer_settings") +def test_index_improperly_configured(indexer_settings): + """The command should raise an exception if the indexer is not configured""" + indexer_settings.SEARCH_INDEXER_CLASS = None + + with pytest.raises(CommandError) as err: + call_command("index") + + assert str(err.value) == "The indexer is not enabled or properly configured." diff --git a/src/backend/core/tests/conftest.py b/src/backend/core/tests/conftest.py index 00e830e18f..65e3926934 100644 --- a/src/backend/core/tests/conftest.py +++ b/src/backend/core/tests/conftest.py @@ -24,3 +24,30 @@ def mock_user_teams(): "core.models.User.teams", new_callable=mock.PropertyMock ) as mock_teams: yield mock_teams + + +@pytest.fixture(name="indexer_settings") +def indexer_settings_fixture(settings): + """ + Setup valid settings for the document indexer. Clear the indexer cache. + """ + + # pylint: disable-next=import-outside-toplevel + from core.services.search_indexers import ( # noqa: PLC0415 + get_document_indexer, + ) + + get_document_indexer.cache_clear() + + settings.SEARCH_INDEXER_CLASS = "core.services.search_indexers.SearchIndexer" + settings.SEARCH_INDEXER_SECRET = "ThisIsAKeyForTest" + settings.SEARCH_INDEXER_URL = "http://localhost:8081/api/v1.0/documents/index/" + settings.SEARCH_INDEXER_QUERY_URL = ( + "http://localhost:8081/api/v1.0/documents/search/" + ) + settings.SEARCH_INDEXER_COUNTDOWN = 1 + + yield settings + + # clear cache to prevent issues with other tests + get_document_indexer.cache_clear() diff --git a/src/backend/core/tests/documents/test_api_documents_search.py b/src/backend/core/tests/documents/test_api_documents_search.py new file mode 100644 index 0000000000..c6d0d8e3ac --- /dev/null +++ b/src/backend/core/tests/documents/test_api_documents_search.py @@ -0,0 +1,425 @@ +""" +Tests for Documents API endpoint in impress's core app: list +""" + +import random +from json import loads as json_loads + +from django.test import RequestFactory + +import pytest +import responses +from faker import Faker +from rest_framework.test import APIClient + +from core import factories, models +from core.services.search_indexers import get_document_indexer + +fake = Faker() +pytestmark = pytest.mark.django_db + + +def build_search_url(**kwargs): + """Build absolute uri for search endpoint with ORDERED query arguments""" + return ( + RequestFactory() + .get("/api/v1.0/documents/search/", dict(sorted(kwargs.items()))) + .build_absolute_uri() + ) + + +@pytest.mark.parametrize("role", models.LinkRoleChoices.values) +@pytest.mark.parametrize("reach", models.LinkReachChoices.values) +@responses.activate +def test_api_documents_search_anonymous(reach, role, indexer_settings): + """ + Anonymous users should not be allowed to search documents whatever the + link reach and link role + """ + indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + + factories.DocumentFactory(link_reach=reach, link_role=role) + + # Find response + responses.add( + responses.POST, + "http://find/api/v1.0/search", + json=[], + status=200, + ) + + response = APIClient().get("/api/v1.0/documents/search/", data={"q": "alpha"}) + + assert response.status_code == 200 + assert response.json() == { + "count": 0, + "next": None, + "previous": None, + "results": [], + } + + +def test_api_documents_search_endpoint_is_none(indexer_settings): + """ + Missing SEARCH_INDEXER_QUERY_URL, so the indexer is not properly configured. + Should fallback on title filter + """ + indexer_settings.SEARCH_INDEXER_QUERY_URL = None + + assert get_document_indexer() is None + + user = factories.UserFactory() + document = factories.DocumentFactory(title="alpha") + access = factories.UserDocumentAccessFactory(document=document, user=user) + + client = APIClient() + client.force_login(user) + + response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"}) + + assert response.status_code == 200 + content = response.json() + results = content.pop("results") + assert content == { + "count": 1, + "next": None, + "previous": None, + } + assert len(results) == 1 + assert results[0] == { + "id": str(document.id), + "abilities": document.get_abilities(user), + "ancestors_link_reach": None, + "ancestors_link_role": None, + "computed_link_reach": document.computed_link_reach, + "computed_link_role": document.computed_link_role, + "created_at": document.created_at.isoformat().replace("+00:00", "Z"), + "creator": str(document.creator.id), + "depth": 1, + "excerpt": document.excerpt, + "link_reach": document.link_reach, + "link_role": document.link_role, + "nb_accesses_ancestors": 1, + "nb_accesses_direct": 1, + "numchild": 0, + "path": document.path, + "title": document.title, + "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"), + "deleted_at": None, + "user_role": access.role, + } + + +@responses.activate +def test_api_documents_search_invalid_params(indexer_settings): + """Validate the format of documents as returned by the search view.""" + indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + response = client.get("/api/v1.0/documents/search/") + + assert response.status_code == 400 + assert response.json() == {"q": ["This field is required."]} + + response = client.get("/api/v1.0/documents/search/", data={"q": " "}) + + assert response.status_code == 400 + assert response.json() == {"q": ["This field may not be blank."]} + + response = client.get( + "/api/v1.0/documents/search/", data={"q": "any", "page": "NaN"} + ) + + assert response.status_code == 400 + assert response.json() == {"page": ["A valid integer is required."]} + + +@responses.activate +def test_api_documents_search_format(indexer_settings): + """Validate the format of documents as returned by the search view.""" + indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + + assert get_document_indexer() is not None + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + user_a, user_b, user_c = factories.UserFactory.create_batch(3) + document = factories.DocumentFactory( + title="alpha", + users=(user_a, user_c), + link_traces=(user, user_b), + ) + access = factories.UserDocumentAccessFactory(document=document, user=user) + + # Find response + responses.add( + responses.POST, + "http://find/api/v1.0/search", + json=[ + {"_id": str(document.pk)}, + ], + status=200, + ) + response = client.get("/api/v1.0/documents/search/", data={"q": "alpha"}) + + assert response.status_code == 200 + content = response.json() + results = content.pop("results") + assert content == { + "count": 1, + "next": None, + "previous": None, + } + assert len(results) == 1 + assert results[0] == { + "id": str(document.id), + "abilities": document.get_abilities(user), + "ancestors_link_reach": None, + "ancestors_link_role": None, + "computed_link_reach": document.computed_link_reach, + "computed_link_role": document.computed_link_role, + "created_at": document.created_at.isoformat().replace("+00:00", "Z"), + "creator": str(document.creator.id), + "depth": 1, + "excerpt": document.excerpt, + "link_reach": document.link_reach, + "link_role": document.link_role, + "nb_accesses_ancestors": 3, + "nb_accesses_direct": 3, + "numchild": 0, + "path": document.path, + "title": document.title, + "updated_at": document.updated_at.isoformat().replace("+00:00", "Z"), + "deleted_at": None, + "user_role": access.role, + } + + +@responses.activate +@pytest.mark.parametrize( + "pagination, status, expected", + ( + ( + {"page": 1, "page_size": 10}, + 200, + { + "count": 10, + "previous": None, + "next": None, + "range": (0, None), + }, + ), + ( + {}, + 200, + { + "count": 10, + "previous": None, + "next": None, + "range": (0, None), + "api_page_size": 21, # default page_size is 20 + }, + ), + ( + {"page": 2, "page_size": 10}, + 404, + {}, + ), + ( + {"page": 1, "page_size": 5}, + 200, + { + "count": 10, + "previous": None, + "next": {"page": 2, "page_size": 5}, + "range": (0, 5), + }, + ), + ( + {"page": 2, "page_size": 5}, + 200, + { + "count": 10, + "previous": {"page_size": 5}, + "next": None, + "range": (5, None), + }, + ), + ({"page": 3, "page_size": 5}, 404, {}), + ), +) +def test_api_documents_search_pagination( + indexer_settings, pagination, status, expected +): + """Documents should be ordered by descending "score" by default""" + indexer_settings.SEARCH_INDEXER_QUERY_URL = "http://find/api/v1.0/search" + + assert get_document_indexer() is not None + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + docs = factories.DocumentFactory.create_batch(10, title="alpha", users=[user]) + + docs_by_uuid = {str(doc.pk): doc for doc in docs} + api_results = [{"_id": id} for id in docs_by_uuid.keys()] + + # reorder randomly to simulate score ordering + random.shuffle(api_results) + + # Find response + # pylint: disable-next=assignment-from-none + api_search = responses.add( + responses.POST, + "http://find/api/v1.0/search", + json=api_results, + status=200, + ) + + response = client.get( + "/api/v1.0/documents/search/", + data={ + "q": "alpha", + **pagination, + }, + ) + + assert response.status_code == status + + if response.status_code < 300: + previous_url = ( + build_search_url(q="alpha", **expected["previous"]) + if expected["previous"] + else None + ) + next_url = ( + build_search_url(q="alpha", **expected["next"]) + if expected["next"] + else None + ) + start, end = expected["range"] + + content = response.json() + + assert content["count"] == expected["count"] + assert content["previous"] == previous_url + assert content["next"] == next_url + + results = content.pop("results") + + # The find api results ordering by score is kept + assert [r["id"] for r in results] == [r["_id"] for r in api_results[start:end]] + + # Check the query parameters. + assert api_search.call_count == 1 + assert api_search.calls[0].response.status_code == 200 + assert json_loads(api_search.calls[0].request.body) == { + "q": "alpha", + "visited": [], + "services": ["docs"], + "nb_results": 50, + "order_by": "updated_at", + "order_direction": "desc", + } + + +@responses.activate +@pytest.mark.parametrize( + "pagination, status, expected", + ( + ( + {"page": 1, "page_size": 10}, + 200, + {"count": 10, "previous": None, "next": None, "range": (0, None)}, + ), + ( + {}, + 200, + {"count": 10, "previous": None, "next": None, "range": (0, None)}, + ), + ( + {"page": 2, "page_size": 10}, + 404, + {}, + ), + ( + {"page": 1, "page_size": 5}, + 200, + { + "count": 10, + "previous": None, + "next": {"page": 2, "page_size": 5}, + "range": (0, 5), + }, + ), + ( + {"page": 2, "page_size": 5}, + 200, + { + "count": 10, + "previous": {"page_size": 5}, + "next": None, + "range": (5, None), + }, + ), + ({"page": 3, "page_size": 5}, 404, {}), + ), +) +def test_api_documents_search_pagination_endpoint_is_none( + indexer_settings, pagination, status, expected +): + """Documents should be ordered by descending "-updated_at" by default""" + indexer_settings.SEARCH_INDEXER_QUERY_URL = None + + assert get_document_indexer() is None + + user = factories.UserFactory() + + client = APIClient() + client.force_login(user) + + factories.DocumentFactory.create_batch(10, title="alpha", users=[user]) + + response = client.get( + "/api/v1.0/documents/search/", + data={ + "q": "alpha", + **pagination, + }, + ) + + assert response.status_code == status + + if response.status_code < 300: + previous_url = ( + build_search_url(q="alpha", **expected["previous"]) + if expected["previous"] + else None + ) + next_url = ( + build_search_url(q="alpha", **expected["next"]) + if expected["next"] + else None + ) + queryset = models.Document.objects.order_by("-updated_at") + start, end = expected["range"] + expected_results = [str(d.pk) for d in queryset[start:end]] + + content = response.json() + + assert content["count"] == expected["count"] + assert content["previous"] == previous_url + assert content["next"] == next_url + + results = content.pop("results") + + assert [r["id"] for r in results] == expected_results diff --git a/src/backend/core/tests/test_models_documents_indexer.py b/src/backend/core/tests/test_models_documents_indexer.py new file mode 100644 index 0000000000..9e171f724d --- /dev/null +++ b/src/backend/core/tests/test_models_documents_indexer.py @@ -0,0 +1,441 @@ +""" +Unit tests for the Document model +""" +# pylint: disable=too-many-lines + +from operator import itemgetter +from unittest import mock + +from django.core.cache import cache +from django.db import transaction + +import pytest + +from core import factories, models +from core.services.search_indexers import SearchIndexer + +pytestmark = pytest.mark.django_db + + +def reset_batch_indexer_throttle(): + """Reset throttle flag""" + cache.delete("document-batch-indexer-throttle") + + +@pytest.fixture(autouse=True) +def reset_throttle(): + """Reset throttle flag before each test""" + reset_batch_indexer_throttle() + yield + reset_batch_indexer_throttle() + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer(mock_push): + """Test indexation task on document creation""" + with transaction.atomic(): + doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3) + + accesses = {} + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = SearchIndexer() + + assert len(data) == 1 + + # One call + assert sorted(data[0], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc1, accesses), + indexer.serialize_document(doc2, accesses), + indexer.serialize_document(doc3, accesses), + ], + key=itemgetter("id"), + ) + + # The throttle counters should be reset + assert cache.get("document-batch-indexer-throttle") == 1 + + +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_no_batches(indexer_settings): + """Test indexation task on doculment creation, no throttle""" + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 + + with mock.patch.object(SearchIndexer, "push") as mock_push: + with transaction.atomic(): + doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3) + + accesses = {} + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = SearchIndexer() + + # 3 calls + assert len(data) == 3 + # one document per call + assert [len(d) for d in data] == [1] * 3 + # all documents are indexed + assert sorted([d[0] for d in data], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc1, accesses), + indexer.serialize_document(doc2, accesses), + indexer.serialize_document(doc3, accesses), + ], + key=itemgetter("id"), + ) + + # The throttle counters should be reset + assert cache.get("file-batch-indexer-throttle") is None + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_not_configured(mock_push, indexer_settings): + """Task should not start an indexation when disabled""" + indexer_settings.SEARCH_INDEXER_CLASS = None + + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory() + factories.UserDocumentAccessFactory(document=doc, user=user) + + assert mock_push.assert_not_called + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_wrongly_configured( + mock_push, indexer_settings +): + """Task should not start an indexation when disabled""" + indexer_settings.SEARCH_INDEXER_URL = None + + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory() + factories.UserDocumentAccessFactory(document=doc, user=user) + + assert mock_push.assert_not_called + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_with_accesses(mock_push): + """Test indexation task on document creation""" + user = factories.UserFactory() + + with transaction.atomic(): + doc1, doc2, doc3 = factories.DocumentFactory.create_batch(3) + + factories.UserDocumentAccessFactory(document=doc1, user=user) + factories.UserDocumentAccessFactory(document=doc2, user=user) + factories.UserDocumentAccessFactory(document=doc3, user=user) + + accesses = { + str(doc1.path): {"users": [user.sub]}, + str(doc2.path): {"users": [user.sub]}, + str(doc3.path): {"users": [user.sub]}, + } + + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = SearchIndexer() + + assert len(data) == 1 + assert sorted(data[0], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc1, accesses), + indexer.serialize_document(doc2, accesses), + indexer.serialize_document(doc3, accesses), + ], + key=itemgetter("id"), + ) + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_deleted(mock_push): + """Indexation task on deleted or ancestor_deleted documents""" + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + main_doc = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + child_doc = factories.DocumentFactory( + parent=main_doc, + link_reach=models.LinkReachChoices.AUTHENTICATED, + ) + + factories.UserDocumentAccessFactory(document=doc, user=user) + factories.UserDocumentAccessFactory(document=main_doc, user=user) + factories.UserDocumentAccessFactory(document=child_doc, user=user) + + # Manually reset the throttle flag here or the next indexation will be ignored for 1 second + reset_batch_indexer_throttle() + + with transaction.atomic(): + main_doc_deleted = models.Document.objects.get(pk=main_doc.pk) + main_doc_deleted.soft_delete() + + child_doc_deleted = models.Document.objects.get(pk=child_doc.pk) + + main_doc_deleted.refresh_from_db() + child_doc_deleted.refresh_from_db() + + assert main_doc_deleted.deleted_at is not None + assert child_doc_deleted.ancestors_deleted_at is not None + + assert child_doc_deleted.deleted_at is None + assert child_doc_deleted.ancestors_deleted_at is not None + + accesses = { + str(doc.path): {"users": [user.sub]}, + str(main_doc_deleted.path): {"users": [user.sub]}, + str(child_doc_deleted.path): {"users": [user.sub]}, + } + + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = SearchIndexer() + + assert len(data) == 2 + + # First indexation on document creation + assert sorted(data[0], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc, accesses), + indexer.serialize_document(main_doc, accesses), + indexer.serialize_document(child_doc, accesses), + ], + key=itemgetter("id"), + ) + + # Even deleted items are re-indexed : only update their status in the future + assert sorted(data[1], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(main_doc_deleted, accesses), # soft_delete() + indexer.serialize_document(child_doc_deleted, accesses), + ], + key=itemgetter("id"), + ) + + +@pytest.mark.django_db(transaction=True) +@pytest.mark.usefixtures("indexer_settings") +def test_models_documents_indexer_hard_deleted(): + """Indexation task on hard deleted document""" + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + factories.UserDocumentAccessFactory(document=doc, user=user) + + # Call task on deleted document. + with mock.patch.object(SearchIndexer, "push") as mock_push: + doc.delete() + + # Hard delete document are not re-indexed. + assert mock_push.assert_not_called + + +@mock.patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +@pytest.mark.django_db(transaction=True) +def test_models_documents_post_save_indexer_restored(mock_push): + """Restart indexation task on restored documents""" + user = factories.UserFactory() + + with transaction.atomic(): + doc = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + doc_deleted = factories.DocumentFactory( + link_reach=models.LinkReachChoices.AUTHENTICATED + ) + doc_ancestor_deleted = factories.DocumentFactory( + parent=doc_deleted, + link_reach=models.LinkReachChoices.AUTHENTICATED, + ) + + factories.UserDocumentAccessFactory(document=doc, user=user) + factories.UserDocumentAccessFactory(document=doc_deleted, user=user) + factories.UserDocumentAccessFactory(document=doc_ancestor_deleted, user=user) + + doc_deleted.soft_delete() + + doc_deleted.refresh_from_db() + doc_ancestor_deleted.refresh_from_db() + + assert doc_deleted.deleted_at is not None + assert doc_deleted.ancestors_deleted_at is not None + + assert doc_ancestor_deleted.deleted_at is None + assert doc_ancestor_deleted.ancestors_deleted_at is not None + + # Manually reset the throttle flag here or the next indexation will be ignored for 1 second + reset_batch_indexer_throttle() + + with transaction.atomic(): + doc_restored = models.Document.objects.get(pk=doc_deleted.pk) + doc_restored.restore() + + doc_ancestor_restored = models.Document.objects.get(pk=doc_ancestor_deleted.pk) + + assert doc_restored.deleted_at is None + assert doc_restored.ancestors_deleted_at is None + + assert doc_ancestor_restored.deleted_at is None + assert doc_ancestor_restored.ancestors_deleted_at is None + + accesses = { + str(doc.path): {"users": [user.sub]}, + str(doc_deleted.path): {"users": [user.sub]}, + str(doc_ancestor_deleted.path): {"users": [user.sub]}, + } + + data = [call.args[0] for call in mock_push.call_args_list] + + indexer = SearchIndexer() + + # All docs are re-indexed + assert len(data) == 2 + + # First indexation on items creation & soft delete (in the same transaction) + assert sorted(data[0], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc, accesses), + indexer.serialize_document(doc_deleted, accesses), + indexer.serialize_document(doc_ancestor_deleted, accesses), + ], + key=itemgetter("id"), + ) + + # Restored items are re-indexed : only update their status in the future + assert sorted(data[1], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(doc_restored, accesses), # restore() + indexer.serialize_document(doc_ancestor_restored, accesses), + ], + key=itemgetter("id"), + ) + + +@pytest.mark.django_db(transaction=True) +@pytest.mark.usefixtures("indexer_settings") +def test_models_documents_post_save_indexer_throttle(): + """Test indexation task skipping on document update""" + indexer = SearchIndexer() + user = factories.UserFactory() + + with mock.patch.object(SearchIndexer, "push"): + with transaction.atomic(): + docs = factories.DocumentFactory.create_batch(5, users=(user,)) + + accesses = {str(item.path): {"users": [user.sub]} for item in docs} + + with mock.patch.object(SearchIndexer, "push") as mock_push: + # Simulate 1 running task + cache.set("document-batch-indexer-throttle", 1) + + # save doc to trigger the indexer, but nothing should be done since + # the flag is up + with transaction.atomic(): + docs[0].save() + docs[2].save() + docs[3].save() + + assert [call.args[0] for call in mock_push.call_args_list] == [] + + with mock.patch.object(SearchIndexer, "push") as mock_push: + # No waiting task + cache.delete("document-batch-indexer-throttle") + + with transaction.atomic(): + docs[0].save() + docs[2].save() + docs[3].save() + + data = [call.args[0] for call in mock_push.call_args_list] + + # One call + assert len(data) == 1 + + assert sorted(data[0], key=itemgetter("id")) == sorted( + [ + indexer.serialize_document(docs[0], accesses), + indexer.serialize_document(docs[2], accesses), + indexer.serialize_document(docs[3], accesses), + ], + key=itemgetter("id"), + ) + + +@pytest.mark.django_db(transaction=True) +@pytest.mark.usefixtures("indexer_settings") +def test_models_documents_access_post_save_indexer(): + """Test indexation task on DocumentAccess update""" + users = factories.UserFactory.create_batch(3) + + with mock.patch.object(SearchIndexer, "push"): + with transaction.atomic(): + doc = factories.DocumentFactory(users=users) + doc_accesses = models.DocumentAccess.objects.filter(document=doc).order_by( + "user__sub" + ) + + reset_batch_indexer_throttle() + + with mock.patch.object(SearchIndexer, "push") as mock_push: + with transaction.atomic(): + for doc_access in doc_accesses: + doc_access.save() + + data = [call.args[0] for call in mock_push.call_args_list] + + # One call + assert len(data) == 1 + + assert [d["id"] for d in data[0]] == [str(doc.pk)] + + +@pytest.mark.django_db(transaction=True) +def test_models_items_access_post_save_indexer_no_throttle(indexer_settings): + """Test indexation task on ItemAccess update, no throttle""" + indexer_settings.SEARCH_INDEXER_COUNTDOWN = 0 + + users = factories.UserFactory.create_batch(3) + + with transaction.atomic(): + doc = factories.DocumentFactory(users=users) + doc_accesses = models.DocumentAccess.objects.filter(document=doc).order_by( + "user__sub" + ) + + reset_batch_indexer_throttle() + + with mock.patch.object(SearchIndexer, "push") as mock_push: + with transaction.atomic(): + for doc_access in doc_accesses: + doc_access.save() + + data = [call.args[0] for call in mock_push.call_args_list] + + # 3 calls + assert len(data) == 3 + # one document per call + assert [len(d) for d in data] == [1] * 3 + # the same document is indexed 3 times + assert [d[0]["id"] for d in data] == [str(doc.pk)] * 3 diff --git a/src/backend/core/tests/test_services_search_indexers.py b/src/backend/core/tests/test_services_search_indexers.py new file mode 100644 index 0000000000..61488a921b --- /dev/null +++ b/src/backend/core/tests/test_services_search_indexers.py @@ -0,0 +1,635 @@ +"""Tests for Documents search indexers""" + +from functools import partial +from json import dumps as json_dumps +from unittest.mock import patch + +from django.contrib.auth.models import AnonymousUser +from django.core.exceptions import ImproperlyConfigured +from django.utils.module_loading import import_string + +import pytest +import responses +from requests import HTTPError + +from core import factories, models, utils +from core.services.search_indexers import ( + BaseDocumentIndexer, + SearchIndexer, + get_document_indexer, + get_visited_document_ids_of, +) + +pytestmark = pytest.mark.django_db + + +class FakeDocumentIndexer(BaseDocumentIndexer): + """Fake indexer for test purpose""" + + def serialize_document(self, document, accesses): + return {} + + def push(self, data): + pass + + def search_query(self, data, token): + return {} + + +def test_services_search_indexer_class_invalid(indexer_settings): + """ + Should raise RuntimeError if SEARCH_INDEXER_CLASS cannot be imported. + """ + indexer_settings.SEARCH_INDEXER_CLASS = "unknown.Unknown" + + assert get_document_indexer() is None + + +def test_services_search_indexer_class(indexer_settings): + """ + Import indexer class defined in setting SEARCH_INDEXER_CLASS. + """ + indexer_settings.SEARCH_INDEXER_CLASS = ( + "core.tests.test_services_search_indexers.FakeDocumentIndexer" + ) + + assert isinstance( + get_document_indexer(), + import_string("core.tests.test_services_search_indexers.FakeDocumentIndexer"), + ) + + +def test_services_search_indexer_is_configured(indexer_settings): + """ + Should return true only when the indexer class and other configuration settings + are valid. + """ + indexer_settings.SEARCH_INDEXER_CLASS = None + + # None + get_document_indexer.cache_clear() + assert not get_document_indexer() + + # Empty + indexer_settings.SEARCH_INDEXER_CLASS = "" + + get_document_indexer.cache_clear() + assert not get_document_indexer() + + # Valid class + indexer_settings.SEARCH_INDEXER_CLASS = ( + "core.services.search_indexers.SearchIndexer" + ) + + get_document_indexer.cache_clear() + assert get_document_indexer() is not None + + indexer_settings.SEARCH_INDEXER_URL = "" + + # Invalid url + get_document_indexer.cache_clear() + assert not get_document_indexer() + + +def test_services_search_indexer_url_is_none(indexer_settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_URL is None or empty. + """ + indexer_settings.SEARCH_INDEXER_URL = None + + with pytest.raises(ImproperlyConfigured) as exc_info: + SearchIndexer() + + assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value) + + +def test_services_search_indexer_url_is_empty(indexer_settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_URL is empty string. + """ + indexer_settings.SEARCH_INDEXER_URL = "" + + with pytest.raises(ImproperlyConfigured) as exc_info: + SearchIndexer() + + assert "SEARCH_INDEXER_URL must be set in Django settings." in str(exc_info.value) + + +def test_services_search_indexer_secret_is_none(indexer_settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is None. + """ + indexer_settings.SEARCH_INDEXER_SECRET = None + + with pytest.raises(ImproperlyConfigured) as exc_info: + SearchIndexer() + + assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str( + exc_info.value + ) + + +def test_services_search_indexer_secret_is_empty(indexer_settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_SECRET is empty string. + """ + indexer_settings.SEARCH_INDEXER_SECRET = "" + + with pytest.raises(ImproperlyConfigured) as exc_info: + SearchIndexer() + + assert "SEARCH_INDEXER_SECRET must be set in Django settings." in str( + exc_info.value + ) + + +def test_services_search_endpoint_is_none(indexer_settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is None. + """ + indexer_settings.SEARCH_INDEXER_QUERY_URL = None + + with pytest.raises(ImproperlyConfigured) as exc_info: + SearchIndexer() + + assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str( + exc_info.value + ) + + +def test_services_search_endpoint_is_empty(indexer_settings): + """ + Indexer should raise RuntimeError if SEARCH_INDEXER_QUERY_URL is empty. + """ + indexer_settings.SEARCH_INDEXER_QUERY_URL = "" + + with pytest.raises(ImproperlyConfigured) as exc_info: + SearchIndexer() + + assert "SEARCH_INDEXER_QUERY_URL must be set in Django settings." in str( + exc_info.value + ) + + +@pytest.mark.usefixtures("indexer_settings") +def test_services_search_indexers_serialize_document_returns_expected_json(): + """ + It should serialize documents with correct metadata and access control. + """ + user_a, user_b = factories.UserFactory.create_batch(2) + document = factories.DocumentFactory() + factories.DocumentFactory(parent=document) + + factories.UserDocumentAccessFactory(document=document, user=user_a) + factories.UserDocumentAccessFactory(document=document, user=user_b) + factories.TeamDocumentAccessFactory(document=document, team="team1") + factories.TeamDocumentAccessFactory(document=document, team="team2") + + accesses = { + document.path: { + "users": {str(user_a.sub), str(user_b.sub)}, + "teams": {"team1", "team2"}, + } + } + + indexer = SearchIndexer() + result = indexer.serialize_document(document, accesses) + + assert set(result.pop("users")) == {str(user_a.sub), str(user_b.sub)} + assert set(result.pop("groups")) == {"team1", "team2"} + assert result == { + "id": str(document.id), + "title": document.title, + "depth": 1, + "path": document.path, + "numchild": 1, + "content": utils.base64_yjs_to_text(document.content), + "created_at": document.created_at.isoformat(), + "updated_at": document.updated_at.isoformat(), + "reach": document.link_reach, + "size": 13, + "is_active": True, + } + + +@pytest.mark.usefixtures("indexer_settings") +def test_services_search_indexers_serialize_document_deleted(): + """Deleted documents are marked as just in the serialized json.""" + parent = factories.DocumentFactory() + document = factories.DocumentFactory(parent=parent) + + parent.soft_delete() + document.refresh_from_db() + + indexer = SearchIndexer() + result = indexer.serialize_document(document, {}) + + assert result["is_active"] is False + + +@pytest.mark.usefixtures("indexer_settings") +def test_services_search_indexers_serialize_document_empty(): + """Empty documents returns empty content in the serialized json.""" + document = factories.DocumentFactory(content="", title=None) + + indexer = SearchIndexer() + result = indexer.serialize_document(document, {}) + + assert result["content"] == "" + assert result["title"] == "" + + +@responses.activate +def test_services_search_indexers_index_errors(indexer_settings): + """ + Documents indexing response handling on Find API HTTP errors. + """ + factories.DocumentFactory() + + indexer_settings.SEARCH_INDEXER_URL = "http://app-find/api/v1.0/documents/index/" + + responses.add( + responses.POST, + "http://app-find/api/v1.0/documents/index/", + status=401, + body=json_dumps({"message": "Authentication failed."}), + ) + + with pytest.raises(HTTPError): + SearchIndexer().index() + + +@patch.object(SearchIndexer, "push") +def test_services_search_indexers_batches_pass_only_batch_accesses( + mock_push, indexer_settings +): + """ + Documents indexing should be processed in batches, + and only the access data relevant to each batch should be used. + """ + indexer_settings.SEARCH_INDEXER_BATCH_SIZE = 2 + documents = factories.DocumentFactory.create_batch(5) + + # Attach a single user access to each document + expected_user_subs = {} + for document in documents: + access = factories.UserDocumentAccessFactory(document=document) + expected_user_subs[str(document.id)] = str(access.user.sub) + + assert SearchIndexer().index() == 5 + + # Should be 3 batches: 2 + 2 + 1 + assert mock_push.call_count == 3 + + seen_doc_ids = set() + + for call in mock_push.call_args_list: + batch = call.args[0] + assert isinstance(batch, list) + + for doc_json in batch: + doc_id = doc_json["id"] + seen_doc_ids.add(doc_id) + + # Only one user expected per document + assert doc_json["users"] == [expected_user_subs[doc_id]] + assert doc_json["groups"] == [] + + # Make sure all 5 documents were indexed + assert seen_doc_ids == {str(d.id) for d in documents} + + +@patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +def test_services_search_indexers_batch_size_argument(mock_push): + """ + Documents indexing should be processed in batches, + batch_size overrides SEARCH_INDEXER_BATCH_SIZE + """ + documents = factories.DocumentFactory.create_batch(5) + + # Attach a single user access to each document + expected_user_subs = {} + for document in documents: + access = factories.UserDocumentAccessFactory(document=document) + expected_user_subs[str(document.id)] = str(access.user.sub) + + assert SearchIndexer().index(batch_size=2) == 5 + + # Should be 3 batches: 2 + 2 + 1 + assert mock_push.call_count == 3 + + seen_doc_ids = set() + + for call in mock_push.call_args_list: + batch = call.args[0] + assert isinstance(batch, list) + + for doc_json in batch: + doc_id = doc_json["id"] + seen_doc_ids.add(doc_id) + + # Only one user expected per document + assert doc_json["users"] == [expected_user_subs[doc_id]] + assert doc_json["groups"] == [] + + # Make sure all 5 documents were indexed + assert seen_doc_ids == {str(d.id) for d in documents} + + +@patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +def test_services_search_indexers_ignore_empty_documents(mock_push): + """ + Documents indexing should be processed in batches, + and only the access data relevant to each batch should be used. + """ + document = factories.DocumentFactory() + factories.DocumentFactory(content="", title="") + empty_title = factories.DocumentFactory(title="") + empty_content = factories.DocumentFactory(content="") + + assert SearchIndexer().index() == 3 + + assert mock_push.call_count == 1 + + # Make sure only not eempty documents are indexed + results = {doc["id"] for doc in mock_push.call_args[0][0]} + assert results == { + str(d.id) + for d in ( + document, + empty_content, + empty_title, + ) + } + + +@patch.object(SearchIndexer, "push") +def test_services_search_indexers_skip_empty_batches(mock_push, indexer_settings): + """ + Documents indexing batch can be empty if all the docs are empty. + """ + indexer_settings.SEARCH_INDEXER_BATCH_SIZE = 2 + + document = factories.DocumentFactory() + + # Only empty docs + factories.DocumentFactory.create_batch(5, content="", title="") + + assert SearchIndexer().index() == 1 + assert mock_push.call_count == 1 + + results = [doc["id"] for doc in mock_push.call_args[0][0]] + assert results == [str(document.id)] + + +@patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +def test_services_search_indexers_ancestors_link_reach(mock_push): + """Document accesses and reach should take into account ancestors link reaches.""" + great_grand_parent = factories.DocumentFactory(link_reach="restricted") + grand_parent = factories.DocumentFactory( + parent=great_grand_parent, link_reach="authenticated" + ) + parent = factories.DocumentFactory(parent=grand_parent, link_reach="public") + document = factories.DocumentFactory(parent=parent, link_reach="restricted") + + assert SearchIndexer().index() == 4 + + results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} + assert len(results) == 4 + assert results[str(great_grand_parent.id)]["reach"] == "restricted" + assert results[str(grand_parent.id)]["reach"] == "authenticated" + assert results[str(parent.id)]["reach"] == "public" + assert results[str(document.id)]["reach"] == "public" + + +@patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +def test_services_search_indexers_ancestors_users(mock_push): + """Document accesses and reach should include users from ancestors.""" + user_gp, user_p, user_d = factories.UserFactory.create_batch(3) + + grand_parent = factories.DocumentFactory(users=[user_gp]) + parent = factories.DocumentFactory(parent=grand_parent, users=[user_p]) + document = factories.DocumentFactory(parent=parent, users=[user_d]) + + assert SearchIndexer().index() == 3 + + results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} + assert len(results) == 3 + assert results[str(grand_parent.id)]["users"] == [str(user_gp.sub)] + assert set(results[str(parent.id)]["users"]) == {str(user_gp.sub), str(user_p.sub)} + assert set(results[str(document.id)]["users"]) == { + str(user_gp.sub), + str(user_p.sub), + str(user_d.sub), + } + + +@patch.object(SearchIndexer, "push") +@pytest.mark.usefixtures("indexer_settings") +def test_services_search_indexers_ancestors_teams(mock_push): + """Document accesses and reach should include teams from ancestors.""" + grand_parent = factories.DocumentFactory(teams=["team_gp"]) + parent = factories.DocumentFactory(parent=grand_parent, teams=["team_p"]) + document = factories.DocumentFactory(parent=parent, teams=["team_d"]) + + assert SearchIndexer().index() == 3 + + results = {doc["id"]: doc for doc in mock_push.call_args[0][0]} + assert len(results) == 3 + assert results[str(grand_parent.id)]["groups"] == ["team_gp"] + assert set(results[str(parent.id)]["groups"]) == {"team_gp", "team_p"} + assert set(results[str(document.id)]["groups"]) == {"team_gp", "team_p", "team_d"} + + +@patch("requests.post") +def test_push_uses_correct_url_and_data(mock_post, indexer_settings): + """ + push() should call requests.post with the correct URL from settings + the timeout set to 10 seconds and the data as JSON. + """ + indexer_settings.SEARCH_INDEXER_URL = "http://example.com/index" + + indexer = SearchIndexer() + sample_data = [{"id": "123", "title": "Test"}] + + mock_response = mock_post.return_value + mock_response.raise_for_status.return_value = None # No error + + indexer.push(sample_data) + + mock_post.assert_called_once() + args, kwargs = mock_post.call_args + + assert args[0] == indexer_settings.SEARCH_INDEXER_URL + assert kwargs.get("json") == sample_data + assert kwargs.get("timeout") == 10 + + +def test_get_visited_document_ids_of(): + """ + get_visited_document_ids_of() returns the ids of the documents viewed + by the user BUT without specific access configuration (like public ones) + """ + user = factories.UserFactory() + other = factories.UserFactory() + anonymous = AnonymousUser() + queryset = models.Document.objects.all() + + assert not get_visited_document_ids_of(queryset, anonymous) + assert not get_visited_document_ids_of(queryset, user) + + doc1, doc2, _ = factories.DocumentFactory.create_batch(3) + + create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False) + + create_link(document=doc1) + create_link(document=doc2) + + # The third document is not visited + assert sorted(get_visited_document_ids_of(queryset, user)) == sorted( + [str(doc1.pk), str(doc2.pk)] + ) + + factories.UserDocumentAccessFactory(user=other, document=doc1) + factories.UserDocumentAccessFactory(user=user, document=doc2) + + # The second document have an access for the user + assert get_visited_document_ids_of(queryset, user) == [str(doc1.pk)] + + +@pytest.mark.usefixtures("indexer_settings") +def test_get_visited_document_ids_of_deleted(): + """ + get_visited_document_ids_of() returns the ids of the documents viewed + by the user if they are not deleted. + """ + user = factories.UserFactory() + anonymous = AnonymousUser() + queryset = models.Document.objects.all() + + assert not get_visited_document_ids_of(queryset, anonymous) + assert not get_visited_document_ids_of(queryset, user) + + doc = factories.DocumentFactory() + doc_deleted = factories.DocumentFactory() + doc_ancestor_deleted = factories.DocumentFactory(parent=doc_deleted) + + create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False) + + create_link(document=doc) + create_link(document=doc_deleted) + create_link(document=doc_ancestor_deleted) + + # The all documents are visited + assert sorted(get_visited_document_ids_of(queryset, user)) == sorted( + [str(doc.pk), str(doc_deleted.pk), str(doc_ancestor_deleted.pk)] + ) + + doc_deleted.soft_delete() + + # Only the first document is not deleted + assert get_visited_document_ids_of(queryset, user) == [str(doc.pk)] + + +@responses.activate +def test_services_search_indexers_search_errors(indexer_settings): + """ + Documents indexing response handling on Find API HTTP errors. + """ + factories.DocumentFactory() + + indexer_settings.SEARCH_INDEXER_QUERY_URL = ( + "http://app-find/api/v1.0/documents/search/" + ) + + responses.add( + responses.POST, + "http://app-find/api/v1.0/documents/search/", + status=401, + body=json_dumps({"message": "Authentication failed."}), + ) + + with pytest.raises(HTTPError): + SearchIndexer().search("alpha", token="mytoken") + + +@patch("requests.post") +def test_services_search_indexers_search(mock_post, indexer_settings): + """ + search() should call requests.post to SEARCH_INDEXER_QUERY_URL with the + document ids from linktraces. + """ + user = factories.UserFactory() + indexer = SearchIndexer() + + mock_response = mock_post.return_value + mock_response.raise_for_status.return_value = None # No error + + doc1, doc2, _ = factories.DocumentFactory.create_batch(3) + + create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False) + + create_link(document=doc1) + create_link(document=doc2) + + visited = get_visited_document_ids_of(models.Document.objects.all(), user) + + indexer.search("alpha", visited=visited, token="mytoken") + + args, kwargs = mock_post.call_args + + assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL + + query_data = kwargs.get("json") + assert query_data["q"] == "alpha" + assert sorted(query_data["visited"]) == sorted([str(doc1.pk), str(doc2.pk)]) + assert query_data["services"] == ["docs"] + assert query_data["nb_results"] == 50 + assert query_data["order_by"] == "updated_at" + assert query_data["order_direction"] == "desc" + + assert kwargs.get("headers") == {"Authorization": "Bearer mytoken"} + assert kwargs.get("timeout") == 10 + + +@patch("requests.post") +def test_services_search_indexers_search_nb_results(mock_post, indexer_settings): + """ + Find API call should have nb_results == SEARCH_INDEXER_QUERY_LIMIT + or the given nb_results argument. + """ + indexer_settings.SEARCH_INDEXER_QUERY_LIMIT = 25 + + user = factories.UserFactory() + indexer = SearchIndexer() + + mock_response = mock_post.return_value + mock_response.raise_for_status.return_value = None # No error + + doc1, doc2, _ = factories.DocumentFactory.create_batch(3) + + create_link = partial(models.LinkTrace.objects.create, user=user, is_masked=False) + + create_link(document=doc1) + create_link(document=doc2) + + visited = get_visited_document_ids_of(models.Document.objects.all(), user) + + indexer.search("alpha", visited=visited, token="mytoken") + + args, kwargs = mock_post.call_args + + assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL + assert kwargs.get("json")["nb_results"] == 25 + + # The argument overrides the setting value + indexer.search("alpha", visited=visited, token="mytoken", nb_results=109) + + args, kwargs = mock_post.call_args + + assert args[0] == indexer_settings.SEARCH_INDEXER_QUERY_URL + assert kwargs.get("json")["nb_results"] == 109 diff --git a/src/backend/core/tests/test_utils.py b/src/backend/core/tests/test_utils.py index 37b2e32d5e..42d588c536 100644 --- a/src/backend/core/tests/test_utils.py +++ b/src/backend/core/tests/test_utils.py @@ -75,3 +75,28 @@ def test_utils_extract_attachments(): base64_string = base64.b64encode(update).decode("utf-8") # image_key2 is missing the "/media/" part and shouldn't get extracted assert utils.extract_attachments(base64_string) == [image_key1, image_key3] + + +def test_utils_get_ancestor_to_descendants_map_single_path(): + """Test ancestor mapping of a single path.""" + paths = ["000100020005"] + result = utils.get_ancestor_to_descendants_map(paths, steplen=4) + + assert result == { + "0001": {"000100020005"}, + "00010002": {"000100020005"}, + "000100020005": {"000100020005"}, + } + + +def test_utils_get_ancestor_to_descendants_map_multiple_paths(): + """Test ancestor mapping of multiple paths with shared prefixes.""" + paths = ["000100020005", "00010003"] + result = utils.get_ancestor_to_descendants_map(paths, steplen=4) + + assert result == { + "0001": {"000100020005", "00010003"}, + "00010002": {"000100020005"}, + "000100020005": {"000100020005"}, + "00010003": {"00010003"}, + } diff --git a/src/backend/core/utils.py b/src/backend/core/utils.py index 780431f495..357ede03c3 100644 --- a/src/backend/core/utils.py +++ b/src/backend/core/utils.py @@ -2,6 +2,7 @@ import base64 import re +from collections import defaultdict import pycrdt from bs4 import BeautifulSoup @@ -9,6 +10,27 @@ from core import enums +def get_ancestor_to_descendants_map(paths, steplen): + """ + Given a list of document paths, return a mapping of ancestor_path -> set of descendant_paths. + + Each path is assumed to use materialized path format with fixed-length segments. + + Args: + paths (list of str): List of full document paths. + steplen (int): Length of each path segment. + + Returns: + dict[str, set[str]]: Mapping from ancestor path to its descendant paths (including itself). + """ + ancestor_map = defaultdict(set) + for path in paths: + for i in range(steplen, len(path) + 1, steplen): + ancestor = path[:i] + ancestor_map[ancestor].add(path) + return ancestor_map + + def filter_descendants(paths, root_paths, skip_sorting=False): """ Filters paths to keep only those that are descendants of any path in root_paths. diff --git a/src/backend/demo/management/commands/create_demo.py b/src/backend/demo/management/commands/create_demo.py index 3681c378a0..ebce4eae25 100644 --- a/src/backend/demo/management/commands/create_demo.py +++ b/src/backend/demo/management/commands/create_demo.py @@ -1,16 +1,19 @@ # ruff: noqa: S311, S106 """create_demo management command""" +import base64 import logging import math import random import time from collections import defaultdict +from uuid import uuid4 from django import db from django.conf import settings from django.core.management.base import BaseCommand, CommandError +import pycrdt from faker import Faker from core import models @@ -27,6 +30,16 @@ def random_true_with_probability(probability): return random.random() < probability +def get_ydoc_for_text(text): + """Return a ydoc from plain text for demo purposes.""" + ydoc = pycrdt.Doc() + paragraph = pycrdt.XmlElement("p", {}, [pycrdt.XmlText(text)]) + fragment = pycrdt.XmlFragment([paragraph]) + ydoc["document-store"] = fragment + update = ydoc.get_update() + return base64.b64encode(update).decode("utf-8") + + class BulkQueue: """A utility class to create Django model instances in bulk by just pushing to a queue.""" @@ -48,7 +61,7 @@ def _bulk_create(self, objects): self.queue[objects[0]._meta.model.__name__] = [] # noqa: SLF001 def push(self, obj): - """Add a model instance to queue to that it gets created in bulk.""" + """Add a model instance to queue so that it gets created in bulk.""" objects = self.queue[obj._meta.model.__name__] # noqa: SLF001 objects.append(obj) if len(objects) > self.BATCH_SIZE: @@ -139,17 +152,19 @@ def create_demo(stdout): # pylint: disable=protected-access key = models.Document._int2str(i) # noqa: SLF001 padding = models.Document.alphabet[0] * (models.Document.steplen - len(key)) - queue.push( - models.Document( - depth=1, - path=f"{padding}{key}", - creator_id=random.choice(users_ids), - title=fake.sentence(nb_words=4), - link_reach=models.LinkReachChoices.AUTHENTICATED - if random_true_with_probability(0.5) - else random.choice(models.LinkReachChoices.values), - ) + title = fake.sentence(nb_words=4) + document = models.Document( + id=uuid4(), + depth=1, + path=f"{padding}{key}", + creator_id=random.choice(users_ids), + title=title, + link_reach=models.LinkReachChoices.AUTHENTICATED + if random_true_with_probability(0.5) + else random.choice(models.LinkReachChoices.values), ) + document.save_content(get_ydoc_for_text(f"Content for {title:s}")) + queue.push(document) queue.flush() diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index 2229036c8a..4e3c55c87a 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -99,6 +99,31 @@ class Base(Configuration): } DEFAULT_AUTO_FIELD = "django.db.models.AutoField" + # Search + SEARCH_INDEXER_CLASS = values.Value( + default=None, + environ_name="SEARCH_INDEXER_CLASS", + environ_prefix=None, + ) + SEARCH_INDEXER_BATCH_SIZE = values.IntegerValue( + default=100_000, environ_name="SEARCH_INDEXER_BATCH_SIZE", environ_prefix=None + ) + SEARCH_INDEXER_URL = values.Value( + default=None, environ_name="SEARCH_INDEXER_URL", environ_prefix=None + ) + SEARCH_INDEXER_COUNTDOWN = values.IntegerValue( + default=1, environ_name="SEARCH_INDEXER_COUNTDOWN", environ_prefix=None + ) + SEARCH_INDEXER_SECRET = values.Value( + default=None, environ_name="SEARCH_INDEXER_SECRET", environ_prefix=None + ) + SEARCH_INDEXER_QUERY_URL = values.Value( + default=None, environ_name="SEARCH_INDEXER_QUERY_URL", environ_prefix=None + ) + SEARCH_INDEXER_QUERY_LIMIT = values.PositiveIntegerValue( + default=50, environ_name="SEARCH_INDEXER_QUERY_LIMIT", environ_prefix=None + ) + # Static files (CSS, JavaScript, Images) STATIC_URL = "/static/" STATIC_ROOT = os.path.join(DATA_DIR, "static")