diff --git a/.docker/server_dockerfile b/.docker/server_dockerfile index d03bd6c57..cecd0b410 100644 --- a/.docker/server_dockerfile +++ b/.docker/server_dockerfile @@ -23,7 +23,7 @@ ENV UV_LINK_MODE=copy \ # Used to fake the version of the package in cases where datalab is only # available as a git submodule or worktree -ENV SETUPTOOLS_SCM_PRETEND_VERSION=${SETUPTOOLS_SCM_PRETEND_VERSION} +ENV SETUPTOOLS_SCM_PRETEND_VERSION="" WORKDIR /app COPY ./pydatalab/pyproject.toml . diff --git a/pydatalab/src/pydatalab/routes/v0_1/items.py b/pydatalab/src/pydatalab/routes/v0_1/items.py index a886f57e2..0caa872d8 100644 --- a/pydatalab/src/pydatalab/routes/v0_1/items.py +++ b/pydatalab/src/pydatalab/routes/v0_1/items.py @@ -1,5 +1,6 @@ import datetime import json +import re from bson import ObjectId from flask import Blueprint, jsonify, redirect, request @@ -327,9 +328,18 @@ def search_items(): pipeline.append({"$match": match_obj}) pipeline.append({"$sort": {"score": {"$meta": "textScore"}}}) else: + query_parts = [re.escape(part) for part in query.split(" ") if part.strip()] + + # Add word boundary to short parts to avoid excessive matches, i.e., search start of string + # for short parts, but allow match anywhere in string for longer parts + query_parts = [f"\\b{part}" if len(part) < 5 else part for part in query_parts] match_obj = { - "$or": [{field: {"$regex": query, "$options": "i"}} for field in ITEMS_FTS_FIELDS] + "$or": [ + {"$and": [{field: {"$regex": query, "$options": "i"}} for query in query_parts]} + for field in ITEMS_FTS_FIELDS + ] } + LOGGER.debug("Performing regex search for %s with full search %s", query_parts, match_obj) match_obj = {"$and": [get_default_permissions(user_only=False), match_obj]} if types is not None: match_obj["$and"].append({"type": {"$in": types}}) diff --git a/pydatalab/tests/server/conftest.py b/pydatalab/tests/server/conftest.py index 6621fb64d..75f879d92 100644 --- a/pydatalab/tests/server/conftest.py +++ b/pydatalab/tests/server/conftest.py @@ -494,6 +494,7 @@ def example_items(user_id, admin_user_id): Sample( **{ "item_id": "sample_2", + "chemform": "vanadium(II) oxide", "name": "other_sample", "date": "1970-02-01", "refcode": "grey:TEST3", @@ -577,6 +578,11 @@ def fixture_insert_default_equipment(default_equipment): yield from _insert_and_cleanup_item_from_model(default_equipment) +@pytest.fixture(scope="module", name="insert_example_items") +def fixture_insert_example_items(example_items, real_mongo_client): + real_mongo_client.get_database().items.insert_many(example_items) + + @pytest.fixture(scope="module", name="inserted_default_items") def fixture_inserted_default_items( insert_default_sample, diff --git a/pydatalab/tests/server/test_samples.py b/pydatalab/tests/server/test_samples.py index 33483cd96..92e3a7b86 100644 --- a/pydatalab/tests/server/test_samples.py +++ b/pydatalab/tests/server/test_samples.py @@ -152,95 +152,89 @@ def test_create_indices(real_mongo_client): assert all(name in names for name in expected_index_names) +@pytest.mark.parametrize( + "query,expected_result_ids", + [ + ("query=%2512345&types=samples", {"12345", "sample_1"}), + ("query=%new material", {"material", "12345"}), + ("query=%NaNiO2", {"test", "material", "12345"}), + ("query=%'grey:TEST4'", ["material", "test", "sample_1", "12345", "sample_2"]), + ], +) @pytest.mark.dependency(depends=["test_create_indices"]) -def test_item_search(client, admin_client, real_mongo_client, example_items): +def test_item_fts_search( + query, expected_result_ids, client, real_mongo_client, insert_example_items +): if real_mongo_client is None: pytest.skip("Skipping FTS tests, not connected to real MongoDB") - real_mongo_client.get_database().items.insert_many(example_items) - - response = client.get("/search-items/?query=%2512345&types=samples") - - assert response.status_code == 200 - assert response.json["status"] == "success" - assert len(response.json["items"]) == 2 - assert response.json["items"][0]["item_id"] == "12345" - assert response.json["items"][1]["item_id"] == "sample_1" - - response = client.get("/search-items/?query=%new material") - - assert response.status_code == 200 - assert response.json["status"] == "success" - assert len(response.json["items"]) == 2 - assert response.json["items"][0]["item_id"] == "material" - assert response.json["items"][1]["item_id"] == "12345" - - response = client.get("/search-items/?query=%NaNiO2") + response = client.get(f"/search-items/?{query}") assert response.status_code == 200 assert response.json["status"] == "success" - assert len(response.json["items"]) == 3 - assert response.json["items"][0]["item_id"] == "test" - assert response.json["items"][1]["item_id"] == "material" - assert response.json["items"][2]["item_id"] == "12345" - - response = client.get("/search-items/?query=%'grey:TEST4'") - - assert response.status_code == 200 - assert response.json["status"] == "success" - # should return all grey:TEST samples, with TEST4 first - assert len(response.json["items"]) == 5 - assert response.json["items"][0]["item_id"] == "material" - - # Test regex search - response = client.get("/search-items/?query=mater&types=samples,starting_materials") - - assert response.status_code == 200 - assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert "material" in item_ids - assert "12345" in item_ids - assert len(item_ids) == 2 - - # Test regex search with different types - response = client.get("/search-items/?query=mater&types=equipment") - - assert response.status_code == 200 - assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert len(item_ids) == 0 - - # Test regex search still obeys permissions - response = admin_client.get("/search-items/?query=mater") + item_ids = [item["item_id"] for item in response.json["items"]] + if isinstance(expected_result_ids, set): + assert all(_id in item_ids for _id in expected_result_ids), ( + f"Some expected IDs not found for {query=}: expected {expected_result_ids}, found {item_ids}" + ) - assert response.status_code == 200 - assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert len(item_ids) == 3 - assert "material" in item_ids - assert "12345" in item_ids - assert "123456" in item_ids + else: + assert item_ids == expected_result_ids - # Search for single word in description - response = client.get("/search-items/?query='magic'") - assert response.status_code == 200 - assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert "material" in item_ids - assert "test" in item_ids - assert len(item_ids) == 2 +@pytest.mark.dependency(depends=["test_create_indices"]) +@pytest.mark.parametrize( + "user,query,expected_result_ids", + [ + ("user", "query=mater&types=samples,starting_materials", ["material", "12345"]), + ("user", "query=mater&types=equipment", []), # Tests avoidance of different types + ("admin", "query=mater", ["material", "12345", "123456"]), # Test search obeys permissions + ( + "admin", + "query='magic'", + ["material", "test", "sample_admin"], + ), # Test simple word in description as admin + ( + "user", + "query='magic'", + ["material", "test"], + ), # Test simple word in description + ("admin", "query='vanadium('&types=samples", ["sample_2"]), # Test unclosed brackets + ("admin", "query='vanadium oxide'&types=samples", ["sample_2"]), # Test two words + ("admin", "query='oxide vanadium'&types=samples", ["sample_2"]), # Test reverse order + ("admin", "query='v'", ["sample_2"]), # Test single char at start of word + ("admin", "query='van'", ["sample_2"]), # Test prefix at start of word + ("admin", "query='oxid'", ["sample_2"]), # Test prefix at start of word + ( + "admin", + "query='anadium'&types=samples", + ["sample_2"], + ), # Test word ending that should return results if long enough + ( + "admin", + "query='dium'&types=samples", + [], + ), # Test word ending that should not return results if its too short + ], +) +def test_item_regex_search( + user, query, expected_result_ids, real_mongo_client, client, admin_client, insert_example_items +): + if real_mongo_client is None: + pytest.skip("Skipping FTS tests, not connected to real MongoDB") - # Search for single word in description, again checking extra results for admins - response = admin_client.get("/search-items/?query='magic'") + if user == "admin": + response = admin_client.get(f"/search-items/?{query}") + else: + response = client.get(f"/search-items/?{query}") assert response.status_code == 200 assert response.json["status"] == "success" item_ids = {item["item_id"] for item in response.json["items"]} - assert len(item_ids) == 3 - assert "material" in item_ids - assert "test" in item_ids - assert "sample_admin" in item_ids + assert all(_id in item_ids for _id in expected_result_ids), ( + f"Some expected IDs not found for {query=} as {user=}: expected {expected_result_ids}, found {item_ids}" + ) + assert len(item_ids) == len(expected_result_ids) @pytest.mark.dependency(depends=["test_delete_sample"])