From 2697dbbf344345e977cb19f6fed8f5460cbcc1b6 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Fri, 29 Aug 2025 00:27:45 +0100 Subject: [PATCH 1/4] Improve/fix regex item search - sanitize user input with brackets - match on implicit word boundaries - chain whitespace delimited queries --- pydatalab/src/pydatalab/routes/v0_1/items.py | 8 +++- pydatalab/tests/server/conftest.py | 1 + pydatalab/tests/server/test_samples.py | 43 ++++++++++++++++++++ 3 files changed, 51 insertions(+), 1 deletion(-) diff --git a/pydatalab/src/pydatalab/routes/v0_1/items.py b/pydatalab/src/pydatalab/routes/v0_1/items.py index 58f512e84..4477e608e 100644 --- a/pydatalab/src/pydatalab/routes/v0_1/items.py +++ b/pydatalab/src/pydatalab/routes/v0_1/items.py @@ -1,5 +1,6 @@ import datetime import json +import re from bson import ObjectId from flask import Blueprint, jsonify, redirect, request @@ -327,9 +328,14 @@ def search_items(): pipeline.append({"$match": match_obj}) pipeline.append({"$sort": {"score": {"$meta": "textScore"}}}) else: + query_parts = [r"\b" + re.escape(part) for part in query.split(" ") if part] match_obj = { - "$or": [{field: {"$regex": query, "$options": "i"}} for field in ITEMS_FTS_FIELDS] + "$or": [ + {"$and": [{field: {"$regex": query, "$options": "i"}} for query in query_parts]} + for field in ITEMS_FTS_FIELDS + ] } + LOGGER.debug("Performing regex search for %s with full search %s", query_parts, match_obj) match_obj = {"$and": [get_default_permissions(user_only=False), match_obj]} if types is not None: match_obj["$and"].append({"type": {"$in": types}}) diff --git a/pydatalab/tests/server/conftest.py b/pydatalab/tests/server/conftest.py index 6621fb64d..e5baa8dbd 100644 --- a/pydatalab/tests/server/conftest.py +++ b/pydatalab/tests/server/conftest.py @@ -494,6 +494,7 @@ def example_items(user_id, admin_user_id): Sample( **{ "item_id": "sample_2", + "chemform": "vanadium(II) oxide", "name": "other_sample", "date": "1970-02-01", "refcode": "grey:TEST3", diff --git a/pydatalab/tests/server/test_samples.py b/pydatalab/tests/server/test_samples.py index 3dcc92d18..ab16179f8 100644 --- a/pydatalab/tests/server/test_samples.py +++ b/pydatalab/tests/server/test_samples.py @@ -242,6 +242,49 @@ def test_item_search(client, admin_client, real_mongo_client, example_items): assert "test" in item_ids assert "sample_admin" in item_ids + # Search for string with brackets + response = admin_client.get("/search-items/?query='vanadium('") + + assert response.status_code == 200 + assert response.json["status"] == "success" + item_ids = {item["item_id"] for item in response.json["items"]} + assert len(item_ids) == 1 + assert "sample_2" in item_ids + + # Search for two words present in sample in either order + response = admin_client.get("/search-items/?query='vanadium oxide'") + + assert response.status_code == 200 + assert response.json["status"] == "success" + item_ids = {item["item_id"] for item in response.json["items"]} + assert len(item_ids) == 1 + assert "sample_2" in item_ids + + response = admin_client.get("/search-items/?query='oxide vanadium'") + + assert response.status_code == 200 + assert response.json["status"] == "success" + item_ids = {item["item_id"] for item in response.json["items"]} + assert len(item_ids) == 1 + assert "sample_2" in item_ids + + # Search for single char at start of word + response = admin_client.get("/search-items/?query='v'") + + assert response.status_code == 200 + assert response.json["status"] == "success" + item_ids = {item["item_id"] for item in response.json["items"]} + assert len(item_ids) == 1 + assert "sample_2" in item_ids + + # Search for word ending that should not return results + response = admin_client.get("/search-items/?query='anadium'") + + assert response.status_code == 200 + assert response.json["status"] == "success" + item_ids = {item["item_id"] for item in response.json["items"]} + assert len(item_ids) == 0 + @pytest.mark.dependency(depends=["test_delete_sample"]) def test_new_sample_with_relationships(client, complicated_sample): From fe297396d62fbae3644042ca12b3063cbfd469e5 Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Sat, 30 Aug 2025 21:08:20 +0100 Subject: [PATCH 2/4] Only add word boundaries for short queries --- pydatalab/src/pydatalab/routes/v0_1/items.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pydatalab/src/pydatalab/routes/v0_1/items.py b/pydatalab/src/pydatalab/routes/v0_1/items.py index 4477e608e..839445069 100644 --- a/pydatalab/src/pydatalab/routes/v0_1/items.py +++ b/pydatalab/src/pydatalab/routes/v0_1/items.py @@ -328,7 +328,11 @@ def search_items(): pipeline.append({"$match": match_obj}) pipeline.append({"$sort": {"score": {"$meta": "textScore"}}}) else: - query_parts = [r"\b" + re.escape(part) for part in query.split(" ") if part] + query_parts = [re.escape(part) for part in query.split(" ") if part.strip()] + + # Add word boundary to short parts to avoid excessive matches, i.e., search start of string + # for short parts, but allow match anywhere in string for longer parts + query_parts = [f"\\b{part}" if len(part) < 5 else part for part in query_parts] match_obj = { "$or": [ {"$and": [{field: {"$regex": query, "$options": "i"}} for query in query_parts]} From 13ee001376c4b6e4019580c4d4c17c53013526af Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Sun, 31 Aug 2025 18:37:04 +0100 Subject: [PATCH 3/4] Refactor item search tests --- pydatalab/tests/server/conftest.py | 5 + pydatalab/tests/server/test_samples.py | 185 +++++++++---------------- 2 files changed, 73 insertions(+), 117 deletions(-) diff --git a/pydatalab/tests/server/conftest.py b/pydatalab/tests/server/conftest.py index e5baa8dbd..75f879d92 100644 --- a/pydatalab/tests/server/conftest.py +++ b/pydatalab/tests/server/conftest.py @@ -578,6 +578,11 @@ def fixture_insert_default_equipment(default_equipment): yield from _insert_and_cleanup_item_from_model(default_equipment) +@pytest.fixture(scope="module", name="insert_example_items") +def fixture_insert_example_items(example_items, real_mongo_client): + real_mongo_client.get_database().items.insert_many(example_items) + + @pytest.fixture(scope="module", name="inserted_default_items") def fixture_inserted_default_items( insert_default_sample, diff --git a/pydatalab/tests/server/test_samples.py b/pydatalab/tests/server/test_samples.py index ab16179f8..1cbd02995 100644 --- a/pydatalab/tests/server/test_samples.py +++ b/pydatalab/tests/server/test_samples.py @@ -152,138 +152,89 @@ def test_create_indices(real_mongo_client): assert all(name in names for name in expected_index_names) +@pytest.mark.parametrize( + "query,expected_result_ids", + [ + ("query=%2512345&types=samples", {"12345", "sample_1"}), + ("query=%new material", {"material", "12345"}), + ("query=%NaNiO2", {"test", "material", "12345"}), + ("query=%'grey:TEST4'", ["material", "test", "sample_1", "12345", "sample_2"]), + ], +) @pytest.mark.dependency(depends=["test_create_indices"]) -def test_item_search(client, admin_client, real_mongo_client, example_items): +def test_item_fts_search( + query, expected_result_ids, client, real_mongo_client, insert_example_items +): if real_mongo_client is None: pytest.skip("Skipping FTS tests, not connected to real MongoDB") - real_mongo_client.get_database().items.insert_many(example_items) - - response = client.get("/search-items/?query=%2512345&types=samples") - - assert response.status_code == 200 - assert response.json["status"] == "success" - assert len(response.json["items"]) == 2 - assert response.json["items"][0]["item_id"] == "12345" - assert response.json["items"][1]["item_id"] == "sample_1" - - response = client.get("/search-items/?query=%new material") - - assert response.status_code == 200 - assert response.json["status"] == "success" - assert len(response.json["items"]) == 2 - assert response.json["items"][0]["item_id"] == "material" - assert response.json["items"][1]["item_id"] == "12345" - - response = client.get("/search-items/?query=%NaNiO2") - - assert response.status_code == 200 - assert response.json["status"] == "success" - assert len(response.json["items"]) == 3 - assert response.json["items"][0]["item_id"] == "test" - assert response.json["items"][1]["item_id"] == "material" - assert response.json["items"][2]["item_id"] == "12345" - - response = client.get("/search-items/?query=%'grey:TEST4'") - - assert response.status_code == 200 - assert response.json["status"] == "success" - # should return all grey:TEST samples, with TEST4 first - assert len(response.json["items"]) == 5 - assert response.json["items"][0]["item_id"] == "material" - - # Test regex search - response = client.get("/search-items/?query=mater&types=samples,starting_materials") - - assert response.status_code == 200 - assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert "material" in item_ids - assert "12345" in item_ids - assert len(item_ids) == 2 - - # Test regex search with different types - response = client.get("/search-items/?query=mater&types=equipment") - - assert response.status_code == 200 - assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert len(item_ids) == 0 - - # Test regex search still obeys permissions - response = admin_client.get("/search-items/?query=mater") - - assert response.status_code == 200 - assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert len(item_ids) == 3 - assert "material" in item_ids - assert "12345" in item_ids - assert "123456" in item_ids - - # Search for single word in description - response = client.get("/search-items/?query='magic'") - - assert response.status_code == 200 - assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert "material" in item_ids - assert "test" in item_ids - assert len(item_ids) == 2 - - # Search for single word in description, again checking extra results for admins - response = admin_client.get("/search-items/?query='magic'") - - assert response.status_code == 200 - assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert len(item_ids) == 3 - assert "material" in item_ids - assert "test" in item_ids - assert "sample_admin" in item_ids - - # Search for string with brackets - response = admin_client.get("/search-items/?query='vanadium('") + response = client.get(f"/search-items/?{query}") assert response.status_code == 200 assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert len(item_ids) == 1 - assert "sample_2" in item_ids - - # Search for two words present in sample in either order - response = admin_client.get("/search-items/?query='vanadium oxide'") - - assert response.status_code == 200 - assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert len(item_ids) == 1 - assert "sample_2" in item_ids - - response = admin_client.get("/search-items/?query='oxide vanadium'") + item_ids = [item["item_id"] for item in response.json["items"]] + if isinstance(expected_result_ids, set): + assert all(_id in item_ids for _id in expected_result_ids), ( + f"Some expected IDs not found for {query=}: expected {expected_result_ids}, found {item_ids}" + ) - assert response.status_code == 200 - assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert len(item_ids) == 1 - assert "sample_2" in item_ids + else: + assert item_ids == expected_result_ids - # Search for single char at start of word - response = admin_client.get("/search-items/?query='v'") - assert response.status_code == 200 - assert response.json["status"] == "success" - item_ids = {item["item_id"] for item in response.json["items"]} - assert len(item_ids) == 1 - assert "sample_2" in item_ids +@pytest.mark.dependency(depends=["test_create_indices"]) +@pytest.mark.parametrize( + "user,query,expected_result_ids", + [ + ("user", "query=mater&types=samples,starting_materials", ["material", "12345"]), + ("user", "query=mater&types=equipment", []), # Tests avoidance of different types + ("admin", "query=mater", ["material", "12345", "123456"]), # Test search obeys permissions + ( + "admin", + "query='magic'", + ["material", "test", "sample_admin"], + ), # Test simple word in description as admin + ( + "user", + "query='magic'", + ["material", "test"], + ), # Test simple word in description + ("admin", "query='vanadium('&types=samples", ["sample_2"]), # Test unclosed brackets + ("admin", "query='vanadium oxide'&types=samples", ["sample_2"]), # Test two words + ("admin", "query='oxide vanadium'&types=samples", ["sample_2"]), # Test reverse order + ("admin", "query='v'", ["sample_2"]), # Test single char at start of word + ("admin", "query='van'", ["sample_2"]), # Test prefix at start of word + ("admin", "query='oxid'", ["sample_2"]), # Test prefix at start of word + ( + "admin", + "query='anadium'&types=samples", + ["sample_2"], + ), # Test word ending that should return results if long enough + ( + "admin", + "query='dium'&types=samples", + [], + ), # Test word ending that should not return results if its too short + ], +) +def test_item_regex_search( + user, query, expected_result_ids, real_mongo_client, client, admin_client, insert_example_items +): + if real_mongo_client is None: + pytest.skip("Skipping FTS tests, not connected to real MongoDB") - # Search for word ending that should not return results - response = admin_client.get("/search-items/?query='anadium'") + if user == "admin": + response = admin_client.get(f"/search-items/?{query}") + else: + response = client.get(f"/search-items/?{query}") assert response.status_code == 200 assert response.json["status"] == "success" item_ids = {item["item_id"] for item in response.json["items"]} - assert len(item_ids) == 0 + assert all(_id in item_ids for _id in expected_result_ids), ( + f"Some expected IDs not found for {query=} as {user=}: expected {expected_result_ids}, found {item_ids}" + ) + assert len(item_ids) == len(expected_result_ids) @pytest.mark.dependency(depends=["test_delete_sample"]) From 52e4aad0c7ad9449af56686ac77e9613f1c80a2c Mon Sep 17 00:00:00 2001 From: Matthew Evans Date: Thu, 25 Sep 2025 15:49:13 +0100 Subject: [PATCH 4/4] Maintain the old regex search with a `#` prefix --- pydatalab/src/pydatalab/routes/v0_1/items.py | 61 ++++++++++++++++---- pydatalab/tests/server/test_samples.py | 35 +++++++++-- 2 files changed, 82 insertions(+), 14 deletions(-) diff --git a/pydatalab/src/pydatalab/routes/v0_1/items.py b/pydatalab/src/pydatalab/routes/v0_1/items.py index 839445069..b51e0a739 100644 --- a/pydatalab/src/pydatalab/routes/v0_1/items.py +++ b/pydatalab/src/pydatalab/routes/v0_1/items.py @@ -317,7 +317,9 @@ def search_items(): query = query.strip("'") if isinstance(query, str) and query.startswith("%"): + # Old FTS query style, using MongoDB text indexes query = query.lstrip("%") + query = query.strip("'") match_obj = { "$text": {"$search": query}, **get_default_permissions(user_only=False), @@ -327,19 +329,58 @@ def search_items(): pipeline.append({"$match": match_obj}) pipeline.append({"$sort": {"score": {"$meta": "textScore"}}}) - else: - query_parts = [re.escape(part) for part in query.split(" ") if part.strip()] + elif isinstance(query, str) and query.startswith("#"): + # Plain regex search, without word boundaries or splitting into parts + query = query.lstrip("#") + query = query.strip("'") - # Add word boundary to short parts to avoid excessive matches, i.e., search start of string - # for short parts, but allow match anywhere in string for longer parts - query_parts = [f"\\b{part}" if len(part) < 5 else part for part in query_parts] match_obj = { - "$or": [ - {"$and": [{field: {"$regex": query, "$options": "i"}} for query in query_parts]} - for field in ITEMS_FTS_FIELDS - ] + "$or": [{field: {"$regex": query, "$options": "i"}} for field in ITEMS_FTS_FIELDS] } - LOGGER.debug("Performing regex search for %s with full search %s", query_parts, match_obj) + match_obj = {"$and": [get_default_permissions(user_only=False), match_obj]} + if types is not None: + match_obj["$and"].append({"type": {"$in": types}}) + + pipeline.append({"$match": match_obj}) + + else: + # Heuristic + regex search, splitting the query into parts and adding word boundaries + # depending on length + def _generate_heuristic_regex_search(query: str, part_length: int = 4) -> dict: + """Generate a heuristic regex search object for MongoDB that uses + word boundaries for short parts of the query, but allows matches anywhere. + + Parameters: + query: The full search query string. + part_length: The length below which to add a word boundary to the start of the part. + + Returns: + A MongoDB query object that can be used in a $match stage. + + """ + query_parts = [re.escape(part) for part in query.split(" ") if part.strip()] + + # Add word boundary to short parts to avoid excessive matches, i.e., search start of string + # for short parts, but allow match anywhere in string for longer parts + query_parts = [ + f"\\b{part}" if len(part) <= part_length else part for part in query_parts + ] + match_obj = { + "$or": [ + {"$and": [{field: {"$regex": query, "$options": "i"}} for query in query_parts]} + for field in ITEMS_FTS_FIELDS + ] + } + LOGGER.debug( + "Performing regex search for %s with full search %s", query_parts, match_obj + ) + + return match_obj + + if not query: + return jsonify({"status": "error", "message": "No query provided."}), 400 + + match_obj = _generate_heuristic_regex_search(query) match_obj = {"$and": [get_default_permissions(user_only=False), match_obj]} if types is not None: match_obj["$and"].append({"type": {"$in": types}}) diff --git a/pydatalab/tests/server/test_samples.py b/pydatalab/tests/server/test_samples.py index 1cbd02995..41b0dbae7 100644 --- a/pydatalab/tests/server/test_samples.py +++ b/pydatalab/tests/server/test_samples.py @@ -158,7 +158,7 @@ def test_create_indices(real_mongo_client): ("query=%2512345&types=samples", {"12345", "sample_1"}), ("query=%new material", {"material", "12345"}), ("query=%NaNiO2", {"test", "material", "12345"}), - ("query=%'grey:TEST4'", ["material", "test", "sample_1", "12345", "sample_2"]), + ("query=%'grey:TEST4'", {"material", "test", "sample_1", "12345", "sample_2"}), ], ) @pytest.mark.dependency(depends=["test_create_indices"]) @@ -182,6 +182,36 @@ def test_item_fts_search( assert item_ids == expected_result_ids +@pytest.mark.parametrize( + "query,expected_result_ids", + [ + ("query=%23mater&types=samples,starting_materials", {"12345", "material"}), + ("query=%23mater&types=equipment", []), + ("query=%23mater", {"material", "12345"}), + ("query=%23'magic'", {"material", "test"}), + ], +) +@pytest.mark.dependency(depends=["test_create_indices"]) +def test_item_old_regex_search( + query, expected_result_ids, client, real_mongo_client, insert_example_items +): + if real_mongo_client is None: + pytest.skip("Skipping FTS tests, not connected to real MongoDB") + + response = client.get(f"/search-items/?{query}") + + assert response.status_code == 200 + assert response.json["status"] == "success" + item_ids = [item["item_id"] for item in response.json["items"]] + if isinstance(expected_result_ids, set): + assert all(_id in item_ids for _id in expected_result_ids), ( + f"Some expected IDs not found for {query=}: expected {expected_result_ids}, found {item_ids}" + ) + + else: + assert item_ids == expected_result_ids + + @pytest.mark.dependency(depends=["test_create_indices"]) @pytest.mark.parametrize( "user,query,expected_result_ids", @@ -220,9 +250,6 @@ def test_item_fts_search( def test_item_regex_search( user, query, expected_result_ids, real_mongo_client, client, admin_client, insert_example_items ): - if real_mongo_client is None: - pytest.skip("Skipping FTS tests, not connected to real MongoDB") - if user == "admin": response = admin_client.get(f"/search-items/?{query}") else: