Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .docker/server_dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ ENV UV_LINK_MODE=copy \

# Used to fake the version of the package in cases where datalab is only
# available as a git submodule or worktree
ENV SETUPTOOLS_SCM_PRETEND_VERSION=${SETUPTOOLS_SCM_PRETEND_VERSION}
ENV SETUPTOOLS_SCM_PRETEND_VERSION=""

WORKDIR /app
COPY ./pydatalab/pyproject.toml .
Expand Down
12 changes: 11 additions & 1 deletion pydatalab/src/pydatalab/routes/v0_1/items.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datetime
import json
import re

from bson import ObjectId
from flask import Blueprint, jsonify, redirect, request
Expand Down Expand Up @@ -327,9 +328,18 @@ def search_items():
pipeline.append({"$match": match_obj})
pipeline.append({"$sort": {"score": {"$meta": "textScore"}}})
else:
query_parts = [re.escape(part) for part in query.split(" ") if part.strip()]

# Add word boundary to short parts to avoid excessive matches, i.e., search start of string
# for short parts, but allow match anywhere in string for longer parts
query_parts = [f"\\b{part}" if len(part) < 5 else part for part in query_parts]
match_obj = {
"$or": [{field: {"$regex": query, "$options": "i"}} for field in ITEMS_FTS_FIELDS]
"$or": [
{"$and": [{field: {"$regex": query, "$options": "i"}} for query in query_parts]}
for field in ITEMS_FTS_FIELDS
]
}
LOGGER.debug("Performing regex search for %s with full search %s", query_parts, match_obj)
match_obj = {"$and": [get_default_permissions(user_only=False), match_obj]}
if types is not None:
match_obj["$and"].append({"type": {"$in": types}})
Expand Down
6 changes: 6 additions & 0 deletions pydatalab/tests/server/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ def example_items(user_id, admin_user_id):
Sample(
**{
"item_id": "sample_2",
"chemform": "vanadium(II) oxide",
"name": "other_sample",
"date": "1970-02-01",
"refcode": "grey:TEST3",
Expand Down Expand Up @@ -577,6 +578,11 @@ def fixture_insert_default_equipment(default_equipment):
yield from _insert_and_cleanup_item_from_model(default_equipment)


@pytest.fixture(scope="module", name="insert_example_items")
def fixture_insert_example_items(example_items, real_mongo_client):
real_mongo_client.get_database().items.insert_many(example_items)


@pytest.fixture(scope="module", name="inserted_default_items")
def fixture_inserted_default_items(
insert_default_sample,
Expand Down
142 changes: 68 additions & 74 deletions pydatalab/tests/server/test_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,95 +152,89 @@ def test_create_indices(real_mongo_client):
assert all(name in names for name in expected_index_names)


@pytest.mark.parametrize(
"query,expected_result_ids",
[
("query=%2512345&types=samples", {"12345", "sample_1"}),
("query=%new material", {"material", "12345"}),
("query=%NaNiO2", {"test", "material", "12345"}),
("query=%'grey:TEST4'", ["material", "test", "sample_1", "12345", "sample_2"]),
],
)
@pytest.mark.dependency(depends=["test_create_indices"])
def test_item_search(client, admin_client, real_mongo_client, example_items):
def test_item_fts_search(
query, expected_result_ids, client, real_mongo_client, insert_example_items
):
if real_mongo_client is None:
pytest.skip("Skipping FTS tests, not connected to real MongoDB")

real_mongo_client.get_database().items.insert_many(example_items)

response = client.get("/search-items/?query=%2512345&types=samples")

assert response.status_code == 200
assert response.json["status"] == "success"
assert len(response.json["items"]) == 2
assert response.json["items"][0]["item_id"] == "12345"
assert response.json["items"][1]["item_id"] == "sample_1"

response = client.get("/search-items/?query=%new material")

assert response.status_code == 200
assert response.json["status"] == "success"
assert len(response.json["items"]) == 2
assert response.json["items"][0]["item_id"] == "material"
assert response.json["items"][1]["item_id"] == "12345"

response = client.get("/search-items/?query=%NaNiO2")
response = client.get(f"/search-items/?{query}")

assert response.status_code == 200
assert response.json["status"] == "success"
assert len(response.json["items"]) == 3
assert response.json["items"][0]["item_id"] == "test"
assert response.json["items"][1]["item_id"] == "material"
assert response.json["items"][2]["item_id"] == "12345"

response = client.get("/search-items/?query=%'grey:TEST4'")

assert response.status_code == 200
assert response.json["status"] == "success"
# should return all grey:TEST<n> samples, with TEST4 first
assert len(response.json["items"]) == 5
assert response.json["items"][0]["item_id"] == "material"

# Test regex search
response = client.get("/search-items/?query=mater&types=samples,starting_materials")

assert response.status_code == 200
assert response.json["status"] == "success"
item_ids = {item["item_id"] for item in response.json["items"]}
assert "material" in item_ids
assert "12345" in item_ids
assert len(item_ids) == 2

# Test regex search with different types
response = client.get("/search-items/?query=mater&types=equipment")

assert response.status_code == 200
assert response.json["status"] == "success"
item_ids = {item["item_id"] for item in response.json["items"]}
assert len(item_ids) == 0

# Test regex search still obeys permissions
response = admin_client.get("/search-items/?query=mater")
item_ids = [item["item_id"] for item in response.json["items"]]
if isinstance(expected_result_ids, set):
assert all(_id in item_ids for _id in expected_result_ids), (
f"Some expected IDs not found for {query=}: expected {expected_result_ids}, found {item_ids}"
)

assert response.status_code == 200
assert response.json["status"] == "success"
item_ids = {item["item_id"] for item in response.json["items"]}
assert len(item_ids) == 3
assert "material" in item_ids
assert "12345" in item_ids
assert "123456" in item_ids
else:
assert item_ids == expected_result_ids

# Search for single word in description
response = client.get("/search-items/?query='magic'")

assert response.status_code == 200
assert response.json["status"] == "success"
item_ids = {item["item_id"] for item in response.json["items"]}
assert "material" in item_ids
assert "test" in item_ids
assert len(item_ids) == 2
@pytest.mark.dependency(depends=["test_create_indices"])
@pytest.mark.parametrize(
"user,query,expected_result_ids",
[
("user", "query=mater&types=samples,starting_materials", ["material", "12345"]),
("user", "query=mater&types=equipment", []), # Tests avoidance of different types
("admin", "query=mater", ["material", "12345", "123456"]), # Test search obeys permissions
(
"admin",
"query='magic'",
["material", "test", "sample_admin"],
), # Test simple word in description as admin
(
"user",
"query='magic'",
["material", "test"],
), # Test simple word in description
("admin", "query='vanadium('&types=samples", ["sample_2"]), # Test unclosed brackets
("admin", "query='vanadium oxide'&types=samples", ["sample_2"]), # Test two words
("admin", "query='oxide vanadium'&types=samples", ["sample_2"]), # Test reverse order
("admin", "query='v'", ["sample_2"]), # Test single char at start of word
("admin", "query='van'", ["sample_2"]), # Test prefix at start of word
("admin", "query='oxid'", ["sample_2"]), # Test prefix at start of word
(
"admin",
"query='anadium'&types=samples",
["sample_2"],
), # Test word ending that should return results if long enough
(
"admin",
"query='dium'&types=samples",
[],
), # Test word ending that should not return results if its too short
],
)
def test_item_regex_search(
user, query, expected_result_ids, real_mongo_client, client, admin_client, insert_example_items
):
if real_mongo_client is None:
pytest.skip("Skipping FTS tests, not connected to real MongoDB")

# Search for single word in description, again checking extra results for admins
response = admin_client.get("/search-items/?query='magic'")
if user == "admin":
response = admin_client.get(f"/search-items/?{query}")
else:
response = client.get(f"/search-items/?{query}")

assert response.status_code == 200
assert response.json["status"] == "success"
item_ids = {item["item_id"] for item in response.json["items"]}
assert len(item_ids) == 3
assert "material" in item_ids
assert "test" in item_ids
assert "sample_admin" in item_ids
assert all(_id in item_ids for _id in expected_result_ids), (
f"Some expected IDs not found for {query=} as {user=}: expected {expected_result_ids}, found {item_ids}"
)
assert len(item_ids) == len(expected_result_ids)


@pytest.mark.dependency(depends=["test_delete_sample"])
Expand Down
Loading