Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions src/ndi/cloud/api/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,17 @@

from __future__ import annotations

import re
from typing import Annotated, Any

from pydantic import SkipValidation, validate_call

from ..client import APIResponse, CloudClient, _auto_client
from ._validators import VALIDATE_CONFIG, CloudId, FilePath, PageNumber, PageSize, Scope

_HEX24 = re.compile(r"^[0-9a-fA-F]{24}$")
_BULK_FETCH_MAX = 500

_Client = Annotated[CloudClient | None, SkipValidation()]


Expand Down Expand Up @@ -178,6 +182,86 @@ def countDocuments(dataset_id: CloudId, *, client: _Client = None) -> int:
return ds.get("documentCount", 0)


@_auto_client
@validate_call(config=VALIDATE_CONFIG)
def bulkFetch(
dataset_id: CloudId,
doc_ids: list[str],
*,
client: _Client = None,
) -> list[dict[str, Any]]:
"""POST /datasets/{datasetId}/documents/bulk-fetch

Synchronously fetch up to 500 documents (with full data) from a
dataset in a single call. This is the fast synchronous companion
to the asynchronous :func:`getBulkDownloadURL` pipeline and is
intended for small sets (e.g. a subset of IDs returned by
:func:`ndiquery`).

Documents that do not exist, are soft-deleted, or do not belong to
the specified dataset are silently omitted from the response. The
order of the returned documents is not guaranteed to match the
request order.

MATLAB equivalent: +cloud/+api/+documents/bulkFetch.m

Args:
dataset_id: The ID of the dataset containing the documents.
doc_ids: Document IDs to fetch. Must be non-empty, at most 500
entries, and each entry must be a 24-character hex string.
client: Authenticated cloud client (auto-created if omitted).

Returns:
A list of document dicts, each with fields ``id``, ``ndiId``,
``name``, ``className``, ``datasetId``, and ``data``.
"""
if not doc_ids:
raise ValueError("doc_ids must be non-empty")
if len(doc_ids) > _BULK_FETCH_MAX:
raise ValueError(f"doc_ids must have at most {_BULK_FETCH_MAX} entries")
for did in doc_ids:
if not _HEX24.match(did):
raise ValueError(f"doc_ids entries must be 24-character hex strings: {did!r}")
result = client.post(
"/datasets/{datasetId}/documents/bulk-fetch",
json={"documentIds": list(doc_ids)},
datasetId=dataset_id,
)
return result.get("documents", []) if isinstance(result, dict) else list(result or [])


@_auto_client
@validate_call(config=VALIDATE_CONFIG)
def documentClassCounts(
dataset_id: CloudId,
*,
client: _Client = None,
) -> dict[str, Any]:
"""GET /datasets/{datasetId}/document-class-counts

Retrieve a flat histogram of documents in a dataset grouped by leaf
``data.document_class.class_name``. No inheritance roll-up is
performed; for class-aware drill-downs use :func:`ndiquery` with
the ``isa`` operator.

MATLAB equivalent: +cloud/+api/+documents/documentClassCounts.m

Args:
dataset_id: The ID of the dataset to query.
client: Authenticated cloud client (auto-created if omitted).

Returns:
Dict with fields ``datasetId``, ``totalDocuments``, and
``classCounts`` (a mapping of class name to integer count).
Documents with missing/empty ``class_name`` are bucketed under
``'unknown'``.
"""
return client.get(
"/datasets/{datasetId}/document-class-counts",
datasetId=dataset_id,
)


@_auto_client
@validate_call(config=VALIDATE_CONFIG)
def bulkUpload(
Expand Down
47 changes: 47 additions & 0 deletions src/ndi/cloud/api/ndi_matlab_python_bridge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,53 @@ functions:
Python convenience that combines getBulkUploadURL + putFiles.
MATLAB does these as separate steps.

- name: bulkFetch
matlab_path: "+ndi/+cloud/+api/+documents/bulkFetch.m"
matlab_last_sync_hash: "bacdd0c3"
python_path: "ndi/cloud/api/documents.py"
input_arguments:
- name: dataset_id
type_matlab: "string"
type_python: "CloudId"
- name: doc_ids
type_matlab: "string array"
type_python: "list[str]"
- name: client
type_python: "_Client"
default: "None"
output_arguments:
- name: documents
type_python: "list[dict[str, Any]]"
decision_log: >
Synchronized with MATLAB main as of 2026-04-20. Synchronous bulk
fetch of up to 500 documents by ID via POST /datasets/{datasetId}
/documents/bulk-fetch. Mirrors MATLAB input validation: non-empty,
<= 500 entries, each a 24-character hex string. MATLAB returns
(b, answer, apiResponse, apiURL); Python returns only the documents
list (the 'answer'), consistent with other api.* wrappers that
delegate HTTP metadata to CloudClient.

- name: documentClassCounts
matlab_path: "+ndi/+cloud/+api/+documents/documentClassCounts.m"
matlab_last_sync_hash: "12bfe81"
python_path: "ndi/cloud/api/documents.py"
input_arguments:
- name: dataset_id
type_matlab: "string"
type_python: "CloudId"
- name: client
type_python: "_Client"
default: "None"
output_arguments:
- name: result
type_python: "dict[str, Any]"
decision_log: >
Synchronized with MATLAB main as of 2026-04-20. GET /datasets/
{datasetId}/document-class-counts. Returns a flat histogram of
leaf data.document_class.class_name with fields datasetId,
totalDocuments, and classCounts (a mapping of class name to int).
No inheritance roll-up is performed.

- name: getBulkUploadURL
matlab_path: "+ndi/+cloud/+api/+documents/getBulkUploadURL.m"
matlab_last_sync_hash: "9b75c0fe"
Expand Down
4 changes: 1 addition & 3 deletions src/ndi/gui/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,9 +139,7 @@ def _update_db_list(self) -> None:
try:
from ndi.query import ndi_query

doc_list = self._session.database_search(
ndi_query("document_class.class_name", "regex", "(.*)", "")
)
doc_list = self._session.database_search(ndi_query.all())
except Exception:
doc_list = []

Expand Down
91 changes: 91 additions & 0 deletions tests/test_cloud_api_documents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
"""Unit tests for ndi.cloud.api.documents — no network required."""

from __future__ import annotations

from unittest.mock import MagicMock

import pytest


def _make_client() -> MagicMock:
"""Return a mock CloudClient."""
client = MagicMock()
client.config.org_id = "org-123"
client.config.api_url = "https://api.ndi-cloud.com/v1"
return client


# --- 24-char hex helper for bulkFetch --------------------------------------
_HEX24_A = "a" * 24
_HEX24_B = "b" * 24


class TestBulkFetch:
"""bulkFetch validates inputs and POSTs to /documents/bulk-fetch."""

def test_returns_documents_list(self):
from ndi.cloud.api.documents import bulkFetch

client = _make_client()
client.post.return_value = {
"documents": [{"id": _HEX24_A, "name": "d1"}, {"id": _HEX24_B, "name": "d2"}]
}

docs = bulkFetch("ds-1", [_HEX24_A, _HEX24_B], client=client)

client.post.assert_called_once()
call = client.post.call_args
assert call.args[0] == "/datasets/{datasetId}/documents/bulk-fetch"
assert call.kwargs["datasetId"] == "ds-1"
assert call.kwargs["json"] == {"documentIds": [_HEX24_A, _HEX24_B]}
assert [d["name"] for d in docs] == ["d1", "d2"]

def test_empty_doc_ids_raises(self):
from ndi.cloud.api.documents import bulkFetch

with pytest.raises(ValueError, match="non-empty"):
bulkFetch("ds-1", [], client=_make_client())

def test_over_500_raises(self):
from ndi.cloud.api.documents import bulkFetch

ids = [_HEX24_A] * 501
with pytest.raises(ValueError, match="at most 500"):
bulkFetch("ds-1", ids, client=_make_client())

def test_non_hex_id_raises(self):
from ndi.cloud.api.documents import bulkFetch

with pytest.raises(ValueError, match="24-character hex"):
bulkFetch("ds-1", ["not-a-hex-id"], client=_make_client())

def test_missing_documents_field_returns_empty(self):
from ndi.cloud.api.documents import bulkFetch

client = _make_client()
client.post.return_value = {}
docs = bulkFetch("ds-1", [_HEX24_A], client=client)
assert docs == []


class TestDocumentClassCounts:
"""documentClassCounts GETs /document-class-counts and returns the struct."""

def test_returns_response_dict(self):
from ndi.cloud.api.documents import documentClassCounts

client = _make_client()
client.get.return_value = {
"datasetId": "ds-1",
"totalDocuments": 3,
"classCounts": {"ndi_document_probe": 2, "unknown": 1},
}

result = documentClassCounts("ds-1", client=client)

client.get.assert_called_once()
call = client.get.call_args
assert call.args[0] == "/datasets/{datasetId}/document-class-counts"
assert call.kwargs["datasetId"] == "ds-1"
assert result["totalDocuments"] == 3
assert result["classCounts"]["ndi_document_probe"] == 2
20 changes: 6 additions & 14 deletions tests/test_cloud_live.py
Original file line number Diff line number Diff line change
Expand Up @@ -794,13 +794,9 @@ def test_ndiquery_public(self, client):
"""ndiquery should return documents matching a search."""
from ndi.cloud.api.documents import ndiquery

search = [
{
"field": "document_class.class_name",
"operation": "exact_string",
"param1": "session",
}
]
# Class filtering on the cloud must go through the 'isa' operator;
# the document_class.class_name field path is no longer searchable.
search = [{"field": "", "operation": "isa", "param1": "session"}]
result = _retry_on_server_error(
lambda: ndiquery("public", search, page=1, page_size=5, client=client)
)
Expand Down Expand Up @@ -828,13 +824,9 @@ def test_ndiqueryAll_paginates(self, client):
"""ndiqueryAll should auto-paginate results."""
from ndi.cloud.api.documents import ndiqueryAll

search = [
{
"field": "document_class.class_name",
"operation": "exact_string",
"param1": "session",
}
]
# Class filtering on the cloud must go through the 'isa' operator;
# the document_class.class_name field path is no longer searchable.
search = [{"field": "", "operation": "isa", "param1": "session"}]
result = _retry_on_server_error(
lambda: ndiqueryAll("public", search, page_size=3, client=client)
)
Expand Down
Loading