Waltham-Data-Science · stevevanhooser · Apr 1, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -52,6 +52,9 @@ jobs:
         run: python -m ndi check
 
       - name: Run tests with coverage
+        env:
+          NDI_CLOUD_USERNAME: ${{ secrets.TEST_USER_2_USERNAME }}
+          NDI_CLOUD_PASSWORD: ${{ secrets.TEST_USER_2_PASSWORD }}
         run: |
           # Use sys.monitoring (PEP 669) on Python 3.12+ for faster coverage.
           # CTracer (sys.settrace) is catastrophically slow on 3.12 when

diff --git a/ndi_install.py b/ndi_install.py
@@ -41,6 +41,14 @@
         "python_path": ".",
         "description": "VH-Lab data utilities and file formats (not on PyPI)",
     },
+    {
+        "name": "NDIcalc-vis-matlab",
+        "repo": "https://github.com/VH-Lab/NDIcalc-vis-matlab.git",
+        "branch": "main",
+        "python_path": "",
+        "ndi_common": True,
+        "description": "NDI calculator and visualization document definitions",
+    },
 ]
 
 DEFAULT_TOOLS_DIR = Path.home() / ".ndi" / "tools"
@@ -268,6 +276,8 @@ def write_pth_file(site_packages: Path, tools_dir: Path) -> Path | None:
     lines = []
 
     for dep in DEPENDENCIES:
+        if not dep.get("python_path"):
+            continue  # No Python code to add to path
         dep_dir = tools_dir / dep["name"]
         python_path = dep_dir / dep["python_path"] if dep["python_path"] != "." else dep_dir
         if python_path.is_dir():
@@ -290,6 +300,56 @@ def write_pth_file(site_packages: Path, tools_dir: Path) -> Path | None:
         return None
 
 
+# ---------------------------------------------------------------------------
+# ndi_common document definitions from external dependencies
+# ---------------------------------------------------------------------------
+
+
+def install_ndi_common_docs(tools_dir: Path, ndi_root: Path) -> bool:
+    """Copy ndi_common/{database,schema}_documents from external deps.
+
+    Some dependencies (e.g. NDIcalc-vis-matlab) ship document type
+    definitions that NDI-python needs at runtime.  This copies their
+    ``ndi_common/database_documents`` and ``ndi_common/schema_documents``
+    trees into NDI-python's own ``ndi_common`` folder so they are
+    discoverable via ``ndi_common_PathConstants.DOCUMENT_PATH``.
+    """
+    import shutil
+
+    ndi_common = ndi_root / "src" / "ndi" / "ndi_common"
+    ok = True
+
+    for dep in DEPENDENCIES:
+        if not dep.get("ndi_common"):
+            continue
+        dep_dir = tools_dir / dep["name"]
+        dep_common = dep_dir / "ndi_common"
+        if not dep_common.is_dir():
+            warn(f"{dep['name']}: ndi_common folder not found at {dep_common}")
+            ok = False
+            continue
+
+        for sub in ("database_documents", "schema_documents"):
+            src = dep_common / sub
+            dst = ndi_common / sub
+            if not src.is_dir():
+                continue
+            count = 0
+            for src_file in src.rglob("*"):
+                if src_file.is_dir():
+                    continue
+                rel = src_file.relative_to(src)
+                dst_file = dst / rel
+                dst_file.parent.mkdir(parents=True, exist_ok=True)
+                shutil.copy2(src_file, dst_file)
+                count += 1
+            detail(f"Copied {count} {sub} files from {dep['name']}")
+
+        success(f"Installed document definitions from {dep['name']}")
+
+    return ok
+
+
 # ---------------------------------------------------------------------------
 # pip installation
 # ---------------------------------------------------------------------------
@@ -529,6 +589,8 @@ def main() -> int:
         fail("Could not find site-packages directory")
         warn("You may need to set PYTHONPATH manually:")
         for dep in DEPENDENCIES:
+            if not dep.get("python_path"):
+                continue
             dep_dir = tools_dir / dep["name"]
             python_path = dep_dir / dep["python_path"] if dep["python_path"] != "." else dep_dir
             warn(f"  {python_path}")
@@ -546,6 +608,8 @@ def main() -> int:
     importlib.reload(site)
     # Add paths directly for this process
     for dep in DEPENDENCIES:
+        if not dep.get("python_path"):
+            continue  # No Python code to add to path
         dep_dir = tools_dir / dep["name"]
         python_path = (
             str(dep_dir / dep["python_path"]) if dep["python_path"] != "." else str(dep_dir)
@@ -564,6 +628,9 @@ def main() -> int:
     if not install_ndi_and_deps(ndi_root, include_dev=args.dev):
         warn("Some packages may not have installed correctly")
 
+    # Copy document definitions from external dependencies
+    install_ndi_common_docs(tools_dir, ndi_root)
+
     # ── Step 5: Validate ───────────────────────────────────────────────
     if args.no_validate:
         print("\n[5/5] Validation skipped (--no-validate)")

diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
     "did @ git+https://github.com/VH-Lab/DID-python.git@main",
     "ndr @ git+https://github.com/VH-lab/NDR-python.git@main",
     "vhlab-toolbox-python @ git+https://github.com/VH-Lab/vhlab-toolbox-python.git@main",
+    "ndi-compress @ git+https://github.com/Waltham-Data-Science/NDI-compress-python.git@main",
     "numpy>=1.20.0",
     "networkx>=2.6",
     "jsonschema>=4.0.0",

diff --git a/src/ndi/cloud/orchestration.py b/src/ndi/cloud/orchestration.py
@@ -84,7 +84,6 @@ def downloadDataset(
     from ndi.dataset import ndi_dataset_dir
 
     documents = jsons2documents(doc_jsons)
-    conversion_lost = len(doc_jsons) - len(documents)
     dataset = ndi_dataset_dir("", target, documents=documents)
 
     # Create remote link document if not already present
@@ -113,85 +112,76 @@ def downloadDataset(
         if verbose:
             print(f'  Files downloaded: {report["downloaded"]}, failed: {report["failed"]}')
 
-    # Collect failures: conversion + exception-tracked + silent (DID-python)
-    add_failures: list[tuple[str, str]] = list(getattr(dataset, "add_doc_failures", []))
-
-    # Cross-check using raw DID-python doc IDs (not isa('base') query,
-    # which might miss documents whose type info wasn't stored correctly).
+    # Verify every downloaded document made it into the local database.
+    # The local dataset may have *more* documents (e.g. session and
+    # session-in-a-dataset docs created internally), so we only check
+    # that every remote doc ID is present locally.
     db_ids = set(
         dataset._session._database._driver._db.get_doc_ids(
             dataset._session._database._driver._branch_id
         )
     )
 
-    # Build a map from doc_id -> original JSON for missing-doc output
-    doc_json_by_id: dict[str, dict] = {}
+    missing: list[str] = []
+    missing_jsons: list[dict] = []
     for dj in doc_jsons:
         did = dj.get("base", {}).get("id", "") if isinstance(dj, dict) else ""
-        if did:
-            doc_json_by_id[did] = dj
-
-    # Find documents that were "added" (no exception) but aren't in the DB
-    tracked_ids = {f[0] for f in add_failures}
-    silent_failures: list[str] = []
-    for doc in documents:
-        doc_id = (
-            doc.document_properties.get("base", {}).get("id", "")
-            if hasattr(doc, "document_properties")
-            else doc.get("base", {}).get("id", "")
-        )
-        if doc_id and doc_id not in db_ids and doc_id not in tracked_ids:
-            silent_failures.append(doc_id)
-
-    total_lost = conversion_lost + len(add_failures) + len(silent_failures)
+        if did and did not in db_ids:
+            missing.append(did)
+            missing_jsons.append(dj)
 
     if verbose:
         print("Download complete.")
 
-    if total_lost > 0:
-        # Write missing documents to a JSON file for inspection
-        missing_docs_path = target / "missingDocuments.json"
-        missing_docs = []
-        for doc_id in silent_failures:
-            if doc_id in doc_json_by_id:
-                missing_docs.append(doc_json_by_id[doc_id])
+    if missing:
+        # Print the document_class of each missing doc for diagnostics.
+        # Session/dataset docs from older datasets are expected to be
+        # missing (superseded by docs created locally during dataset init).
+        session_dataset_types = {
+            "ndi_session",
+            "ndi_dataset",
+            "session",
+            "dataset",
+            "session_in_a_dataset",
+            "dataset_session_info",
+        }
+        real_missing: list[tuple[str, str]] = []
+        for doc_id, dj in zip(missing, missing_jsons):
+            doc_class = (
+                dj.get("document_class", {}).get("class_name", "") if isinstance(dj, dict) else ""
+            )
+            superclasses = (
+                dj.get("document_class", {}).get("superclasses", []) if isinstance(dj, dict) else []
+            )
+            all_types = {doc_class} | {
+                sc.get("class_name", "") if isinstance(sc, dict) else str(sc)
+                for sc in (superclasses if isinstance(superclasses, list) else [])
+            }
+            if all_types & session_dataset_types:
+                print(
+                    f"  Note: remote doc {doc_id} (class: {doc_class}) "
+                    f"not in local DB — expected for session/dataset docs"
+                )
             else:
-                missing_docs.append({"base": {"id": doc_id}})
-        for doc_id, reason in add_failures:
-            entry = dict(doc_json_by_id.get(doc_id, {"base": {"id": doc_id}}))
-            entry["_add_error"] = reason
-            missing_docs.append(entry)
-        if missing_docs:
-            import json
+                print(f"  WARNING: remote doc {doc_id} (class: {doc_class}) missing from local DB")
+                real_missing.append((doc_id, doc_class))
 
-            missing_docs_path.write_text(json.dumps(missing_docs, indent=2, default=str))
+        if real_missing:
+            missing_docs_path = target / "missingDocuments.json"
+            import json
 
-        lines = [
-            f"Downloaded {len(doc_jsons)} documents but only "
-            f"{len(db_ids)} were added to the dataset. "
-            f"{total_lost} document(s) lost:"
-        ]
-        if conversion_lost > 0:
-            lines.append(f"\n{conversion_lost} failed to convert from JSON" " to ndi_document")
-        if add_failures:
-            lines.append(f"\n{len(add_failures)} raised errors during" " database add:")
-            for doc_id, reason in add_failures[:50]:
-                lines.append(f"\n  - {doc_id}: {reason}")
-            if len(add_failures) > 50:
-                lines.append(f"\n  ... and {len(add_failures) - 50} more")
-        if silent_failures:
-            lines.append(
-                f"\n{len(silent_failures)} were passed to"
-                " database.add() without error but are NOT in the"
-                " database (possible DID-python bug):"
-            )
-            for doc_id in silent_failures[:50]:
-                lines.append(f"\n  - {doc_id}")
-            if len(silent_failures) > 50:
-                lines.append(f"\n  ... and {len(silent_failures) - 50} more")
-        if missing_docs:
-            lines.append(f"\nFull JSON of missing documents written to:" f"\n  {missing_docs_path}")
-        raise RuntimeError("".join(lines))
+            missing_docs_path.write_text(json.dumps(missing_jsons, indent=2, default=str))
+
+            lines = [
+                f"Downloaded {len(doc_jsons)} documents but "
+                f"{len(real_missing)} are missing from the local dataset:"
+            ]
+            for doc_id, doc_class in real_missing[:50]:
+                lines.append(f"\n  - {doc_id} (class: {doc_class})")
+            if len(real_missing) > 50:
+                lines.append(f"\n  ... and {len(real_missing) - 50} more")
+            lines.append(f"\nFull JSON of missing documents written to:\n  {missing_docs_path}")
+            raise RuntimeError("".join(lines))
 
     return dataset