From 0edd05895310c7dea68528811aa5210dbb277c65 Mon Sep 17 00:00:00 2001
From: Aishwarya Sinhasane <aishwarya@MacBook-Air-7.local>
Date: Fri, 10 Oct 2025 12:34:53 -0400
Subject: [PATCH 1/2] testing asct-b digital objects documentation added

---
 TestingREADME.md                              |  96 ++++++++
 .../asct-b-utils/api.functions.js             |  14 +-
 testing/compare_and_mismatched_reason.py      | 226 ++++++++++++++++++
 testing/test-digital-objects.sh               | 172 +++++++++++++
 4 files changed, 507 insertions(+), 1 deletion(-)
 create mode 100644 TestingREADME.md
 create mode 100644 testing/compare_and_mismatched_reason.py
 create mode 100755 testing/test-digital-objects.sh
diff --git a/TestingREADME.md b/TestingREADME.md
new file mode 100644
index 00000000..832267a7
--- /dev/null
+++ b/TestingREADME.md
@@ -0,0 +1,96 @@
+# HRA-DO Processor – Testing and Debugging Notes
+
+## 1. Environment Setup and Debugging
+
+If you encounter a **`riot class not found`** error even when using Java 11, ensure that Apache Jena is properly configured in your environment.
+
+Run the following commands in your terminal:
+
+```bash
+export JAVA_HOME=$(/usr/libexec/java_home -v 11)
+export PATH="$JAVA_HOME/bin:$PATH"
+export JENA_HOME="$(pwd)/.venv/opt/apache-jena"
+export PATH="$JENA_HOME/bin:$PATH"
+hash -r
+which riot
+```
+
+✅ **Expected output:**
+```
+.venv/opt/apache-jena/bin/riot
+```
+
+If `which riot` points to another location, the `riot` CLI tool might not be using the correct Jena installation. Ensure `.venv/opt/apache-jena/bin` appears first in your `PATH`.
+
+---
+## 3. Run Testing Script
+The main automation script is located at:
+```
+hra-do-processor/testing/testing-digital-objects.sh
+```
+It must be **executed from the root of the repository**, not from inside the `testing` folder.
+
+## 🧭 Paths to Update Before Running
+
+Open the script and review these key variables near the top:
+
+| Variable | Current Default | What to Change |
+|-----------|------------------|----------------|
+| `source` | `/home/hra-do-processor/.venv/bin/activate` | Update if your virtual environment path is different |
+| `DO_PATH` | `/home/hra-do-processor/digital-objects/asct-b/blood-pelvis/v1.4` | Change to the specific Digital Object directory you want to test |
+| `COMPARE_SCRIPT` | `compare_and_mismatched_reason.py` | Update if your comparison script is located in another folder. No need to change, this by default is in testing/ folder|
+
+If your project is in a different location, simply replace `/home/hra-do-processor` with your actual path.
+
+---
+## 🚀 How to Run
+
+From the **project root**, run:
+```bash
+bash testing/testing-digital-objects.sh 
+```
+
+## ⚙️ What the Script Does
+
+This script automates the full **HRA Digital Object (DO)** processing workflow using the `do-processor` CLI and a comparison script.
+
+It performs the following steps:
+
+| Step | Command | Description |
+|------|----------|-------------|
+| 1️⃣ | `do-processor normalize` | Normalizes the raw CSV data into standardized YAML form |
+| 2️⃣ | `do-processor enrich` | Adds ontology links and metadata enrichment |
+| 3️⃣ | `do-processor build` | Builds deployable RDF/JSON artifacts |
+| 4️⃣ | `do-processor reconstruct` | Reconstructs CSV from normalized data |
+| 5️⃣ | `compare_and_mismatched_reason.py` | Compares **raw → normalized → reconstructed** outputs and summarizes mismatches |
+
+Outputs are stored under the Digital Object’s own folders:
+```
+normalized/
+enriched/
+reconstructed/
+    columns_only_in_raw
+    columns_only_in_recon
+    value_mismatches_explained.csv
+```
+
+---
+
+## 2. Digital Object Testing Summary
+
+| **Digital Object** | **Columns Dropped (only in raw)** | **Key Observations** | **Suggested Changes** |
+|--------------------|-----------------------------------|----------------------|------------------------|
+| **Allen Brain** | `all_gene_biomarkers` (others empty/dropped) | - Most diffs due to normalization transforms (2,759 records)<br>- `all_gene_biomarkers` dropped during normalize (comma-separated values)<br>- Missing RDFS labels for some LOC IDs | - Add `all_gene_biomarkers` field to schema (array of strings)<br>- Update normalize step to keep and parse this column<br>- Implement ontology label fallback for LOC IDs |
+| **Blood – Pelvis** | `all_gene_biomarkers`, `ftu/1`, `ftu/1/id`, `ftu/1/label`, `ref/2`, `ref/2/id`, `ref/2/notes` | - Most diffs from normalization transforms (174)<br>- Gene/protein labels standardized (e.g., “CD19 molecule” → “CD19”)<br>- Metadata order mismatches<br>- Example filtered rows: `bgene/10/label` row 28–29 (“tryptophanyl-tRNA synthetase 1” → “WARS1”) filtered at normalize | - Keep normalization label standardization<br>- Add filter exception handling for `bgene/*/label` when raw not found in `normalized.yaml` |
+| **Kidney** | `bprotein/4`, `bprotein/4/id`, `bprotein/4/label`, `bprotein/4/notes`, `ct/1/abbr`, `ct/2/abbr`, `ftu/2/id/notes` | - Diffs from normalization (415) + mapping/format (313)<br>- `bprotein/4*` dropped due to invalid ID format<br>- `ct/*/abbr` dropped across DOs (missing in schema)<br>- `ftu/2/id/notes` empty | 1️⃣ Generate or repair valid IDs for `bprotein/4` items before normalize<br>2️⃣ Add `ct/*/abbr` to schema<br>3️⃣ Allowlist raw-only fields if needed<br>4️⃣ Review 313 mapping/format diffs for consistent URI/CURIE formatting |
+| **Large Intestine** | `bprotein/6/id` (HGNC:1678 rows) | - `HGNC:1678` dropped because normalizer found no RDFS label<br>- 7,345 mismatch rows: 4,115 filtered, 2,970 transformed during normalize<br>- Diffs mainly due to normalization (not reconstruction bugs) | - Add fallback for missing RDFS label to retain ID<br>- Review normalization logic for label lookup |
+| **Heart** | `combined_gene_markers` | - Combined gene marker values split/dropped by normalizer<br>- Array cells like `GENE1;GENE2` treated as single biomarker or dropped if lookup fails | - Add splitting logic in `setData()` to handle multi-marker cells<br>- Ensure `GENE1;GENE2` → two biomarker entries |
+| **Pancreas** | — | - Entries previously appeared shifted due to normalization reordering<br>- Comparator now uses stable key to fix alignment | - Continue using stable key comparator to prevent array misalignment |
+
+---
+
+## 3. Summary
+
+- **Normalization step** is the main source of differences across digital objects.  
+- **Schema gaps** (like missing `abbr` fields or combined marker arrays) must be addressed for consistency.  
+- **Ontology label lookups** (e.g., LOC IDs, HGNC symbols) should implement **fallbacks** to prevent data loss during normalization.  
diff --git a/src/normalization/asct-b-utils/api.functions.js b/src/normalization/asct-b-utils/api.functions.js
index b030ac76..a1284e1c 100644
--- a/src/normalization/asct-b-utils/api.functions.js
+++ b/src/normalization/asct-b-utils/api.functions.js
@@ -73,7 +73,18 @@ function setData(column, columnNumber, row, value, warnings) {
       if (objectArray.length === 0 && arrayName) {
         row[arrayName] = objectArray;
       }
-      objectArray.push(createObject(value, originalArrayName));
+      // Split combined biomarker tokens (if this is a biomarker array) into separate objects.
+      const biomarkerArrays = new Set(['BG','BGENE','BP','BPROTEIN','BL','BLIPID','BM','BMETABOLITES','BF','BPROTEOFORM']);
+      let tokens = [value];
+      if (value && typeof value === 'string' && biomarkerArrays.has(originalArrayName.toUpperCase())) {
+        const escapeForCharClass = (s) => s.replace(/[-\\\]^]/g, '\\$&');
+        const delimChars = escapeForCharClass(DELIMETER) + ',|';
+        const separators = new RegExp('[' + delimChars + ']+' );
+        tokens = value.split(separators).map((s) => s.trim()).filter(Boolean);
+      }
+      for (const token of tokens) {
+        objectArray.push(createObject(token, originalArrayName));
+      }
     } else if (column.length === 3 && arrayName) {
       let arrayIndex = parseInt(column[1], 10) - 1;
       const fieldName = objectFieldMap[column[2]]; // || (column[2]?.toLowerCase() ?? '').trim();
@@ -100,6 +111,7 @@ function setData(column, columnNumber, row, value, warnings) {
   }
 }
 
+
 const invalidCharacterRegex = /_/gi;
 const isLinkRegex = /^http/gi;
 const codepointUppercaseA = 65;
diff --git a/testing/compare_and_mismatched_reason.py b/testing/compare_and_mismatched_reason.py
new file mode 100644
index 00000000..0750dc09
--- /dev/null
+++ b/testing/compare_and_mismatched_reason.py
@@ -0,0 +1,226 @@
+# Robust RAW vs RECONSTRUCTED mismatch explainer for HRA DOs.
+# - Uses line-based CSV parsing to avoid pandas delimiter issues.
+# - Assumes header is on line --header (1-based, default 11).
+# - Produces:
+#     columns_only_in_raw.csv
+#     columns_only_in_reconstructed.csv
+#     value_mismatches.csv
+#     value_mismatches_explained.csv  (adds "reason" + "evidence")
+#
+# Example:
+#   python3 compare_and_mismatched_reason.py \
+#     --raw "/.../asct-b/kidney/v1.6/raw/asct-b-vh-kidney.csv" \
+#     --recon "/.../asct-b/kidney/v1.6/reconstructed/reconstructed.csv" \
+#     --normalized "/.../asct-b/kidney/v1.6/normalized/normalized.yaml" \
+#     --warnings "/.../asct-b/kidney/v1.6/normalized/warnings.yaml" \
+#     --header 11
+
+from __future__ import annotations
+from pathlib import Path
+import argparse
+import csv
+import sys
+import pandas as pd
+
+# -------- Config toggles --------
+NORMALIZE_TEXT = True  # collapse whitespace in cell values
+# --------------------------------
+
+def canon_text(x):
+    if isinstance(x, str):
+        x = x.strip()
+        if NORMALIZE_TEXT:
+            x = " ".join(x.split())
+    return x
+
+def read_all_lines(path: Path, encoding: str) -> list[str]:
+    with path.open("r", encoding=encoding, errors="replace", newline="") as f:
+        return f.read().splitlines()
+
+def choose_delimiter(header_line: str) -> str:
+    # Pick the delimiter that yields the most fields for the header row
+    candidates = [",", "\t", ";", "|"]
+    best = ","
+    best_n = -1
+    for d in candidates:
+        row = next(csv.reader([header_line], delimiter=d, quotechar='"', escapechar="\\"))
+        if len(row) > best_n:
+            best_n = len(row)
+            best = d
+    return best
+
+def parse_csv_lines(lines: list[str], delimiter: str) -> list[list[str]]:
+    reader = csv.reader(lines, delimiter=delimiter, quotechar='"', escapechar="\\")
+    return [row for row in reader]
+
+def pad_to_width(rows: list[list[str]]) -> list[list[str]]:
+    width = max((len(r) for r in rows), default=0)
+    return [r + [""] * (width - len(r)) for r in rows]
+
+def norm_col_name(s: str, idx: int) -> str:
+    name = "_".join(s.strip().split()).lower()
+    return name if name else f"col_{idx+1}"
+
+def build_df_from_file(path: Path, header_row_1based: int, encoding: str):
+    lines = read_all_lines(path, encoding)
+    if len(lines) < header_row_1based:
+        raise ValueError(f"{path} has only {len(lines)} lines; cannot use line {header_row_1based} as header.")
+
+    header_line = lines[header_row_1based - 1]
+    delim = choose_delimiter(header_line)
+
+    rows = parse_csv_lines(lines, delim)
+    rows = pad_to_width(rows)
+
+    header_idx0 = header_row_1based - 1
+    header_row = rows[header_idx0]
+
+    # Build unique, normalized column names
+    cols, used = [], set()
+    for i, raw_name in enumerate(header_row):
+        name = norm_col_name(raw_name, i)
+        base, k = name, 2
+        while name in used:
+            name = f"{base}_{k}"; k += 1
+        used.add(name)
+        cols.append(name)
+
+    data_rows = rows[header_idx0 + 1:]
+    df = pd.DataFrame(data_rows, columns=cols)
+    df = df.map(canon_text)
+
+    info = {
+        "path": str(path),
+        "n_lines": len(lines),
+        "delimiter": repr(delim),
+        "n_rows": df.shape[0],
+        "n_cols": df.shape[1],
+        "n_header_cols": len(cols),
+    }
+    return df, info
+
+def load_text(path: Path) -> str:
+    try:
+        return path.read_text(encoding="utf-8", errors="replace")
+    except Exception:
+        return ""
+
+def contains_text(hay: str, needle: str) -> bool:
+    if not needle:
+        return False
+    # normalize both for fair contains check
+    H = canon_text(hay)
+    N = canon_text(needle)
+    return N in H if (H is not None and N is not None) else False
+
+def explain_reason(raw_val: str, recon_val: str, normalized_text: str, warnings_text: str) -> tuple[str, str]:
+    """
+    Heuristic reason assignment:
+      - Dropped during normalize (warning): warnings.yaml mentions raw_val
+      - Filtered at normalize: raw_val absent in normalized.yaml
+      - Transformed during normalize: recon_val present, raw_val absent in normalized.yaml
+      - Reconstruction mapping/formatting difference: raw_val present in normalized.yaml but differs in recon
+      - Indeterminate: fallback
+    """
+    if contains_text(warnings_text, raw_val):
+        return "Dropped during normalize (warning)", "warnings.yaml contains RAW value"
+
+    in_norm_raw = contains_text(normalized_text, raw_val)
+    in_norm_recon = contains_text(normalized_text, recon_val)
+
+    if not in_norm_raw:
+        if in_norm_recon and raw_val != recon_val:
+            return "Transformed during normalize", "normalized.yaml contains RECON value, not RAW"
+        return "Filtered at normalize", "RAW value not found in normalized.yaml"
+
+    # RAW present in normalized; RECON differs
+    if raw_val != recon_val:
+        if in_norm_recon:
+            return "Reconstruction mapping/formatting difference", "Both RAW and RECON appear in normalized.yaml"
+        return "Reconstruction mapping/formatting difference", "RAW present in normalized.yaml but RECON differs"
+
+    return "Indeterminate", ""
+
+def parse_args():
+    ap = argparse.ArgumentParser(description="Explain RAW vs RECON mismatches using normalized + warnings.")
+    ap.add_argument("--raw", required=False, default=None, help="Path to RAW CSV")
+    ap.add_argument("--recon", required=False, default=None, help="Path to reconstructed CSV")
+    ap.add_argument("--normalized", required=False, default=None, help="Path to normalized.yaml")
+    ap.add_argument("--warnings", required=False, default=None, help="Path to warnings.yaml")
+    ap.add_argument("--header", type=int, default=11, help="Header line number (1-based). Default: 11")
+    ap.add_argument("--encoding", default="utf-8", help="Text encoding. Default: utf-8")
+    return ap.parse_args()
+
+def main():
+    args = parse_args()
+
+    # If paths not provided, try kidney v1.6 defaults (easy to override with flags)
+    raw_p = Path(args.raw) if args.raw else Path("/Users/aishwarya/CNS-Code/hra-do-processor/digital-objects/asct-b/lung/v1.5/raw/asct-b-vh-lung.csv")
+    recon_p = Path(args.recon) if args.recon else Path("/Users/aishwarya/CNS-Code/hra-do-processor/digital-objects/asct-b/lung/v1.5/reconstructed/reconstructed.csv")
+    normalized_p = Path(args.normalized) if args.normalized else Path("/Users/aishwarya/CNS-Code/hra-do-processor/digital-objects/asct-b/lung/v1.5/normalized/normalized.yaml")
+    warnings_p = Path(args.warnings) if args.warnings else Path("/Users/aishwarya/CNS-Code/hra-do-processor/digital-objects/asct-b/lung/v1.5/normalized/warnings.yaml")
+
+    # Build dataframes using the robust reader
+    raw_df, raw_info = build_df_from_file(raw_p, args.header, args.encoding)
+    recon_df, recon_info = build_df_from_file(recon_p, args.header, args.encoding)
+
+    print("RAW info  :", raw_info)
+    print("RECON info:", recon_info)
+
+    # Column presence
+    raw_cols = set(raw_df.columns)
+    recon_cols = set(recon_df.columns)
+    only_in_raw = sorted(raw_cols - recon_cols)
+    only_in_recon = sorted(recon_cols - raw_cols)
+    in_both = sorted(raw_cols & recon_cols)
+
+    out_dir = recon_p.parent
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    pd.DataFrame({"column_only_in_raw": only_in_raw}).to_csv(out_dir / "columns_only_in_raw.csv", index=False)
+    pd.DataFrame({"column_only_in_reconstructed": only_in_recon}).to_csv(out_dir / "columns_only_in_reconstructed.csv", index=False)
+
+    # Compare values in shared columns (row-wise up to min rows)
+    n = min(len(raw_df), len(recon_df))
+    raw_c = raw_df.iloc[:n].reset_index(drop=True)
+    recon_c = recon_df.iloc[:n].reset_index(drop=True)
+
+    # Save raw mismatches (no reasons) for reference
+    mismatches_plain = []
+    for col in in_both:
+        diffs = raw_c[col] != recon_c[col]
+        if diffs.any():
+            for i in diffs[diffs].index.tolist():
+                mismatches_plain.append({
+                    "column": col,
+                    "row_number_in_file": args.header + 1 + i,
+                    "raw_value": raw_c.at[i, col],
+                    "reconstructed_value": recon_c.at[i, col],
+                })
+    pd.DataFrame(mismatches_plain).to_csv(out_dir / "value_mismatches.csv", index=False)
+
+    # Load normalized & warnings text to derive reasons
+    normalized_text = load_text(normalized_p)
+    warnings_text = load_text(warnings_p)
+
+    explained = []
+    for row in mismatches_plain:
+        reason, evidence = explain_reason(row["raw_value"], row["reconstructed_value"], normalized_text, warnings_text)
+        row2 = dict(row)
+        row2["reason"] = reason
+        row2["evidence"] = evidence
+        explained.append(row2)
+
+    pd.DataFrame(explained).to_csv(out_dir / "value_mismatches_explained.csv", index=False)
+
+    print("Saved:")
+    print(f"  - {out_dir/'columns_only_in_raw.csv'}")
+    print(f"  - {out_dir/'columns_only_in_reconstructed.csv'}")
+    print(f"  - {out_dir/'value_mismatches.csv'}")
+    print(f"  - {out_dir/'value_mismatches_explained.csv'}")
+
+if __name__ == "__main__":
+    try:
+        main()
+    except BrokenPipeError:
+        sys.exit(0)
diff --git a/testing/test-digital-objects.sh b/testing/test-digital-objects.sh
new file mode 100755
index 00000000..e1f2ae50
--- /dev/null
+++ b/testing/test-digital-objects.sh
@@ -0,0 +1,172 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# ------------------------------------------------------------
+# Usage:
+#   ./run_do_pipeline.sh [DO_PATH] [--base-iri URL] [--processor-home DIR] [--raw-file FILE] [--python PY]
+#
+# Defaults:
+#   DO_PATH           = ./digital-objects/asct-b/mouth/v1.0
+#   --base-iri        = https://purl.humanatlas.io
+#   --processor-home  = "$(pwd)"
+#   --raw-file        = auto-detected: first CSV in "$DO_PATH/raw/*.csv"
+#   --python          = python (from current shell/venv)
+#
+# Steps:
+#   1) do-processor normalize
+#   2) do-processor enrich
+#   3) do-processor build
+#   4) do-processor reconstruct
+#   5) python compare_and_mismatched_reason.py --raw --recon --normalized --warnings
+# ------------------------------------------------------------
+source /Users/aishwarya/CNS-Code/hra-do-processor/.venv/bin/activate
+DO_PATH="${1:-/Users/aishwarya/CNS-Code/hra/hra-do-processor/digital-objects/asct-b/blood-pelvis/v1.4}"
+shift || true
+
+BASE_IRI="https://purl.humanatlas.io"
+PROCESSOR_HOME="$(pwd)"
+RAW_FILE=""
+PYTHON_BIN="python"
+
+# Parse optional flags
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --base-iri)
+      BASE_IRI="${2:?need value for --base-iri}"; shift 2;;
+    --processor-home)
+      PROCESSOR_HOME="${2:?need value for --processor-home}"; shift 2;;
+    --raw-file)
+      RAW_FILE="${2:?need value for --raw-file}"; shift 2;;
+    --python)
+      PYTHON_BIN="${2:?need value for --python}"; shift 2;;
+    -h|--help)
+      echo "Usage: $0 [DO_PATH] [--base-iri URL] [--processor-home DIR] [--raw-file FILE] [--python PY]"
+      exit 0;;
+    *)
+      echo "Unknown option: $1" >&2; exit 1;;
+  esac
+done
+
+# Resolve important paths
+RAW_DIR="${DO_PATH%/}/raw"
+NORM_DIR="${DO_PATH%/}/normalized"
+ENRICH_DIR="${DO_PATH%/}/enriched"
+RECON_DIR="${DO_PATH%/}/reconstructed"
+
+NORM_META_YAML="${NORM_DIR}/normalized-metadata.yaml"
+NORM_MAIN_YAML="${NORM_DIR}/normalized.yaml"
+WARNINGS_YAML="${NORM_DIR}/warnings.yaml"
+RECON_CSV="${RECON_DIR}/reconstructed.csv"
+
+# Pick a raw CSV if not provided: the first CSV in raw/
+if [[ -z "${RAW_FILE}" ]]; then
+  shopt -s nullglob
+  RAW_CANDIDATES=("${RAW_DIR}"/*.csv)
+  shopt -u nullglob
+  if [[ ${#RAW_CANDIDATES[@]} -eq 0 ]]; then
+    echo "ERROR: Could not auto-detect a raw CSV in ${RAW_DIR}. Use --raw-file to specify it." >&2
+    exit 1
+  fi
+  RAW_FILE="${RAW_CANDIDATES[0]}"
+fi
+
+echo "==> DO_PATH          : ${DO_PATH}"
+echo "==> BASE_IRI         : ${BASE_IRI}"
+echo "==> PROCESSOR_HOME   : ${PROCESSOR_HOME}"
+echo "==> RAW_FILE         : ${RAW_FILE}"
+echo "==> PYTHON_BIN       : ${PYTHON_BIN}"
+echo
+
+# 1) Normalize
+echo ">>> NORMALIZE"
+do-processor normalize "${DO_PATH}" \
+  --base-iri "${BASE_IRI}" \
+  --exclude-bad-values \
+  --processor-home "${PROCESSOR_HOME}"
+
+# sanity: normalized artifacts
+if [[ ! -f "${NORM_META_YAML}" && ! -f "${NORM_MAIN_YAML}" ]]; then
+  echo "ERROR: normalize did not create ${NORM_META_YAML} or ${NORM_MAIN_YAML}" >&2
+  exit 1
+fi
+
+# 2) Enrich
+echo ">>> ENRICH"
+do-processor enrich "${DO_PATH}" \
+  --base-iri "${BASE_IRI}" \
+  --exclude-bad-values \
+  --processor-home "${PROCESSOR_HOME}"
+
+# 3) Build (this may regenerate deployables)
+echo ">>> BUILD"
+do-processor build "${DO_PATH}" \
+  --base-iri "${BASE_IRI}" \
+  --exclude-bad-values \
+  --processor-home "${PROCESSOR_HOME}"
+
+# 4) Reconstruct
+echo ">>> RECONSTRUCT"
+do-processor reconstruct "${DO_PATH}" \
+  --processor-home "${PROCESSOR_HOME}"
+
+# sanity: reconstructed CSV
+if [[ ! -f "${RECON_CSV}" ]]; then
+  echo "ERROR: reconstruct did not create ${RECON_CSV}" >&2
+  exit 1
+fi
+
+# 5) Compare & mismatched reason script
+#    Uses: --raw, --recon, --normalized, --warnings
+#    normalized: prefer normalized.yaml if present, else normalized-metadata.yaml
+NORMALIZED_TO_USE=""
+if [[ -f "${NORM_MAIN_YAML}" ]]; then
+  NORMALIZED_TO_USE="${NORM_MAIN_YAML}"
+elif [[ -f "${NORM_META_YAML}" ]]; then
+  NORMALIZED_TO_USE="${NORM_META_YAML}"
+else
+  echo "WARNING: No normalized YAML found; skipping compare script." >&2
+  exit 0
+fi
+
+WARNINGS_TO_USE=""
+if [[ -f "${WARNINGS_YAML}" ]]; then
+  WARNINGS_TO_USE="${WARNINGS_YAML}"
+else
+  # not fatal; some runs may not emit warnings.yaml
+  echo "NOTE: ${WARNINGS_YAML} not found; proceeding without it."
+fi
+
+echo ">>> RUN compare_and_mismatched_reason.py"
+# Adjust path if your script lives elsewhere:
+COMPARE_SCRIPT="compare_and_mismatched_reason.py"
+
+if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
+  echo "ERROR: python interpreter '${PYTHON_BIN}' not found" >&2
+  exit 1
+fi
+
+if [[ ! -f "${COMPARE_SCRIPT}" ]]; then
+  # try inside repo root scripts/ if that's where you keep it
+  if [[ -f "${COMPARE_SCRIPT}" ]]; then
+    COMPARE_SCRIPT="${COMPARE_SCRIPT}"
+  else
+    echo "ERROR: ${COMPARE_SCRIPT} not found in current dir or scripts/" >&2
+    exit 1
+  fi
+fi
+
+COMPARE_CMD=( "${PYTHON_BIN}" "${COMPARE_SCRIPT}"
+  --raw "${RAW_FILE}"
+  --recon "${RECON_CSV}"
+  --normalized "${NORMALIZED_TO_USE}"
+)
+
+# include warnings only if present
+if [[ -n "${WARNINGS_TO_USE}" ]]; then
+  COMPARE_CMD+=( --warnings "${WARNINGS_TO_USE}" )
+fi
+
+echo "+ ${COMPARE_CMD[*]}"
+"${COMPARE_CMD[@]}"
+
+echo "✅ Done."

From f3bc7a9b978c40fb77c03a493f329bfd02a00ba9 Mon Sep 17 00:00:00 2001
From: aishwarya-sinhasane <avsinhas@iu.edu>
Date: Fri, 10 Oct 2025 14:10:37 -0400
Subject: [PATCH 2/2] Fix path for compare_and_mismatched_reason.py

Update the path for the compare script in the test script.
---
 testing/test-digital-objects.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/testing/test-digital-objects.sh b/testing/test-digital-objects.sh
index e1f2ae50..8d72f1ee 100755
--- a/testing/test-digital-objects.sh
+++ b/testing/test-digital-objects.sh
@@ -138,7 +138,7 @@ fi
 
 echo ">>> RUN compare_and_mismatched_reason.py"
 # Adjust path if your script lives elsewhere:
-COMPARE_SCRIPT="compare_and_mismatched_reason.py"
+COMPARE_SCRIPT="testing/compare_and_mismatched_reason.py"
 
 if ! command -v "${PYTHON_BIN}" >/dev/null 2>&1; then
   echo "ERROR: python interpreter '${PYTHON_BIN}' not found" >&2