From b3d2eea52543d7066de9813c50cc9e36e2747d28 Mon Sep 17 00:00:00 2001
From: Javier <javier.millanacosta@maastrichtuniversity.nl>
Date: Fri, 17 Apr 2026 16:59:01 +0200
Subject: [PATCH 1/2] Add data models and SQLite database

---
 .gitignore                                    |   2 +
 Dockerfile                                    |   5 +
 app.py                                        | 539 +++++------
 entrypoint.sh                                 |  11 +-
 patch.py                                      |  46 -
 requirements.txt                              |  35 +-
 src/__init__.py                               |   0
 src/api.py                                    | 564 ++++++++++++
 src/casestudy_resolver.py                     | 298 ++++++
 src/db.py                                     |  75 ++
 src/models/__init__.py                        |   0
 src/models/casestudy.py                       | 209 +++++
 src/models/cloud/method.py                    | 134 +++
 src/models/cloud/tool.py                      |  98 ++
 src/models/compound.py                        |  75 ++
 src/models/data/__init__.py                   |  50 +
 src/models/data/biostudies.py                 | 867 ++++++++++++++++++
 src/models/data/mapping.py                    | 526 +++++++++++
 src/models/data/schemas.py                    | 245 +++++
 src/models/data/zenodo.py                     | 484 ++++++++++
 src/models/platform.py                        |  56 ++
 src/scheduler.py                              |  61 ++
 src/seed.py                                   | 279 ++++++
 src/services/__init__.py                      |   0
 src/services/compound.py                      | 204 +++++
 src/sitemap.py                                |  59 ++
 templates/base.html                           |   4 +
 templates/case_studies/casestudies.html       |  36 +-
 templates/case_studies/casestudy_server.html  | 229 +++++
 ...w.html => safety_assessment_workflow.html} |   0
 30 files changed, 4812 insertions(+), 379 deletions(-)
 delete mode 100644 patch.py
 create mode 100644 src/__init__.py
 create mode 100644 src/api.py
 create mode 100644 src/casestudy_resolver.py
 create mode 100644 src/db.py
 create mode 100644 src/models/__init__.py
 create mode 100644 src/models/casestudy.py
 create mode 100644 src/models/cloud/method.py
 create mode 100644 src/models/cloud/tool.py
 create mode 100644 src/models/compound.py
 create mode 100644 src/models/data/__init__.py
 create mode 100644 src/models/data/biostudies.py
 create mode 100644 src/models/data/mapping.py
 create mode 100644 src/models/data/schemas.py
 create mode 100644 src/models/data/zenodo.py
 create mode 100644 src/models/platform.py
 create mode 100644 src/scheduler.py
 create mode 100644 src/seed.py
 create mode 100644 src/services/__init__.py
 create mode 100644 src/services/compound.py
 create mode 100644 src/sitemap.py
 create mode 100644 templates/case_studies/casestudy_server.html
 rename templates/{Safety_Assessment_Workflow.html => safety_assessment_workflow.html} (100%)

diff --git a/.gitignore b/.gitignore
index a64738a..58802be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@ __pycache__/
 *.py[cod]
 *$py.class
 
+# SQLite database
+data/*.db
 # C extensions
 *.so
 
diff --git a/Dockerfile b/Dockerfile
index 854e92a..90b4c14 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -21,9 +21,14 @@ COPY . .
 # Install any needed packages specified in requirements.txt
 RUN pip install -r requirements.txt
 
+# Create data directory for SQLite DB
+RUN mkdir -p /usr/src/app/data
+
 # Copy entrypoint script
 COPY entrypoint.sh /usr/src/app/entrypoint.sh
 RUN chmod +x /usr/src/app/entrypoint.sh
 
+EXPOSE 5050
+
 # Define the entrypoint script
 ENTRYPOINT ["/usr/src/app/entrypoint.sh"]
diff --git a/app.py b/app.py
index e6aa3de..a3e0cec 100644
--- a/app.py
+++ b/app.py
@@ -1,6 +1,7 @@
 ################################################################################
 ### Loading the required modules
 import json
+import os
 import re
 
 import requests
@@ -13,10 +14,15 @@
 # from wikidataintegrator import wdi_core
 from wikibaseintegrator import wbi_helpers
 
-# Import BioStudies extractor
-from data.biostudies.search import BioStudiesExtractor
-from data.zenodo.search import ZenodoExtractor
-from data.mapping import normalize_all
+# Data extractors (API wrappers — no DB needed)
+from src.models.data.biostudies import BioStudiesExtractor
+from src.models.data.zenodo import ZenodoExtractor
+from src.models.data.mapping import normalize_all
+
+# Database layer
+from src.db import get_conn, init_db
+from src.api import init_api
+from src.casestudy_resolver import resolve as resolve_casestudy
 
 ################################################################################
 CACHE_TIMEOUT = 60 * 60 * 24 * 5    # 5 days -- [Ozan] I created a separate
@@ -62,7 +68,7 @@
     },
     "reg_q_2b": {
         "label": "Parkinson Case Study (b)",
-        "explanation": "What level of exposure to compound Dinoseb leads to risk for developing Parkinson’s disease?",
+        "explanation": "What level of exposure to compound Dinoseb leads to risk for developing Parkinson's disease?",
     },
     "reg_q_3a": {
         "label": "Thyroid Case Study (a)",
@@ -103,9 +109,16 @@ def __init__(self, url_map, *items):
     "CACHE_SERVICE_TIMEOUT": CACHE_TIMEOUT_SERVICE
 }
 app = Flask(__name__)
+app.secret_key = os.environ.get(
+    "FLASK_SECRET_KEY", "dev-insecure-key"
+)
 app.config.from_mapping(cache_config)
 cache = Cache(app)
 
+# Database init and API registration
+init_db()
+init_api(app)
+
 
 @cache.memoize(timeout=CACHE_TIMEOUT)
 def get_json_dict(url: str, timeout: int = 5) -> dict:
@@ -204,42 +217,25 @@ def get_repository_data(
 # Provide methods list to all templates for the Methods dropdown in the navbar
 @app.context_processor
 def inject_methods_menu():
-    """Fetch methods_index.json and expose a simple list of {id, title} to templates.
-    Return an empty list on any error to avoid breaking pages.
-    """
-    data = get_json_dict(METHODS_URL)
-    if data:
-        items = []
-        for key, val in data.items() if isinstance(data, dict) else []:
-            title = (
-                val.get("method")
-                or val.get("method_name_content")
-                or val.get("method_name")
-                or key
-            )
-            items.append({"id": key, "title": title})
-        # sort by title
-        items = sorted(items, key=lambda x: x["title"].lower())
-        return {"methods_menu": items}
-    else:
+    """Expose methods list to all templates for navbar dropdown."""
+    try:
+        conn = get_conn()
+        rows = conn.execute("SELECT id, method FROM methods ORDER BY method").fetchall()
+        conn.close()
+        return {"methods_menu": [{"id": r["id"], "title": r["method"]} for r in rows]}
+    except Exception:
         return {"methods_menu": []}
 
 
 @app.context_processor
 def inject_tools_menu():
-    """Fetch methods_index.json and expose a simple list of {id, title} to templates.
-    Return an empty list on any error to avoid breaking pages.
-    """
-    data = get_json_dict_service(SERVICES_URL)
-    if data:
-        items = []
-        for key, val in data.items() if isinstance(data, dict) else []:
-            title = val.get("service") or key
-            items.append({"id": key, "title": title})
-        # sort by title
-        items = sorted(items, key=lambda x: x["title"].lower())
-        return {"tools_menu": items}
-    else:
+    """Expose tools list to all templates for navbar dropdown."""
+    try:
+        conn = get_conn()
+        rows = conn.execute("SELECT id, service FROM tools ORDER BY service").fetchall()
+        conn.close()
+        return {"tools_menu": [{"id": r["id"], "title": r["service"]} for r in rows]}
+    except Exception:
         return {"tools_menu": []}
 
 
@@ -269,17 +265,12 @@ def inject_data_menu():
 ### The landing page
 @app.route("/")
 def home():
-    try:
-        tools = get_json_dict_service(
-            SERVICES_URL
-        )  # Geting the service_list.json in the dictionary format.
-        tools = list(tools.values())  # Converting the dictionary to a list object.
-    except Exception as e:
-        return f"Error processing service data: {e}", 500
-    num_tools = len(tools)
-    num_case_studies = len(CASESTUDIES)
+    conn = get_conn()
+    num_tools = conn.execute("SELECT COUNT(*) FROM tools").fetchone()[0]
+    num_case_studies = conn.execute("SELECT COUNT(*) FROM case_studies").fetchone()[0]
+    conn.close()
     bs_res, zen_res = get_repository_data(search_query="")
-    num_datasets = bs_res["total"] + zen_res["total"]
+    num_datasets = bs_res.get("total", 0) + zen_res.get("total", 0)
     return render_template(
         "home.html",
         num_tools=num_tools,
@@ -292,26 +283,34 @@ def home():
 ### The sitemap.xml for search engines
 @app.route("/sitemap.xml")
 def sitemap():
-    sitemapContent = """<?xml version="1.0" encoding="utf-8"?>
+        # Prefer generated static sitemap if present (created by src.sitemap)
+        import os
+        path = os.path.join(os.path.dirname(__file__), "static", "sitemap.xml")
+        if os.path.exists(path):
+                with open(path, "rb") as fh:
+                        return Response(fh.read(), mimetype="application/xml")
+
+        # Fallback minimal sitemap
+        sitemapContent = """<?xml version="1.0" encoding="utf-8"?>
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
-  <url>
-    <loc>https://platform.vhp4safety.nl/</loc>
-  </url>
-  <url>
-    <loc>https://platform.vhp4safety.nl/casestudies</loc>
-  </url>
-  <url>
-    <loc>https://platform.vhp4safety.nl/tools</loc>
-  </url>
-  <url>
-    <loc>https://platform.vhp4safety.nl/methods</loc>
-  </url>
-  <url>
-    <loc>https://platform.vhp4safety.nl/data</loc>
-  </url>
+    <url>
+        <loc>https://platform.vhp4safety.nl/\</loc\>
+    </url>
+    <url>
+        <loc>https://platform.vhp4safety.nl/casestudies\</loc\>
+    </url>
+    <url>
+        <loc>https://platform.vhp4safety.nl/tools\</loc\>
+    </url>
+    <url>
+        <loc>https://platform.vhp4safety.nl/methods\</loc\>
+    </url>
+    <url>
+        <loc>https://platform.vhp4safety.nl/data\</loc\>
+    </url>
 </urlset>
-""";
-    return Response(sitemapContent, mimetype='text/xml');
+"""
+        return Response(sitemapContent, mimetype="text/xml")
 
 
 ################################################################################
@@ -529,112 +528,94 @@ def models():
 ### Pages under 'Tools'
 
 
-### Here begins the updated version for creating the tool list page.
 @app.route("/tools")
 def tools():
     try:
-        tools = get_json_dict_service(
-            SERVICES_URL
-        )  # Geting the service_list.json in the dictionary format.
-        tools = list(tools.values())  # Converting the dictionary to a list object.
-
-        # Mapping the URLs with glossary IDs to their text values.
-        stage_mapping = {
-            "https://vhp4safety.github.io/glossary#VHP0000056": "ADME",
-            "https://vhp4safety.github.io/glossary#VHP0000102": "Hazard Assessment",
-            "https://vhp4safety.github.io/glossary#VHP0000148": "Chemical Information",
-            "https://vhp4safety.github.io/glossary#VHP0000149": "General",
-        }
-
-        for tool in tools:
-            full_stage_url = tool.get("stage", "")
-
-            # Writing the service name and stage values in the logs for troubleshooting.
-            # print(f"Tool: {tool['service']}, Stage URL: {full_stage_url}")  # Log the full URL
-
-            # Checking if the full URL is in the mapping and updating the stage.
-            if full_stage_url in stage_mapping:
-                # print(f"Mapping stage URL {full_stage_url} to {stage_mapping[full_stage_url]}")  # Log the mapping
-                tool["stage"] = stage_mapping[full_stage_url]
-            elif tool["stage"] in ["NA", "Unknown"]:
-                tool["stage"] = (
-                    "Other"  # Combining "NA" and "Unknown" stages in a single stage-type, "Other".
-                )
-
-            html_name = tool.get("html_name")
-            md_name = tool.get("md_file_name")
-            png_name = tool.get("png_file_name")
-
-            tool["url"] = f"https://cloud.vhp4safety.nl/service/{html_name}"
-            tool["meta_data"] = (
-                f"https://raw.githubusercontent.com/VHP4Safety/cloud/main/docs/service/{md_name}"
-                if md_name
-                else "md file not found"
-            )
-
-            # Check if the tool has the placeholder logo
-            placeholder_logo = "https://github.com/VHP4Safety/ui-design/blob/main/static/images/logo.png"
-            if png_name == placeholder_logo:
-                tool["png"] = None  # set to None if it's the common placeholder
-            else:
-                tool["png"] = (
-                    f"https://raw.githubusercontent.com/VHP4Safety/cloud/main/docs/service/{png_name}"
-                    if not png_name.startswith("http")
-                    else png_name
-                )
-
-            inst_url = tool.get("inst_url", "no_url")
-            if not inst_url:  # catches "" as well
-                inst_url = "no_url"
-            tool["inst_url"] = inst_url
+        conn = get_conn()
 
-        # Getting selected stages from the URL.
         selected_stages = request.args.getlist("stage")
+        search_query = request.args.get("search", "").strip().lower()
 
-        # Filtering tools by selected stages.
+        sql = "SELECT * FROM tools WHERE 1=1"
+        params = []
         if selected_stages:
-            tools = [tool for tool in tools if tool.get("stage") in selected_stages]
-
-        # Getting all unique stages from the tools for the filter options.
-        stages = sorted(set(tool.get("stage") for tool in tools if tool.get("stage")))
-
-        # Forcing "Other" to be the last item in the list of stages.
-        if "Other" in stages:
-            stages.remove("Other")
-            stages.append("Other")
+            placeholders = ",".join("?" * len(selected_stages))
+            sql += f" AND stage IN ({placeholders})"
+            params.extend(selected_stages)
+        if search_query:
+            sql += " AND LOWER(service) LIKE ?"
+            params.append(f"%{search_query}%")
+        sql += " ORDER BY service"
+        rows = conn.execute(sql, params).fetchall()
 
-        # Filtering over the regulatory questions.
-        reg_questions = {v["label"]: k for k, v in REG_QUESTIONS.items()}
+        # Build reg_questions lookup from DB
+        rq_rows = conn.execute("SELECT * FROM regulatory_questions").fetchall()
+        reg_questions = {r["label"]: r["key"] for r in rq_rows}
 
+        # Apply regulatory question filters
         selected_questions = request.args.getlist("reg_q")
+        tools_list = []
+        for row in [dict(r) for r in rows]:
+            raw = json.loads(row["raw_json"]) if row.get("raw_json") else {}
+            # Check reg question filters
+            skip = False
+            for question in selected_questions:
+                field = reg_questions.get(question)
+                if field and str(raw.get(field, "")).lower() != "true":
+                    skip = True
+                    break
+            if skip:
+                continue
+
+            html_name = row["html_name"]
+            png_name = row["png_file_name"]
+            placeholder = (
+                "https://github.com/VHP4Safety/ui-design"
+                "/blob/main/static/images/logo.png"
+            )
 
-        for question in selected_questions:
-            field = reg_questions.get(question)
-            if field:
-                tools = [
-                    tool for tool in tools if str(tool.get(field, "")).lower() == "true"
-                ]
-
-        # Getting the search query from URL to add a search bar based on tool names.
-        search_query = request.args.get("search", "").strip().lower()
-
-        # Filtering tools by search query.
-        if search_query:
-            tools = [
-                tool
-                for tool in tools
-                if search_query in tool.get("service", "").lower()
-            ]
+            tools_list.append({
+                "id": row["id"],
+                "service": row["service"],
+                "description": row["description"],
+                "stage": row["stage"],
+                "html_name": html_name,
+                "url": f"https://cloud.vhp4safety.nl/service/{html_name}",
+                "inst_url": row["inst_url"] or "no_url",
+                "png": (
+                    None if png_name == placeholder else
+                    f"https://raw.githubusercontent.com/VHP4Safety/cloud/main/docs/service/{png_name}"
+                    if png_name and not png_name.startswith("http")
+                    else png_name
+                ),
+                **raw,
+            })
+
+        # Collect stages for filter sidebar
+        all_stages = sorted(set(
+            t["stage"] for t in tools_list if t.get("stage")
+        ))
+        if "Other" in all_stages:
+            all_stages.remove("Other")
+            all_stages.append("Other")
+
+        # Stage / reg question explanations from DB
+        se_rows = conn.execute("SELECT * FROM stage_explanations").fetchall()
+        stage_explanations = {s["name"]: s["explanation"] for s in se_rows}
+        reg_question_explanations = {
+            r["label"]: r["explanation"] for r in rq_rows
+        }
+        conn.close()
 
         return render_template(
             "tools/tools.html",
-            tools=tools,
-            stages=stages,
+            tools=tools_list,
+            stages=all_stages,
             selected_stages=selected_stages,
             reg_questions=reg_questions,
             selected_questions=selected_questions,
-            stage_explanations=STAGE_EXPLANATIONS,
-            reg_question_explanations=REG_QUESTION_EXPLANATIONS,
+            stage_explanations=stage_explanations,
+            reg_question_explanations=reg_question_explanations,
         )
 
     except Exception as e:
@@ -645,100 +626,68 @@ def tools():
 @app.route("/methods")
 @app.route("/methods/")
 def methods():
-    """Fetch methods_index.json from the cloud repo, normalize fields and render a methods list page."""
-    url = "https://raw.githubusercontent.com/VHP4Safety/cloud/refs/heads/main/cap/methods_index.json"
-    response = requests.get(url)
-
-    if response.status_code != 200:
-        return f"Error fetching methods list: {response.status_code}", 503
-
+    """Render methods list page from DB."""
     try:
-        methods = response.json()
-        methods = list(methods.values())  # convert dict to list
+        conn = get_conn()
 
-        # Normalize fields for the template and collect stages
-        stages_set = set()
-        normalized = []
-        for m in methods:
-            norm = {}
-            norm["id"] = m.get("id", "")
-            # template expects 'service' and 'description'
-            norm["service"] = (
-                m.get("method")
-                or m.get("method_name_content")
-                or m.get("method_name")
-                or ""
-            )
-            norm["description"] = (
-                m.get("method_description_content") or m.get("method_description") or ""
-            )
-            # main_url used for method webpage (catalog page)
-            norm["main_url"] = m.get("catalog_webpage_url") or "no_url"
-            # interactive instance not present in methods index
-            norm["inst_url"] = m.get("inst_url") or "no_url"
-            # metadata md file not available in index; keep empty string
-            norm["meta_data"] = m.get("meta_data") or ""
-            # placeholder/no png
-            norm["png"] = None
-            # keep original raw data for potential details page
-            norm["raw"] = m
-
-            # collect stages (split comma-separated values)
-            stage_field = (m.get("vhp4safety_workflow_stage_content") or "").strip()
-            if stage_field:
-                for part in [s.strip() for s in stage_field.split(",")]:
-                    if part:
-                        stages_set.add(part)
-
-            normalized.append(norm)
-
-        # Apply search and filters similar to /tools
         selected_stages = request.args.getlist("stage")
-        selected_questions = request.args.getlist("reg_q")
         search_query = request.args.get("search", "").strip().lower()
 
-        methods_filtered = normalized
+        sql = "SELECT * FROM methods WHERE 1=1"
+        params = []
+        if search_query:
+            sql += " AND LOWER(method) LIKE ?"
+            params.append(f"%{search_query}%")
+        sql += " ORDER BY method"
+        rows = [dict(r) for r in conn.execute(sql, params).fetchall()]
 
-        if selected_stages:
-            methods_filtered = [
-                m
-                for m in methods_filtered
-                if any(
-                    s
-                    in (
-                        (m["raw"].get("vhp4safety_workflow_stage_content") or "").split(
-                            ","
-                        )
-                    )
-                    for s in selected_stages
-                )
-            ]
-
-        # Filter by regulatory questions if provided (REG_QUESTIONS keys map to internal fields)
-        reg_questions = {v["label"]: k for k, v in REG_QUESTIONS.items()}
-        if selected_questions:
+        rq_rows = conn.execute("SELECT * FROM regulatory_questions").fetchall()
+        reg_questions = {r["label"]: r["key"] for r in rq_rows}
+        selected_questions = request.args.getlist("reg_q")
+
+        stages_set = set()
+        methods_filtered = []
+        for row in rows:
+            raw = json.loads(row["raw_json"]) if row.get("raw_json") else {}
+            stage_field = (row.get("stage") or "").strip()
+            parts = [s.strip() for s in stage_field.split(",") if s.strip()]
+            stages_set.update(parts)
+
+            if selected_stages and not any(s in parts for s in selected_stages):
+                continue
+
+            skip = False
             for question in selected_questions:
                 field = reg_questions.get(question)
-                if field:
-                    methods_filtered = [
-                        m
-                        for m in methods_filtered
-                        if str(m["raw"].get(field, "")).lower() == "true"
-                    ]
-
-        if search_query:
-            methods_filtered = [
-                m
-                for m in methods_filtered
-                if search_query in m.get("service", "").lower()
-            ]
+                if field and str(raw.get(field, "")).lower() != "true":
+                    skip = True
+                    break
+            if skip:
+                continue
+
+            methods_filtered.append({
+                "id": row["id"],
+                "service": row["method"],
+                "description": row.get("description") or "",
+                "main_url": row.get("catalog_webpage_url") or "no_url",
+                "inst_url": "no_url",
+                "meta_data": "",
+                "png": None,
+                "raw": raw,
+            })
 
         stages = sorted(stages_set)
         if "Other" in stages:
             stages.remove("Other")
             stages.append("Other")
 
-        # Pass everything the template expects
+        se_rows = conn.execute("SELECT * FROM stage_explanations").fetchall()
+        stage_explanations = {s["name"]: s["explanation"] for s in se_rows}
+        reg_question_explanations = {
+            r["label"]: r["explanation"] for r in rq_rows
+        }
+        conn.close()
+
         return render_template(
             "methods/methods.html",
             methods=methods_filtered,
@@ -746,8 +695,8 @@ def methods():
             selected_stages=selected_stages,
             reg_questions=reg_questions,
             selected_questions=selected_questions,
-            stage_explanations=STAGE_EXPLANATIONS,
-            reg_question_explanations=REG_QUESTION_EXPLANATIONS,
+            stage_explanations=stage_explanations,
+            reg_question_explanations=reg_question_explanations,
         )
 
     except Exception as e:
@@ -756,38 +705,29 @@ def methods():
 
 @app.route("/methods/<methodid>")
 def method_page(methodid):
-    """Render a single method page using templates/methods/method.html
-    Method details are taken from methods_index.json (keyed by method id).
-    """
-    try:
-        methods = get_json_dict(METHODS_URL)
-        # methods_index.json is a dict keyed by method id
-        if methodid not in methods:
-            abort(404)
-        method_details = methods[methodid]
-    except Exception as e:
-        return f"Error processing methods data: {e}", 500
+    """Render a single method detail page."""
+    conn = get_conn()
+    row = conn.execute("SELECT * FROM methods WHERE id = ?", (methodid,)).fetchone()
+    conn.close()
+    if not row:
+        abort(404)
+
+    method_details = json.loads(row["raw_json"]) if row["raw_json"] else {}
 
-    # Try to load the full method JSON from the docs/methods folder (raw github)
-    method_json = None
-    # URL-encode the filename part to be safe
+    # Try to load full JSON from GitHub docs/methods/
+    method_json = method_details
     encoded = urllib.parse.quote(methodid, safe="")
     raw_url = (
-        "https://raw.githubusercontent.com/VHP4Safety/cloud/refs/heads/main/docs/methods/"
-        + f"{encoded}.json"
+        "https://raw.githubusercontent.com/VHP4Safety/cloud"
+        f"/refs/heads/main/docs/methods/{encoded}.json"
     )
     try:
         r = requests.get(raw_url, timeout=5)
         if r.status_code == 200:
             method_json = r.json()
-        else:
-            # fall back to using the index entry as minimal data
-            method_json = method_details
-    except Exception as exc:
-        # on any error, fall back to index entry
-        method_json = method_details
+    except Exception:
+        pass
 
-    # Pass both to the template: some templates expect method_json, others method_details
     return render_template(
         "methods/method.html",
         method=method_details,
@@ -798,37 +738,27 @@ def method_page(methodid):
 
 @app.route("/tools/<toolname>")
 def tool_page(toolname):
-    # get the tools metadata:
-    try:
-        tools = get_json_dict_service(SERVICES_URL)
-        tools = dict(tools)
-        # Geting the service_list.json in the dictionary format.
-        # Converting the dictionary to a list object.
-    except Exception as e:
-        return f"Error processing service data: {e}", 500
-
-    # Map toolname to the correct JSON file in the new tool folder
-    if toolname not in tools:
+    """Render a single tool detail page."""
+    conn = get_conn()
+    row = conn.execute("SELECT * FROM tools WHERE id = ?", (toolname,)).fetchone()
+    conn.close()
+    if not row:
         abort(404)
 
-    # get the tools metadata:
-    url = "https://cloud.vhp4safety.nl/service/" + toolname + ".json"
-    response = requests.get(url)
-
-    if response.status_code != 200:
-        return f"Error fetching service list: {response.status_code}", 503
+    tool_json = json.loads(row["raw_json"]) if row["raw_json"] else {}
 
+    # Fetch full details from cloud service JSON
+    url = f"https://cloud.vhp4safety.nl/service/{toolname}.json"
     try:
-        tool_details = response.json()
-        tool_details = dict(tool_details)
-        # Geting the service_list.json in the dictionary format.
-        # Converting the dictionary to a list object.
-    except Exception as e:
-        return f"Error processing service data: {e}", 500
+        resp = requests.get(url, timeout=10)
+        tool_details = resp.json() if resp.status_code == 200 else tool_json
+    except Exception:
+        tool_details = tool_json
 
-    # Pass the json filename to the template (for JS to pick up)
     return render_template(
-        "tools/tool.html", tool_json=tools[toolname], tool_details=tool_details
+        "tools/tool.html",
+        tool_json=tool_json,
+        tool_details=tool_details,
     )
 
 
@@ -837,31 +767,45 @@ def tool_page(toolname):
 
 
 # General Safety Assessment Workflow page
-@app.route("/Safety_Assessment_Workflow")
+@app.route("/safety_assessment_workflow")
 def SafetyAssessmentWorkflow():
-    return render_template("Safety_Assessment_Workflow.html")
+    return render_template("safety_assessment_workflow.html")
 
 
 ################################################################################
 ### Pages under 'Case Studies'
 
 
-# General case studies page
 @app.route("/casestudies")
 def workflows():
-    return render_template("case_studies/casestudies.html")
+    conn = get_conn()
+    cards = conn.execute("SELECT * FROM case_studies").fetchall()
+    conn.close()
+    return render_template(
+        "case_studies/casestudies.html", cards=[dict(c) for c in cards]
+    )
 
 
-# Individual case study page, dynamically filled based on URL
-@app.route("/casestudies/<case>", defaults={"step": ""})
-@app.route("/casestudies/<case>/<question>")
-@app.route("/casestudies/<case>/<question>/<step>")
-# additional routes are parsed client side via js to allow smooth animation
-def casestudy(case:str="", question:str="", step:str=""):
-    if case not in CASESTUDIES:
+@app.route("/casestudies/<case>")
+@app.route("/casestudies/<case>/<path:subpath>")
+def casestudy(case: str, subpath: str = ""):
+    conn = get_conn()
+    cs = conn.execute("SELECT * FROM case_studies WHERE slug = ?", (case,)).fetchone()
+    conn.close()
+    if not cs:
         abort(404)
-    # JS will handle steps via the URL
-    return render_template("case_studies/casestudy.html", case=case)
+
+    parts = [
+        p for p in subpath.split("/") if p
+    ] if subpath else []
+
+    step = resolve_casestudy(case, parts)
+    if step is None:
+        abort(404)
+
+    return render_template(
+        "case_studies/casestudy_server.html", step=step
+    )
 
 
 @app.route("/workflow/<workflow>")
@@ -1121,5 +1065,8 @@ def privacy_policy():
     return render_template("legal/privacypolicy.html")
 
 
+from src.scheduler import init_scheduler
+init_scheduler(app)
+
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=5050, debug=True)
diff --git a/entrypoint.sh b/entrypoint.sh
index cd96440..56a3e46 100644
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -1,4 +1,11 @@
 #!/bin/sh
+set -e
 
-# Start Flask app
-python app.py
+echo "==> Seeding database..."
+python -m src.seed
+
+echo "==> Generating sitemap..."
+python -m src.sitemap || echo "sitemap generation failed; continuing"
+
+echo "==> Starting Flask app..."
+exec python app.py
diff --git a/patch.py b/patch.py
deleted file mode 100644
index 5cd790a..0000000
--- a/patch.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from importlib import import_module
-from pathlib import Path
-
-def apply_patch():
-    try:
-        # Dynamically import the module and get its file path
-        try:
-            module = import_module('pyshexc.parser.ShExDocLexer')
-        except ModuleNotFoundError as e:
-            # Give a precise, actionable hint for installation in the active interpreter
-            print(
-                "Missing dependency: 'pyshexc' (PyShExC).\n"
-                "Install it in the same environment you're using to run this script.\n"
-                "Examples:\n"
-                "  python -m pip install PyShExC\n"
-                "  # or with uv:   uv pip install PyShExC\n"
-                "  # or poetry:    poetry add PyShExC\n"
-                "  # or conda:     conda install -c conda-forge pyshexc\n"
-            )
-            return
-
-        file_path = Path(module.__file__)
-
-        if not file_path.exists():
-            raise FileNotFoundError(f"Could not find the file: {file_path}")
-
-        # Read the file content
-        file_content = file_path.read_text()
-
-        # Replace 'from typing.io import TextIO' with 'from typing import TextIO'
-        new_content = file_content.replace("from typing.io import TextIO", "from typing import TextIO")
-
-        # Only write if a change is needed
-        if new_content != file_content:
-            file_path.write_text(new_content)
-            print("Patch applied successfully!")
-        else:
-            print("No patch needed; target text not found (already patched or different version).")
-
-    except FileNotFoundError as e:
-        print(e)
-    except Exception as e:
-        print(f"An error occurred: {e}")
-
-if __name__ == "__main__":
-    apply_patch()
diff --git a/requirements.txt b/requirements.txt
index 3e7ed1c..95607e1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,30 @@
-flask>=3.1.3
+annotated-types==0.7.0
+backoff==2.2.1
+blinker==1.9.0
+cachelib==0.13.0
+certifi==2026.2.25
+charset-normalizer==3.4.7
+click==8.3.2
+flask==3.1.3
 flask-caching==2.3.1
+idna==3.11
+itsdangerous==2.2.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mwoauth==0.4.0
+oauthlib==3.3.1
+pydantic==2.13.2
+pydantic-core==2.46.2
+pyjwt==2.12.1
 requests==2.32.4
-#wikidataintegrator==0.9.30
-setuptools==78.1.1 # Provides pkg_resources module, required for wikidataintegrator
-werkzeug>=3.0.6
-#pyBiodatafuse @ git+https://github.com/BioDataFuse/pyBiodatafuse.git
-wikibaseintegrator>=0.12.14
-
+requests-oauthlib==2.0.0
+setuptools==78.1.1
+typing-extensions==4.15.0
+typing-inspection==0.4.2
+ujson==5.12.0
+urllib3==2.6.3
+werkzeug==3.1.8
+wikibaseintegrator==0.12.15
+flask-smorest>=0.44
+marshmallow>=3.20
+APScheduler>=3.10,<4
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/api.py b/src/api.py
new file mode 100644
index 0000000..646a88c
--- /dev/null
+++ b/src/api.py
@@ -0,0 +1,564 @@
+"""RESTful API with auto-generated OpenAPI documentation.
+
+Uses flask-smorest (marshmallow + OpenAPI 3) so Swagger UI is
+served automatically at /api/v1/docs.
+"""
+
+from __future__ import annotations
+
+import json
+
+from flask import Flask
+from flask_smorest import Api, Blueprint, abort
+from marshmallow import Schema, fields
+
+from src.db import get_conn
+from src.models.data.biostudies import BioStudiesExtractor
+from src.models.data.zenodo import ZenodoExtractor
+from src.models.data.mapping import normalize_all
+from src.services.compound import (
+    get_experimental_data,
+    get_full_compound,
+    get_identifiers,
+    get_properties,
+    get_toxicology,
+    is_valid_qid,
+)
+
+BIOSTUDIES_COLLECTION = "VHP4Safety"
+ZENODO_COMMUNITY = "vhp4safety"
+ZENODO_RECORD_TYPE = "dataset"
+
+
+# -- Marshmallow Schemas ---------------------------------------------------
+
+class ToolSchema(Schema):
+    id = fields.Str()
+    service = fields.Str()
+    description = fields.Str()
+    stage = fields.Str()
+    main_url = fields.Str()
+    inst_url = fields.Str()
+    html_name = fields.Str()
+    png_file_name = fields.Str()
+
+
+class MethodSchema(Schema):
+    id = fields.Str()
+    method = fields.Str()
+    description = fields.Str()
+    stage = fields.Str()
+    substage = fields.Str()
+    catalog_webpage_url = fields.Str()
+    raw = fields.Dict(load_default=None)
+
+
+class RegulatoryQuestionSchema(Schema):
+    key = fields.Str()
+    label = fields.Str()
+    explanation = fields.Str()
+
+
+class StageExplanationSchema(Schema):
+    name = fields.Str()
+    explanation = fields.Str()
+
+
+class CaseStudySchema(Schema):
+    slug = fields.Str()
+    title = fields.Str()
+    description = fields.Str()
+    image_src = fields.Str()
+    config_repo = fields.Str()
+    default_branch = fields.Str()
+
+
+class CaseStudyDetailSchema(CaseStudySchema):
+    content_json = fields.Raw(load_default=None)
+
+
+class CompoundSummarySchema(Schema):
+    wcid = fields.Str()
+    label = fields.Str()
+    inchi = fields.Str()
+    inchikey = fields.Str()
+    smiles = fields.Str(data_key="SMILES")
+    formula = fields.Str()
+    mass = fields.Str()
+
+
+class CompoundIdentifierSchema(Schema):
+    property_label = fields.Str(data_key="propertyLabel")
+    value = fields.Str()
+    formatter_url = fields.Str(data_key="formatterURL")
+
+
+class CompoundToxicologySchema(Schema):
+    property_label = fields.Str(data_key="propertyLabel")
+    value = fields.Str()
+
+
+class CompoundExpDataSchema(Schema):
+    property_label = fields.Str(data_key="propEntityLabel")
+    value = fields.Str()
+    units_label = fields.Str(data_key="unitsLabel")
+    source = fields.Str()
+    doi = fields.Str()
+    see_also = fields.Str(data_key="seeAlso")
+
+
+class CompoundDetailSchema(Schema):
+    summary = fields.Nested(CompoundSummarySchema)
+    identifiers = fields.List(fields.Nested(CompoundIdentifierSchema))
+    toxicology = fields.List(fields.Nested(CompoundToxicologySchema))
+    experimental_data = fields.List(fields.Nested(CompoundExpDataSchema))
+
+
+class DataSearchQuerySchema(Schema):
+    query = fields.Str(load_default="")
+    page = fields.Int(load_default=1)
+    size = fields.Int(load_default=18)
+
+
+class DataSourceResultSchema(Schema):
+    total = fields.Int()
+    hits = fields.List(fields.Dict())
+    error = fields.Str(allow_none=True)
+
+
+class DataResultSchema(Schema):
+    biostudies = fields.Nested(DataSourceResultSchema)
+    zenodo = fields.Nested(DataSourceResultSchema)
+
+
+class SearchQuerySchema(Schema):
+    stage = fields.Str(load_default=None)
+    search = fields.Str(load_default="")
+
+
+# -- Blueprints ------------------------------------------------------------
+
+tools_bp = Blueprint("tools", __name__, url_prefix="/api/tools",
+                     description="Tool / service endpoints")
+methods_bp = Blueprint("methods", __name__, url_prefix="/api/methods",
+                       description="Method endpoints")
+reg_q_bp = Blueprint("regulatory_questions", __name__,
+                     url_prefix="/api/regulatory-questions",
+                     description="Regulatory questions")
+stages_bp = Blueprint("stages", __name__, url_prefix="/api/stages",
+                      description="Safety-assessment workflow stages")
+casestudies_bp = Blueprint("casestudies", __name__,
+                           url_prefix="/api/casestudies",
+                           description="Case study endpoints")
+compounds_bp = Blueprint("compounds", __name__, url_prefix="/api/compounds",
+                         description="Compound data (SPARQL-backed)")
+data_bp = Blueprint("data", __name__, url_prefix="/api/data",
+                    description="Dataset search (BioStudies + Zenodo)")
+
+
+# -- Tools -----------------------------------------------------------------
+
+@tools_bp.route("/")
+@tools_bp.arguments(SearchQuerySchema, location="query")
+@tools_bp.response(200, ToolSchema(many=True))
+def list_tools(args):
+    """List all tools, with optional stage/search filters."""
+    conn = get_conn()
+    sql = "SELECT * FROM tools WHERE 1=1"
+    params = []
+    if args.get("stage"):
+        sql += " AND stage = ?"
+        params.append(args["stage"])
+    if args.get("search"):
+        sql += " AND service LIKE ?"
+        params.append(f"%{args['search']}%")
+    sql += " ORDER BY service"
+    rows = conn.execute(sql, params).fetchall()
+    conn.close()
+    return [dict(r) for r in rows]
+
+
+@tools_bp.route("/<tool_id>")
+@tools_bp.response(200, ToolSchema)
+def get_tool(tool_id):
+    """Get a single tool by ID."""
+    conn = get_conn()
+    row = conn.execute("SELECT * FROM tools WHERE id = ?", (tool_id,)).fetchone()
+    conn.close()
+    if not row:
+        abort(404, message="Tool not found")
+    return dict(row)
+
+
+# -- Methods ---------------------------------------------------------------
+
+@methods_bp.route("/")
+@methods_bp.arguments(SearchQuerySchema, location="query")
+@methods_bp.response(200, MethodSchema(many=True))
+def list_methods(args):
+    """List all methods, with optional stage/search filters."""
+    conn = get_conn()
+    sql = "SELECT * FROM methods WHERE 1=1"
+    params = []
+    if args.get("stage"):
+        sql += " AND stage LIKE ?"
+        params.append(f"%{args['stage']}%")
+    if args.get("search"):
+        sql += " AND method LIKE ?"
+        params.append(f"%{args['search']}%")
+    sql += " ORDER BY method"
+    rows = conn.execute(sql, params).fetchall()
+    conn.close()
+    return [dict(r) for r in rows]
+
+
+@methods_bp.route("/<method_id>")
+@methods_bp.response(200, MethodSchema)
+def get_method(method_id):
+    """Get a single method by ID."""
+    conn = get_conn()
+    row = conn.execute("SELECT * FROM methods WHERE id = ?", (method_id,)).fetchone()
+    conn.close()
+    if not row:
+        abort(404, message="Method not found")
+    d = dict(row)
+    if d.get("raw_json"):
+        d["raw"] = json.loads(d["raw_json"])
+    return d
+
+
+# -- Regulatory Questions --------------------------------------------------
+
+@reg_q_bp.route("/")
+@reg_q_bp.response(200, RegulatoryQuestionSchema(many=True))
+def list_regulatory_questions():
+    """List all regulatory questions."""
+    conn = get_conn()
+    rows = conn.execute("SELECT * FROM regulatory_questions").fetchall()
+    conn.close()
+    return [dict(r) for r in rows]
+
+
+# -- Stages ----------------------------------------------------------------
+
+@stages_bp.route("/")
+@stages_bp.response(200, StageExplanationSchema(many=True))
+def list_stages():
+    """List all safety-assessment workflow stages."""
+    conn = get_conn()
+    rows = conn.execute("SELECT * FROM stage_explanations").fetchall()
+    conn.close()
+    return [dict(r) for r in rows]
+
+
+# -- Case Studies ----------------------------------------------------------
+
+@casestudies_bp.route("/")
+@casestudies_bp.response(200, CaseStudySchema(many=True))
+def list_case_studies():
+    """List all case studies."""
+    conn = get_conn()
+    rows = conn.execute("SELECT * FROM case_studies").fetchall()
+    conn.close()
+    return [dict(r) for r in rows]
+
+
+@casestudies_bp.route("/<slug>")
+@casestudies_bp.response(200, CaseStudyDetailSchema)
+def get_case_study(slug):
+    """Get a case study with its full content JSON."""
+    conn = get_conn()
+    row = conn.execute("SELECT * FROM case_studies WHERE slug = ?", (slug,)).fetchone()
+    conn.close()
+    if not row:
+        abort(404, message="Case study not found")
+    d = dict(row)
+    if d.get("content_json"):
+        d["content_json"] = json.loads(d["content_json"])
+    return d
+
+
+# -- Compounds (SPARQL-backed) ---------------------------------------------
+
+@compounds_bp.route("/<cwid>")
+@compounds_bp.response(200, CompoundDetailSchema)
+def get_compound(cwid):
+    """Get full compound data."""
+    if not is_valid_qid(cwid):
+        abort(400, message="Invalid compound identifier")
+    try:
+        return get_full_compound(cwid).model_dump()
+    except Exception as e:
+        abort(502, message=str(e))
+
+
+@compounds_bp.route("/<cwid>/properties")
+@compounds_bp.response(200, CompoundSummarySchema)
+def get_compound_properties(cwid):
+    """Get core compound identifiers."""
+    if not is_valid_qid(cwid):
+        abort(400, message="Invalid compound identifier")
+    try:
+        summary = get_properties(cwid)
+        if not summary:
+            abort(404, message="No data found")
+        return summary.model_dump()
+    except Exception as e:
+        abort(502, message=str(e))
+
+
+@compounds_bp.route("/<cwid>/identifiers")
+@compounds_bp.response(200, CompoundIdentifierSchema(many=True))
+def get_compound_identifiers(cwid):
+    """Get external identifiers."""
+    if not is_valid_qid(cwid):
+        abort(400, message="Invalid compound identifier")
+    try:
+        return [i.model_dump() for i in get_identifiers(cwid)]
+    except Exception as e:
+        abort(502, message=str(e))
+
+
+@compounds_bp.route("/<cwid>/toxicology")
+@compounds_bp.response(200, CompoundToxicologySchema(many=True))
+def get_compound_toxicology(cwid):
+    """Get toxicology data."""
+    if not is_valid_qid(cwid):
+        abort(400, message="Invalid compound identifier")
+    try:
+        return [t.model_dump() for t in get_toxicology(cwid)]
+    except Exception as e:
+        abort(502, message=str(e))
+
+
+@compounds_bp.route("/<cwid>/experimental-data")
+@compounds_bp.response(200, CompoundExpDataSchema(many=True))
+def get_compound_exp_data(cwid):
+    """Get experimental measurements."""
+    if not is_valid_qid(cwid):
+        abort(400, message="Invalid compound identifier")
+    try:
+        return [d.model_dump() for d in get_experimental_data(cwid)]
+    except Exception as e:
+        abort(502, message=str(e))
+
+
+# -- Data (BioStudies + Zenodo passthrough) --------------------------------
+
+@data_bp.route("/")
+@data_bp.arguments(DataSearchQuerySchema, location="query")
+@data_bp.response(200, DataResultSchema)
+def list_data(args):
+    """Search datasets across BioStudies and Zenodo."""
+    query = args.get("query", "")
+    page = args.get("page", 1)
+    size = args.get("size", 18)
+
+    bs = BioStudiesExtractor(collection=BIOSTUDIES_COLLECTION)
+    zen = ZenodoExtractor(community=ZENODO_COMMUNITY, record_type=ZENODO_RECORD_TYPE)
+
+    if query:
+        bs_res = bs.search_studies(query, page=page, page_size=size)
+        zen_res = zen.search_records(query, page=page, size=size)
+    else:
+        bs_res = bs.list_studies(page=page, page_size=size, include_urls=True)
+        zen_res = zen.list_records(page=page, size=size, include_urls=True)
+
+    studies = bs_res.get("hits", [])
+    datasets = zen_res.get("hits", [])
+    studies, datasets = normalize_all(studies, datasets)
+
+    return {
+        "biostudies": {
+            "total": bs_res.get("total", 0),
+            "hits": [h.get("norm_metadata", h) for h in studies],
+            "error": bs_res.get("error"),
+        },
+        "zenodo": {
+            "total": zen_res.get("total", 0),
+            "hits": [h.get("norm_metadata", h) for h in datasets],
+            "error": zen_res.get("error"),
+        },
+    }
+
+
+@data_bp.route("/<data_id>")
+@data_bp.response(200)
+def get_data_detail(data_id):
+    """Get normalized metadata for a single dataset."""
+    bs = BioStudiesExtractor(collection=BIOSTUDIES_COLLECTION)
+    zen = ZenodoExtractor(community=ZENODO_COMMUNITY, record_type=ZENODO_RECORD_TYPE)
+    bs_res = bs.search_studies(data_id, page=1, page_size=1)
+    zen_res = zen.search_records(data_id, page=1, size=1)
+    studies = bs_res.get("hits", [])
+    datasets = zen_res.get("hits", [])
+    studies, datasets = normalize_all(studies, datasets)
+    if studies:
+        return studies[0].get("norm_metadata", studies[0])
+    if datasets:
+        return datasets[0].get("norm_metadata", datasets[0])
+    abort(404, message="Dataset not found")
+
+
+# -- Validation blueprint --------------------------------------------------
+
+validation_bp = Blueprint("validation", __name__, url_prefix="/api/validation",
+                          description="Data completeness validation")
+
+from src.models.cloud.method import ServiceIndexEntry as ToolModel
+from src.models.cloud.tool import Method as MethodModel
+from src.models.platform import (
+    RegulatoryQuestion as RQModel,
+    StageExplanation as SEModel,
+)
+from src.models.casestudy import CaseStudyCard as CSModel
+
+_ENTITY_REGISTRY = {
+    "tools":                  ("tools",                  ToolModel,   "id",   "service"),
+    "methods":                ("methods",                MethodModel, "id",   "method"),
+    "case_studies":           ("case_studies",           CSModel,     "slug", "title"),
+    "regulatory_questions":   ("regulatory_questions",   RQModel,     "key",  "label"),
+    "stage_explanations":     ("stage_explanations",     SEModel,     "name", "name"),
+}
+
+_SKIP_FIELDS = {
+    "raw_json", "updated_at", "model_config",
+    "timestamp", "https",
+    "reg_q_1a", "reg_q_1b", "reg_q_2a",
+    "reg_q_2b", "reg_q_3a", "reg_q_3b",
+}
+
+
+class FieldCompleteness(Schema):
+    field = fields.Str()
+    present = fields.Bool()
+    value_preview = fields.Str(allow_none=True)
+
+
+class EntryValidation(Schema):
+    id = fields.Str()
+    label = fields.Str()
+    fields_total = fields.Int()
+    fields_filled = fields.Int()
+    completeness_pct = fields.Float()
+    missing = fields.List(fields.Str())
+    details = fields.List(fields.Nested(FieldCompleteness))
+
+
+class EntitySummary(Schema):
+    entity = fields.Str()
+    total_entries = fields.Int()
+    schema_fields = fields.List(fields.Str())
+    avg_completeness_pct = fields.Float()
+    fully_complete = fields.Int()
+    entries = fields.List(fields.Nested(EntryValidation))
+
+
+class ValidationReport(Schema):
+    generated_at = fields.Str()
+    entities = fields.List(fields.Nested(EntitySummary))
+
+
+def _is_filled(val):
+    if val is None:
+        return False
+    if isinstance(val, str) and val.strip() == "":
+        return False
+    return True
+
+
+def _preview(val, max_len=80):
+    if val is None:
+        return None
+    s = str(val)
+    return s[:max_len] + ("..." if len(s) > max_len else "")
+
+
+def _validate_entity(entity_name, table, pydantic_model, id_attr, label_attr):
+    check_fields = [f for f in pydantic_model.model_fields if f not in _SKIP_FIELDS]
+    conn = get_conn()
+    rows = conn.execute(f"SELECT * FROM {table}").fetchall()
+    conn.close()
+
+    entries = []
+    for row in rows:
+        d = dict(row)
+        details = []
+        filled = 0
+        missing = []
+        for f in check_fields:
+            val = d.get(f)
+            ok = _is_filled(val)
+            if ok:
+                filled += 1
+            else:
+                missing.append(f)
+            details.append({"field": f, "present": ok, "value_preview": _preview(val)})
+
+        total = len(check_fields)
+        pct = round(filled / total * 100, 1) if total else 100.0
+        entries.append({
+            "id": str(d.get(id_attr, "?")),
+            "label": str(d.get(label_attr) or d.get(id_attr, "?")),
+            "fields_total": total,
+            "fields_filled": filled,
+            "completeness_pct": pct,
+            "missing": missing,
+            "details": details,
+        })
+
+    avg = round(sum(e["completeness_pct"] for e in entries) / len(entries), 1) if entries else 0.0
+    fully = sum(1 for e in entries if e["completeness_pct"] == 100.0)
+    return {
+        "entity": entity_name,
+        "total_entries": len(entries),
+        "schema_fields": check_fields,
+        "avg_completeness_pct": avg,
+        "fully_complete": fully,
+        "entries": entries,
+    }
+
+
+@validation_bp.route("/")
+@validation_bp.response(200, ValidationReport)
+def validate_all():
+    """Full data completeness report."""
+    from datetime import datetime, timezone
+    return {
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+        "entities": [
+            _validate_entity(name, tbl, model, id_a, lbl_a)
+            for name, (tbl, model, id_a, lbl_a) in _ENTITY_REGISTRY.items()
+        ],
+    }
+
+
+@validation_bp.route("/<entity>")
+@validation_bp.response(200, EntitySummary)
+def validate_entity(entity):
+    """Data completeness report for a single entity type."""
+    if entity not in _ENTITY_REGISTRY:
+        abort(404, message=f"Unknown entity '{entity}'. Valid: {', '.join(_ENTITY_REGISTRY)}")
+    tbl, model, id_a, lbl_a = _ENTITY_REGISTRY[entity]
+    return _validate_entity(entity, tbl, model, id_a, lbl_a)
+
+
+# -- Registration helper ---------------------------------------------------
+
+def init_api(app: Flask) -> Api:
+    """Configure flask-smorest and register all API blueprints."""
+    app.config.update({
+        "API_TITLE": "VHP4Safety Platform API",
+        "API_VERSION": "v1",
+        "OPENAPI_VERSION": "3.0.3",
+        "OPENAPI_URL_PREFIX": "/api/v1",
+        "OPENAPI_SWAGGER_UI_PATH": "/docs",
+        "OPENAPI_SWAGGER_UI_URL": "https://cdn.jsdelivr.net/npm/swagger-ui-dist/",
+        "OPENAPI_REDOC_PATH": "/redoc",
+        "OPENAPI_REDOC_URL": "https://cdn.jsdelivr.net/npm/redoc@latest/bundles/redoc.standalone.js",
+    })
+    smorest_api = Api(app)
+    for bp in (tools_bp, methods_bp, reg_q_bp, stages_bp,
+               casestudies_bp, compounds_bp, data_bp, validation_bp):
+        smorest_api.register_blueprint(bp)
+    return smorest_api
diff --git a/src/casestudy_resolver.py b/src/casestudy_resolver.py
new file mode 100644
index 0000000..561624f
--- /dev/null
+++ b/src/casestudy_resolver.py
@@ -0,0 +1,298 @@
+"""Resolve case-study content from the database step hierarchy.
+
+Case study content JSON is seeded into the ``case_studies`` table from
+the VHP4Safety/ui-casestudy-config GitHub repo at seed time.
+The JSON has up to 6 nesting levels:
+  step1Contents  → intro + regulatory questions
+  step2Contents  → dict[question_key → nav with process-flow steps]
+  step3Contents  → dict[q → dict[step → node]]
+  step4Contents  → dict[q → dict[step → dict[substep → node]]]
+  step5Contents  → dict[q → dict[...]]
+  step6Contents  → dict[q → dict[...]]
+
+Given a URL path like /casestudies/kidney/Q1/Kinetics we resolve the
+node at step3Contents["Q1"]["Kinetics"] and render it server-side.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+from src.db import get_conn
+
+
+# In-memory cache keyed by slug
+_content_cache: dict[str, dict] = {}
+
+
+def get_content(slug: str) -> dict | None:
+    """Load case-study content JSON from the database (cached)."""
+    if slug in _content_cache:
+        return _content_cache[slug]
+
+    conn = get_conn()
+    row = conn.execute("SELECT content_json FROM case_studies WHERE slug = ?", (slug,)).fetchone()
+    conn.close()
+    if not row or not row["content_json"]:
+        return None
+
+    data = json.loads(row["content_json"])
+    _content_cache[slug] = data
+    return data
+
+
+# ── Resolved result ──────────────────────────────────────────────────────
+
+STEP_TYPE_COLORS = {
+    "workflow step": "btn-vhpdarkteal",
+    "workflow-step": "btn-vhpdarkteal",
+    "workflow substep": "btn-vhplightteal",
+    "workflow-substep": "btn-vhplightteal",
+    "process flow step": "btn-vhpdarkpurple",
+    "process-flow-step": "btn-vhpdarkpurple",
+    "regulatory question": "btn-vhppink-distinct",
+    "regulatory-question": "btn-vhppink-distinct",
+    "tool": "btn-vhpblue",
+}
+
+# Workflow header definitions
+WORKFLOW_STEPS = [
+    {"number": 1, "type": "regulatory-question",
+     "label": "Regulatory Question"},
+    {"number": 2, "type": "workflow-step",
+     "label": "Safety Assessment Workflow Step"},
+    {"number": 3, "type": "process-flow-step",
+     "label": "Case Study Step"},
+    {"number": 4, "type": "workflow-substep",
+     "label": "Case Study Substep"},
+    {"number": 5, "type": "tool",
+     "label": "Tools, Models and Data"},
+]
+
+
+def btn_color(step_type: str | None) -> str:
+    """Return CSS class for a step button based on its type."""
+    if not step_type:
+        return "btn-vhpblue"
+    return STEP_TYPE_COLORS.get(step_type, "btn-vhpblue")
+
+
+@dataclass
+class Breadcrumb:
+    label: str
+    url: str
+    active: bool = False
+
+
+@dataclass
+class StepButtonResolved:
+    """A button ready to render in Jinja."""
+    label: str
+    description: str = ""
+    css_class: str = "btn-vhpblue"
+    url: str = ""
+    disabled: bool = False
+    is_tool_link: bool = False
+
+
+@dataclass
+class ResolvedStep:
+    """Everything the template needs to render one case-study page."""
+    case_slug: str = ""
+    case_title: str = ""
+    step_number: int = 1
+    nav_title: str = ""
+    nav_description: str = ""
+    image_html: str = ""
+    buttons: list[StepButtonResolved] = field(default_factory=list)
+    accordion_sections: list[dict] = field(default_factory=list)
+    content_html: str = ""
+    breadcrumbs: list[Breadcrumb] = field(default_factory=list)
+    workflow_steps: list[dict] = field(default_factory=list)
+    path_parts: list[str] = field(default_factory=list)
+
+
+def _slugify(value: str) -> str:
+    """Convert space-separated label to URL-safe slug."""
+    return value.replace(" ", "_")
+
+
+def _unslugify(value: str) -> str:
+    """Convert URL slug back to the key used in JSON."""
+    return value.replace("_", " ")
+
+
+def _make_url(case: str, parts: list[str]) -> str:
+    """Build an absolute URL from case slug and path parts."""
+    base = f"/casestudies/{case}"
+    if parts:
+        return base + "/" + "/".join(_slugify(p) for p in parts)
+    return base
+
+
+def _parse_content(raw: Any) -> tuple[str, list[dict]]:
+    """Split content into HTML string and accordion sections list."""
+    if raw is None:
+        return "", []
+    if isinstance(raw, str):
+        return raw, []
+    if isinstance(raw, list):
+        sections = []
+        for item in raw:
+            if isinstance(item, dict):
+                sections.append(item)
+        return "", sections
+    return str(raw), []
+
+
+def resolve(
+    slug: str,
+    path_parts: list[str],
+    branch: str = "main",
+) -> Optional[ResolvedStep]:
+    """Resolve a URL path to the correct step content.
+
+    Parameters
+    ----------
+    slug : str
+        Case study slug (kidney, parkinson, thyroid).
+    path_parts : list[str]
+        Path segments after /casestudies/<slug>/ — e.g.
+        ["Q1", "Kinetics"] for step 3.
+
+    Returns
+    -------
+    ResolvedStep or None if the path doesn't resolve.
+    """
+    data = get_content(slug)
+    if data is None:
+        return None
+
+    step1 = data.get("step1Contents", {})
+    case_title = step1.get("navTitle", slug.title() + " Case Study")
+
+    result = ResolvedStep(
+        case_slug=slug,
+        case_title=case_title,
+        path_parts=list(path_parts),
+    )
+
+    # Build workflow header state
+    active_step = len(path_parts) + 1
+    result.step_number = active_step
+    for ws in WORKFLOW_STEPS:
+        state = "completed" if ws["number"] < active_step \
+            else "active" if ws["number"] == active_step \
+            else ""
+        result.workflow_steps.append({**ws, "state": state})
+
+    # ── Step 1: no path parts ─────────────────────────────────────
+    if not path_parts:
+        result.nav_title = step1.get("navTitle", "")
+        result.nav_description = step1.get("navDescription", "")
+        html, sections = _parse_content(step1.get("content"))
+        result.content_html = html
+        result.accordion_sections = sections
+        # Buttons = regulatory questions
+        for q in step1.get("questions", []):
+            result.buttons.append(StepButtonResolved(
+                label=q.get("label", ""),
+                description=q.get("description", ""),
+                css_class=btn_color(
+                    q.get("type", "regulatory-question")
+                ),
+                url=_make_url(slug, [q["value"]]),
+                disabled=q.get("state") == "disabled",
+            ))
+        result.breadcrumbs = [
+            Breadcrumb("Case Studies", "/casestudies"),
+            Breadcrumb(case_title, "", active=True),
+        ]
+        return result
+
+    # ── Step 2+: walk the nested dicts ────────────────────────────
+    # path_parts[0] is the question key (e.g. "Q1")
+    # path_parts[1] is the step2 choice (e.g. "Kinetics")
+    # etc.
+    depth = len(path_parts)
+    step_key = f"step{depth + 1}Contents"
+
+    # Navigate to the correct node
+    container = data.get(step_key, {})
+    node = container
+    for i, part in enumerate(path_parts):
+        key = _unslugify(part)
+        if isinstance(node, dict) and key in node:
+            node = node[key]
+        else:
+            # Try original (slugified) key as fallback
+            if isinstance(node, dict) and part in node:
+                node = node[part]
+            else:
+                return None
+
+    if not isinstance(node, dict):
+        return None
+
+    # Extract node fields
+    result.nav_title = node.get("navTitle", "")
+    result.nav_description = node.get("navDescription", "")
+    result.image_html = node.get("image", "")
+    html, sections = _parse_content(node.get("content"))
+    result.content_html = html
+    result.accordion_sections = sections
+
+    # Determine next-step buttons
+    base_url_parts = list(path_parts)
+
+    if node.get("steps"):
+        for s in node["steps"]:
+            val = s.get("value", s.get("label", ""))
+            result.buttons.append(StepButtonResolved(
+                label=s.get("label", ""),
+                description=s.get("description", ""),
+                css_class=btn_color(s.get("type")),
+                url=_make_url(slug, base_url_parts + [val]),
+                disabled=s.get("state") == "disabled",
+            ))
+    elif node.get("tools"):
+        for t in node["tools"]:
+            tool_id = t.get("id")
+            route = t.get("route", "tools")
+            if tool_id:
+                url = f"/{route}/{tool_id}"
+                is_tool = True
+            else:
+                url = ""
+                is_tool = False
+            result.buttons.append(StepButtonResolved(
+                label=t.get("label", ""),
+                description=t.get("description", ""),
+                css_class=btn_color(t.get("type", "tool")),
+                url=url,
+                disabled=t.get("state") == "disabled",
+                is_tool_link=is_tool,
+            ))
+
+    # Breadcrumbs
+    crumbs = [Breadcrumb("Case Studies", "/casestudies")]
+    crumbs.append(Breadcrumb(
+        case_title, _make_url(slug, []),
+    ))
+
+    # Build intermediate crumbs
+    # Step 2 label = "Regulatory Question <Q>"
+    for i, part in enumerate(path_parts):
+        is_last = (i == len(path_parts) - 1)
+        label = _unslugify(part)
+        if i == 0:
+            label = f"Regulatory Question {label}"
+        url = _make_url(slug, path_parts[: i + 1])
+        crumbs.append(Breadcrumb(
+            label, url, active=is_last,
+        ))
+
+    result.breadcrumbs = crumbs
+    return result
diff --git a/src/db.py b/src/db.py
new file mode 100644
index 0000000..4affbf4
--- /dev/null
+++ b/src/db.py
@@ -0,0 +1,75 @@
+"""Thin sqlite3 helper. No ORM — just raw SQL."""
+
+from __future__ import annotations
+
+import os
+import sqlite3
+from contextlib import contextmanager
+
+DB_PATH = os.environ.get("DATABASE_PATH", "data/vhp4safety.db")
+
+_TABLES = [
+    """CREATE TABLE IF NOT EXISTS tools (
+        id TEXT PRIMARY KEY, service TEXT NOT NULL, description TEXT,
+        stage TEXT, html_name TEXT, md_file_name TEXT, png_file_name TEXT,
+        main_url TEXT, inst_url TEXT,
+        reg_q_1a INTEGER, reg_q_1b INTEGER, reg_q_2a INTEGER,
+        reg_q_2b INTEGER, reg_q_3a INTEGER, reg_q_3b INTEGER,
+        login TEXT, api_type TEXT, casestudy TEXT, provider TEXT,
+        provider_email TEXT, citation TEXT, version TEXT, license TEXT,
+        sourcecode TEXT, docker TEXT, bio_tools TEXT, tess TEXT,
+        raw_json TEXT, updated_at TEXT
+    )""",
+    """CREATE TABLE IF NOT EXISTS methods (
+        id TEXT PRIMARY KEY, method TEXT NOT NULL, issue_number INTEGER,
+        description TEXT, stage TEXT, substage TEXT,
+        catalog_webpage_url TEXT, case_study TEXT, regulatory_question TEXT,
+        reg_q_1a INTEGER, reg_q_1b INTEGER, reg_q_2a INTEGER,
+        reg_q_2b INTEGER, reg_q_3a INTEGER, reg_q_3b INTEGER,
+        data_producer TEXT, sop TEXT, vendor TEXT, catalog_number TEXT,
+        citation TEXT, type_iri TEXT, ontology TEXT,
+        key_event_id TEXT, aop_id TEXT, raw_json TEXT, updated_at TEXT
+    )""",
+    """CREATE TABLE IF NOT EXISTS regulatory_questions (
+        key TEXT PRIMARY KEY, label TEXT NOT NULL, explanation TEXT NOT NULL
+    )""",
+    """CREATE TABLE IF NOT EXISTS stage_explanations (
+        name TEXT PRIMARY KEY, explanation TEXT NOT NULL
+    )""",
+    """CREATE TABLE IF NOT EXISTS glossary_stage_mappings (
+        glossary_url TEXT PRIMARY KEY, stage_name TEXT NOT NULL
+    )""",
+    """CREATE TABLE IF NOT EXISTS case_studies (
+        slug TEXT PRIMARY KEY, title TEXT NOT NULL, description TEXT NOT NULL,
+        image_src TEXT, image_alt TEXT,
+        config_repo TEXT DEFAULT 'VHP4Safety/ui-casestudy-config',
+        default_branch TEXT DEFAULT 'main', content_json TEXT
+    )""",
+]
+
+
+def get_conn() -> sqlite3.Connection:
+    """Return a new connection with Row factory."""
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+    return conn
+
+
+@contextmanager
+def get_db():
+    """Context manager: yields a connection, auto-closes."""
+    conn = get_conn()
+    try:
+        yield conn
+    finally:
+        conn.close()
+
+
+def init_db() -> None:
+    """Create all tables (idempotent)."""
+    os.makedirs(os.path.dirname(DB_PATH) or ".", exist_ok=True)
+    conn = sqlite3.connect(DB_PATH)
+    for ddl in _TABLES:
+        conn.execute(ddl)
+    conn.commit()
+    conn.close()
diff --git a/src/models/__init__.py b/src/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/models/casestudy.py b/src/models/casestudy.py
new file mode 100644
index 0000000..430ef03
--- /dev/null
+++ b/src/models/casestudy.py
@@ -0,0 +1,209 @@
+"""Pydantic models for VHP4Safety case-study content JSON schemas.
+
+The JSON files originate from a separate GitHub repo
+(VHP4Safety/ui-casestudy-config) and are fetched once during database
+seeding (``python -m src.seed``).  The full JSON blob is stored in
+the ``case_studies.content_json`` column and resolved server-side
+by ``src.casestudy_resolver`` into rendered Jinja templates.
+
+These models formalise the structure so it can be validated
+server-side, used in tests, and consumed by type-aware code.
+
+Hierarchy (up to 6 levels deep):
+  CaseStudyContent          ← root of one *_content.json file
+    └ Step1Contents          ← intro + regulatory-question buttons
+    └ step2Contents          ← dict[question_key → ProcessFlowNav]
+    └ step3Contents          ← dict[question_key → dict[step_label → WorkflowStepNode]]
+    └ step4–6Contents        ← additional nesting (same WorkflowStepNode shape)
+
+Every "node" at step ≥ 2 follows the same recursive pattern captured
+by ``WorkflowStepNode``.
+"""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Any, Optional
+
+from pydantic import BaseModel, Field
+
+
+# ── Enums ─────────────────────────────────────────────────────────────────
+
+
+class StepType(str, Enum):
+    """Button colour / role categories used by the JS renderer."""
+
+    WORKFLOW_STEP = "workflow step"
+    WORKFLOW_SUBSTEP = "workflow substep"
+    PROCESS_FLOW_STEP = "process flow step"
+    REGULATORY_QUESTION = "regulatory question"
+    TOOL = "tool"
+
+
+class CaseStudySlug(str, Enum):
+    """Known case-study URL slugs."""
+
+    KIDNEY = "kidney"
+    PARKINSON = "parkinson"
+    THYROID = "thyroid"
+
+
+# ── Leaf / reusable pieces ────────────────────────────────────────────────
+
+
+class StepButton(BaseModel):
+    """A single clickable button shown in a step panel.
+
+    Appears in ``questions``, ``steps``, and ``tools`` arrays.
+    """
+
+    label: str
+    value: Optional[str] = None
+    description: Optional[str] = None
+    type: Optional[StepType] = None
+    state: Optional[str] = None  # e.g. "disabled"
+
+    # tool-specific fields
+    id: Optional[str] = None
+    route: Optional[str] = None  # e.g. "tools" or "methods"
+
+    model_config = {"extra": "allow"}
+
+
+class AccordionSection(BaseModel):
+    """One collapsible section inside ``content`` when it is an array."""
+
+    section: Optional[str] = None
+    description: Optional[str] = None
+
+    model_config = {"extra": "allow"}
+
+
+# Content can be a raw HTML string **or** a list of accordion sections.
+# We keep it as ``Any`` so both shapes validate; downstream code already
+# branches on ``Array.isArray(content)`` in JS.
+ContentBlock = str | list[AccordionSection] | None
+
+
+# ── Step 1 (intro + regulatory questions) ────────────────────────────────
+
+
+class Step1Contents(BaseModel):
+    """Top-level intro panel for a case study.
+
+    Shown on first load; contains the two regulatory-question buttons.
+    """
+
+    navTitle: str
+    navDescription: str = ""
+    questions: list[StepButton] = Field(default_factory=list)
+    content: Any = None  # HTML string or accordion list
+
+    model_config = {"extra": "allow"}
+
+
+# ── Generic workflow node (steps 2–6) ─────────────────────────────────────
+
+
+class WorkflowStepNode(BaseModel):
+    """A single node at any depth in the step hierarchy.
+
+    Depending on what keys are present the JS renderer shows:
+    * ``steps``  → navigable sub-step buttons (goes deeper)
+    * ``tools``  → tool buttons (leaf, may link to /tools/<id>)
+    * neither    → plain content panel
+
+    Nodes may contain ``content`` as HTML **or** accordion JSON.
+    ``image`` is an optional raw HTML string (e.g. an <img> tag).
+    """
+
+    navTitle: Optional[str] = None
+    navDescription: Optional[str] = None
+    steps: Optional[list[StepButton]] = None
+    tools: Optional[list[StepButton]] = None
+    content: Any = None
+    image: Optional[str] = None
+
+    # Some step-3 entries carry a flag to signal step-4 exists
+    step4content: Optional[str] = None
+
+    model_config = {"extra": "allow"}
+
+
+class ProcessFlowNav(BaseModel):
+    """Step-2 panel: safety-assessment workflow steps for one question.
+
+    Keys ``steps`` list the process-flow buttons; ``content`` is the
+    intro HTML.
+    """
+
+    navTitle: str = ""
+    navDescription: str = ""
+    steps: list[StepButton] = Field(default_factory=list)
+    content: Any = None
+    image: Optional[str] = None
+
+    model_config = {"extra": "allow"}
+
+
+# ── Root document ─────────────────────────────────────────────────────────
+
+# Steps 3-6 are nested dicts whose keys are dynamic (question key,
+# step label, sub-step label …).  We type them as deeply as
+# practical; the innermost values are always WorkflowStepNode.
+
+Step3Map = dict[str, dict[str, WorkflowStepNode]]
+Step4Map = dict[str, dict[str, dict[str, WorkflowStepNode]]]
+Step5Map = dict[str, dict[str, dict[str, dict[str, WorkflowStepNode]]]]
+Step6Map = dict[
+    str, dict[str, dict[str, dict[str, dict[str, WorkflowStepNode]]]]
+]
+
+
+class CaseStudyContent(BaseModel):
+    """Root schema for a ``<case>_content.json`` file.
+
+    Mirrors exactly the shape consumed by ``casestudies.js``.
+    """
+
+    step1Contents: Step1Contents
+    step2Contents: dict[str, ProcessFlowNav] = Field(
+        default_factory=dict
+    )
+    step3Contents: Optional[Step3Map] = None
+    step4Contents: Optional[Step4Map] = None
+    step5Contents: Optional[Step5Map] = None
+    step6Contents: Optional[Step6Map] = None
+
+    model_config = {"extra": "allow"}
+
+
+# ── Case study card (listing page) ───────────────────────────────────────
+
+
+class CaseStudyCard(BaseModel):
+    """Metadata for one card on the /casestudies listing page."""
+
+    slug: CaseStudySlug
+    title: str
+    description: str
+    image_src: str = ""
+    image_alt: str = ""
+    url: str = ""
+    config_repo: Optional[str] = None
+    content_json: Optional[str] = None
+
+
+# ── Convenience: full registry ────────────────────────────────────────────
+
+
+class CaseStudyRegistry(BaseModel):
+    """All known case studies with their summary cards and loaded content."""
+
+    cards: list[CaseStudyCard] = Field(default_factory=list)
+    content: dict[CaseStudySlug, CaseStudyContent] = Field(
+        default_factory=dict,
+    )
+
+    model_config = {"extra": "allow"}
diff --git a/src/models/cloud/method.py b/src/models/cloud/method.py
new file mode 100644
index 0000000..48d37f9
--- /dev/null
+++ b/src/models/cloud/method.py
@@ -0,0 +1,134 @@
+"""Pydantic models for VHP4Safety Cloud method JSON schemas."""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+class ServiceContact(BaseModel):
+    name: Optional[str] = None
+    email: Optional[str] = None
+
+
+class ServiceProvider(BaseModel):
+    contact: Optional[ServiceContact] = None
+    url: Optional[str] = None
+    name: Optional[str] = None
+
+
+class ServiceInstance(BaseModel):
+    type: Optional[str] = None
+    url: Optional[str] = None
+    license: Optional[str] = None
+    version: Optional[str] = None
+    source: Optional[str] = None
+    vhp_platform: Optional[str] = Field(None, alias="vhp-platform")
+
+    model_config = {"populate_by_name": True, "extra": "allow"}
+
+
+class ServiceAccess(BaseModel):
+    API: Optional[str] = None
+    login: Optional[str] = None
+
+    model_config = {"extra": "allow"}
+
+
+class ServiceIntro(BaseModel):
+    title: Optional[str] = None
+    url: Optional[str] = None
+
+
+class RegulatoryQuestion(BaseModel):
+    q1a: Optional[str] = Field(None, alias="1a")
+    q1b: Optional[str] = Field(None, alias="1b")
+    q2a: Optional[str] = Field(None, alias="2a")
+    q2b: Optional[str] = Field(None, alias="2b")
+    q3a: Optional[str] = Field(None, alias="3a")
+    q3b: Optional[str] = Field(None, alias="3b")
+
+    model_config = {"populate_by_name": True}
+
+
+class Service(BaseModel):
+    """A single service entry (docs/service/*.json)."""
+
+    id: str
+    service: str = Field(description="Service display name")
+    description: Optional[str] = None
+
+    stage: Optional[str] = None
+    substage: Optional[str] = None
+    screenshot: Optional[str] = None
+    url: Optional[str] = None
+
+    instance: Optional[ServiceInstance] = None
+    intro: Optional[ServiceIntro] = None
+    provider: Optional[ServiceProvider] = None
+    access: Optional[ServiceAccess] = None
+    regulatory_question: Optional[RegulatoryQuestion] = Field(
+        None, alias="regulatory-question"
+    )
+    ELIXIR: Optional[dict] = None
+
+    model_config = {"populate_by_name": True, "extra": "allow"}
+
+
+class ServiceIndexEntry(BaseModel):
+    """A service as represented in the index (cap/service_index.json)."""
+
+    id: str
+    service: str
+    description: Optional[str] = None
+
+    html_name: Optional[str] = None
+    md_file_name: Optional[str] = None
+    png_file_name: Optional[str] = None
+    stage: Optional[str] = None
+    main_url: Optional[str] = None
+    inst_url: Optional[str] = None
+
+    # Regulatory question flags
+    reg_q_1a: Optional[str] = None
+    reg_q_1b: Optional[str] = None
+    reg_q_2a: Optional[str] = None
+    reg_q_2b: Optional[str] = None
+    reg_q_3a: Optional[str] = None
+    reg_q_3b: Optional[str] = None
+
+    # Upstream issue-template fields (new-tool-service-entry.yml)
+    login: Optional[str] = None
+    api_type: Optional[str] = Field(None, alias="api")
+    casestudy: Optional[str] = None
+    provider: Optional[str] = None
+    provider_email: Optional[str] = Field(
+        None, alias="provider-email"
+    )
+    citation: Optional[str] = None
+    version: Optional[str] = None
+    license: Optional[str] = None
+    sourcecode: Optional[str] = None
+    docker: Optional[str] = None
+    bio_tools: Optional[str] = Field(None, alias="bioTools")
+    tess: Optional[str] = None
+
+    model_config = {"populate_by_name": True, "extra": "allow"}
+
+
+class ServiceIndex(BaseModel):
+    """The full service index (cap/service_index.json).
+
+    A mapping of service id → ServiceIndexEntry.
+    """
+
+    root: dict[str, ServiceIndexEntry] = Field(default_factory=dict)
+
+    model_config = {"extra": "allow"}
+
+    @classmethod
+    def from_dict(cls, data: dict) -> ServiceIndex:
+        return cls(
+            root={k: ServiceIndexEntry.model_validate(v) for k, v in data.items()}
+        )
diff --git a/src/models/cloud/tool.py b/src/models/cloud/tool.py
new file mode 100644
index 0000000..01b574a
--- /dev/null
+++ b/src/models/cloud/tool.py
@@ -0,0 +1,98 @@
+"""Pydantic models for VHP4Safety Cloud tool JSON schemas."""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+
+class Method(BaseModel):
+    """A single method entry (docs/methods/*.json).
+
+    Field names match the ORM columns in tables.py.
+    Aliases map to the raw JSON keys from the cloud repo.
+    """
+
+    id: str
+    method: str = Field(description="Method title (from issue title)")
+    issue_number: Optional[int] = None
+    description: Optional[str] = Field(
+        None, alias="method_description_content"
+    )
+
+    # Upstream issue-template fields (new-tool-method-entry.yml)
+    data_producer: Optional[str] = Field(
+        None, alias="data_producer_content"
+    )
+    sop: Optional[str] = Field(
+        None, alias="available_sop_or_protocol_content"
+    )
+    vendor: Optional[str] = Field(
+        None, alias="vendor_content"
+    )
+    catalog_number: Optional[str] = Field(
+        None, alias="catalog_number_content"
+    )
+    catalog_webpage_url: Optional[str] = None
+    citation: Optional[str] = Field(
+        None, alias="citation_content"
+    )
+    stage: Optional[str] = Field(
+        None, alias="vhp4safety_workflow_stage_content"
+    )
+    substage: Optional[str] = Field(
+        None, alias="workflow_substage_content"
+    )
+    case_study: Optional[str] = Field(
+        None, alias="case_study_content"
+    )
+    regulatory_question: Optional[str] = Field(
+        None, alias="regulatory_question_content"
+    )
+    type_iri: Optional[str] = Field(
+        None, alias="ontology_term_content"
+    )
+    ontology: Optional[str] = Field(
+        None, alias="type_content"
+    )
+    key_event_id: Optional[str] = Field(
+        None,
+        alias="relevant_aop_wiki_key_event(s)_to_the_assay_content",
+    )
+    aop_id: Optional[str] = Field(
+        None,
+        alias="relevant_aop_wiki_adverse_outcome_pathway(s)"
+        "_to_the_assay_content",
+    )
+
+    # Regulatory question flags
+    reg_q_1a: Optional[str] = None
+    reg_q_1b: Optional[str] = None
+    reg_q_2a: Optional[str] = None
+    reg_q_2b: Optional[str] = None
+    reg_q_3a: Optional[str] = None
+    reg_q_3b: Optional[str] = None
+
+    timestamp: Optional[datetime] = None
+    https: Optional[str] = Field(
+        None, description="Broken URL fragment in some files"
+    )
+
+    model_config = {"populate_by_name": True, "extra": "allow"}
+
+
+class MethodIndex(BaseModel):
+    """The full methods index (cap/methods_index.json).
+
+    A mapping of method id → Method.
+    """
+
+    root: dict[str, Method] = Field(default_factory=dict)
+
+    model_config = {"extra": "allow"}
+
+    @classmethod
+    def from_dict(cls, data: dict) -> MethodIndex:
+        return cls(root={k: Method.model_validate(v) for k, v in data.items()})
diff --git a/src/models/compound.py b/src/models/compound.py
new file mode 100644
index 0000000..1b871f4
--- /dev/null
+++ b/src/models/compound.py
@@ -0,0 +1,75 @@
+"""Pydantic models for compound data from CompoundCloud SPARQL.
+
+These are not stored in the database — they model the responses from
+the CompoundCloud Wikibase SPARQL endpoint and from Wikidata QLever
+for experimental data.
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+
+class CompoundSummary(BaseModel):
+    """Core compound identifiers from CompoundCloud."""
+
+    wcid: str = Field(description="CompoundCloud entity URI")
+    label: str = Field(description="Human-readable compound name")
+    inchi: str = ""
+    inchikey: str = ""
+    smiles: str = Field("", alias="SMILES")
+    formula: str = ""
+    mass: str = ""
+
+    model_config = {"populate_by_name": True}
+
+
+class CompoundIdentifier(BaseModel):
+    """A single external identifier for a compound."""
+
+    property_label: str = Field(
+        "", description="Name of the identifier property"
+    )
+    value: str = ""
+    formatter_url: str = Field(
+        "", description="URL template for the identifier"
+    )
+
+
+class CompoundToxicology(BaseModel):
+    """A toxicology property row."""
+
+    property_label: str = ""
+    value: str = ""
+
+
+class CompoundExperimentalDatum(BaseModel):
+    """A single experimental measurement from Wikidata."""
+
+    property_label: str = Field(
+        "", description="Measured property name"
+    )
+    value: str = ""
+    units_label: str = ""
+    source: str = ""
+    doi: str = ""
+    see_also: str = Field(
+        "", description="Link to the Wikidata statement"
+    )
+
+
+class CompoundDetail(BaseModel):
+    """Full compound view combining all SPARQL query results."""
+
+    summary: Optional[CompoundSummary] = None
+    identifiers: list[CompoundIdentifier] = Field(
+        default_factory=list
+    )
+    toxicology: list[CompoundToxicology] = Field(
+        default_factory=list
+    )
+    experimental_data: list[CompoundExperimentalDatum] = Field(
+        default_factory=list
+    )
diff --git a/src/models/data/__init__.py b/src/models/data/__init__.py
new file mode 100644
index 0000000..6fd1d01
--- /dev/null
+++ b/src/models/data/__init__.py
@@ -0,0 +1,50 @@
+"""Data models & extractors for BioStudies and Zenodo datasets."""
+
+from src.models.data.biostudies import BioStudiesExtractor
+from src.models.data.zenodo import ZenodoExtractor
+from src.models.data.mapping import normalize_all
+from src.models.data.schemas import (
+    Author,
+    Attribute,
+    AuthorDetail,
+    BiologicalContext,
+    BioStudiesParsedMetadata,
+    DataFile,
+    ExperimentalDesign,
+    FileEntry,
+    Funding,
+    LinkEntry,
+    NormalizedMetadata,
+    ProtocolEntry,
+    Publication,
+    TechnicalDetails,
+    UrlExistsResult,
+    ZenodoFileEntry,
+    ZenodoParsedMetadata,
+)
+
+__all__ = [
+    # Extractors
+    "BioStudiesExtractor",
+    "ZenodoExtractor",
+    # Normalizer
+    "normalize_all",
+    # Pydantic models
+    "Author",
+    "Attribute",
+    "AuthorDetail",
+    "BiologicalContext",
+    "BioStudiesParsedMetadata",
+    "DataFile",
+    "ExperimentalDesign",
+    "FileEntry",
+    "Funding",
+    "LinkEntry",
+    "NormalizedMetadata",
+    "ProtocolEntry",
+    "Publication",
+    "TechnicalDetails",
+    "UrlExistsResult",
+    "ZenodoFileEntry",
+    "ZenodoParsedMetadata",
+]
diff --git a/src/models/data/biostudies.py b/src/models/data/biostudies.py
new file mode 100644
index 0000000..2756cb1
--- /dev/null
+++ b/src/models/data/biostudies.py
@@ -0,0 +1,867 @@
+import requests
+import json
+import time
+import re
+from urllib.parse import quote
+
+
+class BioStudiesExtractor:
+    """Class to handle BioStudies API interactions"""
+
+    _SPLIT_RE = re.compile(r"^(.*?)(\d+)$")
+
+    def __init__(self, collection: str = ""):
+        self.base_url = "https://www.ebi.ac.uk/biostudies/api/v1"
+        self.ftp_base = "https://ftp.ebi.ac.uk/pub/databases/biostudies/"
+        self.studies_url = self.base_url + "/studies"
+        self.search_url = (
+            f"{self.base_url}/{collection}/search"
+            if collection
+            else f"{self.base_url}/search"
+        )
+
+    # -----------------------------
+    # ID validation / URL building
+    # -----------------------------
+    def validate_study_id(self, study_id):
+        """
+        Validate BioStudies ID format
+
+        Args:
+            study_id (str): BioStudies accession ID
+
+        Returns:
+            tuple: (is_valid, cleaned_id, error_message)
+        """
+        if not study_id or not isinstance(study_id, str):
+            return False, None, "Study ID is required"
+
+        verified_id = study_id.strip().upper()
+
+        # Examples: S-ONTX26, E-MTAB-1234, S-BSST123, S-VHPS21, S-TOXR1735
+        patterns = [
+            r"^S-[A-Z0-9]+$",      # Studies starting with S-
+            r"^E-[A-Z]+-\d+$",     # Expression studies like E-MTAB-1234
+            r"^[A-Z]+-\d+$",       # General pattern like ABC-123
+        ]
+
+        if not any(re.match(pattern, verified_id) for pattern in patterns):
+            return (
+                False,
+                verified_id,
+                "Invalid BioStudies ID format. Expected format: S-ONTX26, E-MTAB-1234, etc.",
+            )
+
+        return True, verified_id, None
+
+    def split_text_int(self, value: str):
+        """
+        Splits trailing integer from a string.
+        'S-VHPS21' -> ('S-VHPS', 21)
+        'ABC'      -> ('ABC', None)
+        'X-12A'    -> ('X-12A', None)
+        """
+        if not value:
+            return value, None
+        m = self._SPLIT_RE.match(value)
+        if not m:
+            return value, None
+        prefix, num = m.group(1), int(m.group(2))
+        return prefix, num
+
+    def build_biostudies_https_file_url(self, accno: str, filename: str) -> str | None:
+        """
+        Constructs:
+        https://ftp.ebi.ac.uk/pub/databases/biostudies/{prefix}/{num3}/{accno}/Files/{filename}
+
+        Returns None if accno has no trailing integer.
+
+        Note:
+        - We keep "/" safe in case filename contains subfolders (rare, but possible).
+        """
+        prefix, num = self.split_text_int(accno)
+        if num is None or not filename:
+            return None
+
+        num3 = f"{num:03d}"
+
+        # Encode only the filename segment (allow "/" for potential subpaths)
+        safe_name = quote(filename, safe="/")
+
+        return (
+            self.ftp_base
+            + f"{prefix}/{num3}/{accno}/Files/{safe_name}"
+        )
+
+    def url_exists_no_download(self, url: str, timeout=(3.05, 10)):
+        """
+        Returns a dict describing existence with minimal data transfer.
+        - tries HEAD
+        - falls back to GET Range bytes=0-0
+        """
+        result = {
+            "url": url,
+            "exists": False,
+            "status_code": None,
+            "content_length": None,
+            "final_url": None,
+            "error": None,
+            "method": None,
+        }
+
+        if not url:
+            result["error"] = "Empty URL"
+            return result
+
+        try:
+            # 1) HEAD (preferred: no body)
+            r = requests.head(url, allow_redirects=True, timeout=timeout)
+            result["status_code"] = r.status_code
+            result["final_url"] = str(r.url)
+            result["method"] = "HEAD"
+
+            if r.status_code == 200:
+                result["exists"] = True
+                result["content_length"] = r.headers.get("Content-Length")
+                return result
+
+            # 2) Fallback if HEAD not allowed or forbidden, etc.
+            if r.status_code in (403, 405):
+                rg = requests.get(
+                    url,
+                    stream=True,
+                    allow_redirects=True,
+                    headers={"Range": "bytes=0-0"},
+                    timeout=timeout,
+                )
+                result["status_code"] = rg.status_code
+                result["final_url"] = str(rg.url)
+                result["method"] = "GET_RANGE"
+
+                # 206 Partial Content is a strong "exists"
+                if rg.status_code in (200, 206):
+                    result["exists"] = True
+                    result["content_length"] = rg.headers.get("Content-Length")
+
+                return result
+
+            # other codes (404, 410, 500...) treated as not found / not accessible
+            return result
+
+        except requests.RequestException as e:
+            result["error"] = str(e)
+            return result
+
+    def _pick_rocrate_file(self, files: list[dict]) -> dict | None:
+        """
+        Return the first file dict whose name/path contains 'rocrate' (case-insensitive).
+        Preference order:
+        1) files where exists_check.exists is True (if exists_check present)
+        2) otherwise first match
+        """
+        if not isinstance(files, list) or not files:
+            return None
+
+        def fname(f: dict) -> str:
+            if not isinstance(f, dict):
+                return ""
+            return str(f.get("name") or f.get("path") or "").lower()
+
+        # All matches by name/path
+        matches = [f for f in files if "rocrate" in fname(f)]
+        if not matches:
+            return None
+
+        # Prefer verified existing ones if available
+        verified = [
+            f for f in matches
+            if isinstance(f, dict)
+            and isinstance(f.get("exists_check"), dict)
+            and f["exists_check"].get("exists") is True
+        ]
+        return verified[0] if verified else matches[0]
+    
+    # -----------------------------
+    # API operations
+    # -----------------------------
+    def get_study_metadata(self, study_id):
+        """
+        Extract metadata for a given BioStudies ID
+
+        Args:
+            study_id (str): BioStudies accession ID (e.g., S-ONTX26)
+
+        Returns:
+            dict: Parsed metadata or error information
+        """
+        try:
+            # Validate study ID format
+            is_valid, verified_id, validation_error = self.validate_study_id(study_id)
+            if not is_valid:
+                return {"error": validation_error}
+
+            url = self.studies_url + f"/{verified_id}"
+
+            headers = {
+                "Accept": "application/json",
+                "User-Agent": "BioStudies-VHP4Safety-App/1.0",
+            }
+
+            response = requests.get(url, headers=headers, timeout=30)
+
+            if response.status_code == 200:
+                try:
+                    data = response.json()
+                    if not data:
+                        return {"error": f"Empty response received for study {verified_id}"}
+
+                    # Parse metadata first, then build URL using the derived collection (no extra API calls)
+                    md = self.parse_metadata(data)
+                    collection = md.get("collection", "")
+                    web_url = self.build_study_url(verified_id, collection).get("url", "")
+                    return md | {"url": web_url}
+
+                except json.JSONDecodeError as e:
+                    return {"error": f"Invalid JSON response from BioStudies API: {str(e)}"}
+
+            elif response.status_code == 404:
+                return {
+                    "error": f"Study '{verified_id}' not found in BioStudies database. Please check the ID and try again."
+                }
+            elif response.status_code == 403:
+                return {"error": "Access forbidden. The study may be restricted or private."}
+            elif response.status_code == 500:
+                return {"error": "BioStudies server error. Please try again later."}
+            elif response.status_code == 503:
+                return {"error": "BioStudies service temporarily unavailable. Please try again later."}
+            else:
+                return {"error": f"BioStudies API returned status {response.status_code}. Please try again later."}
+
+        except requests.exceptions.Timeout:
+            return {"error": "Request timed out. BioStudies server may be slow. Please try again."}
+        except requests.exceptions.ConnectionError:
+            return {"error": "Cannot connect to BioStudies server. Please check your internet connection."}
+        except requests.exceptions.RequestException as e:
+            return {"error": f"Network error: {str(e)}"}
+        except Exception as e:
+            return {"error": f"Unexpected error occurred: {str(e)}"}
+
+    def get_study_collection(self, study_id):
+        """
+        Extract collection for a given BioStudies ID
+        """
+        metadata = self.get_study_metadata(study_id)
+        if "error" in metadata:
+            return metadata
+        collection = metadata.get("collection", "")
+        return {"accession": study_id, "collection": collection}
+
+    def build_study_url(self, study_id, collection: str = ""):
+        """
+        Build the URL to access the study in BioStudies web interface
+        """
+        is_valid, verified_id, validation_error = self.validate_study_id(study_id)
+        if not is_valid:
+            return {"error": validation_error}
+
+        if collection:
+            url = f"https://www.ebi.ac.uk/biostudies/{collection}/studies/{verified_id}"
+        else:
+            url = f"https://www.ebi.ac.uk/biostudies/studies/{verified_id}"
+
+        return {"accession": verified_id, "url": url}
+
+    # -----------------------------
+    # Search / list
+    # -----------------------------
+    def search_studies(
+        self,
+        query,
+        page=1,
+        page_size=10,
+        load_metadata: bool = True,
+        filters: tuple[tuple] | None = None,
+    ) -> dict:
+        """
+        Search for studies in BioStudies database
+        """
+        try:
+            if not query or not isinstance(query, str):
+                return {"error": "Search query must be a non-empty string."}
+
+            filters_applied = bool(filters)
+            if filters_applied:
+                load_metadata = True
+
+            params = {"query": query, "page": page, "pageSize": page_size}
+
+            headers = {
+                "Accept": "application/json",
+                "User-Agent": "BioStudies-VHP4Safety-App/1.0",
+            }
+
+            response = requests.get(self.search_url, headers=headers, params=params, timeout=30)
+
+            if response.status_code == 200:
+                try:
+                    data = response.json()
+                    hits = data.get("hits", [])
+                    total_hits = data.get("totalHits", 0)
+
+                    if not data or total_hits == 0:
+                        return {"error": "No results found."}
+
+                    if load_metadata:
+                        hits = self._hit_metadata(hits)
+                    hits = self._hit_url(hits)
+
+                    if filters_applied:
+                        hits = self._apply_filters(hits, filters)
+
+                        page_size_met = len(hits) >= page_size
+                        pages_fetched = 1
+
+                        if not page_size_met:
+                            hits, page_size_met, pages_fetched = self._backfill_filtered_results(
+                                hits, page, page_size, filters, query
+                            )
+
+                        return {
+                            "totalHits": total_hits,
+                            "hits": hits,
+                            "hits_returned": len(hits),
+                            "page": page,
+                            "pageSize": page_size,
+                            "pages_fetched": pages_fetched,
+                            "filters_applied": True,
+                            "page_size_met": page_size_met,
+                        }
+
+                    return data | {"hits": hits, "total": total_hits}
+
+                except json.JSONDecodeError as e:
+                    return {"error": f"Invalid JSON response from BioStudies API: {str(e)}"}
+
+            elif response.status_code == 400:
+                return {"error": "Bad request. Please check your search parameters."}
+            elif response.status_code == 403:
+                return {"error": "Access forbidden. The collection may be restricted."}
+            elif response.status_code == 500:
+                return {"error": "BioStudies server error. Please try again later."}
+            elif response.status_code == 503:
+                return {"error": "BioStudies service temporarily unavailable. Please try again later."}
+            else:
+                return {"error": f"BioStudies API returned status {response.status_code}. Please try again later."}
+
+        except requests.exceptions.Timeout:
+            return {"error": "Request timed out. BioStudies server may be slow. Please try again."}
+        except requests.exceptions.ConnectionError:
+            return {"error": "Cannot connect to BioStudies server. Please check your internet connection."}
+        except requests.exceptions.RequestException as e:
+            return {"error": f"Network error: {str(e)}"}
+        except Exception as e:
+            return {"error": f"Unexpected error occurred: {str(e)}"}
+
+    def list_studies(
+        self,
+        page=1,
+        page_size=50,
+        include_urls: bool = False,
+        load_metadata: bool = False,
+        filters: tuple[tuple] | None = None,
+    ) -> dict:
+        """
+        List studies in the configured BioStudies collection for a specific page.
+        """
+        filters_applied = bool(filters)
+        if filters_applied:
+            load_metadata = True
+            include_urls = True
+
+        headers = {
+            "Accept": "application/json",
+            "User-Agent": "BioStudies-VHP4Safety-App/1.0",
+        }
+        params = {"page": page, "pageSize": page_size}
+
+        try:
+            response = requests.get(self.search_url, headers=headers, params=params, timeout=30)
+        except requests.exceptions.RequestException as e:
+            return {"error": f"Network error during listing: {e}", "total": 0, "hits": []}
+
+        if response.status_code != 200:
+            return {
+                "error": f"BioStudies API returned status {response.status_code} while listing.",
+                "total": 0,
+                "hits": [],
+            }
+
+        try:
+            data = response.json()
+        except json.JSONDecodeError as e:
+            return {"error": f"Invalid JSON response from BioStudies API: {str(e)}", "total": 0, "hits": []}
+
+        total_hits = data.get("totalHits") or data.get("total") or 0
+        hits = data.get("hits", [])
+
+        if include_urls:
+            hits = self._hit_url(hits)
+        if load_metadata:
+            hits = self._hit_metadata(hits)
+
+        if filters_applied:
+            hits = self._apply_filters(hits, filters)
+
+            page_size_met = len(hits) >= page_size
+            pages_fetched = 1
+
+            if not page_size_met:
+                hits, page_size_met, pages_fetched = self._backfill_filtered_results(
+                    hits, page, page_size, filters, query=None
+                )
+
+            return {
+                "totalHits": total_hits,
+                "total": total_hits,
+                "hits": hits,
+                "hits_returned": len(hits),
+                "page": page,
+                "pageSize": page_size,
+                "pages_fetched": pages_fetched,
+                "filters_applied": True,
+                "page_size_met": page_size_met,
+            }
+
+        return {"total": total_hits, "hits": hits}
+
+    def _hit_url(self, hits: list) -> list:
+        for hit in hits:
+            acc = hit.get("accession") or hit.get("accno")
+            if acc:
+                hit["url"] = self.build_study_url(acc).get("url", "")
+        return hits
+
+    def _hit_metadata(self, hits: list) -> list:
+        for hit in hits:
+            acc = hit.get("accession") or hit.get("accno")
+            if acc:
+                hit["metadata"] = self.get_study_metadata(acc)
+        return hits
+
+    def _apply_filters(self, hits: list, filters: list[tuple]) -> list:
+        """
+        Filter hits based on metadata field values (case-insensitive AND logic)
+        """
+        if not filters:
+            return hits
+
+        filtered = []
+        for hit in hits:
+            metadata = hit.get("metadata", {})
+            if not metadata:
+                continue
+
+            matches_all = True
+            for field, value in filters:
+                field_value = str(metadata.get(field, "")).strip().lower()
+                filter_value = str(value).strip().lower()
+                if field_value != filter_value:
+                    matches_all = False
+                    break
+
+            if matches_all:
+                filtered.append(hit)
+
+        return filtered
+
+    def _backfill_filtered_results(
+        self,
+        initial_hits: list,
+        page: int,
+        page_size: int,
+        filters: list[tuple],
+        query: str = None,
+    ) -> tuple:
+        """
+        Backfill filtered results by fetching additional pages until page_size is met or timeout
+        """
+        filtered = initial_hits[:]
+        current_page = page
+        start_time = time.time()
+        pages_fetched = 1
+
+        while len(filtered) < page_size:
+            if time.time() - start_time > 30:
+                break
+
+            current_page += 1
+
+            try:
+                params = {"page": current_page, "pageSize": page_size}
+                headers = {"Accept": "application/json", "User-Agent": "BioStudies-VHP4Safety-App/1.0"}
+
+                if query:
+                    params["query"] = query
+
+                response = requests.get(self.search_url, headers=headers, params=params, timeout=30)
+                if response.status_code != 200:
+                    break
+
+                data = response.json()
+                next_hits = data.get("hits", [])
+                if not next_hits:
+                    break
+
+                next_hits = self._hit_metadata(next_hits)
+                next_filtered = self._apply_filters(next_hits, filters)
+                filtered.extend(next_filtered)
+                pages_fetched += 1
+
+            except Exception:
+                break
+
+        page_size_met = len(filtered) >= page_size
+        return filtered[:page_size], page_size_met, pages_fetched
+
+    # -----------------------------
+    # Metadata parsing (FIXED)
+    # -----------------------------
+    def parse_metadata(self, raw_data: dict, *, validate_files: bool = True, file_timeout=(3.05, 10)):
+        """
+        Parse and structure the metadata from BioStudies API response.
+
+        FIX:
+        - Files are extracted ONLY here (enriched), not in _extract_comprehensive_metadata().
+          This prevents duplicates and ensures consistent structure.
+        """
+        try:
+            metadata = {
+                "accession": raw_data.get("accno", "N/A"),
+                "title": raw_data.get("title", "N/A"),
+                "description": raw_data.get("description", "N/A"),
+                "release_date": raw_data.get("rdate", raw_data.get("ReleaseDate", "N/A")),
+                "modification_date": raw_data.get("mdate", "N/A"),
+                "type": raw_data.get("type", "N/A"),
+
+                # VHP4Safety filterable fields
+                "case_study": "",
+                "regulatory_question": "",
+                "flow_step": "",
+                "collection": "",
+
+                "attributes": [],
+                "authors": [],
+                "files": [],
+                "links": [],
+                "protocols": [],
+                "publications": [],
+                "organizations": [],
+
+                "biological_context": {},
+                "technical_details": {},
+                "experimental_design": {},
+
+                "raw_data": raw_data,
+            }
+
+            # ---- helpers
+            def _norm_attr_name(attr: dict) -> str:
+                return (attr.get("name") or "").strip().lower()
+
+            def _attr_value(attr: dict) -> str:
+                v = attr.get("value", "")
+                return "" if v is None else str(v)
+
+            def _capture_vhp_fields(attr_name: str, attr_value: str):
+                if attr_name == "attachto":
+                    metadata["collection"] = attr_value
+                elif attr_name == "case study":
+                    metadata["case_study"] = attr_value
+                elif attr_name == "regulatory question":
+                    metadata["regulatory_question"] = attr_value
+                elif attr_name == "process flow step":
+                    metadata["flow_step"] = attr_value
+
+            BIO_KEYS = {
+                "organism", "species", "organism part", "organ", "cell type",
+                "tissue", "disease", "disease state", "sample type",
+            }
+            TECH_KEYS = {
+                "platform", "instrument", "assay", "assay type", "library strategy",
+                "library source", "data type", "sequencing mode", "sequencing date",
+                "index adapters", "pipeline",
+            }
+            AUTHOR_KEYS = {"author", "authors", "contact", "submitter"}
+
+            def _categorize(attr_name: str, attr_value: str):
+                if attr_name in BIO_KEYS:
+                    metadata["biological_context"][attr_name] = attr_value
+                elif attr_name in TECH_KEYS:
+                    metadata["technical_details"][attr_name] = attr_value
+                elif attr_name in AUTHOR_KEYS:
+                    if attr_value and attr_value not in metadata["authors"]:
+                        metadata["authors"].append(attr_value)
+
+            def _file_attrs_map(fobj: dict) -> dict:
+                out = {}
+                for a in (fobj or {}).get("attributes", []) or []:
+                    n = (a.get("name") or "").strip()
+                    if n:
+                        out[n] = a.get("value")
+                return out
+
+            def _iter_section_files(sec: dict):
+                if not isinstance(sec, dict):
+                    return
+                if isinstance(sec.get("files"), list):
+                    for f in sec["files"]:
+                        yield f
+                if isinstance(sec.get("subsections"), list):
+                    for s in sec["subsections"]:
+                        yield from _iter_section_files(s)
+
+            seen_files = set()
+
+            def _add_files(files_list):
+                if not isinstance(files_list, list):
+                    return
+
+                accno = metadata.get("accession") or raw_data.get("accno") or "N/A"
+
+                for f in files_list:
+                    if not isinstance(f, dict):
+                        continue
+
+                    file_path = (f.get("path") or f.get("name") or f.get("filename") or "").strip()
+                    if not file_path:
+                        continue
+
+                    dedupe_key = f"{accno}::{file_path}"
+                    if dedupe_key in seen_files:
+                        continue
+                    seen_files.add(dedupe_key)
+
+                    fam = _file_attrs_map(f)
+                    url = self.build_biostudies_https_file_url(accno, file_path)
+
+                    entry = {
+                        "name": file_path,
+                        "path": file_path,
+                        "size": f.get("size"),
+                        "type": f.get("type"),
+                        "description": fam.get("Description") or fam.get("description") or "",
+                        "file_kind": fam.get("Type") or fam.get("type") or "",
+                        "attributes": f.get("attributes", []),
+                        "url": url,
+                        "exists_check": None,
+                        "raw": f,
+                    }
+
+                    if validate_files and url:
+                        entry["exists_check"] = self.url_exists_no_download(url, timeout=file_timeout)
+
+                    metadata["files"].append(entry)
+
+            # ---- top-level attributes
+            if isinstance(raw_data.get("attributes"), list):
+                for attr in raw_data["attributes"]:
+                    if not isinstance(attr, dict):
+                        continue
+                    name_raw = attr.get("name", "")
+                    attr_name = _norm_attr_name(attr)
+                    value = _attr_value(attr)
+
+                    metadata["attributes"].append({"name": name_raw, "value": value})
+                    _capture_vhp_fields(attr_name, value)
+                    _categorize(attr_name, value)
+
+            # ---- org lookup
+            organization_lookup = {}
+            if isinstance(raw_data.get("section"), dict):
+                self._build_organization_lookup(raw_data["section"], organization_lookup)
+
+            # ---- section attributes
+            section = raw_data.get("section") if isinstance(raw_data.get("section"), dict) else None
+            if section and isinstance(section.get("attributes"), list):
+                for attr in section["attributes"]:
+                    if not isinstance(attr, dict):
+                        continue
+                    name_raw = attr.get("name", "")
+                    attr_name = _norm_attr_name(attr)
+                    value = _attr_value(attr)
+
+                    if attr_name == "title" and (metadata["title"] == "N/A" or not metadata["title"]):
+                        metadata["title"] = value
+                    elif attr_name == "description" and (metadata["description"] == "N/A" or not metadata["description"]):
+                        metadata["description"] = value
+
+                    _capture_vhp_fields(attr_name, value)
+                    _categorize(attr_name, value)
+                    metadata["attributes"].append({"name": name_raw, "value": value})
+
+            # ---- comprehensive extraction (NO FILES inside this anymore!)
+            if section:
+                self._extract_comprehensive_metadata(section, metadata, organization_lookup)
+
+            # ---- files (enriched, deduped)
+            if section:
+                _add_files(list(_iter_section_files(section)))
+            if isinstance(raw_data.get("files"), list):
+                _add_files(raw_data["files"])
+
+            # ---- links + publications
+            def _add_links(links_list):
+                if not isinstance(links_list, list):
+                    return
+                for link in links_list:
+                    if not isinstance(link, dict):
+                        continue
+                    link_data = {
+                        "url": link.get("url", ""),
+                        "type": link.get("type", ""),
+                        "description": link.get("description", ""),
+                        "attributes": link.get("attributes", []),
+                    }
+                    metadata["links"].append(link_data)
+
+                    link_type = (link.get("type", "") or "").lower()
+                    if ("doi" in link_type) or ("pubmed" in link_type) or ("publication" in link_type):
+                        metadata["publications"].append(link_data)
+
+            _add_links(raw_data.get("links"))
+            if section:
+                _add_links(section.get("links"))
+
+            # pick ro-crate link from available files -> requires filename to contain "rocrate"
+            rocrate = self._pick_rocrate_file(metadata.get("files", []))
+            metadata["rocrate_file"] = rocrate  # full dict (name/path/url/size/exists_check...)
+            metadata["rocrate_url"] = rocrate.get("url") if isinstance(rocrate, dict) else None
+
+            
+            return metadata
+
+        except Exception as e:
+            return {"error": f"Failed to parse metadata: {str(e)}", "raw_data": raw_data}
+
+    # -----------------------------
+    # Organisation lookup / deep extraction
+    # -----------------------------
+    def _build_organization_lookup(self, section, org_lookup):
+        """Build a lookup table for organization references"""
+        if isinstance(section, dict):
+            if section.get("type", "").lower() in ["organization", "organisation"]:
+                org_id = section.get("accno", "")
+                if org_id and "attributes" in section:
+                    org_data = {}
+                    for attr in section["attributes"]:
+                        attr_name = (attr.get("name", "") or "").lower()
+                        attr_value = attr.get("value", "")
+                        if attr_name in ["name", "organization", "email", "address", "department", "affiliation"]:
+                            org_data[attr_name] = attr_value
+                    if org_data:
+                        org_lookup[org_id] = org_data
+
+            if "subsections" in section:
+                for subsection in section["subsections"]:
+                    self._build_organization_lookup(subsection, org_lookup)
+
+        elif isinstance(section, list):
+            for item in section:
+                self._build_organization_lookup(item, org_lookup)
+
+    def _extract_comprehensive_metadata(self, section, metadata, organization_lookup=None):
+        """
+        Comprehensively extract metadata from sections/subsections.
+
+        IMPORTANT FIX:
+        - DO NOT append files here (to avoid duplicates). Files are handled in parse_metadata().
+        """
+        if organization_lookup is None:
+            organization_lookup = {}
+
+        if isinstance(section, dict):
+            # ---- protocols
+            if section.get("type", "").lower() == "protocols" or "protocol" in section.get("type", "").lower():
+                if "subsections" in section:
+                    for protocol in section["subsections"]:
+                        protocol_data = {
+                            "type": protocol.get("type", ""),
+                            "description": protocol.get("description", ""),
+                            "attributes": [],
+                        }
+
+                        if "attributes" in protocol:
+                            for attr in protocol["attributes"]:
+                                protocol_data["attributes"].append(
+                                    {"name": attr.get("name", ""), "value": attr.get("value", "")}
+                                )
+
+                        metadata["protocols"].append(protocol_data)
+
+            # ---- author and organization information
+            if section.get("type", "").lower() in ["author", "contact", "person"]:
+                if "attributes" in section:
+                    author_info = {}
+                    author_affiliation_ref = None
+
+                    for attr in section["attributes"]:
+                        attr_name = (attr.get("name", "") or "").lower()
+                        attr_value = attr.get("value", "")
+
+                        if attr_name in ["name", "first name", "last name", "email", "e-mail", "orcid"]:
+                            author_info[attr_name] = attr_value
+                        elif attr_name == "affiliation" and attr.get("reference"):
+                            author_affiliation_ref = attr_value
+
+                    if author_info:
+                        author_name = author_info.get("name", "")
+                        if not author_name:
+                            first = author_info.get("first name", "")
+                            last = author_info.get("last name", "")
+                            author_name = f"{first} {last}".strip()
+
+                        email = author_info.get("email") or author_info.get("e-mail", "")
+                        orcid = author_info.get("orcid") or None
+
+                        author_entry = {
+                            "name": author_name,
+                            "email": email,
+                            "orcid": orcid,
+                            "affiliation_ref": author_affiliation_ref,
+                            "affiliation_name": "",
+                        }
+
+                        if author_affiliation_ref and author_affiliation_ref in organization_lookup:
+                            resolved_org = organization_lookup[author_affiliation_ref]
+                            author_entry["affiliation_name"] = resolved_org.get("name", "")
+
+                        if author_name:
+                            existing_author = next(
+                                (a for a in metadata.get("author_details", []) if a.get("name") == author_name),
+                                None,
+                            )
+                            if not existing_author:
+                                metadata.setdefault("author_details", []).append(author_entry)
+
+                            if author_name not in metadata["authors"]:
+                                metadata["authors"].append(author_name)
+
+            # ---- experimental design info
+            if "attributes" in section:
+                for attr in section["attributes"]:
+                    attr_name = (attr.get("name", "") or "").lower()
+                    attr_value = attr.get("value", "")
+
+                    if attr_name in ["experimental factor", "variable", "treatment", "condition", "time point"]:
+                        metadata["experimental_design"].setdefault("factors", []).append(
+                            {"name": attr_name, "value": attr_value}
+                        )
+
+            # ---- recurse
+            if "subsections" in section:
+                for subsection in section["subsections"]:
+                    self._extract_comprehensive_metadata(subsection, metadata, organization_lookup)
+
+        elif isinstance(section, list):
+            for item in section:
+                self._extract_comprehensive_metadata(item, metadata, organization_lookup)
\ No newline at end of file
diff --git a/src/models/data/mapping.py b/src/models/data/mapping.py
new file mode 100644
index 0000000..b65fc69
--- /dev/null
+++ b/src/models/data/mapping.py
@@ -0,0 +1,526 @@
+from typing import Any, Dict, List, Optional, Tuple
+import re
+
+# ---------- small helpers ----------
+
+# Prefer literal "<>" in real code (not HTML-escaped &lt; &gt;)
+DOI_RE = re.compile(r'\b10\.\d{4,9}/[^\s"<>]+', re.IGNORECASE)
+
+def is_valid_doi(doi: Optional[str]) -> bool:
+    """Basic DOI sanity check. Rejects obvious redactions like '***'."""
+    if not doi or not isinstance(doi, str):
+        return False
+    d = doi.strip()
+    if "*" in d:           # handles 10.5281/zenodo.*** etc.
+        return False
+    if not d.lower().startswith("10."):
+        return False
+    if "/" not in d:
+        return False
+    return True
+
+def g(d: Dict[str, Any], *path: str, default=None):
+    """Safe nested-get. Never raises KeyError."""
+    cur: Any = d
+    for key in path:
+        if isinstance(cur, dict) and key in cur:
+            cur = cur[key]
+        else:
+            return default
+    return cur
+
+def first(*vals, default=None):
+    """Return first non-empty (not None, not '' , not []) value."""
+    for v in vals:
+        if v is None:
+            continue
+        if v == "":
+            continue
+        if isinstance(v, (list, dict)) and len(v) == 0:
+            continue
+        return v
+    return default
+
+def find_attr(attrs: Any, name: str) -> Optional[str]:
+    """Find BioStudies attribute list entry with given name."""
+    if not isinstance(attrs, list):
+        return None
+    for a in attrs:
+        if isinstance(a, dict) and a.get("name") == name:
+            return a.get("value")
+    return None
+
+def extract_doi_from_text(text: Any) -> Optional[str]:
+    """Extract a DOI from a string (or return None)."""
+    if not isinstance(text, str) or not text:
+        return None
+    m = DOI_RE.search(text)
+    doi = m.group(0) if m else None
+    return doi if is_valid_doi(doi) else None
+
+def extract_all_dois(text: Any) -> List[str]:
+    """Extract all valid DOIs from a string."""
+    if not isinstance(text, str) or not text:
+        return []
+    dois = []
+    for m in DOI_RE.finditer(text):
+        d = m.group(0)
+        if is_valid_doi(d):
+            dois.append(d)
+    return dois
+
+def doi_url(doi: Optional[str]) -> Optional[str]:
+    """Convert DOI to https://doi.org/..."""
+    if not doi:
+        return None
+    d = doi.strip()
+    if d.lower().startswith("http"):
+        return d
+    return f"https://doi.org/{d}"
+
+# ---------- DOI + publications extraction ----------
+
+def find_doi_anywhere(item: Dict[str, Any]) -> Optional[str]:
+    """
+    Best-effort *dataset DOI* extractor.
+    NOTE: Intentionally does NOT search BioStudies raw_data publication subsections,
+    because those are *linked publications*, not dataset DOI.
+    """
+    # direct keys first (dataset DOI)
+    doi = first(item.get("doi"), g(item, "metadata", "doi"))
+    doi = extract_doi_from_text(doi) or doi
+    if is_valid_doi(doi):
+        return doi
+
+    # Zenodo: related identifiers (sometimes contains dataset DOI, but usually pubs)
+    rel = g(item, "metadata", "related_identifiers", default=[]) or []
+    if isinstance(rel, list):
+        for r in rel:
+            if not isinstance(r, dict):
+                continue
+            ident = r.get("identifier")
+            scheme = (r.get("scheme") or "").lower()
+            if scheme == "doi":
+                found = extract_doi_from_text(ident) or ident
+                if is_valid_doi(found):
+                    return found
+            found = extract_doi_from_text(ident)
+            if found:
+                return found
+
+    # BioStudies: attributes (dataset DOI if present)
+    attrs = g(item, "metadata", "attributes", default=[]) or []
+    for key in ("DOI", "doi", "Dataset DOI"):
+        v = find_attr(attrs, key)
+        found = extract_doi_from_text(v)
+        if found:
+            return found
+
+    # BioStudies: publications list (if present) - ambiguous; keep as last resort
+    pubs = g(item, "metadata", "publications", default=[]) or []
+    if isinstance(pubs, list):
+        for p in pubs:
+            if not isinstance(p, dict):
+                continue
+            for cand in (p.get("doi"), p.get("identifier"), p.get("url")):
+                found = extract_doi_from_text(cand)
+                if found:
+                    return found
+
+    # last resort: description text
+    desc = first(g(item, "metadata", "description"), item.get("description"))
+    found = extract_doi_from_text(desc)
+    if found:
+        return found
+
+    return None
+
+def _dedup_publications(pubs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    """Deduplicate publications by DOI (preferred) or URL."""
+    seen = set()
+    out = []
+    for p in pubs:
+        doi = (p.get("doi") or "").lower().strip()
+        url = (p.get("url") or "").lower().strip()
+        key = doi or url
+        if not key:
+            continue
+        if key in seen:
+            continue
+        seen.add(key)
+        out.append(p)
+    return out
+
+def extract_publications_zenodo(z: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    Extract linked publications from Zenodo record.
+    Sources:
+      - metadata.related_identifiers
+      - metadata.references (list of strings)
+      - DOIs embedded in metadata.description (optional, but useful)
+    """
+    pubs: List[Dict[str, Any]] = []
+
+    dataset_doi = find_doi_anywhere(z)
+    concept_doi = first(z.get("conceptdoi"), g(z, "metadata", "conceptdoi"))
+    concept_doi = extract_doi_from_text(concept_doi) or concept_doi
+    if not is_valid_doi(concept_doi):
+        concept_doi = None
+
+    rel = g(z, "metadata", "related_identifiers", default=[]) or []
+    if isinstance(rel, list):
+        for r in rel:
+            if not isinstance(r, dict):
+                continue
+            ident = r.get("identifier")
+            scheme = (r.get("scheme") or "").lower()
+            relation = (r.get("relation") or "").lower()
+            rtype = (r.get("resource_type") or "").lower()
+
+            # Heuristic: treat as publication if resource_type contains "publication"
+            # or relation indicates citation-like linkage.
+            looks_like_pub = (
+                "publication" in rtype
+                or relation in {"references", "iscitedby", "isreferencedby", "issupplementto", "isdocumentedby"}
+            )
+
+            if not looks_like_pub:
+                # still accept DOI-looking identifiers if they are clearly *not* Zenodo dataset DOIs
+                pass
+
+            doi = None
+            url = None
+
+            if scheme == "doi":
+                doi = extract_doi_from_text(ident) or (ident.strip() if isinstance(ident, str) else None)
+                if not is_valid_doi(doi):
+                    doi = None
+                url = doi_url(doi) if doi else None
+            elif scheme == "url":
+                url = ident.strip() if isinstance(ident, str) else None
+                doi = extract_doi_from_text(url)
+            else:
+                # Unknown scheme: try DOI extraction
+                doi = extract_doi_from_text(ident)
+                url = doi_url(doi) if doi else (ident.strip() if isinstance(ident, str) else None)
+
+            # Exclude dataset DOI / concept DOI if they appear
+            if doi and (doi == dataset_doi or doi == concept_doi):
+                continue
+
+            if doi or url:
+                pubs.append({
+                    "doi": doi,
+                    "doi_url": doi_url(doi) if doi else None,
+                    "url": url,
+                    "relation": relation or None,
+                    "resource_type": r.get("resource_type"),
+                    "source": "zenodo.related_identifiers",
+                })
+
+    refs = g(z, "metadata", "references", default=[]) or []
+    if isinstance(refs, list):
+        for ref in refs:
+            doi = extract_doi_from_text(ref)
+            if doi and doi not in {dataset_doi, concept_doi}:
+                pubs.append({
+                    "doi": doi,
+                    "doi_url": doi_url(doi),
+                    "url": doi_url(doi),
+                    "relation": "references",
+                    "resource_type": "publication",
+                    "source": "zenodo.references",
+                })
+
+    # Optional: mine description for DOI links (often present as doi.org/10.xxxx/...)
+    desc = g(z, "metadata", "description")
+    for doi in extract_all_dois(desc):
+        if doi not in {dataset_doi, concept_doi}:
+            pubs.append({
+                "doi": doi,
+                "doi_url": doi_url(doi),
+                "url": doi_url(doi),
+                "relation": "mentions",
+                "resource_type": "publication",
+                "source": "zenodo.description",
+            })
+
+    return _dedup_publications(pubs)
+
+def extract_publications_biostudies(b: Dict[str, Any]) -> List[Dict[str, Any]]:
+    """
+    Extract linked publications from BioStudies record.
+    Sources:
+      - metadata.publications (if present)
+      - metadata.raw_data.section.subsections entries of type 'Publication'
+    """
+    pubs: List[Dict[str, Any]] = []
+    meta = b.get("metadata", {}) or {}
+
+    # 1) metadata.publications (sometimes already structured)
+    meta_pubs = meta.get("publications", []) or []
+    if isinstance(meta_pubs, list):
+        for p in meta_pubs:
+            if isinstance(p, dict):
+                doi = extract_doi_from_text(first(p.get("doi"), p.get("identifier"), p.get("url")))
+                url = first(p.get("url"), doi_url(doi))
+                if doi or url:
+                    pubs.append({
+                        "title": p.get("title"),
+                        "doi": doi,
+                        "doi_url": doi_url(doi) if doi else None,
+                        "url": url,
+                        "pmid": p.get("pmid") or p.get("PMID"),
+                        "year": p.get("year") or p.get("Year"),
+                        "authors": p.get("authors") or p.get("Authors"),
+                        "source": "biostudies.metadata.publications",
+                    })
+            elif isinstance(p, str):
+                doi = extract_doi_from_text(p)
+                if doi:
+                    pubs.append({
+                        "doi": doi,
+                        "doi_url": doi_url(doi),
+                        "url": doi_url(doi),
+                        "source": "biostudies.metadata.publications",
+                    })
+
+    # 2) raw_data.section.subsections: type == Publication
+    subs = g(b, "metadata", "raw_data", "section", "subsections", default=[]) or []
+    if isinstance(subs, list):
+        for s in subs:
+            if not isinstance(s, dict):
+                continue
+            stype = str(s.get("type", "")).strip().lower()
+            if stype != "publication":
+                continue
+
+            # flatten attributes into dict
+            attrs = s.get("attributes") or []
+            flat: Dict[str, Any] = {}
+            if isinstance(attrs, list):
+                for a in attrs:
+                    if isinstance(a, dict) and a.get("name"):
+                        flat[a["name"]] = a.get("value")
+
+            doi = extract_doi_from_text(flat.get("DOI") or flat.get("doi"))
+            pmid = flat.get("PMID") or flat.get("pmid")
+            title = flat.get("Title") or flat.get("title")
+            year = flat.get("Year") or flat.get("year")
+            authors = flat.get("Authors") or flat.get("Author") or flat.get("authors")
+
+            url = doi_url(doi) if doi else None
+
+            if doi or pmid or title:
+                pubs.append({
+                    "title": title,
+                    "doi": doi,
+                    "doi_url": doi_url(doi) if doi else None,
+                    "url": url,
+                    "pmid": pmid,
+                    "year": year,
+                    "authors": authors,
+                    "journal": flat.get("Journal") or flat.get("journal"),
+                    "volume": flat.get("Volume") or flat.get("volume"),
+                    "issue": flat.get("Issue") or flat.get("issue"),
+                    "type": flat.get("Type") or flat.get("type"),
+                    "issn": flat.get("Issn") or flat.get("ISSN"),
+                    "source": "biostudies.raw_data.section.subsections",
+                })
+
+    return _dedup_publications(pubs)
+
+# ---------- Zenodo normalizer ----------
+
+def normalize_zenodo(z: Dict[str, Any]) -> Dict[str, Any]:
+    creators = g(z, "metadata", "creators", default=[]) or []
+    grants = g(z, "metadata", "grants", default=[]) or []
+    files = z.get("files", []) or []
+
+    doi = find_doi_anywhere(z)
+    if not is_valid_doi(doi):
+        doi = None
+
+    publications = extract_publications_zenodo(z)
+
+    return {
+        "title": first(g(z, "metadata", "title"), z.get("title")),
+        "description": first(g(z, "metadata", "description")),
+        "license": first(g(z, "metadata", "license", "id")),
+        "authors": [
+            {
+                "name": c.get("name"),
+                "orcid": c.get("orcid"),
+                "affiliation": c.get("affiliation"),
+            }
+            for c in creators
+            if isinstance(c, dict)
+        ],
+        "funding": [
+            {
+                "funder": g(gr, "funder", "name"),
+                "funder_doi": g(gr, "funder", "doi"),
+                "acronym": gr.get("acronym"),
+                "title": gr.get("title"),
+                "code": gr.get("code"),
+                "url": gr.get("url"),
+            }
+            for gr in grants
+            if isinstance(gr, dict)
+        ],
+        "ReleaseDate": first(g(z, "metadata", "publication_date"), z.get("created")),
+        "id": first(z.get("id"), z.get("recid")),
+        "type": first(g(z, "metadata", "resource_type", "type"), "dataset"),
+        "version": first(g(z, "metadata", "version")),
+        "files": [
+            {
+                "name": f.get("key"),
+                "size": f.get("size"),
+                "checksum": f.get("checksum"),
+                "url": g(f, "links", "self"),
+            }
+            for f in files
+            if isinstance(f, dict)
+        ],
+        "url": first(z.get("url"), g(z, "links", "self_html"), g(z, "links", "self")),
+
+        # dataset DOI
+        "doi": doi,
+        "doi_url": doi_url(doi),
+
+        "conceptdoi": first(z.get("conceptdoi"), g(z, "metadata", "conceptdoi")),
+        "conceptdoi_url": doi_url(first(z.get("conceptdoi"), g(z, "metadata", "conceptdoi"))),
+
+        # NEW: linked publications
+        "publications": publications,
+    }
+
+# ---------- BioStudies normalizer ----------
+
+def normalize_biostudies(b: Dict[str, Any]) -> Dict[str, Any]:
+    meta = b.get("metadata", {}) or {}
+    attrs = meta.get("attributes", []) or []
+    files = meta.get("files", []) or []
+
+    author_details = meta.get("author_details", []) or []
+    authors = meta.get("authors", []) or []
+
+    if isinstance(author_details, list) and len(author_details) > 0:
+        authors_norm = [
+            {
+                "name": a.get("name"),
+                "orcid": a.get("orcid"),
+                "affiliation": a.get("affiliation_name") or a.get("affiliation_ref"),
+                "email": a.get("email"),
+            }
+            for a in author_details
+            if isinstance(a, dict)
+        ]
+    else:
+        authors_norm = [
+            {"name": name, "orcid": None, "affiliation": None}
+            for name in authors
+            if isinstance(name, str)
+        ]
+
+    # funding best-effort (normalized)
+    funding: List[Dict[str, Any]] = []
+    subsections = g(b, "metadata", "raw_data", "section", "subsections", default=[]) or []
+    if isinstance(subsections, list):
+        for s in subsections:
+            if not isinstance(s, dict):
+                continue
+            if str(s.get("type", "")).strip().lower() != "funding":
+                continue
+
+            flat = {}
+            for a in s.get("attributes") or []:
+                if isinstance(a, dict) and a.get("name"):
+                    flat[a["name"]] = a.get("value")
+
+            if not flat:
+                continue
+
+            funder = first(flat.get("Funder"), flat.get("Agency"), flat.get("Funding agency"), flat.get("Agency name"))
+            code = first(flat.get("Grant_id"), flat.get("Grant ID"), flat.get("Grant"), flat.get("Grant number"))
+            url = first(flat.get("URL"), flat.get("Url"), flat.get("Project URL"))
+
+            funding.append({
+                "funder": funder,
+                "code": code,
+                "url": url,
+                "acronym": flat.get("Acronym") or flat.get("Programme") or flat.get("Program"),
+                "raw": flat,
+                "source": "biostudies.raw_data.section.subsections",
+            })
+
+    doi = find_doi_anywhere(b)
+    if not is_valid_doi(doi):
+        doi = None
+
+    publications = extract_publications_biostudies(b)
+
+    # ✅ files: PASS THROUGH URL only (no rebuilding)
+    files_norm: List[Dict[str, Any]] = []
+    for f in files:
+        if not isinstance(f, dict):
+            continue
+        files_norm.append({
+            "name": first(f.get("name"), f.get("path")),
+            "size": f.get("size"),
+            "path": f.get("path"),
+            "url": f.get("url"),  # <-- do not rebuild
+            # optional (keep if useful)
+            "exists": g(f, "exists_check", "exists"),
+            "content_length": g(f, "exists_check", "content_length"),
+        })
+
+    # OPTIONAL strictness: attach warning if any url missing
+    missing = [x.get("path") for x in files_norm if x.get("path") and not x.get("url")]
+    if missing:
+        meta.setdefault("warnings", []).append(
+            f"{len(missing)} BioStudies file(s) missing url in metadata.files (pass-through mode)."
+        )
+
+    return {
+        "title": first(meta.get("title"), b.get("title")),
+        "description": first(meta.get("description")),
+        "license": first(find_attr(attrs, "License")),
+        "authors": authors_norm,
+        "funding": funding,
+        "ReleaseDate": first(
+            b.get("release_date"),
+            find_attr(attrs, "ReleaseDate"),
+            find_attr(attrs, "Release Date"),
+        ),
+        "id": first(meta.get("accession"), b.get("accession"), b.get("id")),
+        "type": first(b.get("type"), meta.get("type"), "study"),
+        "version": first(meta.get("version")),
+        "files": files_norm,
+        "url": first(b.get("url")),
+        "doi": doi,
+        "doi_url": doi_url(doi),
+        "publications": publications,
+    }
+
+# ---------- combine ----------
+
+def normalize_all(
+    bs_entries: List[Dict[str, Any]],
+    zenodo_entries: List[Dict[str, Any]],
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    Adds 'norm_metadata' to each dict in both lists and returns a 2-tuple
+    (bs_entries, zenodo_entries) with 'norm_metadata' populated.
+    Robust: ignores non-dicts and missing lists.
+    """
+
+    for z in zenodo_entries or []:
+        if isinstance(z, dict):
+            z["norm_metadata"] = normalize_zenodo(z)
+
+    for b in bs_entries or []:
+        if isinstance(b, dict):
+            b["norm_metadata"] = normalize_biostudies(b)
+
+    return bs_entries, zenodo_entries
\ No newline at end of file
diff --git a/src/models/data/schemas.py b/src/models/data/schemas.py
new file mode 100644
index 0000000..23dd05a
--- /dev/null
+++ b/src/models/data/schemas.py
@@ -0,0 +1,245 @@
+"""Pydantic models for normalized dataset metadata (BioStudies & Zenodo)."""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from pydantic import BaseModel, Field
+
+
+# ── Shared / reusable sub-models ──────────────────────────────────────────
+
+
+class Author(BaseModel):
+    """Normalised author/creator."""
+
+    name: Optional[str] = None
+    orcid: Optional[str] = None
+    affiliation: Optional[str] = None
+    email: Optional[str] = None
+
+
+class Funding(BaseModel):
+    """Normalised funding entry."""
+
+    funder: Optional[str] = None
+    funder_doi: Optional[str] = None
+    acronym: Optional[str] = None
+    title: Optional[str] = None
+    code: Optional[str] = None
+    url: Optional[str] = None
+    raw: Optional[dict[str, Any]] = None
+    source: Optional[str] = None
+
+
+class DataFile(BaseModel):
+    """Normalised file entry (common to both sources)."""
+
+    name: Optional[str] = None
+    path: Optional[str] = None
+    size: Optional[int] = None
+    checksum: Optional[str] = None
+    url: Optional[str] = None
+    exists: Optional[bool] = None
+    content_length: Optional[str] = None
+
+    model_config = {"extra": "allow"}
+
+
+class Publication(BaseModel):
+    """Linked publication extracted from a dataset record."""
+
+    title: Optional[str] = None
+    doi: Optional[str] = None
+    doi_url: Optional[str] = None
+    url: Optional[str] = None
+    pmid: Optional[str] = None
+    year: Optional[str] = None
+    authors: Optional[str] = None
+    journal: Optional[str] = None
+    volume: Optional[str] = None
+    issue: Optional[str] = None
+    type: Optional[str] = None
+    issn: Optional[str] = None
+    relation: Optional[str] = None
+    resource_type: Optional[str] = None
+    source: Optional[str] = None
+
+
+# ── Top-level normalised metadata ─────────────────────────────────────────
+
+
+class NormalizedMetadata(BaseModel):
+    """Unified normalised metadata for any dataset (Zenodo or BioStudies)."""
+
+    title: Optional[str] = None
+    description: Optional[str] = None
+    license: Optional[str] = None
+    authors: list[Author] = Field(default_factory=list)
+    funding: list[Funding] = Field(default_factory=list)
+    ReleaseDate: Optional[str] = Field(None, alias="ReleaseDate")
+    id: Optional[str | int] = None
+    type: Optional[str] = None
+    version: Optional[str] = None
+    files: list[DataFile] = Field(default_factory=list)
+    url: Optional[str] = None
+    doi: Optional[str] = None
+    doi_url: Optional[str] = None
+    publications: list[Publication] = Field(default_factory=list)
+
+    # Zenodo-specific
+    conceptdoi: Optional[str] = None
+    conceptdoi_url: Optional[str] = None
+
+    model_config = {"populate_by_name": True, "extra": "allow"}
+
+
+# ── BioStudies raw-metadata models ────────────────────────────────────────
+
+
+class Attribute(BaseModel):
+    name: str = ""
+    value: str = ""
+
+
+class BiologicalContext(BaseModel):
+    model_config = {"extra": "allow"}
+
+
+class TechnicalDetails(BaseModel):
+    model_config = {"extra": "allow"}
+
+
+class ExperimentalDesign(BaseModel):
+    factors: list[dict[str, Any]] = Field(default_factory=list)
+
+    model_config = {"extra": "allow"}
+
+
+class ProtocolEntry(BaseModel):
+    type: str = ""
+    description: str = ""
+    attributes: list[Attribute] = Field(default_factory=list)
+
+
+class LinkEntry(BaseModel):
+    url: str = ""
+    type: str = ""
+    description: str = ""
+    attributes: list[dict[str, Any]] = Field(default_factory=list)
+
+
+class FileEntry(BaseModel):
+    """Rich file entry from BioStudies parse_metadata."""
+
+    name: str = ""
+    path: str = ""
+    size: Optional[int] = None
+    type: Optional[str] = None
+    description: str = ""
+    file_kind: str = ""
+    attributes: list[dict[str, Any]] = Field(default_factory=list)
+    url: Optional[str] = None
+    exists_check: Optional[dict[str, Any]] = None
+    raw: Optional[dict[str, Any]] = None
+
+
+class AuthorDetail(BaseModel):
+    name: str = ""
+    email: str = ""
+    orcid: Optional[str] = None
+    affiliation_ref: Optional[str] = None
+    affiliation_name: str = ""
+
+
+class BioStudiesParsedMetadata(BaseModel):
+    """Full structured metadata returned by BioStudiesExtractor.parse_metadata."""
+
+    accession: str = "N/A"
+    title: str = "N/A"
+    description: str = "N/A"
+    release_date: str = "N/A"
+    modification_date: str = "N/A"
+    type: str = "N/A"
+
+    # VHP4Safety filterable fields
+    case_study: str = ""
+    regulatory_question: str = ""
+    flow_step: str = ""
+    collection: str = ""
+
+    attributes: list[Attribute] = Field(default_factory=list)
+    authors: list[str] = Field(default_factory=list)
+    author_details: list[AuthorDetail] = Field(default_factory=list)
+    files: list[FileEntry] = Field(default_factory=list)
+    links: list[LinkEntry] = Field(default_factory=list)
+    protocols: list[ProtocolEntry] = Field(default_factory=list)
+    publications: list[LinkEntry] = Field(default_factory=list)
+    organizations: list[dict[str, Any]] = Field(default_factory=list)
+
+    biological_context: BiologicalContext = Field(default_factory=BiologicalContext)
+    technical_details: TechnicalDetails = Field(default_factory=TechnicalDetails)
+    experimental_design: ExperimentalDesign = Field(default_factory=ExperimentalDesign)
+
+    rocrate_file: Optional[dict[str, Any]] = None
+    rocrate_url: Optional[str] = None
+
+    url: str = ""
+    raw_data: Optional[dict[str, Any]] = None
+
+    model_config = {"extra": "allow"}
+
+
+# ── Zenodo parsed-metadata model ──────────────────────────────────────────
+
+
+class ZenodoFileEntry(BaseModel):
+    id: Optional[str] = None
+    key: Optional[str] = None
+    size: Optional[int] = None
+    checksum: Optional[str] = None
+    links: dict[str, Any] = Field(default_factory=dict)
+
+
+class ZenodoParsedMetadata(BaseModel):
+    """Full structured metadata returned by ZenodoExtractor.parse_metadata."""
+
+    id: Optional[int | str] = None
+    recid: Optional[int | str] = None
+    doi: Optional[str] = None
+    doi_url: Optional[str] = None
+    title: str = "N/A"
+    description: str = "N/A"
+    publication_date: str = "N/A"
+    access_right: Optional[str] = None
+    creators: list[dict[str, Any]] = Field(default_factory=list)
+    keywords: list[str] = Field(default_factory=list)
+    resource_type: dict[str, Any] = Field(default_factory=dict)
+    license: dict[str, Any] = Field(default_factory=dict)
+    grants: list[dict[str, Any]] = Field(default_factory=list)
+    communities: list[dict[str, Any]] = Field(default_factory=list)
+    related_identifiers: list[dict[str, Any]] = Field(default_factory=list)
+    files: list[ZenodoFileEntry] = Field(default_factory=list)
+    links: dict[str, Any] = Field(default_factory=dict)
+    stats: dict[str, Any] = Field(default_factory=dict)
+    is_rocrate: bool = False
+
+    url: str = ""
+    raw: Optional[dict[str, Any]] = None
+
+    model_config = {"extra": "allow"}
+
+
+# ── URL-existence check result ────────────────────────────────────────────
+
+
+class UrlExistsResult(BaseModel):
+    """Result of a HEAD / Range probe to check file existence."""
+
+    url: Optional[str] = None
+    exists: bool = False
+    status_code: Optional[int] = None
+    content_length: Optional[str] = None
+    final_url: Optional[str] = None
+    error: Optional[str] = None
+    method: Optional[str] = None
diff --git a/src/models/data/zenodo.py b/src/models/data/zenodo.py
new file mode 100644
index 0000000..17f0820
--- /dev/null
+++ b/src/models/data/zenodo.py
@@ -0,0 +1,484 @@
+from __future__ import annotations
+
+import json
+import re
+import time
+from typing import Any
+
+import requests
+
+
+class ZenodoExtractor:
+    """Extractor for interacting with the Zenodo Records API.
+
+    Defaults to community 'vhp4safety' and record type 'dataset' to match
+    the user's request. Optional access_token may be provided for higher
+    rate limits or accessing private records.
+    """
+
+    def __init__(
+        self,
+        access_token: str | None = None,
+        community: str = "vhp4safety",
+        record_type: str = "dataset",
+        base_url: str = "https://zenodo.org/api/records",
+    ) -> None:
+        self.base_url = base_url
+        self.community = community
+        self.record_type = record_type
+        self.session = requests.Session()
+        self.headers = {
+            "Accept": "application/json",
+            "User-Agent": "Zenodo-VHP4Safety-App/1.0",
+        }
+        if access_token:
+            # Use Authorization header when token is provided
+            self.headers["Authorization"] = f"Bearer {access_token}"
+
+    def validate_record_id(self, record_id: Any) -> tuple[bool, Any, str | None]:
+        """Validate a Zenodo record identifier.
+
+        Accepts numeric recid (int or numeric string) or DOI (10.xxxx/...).
+
+        Returns:
+            (is_valid, normalized_id, error_message)
+        """
+        if record_id is None:
+            return False, None, "Record ID is required"
+
+        # numeric recid
+        try:
+            if isinstance(record_id, int):
+                return True, record_id, None
+            if isinstance(record_id, str) and record_id.isdigit():
+                return True, int(record_id), None
+        except Exception:
+            pass
+
+        # DOI pattern
+        if isinstance(record_id, str):
+            # strip DOI url wrapper
+            candidate = record_id.strip()
+            # DOI url like https://doi.org/10.5281/zenodo.1234
+            if candidate.startswith("http") and "doi.org" in candidate:
+                candidate = candidate.split("doi.org/", 1)[-1]
+
+            doi_regex = r"^10\.\d{4,9}/[-._;()/:A-Z0-9]+$"
+            if re.match(doi_regex, candidate, flags=re.IGNORECASE):
+                return True, candidate, None
+
+        return (
+            False,
+            record_id,
+            "Invalid Zenodo record identifier (expect recid or DOI)",
+        )
+
+    def build_record_url(self, record_id: Any) -> dict[str, Any]:
+        """Build a public URL for a record identifier (recid or DOI)."""
+        is_valid, normalized, error = self.validate_record_id(record_id)
+        if not is_valid:
+            return {"error": error}
+
+        if isinstance(normalized, int):
+            url = f"https://zenodo.org/records/{normalized}"
+        else:
+            # DOI string
+            url = f"https://doi.org/{normalized}"
+
+        return {"id": normalized, "url": url}
+
+    def get_record_metadata(self, record_id: Any) -> dict[str, Any]:
+        """Retrieve and normalize metadata for a single record.
+
+        If record_id is a DOI string, perform a search for that DOI and
+        return the first match's parsed metadata.
+        """
+        try:
+            is_valid, normalized, validation_error = self.validate_record_id(record_id)
+            if not is_valid:
+                return {"error": validation_error}
+
+            # If numeric recid, retrieve directly
+            if isinstance(normalized, int):
+                url = f"{self.base_url}/{normalized}"
+                resp = self.session.get(url, headers=self.headers, timeout=30)
+                if resp.status_code == 200:
+                    try:
+                        data = resp.json()
+                        parsed = self.parse_metadata(data)
+                        parsed_url = self.build_record_url(normalized).get("url", "")
+                        return parsed | {"url": parsed_url}
+                    except json.JSONDecodeError as e:
+                        return {"error": f"Invalid JSON response from Zenodo API: {e}"}
+                elif resp.status_code == 404:
+                    return {"error": f"Record '{normalized}' not found."}
+                else:
+                    return {"error": f"Zenodo API returned status {resp.status_code}."}
+
+            # DOI case: search for DOI
+            doi = normalized
+            query = f'doi:"{doi}"'
+            search = self.search_records(
+                query=query, page=1, size=1, load_metadata=True
+            )
+            if "error" in search:
+                return search
+            hits = search.get("hits", [])
+            if not hits:
+                return {"error": f"Record with DOI '{doi}' not found."}
+            # return parsed metadata from first hit
+            first = hits[0]
+            # parsed metadata may be under 'parsed_metadata' or 'metadata'
+            parsed = first.get("parsed_metadata") or first.get("metadata")
+            parsed_url = self.build_record_url(
+                first.get("recid") or first.get("id") or doi
+            ).get(
+                "url",
+                "",
+            )
+            return parsed | {"url": parsed_url}
+
+        except requests.exceptions.Timeout:
+            return {"error": "Request timed out. Zenodo server may be slow."}
+        except requests.exceptions.ConnectionError:
+            return {
+                "error": "Cannot connect to Zenodo server. Check your internet connection."
+            }
+        except requests.exceptions.RequestException as e:
+            return {"error": f"Network error: {e}"}
+        except Exception as e:
+            return {"error": f"Unexpected error: {e}"}
+
+    def search_records(
+        self,
+        query: str = "",
+        page: int = 1,
+        size: int = 25,
+        load_metadata: bool = True,
+        filters: tuple[tuple[str, str]] | None= None,
+    ) -> dict[str, Any]:
+        """Search Zenodo records.
+
+        Defaults to the configured community and record_type.
+        """
+        try:
+            if not isinstance(query, str):
+                return {"error": "Query must be a string."}
+
+            # If filters are provided, ensure metadata is loaded
+            filters_applied = bool(filters)
+            if filters_applied:
+                load_metadata = True
+
+            params = {
+                "q": query,
+                "page": page,
+                "size": size,
+                "communities": self.community,
+                "type": self.record_type,
+            }
+
+            resp = self.session.get(
+                self.base_url, headers=self.headers, params=params, timeout=30
+            )
+            if resp.status_code == 200:
+                try:
+                    data = resp.json()
+                except json.JSONDecodeError as e:
+                    return {"error": f"Invalid JSON response from Zenodo API: {e}"}
+
+                hits = (
+                    data.get("hits", {}).get("hits", [])
+                    if isinstance(data.get("hits"), dict)
+                    else data.get("hits", [])
+                )
+                total = (
+                    data.get("hits", {}).get("total")
+                    if isinstance(data.get("hits"), dict)
+                    else data.get("total", 0)
+                )
+
+                if not data or (isinstance(total, int) and total == 0):
+                    return {"error": "No results found.", "hits": []}
+
+                if load_metadata:
+                    hits = self._hit_metadata(hits)
+
+                hits = self._hit_url(hits)
+
+                if filters_applied:
+                    hits = self._apply_filters(hits, filters)
+
+                    page_size_met = len(hits) >= size
+                    pages_fetched = 1
+                    if not page_size_met:
+                        hits, page_size_met, pages_fetched = (
+                            self._backfill_filtered_results(
+                                hits, page, size, filters, query
+                            )
+                        )
+
+                    return {
+                        "totalHits": total,
+                        "hits": hits,
+                        "hits_returned": len(hits),
+                        "page": page,
+                        "pageSize": size,
+                        "pages_fetched": pages_fetched,
+                        "filters_applied": True,
+                        "page_size_met": page_size_met,
+                    }
+
+                return {"total": total, "hits": hits}
+
+            elif resp.status_code == 400:
+                return {"error": "Bad request. Check your search parameters."}
+            elif resp.status_code == 403:
+                return {
+                    "error": "Access forbidden. Community or collection may be restricted."
+                }
+            elif resp.status_code in (500, 503):
+                return {"error": "Zenodo server error. Please try again later."}
+            else:
+                return {"error": f"Zenodo API returned status {resp.status_code}."}
+
+        except requests.exceptions.Timeout:
+            return {"error": "Request timed out. Zenodo server may be slow."}
+        except requests.exceptions.ConnectionError:
+            return {
+                "error": "Cannot connect to Zenodo server. Check your internet connection."
+            }
+        except requests.exceptions.RequestException as e:
+            return {"error": f"Network error: {e}"}
+        except Exception as e:
+            return {"error": f"Unexpected error: {e}"}
+
+    def list_records(
+        self,
+        page: int = 1,
+        size: int = 25,
+        include_urls: bool = False,
+        load_metadata: bool = False,
+        filters: tuple[tuple[str, str]]|None = None,
+    ) -> dict[str, Any]:
+        """list records for the configured community/type (wrapper for search_records)."""
+        # If filters provided, require metadata and URLs
+        if filters:
+            load_metadata = True
+            include_urls = True
+
+        result = self.search_records(
+            query="", page=page, size=size, load_metadata=load_metadata, filters=filters
+        )
+
+        if include_urls and "hits" in result:
+            result["hits"] = self._hit_url(result["hits"])
+
+        return result
+
+    def _hit_url(self, hits: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        for hit in hits:
+            # try recid present in different keys
+            recid = (
+                hit.get("recid")
+                or hit.get("id")
+                or (hit.get("metadata", {}).get("doi") if hit.get("metadata") else None)
+            )
+            if recid:
+                try:
+                    recid_int = int(recid)
+                    hit["url"] = self.build_record_url(recid_int).get("url", "")
+                except Exception:
+                    # fallback to DOI url
+                    doi = (
+                        hit.get("metadata", {}).get("doi")
+                        if hit.get("metadata")
+                        else None
+                    )
+                    if doi:
+                        hit["url"] = self.build_record_url(doi).get("url", "")
+        return hits
+
+    def _hit_metadata(self, hits: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        """Attach parsed metadata to each hit as 'parsed_metadata'."""
+        for hit in hits:
+            try:
+                # some hits already include top-level fields, but parse consistently
+                parsed = self.parse_metadata(hit)
+                # preserve both raw and parsed
+                hit["parsed_metadata"] = parsed
+            except Exception:
+                hit["parsed_metadata"] = {}
+        return hits
+
+    def _apply_filters(
+        self, hits: list[dict[str, Any]], filters: tuple[tuple[str, str]]|None
+    ) -> list[dict[str, Any]]:
+        """Apply AND-filters to hits using parsed metadata when available.
+
+        Field matching is case-insensitive. For list fields (keywords, creators,
+        communities) we match if any element contains the filter value.
+        """
+        if not filters:
+            return hits
+
+        filtered: list[dict[str, Any]] = []
+        for hit in hits:
+            metadata = hit.get("parsed_metadata") or hit.get("metadata") or {}
+            if not metadata:
+                continue
+
+            matches_all = True
+            for field, value in filters:
+                filter_value = value.lower()
+                field_value = metadata.get(field, "")
+
+                if isinstance(field_value, list):
+                    # normalize list values to strings
+                    found = False
+                    for item in field_value:
+                        # item may be dict (e.g., creators)
+                        if isinstance(item, dict):
+                            # try to match on common text fields
+                            text = " ".join(
+                                str(v) for v in item.values() if isinstance(v, str)
+                            )
+                        else:
+                            text = str(item)
+                        if filter_value in text.lower():
+                            found = True
+                            break
+                    if not found:
+                        matches_all = False
+                        break
+
+                else:
+                    if not isinstance(field_value, str):
+                        field_value = str(field_value)
+                    if (
+                        filter_value != field_value.lower()
+                        and filter_value not in field_value.lower()
+                    ):
+                        matches_all = False
+                        break
+
+            if matches_all:
+                filtered.append(hit)
+
+        return filtered
+
+    def _backfill_filtered_results(
+        self,
+        initial_hits: list[dict[str, Any]],
+        page: int,
+        page_size: int,
+        filters: tuple[tuple[str, str]]|None,
+        query: None | str = None,
+    ) -> tuple[list[dict[str, Any]], bool, int]:
+        """Fetch subsequent pages until page_size filtered results are collected or timeout.
+
+        Returns (filtered_hits_trimmed, page_size_met, pages_fetched).
+        """
+        filtered = initial_hits[:]
+        current_page = page
+        start_time = time.time()
+        pages_fetched = 1
+
+        while len(filtered) < page_size:
+            if time.time() - start_time > 30:
+                break
+
+            current_page += 1
+            try:
+                params = {
+                    "q": query or "",
+                    "page": current_page,
+                    "size": page_size,
+                    "communities": self.community,
+                    "type": self.record_type,
+                }
+                resp = self.session.get(
+                    self.base_url, headers=self.headers, params=params, timeout=30
+                )
+                if resp.status_code != 200:
+                    break
+                data = resp.json()
+                next_hits = (
+                    data.get("hits", {}).get("hits", [])
+                    if isinstance(data.get("hits"), dict)
+                    else data.get("hits", [])
+                )
+                if not next_hits:
+                    break
+
+                next_hits = self._hit_metadata(next_hits)
+                next_hits = self._hit_url(next_hits)
+                next_filtered = self._apply_filters(next_hits, filters)
+                filtered.extend(next_filtered)
+                pages_fetched += 1
+
+            except Exception:
+                break
+
+        page_size_met = len(filtered) >= page_size
+        return filtered[:page_size], page_size_met, pages_fetched
+
+    def parse_metadata(self, raw_record: dict[str, Any]) -> dict[str, Any]:
+        """Normalize Zenodo record structure into a simpler metadata dict.
+
+        Accepts either a full record returned from /api/records/:id or a hit
+        element from a search response.
+        """
+        try:
+            # Zenodo typically nests useful fields under 'metadata'
+            raw = raw_record.get("metadata", raw_record)
+
+            metadata: dict[str, Any] = {
+                "id": raw_record.get("id")
+                or raw_record.get("recid")
+                or raw.get("recid"),
+                "recid": raw_record.get("recid") or raw_record.get("id"),
+                "doi": raw.get("doi"),
+                "doi_url": raw_record.get("doi_url") or raw.get("doi_url"),
+                "title": raw.get("title", "N/A"),
+                "description": raw.get("description", "N/A"),
+                "publication_date": raw.get(
+                    "publication_date", raw.get("publication_date", "N/A")
+                ),
+                "access_right": raw.get("access_right"),
+                "creators": raw.get("creators", []),
+                "keywords": raw.get("keywords", []),
+                "resource_type": raw.get("resource_type", {}),
+                "license": raw.get("license", {}),
+                "grants": raw.get("grants", []),
+                "communities": raw.get("communities", []),
+                "related_identifiers": raw.get(
+                    "related_identifiers", raw.get("related_identifiers", [])
+                ),
+                "files": [],
+                "links": raw_record.get("links", {}),
+                "stats": raw_record.get("stats", {}),
+                "raw": raw_record,
+            }
+
+            # Extract files if available at top-level or under raw
+            files = raw_record.get("files") or raw.get("files") or []
+            is_rocrate = False
+            for f in files:
+                if f.get("key", "").lower() == "rocrate-metadata.json":
+                    is_rocrate = True
+                metadata["files"].append(
+                    {
+                        "id": f.get("id"),
+                        "key": f.get("key") or f.get("name"),
+                        "size": f.get("size"),
+                        "checksum": f.get("checksum"),
+                        "links": f.get("links", {}),
+                    }
+                )
+            metadata["is_rocrate"] = is_rocrate
+
+            return metadata
+
+        except Exception as e:
+            return {"error": f"Failed to parse metadata: {e}", "raw": raw_record}
diff --git a/src/models/platform.py b/src/models/platform.py
new file mode 100644
index 0000000..4438abe
--- /dev/null
+++ b/src/models/platform.py
@@ -0,0 +1,56 @@
+"""Pydantic models for VHP4Safety platform configuration and domain objects."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+
+class RegulatoryQuestion(BaseModel):
+    """A regulatory question tied to a case study."""
+
+    key: str = Field(description="Internal key, e.g. reg_q_1a")
+    label: str
+    explanation: str
+    case_study: Optional[str] = None
+
+
+class StageExplanation(BaseModel):
+    """Safety-assessment workflow stage with a short explanation."""
+
+    name: str
+    explanation: str
+
+
+class CompoundProperty(BaseModel):
+    """Single property row returned by a SPARQL compound query."""
+
+    property_label: str = ""
+    value: str = ""
+    units_label: Optional[str] = None
+    formatter_url: Optional[str] = None
+    source: Optional[str] = None
+    doi: Optional[str] = None
+    see_also: Optional[str] = None
+
+
+class CompoundSummary(BaseModel):
+    """Core identifiers for a compound from CompoundCloud."""
+
+    wcid: str
+    label: str
+    inchi: str = ""
+    inchikey: str = ""
+    smiles: str = Field("", alias="SMILES")
+    formula: str = ""
+    mass: str = ""
+
+    model_config = {"populate_by_name": True}
+
+
+class GlossaryStageMapping(BaseModel):
+    """Maps a glossary URL to a human-readable stage name."""
+
+    glossary_url: str
+    stage_name: str
diff --git a/src/scheduler.py b/src/scheduler.py
new file mode 100644
index 0000000..e5e5654
--- /dev/null
+++ b/src/scheduler.py
@@ -0,0 +1,61 @@
+"""
+Nightly background job that re-seeds the database from upstream GitHub sources.
+
+Uses APScheduler's BackgroundScheduler so it runs inside the same Flask /
+SQLite process — no external cron or second container needed.
+"""
+
+import logging
+import os
+
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers.cron import CronTrigger
+
+log = logging.getLogger(__name__)
+
+_scheduler: BackgroundScheduler | None = None
+
+
+def _reseed_job() -> None:
+    """Drop + re-seed all tables from upstream YAML indexes."""
+    from src.seed import seed_all          # late import to avoid circular deps
+    log.info("⏳ Nightly re-seed started …")
+    try:
+        seed_all()
+        log.info("✅ Nightly re-seed complete")
+    except Exception:
+        log.exception("❌ Nightly re-seed failed")
+
+
+def init_scheduler(app=None) -> BackgroundScheduler:
+    """
+    Start (or return) the background scheduler.
+
+    Environment knobs (all optional):
+      RESEED_HOUR   – hour to run (0-23, default 3)
+      RESEED_MINUTE – minute to run (0-59, default 0)
+      RESEED_ENABLED – set to "false" to disable entirely
+    """
+    global _scheduler
+    if _scheduler is not None:
+        return _scheduler
+
+    enabled = os.environ.get("RESEED_ENABLED", "true").lower()
+    if enabled == "false":
+        log.info("🔕 Nightly re-seed disabled (RESEED_ENABLED=false)")
+        return None
+
+    hour = int(os.environ.get("RESEED_HOUR", "3"))
+    minute = int(os.environ.get("RESEED_MINUTE", "0"))
+
+    _scheduler = BackgroundScheduler(daemon=True)
+    _scheduler.add_job(
+        _reseed_job,
+        trigger=CronTrigger(hour=hour, minute=minute),
+        id="nightly_reseed",
+        name="Re-seed DB from upstream",
+        replace_existing=True,
+    )
+    _scheduler.start()
+    log.info("🕐 Nightly re-seed scheduled at %02d:%02d UTC", hour, minute)
+    return _scheduler
diff --git a/src/seed.py b/src/seed.py
new file mode 100644
index 0000000..8ab0032
--- /dev/null
+++ b/src/seed.py
@@ -0,0 +1,279 @@
+"""Seed the database from upstream GitHub JSON indexes.
+
+Run: python -m src.seed
+Idempotent — uses INSERT OR REPLACE (upsert).
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from datetime import datetime, timezone
+
+import requests
+
+from src.db import get_conn, init_db
+
+SERVICES_URL = os.environ.get(
+    "SERVICES_URL",
+    "https://raw.githubusercontent.com/VHP4Safety/cloud"
+    "/refs/heads/main/cap/service_index.json",
+)
+METHODS_URL = os.environ.get(
+    "METHODS_URL",
+    "https://raw.githubusercontent.com/VHP4Safety/cloud"
+    "/refs/heads/main/cap/methods_index.json",
+)
+
+# ── Static reference data ────────────────────────────────────────────────
+
+REG_QUESTIONS = {
+    "reg_q_1a": {
+        "label": "Kidney Case Study (a)",
+        "explanation": "What is the safe cisplatin dose in cancer patients?",
+    },
+    "reg_q_1b": {
+        "label": "Kidney Case Study (b)",
+        "explanation": (
+            "What is the intrinsic hazard of tacrolimus "
+            "for nephrotoxicity?"
+        ),
+    },
+    "reg_q_2a": {
+        "label": "Parkinson Case Study (a)",
+        "explanation": "Can compound Dinoseb cause Parkinson's Disease?",
+    },
+    "reg_q_2b": {
+        "label": "Parkinson Case Study (b)",
+        "explanation": (
+            "What level of exposure to compound Dinoseb leads to "
+            "risk for developing Parkinson's disease?"
+        ),
+    },
+    "reg_q_3a": {
+        "label": "Thyroid Case Study (a)",
+        "explanation": (
+            "What information about silychristin do we need to give "
+            "an advice to women in their early pregnancy to decide "
+            "whether the substance can be used?"
+        ),
+    },
+    "reg_q_3b": {
+        "label": "Thyroid Case Study (b)",
+        "explanation": (
+            "Does silychristin influence the thyroid-mediated brain "
+            "development in the fetus resulting in cognitive "
+            "impairment in children?"
+        ),
+    },
+}
+
+STAGE_EXPLANATIONS = {
+    "ADME": (
+        "Absorption, distribution, metabolism, and excretion of a "
+        "substance in a living organism, following exposure."
+    ),
+    "Hazard Assessment": (
+        "The process of assessing the intrinsic hazard a substance "
+        "poses to human health and/or the environment."
+    ),
+    "Chemical Information": (
+        "Information about chemical properties and identity."
+    ),
+    "General": "Not specific to a flow step.",
+    "(External) exposure": "External exposure assessment.",
+    "Generic": "Generic category.",
+    "Other": "Other or unknown category.",
+}
+
+GLOSSARY_STAGE_MAPPINGS = {
+    "https://vhp4safety.github.io/glossary#VHP0000056": "ADME",
+    "https://vhp4safety.github.io/glossary#VHP0000102": "Hazard Assessment",
+    "https://vhp4safety.github.io/glossary#VHP0000148": "Chemical Information",
+    "https://vhp4safety.github.io/glossary#VHP0000149": "General",
+}
+
+CASE_STUDIES = [
+    {
+        "slug": "kidney",
+        "title": "Kidney case study",
+        "description": "To study kidney disease and pharmacovigilance.",
+        "image_src": "/static/images/image43_hexagon.svg",
+        "image_alt": "Kidney case study",
+    },
+    {
+        "slug": "parkinson",
+        "title": "Parkinson case study",
+        "description": (
+            "To study life course pesticide exposure and "
+            "neurodegenerative disease."
+        ),
+        "image_src": "/static/images/image45_hexagon.svg",
+        "image_alt": "Parkinson case study",
+    },
+    {
+        "slug": "thyroid",
+        "title": "Thyroid case study",
+        "description": (
+            "To study health effects discriminated by age and sex on "
+            "thyroid-mediated neurodevelopment."
+        ),
+        "image_src": "/static/images/image47_hexagon.svg",
+        "image_alt": "Thyroid case study",
+    },
+]
+
+CASESTUDY_CONTENT_URL = (
+    "https://raw.githubusercontent.com/"
+    "VHP4Safety/ui-casestudy-config/main/{slug}_content.json"
+)
+
+
+def _bool_flag(val):
+    if val is None or val == "":
+        return None
+    return 1 if str(val).strip().lower() == "true" else 0
+
+
+def _now():
+    return datetime.now(timezone.utc).isoformat()
+
+
+def seed_reference_data(conn) -> None:
+    for key, data in REG_QUESTIONS.items():
+        conn.execute(
+            "INSERT OR REPLACE INTO regulatory_questions (key, label, explanation) VALUES (?, ?, ?)",
+            (key, data["label"], data["explanation"]),
+        )
+    for name, explanation in STAGE_EXPLANATIONS.items():
+        conn.execute(
+            "INSERT OR REPLACE INTO stage_explanations (name, explanation) VALUES (?, ?)",
+            (name, explanation),
+        )
+    for url, stage in GLOSSARY_STAGE_MAPPINGS.items():
+        conn.execute(
+            "INSERT OR REPLACE INTO glossary_stage_mappings (glossary_url, stage_name) VALUES (?, ?)",
+            (url, stage),
+        )
+    for cs in CASE_STUDIES:
+        content_json = None
+        try:
+            url = CASESTUDY_CONTENT_URL.format(slug=cs["slug"])
+            resp = requests.get(url, timeout=15)
+            resp.raise_for_status()
+            content_json = resp.text
+            print(f"  ok fetched {cs['slug']}_content.json")
+        except Exception as exc:
+            print(f"  x could not fetch {cs['slug']}: {exc}")
+        conn.execute(
+            """INSERT OR REPLACE INTO case_studies
+               (slug, title, description, image_src, image_alt, content_json)
+               VALUES (?, ?, ?, ?, ?, ?)""",
+            (cs["slug"], cs["title"], cs["description"],
+             cs.get("image_src"), cs.get("image_alt"), content_json),
+        )
+    conn.commit()
+    print("ok reference data seeded")
+
+
+def seed_tools(conn) -> None:
+    resp = requests.get(SERVICES_URL, timeout=15)
+    resp.raise_for_status()
+    data = resp.json()
+
+    # Build glossary lookup
+    cur = conn.execute("SELECT glossary_url, stage_name FROM glossary_stage_mappings")
+    glossary = {r["glossary_url"]: r["stage_name"] for r in cur}
+
+    now = _now()
+    for tool_id, raw in data.items():
+        stage = raw.get("stage", "")
+        stage = glossary.get(stage, stage)
+        if stage in ("NA", "Unknown"):
+            stage = "Other"
+
+        conn.execute(
+            """INSERT OR REPLACE INTO tools
+               (id, service, description, stage, html_name, md_file_name,
+                png_file_name, main_url, inst_url,
+                reg_q_1a, reg_q_1b, reg_q_2a, reg_q_2b, reg_q_3a, reg_q_3b,
+                login, api_type, casestudy, provider, provider_email,
+                citation, version, license, sourcecode, docker,
+                bio_tools, tess, raw_json, updated_at)
+               VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
+            (tool_id, raw.get("service", tool_id), raw.get("description"),
+             stage, raw.get("html_name"), raw.get("md_file_name"),
+             raw.get("png_file_name"), raw.get("main_url"),
+             raw.get("inst_url") or None,
+             _bool_flag(raw.get("reg_q_1a")), _bool_flag(raw.get("reg_q_1b")),
+             _bool_flag(raw.get("reg_q_2a")), _bool_flag(raw.get("reg_q_2b")),
+             _bool_flag(raw.get("reg_q_3a")), _bool_flag(raw.get("reg_q_3b")),
+             raw.get("login"), raw.get("api"), raw.get("casestudy"),
+             raw.get("provider"), raw.get("provider-email"),
+             raw.get("citation"), raw.get("version"), raw.get("license"),
+             raw.get("sourcecode"), raw.get("docker"),
+             raw.get("bioTools"), raw.get("tess"),
+             json.dumps(raw), now),
+        )
+    conn.commit()
+    print(f"ok {len(data)} tools seeded")
+
+
+def seed_methods(conn) -> None:
+    resp = requests.get(METHODS_URL, timeout=15)
+    resp.raise_for_status()
+    data = resp.json()
+
+    now = _now()
+    for method_id, raw in data.items():
+        conn.execute(
+            """INSERT OR REPLACE INTO methods
+               (id, method, issue_number, description, stage, substage,
+                catalog_webpage_url, case_study, regulatory_question,
+                reg_q_1a, reg_q_1b, reg_q_2a, reg_q_2b, reg_q_3a, reg_q_3b,
+                data_producer, sop, vendor, catalog_number, citation,
+                type_iri, ontology, key_event_id, aop_id,
+                raw_json, updated_at)
+               VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
+            (method_id,
+             raw.get("method") or raw.get("method_name_content", method_id),
+             raw.get("issue_number"),
+             raw.get("method_description_content"),
+             raw.get("vhp4safety_workflow_stage_content"),
+             raw.get("workflow_substage_content"),
+             raw.get("catalog_webpage_url"),
+             raw.get("case_study_content"),
+             raw.get("regulatory_question_content"),
+             _bool_flag(raw.get("reg_q_1a")), _bool_flag(raw.get("reg_q_1b")),
+             _bool_flag(raw.get("reg_q_2a")), _bool_flag(raw.get("reg_q_2b")),
+             _bool_flag(raw.get("reg_q_3a")), _bool_flag(raw.get("reg_q_3b")),
+             raw.get("data_producer_content"),
+             raw.get("available_sop_or_protocol_content"),
+             raw.get("vendor_content"),
+             raw.get("catalog_number_content"),
+             raw.get("citation_content"),
+             raw.get("ontology_term_content"),
+             raw.get("type_content"),
+             raw.get("relevant_aop_wiki_key_event(s)_to_the_assay_content"),
+             raw.get("relevant_aop_wiki_adverse_outcome_pathway(s)_to_the_assay_content"),
+             json.dumps(raw), now),
+        )
+    conn.commit()
+    print(f"ok {len(data)} methods seeded")
+
+
+def seed_all() -> None:
+    init_db()
+    conn = get_conn()
+    try:
+        seed_reference_data(conn)
+        seed_tools(conn)
+        seed_methods(conn)
+        print("ok seeding complete")
+    finally:
+        conn.close()
+
+
+if __name__ == "__main__":
+    seed_all()
diff --git a/src/services/__init__.py b/src/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/services/compound.py b/src/services/compound.py
new file mode 100644
index 0000000..cce8323
--- /dev/null
+++ b/src/services/compound.py
@@ -0,0 +1,204 @@
+"""Compound data service — encapsulates all CompoundCloud SPARQL queries.
+
+All SPARQL logic is centralised here; Flask routes just call these
+functions and get back typed Pydantic models or plain dicts.
+"""
+
+from __future__ import annotations
+
+import re
+import urllib.parse
+from typing import Optional
+
+import requests
+from wikibaseintegrator import wbi_helpers
+
+from src.models.compound import (
+    CompoundDetail,
+    CompoundExperimentalDatum,
+    CompoundIdentifier,
+    CompoundSummary,
+    CompoundToxicology,
+)
+
+COMPOUND_EP = "https://compoundcloud.wikibase.cloud/query/sparql"
+QLEVER_EP = (
+    "https://qlever.cs.uni-freiburg.de/api/wikidata"
+    "?format=json&query="
+)
+
+_QID_RE = re.compile(r"^Q\d+$")
+
+
+def is_valid_qid(qid: str) -> bool:
+    return bool(_QID_RE.fullmatch(qid))
+
+
+# ── Individual queries ────────────────────────────────────────────────────
+
+
+def get_properties(cwid: str) -> Optional[CompoundSummary]:
+    """Fetch core identifiers (InChI, SMILES, formula, mass)."""
+    q = (
+        "PREFIX wd: <https://compoundcloud.wikibase.cloud/entity/>\n"
+        "PREFIX wdt: <https://compoundcloud.wikibase.cloud/prop/direct/>\n\n"
+        "SELECT ?cmp ?cmpLabel ?formula ?mass ?inchi ?inchiKey ?SMILES WHERE {\n"
+        f"  VALUES ?cmp {{ wd:{cwid} }}\n"
+        "  ?cmp wdt:P9 ?inchi ;\n"
+        "       wdt:P10 ?inchiKey .\n"
+        "  OPTIONAL { ?cmp wdt:P2 ?mass }\n"
+        "  OPTIONAL { ?cmp wdt:P3 ?formula }\n"
+        "  OPTIONAL { ?cmp wdt:P7 ?chiralSMILES }\n"
+        "  OPTIONAL { ?cmp wdt:P12 ?nonchiralSMILES }\n"
+        '  BIND (COALESCE(IF(BOUND(?chiralSMILES), ?chiralSMILES, 1/0),'
+        ' IF(BOUND(?nonchiralSMILES), ?nonchiralSMILES, 1/0), "")'
+        " AS ?SMILES)\n"
+        "  SERVICE wikibase:label {"
+        ' bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n'
+        "}"
+    )
+    result = wbi_helpers.execute_sparql_query(q, endpoint=COMPOUND_EP)
+    bindings = result.get("results", {}).get("bindings", [])
+    if not bindings:
+        return None
+    b = bindings[0]
+    return CompoundSummary(
+        wcid=b["cmp"]["value"],
+        label=b["cmpLabel"]["value"],
+        inchi=b["inchi"]["value"],
+        inchikey=b["inchiKey"]["value"],
+        SMILES=b.get("SMILES", {}).get("value", ""),
+        formula=b.get("formula", {}).get("value", ""),
+        mass=b.get("mass", {}).get("value", ""),
+    )
+
+
+def get_identifiers(cwid: str) -> list[CompoundIdentifier]:
+    """Fetch external identifiers (CAS, PubChem, …)."""
+    q = (
+        "PREFIX wd: <https://compoundcloud.wikibase.cloud/entity/>\n"
+        "PREFIX wdt: <https://compoundcloud.wikibase.cloud/prop/direct/>\n\n"
+        "SELECT DISTINCT ?propertyLabel ?value ?formatterURL\n"
+        "WHERE {\n"
+        "  VALUES ?property { wd:P13 wd:P22 wd:P23 wd:P26 wd:P27"
+        " wd:P28 wd:P36 wd:P41 wd:P43 wd:P44 wd:P45 }\n"
+        "  ?property wikibase:directClaim ?valueProp .\n"
+        f"  OPTIONAL {{ wd:{cwid} ?valueProp ?value }}\n"
+        "  OPTIONAL { ?property wdt:P6 ?formatterURL }\n"
+        "  SERVICE wikibase:label {"
+        ' bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n'
+        "}"
+    )
+    result = wbi_helpers.execute_sparql_query(q, endpoint=COMPOUND_EP)
+    bindings = result.get("results", {}).get("bindings", [])
+    out: list[CompoundIdentifier] = []
+    for b in bindings:
+        out.append(CompoundIdentifier(
+            property_label=b.get("propertyLabel", {}).get("value", ""),
+            value=b.get("value", {}).get("value", ""),
+            formatter_url=b.get("formatterURL", {}).get("value", ""),
+        ))
+    return out
+
+
+def get_toxicology(cwid: str) -> list[CompoundToxicology]:
+    """Fetch toxicology properties."""
+    q = (
+        "PREFIX wd: <https://compoundcloud.wikibase.cloud/entity/>\n"
+        "PREFIX wdt: <https://compoundcloud.wikibase.cloud/prop/direct/>\n\n"
+        "SELECT DISTINCT ?propertyLabel ?value ?formatterURL\n"
+        "WHERE {\n"
+        "  VALUES ?property { wd:P17 wd:P19 wd:P4 }\n"
+        "  ?property wikibase:directClaim ?valueProp .\n"
+        f"  OPTIONAL {{ wd:{cwid} ?valueProp ?value }}\n"
+        "  OPTIONAL { ?property wdt:P6 ?formatterURL }\n"
+        "  SERVICE wikibase:label {"
+        ' bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n'
+        "}"
+    )
+    result = wbi_helpers.execute_sparql_query(q, endpoint=COMPOUND_EP)
+    bindings = result.get("results", {}).get("bindings", [])
+    out: list[CompoundToxicology] = []
+    for b in bindings:
+        out.append(CompoundToxicology(
+            property_label=b.get("propertyLabel", {}).get("value", ""),
+            value=b.get("value", {}).get("value", ""),
+        ))
+    return out
+
+
+def get_experimental_data(
+    cwid: str,
+) -> list[CompoundExperimentalDatum]:
+    """Fetch experimental data via Wikidata QLever."""
+    # Step 1: resolve CompoundCloud QID → Wikidata QID
+    q1 = (
+        "PREFIX wd: <https://compoundcloud.wikibase.cloud/entity/>\n"
+        "PREFIX wdt: <https://compoundcloud.wikibase.cloud/prop/direct/>\n\n"
+        "SELECT ?qid WHERE {\n"
+        "  wd:P5 wikibase:directClaim ?identifierProp .\n"
+        f"  wd:{cwid} ?identifierProp ?wikidata .\n"
+        "  BIND (iri(CONCAT("
+        '"http://www.wikidata.org/entity/", ?wikidata)) AS ?qid)\n'
+        "}"
+    )
+    r1 = wbi_helpers.execute_sparql_query(q1, endpoint=COMPOUND_EP)
+    bindings = r1.get("results", {}).get("bindings", [])
+    if not bindings:
+        return []
+    qid = bindings[0]["qid"]["value"]
+
+    # Step 2: query Wikidata QLever for experimental properties
+    q2 = (
+        "PREFIX wd: <http://www.wikidata.org/entity/>\n"
+        "PREFIX wdt: <http://www.wikidata.org/prop/direct/>\n"
+        "PREFIX prov: <http://www.w3.org/ns/prov#>\n"
+        "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n"
+        "PREFIX pr: <http://www.wikidata.org/prop/reference/>\n"
+        "PREFIX wikibase: <http://wikiba.se/ontology#>\n\n"
+        "SELECT DISTINCT ?propEntityLabel ?value"
+        " ?unitsLabel ?source ?doi ?statement\n"
+        "WHERE {\n"
+        f"    <{qid}> ?propp ?statement .\n"
+        "    ?statement a wikibase:BestRank ;\n"
+        "      ?proppsv ["
+        " wikibase:quantityAmount ?value ;"
+        " wikibase:quantityUnit ?units ] .\n"
+        "    ?property wikibase:claim ?propp ;"
+        " wikibase:statementValue ?proppsv ;"
+        " wdt:P1629 ?propEntity ;"
+        " wdt:P31 wd:Q21077852 .\n"
+        "    ?propEntity @en@rdfs:label ?propEntityLabel .\n"
+        "    ?units @en@rdfs:label ?unitsLabel .\n"
+        "    BIND (COALESCE(IF(BOUND(?sourceTmp),"
+        ' ?sourceTmp, 1/0), "") AS ?source)\n'
+        "    BIND (COALESCE(IF(BOUND(?doiTmp),"
+        ' ?doiTmp, 1/0), "") AS ?doi)\n'
+        "}"
+    )
+    url = QLEVER_EP + urllib.parse.quote_plus(q2)
+    resp = requests.get(url, timeout=15)
+    data = resp.json()
+    bindings = data.get("results", {}).get("bindings", [])
+
+    out: list[CompoundExperimentalDatum] = []
+    for b in bindings:
+        out.append(CompoundExperimentalDatum(
+            property_label=b.get("propEntityLabel", {}).get("value", ""),
+            value=b.get("value", {}).get("value", ""),
+            units_label=b.get("unitsLabel", {}).get("value", ""),
+            source=b.get("source", {}).get("value", ""),
+            doi=b.get("doi", {}).get("value", ""),
+            see_also=b.get("statement", {}).get("value", ""),
+        ))
+    return out
+
+
+def get_full_compound(cwid: str) -> CompoundDetail:
+    """Fetch everything about a compound."""
+    return CompoundDetail(
+        summary=get_properties(cwid),
+        identifiers=get_identifiers(cwid),
+        toxicology=get_toxicology(cwid),
+        experimental_data=get_experimental_data(cwid),
+    )
diff --git a/src/sitemap.py b/src/sitemap.py
new file mode 100644
index 0000000..5e7ee36
--- /dev/null
+++ b/src/sitemap.py
@@ -0,0 +1,59 @@
+"""Generate a static sitemap.xml file from DB contents."""
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Iterable
+import os
+from xml.etree import ElementTree as ET
+
+from src.db import get_conn
+
+BASE_URL = os.environ.get("BASE_URL", "http://localhost:5050")
+OUT_PATH = os.path.join(os.path.dirname(__file__), "..", "static", "sitemap.xml")
+
+
+def _add_url(root, loc, lastmod=None, changefreq="monthly", priority="0.5"):
+    url = ET.SubElement(root, "url")
+    ET.SubElement(url, "loc").text = loc
+    if lastmod:
+        ET.SubElement(url, "lastmod").text = lastmod
+    ET.SubElement(url, "changefreq").text = changefreq
+    ET.SubElement(url, "priority").text = priority
+
+
+def gather_urls() -> Iterable[tuple[str, str | None]]:
+    conn = get_conn()
+    try:
+        yield (f"{BASE_URL}/", datetime.utcnow().isoformat())
+        for path in ("/tools", "/methods", "/data", "/casestudies", "/api/v1/docs"):
+            yield (f"{BASE_URL}{path}", None)
+        for t in conn.execute("SELECT id, updated_at FROM tools").fetchall():
+            if t["id"]:
+                yield (f"{BASE_URL}/tools/{t['id']}", t["updated_at"])
+        for m in conn.execute("SELECT id, updated_at FROM methods").fetchall():
+            if m["id"]:
+                yield (f"{BASE_URL}/methods/{m['id']}", m["updated_at"])
+        for cs in conn.execute("SELECT slug FROM case_studies").fetchall():
+            if cs["slug"]:
+                yield (f"{BASE_URL}/casestudies/{cs['slug']}", None)
+    finally:
+        conn.close()
+
+
+def build_sitemap(out_path: str = OUT_PATH) -> str:
+    root = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
+    for loc, last in gather_urls():
+        _add_url(root, loc, lastmod=last)
+    tree = ET.ElementTree(root)
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    tree.write(out_path, encoding="utf-8", xml_declaration=True)
+    return out_path
+
+
+def main() -> None:
+    path = build_sitemap()
+    print(f"Wrote sitemap to: {path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/templates/base.html b/templates/base.html
index 4c103ac..5983caa 100644
--- a/templates/base.html
+++ b/templates/base.html
@@ -162,6 +162,9 @@
 
     <!-- Data Button -->
     <button class="btn btn-vhpteal text-nowrap" style="width: 150px;" onclick="location.href='/data'" type="button">Data</button>
+
+    <!-- API Docs Button 
+    <button class="btn btn-outline-secondary text-nowrap" style="width: 150px;" onclick="location.href='/api/v1/docs'" type="button">API</button>-->
   </div>
 </nav>
 
@@ -281,6 +284,7 @@ <h5 class="offcanvas-title" id="offcanvasMenuLabel">Menu</h5>
             <h6 class="text-uppercase fw-bold mb-3">EXPLORE</h6>
             <ul class="list-unstyled">
               <li><a href="/tools" class="text-decoration-none link-vhppink fs-7 small">Tools</a></li>
+              <li><a href="/methods" class="text-decoration-none link-vhppink fs-7 small">Methods</a></li>
               <li><a href="/casestudies" class="text-decoration-none link-vhppink fs-7 small">Case Studies</a></li>
               <li><a href="/data" class="text-decoration-none link-vhppink fs-7 small">Data</a></li>
             </ul>
diff --git a/templates/case_studies/casestudies.html b/templates/case_studies/casestudies.html
index 0854f95..3bbd7fe 100644
--- a/templates/case_studies/casestudies.html
+++ b/templates/case_studies/casestudies.html
@@ -11,41 +11,21 @@ <h1><span class="text-vhpblue" id="collection-name">Case Studies</span></h1>
         </p>
     </div>
   <div class="row row-cols-1 row-cols-md-3 g-4 mt-3">
-    <div class="col">        
-      <a href="/casestudies/kidney" class="text-decoration-none">
-        <div class="card h-100 text-center card-button card-button-vhpblue">
-            <img src="/static/images/image43_hexagon.svg" class="card-img-top p-4 mg-fluid" alt="Kidney case study" style="max-width: 120px; margin: 0 auto;">
-          <div class="card-body">
-            <h5 class="card-title">Kidney case study</h5>
-            <p class="card-text">To study kidney disease and pharmacovigilance.</p>
-          </div>
-        </div>
-      </a>
-    </div>
-
-    <div class="col">
-      <a href="/casestudies/parkinson" class="text-decoration-none">
-        <div class="card h-100 text-center card-button card-button-vhpblue">
-              <img src="/static/images/image45_hexagon.svg" class="card-img-top p-4 img-fluid" alt="Parkinson case study" style="max-width: 120px; margin: 0 auto;">
-            <div class="card-body">
-              <h5 class="card-title">Parkinson case study</h5>
-              <p class="card-text">To study life course pesticide exposure and neurodegenerative disease.</p>
-            </div>
-        </div>
-      </a>
-    </div>
-
+    {% for card in cards %}
     <div class="col">
-      <a href="/casestudies/thyroid" class="text-decoration-none">
+      <a href="/casestudies/{{ card.slug }}" class="text-decoration-none">
         <div class="card h-100 text-center card-button card-button-vhpblue">
-            <img src="/static/images/image47_hexagon.svg" class="card-img-top p-4 mg-fluid" alt="Thyroid case study" style="max-width: 120px; margin: 0 auto;">
+          {% if card.image_src %}
+            <img src="{{ card.image_src }}" class="card-img-top p-4 img-fluid" alt="{{ card.image_alt or card.title }}" style="max-width: 120px; margin: 0 auto;">
+          {% endif %}
           <div class="card-body">
-            <h5 class="card-title">Thyroid case study</h5>
-            <p class="card-text">To study health effects discriminated by age and sex on thyroid-mediated neurodevelopment.</p>
+            <h5 class="card-title">{{ card.title }}</h5>
+            <p class="card-text">{{ card.description }}</p>
           </div>
         </div>
       </a>
     </div>
+    {% endfor %}
   </div>
 
 
diff --git a/templates/case_studies/casestudy_server.html b/templates/case_studies/casestudy_server.html
new file mode 100644
index 0000000..eeadd59
--- /dev/null
+++ b/templates/case_studies/casestudy_server.html
@@ -0,0 +1,229 @@
+{% extends "base.html" %} {% block content %}
+<link rel="stylesheet" href="/static/css/casestudies.css" />
+
+<style>
+  /* View-transition slide animation for browsers that support it */
+  @keyframes cs-slide-in-from-right {
+    from { transform: translateX(60px); opacity: 0; }
+    to   { transform: translateX(0);    opacity: 1; }
+  }
+  @keyframes cs-slide-in-from-left {
+    from { transform: translateX(-60px); opacity: 0; }
+    to   { transform: translateX(0);     opacity: 1; }
+  }
+  @keyframes cs-slide-out-to-left {
+    from { transform: translateX(0);     opacity: 1; }
+    to   { transform: translateX(-60px); opacity: 0; }
+  }
+  @keyframes cs-slide-out-to-right {
+    from { transform: translateX(0);    opacity: 1; }
+    to   { transform: translateX(60px); opacity: 0; }
+  }
+
+  /* Native View Transitions API (Chrome 111+, Edge 111+) */
+  ::view-transition-old(cs-main) {
+    animation: cs-slide-out-to-left 0.35s ease both;
+  }
+  ::view-transition-new(cs-main) {
+    animation: cs-slide-in-from-right 0.35s ease both;
+  }
+
+  /* Reverse direction class */
+  .nav-back::view-transition-old(cs-main) {
+    animation: cs-slide-out-to-right 0.35s ease both;
+  }
+  .nav-back::view-transition-new(cs-main) {
+    animation: cs-slide-in-from-left 0.35s ease both;
+  }
+
+  /* Fallback fade-slide for all browsers via CSS class */
+  .cs-enter {
+    animation: cs-slide-in-from-right 0.4s ease both;
+  }
+</style>
+
+{# ── Breadcrumbs ── #}
+<nav class="navbar navbar-expand-lg bg-body-tertiary" style="--bs-breadcrumb-divider:'/'" aria-label="breadcrumb">
+  <ol class="breadcrumb p-3 m-0">
+    {% for crumb in step.breadcrumbs %}
+    <li class="breadcrumb-item{% if crumb.active %} active text-vhpblue{% endif %}"
+        {% if crumb.active %}aria-current="page"{% endif %}>
+      {% if crumb.active %}
+        {{ crumb.label }}
+      {% else %}
+        <a href="{{ crumb.url }}">{{ crumb.label }}</a>
+      {% endif %}
+    </li>
+    {% endfor %}
+  </ol>
+</nav>
+
+{# ── Workflow Header ── #}
+<div class="workflow-header">
+  <h5>Process Flow</h5>
+  <div class="step-tracker">
+    {% for ws in step.workflow_steps %}
+    {% if not loop.first %}
+    <div class="step-line"></div>
+    {% endif %}
+    <div class="step-item {{ ws.state }}" data-step="{{ ws.number }}" data-type="{{ ws.type }}">
+      <div class="step-icon">{{ ws.number }}</div>
+      <span class="step-text">{{ ws.label }}</span>
+    </div>
+    {% endfor %}
+  </div>
+</div>
+
+{# ── Main Content ── #}
+<div class="container py-md-5 py-3" id="csMain" style="view-transition-name: cs-main;">
+
+  {% if step.nav_title %}
+  <h1 class="text-vhpblue"><span class="kinetics-bold">{{ step.nav_title }}</span></h1>
+  {% endif %}
+
+  {% if step.nav_description %}
+  <p class="step-desc">{{ step.nav_description }}</p>
+  {% endif %}
+
+  {% if step.image_html %}
+  {{ step.image_html | safe }}
+  {% endif %}
+
+  {# ── Step Buttons ── #}
+  {% if step.buttons %}
+  <div class="row py-3">
+    {% for btn in step.buttons %}
+    <div class="col-md pb-2 d-flex align-items-stretch">
+      {% if btn.disabled %}
+      <button class="btn w-100 text-white {{ btn.css_class }} disabled opacity-25" disabled>
+        <b>{{ btn.label }}</b>
+        {% if btn.description %}<br>{{ btn.description }}{% endif %}
+      </button>
+      {% elif btn.url %}
+      <a href="{{ btn.url }}" class="btn w-100 text-white {{ btn.css_class }}">
+        <b>{{ btn.label }}</b>
+        {% if btn.description %}<br>{{ btn.description }}{% endif %}
+      </a>
+      {% else %}
+      <button class="btn w-100 text-white {{ btn.css_class }}">
+        <b>{{ btn.label }}</b>
+        {% if btn.description %}<br>{{ btn.description }}{% endif %}
+      </button>
+      {% endif %}
+    </div>
+    {% endfor %}
+  </div>
+  {% endif %}
+
+  {# ── HTML Content Block ── #}
+  {% if step.content_html %}
+  <div class="mt-4">
+    {{ step.content_html | safe }}
+  </div>
+  {% endif %}
+
+  {# ── Accordion Sections ── #}
+  {% if step.accordion_sections %}
+  <div class="accordion mt-4" id="accordionMain">
+    {% for item in step.accordion_sections %}
+    {% set item_id = "accordionItem" ~ loop.index0 %}
+    <div class="accordion-item">
+      <h2 class="accordion-header" id="heading{{ loop.index0 }}">
+        <button class="accordion-button{% if not loop.first %} collapsed{% endif %}"
+                type="button"
+                data-bs-toggle="collapse"
+                data-bs-target="#{{ item_id }}"
+                aria-expanded="{{ 'true' if loop.first else 'false' }}"
+                aria-controls="{{ item_id }}">
+          {{ item.section | default("Section " ~ loop.index) }}
+        </button>
+      </h2>
+      <div id="{{ item_id }}"
+           class="accordion-collapse collapse{% if loop.first %} show{% endif %}"
+           aria-labelledby="heading{{ loop.index0 }}"
+           data-bs-parent="#accordionMain">
+        <div class="accordion-body">
+          {{ item.description | default("") | safe }}
+        </div>
+      </div>
+    </div>
+    {% endfor %}
+  </div>
+  {% endif %}
+
+</div>
+
+{# ── Feedback Button ── #}
+<a href="https://github.com/VHP4Safety/ui-casestudy-config/issues/new">
+  <button class="btn btn-vhppink-distinct ms-2" style="position: fixed; bottom: 20px; right: 20px;">Give Us feedback</button>
+</a>
+
+<script>
+(function() {
+  var main = document.getElementById('csMain');
+
+  // On load: play enter animation
+  if (main) main.classList.add('cs-enter');
+
+  // Intercept casestudy link clicks → fetch new page, swap content with animation
+  document.addEventListener('click', function(e) {
+    var link = e.target.closest('a[href*="/casestudies/"]');
+    if (!link) return;
+    var href = link.getAttribute('href');
+    if (!href || href.startsWith('http') || href.startsWith('//')) return;
+    e.preventDefault();
+
+    // Determine direction
+    var currentDepth = document.querySelectorAll('.breadcrumb-item').length;
+    var targetDepth = (href.match(/\//g) || []).length;
+    var goingBack = targetDepth < currentDepth;
+
+    var slideOut = goingBack ? 'cs-slide-out-to-right' : 'cs-slide-out-to-left';
+    var slideIn  = goingBack ? 'cs-slide-in-from-left'  : 'cs-slide-in-from-right';
+
+    // Animate out current content
+    main.style.animation = slideOut + ' 0.3s ease forwards';
+
+    // Fetch new page
+    fetch(href).then(function(r) { return r.text(); }).then(function(html) {
+      var parser = new DOMParser();
+      var doc = parser.parseFromString(html, 'text/html');
+      var newMain = doc.getElementById('csMain');
+      var newBreadcrumbs = doc.querySelector('.breadcrumb');
+      var newWorkflow = doc.querySelector('.workflow-header');
+
+      // Update URL
+      history.pushState(null, '', href);
+
+      // Swap breadcrumbs
+      if (newBreadcrumbs) {
+        var bc = document.querySelector('.breadcrumb');
+        if (bc) bc.innerHTML = newBreadcrumbs.innerHTML;
+      }
+
+      // Swap workflow header
+      if (newWorkflow) {
+        var wh = document.querySelector('.workflow-header');
+        if (wh) wh.innerHTML = newWorkflow.innerHTML;
+      }
+
+      // Swap main content with slide-in
+      if (newMain) {
+        main.innerHTML = newMain.innerHTML;
+      }
+      main.style.animation = slideIn + ' 0.4s ease forwards';
+      window.scrollTo({ top: 0, behavior: 'smooth' });
+    }).catch(function() {
+      // Fallback: hard navigate
+      window.location.href = href;
+    });
+  });
+
+  // Handle browser back/forward
+  window.addEventListener('popstate', function() {
+    window.location.reload();
+  });
+})();
+</script>
+
+{% endblock %}
diff --git a/templates/Safety_Assessment_Workflow.html b/templates/safety_assessment_workflow.html
similarity index 100%
rename from templates/Safety_Assessment_Workflow.html
rename to templates/safety_assessment_workflow.html

From 245719bb28d18cd2344e24da5fad5cf64373490d Mon Sep 17 00:00:00 2001
From: Javier <javier.millanacosta@maastrichtuniversity.nl>
Date: Fri, 17 Apr 2026 17:36:31 +0200
Subject: [PATCH 2/2] Add examples to endpoints and API check action

---
 .github/scripts/api_check.py       | 122 ++++++++++++++++
 .github/workflows/pr-api-check.yml |  45 ++++++
 src/api.py                         | 219 ++++++++++++++++++++++-------
 3 files changed, 336 insertions(+), 50 deletions(-)
 create mode 100644 .github/scripts/api_check.py
 create mode 100644 .github/workflows/pr-api-check.yml

diff --git a/.github/scripts/api_check.py b/.github/scripts/api_check.py
new file mode 100644
index 0000000..9e3dbc7
--- /dev/null
+++ b/.github/scripts/api_check.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+"""API check: counts, validation summary, and route health."""
+
+import json
+import sys
+import urllib.request
+from datetime import datetime, timezone
+
+BASE = "http://localhost:5050/api"
+
+
+def get(path):
+    url = f"{BASE}{path}"
+    try:
+        req = urllib.request.Request(url)
+        with urllib.request.urlopen(req, timeout=15) as r:
+            return r.status, json.loads(r.read())
+    except urllib.error.HTTPError as e:
+        return e.code, None
+    except Exception:
+        return 0, None
+
+
+errors = []
+
+# 1. Entity counts
+ENTITIES = {
+    "Tools":                "/tools/",
+    "Methods":              "/methods/",
+    "Case studies":         "/casestudies/",
+    "Regulatory questions": "/regulatory-questions/",
+    "Stage explanations":   "/stages/",
+}
+
+counts = {}
+for label, path in ENTITIES.items():
+    status, data = get(path)
+    if status == 200 and isinstance(data, list):
+        counts[label] = len(data)
+    else:
+        counts[label] = None
+        errors.append(f"GET {path} -> {status}")
+
+# 2. Validation summary
+status, validation = get("/validation/")
+if status != 200:
+    errors.append(f"GET /validation/ -> {status}")
+    validation = None
+
+# 3. Health check every route
+ROUTES = [
+    ("GET", "/tools/"),
+    ("GET", "/tools/cdkdepict"),
+    ("GET", "/methods/"),
+    ("GET", "/methods/5_cfda_assay_to_determine_cytotoxicity"),
+    ("GET", "/regulatory-questions/"),
+    ("GET", "/stages/"),
+    ("GET", "/casestudies/"),
+    ("GET", "/casestudies/kidney"),
+    ("GET", "/compounds/Q2270"),
+    ("GET", "/compounds/Q2270/properties"),
+    ("GET", "/compounds/Q2270/identifiers"),
+    ("GET", "/compounds/Q2270/toxicology"),
+    ("GET", "/compounds/Q2270/experimental-data"),
+    ("GET", "/data/"),
+    ("GET", "/validation/"),
+    ("GET", "/validation/tools"),
+]
+
+health = []
+for method, path in ROUTES:
+    status, _ = get(path)
+    ok = 200 <= status < 300
+    health.append((method, path, status, ok))
+    if not ok:
+        errors.append(f"{method} {path} -> {status}")
+
+# ── build report ──────────────────────────────────────────────────
+
+now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
+lines = [f"## API check -- {now}", ""]
+
+# counts
+lines.append("### Entity counts")
+lines.append("")
+lines.append("| Entity | Count |")
+lines.append("|--------|------:|")
+for label, n in counts.items():
+    lines.append(f"| {label} | {n if n is not None else 'ERR'} |")
+lines.append("")
+
+# validation
+if validation and "entities" in validation:
+    lines.append("### Validation (field completeness)")
+    lines.append("")
+    lines.append("| Entity | Entries | Avg complete | Full |")
+    lines.append("|--------|--------:|-------------:|-----:|")
+    for e in validation["entities"]:
+        lines.append(
+            f"| {e['entity']} | {e['total_entries']}"
+            f" | {e['avg_completeness_pct']}%"
+            f" | {e['fully_complete']}/{e['total_entries']} |"
+        )
+    lines.append("")
+
+# health
+lines.append("### Route health")
+lines.append("")
+lines.append("| Method | Route | Status |")
+lines.append("|--------|-------|-------:|")
+for method, path, status, ok in health:
+    mark = "ok" if ok else f"FAIL ({status})"
+    lines.append(f"| {method} | `{path}` | {mark} |")
+lines.append("")
+
+# result
+all_ok = not errors
+lines.append(f"**Result: {'PASS' if all_ok else 'FAIL'}**")
+
+print("\n".join(lines))
+if not all_ok:
+    sys.exit(1)
diff --git a/.github/workflows/pr-api-check.yml b/.github/workflows/pr-api-check.yml
new file mode 100644
index 0000000..b9680cd
--- /dev/null
+++ b/.github/workflows/pr-api-check.yml
@@ -0,0 +1,45 @@
+name: API check
+
+on:
+  pull_request:
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  api-check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Build Docker image
+        run: docker build -t vhp4safety .
+
+      - name: Start container
+        run: |
+          docker run -d --name vhp4safety -p 5050:5050 vhp4safety
+          for i in $(seq 1 30); do
+            curl -sf http://localhost:5050/api/tools/ && break
+            sleep 2
+          done
+
+      - name: Run API checks
+        id: report
+        run: |
+          python3 .github/scripts/api_check.py > report.md
+          {
+            echo 'REPORT<<GHEOF'
+            cat report.md
+            echo 'GHEOF'
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Post PR comment
+        uses: marocchino/sticky-pull-request-comment@v2
+        with:
+          header: api-check
+          message: ${{ steps.report.outputs.REPORT }}
+
+      - name: Stop container
+        if: always()
+        run: docker stop vhp4safety && docker rm vhp4safety
diff --git a/src/api.py b/src/api.py
index 646a88c..d2b59a6 100644
--- a/src/api.py
+++ b/src/api.py
@@ -33,48 +33,87 @@
 # -- Marshmallow Schemas ---------------------------------------------------
 
 class ToolSchema(Schema):
-    id = fields.Str()
-    service = fields.Str()
-    description = fields.Str()
-    stage = fields.Str()
-    main_url = fields.Str()
-    inst_url = fields.Str()
-    html_name = fields.Str()
-    png_file_name = fields.Str()
+    id = fields.Str(metadata={"example": "cdkdepict"})
+    service = fields.Str(metadata={
+        "example": "CDK Depict",
+        "description": "Human-readable tool name"})
+    description = fields.Str(metadata={
+        "example": "A webservice for generating chemical "
+        "structure images from SMILES inputs."})
+    stage = fields.Str(metadata={
+        "example": "Other",
+        "description": "Safety-assessment workflow stage"})
+    main_url = fields.Str(metadata={
+        "example": "https://www.simolecule.com/cdkdepict/depict.html"})
+    inst_url = fields.Str(metadata={
+        "example": "https://cdkdepict.cloud.vhp4safety.nl/"})
+    html_name = fields.Str(metadata={"example": "cdkdepict.html"})
+    png_file_name = fields.Str(metadata={"example": "cdkdepict.png"})
 
 
 class MethodSchema(Schema):
-    id = fields.Str()
-    method = fields.Str()
-    description = fields.Str()
-    stage = fields.Str()
-    substage = fields.Str()
-    catalog_webpage_url = fields.Str()
-    raw = fields.Dict(load_default=None)
+    id = fields.Str(metadata={
+        "example": "5_cfda_assay_to_determine_cytotoxicity"})
+    method = fields.Str(metadata={
+        "example": "5-CFDA assay to determine cytotoxicity",
+        "description": "Human-readable method name"})
+    description = fields.Str(metadata={
+        "example": "Fluorescence-based determination "
+        "of cell membrane damage"})
+    stage = fields.Str(metadata={"example": "Adverse Outcome"})
+    substage = fields.Str(metadata={
+        "example": "Cell death, Adverse outcome"})
+    catalog_webpage_url = fields.Str(metadata={
+        "example": "https://www.thermofisher.com/order/"
+        "catalog/product/C1354"})
+    raw = fields.Dict(load_default=None, metadata={
+        "description": "Full upstream YAML fields "
+        "from the methods catalog"})
 
 
 class RegulatoryQuestionSchema(Schema):
-    key = fields.Str()
-    label = fields.Str()
-    explanation = fields.Str()
+    key = fields.Str(metadata={"example": "reg_q_1a"})
+    label = fields.Str(metadata={
+        "example": "Kidney Case Study (a)"})
+    explanation = fields.Str(metadata={
+        "example": "What is the safe cisplatin dose "
+        "in cancer patients?"})
 
 
 class StageExplanationSchema(Schema):
-    name = fields.Str()
-    explanation = fields.Str()
+    name = fields.Str(metadata={"example": "ADME"})
+    explanation = fields.Str(metadata={
+        "example": "Absorption, distribution, metabolism, "
+        "and excretion of a substance in a living organism, "
+        "following exposure."})
 
 
 class CaseStudySchema(Schema):
-    slug = fields.Str()
-    title = fields.Str()
-    description = fields.Str()
-    image_src = fields.Str()
-    config_repo = fields.Str()
-    default_branch = fields.Str()
+    name = fields.Str(
+        attribute="slug",
+        metadata={"description": "Short identifier used in URLs",
+                  "example": "kidney"})
+    title = fields.Str(metadata={
+        "example": "Kidney case study"})
+    description = fields.Str(metadata={
+        "example": "To study kidney disease "
+        "and pharmacovigilance."})
+    image_src = fields.Str(metadata={
+        "example": "/static/images/image43_hexagon.svg"})
+    config_repo = fields.Str(metadata={
+        "example": "VHP4Safety/ui-casestudy-config"})
+    default_branch = fields.Str(metadata={
+        "example": "main"})
 
 
 class CaseStudyDetailSchema(CaseStudySchema):
-    content_json = fields.Raw(load_default=None)
+    content_json = fields.Raw(
+        load_default=None,
+        metadata={
+            "description":
+            "Full nested JSON driving the case-study UI "
+            "(intro text, regulatory questions, "
+            "process-flow steps)"})
 
 
 class CompoundSummarySchema(Schema):
@@ -115,7 +154,9 @@ class CompoundDetailSchema(Schema):
 
 
 class DataSearchQuerySchema(Schema):
-    query = fields.Str(load_default="")
+    query = fields.Str(
+        load_default="",
+        metadata={"example": "kidney"})
     page = fields.Int(load_default=1)
     size = fields.Int(load_default=18)
 
@@ -132,8 +173,12 @@ class DataResultSchema(Schema):
 
 
 class SearchQuerySchema(Schema):
-    stage = fields.Str(load_default=None)
-    search = fields.Str(load_default="")
+    stage = fields.Str(
+        load_default=None,
+        metadata={"example": "Other"})
+    search = fields.Str(
+        load_default="",
+        metadata={"example": ""})
 
 
 # -- Blueprints ------------------------------------------------------------
@@ -162,7 +207,11 @@ class SearchQuerySchema(Schema):
 @tools_bp.arguments(SearchQuerySchema, location="query")
 @tools_bp.response(200, ToolSchema(many=True))
 def list_tools(args):
-    """List all tools, with optional stage/search filters."""
+    """List all tools, with optional stage/search filters.
+
+    Returns every tool (service) registered on the platform.
+    Filter by workflow stage or free-text search on the tool name.
+    """
     conn = get_conn()
     sql = "SELECT * FROM tools WHERE 1=1"
     params = []
@@ -179,9 +228,12 @@ def list_tools(args):
 
 
 @tools_bp.route("/<tool_id>")
+@tools_bp.doc(parameters=[{
+    "name": "tool_id", "in": "path",
+    "example": "cdkdepict"}])
 @tools_bp.response(200, ToolSchema)
 def get_tool(tool_id):
-    """Get a single tool by ID."""
+    """Get a single tool by its ID."""
     conn = get_conn()
     row = conn.execute("SELECT * FROM tools WHERE id = ?", (tool_id,)).fetchone()
     conn.close()
@@ -196,7 +248,11 @@ def get_tool(tool_id):
 @methods_bp.arguments(SearchQuerySchema, location="query")
 @methods_bp.response(200, MethodSchema(many=True))
 def list_methods(args):
-    """List all methods, with optional stage/search filters."""
+    """List all methods, with optional stage/search filters.
+
+    Methods describe experimental or computational procedures
+    used in safety-assessment workflows.
+    """
     conn = get_conn()
     sql = "SELECT * FROM methods WHERE 1=1"
     params = []
@@ -213,9 +269,16 @@ def list_methods(args):
 
 
 @methods_bp.route("/<method_id>")
+@methods_bp.doc(parameters=[{
+    "name": "method_id", "in": "path",
+    "example": "5_cfda_assay_to_determine_cytotoxicity"}])
 @methods_bp.response(200, MethodSchema)
 def get_method(method_id):
-    """Get a single method by ID."""
+    """Get a single method by ID, including full upstream fields.
+
+    The ``raw`` field contains every field from the upstream
+    methods catalog YAML (AOP references, key events, etc.).
+    """
     conn = get_conn()
     row = conn.execute("SELECT * FROM methods WHERE id = ?", (method_id,)).fetchone()
     conn.close()
@@ -232,7 +295,11 @@ def get_method(method_id):
 @reg_q_bp.route("/")
 @reg_q_bp.response(200, RegulatoryQuestionSchema(many=True))
 def list_regulatory_questions():
-    """List all regulatory questions."""
+    """List the six regulatory questions that link tools to case studies.
+
+    Each question is tied to a case study pair (a/b).
+    For example, ``reg_q_1a`` = *"Kidney Case Study (a)"*.
+    """
     conn = get_conn()
     rows = conn.execute("SELECT * FROM regulatory_questions").fetchall()
     conn.close()
@@ -244,7 +311,11 @@ def list_regulatory_questions():
 @stages_bp.route("/")
 @stages_bp.response(200, StageExplanationSchema(many=True))
 def list_stages():
-    """List all safety-assessment workflow stages."""
+    """List all safety-assessment workflow stages.
+
+    Stages are the high-level phases of the VHP4Safety
+    process flow: ADME, Hazard Assessment, etc.
+    """
     conn = get_conn()
     rows = conn.execute("SELECT * FROM stage_explanations").fetchall()
     conn.close()
@@ -256,19 +327,32 @@ def list_stages():
 @casestudies_bp.route("/")
 @casestudies_bp.response(200, CaseStudySchema(many=True))
 def list_case_studies():
-    """List all case studies."""
+    """List the three VHP4Safety case studies (summary only).
+
+    Returns name, title, description, and image for each.
+    Use the detail endpoint for the full content JSON.
+
+    Available names: ``kidney``, ``parkinson``, ``thyroid``.
+    """
     conn = get_conn()
     rows = conn.execute("SELECT * FROM case_studies").fetchall()
     conn.close()
     return [dict(r) for r in rows]
 
 
-@casestudies_bp.route("/<slug>")
+@casestudies_bp.route("/<name>")
+@casestudies_bp.doc(parameters=[{
+    "name": "name", "in": "path",
+    "example": "kidney"}])
 @casestudies_bp.response(200, CaseStudyDetailSchema)
-def get_case_study(slug):
-    """Get a case study with its full content JSON."""
+def get_case_study(name):
+    """Get a case study by name, including its full content JSON.
+
+    The content JSON contains the intro text, regulatory questions,
+    and process-flow workflow steps that drive the case-study UI.
+    """
     conn = get_conn()
-    row = conn.execute("SELECT * FROM case_studies WHERE slug = ?", (slug,)).fetchone()
+    row = conn.execute("SELECT * FROM case_studies WHERE slug = ?", (name,)).fetchone()
     conn.close()
     if not row:
         abort(404, message="Case study not found")
@@ -281,9 +365,17 @@ def get_case_study(slug):
 # -- Compounds (SPARQL-backed) ---------------------------------------------
 
 @compounds_bp.route("/<cwid>")
+@compounds_bp.doc(parameters=[{
+    "name": "cwid", "in": "path",
+    "description": "Wikidata compound ID",
+    "example": "Q2270"}])
 @compounds_bp.response(200, CompoundDetailSchema)
 def get_compound(cwid):
-    """Get full compound data."""
+    """Get full compound data from Wikidata via SPARQL.
+
+    Returns summary properties, external identifiers,
+    toxicology data, and experimental measurements.
+    """
     if not is_valid_qid(cwid):
         abort(400, message="Invalid compound identifier")
     try:
@@ -293,9 +385,11 @@ def get_compound(cwid):
 
 
 @compounds_bp.route("/<cwid>/properties")
+@compounds_bp.doc(parameters=[{
+    "name": "cwid", "in": "path", "example": "Q2270"}])
 @compounds_bp.response(200, CompoundSummarySchema)
 def get_compound_properties(cwid):
-    """Get core compound identifiers."""
+    """Get core compound properties (formula, mass, InChI, SMILES)."""
     if not is_valid_qid(cwid):
         abort(400, message="Invalid compound identifier")
     try:
@@ -308,9 +402,11 @@ def get_compound_properties(cwid):
 
 
 @compounds_bp.route("/<cwid>/identifiers")
+@compounds_bp.doc(parameters=[{
+    "name": "cwid", "in": "path", "example": "Q2270"}])
 @compounds_bp.response(200, CompoundIdentifierSchema(many=True))
 def get_compound_identifiers(cwid):
-    """Get external identifiers."""
+    """Get external database identifiers (CAS, PubChem, ChEBI, etc.)."""
     if not is_valid_qid(cwid):
         abort(400, message="Invalid compound identifier")
     try:
@@ -320,9 +416,11 @@ def get_compound_identifiers(cwid):
 
 
 @compounds_bp.route("/<cwid>/toxicology")
+@compounds_bp.doc(parameters=[{
+    "name": "cwid", "in": "path", "example": "Q2270"}])
 @compounds_bp.response(200, CompoundToxicologySchema(many=True))
 def get_compound_toxicology(cwid):
-    """Get toxicology data."""
+    """Get toxicology data (LD50, LC50, etc.)."""
     if not is_valid_qid(cwid):
         abort(400, message="Invalid compound identifier")
     try:
@@ -332,9 +430,11 @@ def get_compound_toxicology(cwid):
 
 
 @compounds_bp.route("/<cwid>/experimental-data")
+@compounds_bp.doc(parameters=[{
+    "name": "cwid", "in": "path", "example": "Q2270"}])
 @compounds_bp.response(200, CompoundExpDataSchema(many=True))
 def get_compound_exp_data(cwid):
-    """Get experimental measurements."""
+    """Get experimental measurements (EC50, IC50, etc.)."""
     if not is_valid_qid(cwid):
         abort(400, message="Invalid compound identifier")
     try:
@@ -349,7 +449,10 @@ def get_compound_exp_data(cwid):
 @data_bp.arguments(DataSearchQuerySchema, location="query")
 @data_bp.response(200, DataResultSchema)
 def list_data(args):
-    """Search datasets across BioStudies and Zenodo."""
+    """Search datasets across BioStudies and Zenodo repositories.
+
+    Returns paginated results from both sources with normalised metadata.
+    """
     query = args.get("query", "")
     page = args.get("page", 1)
     size = args.get("size", 18)
@@ -383,9 +486,14 @@ def list_data(args):
 
 
 @data_bp.route("/<data_id>")
+@data_bp.doc(parameters=[{
+    "name": "data_id", "in": "path", "example": "S-BSST1503"}])
 @data_bp.response(200)
 def get_data_detail(data_id):
-    """Get normalized metadata for a single dataset."""
+    """Get normalised metadata for a single dataset by its accession ID.
+
+    Searches both BioStudies and Zenodo for the given identifier.
+    """
     bs = BioStudiesExtractor(collection=BIOSTUDIES_COLLECTION)
     zen = ZenodoExtractor(community=ZENODO_COMMUNITY, record_type=ZENODO_RECORD_TYPE)
     bs_res = bs.search_studies(data_id, page=1, page_size=1)
@@ -522,7 +630,12 @@ def _validate_entity(entity_name, table, pydantic_model, id_attr, label_attr):
 @validation_bp.route("/")
 @validation_bp.response(200, ValidationReport)
 def validate_all():
-    """Full data completeness report."""
+    """Full data completeness report across all entity types.
+
+    Checks every row in tools, methods, case_studies,
+    regulatory_questions, and stage_explanations for missing fields.
+
+    """
     from datetime import datetime, timezone
     return {
         "generated_at": datetime.now(timezone.utc).isoformat(),
@@ -534,9 +647,15 @@ def validate_all():
 
 
 @validation_bp.route("/<entity>")
+@validation_bp.doc(parameters=[{
+    "name": "entity", "in": "path", "example": "tools"}])
 @validation_bp.response(200, EntitySummary)
 def validate_entity(entity):
-    """Data completeness report for a single entity type."""
+    """Data completeness report for a single entity type.
+
+    Valid entity names: ``tools``, ``methods``, ``case_studies``,
+    ``regulatory_questions``, ``stage_explanations``.
+    """
     if entity not in _ENTITY_REGISTRY:
         abort(404, message=f"Unknown entity '{entity}'. Valid: {', '.join(_ENTITY_REGISTRY)}")
     tbl, model, id_a, lbl_a = _ENTITY_REGISTRY[entity]