From b3d2eea52543d7066de9813c50cc9e36e2747d28 Mon Sep 17 00:00:00 2001
From: Javier
Date: Fri, 17 Apr 2026 16:59:01 +0200
Subject: [PATCH 1/2] Add data models and SQLite database
---
.gitignore | 2 +
Dockerfile | 5 +
app.py | 539 +++++------
entrypoint.sh | 11 +-
patch.py | 46 -
requirements.txt | 35 +-
src/__init__.py | 0
src/api.py | 564 ++++++++++++
src/casestudy_resolver.py | 298 ++++++
src/db.py | 75 ++
src/models/__init__.py | 0
src/models/casestudy.py | 209 +++++
src/models/cloud/method.py | 134 +++
src/models/cloud/tool.py | 98 ++
src/models/compound.py | 75 ++
src/models/data/__init__.py | 50 +
src/models/data/biostudies.py | 867 ++++++++++++++++++
src/models/data/mapping.py | 526 +++++++++++
src/models/data/schemas.py | 245 +++++
src/models/data/zenodo.py | 484 ++++++++++
src/models/platform.py | 56 ++
src/scheduler.py | 61 ++
src/seed.py | 279 ++++++
src/services/__init__.py | 0
src/services/compound.py | 204 +++++
src/sitemap.py | 59 ++
templates/base.html | 4 +
templates/case_studies/casestudies.html | 36 +-
templates/case_studies/casestudy_server.html | 229 +++++
...w.html => safety_assessment_workflow.html} | 0
30 files changed, 4812 insertions(+), 379 deletions(-)
delete mode 100644 patch.py
create mode 100644 src/__init__.py
create mode 100644 src/api.py
create mode 100644 src/casestudy_resolver.py
create mode 100644 src/db.py
create mode 100644 src/models/__init__.py
create mode 100644 src/models/casestudy.py
create mode 100644 src/models/cloud/method.py
create mode 100644 src/models/cloud/tool.py
create mode 100644 src/models/compound.py
create mode 100644 src/models/data/__init__.py
create mode 100644 src/models/data/biostudies.py
create mode 100644 src/models/data/mapping.py
create mode 100644 src/models/data/schemas.py
create mode 100644 src/models/data/zenodo.py
create mode 100644 src/models/platform.py
create mode 100644 src/scheduler.py
create mode 100644 src/seed.py
create mode 100644 src/services/__init__.py
create mode 100644 src/services/compound.py
create mode 100644 src/sitemap.py
create mode 100644 templates/case_studies/casestudy_server.html
rename templates/{Safety_Assessment_Workflow.html => safety_assessment_workflow.html} (100%)
diff --git a/.gitignore b/.gitignore
index a64738a..58802be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,8 @@ __pycache__/
*.py[cod]
*$py.class
+# SQLite database
+data/*.db
# C extensions
*.so
diff --git a/Dockerfile b/Dockerfile
index 854e92a..90b4c14 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -21,9 +21,14 @@ COPY . .
# Install any needed packages specified in requirements.txt
RUN pip install -r requirements.txt
+# Create data directory for SQLite DB
+RUN mkdir -p /usr/src/app/data
+
# Copy entrypoint script
COPY entrypoint.sh /usr/src/app/entrypoint.sh
RUN chmod +x /usr/src/app/entrypoint.sh
+EXPOSE 5050
+
# Define the entrypoint script
ENTRYPOINT ["/usr/src/app/entrypoint.sh"]
diff --git a/app.py b/app.py
index e6aa3de..a3e0cec 100644
--- a/app.py
+++ b/app.py
@@ -1,6 +1,7 @@
################################################################################
### Loading the required modules
import json
+import os
import re
import requests
@@ -13,10 +14,15 @@
# from wikidataintegrator import wdi_core
from wikibaseintegrator import wbi_helpers
-# Import BioStudies extractor
-from data.biostudies.search import BioStudiesExtractor
-from data.zenodo.search import ZenodoExtractor
-from data.mapping import normalize_all
+# Data extractors (API wrappers — no DB needed)
+from src.models.data.biostudies import BioStudiesExtractor
+from src.models.data.zenodo import ZenodoExtractor
+from src.models.data.mapping import normalize_all
+
+# Database layer
+from src.db import get_conn, init_db
+from src.api import init_api
+from src.casestudy_resolver import resolve as resolve_casestudy
################################################################################
CACHE_TIMEOUT = 60 * 60 * 24 * 5 # 5 days -- [Ozan] I created a separate
@@ -62,7 +68,7 @@
},
"reg_q_2b": {
"label": "Parkinson Case Study (b)",
- "explanation": "What level of exposure to compound Dinoseb leads to risk for developing Parkinson’s disease?",
+ "explanation": "What level of exposure to compound Dinoseb leads to risk for developing Parkinson's disease?",
},
"reg_q_3a": {
"label": "Thyroid Case Study (a)",
@@ -103,9 +109,16 @@ def __init__(self, url_map, *items):
"CACHE_SERVICE_TIMEOUT": CACHE_TIMEOUT_SERVICE
}
app = Flask(__name__)
+app.secret_key = os.environ.get(
+ "FLASK_SECRET_KEY", "dev-insecure-key"
+)
app.config.from_mapping(cache_config)
cache = Cache(app)
+# Database init and API registration
+init_db()
+init_api(app)
+
@cache.memoize(timeout=CACHE_TIMEOUT)
def get_json_dict(url: str, timeout: int = 5) -> dict:
@@ -204,42 +217,25 @@ def get_repository_data(
# Provide methods list to all templates for the Methods dropdown in the navbar
@app.context_processor
def inject_methods_menu():
- """Fetch methods_index.json and expose a simple list of {id, title} to templates.
- Return an empty list on any error to avoid breaking pages.
- """
- data = get_json_dict(METHODS_URL)
- if data:
- items = []
- for key, val in data.items() if isinstance(data, dict) else []:
- title = (
- val.get("method")
- or val.get("method_name_content")
- or val.get("method_name")
- or key
- )
- items.append({"id": key, "title": title})
- # sort by title
- items = sorted(items, key=lambda x: x["title"].lower())
- return {"methods_menu": items}
- else:
+ """Expose methods list to all templates for navbar dropdown."""
+ try:
+ conn = get_conn()
+ rows = conn.execute("SELECT id, method FROM methods ORDER BY method").fetchall()
+ conn.close()
+ return {"methods_menu": [{"id": r["id"], "title": r["method"]} for r in rows]}
+ except Exception:
return {"methods_menu": []}
@app.context_processor
def inject_tools_menu():
- """Fetch methods_index.json and expose a simple list of {id, title} to templates.
- Return an empty list on any error to avoid breaking pages.
- """
- data = get_json_dict_service(SERVICES_URL)
- if data:
- items = []
- for key, val in data.items() if isinstance(data, dict) else []:
- title = val.get("service") or key
- items.append({"id": key, "title": title})
- # sort by title
- items = sorted(items, key=lambda x: x["title"].lower())
- return {"tools_menu": items}
- else:
+ """Expose tools list to all templates for navbar dropdown."""
+ try:
+ conn = get_conn()
+ rows = conn.execute("SELECT id, service FROM tools ORDER BY service").fetchall()
+ conn.close()
+ return {"tools_menu": [{"id": r["id"], "title": r["service"]} for r in rows]}
+ except Exception:
return {"tools_menu": []}
@@ -269,17 +265,12 @@ def inject_data_menu():
### The landing page
@app.route("/")
def home():
- try:
- tools = get_json_dict_service(
- SERVICES_URL
- ) # Geting the service_list.json in the dictionary format.
- tools = list(tools.values()) # Converting the dictionary to a list object.
- except Exception as e:
- return f"Error processing service data: {e}", 500
- num_tools = len(tools)
- num_case_studies = len(CASESTUDIES)
+ conn = get_conn()
+ num_tools = conn.execute("SELECT COUNT(*) FROM tools").fetchone()[0]
+ num_case_studies = conn.execute("SELECT COUNT(*) FROM case_studies").fetchone()[0]
+ conn.close()
bs_res, zen_res = get_repository_data(search_query="")
- num_datasets = bs_res["total"] + zen_res["total"]
+ num_datasets = bs_res.get("total", 0) + zen_res.get("total", 0)
return render_template(
"home.html",
num_tools=num_tools,
@@ -292,26 +283,34 @@ def home():
### The sitemap.xml for search engines
@app.route("/sitemap.xml")
def sitemap():
- sitemapContent = """
+ # Prefer generated static sitemap if present (created by src.sitemap)
+ import os
+ path = os.path.join(os.path.dirname(__file__), "static", "sitemap.xml")
+ if os.path.exists(path):
+ with open(path, "rb") as fh:
+ return Response(fh.read(), mimetype="application/xml")
+
+ # Fallback minimal sitemap
+ sitemapContent = """
-
- https://platform.vhp4safety.nl/
-
-
- https://platform.vhp4safety.nl/casestudies
-
-
- https://platform.vhp4safety.nl/tools
-
-
- https://platform.vhp4safety.nl/methods
-
-
- https://platform.vhp4safety.nl/data
-
+
+ https://platform.vhp4safety.nl/\
+
+
+ https://platform.vhp4safety.nl/casestudies\
+
+
+ https://platform.vhp4safety.nl/tools\
+
+
+ https://platform.vhp4safety.nl/methods\
+
+
+ https://platform.vhp4safety.nl/data\
+
-""";
- return Response(sitemapContent, mimetype='text/xml');
+"""
+ return Response(sitemapContent, mimetype="text/xml")
################################################################################
@@ -529,112 +528,94 @@ def models():
### Pages under 'Tools'
-### Here begins the updated version for creating the tool list page.
@app.route("/tools")
def tools():
try:
- tools = get_json_dict_service(
- SERVICES_URL
- ) # Geting the service_list.json in the dictionary format.
- tools = list(tools.values()) # Converting the dictionary to a list object.
-
- # Mapping the URLs with glossary IDs to their text values.
- stage_mapping = {
- "https://vhp4safety.github.io/glossary#VHP0000056": "ADME",
- "https://vhp4safety.github.io/glossary#VHP0000102": "Hazard Assessment",
- "https://vhp4safety.github.io/glossary#VHP0000148": "Chemical Information",
- "https://vhp4safety.github.io/glossary#VHP0000149": "General",
- }
-
- for tool in tools:
- full_stage_url = tool.get("stage", "")
-
- # Writing the service name and stage values in the logs for troubleshooting.
- # print(f"Tool: {tool['service']}, Stage URL: {full_stage_url}") # Log the full URL
-
- # Checking if the full URL is in the mapping and updating the stage.
- if full_stage_url in stage_mapping:
- # print(f"Mapping stage URL {full_stage_url} to {stage_mapping[full_stage_url]}") # Log the mapping
- tool["stage"] = stage_mapping[full_stage_url]
- elif tool["stage"] in ["NA", "Unknown"]:
- tool["stage"] = (
- "Other" # Combining "NA" and "Unknown" stages in a single stage-type, "Other".
- )
-
- html_name = tool.get("html_name")
- md_name = tool.get("md_file_name")
- png_name = tool.get("png_file_name")
-
- tool["url"] = f"https://cloud.vhp4safety.nl/service/{html_name}"
- tool["meta_data"] = (
- f"https://raw.githubusercontent.com/VHP4Safety/cloud/main/docs/service/{md_name}"
- if md_name
- else "md file not found"
- )
-
- # Check if the tool has the placeholder logo
- placeholder_logo = "https://github.com/VHP4Safety/ui-design/blob/main/static/images/logo.png"
- if png_name == placeholder_logo:
- tool["png"] = None # set to None if it's the common placeholder
- else:
- tool["png"] = (
- f"https://raw.githubusercontent.com/VHP4Safety/cloud/main/docs/service/{png_name}"
- if not png_name.startswith("http")
- else png_name
- )
-
- inst_url = tool.get("inst_url", "no_url")
- if not inst_url: # catches "" as well
- inst_url = "no_url"
- tool["inst_url"] = inst_url
+ conn = get_conn()
- # Getting selected stages from the URL.
selected_stages = request.args.getlist("stage")
+ search_query = request.args.get("search", "").strip().lower()
- # Filtering tools by selected stages.
+ sql = "SELECT * FROM tools WHERE 1=1"
+ params = []
if selected_stages:
- tools = [tool for tool in tools if tool.get("stage") in selected_stages]
-
- # Getting all unique stages from the tools for the filter options.
- stages = sorted(set(tool.get("stage") for tool in tools if tool.get("stage")))
-
- # Forcing "Other" to be the last item in the list of stages.
- if "Other" in stages:
- stages.remove("Other")
- stages.append("Other")
+ placeholders = ",".join("?" * len(selected_stages))
+ sql += f" AND stage IN ({placeholders})"
+ params.extend(selected_stages)
+ if search_query:
+ sql += " AND LOWER(service) LIKE ?"
+ params.append(f"%{search_query}%")
+ sql += " ORDER BY service"
+ rows = conn.execute(sql, params).fetchall()
- # Filtering over the regulatory questions.
- reg_questions = {v["label"]: k for k, v in REG_QUESTIONS.items()}
+ # Build reg_questions lookup from DB
+ rq_rows = conn.execute("SELECT * FROM regulatory_questions").fetchall()
+ reg_questions = {r["label"]: r["key"] for r in rq_rows}
+ # Apply regulatory question filters
selected_questions = request.args.getlist("reg_q")
+ tools_list = []
+ for row in [dict(r) for r in rows]:
+ raw = json.loads(row["raw_json"]) if row.get("raw_json") else {}
+ # Check reg question filters
+ skip = False
+ for question in selected_questions:
+ field = reg_questions.get(question)
+ if field and str(raw.get(field, "")).lower() != "true":
+ skip = True
+ break
+ if skip:
+ continue
+
+ html_name = row["html_name"]
+ png_name = row["png_file_name"]
+ placeholder = (
+ "https://github.com/VHP4Safety/ui-design"
+ "/blob/main/static/images/logo.png"
+ )
- for question in selected_questions:
- field = reg_questions.get(question)
- if field:
- tools = [
- tool for tool in tools if str(tool.get(field, "")).lower() == "true"
- ]
-
- # Getting the search query from URL to add a search bar based on tool names.
- search_query = request.args.get("search", "").strip().lower()
-
- # Filtering tools by search query.
- if search_query:
- tools = [
- tool
- for tool in tools
- if search_query in tool.get("service", "").lower()
- ]
+ tools_list.append({
+ "id": row["id"],
+ "service": row["service"],
+ "description": row["description"],
+ "stage": row["stage"],
+ "html_name": html_name,
+ "url": f"https://cloud.vhp4safety.nl/service/{html_name}",
+ "inst_url": row["inst_url"] or "no_url",
+ "png": (
+ None if png_name == placeholder else
+ f"https://raw.githubusercontent.com/VHP4Safety/cloud/main/docs/service/{png_name}"
+ if png_name and not png_name.startswith("http")
+ else png_name
+ ),
+ **raw,
+ })
+
+ # Collect stages for filter sidebar
+ all_stages = sorted(set(
+ t["stage"] for t in tools_list if t.get("stage")
+ ))
+ if "Other" in all_stages:
+ all_stages.remove("Other")
+ all_stages.append("Other")
+
+ # Stage / reg question explanations from DB
+ se_rows = conn.execute("SELECT * FROM stage_explanations").fetchall()
+ stage_explanations = {s["name"]: s["explanation"] for s in se_rows}
+ reg_question_explanations = {
+ r["label"]: r["explanation"] for r in rq_rows
+ }
+ conn.close()
return render_template(
"tools/tools.html",
- tools=tools,
- stages=stages,
+ tools=tools_list,
+ stages=all_stages,
selected_stages=selected_stages,
reg_questions=reg_questions,
selected_questions=selected_questions,
- stage_explanations=STAGE_EXPLANATIONS,
- reg_question_explanations=REG_QUESTION_EXPLANATIONS,
+ stage_explanations=stage_explanations,
+ reg_question_explanations=reg_question_explanations,
)
except Exception as e:
@@ -645,100 +626,68 @@ def tools():
@app.route("/methods")
@app.route("/methods/")
def methods():
- """Fetch methods_index.json from the cloud repo, normalize fields and render a methods list page."""
- url = "https://raw.githubusercontent.com/VHP4Safety/cloud/refs/heads/main/cap/methods_index.json"
- response = requests.get(url)
-
- if response.status_code != 200:
- return f"Error fetching methods list: {response.status_code}", 503
-
+ """Render methods list page from DB."""
try:
- methods = response.json()
- methods = list(methods.values()) # convert dict to list
+ conn = get_conn()
- # Normalize fields for the template and collect stages
- stages_set = set()
- normalized = []
- for m in methods:
- norm = {}
- norm["id"] = m.get("id", "")
- # template expects 'service' and 'description'
- norm["service"] = (
- m.get("method")
- or m.get("method_name_content")
- or m.get("method_name")
- or ""
- )
- norm["description"] = (
- m.get("method_description_content") or m.get("method_description") or ""
- )
- # main_url used for method webpage (catalog page)
- norm["main_url"] = m.get("catalog_webpage_url") or "no_url"
- # interactive instance not present in methods index
- norm["inst_url"] = m.get("inst_url") or "no_url"
- # metadata md file not available in index; keep empty string
- norm["meta_data"] = m.get("meta_data") or ""
- # placeholder/no png
- norm["png"] = None
- # keep original raw data for potential details page
- norm["raw"] = m
-
- # collect stages (split comma-separated values)
- stage_field = (m.get("vhp4safety_workflow_stage_content") or "").strip()
- if stage_field:
- for part in [s.strip() for s in stage_field.split(",")]:
- if part:
- stages_set.add(part)
-
- normalized.append(norm)
-
- # Apply search and filters similar to /tools
selected_stages = request.args.getlist("stage")
- selected_questions = request.args.getlist("reg_q")
search_query = request.args.get("search", "").strip().lower()
- methods_filtered = normalized
+ sql = "SELECT * FROM methods WHERE 1=1"
+ params = []
+ if search_query:
+ sql += " AND LOWER(method) LIKE ?"
+ params.append(f"%{search_query}%")
+ sql += " ORDER BY method"
+ rows = [dict(r) for r in conn.execute(sql, params).fetchall()]
- if selected_stages:
- methods_filtered = [
- m
- for m in methods_filtered
- if any(
- s
- in (
- (m["raw"].get("vhp4safety_workflow_stage_content") or "").split(
- ","
- )
- )
- for s in selected_stages
- )
- ]
-
- # Filter by regulatory questions if provided (REG_QUESTIONS keys map to internal fields)
- reg_questions = {v["label"]: k for k, v in REG_QUESTIONS.items()}
- if selected_questions:
+ rq_rows = conn.execute("SELECT * FROM regulatory_questions").fetchall()
+ reg_questions = {r["label"]: r["key"] for r in rq_rows}
+ selected_questions = request.args.getlist("reg_q")
+
+ stages_set = set()
+ methods_filtered = []
+ for row in rows:
+ raw = json.loads(row["raw_json"]) if row.get("raw_json") else {}
+ stage_field = (row.get("stage") or "").strip()
+ parts = [s.strip() for s in stage_field.split(",") if s.strip()]
+ stages_set.update(parts)
+
+ if selected_stages and not any(s in parts for s in selected_stages):
+ continue
+
+ skip = False
for question in selected_questions:
field = reg_questions.get(question)
- if field:
- methods_filtered = [
- m
- for m in methods_filtered
- if str(m["raw"].get(field, "")).lower() == "true"
- ]
-
- if search_query:
- methods_filtered = [
- m
- for m in methods_filtered
- if search_query in m.get("service", "").lower()
- ]
+ if field and str(raw.get(field, "")).lower() != "true":
+ skip = True
+ break
+ if skip:
+ continue
+
+ methods_filtered.append({
+ "id": row["id"],
+ "service": row["method"],
+ "description": row.get("description") or "",
+ "main_url": row.get("catalog_webpage_url") or "no_url",
+ "inst_url": "no_url",
+ "meta_data": "",
+ "png": None,
+ "raw": raw,
+ })
stages = sorted(stages_set)
if "Other" in stages:
stages.remove("Other")
stages.append("Other")
- # Pass everything the template expects
+ se_rows = conn.execute("SELECT * FROM stage_explanations").fetchall()
+ stage_explanations = {s["name"]: s["explanation"] for s in se_rows}
+ reg_question_explanations = {
+ r["label"]: r["explanation"] for r in rq_rows
+ }
+ conn.close()
+
return render_template(
"methods/methods.html",
methods=methods_filtered,
@@ -746,8 +695,8 @@ def methods():
selected_stages=selected_stages,
reg_questions=reg_questions,
selected_questions=selected_questions,
- stage_explanations=STAGE_EXPLANATIONS,
- reg_question_explanations=REG_QUESTION_EXPLANATIONS,
+ stage_explanations=stage_explanations,
+ reg_question_explanations=reg_question_explanations,
)
except Exception as e:
@@ -756,38 +705,29 @@ def methods():
@app.route("/methods/")
def method_page(methodid):
- """Render a single method page using templates/methods/method.html
- Method details are taken from methods_index.json (keyed by method id).
- """
- try:
- methods = get_json_dict(METHODS_URL)
- # methods_index.json is a dict keyed by method id
- if methodid not in methods:
- abort(404)
- method_details = methods[methodid]
- except Exception as e:
- return f"Error processing methods data: {e}", 500
+ """Render a single method detail page."""
+ conn = get_conn()
+ row = conn.execute("SELECT * FROM methods WHERE id = ?", (methodid,)).fetchone()
+ conn.close()
+ if not row:
+ abort(404)
+
+ method_details = json.loads(row["raw_json"]) if row["raw_json"] else {}
- # Try to load the full method JSON from the docs/methods folder (raw github)
- method_json = None
- # URL-encode the filename part to be safe
+ # Try to load full JSON from GitHub docs/methods/
+ method_json = method_details
encoded = urllib.parse.quote(methodid, safe="")
raw_url = (
- "https://raw.githubusercontent.com/VHP4Safety/cloud/refs/heads/main/docs/methods/"
- + f"{encoded}.json"
+ "https://raw.githubusercontent.com/VHP4Safety/cloud"
+ f"/refs/heads/main/docs/methods/{encoded}.json"
)
try:
r = requests.get(raw_url, timeout=5)
if r.status_code == 200:
method_json = r.json()
- else:
- # fall back to using the index entry as minimal data
- method_json = method_details
- except Exception as exc:
- # on any error, fall back to index entry
- method_json = method_details
+ except Exception:
+ pass
- # Pass both to the template: some templates expect method_json, others method_details
return render_template(
"methods/method.html",
method=method_details,
@@ -798,37 +738,27 @@ def method_page(methodid):
@app.route("/tools/")
def tool_page(toolname):
- # get the tools metadata:
- try:
- tools = get_json_dict_service(SERVICES_URL)
- tools = dict(tools)
- # Geting the service_list.json in the dictionary format.
- # Converting the dictionary to a list object.
- except Exception as e:
- return f"Error processing service data: {e}", 500
-
- # Map toolname to the correct JSON file in the new tool folder
- if toolname not in tools:
+ """Render a single tool detail page."""
+ conn = get_conn()
+ row = conn.execute("SELECT * FROM tools WHERE id = ?", (toolname,)).fetchone()
+ conn.close()
+ if not row:
abort(404)
- # get the tools metadata:
- url = "https://cloud.vhp4safety.nl/service/" + toolname + ".json"
- response = requests.get(url)
-
- if response.status_code != 200:
- return f"Error fetching service list: {response.status_code}", 503
+ tool_json = json.loads(row["raw_json"]) if row["raw_json"] else {}
+ # Fetch full details from cloud service JSON
+ url = f"https://cloud.vhp4safety.nl/service/{toolname}.json"
try:
- tool_details = response.json()
- tool_details = dict(tool_details)
- # Geting the service_list.json in the dictionary format.
- # Converting the dictionary to a list object.
- except Exception as e:
- return f"Error processing service data: {e}", 500
+ resp = requests.get(url, timeout=10)
+ tool_details = resp.json() if resp.status_code == 200 else tool_json
+ except Exception:
+ tool_details = tool_json
- # Pass the json filename to the template (for JS to pick up)
return render_template(
- "tools/tool.html", tool_json=tools[toolname], tool_details=tool_details
+ "tools/tool.html",
+ tool_json=tool_json,
+ tool_details=tool_details,
)
@@ -837,31 +767,45 @@ def tool_page(toolname):
# General Safety Assessment Workflow page
-@app.route("/Safety_Assessment_Workflow")
+@app.route("/safety_assessment_workflow")
def SafetyAssessmentWorkflow():
- return render_template("Safety_Assessment_Workflow.html")
+ return render_template("safety_assessment_workflow.html")
################################################################################
### Pages under 'Case Studies'
-# General case studies page
@app.route("/casestudies")
def workflows():
- return render_template("case_studies/casestudies.html")
+ conn = get_conn()
+ cards = conn.execute("SELECT * FROM case_studies").fetchall()
+ conn.close()
+ return render_template(
+ "case_studies/casestudies.html", cards=[dict(c) for c in cards]
+ )
-# Individual case study page, dynamically filled based on URL
-@app.route("/casestudies/", defaults={"step": ""})
-@app.route("/casestudies//")
-@app.route("/casestudies///")
-# additional routes are parsed client side via js to allow smooth animation
-def casestudy(case:str="", question:str="", step:str=""):
- if case not in CASESTUDIES:
+@app.route("/casestudies/")
+@app.route("/casestudies//")
+def casestudy(case: str, subpath: str = ""):
+ conn = get_conn()
+ cs = conn.execute("SELECT * FROM case_studies WHERE slug = ?", (case,)).fetchone()
+ conn.close()
+ if not cs:
abort(404)
- # JS will handle steps via the URL
- return render_template("case_studies/casestudy.html", case=case)
+
+ parts = [
+ p for p in subpath.split("/") if p
+ ] if subpath else []
+
+ step = resolve_casestudy(case, parts)
+ if step is None:
+ abort(404)
+
+ return render_template(
+ "case_studies/casestudy_server.html", step=step
+ )
@app.route("/workflow/")
@@ -1121,5 +1065,8 @@ def privacy_policy():
return render_template("legal/privacypolicy.html")
+from src.scheduler import init_scheduler
+init_scheduler(app)
+
if __name__ == "__main__":
app.run(host="0.0.0.0", port=5050, debug=True)
diff --git a/entrypoint.sh b/entrypoint.sh
index cd96440..56a3e46 100644
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -1,4 +1,11 @@
#!/bin/sh
+set -e
-# Start Flask app
-python app.py
+echo "==> Seeding database..."
+python -m src.seed
+
+echo "==> Generating sitemap..."
+python -m src.sitemap || echo "sitemap generation failed; continuing"
+
+echo "==> Starting Flask app..."
+exec python app.py
diff --git a/patch.py b/patch.py
deleted file mode 100644
index 5cd790a..0000000
--- a/patch.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from importlib import import_module
-from pathlib import Path
-
-def apply_patch():
- try:
- # Dynamically import the module and get its file path
- try:
- module = import_module('pyshexc.parser.ShExDocLexer')
- except ModuleNotFoundError as e:
- # Give a precise, actionable hint for installation in the active interpreter
- print(
- "Missing dependency: 'pyshexc' (PyShExC).\n"
- "Install it in the same environment you're using to run this script.\n"
- "Examples:\n"
- " python -m pip install PyShExC\n"
- " # or with uv: uv pip install PyShExC\n"
- " # or poetry: poetry add PyShExC\n"
- " # or conda: conda install -c conda-forge pyshexc\n"
- )
- return
-
- file_path = Path(module.__file__)
-
- if not file_path.exists():
- raise FileNotFoundError(f"Could not find the file: {file_path}")
-
- # Read the file content
- file_content = file_path.read_text()
-
- # Replace 'from typing.io import TextIO' with 'from typing import TextIO'
- new_content = file_content.replace("from typing.io import TextIO", "from typing import TextIO")
-
- # Only write if a change is needed
- if new_content != file_content:
- file_path.write_text(new_content)
- print("Patch applied successfully!")
- else:
- print("No patch needed; target text not found (already patched or different version).")
-
- except FileNotFoundError as e:
- print(e)
- except Exception as e:
- print(f"An error occurred: {e}")
-
-if __name__ == "__main__":
- apply_patch()
diff --git a/requirements.txt b/requirements.txt
index 3e7ed1c..95607e1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,30 @@
-flask>=3.1.3
+annotated-types==0.7.0
+backoff==2.2.1
+blinker==1.9.0
+cachelib==0.13.0
+certifi==2026.2.25
+charset-normalizer==3.4.7
+click==8.3.2
+flask==3.1.3
flask-caching==2.3.1
+idna==3.11
+itsdangerous==2.2.0
+jinja2==3.1.6
+markupsafe==3.0.3
+mwoauth==0.4.0
+oauthlib==3.3.1
+pydantic==2.13.2
+pydantic-core==2.46.2
+pyjwt==2.12.1
requests==2.32.4
-#wikidataintegrator==0.9.30
-setuptools==78.1.1 # Provides pkg_resources module, required for wikidataintegrator
-werkzeug>=3.0.6
-#pyBiodatafuse @ git+https://github.com/BioDataFuse/pyBiodatafuse.git
-wikibaseintegrator>=0.12.14
-
+requests-oauthlib==2.0.0
+setuptools==78.1.1
+typing-extensions==4.15.0
+typing-inspection==0.4.2
+ujson==5.12.0
+urllib3==2.6.3
+werkzeug==3.1.8
+wikibaseintegrator==0.12.15
+flask-smorest>=0.44
+marshmallow>=3.20
+APScheduler>=3.10,<4
diff --git a/src/__init__.py b/src/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/api.py b/src/api.py
new file mode 100644
index 0000000..646a88c
--- /dev/null
+++ b/src/api.py
@@ -0,0 +1,564 @@
+"""RESTful API with auto-generated OpenAPI documentation.
+
+Uses flask-smorest (marshmallow + OpenAPI 3) so Swagger UI is
+served automatically at /api/v1/docs.
+"""
+
+from __future__ import annotations
+
+import json
+
+from flask import Flask
+from flask_smorest import Api, Blueprint, abort
+from marshmallow import Schema, fields
+
+from src.db import get_conn
+from src.models.data.biostudies import BioStudiesExtractor
+from src.models.data.zenodo import ZenodoExtractor
+from src.models.data.mapping import normalize_all
+from src.services.compound import (
+ get_experimental_data,
+ get_full_compound,
+ get_identifiers,
+ get_properties,
+ get_toxicology,
+ is_valid_qid,
+)
+
+BIOSTUDIES_COLLECTION = "VHP4Safety"
+ZENODO_COMMUNITY = "vhp4safety"
+ZENODO_RECORD_TYPE = "dataset"
+
+
+# -- Marshmallow Schemas ---------------------------------------------------
+
+class ToolSchema(Schema):
+ id = fields.Str()
+ service = fields.Str()
+ description = fields.Str()
+ stage = fields.Str()
+ main_url = fields.Str()
+ inst_url = fields.Str()
+ html_name = fields.Str()
+ png_file_name = fields.Str()
+
+
+class MethodSchema(Schema):
+ id = fields.Str()
+ method = fields.Str()
+ description = fields.Str()
+ stage = fields.Str()
+ substage = fields.Str()
+ catalog_webpage_url = fields.Str()
+ raw = fields.Dict(load_default=None)
+
+
+class RegulatoryQuestionSchema(Schema):
+ key = fields.Str()
+ label = fields.Str()
+ explanation = fields.Str()
+
+
+class StageExplanationSchema(Schema):
+ name = fields.Str()
+ explanation = fields.Str()
+
+
+class CaseStudySchema(Schema):
+ slug = fields.Str()
+ title = fields.Str()
+ description = fields.Str()
+ image_src = fields.Str()
+ config_repo = fields.Str()
+ default_branch = fields.Str()
+
+
+class CaseStudyDetailSchema(CaseStudySchema):
+ content_json = fields.Raw(load_default=None)
+
+
+class CompoundSummarySchema(Schema):
+ wcid = fields.Str()
+ label = fields.Str()
+ inchi = fields.Str()
+ inchikey = fields.Str()
+ smiles = fields.Str(data_key="SMILES")
+ formula = fields.Str()
+ mass = fields.Str()
+
+
+class CompoundIdentifierSchema(Schema):
+ property_label = fields.Str(data_key="propertyLabel")
+ value = fields.Str()
+ formatter_url = fields.Str(data_key="formatterURL")
+
+
+class CompoundToxicologySchema(Schema):
+ property_label = fields.Str(data_key="propertyLabel")
+ value = fields.Str()
+
+
+class CompoundExpDataSchema(Schema):
+ property_label = fields.Str(data_key="propEntityLabel")
+ value = fields.Str()
+ units_label = fields.Str(data_key="unitsLabel")
+ source = fields.Str()
+ doi = fields.Str()
+ see_also = fields.Str(data_key="seeAlso")
+
+
+class CompoundDetailSchema(Schema):
+ summary = fields.Nested(CompoundSummarySchema)
+ identifiers = fields.List(fields.Nested(CompoundIdentifierSchema))
+ toxicology = fields.List(fields.Nested(CompoundToxicologySchema))
+ experimental_data = fields.List(fields.Nested(CompoundExpDataSchema))
+
+
+class DataSearchQuerySchema(Schema):
+ query = fields.Str(load_default="")
+ page = fields.Int(load_default=1)
+ size = fields.Int(load_default=18)
+
+
+class DataSourceResultSchema(Schema):
+ total = fields.Int()
+ hits = fields.List(fields.Dict())
+ error = fields.Str(allow_none=True)
+
+
+class DataResultSchema(Schema):
+ biostudies = fields.Nested(DataSourceResultSchema)
+ zenodo = fields.Nested(DataSourceResultSchema)
+
+
+class SearchQuerySchema(Schema):
+ stage = fields.Str(load_default=None)
+ search = fields.Str(load_default="")
+
+
+# -- Blueprints ------------------------------------------------------------
+
+tools_bp = Blueprint("tools", __name__, url_prefix="/api/tools",
+ description="Tool / service endpoints")
+methods_bp = Blueprint("methods", __name__, url_prefix="/api/methods",
+ description="Method endpoints")
+reg_q_bp = Blueprint("regulatory_questions", __name__,
+ url_prefix="/api/regulatory-questions",
+ description="Regulatory questions")
+stages_bp = Blueprint("stages", __name__, url_prefix="/api/stages",
+ description="Safety-assessment workflow stages")
+casestudies_bp = Blueprint("casestudies", __name__,
+ url_prefix="/api/casestudies",
+ description="Case study endpoints")
+compounds_bp = Blueprint("compounds", __name__, url_prefix="/api/compounds",
+ description="Compound data (SPARQL-backed)")
+data_bp = Blueprint("data", __name__, url_prefix="/api/data",
+ description="Dataset search (BioStudies + Zenodo)")
+
+
+# -- Tools -----------------------------------------------------------------
+
+@tools_bp.route("/")
+@tools_bp.arguments(SearchQuerySchema, location="query")
+@tools_bp.response(200, ToolSchema(many=True))
+def list_tools(args):
+ """List all tools, with optional stage/search filters."""
+ conn = get_conn()
+ sql = "SELECT * FROM tools WHERE 1=1"
+ params = []
+ if args.get("stage"):
+ sql += " AND stage = ?"
+ params.append(args["stage"])
+ if args.get("search"):
+ sql += " AND service LIKE ?"
+ params.append(f"%{args['search']}%")
+ sql += " ORDER BY service"
+ rows = conn.execute(sql, params).fetchall()
+ conn.close()
+ return [dict(r) for r in rows]
+
+
+@tools_bp.route("/")
+@tools_bp.response(200, ToolSchema)
+def get_tool(tool_id):
+ """Get a single tool by ID."""
+ conn = get_conn()
+ row = conn.execute("SELECT * FROM tools WHERE id = ?", (tool_id,)).fetchone()
+ conn.close()
+ if not row:
+ abort(404, message="Tool not found")
+ return dict(row)
+
+
+# -- Methods ---------------------------------------------------------------
+
+@methods_bp.route("/")
+@methods_bp.arguments(SearchQuerySchema, location="query")
+@methods_bp.response(200, MethodSchema(many=True))
+def list_methods(args):
+ """List all methods, with optional stage/search filters."""
+ conn = get_conn()
+ sql = "SELECT * FROM methods WHERE 1=1"
+ params = []
+ if args.get("stage"):
+ sql += " AND stage LIKE ?"
+ params.append(f"%{args['stage']}%")
+ if args.get("search"):
+ sql += " AND method LIKE ?"
+ params.append(f"%{args['search']}%")
+ sql += " ORDER BY method"
+ rows = conn.execute(sql, params).fetchall()
+ conn.close()
+ return [dict(r) for r in rows]
+
+
+@methods_bp.route("/")
+@methods_bp.response(200, MethodSchema)
+def get_method(method_id):
+ """Get a single method by ID."""
+ conn = get_conn()
+ row = conn.execute("SELECT * FROM methods WHERE id = ?", (method_id,)).fetchone()
+ conn.close()
+ if not row:
+ abort(404, message="Method not found")
+ d = dict(row)
+ if d.get("raw_json"):
+ d["raw"] = json.loads(d["raw_json"])
+ return d
+
+
+# -- Regulatory Questions --------------------------------------------------
+
+@reg_q_bp.route("/")
+@reg_q_bp.response(200, RegulatoryQuestionSchema(many=True))
+def list_regulatory_questions():
+ """List all regulatory questions."""
+ conn = get_conn()
+ rows = conn.execute("SELECT * FROM regulatory_questions").fetchall()
+ conn.close()
+ return [dict(r) for r in rows]
+
+
+# -- Stages ----------------------------------------------------------------
+
+@stages_bp.route("/")
+@stages_bp.response(200, StageExplanationSchema(many=True))
+def list_stages():
+ """List all safety-assessment workflow stages."""
+ conn = get_conn()
+ rows = conn.execute("SELECT * FROM stage_explanations").fetchall()
+ conn.close()
+ return [dict(r) for r in rows]
+
+
+# -- Case Studies ----------------------------------------------------------
+
+@casestudies_bp.route("/")
+@casestudies_bp.response(200, CaseStudySchema(many=True))
+def list_case_studies():
+ """List all case studies."""
+ conn = get_conn()
+ rows = conn.execute("SELECT * FROM case_studies").fetchall()
+ conn.close()
+ return [dict(r) for r in rows]
+
+
+@casestudies_bp.route("/")
+@casestudies_bp.response(200, CaseStudyDetailSchema)
+def get_case_study(slug):
+ """Get a case study with its full content JSON."""
+ conn = get_conn()
+ row = conn.execute("SELECT * FROM case_studies WHERE slug = ?", (slug,)).fetchone()
+ conn.close()
+ if not row:
+ abort(404, message="Case study not found")
+ d = dict(row)
+ if d.get("content_json"):
+ d["content_json"] = json.loads(d["content_json"])
+ return d
+
+
+# -- Compounds (SPARQL-backed) ---------------------------------------------
+
+@compounds_bp.route("/")
+@compounds_bp.response(200, CompoundDetailSchema)
+def get_compound(cwid):
+ """Get full compound data."""
+ if not is_valid_qid(cwid):
+ abort(400, message="Invalid compound identifier")
+ try:
+ return get_full_compound(cwid).model_dump()
+ except Exception as e:
+ abort(502, message=str(e))
+
+
+@compounds_bp.route("//properties")
+@compounds_bp.response(200, CompoundSummarySchema)
+def get_compound_properties(cwid):
+ """Get core compound identifiers."""
+ if not is_valid_qid(cwid):
+ abort(400, message="Invalid compound identifier")
+ try:
+ summary = get_properties(cwid)
+ if not summary:
+ abort(404, message="No data found")
+ return summary.model_dump()
+ except Exception as e:
+ abort(502, message=str(e))
+
+
+@compounds_bp.route("//identifiers")
+@compounds_bp.response(200, CompoundIdentifierSchema(many=True))
+def get_compound_identifiers(cwid):
+ """Get external identifiers."""
+ if not is_valid_qid(cwid):
+ abort(400, message="Invalid compound identifier")
+ try:
+ return [i.model_dump() for i in get_identifiers(cwid)]
+ except Exception as e:
+ abort(502, message=str(e))
+
+
+@compounds_bp.route("//toxicology")
+@compounds_bp.response(200, CompoundToxicologySchema(many=True))
+def get_compound_toxicology(cwid):
+ """Get toxicology data."""
+ if not is_valid_qid(cwid):
+ abort(400, message="Invalid compound identifier")
+ try:
+ return [t.model_dump() for t in get_toxicology(cwid)]
+ except Exception as e:
+ abort(502, message=str(e))
+
+
+@compounds_bp.route("//experimental-data")
+@compounds_bp.response(200, CompoundExpDataSchema(many=True))
+def get_compound_exp_data(cwid):
+ """Get experimental measurements."""
+ if not is_valid_qid(cwid):
+ abort(400, message="Invalid compound identifier")
+ try:
+ return [d.model_dump() for d in get_experimental_data(cwid)]
+ except Exception as e:
+ abort(502, message=str(e))
+
+
+# -- Data (BioStudies + Zenodo passthrough) --------------------------------
+
+@data_bp.route("/")
+@data_bp.arguments(DataSearchQuerySchema, location="query")
+@data_bp.response(200, DataResultSchema)
+def list_data(args):
+ """Search datasets across BioStudies and Zenodo."""
+ query = args.get("query", "")
+ page = args.get("page", 1)
+ size = args.get("size", 18)
+
+ bs = BioStudiesExtractor(collection=BIOSTUDIES_COLLECTION)
+ zen = ZenodoExtractor(community=ZENODO_COMMUNITY, record_type=ZENODO_RECORD_TYPE)
+
+ if query:
+ bs_res = bs.search_studies(query, page=page, page_size=size)
+ zen_res = zen.search_records(query, page=page, size=size)
+ else:
+ bs_res = bs.list_studies(page=page, page_size=size, include_urls=True)
+ zen_res = zen.list_records(page=page, size=size, include_urls=True)
+
+ studies = bs_res.get("hits", [])
+ datasets = zen_res.get("hits", [])
+ studies, datasets = normalize_all(studies, datasets)
+
+ return {
+ "biostudies": {
+ "total": bs_res.get("total", 0),
+ "hits": [h.get("norm_metadata", h) for h in studies],
+ "error": bs_res.get("error"),
+ },
+ "zenodo": {
+ "total": zen_res.get("total", 0),
+ "hits": [h.get("norm_metadata", h) for h in datasets],
+ "error": zen_res.get("error"),
+ },
+ }
+
+
+@data_bp.route("/")
+@data_bp.response(200)
+def get_data_detail(data_id):
+ """Get normalized metadata for a single dataset."""
+ bs = BioStudiesExtractor(collection=BIOSTUDIES_COLLECTION)
+ zen = ZenodoExtractor(community=ZENODO_COMMUNITY, record_type=ZENODO_RECORD_TYPE)
+ bs_res = bs.search_studies(data_id, page=1, page_size=1)
+ zen_res = zen.search_records(data_id, page=1, size=1)
+ studies = bs_res.get("hits", [])
+ datasets = zen_res.get("hits", [])
+ studies, datasets = normalize_all(studies, datasets)
+ if studies:
+ return studies[0].get("norm_metadata", studies[0])
+ if datasets:
+ return datasets[0].get("norm_metadata", datasets[0])
+ abort(404, message="Dataset not found")
+
+
+# -- Validation blueprint --------------------------------------------------
+
+validation_bp = Blueprint("validation", __name__, url_prefix="/api/validation",
+ description="Data completeness validation")
+
+from src.models.cloud.method import ServiceIndexEntry as ToolModel
+from src.models.cloud.tool import Method as MethodModel
+from src.models.platform import (
+ RegulatoryQuestion as RQModel,
+ StageExplanation as SEModel,
+)
+from src.models.casestudy import CaseStudyCard as CSModel
+
+_ENTITY_REGISTRY = {
+ "tools": ("tools", ToolModel, "id", "service"),
+ "methods": ("methods", MethodModel, "id", "method"),
+ "case_studies": ("case_studies", CSModel, "slug", "title"),
+ "regulatory_questions": ("regulatory_questions", RQModel, "key", "label"),
+ "stage_explanations": ("stage_explanations", SEModel, "name", "name"),
+}
+
+_SKIP_FIELDS = {
+ "raw_json", "updated_at", "model_config",
+ "timestamp", "https",
+ "reg_q_1a", "reg_q_1b", "reg_q_2a",
+ "reg_q_2b", "reg_q_3a", "reg_q_3b",
+}
+
+
+class FieldCompleteness(Schema):
+ field = fields.Str()
+ present = fields.Bool()
+ value_preview = fields.Str(allow_none=True)
+
+
+class EntryValidation(Schema):
+ id = fields.Str()
+ label = fields.Str()
+ fields_total = fields.Int()
+ fields_filled = fields.Int()
+ completeness_pct = fields.Float()
+ missing = fields.List(fields.Str())
+ details = fields.List(fields.Nested(FieldCompleteness))
+
+
+class EntitySummary(Schema):
+ entity = fields.Str()
+ total_entries = fields.Int()
+ schema_fields = fields.List(fields.Str())
+ avg_completeness_pct = fields.Float()
+ fully_complete = fields.Int()
+ entries = fields.List(fields.Nested(EntryValidation))
+
+
+class ValidationReport(Schema):
+ generated_at = fields.Str()
+ entities = fields.List(fields.Nested(EntitySummary))
+
+
+def _is_filled(val):
+ if val is None:
+ return False
+ if isinstance(val, str) and val.strip() == "":
+ return False
+ return True
+
+
+def _preview(val, max_len=80):
+ if val is None:
+ return None
+ s = str(val)
+ return s[:max_len] + ("..." if len(s) > max_len else "")
+
+
+def _validate_entity(entity_name, table, pydantic_model, id_attr, label_attr):
+ check_fields = [f for f in pydantic_model.model_fields if f not in _SKIP_FIELDS]
+ conn = get_conn()
+ rows = conn.execute(f"SELECT * FROM {table}").fetchall()
+ conn.close()
+
+ entries = []
+ for row in rows:
+ d = dict(row)
+ details = []
+ filled = 0
+ missing = []
+ for f in check_fields:
+ val = d.get(f)
+ ok = _is_filled(val)
+ if ok:
+ filled += 1
+ else:
+ missing.append(f)
+ details.append({"field": f, "present": ok, "value_preview": _preview(val)})
+
+ total = len(check_fields)
+ pct = round(filled / total * 100, 1) if total else 100.0
+ entries.append({
+ "id": str(d.get(id_attr, "?")),
+ "label": str(d.get(label_attr) or d.get(id_attr, "?")),
+ "fields_total": total,
+ "fields_filled": filled,
+ "completeness_pct": pct,
+ "missing": missing,
+ "details": details,
+ })
+
+ avg = round(sum(e["completeness_pct"] for e in entries) / len(entries), 1) if entries else 0.0
+ fully = sum(1 for e in entries if e["completeness_pct"] == 100.0)
+ return {
+ "entity": entity_name,
+ "total_entries": len(entries),
+ "schema_fields": check_fields,
+ "avg_completeness_pct": avg,
+ "fully_complete": fully,
+ "entries": entries,
+ }
+
+
+@validation_bp.route("/")
+@validation_bp.response(200, ValidationReport)
+def validate_all():
+ """Full data completeness report."""
+ from datetime import datetime, timezone
+ return {
+ "generated_at": datetime.now(timezone.utc).isoformat(),
+ "entities": [
+ _validate_entity(name, tbl, model, id_a, lbl_a)
+ for name, (tbl, model, id_a, lbl_a) in _ENTITY_REGISTRY.items()
+ ],
+ }
+
+
+@validation_bp.route("/")
+@validation_bp.response(200, EntitySummary)
+def validate_entity(entity):
+ """Data completeness report for a single entity type."""
+ if entity not in _ENTITY_REGISTRY:
+ abort(404, message=f"Unknown entity '{entity}'. Valid: {', '.join(_ENTITY_REGISTRY)}")
+ tbl, model, id_a, lbl_a = _ENTITY_REGISTRY[entity]
+ return _validate_entity(entity, tbl, model, id_a, lbl_a)
+
+
+# -- Registration helper ---------------------------------------------------
+
+def init_api(app: Flask) -> Api:
+ """Configure flask-smorest and register all API blueprints."""
+ app.config.update({
+ "API_TITLE": "VHP4Safety Platform API",
+ "API_VERSION": "v1",
+ "OPENAPI_VERSION": "3.0.3",
+ "OPENAPI_URL_PREFIX": "/api/v1",
+ "OPENAPI_SWAGGER_UI_PATH": "/docs",
+ "OPENAPI_SWAGGER_UI_URL": "https://cdn.jsdelivr.net/npm/swagger-ui-dist/",
+ "OPENAPI_REDOC_PATH": "/redoc",
+ "OPENAPI_REDOC_URL": "https://cdn.jsdelivr.net/npm/redoc@latest/bundles/redoc.standalone.js",
+ })
+ smorest_api = Api(app)
+ for bp in (tools_bp, methods_bp, reg_q_bp, stages_bp,
+ casestudies_bp, compounds_bp, data_bp, validation_bp):
+ smorest_api.register_blueprint(bp)
+ return smorest_api
diff --git a/src/casestudy_resolver.py b/src/casestudy_resolver.py
new file mode 100644
index 0000000..561624f
--- /dev/null
+++ b/src/casestudy_resolver.py
@@ -0,0 +1,298 @@
+"""Resolve case-study content from the database step hierarchy.
+
+Case study content JSON is seeded into the ``case_studies`` table from
+the VHP4Safety/ui-casestudy-config GitHub repo at seed time.
+The JSON has up to 6 nesting levels:
+ step1Contents → intro + regulatory questions
+ step2Contents → dict[question_key → nav with process-flow steps]
+ step3Contents → dict[q → dict[step → node]]
+ step4Contents → dict[q → dict[step → dict[substep → node]]]
+ step5Contents → dict[q → dict[...]]
+ step6Contents → dict[q → dict[...]]
+
+Given a URL path like /casestudies/kidney/Q1/Kinetics we resolve the
+node at step3Contents["Q1"]["Kinetics"] and render it server-side.
+"""
+
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import Any, Optional
+
+from src.db import get_conn
+
+
+# In-memory cache keyed by slug
+_content_cache: dict[str, dict] = {}
+
+
+def get_content(slug: str) -> dict | None:
+ """Load case-study content JSON from the database (cached)."""
+ if slug in _content_cache:
+ return _content_cache[slug]
+
+ conn = get_conn()
+ row = conn.execute("SELECT content_json FROM case_studies WHERE slug = ?", (slug,)).fetchone()
+ conn.close()
+ if not row or not row["content_json"]:
+ return None
+
+ data = json.loads(row["content_json"])
+ _content_cache[slug] = data
+ return data
+
+
+# ── Resolved result ──────────────────────────────────────────────────────
+
+STEP_TYPE_COLORS = {
+ "workflow step": "btn-vhpdarkteal",
+ "workflow-step": "btn-vhpdarkteal",
+ "workflow substep": "btn-vhplightteal",
+ "workflow-substep": "btn-vhplightteal",
+ "process flow step": "btn-vhpdarkpurple",
+ "process-flow-step": "btn-vhpdarkpurple",
+ "regulatory question": "btn-vhppink-distinct",
+ "regulatory-question": "btn-vhppink-distinct",
+ "tool": "btn-vhpblue",
+}
+
+# Workflow header definitions
+WORKFLOW_STEPS = [
+ {"number": 1, "type": "regulatory-question",
+ "label": "Regulatory Question"},
+ {"number": 2, "type": "workflow-step",
+ "label": "Safety Assessment Workflow Step"},
+ {"number": 3, "type": "process-flow-step",
+ "label": "Case Study Step"},
+ {"number": 4, "type": "workflow-substep",
+ "label": "Case Study Substep"},
+ {"number": 5, "type": "tool",
+ "label": "Tools, Models and Data"},
+]
+
+
+def btn_color(step_type: str | None) -> str:
+ """Return CSS class for a step button based on its type."""
+ if not step_type:
+ return "btn-vhpblue"
+ return STEP_TYPE_COLORS.get(step_type, "btn-vhpblue")
+
+
+@dataclass
+class Breadcrumb:
+ label: str
+ url: str
+ active: bool = False
+
+
+@dataclass
+class StepButtonResolved:
+ """A button ready to render in Jinja."""
+ label: str
+ description: str = ""
+ css_class: str = "btn-vhpblue"
+ url: str = ""
+ disabled: bool = False
+ is_tool_link: bool = False
+
+
+@dataclass
+class ResolvedStep:
+ """Everything the template needs to render one case-study page."""
+ case_slug: str = ""
+ case_title: str = ""
+ step_number: int = 1
+ nav_title: str = ""
+ nav_description: str = ""
+ image_html: str = ""
+ buttons: list[StepButtonResolved] = field(default_factory=list)
+ accordion_sections: list[dict] = field(default_factory=list)
+ content_html: str = ""
+ breadcrumbs: list[Breadcrumb] = field(default_factory=list)
+ workflow_steps: list[dict] = field(default_factory=list)
+ path_parts: list[str] = field(default_factory=list)
+
+
+def _slugify(value: str) -> str:
+ """Convert space-separated label to URL-safe slug."""
+ return value.replace(" ", "_")
+
+
+def _unslugify(value: str) -> str:
+ """Convert URL slug back to the key used in JSON."""
+ return value.replace("_", " ")
+
+
+def _make_url(case: str, parts: list[str]) -> str:
+ """Build an absolute URL from case slug and path parts."""
+ base = f"/casestudies/{case}"
+ if parts:
+ return base + "/" + "/".join(_slugify(p) for p in parts)
+ return base
+
+
+def _parse_content(raw: Any) -> tuple[str, list[dict]]:
+ """Split content into HTML string and accordion sections list."""
+ if raw is None:
+ return "", []
+ if isinstance(raw, str):
+ return raw, []
+ if isinstance(raw, list):
+ sections = []
+ for item in raw:
+ if isinstance(item, dict):
+ sections.append(item)
+ return "", sections
+ return str(raw), []
+
+
+def resolve(
+ slug: str,
+ path_parts: list[str],
+ branch: str = "main",
+) -> Optional[ResolvedStep]:
+ """Resolve a URL path to the correct step content.
+
+ Parameters
+ ----------
+ slug : str
+ Case study slug (kidney, parkinson, thyroid).
+ path_parts : list[str]
+ Path segments after /casestudies// — e.g.
+ ["Q1", "Kinetics"] for step 3.
+
+ Returns
+ -------
+ ResolvedStep or None if the path doesn't resolve.
+ """
+ data = get_content(slug)
+ if data is None:
+ return None
+
+ step1 = data.get("step1Contents", {})
+ case_title = step1.get("navTitle", slug.title() + " Case Study")
+
+ result = ResolvedStep(
+ case_slug=slug,
+ case_title=case_title,
+ path_parts=list(path_parts),
+ )
+
+ # Build workflow header state
+ active_step = len(path_parts) + 1
+ result.step_number = active_step
+ for ws in WORKFLOW_STEPS:
+ state = "completed" if ws["number"] < active_step \
+ else "active" if ws["number"] == active_step \
+ else ""
+ result.workflow_steps.append({**ws, "state": state})
+
+ # ── Step 1: no path parts ─────────────────────────────────────
+ if not path_parts:
+ result.nav_title = step1.get("navTitle", "")
+ result.nav_description = step1.get("navDescription", "")
+ html, sections = _parse_content(step1.get("content"))
+ result.content_html = html
+ result.accordion_sections = sections
+ # Buttons = regulatory questions
+ for q in step1.get("questions", []):
+ result.buttons.append(StepButtonResolved(
+ label=q.get("label", ""),
+ description=q.get("description", ""),
+ css_class=btn_color(
+ q.get("type", "regulatory-question")
+ ),
+ url=_make_url(slug, [q["value"]]),
+ disabled=q.get("state") == "disabled",
+ ))
+ result.breadcrumbs = [
+ Breadcrumb("Case Studies", "/casestudies"),
+ Breadcrumb(case_title, "", active=True),
+ ]
+ return result
+
+ # ── Step 2+: walk the nested dicts ────────────────────────────
+ # path_parts[0] is the question key (e.g. "Q1")
+ # path_parts[1] is the step2 choice (e.g. "Kinetics")
+ # etc.
+ depth = len(path_parts)
+ step_key = f"step{depth + 1}Contents"
+
+ # Navigate to the correct node
+ container = data.get(step_key, {})
+ node = container
+ for i, part in enumerate(path_parts):
+ key = _unslugify(part)
+ if isinstance(node, dict) and key in node:
+ node = node[key]
+ else:
+ # Try original (slugified) key as fallback
+ if isinstance(node, dict) and part in node:
+ node = node[part]
+ else:
+ return None
+
+ if not isinstance(node, dict):
+ return None
+
+ # Extract node fields
+ result.nav_title = node.get("navTitle", "")
+ result.nav_description = node.get("navDescription", "")
+ result.image_html = node.get("image", "")
+ html, sections = _parse_content(node.get("content"))
+ result.content_html = html
+ result.accordion_sections = sections
+
+ # Determine next-step buttons
+ base_url_parts = list(path_parts)
+
+ if node.get("steps"):
+ for s in node["steps"]:
+ val = s.get("value", s.get("label", ""))
+ result.buttons.append(StepButtonResolved(
+ label=s.get("label", ""),
+ description=s.get("description", ""),
+ css_class=btn_color(s.get("type")),
+ url=_make_url(slug, base_url_parts + [val]),
+ disabled=s.get("state") == "disabled",
+ ))
+ elif node.get("tools"):
+ for t in node["tools"]:
+ tool_id = t.get("id")
+ route = t.get("route", "tools")
+ if tool_id:
+ url = f"/{route}/{tool_id}"
+ is_tool = True
+ else:
+ url = ""
+ is_tool = False
+ result.buttons.append(StepButtonResolved(
+ label=t.get("label", ""),
+ description=t.get("description", ""),
+ css_class=btn_color(t.get("type", "tool")),
+ url=url,
+ disabled=t.get("state") == "disabled",
+ is_tool_link=is_tool,
+ ))
+
+ # Breadcrumbs
+ crumbs = [Breadcrumb("Case Studies", "/casestudies")]
+ crumbs.append(Breadcrumb(
+ case_title, _make_url(slug, []),
+ ))
+
+ # Build intermediate crumbs
+ # Step 2 label = "Regulatory Question "
+ for i, part in enumerate(path_parts):
+ is_last = (i == len(path_parts) - 1)
+ label = _unslugify(part)
+ if i == 0:
+ label = f"Regulatory Question {label}"
+ url = _make_url(slug, path_parts[: i + 1])
+ crumbs.append(Breadcrumb(
+ label, url, active=is_last,
+ ))
+
+ result.breadcrumbs = crumbs
+ return result
diff --git a/src/db.py b/src/db.py
new file mode 100644
index 0000000..4affbf4
--- /dev/null
+++ b/src/db.py
@@ -0,0 +1,75 @@
+"""Thin sqlite3 helper. No ORM — just raw SQL."""
+
+from __future__ import annotations
+
+import os
+import sqlite3
+from contextlib import contextmanager
+
+DB_PATH = os.environ.get("DATABASE_PATH", "data/vhp4safety.db")
+
+_TABLES = [
+ """CREATE TABLE IF NOT EXISTS tools (
+ id TEXT PRIMARY KEY, service TEXT NOT NULL, description TEXT,
+ stage TEXT, html_name TEXT, md_file_name TEXT, png_file_name TEXT,
+ main_url TEXT, inst_url TEXT,
+ reg_q_1a INTEGER, reg_q_1b INTEGER, reg_q_2a INTEGER,
+ reg_q_2b INTEGER, reg_q_3a INTEGER, reg_q_3b INTEGER,
+ login TEXT, api_type TEXT, casestudy TEXT, provider TEXT,
+ provider_email TEXT, citation TEXT, version TEXT, license TEXT,
+ sourcecode TEXT, docker TEXT, bio_tools TEXT, tess TEXT,
+ raw_json TEXT, updated_at TEXT
+ )""",
+ """CREATE TABLE IF NOT EXISTS methods (
+ id TEXT PRIMARY KEY, method TEXT NOT NULL, issue_number INTEGER,
+ description TEXT, stage TEXT, substage TEXT,
+ catalog_webpage_url TEXT, case_study TEXT, regulatory_question TEXT,
+ reg_q_1a INTEGER, reg_q_1b INTEGER, reg_q_2a INTEGER,
+ reg_q_2b INTEGER, reg_q_3a INTEGER, reg_q_3b INTEGER,
+ data_producer TEXT, sop TEXT, vendor TEXT, catalog_number TEXT,
+ citation TEXT, type_iri TEXT, ontology TEXT,
+ key_event_id TEXT, aop_id TEXT, raw_json TEXT, updated_at TEXT
+ )""",
+ """CREATE TABLE IF NOT EXISTS regulatory_questions (
+ key TEXT PRIMARY KEY, label TEXT NOT NULL, explanation TEXT NOT NULL
+ )""",
+ """CREATE TABLE IF NOT EXISTS stage_explanations (
+ name TEXT PRIMARY KEY, explanation TEXT NOT NULL
+ )""",
+ """CREATE TABLE IF NOT EXISTS glossary_stage_mappings (
+ glossary_url TEXT PRIMARY KEY, stage_name TEXT NOT NULL
+ )""",
+ """CREATE TABLE IF NOT EXISTS case_studies (
+ slug TEXT PRIMARY KEY, title TEXT NOT NULL, description TEXT NOT NULL,
+ image_src TEXT, image_alt TEXT,
+ config_repo TEXT DEFAULT 'VHP4Safety/ui-casestudy-config',
+ default_branch TEXT DEFAULT 'main', content_json TEXT
+ )""",
+]
+
+
+def get_conn() -> sqlite3.Connection:
+ """Return a new connection with Row factory."""
+ conn = sqlite3.connect(DB_PATH)
+ conn.row_factory = sqlite3.Row
+ return conn
+
+
+@contextmanager
+def get_db():
+ """Context manager: yields a connection, auto-closes."""
+ conn = get_conn()
+ try:
+ yield conn
+ finally:
+ conn.close()
+
+
+def init_db() -> None:
+ """Create all tables (idempotent)."""
+ os.makedirs(os.path.dirname(DB_PATH) or ".", exist_ok=True)
+ conn = sqlite3.connect(DB_PATH)
+ for ddl in _TABLES:
+ conn.execute(ddl)
+ conn.commit()
+ conn.close()
diff --git a/src/models/__init__.py b/src/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/models/casestudy.py b/src/models/casestudy.py
new file mode 100644
index 0000000..430ef03
--- /dev/null
+++ b/src/models/casestudy.py
@@ -0,0 +1,209 @@
+"""Pydantic models for VHP4Safety case-study content JSON schemas.
+
+The JSON files originate from a separate GitHub repo
+(VHP4Safety/ui-casestudy-config) and are fetched once during database
+seeding (``python -m src.seed``). The full JSON blob is stored in
+the ``case_studies.content_json`` column and resolved server-side
+by ``src.casestudy_resolver`` into rendered Jinja templates.
+
+These models formalise the structure so it can be validated
+server-side, used in tests, and consumed by type-aware code.
+
+Hierarchy (up to 6 levels deep):
+ CaseStudyContent ← root of one *_content.json file
+ └ Step1Contents ← intro + regulatory-question buttons
+ └ step2Contents ← dict[question_key → ProcessFlowNav]
+ └ step3Contents ← dict[question_key → dict[step_label → WorkflowStepNode]]
+ └ step4–6Contents ← additional nesting (same WorkflowStepNode shape)
+
+Every "node" at step ≥ 2 follows the same recursive pattern captured
+by ``WorkflowStepNode``.
+"""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Any, Optional
+
+from pydantic import BaseModel, Field
+
+
+# ── Enums ─────────────────────────────────────────────────────────────────
+
+
+class StepType(str, Enum):
+ """Button colour / role categories used by the JS renderer."""
+
+ WORKFLOW_STEP = "workflow step"
+ WORKFLOW_SUBSTEP = "workflow substep"
+ PROCESS_FLOW_STEP = "process flow step"
+ REGULATORY_QUESTION = "regulatory question"
+ TOOL = "tool"
+
+
+class CaseStudySlug(str, Enum):
+ """Known case-study URL slugs."""
+
+ KIDNEY = "kidney"
+ PARKINSON = "parkinson"
+ THYROID = "thyroid"
+
+
+# ── Leaf / reusable pieces ────────────────────────────────────────────────
+
+
+class StepButton(BaseModel):
+ """A single clickable button shown in a step panel.
+
+ Appears in ``questions``, ``steps``, and ``tools`` arrays.
+ """
+
+ label: str
+ value: Optional[str] = None
+ description: Optional[str] = None
+ type: Optional[StepType] = None
+ state: Optional[str] = None # e.g. "disabled"
+
+ # tool-specific fields
+ id: Optional[str] = None
+ route: Optional[str] = None # e.g. "tools" or "methods"
+
+ model_config = {"extra": "allow"}
+
+
+class AccordionSection(BaseModel):
+ """One collapsible section inside ``content`` when it is an array."""
+
+ section: Optional[str] = None
+ description: Optional[str] = None
+
+ model_config = {"extra": "allow"}
+
+
+# Content can be a raw HTML string **or** a list of accordion sections.
+# We keep it as ``Any`` so both shapes validate; downstream code already
+# branches on ``Array.isArray(content)`` in JS.
+ContentBlock = str | list[AccordionSection] | None
+
+
+# ── Step 1 (intro + regulatory questions) ────────────────────────────────
+
+
+class Step1Contents(BaseModel):
+ """Top-level intro panel for a case study.
+
+ Shown on first load; contains the two regulatory-question buttons.
+ """
+
+ navTitle: str
+ navDescription: str = ""
+ questions: list[StepButton] = Field(default_factory=list)
+ content: Any = None # HTML string or accordion list
+
+ model_config = {"extra": "allow"}
+
+
+# ── Generic workflow node (steps 2–6) ─────────────────────────────────────
+
+
+class WorkflowStepNode(BaseModel):
+ """A single node at any depth in the step hierarchy.
+
+ Depending on what keys are present the JS renderer shows:
+ * ``steps`` → navigable sub-step buttons (goes deeper)
+ * ``tools`` → tool buttons (leaf, may link to /tools/)
+ * neither → plain content panel
+
+ Nodes may contain ``content`` as HTML **or** accordion JSON.
+ ``image`` is an optional raw HTML string (e.g. an
tag).
+ """
+
+ navTitle: Optional[str] = None
+ navDescription: Optional[str] = None
+ steps: Optional[list[StepButton]] = None
+ tools: Optional[list[StepButton]] = None
+ content: Any = None
+ image: Optional[str] = None
+
+ # Some step-3 entries carry a flag to signal step-4 exists
+ step4content: Optional[str] = None
+
+ model_config = {"extra": "allow"}
+
+
+class ProcessFlowNav(BaseModel):
+ """Step-2 panel: safety-assessment workflow steps for one question.
+
+ Keys ``steps`` list the process-flow buttons; ``content`` is the
+ intro HTML.
+ """
+
+ navTitle: str = ""
+ navDescription: str = ""
+ steps: list[StepButton] = Field(default_factory=list)
+ content: Any = None
+ image: Optional[str] = None
+
+ model_config = {"extra": "allow"}
+
+
+# ── Root document ─────────────────────────────────────────────────────────
+
+# Steps 3-6 are nested dicts whose keys are dynamic (question key,
+# step label, sub-step label …). We type them as deeply as
+# practical; the innermost values are always WorkflowStepNode.
+
+Step3Map = dict[str, dict[str, WorkflowStepNode]]
+Step4Map = dict[str, dict[str, dict[str, WorkflowStepNode]]]
+Step5Map = dict[str, dict[str, dict[str, dict[str, WorkflowStepNode]]]]
+Step6Map = dict[
+ str, dict[str, dict[str, dict[str, dict[str, WorkflowStepNode]]]]
+]
+
+
+class CaseStudyContent(BaseModel):
+ """Root schema for a ``_content.json`` file.
+
+ Mirrors exactly the shape consumed by ``casestudies.js``.
+ """
+
+ step1Contents: Step1Contents
+ step2Contents: dict[str, ProcessFlowNav] = Field(
+ default_factory=dict
+ )
+ step3Contents: Optional[Step3Map] = None
+ step4Contents: Optional[Step4Map] = None
+ step5Contents: Optional[Step5Map] = None
+ step6Contents: Optional[Step6Map] = None
+
+ model_config = {"extra": "allow"}
+
+
+# ── Case study card (listing page) ───────────────────────────────────────
+
+
+class CaseStudyCard(BaseModel):
+ """Metadata for one card on the /casestudies listing page."""
+
+ slug: CaseStudySlug
+ title: str
+ description: str
+ image_src: str = ""
+ image_alt: str = ""
+ url: str = ""
+ config_repo: Optional[str] = None
+ content_json: Optional[str] = None
+
+
+# ── Convenience: full registry ────────────────────────────────────────────
+
+
+class CaseStudyRegistry(BaseModel):
+ """All known case studies with their summary cards and loaded content."""
+
+ cards: list[CaseStudyCard] = Field(default_factory=list)
+ content: dict[CaseStudySlug, CaseStudyContent] = Field(
+ default_factory=dict,
+ )
+
+ model_config = {"extra": "allow"}
diff --git a/src/models/cloud/method.py b/src/models/cloud/method.py
new file mode 100644
index 0000000..48d37f9
--- /dev/null
+++ b/src/models/cloud/method.py
@@ -0,0 +1,134 @@
+"""Pydantic models for VHP4Safety Cloud method JSON schemas."""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+class ServiceContact(BaseModel):
+ name: Optional[str] = None
+ email: Optional[str] = None
+
+
+class ServiceProvider(BaseModel):
+ contact: Optional[ServiceContact] = None
+ url: Optional[str] = None
+ name: Optional[str] = None
+
+
+class ServiceInstance(BaseModel):
+ type: Optional[str] = None
+ url: Optional[str] = None
+ license: Optional[str] = None
+ version: Optional[str] = None
+ source: Optional[str] = None
+ vhp_platform: Optional[str] = Field(None, alias="vhp-platform")
+
+ model_config = {"populate_by_name": True, "extra": "allow"}
+
+
+class ServiceAccess(BaseModel):
+ API: Optional[str] = None
+ login: Optional[str] = None
+
+ model_config = {"extra": "allow"}
+
+
+class ServiceIntro(BaseModel):
+ title: Optional[str] = None
+ url: Optional[str] = None
+
+
+class RegulatoryQuestion(BaseModel):
+ q1a: Optional[str] = Field(None, alias="1a")
+ q1b: Optional[str] = Field(None, alias="1b")
+ q2a: Optional[str] = Field(None, alias="2a")
+ q2b: Optional[str] = Field(None, alias="2b")
+ q3a: Optional[str] = Field(None, alias="3a")
+ q3b: Optional[str] = Field(None, alias="3b")
+
+ model_config = {"populate_by_name": True}
+
+
+class Service(BaseModel):
+ """A single service entry (docs/service/*.json)."""
+
+ id: str
+ service: str = Field(description="Service display name")
+ description: Optional[str] = None
+
+ stage: Optional[str] = None
+ substage: Optional[str] = None
+ screenshot: Optional[str] = None
+ url: Optional[str] = None
+
+ instance: Optional[ServiceInstance] = None
+ intro: Optional[ServiceIntro] = None
+ provider: Optional[ServiceProvider] = None
+ access: Optional[ServiceAccess] = None
+ regulatory_question: Optional[RegulatoryQuestion] = Field(
+ None, alias="regulatory-question"
+ )
+ ELIXIR: Optional[dict] = None
+
+ model_config = {"populate_by_name": True, "extra": "allow"}
+
+
+class ServiceIndexEntry(BaseModel):
+ """A service as represented in the index (cap/service_index.json)."""
+
+ id: str
+ service: str
+ description: Optional[str] = None
+
+ html_name: Optional[str] = None
+ md_file_name: Optional[str] = None
+ png_file_name: Optional[str] = None
+ stage: Optional[str] = None
+ main_url: Optional[str] = None
+ inst_url: Optional[str] = None
+
+ # Regulatory question flags
+ reg_q_1a: Optional[str] = None
+ reg_q_1b: Optional[str] = None
+ reg_q_2a: Optional[str] = None
+ reg_q_2b: Optional[str] = None
+ reg_q_3a: Optional[str] = None
+ reg_q_3b: Optional[str] = None
+
+ # Upstream issue-template fields (new-tool-service-entry.yml)
+ login: Optional[str] = None
+ api_type: Optional[str] = Field(None, alias="api")
+ casestudy: Optional[str] = None
+ provider: Optional[str] = None
+ provider_email: Optional[str] = Field(
+ None, alias="provider-email"
+ )
+ citation: Optional[str] = None
+ version: Optional[str] = None
+ license: Optional[str] = None
+ sourcecode: Optional[str] = None
+ docker: Optional[str] = None
+ bio_tools: Optional[str] = Field(None, alias="bioTools")
+ tess: Optional[str] = None
+
+ model_config = {"populate_by_name": True, "extra": "allow"}
+
+
+class ServiceIndex(BaseModel):
+ """The full service index (cap/service_index.json).
+
+ A mapping of service id → ServiceIndexEntry.
+ """
+
+ root: dict[str, ServiceIndexEntry] = Field(default_factory=dict)
+
+ model_config = {"extra": "allow"}
+
+ @classmethod
+ def from_dict(cls, data: dict) -> ServiceIndex:
+ return cls(
+ root={k: ServiceIndexEntry.model_validate(v) for k, v in data.items()}
+ )
diff --git a/src/models/cloud/tool.py b/src/models/cloud/tool.py
new file mode 100644
index 0000000..01b574a
--- /dev/null
+++ b/src/models/cloud/tool.py
@@ -0,0 +1,98 @@
+"""Pydantic models for VHP4Safety Cloud tool JSON schemas."""
+
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+
+class Method(BaseModel):
+ """A single method entry (docs/methods/*.json).
+
+ Field names match the ORM columns in tables.py.
+ Aliases map to the raw JSON keys from the cloud repo.
+ """
+
+ id: str
+ method: str = Field(description="Method title (from issue title)")
+ issue_number: Optional[int] = None
+ description: Optional[str] = Field(
+ None, alias="method_description_content"
+ )
+
+ # Upstream issue-template fields (new-tool-method-entry.yml)
+ data_producer: Optional[str] = Field(
+ None, alias="data_producer_content"
+ )
+ sop: Optional[str] = Field(
+ None, alias="available_sop_or_protocol_content"
+ )
+ vendor: Optional[str] = Field(
+ None, alias="vendor_content"
+ )
+ catalog_number: Optional[str] = Field(
+ None, alias="catalog_number_content"
+ )
+ catalog_webpage_url: Optional[str] = None
+ citation: Optional[str] = Field(
+ None, alias="citation_content"
+ )
+ stage: Optional[str] = Field(
+ None, alias="vhp4safety_workflow_stage_content"
+ )
+ substage: Optional[str] = Field(
+ None, alias="workflow_substage_content"
+ )
+ case_study: Optional[str] = Field(
+ None, alias="case_study_content"
+ )
+ regulatory_question: Optional[str] = Field(
+ None, alias="regulatory_question_content"
+ )
+ type_iri: Optional[str] = Field(
+ None, alias="ontology_term_content"
+ )
+ ontology: Optional[str] = Field(
+ None, alias="type_content"
+ )
+ key_event_id: Optional[str] = Field(
+ None,
+ alias="relevant_aop_wiki_key_event(s)_to_the_assay_content",
+ )
+ aop_id: Optional[str] = Field(
+ None,
+ alias="relevant_aop_wiki_adverse_outcome_pathway(s)"
+ "_to_the_assay_content",
+ )
+
+ # Regulatory question flags
+ reg_q_1a: Optional[str] = None
+ reg_q_1b: Optional[str] = None
+ reg_q_2a: Optional[str] = None
+ reg_q_2b: Optional[str] = None
+ reg_q_3a: Optional[str] = None
+ reg_q_3b: Optional[str] = None
+
+ timestamp: Optional[datetime] = None
+ https: Optional[str] = Field(
+ None, description="Broken URL fragment in some files"
+ )
+
+ model_config = {"populate_by_name": True, "extra": "allow"}
+
+
+class MethodIndex(BaseModel):
+ """The full methods index (cap/methods_index.json).
+
+ A mapping of method id → Method.
+ """
+
+ root: dict[str, Method] = Field(default_factory=dict)
+
+ model_config = {"extra": "allow"}
+
+ @classmethod
+ def from_dict(cls, data: dict) -> MethodIndex:
+ return cls(root={k: Method.model_validate(v) for k, v in data.items()})
diff --git a/src/models/compound.py b/src/models/compound.py
new file mode 100644
index 0000000..1b871f4
--- /dev/null
+++ b/src/models/compound.py
@@ -0,0 +1,75 @@
+"""Pydantic models for compound data from CompoundCloud SPARQL.
+
+These are not stored in the database — they model the responses from
+the CompoundCloud Wikibase SPARQL endpoint and from Wikidata QLever
+for experimental data.
+"""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+
+class CompoundSummary(BaseModel):
+ """Core compound identifiers from CompoundCloud."""
+
+ wcid: str = Field(description="CompoundCloud entity URI")
+ label: str = Field(description="Human-readable compound name")
+ inchi: str = ""
+ inchikey: str = ""
+ smiles: str = Field("", alias="SMILES")
+ formula: str = ""
+ mass: str = ""
+
+ model_config = {"populate_by_name": True}
+
+
+class CompoundIdentifier(BaseModel):
+ """A single external identifier for a compound."""
+
+ property_label: str = Field(
+ "", description="Name of the identifier property"
+ )
+ value: str = ""
+ formatter_url: str = Field(
+ "", description="URL template for the identifier"
+ )
+
+
+class CompoundToxicology(BaseModel):
+ """A toxicology property row."""
+
+ property_label: str = ""
+ value: str = ""
+
+
+class CompoundExperimentalDatum(BaseModel):
+ """A single experimental measurement from Wikidata."""
+
+ property_label: str = Field(
+ "", description="Measured property name"
+ )
+ value: str = ""
+ units_label: str = ""
+ source: str = ""
+ doi: str = ""
+ see_also: str = Field(
+ "", description="Link to the Wikidata statement"
+ )
+
+
+class CompoundDetail(BaseModel):
+ """Full compound view combining all SPARQL query results."""
+
+ summary: Optional[CompoundSummary] = None
+ identifiers: list[CompoundIdentifier] = Field(
+ default_factory=list
+ )
+ toxicology: list[CompoundToxicology] = Field(
+ default_factory=list
+ )
+ experimental_data: list[CompoundExperimentalDatum] = Field(
+ default_factory=list
+ )
diff --git a/src/models/data/__init__.py b/src/models/data/__init__.py
new file mode 100644
index 0000000..6fd1d01
--- /dev/null
+++ b/src/models/data/__init__.py
@@ -0,0 +1,50 @@
+"""Data models & extractors for BioStudies and Zenodo datasets."""
+
+from src.models.data.biostudies import BioStudiesExtractor
+from src.models.data.zenodo import ZenodoExtractor
+from src.models.data.mapping import normalize_all
+from src.models.data.schemas import (
+ Author,
+ Attribute,
+ AuthorDetail,
+ BiologicalContext,
+ BioStudiesParsedMetadata,
+ DataFile,
+ ExperimentalDesign,
+ FileEntry,
+ Funding,
+ LinkEntry,
+ NormalizedMetadata,
+ ProtocolEntry,
+ Publication,
+ TechnicalDetails,
+ UrlExistsResult,
+ ZenodoFileEntry,
+ ZenodoParsedMetadata,
+)
+
+__all__ = [
+ # Extractors
+ "BioStudiesExtractor",
+ "ZenodoExtractor",
+ # Normalizer
+ "normalize_all",
+ # Pydantic models
+ "Author",
+ "Attribute",
+ "AuthorDetail",
+ "BiologicalContext",
+ "BioStudiesParsedMetadata",
+ "DataFile",
+ "ExperimentalDesign",
+ "FileEntry",
+ "Funding",
+ "LinkEntry",
+ "NormalizedMetadata",
+ "ProtocolEntry",
+ "Publication",
+ "TechnicalDetails",
+ "UrlExistsResult",
+ "ZenodoFileEntry",
+ "ZenodoParsedMetadata",
+]
diff --git a/src/models/data/biostudies.py b/src/models/data/biostudies.py
new file mode 100644
index 0000000..2756cb1
--- /dev/null
+++ b/src/models/data/biostudies.py
@@ -0,0 +1,867 @@
+import requests
+import json
+import time
+import re
+from urllib.parse import quote
+
+
+class BioStudiesExtractor:
+ """Class to handle BioStudies API interactions"""
+
+ _SPLIT_RE = re.compile(r"^(.*?)(\d+)$")
+
+ def __init__(self, collection: str = ""):
+ self.base_url = "https://www.ebi.ac.uk/biostudies/api/v1"
+ self.ftp_base = "https://ftp.ebi.ac.uk/pub/databases/biostudies/"
+ self.studies_url = self.base_url + "/studies"
+ self.search_url = (
+ f"{self.base_url}/{collection}/search"
+ if collection
+ else f"{self.base_url}/search"
+ )
+
+ # -----------------------------
+ # ID validation / URL building
+ # -----------------------------
+ def validate_study_id(self, study_id):
+ """
+ Validate BioStudies ID format
+
+ Args:
+ study_id (str): BioStudies accession ID
+
+ Returns:
+ tuple: (is_valid, cleaned_id, error_message)
+ """
+ if not study_id or not isinstance(study_id, str):
+ return False, None, "Study ID is required"
+
+ verified_id = study_id.strip().upper()
+
+ # Examples: S-ONTX26, E-MTAB-1234, S-BSST123, S-VHPS21, S-TOXR1735
+ patterns = [
+ r"^S-[A-Z0-9]+$", # Studies starting with S-
+ r"^E-[A-Z]+-\d+$", # Expression studies like E-MTAB-1234
+ r"^[A-Z]+-\d+$", # General pattern like ABC-123
+ ]
+
+ if not any(re.match(pattern, verified_id) for pattern in patterns):
+ return (
+ False,
+ verified_id,
+ "Invalid BioStudies ID format. Expected format: S-ONTX26, E-MTAB-1234, etc.",
+ )
+
+ return True, verified_id, None
+
+ def split_text_int(self, value: str):
+ """
+ Splits trailing integer from a string.
+ 'S-VHPS21' -> ('S-VHPS', 21)
+ 'ABC' -> ('ABC', None)
+ 'X-12A' -> ('X-12A', None)
+ """
+ if not value:
+ return value, None
+ m = self._SPLIT_RE.match(value)
+ if not m:
+ return value, None
+ prefix, num = m.group(1), int(m.group(2))
+ return prefix, num
+
+ def build_biostudies_https_file_url(self, accno: str, filename: str) -> str | None:
+ """
+ Constructs:
+ https://ftp.ebi.ac.uk/pub/databases/biostudies/{prefix}/{num3}/{accno}/Files/{filename}
+
+ Returns None if accno has no trailing integer.
+
+ Note:
+ - We keep "/" safe in case filename contains subfolders (rare, but possible).
+ """
+ prefix, num = self.split_text_int(accno)
+ if num is None or not filename:
+ return None
+
+ num3 = f"{num:03d}"
+
+ # Encode only the filename segment (allow "/" for potential subpaths)
+ safe_name = quote(filename, safe="/")
+
+ return (
+ self.ftp_base
+ + f"{prefix}/{num3}/{accno}/Files/{safe_name}"
+ )
+
+ def url_exists_no_download(self, url: str, timeout=(3.05, 10)):
+ """
+ Returns a dict describing existence with minimal data transfer.
+ - tries HEAD
+ - falls back to GET Range bytes=0-0
+ """
+ result = {
+ "url": url,
+ "exists": False,
+ "status_code": None,
+ "content_length": None,
+ "final_url": None,
+ "error": None,
+ "method": None,
+ }
+
+ if not url:
+ result["error"] = "Empty URL"
+ return result
+
+ try:
+ # 1) HEAD (preferred: no body)
+ r = requests.head(url, allow_redirects=True, timeout=timeout)
+ result["status_code"] = r.status_code
+ result["final_url"] = str(r.url)
+ result["method"] = "HEAD"
+
+ if r.status_code == 200:
+ result["exists"] = True
+ result["content_length"] = r.headers.get("Content-Length")
+ return result
+
+ # 2) Fallback if HEAD not allowed or forbidden, etc.
+ if r.status_code in (403, 405):
+ rg = requests.get(
+ url,
+ stream=True,
+ allow_redirects=True,
+ headers={"Range": "bytes=0-0"},
+ timeout=timeout,
+ )
+ result["status_code"] = rg.status_code
+ result["final_url"] = str(rg.url)
+ result["method"] = "GET_RANGE"
+
+ # 206 Partial Content is a strong "exists"
+ if rg.status_code in (200, 206):
+ result["exists"] = True
+ result["content_length"] = rg.headers.get("Content-Length")
+
+ return result
+
+ # other codes (404, 410, 500...) treated as not found / not accessible
+ return result
+
+ except requests.RequestException as e:
+ result["error"] = str(e)
+ return result
+
+ def _pick_rocrate_file(self, files: list[dict]) -> dict | None:
+ """
+ Return the first file dict whose name/path contains 'rocrate' (case-insensitive).
+ Preference order:
+ 1) files where exists_check.exists is True (if exists_check present)
+ 2) otherwise first match
+ """
+ if not isinstance(files, list) or not files:
+ return None
+
+ def fname(f: dict) -> str:
+ if not isinstance(f, dict):
+ return ""
+ return str(f.get("name") or f.get("path") or "").lower()
+
+ # All matches by name/path
+ matches = [f for f in files if "rocrate" in fname(f)]
+ if not matches:
+ return None
+
+ # Prefer verified existing ones if available
+ verified = [
+ f for f in matches
+ if isinstance(f, dict)
+ and isinstance(f.get("exists_check"), dict)
+ and f["exists_check"].get("exists") is True
+ ]
+ return verified[0] if verified else matches[0]
+
+ # -----------------------------
+ # API operations
+ # -----------------------------
+ def get_study_metadata(self, study_id):
+ """
+ Extract metadata for a given BioStudies ID
+
+ Args:
+ study_id (str): BioStudies accession ID (e.g., S-ONTX26)
+
+ Returns:
+ dict: Parsed metadata or error information
+ """
+ try:
+ # Validate study ID format
+ is_valid, verified_id, validation_error = self.validate_study_id(study_id)
+ if not is_valid:
+ return {"error": validation_error}
+
+ url = self.studies_url + f"/{verified_id}"
+
+ headers = {
+ "Accept": "application/json",
+ "User-Agent": "BioStudies-VHP4Safety-App/1.0",
+ }
+
+ response = requests.get(url, headers=headers, timeout=30)
+
+ if response.status_code == 200:
+ try:
+ data = response.json()
+ if not data:
+ return {"error": f"Empty response received for study {verified_id}"}
+
+ # Parse metadata first, then build URL using the derived collection (no extra API calls)
+ md = self.parse_metadata(data)
+ collection = md.get("collection", "")
+ web_url = self.build_study_url(verified_id, collection).get("url", "")
+ return md | {"url": web_url}
+
+ except json.JSONDecodeError as e:
+ return {"error": f"Invalid JSON response from BioStudies API: {str(e)}"}
+
+ elif response.status_code == 404:
+ return {
+ "error": f"Study '{verified_id}' not found in BioStudies database. Please check the ID and try again."
+ }
+ elif response.status_code == 403:
+ return {"error": "Access forbidden. The study may be restricted or private."}
+ elif response.status_code == 500:
+ return {"error": "BioStudies server error. Please try again later."}
+ elif response.status_code == 503:
+ return {"error": "BioStudies service temporarily unavailable. Please try again later."}
+ else:
+ return {"error": f"BioStudies API returned status {response.status_code}. Please try again later."}
+
+ except requests.exceptions.Timeout:
+ return {"error": "Request timed out. BioStudies server may be slow. Please try again."}
+ except requests.exceptions.ConnectionError:
+ return {"error": "Cannot connect to BioStudies server. Please check your internet connection."}
+ except requests.exceptions.RequestException as e:
+ return {"error": f"Network error: {str(e)}"}
+ except Exception as e:
+ return {"error": f"Unexpected error occurred: {str(e)}"}
+
+ def get_study_collection(self, study_id):
+ """
+ Extract collection for a given BioStudies ID
+ """
+ metadata = self.get_study_metadata(study_id)
+ if "error" in metadata:
+ return metadata
+ collection = metadata.get("collection", "")
+ return {"accession": study_id, "collection": collection}
+
+ def build_study_url(self, study_id, collection: str = ""):
+ """
+ Build the URL to access the study in BioStudies web interface
+ """
+ is_valid, verified_id, validation_error = self.validate_study_id(study_id)
+ if not is_valid:
+ return {"error": validation_error}
+
+ if collection:
+ url = f"https://www.ebi.ac.uk/biostudies/{collection}/studies/{verified_id}"
+ else:
+ url = f"https://www.ebi.ac.uk/biostudies/studies/{verified_id}"
+
+ return {"accession": verified_id, "url": url}
+
+ # -----------------------------
+ # Search / list
+ # -----------------------------
+ def search_studies(
+ self,
+ query,
+ page=1,
+ page_size=10,
+ load_metadata: bool = True,
+ filters: tuple[tuple] | None = None,
+ ) -> dict:
+ """
+ Search for studies in BioStudies database
+ """
+ try:
+ if not query or not isinstance(query, str):
+ return {"error": "Search query must be a non-empty string."}
+
+ filters_applied = bool(filters)
+ if filters_applied:
+ load_metadata = True
+
+ params = {"query": query, "page": page, "pageSize": page_size}
+
+ headers = {
+ "Accept": "application/json",
+ "User-Agent": "BioStudies-VHP4Safety-App/1.0",
+ }
+
+ response = requests.get(self.search_url, headers=headers, params=params, timeout=30)
+
+ if response.status_code == 200:
+ try:
+ data = response.json()
+ hits = data.get("hits", [])
+ total_hits = data.get("totalHits", 0)
+
+ if not data or total_hits == 0:
+ return {"error": "No results found."}
+
+ if load_metadata:
+ hits = self._hit_metadata(hits)
+ hits = self._hit_url(hits)
+
+ if filters_applied:
+ hits = self._apply_filters(hits, filters)
+
+ page_size_met = len(hits) >= page_size
+ pages_fetched = 1
+
+ if not page_size_met:
+ hits, page_size_met, pages_fetched = self._backfill_filtered_results(
+ hits, page, page_size, filters, query
+ )
+
+ return {
+ "totalHits": total_hits,
+ "hits": hits,
+ "hits_returned": len(hits),
+ "page": page,
+ "pageSize": page_size,
+ "pages_fetched": pages_fetched,
+ "filters_applied": True,
+ "page_size_met": page_size_met,
+ }
+
+ return data | {"hits": hits, "total": total_hits}
+
+ except json.JSONDecodeError as e:
+ return {"error": f"Invalid JSON response from BioStudies API: {str(e)}"}
+
+ elif response.status_code == 400:
+ return {"error": "Bad request. Please check your search parameters."}
+ elif response.status_code == 403:
+ return {"error": "Access forbidden. The collection may be restricted."}
+ elif response.status_code == 500:
+ return {"error": "BioStudies server error. Please try again later."}
+ elif response.status_code == 503:
+ return {"error": "BioStudies service temporarily unavailable. Please try again later."}
+ else:
+ return {"error": f"BioStudies API returned status {response.status_code}. Please try again later."}
+
+ except requests.exceptions.Timeout:
+ return {"error": "Request timed out. BioStudies server may be slow. Please try again."}
+ except requests.exceptions.ConnectionError:
+ return {"error": "Cannot connect to BioStudies server. Please check your internet connection."}
+ except requests.exceptions.RequestException as e:
+ return {"error": f"Network error: {str(e)}"}
+ except Exception as e:
+ return {"error": f"Unexpected error occurred: {str(e)}"}
+
+ def list_studies(
+ self,
+ page=1,
+ page_size=50,
+ include_urls: bool = False,
+ load_metadata: bool = False,
+ filters: tuple[tuple] | None = None,
+ ) -> dict:
+ """
+ List studies in the configured BioStudies collection for a specific page.
+ """
+ filters_applied = bool(filters)
+ if filters_applied:
+ load_metadata = True
+ include_urls = True
+
+ headers = {
+ "Accept": "application/json",
+ "User-Agent": "BioStudies-VHP4Safety-App/1.0",
+ }
+ params = {"page": page, "pageSize": page_size}
+
+ try:
+ response = requests.get(self.search_url, headers=headers, params=params, timeout=30)
+ except requests.exceptions.RequestException as e:
+ return {"error": f"Network error during listing: {e}", "total": 0, "hits": []}
+
+ if response.status_code != 200:
+ return {
+ "error": f"BioStudies API returned status {response.status_code} while listing.",
+ "total": 0,
+ "hits": [],
+ }
+
+ try:
+ data = response.json()
+ except json.JSONDecodeError as e:
+ return {"error": f"Invalid JSON response from BioStudies API: {str(e)}", "total": 0, "hits": []}
+
+ total_hits = data.get("totalHits") or data.get("total") or 0
+ hits = data.get("hits", [])
+
+ if include_urls:
+ hits = self._hit_url(hits)
+ if load_metadata:
+ hits = self._hit_metadata(hits)
+
+ if filters_applied:
+ hits = self._apply_filters(hits, filters)
+
+ page_size_met = len(hits) >= page_size
+ pages_fetched = 1
+
+ if not page_size_met:
+ hits, page_size_met, pages_fetched = self._backfill_filtered_results(
+ hits, page, page_size, filters, query=None
+ )
+
+ return {
+ "totalHits": total_hits,
+ "total": total_hits,
+ "hits": hits,
+ "hits_returned": len(hits),
+ "page": page,
+ "pageSize": page_size,
+ "pages_fetched": pages_fetched,
+ "filters_applied": True,
+ "page_size_met": page_size_met,
+ }
+
+ return {"total": total_hits, "hits": hits}
+
+ def _hit_url(self, hits: list) -> list:
+ for hit in hits:
+ acc = hit.get("accession") or hit.get("accno")
+ if acc:
+ hit["url"] = self.build_study_url(acc).get("url", "")
+ return hits
+
+ def _hit_metadata(self, hits: list) -> list:
+ for hit in hits:
+ acc = hit.get("accession") or hit.get("accno")
+ if acc:
+ hit["metadata"] = self.get_study_metadata(acc)
+ return hits
+
+ def _apply_filters(self, hits: list, filters: list[tuple]) -> list:
+ """
+ Filter hits based on metadata field values (case-insensitive AND logic)
+ """
+ if not filters:
+ return hits
+
+ filtered = []
+ for hit in hits:
+ metadata = hit.get("metadata", {})
+ if not metadata:
+ continue
+
+ matches_all = True
+ for field, value in filters:
+ field_value = str(metadata.get(field, "")).strip().lower()
+ filter_value = str(value).strip().lower()
+ if field_value != filter_value:
+ matches_all = False
+ break
+
+ if matches_all:
+ filtered.append(hit)
+
+ return filtered
+
+ def _backfill_filtered_results(
+ self,
+ initial_hits: list,
+ page: int,
+ page_size: int,
+ filters: list[tuple],
+ query: str = None,
+ ) -> tuple:
+ """
+ Backfill filtered results by fetching additional pages until page_size is met or timeout
+ """
+ filtered = initial_hits[:]
+ current_page = page
+ start_time = time.time()
+ pages_fetched = 1
+
+ while len(filtered) < page_size:
+ if time.time() - start_time > 30:
+ break
+
+ current_page += 1
+
+ try:
+ params = {"page": current_page, "pageSize": page_size}
+ headers = {"Accept": "application/json", "User-Agent": "BioStudies-VHP4Safety-App/1.0"}
+
+ if query:
+ params["query"] = query
+
+ response = requests.get(self.search_url, headers=headers, params=params, timeout=30)
+ if response.status_code != 200:
+ break
+
+ data = response.json()
+ next_hits = data.get("hits", [])
+ if not next_hits:
+ break
+
+ next_hits = self._hit_metadata(next_hits)
+ next_filtered = self._apply_filters(next_hits, filters)
+ filtered.extend(next_filtered)
+ pages_fetched += 1
+
+ except Exception:
+ break
+
+ page_size_met = len(filtered) >= page_size
+ return filtered[:page_size], page_size_met, pages_fetched
+
+ # -----------------------------
+ # Metadata parsing (FIXED)
+ # -----------------------------
+ def parse_metadata(self, raw_data: dict, *, validate_files: bool = True, file_timeout=(3.05, 10)):
+ """
+ Parse and structure the metadata from BioStudies API response.
+
+ FIX:
+ - Files are extracted ONLY here (enriched), not in _extract_comprehensive_metadata().
+ This prevents duplicates and ensures consistent structure.
+ """
+ try:
+ metadata = {
+ "accession": raw_data.get("accno", "N/A"),
+ "title": raw_data.get("title", "N/A"),
+ "description": raw_data.get("description", "N/A"),
+ "release_date": raw_data.get("rdate", raw_data.get("ReleaseDate", "N/A")),
+ "modification_date": raw_data.get("mdate", "N/A"),
+ "type": raw_data.get("type", "N/A"),
+
+ # VHP4Safety filterable fields
+ "case_study": "",
+ "regulatory_question": "",
+ "flow_step": "",
+ "collection": "",
+
+ "attributes": [],
+ "authors": [],
+ "files": [],
+ "links": [],
+ "protocols": [],
+ "publications": [],
+ "organizations": [],
+
+ "biological_context": {},
+ "technical_details": {},
+ "experimental_design": {},
+
+ "raw_data": raw_data,
+ }
+
+ # ---- helpers
+ def _norm_attr_name(attr: dict) -> str:
+ return (attr.get("name") or "").strip().lower()
+
+ def _attr_value(attr: dict) -> str:
+ v = attr.get("value", "")
+ return "" if v is None else str(v)
+
+ def _capture_vhp_fields(attr_name: str, attr_value: str):
+ if attr_name == "attachto":
+ metadata["collection"] = attr_value
+ elif attr_name == "case study":
+ metadata["case_study"] = attr_value
+ elif attr_name == "regulatory question":
+ metadata["regulatory_question"] = attr_value
+ elif attr_name == "process flow step":
+ metadata["flow_step"] = attr_value
+
+ BIO_KEYS = {
+ "organism", "species", "organism part", "organ", "cell type",
+ "tissue", "disease", "disease state", "sample type",
+ }
+ TECH_KEYS = {
+ "platform", "instrument", "assay", "assay type", "library strategy",
+ "library source", "data type", "sequencing mode", "sequencing date",
+ "index adapters", "pipeline",
+ }
+ AUTHOR_KEYS = {"author", "authors", "contact", "submitter"}
+
+ def _categorize(attr_name: str, attr_value: str):
+ if attr_name in BIO_KEYS:
+ metadata["biological_context"][attr_name] = attr_value
+ elif attr_name in TECH_KEYS:
+ metadata["technical_details"][attr_name] = attr_value
+ elif attr_name in AUTHOR_KEYS:
+ if attr_value and attr_value not in metadata["authors"]:
+ metadata["authors"].append(attr_value)
+
+ def _file_attrs_map(fobj: dict) -> dict:
+ out = {}
+ for a in (fobj or {}).get("attributes", []) or []:
+ n = (a.get("name") or "").strip()
+ if n:
+ out[n] = a.get("value")
+ return out
+
+ def _iter_section_files(sec: dict):
+ if not isinstance(sec, dict):
+ return
+ if isinstance(sec.get("files"), list):
+ for f in sec["files"]:
+ yield f
+ if isinstance(sec.get("subsections"), list):
+ for s in sec["subsections"]:
+ yield from _iter_section_files(s)
+
+ seen_files = set()
+
+ def _add_files(files_list):
+ if not isinstance(files_list, list):
+ return
+
+ accno = metadata.get("accession") or raw_data.get("accno") or "N/A"
+
+ for f in files_list:
+ if not isinstance(f, dict):
+ continue
+
+ file_path = (f.get("path") or f.get("name") or f.get("filename") or "").strip()
+ if not file_path:
+ continue
+
+ dedupe_key = f"{accno}::{file_path}"
+ if dedupe_key in seen_files:
+ continue
+ seen_files.add(dedupe_key)
+
+ fam = _file_attrs_map(f)
+ url = self.build_biostudies_https_file_url(accno, file_path)
+
+ entry = {
+ "name": file_path,
+ "path": file_path,
+ "size": f.get("size"),
+ "type": f.get("type"),
+ "description": fam.get("Description") or fam.get("description") or "",
+ "file_kind": fam.get("Type") or fam.get("type") or "",
+ "attributes": f.get("attributes", []),
+ "url": url,
+ "exists_check": None,
+ "raw": f,
+ }
+
+ if validate_files and url:
+ entry["exists_check"] = self.url_exists_no_download(url, timeout=file_timeout)
+
+ metadata["files"].append(entry)
+
+ # ---- top-level attributes
+ if isinstance(raw_data.get("attributes"), list):
+ for attr in raw_data["attributes"]:
+ if not isinstance(attr, dict):
+ continue
+ name_raw = attr.get("name", "")
+ attr_name = _norm_attr_name(attr)
+ value = _attr_value(attr)
+
+ metadata["attributes"].append({"name": name_raw, "value": value})
+ _capture_vhp_fields(attr_name, value)
+ _categorize(attr_name, value)
+
+ # ---- org lookup
+ organization_lookup = {}
+ if isinstance(raw_data.get("section"), dict):
+ self._build_organization_lookup(raw_data["section"], organization_lookup)
+
+ # ---- section attributes
+ section = raw_data.get("section") if isinstance(raw_data.get("section"), dict) else None
+ if section and isinstance(section.get("attributes"), list):
+ for attr in section["attributes"]:
+ if not isinstance(attr, dict):
+ continue
+ name_raw = attr.get("name", "")
+ attr_name = _norm_attr_name(attr)
+ value = _attr_value(attr)
+
+ if attr_name == "title" and (metadata["title"] == "N/A" or not metadata["title"]):
+ metadata["title"] = value
+ elif attr_name == "description" and (metadata["description"] == "N/A" or not metadata["description"]):
+ metadata["description"] = value
+
+ _capture_vhp_fields(attr_name, value)
+ _categorize(attr_name, value)
+ metadata["attributes"].append({"name": name_raw, "value": value})
+
+ # ---- comprehensive extraction (NO FILES inside this anymore!)
+ if section:
+ self._extract_comprehensive_metadata(section, metadata, organization_lookup)
+
+ # ---- files (enriched, deduped)
+ if section:
+ _add_files(list(_iter_section_files(section)))
+ if isinstance(raw_data.get("files"), list):
+ _add_files(raw_data["files"])
+
+ # ---- links + publications
+ def _add_links(links_list):
+ if not isinstance(links_list, list):
+ return
+ for link in links_list:
+ if not isinstance(link, dict):
+ continue
+ link_data = {
+ "url": link.get("url", ""),
+ "type": link.get("type", ""),
+ "description": link.get("description", ""),
+ "attributes": link.get("attributes", []),
+ }
+ metadata["links"].append(link_data)
+
+ link_type = (link.get("type", "") or "").lower()
+ if ("doi" in link_type) or ("pubmed" in link_type) or ("publication" in link_type):
+ metadata["publications"].append(link_data)
+
+ _add_links(raw_data.get("links"))
+ if section:
+ _add_links(section.get("links"))
+
+ # pick ro-crate link from available files -> requires filename to contain "rocrate"
+ rocrate = self._pick_rocrate_file(metadata.get("files", []))
+ metadata["rocrate_file"] = rocrate # full dict (name/path/url/size/exists_check...)
+ metadata["rocrate_url"] = rocrate.get("url") if isinstance(rocrate, dict) else None
+
+
+ return metadata
+
+ except Exception as e:
+ return {"error": f"Failed to parse metadata: {str(e)}", "raw_data": raw_data}
+
+ # -----------------------------
+ # Organisation lookup / deep extraction
+ # -----------------------------
+ def _build_organization_lookup(self, section, org_lookup):
+ """Build a lookup table for organization references"""
+ if isinstance(section, dict):
+ if section.get("type", "").lower() in ["organization", "organisation"]:
+ org_id = section.get("accno", "")
+ if org_id and "attributes" in section:
+ org_data = {}
+ for attr in section["attributes"]:
+ attr_name = (attr.get("name", "") or "").lower()
+ attr_value = attr.get("value", "")
+ if attr_name in ["name", "organization", "email", "address", "department", "affiliation"]:
+ org_data[attr_name] = attr_value
+ if org_data:
+ org_lookup[org_id] = org_data
+
+ if "subsections" in section:
+ for subsection in section["subsections"]:
+ self._build_organization_lookup(subsection, org_lookup)
+
+ elif isinstance(section, list):
+ for item in section:
+ self._build_organization_lookup(item, org_lookup)
+
+ def _extract_comprehensive_metadata(self, section, metadata, organization_lookup=None):
+ """
+ Comprehensively extract metadata from sections/subsections.
+
+ IMPORTANT FIX:
+ - DO NOT append files here (to avoid duplicates). Files are handled in parse_metadata().
+ """
+ if organization_lookup is None:
+ organization_lookup = {}
+
+ if isinstance(section, dict):
+ # ---- protocols
+ if section.get("type", "").lower() == "protocols" or "protocol" in section.get("type", "").lower():
+ if "subsections" in section:
+ for protocol in section["subsections"]:
+ protocol_data = {
+ "type": protocol.get("type", ""),
+ "description": protocol.get("description", ""),
+ "attributes": [],
+ }
+
+ if "attributes" in protocol:
+ for attr in protocol["attributes"]:
+ protocol_data["attributes"].append(
+ {"name": attr.get("name", ""), "value": attr.get("value", "")}
+ )
+
+ metadata["protocols"].append(protocol_data)
+
+ # ---- author and organization information
+ if section.get("type", "").lower() in ["author", "contact", "person"]:
+ if "attributes" in section:
+ author_info = {}
+ author_affiliation_ref = None
+
+ for attr in section["attributes"]:
+ attr_name = (attr.get("name", "") or "").lower()
+ attr_value = attr.get("value", "")
+
+ if attr_name in ["name", "first name", "last name", "email", "e-mail", "orcid"]:
+ author_info[attr_name] = attr_value
+ elif attr_name == "affiliation" and attr.get("reference"):
+ author_affiliation_ref = attr_value
+
+ if author_info:
+ author_name = author_info.get("name", "")
+ if not author_name:
+ first = author_info.get("first name", "")
+ last = author_info.get("last name", "")
+ author_name = f"{first} {last}".strip()
+
+ email = author_info.get("email") or author_info.get("e-mail", "")
+ orcid = author_info.get("orcid") or None
+
+ author_entry = {
+ "name": author_name,
+ "email": email,
+ "orcid": orcid,
+ "affiliation_ref": author_affiliation_ref,
+ "affiliation_name": "",
+ }
+
+ if author_affiliation_ref and author_affiliation_ref in organization_lookup:
+ resolved_org = organization_lookup[author_affiliation_ref]
+ author_entry["affiliation_name"] = resolved_org.get("name", "")
+
+ if author_name:
+ existing_author = next(
+ (a for a in metadata.get("author_details", []) if a.get("name") == author_name),
+ None,
+ )
+ if not existing_author:
+ metadata.setdefault("author_details", []).append(author_entry)
+
+ if author_name not in metadata["authors"]:
+ metadata["authors"].append(author_name)
+
+ # ---- experimental design info
+ if "attributes" in section:
+ for attr in section["attributes"]:
+ attr_name = (attr.get("name", "") or "").lower()
+ attr_value = attr.get("value", "")
+
+ if attr_name in ["experimental factor", "variable", "treatment", "condition", "time point"]:
+ metadata["experimental_design"].setdefault("factors", []).append(
+ {"name": attr_name, "value": attr_value}
+ )
+
+ # ---- recurse
+ if "subsections" in section:
+ for subsection in section["subsections"]:
+ self._extract_comprehensive_metadata(subsection, metadata, organization_lookup)
+
+ elif isinstance(section, list):
+ for item in section:
+ self._extract_comprehensive_metadata(item, metadata, organization_lookup)
\ No newline at end of file
diff --git a/src/models/data/mapping.py b/src/models/data/mapping.py
new file mode 100644
index 0000000..b65fc69
--- /dev/null
+++ b/src/models/data/mapping.py
@@ -0,0 +1,526 @@
+from typing import Any, Dict, List, Optional, Tuple
+import re
+
+# ---------- small helpers ----------
+
+# Prefer literal "<>" in real code (not HTML-escaped < >)
+DOI_RE = re.compile(r'\b10\.\d{4,9}/[^\s"<>]+', re.IGNORECASE)
+
+def is_valid_doi(doi: Optional[str]) -> bool:
+ """Basic DOI sanity check. Rejects obvious redactions like '***'."""
+ if not doi or not isinstance(doi, str):
+ return False
+ d = doi.strip()
+ if "*" in d: # handles 10.5281/zenodo.*** etc.
+ return False
+ if not d.lower().startswith("10."):
+ return False
+ if "/" not in d:
+ return False
+ return True
+
+def g(d: Dict[str, Any], *path: str, default=None):
+ """Safe nested-get. Never raises KeyError."""
+ cur: Any = d
+ for key in path:
+ if isinstance(cur, dict) and key in cur:
+ cur = cur[key]
+ else:
+ return default
+ return cur
+
+def first(*vals, default=None):
+ """Return first non-empty (not None, not '' , not []) value."""
+ for v in vals:
+ if v is None:
+ continue
+ if v == "":
+ continue
+ if isinstance(v, (list, dict)) and len(v) == 0:
+ continue
+ return v
+ return default
+
+def find_attr(attrs: Any, name: str) -> Optional[str]:
+ """Find BioStudies attribute list entry with given name."""
+ if not isinstance(attrs, list):
+ return None
+ for a in attrs:
+ if isinstance(a, dict) and a.get("name") == name:
+ return a.get("value")
+ return None
+
+def extract_doi_from_text(text: Any) -> Optional[str]:
+ """Extract a DOI from a string (or return None)."""
+ if not isinstance(text, str) or not text:
+ return None
+ m = DOI_RE.search(text)
+ doi = m.group(0) if m else None
+ return doi if is_valid_doi(doi) else None
+
+def extract_all_dois(text: Any) -> List[str]:
+ """Extract all valid DOIs from a string."""
+ if not isinstance(text, str) or not text:
+ return []
+ dois = []
+ for m in DOI_RE.finditer(text):
+ d = m.group(0)
+ if is_valid_doi(d):
+ dois.append(d)
+ return dois
+
+def doi_url(doi: Optional[str]) -> Optional[str]:
+ """Convert DOI to https://doi.org/..."""
+ if not doi:
+ return None
+ d = doi.strip()
+ if d.lower().startswith("http"):
+ return d
+ return f"https://doi.org/{d}"
+
+# ---------- DOI + publications extraction ----------
+
+def find_doi_anywhere(item: Dict[str, Any]) -> Optional[str]:
+ """
+ Best-effort *dataset DOI* extractor.
+ NOTE: Intentionally does NOT search BioStudies raw_data publication subsections,
+ because those are *linked publications*, not dataset DOI.
+ """
+ # direct keys first (dataset DOI)
+ doi = first(item.get("doi"), g(item, "metadata", "doi"))
+ doi = extract_doi_from_text(doi) or doi
+ if is_valid_doi(doi):
+ return doi
+
+ # Zenodo: related identifiers (sometimes contains dataset DOI, but usually pubs)
+ rel = g(item, "metadata", "related_identifiers", default=[]) or []
+ if isinstance(rel, list):
+ for r in rel:
+ if not isinstance(r, dict):
+ continue
+ ident = r.get("identifier")
+ scheme = (r.get("scheme") or "").lower()
+ if scheme == "doi":
+ found = extract_doi_from_text(ident) or ident
+ if is_valid_doi(found):
+ return found
+ found = extract_doi_from_text(ident)
+ if found:
+ return found
+
+ # BioStudies: attributes (dataset DOI if present)
+ attrs = g(item, "metadata", "attributes", default=[]) or []
+ for key in ("DOI", "doi", "Dataset DOI"):
+ v = find_attr(attrs, key)
+ found = extract_doi_from_text(v)
+ if found:
+ return found
+
+ # BioStudies: publications list (if present) - ambiguous; keep as last resort
+ pubs = g(item, "metadata", "publications", default=[]) or []
+ if isinstance(pubs, list):
+ for p in pubs:
+ if not isinstance(p, dict):
+ continue
+ for cand in (p.get("doi"), p.get("identifier"), p.get("url")):
+ found = extract_doi_from_text(cand)
+ if found:
+ return found
+
+ # last resort: description text
+ desc = first(g(item, "metadata", "description"), item.get("description"))
+ found = extract_doi_from_text(desc)
+ if found:
+ return found
+
+ return None
+
+def _dedup_publications(pubs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+ """Deduplicate publications by DOI (preferred) or URL."""
+ seen = set()
+ out = []
+ for p in pubs:
+ doi = (p.get("doi") or "").lower().strip()
+ url = (p.get("url") or "").lower().strip()
+ key = doi or url
+ if not key:
+ continue
+ if key in seen:
+ continue
+ seen.add(key)
+ out.append(p)
+ return out
+
+def extract_publications_zenodo(z: Dict[str, Any]) -> List[Dict[str, Any]]:
+ """
+ Extract linked publications from Zenodo record.
+ Sources:
+ - metadata.related_identifiers
+ - metadata.references (list of strings)
+ - DOIs embedded in metadata.description (optional, but useful)
+ """
+ pubs: List[Dict[str, Any]] = []
+
+ dataset_doi = find_doi_anywhere(z)
+ concept_doi = first(z.get("conceptdoi"), g(z, "metadata", "conceptdoi"))
+ concept_doi = extract_doi_from_text(concept_doi) or concept_doi
+ if not is_valid_doi(concept_doi):
+ concept_doi = None
+
+ rel = g(z, "metadata", "related_identifiers", default=[]) or []
+ if isinstance(rel, list):
+ for r in rel:
+ if not isinstance(r, dict):
+ continue
+ ident = r.get("identifier")
+ scheme = (r.get("scheme") or "").lower()
+ relation = (r.get("relation") or "").lower()
+ rtype = (r.get("resource_type") or "").lower()
+
+ # Heuristic: treat as publication if resource_type contains "publication"
+ # or relation indicates citation-like linkage.
+ looks_like_pub = (
+ "publication" in rtype
+ or relation in {"references", "iscitedby", "isreferencedby", "issupplementto", "isdocumentedby"}
+ )
+
+ if not looks_like_pub:
+ # still accept DOI-looking identifiers if they are clearly *not* Zenodo dataset DOIs
+ pass
+
+ doi = None
+ url = None
+
+ if scheme == "doi":
+ doi = extract_doi_from_text(ident) or (ident.strip() if isinstance(ident, str) else None)
+ if not is_valid_doi(doi):
+ doi = None
+ url = doi_url(doi) if doi else None
+ elif scheme == "url":
+ url = ident.strip() if isinstance(ident, str) else None
+ doi = extract_doi_from_text(url)
+ else:
+ # Unknown scheme: try DOI extraction
+ doi = extract_doi_from_text(ident)
+ url = doi_url(doi) if doi else (ident.strip() if isinstance(ident, str) else None)
+
+ # Exclude dataset DOI / concept DOI if they appear
+ if doi and (doi == dataset_doi or doi == concept_doi):
+ continue
+
+ if doi or url:
+ pubs.append({
+ "doi": doi,
+ "doi_url": doi_url(doi) if doi else None,
+ "url": url,
+ "relation": relation or None,
+ "resource_type": r.get("resource_type"),
+ "source": "zenodo.related_identifiers",
+ })
+
+ refs = g(z, "metadata", "references", default=[]) or []
+ if isinstance(refs, list):
+ for ref in refs:
+ doi = extract_doi_from_text(ref)
+ if doi and doi not in {dataset_doi, concept_doi}:
+ pubs.append({
+ "doi": doi,
+ "doi_url": doi_url(doi),
+ "url": doi_url(doi),
+ "relation": "references",
+ "resource_type": "publication",
+ "source": "zenodo.references",
+ })
+
+ # Optional: mine description for DOI links (often present as doi.org/10.xxxx/...)
+ desc = g(z, "metadata", "description")
+ for doi in extract_all_dois(desc):
+ if doi not in {dataset_doi, concept_doi}:
+ pubs.append({
+ "doi": doi,
+ "doi_url": doi_url(doi),
+ "url": doi_url(doi),
+ "relation": "mentions",
+ "resource_type": "publication",
+ "source": "zenodo.description",
+ })
+
+ return _dedup_publications(pubs)
+
+def extract_publications_biostudies(b: Dict[str, Any]) -> List[Dict[str, Any]]:
+ """
+ Extract linked publications from BioStudies record.
+ Sources:
+ - metadata.publications (if present)
+ - metadata.raw_data.section.subsections entries of type 'Publication'
+ """
+ pubs: List[Dict[str, Any]] = []
+ meta = b.get("metadata", {}) or {}
+
+ # 1) metadata.publications (sometimes already structured)
+ meta_pubs = meta.get("publications", []) or []
+ if isinstance(meta_pubs, list):
+ for p in meta_pubs:
+ if isinstance(p, dict):
+ doi = extract_doi_from_text(first(p.get("doi"), p.get("identifier"), p.get("url")))
+ url = first(p.get("url"), doi_url(doi))
+ if doi or url:
+ pubs.append({
+ "title": p.get("title"),
+ "doi": doi,
+ "doi_url": doi_url(doi) if doi else None,
+ "url": url,
+ "pmid": p.get("pmid") or p.get("PMID"),
+ "year": p.get("year") or p.get("Year"),
+ "authors": p.get("authors") or p.get("Authors"),
+ "source": "biostudies.metadata.publications",
+ })
+ elif isinstance(p, str):
+ doi = extract_doi_from_text(p)
+ if doi:
+ pubs.append({
+ "doi": doi,
+ "doi_url": doi_url(doi),
+ "url": doi_url(doi),
+ "source": "biostudies.metadata.publications",
+ })
+
+ # 2) raw_data.section.subsections: type == Publication
+ subs = g(b, "metadata", "raw_data", "section", "subsections", default=[]) or []
+ if isinstance(subs, list):
+ for s in subs:
+ if not isinstance(s, dict):
+ continue
+ stype = str(s.get("type", "")).strip().lower()
+ if stype != "publication":
+ continue
+
+ # flatten attributes into dict
+ attrs = s.get("attributes") or []
+ flat: Dict[str, Any] = {}
+ if isinstance(attrs, list):
+ for a in attrs:
+ if isinstance(a, dict) and a.get("name"):
+ flat[a["name"]] = a.get("value")
+
+ doi = extract_doi_from_text(flat.get("DOI") or flat.get("doi"))
+ pmid = flat.get("PMID") or flat.get("pmid")
+ title = flat.get("Title") or flat.get("title")
+ year = flat.get("Year") or flat.get("year")
+ authors = flat.get("Authors") or flat.get("Author") or flat.get("authors")
+
+ url = doi_url(doi) if doi else None
+
+ if doi or pmid or title:
+ pubs.append({
+ "title": title,
+ "doi": doi,
+ "doi_url": doi_url(doi) if doi else None,
+ "url": url,
+ "pmid": pmid,
+ "year": year,
+ "authors": authors,
+ "journal": flat.get("Journal") or flat.get("journal"),
+ "volume": flat.get("Volume") or flat.get("volume"),
+ "issue": flat.get("Issue") or flat.get("issue"),
+ "type": flat.get("Type") or flat.get("type"),
+ "issn": flat.get("Issn") or flat.get("ISSN"),
+ "source": "biostudies.raw_data.section.subsections",
+ })
+
+ return _dedup_publications(pubs)
+
+# ---------- Zenodo normalizer ----------
+
+def normalize_zenodo(z: Dict[str, Any]) -> Dict[str, Any]:
+ creators = g(z, "metadata", "creators", default=[]) or []
+ grants = g(z, "metadata", "grants", default=[]) or []
+ files = z.get("files", []) or []
+
+ doi = find_doi_anywhere(z)
+ if not is_valid_doi(doi):
+ doi = None
+
+ publications = extract_publications_zenodo(z)
+
+ return {
+ "title": first(g(z, "metadata", "title"), z.get("title")),
+ "description": first(g(z, "metadata", "description")),
+ "license": first(g(z, "metadata", "license", "id")),
+ "authors": [
+ {
+ "name": c.get("name"),
+ "orcid": c.get("orcid"),
+ "affiliation": c.get("affiliation"),
+ }
+ for c in creators
+ if isinstance(c, dict)
+ ],
+ "funding": [
+ {
+ "funder": g(gr, "funder", "name"),
+ "funder_doi": g(gr, "funder", "doi"),
+ "acronym": gr.get("acronym"),
+ "title": gr.get("title"),
+ "code": gr.get("code"),
+ "url": gr.get("url"),
+ }
+ for gr in grants
+ if isinstance(gr, dict)
+ ],
+ "ReleaseDate": first(g(z, "metadata", "publication_date"), z.get("created")),
+ "id": first(z.get("id"), z.get("recid")),
+ "type": first(g(z, "metadata", "resource_type", "type"), "dataset"),
+ "version": first(g(z, "metadata", "version")),
+ "files": [
+ {
+ "name": f.get("key"),
+ "size": f.get("size"),
+ "checksum": f.get("checksum"),
+ "url": g(f, "links", "self"),
+ }
+ for f in files
+ if isinstance(f, dict)
+ ],
+ "url": first(z.get("url"), g(z, "links", "self_html"), g(z, "links", "self")),
+
+ # dataset DOI
+ "doi": doi,
+ "doi_url": doi_url(doi),
+
+ "conceptdoi": first(z.get("conceptdoi"), g(z, "metadata", "conceptdoi")),
+ "conceptdoi_url": doi_url(first(z.get("conceptdoi"), g(z, "metadata", "conceptdoi"))),
+
+ # NEW: linked publications
+ "publications": publications,
+ }
+
+# ---------- BioStudies normalizer ----------
+
+def normalize_biostudies(b: Dict[str, Any]) -> Dict[str, Any]:
+ meta = b.get("metadata", {}) or {}
+ attrs = meta.get("attributes", []) or []
+ files = meta.get("files", []) or []
+
+ author_details = meta.get("author_details", []) or []
+ authors = meta.get("authors", []) or []
+
+ if isinstance(author_details, list) and len(author_details) > 0:
+ authors_norm = [
+ {
+ "name": a.get("name"),
+ "orcid": a.get("orcid"),
+ "affiliation": a.get("affiliation_name") or a.get("affiliation_ref"),
+ "email": a.get("email"),
+ }
+ for a in author_details
+ if isinstance(a, dict)
+ ]
+ else:
+ authors_norm = [
+ {"name": name, "orcid": None, "affiliation": None}
+ for name in authors
+ if isinstance(name, str)
+ ]
+
+ # funding best-effort (normalized)
+ funding: List[Dict[str, Any]] = []
+ subsections = g(b, "metadata", "raw_data", "section", "subsections", default=[]) or []
+ if isinstance(subsections, list):
+ for s in subsections:
+ if not isinstance(s, dict):
+ continue
+ if str(s.get("type", "")).strip().lower() != "funding":
+ continue
+
+ flat = {}
+ for a in s.get("attributes") or []:
+ if isinstance(a, dict) and a.get("name"):
+ flat[a["name"]] = a.get("value")
+
+ if not flat:
+ continue
+
+ funder = first(flat.get("Funder"), flat.get("Agency"), flat.get("Funding agency"), flat.get("Agency name"))
+ code = first(flat.get("Grant_id"), flat.get("Grant ID"), flat.get("Grant"), flat.get("Grant number"))
+ url = first(flat.get("URL"), flat.get("Url"), flat.get("Project URL"))
+
+ funding.append({
+ "funder": funder,
+ "code": code,
+ "url": url,
+ "acronym": flat.get("Acronym") or flat.get("Programme") or flat.get("Program"),
+ "raw": flat,
+ "source": "biostudies.raw_data.section.subsections",
+ })
+
+ doi = find_doi_anywhere(b)
+ if not is_valid_doi(doi):
+ doi = None
+
+ publications = extract_publications_biostudies(b)
+
+ # ✅ files: PASS THROUGH URL only (no rebuilding)
+ files_norm: List[Dict[str, Any]] = []
+ for f in files:
+ if not isinstance(f, dict):
+ continue
+ files_norm.append({
+ "name": first(f.get("name"), f.get("path")),
+ "size": f.get("size"),
+ "path": f.get("path"),
+ "url": f.get("url"), # <-- do not rebuild
+ # optional (keep if useful)
+ "exists": g(f, "exists_check", "exists"),
+ "content_length": g(f, "exists_check", "content_length"),
+ })
+
+ # OPTIONAL strictness: attach warning if any url missing
+ missing = [x.get("path") for x in files_norm if x.get("path") and not x.get("url")]
+ if missing:
+ meta.setdefault("warnings", []).append(
+ f"{len(missing)} BioStudies file(s) missing url in metadata.files (pass-through mode)."
+ )
+
+ return {
+ "title": first(meta.get("title"), b.get("title")),
+ "description": first(meta.get("description")),
+ "license": first(find_attr(attrs, "License")),
+ "authors": authors_norm,
+ "funding": funding,
+ "ReleaseDate": first(
+ b.get("release_date"),
+ find_attr(attrs, "ReleaseDate"),
+ find_attr(attrs, "Release Date"),
+ ),
+ "id": first(meta.get("accession"), b.get("accession"), b.get("id")),
+ "type": first(b.get("type"), meta.get("type"), "study"),
+ "version": first(meta.get("version")),
+ "files": files_norm,
+ "url": first(b.get("url")),
+ "doi": doi,
+ "doi_url": doi_url(doi),
+ "publications": publications,
+ }
+
+# ---------- combine ----------
+
+def normalize_all(
+ bs_entries: List[Dict[str, Any]],
+ zenodo_entries: List[Dict[str, Any]],
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+ """
+ Adds 'norm_metadata' to each dict in both lists and returns a 2-tuple
+ (bs_entries, zenodo_entries) with 'norm_metadata' populated.
+ Robust: ignores non-dicts and missing lists.
+ """
+
+ for z in zenodo_entries or []:
+ if isinstance(z, dict):
+ z["norm_metadata"] = normalize_zenodo(z)
+
+ for b in bs_entries or []:
+ if isinstance(b, dict):
+ b["norm_metadata"] = normalize_biostudies(b)
+
+ return bs_entries, zenodo_entries
\ No newline at end of file
diff --git a/src/models/data/schemas.py b/src/models/data/schemas.py
new file mode 100644
index 0000000..23dd05a
--- /dev/null
+++ b/src/models/data/schemas.py
@@ -0,0 +1,245 @@
+"""Pydantic models for normalized dataset metadata (BioStudies & Zenodo)."""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from pydantic import BaseModel, Field
+
+
+# ── Shared / reusable sub-models ──────────────────────────────────────────
+
+
+class Author(BaseModel):
+ """Normalised author/creator."""
+
+ name: Optional[str] = None
+ orcid: Optional[str] = None
+ affiliation: Optional[str] = None
+ email: Optional[str] = None
+
+
+class Funding(BaseModel):
+ """Normalised funding entry."""
+
+ funder: Optional[str] = None
+ funder_doi: Optional[str] = None
+ acronym: Optional[str] = None
+ title: Optional[str] = None
+ code: Optional[str] = None
+ url: Optional[str] = None
+ raw: Optional[dict[str, Any]] = None
+ source: Optional[str] = None
+
+
+class DataFile(BaseModel):
+ """Normalised file entry (common to both sources)."""
+
+ name: Optional[str] = None
+ path: Optional[str] = None
+ size: Optional[int] = None
+ checksum: Optional[str] = None
+ url: Optional[str] = None
+ exists: Optional[bool] = None
+ content_length: Optional[str] = None
+
+ model_config = {"extra": "allow"}
+
+
+class Publication(BaseModel):
+ """Linked publication extracted from a dataset record."""
+
+ title: Optional[str] = None
+ doi: Optional[str] = None
+ doi_url: Optional[str] = None
+ url: Optional[str] = None
+ pmid: Optional[str] = None
+ year: Optional[str] = None
+ authors: Optional[str] = None
+ journal: Optional[str] = None
+ volume: Optional[str] = None
+ issue: Optional[str] = None
+ type: Optional[str] = None
+ issn: Optional[str] = None
+ relation: Optional[str] = None
+ resource_type: Optional[str] = None
+ source: Optional[str] = None
+
+
+# ── Top-level normalised metadata ─────────────────────────────────────────
+
+
+class NormalizedMetadata(BaseModel):
+ """Unified normalised metadata for any dataset (Zenodo or BioStudies)."""
+
+ title: Optional[str] = None
+ description: Optional[str] = None
+ license: Optional[str] = None
+ authors: list[Author] = Field(default_factory=list)
+ funding: list[Funding] = Field(default_factory=list)
+ ReleaseDate: Optional[str] = Field(None, alias="ReleaseDate")
+ id: Optional[str | int] = None
+ type: Optional[str] = None
+ version: Optional[str] = None
+ files: list[DataFile] = Field(default_factory=list)
+ url: Optional[str] = None
+ doi: Optional[str] = None
+ doi_url: Optional[str] = None
+ publications: list[Publication] = Field(default_factory=list)
+
+ # Zenodo-specific
+ conceptdoi: Optional[str] = None
+ conceptdoi_url: Optional[str] = None
+
+ model_config = {"populate_by_name": True, "extra": "allow"}
+
+
+# ── BioStudies raw-metadata models ────────────────────────────────────────
+
+
+class Attribute(BaseModel):
+ name: str = ""
+ value: str = ""
+
+
+class BiologicalContext(BaseModel):
+ model_config = {"extra": "allow"}
+
+
+class TechnicalDetails(BaseModel):
+ model_config = {"extra": "allow"}
+
+
+class ExperimentalDesign(BaseModel):
+ factors: list[dict[str, Any]] = Field(default_factory=list)
+
+ model_config = {"extra": "allow"}
+
+
+class ProtocolEntry(BaseModel):
+ type: str = ""
+ description: str = ""
+ attributes: list[Attribute] = Field(default_factory=list)
+
+
+class LinkEntry(BaseModel):
+ url: str = ""
+ type: str = ""
+ description: str = ""
+ attributes: list[dict[str, Any]] = Field(default_factory=list)
+
+
+class FileEntry(BaseModel):
+ """Rich file entry from BioStudies parse_metadata."""
+
+ name: str = ""
+ path: str = ""
+ size: Optional[int] = None
+ type: Optional[str] = None
+ description: str = ""
+ file_kind: str = ""
+ attributes: list[dict[str, Any]] = Field(default_factory=list)
+ url: Optional[str] = None
+ exists_check: Optional[dict[str, Any]] = None
+ raw: Optional[dict[str, Any]] = None
+
+
+class AuthorDetail(BaseModel):
+ name: str = ""
+ email: str = ""
+ orcid: Optional[str] = None
+ affiliation_ref: Optional[str] = None
+ affiliation_name: str = ""
+
+
+class BioStudiesParsedMetadata(BaseModel):
+ """Full structured metadata returned by BioStudiesExtractor.parse_metadata."""
+
+ accession: str = "N/A"
+ title: str = "N/A"
+ description: str = "N/A"
+ release_date: str = "N/A"
+ modification_date: str = "N/A"
+ type: str = "N/A"
+
+ # VHP4Safety filterable fields
+ case_study: str = ""
+ regulatory_question: str = ""
+ flow_step: str = ""
+ collection: str = ""
+
+ attributes: list[Attribute] = Field(default_factory=list)
+ authors: list[str] = Field(default_factory=list)
+ author_details: list[AuthorDetail] = Field(default_factory=list)
+ files: list[FileEntry] = Field(default_factory=list)
+ links: list[LinkEntry] = Field(default_factory=list)
+ protocols: list[ProtocolEntry] = Field(default_factory=list)
+ publications: list[LinkEntry] = Field(default_factory=list)
+ organizations: list[dict[str, Any]] = Field(default_factory=list)
+
+ biological_context: BiologicalContext = Field(default_factory=BiologicalContext)
+ technical_details: TechnicalDetails = Field(default_factory=TechnicalDetails)
+ experimental_design: ExperimentalDesign = Field(default_factory=ExperimentalDesign)
+
+ rocrate_file: Optional[dict[str, Any]] = None
+ rocrate_url: Optional[str] = None
+
+ url: str = ""
+ raw_data: Optional[dict[str, Any]] = None
+
+ model_config = {"extra": "allow"}
+
+
+# ── Zenodo parsed-metadata model ──────────────────────────────────────────
+
+
+class ZenodoFileEntry(BaseModel):
+ id: Optional[str] = None
+ key: Optional[str] = None
+ size: Optional[int] = None
+ checksum: Optional[str] = None
+ links: dict[str, Any] = Field(default_factory=dict)
+
+
+class ZenodoParsedMetadata(BaseModel):
+ """Full structured metadata returned by ZenodoExtractor.parse_metadata."""
+
+ id: Optional[int | str] = None
+ recid: Optional[int | str] = None
+ doi: Optional[str] = None
+ doi_url: Optional[str] = None
+ title: str = "N/A"
+ description: str = "N/A"
+ publication_date: str = "N/A"
+ access_right: Optional[str] = None
+ creators: list[dict[str, Any]] = Field(default_factory=list)
+ keywords: list[str] = Field(default_factory=list)
+ resource_type: dict[str, Any] = Field(default_factory=dict)
+ license: dict[str, Any] = Field(default_factory=dict)
+ grants: list[dict[str, Any]] = Field(default_factory=list)
+ communities: list[dict[str, Any]] = Field(default_factory=list)
+ related_identifiers: list[dict[str, Any]] = Field(default_factory=list)
+ files: list[ZenodoFileEntry] = Field(default_factory=list)
+ links: dict[str, Any] = Field(default_factory=dict)
+ stats: dict[str, Any] = Field(default_factory=dict)
+ is_rocrate: bool = False
+
+ url: str = ""
+ raw: Optional[dict[str, Any]] = None
+
+ model_config = {"extra": "allow"}
+
+
+# ── URL-existence check result ────────────────────────────────────────────
+
+
+class UrlExistsResult(BaseModel):
+ """Result of a HEAD / Range probe to check file existence."""
+
+ url: Optional[str] = None
+ exists: bool = False
+ status_code: Optional[int] = None
+ content_length: Optional[str] = None
+ final_url: Optional[str] = None
+ error: Optional[str] = None
+ method: Optional[str] = None
diff --git a/src/models/data/zenodo.py b/src/models/data/zenodo.py
new file mode 100644
index 0000000..17f0820
--- /dev/null
+++ b/src/models/data/zenodo.py
@@ -0,0 +1,484 @@
+from __future__ import annotations
+
+import json
+import re
+import time
+from typing import Any
+
+import requests
+
+
+class ZenodoExtractor:
+ """Extractor for interacting with the Zenodo Records API.
+
+ Defaults to community 'vhp4safety' and record type 'dataset' to match
+ the user's request. Optional access_token may be provided for higher
+ rate limits or accessing private records.
+ """
+
+ def __init__(
+ self,
+ access_token: str | None = None,
+ community: str = "vhp4safety",
+ record_type: str = "dataset",
+ base_url: str = "https://zenodo.org/api/records",
+ ) -> None:
+ self.base_url = base_url
+ self.community = community
+ self.record_type = record_type
+ self.session = requests.Session()
+ self.headers = {
+ "Accept": "application/json",
+ "User-Agent": "Zenodo-VHP4Safety-App/1.0",
+ }
+ if access_token:
+ # Use Authorization header when token is provided
+ self.headers["Authorization"] = f"Bearer {access_token}"
+
+ def validate_record_id(self, record_id: Any) -> tuple[bool, Any, str | None]:
+ """Validate a Zenodo record identifier.
+
+ Accepts numeric recid (int or numeric string) or DOI (10.xxxx/...).
+
+ Returns:
+ (is_valid, normalized_id, error_message)
+ """
+ if record_id is None:
+ return False, None, "Record ID is required"
+
+ # numeric recid
+ try:
+ if isinstance(record_id, int):
+ return True, record_id, None
+ if isinstance(record_id, str) and record_id.isdigit():
+ return True, int(record_id), None
+ except Exception:
+ pass
+
+ # DOI pattern
+ if isinstance(record_id, str):
+ # strip DOI url wrapper
+ candidate = record_id.strip()
+ # DOI url like https://doi.org/10.5281/zenodo.1234
+ if candidate.startswith("http") and "doi.org" in candidate:
+ candidate = candidate.split("doi.org/", 1)[-1]
+
+ doi_regex = r"^10\.\d{4,9}/[-._;()/:A-Z0-9]+$"
+ if re.match(doi_regex, candidate, flags=re.IGNORECASE):
+ return True, candidate, None
+
+ return (
+ False,
+ record_id,
+ "Invalid Zenodo record identifier (expect recid or DOI)",
+ )
+
+ def build_record_url(self, record_id: Any) -> dict[str, Any]:
+ """Build a public URL for a record identifier (recid or DOI)."""
+ is_valid, normalized, error = self.validate_record_id(record_id)
+ if not is_valid:
+ return {"error": error}
+
+ if isinstance(normalized, int):
+ url = f"https://zenodo.org/records/{normalized}"
+ else:
+ # DOI string
+ url = f"https://doi.org/{normalized}"
+
+ return {"id": normalized, "url": url}
+
+ def get_record_metadata(self, record_id: Any) -> dict[str, Any]:
+ """Retrieve and normalize metadata for a single record.
+
+ If record_id is a DOI string, perform a search for that DOI and
+ return the first match's parsed metadata.
+ """
+ try:
+ is_valid, normalized, validation_error = self.validate_record_id(record_id)
+ if not is_valid:
+ return {"error": validation_error}
+
+ # If numeric recid, retrieve directly
+ if isinstance(normalized, int):
+ url = f"{self.base_url}/{normalized}"
+ resp = self.session.get(url, headers=self.headers, timeout=30)
+ if resp.status_code == 200:
+ try:
+ data = resp.json()
+ parsed = self.parse_metadata(data)
+ parsed_url = self.build_record_url(normalized).get("url", "")
+ return parsed | {"url": parsed_url}
+ except json.JSONDecodeError as e:
+ return {"error": f"Invalid JSON response from Zenodo API: {e}"}
+ elif resp.status_code == 404:
+ return {"error": f"Record '{normalized}' not found."}
+ else:
+ return {"error": f"Zenodo API returned status {resp.status_code}."}
+
+ # DOI case: search for DOI
+ doi = normalized
+ query = f'doi:"{doi}"'
+ search = self.search_records(
+ query=query, page=1, size=1, load_metadata=True
+ )
+ if "error" in search:
+ return search
+ hits = search.get("hits", [])
+ if not hits:
+ return {"error": f"Record with DOI '{doi}' not found."}
+ # return parsed metadata from first hit
+ first = hits[0]
+ # parsed metadata may be under 'parsed_metadata' or 'metadata'
+ parsed = first.get("parsed_metadata") or first.get("metadata")
+ parsed_url = self.build_record_url(
+ first.get("recid") or first.get("id") or doi
+ ).get(
+ "url",
+ "",
+ )
+ return parsed | {"url": parsed_url}
+
+ except requests.exceptions.Timeout:
+ return {"error": "Request timed out. Zenodo server may be slow."}
+ except requests.exceptions.ConnectionError:
+ return {
+ "error": "Cannot connect to Zenodo server. Check your internet connection."
+ }
+ except requests.exceptions.RequestException as e:
+ return {"error": f"Network error: {e}"}
+ except Exception as e:
+ return {"error": f"Unexpected error: {e}"}
+
+ def search_records(
+ self,
+ query: str = "",
+ page: int = 1,
+ size: int = 25,
+ load_metadata: bool = True,
+ filters: tuple[tuple[str, str]] | None= None,
+ ) -> dict[str, Any]:
+ """Search Zenodo records.
+
+ Defaults to the configured community and record_type.
+ """
+ try:
+ if not isinstance(query, str):
+ return {"error": "Query must be a string."}
+
+ # If filters are provided, ensure metadata is loaded
+ filters_applied = bool(filters)
+ if filters_applied:
+ load_metadata = True
+
+ params = {
+ "q": query,
+ "page": page,
+ "size": size,
+ "communities": self.community,
+ "type": self.record_type,
+ }
+
+ resp = self.session.get(
+ self.base_url, headers=self.headers, params=params, timeout=30
+ )
+ if resp.status_code == 200:
+ try:
+ data = resp.json()
+ except json.JSONDecodeError as e:
+ return {"error": f"Invalid JSON response from Zenodo API: {e}"}
+
+ hits = (
+ data.get("hits", {}).get("hits", [])
+ if isinstance(data.get("hits"), dict)
+ else data.get("hits", [])
+ )
+ total = (
+ data.get("hits", {}).get("total")
+ if isinstance(data.get("hits"), dict)
+ else data.get("total", 0)
+ )
+
+ if not data or (isinstance(total, int) and total == 0):
+ return {"error": "No results found.", "hits": []}
+
+ if load_metadata:
+ hits = self._hit_metadata(hits)
+
+ hits = self._hit_url(hits)
+
+ if filters_applied:
+ hits = self._apply_filters(hits, filters)
+
+ page_size_met = len(hits) >= size
+ pages_fetched = 1
+ if not page_size_met:
+ hits, page_size_met, pages_fetched = (
+ self._backfill_filtered_results(
+ hits, page, size, filters, query
+ )
+ )
+
+ return {
+ "totalHits": total,
+ "hits": hits,
+ "hits_returned": len(hits),
+ "page": page,
+ "pageSize": size,
+ "pages_fetched": pages_fetched,
+ "filters_applied": True,
+ "page_size_met": page_size_met,
+ }
+
+ return {"total": total, "hits": hits}
+
+ elif resp.status_code == 400:
+ return {"error": "Bad request. Check your search parameters."}
+ elif resp.status_code == 403:
+ return {
+ "error": "Access forbidden. Community or collection may be restricted."
+ }
+ elif resp.status_code in (500, 503):
+ return {"error": "Zenodo server error. Please try again later."}
+ else:
+ return {"error": f"Zenodo API returned status {resp.status_code}."}
+
+ except requests.exceptions.Timeout:
+ return {"error": "Request timed out. Zenodo server may be slow."}
+ except requests.exceptions.ConnectionError:
+ return {
+ "error": "Cannot connect to Zenodo server. Check your internet connection."
+ }
+ except requests.exceptions.RequestException as e:
+ return {"error": f"Network error: {e}"}
+ except Exception as e:
+ return {"error": f"Unexpected error: {e}"}
+
+ def list_records(
+ self,
+ page: int = 1,
+ size: int = 25,
+ include_urls: bool = False,
+ load_metadata: bool = False,
+ filters: tuple[tuple[str, str]]|None = None,
+ ) -> dict[str, Any]:
+ """list records for the configured community/type (wrapper for search_records)."""
+ # If filters provided, require metadata and URLs
+ if filters:
+ load_metadata = True
+ include_urls = True
+
+ result = self.search_records(
+ query="", page=page, size=size, load_metadata=load_metadata, filters=filters
+ )
+
+ if include_urls and "hits" in result:
+ result["hits"] = self._hit_url(result["hits"])
+
+ return result
+
+ def _hit_url(self, hits: list[dict[str, Any]]) -> list[dict[str, Any]]:
+ for hit in hits:
+ # try recid present in different keys
+ recid = (
+ hit.get("recid")
+ or hit.get("id")
+ or (hit.get("metadata", {}).get("doi") if hit.get("metadata") else None)
+ )
+ if recid:
+ try:
+ recid_int = int(recid)
+ hit["url"] = self.build_record_url(recid_int).get("url", "")
+ except Exception:
+ # fallback to DOI url
+ doi = (
+ hit.get("metadata", {}).get("doi")
+ if hit.get("metadata")
+ else None
+ )
+ if doi:
+ hit["url"] = self.build_record_url(doi).get("url", "")
+ return hits
+
+ def _hit_metadata(self, hits: list[dict[str, Any]]) -> list[dict[str, Any]]:
+ """Attach parsed metadata to each hit as 'parsed_metadata'."""
+ for hit in hits:
+ try:
+ # some hits already include top-level fields, but parse consistently
+ parsed = self.parse_metadata(hit)
+ # preserve both raw and parsed
+ hit["parsed_metadata"] = parsed
+ except Exception:
+ hit["parsed_metadata"] = {}
+ return hits
+
+ def _apply_filters(
+ self, hits: list[dict[str, Any]], filters: tuple[tuple[str, str]]|None
+ ) -> list[dict[str, Any]]:
+ """Apply AND-filters to hits using parsed metadata when available.
+
+ Field matching is case-insensitive. For list fields (keywords, creators,
+ communities) we match if any element contains the filter value.
+ """
+ if not filters:
+ return hits
+
+ filtered: list[dict[str, Any]] = []
+ for hit in hits:
+ metadata = hit.get("parsed_metadata") or hit.get("metadata") or {}
+ if not metadata:
+ continue
+
+ matches_all = True
+ for field, value in filters:
+ filter_value = value.lower()
+ field_value = metadata.get(field, "")
+
+ if isinstance(field_value, list):
+ # normalize list values to strings
+ found = False
+ for item in field_value:
+ # item may be dict (e.g., creators)
+ if isinstance(item, dict):
+ # try to match on common text fields
+ text = " ".join(
+ str(v) for v in item.values() if isinstance(v, str)
+ )
+ else:
+ text = str(item)
+ if filter_value in text.lower():
+ found = True
+ break
+ if not found:
+ matches_all = False
+ break
+
+ else:
+ if not isinstance(field_value, str):
+ field_value = str(field_value)
+ if (
+ filter_value != field_value.lower()
+ and filter_value not in field_value.lower()
+ ):
+ matches_all = False
+ break
+
+ if matches_all:
+ filtered.append(hit)
+
+ return filtered
+
+ def _backfill_filtered_results(
+ self,
+ initial_hits: list[dict[str, Any]],
+ page: int,
+ page_size: int,
+ filters: tuple[tuple[str, str]]|None,
+ query: None | str = None,
+ ) -> tuple[list[dict[str, Any]], bool, int]:
+ """Fetch subsequent pages until page_size filtered results are collected or timeout.
+
+ Returns (filtered_hits_trimmed, page_size_met, pages_fetched).
+ """
+ filtered = initial_hits[:]
+ current_page = page
+ start_time = time.time()
+ pages_fetched = 1
+
+ while len(filtered) < page_size:
+ if time.time() - start_time > 30:
+ break
+
+ current_page += 1
+ try:
+ params = {
+ "q": query or "",
+ "page": current_page,
+ "size": page_size,
+ "communities": self.community,
+ "type": self.record_type,
+ }
+ resp = self.session.get(
+ self.base_url, headers=self.headers, params=params, timeout=30
+ )
+ if resp.status_code != 200:
+ break
+ data = resp.json()
+ next_hits = (
+ data.get("hits", {}).get("hits", [])
+ if isinstance(data.get("hits"), dict)
+ else data.get("hits", [])
+ )
+ if not next_hits:
+ break
+
+ next_hits = self._hit_metadata(next_hits)
+ next_hits = self._hit_url(next_hits)
+ next_filtered = self._apply_filters(next_hits, filters)
+ filtered.extend(next_filtered)
+ pages_fetched += 1
+
+ except Exception:
+ break
+
+ page_size_met = len(filtered) >= page_size
+ return filtered[:page_size], page_size_met, pages_fetched
+
+ def parse_metadata(self, raw_record: dict[str, Any]) -> dict[str, Any]:
+ """Normalize Zenodo record structure into a simpler metadata dict.
+
+ Accepts either a full record returned from /api/records/:id or a hit
+ element from a search response.
+ """
+ try:
+ # Zenodo typically nests useful fields under 'metadata'
+ raw = raw_record.get("metadata", raw_record)
+
+ metadata: dict[str, Any] = {
+ "id": raw_record.get("id")
+ or raw_record.get("recid")
+ or raw.get("recid"),
+ "recid": raw_record.get("recid") or raw_record.get("id"),
+ "doi": raw.get("doi"),
+ "doi_url": raw_record.get("doi_url") or raw.get("doi_url"),
+ "title": raw.get("title", "N/A"),
+ "description": raw.get("description", "N/A"),
+ "publication_date": raw.get(
+ "publication_date", raw.get("publication_date", "N/A")
+ ),
+ "access_right": raw.get("access_right"),
+ "creators": raw.get("creators", []),
+ "keywords": raw.get("keywords", []),
+ "resource_type": raw.get("resource_type", {}),
+ "license": raw.get("license", {}),
+ "grants": raw.get("grants", []),
+ "communities": raw.get("communities", []),
+ "related_identifiers": raw.get(
+ "related_identifiers", raw.get("related_identifiers", [])
+ ),
+ "files": [],
+ "links": raw_record.get("links", {}),
+ "stats": raw_record.get("stats", {}),
+ "raw": raw_record,
+ }
+
+ # Extract files if available at top-level or under raw
+ files = raw_record.get("files") or raw.get("files") or []
+ is_rocrate = False
+ for f in files:
+ if f.get("key", "").lower() == "rocrate-metadata.json":
+ is_rocrate = True
+ metadata["files"].append(
+ {
+ "id": f.get("id"),
+ "key": f.get("key") or f.get("name"),
+ "size": f.get("size"),
+ "checksum": f.get("checksum"),
+ "links": f.get("links", {}),
+ }
+ )
+ metadata["is_rocrate"] = is_rocrate
+
+ return metadata
+
+ except Exception as e:
+ return {"error": f"Failed to parse metadata: {e}", "raw": raw_record}
diff --git a/src/models/platform.py b/src/models/platform.py
new file mode 100644
index 0000000..4438abe
--- /dev/null
+++ b/src/models/platform.py
@@ -0,0 +1,56 @@
+"""Pydantic models for VHP4Safety platform configuration and domain objects."""
+
+from __future__ import annotations
+
+from typing import Optional
+
+from pydantic import BaseModel, Field
+
+
+class RegulatoryQuestion(BaseModel):
+ """A regulatory question tied to a case study."""
+
+ key: str = Field(description="Internal key, e.g. reg_q_1a")
+ label: str
+ explanation: str
+ case_study: Optional[str] = None
+
+
+class StageExplanation(BaseModel):
+ """Safety-assessment workflow stage with a short explanation."""
+
+ name: str
+ explanation: str
+
+
+class CompoundProperty(BaseModel):
+ """Single property row returned by a SPARQL compound query."""
+
+ property_label: str = ""
+ value: str = ""
+ units_label: Optional[str] = None
+ formatter_url: Optional[str] = None
+ source: Optional[str] = None
+ doi: Optional[str] = None
+ see_also: Optional[str] = None
+
+
+class CompoundSummary(BaseModel):
+ """Core identifiers for a compound from CompoundCloud."""
+
+ wcid: str
+ label: str
+ inchi: str = ""
+ inchikey: str = ""
+ smiles: str = Field("", alias="SMILES")
+ formula: str = ""
+ mass: str = ""
+
+ model_config = {"populate_by_name": True}
+
+
+class GlossaryStageMapping(BaseModel):
+ """Maps a glossary URL to a human-readable stage name."""
+
+ glossary_url: str
+ stage_name: str
diff --git a/src/scheduler.py b/src/scheduler.py
new file mode 100644
index 0000000..e5e5654
--- /dev/null
+++ b/src/scheduler.py
@@ -0,0 +1,61 @@
+"""
+Nightly background job that re-seeds the database from upstream GitHub sources.
+
+Uses APScheduler's BackgroundScheduler so it runs inside the same Flask /
+SQLite process — no external cron or second container needed.
+"""
+
+import logging
+import os
+
+from apscheduler.schedulers.background import BackgroundScheduler
+from apscheduler.triggers.cron import CronTrigger
+
+log = logging.getLogger(__name__)
+
+_scheduler: BackgroundScheduler | None = None
+
+
+def _reseed_job() -> None:
+ """Drop + re-seed all tables from upstream YAML indexes."""
+ from src.seed import seed_all # late import to avoid circular deps
+ log.info("⏳ Nightly re-seed started …")
+ try:
+ seed_all()
+ log.info("✅ Nightly re-seed complete")
+ except Exception:
+ log.exception("❌ Nightly re-seed failed")
+
+
+def init_scheduler(app=None) -> BackgroundScheduler:
+ """
+ Start (or return) the background scheduler.
+
+ Environment knobs (all optional):
+ RESEED_HOUR – hour to run (0-23, default 3)
+ RESEED_MINUTE – minute to run (0-59, default 0)
+ RESEED_ENABLED – set to "false" to disable entirely
+ """
+ global _scheduler
+ if _scheduler is not None:
+ return _scheduler
+
+ enabled = os.environ.get("RESEED_ENABLED", "true").lower()
+ if enabled == "false":
+ log.info("🔕 Nightly re-seed disabled (RESEED_ENABLED=false)")
+ return None
+
+ hour = int(os.environ.get("RESEED_HOUR", "3"))
+ minute = int(os.environ.get("RESEED_MINUTE", "0"))
+
+ _scheduler = BackgroundScheduler(daemon=True)
+ _scheduler.add_job(
+ _reseed_job,
+ trigger=CronTrigger(hour=hour, minute=minute),
+ id="nightly_reseed",
+ name="Re-seed DB from upstream",
+ replace_existing=True,
+ )
+ _scheduler.start()
+ log.info("🕐 Nightly re-seed scheduled at %02d:%02d UTC", hour, minute)
+ return _scheduler
diff --git a/src/seed.py b/src/seed.py
new file mode 100644
index 0000000..8ab0032
--- /dev/null
+++ b/src/seed.py
@@ -0,0 +1,279 @@
+"""Seed the database from upstream GitHub JSON indexes.
+
+Run: python -m src.seed
+Idempotent — uses INSERT OR REPLACE (upsert).
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+from datetime import datetime, timezone
+
+import requests
+
+from src.db import get_conn, init_db
+
+SERVICES_URL = os.environ.get(
+ "SERVICES_URL",
+ "https://raw.githubusercontent.com/VHP4Safety/cloud"
+ "/refs/heads/main/cap/service_index.json",
+)
+METHODS_URL = os.environ.get(
+ "METHODS_URL",
+ "https://raw.githubusercontent.com/VHP4Safety/cloud"
+ "/refs/heads/main/cap/methods_index.json",
+)
+
+# ── Static reference data ────────────────────────────────────────────────
+
+REG_QUESTIONS = {
+ "reg_q_1a": {
+ "label": "Kidney Case Study (a)",
+ "explanation": "What is the safe cisplatin dose in cancer patients?",
+ },
+ "reg_q_1b": {
+ "label": "Kidney Case Study (b)",
+ "explanation": (
+ "What is the intrinsic hazard of tacrolimus "
+ "for nephrotoxicity?"
+ ),
+ },
+ "reg_q_2a": {
+ "label": "Parkinson Case Study (a)",
+ "explanation": "Can compound Dinoseb cause Parkinson's Disease?",
+ },
+ "reg_q_2b": {
+ "label": "Parkinson Case Study (b)",
+ "explanation": (
+ "What level of exposure to compound Dinoseb leads to "
+ "risk for developing Parkinson's disease?"
+ ),
+ },
+ "reg_q_3a": {
+ "label": "Thyroid Case Study (a)",
+ "explanation": (
+ "What information about silychristin do we need to give "
+ "an advice to women in their early pregnancy to decide "
+ "whether the substance can be used?"
+ ),
+ },
+ "reg_q_3b": {
+ "label": "Thyroid Case Study (b)",
+ "explanation": (
+ "Does silychristin influence the thyroid-mediated brain "
+ "development in the fetus resulting in cognitive "
+ "impairment in children?"
+ ),
+ },
+}
+
+STAGE_EXPLANATIONS = {
+ "ADME": (
+ "Absorption, distribution, metabolism, and excretion of a "
+ "substance in a living organism, following exposure."
+ ),
+ "Hazard Assessment": (
+ "The process of assessing the intrinsic hazard a substance "
+ "poses to human health and/or the environment."
+ ),
+ "Chemical Information": (
+ "Information about chemical properties and identity."
+ ),
+ "General": "Not specific to a flow step.",
+ "(External) exposure": "External exposure assessment.",
+ "Generic": "Generic category.",
+ "Other": "Other or unknown category.",
+}
+
+GLOSSARY_STAGE_MAPPINGS = {
+ "https://vhp4safety.github.io/glossary#VHP0000056": "ADME",
+ "https://vhp4safety.github.io/glossary#VHP0000102": "Hazard Assessment",
+ "https://vhp4safety.github.io/glossary#VHP0000148": "Chemical Information",
+ "https://vhp4safety.github.io/glossary#VHP0000149": "General",
+}
+
+CASE_STUDIES = [
+ {
+ "slug": "kidney",
+ "title": "Kidney case study",
+ "description": "To study kidney disease and pharmacovigilance.",
+ "image_src": "/static/images/image43_hexagon.svg",
+ "image_alt": "Kidney case study",
+ },
+ {
+ "slug": "parkinson",
+ "title": "Parkinson case study",
+ "description": (
+ "To study life course pesticide exposure and "
+ "neurodegenerative disease."
+ ),
+ "image_src": "/static/images/image45_hexagon.svg",
+ "image_alt": "Parkinson case study",
+ },
+ {
+ "slug": "thyroid",
+ "title": "Thyroid case study",
+ "description": (
+ "To study health effects discriminated by age and sex on "
+ "thyroid-mediated neurodevelopment."
+ ),
+ "image_src": "/static/images/image47_hexagon.svg",
+ "image_alt": "Thyroid case study",
+ },
+]
+
+CASESTUDY_CONTENT_URL = (
+ "https://raw.githubusercontent.com/"
+ "VHP4Safety/ui-casestudy-config/main/{slug}_content.json"
+)
+
+
+def _bool_flag(val):
+ if val is None or val == "":
+ return None
+ return 1 if str(val).strip().lower() == "true" else 0
+
+
+def _now():
+ return datetime.now(timezone.utc).isoformat()
+
+
+def seed_reference_data(conn) -> None:
+ for key, data in REG_QUESTIONS.items():
+ conn.execute(
+ "INSERT OR REPLACE INTO regulatory_questions (key, label, explanation) VALUES (?, ?, ?)",
+ (key, data["label"], data["explanation"]),
+ )
+ for name, explanation in STAGE_EXPLANATIONS.items():
+ conn.execute(
+ "INSERT OR REPLACE INTO stage_explanations (name, explanation) VALUES (?, ?)",
+ (name, explanation),
+ )
+ for url, stage in GLOSSARY_STAGE_MAPPINGS.items():
+ conn.execute(
+ "INSERT OR REPLACE INTO glossary_stage_mappings (glossary_url, stage_name) VALUES (?, ?)",
+ (url, stage),
+ )
+ for cs in CASE_STUDIES:
+ content_json = None
+ try:
+ url = CASESTUDY_CONTENT_URL.format(slug=cs["slug"])
+ resp = requests.get(url, timeout=15)
+ resp.raise_for_status()
+ content_json = resp.text
+ print(f" ok fetched {cs['slug']}_content.json")
+ except Exception as exc:
+ print(f" x could not fetch {cs['slug']}: {exc}")
+ conn.execute(
+ """INSERT OR REPLACE INTO case_studies
+ (slug, title, description, image_src, image_alt, content_json)
+ VALUES (?, ?, ?, ?, ?, ?)""",
+ (cs["slug"], cs["title"], cs["description"],
+ cs.get("image_src"), cs.get("image_alt"), content_json),
+ )
+ conn.commit()
+ print("ok reference data seeded")
+
+
+def seed_tools(conn) -> None:
+ resp = requests.get(SERVICES_URL, timeout=15)
+ resp.raise_for_status()
+ data = resp.json()
+
+ # Build glossary lookup
+ cur = conn.execute("SELECT glossary_url, stage_name FROM glossary_stage_mappings")
+ glossary = {r["glossary_url"]: r["stage_name"] for r in cur}
+
+ now = _now()
+ for tool_id, raw in data.items():
+ stage = raw.get("stage", "")
+ stage = glossary.get(stage, stage)
+ if stage in ("NA", "Unknown"):
+ stage = "Other"
+
+ conn.execute(
+ """INSERT OR REPLACE INTO tools
+ (id, service, description, stage, html_name, md_file_name,
+ png_file_name, main_url, inst_url,
+ reg_q_1a, reg_q_1b, reg_q_2a, reg_q_2b, reg_q_3a, reg_q_3b,
+ login, api_type, casestudy, provider, provider_email,
+ citation, version, license, sourcecode, docker,
+ bio_tools, tess, raw_json, updated_at)
+ VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
+ (tool_id, raw.get("service", tool_id), raw.get("description"),
+ stage, raw.get("html_name"), raw.get("md_file_name"),
+ raw.get("png_file_name"), raw.get("main_url"),
+ raw.get("inst_url") or None,
+ _bool_flag(raw.get("reg_q_1a")), _bool_flag(raw.get("reg_q_1b")),
+ _bool_flag(raw.get("reg_q_2a")), _bool_flag(raw.get("reg_q_2b")),
+ _bool_flag(raw.get("reg_q_3a")), _bool_flag(raw.get("reg_q_3b")),
+ raw.get("login"), raw.get("api"), raw.get("casestudy"),
+ raw.get("provider"), raw.get("provider-email"),
+ raw.get("citation"), raw.get("version"), raw.get("license"),
+ raw.get("sourcecode"), raw.get("docker"),
+ raw.get("bioTools"), raw.get("tess"),
+ json.dumps(raw), now),
+ )
+ conn.commit()
+ print(f"ok {len(data)} tools seeded")
+
+
+def seed_methods(conn) -> None:
+ resp = requests.get(METHODS_URL, timeout=15)
+ resp.raise_for_status()
+ data = resp.json()
+
+ now = _now()
+ for method_id, raw in data.items():
+ conn.execute(
+ """INSERT OR REPLACE INTO methods
+ (id, method, issue_number, description, stage, substage,
+ catalog_webpage_url, case_study, regulatory_question,
+ reg_q_1a, reg_q_1b, reg_q_2a, reg_q_2b, reg_q_3a, reg_q_3b,
+ data_producer, sop, vendor, catalog_number, citation,
+ type_iri, ontology, key_event_id, aop_id,
+ raw_json, updated_at)
+ VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""",
+ (method_id,
+ raw.get("method") or raw.get("method_name_content", method_id),
+ raw.get("issue_number"),
+ raw.get("method_description_content"),
+ raw.get("vhp4safety_workflow_stage_content"),
+ raw.get("workflow_substage_content"),
+ raw.get("catalog_webpage_url"),
+ raw.get("case_study_content"),
+ raw.get("regulatory_question_content"),
+ _bool_flag(raw.get("reg_q_1a")), _bool_flag(raw.get("reg_q_1b")),
+ _bool_flag(raw.get("reg_q_2a")), _bool_flag(raw.get("reg_q_2b")),
+ _bool_flag(raw.get("reg_q_3a")), _bool_flag(raw.get("reg_q_3b")),
+ raw.get("data_producer_content"),
+ raw.get("available_sop_or_protocol_content"),
+ raw.get("vendor_content"),
+ raw.get("catalog_number_content"),
+ raw.get("citation_content"),
+ raw.get("ontology_term_content"),
+ raw.get("type_content"),
+ raw.get("relevant_aop_wiki_key_event(s)_to_the_assay_content"),
+ raw.get("relevant_aop_wiki_adverse_outcome_pathway(s)_to_the_assay_content"),
+ json.dumps(raw), now),
+ )
+ conn.commit()
+ print(f"ok {len(data)} methods seeded")
+
+
+def seed_all() -> None:
+ init_db()
+ conn = get_conn()
+ try:
+ seed_reference_data(conn)
+ seed_tools(conn)
+ seed_methods(conn)
+ print("ok seeding complete")
+ finally:
+ conn.close()
+
+
+if __name__ == "__main__":
+ seed_all()
diff --git a/src/services/__init__.py b/src/services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/services/compound.py b/src/services/compound.py
new file mode 100644
index 0000000..cce8323
--- /dev/null
+++ b/src/services/compound.py
@@ -0,0 +1,204 @@
+"""Compound data service — encapsulates all CompoundCloud SPARQL queries.
+
+All SPARQL logic is centralised here; Flask routes just call these
+functions and get back typed Pydantic models or plain dicts.
+"""
+
+from __future__ import annotations
+
+import re
+import urllib.parse
+from typing import Optional
+
+import requests
+from wikibaseintegrator import wbi_helpers
+
+from src.models.compound import (
+ CompoundDetail,
+ CompoundExperimentalDatum,
+ CompoundIdentifier,
+ CompoundSummary,
+ CompoundToxicology,
+)
+
+COMPOUND_EP = "https://compoundcloud.wikibase.cloud/query/sparql"
+QLEVER_EP = (
+ "https://qlever.cs.uni-freiburg.de/api/wikidata"
+ "?format=json&query="
+)
+
+_QID_RE = re.compile(r"^Q\d+$")
+
+
+def is_valid_qid(qid: str) -> bool:
+ return bool(_QID_RE.fullmatch(qid))
+
+
+# ── Individual queries ────────────────────────────────────────────────────
+
+
+def get_properties(cwid: str) -> Optional[CompoundSummary]:
+ """Fetch core identifiers (InChI, SMILES, formula, mass)."""
+ q = (
+ "PREFIX wd: \n"
+ "PREFIX wdt: \n\n"
+ "SELECT ?cmp ?cmpLabel ?formula ?mass ?inchi ?inchiKey ?SMILES WHERE {\n"
+ f" VALUES ?cmp {{ wd:{cwid} }}\n"
+ " ?cmp wdt:P9 ?inchi ;\n"
+ " wdt:P10 ?inchiKey .\n"
+ " OPTIONAL { ?cmp wdt:P2 ?mass }\n"
+ " OPTIONAL { ?cmp wdt:P3 ?formula }\n"
+ " OPTIONAL { ?cmp wdt:P7 ?chiralSMILES }\n"
+ " OPTIONAL { ?cmp wdt:P12 ?nonchiralSMILES }\n"
+ ' BIND (COALESCE(IF(BOUND(?chiralSMILES), ?chiralSMILES, 1/0),'
+ ' IF(BOUND(?nonchiralSMILES), ?nonchiralSMILES, 1/0), "")'
+ " AS ?SMILES)\n"
+ " SERVICE wikibase:label {"
+ ' bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n'
+ "}"
+ )
+ result = wbi_helpers.execute_sparql_query(q, endpoint=COMPOUND_EP)
+ bindings = result.get("results", {}).get("bindings", [])
+ if not bindings:
+ return None
+ b = bindings[0]
+ return CompoundSummary(
+ wcid=b["cmp"]["value"],
+ label=b["cmpLabel"]["value"],
+ inchi=b["inchi"]["value"],
+ inchikey=b["inchiKey"]["value"],
+ SMILES=b.get("SMILES", {}).get("value", ""),
+ formula=b.get("formula", {}).get("value", ""),
+ mass=b.get("mass", {}).get("value", ""),
+ )
+
+
+def get_identifiers(cwid: str) -> list[CompoundIdentifier]:
+ """Fetch external identifiers (CAS, PubChem, …)."""
+ q = (
+ "PREFIX wd: \n"
+ "PREFIX wdt: \n\n"
+ "SELECT DISTINCT ?propertyLabel ?value ?formatterURL\n"
+ "WHERE {\n"
+ " VALUES ?property { wd:P13 wd:P22 wd:P23 wd:P26 wd:P27"
+ " wd:P28 wd:P36 wd:P41 wd:P43 wd:P44 wd:P45 }\n"
+ " ?property wikibase:directClaim ?valueProp .\n"
+ f" OPTIONAL {{ wd:{cwid} ?valueProp ?value }}\n"
+ " OPTIONAL { ?property wdt:P6 ?formatterURL }\n"
+ " SERVICE wikibase:label {"
+ ' bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n'
+ "}"
+ )
+ result = wbi_helpers.execute_sparql_query(q, endpoint=COMPOUND_EP)
+ bindings = result.get("results", {}).get("bindings", [])
+ out: list[CompoundIdentifier] = []
+ for b in bindings:
+ out.append(CompoundIdentifier(
+ property_label=b.get("propertyLabel", {}).get("value", ""),
+ value=b.get("value", {}).get("value", ""),
+ formatter_url=b.get("formatterURL", {}).get("value", ""),
+ ))
+ return out
+
+
+def get_toxicology(cwid: str) -> list[CompoundToxicology]:
+ """Fetch toxicology properties."""
+ q = (
+ "PREFIX wd: \n"
+ "PREFIX wdt: \n\n"
+ "SELECT DISTINCT ?propertyLabel ?value ?formatterURL\n"
+ "WHERE {\n"
+ " VALUES ?property { wd:P17 wd:P19 wd:P4 }\n"
+ " ?property wikibase:directClaim ?valueProp .\n"
+ f" OPTIONAL {{ wd:{cwid} ?valueProp ?value }}\n"
+ " OPTIONAL { ?property wdt:P6 ?formatterURL }\n"
+ " SERVICE wikibase:label {"
+ ' bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n'
+ "}"
+ )
+ result = wbi_helpers.execute_sparql_query(q, endpoint=COMPOUND_EP)
+ bindings = result.get("results", {}).get("bindings", [])
+ out: list[CompoundToxicology] = []
+ for b in bindings:
+ out.append(CompoundToxicology(
+ property_label=b.get("propertyLabel", {}).get("value", ""),
+ value=b.get("value", {}).get("value", ""),
+ ))
+ return out
+
+
+def get_experimental_data(
+ cwid: str,
+) -> list[CompoundExperimentalDatum]:
+ """Fetch experimental data via Wikidata QLever."""
+ # Step 1: resolve CompoundCloud QID → Wikidata QID
+ q1 = (
+ "PREFIX wd: \n"
+ "PREFIX wdt: \n\n"
+ "SELECT ?qid WHERE {\n"
+ " wd:P5 wikibase:directClaim ?identifierProp .\n"
+ f" wd:{cwid} ?identifierProp ?wikidata .\n"
+ " BIND (iri(CONCAT("
+ '"http://www.wikidata.org/entity/", ?wikidata)) AS ?qid)\n'
+ "}"
+ )
+ r1 = wbi_helpers.execute_sparql_query(q1, endpoint=COMPOUND_EP)
+ bindings = r1.get("results", {}).get("bindings", [])
+ if not bindings:
+ return []
+ qid = bindings[0]["qid"]["value"]
+
+ # Step 2: query Wikidata QLever for experimental properties
+ q2 = (
+ "PREFIX wd: \n"
+ "PREFIX wdt: \n"
+ "PREFIX prov: \n"
+ "PREFIX rdfs: \n"
+ "PREFIX pr: \n"
+ "PREFIX wikibase: \n\n"
+ "SELECT DISTINCT ?propEntityLabel ?value"
+ " ?unitsLabel ?source ?doi ?statement\n"
+ "WHERE {\n"
+ f" <{qid}> ?propp ?statement .\n"
+ " ?statement a wikibase:BestRank ;\n"
+ " ?proppsv ["
+ " wikibase:quantityAmount ?value ;"
+ " wikibase:quantityUnit ?units ] .\n"
+ " ?property wikibase:claim ?propp ;"
+ " wikibase:statementValue ?proppsv ;"
+ " wdt:P1629 ?propEntity ;"
+ " wdt:P31 wd:Q21077852 .\n"
+ " ?propEntity @en@rdfs:label ?propEntityLabel .\n"
+ " ?units @en@rdfs:label ?unitsLabel .\n"
+ " BIND (COALESCE(IF(BOUND(?sourceTmp),"
+ ' ?sourceTmp, 1/0), "") AS ?source)\n'
+ " BIND (COALESCE(IF(BOUND(?doiTmp),"
+ ' ?doiTmp, 1/0), "") AS ?doi)\n'
+ "}"
+ )
+ url = QLEVER_EP + urllib.parse.quote_plus(q2)
+ resp = requests.get(url, timeout=15)
+ data = resp.json()
+ bindings = data.get("results", {}).get("bindings", [])
+
+ out: list[CompoundExperimentalDatum] = []
+ for b in bindings:
+ out.append(CompoundExperimentalDatum(
+ property_label=b.get("propEntityLabel", {}).get("value", ""),
+ value=b.get("value", {}).get("value", ""),
+ units_label=b.get("unitsLabel", {}).get("value", ""),
+ source=b.get("source", {}).get("value", ""),
+ doi=b.get("doi", {}).get("value", ""),
+ see_also=b.get("statement", {}).get("value", ""),
+ ))
+ return out
+
+
+def get_full_compound(cwid: str) -> CompoundDetail:
+ """Fetch everything about a compound."""
+ return CompoundDetail(
+ summary=get_properties(cwid),
+ identifiers=get_identifiers(cwid),
+ toxicology=get_toxicology(cwid),
+ experimental_data=get_experimental_data(cwid),
+ )
diff --git a/src/sitemap.py b/src/sitemap.py
new file mode 100644
index 0000000..5e7ee36
--- /dev/null
+++ b/src/sitemap.py
@@ -0,0 +1,59 @@
+"""Generate a static sitemap.xml file from DB contents."""
+from __future__ import annotations
+
+from datetime import datetime
+from typing import Iterable
+import os
+from xml.etree import ElementTree as ET
+
+from src.db import get_conn
+
+BASE_URL = os.environ.get("BASE_URL", "http://localhost:5050")
+OUT_PATH = os.path.join(os.path.dirname(__file__), "..", "static", "sitemap.xml")
+
+
+def _add_url(root, loc, lastmod=None, changefreq="monthly", priority="0.5"):
+ url = ET.SubElement(root, "url")
+ ET.SubElement(url, "loc").text = loc
+ if lastmod:
+ ET.SubElement(url, "lastmod").text = lastmod
+ ET.SubElement(url, "changefreq").text = changefreq
+ ET.SubElement(url, "priority").text = priority
+
+
+def gather_urls() -> Iterable[tuple[str, str | None]]:
+ conn = get_conn()
+ try:
+ yield (f"{BASE_URL}/", datetime.utcnow().isoformat())
+ for path in ("/tools", "/methods", "/data", "/casestudies", "/api/v1/docs"):
+ yield (f"{BASE_URL}{path}", None)
+ for t in conn.execute("SELECT id, updated_at FROM tools").fetchall():
+ if t["id"]:
+ yield (f"{BASE_URL}/tools/{t['id']}", t["updated_at"])
+ for m in conn.execute("SELECT id, updated_at FROM methods").fetchall():
+ if m["id"]:
+ yield (f"{BASE_URL}/methods/{m['id']}", m["updated_at"])
+ for cs in conn.execute("SELECT slug FROM case_studies").fetchall():
+ if cs["slug"]:
+ yield (f"{BASE_URL}/casestudies/{cs['slug']}", None)
+ finally:
+ conn.close()
+
+
+def build_sitemap(out_path: str = OUT_PATH) -> str:
+ root = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9")
+ for loc, last in gather_urls():
+ _add_url(root, loc, lastmod=last)
+ tree = ET.ElementTree(root)
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
+ tree.write(out_path, encoding="utf-8", xml_declaration=True)
+ return out_path
+
+
+def main() -> None:
+ path = build_sitemap()
+ print(f"Wrote sitemap to: {path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/templates/base.html b/templates/base.html
index 4c103ac..5983caa 100644
--- a/templates/base.html
+++ b/templates/base.html
@@ -162,6 +162,9 @@
+
+
@@ -281,6 +284,7 @@
EXPLORE
diff --git a/templates/case_studies/casestudies.html b/templates/case_studies/casestudies.html
index 0854f95..3bbd7fe 100644
--- a/templates/case_studies/casestudies.html
+++ b/templates/case_studies/casestudies.html
@@ -11,41 +11,21 @@ Case Studies
-
-
-
-
+ {% for card in cards %}
+ {% endfor %}
diff --git a/templates/case_studies/casestudy_server.html b/templates/case_studies/casestudy_server.html
new file mode 100644
index 0000000..eeadd59
--- /dev/null
+++ b/templates/case_studies/casestudy_server.html
@@ -0,0 +1,229 @@
+{% extends "base.html" %} {% block content %}
+
+
+
+
+{# ── Breadcrumbs ── #}
+
+
+{# ── Workflow Header ── #}
+
+
+{# ── Main Content ── #}
+
+
+ {% if step.nav_title %}
+
{{ step.nav_title }}
+ {% endif %}
+
+ {% if step.nav_description %}
+
{{ step.nav_description }}
+ {% endif %}
+
+ {% if step.image_html %}
+ {{ step.image_html | safe }}
+ {% endif %}
+
+ {# ── Step Buttons ── #}
+ {% if step.buttons %}
+
+ {% for btn in step.buttons %}
+
+ {% endfor %}
+
+ {% endif %}
+
+ {# ── HTML Content Block ── #}
+ {% if step.content_html %}
+
+ {{ step.content_html | safe }}
+
+ {% endif %}
+
+ {# ── Accordion Sections ── #}
+ {% if step.accordion_sections %}
+
+ {% for item in step.accordion_sections %}
+ {% set item_id = "accordionItem" ~ loop.index0 %}
+
+
+
+
+ {{ item.description | default("") | safe }}
+
+
+
+ {% endfor %}
+
+ {% endif %}
+
+
+
+{# ── Feedback Button ── #}
+
+
+
+
+
+
+{% endblock %}
diff --git a/templates/Safety_Assessment_Workflow.html b/templates/safety_assessment_workflow.html
similarity index 100%
rename from templates/Safety_Assessment_Workflow.html
rename to templates/safety_assessment_workflow.html
From 245719bb28d18cd2344e24da5fad5cf64373490d Mon Sep 17 00:00:00 2001
From: Javier
Date: Fri, 17 Apr 2026 17:36:31 +0200
Subject: [PATCH 2/2] Add examples to endpoints and API check action
---
.github/scripts/api_check.py | 122 ++++++++++++++++
.github/workflows/pr-api-check.yml | 45 ++++++
src/api.py | 219 ++++++++++++++++++++++-------
3 files changed, 336 insertions(+), 50 deletions(-)
create mode 100644 .github/scripts/api_check.py
create mode 100644 .github/workflows/pr-api-check.yml
diff --git a/.github/scripts/api_check.py b/.github/scripts/api_check.py
new file mode 100644
index 0000000..9e3dbc7
--- /dev/null
+++ b/.github/scripts/api_check.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python3
+"""API check: counts, validation summary, and route health."""
+
+import json
+import sys
+import urllib.request
+from datetime import datetime, timezone
+
+BASE = "http://localhost:5050/api"
+
+
+def get(path):
+ url = f"{BASE}{path}"
+ try:
+ req = urllib.request.Request(url)
+ with urllib.request.urlopen(req, timeout=15) as r:
+ return r.status, json.loads(r.read())
+ except urllib.error.HTTPError as e:
+ return e.code, None
+ except Exception:
+ return 0, None
+
+
+errors = []
+
+# 1. Entity counts
+ENTITIES = {
+ "Tools": "/tools/",
+ "Methods": "/methods/",
+ "Case studies": "/casestudies/",
+ "Regulatory questions": "/regulatory-questions/",
+ "Stage explanations": "/stages/",
+}
+
+counts = {}
+for label, path in ENTITIES.items():
+ status, data = get(path)
+ if status == 200 and isinstance(data, list):
+ counts[label] = len(data)
+ else:
+ counts[label] = None
+ errors.append(f"GET {path} -> {status}")
+
+# 2. Validation summary
+status, validation = get("/validation/")
+if status != 200:
+ errors.append(f"GET /validation/ -> {status}")
+ validation = None
+
+# 3. Health check every route
+ROUTES = [
+ ("GET", "/tools/"),
+ ("GET", "/tools/cdkdepict"),
+ ("GET", "/methods/"),
+ ("GET", "/methods/5_cfda_assay_to_determine_cytotoxicity"),
+ ("GET", "/regulatory-questions/"),
+ ("GET", "/stages/"),
+ ("GET", "/casestudies/"),
+ ("GET", "/casestudies/kidney"),
+ ("GET", "/compounds/Q2270"),
+ ("GET", "/compounds/Q2270/properties"),
+ ("GET", "/compounds/Q2270/identifiers"),
+ ("GET", "/compounds/Q2270/toxicology"),
+ ("GET", "/compounds/Q2270/experimental-data"),
+ ("GET", "/data/"),
+ ("GET", "/validation/"),
+ ("GET", "/validation/tools"),
+]
+
+health = []
+for method, path in ROUTES:
+ status, _ = get(path)
+ ok = 200 <= status < 300
+ health.append((method, path, status, ok))
+ if not ok:
+ errors.append(f"{method} {path} -> {status}")
+
+# ── build report ──────────────────────────────────────────────────
+
+now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
+lines = [f"## API check -- {now}", ""]
+
+# counts
+lines.append("### Entity counts")
+lines.append("")
+lines.append("| Entity | Count |")
+lines.append("|--------|------:|")
+for label, n in counts.items():
+ lines.append(f"| {label} | {n if n is not None else 'ERR'} |")
+lines.append("")
+
+# validation
+if validation and "entities" in validation:
+ lines.append("### Validation (field completeness)")
+ lines.append("")
+ lines.append("| Entity | Entries | Avg complete | Full |")
+ lines.append("|--------|--------:|-------------:|-----:|")
+ for e in validation["entities"]:
+ lines.append(
+ f"| {e['entity']} | {e['total_entries']}"
+ f" | {e['avg_completeness_pct']}%"
+ f" | {e['fully_complete']}/{e['total_entries']} |"
+ )
+ lines.append("")
+
+# health
+lines.append("### Route health")
+lines.append("")
+lines.append("| Method | Route | Status |")
+lines.append("|--------|-------|-------:|")
+for method, path, status, ok in health:
+ mark = "ok" if ok else f"FAIL ({status})"
+ lines.append(f"| {method} | `{path}` | {mark} |")
+lines.append("")
+
+# result
+all_ok = not errors
+lines.append(f"**Result: {'PASS' if all_ok else 'FAIL'}**")
+
+print("\n".join(lines))
+if not all_ok:
+ sys.exit(1)
diff --git a/.github/workflows/pr-api-check.yml b/.github/workflows/pr-api-check.yml
new file mode 100644
index 0000000..b9680cd
--- /dev/null
+++ b/.github/workflows/pr-api-check.yml
@@ -0,0 +1,45 @@
+name: API check
+
+on:
+ pull_request:
+
+permissions:
+ contents: read
+ pull-requests: write
+
+jobs:
+ api-check:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Build Docker image
+ run: docker build -t vhp4safety .
+
+ - name: Start container
+ run: |
+ docker run -d --name vhp4safety -p 5050:5050 vhp4safety
+ for i in $(seq 1 30); do
+ curl -sf http://localhost:5050/api/tools/ && break
+ sleep 2
+ done
+
+ - name: Run API checks
+ id: report
+ run: |
+ python3 .github/scripts/api_check.py > report.md
+ {
+ echo 'REPORT<> "$GITHUB_OUTPUT"
+
+ - name: Post PR comment
+ uses: marocchino/sticky-pull-request-comment@v2
+ with:
+ header: api-check
+ message: ${{ steps.report.outputs.REPORT }}
+
+ - name: Stop container
+ if: always()
+ run: docker stop vhp4safety && docker rm vhp4safety
diff --git a/src/api.py b/src/api.py
index 646a88c..d2b59a6 100644
--- a/src/api.py
+++ b/src/api.py
@@ -33,48 +33,87 @@
# -- Marshmallow Schemas ---------------------------------------------------
class ToolSchema(Schema):
- id = fields.Str()
- service = fields.Str()
- description = fields.Str()
- stage = fields.Str()
- main_url = fields.Str()
- inst_url = fields.Str()
- html_name = fields.Str()
- png_file_name = fields.Str()
+ id = fields.Str(metadata={"example": "cdkdepict"})
+ service = fields.Str(metadata={
+ "example": "CDK Depict",
+ "description": "Human-readable tool name"})
+ description = fields.Str(metadata={
+ "example": "A webservice for generating chemical "
+ "structure images from SMILES inputs."})
+ stage = fields.Str(metadata={
+ "example": "Other",
+ "description": "Safety-assessment workflow stage"})
+ main_url = fields.Str(metadata={
+ "example": "https://www.simolecule.com/cdkdepict/depict.html"})
+ inst_url = fields.Str(metadata={
+ "example": "https://cdkdepict.cloud.vhp4safety.nl/"})
+ html_name = fields.Str(metadata={"example": "cdkdepict.html"})
+ png_file_name = fields.Str(metadata={"example": "cdkdepict.png"})
class MethodSchema(Schema):
- id = fields.Str()
- method = fields.Str()
- description = fields.Str()
- stage = fields.Str()
- substage = fields.Str()
- catalog_webpage_url = fields.Str()
- raw = fields.Dict(load_default=None)
+ id = fields.Str(metadata={
+ "example": "5_cfda_assay_to_determine_cytotoxicity"})
+ method = fields.Str(metadata={
+ "example": "5-CFDA assay to determine cytotoxicity",
+ "description": "Human-readable method name"})
+ description = fields.Str(metadata={
+ "example": "Fluorescence-based determination "
+ "of cell membrane damage"})
+ stage = fields.Str(metadata={"example": "Adverse Outcome"})
+ substage = fields.Str(metadata={
+ "example": "Cell death, Adverse outcome"})
+ catalog_webpage_url = fields.Str(metadata={
+ "example": "https://www.thermofisher.com/order/"
+ "catalog/product/C1354"})
+ raw = fields.Dict(load_default=None, metadata={
+ "description": "Full upstream YAML fields "
+ "from the methods catalog"})
class RegulatoryQuestionSchema(Schema):
- key = fields.Str()
- label = fields.Str()
- explanation = fields.Str()
+ key = fields.Str(metadata={"example": "reg_q_1a"})
+ label = fields.Str(metadata={
+ "example": "Kidney Case Study (a)"})
+ explanation = fields.Str(metadata={
+ "example": "What is the safe cisplatin dose "
+ "in cancer patients?"})
class StageExplanationSchema(Schema):
- name = fields.Str()
- explanation = fields.Str()
+ name = fields.Str(metadata={"example": "ADME"})
+ explanation = fields.Str(metadata={
+ "example": "Absorption, distribution, metabolism, "
+ "and excretion of a substance in a living organism, "
+ "following exposure."})
class CaseStudySchema(Schema):
- slug = fields.Str()
- title = fields.Str()
- description = fields.Str()
- image_src = fields.Str()
- config_repo = fields.Str()
- default_branch = fields.Str()
+ name = fields.Str(
+ attribute="slug",
+ metadata={"description": "Short identifier used in URLs",
+ "example": "kidney"})
+ title = fields.Str(metadata={
+ "example": "Kidney case study"})
+ description = fields.Str(metadata={
+ "example": "To study kidney disease "
+ "and pharmacovigilance."})
+ image_src = fields.Str(metadata={
+ "example": "/static/images/image43_hexagon.svg"})
+ config_repo = fields.Str(metadata={
+ "example": "VHP4Safety/ui-casestudy-config"})
+ default_branch = fields.Str(metadata={
+ "example": "main"})
class CaseStudyDetailSchema(CaseStudySchema):
- content_json = fields.Raw(load_default=None)
+ content_json = fields.Raw(
+ load_default=None,
+ metadata={
+ "description":
+ "Full nested JSON driving the case-study UI "
+ "(intro text, regulatory questions, "
+ "process-flow steps)"})
class CompoundSummarySchema(Schema):
@@ -115,7 +154,9 @@ class CompoundDetailSchema(Schema):
class DataSearchQuerySchema(Schema):
- query = fields.Str(load_default="")
+ query = fields.Str(
+ load_default="",
+ metadata={"example": "kidney"})
page = fields.Int(load_default=1)
size = fields.Int(load_default=18)
@@ -132,8 +173,12 @@ class DataResultSchema(Schema):
class SearchQuerySchema(Schema):
- stage = fields.Str(load_default=None)
- search = fields.Str(load_default="")
+ stage = fields.Str(
+ load_default=None,
+ metadata={"example": "Other"})
+ search = fields.Str(
+ load_default="",
+ metadata={"example": ""})
# -- Blueprints ------------------------------------------------------------
@@ -162,7 +207,11 @@ class SearchQuerySchema(Schema):
@tools_bp.arguments(SearchQuerySchema, location="query")
@tools_bp.response(200, ToolSchema(many=True))
def list_tools(args):
- """List all tools, with optional stage/search filters."""
+ """List all tools, with optional stage/search filters.
+
+ Returns every tool (service) registered on the platform.
+ Filter by workflow stage or free-text search on the tool name.
+ """
conn = get_conn()
sql = "SELECT * FROM tools WHERE 1=1"
params = []
@@ -179,9 +228,12 @@ def list_tools(args):
@tools_bp.route("/")
+@tools_bp.doc(parameters=[{
+ "name": "tool_id", "in": "path",
+ "example": "cdkdepict"}])
@tools_bp.response(200, ToolSchema)
def get_tool(tool_id):
- """Get a single tool by ID."""
+ """Get a single tool by its ID."""
conn = get_conn()
row = conn.execute("SELECT * FROM tools WHERE id = ?", (tool_id,)).fetchone()
conn.close()
@@ -196,7 +248,11 @@ def get_tool(tool_id):
@methods_bp.arguments(SearchQuerySchema, location="query")
@methods_bp.response(200, MethodSchema(many=True))
def list_methods(args):
- """List all methods, with optional stage/search filters."""
+ """List all methods, with optional stage/search filters.
+
+ Methods describe experimental or computational procedures
+ used in safety-assessment workflows.
+ """
conn = get_conn()
sql = "SELECT * FROM methods WHERE 1=1"
params = []
@@ -213,9 +269,16 @@ def list_methods(args):
@methods_bp.route("/")
+@methods_bp.doc(parameters=[{
+ "name": "method_id", "in": "path",
+ "example": "5_cfda_assay_to_determine_cytotoxicity"}])
@methods_bp.response(200, MethodSchema)
def get_method(method_id):
- """Get a single method by ID."""
+ """Get a single method by ID, including full upstream fields.
+
+ The ``raw`` field contains every field from the upstream
+ methods catalog YAML (AOP references, key events, etc.).
+ """
conn = get_conn()
row = conn.execute("SELECT * FROM methods WHERE id = ?", (method_id,)).fetchone()
conn.close()
@@ -232,7 +295,11 @@ def get_method(method_id):
@reg_q_bp.route("/")
@reg_q_bp.response(200, RegulatoryQuestionSchema(many=True))
def list_regulatory_questions():
- """List all regulatory questions."""
+ """List the six regulatory questions that link tools to case studies.
+
+ Each question is tied to a case study pair (a/b).
+ For example, ``reg_q_1a`` = *"Kidney Case Study (a)"*.
+ """
conn = get_conn()
rows = conn.execute("SELECT * FROM regulatory_questions").fetchall()
conn.close()
@@ -244,7 +311,11 @@ def list_regulatory_questions():
@stages_bp.route("/")
@stages_bp.response(200, StageExplanationSchema(many=True))
def list_stages():
- """List all safety-assessment workflow stages."""
+ """List all safety-assessment workflow stages.
+
+ Stages are the high-level phases of the VHP4Safety
+ process flow: ADME, Hazard Assessment, etc.
+ """
conn = get_conn()
rows = conn.execute("SELECT * FROM stage_explanations").fetchall()
conn.close()
@@ -256,19 +327,32 @@ def list_stages():
@casestudies_bp.route("/")
@casestudies_bp.response(200, CaseStudySchema(many=True))
def list_case_studies():
- """List all case studies."""
+ """List the three VHP4Safety case studies (summary only).
+
+ Returns name, title, description, and image for each.
+ Use the detail endpoint for the full content JSON.
+
+ Available names: ``kidney``, ``parkinson``, ``thyroid``.
+ """
conn = get_conn()
rows = conn.execute("SELECT * FROM case_studies").fetchall()
conn.close()
return [dict(r) for r in rows]
-@casestudies_bp.route("/")
+@casestudies_bp.route("/")
+@casestudies_bp.doc(parameters=[{
+ "name": "name", "in": "path",
+ "example": "kidney"}])
@casestudies_bp.response(200, CaseStudyDetailSchema)
-def get_case_study(slug):
- """Get a case study with its full content JSON."""
+def get_case_study(name):
+ """Get a case study by name, including its full content JSON.
+
+ The content JSON contains the intro text, regulatory questions,
+ and process-flow workflow steps that drive the case-study UI.
+ """
conn = get_conn()
- row = conn.execute("SELECT * FROM case_studies WHERE slug = ?", (slug,)).fetchone()
+ row = conn.execute("SELECT * FROM case_studies WHERE slug = ?", (name,)).fetchone()
conn.close()
if not row:
abort(404, message="Case study not found")
@@ -281,9 +365,17 @@ def get_case_study(slug):
# -- Compounds (SPARQL-backed) ---------------------------------------------
@compounds_bp.route("/")
+@compounds_bp.doc(parameters=[{
+ "name": "cwid", "in": "path",
+ "description": "Wikidata compound ID",
+ "example": "Q2270"}])
@compounds_bp.response(200, CompoundDetailSchema)
def get_compound(cwid):
- """Get full compound data."""
+ """Get full compound data from Wikidata via SPARQL.
+
+ Returns summary properties, external identifiers,
+ toxicology data, and experimental measurements.
+ """
if not is_valid_qid(cwid):
abort(400, message="Invalid compound identifier")
try:
@@ -293,9 +385,11 @@ def get_compound(cwid):
@compounds_bp.route("//properties")
+@compounds_bp.doc(parameters=[{
+ "name": "cwid", "in": "path", "example": "Q2270"}])
@compounds_bp.response(200, CompoundSummarySchema)
def get_compound_properties(cwid):
- """Get core compound identifiers."""
+ """Get core compound properties (formula, mass, InChI, SMILES)."""
if not is_valid_qid(cwid):
abort(400, message="Invalid compound identifier")
try:
@@ -308,9 +402,11 @@ def get_compound_properties(cwid):
@compounds_bp.route("//identifiers")
+@compounds_bp.doc(parameters=[{
+ "name": "cwid", "in": "path", "example": "Q2270"}])
@compounds_bp.response(200, CompoundIdentifierSchema(many=True))
def get_compound_identifiers(cwid):
- """Get external identifiers."""
+ """Get external database identifiers (CAS, PubChem, ChEBI, etc.)."""
if not is_valid_qid(cwid):
abort(400, message="Invalid compound identifier")
try:
@@ -320,9 +416,11 @@ def get_compound_identifiers(cwid):
@compounds_bp.route("//toxicology")
+@compounds_bp.doc(parameters=[{
+ "name": "cwid", "in": "path", "example": "Q2270"}])
@compounds_bp.response(200, CompoundToxicologySchema(many=True))
def get_compound_toxicology(cwid):
- """Get toxicology data."""
+ """Get toxicology data (LD50, LC50, etc.)."""
if not is_valid_qid(cwid):
abort(400, message="Invalid compound identifier")
try:
@@ -332,9 +430,11 @@ def get_compound_toxicology(cwid):
@compounds_bp.route("//experimental-data")
+@compounds_bp.doc(parameters=[{
+ "name": "cwid", "in": "path", "example": "Q2270"}])
@compounds_bp.response(200, CompoundExpDataSchema(many=True))
def get_compound_exp_data(cwid):
- """Get experimental measurements."""
+ """Get experimental measurements (EC50, IC50, etc.)."""
if not is_valid_qid(cwid):
abort(400, message="Invalid compound identifier")
try:
@@ -349,7 +449,10 @@ def get_compound_exp_data(cwid):
@data_bp.arguments(DataSearchQuerySchema, location="query")
@data_bp.response(200, DataResultSchema)
def list_data(args):
- """Search datasets across BioStudies and Zenodo."""
+ """Search datasets across BioStudies and Zenodo repositories.
+
+ Returns paginated results from both sources with normalised metadata.
+ """
query = args.get("query", "")
page = args.get("page", 1)
size = args.get("size", 18)
@@ -383,9 +486,14 @@ def list_data(args):
@data_bp.route("/")
+@data_bp.doc(parameters=[{
+ "name": "data_id", "in": "path", "example": "S-BSST1503"}])
@data_bp.response(200)
def get_data_detail(data_id):
- """Get normalized metadata for a single dataset."""
+ """Get normalised metadata for a single dataset by its accession ID.
+
+ Searches both BioStudies and Zenodo for the given identifier.
+ """
bs = BioStudiesExtractor(collection=BIOSTUDIES_COLLECTION)
zen = ZenodoExtractor(community=ZENODO_COMMUNITY, record_type=ZENODO_RECORD_TYPE)
bs_res = bs.search_studies(data_id, page=1, page_size=1)
@@ -522,7 +630,12 @@ def _validate_entity(entity_name, table, pydantic_model, id_attr, label_attr):
@validation_bp.route("/")
@validation_bp.response(200, ValidationReport)
def validate_all():
- """Full data completeness report."""
+ """Full data completeness report across all entity types.
+
+ Checks every row in tools, methods, case_studies,
+ regulatory_questions, and stage_explanations for missing fields.
+
+ """
from datetime import datetime, timezone
return {
"generated_at": datetime.now(timezone.utc).isoformat(),
@@ -534,9 +647,15 @@ def validate_all():
@validation_bp.route("/")
+@validation_bp.doc(parameters=[{
+ "name": "entity", "in": "path", "example": "tools"}])
@validation_bp.response(200, EntitySummary)
def validate_entity(entity):
- """Data completeness report for a single entity type."""
+ """Data completeness report for a single entity type.
+
+ Valid entity names: ``tools``, ``methods``, ``case_studies``,
+ ``regulatory_questions``, ``stage_explanations``.
+ """
if entity not in _ENTITY_REGISTRY:
abort(404, message=f"Unknown entity '{entity}'. Valid: {', '.join(_ENTITY_REGISTRY)}")
tbl, model, id_a, lbl_a = _ENTITY_REGISTRY[entity]