From b3d2eea52543d7066de9813c50cc9e36e2747d28 Mon Sep 17 00:00:00 2001 From: Javier Date: Fri, 17 Apr 2026 16:59:01 +0200 Subject: [PATCH 1/2] Add data models and SQLite database --- .gitignore | 2 + Dockerfile | 5 + app.py | 539 +++++------ entrypoint.sh | 11 +- patch.py | 46 - requirements.txt | 35 +- src/__init__.py | 0 src/api.py | 564 ++++++++++++ src/casestudy_resolver.py | 298 ++++++ src/db.py | 75 ++ src/models/__init__.py | 0 src/models/casestudy.py | 209 +++++ src/models/cloud/method.py | 134 +++ src/models/cloud/tool.py | 98 ++ src/models/compound.py | 75 ++ src/models/data/__init__.py | 50 + src/models/data/biostudies.py | 867 ++++++++++++++++++ src/models/data/mapping.py | 526 +++++++++++ src/models/data/schemas.py | 245 +++++ src/models/data/zenodo.py | 484 ++++++++++ src/models/platform.py | 56 ++ src/scheduler.py | 61 ++ src/seed.py | 279 ++++++ src/services/__init__.py | 0 src/services/compound.py | 204 +++++ src/sitemap.py | 59 ++ templates/base.html | 4 + templates/case_studies/casestudies.html | 36 +- templates/case_studies/casestudy_server.html | 229 +++++ ...w.html => safety_assessment_workflow.html} | 0 30 files changed, 4812 insertions(+), 379 deletions(-) delete mode 100644 patch.py create mode 100644 src/__init__.py create mode 100644 src/api.py create mode 100644 src/casestudy_resolver.py create mode 100644 src/db.py create mode 100644 src/models/__init__.py create mode 100644 src/models/casestudy.py create mode 100644 src/models/cloud/method.py create mode 100644 src/models/cloud/tool.py create mode 100644 src/models/compound.py create mode 100644 src/models/data/__init__.py create mode 100644 src/models/data/biostudies.py create mode 100644 src/models/data/mapping.py create mode 100644 src/models/data/schemas.py create mode 100644 src/models/data/zenodo.py create mode 100644 src/models/platform.py create mode 100644 src/scheduler.py create mode 100644 src/seed.py create mode 100644 src/services/__init__.py create mode 100644 src/services/compound.py create mode 100644 src/sitemap.py create mode 100644 templates/case_studies/casestudy_server.html rename templates/{Safety_Assessment_Workflow.html => safety_assessment_workflow.html} (100%) diff --git a/.gitignore b/.gitignore index a64738a..58802be 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ __pycache__/ *.py[cod] *$py.class +# SQLite database +data/*.db # C extensions *.so diff --git a/Dockerfile b/Dockerfile index 854e92a..90b4c14 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,9 +21,14 @@ COPY . . # Install any needed packages specified in requirements.txt RUN pip install -r requirements.txt +# Create data directory for SQLite DB +RUN mkdir -p /usr/src/app/data + # Copy entrypoint script COPY entrypoint.sh /usr/src/app/entrypoint.sh RUN chmod +x /usr/src/app/entrypoint.sh +EXPOSE 5050 + # Define the entrypoint script ENTRYPOINT ["/usr/src/app/entrypoint.sh"] diff --git a/app.py b/app.py index e6aa3de..a3e0cec 100644 --- a/app.py +++ b/app.py @@ -1,6 +1,7 @@ ################################################################################ ### Loading the required modules import json +import os import re import requests @@ -13,10 +14,15 @@ # from wikidataintegrator import wdi_core from wikibaseintegrator import wbi_helpers -# Import BioStudies extractor -from data.biostudies.search import BioStudiesExtractor -from data.zenodo.search import ZenodoExtractor -from data.mapping import normalize_all +# Data extractors (API wrappers — no DB needed) +from src.models.data.biostudies import BioStudiesExtractor +from src.models.data.zenodo import ZenodoExtractor +from src.models.data.mapping import normalize_all + +# Database layer +from src.db import get_conn, init_db +from src.api import init_api +from src.casestudy_resolver import resolve as resolve_casestudy ################################################################################ CACHE_TIMEOUT = 60 * 60 * 24 * 5 # 5 days -- [Ozan] I created a separate @@ -62,7 +68,7 @@ }, "reg_q_2b": { "label": "Parkinson Case Study (b)", - "explanation": "What level of exposure to compound Dinoseb leads to risk for developing Parkinson’s disease?", + "explanation": "What level of exposure to compound Dinoseb leads to risk for developing Parkinson's disease?", }, "reg_q_3a": { "label": "Thyroid Case Study (a)", @@ -103,9 +109,16 @@ def __init__(self, url_map, *items): "CACHE_SERVICE_TIMEOUT": CACHE_TIMEOUT_SERVICE } app = Flask(__name__) +app.secret_key = os.environ.get( + "FLASK_SECRET_KEY", "dev-insecure-key" +) app.config.from_mapping(cache_config) cache = Cache(app) +# Database init and API registration +init_db() +init_api(app) + @cache.memoize(timeout=CACHE_TIMEOUT) def get_json_dict(url: str, timeout: int = 5) -> dict: @@ -204,42 +217,25 @@ def get_repository_data( # Provide methods list to all templates for the Methods dropdown in the navbar @app.context_processor def inject_methods_menu(): - """Fetch methods_index.json and expose a simple list of {id, title} to templates. - Return an empty list on any error to avoid breaking pages. - """ - data = get_json_dict(METHODS_URL) - if data: - items = [] - for key, val in data.items() if isinstance(data, dict) else []: - title = ( - val.get("method") - or val.get("method_name_content") - or val.get("method_name") - or key - ) - items.append({"id": key, "title": title}) - # sort by title - items = sorted(items, key=lambda x: x["title"].lower()) - return {"methods_menu": items} - else: + """Expose methods list to all templates for navbar dropdown.""" + try: + conn = get_conn() + rows = conn.execute("SELECT id, method FROM methods ORDER BY method").fetchall() + conn.close() + return {"methods_menu": [{"id": r["id"], "title": r["method"]} for r in rows]} + except Exception: return {"methods_menu": []} @app.context_processor def inject_tools_menu(): - """Fetch methods_index.json and expose a simple list of {id, title} to templates. - Return an empty list on any error to avoid breaking pages. - """ - data = get_json_dict_service(SERVICES_URL) - if data: - items = [] - for key, val in data.items() if isinstance(data, dict) else []: - title = val.get("service") or key - items.append({"id": key, "title": title}) - # sort by title - items = sorted(items, key=lambda x: x["title"].lower()) - return {"tools_menu": items} - else: + """Expose tools list to all templates for navbar dropdown.""" + try: + conn = get_conn() + rows = conn.execute("SELECT id, service FROM tools ORDER BY service").fetchall() + conn.close() + return {"tools_menu": [{"id": r["id"], "title": r["service"]} for r in rows]} + except Exception: return {"tools_menu": []} @@ -269,17 +265,12 @@ def inject_data_menu(): ### The landing page @app.route("/") def home(): - try: - tools = get_json_dict_service( - SERVICES_URL - ) # Geting the service_list.json in the dictionary format. - tools = list(tools.values()) # Converting the dictionary to a list object. - except Exception as e: - return f"Error processing service data: {e}", 500 - num_tools = len(tools) - num_case_studies = len(CASESTUDIES) + conn = get_conn() + num_tools = conn.execute("SELECT COUNT(*) FROM tools").fetchone()[0] + num_case_studies = conn.execute("SELECT COUNT(*) FROM case_studies").fetchone()[0] + conn.close() bs_res, zen_res = get_repository_data(search_query="") - num_datasets = bs_res["total"] + zen_res["total"] + num_datasets = bs_res.get("total", 0) + zen_res.get("total", 0) return render_template( "home.html", num_tools=num_tools, @@ -292,26 +283,34 @@ def home(): ### The sitemap.xml for search engines @app.route("/sitemap.xml") def sitemap(): - sitemapContent = """ + # Prefer generated static sitemap if present (created by src.sitemap) + import os + path = os.path.join(os.path.dirname(__file__), "static", "sitemap.xml") + if os.path.exists(path): + with open(path, "rb") as fh: + return Response(fh.read(), mimetype="application/xml") + + # Fallback minimal sitemap + sitemapContent = """ - - https://platform.vhp4safety.nl/ - - - https://platform.vhp4safety.nl/casestudies - - - https://platform.vhp4safety.nl/tools - - - https://platform.vhp4safety.nl/methods - - - https://platform.vhp4safety.nl/data - + + https://platform.vhp4safety.nl/\ + + + https://platform.vhp4safety.nl/casestudies\ + + + https://platform.vhp4safety.nl/tools\ + + + https://platform.vhp4safety.nl/methods\ + + + https://platform.vhp4safety.nl/data\ + -"""; - return Response(sitemapContent, mimetype='text/xml'); +""" + return Response(sitemapContent, mimetype="text/xml") ################################################################################ @@ -529,112 +528,94 @@ def models(): ### Pages under 'Tools' -### Here begins the updated version for creating the tool list page. @app.route("/tools") def tools(): try: - tools = get_json_dict_service( - SERVICES_URL - ) # Geting the service_list.json in the dictionary format. - tools = list(tools.values()) # Converting the dictionary to a list object. - - # Mapping the URLs with glossary IDs to their text values. - stage_mapping = { - "https://vhp4safety.github.io/glossary#VHP0000056": "ADME", - "https://vhp4safety.github.io/glossary#VHP0000102": "Hazard Assessment", - "https://vhp4safety.github.io/glossary#VHP0000148": "Chemical Information", - "https://vhp4safety.github.io/glossary#VHP0000149": "General", - } - - for tool in tools: - full_stage_url = tool.get("stage", "") - - # Writing the service name and stage values in the logs for troubleshooting. - # print(f"Tool: {tool['service']}, Stage URL: {full_stage_url}") # Log the full URL - - # Checking if the full URL is in the mapping and updating the stage. - if full_stage_url in stage_mapping: - # print(f"Mapping stage URL {full_stage_url} to {stage_mapping[full_stage_url]}") # Log the mapping - tool["stage"] = stage_mapping[full_stage_url] - elif tool["stage"] in ["NA", "Unknown"]: - tool["stage"] = ( - "Other" # Combining "NA" and "Unknown" stages in a single stage-type, "Other". - ) - - html_name = tool.get("html_name") - md_name = tool.get("md_file_name") - png_name = tool.get("png_file_name") - - tool["url"] = f"https://cloud.vhp4safety.nl/service/{html_name}" - tool["meta_data"] = ( - f"https://raw.githubusercontent.com/VHP4Safety/cloud/main/docs/service/{md_name}" - if md_name - else "md file not found" - ) - - # Check if the tool has the placeholder logo - placeholder_logo = "https://github.com/VHP4Safety/ui-design/blob/main/static/images/logo.png" - if png_name == placeholder_logo: - tool["png"] = None # set to None if it's the common placeholder - else: - tool["png"] = ( - f"https://raw.githubusercontent.com/VHP4Safety/cloud/main/docs/service/{png_name}" - if not png_name.startswith("http") - else png_name - ) - - inst_url = tool.get("inst_url", "no_url") - if not inst_url: # catches "" as well - inst_url = "no_url" - tool["inst_url"] = inst_url + conn = get_conn() - # Getting selected stages from the URL. selected_stages = request.args.getlist("stage") + search_query = request.args.get("search", "").strip().lower() - # Filtering tools by selected stages. + sql = "SELECT * FROM tools WHERE 1=1" + params = [] if selected_stages: - tools = [tool for tool in tools if tool.get("stage") in selected_stages] - - # Getting all unique stages from the tools for the filter options. - stages = sorted(set(tool.get("stage") for tool in tools if tool.get("stage"))) - - # Forcing "Other" to be the last item in the list of stages. - if "Other" in stages: - stages.remove("Other") - stages.append("Other") + placeholders = ",".join("?" * len(selected_stages)) + sql += f" AND stage IN ({placeholders})" + params.extend(selected_stages) + if search_query: + sql += " AND LOWER(service) LIKE ?" + params.append(f"%{search_query}%") + sql += " ORDER BY service" + rows = conn.execute(sql, params).fetchall() - # Filtering over the regulatory questions. - reg_questions = {v["label"]: k for k, v in REG_QUESTIONS.items()} + # Build reg_questions lookup from DB + rq_rows = conn.execute("SELECT * FROM regulatory_questions").fetchall() + reg_questions = {r["label"]: r["key"] for r in rq_rows} + # Apply regulatory question filters selected_questions = request.args.getlist("reg_q") + tools_list = [] + for row in [dict(r) for r in rows]: + raw = json.loads(row["raw_json"]) if row.get("raw_json") else {} + # Check reg question filters + skip = False + for question in selected_questions: + field = reg_questions.get(question) + if field and str(raw.get(field, "")).lower() != "true": + skip = True + break + if skip: + continue + + html_name = row["html_name"] + png_name = row["png_file_name"] + placeholder = ( + "https://github.com/VHP4Safety/ui-design" + "/blob/main/static/images/logo.png" + ) - for question in selected_questions: - field = reg_questions.get(question) - if field: - tools = [ - tool for tool in tools if str(tool.get(field, "")).lower() == "true" - ] - - # Getting the search query from URL to add a search bar based on tool names. - search_query = request.args.get("search", "").strip().lower() - - # Filtering tools by search query. - if search_query: - tools = [ - tool - for tool in tools - if search_query in tool.get("service", "").lower() - ] + tools_list.append({ + "id": row["id"], + "service": row["service"], + "description": row["description"], + "stage": row["stage"], + "html_name": html_name, + "url": f"https://cloud.vhp4safety.nl/service/{html_name}", + "inst_url": row["inst_url"] or "no_url", + "png": ( + None if png_name == placeholder else + f"https://raw.githubusercontent.com/VHP4Safety/cloud/main/docs/service/{png_name}" + if png_name and not png_name.startswith("http") + else png_name + ), + **raw, + }) + + # Collect stages for filter sidebar + all_stages = sorted(set( + t["stage"] for t in tools_list if t.get("stage") + )) + if "Other" in all_stages: + all_stages.remove("Other") + all_stages.append("Other") + + # Stage / reg question explanations from DB + se_rows = conn.execute("SELECT * FROM stage_explanations").fetchall() + stage_explanations = {s["name"]: s["explanation"] for s in se_rows} + reg_question_explanations = { + r["label"]: r["explanation"] for r in rq_rows + } + conn.close() return render_template( "tools/tools.html", - tools=tools, - stages=stages, + tools=tools_list, + stages=all_stages, selected_stages=selected_stages, reg_questions=reg_questions, selected_questions=selected_questions, - stage_explanations=STAGE_EXPLANATIONS, - reg_question_explanations=REG_QUESTION_EXPLANATIONS, + stage_explanations=stage_explanations, + reg_question_explanations=reg_question_explanations, ) except Exception as e: @@ -645,100 +626,68 @@ def tools(): @app.route("/methods") @app.route("/methods/") def methods(): - """Fetch methods_index.json from the cloud repo, normalize fields and render a methods list page.""" - url = "https://raw.githubusercontent.com/VHP4Safety/cloud/refs/heads/main/cap/methods_index.json" - response = requests.get(url) - - if response.status_code != 200: - return f"Error fetching methods list: {response.status_code}", 503 - + """Render methods list page from DB.""" try: - methods = response.json() - methods = list(methods.values()) # convert dict to list + conn = get_conn() - # Normalize fields for the template and collect stages - stages_set = set() - normalized = [] - for m in methods: - norm = {} - norm["id"] = m.get("id", "") - # template expects 'service' and 'description' - norm["service"] = ( - m.get("method") - or m.get("method_name_content") - or m.get("method_name") - or "" - ) - norm["description"] = ( - m.get("method_description_content") or m.get("method_description") or "" - ) - # main_url used for method webpage (catalog page) - norm["main_url"] = m.get("catalog_webpage_url") or "no_url" - # interactive instance not present in methods index - norm["inst_url"] = m.get("inst_url") or "no_url" - # metadata md file not available in index; keep empty string - norm["meta_data"] = m.get("meta_data") or "" - # placeholder/no png - norm["png"] = None - # keep original raw data for potential details page - norm["raw"] = m - - # collect stages (split comma-separated values) - stage_field = (m.get("vhp4safety_workflow_stage_content") or "").strip() - if stage_field: - for part in [s.strip() for s in stage_field.split(",")]: - if part: - stages_set.add(part) - - normalized.append(norm) - - # Apply search and filters similar to /tools selected_stages = request.args.getlist("stage") - selected_questions = request.args.getlist("reg_q") search_query = request.args.get("search", "").strip().lower() - methods_filtered = normalized + sql = "SELECT * FROM methods WHERE 1=1" + params = [] + if search_query: + sql += " AND LOWER(method) LIKE ?" + params.append(f"%{search_query}%") + sql += " ORDER BY method" + rows = [dict(r) for r in conn.execute(sql, params).fetchall()] - if selected_stages: - methods_filtered = [ - m - for m in methods_filtered - if any( - s - in ( - (m["raw"].get("vhp4safety_workflow_stage_content") or "").split( - "," - ) - ) - for s in selected_stages - ) - ] - - # Filter by regulatory questions if provided (REG_QUESTIONS keys map to internal fields) - reg_questions = {v["label"]: k for k, v in REG_QUESTIONS.items()} - if selected_questions: + rq_rows = conn.execute("SELECT * FROM regulatory_questions").fetchall() + reg_questions = {r["label"]: r["key"] for r in rq_rows} + selected_questions = request.args.getlist("reg_q") + + stages_set = set() + methods_filtered = [] + for row in rows: + raw = json.loads(row["raw_json"]) if row.get("raw_json") else {} + stage_field = (row.get("stage") or "").strip() + parts = [s.strip() for s in stage_field.split(",") if s.strip()] + stages_set.update(parts) + + if selected_stages and not any(s in parts for s in selected_stages): + continue + + skip = False for question in selected_questions: field = reg_questions.get(question) - if field: - methods_filtered = [ - m - for m in methods_filtered - if str(m["raw"].get(field, "")).lower() == "true" - ] - - if search_query: - methods_filtered = [ - m - for m in methods_filtered - if search_query in m.get("service", "").lower() - ] + if field and str(raw.get(field, "")).lower() != "true": + skip = True + break + if skip: + continue + + methods_filtered.append({ + "id": row["id"], + "service": row["method"], + "description": row.get("description") or "", + "main_url": row.get("catalog_webpage_url") or "no_url", + "inst_url": "no_url", + "meta_data": "", + "png": None, + "raw": raw, + }) stages = sorted(stages_set) if "Other" in stages: stages.remove("Other") stages.append("Other") - # Pass everything the template expects + se_rows = conn.execute("SELECT * FROM stage_explanations").fetchall() + stage_explanations = {s["name"]: s["explanation"] for s in se_rows} + reg_question_explanations = { + r["label"]: r["explanation"] for r in rq_rows + } + conn.close() + return render_template( "methods/methods.html", methods=methods_filtered, @@ -746,8 +695,8 @@ def methods(): selected_stages=selected_stages, reg_questions=reg_questions, selected_questions=selected_questions, - stage_explanations=STAGE_EXPLANATIONS, - reg_question_explanations=REG_QUESTION_EXPLANATIONS, + stage_explanations=stage_explanations, + reg_question_explanations=reg_question_explanations, ) except Exception as e: @@ -756,38 +705,29 @@ def methods(): @app.route("/methods/") def method_page(methodid): - """Render a single method page using templates/methods/method.html - Method details are taken from methods_index.json (keyed by method id). - """ - try: - methods = get_json_dict(METHODS_URL) - # methods_index.json is a dict keyed by method id - if methodid not in methods: - abort(404) - method_details = methods[methodid] - except Exception as e: - return f"Error processing methods data: {e}", 500 + """Render a single method detail page.""" + conn = get_conn() + row = conn.execute("SELECT * FROM methods WHERE id = ?", (methodid,)).fetchone() + conn.close() + if not row: + abort(404) + + method_details = json.loads(row["raw_json"]) if row["raw_json"] else {} - # Try to load the full method JSON from the docs/methods folder (raw github) - method_json = None - # URL-encode the filename part to be safe + # Try to load full JSON from GitHub docs/methods/ + method_json = method_details encoded = urllib.parse.quote(methodid, safe="") raw_url = ( - "https://raw.githubusercontent.com/VHP4Safety/cloud/refs/heads/main/docs/methods/" - + f"{encoded}.json" + "https://raw.githubusercontent.com/VHP4Safety/cloud" + f"/refs/heads/main/docs/methods/{encoded}.json" ) try: r = requests.get(raw_url, timeout=5) if r.status_code == 200: method_json = r.json() - else: - # fall back to using the index entry as minimal data - method_json = method_details - except Exception as exc: - # on any error, fall back to index entry - method_json = method_details + except Exception: + pass - # Pass both to the template: some templates expect method_json, others method_details return render_template( "methods/method.html", method=method_details, @@ -798,37 +738,27 @@ def method_page(methodid): @app.route("/tools/") def tool_page(toolname): - # get the tools metadata: - try: - tools = get_json_dict_service(SERVICES_URL) - tools = dict(tools) - # Geting the service_list.json in the dictionary format. - # Converting the dictionary to a list object. - except Exception as e: - return f"Error processing service data: {e}", 500 - - # Map toolname to the correct JSON file in the new tool folder - if toolname not in tools: + """Render a single tool detail page.""" + conn = get_conn() + row = conn.execute("SELECT * FROM tools WHERE id = ?", (toolname,)).fetchone() + conn.close() + if not row: abort(404) - # get the tools metadata: - url = "https://cloud.vhp4safety.nl/service/" + toolname + ".json" - response = requests.get(url) - - if response.status_code != 200: - return f"Error fetching service list: {response.status_code}", 503 + tool_json = json.loads(row["raw_json"]) if row["raw_json"] else {} + # Fetch full details from cloud service JSON + url = f"https://cloud.vhp4safety.nl/service/{toolname}.json" try: - tool_details = response.json() - tool_details = dict(tool_details) - # Geting the service_list.json in the dictionary format. - # Converting the dictionary to a list object. - except Exception as e: - return f"Error processing service data: {e}", 500 + resp = requests.get(url, timeout=10) + tool_details = resp.json() if resp.status_code == 200 else tool_json + except Exception: + tool_details = tool_json - # Pass the json filename to the template (for JS to pick up) return render_template( - "tools/tool.html", tool_json=tools[toolname], tool_details=tool_details + "tools/tool.html", + tool_json=tool_json, + tool_details=tool_details, ) @@ -837,31 +767,45 @@ def tool_page(toolname): # General Safety Assessment Workflow page -@app.route("/Safety_Assessment_Workflow") +@app.route("/safety_assessment_workflow") def SafetyAssessmentWorkflow(): - return render_template("Safety_Assessment_Workflow.html") + return render_template("safety_assessment_workflow.html") ################################################################################ ### Pages under 'Case Studies' -# General case studies page @app.route("/casestudies") def workflows(): - return render_template("case_studies/casestudies.html") + conn = get_conn() + cards = conn.execute("SELECT * FROM case_studies").fetchall() + conn.close() + return render_template( + "case_studies/casestudies.html", cards=[dict(c) for c in cards] + ) -# Individual case study page, dynamically filled based on URL -@app.route("/casestudies/", defaults={"step": ""}) -@app.route("/casestudies//") -@app.route("/casestudies///") -# additional routes are parsed client side via js to allow smooth animation -def casestudy(case:str="", question:str="", step:str=""): - if case not in CASESTUDIES: +@app.route("/casestudies/") +@app.route("/casestudies//") +def casestudy(case: str, subpath: str = ""): + conn = get_conn() + cs = conn.execute("SELECT * FROM case_studies WHERE slug = ?", (case,)).fetchone() + conn.close() + if not cs: abort(404) - # JS will handle steps via the URL - return render_template("case_studies/casestudy.html", case=case) + + parts = [ + p for p in subpath.split("/") if p + ] if subpath else [] + + step = resolve_casestudy(case, parts) + if step is None: + abort(404) + + return render_template( + "case_studies/casestudy_server.html", step=step + ) @app.route("/workflow/") @@ -1121,5 +1065,8 @@ def privacy_policy(): return render_template("legal/privacypolicy.html") +from src.scheduler import init_scheduler +init_scheduler(app) + if __name__ == "__main__": app.run(host="0.0.0.0", port=5050, debug=True) diff --git a/entrypoint.sh b/entrypoint.sh index cd96440..56a3e46 100644 --- a/entrypoint.sh +++ b/entrypoint.sh @@ -1,4 +1,11 @@ #!/bin/sh +set -e -# Start Flask app -python app.py +echo "==> Seeding database..." +python -m src.seed + +echo "==> Generating sitemap..." +python -m src.sitemap || echo "sitemap generation failed; continuing" + +echo "==> Starting Flask app..." +exec python app.py diff --git a/patch.py b/patch.py deleted file mode 100644 index 5cd790a..0000000 --- a/patch.py +++ /dev/null @@ -1,46 +0,0 @@ -from importlib import import_module -from pathlib import Path - -def apply_patch(): - try: - # Dynamically import the module and get its file path - try: - module = import_module('pyshexc.parser.ShExDocLexer') - except ModuleNotFoundError as e: - # Give a precise, actionable hint for installation in the active interpreter - print( - "Missing dependency: 'pyshexc' (PyShExC).\n" - "Install it in the same environment you're using to run this script.\n" - "Examples:\n" - " python -m pip install PyShExC\n" - " # or with uv: uv pip install PyShExC\n" - " # or poetry: poetry add PyShExC\n" - " # or conda: conda install -c conda-forge pyshexc\n" - ) - return - - file_path = Path(module.__file__) - - if not file_path.exists(): - raise FileNotFoundError(f"Could not find the file: {file_path}") - - # Read the file content - file_content = file_path.read_text() - - # Replace 'from typing.io import TextIO' with 'from typing import TextIO' - new_content = file_content.replace("from typing.io import TextIO", "from typing import TextIO") - - # Only write if a change is needed - if new_content != file_content: - file_path.write_text(new_content) - print("Patch applied successfully!") - else: - print("No patch needed; target text not found (already patched or different version).") - - except FileNotFoundError as e: - print(e) - except Exception as e: - print(f"An error occurred: {e}") - -if __name__ == "__main__": - apply_patch() diff --git a/requirements.txt b/requirements.txt index 3e7ed1c..95607e1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,30 @@ -flask>=3.1.3 +annotated-types==0.7.0 +backoff==2.2.1 +blinker==1.9.0 +cachelib==0.13.0 +certifi==2026.2.25 +charset-normalizer==3.4.7 +click==8.3.2 +flask==3.1.3 flask-caching==2.3.1 +idna==3.11 +itsdangerous==2.2.0 +jinja2==3.1.6 +markupsafe==3.0.3 +mwoauth==0.4.0 +oauthlib==3.3.1 +pydantic==2.13.2 +pydantic-core==2.46.2 +pyjwt==2.12.1 requests==2.32.4 -#wikidataintegrator==0.9.30 -setuptools==78.1.1 # Provides pkg_resources module, required for wikidataintegrator -werkzeug>=3.0.6 -#pyBiodatafuse @ git+https://github.com/BioDataFuse/pyBiodatafuse.git -wikibaseintegrator>=0.12.14 - +requests-oauthlib==2.0.0 +setuptools==78.1.1 +typing-extensions==4.15.0 +typing-inspection==0.4.2 +ujson==5.12.0 +urllib3==2.6.3 +werkzeug==3.1.8 +wikibaseintegrator==0.12.15 +flask-smorest>=0.44 +marshmallow>=3.20 +APScheduler>=3.10,<4 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/api.py b/src/api.py new file mode 100644 index 0000000..646a88c --- /dev/null +++ b/src/api.py @@ -0,0 +1,564 @@ +"""RESTful API with auto-generated OpenAPI documentation. + +Uses flask-smorest (marshmallow + OpenAPI 3) so Swagger UI is +served automatically at /api/v1/docs. +""" + +from __future__ import annotations + +import json + +from flask import Flask +from flask_smorest import Api, Blueprint, abort +from marshmallow import Schema, fields + +from src.db import get_conn +from src.models.data.biostudies import BioStudiesExtractor +from src.models.data.zenodo import ZenodoExtractor +from src.models.data.mapping import normalize_all +from src.services.compound import ( + get_experimental_data, + get_full_compound, + get_identifiers, + get_properties, + get_toxicology, + is_valid_qid, +) + +BIOSTUDIES_COLLECTION = "VHP4Safety" +ZENODO_COMMUNITY = "vhp4safety" +ZENODO_RECORD_TYPE = "dataset" + + +# -- Marshmallow Schemas --------------------------------------------------- + +class ToolSchema(Schema): + id = fields.Str() + service = fields.Str() + description = fields.Str() + stage = fields.Str() + main_url = fields.Str() + inst_url = fields.Str() + html_name = fields.Str() + png_file_name = fields.Str() + + +class MethodSchema(Schema): + id = fields.Str() + method = fields.Str() + description = fields.Str() + stage = fields.Str() + substage = fields.Str() + catalog_webpage_url = fields.Str() + raw = fields.Dict(load_default=None) + + +class RegulatoryQuestionSchema(Schema): + key = fields.Str() + label = fields.Str() + explanation = fields.Str() + + +class StageExplanationSchema(Schema): + name = fields.Str() + explanation = fields.Str() + + +class CaseStudySchema(Schema): + slug = fields.Str() + title = fields.Str() + description = fields.Str() + image_src = fields.Str() + config_repo = fields.Str() + default_branch = fields.Str() + + +class CaseStudyDetailSchema(CaseStudySchema): + content_json = fields.Raw(load_default=None) + + +class CompoundSummarySchema(Schema): + wcid = fields.Str() + label = fields.Str() + inchi = fields.Str() + inchikey = fields.Str() + smiles = fields.Str(data_key="SMILES") + formula = fields.Str() + mass = fields.Str() + + +class CompoundIdentifierSchema(Schema): + property_label = fields.Str(data_key="propertyLabel") + value = fields.Str() + formatter_url = fields.Str(data_key="formatterURL") + + +class CompoundToxicologySchema(Schema): + property_label = fields.Str(data_key="propertyLabel") + value = fields.Str() + + +class CompoundExpDataSchema(Schema): + property_label = fields.Str(data_key="propEntityLabel") + value = fields.Str() + units_label = fields.Str(data_key="unitsLabel") + source = fields.Str() + doi = fields.Str() + see_also = fields.Str(data_key="seeAlso") + + +class CompoundDetailSchema(Schema): + summary = fields.Nested(CompoundSummarySchema) + identifiers = fields.List(fields.Nested(CompoundIdentifierSchema)) + toxicology = fields.List(fields.Nested(CompoundToxicologySchema)) + experimental_data = fields.List(fields.Nested(CompoundExpDataSchema)) + + +class DataSearchQuerySchema(Schema): + query = fields.Str(load_default="") + page = fields.Int(load_default=1) + size = fields.Int(load_default=18) + + +class DataSourceResultSchema(Schema): + total = fields.Int() + hits = fields.List(fields.Dict()) + error = fields.Str(allow_none=True) + + +class DataResultSchema(Schema): + biostudies = fields.Nested(DataSourceResultSchema) + zenodo = fields.Nested(DataSourceResultSchema) + + +class SearchQuerySchema(Schema): + stage = fields.Str(load_default=None) + search = fields.Str(load_default="") + + +# -- Blueprints ------------------------------------------------------------ + +tools_bp = Blueprint("tools", __name__, url_prefix="/api/tools", + description="Tool / service endpoints") +methods_bp = Blueprint("methods", __name__, url_prefix="/api/methods", + description="Method endpoints") +reg_q_bp = Blueprint("regulatory_questions", __name__, + url_prefix="/api/regulatory-questions", + description="Regulatory questions") +stages_bp = Blueprint("stages", __name__, url_prefix="/api/stages", + description="Safety-assessment workflow stages") +casestudies_bp = Blueprint("casestudies", __name__, + url_prefix="/api/casestudies", + description="Case study endpoints") +compounds_bp = Blueprint("compounds", __name__, url_prefix="/api/compounds", + description="Compound data (SPARQL-backed)") +data_bp = Blueprint("data", __name__, url_prefix="/api/data", + description="Dataset search (BioStudies + Zenodo)") + + +# -- Tools ----------------------------------------------------------------- + +@tools_bp.route("/") +@tools_bp.arguments(SearchQuerySchema, location="query") +@tools_bp.response(200, ToolSchema(many=True)) +def list_tools(args): + """List all tools, with optional stage/search filters.""" + conn = get_conn() + sql = "SELECT * FROM tools WHERE 1=1" + params = [] + if args.get("stage"): + sql += " AND stage = ?" + params.append(args["stage"]) + if args.get("search"): + sql += " AND service LIKE ?" + params.append(f"%{args['search']}%") + sql += " ORDER BY service" + rows = conn.execute(sql, params).fetchall() + conn.close() + return [dict(r) for r in rows] + + +@tools_bp.route("/") +@tools_bp.response(200, ToolSchema) +def get_tool(tool_id): + """Get a single tool by ID.""" + conn = get_conn() + row = conn.execute("SELECT * FROM tools WHERE id = ?", (tool_id,)).fetchone() + conn.close() + if not row: + abort(404, message="Tool not found") + return dict(row) + + +# -- Methods --------------------------------------------------------------- + +@methods_bp.route("/") +@methods_bp.arguments(SearchQuerySchema, location="query") +@methods_bp.response(200, MethodSchema(many=True)) +def list_methods(args): + """List all methods, with optional stage/search filters.""" + conn = get_conn() + sql = "SELECT * FROM methods WHERE 1=1" + params = [] + if args.get("stage"): + sql += " AND stage LIKE ?" + params.append(f"%{args['stage']}%") + if args.get("search"): + sql += " AND method LIKE ?" + params.append(f"%{args['search']}%") + sql += " ORDER BY method" + rows = conn.execute(sql, params).fetchall() + conn.close() + return [dict(r) for r in rows] + + +@methods_bp.route("/") +@methods_bp.response(200, MethodSchema) +def get_method(method_id): + """Get a single method by ID.""" + conn = get_conn() + row = conn.execute("SELECT * FROM methods WHERE id = ?", (method_id,)).fetchone() + conn.close() + if not row: + abort(404, message="Method not found") + d = dict(row) + if d.get("raw_json"): + d["raw"] = json.loads(d["raw_json"]) + return d + + +# -- Regulatory Questions -------------------------------------------------- + +@reg_q_bp.route("/") +@reg_q_bp.response(200, RegulatoryQuestionSchema(many=True)) +def list_regulatory_questions(): + """List all regulatory questions.""" + conn = get_conn() + rows = conn.execute("SELECT * FROM regulatory_questions").fetchall() + conn.close() + return [dict(r) for r in rows] + + +# -- Stages ---------------------------------------------------------------- + +@stages_bp.route("/") +@stages_bp.response(200, StageExplanationSchema(many=True)) +def list_stages(): + """List all safety-assessment workflow stages.""" + conn = get_conn() + rows = conn.execute("SELECT * FROM stage_explanations").fetchall() + conn.close() + return [dict(r) for r in rows] + + +# -- Case Studies ---------------------------------------------------------- + +@casestudies_bp.route("/") +@casestudies_bp.response(200, CaseStudySchema(many=True)) +def list_case_studies(): + """List all case studies.""" + conn = get_conn() + rows = conn.execute("SELECT * FROM case_studies").fetchall() + conn.close() + return [dict(r) for r in rows] + + +@casestudies_bp.route("/") +@casestudies_bp.response(200, CaseStudyDetailSchema) +def get_case_study(slug): + """Get a case study with its full content JSON.""" + conn = get_conn() + row = conn.execute("SELECT * FROM case_studies WHERE slug = ?", (slug,)).fetchone() + conn.close() + if not row: + abort(404, message="Case study not found") + d = dict(row) + if d.get("content_json"): + d["content_json"] = json.loads(d["content_json"]) + return d + + +# -- Compounds (SPARQL-backed) --------------------------------------------- + +@compounds_bp.route("/") +@compounds_bp.response(200, CompoundDetailSchema) +def get_compound(cwid): + """Get full compound data.""" + if not is_valid_qid(cwid): + abort(400, message="Invalid compound identifier") + try: + return get_full_compound(cwid).model_dump() + except Exception as e: + abort(502, message=str(e)) + + +@compounds_bp.route("//properties") +@compounds_bp.response(200, CompoundSummarySchema) +def get_compound_properties(cwid): + """Get core compound identifiers.""" + if not is_valid_qid(cwid): + abort(400, message="Invalid compound identifier") + try: + summary = get_properties(cwid) + if not summary: + abort(404, message="No data found") + return summary.model_dump() + except Exception as e: + abort(502, message=str(e)) + + +@compounds_bp.route("//identifiers") +@compounds_bp.response(200, CompoundIdentifierSchema(many=True)) +def get_compound_identifiers(cwid): + """Get external identifiers.""" + if not is_valid_qid(cwid): + abort(400, message="Invalid compound identifier") + try: + return [i.model_dump() for i in get_identifiers(cwid)] + except Exception as e: + abort(502, message=str(e)) + + +@compounds_bp.route("//toxicology") +@compounds_bp.response(200, CompoundToxicologySchema(many=True)) +def get_compound_toxicology(cwid): + """Get toxicology data.""" + if not is_valid_qid(cwid): + abort(400, message="Invalid compound identifier") + try: + return [t.model_dump() for t in get_toxicology(cwid)] + except Exception as e: + abort(502, message=str(e)) + + +@compounds_bp.route("//experimental-data") +@compounds_bp.response(200, CompoundExpDataSchema(many=True)) +def get_compound_exp_data(cwid): + """Get experimental measurements.""" + if not is_valid_qid(cwid): + abort(400, message="Invalid compound identifier") + try: + return [d.model_dump() for d in get_experimental_data(cwid)] + except Exception as e: + abort(502, message=str(e)) + + +# -- Data (BioStudies + Zenodo passthrough) -------------------------------- + +@data_bp.route("/") +@data_bp.arguments(DataSearchQuerySchema, location="query") +@data_bp.response(200, DataResultSchema) +def list_data(args): + """Search datasets across BioStudies and Zenodo.""" + query = args.get("query", "") + page = args.get("page", 1) + size = args.get("size", 18) + + bs = BioStudiesExtractor(collection=BIOSTUDIES_COLLECTION) + zen = ZenodoExtractor(community=ZENODO_COMMUNITY, record_type=ZENODO_RECORD_TYPE) + + if query: + bs_res = bs.search_studies(query, page=page, page_size=size) + zen_res = zen.search_records(query, page=page, size=size) + else: + bs_res = bs.list_studies(page=page, page_size=size, include_urls=True) + zen_res = zen.list_records(page=page, size=size, include_urls=True) + + studies = bs_res.get("hits", []) + datasets = zen_res.get("hits", []) + studies, datasets = normalize_all(studies, datasets) + + return { + "biostudies": { + "total": bs_res.get("total", 0), + "hits": [h.get("norm_metadata", h) for h in studies], + "error": bs_res.get("error"), + }, + "zenodo": { + "total": zen_res.get("total", 0), + "hits": [h.get("norm_metadata", h) for h in datasets], + "error": zen_res.get("error"), + }, + } + + +@data_bp.route("/") +@data_bp.response(200) +def get_data_detail(data_id): + """Get normalized metadata for a single dataset.""" + bs = BioStudiesExtractor(collection=BIOSTUDIES_COLLECTION) + zen = ZenodoExtractor(community=ZENODO_COMMUNITY, record_type=ZENODO_RECORD_TYPE) + bs_res = bs.search_studies(data_id, page=1, page_size=1) + zen_res = zen.search_records(data_id, page=1, size=1) + studies = bs_res.get("hits", []) + datasets = zen_res.get("hits", []) + studies, datasets = normalize_all(studies, datasets) + if studies: + return studies[0].get("norm_metadata", studies[0]) + if datasets: + return datasets[0].get("norm_metadata", datasets[0]) + abort(404, message="Dataset not found") + + +# -- Validation blueprint -------------------------------------------------- + +validation_bp = Blueprint("validation", __name__, url_prefix="/api/validation", + description="Data completeness validation") + +from src.models.cloud.method import ServiceIndexEntry as ToolModel +from src.models.cloud.tool import Method as MethodModel +from src.models.platform import ( + RegulatoryQuestion as RQModel, + StageExplanation as SEModel, +) +from src.models.casestudy import CaseStudyCard as CSModel + +_ENTITY_REGISTRY = { + "tools": ("tools", ToolModel, "id", "service"), + "methods": ("methods", MethodModel, "id", "method"), + "case_studies": ("case_studies", CSModel, "slug", "title"), + "regulatory_questions": ("regulatory_questions", RQModel, "key", "label"), + "stage_explanations": ("stage_explanations", SEModel, "name", "name"), +} + +_SKIP_FIELDS = { + "raw_json", "updated_at", "model_config", + "timestamp", "https", + "reg_q_1a", "reg_q_1b", "reg_q_2a", + "reg_q_2b", "reg_q_3a", "reg_q_3b", +} + + +class FieldCompleteness(Schema): + field = fields.Str() + present = fields.Bool() + value_preview = fields.Str(allow_none=True) + + +class EntryValidation(Schema): + id = fields.Str() + label = fields.Str() + fields_total = fields.Int() + fields_filled = fields.Int() + completeness_pct = fields.Float() + missing = fields.List(fields.Str()) + details = fields.List(fields.Nested(FieldCompleteness)) + + +class EntitySummary(Schema): + entity = fields.Str() + total_entries = fields.Int() + schema_fields = fields.List(fields.Str()) + avg_completeness_pct = fields.Float() + fully_complete = fields.Int() + entries = fields.List(fields.Nested(EntryValidation)) + + +class ValidationReport(Schema): + generated_at = fields.Str() + entities = fields.List(fields.Nested(EntitySummary)) + + +def _is_filled(val): + if val is None: + return False + if isinstance(val, str) and val.strip() == "": + return False + return True + + +def _preview(val, max_len=80): + if val is None: + return None + s = str(val) + return s[:max_len] + ("..." if len(s) > max_len else "") + + +def _validate_entity(entity_name, table, pydantic_model, id_attr, label_attr): + check_fields = [f for f in pydantic_model.model_fields if f not in _SKIP_FIELDS] + conn = get_conn() + rows = conn.execute(f"SELECT * FROM {table}").fetchall() + conn.close() + + entries = [] + for row in rows: + d = dict(row) + details = [] + filled = 0 + missing = [] + for f in check_fields: + val = d.get(f) + ok = _is_filled(val) + if ok: + filled += 1 + else: + missing.append(f) + details.append({"field": f, "present": ok, "value_preview": _preview(val)}) + + total = len(check_fields) + pct = round(filled / total * 100, 1) if total else 100.0 + entries.append({ + "id": str(d.get(id_attr, "?")), + "label": str(d.get(label_attr) or d.get(id_attr, "?")), + "fields_total": total, + "fields_filled": filled, + "completeness_pct": pct, + "missing": missing, + "details": details, + }) + + avg = round(sum(e["completeness_pct"] for e in entries) / len(entries), 1) if entries else 0.0 + fully = sum(1 for e in entries if e["completeness_pct"] == 100.0) + return { + "entity": entity_name, + "total_entries": len(entries), + "schema_fields": check_fields, + "avg_completeness_pct": avg, + "fully_complete": fully, + "entries": entries, + } + + +@validation_bp.route("/") +@validation_bp.response(200, ValidationReport) +def validate_all(): + """Full data completeness report.""" + from datetime import datetime, timezone + return { + "generated_at": datetime.now(timezone.utc).isoformat(), + "entities": [ + _validate_entity(name, tbl, model, id_a, lbl_a) + for name, (tbl, model, id_a, lbl_a) in _ENTITY_REGISTRY.items() + ], + } + + +@validation_bp.route("/") +@validation_bp.response(200, EntitySummary) +def validate_entity(entity): + """Data completeness report for a single entity type.""" + if entity not in _ENTITY_REGISTRY: + abort(404, message=f"Unknown entity '{entity}'. Valid: {', '.join(_ENTITY_REGISTRY)}") + tbl, model, id_a, lbl_a = _ENTITY_REGISTRY[entity] + return _validate_entity(entity, tbl, model, id_a, lbl_a) + + +# -- Registration helper --------------------------------------------------- + +def init_api(app: Flask) -> Api: + """Configure flask-smorest and register all API blueprints.""" + app.config.update({ + "API_TITLE": "VHP4Safety Platform API", + "API_VERSION": "v1", + "OPENAPI_VERSION": "3.0.3", + "OPENAPI_URL_PREFIX": "/api/v1", + "OPENAPI_SWAGGER_UI_PATH": "/docs", + "OPENAPI_SWAGGER_UI_URL": "https://cdn.jsdelivr.net/npm/swagger-ui-dist/", + "OPENAPI_REDOC_PATH": "/redoc", + "OPENAPI_REDOC_URL": "https://cdn.jsdelivr.net/npm/redoc@latest/bundles/redoc.standalone.js", + }) + smorest_api = Api(app) + for bp in (tools_bp, methods_bp, reg_q_bp, stages_bp, + casestudies_bp, compounds_bp, data_bp, validation_bp): + smorest_api.register_blueprint(bp) + return smorest_api diff --git a/src/casestudy_resolver.py b/src/casestudy_resolver.py new file mode 100644 index 0000000..561624f --- /dev/null +++ b/src/casestudy_resolver.py @@ -0,0 +1,298 @@ +"""Resolve case-study content from the database step hierarchy. + +Case study content JSON is seeded into the ``case_studies`` table from +the VHP4Safety/ui-casestudy-config GitHub repo at seed time. +The JSON has up to 6 nesting levels: + step1Contents → intro + regulatory questions + step2Contents → dict[question_key → nav with process-flow steps] + step3Contents → dict[q → dict[step → node]] + step4Contents → dict[q → dict[step → dict[substep → node]]] + step5Contents → dict[q → dict[...]] + step6Contents → dict[q → dict[...]] + +Given a URL path like /casestudies/kidney/Q1/Kinetics we resolve the +node at step3Contents["Q1"]["Kinetics"] and render it server-side. +""" + +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from typing import Any, Optional + +from src.db import get_conn + + +# In-memory cache keyed by slug +_content_cache: dict[str, dict] = {} + + +def get_content(slug: str) -> dict | None: + """Load case-study content JSON from the database (cached).""" + if slug in _content_cache: + return _content_cache[slug] + + conn = get_conn() + row = conn.execute("SELECT content_json FROM case_studies WHERE slug = ?", (slug,)).fetchone() + conn.close() + if not row or not row["content_json"]: + return None + + data = json.loads(row["content_json"]) + _content_cache[slug] = data + return data + + +# ── Resolved result ────────────────────────────────────────────────────── + +STEP_TYPE_COLORS = { + "workflow step": "btn-vhpdarkteal", + "workflow-step": "btn-vhpdarkteal", + "workflow substep": "btn-vhplightteal", + "workflow-substep": "btn-vhplightteal", + "process flow step": "btn-vhpdarkpurple", + "process-flow-step": "btn-vhpdarkpurple", + "regulatory question": "btn-vhppink-distinct", + "regulatory-question": "btn-vhppink-distinct", + "tool": "btn-vhpblue", +} + +# Workflow header definitions +WORKFLOW_STEPS = [ + {"number": 1, "type": "regulatory-question", + "label": "Regulatory Question"}, + {"number": 2, "type": "workflow-step", + "label": "Safety Assessment Workflow Step"}, + {"number": 3, "type": "process-flow-step", + "label": "Case Study Step"}, + {"number": 4, "type": "workflow-substep", + "label": "Case Study Substep"}, + {"number": 5, "type": "tool", + "label": "Tools, Models and Data"}, +] + + +def btn_color(step_type: str | None) -> str: + """Return CSS class for a step button based on its type.""" + if not step_type: + return "btn-vhpblue" + return STEP_TYPE_COLORS.get(step_type, "btn-vhpblue") + + +@dataclass +class Breadcrumb: + label: str + url: str + active: bool = False + + +@dataclass +class StepButtonResolved: + """A button ready to render in Jinja.""" + label: str + description: str = "" + css_class: str = "btn-vhpblue" + url: str = "" + disabled: bool = False + is_tool_link: bool = False + + +@dataclass +class ResolvedStep: + """Everything the template needs to render one case-study page.""" + case_slug: str = "" + case_title: str = "" + step_number: int = 1 + nav_title: str = "" + nav_description: str = "" + image_html: str = "" + buttons: list[StepButtonResolved] = field(default_factory=list) + accordion_sections: list[dict] = field(default_factory=list) + content_html: str = "" + breadcrumbs: list[Breadcrumb] = field(default_factory=list) + workflow_steps: list[dict] = field(default_factory=list) + path_parts: list[str] = field(default_factory=list) + + +def _slugify(value: str) -> str: + """Convert space-separated label to URL-safe slug.""" + return value.replace(" ", "_") + + +def _unslugify(value: str) -> str: + """Convert URL slug back to the key used in JSON.""" + return value.replace("_", " ") + + +def _make_url(case: str, parts: list[str]) -> str: + """Build an absolute URL from case slug and path parts.""" + base = f"/casestudies/{case}" + if parts: + return base + "/" + "/".join(_slugify(p) for p in parts) + return base + + +def _parse_content(raw: Any) -> tuple[str, list[dict]]: + """Split content into HTML string and accordion sections list.""" + if raw is None: + return "", [] + if isinstance(raw, str): + return raw, [] + if isinstance(raw, list): + sections = [] + for item in raw: + if isinstance(item, dict): + sections.append(item) + return "", sections + return str(raw), [] + + +def resolve( + slug: str, + path_parts: list[str], + branch: str = "main", +) -> Optional[ResolvedStep]: + """Resolve a URL path to the correct step content. + + Parameters + ---------- + slug : str + Case study slug (kidney, parkinson, thyroid). + path_parts : list[str] + Path segments after /casestudies// — e.g. + ["Q1", "Kinetics"] for step 3. + + Returns + ------- + ResolvedStep or None if the path doesn't resolve. + """ + data = get_content(slug) + if data is None: + return None + + step1 = data.get("step1Contents", {}) + case_title = step1.get("navTitle", slug.title() + " Case Study") + + result = ResolvedStep( + case_slug=slug, + case_title=case_title, + path_parts=list(path_parts), + ) + + # Build workflow header state + active_step = len(path_parts) + 1 + result.step_number = active_step + for ws in WORKFLOW_STEPS: + state = "completed" if ws["number"] < active_step \ + else "active" if ws["number"] == active_step \ + else "" + result.workflow_steps.append({**ws, "state": state}) + + # ── Step 1: no path parts ───────────────────────────────────── + if not path_parts: + result.nav_title = step1.get("navTitle", "") + result.nav_description = step1.get("navDescription", "") + html, sections = _parse_content(step1.get("content")) + result.content_html = html + result.accordion_sections = sections + # Buttons = regulatory questions + for q in step1.get("questions", []): + result.buttons.append(StepButtonResolved( + label=q.get("label", ""), + description=q.get("description", ""), + css_class=btn_color( + q.get("type", "regulatory-question") + ), + url=_make_url(slug, [q["value"]]), + disabled=q.get("state") == "disabled", + )) + result.breadcrumbs = [ + Breadcrumb("Case Studies", "/casestudies"), + Breadcrumb(case_title, "", active=True), + ] + return result + + # ── Step 2+: walk the nested dicts ──────────────────────────── + # path_parts[0] is the question key (e.g. "Q1") + # path_parts[1] is the step2 choice (e.g. "Kinetics") + # etc. + depth = len(path_parts) + step_key = f"step{depth + 1}Contents" + + # Navigate to the correct node + container = data.get(step_key, {}) + node = container + for i, part in enumerate(path_parts): + key = _unslugify(part) + if isinstance(node, dict) and key in node: + node = node[key] + else: + # Try original (slugified) key as fallback + if isinstance(node, dict) and part in node: + node = node[part] + else: + return None + + if not isinstance(node, dict): + return None + + # Extract node fields + result.nav_title = node.get("navTitle", "") + result.nav_description = node.get("navDescription", "") + result.image_html = node.get("image", "") + html, sections = _parse_content(node.get("content")) + result.content_html = html + result.accordion_sections = sections + + # Determine next-step buttons + base_url_parts = list(path_parts) + + if node.get("steps"): + for s in node["steps"]: + val = s.get("value", s.get("label", "")) + result.buttons.append(StepButtonResolved( + label=s.get("label", ""), + description=s.get("description", ""), + css_class=btn_color(s.get("type")), + url=_make_url(slug, base_url_parts + [val]), + disabled=s.get("state") == "disabled", + )) + elif node.get("tools"): + for t in node["tools"]: + tool_id = t.get("id") + route = t.get("route", "tools") + if tool_id: + url = f"/{route}/{tool_id}" + is_tool = True + else: + url = "" + is_tool = False + result.buttons.append(StepButtonResolved( + label=t.get("label", ""), + description=t.get("description", ""), + css_class=btn_color(t.get("type", "tool")), + url=url, + disabled=t.get("state") == "disabled", + is_tool_link=is_tool, + )) + + # Breadcrumbs + crumbs = [Breadcrumb("Case Studies", "/casestudies")] + crumbs.append(Breadcrumb( + case_title, _make_url(slug, []), + )) + + # Build intermediate crumbs + # Step 2 label = "Regulatory Question " + for i, part in enumerate(path_parts): + is_last = (i == len(path_parts) - 1) + label = _unslugify(part) + if i == 0: + label = f"Regulatory Question {label}" + url = _make_url(slug, path_parts[: i + 1]) + crumbs.append(Breadcrumb( + label, url, active=is_last, + )) + + result.breadcrumbs = crumbs + return result diff --git a/src/db.py b/src/db.py new file mode 100644 index 0000000..4affbf4 --- /dev/null +++ b/src/db.py @@ -0,0 +1,75 @@ +"""Thin sqlite3 helper. No ORM — just raw SQL.""" + +from __future__ import annotations + +import os +import sqlite3 +from contextlib import contextmanager + +DB_PATH = os.environ.get("DATABASE_PATH", "data/vhp4safety.db") + +_TABLES = [ + """CREATE TABLE IF NOT EXISTS tools ( + id TEXT PRIMARY KEY, service TEXT NOT NULL, description TEXT, + stage TEXT, html_name TEXT, md_file_name TEXT, png_file_name TEXT, + main_url TEXT, inst_url TEXT, + reg_q_1a INTEGER, reg_q_1b INTEGER, reg_q_2a INTEGER, + reg_q_2b INTEGER, reg_q_3a INTEGER, reg_q_3b INTEGER, + login TEXT, api_type TEXT, casestudy TEXT, provider TEXT, + provider_email TEXT, citation TEXT, version TEXT, license TEXT, + sourcecode TEXT, docker TEXT, bio_tools TEXT, tess TEXT, + raw_json TEXT, updated_at TEXT + )""", + """CREATE TABLE IF NOT EXISTS methods ( + id TEXT PRIMARY KEY, method TEXT NOT NULL, issue_number INTEGER, + description TEXT, stage TEXT, substage TEXT, + catalog_webpage_url TEXT, case_study TEXT, regulatory_question TEXT, + reg_q_1a INTEGER, reg_q_1b INTEGER, reg_q_2a INTEGER, + reg_q_2b INTEGER, reg_q_3a INTEGER, reg_q_3b INTEGER, + data_producer TEXT, sop TEXT, vendor TEXT, catalog_number TEXT, + citation TEXT, type_iri TEXT, ontology TEXT, + key_event_id TEXT, aop_id TEXT, raw_json TEXT, updated_at TEXT + )""", + """CREATE TABLE IF NOT EXISTS regulatory_questions ( + key TEXT PRIMARY KEY, label TEXT NOT NULL, explanation TEXT NOT NULL + )""", + """CREATE TABLE IF NOT EXISTS stage_explanations ( + name TEXT PRIMARY KEY, explanation TEXT NOT NULL + )""", + """CREATE TABLE IF NOT EXISTS glossary_stage_mappings ( + glossary_url TEXT PRIMARY KEY, stage_name TEXT NOT NULL + )""", + """CREATE TABLE IF NOT EXISTS case_studies ( + slug TEXT PRIMARY KEY, title TEXT NOT NULL, description TEXT NOT NULL, + image_src TEXT, image_alt TEXT, + config_repo TEXT DEFAULT 'VHP4Safety/ui-casestudy-config', + default_branch TEXT DEFAULT 'main', content_json TEXT + )""", +] + + +def get_conn() -> sqlite3.Connection: + """Return a new connection with Row factory.""" + conn = sqlite3.connect(DB_PATH) + conn.row_factory = sqlite3.Row + return conn + + +@contextmanager +def get_db(): + """Context manager: yields a connection, auto-closes.""" + conn = get_conn() + try: + yield conn + finally: + conn.close() + + +def init_db() -> None: + """Create all tables (idempotent).""" + os.makedirs(os.path.dirname(DB_PATH) or ".", exist_ok=True) + conn = sqlite3.connect(DB_PATH) + for ddl in _TABLES: + conn.execute(ddl) + conn.commit() + conn.close() diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/models/casestudy.py b/src/models/casestudy.py new file mode 100644 index 0000000..430ef03 --- /dev/null +++ b/src/models/casestudy.py @@ -0,0 +1,209 @@ +"""Pydantic models for VHP4Safety case-study content JSON schemas. + +The JSON files originate from a separate GitHub repo +(VHP4Safety/ui-casestudy-config) and are fetched once during database +seeding (``python -m src.seed``). The full JSON blob is stored in +the ``case_studies.content_json`` column and resolved server-side +by ``src.casestudy_resolver`` into rendered Jinja templates. + +These models formalise the structure so it can be validated +server-side, used in tests, and consumed by type-aware code. + +Hierarchy (up to 6 levels deep): + CaseStudyContent ← root of one *_content.json file + └ Step1Contents ← intro + regulatory-question buttons + └ step2Contents ← dict[question_key → ProcessFlowNav] + └ step3Contents ← dict[question_key → dict[step_label → WorkflowStepNode]] + └ step4–6Contents ← additional nesting (same WorkflowStepNode shape) + +Every "node" at step ≥ 2 follows the same recursive pattern captured +by ``WorkflowStepNode``. +""" + +from __future__ import annotations + +from enum import Enum +from typing import Any, Optional + +from pydantic import BaseModel, Field + + +# ── Enums ───────────────────────────────────────────────────────────────── + + +class StepType(str, Enum): + """Button colour / role categories used by the JS renderer.""" + + WORKFLOW_STEP = "workflow step" + WORKFLOW_SUBSTEP = "workflow substep" + PROCESS_FLOW_STEP = "process flow step" + REGULATORY_QUESTION = "regulatory question" + TOOL = "tool" + + +class CaseStudySlug(str, Enum): + """Known case-study URL slugs.""" + + KIDNEY = "kidney" + PARKINSON = "parkinson" + THYROID = "thyroid" + + +# ── Leaf / reusable pieces ──────────────────────────────────────────────── + + +class StepButton(BaseModel): + """A single clickable button shown in a step panel. + + Appears in ``questions``, ``steps``, and ``tools`` arrays. + """ + + label: str + value: Optional[str] = None + description: Optional[str] = None + type: Optional[StepType] = None + state: Optional[str] = None # e.g. "disabled" + + # tool-specific fields + id: Optional[str] = None + route: Optional[str] = None # e.g. "tools" or "methods" + + model_config = {"extra": "allow"} + + +class AccordionSection(BaseModel): + """One collapsible section inside ``content`` when it is an array.""" + + section: Optional[str] = None + description: Optional[str] = None + + model_config = {"extra": "allow"} + + +# Content can be a raw HTML string **or** a list of accordion sections. +# We keep it as ``Any`` so both shapes validate; downstream code already +# branches on ``Array.isArray(content)`` in JS. +ContentBlock = str | list[AccordionSection] | None + + +# ── Step 1 (intro + regulatory questions) ──────────────────────────────── + + +class Step1Contents(BaseModel): + """Top-level intro panel for a case study. + + Shown on first load; contains the two regulatory-question buttons. + """ + + navTitle: str + navDescription: str = "" + questions: list[StepButton] = Field(default_factory=list) + content: Any = None # HTML string or accordion list + + model_config = {"extra": "allow"} + + +# ── Generic workflow node (steps 2–6) ───────────────────────────────────── + + +class WorkflowStepNode(BaseModel): + """A single node at any depth in the step hierarchy. + + Depending on what keys are present the JS renderer shows: + * ``steps`` → navigable sub-step buttons (goes deeper) + * ``tools`` → tool buttons (leaf, may link to /tools/) + * neither → plain content panel + + Nodes may contain ``content`` as HTML **or** accordion JSON. + ``image`` is an optional raw HTML string (e.g. an tag). + """ + + navTitle: Optional[str] = None + navDescription: Optional[str] = None + steps: Optional[list[StepButton]] = None + tools: Optional[list[StepButton]] = None + content: Any = None + image: Optional[str] = None + + # Some step-3 entries carry a flag to signal step-4 exists + step4content: Optional[str] = None + + model_config = {"extra": "allow"} + + +class ProcessFlowNav(BaseModel): + """Step-2 panel: safety-assessment workflow steps for one question. + + Keys ``steps`` list the process-flow buttons; ``content`` is the + intro HTML. + """ + + navTitle: str = "" + navDescription: str = "" + steps: list[StepButton] = Field(default_factory=list) + content: Any = None + image: Optional[str] = None + + model_config = {"extra": "allow"} + + +# ── Root document ───────────────────────────────────────────────────────── + +# Steps 3-6 are nested dicts whose keys are dynamic (question key, +# step label, sub-step label …). We type them as deeply as +# practical; the innermost values are always WorkflowStepNode. + +Step3Map = dict[str, dict[str, WorkflowStepNode]] +Step4Map = dict[str, dict[str, dict[str, WorkflowStepNode]]] +Step5Map = dict[str, dict[str, dict[str, dict[str, WorkflowStepNode]]]] +Step6Map = dict[ + str, dict[str, dict[str, dict[str, dict[str, WorkflowStepNode]]]] +] + + +class CaseStudyContent(BaseModel): + """Root schema for a ``_content.json`` file. + + Mirrors exactly the shape consumed by ``casestudies.js``. + """ + + step1Contents: Step1Contents + step2Contents: dict[str, ProcessFlowNav] = Field( + default_factory=dict + ) + step3Contents: Optional[Step3Map] = None + step4Contents: Optional[Step4Map] = None + step5Contents: Optional[Step5Map] = None + step6Contents: Optional[Step6Map] = None + + model_config = {"extra": "allow"} + + +# ── Case study card (listing page) ─────────────────────────────────────── + + +class CaseStudyCard(BaseModel): + """Metadata for one card on the /casestudies listing page.""" + + slug: CaseStudySlug + title: str + description: str + image_src: str = "" + image_alt: str = "" + url: str = "" + config_repo: Optional[str] = None + content_json: Optional[str] = None + + +# ── Convenience: full registry ──────────────────────────────────────────── + + +class CaseStudyRegistry(BaseModel): + """All known case studies with their summary cards and loaded content.""" + + cards: list[CaseStudyCard] = Field(default_factory=list) + content: dict[CaseStudySlug, CaseStudyContent] = Field( + default_factory=dict, + ) + + model_config = {"extra": "allow"} diff --git a/src/models/cloud/method.py b/src/models/cloud/method.py new file mode 100644 index 0000000..48d37f9 --- /dev/null +++ b/src/models/cloud/method.py @@ -0,0 +1,134 @@ +"""Pydantic models for VHP4Safety Cloud method JSON schemas.""" + +from __future__ import annotations + +from datetime import datetime +from typing import Optional + +from pydantic import BaseModel, Field + +class ServiceContact(BaseModel): + name: Optional[str] = None + email: Optional[str] = None + + +class ServiceProvider(BaseModel): + contact: Optional[ServiceContact] = None + url: Optional[str] = None + name: Optional[str] = None + + +class ServiceInstance(BaseModel): + type: Optional[str] = None + url: Optional[str] = None + license: Optional[str] = None + version: Optional[str] = None + source: Optional[str] = None + vhp_platform: Optional[str] = Field(None, alias="vhp-platform") + + model_config = {"populate_by_name": True, "extra": "allow"} + + +class ServiceAccess(BaseModel): + API: Optional[str] = None + login: Optional[str] = None + + model_config = {"extra": "allow"} + + +class ServiceIntro(BaseModel): + title: Optional[str] = None + url: Optional[str] = None + + +class RegulatoryQuestion(BaseModel): + q1a: Optional[str] = Field(None, alias="1a") + q1b: Optional[str] = Field(None, alias="1b") + q2a: Optional[str] = Field(None, alias="2a") + q2b: Optional[str] = Field(None, alias="2b") + q3a: Optional[str] = Field(None, alias="3a") + q3b: Optional[str] = Field(None, alias="3b") + + model_config = {"populate_by_name": True} + + +class Service(BaseModel): + """A single service entry (docs/service/*.json).""" + + id: str + service: str = Field(description="Service display name") + description: Optional[str] = None + + stage: Optional[str] = None + substage: Optional[str] = None + screenshot: Optional[str] = None + url: Optional[str] = None + + instance: Optional[ServiceInstance] = None + intro: Optional[ServiceIntro] = None + provider: Optional[ServiceProvider] = None + access: Optional[ServiceAccess] = None + regulatory_question: Optional[RegulatoryQuestion] = Field( + None, alias="regulatory-question" + ) + ELIXIR: Optional[dict] = None + + model_config = {"populate_by_name": True, "extra": "allow"} + + +class ServiceIndexEntry(BaseModel): + """A service as represented in the index (cap/service_index.json).""" + + id: str + service: str + description: Optional[str] = None + + html_name: Optional[str] = None + md_file_name: Optional[str] = None + png_file_name: Optional[str] = None + stage: Optional[str] = None + main_url: Optional[str] = None + inst_url: Optional[str] = None + + # Regulatory question flags + reg_q_1a: Optional[str] = None + reg_q_1b: Optional[str] = None + reg_q_2a: Optional[str] = None + reg_q_2b: Optional[str] = None + reg_q_3a: Optional[str] = None + reg_q_3b: Optional[str] = None + + # Upstream issue-template fields (new-tool-service-entry.yml) + login: Optional[str] = None + api_type: Optional[str] = Field(None, alias="api") + casestudy: Optional[str] = None + provider: Optional[str] = None + provider_email: Optional[str] = Field( + None, alias="provider-email" + ) + citation: Optional[str] = None + version: Optional[str] = None + license: Optional[str] = None + sourcecode: Optional[str] = None + docker: Optional[str] = None + bio_tools: Optional[str] = Field(None, alias="bioTools") + tess: Optional[str] = None + + model_config = {"populate_by_name": True, "extra": "allow"} + + +class ServiceIndex(BaseModel): + """The full service index (cap/service_index.json). + + A mapping of service id → ServiceIndexEntry. + """ + + root: dict[str, ServiceIndexEntry] = Field(default_factory=dict) + + model_config = {"extra": "allow"} + + @classmethod + def from_dict(cls, data: dict) -> ServiceIndex: + return cls( + root={k: ServiceIndexEntry.model_validate(v) for k, v in data.items()} + ) diff --git a/src/models/cloud/tool.py b/src/models/cloud/tool.py new file mode 100644 index 0000000..01b574a --- /dev/null +++ b/src/models/cloud/tool.py @@ -0,0 +1,98 @@ +"""Pydantic models for VHP4Safety Cloud tool JSON schemas.""" + +from __future__ import annotations + +from datetime import datetime +from typing import Optional + +from pydantic import BaseModel, Field + + +class Method(BaseModel): + """A single method entry (docs/methods/*.json). + + Field names match the ORM columns in tables.py. + Aliases map to the raw JSON keys from the cloud repo. + """ + + id: str + method: str = Field(description="Method title (from issue title)") + issue_number: Optional[int] = None + description: Optional[str] = Field( + None, alias="method_description_content" + ) + + # Upstream issue-template fields (new-tool-method-entry.yml) + data_producer: Optional[str] = Field( + None, alias="data_producer_content" + ) + sop: Optional[str] = Field( + None, alias="available_sop_or_protocol_content" + ) + vendor: Optional[str] = Field( + None, alias="vendor_content" + ) + catalog_number: Optional[str] = Field( + None, alias="catalog_number_content" + ) + catalog_webpage_url: Optional[str] = None + citation: Optional[str] = Field( + None, alias="citation_content" + ) + stage: Optional[str] = Field( + None, alias="vhp4safety_workflow_stage_content" + ) + substage: Optional[str] = Field( + None, alias="workflow_substage_content" + ) + case_study: Optional[str] = Field( + None, alias="case_study_content" + ) + regulatory_question: Optional[str] = Field( + None, alias="regulatory_question_content" + ) + type_iri: Optional[str] = Field( + None, alias="ontology_term_content" + ) + ontology: Optional[str] = Field( + None, alias="type_content" + ) + key_event_id: Optional[str] = Field( + None, + alias="relevant_aop_wiki_key_event(s)_to_the_assay_content", + ) + aop_id: Optional[str] = Field( + None, + alias="relevant_aop_wiki_adverse_outcome_pathway(s)" + "_to_the_assay_content", + ) + + # Regulatory question flags + reg_q_1a: Optional[str] = None + reg_q_1b: Optional[str] = None + reg_q_2a: Optional[str] = None + reg_q_2b: Optional[str] = None + reg_q_3a: Optional[str] = None + reg_q_3b: Optional[str] = None + + timestamp: Optional[datetime] = None + https: Optional[str] = Field( + None, description="Broken URL fragment in some files" + ) + + model_config = {"populate_by_name": True, "extra": "allow"} + + +class MethodIndex(BaseModel): + """The full methods index (cap/methods_index.json). + + A mapping of method id → Method. + """ + + root: dict[str, Method] = Field(default_factory=dict) + + model_config = {"extra": "allow"} + + @classmethod + def from_dict(cls, data: dict) -> MethodIndex: + return cls(root={k: Method.model_validate(v) for k, v in data.items()}) diff --git a/src/models/compound.py b/src/models/compound.py new file mode 100644 index 0000000..1b871f4 --- /dev/null +++ b/src/models/compound.py @@ -0,0 +1,75 @@ +"""Pydantic models for compound data from CompoundCloud SPARQL. + +These are not stored in the database — they model the responses from +the CompoundCloud Wikibase SPARQL endpoint and from Wikidata QLever +for experimental data. +""" + +from __future__ import annotations + +from typing import Optional + +from pydantic import BaseModel, Field + + +class CompoundSummary(BaseModel): + """Core compound identifiers from CompoundCloud.""" + + wcid: str = Field(description="CompoundCloud entity URI") + label: str = Field(description="Human-readable compound name") + inchi: str = "" + inchikey: str = "" + smiles: str = Field("", alias="SMILES") + formula: str = "" + mass: str = "" + + model_config = {"populate_by_name": True} + + +class CompoundIdentifier(BaseModel): + """A single external identifier for a compound.""" + + property_label: str = Field( + "", description="Name of the identifier property" + ) + value: str = "" + formatter_url: str = Field( + "", description="URL template for the identifier" + ) + + +class CompoundToxicology(BaseModel): + """A toxicology property row.""" + + property_label: str = "" + value: str = "" + + +class CompoundExperimentalDatum(BaseModel): + """A single experimental measurement from Wikidata.""" + + property_label: str = Field( + "", description="Measured property name" + ) + value: str = "" + units_label: str = "" + source: str = "" + doi: str = "" + see_also: str = Field( + "", description="Link to the Wikidata statement" + ) + + +class CompoundDetail(BaseModel): + """Full compound view combining all SPARQL query results.""" + + summary: Optional[CompoundSummary] = None + identifiers: list[CompoundIdentifier] = Field( + default_factory=list + ) + toxicology: list[CompoundToxicology] = Field( + default_factory=list + ) + experimental_data: list[CompoundExperimentalDatum] = Field( + default_factory=list + ) diff --git a/src/models/data/__init__.py b/src/models/data/__init__.py new file mode 100644 index 0000000..6fd1d01 --- /dev/null +++ b/src/models/data/__init__.py @@ -0,0 +1,50 @@ +"""Data models & extractors for BioStudies and Zenodo datasets.""" + +from src.models.data.biostudies import BioStudiesExtractor +from src.models.data.zenodo import ZenodoExtractor +from src.models.data.mapping import normalize_all +from src.models.data.schemas import ( + Author, + Attribute, + AuthorDetail, + BiologicalContext, + BioStudiesParsedMetadata, + DataFile, + ExperimentalDesign, + FileEntry, + Funding, + LinkEntry, + NormalizedMetadata, + ProtocolEntry, + Publication, + TechnicalDetails, + UrlExistsResult, + ZenodoFileEntry, + ZenodoParsedMetadata, +) + +__all__ = [ + # Extractors + "BioStudiesExtractor", + "ZenodoExtractor", + # Normalizer + "normalize_all", + # Pydantic models + "Author", + "Attribute", + "AuthorDetail", + "BiologicalContext", + "BioStudiesParsedMetadata", + "DataFile", + "ExperimentalDesign", + "FileEntry", + "Funding", + "LinkEntry", + "NormalizedMetadata", + "ProtocolEntry", + "Publication", + "TechnicalDetails", + "UrlExistsResult", + "ZenodoFileEntry", + "ZenodoParsedMetadata", +] diff --git a/src/models/data/biostudies.py b/src/models/data/biostudies.py new file mode 100644 index 0000000..2756cb1 --- /dev/null +++ b/src/models/data/biostudies.py @@ -0,0 +1,867 @@ +import requests +import json +import time +import re +from urllib.parse import quote + + +class BioStudiesExtractor: + """Class to handle BioStudies API interactions""" + + _SPLIT_RE = re.compile(r"^(.*?)(\d+)$") + + def __init__(self, collection: str = ""): + self.base_url = "https://www.ebi.ac.uk/biostudies/api/v1" + self.ftp_base = "https://ftp.ebi.ac.uk/pub/databases/biostudies/" + self.studies_url = self.base_url + "/studies" + self.search_url = ( + f"{self.base_url}/{collection}/search" + if collection + else f"{self.base_url}/search" + ) + + # ----------------------------- + # ID validation / URL building + # ----------------------------- + def validate_study_id(self, study_id): + """ + Validate BioStudies ID format + + Args: + study_id (str): BioStudies accession ID + + Returns: + tuple: (is_valid, cleaned_id, error_message) + """ + if not study_id or not isinstance(study_id, str): + return False, None, "Study ID is required" + + verified_id = study_id.strip().upper() + + # Examples: S-ONTX26, E-MTAB-1234, S-BSST123, S-VHPS21, S-TOXR1735 + patterns = [ + r"^S-[A-Z0-9]+$", # Studies starting with S- + r"^E-[A-Z]+-\d+$", # Expression studies like E-MTAB-1234 + r"^[A-Z]+-\d+$", # General pattern like ABC-123 + ] + + if not any(re.match(pattern, verified_id) for pattern in patterns): + return ( + False, + verified_id, + "Invalid BioStudies ID format. Expected format: S-ONTX26, E-MTAB-1234, etc.", + ) + + return True, verified_id, None + + def split_text_int(self, value: str): + """ + Splits trailing integer from a string. + 'S-VHPS21' -> ('S-VHPS', 21) + 'ABC' -> ('ABC', None) + 'X-12A' -> ('X-12A', None) + """ + if not value: + return value, None + m = self._SPLIT_RE.match(value) + if not m: + return value, None + prefix, num = m.group(1), int(m.group(2)) + return prefix, num + + def build_biostudies_https_file_url(self, accno: str, filename: str) -> str | None: + """ + Constructs: + https://ftp.ebi.ac.uk/pub/databases/biostudies/{prefix}/{num3}/{accno}/Files/{filename} + + Returns None if accno has no trailing integer. + + Note: + - We keep "/" safe in case filename contains subfolders (rare, but possible). + """ + prefix, num = self.split_text_int(accno) + if num is None or not filename: + return None + + num3 = f"{num:03d}" + + # Encode only the filename segment (allow "/" for potential subpaths) + safe_name = quote(filename, safe="/") + + return ( + self.ftp_base + + f"{prefix}/{num3}/{accno}/Files/{safe_name}" + ) + + def url_exists_no_download(self, url: str, timeout=(3.05, 10)): + """ + Returns a dict describing existence with minimal data transfer. + - tries HEAD + - falls back to GET Range bytes=0-0 + """ + result = { + "url": url, + "exists": False, + "status_code": None, + "content_length": None, + "final_url": None, + "error": None, + "method": None, + } + + if not url: + result["error"] = "Empty URL" + return result + + try: + # 1) HEAD (preferred: no body) + r = requests.head(url, allow_redirects=True, timeout=timeout) + result["status_code"] = r.status_code + result["final_url"] = str(r.url) + result["method"] = "HEAD" + + if r.status_code == 200: + result["exists"] = True + result["content_length"] = r.headers.get("Content-Length") + return result + + # 2) Fallback if HEAD not allowed or forbidden, etc. + if r.status_code in (403, 405): + rg = requests.get( + url, + stream=True, + allow_redirects=True, + headers={"Range": "bytes=0-0"}, + timeout=timeout, + ) + result["status_code"] = rg.status_code + result["final_url"] = str(rg.url) + result["method"] = "GET_RANGE" + + # 206 Partial Content is a strong "exists" + if rg.status_code in (200, 206): + result["exists"] = True + result["content_length"] = rg.headers.get("Content-Length") + + return result + + # other codes (404, 410, 500...) treated as not found / not accessible + return result + + except requests.RequestException as e: + result["error"] = str(e) + return result + + def _pick_rocrate_file(self, files: list[dict]) -> dict | None: + """ + Return the first file dict whose name/path contains 'rocrate' (case-insensitive). + Preference order: + 1) files where exists_check.exists is True (if exists_check present) + 2) otherwise first match + """ + if not isinstance(files, list) or not files: + return None + + def fname(f: dict) -> str: + if not isinstance(f, dict): + return "" + return str(f.get("name") or f.get("path") or "").lower() + + # All matches by name/path + matches = [f for f in files if "rocrate" in fname(f)] + if not matches: + return None + + # Prefer verified existing ones if available + verified = [ + f for f in matches + if isinstance(f, dict) + and isinstance(f.get("exists_check"), dict) + and f["exists_check"].get("exists") is True + ] + return verified[0] if verified else matches[0] + + # ----------------------------- + # API operations + # ----------------------------- + def get_study_metadata(self, study_id): + """ + Extract metadata for a given BioStudies ID + + Args: + study_id (str): BioStudies accession ID (e.g., S-ONTX26) + + Returns: + dict: Parsed metadata or error information + """ + try: + # Validate study ID format + is_valid, verified_id, validation_error = self.validate_study_id(study_id) + if not is_valid: + return {"error": validation_error} + + url = self.studies_url + f"/{verified_id}" + + headers = { + "Accept": "application/json", + "User-Agent": "BioStudies-VHP4Safety-App/1.0", + } + + response = requests.get(url, headers=headers, timeout=30) + + if response.status_code == 200: + try: + data = response.json() + if not data: + return {"error": f"Empty response received for study {verified_id}"} + + # Parse metadata first, then build URL using the derived collection (no extra API calls) + md = self.parse_metadata(data) + collection = md.get("collection", "") + web_url = self.build_study_url(verified_id, collection).get("url", "") + return md | {"url": web_url} + + except json.JSONDecodeError as e: + return {"error": f"Invalid JSON response from BioStudies API: {str(e)}"} + + elif response.status_code == 404: + return { + "error": f"Study '{verified_id}' not found in BioStudies database. Please check the ID and try again." + } + elif response.status_code == 403: + return {"error": "Access forbidden. The study may be restricted or private."} + elif response.status_code == 500: + return {"error": "BioStudies server error. Please try again later."} + elif response.status_code == 503: + return {"error": "BioStudies service temporarily unavailable. Please try again later."} + else: + return {"error": f"BioStudies API returned status {response.status_code}. Please try again later."} + + except requests.exceptions.Timeout: + return {"error": "Request timed out. BioStudies server may be slow. Please try again."} + except requests.exceptions.ConnectionError: + return {"error": "Cannot connect to BioStudies server. Please check your internet connection."} + except requests.exceptions.RequestException as e: + return {"error": f"Network error: {str(e)}"} + except Exception as e: + return {"error": f"Unexpected error occurred: {str(e)}"} + + def get_study_collection(self, study_id): + """ + Extract collection for a given BioStudies ID + """ + metadata = self.get_study_metadata(study_id) + if "error" in metadata: + return metadata + collection = metadata.get("collection", "") + return {"accession": study_id, "collection": collection} + + def build_study_url(self, study_id, collection: str = ""): + """ + Build the URL to access the study in BioStudies web interface + """ + is_valid, verified_id, validation_error = self.validate_study_id(study_id) + if not is_valid: + return {"error": validation_error} + + if collection: + url = f"https://www.ebi.ac.uk/biostudies/{collection}/studies/{verified_id}" + else: + url = f"https://www.ebi.ac.uk/biostudies/studies/{verified_id}" + + return {"accession": verified_id, "url": url} + + # ----------------------------- + # Search / list + # ----------------------------- + def search_studies( + self, + query, + page=1, + page_size=10, + load_metadata: bool = True, + filters: tuple[tuple] | None = None, + ) -> dict: + """ + Search for studies in BioStudies database + """ + try: + if not query or not isinstance(query, str): + return {"error": "Search query must be a non-empty string."} + + filters_applied = bool(filters) + if filters_applied: + load_metadata = True + + params = {"query": query, "page": page, "pageSize": page_size} + + headers = { + "Accept": "application/json", + "User-Agent": "BioStudies-VHP4Safety-App/1.0", + } + + response = requests.get(self.search_url, headers=headers, params=params, timeout=30) + + if response.status_code == 200: + try: + data = response.json() + hits = data.get("hits", []) + total_hits = data.get("totalHits", 0) + + if not data or total_hits == 0: + return {"error": "No results found."} + + if load_metadata: + hits = self._hit_metadata(hits) + hits = self._hit_url(hits) + + if filters_applied: + hits = self._apply_filters(hits, filters) + + page_size_met = len(hits) >= page_size + pages_fetched = 1 + + if not page_size_met: + hits, page_size_met, pages_fetched = self._backfill_filtered_results( + hits, page, page_size, filters, query + ) + + return { + "totalHits": total_hits, + "hits": hits, + "hits_returned": len(hits), + "page": page, + "pageSize": page_size, + "pages_fetched": pages_fetched, + "filters_applied": True, + "page_size_met": page_size_met, + } + + return data | {"hits": hits, "total": total_hits} + + except json.JSONDecodeError as e: + return {"error": f"Invalid JSON response from BioStudies API: {str(e)}"} + + elif response.status_code == 400: + return {"error": "Bad request. Please check your search parameters."} + elif response.status_code == 403: + return {"error": "Access forbidden. The collection may be restricted."} + elif response.status_code == 500: + return {"error": "BioStudies server error. Please try again later."} + elif response.status_code == 503: + return {"error": "BioStudies service temporarily unavailable. Please try again later."} + else: + return {"error": f"BioStudies API returned status {response.status_code}. Please try again later."} + + except requests.exceptions.Timeout: + return {"error": "Request timed out. BioStudies server may be slow. Please try again."} + except requests.exceptions.ConnectionError: + return {"error": "Cannot connect to BioStudies server. Please check your internet connection."} + except requests.exceptions.RequestException as e: + return {"error": f"Network error: {str(e)}"} + except Exception as e: + return {"error": f"Unexpected error occurred: {str(e)}"} + + def list_studies( + self, + page=1, + page_size=50, + include_urls: bool = False, + load_metadata: bool = False, + filters: tuple[tuple] | None = None, + ) -> dict: + """ + List studies in the configured BioStudies collection for a specific page. + """ + filters_applied = bool(filters) + if filters_applied: + load_metadata = True + include_urls = True + + headers = { + "Accept": "application/json", + "User-Agent": "BioStudies-VHP4Safety-App/1.0", + } + params = {"page": page, "pageSize": page_size} + + try: + response = requests.get(self.search_url, headers=headers, params=params, timeout=30) + except requests.exceptions.RequestException as e: + return {"error": f"Network error during listing: {e}", "total": 0, "hits": []} + + if response.status_code != 200: + return { + "error": f"BioStudies API returned status {response.status_code} while listing.", + "total": 0, + "hits": [], + } + + try: + data = response.json() + except json.JSONDecodeError as e: + return {"error": f"Invalid JSON response from BioStudies API: {str(e)}", "total": 0, "hits": []} + + total_hits = data.get("totalHits") or data.get("total") or 0 + hits = data.get("hits", []) + + if include_urls: + hits = self._hit_url(hits) + if load_metadata: + hits = self._hit_metadata(hits) + + if filters_applied: + hits = self._apply_filters(hits, filters) + + page_size_met = len(hits) >= page_size + pages_fetched = 1 + + if not page_size_met: + hits, page_size_met, pages_fetched = self._backfill_filtered_results( + hits, page, page_size, filters, query=None + ) + + return { + "totalHits": total_hits, + "total": total_hits, + "hits": hits, + "hits_returned": len(hits), + "page": page, + "pageSize": page_size, + "pages_fetched": pages_fetched, + "filters_applied": True, + "page_size_met": page_size_met, + } + + return {"total": total_hits, "hits": hits} + + def _hit_url(self, hits: list) -> list: + for hit in hits: + acc = hit.get("accession") or hit.get("accno") + if acc: + hit["url"] = self.build_study_url(acc).get("url", "") + return hits + + def _hit_metadata(self, hits: list) -> list: + for hit in hits: + acc = hit.get("accession") or hit.get("accno") + if acc: + hit["metadata"] = self.get_study_metadata(acc) + return hits + + def _apply_filters(self, hits: list, filters: list[tuple]) -> list: + """ + Filter hits based on metadata field values (case-insensitive AND logic) + """ + if not filters: + return hits + + filtered = [] + for hit in hits: + metadata = hit.get("metadata", {}) + if not metadata: + continue + + matches_all = True + for field, value in filters: + field_value = str(metadata.get(field, "")).strip().lower() + filter_value = str(value).strip().lower() + if field_value != filter_value: + matches_all = False + break + + if matches_all: + filtered.append(hit) + + return filtered + + def _backfill_filtered_results( + self, + initial_hits: list, + page: int, + page_size: int, + filters: list[tuple], + query: str = None, + ) -> tuple: + """ + Backfill filtered results by fetching additional pages until page_size is met or timeout + """ + filtered = initial_hits[:] + current_page = page + start_time = time.time() + pages_fetched = 1 + + while len(filtered) < page_size: + if time.time() - start_time > 30: + break + + current_page += 1 + + try: + params = {"page": current_page, "pageSize": page_size} + headers = {"Accept": "application/json", "User-Agent": "BioStudies-VHP4Safety-App/1.0"} + + if query: + params["query"] = query + + response = requests.get(self.search_url, headers=headers, params=params, timeout=30) + if response.status_code != 200: + break + + data = response.json() + next_hits = data.get("hits", []) + if not next_hits: + break + + next_hits = self._hit_metadata(next_hits) + next_filtered = self._apply_filters(next_hits, filters) + filtered.extend(next_filtered) + pages_fetched += 1 + + except Exception: + break + + page_size_met = len(filtered) >= page_size + return filtered[:page_size], page_size_met, pages_fetched + + # ----------------------------- + # Metadata parsing (FIXED) + # ----------------------------- + def parse_metadata(self, raw_data: dict, *, validate_files: bool = True, file_timeout=(3.05, 10)): + """ + Parse and structure the metadata from BioStudies API response. + + FIX: + - Files are extracted ONLY here (enriched), not in _extract_comprehensive_metadata(). + This prevents duplicates and ensures consistent structure. + """ + try: + metadata = { + "accession": raw_data.get("accno", "N/A"), + "title": raw_data.get("title", "N/A"), + "description": raw_data.get("description", "N/A"), + "release_date": raw_data.get("rdate", raw_data.get("ReleaseDate", "N/A")), + "modification_date": raw_data.get("mdate", "N/A"), + "type": raw_data.get("type", "N/A"), + + # VHP4Safety filterable fields + "case_study": "", + "regulatory_question": "", + "flow_step": "", + "collection": "", + + "attributes": [], + "authors": [], + "files": [], + "links": [], + "protocols": [], + "publications": [], + "organizations": [], + + "biological_context": {}, + "technical_details": {}, + "experimental_design": {}, + + "raw_data": raw_data, + } + + # ---- helpers + def _norm_attr_name(attr: dict) -> str: + return (attr.get("name") or "").strip().lower() + + def _attr_value(attr: dict) -> str: + v = attr.get("value", "") + return "" if v is None else str(v) + + def _capture_vhp_fields(attr_name: str, attr_value: str): + if attr_name == "attachto": + metadata["collection"] = attr_value + elif attr_name == "case study": + metadata["case_study"] = attr_value + elif attr_name == "regulatory question": + metadata["regulatory_question"] = attr_value + elif attr_name == "process flow step": + metadata["flow_step"] = attr_value + + BIO_KEYS = { + "organism", "species", "organism part", "organ", "cell type", + "tissue", "disease", "disease state", "sample type", + } + TECH_KEYS = { + "platform", "instrument", "assay", "assay type", "library strategy", + "library source", "data type", "sequencing mode", "sequencing date", + "index adapters", "pipeline", + } + AUTHOR_KEYS = {"author", "authors", "contact", "submitter"} + + def _categorize(attr_name: str, attr_value: str): + if attr_name in BIO_KEYS: + metadata["biological_context"][attr_name] = attr_value + elif attr_name in TECH_KEYS: + metadata["technical_details"][attr_name] = attr_value + elif attr_name in AUTHOR_KEYS: + if attr_value and attr_value not in metadata["authors"]: + metadata["authors"].append(attr_value) + + def _file_attrs_map(fobj: dict) -> dict: + out = {} + for a in (fobj or {}).get("attributes", []) or []: + n = (a.get("name") or "").strip() + if n: + out[n] = a.get("value") + return out + + def _iter_section_files(sec: dict): + if not isinstance(sec, dict): + return + if isinstance(sec.get("files"), list): + for f in sec["files"]: + yield f + if isinstance(sec.get("subsections"), list): + for s in sec["subsections"]: + yield from _iter_section_files(s) + + seen_files = set() + + def _add_files(files_list): + if not isinstance(files_list, list): + return + + accno = metadata.get("accession") or raw_data.get("accno") or "N/A" + + for f in files_list: + if not isinstance(f, dict): + continue + + file_path = (f.get("path") or f.get("name") or f.get("filename") or "").strip() + if not file_path: + continue + + dedupe_key = f"{accno}::{file_path}" + if dedupe_key in seen_files: + continue + seen_files.add(dedupe_key) + + fam = _file_attrs_map(f) + url = self.build_biostudies_https_file_url(accno, file_path) + + entry = { + "name": file_path, + "path": file_path, + "size": f.get("size"), + "type": f.get("type"), + "description": fam.get("Description") or fam.get("description") or "", + "file_kind": fam.get("Type") or fam.get("type") or "", + "attributes": f.get("attributes", []), + "url": url, + "exists_check": None, + "raw": f, + } + + if validate_files and url: + entry["exists_check"] = self.url_exists_no_download(url, timeout=file_timeout) + + metadata["files"].append(entry) + + # ---- top-level attributes + if isinstance(raw_data.get("attributes"), list): + for attr in raw_data["attributes"]: + if not isinstance(attr, dict): + continue + name_raw = attr.get("name", "") + attr_name = _norm_attr_name(attr) + value = _attr_value(attr) + + metadata["attributes"].append({"name": name_raw, "value": value}) + _capture_vhp_fields(attr_name, value) + _categorize(attr_name, value) + + # ---- org lookup + organization_lookup = {} + if isinstance(raw_data.get("section"), dict): + self._build_organization_lookup(raw_data["section"], organization_lookup) + + # ---- section attributes + section = raw_data.get("section") if isinstance(raw_data.get("section"), dict) else None + if section and isinstance(section.get("attributes"), list): + for attr in section["attributes"]: + if not isinstance(attr, dict): + continue + name_raw = attr.get("name", "") + attr_name = _norm_attr_name(attr) + value = _attr_value(attr) + + if attr_name == "title" and (metadata["title"] == "N/A" or not metadata["title"]): + metadata["title"] = value + elif attr_name == "description" and (metadata["description"] == "N/A" or not metadata["description"]): + metadata["description"] = value + + _capture_vhp_fields(attr_name, value) + _categorize(attr_name, value) + metadata["attributes"].append({"name": name_raw, "value": value}) + + # ---- comprehensive extraction (NO FILES inside this anymore!) + if section: + self._extract_comprehensive_metadata(section, metadata, organization_lookup) + + # ---- files (enriched, deduped) + if section: + _add_files(list(_iter_section_files(section))) + if isinstance(raw_data.get("files"), list): + _add_files(raw_data["files"]) + + # ---- links + publications + def _add_links(links_list): + if not isinstance(links_list, list): + return + for link in links_list: + if not isinstance(link, dict): + continue + link_data = { + "url": link.get("url", ""), + "type": link.get("type", ""), + "description": link.get("description", ""), + "attributes": link.get("attributes", []), + } + metadata["links"].append(link_data) + + link_type = (link.get("type", "") or "").lower() + if ("doi" in link_type) or ("pubmed" in link_type) or ("publication" in link_type): + metadata["publications"].append(link_data) + + _add_links(raw_data.get("links")) + if section: + _add_links(section.get("links")) + + # pick ro-crate link from available files -> requires filename to contain "rocrate" + rocrate = self._pick_rocrate_file(metadata.get("files", [])) + metadata["rocrate_file"] = rocrate # full dict (name/path/url/size/exists_check...) + metadata["rocrate_url"] = rocrate.get("url") if isinstance(rocrate, dict) else None + + + return metadata + + except Exception as e: + return {"error": f"Failed to parse metadata: {str(e)}", "raw_data": raw_data} + + # ----------------------------- + # Organisation lookup / deep extraction + # ----------------------------- + def _build_organization_lookup(self, section, org_lookup): + """Build a lookup table for organization references""" + if isinstance(section, dict): + if section.get("type", "").lower() in ["organization", "organisation"]: + org_id = section.get("accno", "") + if org_id and "attributes" in section: + org_data = {} + for attr in section["attributes"]: + attr_name = (attr.get("name", "") or "").lower() + attr_value = attr.get("value", "") + if attr_name in ["name", "organization", "email", "address", "department", "affiliation"]: + org_data[attr_name] = attr_value + if org_data: + org_lookup[org_id] = org_data + + if "subsections" in section: + for subsection in section["subsections"]: + self._build_organization_lookup(subsection, org_lookup) + + elif isinstance(section, list): + for item in section: + self._build_organization_lookup(item, org_lookup) + + def _extract_comprehensive_metadata(self, section, metadata, organization_lookup=None): + """ + Comprehensively extract metadata from sections/subsections. + + IMPORTANT FIX: + - DO NOT append files here (to avoid duplicates). Files are handled in parse_metadata(). + """ + if organization_lookup is None: + organization_lookup = {} + + if isinstance(section, dict): + # ---- protocols + if section.get("type", "").lower() == "protocols" or "protocol" in section.get("type", "").lower(): + if "subsections" in section: + for protocol in section["subsections"]: + protocol_data = { + "type": protocol.get("type", ""), + "description": protocol.get("description", ""), + "attributes": [], + } + + if "attributes" in protocol: + for attr in protocol["attributes"]: + protocol_data["attributes"].append( + {"name": attr.get("name", ""), "value": attr.get("value", "")} + ) + + metadata["protocols"].append(protocol_data) + + # ---- author and organization information + if section.get("type", "").lower() in ["author", "contact", "person"]: + if "attributes" in section: + author_info = {} + author_affiliation_ref = None + + for attr in section["attributes"]: + attr_name = (attr.get("name", "") or "").lower() + attr_value = attr.get("value", "") + + if attr_name in ["name", "first name", "last name", "email", "e-mail", "orcid"]: + author_info[attr_name] = attr_value + elif attr_name == "affiliation" and attr.get("reference"): + author_affiliation_ref = attr_value + + if author_info: + author_name = author_info.get("name", "") + if not author_name: + first = author_info.get("first name", "") + last = author_info.get("last name", "") + author_name = f"{first} {last}".strip() + + email = author_info.get("email") or author_info.get("e-mail", "") + orcid = author_info.get("orcid") or None + + author_entry = { + "name": author_name, + "email": email, + "orcid": orcid, + "affiliation_ref": author_affiliation_ref, + "affiliation_name": "", + } + + if author_affiliation_ref and author_affiliation_ref in organization_lookup: + resolved_org = organization_lookup[author_affiliation_ref] + author_entry["affiliation_name"] = resolved_org.get("name", "") + + if author_name: + existing_author = next( + (a for a in metadata.get("author_details", []) if a.get("name") == author_name), + None, + ) + if not existing_author: + metadata.setdefault("author_details", []).append(author_entry) + + if author_name not in metadata["authors"]: + metadata["authors"].append(author_name) + + # ---- experimental design info + if "attributes" in section: + for attr in section["attributes"]: + attr_name = (attr.get("name", "") or "").lower() + attr_value = attr.get("value", "") + + if attr_name in ["experimental factor", "variable", "treatment", "condition", "time point"]: + metadata["experimental_design"].setdefault("factors", []).append( + {"name": attr_name, "value": attr_value} + ) + + # ---- recurse + if "subsections" in section: + for subsection in section["subsections"]: + self._extract_comprehensive_metadata(subsection, metadata, organization_lookup) + + elif isinstance(section, list): + for item in section: + self._extract_comprehensive_metadata(item, metadata, organization_lookup) \ No newline at end of file diff --git a/src/models/data/mapping.py b/src/models/data/mapping.py new file mode 100644 index 0000000..b65fc69 --- /dev/null +++ b/src/models/data/mapping.py @@ -0,0 +1,526 @@ +from typing import Any, Dict, List, Optional, Tuple +import re + +# ---------- small helpers ---------- + +# Prefer literal "<>" in real code (not HTML-escaped < >) +DOI_RE = re.compile(r'\b10\.\d{4,9}/[^\s"<>]+', re.IGNORECASE) + +def is_valid_doi(doi: Optional[str]) -> bool: + """Basic DOI sanity check. Rejects obvious redactions like '***'.""" + if not doi or not isinstance(doi, str): + return False + d = doi.strip() + if "*" in d: # handles 10.5281/zenodo.*** etc. + return False + if not d.lower().startswith("10."): + return False + if "/" not in d: + return False + return True + +def g(d: Dict[str, Any], *path: str, default=None): + """Safe nested-get. Never raises KeyError.""" + cur: Any = d + for key in path: + if isinstance(cur, dict) and key in cur: + cur = cur[key] + else: + return default + return cur + +def first(*vals, default=None): + """Return first non-empty (not None, not '' , not []) value.""" + for v in vals: + if v is None: + continue + if v == "": + continue + if isinstance(v, (list, dict)) and len(v) == 0: + continue + return v + return default + +def find_attr(attrs: Any, name: str) -> Optional[str]: + """Find BioStudies attribute list entry with given name.""" + if not isinstance(attrs, list): + return None + for a in attrs: + if isinstance(a, dict) and a.get("name") == name: + return a.get("value") + return None + +def extract_doi_from_text(text: Any) -> Optional[str]: + """Extract a DOI from a string (or return None).""" + if not isinstance(text, str) or not text: + return None + m = DOI_RE.search(text) + doi = m.group(0) if m else None + return doi if is_valid_doi(doi) else None + +def extract_all_dois(text: Any) -> List[str]: + """Extract all valid DOIs from a string.""" + if not isinstance(text, str) or not text: + return [] + dois = [] + for m in DOI_RE.finditer(text): + d = m.group(0) + if is_valid_doi(d): + dois.append(d) + return dois + +def doi_url(doi: Optional[str]) -> Optional[str]: + """Convert DOI to https://doi.org/...""" + if not doi: + return None + d = doi.strip() + if d.lower().startswith("http"): + return d + return f"https://doi.org/{d}" + +# ---------- DOI + publications extraction ---------- + +def find_doi_anywhere(item: Dict[str, Any]) -> Optional[str]: + """ + Best-effort *dataset DOI* extractor. + NOTE: Intentionally does NOT search BioStudies raw_data publication subsections, + because those are *linked publications*, not dataset DOI. + """ + # direct keys first (dataset DOI) + doi = first(item.get("doi"), g(item, "metadata", "doi")) + doi = extract_doi_from_text(doi) or doi + if is_valid_doi(doi): + return doi + + # Zenodo: related identifiers (sometimes contains dataset DOI, but usually pubs) + rel = g(item, "metadata", "related_identifiers", default=[]) or [] + if isinstance(rel, list): + for r in rel: + if not isinstance(r, dict): + continue + ident = r.get("identifier") + scheme = (r.get("scheme") or "").lower() + if scheme == "doi": + found = extract_doi_from_text(ident) or ident + if is_valid_doi(found): + return found + found = extract_doi_from_text(ident) + if found: + return found + + # BioStudies: attributes (dataset DOI if present) + attrs = g(item, "metadata", "attributes", default=[]) or [] + for key in ("DOI", "doi", "Dataset DOI"): + v = find_attr(attrs, key) + found = extract_doi_from_text(v) + if found: + return found + + # BioStudies: publications list (if present) - ambiguous; keep as last resort + pubs = g(item, "metadata", "publications", default=[]) or [] + if isinstance(pubs, list): + for p in pubs: + if not isinstance(p, dict): + continue + for cand in (p.get("doi"), p.get("identifier"), p.get("url")): + found = extract_doi_from_text(cand) + if found: + return found + + # last resort: description text + desc = first(g(item, "metadata", "description"), item.get("description")) + found = extract_doi_from_text(desc) + if found: + return found + + return None + +def _dedup_publications(pubs: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Deduplicate publications by DOI (preferred) or URL.""" + seen = set() + out = [] + for p in pubs: + doi = (p.get("doi") or "").lower().strip() + url = (p.get("url") or "").lower().strip() + key = doi or url + if not key: + continue + if key in seen: + continue + seen.add(key) + out.append(p) + return out + +def extract_publications_zenodo(z: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Extract linked publications from Zenodo record. + Sources: + - metadata.related_identifiers + - metadata.references (list of strings) + - DOIs embedded in metadata.description (optional, but useful) + """ + pubs: List[Dict[str, Any]] = [] + + dataset_doi = find_doi_anywhere(z) + concept_doi = first(z.get("conceptdoi"), g(z, "metadata", "conceptdoi")) + concept_doi = extract_doi_from_text(concept_doi) or concept_doi + if not is_valid_doi(concept_doi): + concept_doi = None + + rel = g(z, "metadata", "related_identifiers", default=[]) or [] + if isinstance(rel, list): + for r in rel: + if not isinstance(r, dict): + continue + ident = r.get("identifier") + scheme = (r.get("scheme") or "").lower() + relation = (r.get("relation") or "").lower() + rtype = (r.get("resource_type") or "").lower() + + # Heuristic: treat as publication if resource_type contains "publication" + # or relation indicates citation-like linkage. + looks_like_pub = ( + "publication" in rtype + or relation in {"references", "iscitedby", "isreferencedby", "issupplementto", "isdocumentedby"} + ) + + if not looks_like_pub: + # still accept DOI-looking identifiers if they are clearly *not* Zenodo dataset DOIs + pass + + doi = None + url = None + + if scheme == "doi": + doi = extract_doi_from_text(ident) or (ident.strip() if isinstance(ident, str) else None) + if not is_valid_doi(doi): + doi = None + url = doi_url(doi) if doi else None + elif scheme == "url": + url = ident.strip() if isinstance(ident, str) else None + doi = extract_doi_from_text(url) + else: + # Unknown scheme: try DOI extraction + doi = extract_doi_from_text(ident) + url = doi_url(doi) if doi else (ident.strip() if isinstance(ident, str) else None) + + # Exclude dataset DOI / concept DOI if they appear + if doi and (doi == dataset_doi or doi == concept_doi): + continue + + if doi or url: + pubs.append({ + "doi": doi, + "doi_url": doi_url(doi) if doi else None, + "url": url, + "relation": relation or None, + "resource_type": r.get("resource_type"), + "source": "zenodo.related_identifiers", + }) + + refs = g(z, "metadata", "references", default=[]) or [] + if isinstance(refs, list): + for ref in refs: + doi = extract_doi_from_text(ref) + if doi and doi not in {dataset_doi, concept_doi}: + pubs.append({ + "doi": doi, + "doi_url": doi_url(doi), + "url": doi_url(doi), + "relation": "references", + "resource_type": "publication", + "source": "zenodo.references", + }) + + # Optional: mine description for DOI links (often present as doi.org/10.xxxx/...) + desc = g(z, "metadata", "description") + for doi in extract_all_dois(desc): + if doi not in {dataset_doi, concept_doi}: + pubs.append({ + "doi": doi, + "doi_url": doi_url(doi), + "url": doi_url(doi), + "relation": "mentions", + "resource_type": "publication", + "source": "zenodo.description", + }) + + return _dedup_publications(pubs) + +def extract_publications_biostudies(b: Dict[str, Any]) -> List[Dict[str, Any]]: + """ + Extract linked publications from BioStudies record. + Sources: + - metadata.publications (if present) + - metadata.raw_data.section.subsections entries of type 'Publication' + """ + pubs: List[Dict[str, Any]] = [] + meta = b.get("metadata", {}) or {} + + # 1) metadata.publications (sometimes already structured) + meta_pubs = meta.get("publications", []) or [] + if isinstance(meta_pubs, list): + for p in meta_pubs: + if isinstance(p, dict): + doi = extract_doi_from_text(first(p.get("doi"), p.get("identifier"), p.get("url"))) + url = first(p.get("url"), doi_url(doi)) + if doi or url: + pubs.append({ + "title": p.get("title"), + "doi": doi, + "doi_url": doi_url(doi) if doi else None, + "url": url, + "pmid": p.get("pmid") or p.get("PMID"), + "year": p.get("year") or p.get("Year"), + "authors": p.get("authors") or p.get("Authors"), + "source": "biostudies.metadata.publications", + }) + elif isinstance(p, str): + doi = extract_doi_from_text(p) + if doi: + pubs.append({ + "doi": doi, + "doi_url": doi_url(doi), + "url": doi_url(doi), + "source": "biostudies.metadata.publications", + }) + + # 2) raw_data.section.subsections: type == Publication + subs = g(b, "metadata", "raw_data", "section", "subsections", default=[]) or [] + if isinstance(subs, list): + for s in subs: + if not isinstance(s, dict): + continue + stype = str(s.get("type", "")).strip().lower() + if stype != "publication": + continue + + # flatten attributes into dict + attrs = s.get("attributes") or [] + flat: Dict[str, Any] = {} + if isinstance(attrs, list): + for a in attrs: + if isinstance(a, dict) and a.get("name"): + flat[a["name"]] = a.get("value") + + doi = extract_doi_from_text(flat.get("DOI") or flat.get("doi")) + pmid = flat.get("PMID") or flat.get("pmid") + title = flat.get("Title") or flat.get("title") + year = flat.get("Year") or flat.get("year") + authors = flat.get("Authors") or flat.get("Author") or flat.get("authors") + + url = doi_url(doi) if doi else None + + if doi or pmid or title: + pubs.append({ + "title": title, + "doi": doi, + "doi_url": doi_url(doi) if doi else None, + "url": url, + "pmid": pmid, + "year": year, + "authors": authors, + "journal": flat.get("Journal") or flat.get("journal"), + "volume": flat.get("Volume") or flat.get("volume"), + "issue": flat.get("Issue") or flat.get("issue"), + "type": flat.get("Type") or flat.get("type"), + "issn": flat.get("Issn") or flat.get("ISSN"), + "source": "biostudies.raw_data.section.subsections", + }) + + return _dedup_publications(pubs) + +# ---------- Zenodo normalizer ---------- + +def normalize_zenodo(z: Dict[str, Any]) -> Dict[str, Any]: + creators = g(z, "metadata", "creators", default=[]) or [] + grants = g(z, "metadata", "grants", default=[]) or [] + files = z.get("files", []) or [] + + doi = find_doi_anywhere(z) + if not is_valid_doi(doi): + doi = None + + publications = extract_publications_zenodo(z) + + return { + "title": first(g(z, "metadata", "title"), z.get("title")), + "description": first(g(z, "metadata", "description")), + "license": first(g(z, "metadata", "license", "id")), + "authors": [ + { + "name": c.get("name"), + "orcid": c.get("orcid"), + "affiliation": c.get("affiliation"), + } + for c in creators + if isinstance(c, dict) + ], + "funding": [ + { + "funder": g(gr, "funder", "name"), + "funder_doi": g(gr, "funder", "doi"), + "acronym": gr.get("acronym"), + "title": gr.get("title"), + "code": gr.get("code"), + "url": gr.get("url"), + } + for gr in grants + if isinstance(gr, dict) + ], + "ReleaseDate": first(g(z, "metadata", "publication_date"), z.get("created")), + "id": first(z.get("id"), z.get("recid")), + "type": first(g(z, "metadata", "resource_type", "type"), "dataset"), + "version": first(g(z, "metadata", "version")), + "files": [ + { + "name": f.get("key"), + "size": f.get("size"), + "checksum": f.get("checksum"), + "url": g(f, "links", "self"), + } + for f in files + if isinstance(f, dict) + ], + "url": first(z.get("url"), g(z, "links", "self_html"), g(z, "links", "self")), + + # dataset DOI + "doi": doi, + "doi_url": doi_url(doi), + + "conceptdoi": first(z.get("conceptdoi"), g(z, "metadata", "conceptdoi")), + "conceptdoi_url": doi_url(first(z.get("conceptdoi"), g(z, "metadata", "conceptdoi"))), + + # NEW: linked publications + "publications": publications, + } + +# ---------- BioStudies normalizer ---------- + +def normalize_biostudies(b: Dict[str, Any]) -> Dict[str, Any]: + meta = b.get("metadata", {}) or {} + attrs = meta.get("attributes", []) or [] + files = meta.get("files", []) or [] + + author_details = meta.get("author_details", []) or [] + authors = meta.get("authors", []) or [] + + if isinstance(author_details, list) and len(author_details) > 0: + authors_norm = [ + { + "name": a.get("name"), + "orcid": a.get("orcid"), + "affiliation": a.get("affiliation_name") or a.get("affiliation_ref"), + "email": a.get("email"), + } + for a in author_details + if isinstance(a, dict) + ] + else: + authors_norm = [ + {"name": name, "orcid": None, "affiliation": None} + for name in authors + if isinstance(name, str) + ] + + # funding best-effort (normalized) + funding: List[Dict[str, Any]] = [] + subsections = g(b, "metadata", "raw_data", "section", "subsections", default=[]) or [] + if isinstance(subsections, list): + for s in subsections: + if not isinstance(s, dict): + continue + if str(s.get("type", "")).strip().lower() != "funding": + continue + + flat = {} + for a in s.get("attributes") or []: + if isinstance(a, dict) and a.get("name"): + flat[a["name"]] = a.get("value") + + if not flat: + continue + + funder = first(flat.get("Funder"), flat.get("Agency"), flat.get("Funding agency"), flat.get("Agency name")) + code = first(flat.get("Grant_id"), flat.get("Grant ID"), flat.get("Grant"), flat.get("Grant number")) + url = first(flat.get("URL"), flat.get("Url"), flat.get("Project URL")) + + funding.append({ + "funder": funder, + "code": code, + "url": url, + "acronym": flat.get("Acronym") or flat.get("Programme") or flat.get("Program"), + "raw": flat, + "source": "biostudies.raw_data.section.subsections", + }) + + doi = find_doi_anywhere(b) + if not is_valid_doi(doi): + doi = None + + publications = extract_publications_biostudies(b) + + # ✅ files: PASS THROUGH URL only (no rebuilding) + files_norm: List[Dict[str, Any]] = [] + for f in files: + if not isinstance(f, dict): + continue + files_norm.append({ + "name": first(f.get("name"), f.get("path")), + "size": f.get("size"), + "path": f.get("path"), + "url": f.get("url"), # <-- do not rebuild + # optional (keep if useful) + "exists": g(f, "exists_check", "exists"), + "content_length": g(f, "exists_check", "content_length"), + }) + + # OPTIONAL strictness: attach warning if any url missing + missing = [x.get("path") for x in files_norm if x.get("path") and not x.get("url")] + if missing: + meta.setdefault("warnings", []).append( + f"{len(missing)} BioStudies file(s) missing url in metadata.files (pass-through mode)." + ) + + return { + "title": first(meta.get("title"), b.get("title")), + "description": first(meta.get("description")), + "license": first(find_attr(attrs, "License")), + "authors": authors_norm, + "funding": funding, + "ReleaseDate": first( + b.get("release_date"), + find_attr(attrs, "ReleaseDate"), + find_attr(attrs, "Release Date"), + ), + "id": first(meta.get("accession"), b.get("accession"), b.get("id")), + "type": first(b.get("type"), meta.get("type"), "study"), + "version": first(meta.get("version")), + "files": files_norm, + "url": first(b.get("url")), + "doi": doi, + "doi_url": doi_url(doi), + "publications": publications, + } + +# ---------- combine ---------- + +def normalize_all( + bs_entries: List[Dict[str, Any]], + zenodo_entries: List[Dict[str, Any]], +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """ + Adds 'norm_metadata' to each dict in both lists and returns a 2-tuple + (bs_entries, zenodo_entries) with 'norm_metadata' populated. + Robust: ignores non-dicts and missing lists. + """ + + for z in zenodo_entries or []: + if isinstance(z, dict): + z["norm_metadata"] = normalize_zenodo(z) + + for b in bs_entries or []: + if isinstance(b, dict): + b["norm_metadata"] = normalize_biostudies(b) + + return bs_entries, zenodo_entries \ No newline at end of file diff --git a/src/models/data/schemas.py b/src/models/data/schemas.py new file mode 100644 index 0000000..23dd05a --- /dev/null +++ b/src/models/data/schemas.py @@ -0,0 +1,245 @@ +"""Pydantic models for normalized dataset metadata (BioStudies & Zenodo).""" + +from __future__ import annotations + +from typing import Any, Optional + +from pydantic import BaseModel, Field + + +# ── Shared / reusable sub-models ────────────────────────────────────────── + + +class Author(BaseModel): + """Normalised author/creator.""" + + name: Optional[str] = None + orcid: Optional[str] = None + affiliation: Optional[str] = None + email: Optional[str] = None + + +class Funding(BaseModel): + """Normalised funding entry.""" + + funder: Optional[str] = None + funder_doi: Optional[str] = None + acronym: Optional[str] = None + title: Optional[str] = None + code: Optional[str] = None + url: Optional[str] = None + raw: Optional[dict[str, Any]] = None + source: Optional[str] = None + + +class DataFile(BaseModel): + """Normalised file entry (common to both sources).""" + + name: Optional[str] = None + path: Optional[str] = None + size: Optional[int] = None + checksum: Optional[str] = None + url: Optional[str] = None + exists: Optional[bool] = None + content_length: Optional[str] = None + + model_config = {"extra": "allow"} + + +class Publication(BaseModel): + """Linked publication extracted from a dataset record.""" + + title: Optional[str] = None + doi: Optional[str] = None + doi_url: Optional[str] = None + url: Optional[str] = None + pmid: Optional[str] = None + year: Optional[str] = None + authors: Optional[str] = None + journal: Optional[str] = None + volume: Optional[str] = None + issue: Optional[str] = None + type: Optional[str] = None + issn: Optional[str] = None + relation: Optional[str] = None + resource_type: Optional[str] = None + source: Optional[str] = None + + +# ── Top-level normalised metadata ───────────────────────────────────────── + + +class NormalizedMetadata(BaseModel): + """Unified normalised metadata for any dataset (Zenodo or BioStudies).""" + + title: Optional[str] = None + description: Optional[str] = None + license: Optional[str] = None + authors: list[Author] = Field(default_factory=list) + funding: list[Funding] = Field(default_factory=list) + ReleaseDate: Optional[str] = Field(None, alias="ReleaseDate") + id: Optional[str | int] = None + type: Optional[str] = None + version: Optional[str] = None + files: list[DataFile] = Field(default_factory=list) + url: Optional[str] = None + doi: Optional[str] = None + doi_url: Optional[str] = None + publications: list[Publication] = Field(default_factory=list) + + # Zenodo-specific + conceptdoi: Optional[str] = None + conceptdoi_url: Optional[str] = None + + model_config = {"populate_by_name": True, "extra": "allow"} + + +# ── BioStudies raw-metadata models ──────────────────────────────────────── + + +class Attribute(BaseModel): + name: str = "" + value: str = "" + + +class BiologicalContext(BaseModel): + model_config = {"extra": "allow"} + + +class TechnicalDetails(BaseModel): + model_config = {"extra": "allow"} + + +class ExperimentalDesign(BaseModel): + factors: list[dict[str, Any]] = Field(default_factory=list) + + model_config = {"extra": "allow"} + + +class ProtocolEntry(BaseModel): + type: str = "" + description: str = "" + attributes: list[Attribute] = Field(default_factory=list) + + +class LinkEntry(BaseModel): + url: str = "" + type: str = "" + description: str = "" + attributes: list[dict[str, Any]] = Field(default_factory=list) + + +class FileEntry(BaseModel): + """Rich file entry from BioStudies parse_metadata.""" + + name: str = "" + path: str = "" + size: Optional[int] = None + type: Optional[str] = None + description: str = "" + file_kind: str = "" + attributes: list[dict[str, Any]] = Field(default_factory=list) + url: Optional[str] = None + exists_check: Optional[dict[str, Any]] = None + raw: Optional[dict[str, Any]] = None + + +class AuthorDetail(BaseModel): + name: str = "" + email: str = "" + orcid: Optional[str] = None + affiliation_ref: Optional[str] = None + affiliation_name: str = "" + + +class BioStudiesParsedMetadata(BaseModel): + """Full structured metadata returned by BioStudiesExtractor.parse_metadata.""" + + accession: str = "N/A" + title: str = "N/A" + description: str = "N/A" + release_date: str = "N/A" + modification_date: str = "N/A" + type: str = "N/A" + + # VHP4Safety filterable fields + case_study: str = "" + regulatory_question: str = "" + flow_step: str = "" + collection: str = "" + + attributes: list[Attribute] = Field(default_factory=list) + authors: list[str] = Field(default_factory=list) + author_details: list[AuthorDetail] = Field(default_factory=list) + files: list[FileEntry] = Field(default_factory=list) + links: list[LinkEntry] = Field(default_factory=list) + protocols: list[ProtocolEntry] = Field(default_factory=list) + publications: list[LinkEntry] = Field(default_factory=list) + organizations: list[dict[str, Any]] = Field(default_factory=list) + + biological_context: BiologicalContext = Field(default_factory=BiologicalContext) + technical_details: TechnicalDetails = Field(default_factory=TechnicalDetails) + experimental_design: ExperimentalDesign = Field(default_factory=ExperimentalDesign) + + rocrate_file: Optional[dict[str, Any]] = None + rocrate_url: Optional[str] = None + + url: str = "" + raw_data: Optional[dict[str, Any]] = None + + model_config = {"extra": "allow"} + + +# ── Zenodo parsed-metadata model ────────────────────────────────────────── + + +class ZenodoFileEntry(BaseModel): + id: Optional[str] = None + key: Optional[str] = None + size: Optional[int] = None + checksum: Optional[str] = None + links: dict[str, Any] = Field(default_factory=dict) + + +class ZenodoParsedMetadata(BaseModel): + """Full structured metadata returned by ZenodoExtractor.parse_metadata.""" + + id: Optional[int | str] = None + recid: Optional[int | str] = None + doi: Optional[str] = None + doi_url: Optional[str] = None + title: str = "N/A" + description: str = "N/A" + publication_date: str = "N/A" + access_right: Optional[str] = None + creators: list[dict[str, Any]] = Field(default_factory=list) + keywords: list[str] = Field(default_factory=list) + resource_type: dict[str, Any] = Field(default_factory=dict) + license: dict[str, Any] = Field(default_factory=dict) + grants: list[dict[str, Any]] = Field(default_factory=list) + communities: list[dict[str, Any]] = Field(default_factory=list) + related_identifiers: list[dict[str, Any]] = Field(default_factory=list) + files: list[ZenodoFileEntry] = Field(default_factory=list) + links: dict[str, Any] = Field(default_factory=dict) + stats: dict[str, Any] = Field(default_factory=dict) + is_rocrate: bool = False + + url: str = "" + raw: Optional[dict[str, Any]] = None + + model_config = {"extra": "allow"} + + +# ── URL-existence check result ──────────────────────────────────────────── + + +class UrlExistsResult(BaseModel): + """Result of a HEAD / Range probe to check file existence.""" + + url: Optional[str] = None + exists: bool = False + status_code: Optional[int] = None + content_length: Optional[str] = None + final_url: Optional[str] = None + error: Optional[str] = None + method: Optional[str] = None diff --git a/src/models/data/zenodo.py b/src/models/data/zenodo.py new file mode 100644 index 0000000..17f0820 --- /dev/null +++ b/src/models/data/zenodo.py @@ -0,0 +1,484 @@ +from __future__ import annotations + +import json +import re +import time +from typing import Any + +import requests + + +class ZenodoExtractor: + """Extractor for interacting with the Zenodo Records API. + + Defaults to community 'vhp4safety' and record type 'dataset' to match + the user's request. Optional access_token may be provided for higher + rate limits or accessing private records. + """ + + def __init__( + self, + access_token: str | None = None, + community: str = "vhp4safety", + record_type: str = "dataset", + base_url: str = "https://zenodo.org/api/records", + ) -> None: + self.base_url = base_url + self.community = community + self.record_type = record_type + self.session = requests.Session() + self.headers = { + "Accept": "application/json", + "User-Agent": "Zenodo-VHP4Safety-App/1.0", + } + if access_token: + # Use Authorization header when token is provided + self.headers["Authorization"] = f"Bearer {access_token}" + + def validate_record_id(self, record_id: Any) -> tuple[bool, Any, str | None]: + """Validate a Zenodo record identifier. + + Accepts numeric recid (int or numeric string) or DOI (10.xxxx/...). + + Returns: + (is_valid, normalized_id, error_message) + """ + if record_id is None: + return False, None, "Record ID is required" + + # numeric recid + try: + if isinstance(record_id, int): + return True, record_id, None + if isinstance(record_id, str) and record_id.isdigit(): + return True, int(record_id), None + except Exception: + pass + + # DOI pattern + if isinstance(record_id, str): + # strip DOI url wrapper + candidate = record_id.strip() + # DOI url like https://doi.org/10.5281/zenodo.1234 + if candidate.startswith("http") and "doi.org" in candidate: + candidate = candidate.split("doi.org/", 1)[-1] + + doi_regex = r"^10\.\d{4,9}/[-._;()/:A-Z0-9]+$" + if re.match(doi_regex, candidate, flags=re.IGNORECASE): + return True, candidate, None + + return ( + False, + record_id, + "Invalid Zenodo record identifier (expect recid or DOI)", + ) + + def build_record_url(self, record_id: Any) -> dict[str, Any]: + """Build a public URL for a record identifier (recid or DOI).""" + is_valid, normalized, error = self.validate_record_id(record_id) + if not is_valid: + return {"error": error} + + if isinstance(normalized, int): + url = f"https://zenodo.org/records/{normalized}" + else: + # DOI string + url = f"https://doi.org/{normalized}" + + return {"id": normalized, "url": url} + + def get_record_metadata(self, record_id: Any) -> dict[str, Any]: + """Retrieve and normalize metadata for a single record. + + If record_id is a DOI string, perform a search for that DOI and + return the first match's parsed metadata. + """ + try: + is_valid, normalized, validation_error = self.validate_record_id(record_id) + if not is_valid: + return {"error": validation_error} + + # If numeric recid, retrieve directly + if isinstance(normalized, int): + url = f"{self.base_url}/{normalized}" + resp = self.session.get(url, headers=self.headers, timeout=30) + if resp.status_code == 200: + try: + data = resp.json() + parsed = self.parse_metadata(data) + parsed_url = self.build_record_url(normalized).get("url", "") + return parsed | {"url": parsed_url} + except json.JSONDecodeError as e: + return {"error": f"Invalid JSON response from Zenodo API: {e}"} + elif resp.status_code == 404: + return {"error": f"Record '{normalized}' not found."} + else: + return {"error": f"Zenodo API returned status {resp.status_code}."} + + # DOI case: search for DOI + doi = normalized + query = f'doi:"{doi}"' + search = self.search_records( + query=query, page=1, size=1, load_metadata=True + ) + if "error" in search: + return search + hits = search.get("hits", []) + if not hits: + return {"error": f"Record with DOI '{doi}' not found."} + # return parsed metadata from first hit + first = hits[0] + # parsed metadata may be under 'parsed_metadata' or 'metadata' + parsed = first.get("parsed_metadata") or first.get("metadata") + parsed_url = self.build_record_url( + first.get("recid") or first.get("id") or doi + ).get( + "url", + "", + ) + return parsed | {"url": parsed_url} + + except requests.exceptions.Timeout: + return {"error": "Request timed out. Zenodo server may be slow."} + except requests.exceptions.ConnectionError: + return { + "error": "Cannot connect to Zenodo server. Check your internet connection." + } + except requests.exceptions.RequestException as e: + return {"error": f"Network error: {e}"} + except Exception as e: + return {"error": f"Unexpected error: {e}"} + + def search_records( + self, + query: str = "", + page: int = 1, + size: int = 25, + load_metadata: bool = True, + filters: tuple[tuple[str, str]] | None= None, + ) -> dict[str, Any]: + """Search Zenodo records. + + Defaults to the configured community and record_type. + """ + try: + if not isinstance(query, str): + return {"error": "Query must be a string."} + + # If filters are provided, ensure metadata is loaded + filters_applied = bool(filters) + if filters_applied: + load_metadata = True + + params = { + "q": query, + "page": page, + "size": size, + "communities": self.community, + "type": self.record_type, + } + + resp = self.session.get( + self.base_url, headers=self.headers, params=params, timeout=30 + ) + if resp.status_code == 200: + try: + data = resp.json() + except json.JSONDecodeError as e: + return {"error": f"Invalid JSON response from Zenodo API: {e}"} + + hits = ( + data.get("hits", {}).get("hits", []) + if isinstance(data.get("hits"), dict) + else data.get("hits", []) + ) + total = ( + data.get("hits", {}).get("total") + if isinstance(data.get("hits"), dict) + else data.get("total", 0) + ) + + if not data or (isinstance(total, int) and total == 0): + return {"error": "No results found.", "hits": []} + + if load_metadata: + hits = self._hit_metadata(hits) + + hits = self._hit_url(hits) + + if filters_applied: + hits = self._apply_filters(hits, filters) + + page_size_met = len(hits) >= size + pages_fetched = 1 + if not page_size_met: + hits, page_size_met, pages_fetched = ( + self._backfill_filtered_results( + hits, page, size, filters, query + ) + ) + + return { + "totalHits": total, + "hits": hits, + "hits_returned": len(hits), + "page": page, + "pageSize": size, + "pages_fetched": pages_fetched, + "filters_applied": True, + "page_size_met": page_size_met, + } + + return {"total": total, "hits": hits} + + elif resp.status_code == 400: + return {"error": "Bad request. Check your search parameters."} + elif resp.status_code == 403: + return { + "error": "Access forbidden. Community or collection may be restricted." + } + elif resp.status_code in (500, 503): + return {"error": "Zenodo server error. Please try again later."} + else: + return {"error": f"Zenodo API returned status {resp.status_code}."} + + except requests.exceptions.Timeout: + return {"error": "Request timed out. Zenodo server may be slow."} + except requests.exceptions.ConnectionError: + return { + "error": "Cannot connect to Zenodo server. Check your internet connection." + } + except requests.exceptions.RequestException as e: + return {"error": f"Network error: {e}"} + except Exception as e: + return {"error": f"Unexpected error: {e}"} + + def list_records( + self, + page: int = 1, + size: int = 25, + include_urls: bool = False, + load_metadata: bool = False, + filters: tuple[tuple[str, str]]|None = None, + ) -> dict[str, Any]: + """list records for the configured community/type (wrapper for search_records).""" + # If filters provided, require metadata and URLs + if filters: + load_metadata = True + include_urls = True + + result = self.search_records( + query="", page=page, size=size, load_metadata=load_metadata, filters=filters + ) + + if include_urls and "hits" in result: + result["hits"] = self._hit_url(result["hits"]) + + return result + + def _hit_url(self, hits: list[dict[str, Any]]) -> list[dict[str, Any]]: + for hit in hits: + # try recid present in different keys + recid = ( + hit.get("recid") + or hit.get("id") + or (hit.get("metadata", {}).get("doi") if hit.get("metadata") else None) + ) + if recid: + try: + recid_int = int(recid) + hit["url"] = self.build_record_url(recid_int).get("url", "") + except Exception: + # fallback to DOI url + doi = ( + hit.get("metadata", {}).get("doi") + if hit.get("metadata") + else None + ) + if doi: + hit["url"] = self.build_record_url(doi).get("url", "") + return hits + + def _hit_metadata(self, hits: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Attach parsed metadata to each hit as 'parsed_metadata'.""" + for hit in hits: + try: + # some hits already include top-level fields, but parse consistently + parsed = self.parse_metadata(hit) + # preserve both raw and parsed + hit["parsed_metadata"] = parsed + except Exception: + hit["parsed_metadata"] = {} + return hits + + def _apply_filters( + self, hits: list[dict[str, Any]], filters: tuple[tuple[str, str]]|None + ) -> list[dict[str, Any]]: + """Apply AND-filters to hits using parsed metadata when available. + + Field matching is case-insensitive. For list fields (keywords, creators, + communities) we match if any element contains the filter value. + """ + if not filters: + return hits + + filtered: list[dict[str, Any]] = [] + for hit in hits: + metadata = hit.get("parsed_metadata") or hit.get("metadata") or {} + if not metadata: + continue + + matches_all = True + for field, value in filters: + filter_value = value.lower() + field_value = metadata.get(field, "") + + if isinstance(field_value, list): + # normalize list values to strings + found = False + for item in field_value: + # item may be dict (e.g., creators) + if isinstance(item, dict): + # try to match on common text fields + text = " ".join( + str(v) for v in item.values() if isinstance(v, str) + ) + else: + text = str(item) + if filter_value in text.lower(): + found = True + break + if not found: + matches_all = False + break + + else: + if not isinstance(field_value, str): + field_value = str(field_value) + if ( + filter_value != field_value.lower() + and filter_value not in field_value.lower() + ): + matches_all = False + break + + if matches_all: + filtered.append(hit) + + return filtered + + def _backfill_filtered_results( + self, + initial_hits: list[dict[str, Any]], + page: int, + page_size: int, + filters: tuple[tuple[str, str]]|None, + query: None | str = None, + ) -> tuple[list[dict[str, Any]], bool, int]: + """Fetch subsequent pages until page_size filtered results are collected or timeout. + + Returns (filtered_hits_trimmed, page_size_met, pages_fetched). + """ + filtered = initial_hits[:] + current_page = page + start_time = time.time() + pages_fetched = 1 + + while len(filtered) < page_size: + if time.time() - start_time > 30: + break + + current_page += 1 + try: + params = { + "q": query or "", + "page": current_page, + "size": page_size, + "communities": self.community, + "type": self.record_type, + } + resp = self.session.get( + self.base_url, headers=self.headers, params=params, timeout=30 + ) + if resp.status_code != 200: + break + data = resp.json() + next_hits = ( + data.get("hits", {}).get("hits", []) + if isinstance(data.get("hits"), dict) + else data.get("hits", []) + ) + if not next_hits: + break + + next_hits = self._hit_metadata(next_hits) + next_hits = self._hit_url(next_hits) + next_filtered = self._apply_filters(next_hits, filters) + filtered.extend(next_filtered) + pages_fetched += 1 + + except Exception: + break + + page_size_met = len(filtered) >= page_size + return filtered[:page_size], page_size_met, pages_fetched + + def parse_metadata(self, raw_record: dict[str, Any]) -> dict[str, Any]: + """Normalize Zenodo record structure into a simpler metadata dict. + + Accepts either a full record returned from /api/records/:id or a hit + element from a search response. + """ + try: + # Zenodo typically nests useful fields under 'metadata' + raw = raw_record.get("metadata", raw_record) + + metadata: dict[str, Any] = { + "id": raw_record.get("id") + or raw_record.get("recid") + or raw.get("recid"), + "recid": raw_record.get("recid") or raw_record.get("id"), + "doi": raw.get("doi"), + "doi_url": raw_record.get("doi_url") or raw.get("doi_url"), + "title": raw.get("title", "N/A"), + "description": raw.get("description", "N/A"), + "publication_date": raw.get( + "publication_date", raw.get("publication_date", "N/A") + ), + "access_right": raw.get("access_right"), + "creators": raw.get("creators", []), + "keywords": raw.get("keywords", []), + "resource_type": raw.get("resource_type", {}), + "license": raw.get("license", {}), + "grants": raw.get("grants", []), + "communities": raw.get("communities", []), + "related_identifiers": raw.get( + "related_identifiers", raw.get("related_identifiers", []) + ), + "files": [], + "links": raw_record.get("links", {}), + "stats": raw_record.get("stats", {}), + "raw": raw_record, + } + + # Extract files if available at top-level or under raw + files = raw_record.get("files") or raw.get("files") or [] + is_rocrate = False + for f in files: + if f.get("key", "").lower() == "rocrate-metadata.json": + is_rocrate = True + metadata["files"].append( + { + "id": f.get("id"), + "key": f.get("key") or f.get("name"), + "size": f.get("size"), + "checksum": f.get("checksum"), + "links": f.get("links", {}), + } + ) + metadata["is_rocrate"] = is_rocrate + + return metadata + + except Exception as e: + return {"error": f"Failed to parse metadata: {e}", "raw": raw_record} diff --git a/src/models/platform.py b/src/models/platform.py new file mode 100644 index 0000000..4438abe --- /dev/null +++ b/src/models/platform.py @@ -0,0 +1,56 @@ +"""Pydantic models for VHP4Safety platform configuration and domain objects.""" + +from __future__ import annotations + +from typing import Optional + +from pydantic import BaseModel, Field + + +class RegulatoryQuestion(BaseModel): + """A regulatory question tied to a case study.""" + + key: str = Field(description="Internal key, e.g. reg_q_1a") + label: str + explanation: str + case_study: Optional[str] = None + + +class StageExplanation(BaseModel): + """Safety-assessment workflow stage with a short explanation.""" + + name: str + explanation: str + + +class CompoundProperty(BaseModel): + """Single property row returned by a SPARQL compound query.""" + + property_label: str = "" + value: str = "" + units_label: Optional[str] = None + formatter_url: Optional[str] = None + source: Optional[str] = None + doi: Optional[str] = None + see_also: Optional[str] = None + + +class CompoundSummary(BaseModel): + """Core identifiers for a compound from CompoundCloud.""" + + wcid: str + label: str + inchi: str = "" + inchikey: str = "" + smiles: str = Field("", alias="SMILES") + formula: str = "" + mass: str = "" + + model_config = {"populate_by_name": True} + + +class GlossaryStageMapping(BaseModel): + """Maps a glossary URL to a human-readable stage name.""" + + glossary_url: str + stage_name: str diff --git a/src/scheduler.py b/src/scheduler.py new file mode 100644 index 0000000..e5e5654 --- /dev/null +++ b/src/scheduler.py @@ -0,0 +1,61 @@ +""" +Nightly background job that re-seeds the database from upstream GitHub sources. + +Uses APScheduler's BackgroundScheduler so it runs inside the same Flask / +SQLite process — no external cron or second container needed. +""" + +import logging +import os + +from apscheduler.schedulers.background import BackgroundScheduler +from apscheduler.triggers.cron import CronTrigger + +log = logging.getLogger(__name__) + +_scheduler: BackgroundScheduler | None = None + + +def _reseed_job() -> None: + """Drop + re-seed all tables from upstream YAML indexes.""" + from src.seed import seed_all # late import to avoid circular deps + log.info("⏳ Nightly re-seed started …") + try: + seed_all() + log.info("✅ Nightly re-seed complete") + except Exception: + log.exception("❌ Nightly re-seed failed") + + +def init_scheduler(app=None) -> BackgroundScheduler: + """ + Start (or return) the background scheduler. + + Environment knobs (all optional): + RESEED_HOUR – hour to run (0-23, default 3) + RESEED_MINUTE – minute to run (0-59, default 0) + RESEED_ENABLED – set to "false" to disable entirely + """ + global _scheduler + if _scheduler is not None: + return _scheduler + + enabled = os.environ.get("RESEED_ENABLED", "true").lower() + if enabled == "false": + log.info("🔕 Nightly re-seed disabled (RESEED_ENABLED=false)") + return None + + hour = int(os.environ.get("RESEED_HOUR", "3")) + minute = int(os.environ.get("RESEED_MINUTE", "0")) + + _scheduler = BackgroundScheduler(daemon=True) + _scheduler.add_job( + _reseed_job, + trigger=CronTrigger(hour=hour, minute=minute), + id="nightly_reseed", + name="Re-seed DB from upstream", + replace_existing=True, + ) + _scheduler.start() + log.info("🕐 Nightly re-seed scheduled at %02d:%02d UTC", hour, minute) + return _scheduler diff --git a/src/seed.py b/src/seed.py new file mode 100644 index 0000000..8ab0032 --- /dev/null +++ b/src/seed.py @@ -0,0 +1,279 @@ +"""Seed the database from upstream GitHub JSON indexes. + +Run: python -m src.seed +Idempotent — uses INSERT OR REPLACE (upsert). +""" + +from __future__ import annotations + +import json +import os +import sys +from datetime import datetime, timezone + +import requests + +from src.db import get_conn, init_db + +SERVICES_URL = os.environ.get( + "SERVICES_URL", + "https://raw.githubusercontent.com/VHP4Safety/cloud" + "/refs/heads/main/cap/service_index.json", +) +METHODS_URL = os.environ.get( + "METHODS_URL", + "https://raw.githubusercontent.com/VHP4Safety/cloud" + "/refs/heads/main/cap/methods_index.json", +) + +# ── Static reference data ──────────────────────────────────────────────── + +REG_QUESTIONS = { + "reg_q_1a": { + "label": "Kidney Case Study (a)", + "explanation": "What is the safe cisplatin dose in cancer patients?", + }, + "reg_q_1b": { + "label": "Kidney Case Study (b)", + "explanation": ( + "What is the intrinsic hazard of tacrolimus " + "for nephrotoxicity?" + ), + }, + "reg_q_2a": { + "label": "Parkinson Case Study (a)", + "explanation": "Can compound Dinoseb cause Parkinson's Disease?", + }, + "reg_q_2b": { + "label": "Parkinson Case Study (b)", + "explanation": ( + "What level of exposure to compound Dinoseb leads to " + "risk for developing Parkinson's disease?" + ), + }, + "reg_q_3a": { + "label": "Thyroid Case Study (a)", + "explanation": ( + "What information about silychristin do we need to give " + "an advice to women in their early pregnancy to decide " + "whether the substance can be used?" + ), + }, + "reg_q_3b": { + "label": "Thyroid Case Study (b)", + "explanation": ( + "Does silychristin influence the thyroid-mediated brain " + "development in the fetus resulting in cognitive " + "impairment in children?" + ), + }, +} + +STAGE_EXPLANATIONS = { + "ADME": ( + "Absorption, distribution, metabolism, and excretion of a " + "substance in a living organism, following exposure." + ), + "Hazard Assessment": ( + "The process of assessing the intrinsic hazard a substance " + "poses to human health and/or the environment." + ), + "Chemical Information": ( + "Information about chemical properties and identity." + ), + "General": "Not specific to a flow step.", + "(External) exposure": "External exposure assessment.", + "Generic": "Generic category.", + "Other": "Other or unknown category.", +} + +GLOSSARY_STAGE_MAPPINGS = { + "https://vhp4safety.github.io/glossary#VHP0000056": "ADME", + "https://vhp4safety.github.io/glossary#VHP0000102": "Hazard Assessment", + "https://vhp4safety.github.io/glossary#VHP0000148": "Chemical Information", + "https://vhp4safety.github.io/glossary#VHP0000149": "General", +} + +CASE_STUDIES = [ + { + "slug": "kidney", + "title": "Kidney case study", + "description": "To study kidney disease and pharmacovigilance.", + "image_src": "/static/images/image43_hexagon.svg", + "image_alt": "Kidney case study", + }, + { + "slug": "parkinson", + "title": "Parkinson case study", + "description": ( + "To study life course pesticide exposure and " + "neurodegenerative disease." + ), + "image_src": "/static/images/image45_hexagon.svg", + "image_alt": "Parkinson case study", + }, + { + "slug": "thyroid", + "title": "Thyroid case study", + "description": ( + "To study health effects discriminated by age and sex on " + "thyroid-mediated neurodevelopment." + ), + "image_src": "/static/images/image47_hexagon.svg", + "image_alt": "Thyroid case study", + }, +] + +CASESTUDY_CONTENT_URL = ( + "https://raw.githubusercontent.com/" + "VHP4Safety/ui-casestudy-config/main/{slug}_content.json" +) + + +def _bool_flag(val): + if val is None or val == "": + return None + return 1 if str(val).strip().lower() == "true" else 0 + + +def _now(): + return datetime.now(timezone.utc).isoformat() + + +def seed_reference_data(conn) -> None: + for key, data in REG_QUESTIONS.items(): + conn.execute( + "INSERT OR REPLACE INTO regulatory_questions (key, label, explanation) VALUES (?, ?, ?)", + (key, data["label"], data["explanation"]), + ) + for name, explanation in STAGE_EXPLANATIONS.items(): + conn.execute( + "INSERT OR REPLACE INTO stage_explanations (name, explanation) VALUES (?, ?)", + (name, explanation), + ) + for url, stage in GLOSSARY_STAGE_MAPPINGS.items(): + conn.execute( + "INSERT OR REPLACE INTO glossary_stage_mappings (glossary_url, stage_name) VALUES (?, ?)", + (url, stage), + ) + for cs in CASE_STUDIES: + content_json = None + try: + url = CASESTUDY_CONTENT_URL.format(slug=cs["slug"]) + resp = requests.get(url, timeout=15) + resp.raise_for_status() + content_json = resp.text + print(f" ok fetched {cs['slug']}_content.json") + except Exception as exc: + print(f" x could not fetch {cs['slug']}: {exc}") + conn.execute( + """INSERT OR REPLACE INTO case_studies + (slug, title, description, image_src, image_alt, content_json) + VALUES (?, ?, ?, ?, ?, ?)""", + (cs["slug"], cs["title"], cs["description"], + cs.get("image_src"), cs.get("image_alt"), content_json), + ) + conn.commit() + print("ok reference data seeded") + + +def seed_tools(conn) -> None: + resp = requests.get(SERVICES_URL, timeout=15) + resp.raise_for_status() + data = resp.json() + + # Build glossary lookup + cur = conn.execute("SELECT glossary_url, stage_name FROM glossary_stage_mappings") + glossary = {r["glossary_url"]: r["stage_name"] for r in cur} + + now = _now() + for tool_id, raw in data.items(): + stage = raw.get("stage", "") + stage = glossary.get(stage, stage) + if stage in ("NA", "Unknown"): + stage = "Other" + + conn.execute( + """INSERT OR REPLACE INTO tools + (id, service, description, stage, html_name, md_file_name, + png_file_name, main_url, inst_url, + reg_q_1a, reg_q_1b, reg_q_2a, reg_q_2b, reg_q_3a, reg_q_3b, + login, api_type, casestudy, provider, provider_email, + citation, version, license, sourcecode, docker, + bio_tools, tess, raw_json, updated_at) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", + (tool_id, raw.get("service", tool_id), raw.get("description"), + stage, raw.get("html_name"), raw.get("md_file_name"), + raw.get("png_file_name"), raw.get("main_url"), + raw.get("inst_url") or None, + _bool_flag(raw.get("reg_q_1a")), _bool_flag(raw.get("reg_q_1b")), + _bool_flag(raw.get("reg_q_2a")), _bool_flag(raw.get("reg_q_2b")), + _bool_flag(raw.get("reg_q_3a")), _bool_flag(raw.get("reg_q_3b")), + raw.get("login"), raw.get("api"), raw.get("casestudy"), + raw.get("provider"), raw.get("provider-email"), + raw.get("citation"), raw.get("version"), raw.get("license"), + raw.get("sourcecode"), raw.get("docker"), + raw.get("bioTools"), raw.get("tess"), + json.dumps(raw), now), + ) + conn.commit() + print(f"ok {len(data)} tools seeded") + + +def seed_methods(conn) -> None: + resp = requests.get(METHODS_URL, timeout=15) + resp.raise_for_status() + data = resp.json() + + now = _now() + for method_id, raw in data.items(): + conn.execute( + """INSERT OR REPLACE INTO methods + (id, method, issue_number, description, stage, substage, + catalog_webpage_url, case_study, regulatory_question, + reg_q_1a, reg_q_1b, reg_q_2a, reg_q_2b, reg_q_3a, reg_q_3b, + data_producer, sop, vendor, catalog_number, citation, + type_iri, ontology, key_event_id, aop_id, + raw_json, updated_at) + VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", + (method_id, + raw.get("method") or raw.get("method_name_content", method_id), + raw.get("issue_number"), + raw.get("method_description_content"), + raw.get("vhp4safety_workflow_stage_content"), + raw.get("workflow_substage_content"), + raw.get("catalog_webpage_url"), + raw.get("case_study_content"), + raw.get("regulatory_question_content"), + _bool_flag(raw.get("reg_q_1a")), _bool_flag(raw.get("reg_q_1b")), + _bool_flag(raw.get("reg_q_2a")), _bool_flag(raw.get("reg_q_2b")), + _bool_flag(raw.get("reg_q_3a")), _bool_flag(raw.get("reg_q_3b")), + raw.get("data_producer_content"), + raw.get("available_sop_or_protocol_content"), + raw.get("vendor_content"), + raw.get("catalog_number_content"), + raw.get("citation_content"), + raw.get("ontology_term_content"), + raw.get("type_content"), + raw.get("relevant_aop_wiki_key_event(s)_to_the_assay_content"), + raw.get("relevant_aop_wiki_adverse_outcome_pathway(s)_to_the_assay_content"), + json.dumps(raw), now), + ) + conn.commit() + print(f"ok {len(data)} methods seeded") + + +def seed_all() -> None: + init_db() + conn = get_conn() + try: + seed_reference_data(conn) + seed_tools(conn) + seed_methods(conn) + print("ok seeding complete") + finally: + conn.close() + + +if __name__ == "__main__": + seed_all() diff --git a/src/services/__init__.py b/src/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/services/compound.py b/src/services/compound.py new file mode 100644 index 0000000..cce8323 --- /dev/null +++ b/src/services/compound.py @@ -0,0 +1,204 @@ +"""Compound data service — encapsulates all CompoundCloud SPARQL queries. + +All SPARQL logic is centralised here; Flask routes just call these +functions and get back typed Pydantic models or plain dicts. +""" + +from __future__ import annotations + +import re +import urllib.parse +from typing import Optional + +import requests +from wikibaseintegrator import wbi_helpers + +from src.models.compound import ( + CompoundDetail, + CompoundExperimentalDatum, + CompoundIdentifier, + CompoundSummary, + CompoundToxicology, +) + +COMPOUND_EP = "https://compoundcloud.wikibase.cloud/query/sparql" +QLEVER_EP = ( + "https://qlever.cs.uni-freiburg.de/api/wikidata" + "?format=json&query=" +) + +_QID_RE = re.compile(r"^Q\d+$") + + +def is_valid_qid(qid: str) -> bool: + return bool(_QID_RE.fullmatch(qid)) + + +# ── Individual queries ──────────────────────────────────────────────────── + + +def get_properties(cwid: str) -> Optional[CompoundSummary]: + """Fetch core identifiers (InChI, SMILES, formula, mass).""" + q = ( + "PREFIX wd: \n" + "PREFIX wdt: \n\n" + "SELECT ?cmp ?cmpLabel ?formula ?mass ?inchi ?inchiKey ?SMILES WHERE {\n" + f" VALUES ?cmp {{ wd:{cwid} }}\n" + " ?cmp wdt:P9 ?inchi ;\n" + " wdt:P10 ?inchiKey .\n" + " OPTIONAL { ?cmp wdt:P2 ?mass }\n" + " OPTIONAL { ?cmp wdt:P3 ?formula }\n" + " OPTIONAL { ?cmp wdt:P7 ?chiralSMILES }\n" + " OPTIONAL { ?cmp wdt:P12 ?nonchiralSMILES }\n" + ' BIND (COALESCE(IF(BOUND(?chiralSMILES), ?chiralSMILES, 1/0),' + ' IF(BOUND(?nonchiralSMILES), ?nonchiralSMILES, 1/0), "")' + " AS ?SMILES)\n" + " SERVICE wikibase:label {" + ' bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n' + "}" + ) + result = wbi_helpers.execute_sparql_query(q, endpoint=COMPOUND_EP) + bindings = result.get("results", {}).get("bindings", []) + if not bindings: + return None + b = bindings[0] + return CompoundSummary( + wcid=b["cmp"]["value"], + label=b["cmpLabel"]["value"], + inchi=b["inchi"]["value"], + inchikey=b["inchiKey"]["value"], + SMILES=b.get("SMILES", {}).get("value", ""), + formula=b.get("formula", {}).get("value", ""), + mass=b.get("mass", {}).get("value", ""), + ) + + +def get_identifiers(cwid: str) -> list[CompoundIdentifier]: + """Fetch external identifiers (CAS, PubChem, …).""" + q = ( + "PREFIX wd: \n" + "PREFIX wdt: \n\n" + "SELECT DISTINCT ?propertyLabel ?value ?formatterURL\n" + "WHERE {\n" + " VALUES ?property { wd:P13 wd:P22 wd:P23 wd:P26 wd:P27" + " wd:P28 wd:P36 wd:P41 wd:P43 wd:P44 wd:P45 }\n" + " ?property wikibase:directClaim ?valueProp .\n" + f" OPTIONAL {{ wd:{cwid} ?valueProp ?value }}\n" + " OPTIONAL { ?property wdt:P6 ?formatterURL }\n" + " SERVICE wikibase:label {" + ' bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n' + "}" + ) + result = wbi_helpers.execute_sparql_query(q, endpoint=COMPOUND_EP) + bindings = result.get("results", {}).get("bindings", []) + out: list[CompoundIdentifier] = [] + for b in bindings: + out.append(CompoundIdentifier( + property_label=b.get("propertyLabel", {}).get("value", ""), + value=b.get("value", {}).get("value", ""), + formatter_url=b.get("formatterURL", {}).get("value", ""), + )) + return out + + +def get_toxicology(cwid: str) -> list[CompoundToxicology]: + """Fetch toxicology properties.""" + q = ( + "PREFIX wd: \n" + "PREFIX wdt: \n\n" + "SELECT DISTINCT ?propertyLabel ?value ?formatterURL\n" + "WHERE {\n" + " VALUES ?property { wd:P17 wd:P19 wd:P4 }\n" + " ?property wikibase:directClaim ?valueProp .\n" + f" OPTIONAL {{ wd:{cwid} ?valueProp ?value }}\n" + " OPTIONAL { ?property wdt:P6 ?formatterURL }\n" + " SERVICE wikibase:label {" + ' bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }\n' + "}" + ) + result = wbi_helpers.execute_sparql_query(q, endpoint=COMPOUND_EP) + bindings = result.get("results", {}).get("bindings", []) + out: list[CompoundToxicology] = [] + for b in bindings: + out.append(CompoundToxicology( + property_label=b.get("propertyLabel", {}).get("value", ""), + value=b.get("value", {}).get("value", ""), + )) + return out + + +def get_experimental_data( + cwid: str, +) -> list[CompoundExperimentalDatum]: + """Fetch experimental data via Wikidata QLever.""" + # Step 1: resolve CompoundCloud QID → Wikidata QID + q1 = ( + "PREFIX wd: \n" + "PREFIX wdt: \n\n" + "SELECT ?qid WHERE {\n" + " wd:P5 wikibase:directClaim ?identifierProp .\n" + f" wd:{cwid} ?identifierProp ?wikidata .\n" + " BIND (iri(CONCAT(" + '"http://www.wikidata.org/entity/", ?wikidata)) AS ?qid)\n' + "}" + ) + r1 = wbi_helpers.execute_sparql_query(q1, endpoint=COMPOUND_EP) + bindings = r1.get("results", {}).get("bindings", []) + if not bindings: + return [] + qid = bindings[0]["qid"]["value"] + + # Step 2: query Wikidata QLever for experimental properties + q2 = ( + "PREFIX wd: \n" + "PREFIX wdt: \n" + "PREFIX prov: \n" + "PREFIX rdfs: \n" + "PREFIX pr: \n" + "PREFIX wikibase: \n\n" + "SELECT DISTINCT ?propEntityLabel ?value" + " ?unitsLabel ?source ?doi ?statement\n" + "WHERE {\n" + f" <{qid}> ?propp ?statement .\n" + " ?statement a wikibase:BestRank ;\n" + " ?proppsv [" + " wikibase:quantityAmount ?value ;" + " wikibase:quantityUnit ?units ] .\n" + " ?property wikibase:claim ?propp ;" + " wikibase:statementValue ?proppsv ;" + " wdt:P1629 ?propEntity ;" + " wdt:P31 wd:Q21077852 .\n" + " ?propEntity @en@rdfs:label ?propEntityLabel .\n" + " ?units @en@rdfs:label ?unitsLabel .\n" + " BIND (COALESCE(IF(BOUND(?sourceTmp)," + ' ?sourceTmp, 1/0), "") AS ?source)\n' + " BIND (COALESCE(IF(BOUND(?doiTmp)," + ' ?doiTmp, 1/0), "") AS ?doi)\n' + "}" + ) + url = QLEVER_EP + urllib.parse.quote_plus(q2) + resp = requests.get(url, timeout=15) + data = resp.json() + bindings = data.get("results", {}).get("bindings", []) + + out: list[CompoundExperimentalDatum] = [] + for b in bindings: + out.append(CompoundExperimentalDatum( + property_label=b.get("propEntityLabel", {}).get("value", ""), + value=b.get("value", {}).get("value", ""), + units_label=b.get("unitsLabel", {}).get("value", ""), + source=b.get("source", {}).get("value", ""), + doi=b.get("doi", {}).get("value", ""), + see_also=b.get("statement", {}).get("value", ""), + )) + return out + + +def get_full_compound(cwid: str) -> CompoundDetail: + """Fetch everything about a compound.""" + return CompoundDetail( + summary=get_properties(cwid), + identifiers=get_identifiers(cwid), + toxicology=get_toxicology(cwid), + experimental_data=get_experimental_data(cwid), + ) diff --git a/src/sitemap.py b/src/sitemap.py new file mode 100644 index 0000000..5e7ee36 --- /dev/null +++ b/src/sitemap.py @@ -0,0 +1,59 @@ +"""Generate a static sitemap.xml file from DB contents.""" +from __future__ import annotations + +from datetime import datetime +from typing import Iterable +import os +from xml.etree import ElementTree as ET + +from src.db import get_conn + +BASE_URL = os.environ.get("BASE_URL", "http://localhost:5050") +OUT_PATH = os.path.join(os.path.dirname(__file__), "..", "static", "sitemap.xml") + + +def _add_url(root, loc, lastmod=None, changefreq="monthly", priority="0.5"): + url = ET.SubElement(root, "url") + ET.SubElement(url, "loc").text = loc + if lastmod: + ET.SubElement(url, "lastmod").text = lastmod + ET.SubElement(url, "changefreq").text = changefreq + ET.SubElement(url, "priority").text = priority + + +def gather_urls() -> Iterable[tuple[str, str | None]]: + conn = get_conn() + try: + yield (f"{BASE_URL}/", datetime.utcnow().isoformat()) + for path in ("/tools", "/methods", "/data", "/casestudies", "/api/v1/docs"): + yield (f"{BASE_URL}{path}", None) + for t in conn.execute("SELECT id, updated_at FROM tools").fetchall(): + if t["id"]: + yield (f"{BASE_URL}/tools/{t['id']}", t["updated_at"]) + for m in conn.execute("SELECT id, updated_at FROM methods").fetchall(): + if m["id"]: + yield (f"{BASE_URL}/methods/{m['id']}", m["updated_at"]) + for cs in conn.execute("SELECT slug FROM case_studies").fetchall(): + if cs["slug"]: + yield (f"{BASE_URL}/casestudies/{cs['slug']}", None) + finally: + conn.close() + + +def build_sitemap(out_path: str = OUT_PATH) -> str: + root = ET.Element("urlset", xmlns="http://www.sitemaps.org/schemas/sitemap/0.9") + for loc, last in gather_urls(): + _add_url(root, loc, lastmod=last) + tree = ET.ElementTree(root) + os.makedirs(os.path.dirname(out_path), exist_ok=True) + tree.write(out_path, encoding="utf-8", xml_declaration=True) + return out_path + + +def main() -> None: + path = build_sitemap() + print(f"Wrote sitemap to: {path}") + + +if __name__ == "__main__": + main() diff --git a/templates/base.html b/templates/base.html index 4c103ac..5983caa 100644 --- a/templates/base.html +++ b/templates/base.html @@ -162,6 +162,9 @@ + + @@ -281,6 +284,7 @@
Menu
EXPLORE
diff --git a/templates/case_studies/casestudies.html b/templates/case_studies/casestudies.html index 0854f95..3bbd7fe 100644 --- a/templates/case_studies/casestudies.html +++ b/templates/case_studies/casestudies.html @@ -11,41 +11,21 @@

Case Studies

diff --git a/templates/case_studies/casestudy_server.html b/templates/case_studies/casestudy_server.html new file mode 100644 index 0000000..eeadd59 --- /dev/null +++ b/templates/case_studies/casestudy_server.html @@ -0,0 +1,229 @@ +{% extends "base.html" %} {% block content %} + + + + +{# ── Breadcrumbs ── #} + + +{# ── Workflow Header ── #} +
+
Process Flow
+
+ {% for ws in step.workflow_steps %} + {% if not loop.first %} +
+ {% endif %} +
+
{{ ws.number }}
+ {{ ws.label }} +
+ {% endfor %} +
+
+ +{# ── Main Content ── #} +
+ + {% if step.nav_title %} +

{{ step.nav_title }}

+ {% endif %} + + {% if step.nav_description %} +

{{ step.nav_description }}

+ {% endif %} + + {% if step.image_html %} + {{ step.image_html | safe }} + {% endif %} + + {# ── Step Buttons ── #} + {% if step.buttons %} +
+ {% for btn in step.buttons %} +
+ {% if btn.disabled %} + + {% elif btn.url %} + + {{ btn.label }} + {% if btn.description %}
{{ btn.description }}{% endif %} +
+ {% else %} + + {% endif %} +
+ {% endfor %} +
+ {% endif %} + + {# ── HTML Content Block ── #} + {% if step.content_html %} +
+ {{ step.content_html | safe }} +
+ {% endif %} + + {# ── Accordion Sections ── #} + {% if step.accordion_sections %} +
+ {% for item in step.accordion_sections %} + {% set item_id = "accordionItem" ~ loop.index0 %} +
+

+ +

+
+
+ {{ item.description | default("") | safe }} +
+
+
+ {% endfor %} +
+ {% endif %} + +
+ +{# ── Feedback Button ── #} + + + + + + +{% endblock %} diff --git a/templates/Safety_Assessment_Workflow.html b/templates/safety_assessment_workflow.html similarity index 100% rename from templates/Safety_Assessment_Workflow.html rename to templates/safety_assessment_workflow.html From 245719bb28d18cd2344e24da5fad5cf64373490d Mon Sep 17 00:00:00 2001 From: Javier Date: Fri, 17 Apr 2026 17:36:31 +0200 Subject: [PATCH 2/2] Add examples to endpoints and API check action --- .github/scripts/api_check.py | 122 ++++++++++++++++ .github/workflows/pr-api-check.yml | 45 ++++++ src/api.py | 219 ++++++++++++++++++++++------- 3 files changed, 336 insertions(+), 50 deletions(-) create mode 100644 .github/scripts/api_check.py create mode 100644 .github/workflows/pr-api-check.yml diff --git a/.github/scripts/api_check.py b/.github/scripts/api_check.py new file mode 100644 index 0000000..9e3dbc7 --- /dev/null +++ b/.github/scripts/api_check.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""API check: counts, validation summary, and route health.""" + +import json +import sys +import urllib.request +from datetime import datetime, timezone + +BASE = "http://localhost:5050/api" + + +def get(path): + url = f"{BASE}{path}" + try: + req = urllib.request.Request(url) + with urllib.request.urlopen(req, timeout=15) as r: + return r.status, json.loads(r.read()) + except urllib.error.HTTPError as e: + return e.code, None + except Exception: + return 0, None + + +errors = [] + +# 1. Entity counts +ENTITIES = { + "Tools": "/tools/", + "Methods": "/methods/", + "Case studies": "/casestudies/", + "Regulatory questions": "/regulatory-questions/", + "Stage explanations": "/stages/", +} + +counts = {} +for label, path in ENTITIES.items(): + status, data = get(path) + if status == 200 and isinstance(data, list): + counts[label] = len(data) + else: + counts[label] = None + errors.append(f"GET {path} -> {status}") + +# 2. Validation summary +status, validation = get("/validation/") +if status != 200: + errors.append(f"GET /validation/ -> {status}") + validation = None + +# 3. Health check every route +ROUTES = [ + ("GET", "/tools/"), + ("GET", "/tools/cdkdepict"), + ("GET", "/methods/"), + ("GET", "/methods/5_cfda_assay_to_determine_cytotoxicity"), + ("GET", "/regulatory-questions/"), + ("GET", "/stages/"), + ("GET", "/casestudies/"), + ("GET", "/casestudies/kidney"), + ("GET", "/compounds/Q2270"), + ("GET", "/compounds/Q2270/properties"), + ("GET", "/compounds/Q2270/identifiers"), + ("GET", "/compounds/Q2270/toxicology"), + ("GET", "/compounds/Q2270/experimental-data"), + ("GET", "/data/"), + ("GET", "/validation/"), + ("GET", "/validation/tools"), +] + +health = [] +for method, path in ROUTES: + status, _ = get(path) + ok = 200 <= status < 300 + health.append((method, path, status, ok)) + if not ok: + errors.append(f"{method} {path} -> {status}") + +# ── build report ────────────────────────────────────────────────── + +now = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") +lines = [f"## API check -- {now}", ""] + +# counts +lines.append("### Entity counts") +lines.append("") +lines.append("| Entity | Count |") +lines.append("|--------|------:|") +for label, n in counts.items(): + lines.append(f"| {label} | {n if n is not None else 'ERR'} |") +lines.append("") + +# validation +if validation and "entities" in validation: + lines.append("### Validation (field completeness)") + lines.append("") + lines.append("| Entity | Entries | Avg complete | Full |") + lines.append("|--------|--------:|-------------:|-----:|") + for e in validation["entities"]: + lines.append( + f"| {e['entity']} | {e['total_entries']}" + f" | {e['avg_completeness_pct']}%" + f" | {e['fully_complete']}/{e['total_entries']} |" + ) + lines.append("") + +# health +lines.append("### Route health") +lines.append("") +lines.append("| Method | Route | Status |") +lines.append("|--------|-------|-------:|") +for method, path, status, ok in health: + mark = "ok" if ok else f"FAIL ({status})" + lines.append(f"| {method} | `{path}` | {mark} |") +lines.append("") + +# result +all_ok = not errors +lines.append(f"**Result: {'PASS' if all_ok else 'FAIL'}**") + +print("\n".join(lines)) +if not all_ok: + sys.exit(1) diff --git a/.github/workflows/pr-api-check.yml b/.github/workflows/pr-api-check.yml new file mode 100644 index 0000000..b9680cd --- /dev/null +++ b/.github/workflows/pr-api-check.yml @@ -0,0 +1,45 @@ +name: API check + +on: + pull_request: + +permissions: + contents: read + pull-requests: write + +jobs: + api-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Build Docker image + run: docker build -t vhp4safety . + + - name: Start container + run: | + docker run -d --name vhp4safety -p 5050:5050 vhp4safety + for i in $(seq 1 30); do + curl -sf http://localhost:5050/api/tools/ && break + sleep 2 + done + + - name: Run API checks + id: report + run: | + python3 .github/scripts/api_check.py > report.md + { + echo 'REPORT<> "$GITHUB_OUTPUT" + + - name: Post PR comment + uses: marocchino/sticky-pull-request-comment@v2 + with: + header: api-check + message: ${{ steps.report.outputs.REPORT }} + + - name: Stop container + if: always() + run: docker stop vhp4safety && docker rm vhp4safety diff --git a/src/api.py b/src/api.py index 646a88c..d2b59a6 100644 --- a/src/api.py +++ b/src/api.py @@ -33,48 +33,87 @@ # -- Marshmallow Schemas --------------------------------------------------- class ToolSchema(Schema): - id = fields.Str() - service = fields.Str() - description = fields.Str() - stage = fields.Str() - main_url = fields.Str() - inst_url = fields.Str() - html_name = fields.Str() - png_file_name = fields.Str() + id = fields.Str(metadata={"example": "cdkdepict"}) + service = fields.Str(metadata={ + "example": "CDK Depict", + "description": "Human-readable tool name"}) + description = fields.Str(metadata={ + "example": "A webservice for generating chemical " + "structure images from SMILES inputs."}) + stage = fields.Str(metadata={ + "example": "Other", + "description": "Safety-assessment workflow stage"}) + main_url = fields.Str(metadata={ + "example": "https://www.simolecule.com/cdkdepict/depict.html"}) + inst_url = fields.Str(metadata={ + "example": "https://cdkdepict.cloud.vhp4safety.nl/"}) + html_name = fields.Str(metadata={"example": "cdkdepict.html"}) + png_file_name = fields.Str(metadata={"example": "cdkdepict.png"}) class MethodSchema(Schema): - id = fields.Str() - method = fields.Str() - description = fields.Str() - stage = fields.Str() - substage = fields.Str() - catalog_webpage_url = fields.Str() - raw = fields.Dict(load_default=None) + id = fields.Str(metadata={ + "example": "5_cfda_assay_to_determine_cytotoxicity"}) + method = fields.Str(metadata={ + "example": "5-CFDA assay to determine cytotoxicity", + "description": "Human-readable method name"}) + description = fields.Str(metadata={ + "example": "Fluorescence-based determination " + "of cell membrane damage"}) + stage = fields.Str(metadata={"example": "Adverse Outcome"}) + substage = fields.Str(metadata={ + "example": "Cell death, Adverse outcome"}) + catalog_webpage_url = fields.Str(metadata={ + "example": "https://www.thermofisher.com/order/" + "catalog/product/C1354"}) + raw = fields.Dict(load_default=None, metadata={ + "description": "Full upstream YAML fields " + "from the methods catalog"}) class RegulatoryQuestionSchema(Schema): - key = fields.Str() - label = fields.Str() - explanation = fields.Str() + key = fields.Str(metadata={"example": "reg_q_1a"}) + label = fields.Str(metadata={ + "example": "Kidney Case Study (a)"}) + explanation = fields.Str(metadata={ + "example": "What is the safe cisplatin dose " + "in cancer patients?"}) class StageExplanationSchema(Schema): - name = fields.Str() - explanation = fields.Str() + name = fields.Str(metadata={"example": "ADME"}) + explanation = fields.Str(metadata={ + "example": "Absorption, distribution, metabolism, " + "and excretion of a substance in a living organism, " + "following exposure."}) class CaseStudySchema(Schema): - slug = fields.Str() - title = fields.Str() - description = fields.Str() - image_src = fields.Str() - config_repo = fields.Str() - default_branch = fields.Str() + name = fields.Str( + attribute="slug", + metadata={"description": "Short identifier used in URLs", + "example": "kidney"}) + title = fields.Str(metadata={ + "example": "Kidney case study"}) + description = fields.Str(metadata={ + "example": "To study kidney disease " + "and pharmacovigilance."}) + image_src = fields.Str(metadata={ + "example": "/static/images/image43_hexagon.svg"}) + config_repo = fields.Str(metadata={ + "example": "VHP4Safety/ui-casestudy-config"}) + default_branch = fields.Str(metadata={ + "example": "main"}) class CaseStudyDetailSchema(CaseStudySchema): - content_json = fields.Raw(load_default=None) + content_json = fields.Raw( + load_default=None, + metadata={ + "description": + "Full nested JSON driving the case-study UI " + "(intro text, regulatory questions, " + "process-flow steps)"}) class CompoundSummarySchema(Schema): @@ -115,7 +154,9 @@ class CompoundDetailSchema(Schema): class DataSearchQuerySchema(Schema): - query = fields.Str(load_default="") + query = fields.Str( + load_default="", + metadata={"example": "kidney"}) page = fields.Int(load_default=1) size = fields.Int(load_default=18) @@ -132,8 +173,12 @@ class DataResultSchema(Schema): class SearchQuerySchema(Schema): - stage = fields.Str(load_default=None) - search = fields.Str(load_default="") + stage = fields.Str( + load_default=None, + metadata={"example": "Other"}) + search = fields.Str( + load_default="", + metadata={"example": ""}) # -- Blueprints ------------------------------------------------------------ @@ -162,7 +207,11 @@ class SearchQuerySchema(Schema): @tools_bp.arguments(SearchQuerySchema, location="query") @tools_bp.response(200, ToolSchema(many=True)) def list_tools(args): - """List all tools, with optional stage/search filters.""" + """List all tools, with optional stage/search filters. + + Returns every tool (service) registered on the platform. + Filter by workflow stage or free-text search on the tool name. + """ conn = get_conn() sql = "SELECT * FROM tools WHERE 1=1" params = [] @@ -179,9 +228,12 @@ def list_tools(args): @tools_bp.route("/") +@tools_bp.doc(parameters=[{ + "name": "tool_id", "in": "path", + "example": "cdkdepict"}]) @tools_bp.response(200, ToolSchema) def get_tool(tool_id): - """Get a single tool by ID.""" + """Get a single tool by its ID.""" conn = get_conn() row = conn.execute("SELECT * FROM tools WHERE id = ?", (tool_id,)).fetchone() conn.close() @@ -196,7 +248,11 @@ def get_tool(tool_id): @methods_bp.arguments(SearchQuerySchema, location="query") @methods_bp.response(200, MethodSchema(many=True)) def list_methods(args): - """List all methods, with optional stage/search filters.""" + """List all methods, with optional stage/search filters. + + Methods describe experimental or computational procedures + used in safety-assessment workflows. + """ conn = get_conn() sql = "SELECT * FROM methods WHERE 1=1" params = [] @@ -213,9 +269,16 @@ def list_methods(args): @methods_bp.route("/") +@methods_bp.doc(parameters=[{ + "name": "method_id", "in": "path", + "example": "5_cfda_assay_to_determine_cytotoxicity"}]) @methods_bp.response(200, MethodSchema) def get_method(method_id): - """Get a single method by ID.""" + """Get a single method by ID, including full upstream fields. + + The ``raw`` field contains every field from the upstream + methods catalog YAML (AOP references, key events, etc.). + """ conn = get_conn() row = conn.execute("SELECT * FROM methods WHERE id = ?", (method_id,)).fetchone() conn.close() @@ -232,7 +295,11 @@ def get_method(method_id): @reg_q_bp.route("/") @reg_q_bp.response(200, RegulatoryQuestionSchema(many=True)) def list_regulatory_questions(): - """List all regulatory questions.""" + """List the six regulatory questions that link tools to case studies. + + Each question is tied to a case study pair (a/b). + For example, ``reg_q_1a`` = *"Kidney Case Study (a)"*. + """ conn = get_conn() rows = conn.execute("SELECT * FROM regulatory_questions").fetchall() conn.close() @@ -244,7 +311,11 @@ def list_regulatory_questions(): @stages_bp.route("/") @stages_bp.response(200, StageExplanationSchema(many=True)) def list_stages(): - """List all safety-assessment workflow stages.""" + """List all safety-assessment workflow stages. + + Stages are the high-level phases of the VHP4Safety + process flow: ADME, Hazard Assessment, etc. + """ conn = get_conn() rows = conn.execute("SELECT * FROM stage_explanations").fetchall() conn.close() @@ -256,19 +327,32 @@ def list_stages(): @casestudies_bp.route("/") @casestudies_bp.response(200, CaseStudySchema(many=True)) def list_case_studies(): - """List all case studies.""" + """List the three VHP4Safety case studies (summary only). + + Returns name, title, description, and image for each. + Use the detail endpoint for the full content JSON. + + Available names: ``kidney``, ``parkinson``, ``thyroid``. + """ conn = get_conn() rows = conn.execute("SELECT * FROM case_studies").fetchall() conn.close() return [dict(r) for r in rows] -@casestudies_bp.route("/") +@casestudies_bp.route("/") +@casestudies_bp.doc(parameters=[{ + "name": "name", "in": "path", + "example": "kidney"}]) @casestudies_bp.response(200, CaseStudyDetailSchema) -def get_case_study(slug): - """Get a case study with its full content JSON.""" +def get_case_study(name): + """Get a case study by name, including its full content JSON. + + The content JSON contains the intro text, regulatory questions, + and process-flow workflow steps that drive the case-study UI. + """ conn = get_conn() - row = conn.execute("SELECT * FROM case_studies WHERE slug = ?", (slug,)).fetchone() + row = conn.execute("SELECT * FROM case_studies WHERE slug = ?", (name,)).fetchone() conn.close() if not row: abort(404, message="Case study not found") @@ -281,9 +365,17 @@ def get_case_study(slug): # -- Compounds (SPARQL-backed) --------------------------------------------- @compounds_bp.route("/") +@compounds_bp.doc(parameters=[{ + "name": "cwid", "in": "path", + "description": "Wikidata compound ID", + "example": "Q2270"}]) @compounds_bp.response(200, CompoundDetailSchema) def get_compound(cwid): - """Get full compound data.""" + """Get full compound data from Wikidata via SPARQL. + + Returns summary properties, external identifiers, + toxicology data, and experimental measurements. + """ if not is_valid_qid(cwid): abort(400, message="Invalid compound identifier") try: @@ -293,9 +385,11 @@ def get_compound(cwid): @compounds_bp.route("//properties") +@compounds_bp.doc(parameters=[{ + "name": "cwid", "in": "path", "example": "Q2270"}]) @compounds_bp.response(200, CompoundSummarySchema) def get_compound_properties(cwid): - """Get core compound identifiers.""" + """Get core compound properties (formula, mass, InChI, SMILES).""" if not is_valid_qid(cwid): abort(400, message="Invalid compound identifier") try: @@ -308,9 +402,11 @@ def get_compound_properties(cwid): @compounds_bp.route("//identifiers") +@compounds_bp.doc(parameters=[{ + "name": "cwid", "in": "path", "example": "Q2270"}]) @compounds_bp.response(200, CompoundIdentifierSchema(many=True)) def get_compound_identifiers(cwid): - """Get external identifiers.""" + """Get external database identifiers (CAS, PubChem, ChEBI, etc.).""" if not is_valid_qid(cwid): abort(400, message="Invalid compound identifier") try: @@ -320,9 +416,11 @@ def get_compound_identifiers(cwid): @compounds_bp.route("//toxicology") +@compounds_bp.doc(parameters=[{ + "name": "cwid", "in": "path", "example": "Q2270"}]) @compounds_bp.response(200, CompoundToxicologySchema(many=True)) def get_compound_toxicology(cwid): - """Get toxicology data.""" + """Get toxicology data (LD50, LC50, etc.).""" if not is_valid_qid(cwid): abort(400, message="Invalid compound identifier") try: @@ -332,9 +430,11 @@ def get_compound_toxicology(cwid): @compounds_bp.route("//experimental-data") +@compounds_bp.doc(parameters=[{ + "name": "cwid", "in": "path", "example": "Q2270"}]) @compounds_bp.response(200, CompoundExpDataSchema(many=True)) def get_compound_exp_data(cwid): - """Get experimental measurements.""" + """Get experimental measurements (EC50, IC50, etc.).""" if not is_valid_qid(cwid): abort(400, message="Invalid compound identifier") try: @@ -349,7 +449,10 @@ def get_compound_exp_data(cwid): @data_bp.arguments(DataSearchQuerySchema, location="query") @data_bp.response(200, DataResultSchema) def list_data(args): - """Search datasets across BioStudies and Zenodo.""" + """Search datasets across BioStudies and Zenodo repositories. + + Returns paginated results from both sources with normalised metadata. + """ query = args.get("query", "") page = args.get("page", 1) size = args.get("size", 18) @@ -383,9 +486,14 @@ def list_data(args): @data_bp.route("/") +@data_bp.doc(parameters=[{ + "name": "data_id", "in": "path", "example": "S-BSST1503"}]) @data_bp.response(200) def get_data_detail(data_id): - """Get normalized metadata for a single dataset.""" + """Get normalised metadata for a single dataset by its accession ID. + + Searches both BioStudies and Zenodo for the given identifier. + """ bs = BioStudiesExtractor(collection=BIOSTUDIES_COLLECTION) zen = ZenodoExtractor(community=ZENODO_COMMUNITY, record_type=ZENODO_RECORD_TYPE) bs_res = bs.search_studies(data_id, page=1, page_size=1) @@ -522,7 +630,12 @@ def _validate_entity(entity_name, table, pydantic_model, id_attr, label_attr): @validation_bp.route("/") @validation_bp.response(200, ValidationReport) def validate_all(): - """Full data completeness report.""" + """Full data completeness report across all entity types. + + Checks every row in tools, methods, case_studies, + regulatory_questions, and stage_explanations for missing fields. + + """ from datetime import datetime, timezone return { "generated_at": datetime.now(timezone.utc).isoformat(), @@ -534,9 +647,15 @@ def validate_all(): @validation_bp.route("/") +@validation_bp.doc(parameters=[{ + "name": "entity", "in": "path", "example": "tools"}]) @validation_bp.response(200, EntitySummary) def validate_entity(entity): - """Data completeness report for a single entity type.""" + """Data completeness report for a single entity type. + + Valid entity names: ``tools``, ``methods``, ``case_studies``, + ``regulatory_questions``, ``stage_explanations``. + """ if entity not in _ENTITY_REGISTRY: abort(404, message=f"Unknown entity '{entity}'. Valid: {', '.join(_ENTITY_REGISTRY)}") tbl, model, id_a, lbl_a = _ENTITY_REGISTRY[entity]