CliDyn · dmpantiu · Mar 3, 2026 · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
diff --git a/DATA_ANALYSIS_AGENT_PROMPT.md b/DATA_ANALYSIS_AGENT_PROMPT.md
diff --git a/PULL_REQUEST.md b/PULL_REQUEST.md
@@ -0,0 +1,39 @@
+# FIRST ACCEPT PREVIOUS PR ;)
+
+**This PR is based on PR #197 (analysis modes) and must be merged after it.**
+
+---
+
+## Data Tab: Downloadable Datasets
+
+Adds a new **Data** tab to the UI where users can download all datasets generated during a session. Also renames "Additional information" → "Figures".
+
+### What's new
+
+- **`downloadable_datasets` tracking** — a new field on `AgentState` that accumulates dataset entries (`{label, path, source}`) as they're created throughout the pipeline
+- **Climate model CSVs** — tracked after `write_climate_data_manifest()` in `data_agent`
+- **ERA5 climatology JSON** — tracked in `prepare_predefined_data()` after extraction
+- **ERA5 time series Zarr** — tracked in `data_analysis_agent` after `retrieve_era5_data` tool execution
+- **DestinE time series Zarr** — tracked in `data_analysis_agent` after `retrieve_destine_data` tool execution
+- **Data tab in UI** — lists all tracked datasets with download buttons; Zarr directories are zipped on the fly, JSON/CSV files download directly
+- **Tab rename** — "Additional information" → "Figures"
+- **Data tab always visible** — shown regardless of whether figures are available
+
+### Pipeline fix
+
+Each agent node now **returns** `downloadable_datasets` in its return dict so LangGraph properly merges state across stages (in-place mutation alone is not enough).
+
+### Files changed
+
+| File | Change |
+|------|--------|
+| `climsight_classes.py` | Add `downloadable_datasets: list = []` to `AgentState` |
+| `climsight_engine.py` | Track datasets in `data_agent`, `prepare_predefined_data`, pass through `combine_agent` |
+| `data_analysis_agent.py` | Track ERA5/DestinE Zarr outputs from tool intermediate steps |
+| `streamlit_interface.py` | Rename tab, add Data tab with download buttons |
+
+### Works in all modes
+
+- **fast** — climate model CSVs + ERA5 climatology JSON
+- **smart** — above + ERA5 time series Zarr
+- **deep** — above + DestinE time series Zarr
diff --git a/src/climsight/climsight_classes.py b/src/climsight/climsight_classes.py
@@ -41,4 +41,5 @@ class AgentState(BaseModel):
     hazard_data: Optional[Any] = None  # filtered_events_square for disaster plotting
     population_config: dict = {}  # {'pop_path': str, 'country': str} for population plotting
     predefined_plots: list = []  # List of paths to auto-generated plots
+    downloadable_datasets: list = []  # List of {"label": str, "path": str, "source": str}
     # stream_handler: StreamHandler  # Uncomment if needed
diff --git a/src/climsight/climsight_engine.py b/src/climsight/climsight_engine.py
@@ -888,6 +888,22 @@ def data_agent(state: AgentState, data={}, df={}):
                 state.input_params.update(sandbox_paths)
                 state.input_params["climate_data_manifest"] = manifest_path
 
+                # Track climate CSVs as downloadable datasets
+                state.downloadable_datasets.append({
+                    "label": f"Climate Data Manifest ({climate_source})",
+                    "path": manifest_path,
+                    "source": climate_source,
+                })
+                climate_dir = sandbox_paths["climate_data_dir"]
+                if os.path.isdir(climate_dir):
+                    for fname in sorted(os.listdir(climate_dir)):
+                        if fname.endswith(".csv"):
+                            state.downloadable_datasets.append({
+                                "label": f"Climate Model CSV: {fname}",
+                                "path": os.path.join(climate_dir, fname),
+                                "source": climate_source,
+                            })
+
             # Add appropriate references based on data source
             ref_key_map = {
                 'nextGEMS': 'high_resolution_climate_model',
@@ -909,7 +925,11 @@ def data_agent(state: AgentState, data={}, df={}):
 
         logger.info(f"Data agent in work (source: {climate_source}).")
 
-        respond = {'data_agent_response': data_agent_response, 'df_list': df_list}
+        respond = {
+            'data_agent_response': data_agent_response,
+            'df_list': df_list,
+            'downloadable_datasets': state.downloadable_datasets,
+        }
 
         logger.info(f"data_agent_response: {data_agent_response}")
         return respond
@@ -954,6 +974,14 @@ def prepare_predefined_data(state: AgentState):
                 state.era5_climatology_response = era5_result
                 if "reference" in era5_result:
                     collected_references.append(era5_result["reference"])
+                # Track ERA5 climatology JSON as downloadable
+                era5_json_path = os.path.join(state.uuid_main_dir, "era5_climatology.json")
+                if os.path.exists(era5_json_path):
+                    state.downloadable_datasets.append({
+                        "label": "ERA5 Climatology (monthly, 2015-2025)",
+                        "path": era5_json_path,
+                        "source": "ERA5",
+                    })
                 logger.info(f"Extracted ERA5 climatology for ({lat}, {lon})")
             else:
                 logger.warning(f"ERA5 climatology: {era5_result.get('error', 'unknown error')}")
@@ -1019,6 +1047,7 @@ def prepare_predefined_data(state: AgentState):
             'predefined_plots': predefined_plot_paths,
             'era5_climatology_response': era5_data or {},
             'data_analysis_images': predefined_plot_paths,  # For UI display
+            'downloadable_datasets': state.downloadable_datasets,
         }
 
     def route_after_prepare(state: AgentState) -> str:
@@ -1279,9 +1308,12 @@ def combine_agent(state: AgentState):
         #print("chat_prompt_text: ", chat_prompt_text)
         #print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") 
 
+        # Pass downloadable datasets to input_params for UI access
+        state.input_params['downloadable_datasets'] = state.downloadable_datasets
+
         return {
-            'final_answer': output_content, 
-            'input_params': state.input_params, 
+            'final_answer': output_content,
+            'input_params': state.input_params,
             'content_message': state.content_message,
             'combine_agent_prompt_text': chat_prompt_text
         }

diff --git a/src/climsight/data_analysis_agent.py b/src/climsight/data_analysis_agent.py
@@ -190,8 +190,6 @@ def _create_tool_prompt(datasets_text: str, config: dict, lat: float = None, lon
     ideal_calls = mode_config.get("ideal_tool_calls", "6-7")
     max_per_resp = mode_config.get("max_per_response", 4)
     max_reflect = mode_config.get("max_reflect", 2)
-    has_era5_download = config.get("use_era5_data", False)
-    has_destine = config.get("use_destine_data", False)
 
     # --- Build prompt without f-strings for code blocks to avoid brace escaping ---
     sections = []
@@ -344,17 +342,6 @@ def _create_tool_prompt(datasets_text: str, config: dict, lat: float = None, lon
             "  But don't go overboard with tiny one-liner calls either — find a reasonable balance.\n"
             "  Each script should be self-contained: import what it needs, do meaningful work, print results."
         )
-    tools_list.append(
-        "- **Python_REPL** — execute Python code in a sandboxed environment.\n"
-        "  All files are relative to the sandbox root.\n"
-        "  The `results/` directory is pre-created for saving plots.\n"
-        "  Datasets are pre-loaded into the sandbox (see paths below).\n"
-        "  STRATEGY: DIVIDE AND CONQUER. Split your work into a few focused scripts,\n"
-        "  each tackling ONE logical task (e.g., load+explore, then analyze+plot-set-1,\n"
-        "  then analyze+plot-set-2). This avoids cascading errors from monolithic scripts.\n"
-        "  But don't go overboard with tiny one-liner calls either — find a reasonable balance.\n"
-        "  Each script should be self-contained: import what it needs, do meaningful work, print results."
-    )
     tools_list.append("- **list_plotting_data_files** — discover files in sandbox directories")
     tools_list.append("- **image_viewer** — view and analyze plots in `results/` (use relative paths)")
     if has_python_repl and max_reflect > 0:
@@ -562,6 +549,10 @@ def _create_tool_prompt(datasets_text: str, config: dict, lat: float = None, lon
             "Splitting into reasonable chunks lets you catch and fix errors between steps.\n\n"
         )
 
+    budget_lines.append(
+        "ANTI-SPAM RULES:\n"
+        f"- Never call more than {max_per_resp} tools in a single response.\n"
+    )
     if has_python_repl and max_reflect > 0:
         budget_lines.append("- Never call reflect_on_image in the same response as Python_REPL.\n")
         budget_lines.append(f"- Never call reflect_on_image more than {max_reflect} times total.\n")
@@ -825,6 +816,14 @@ def data_analysis_agent(
                 # Collect reference from ERA5 retrieval
                 if "reference" in obs:
                     agent_references.append(obs["reference"])
+                # Track downloaded Zarr for Data tab
+                if "output_path_zarr" in obs:
+                    variable = obs.get("variable", "unknown")
+                    state.downloadable_datasets.append({
+                        "label": f"ERA5 Time Series: {variable}",
+                        "path": obs["output_path_zarr"],
+                        "source": "ERA5",
+                    })
             elif hasattr(obs, 'content'):
                 era5_output = obs.content
             else:
@@ -837,6 +836,14 @@ def data_analysis_agent(
             if isinstance(obs, dict):
                 if "reference" in obs:
                     agent_references.append(obs["reference"])
+                # Track downloaded Zarr for Data tab
+                if "output_path_zarr" in obs:
+                    variable = obs.get("variable", obs.get("parameter", "unknown"))
+                    state.downloadable_datasets.append({
+                        "label": f"DestinE Time Series: {variable}",
+                        "path": obs["output_path_zarr"],
+                        "source": "DestinE",
+                    })
             state.destine_tool_response = str(obs)
             state.input_params.setdefault("destine_results", []).append(obs)
 
@@ -882,4 +889,5 @@ def data_analysis_agent(
         "era5_tool_response": getattr(state, 'era5_tool_response', None),
         "destine_tool_response": getattr(state, 'destine_tool_response', None),
         "references": state.references,  # Propagate collected references
+        "downloadable_datasets": state.downloadable_datasets,
     }
diff --git a/src/climsight/streamlit_interface.py b/src/climsight/streamlit_interface.py
@@ -196,22 +196,6 @@ def _on_mode_change():
         with col1:
             # Always show additional information (removed toggle per user request)
             show_add_info = True
-            smart_agent   = st.toggle("Use extra search", value=False, help="""If this is activated, ClimSight will make additional requests to Wikipedia and RAG, which can significantly increase response time.""")
-            use_era5_data = st.toggle(
-                "Enable ERA5 data",
-                value=config.get("use_era5_data", False),
-                help="Allow the data analysis agent to retrieve ERA5 data into the sandbox.",
-            )
-            use_destine_data = st.toggle(
-                "Enable DestinE data",
-                value=config.get("use_destine_data", False),
-                help="Allow retrieval of DestinE Climate DT projections (SSP3-7.0, 82 parameters).",
-            )
-            use_powerful_data_analysis = st.toggle(
-                "Enable Python analysis",
-                value=config.get("use_powerful_data_analysis", False),
-                help="Allow the data analysis agent to use the Python REPL and generate plots.",
-            )
             # remove the llmModeKey_box from the form, as we tend to run the agent mode, direct mode is for development only
             #llmModeKey_box = st.radio("Select LLM mode 👉", key="visibility", options=["Direct", "Agent (experimental)"])
 
@@ -480,9 +464,9 @@ def update_progress_ui(message):
         show_add_info_display = st.session_state.get('last_show_add_info', False)
 
         if show_add_info_display:
-            tab_text, tab_add, tab_refs = st.tabs(["Report", "Additional information", "References"])
+            tab_text, tab_figs, tab_data, tab_refs = st.tabs(["Report", "Figures", "Data", "References"])
         else:
-            tab_text, tab_refs = st.tabs(["Report", "References"])
+            tab_text, tab_data, tab_refs = st.tabs(["Report", "Data", "References"])
 
         with tab_text:
             st.markdown(st.session_state['last_output'])
@@ -493,12 +477,12 @@ def update_progress_ui(message):
                     st.markdown(f"- {ref}")
 
         if show_add_info_display:
-            with tab_add:
+            with tab_figs:
                 stored_input_params = st.session_state.get('last_input_params', {})
                 stored_figs = st.session_state.get('last_figs', {})
                 stored_climatemodel_name = st.session_state.get('last_climatemodel_name', 'unknown')
-                
-                st.subheader("Additional information", divider='rainbow')
+
+                st.subheader("Figures", divider='rainbow')
                 if 'lat' in stored_input_params and 'lon' in stored_input_params:
                     st.markdown(f"**Coordinates:** {stored_input_params['lat']}, {stored_input_params['lon']}")
                 if 'elevation' in stored_input_params:
@@ -632,6 +616,51 @@ def update_progress_ui(message):
                         for image_path in other_plots:
                             st.image(image_path)
 
+        # Data tab - downloadable datasets
+        with tab_data:
+            stored_input_params_data = st.session_state.get('last_input_params', {})
+            datasets = stored_input_params_data.get('downloadable_datasets', [])
+            if datasets:
+                st.subheader("Available Datasets", divider='rainbow')
+                for idx, ds_entry in enumerate(datasets):
+                    path = ds_entry.get("path", "")
+                    label = ds_entry.get("label", "Dataset")
+                    source = ds_entry.get("source", "")
+                    if path and os.path.exists(path):
+                        col_label, col_btn = st.columns([3, 1])
+                        with col_label:
+                            st.markdown(f"**{label}**")
+                            st.caption(f"{source} — {os.path.basename(path)}")
+                        with col_btn:
+                            if os.path.isdir(path):
+                                # Zarr directories: zip on the fly
+                                import io
+                                import zipfile
+                                buf = io.BytesIO()
+                                with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
+                                    for root, dirs, files in os.walk(path):
+                                        for f in files:
+                                            fp = os.path.join(root, f)
+                                            zf.write(fp, os.path.relpath(fp, os.path.dirname(path)))
+                                st.download_button(
+                                    "Download",
+                                    buf.getvalue(),
+                                    file_name=os.path.basename(path) + ".zip",
+                                    mime="application/zip",
+                                    key=f"dl_data_{idx}",
+                                )
+                            else:
+                                with open(path, "rb") as f:
+                                    file_data = f.read()
+                                st.download_button(
+                                    "Download",
+                                    file_data,
+                                    file_name=os.path.basename(path),
+                                    key=f"dl_data_{idx}",
+                                )
+            else:
+                st.info("No datasets were generated for this query.")
+
         # Download buttons
         st.markdown("---")  # Add a separator
 

diff --git a/test/plot_destine_data.py b/test/plot_destine_data.py
@@ -1,9 +1,19 @@
-"""Quick script to inspect and plot DestinE Zarr data."""
+"""Quick script to inspect and plot DestinE Zarr data.
+
+Usage:
+    python plot_destine_data.py path/to/destine_167_sfc_20200101_20211231.zarr
+"""
+
+import argparse
+import sys
 
 import xarray as xr
 import matplotlib.pyplot as plt
 
-zarr_path = "/Users/ikuznets/work/projects/climsight/code/climsight/tmp/sandbox/38c864498d174b8a90ebb24ac67cf70e/destine_data/destine_167_sfc_20200101_20211231.zarr"
+parser = argparse.ArgumentParser(description="Inspect and plot a DestinE Zarr dataset.")
+parser.add_argument("zarr_path", help="Path to the DestinE .zarr directory")
+args = parser.parse_args()
+zarr_path = args.zarr_path
 
 ds = xr.open_dataset(zarr_path, engine="zarr")