Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
334 changes: 334 additions & 0 deletions CLAUDE.md

Large diffs are not rendered by default.

427 changes: 427 additions & 0 deletions DATA_ANALYSIS_AGENT_PROMPT.md

Large diffs are not rendered by default.

39 changes: 39 additions & 0 deletions PULL_REQUEST.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# FIRST ACCEPT PREVIOUS PR ;)

**This PR is based on PR #197 (analysis modes) and must be merged after it.**

---

## Data Tab: Downloadable Datasets

Adds a new **Data** tab to the UI where users can download all datasets generated during a session. Also renames "Additional information" → "Figures".

### What's new

- **`downloadable_datasets` tracking** — a new field on `AgentState` that accumulates dataset entries (`{label, path, source}`) as they're created throughout the pipeline
- **Climate model CSVs** — tracked after `write_climate_data_manifest()` in `data_agent`
- **ERA5 climatology JSON** — tracked in `prepare_predefined_data()` after extraction
- **ERA5 time series Zarr** — tracked in `data_analysis_agent` after `retrieve_era5_data` tool execution
- **DestinE time series Zarr** — tracked in `data_analysis_agent` after `retrieve_destine_data` tool execution
- **Data tab in UI** — lists all tracked datasets with download buttons; Zarr directories are zipped on the fly, JSON/CSV files download directly
- **Tab rename** — "Additional information" → "Figures"
- **Data tab always visible** — shown regardless of whether figures are available

### Pipeline fix

Each agent node now **returns** `downloadable_datasets` in its return dict so LangGraph properly merges state across stages (in-place mutation alone is not enough).

### Files changed

| File | Change |
|------|--------|
| `climsight_classes.py` | Add `downloadable_datasets: list = []` to `AgentState` |
| `climsight_engine.py` | Track datasets in `data_agent`, `prepare_predefined_data`, pass through `combine_agent` |
| `data_analysis_agent.py` | Track ERA5/DestinE Zarr outputs from tool intermediate steps |
| `streamlit_interface.py` | Rename tab, add Data tab with download buttons |

### Works in all modes

- **fast** — climate model CSVs + ERA5 climatology JSON
- **smart** — above + ERA5 time series Zarr
- **deep** — above + DestinE time series Zarr
1 change: 1 addition & 0 deletions src/climsight/climsight_classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,4 +41,5 @@ class AgentState(BaseModel):
hazard_data: Optional[Any] = None # filtered_events_square for disaster plotting
population_config: dict = {} # {'pop_path': str, 'country': str} for population plotting
predefined_plots: list = [] # List of paths to auto-generated plots
downloadable_datasets: list = [] # List of {"label": str, "path": str, "source": str}
# stream_handler: StreamHandler # Uncomment if needed
38 changes: 35 additions & 3 deletions src/climsight/climsight_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -888,6 +888,22 @@ def data_agent(state: AgentState, data={}, df={}):
state.input_params.update(sandbox_paths)
state.input_params["climate_data_manifest"] = manifest_path

# Track climate CSVs as downloadable datasets
state.downloadable_datasets.append({
"label": f"Climate Data Manifest ({climate_source})",
"path": manifest_path,
"source": climate_source,
})
climate_dir = sandbox_paths["climate_data_dir"]
if os.path.isdir(climate_dir):
for fname in sorted(os.listdir(climate_dir)):
if fname.endswith(".csv"):
state.downloadable_datasets.append({
"label": f"Climate Model CSV: {fname}",
"path": os.path.join(climate_dir, fname),
"source": climate_source,
})

# Add appropriate references based on data source
ref_key_map = {
'nextGEMS': 'high_resolution_climate_model',
Expand All @@ -909,7 +925,11 @@ def data_agent(state: AgentState, data={}, df={}):

logger.info(f"Data agent in work (source: {climate_source}).")

respond = {'data_agent_response': data_agent_response, 'df_list': df_list}
respond = {
'data_agent_response': data_agent_response,
'df_list': df_list,
'downloadable_datasets': state.downloadable_datasets,
}

logger.info(f"data_agent_response: {data_agent_response}")
return respond
Expand Down Expand Up @@ -954,6 +974,14 @@ def prepare_predefined_data(state: AgentState):
state.era5_climatology_response = era5_result
if "reference" in era5_result:
collected_references.append(era5_result["reference"])
# Track ERA5 climatology JSON as downloadable
era5_json_path = os.path.join(state.uuid_main_dir, "era5_climatology.json")
if os.path.exists(era5_json_path):
state.downloadable_datasets.append({
"label": "ERA5 Climatology (monthly, 2015-2025)",
"path": era5_json_path,
"source": "ERA5",
})
logger.info(f"Extracted ERA5 climatology for ({lat}, {lon})")
else:
logger.warning(f"ERA5 climatology: {era5_result.get('error', 'unknown error')}")
Expand Down Expand Up @@ -1019,6 +1047,7 @@ def prepare_predefined_data(state: AgentState):
'predefined_plots': predefined_plot_paths,
'era5_climatology_response': era5_data or {},
'data_analysis_images': predefined_plot_paths, # For UI display
'downloadable_datasets': state.downloadable_datasets,
}

def route_after_prepare(state: AgentState) -> str:
Expand Down Expand Up @@ -1279,9 +1308,12 @@ def combine_agent(state: AgentState):
#print("chat_prompt_text: ", chat_prompt_text)
#print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")

# Pass downloadable datasets to input_params for UI access
state.input_params['downloadable_datasets'] = state.downloadable_datasets

return {
'final_answer': output_content,
'input_params': state.input_params,
'final_answer': output_content,
'input_params': state.input_params,
'content_message': state.content_message,
'combine_agent_prompt_text': chat_prompt_text
}
Expand Down
34 changes: 21 additions & 13 deletions src/climsight/data_analysis_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,6 @@ def _create_tool_prompt(datasets_text: str, config: dict, lat: float = None, lon
ideal_calls = mode_config.get("ideal_tool_calls", "6-7")
max_per_resp = mode_config.get("max_per_response", 4)
max_reflect = mode_config.get("max_reflect", 2)
has_era5_download = config.get("use_era5_data", False)
has_destine = config.get("use_destine_data", False)

# --- Build prompt without f-strings for code blocks to avoid brace escaping ---
sections = []
Expand Down Expand Up @@ -344,17 +342,6 @@ def _create_tool_prompt(datasets_text: str, config: dict, lat: float = None, lon
" But don't go overboard with tiny one-liner calls either — find a reasonable balance.\n"
" Each script should be self-contained: import what it needs, do meaningful work, print results."
)
tools_list.append(
"- **Python_REPL** — execute Python code in a sandboxed environment.\n"
" All files are relative to the sandbox root.\n"
" The `results/` directory is pre-created for saving plots.\n"
" Datasets are pre-loaded into the sandbox (see paths below).\n"
" STRATEGY: DIVIDE AND CONQUER. Split your work into a few focused scripts,\n"
" each tackling ONE logical task (e.g., load+explore, then analyze+plot-set-1,\n"
" then analyze+plot-set-2). This avoids cascading errors from monolithic scripts.\n"
" But don't go overboard with tiny one-liner calls either — find a reasonable balance.\n"
" Each script should be self-contained: import what it needs, do meaningful work, print results."
)
tools_list.append("- **list_plotting_data_files** — discover files in sandbox directories")
tools_list.append("- **image_viewer** — view and analyze plots in `results/` (use relative paths)")
if has_python_repl and max_reflect > 0:
Expand Down Expand Up @@ -562,6 +549,10 @@ def _create_tool_prompt(datasets_text: str, config: dict, lat: float = None, lon
"Splitting into reasonable chunks lets you catch and fix errors between steps.\n\n"
)

budget_lines.append(
"ANTI-SPAM RULES:\n"
f"- Never call more than {max_per_resp} tools in a single response.\n"
)
if has_python_repl and max_reflect > 0:
budget_lines.append("- Never call reflect_on_image in the same response as Python_REPL.\n")
budget_lines.append(f"- Never call reflect_on_image more than {max_reflect} times total.\n")
Expand Down Expand Up @@ -825,6 +816,14 @@ def data_analysis_agent(
# Collect reference from ERA5 retrieval
if "reference" in obs:
agent_references.append(obs["reference"])
# Track downloaded Zarr for Data tab
if "output_path_zarr" in obs:
variable = obs.get("variable", "unknown")
state.downloadable_datasets.append({
"label": f"ERA5 Time Series: {variable}",
"path": obs["output_path_zarr"],
"source": "ERA5",
})
elif hasattr(obs, 'content'):
era5_output = obs.content
else:
Expand All @@ -837,6 +836,14 @@ def data_analysis_agent(
if isinstance(obs, dict):
if "reference" in obs:
agent_references.append(obs["reference"])
# Track downloaded Zarr for Data tab
if "output_path_zarr" in obs:
variable = obs.get("variable", obs.get("parameter", "unknown"))
state.downloadable_datasets.append({
"label": f"DestinE Time Series: {variable}",
"path": obs["output_path_zarr"],
"source": "DestinE",
})
state.destine_tool_response = str(obs)
state.input_params.setdefault("destine_results", []).append(obs)

Expand Down Expand Up @@ -882,4 +889,5 @@ def data_analysis_agent(
"era5_tool_response": getattr(state, 'era5_tool_response', None),
"destine_tool_response": getattr(state, 'destine_tool_response', None),
"references": state.references, # Propagate collected references
"downloadable_datasets": state.downloadable_datasets,
}
71 changes: 50 additions & 21 deletions src/climsight/streamlit_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,22 +196,6 @@ def _on_mode_change():
with col1:
# Always show additional information (removed toggle per user request)
show_add_info = True
smart_agent = st.toggle("Use extra search", value=False, help="""If this is activated, ClimSight will make additional requests to Wikipedia and RAG, which can significantly increase response time.""")
use_era5_data = st.toggle(
"Enable ERA5 data",
value=config.get("use_era5_data", False),
help="Allow the data analysis agent to retrieve ERA5 data into the sandbox.",
)
use_destine_data = st.toggle(
"Enable DestinE data",
value=config.get("use_destine_data", False),
help="Allow retrieval of DestinE Climate DT projections (SSP3-7.0, 82 parameters).",
)
use_powerful_data_analysis = st.toggle(
"Enable Python analysis",
value=config.get("use_powerful_data_analysis", False),
help="Allow the data analysis agent to use the Python REPL and generate plots.",
)
# remove the llmModeKey_box from the form, as we tend to run the agent mode, direct mode is for development only
#llmModeKey_box = st.radio("Select LLM mode 👉", key="visibility", options=["Direct", "Agent (experimental)"])

Expand Down Expand Up @@ -480,9 +464,9 @@ def update_progress_ui(message):
show_add_info_display = st.session_state.get('last_show_add_info', False)

if show_add_info_display:
tab_text, tab_add, tab_refs = st.tabs(["Report", "Additional information", "References"])
tab_text, tab_figs, tab_data, tab_refs = st.tabs(["Report", "Figures", "Data", "References"])
else:
tab_text, tab_refs = st.tabs(["Report", "References"])
tab_text, tab_data, tab_refs = st.tabs(["Report", "Data", "References"])

with tab_text:
st.markdown(st.session_state['last_output'])
Expand All @@ -493,12 +477,12 @@ def update_progress_ui(message):
st.markdown(f"- {ref}")

if show_add_info_display:
with tab_add:
with tab_figs:
stored_input_params = st.session_state.get('last_input_params', {})
stored_figs = st.session_state.get('last_figs', {})
stored_climatemodel_name = st.session_state.get('last_climatemodel_name', 'unknown')
st.subheader("Additional information", divider='rainbow')

st.subheader("Figures", divider='rainbow')
if 'lat' in stored_input_params and 'lon' in stored_input_params:
st.markdown(f"**Coordinates:** {stored_input_params['lat']}, {stored_input_params['lon']}")
if 'elevation' in stored_input_params:
Expand Down Expand Up @@ -632,6 +616,51 @@ def update_progress_ui(message):
for image_path in other_plots:
st.image(image_path)

# Data tab - downloadable datasets
with tab_data:
stored_input_params_data = st.session_state.get('last_input_params', {})
datasets = stored_input_params_data.get('downloadable_datasets', [])
if datasets:
st.subheader("Available Datasets", divider='rainbow')
for idx, ds_entry in enumerate(datasets):
path = ds_entry.get("path", "")
label = ds_entry.get("label", "Dataset")
source = ds_entry.get("source", "")
if path and os.path.exists(path):
col_label, col_btn = st.columns([3, 1])
with col_label:
st.markdown(f"**{label}**")
st.caption(f"{source} — {os.path.basename(path)}")
with col_btn:
if os.path.isdir(path):
# Zarr directories: zip on the fly
import io
import zipfile
buf = io.BytesIO()
with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf:
for root, dirs, files in os.walk(path):
for f in files:
fp = os.path.join(root, f)
zf.write(fp, os.path.relpath(fp, os.path.dirname(path)))
st.download_button(
"Download",
buf.getvalue(),
file_name=os.path.basename(path) + ".zip",
mime="application/zip",
key=f"dl_data_{idx}",
)
else:
with open(path, "rb") as f:
file_data = f.read()
st.download_button(
"Download",
file_data,
file_name=os.path.basename(path),
key=f"dl_data_{idx}",
)
else:
st.info("No datasets were generated for this query.")

# Download buttons
st.markdown("---") # Add a separator

Expand Down
14 changes: 12 additions & 2 deletions test/plot_destine_data.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,19 @@
"""Quick script to inspect and plot DestinE Zarr data."""
"""Quick script to inspect and plot DestinE Zarr data.

Usage:
python plot_destine_data.py path/to/destine_167_sfc_20200101_20211231.zarr
"""

import argparse
import sys

import xarray as xr
import matplotlib.pyplot as plt

zarr_path = "/Users/ikuznets/work/projects/climsight/code/climsight/tmp/sandbox/38c864498d174b8a90ebb24ac67cf70e/destine_data/destine_167_sfc_20200101_20211231.zarr"
parser = argparse.ArgumentParser(description="Inspect and plot a DestinE Zarr dataset.")
parser.add_argument("zarr_path", help="Path to the DestinE .zarr directory")
args = parser.parse_args()
zarr_path = args.zarr_path

ds = xr.open_dataset(zarr_path, engine="zarr")

Expand Down