diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..c4923ef --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,334 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## Project Overview + +ClimSight is a climate decision support system that integrates Large Language Models (LLMs) with climate data to provide localized climate insights. It uses a multi-agent architecture built on LangGraph to combine climate model data, geographic information, RAG (Retrieval Augmented Generation) from scientific reports, and LLM reasoning. + +**Key Technologies:** +- **LangChain/LangGraph**: Multi-agent orchestration framework +- **Streamlit**: Web UI framework +- **xarray/NetCDF4**: Climate data processing +- **Chroma**: Vector database for RAG +- **GeoPandas/OSMnx**: Geospatial analysis +- **OpenAI API**: LLM backend (supports custom models via AITTA platform) +- **earthkit.data**: DestinE data retrieval via polytope +- **Arraylake**: ERA5 reanalysis data access + +## Common Commands + +### Environment Setup + +```bash +# Using conda/mamba (recommended) +mamba env create -f environment.yml +conda activate climsight +python download_data.py # Downloads ~8GB of climate data + +# Using pip +python -m venv venv +source venv/bin/activate # On Windows: venv\Scripts\activate +pip install -e . +python download_data.py +``` + +### Running the Application + +```bash +# Development mode (from repo root) +streamlit run src/climsight/climsight.py + +# If installed via pip +climsight + +# Testing mode (no OpenAI API calls) +streamlit run src/climsight/climsight.py skipLLMCall +``` + +### Testing + +```bash +# Run all tests (DestinE tests excluded by default) +cd test +pytest + +# Run specific test categories (see test/pytest.ini for all markers) +pytest -m geo # Geographic functions +pytest -m climate # Climate data functions +pytest -m env # Environmental functions +pytest -m "not request" # Skip tests requiring HTTP requests + +# DestinE tool tests (require ~/.polytopeapirc token + OPENAI_API_KEY) +pytest -m destine -v # All DestinE tests +pytest -m destine -v -k search # RAG search only (fast) +pytest -m destine -v -k retrieve # Data retrieval only + +# Run single test file +pytest test_geofunctions.py + +# Run with verbose output +pytest -v +``` + +### Linting + +```bash +# Syntax errors and undefined names only (CI uses this) +flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics +``` + +### Batch Processing (sequential/) + +```bash +cd sequential + +# Generate climate questions +python question_generator.py + +# Visualize questions on map +streamlit run question_map.py + +# Process questions through ClimSight +python question_runner.py --questions_file Q_1.json --llm_model gpt-4.1-nano +``` + +## Architecture + +### Multi-Agent Workflow (LangGraph) + +ClimSight uses a state machine with specialized agents that process user questions: + +1. **intro_agent** ([climsight_engine.py:1079](src/climsight/climsight_engine.py#L1079)) + - Entry point that filters invalid requests + - Uses exclusion-based logic to determine if query is climate-related + - Routes to either FINISH (invalid) or CONTINUE (parallel agents) + +2. **Parallel Information Gathering Agents:** + - **ipcc_rag_agent** ([climsight_engine.py:1031](src/climsight/climsight_engine.py#L1031)): Searches IPCC reports via RAG + - **general_rag_agent** ([climsight_engine.py:1052](src/climsight/climsight_engine.py#L1052)): Searches general climate literature via RAG + - **data_agent** ([climsight_engine.py:854](src/climsight/climsight_engine.py#L854)): Extracts climate model data for location + - **zero_rag_agent** ([climsight_engine.py:723](src/climsight/climsight_engine.py#L723)): Gathers geographic/environmental context + - **smart_agent** ([smart_agent.py:71](src/climsight/smart_agent.py#L71)) — OPTIONAL: + - Information gathering only (Wikipedia, RAG, ECOCROP) + - Controlled by `use_smart_agent` in config + - No Python REPL — that is now in data_analysis_agent + +3. **prepare_predefined_data** ([climsight_engine.py:917](src/climsight/climsight_engine.py#L917)) + - Runs after all parallel agents complete + - Extracts ERA5 climatology and generates predefined plots (climate comparison with ERA5 overlay, disaster summary, population projection) + - Routes via `route_after_prepare`: if `use_powerful_data_analysis` → data_analysis_agent, otherwise → combine_agent + +4. **data_analysis_agent** ([data_analysis_agent.py:476](src/climsight/data_analysis_agent.py#L476)) — OPTIONAL: + - Receives all outputs from parallel agents + predefined plots + - Performs data extraction, post-processing, and visualization using tool-calling + - Tools: Python REPL (Jupyter kernel), ERA5 retrieval, DestinE search + retrieval, image viewer, reflection, visualization strategy + - Controlled by `use_powerful_data_analysis` in config + +5. **combine_agent** ([climsight_engine.py:1181](src/climsight/climsight_engine.py#L1181)) + - Synthesizes all agent outputs into final answer + - Generates references and formatted response + +**Routing Logic:** +- `route_fromintro` launches parallel agents (including smart_agent if enabled) +- All parallel agents converge to `prepare_predefined_data` +- `route_after_prepare` conditionally invokes `data_analysis_agent` or skips to `combine_agent` +- `data_analysis_agent` flows to `combine_agent` + +### Key Modules + +**Core Engine:** +- [climsight_engine.py](src/climsight/climsight_engine.py): Main orchestration, agent definitions, workflow setup +- [climsight_classes.py](src/climsight/climsight_classes.py): `AgentState` Pydantic model for state sharing between agents +- [data_container.py](src/climsight/data_container.py): Container for DataFrames, xarray datasets, and matplotlib figures + +**Data Processing:** +- [climate_functions.py](src/climsight/climate_functions.py): Load and extract climate model data (temperature, precipitation, wind) +- [extract_climatedata_functions.py](src/climsight/extract_climatedata_functions.py): Request climate data with model-specific handling +- [geo_functions.py](src/climsight/geo_functions.py): Geocoding, land/water detection, elevation, soil, land use +- [environmental_functions.py](src/climsight/environmental_functions.py): Biodiversity, natural hazards +- [economic_functions.py](src/climsight/economic_functions.py): Population data and projections + +**RAG System:** +- [rag.py](src/climsight/rag.py): RAG query interface supporting multiple embedding backends (OpenAI, AITTA, Mistral) +- [embedding_utils.py](src/climsight/embedding_utils.py): Embedding model initialization for different backends + +**Information Gathering Agent:** +- [smart_agent.py](src/climsight/smart_agent.py): LangChain agent with tools for Wikipedia, RAG, and ECOCROP database queries (information gathering only — no Python REPL) + +**Data Analysis Agent:** +- [data_analysis_agent.py](src/climsight/data_analysis_agent.py): Full tool-calling agent for data extraction, analysis, and visualization + - `_create_tool_prompt()` (line 120): Dynamically builds system prompt based on config + - `data_analysis_agent()` (line 476): Main entry point + - Registers tools: Python REPL, ERA5 retrieval, DestinE search/retrieval, image viewer, reflect, wise_agent + +**Tools (src/climsight/tools/):** +- [python_repl.py](src/climsight/tools/python_repl.py): Sandboxed Python execution with persistent Jupyter kernel (used by data_analysis_agent) +- [image_viewer.py](src/climsight/tools/image_viewer.py): View and analyze generated plots +- [reflection_tools.py](src/climsight/tools/reflection_tools.py): Quality feedback on generated plots (7/10 threshold) +- [visualization_tools.py](src/climsight/tools/visualization_tools.py): List sandbox files and visualization strategy advice +- [predefined_plots.py](src/climsight/tools/predefined_plots.py): Standard climate visualizations (climate comparison with ERA5 overlay, disaster summary, population projection) +- [era5_climatology_tool.py](src/climsight/tools/era5_climatology_tool.py): Extract ERA5 ground truth (10-year climatology) +- [era5_retrieval_tool.py](src/climsight/tools/era5_retrieval_tool.py): Download ERA5 time series via Arraylake +- [destine_retrieval_tool.py](src/climsight/tools/destine_retrieval_tool.py): DestinE parameter search (RAG over 82 params) + data retrieval via earthkit.data/polytope + +**Interfaces:** +- [streamlit_interface.py](src/climsight/streamlit_interface.py): Web UI with map selection +- [terminal_interface.py](src/climsight/terminal_interface.py): CLI interface +- [stream_handler.py](src/climsight/stream_handler.py): Progress updates for UI + +## Configuration + +**Primary Config:** [config.yml](config.yml) + +Key configuration sections: +- `model_type`: "openai" | "local" | "aitta" +- `model_name*`: Different models for RAG, tools, agents, combine step +- `use_smart_agent`: Enable/disable information gathering agent +- `use_powerful_data_analysis`: Enable/disable data_analysis_agent (Python REPL + tools) +- `use_high_resolution_climate_model`: Use high-res nextGEMS data +- `climate_model_input_files`: Map of NetCDF files with metadata +- `rag_settings`: Embedding model type, Chroma DB paths per backend +- `rag_template`: System prompt for RAG queries +- `era5_climatology`: ERA5 ground truth settings +- `use_destine_data`: Enable/disable DestinE Climate DT data retrieval +- `destine_settings`: Chroma DB path and collection name for parameter search + +**Data Sources:** [data_sources.yml](data_sources.yml) +- Defines remote data URLs and local extraction paths +- Used by `download_data.py` to fetch climate data, natural hazards, population, geographic boundaries + +**Reference Data:** [references.yml](references.yml) +- Citation information for datasets and reports +- Automatically added to outputs + +## Important Patterns + +### LangChain Import Strategy + +The codebase handles LangChain 1.0+ migration with try/except blocks: + +```python +try: + from langchain.chains import LLMChain +except ImportError: + from langchain_classic.chains import LLMChain +``` + +This pattern appears in [climsight_engine.py](src/climsight/climsight_engine.py#L24-27) and [smart_agent.py](src/climsight/smart_agent.py#L11-14). Maintain this pattern when adding LangChain imports. + +### Climate Data Structure + +Climate data uses xarray Datasets loaded via [climate_functions.py](src/climsight/climate_functions.py): +- Historical data: `data['hist']` (typically 1995-2014) +- Future projections: `data['future']` (2020-2049 in decadal chunks) +- Variables mapped via `config['variable_mappings']`: Temperature→tas, Precipitation→pr, etc. +- Coordinates mapped via `config['dimension_mappings']`: lat, lon, month + +High-resolution nextGEMS data uses HEALPix coordinate system (see `coordinate_system: 'healpix'` in config). + +### Agent State Management + +All agents receive `AgentState` ([climsight_classes.py](src/climsight/climsight_classes.py)) and return dictionaries that update state: + +```python +def my_agent(state: AgentState): + # Access shared state + user_query = state.user + lat = float(state.input_params['lat']) + + # Process... + + # Return updates (merged into state) + return {'my_agent_response': result} +``` + +Key AgentState fields: +- **Agent outputs:** `data_agent_response`, `zero_agent_response`, `ipcc_rag_agent_response`, `general_rag_agent_response`, `smart_agent_response`, `data_analysis_response` +- **Sandbox paths:** `thread_id`, `uuid_main_dir`, `results_dir`, `climate_data_dir`, `era5_data_dir`, `destine_data_dir` +- **Artifacts:** `df_list` (climate DataFrames), `predefined_plots`, `data_analysis_images`, `references` +- **ERA5/DestinE:** `era5_climatology_response`, `era5_tool_response`, `destine_tool_response` + +The workflow automatically merges return values into state for downstream agents. + +### Predefined Plots with ERA5 Overlay + +[predefined_plots.py](src/climsight/tools/predefined_plots.py) generates standard climate visualizations with ERA5 observational data overlay: +- Supports multiple climate model variable naming conventions (nextGEMS: mean2t/tp, AWI-CM: tas/pr, DestinE: avg_2t/avg_tprate) +- Maps model variables to ERA5 equivalents via `era5_var_map` and `descriptive_era5_map` +- Handles cross-dataframe column matching for models with different historical/future column names (e.g., AWI-CM) +- Computes ERA5 wind speed from u10/v10 components + +### DestinE Data Retrieval + +The DestinE tool uses a two-step workflow: +1. **Parameter search** — RAG semantic search over 82 DestinE Climate DT parameters via Chroma vector store (`data/destine/chroma_db/`) +2. **Data retrieval** — Download via `earthkit.data.from_source("polytope", "destination-earth", ...)` using token from `~/.polytopeapirc` + +Authentication: Run `desp-authentication.py` to obtain a token (written to `~/.polytopeapirc`). No username/password passed at runtime. + +### RAG Database Initialization + +RAG databases are initialized in [climsight_engine.py](src/climsight/climsight_engine.py) before workflow creation: +- Checks if Chroma DB exists with `is_valid_rag_db()` +- Falls back to creating new DB if invalid +- Supports multiple backends (OpenAI, AITTA) with separate DB paths + +Embedding backend is selected via `config['rag_settings']['embedding_model_type']`. + +### Location Validation + +Before processing any query, [location_request()](src/climsight/climsight_engine.py#L113) validates the point is on land: +- Returns `(None, None)` if point is in ocean (line 139) +- Distinguishes between ocean and inland water bodies +- Fetches address, elevation, soil, land use data + +This critical check prevents wasted processing on invalid locations. + +## Development Notes + +### Adding New Climate Variables + +1. Add NetCDF file to `config['climate_model_input_files']` +2. Add variable mapping to `config['climate_model_variable_mapping']` or `config['variable_mappings']` +3. Update [extract_climatedata_functions.py](src/climsight/extract_climatedata_functions.py) to handle new variable +4. Update `data_agent` in [climsight_engine.py](src/climsight/climsight_engine.py#L854) to include in prompt +5. Update ERA5 variable maps in [predefined_plots.py](src/climsight/tools/predefined_plots.py) for overlay support + +### Adding New Tools to Data Analysis Agent + +1. Create tool function in [tools/](src/climsight/tools/) directory +2. Import in [data_analysis_agent.py](src/climsight/data_analysis_agent.py) +3. Register tool in the tools list (around line 595) +4. Add tool description to `_create_tool_prompt()` (line 120) so the agent knows when to use it + +### Adding New Tools to Smart Agent + +1. Create tool function in [tools/](src/climsight/tools/) directory +2. Import and register in [smart_agent.py](src/climsight/smart_agent.py) +3. Update system prompt to describe when to use the tool + +### Testing with Mock Data + +Use `skipLLMCall` mode to bypass OpenAI API: +- Test files in [test/](test/) use mock configs +- Expected outputs stored as CSV files (e.g., `expected_df_climate_data.csv`) +- Use pytest markers to skip network-dependent tests +- DestinE tests require `-m destine` flag and `~/.polytopeapirc` token + +### Logging + +All modules log to `climsight.log` in the working directory. Check this file for detailed execution traces when debugging agent behavior. + +### Model Compatibility + +The config supports multiple LLM model types: +- Standard OpenAI models (gpt-4o, gpt-4.1-nano, etc.) +- o1 models (automatically sets temperature=1, see [smart_agent.py:77](src/climsight/smart_agent.py#L77)) +- AITTA platform models via `get_aitta_chat_model()` function + +When adding new models, check temperature requirements and tool-calling compatibility. + +### Prompt Template Safety + +When adding code examples to agent prompts (e.g., in `_create_tool_prompt()`), escape curly braces as `{{}}` — otherwise `ChatPromptTemplate` interprets `{}` as a template variable placeholder. diff --git a/DATA_ANALYSIS_AGENT_PROMPT.md b/DATA_ANALYSIS_AGENT_PROMPT.md new file mode 100644 index 0000000..6b0bdfd --- /dev/null +++ b/DATA_ANALYSIS_AGENT_PROMPT.md @@ -0,0 +1,427 @@ +# Data Analysis Agent — Full Prompt Overview + +This document shows the complete prompt sent to the data analysis agent LLM. +The prompt is dynamically assembled in `data_analysis_agent.py` from multiple sections. + +## How the Prompt is Assembled + +The agent uses `create_standard_agent_executor()` from `agent_helpers.py`, which creates a `ChatPromptTemplate` with: + +``` +[system] → tool_prompt (built by _create_tool_prompt()) +[user] → analysis_brief (filtered context from upstream agents) +[messages] → conversation history +[agent_scratchpad] → tool call/response pairs (managed by LangChain) +``` + +--- + +## Part 1: System Prompt (`_create_tool_prompt()`) + +Built dynamically based on config flags. Below is the **full prompt with all features enabled**. + +--- + +### ROLE + +``` +You are ClimSight's data analysis agent. +Your job: provide ADDITIONAL quantitative climate analysis beyond the standard plots. +You have a persistent Python REPL, pre-extracted data files, and optional ERA5 download access. + +CRITICAL EFFICIENCY RULES: +- HARD LIMIT: 30 tool calls total for the entire session. +- MAX 3-4 tool calls per response. Never fire 5+ tools in a single response. +- Write focused Python scripts — each one should accomplish a meaningful chunk of work. +- SEQUENTIAL ordering: first Python_REPL to generate plots, THEN reflect_on_image in a LATER response. + Never call reflect_on_image in the same response as Python_REPL. +- Ideal session: 3-4 REPL calls → 1-2 reflect calls → final answer. That is 6-7 tool calls total. +``` + +### 1. DATA ALREADY IN THE SANDBOX (do not re-extract) + +#### ERA5 Climatology (observational ground truth) +*Included when: ERA5 climatology is available* + +``` +- File: `era5_climatology.json` in sandbox root +- Content: monthly averages of t2m (°C), cp+lsp (precipitation, m), u10/v10 (m/s) — period 2015-2025 +- Load: `era5 = json.load(open('era5_climatology.json'))` +- Role: treat as GROUND TRUTH for validating model data. +``` + +#### Climate Model Data +*Included when: climate model data exists in sandbox* + +``` +- Manifest: `climate_data/climate_data_manifest.json` — READ FIRST to discover all simulations +- Data files: `climate_data/simulation_N.csv` + `simulation_N_meta.json` +- Shortcut: `climate_data/data.csv` = baseline simulation only +- Columns: Month, mean2t (°C), cp (convective precip, m), lsp (large-scale precip, m), + wind_u, wind_v, wind_speed, wind_direction +``` + +### 2. PRE-GENERATED PLOTS (already created — DO NOT recreate) + +*Included when: predefined plots exist* + +``` +- `results/climate_*.png` — temperature, precipitation, wind comparison with ERA5 overlay +- `results/disaster_counts.png` — historical disaster events by type +- `results/population_projection.png` — population trends + +Analyze the underlying DATA directly for your own insights. +Only use `image_viewer` on plots YOU create, not on these predefined ones. +``` + +### 3. ERA5 TIME SERIES DOWNLOAD (year-by-year data) + +*Included when: `use_era5_data: true`* + +``` +Use `retrieve_era5_data` to download full annual time series (2015-2024) from Earthmover. +This gives you YEAR-BY-YEAR values — far richer than the 10-year climatology average above. + +When to download: +- You need to detect warming/drying TRENDS → download t2, cp, and lsp (sum cp+lsp for total precip) +- You need interannual variability or extreme-year identification → download the relevant variables +- You only need monthly climatology for comparison → skip, use era5_climatology.json + +Tool parameters: +- Variable codes: `t2` (temperature), `cp` (convective precip), `lsp` (large-scale precip), + `u10`/`v10` (wind), `mslp` (pressure) +- NOTE: `tp` (total precipitation) is NOT available. Use `cp` + `lsp` and sum them. +- Always pass `work_dir='.'` +- Output: Zarr store saved to `era5_data/` folder + +Loading ERA5 Zarr in Python_REPL: +```python +import xarray as xr, glob +era5_files = glob.glob('era5_data/*.zarr') +print(era5_files) +ds = xr.open_dataset(era5_files[0], engine='zarr', chunks={}) +data = ds['t2'].to_series() +``` + +### 3b. DESTINE CLIMATE PROJECTIONS (SSP3-7.0, 82 parameters) + +*Included when: `use_destine_data: true`* + +``` +You have access to the DestinE Climate DT — high-resolution projections (IFS-NEMO, 2020-2039). +Use a TWO-STEP workflow: + +**Step 1: Search for parameters** +Call `search_destine_parameters` with a natural language query to find relevant parameters. +Example: search_destine_parameters('temperature at 2 meters') → returns candidates with param_id, levtype. + +**Step 2: Download data** +Call `retrieve_destine_data` with param_id and levtype from search results. +- Dates: YYYYMMDD format, range 20200101-20391231 +- **By default request the FULL period**: start_date=20200101, end_date=20391231 (20 years of projections) +- Only use a shorter range if the user explicitly asks for a specific period +- Output: Zarr store saved to `destine_data/` folder + +Loading DestinE data in Python_REPL: +```python +import xarray as xr, glob +destine_files = glob.glob('destine_data/*.zarr') +print(destine_files) +ds = xr.open_dataset(destine_files[0], engine='zarr', chunks={}) +print(ds) +``` + +### 4. AVAILABLE TOOLS + +``` +- **retrieve_era5_data** — download ERA5 year-by-year time series (see section 3) +- **search_destine_parameters** — find DestinE parameters via RAG search (see section 3b) +- **retrieve_destine_data** — download DestinE time series (see section 3b) +- **Python_REPL** — execute Python code in a sandboxed environment. + All files are relative to the sandbox root. + The `results/` directory is pre-created for saving plots. + Datasets are pre-loaded into the sandbox (see paths below). + STRATEGY: DIVIDE AND CONQUER. Split your work into a few focused scripts, + each tackling ONE logical task (e.g., load+explore, then analyze+plot-set-1, + then analyze+plot-set-2). This avoids cascading errors from monolithic scripts. + But don't go overboard with tiny one-liner calls either — find a reasonable balance. + Each script should be self-contained: import what it needs, do meaningful work, print results. +- **list_plotting_data_files** — discover files in sandbox directories +- **image_viewer** — view and analyze plots in `results/` (use relative paths) +- **reflect_on_image** — get quality feedback on a plot you created. + Call once per plot — reflect on ALL generated plots, not just one. + MUST be called in a SEPARATE response AFTER the Python_REPL that created the plots. + Always verify the file exists (via os.path.exists in REPL) BEFORE calling this tool. + MINIMUM ACCEPTABLE SCORE: 7/10. If score < 7, you MUST re-plot with fixes applied. + Read the fix suggestions from the reviewer and apply them in your next REPL call. +- **wise_agent** — ask for visualization strategy advice before coding +``` + +### 5. REQUIRED WORKFLOW + +``` +**Step 1 — Explore and load data:** +MANDATORY FIRST STEP: Load data files AND print their structure before any analysis. +This prevents cascading errors from wrong column names or data formats. + +CRITICAL DATA FORMAT WARNINGS: +- Month column may contain STRING NAMES ('January', 'February') — convert before using as int +- CSV paths in manifest are FILENAMES ONLY — always prepend 'climate_data/' +- Precipitation column may be 'tp' (total precip in mm) OR separate 'cp'/'lsp' (in meters) +- Always print df.columns.tolist() and df.head(2) BEFORE writing analysis code +``` + +```python +import os, json +import pandas as pd + +# 1. Load manifest and print structure +manifest = json.load(open('climate_data/climate_data_manifest.json')) +for e in manifest['entries']: + csv_path = os.path.join('climate_data', os.path.basename(e['csv'])) + print(e['years_of_averaging'], csv_path, '(baseline)' if e.get('main') else '') + +# 2. Load one CSV and inspect columns/dtypes +csv_path = os.path.join('climate_data', os.path.basename(manifest['entries'][0]['csv'])) +df = pd.read_csv(csv_path) +print('Columns:', df.columns.tolist()) +print('Dtypes:', df.dtypes.to_dict()) +print(df.head(2)) + +# 3. Convert Month column (handles both 'January' strings and integers) +month_map = {name: i+1 for i, name in enumerate( + ['January','February','March','April','May','June', + 'July','August','September','October','November','December'])} +if not pd.api.types.is_numeric_dtype(df['Month']): + df['Month'] = df['Month'].map(month_map) +df['Month'] = df['Month'].astype(int) +``` + +```python +# ERA5 observations (ground truth) +era5 = json.load(open('era5_climatology.json')) +era5_temp = era5['variables']['t2m']['monthly_values'] # dict: month_name → value +``` + +``` +**Step 2 — (Optional) Download ERA5 time series:** +Call `retrieve_era5_data` for `t2`, `cp`, and/or `lsp` if year-by-year analysis is needed. +Load the resulting Zarr files in Python_REPL (see section 3 for loading pattern). + +**Step 3 — Climatology analysis + comparison plots:** +Load ALL climate model CSVs from Step 1. Use the EXACT column names you printed in Step 1. +REMINDER: prepend 'climate_data/' to CSV filenames from manifest. +REMINDER: Convert Month strings ('January'→1) if needed (see Step 1 code). +REMINDER: Precipitation may be column 'tp' (already in mm) or 'cp'/'lsp' (in meters, multiply by 1000). +Compute monthly means, deltas between decades. +Create 2-3 comparison plots (temperature, precipitation, wind) saved to `results/`. +Print a concise summary of baseline values and projected changes. + +**Step 4 — Threshold & risk analysis + additional plots:** +If ERA5 time series were downloaded: compute threshold exceedances (heat days, frost days, +dry spells, wind extremes). Create 2-3 threshold/risk plots saved to `results/`. +Print quantitative risk metrics. If no ERA5 time series, skip this step. + +**Step 5 — Verify ALL plots (SEPARATE response, after plots exist):** +In a NEW response (never in the same response as Python_REPL), call `reflect_on_image` +once per generated plot — QA ALL of them, not just one. +MINIMUM SCORE: 7/10. If any plot scores below 7: +- Read the reviewer's fix suggestions carefully +- Write a NEW Python_REPL script applying those exact fixes +- Do NOT give up or skip re-plotting — the fixes are usually simple (font sizes, legend position) +``` + +### 6. PLOTTING CONVENTIONS + +``` +Every plot you create MUST follow these rules: +- `plt.figure(figsize=(12, 6))` — wide-format for readability +- `plt.savefig('results/filename.png', dpi=150, bbox_inches='tight')` +- `plt.close()` — ALWAYS close to prevent memory leaks +- Font sizes: title 14pt, axis labels 12pt, tick labels 10pt, legend 10pt +- Color palette: use scientific defaults — blue=#2196F3 cold, red=#F44336 hot, + green=#4CAF50 precipitation; use 'tab10' for multi-series +- ERA5 observations: always plot as BLACK solid line with circle markers ('k-o') +- Model projections: colored dashed lines, labeled by decade +- Include units on EVERY axis (°C, mm/month, m/s) +- Use `plt.tight_layout()` or `bbox_inches='tight'` to prevent label clipping +``` + +### 7. ERROR RECOVERY + +``` +MOST COMMON ERRORS (fix these FIRST): +- `ValueError: invalid literal for int()` on Month → Month column has string names like 'January'. + FIX: Use month_map dict to convert (see Step 1 code example). +- `FileNotFoundError: simulation_1.csv` → Manifest paths are filenames only. + FIX: Prepend 'climate_data/': `os.path.join('climate_data', os.path.basename(e['csv']))` +- `KeyError: 'cp'` or `'lsp'` → CSV column is 'tp' (total precip in mm), not cp/lsp. + FIX: Check df.columns.tolist() first, use whatever precipitation column exists. + +Other errors: +- File not found? → Run `list_plotting_data_files` to see available files and adapt paths. +- Zarr load fails? → Check `era5_data/` contents with `glob.glob('era5_data/*')`. +- Plot save fails? → Ensure `results/` dir exists: `os.makedirs('results', exist_ok=True)`. +- JSON parse error? → Print the file contents first, then fix the loading code. +- Empty DataFrame? → Print `df.head()` and `df.columns.tolist()` to inspect structure. +``` + +### 8. SANDBOX PATHS AND DATA + +*Dynamically generated from `_build_datasets_text(state)`:* + +``` +Available data directories: +- Climate data: 'climate_data/' +- ERA5 data: 'era5_data/' +- DestinE data: 'destine_data/' + +## Climate Data Files Available (in 'climate_data/' folder) +Files: data.csv, simulation_1.csv, simulation_1_meta.json, ... +Note: Load with `pd.read_csv('climate_data/data.csv')` +``` + +### 9. PROACTIVE ANALYSIS + +``` +Even if the user's query is vague, you SHOULD proactively: +- Create a temperature trend visualization (all decades + ERA5 baseline) +- Create a precipitation comparison chart +- Highlight the 3 months with the largest projected changes +- Identify potential climate risks relevant to the query +``` + +### 10. OUTPUT FORMAT + +``` +Your final response MUST include: +1. **Observed Climate** — current conditions from ERA5 (2015-2025 baseline) +2. **Model Performance** — how well projections match ERA5 observations +3. **Projected Changes** — future vs baseline, with magnitude and timing +4. **Critical Months** — months with largest changes or highest risk +5. **Visualizations** — list of created plot files in `results/` +6. **Implications** — interpretation relevant to the user's query +``` + +### TOOL BUDGET (HARD LIMIT: 30 tool calls total, max 3-4 per response) + +``` +Plan your session carefully — you have at most 30 tool calls: +- Python_REPL: a few calls, each focused on ONE logical task +- retrieve_era5_data: 0-3 calls (one per variable: t2, cp, lsp) +- search_destine_parameters: 1-2 calls (find param_ids before downloading) +- retrieve_destine_data: 0-3 calls (use full 2020-2039 range by default) +- reflect_on_image: one call per plot — QA ALL generated plots, not just one +- list_plotting_data_files / image_viewer: 0-2 calls +- wise_agent: 0-1 calls + +DIVIDE AND CONQUER — Python_REPL strategy: +- Script 1: Load ALL data, explore structure, print column names and shapes +- Script 2: Climatology analysis + comparison plots (temp, precip, wind) +- Script 3: Threshold/risk analysis + additional plots (if ERA5 time series available) +- Script 4 (if needed): Fix any errors from previous scripts, create missing plots + +WHY: One massive all-in-one script causes cascading errors — one bug kills everything. +Splitting into reasonable chunks lets you catch and fix errors between steps. + +ANTI-SPAM RULES: +- Never call more than 3-4 tools in a single response. +- Never call reflect_on_image in the same response as Python_REPL. +- Never call reflect_on_image more than twice total. +- Don't spam tiny one-liner REPL calls — each script should do meaningful work. +``` + +--- + +## Part 2: User Message (`analysis_brief`) + +The user message sent to the agent is either: + +### With filter LLM (default) + +A two-step process: +1. All upstream agent outputs are concatenated and sent to a filter LLM +2. The filter extracts actionable analysis requirements + +Filter prompt (`_build_filter_prompt()`): + +``` +You are a context filter for ClimSight's data analysis agent. +Your output will be consumed by an agent that has Python REPL, ERA5 data access, +and climate model data. Focus on what it should COMPUTE and PLOT. + +Extract ONLY actionable analysis requirements as concise bullets: +- Target variables with units (e.g., 'Temperature (°C)', 'Precipitation (mm/month)') +- Quantitative thresholds or criteria (e.g., 'days above 35°C', 'monthly rainfall < 50mm') +- Time ranges or scenario labels (e.g., '2020-2029 vs 2040-2049', 'SSP5-8.5') +- Spatial specifics (location name, coordinates, search radius) +- Requested analyses (trend detection, seasonal comparison, anomaly identification, custom plots) +- Mentioned crops, infrastructure, or decision topics (e.g., 'wheat cultivation', 'solar panel siting') + +Rules: +- Do NOT include raw climate data values or lengthy text passages. +- Do NOT include RAG or Wikipedia excerpts — only summarize their KEY requirements. +- Omit vague statements that cannot be translated into a computation or plot. +- If no specific analysis is requested, default to: temperature trends, precipitation comparison, + wind assessment, and a climate change signal summary. +``` + +The resulting filtered context is wrapped as: + +``` +USER QUESTION: {user's original question} + +Location: {location name} +Coordinates: {lat}, {lon} + +ANALYSIS REQUIREMENTS: +{filtered bullets from filter LLM} +``` + +### Without filter (fallback) + +``` +USER QUESTION: {user's original question} + +Location: {location name} +Coordinates: {lat}, {lon} + +Available climatology: +{ERA5 climatology summary} + +Required analysis: +- Extract Temperature and Precipitation data +- Compare historical vs future projections +- Create visualizations if Python_REPL is available +``` + +--- + +## Part 3: Agent Execution + +- **LLM**: `ChatOpenAI` with model from `config["llm_combine"]["model_name"]` (default: `gpt-4.1-nano`) +- **Agent type**: `create_openai_tools_agent` (OpenAI tools/function calling) +- **Max iterations**: 20 (each iteration = one LLM call + tool execution) +- **Tool calls per response**: LLM can call multiple tools in parallel (OpenAI native feature) + +### Registered Tools (when all features enabled) + +| Tool | Source | Purpose | +|------|--------|---------| +| `retrieve_era5_data` | `era5_retrieval_tool.py` | Download ERA5 time series via Arraylake | +| `search_destine_parameters` | `destine_retrieval_tool.py` | RAG search over 82 DestinE parameters | +| `retrieve_destine_data` | `destine_retrieval_tool.py` | Download DestinE projections via polytope | +| `Python_REPL` | `python_repl.py` | Sandboxed Python execution (Jupyter kernel) | +| `list_plotting_data_files` | `visualization_tools.py` | List files in sandbox directories | +| `image_viewer` | `image_viewer.py` | View and analyze plot images | +| `reflect_on_image` | `reflection_tools.py` | Quality feedback on generated plots | +| `wise_agent` | `visualization_tools.py` | Visualization strategy advice | + +### Post-Processing + +After agent execution, intermediate steps are scanned for: +- ERA5 climatology outputs → stored in state +- Data component outputs → collected with references +- Python REPL outputs → plot images collected +- ERA5/DestinE retrieval outputs → references collected +- All collected references → added to final state for combine_agent diff --git a/PULL_REQUEST.md b/PULL_REQUEST.md new file mode 100644 index 0000000..4c48479 --- /dev/null +++ b/PULL_REQUEST.md @@ -0,0 +1,39 @@ +# FIRST ACCEPT PREVIOUS PR ;) + +**This PR is based on PR #197 (analysis modes) and must be merged after it.** + +--- + +## Data Tab: Downloadable Datasets + +Adds a new **Data** tab to the UI where users can download all datasets generated during a session. Also renames "Additional information" → "Figures". + +### What's new + +- **`downloadable_datasets` tracking** — a new field on `AgentState` that accumulates dataset entries (`{label, path, source}`) as they're created throughout the pipeline +- **Climate model CSVs** — tracked after `write_climate_data_manifest()` in `data_agent` +- **ERA5 climatology JSON** — tracked in `prepare_predefined_data()` after extraction +- **ERA5 time series Zarr** — tracked in `data_analysis_agent` after `retrieve_era5_data` tool execution +- **DestinE time series Zarr** — tracked in `data_analysis_agent` after `retrieve_destine_data` tool execution +- **Data tab in UI** — lists all tracked datasets with download buttons; Zarr directories are zipped on the fly, JSON/CSV files download directly +- **Tab rename** — "Additional information" → "Figures" +- **Data tab always visible** — shown regardless of whether figures are available + +### Pipeline fix + +Each agent node now **returns** `downloadable_datasets` in its return dict so LangGraph properly merges state across stages (in-place mutation alone is not enough). + +### Files changed + +| File | Change | +|------|--------| +| `climsight_classes.py` | Add `downloadable_datasets: list = []` to `AgentState` | +| `climsight_engine.py` | Track datasets in `data_agent`, `prepare_predefined_data`, pass through `combine_agent` | +| `data_analysis_agent.py` | Track ERA5/DestinE Zarr outputs from tool intermediate steps | +| `streamlit_interface.py` | Rename tab, add Data tab with download buttons | + +### Works in all modes + +- **fast** — climate model CSVs + ERA5 climatology JSON +- **smart** — above + ERA5 time series Zarr +- **deep** — above + DestinE time series Zarr diff --git a/src/climsight/climsight_classes.py b/src/climsight/climsight_classes.py index 6a04573..3960ff6 100644 --- a/src/climsight/climsight_classes.py +++ b/src/climsight/climsight_classes.py @@ -41,4 +41,5 @@ class AgentState(BaseModel): hazard_data: Optional[Any] = None # filtered_events_square for disaster plotting population_config: dict = {} # {'pop_path': str, 'country': str} for population plotting predefined_plots: list = [] # List of paths to auto-generated plots + downloadable_datasets: list = [] # List of {"label": str, "path": str, "source": str} # stream_handler: StreamHandler # Uncomment if needed diff --git a/src/climsight/climsight_engine.py b/src/climsight/climsight_engine.py index 3dcb61a..5a02a0a 100644 --- a/src/climsight/climsight_engine.py +++ b/src/climsight/climsight_engine.py @@ -888,6 +888,22 @@ def data_agent(state: AgentState, data={}, df={}): state.input_params.update(sandbox_paths) state.input_params["climate_data_manifest"] = manifest_path + # Track climate CSVs as downloadable datasets + state.downloadable_datasets.append({ + "label": f"Climate Data Manifest ({climate_source})", + "path": manifest_path, + "source": climate_source, + }) + climate_dir = sandbox_paths["climate_data_dir"] + if os.path.isdir(climate_dir): + for fname in sorted(os.listdir(climate_dir)): + if fname.endswith(".csv"): + state.downloadable_datasets.append({ + "label": f"Climate Model CSV: {fname}", + "path": os.path.join(climate_dir, fname), + "source": climate_source, + }) + # Add appropriate references based on data source ref_key_map = { 'nextGEMS': 'high_resolution_climate_model', @@ -909,7 +925,11 @@ def data_agent(state: AgentState, data={}, df={}): logger.info(f"Data agent in work (source: {climate_source}).") - respond = {'data_agent_response': data_agent_response, 'df_list': df_list} + respond = { + 'data_agent_response': data_agent_response, + 'df_list': df_list, + 'downloadable_datasets': state.downloadable_datasets, + } logger.info(f"data_agent_response: {data_agent_response}") return respond @@ -954,6 +974,14 @@ def prepare_predefined_data(state: AgentState): state.era5_climatology_response = era5_result if "reference" in era5_result: collected_references.append(era5_result["reference"]) + # Track ERA5 climatology JSON as downloadable + era5_json_path = os.path.join(state.uuid_main_dir, "era5_climatology.json") + if os.path.exists(era5_json_path): + state.downloadable_datasets.append({ + "label": "ERA5 Climatology (monthly, 2015-2025)", + "path": era5_json_path, + "source": "ERA5", + }) logger.info(f"Extracted ERA5 climatology for ({lat}, {lon})") else: logger.warning(f"ERA5 climatology: {era5_result.get('error', 'unknown error')}") @@ -1019,6 +1047,7 @@ def prepare_predefined_data(state: AgentState): 'predefined_plots': predefined_plot_paths, 'era5_climatology_response': era5_data or {}, 'data_analysis_images': predefined_plot_paths, # For UI display + 'downloadable_datasets': state.downloadable_datasets, } def route_after_prepare(state: AgentState) -> str: @@ -1279,9 +1308,12 @@ def combine_agent(state: AgentState): #print("chat_prompt_text: ", chat_prompt_text) #print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") + # Pass downloadable datasets to input_params for UI access + state.input_params['downloadable_datasets'] = state.downloadable_datasets + return { - 'final_answer': output_content, - 'input_params': state.input_params, + 'final_answer': output_content, + 'input_params': state.input_params, 'content_message': state.content_message, 'combine_agent_prompt_text': chat_prompt_text } diff --git a/src/climsight/data_analysis_agent.py b/src/climsight/data_analysis_agent.py index 16b55e6..712bc8b 100644 --- a/src/climsight/data_analysis_agent.py +++ b/src/climsight/data_analysis_agent.py @@ -190,8 +190,6 @@ def _create_tool_prompt(datasets_text: str, config: dict, lat: float = None, lon ideal_calls = mode_config.get("ideal_tool_calls", "6-7") max_per_resp = mode_config.get("max_per_response", 4) max_reflect = mode_config.get("max_reflect", 2) - has_era5_download = config.get("use_era5_data", False) - has_destine = config.get("use_destine_data", False) # --- Build prompt without f-strings for code blocks to avoid brace escaping --- sections = [] @@ -344,17 +342,6 @@ def _create_tool_prompt(datasets_text: str, config: dict, lat: float = None, lon " But don't go overboard with tiny one-liner calls either — find a reasonable balance.\n" " Each script should be self-contained: import what it needs, do meaningful work, print results." ) - tools_list.append( - "- **Python_REPL** — execute Python code in a sandboxed environment.\n" - " All files are relative to the sandbox root.\n" - " The `results/` directory is pre-created for saving plots.\n" - " Datasets are pre-loaded into the sandbox (see paths below).\n" - " STRATEGY: DIVIDE AND CONQUER. Split your work into a few focused scripts,\n" - " each tackling ONE logical task (e.g., load+explore, then analyze+plot-set-1,\n" - " then analyze+plot-set-2). This avoids cascading errors from monolithic scripts.\n" - " But don't go overboard with tiny one-liner calls either — find a reasonable balance.\n" - " Each script should be self-contained: import what it needs, do meaningful work, print results." - ) tools_list.append("- **list_plotting_data_files** — discover files in sandbox directories") tools_list.append("- **image_viewer** — view and analyze plots in `results/` (use relative paths)") if has_python_repl and max_reflect > 0: @@ -562,6 +549,10 @@ def _create_tool_prompt(datasets_text: str, config: dict, lat: float = None, lon "Splitting into reasonable chunks lets you catch and fix errors between steps.\n\n" ) + budget_lines.append( + "ANTI-SPAM RULES:\n" + f"- Never call more than {max_per_resp} tools in a single response.\n" + ) if has_python_repl and max_reflect > 0: budget_lines.append("- Never call reflect_on_image in the same response as Python_REPL.\n") budget_lines.append(f"- Never call reflect_on_image more than {max_reflect} times total.\n") @@ -825,6 +816,14 @@ def data_analysis_agent( # Collect reference from ERA5 retrieval if "reference" in obs: agent_references.append(obs["reference"]) + # Track downloaded Zarr for Data tab + if "output_path_zarr" in obs: + variable = obs.get("variable", "unknown") + state.downloadable_datasets.append({ + "label": f"ERA5 Time Series: {variable}", + "path": obs["output_path_zarr"], + "source": "ERA5", + }) elif hasattr(obs, 'content'): era5_output = obs.content else: @@ -837,6 +836,14 @@ def data_analysis_agent( if isinstance(obs, dict): if "reference" in obs: agent_references.append(obs["reference"]) + # Track downloaded Zarr for Data tab + if "output_path_zarr" in obs: + variable = obs.get("variable", obs.get("parameter", "unknown")) + state.downloadable_datasets.append({ + "label": f"DestinE Time Series: {variable}", + "path": obs["output_path_zarr"], + "source": "DestinE", + }) state.destine_tool_response = str(obs) state.input_params.setdefault("destine_results", []).append(obs) @@ -882,4 +889,5 @@ def data_analysis_agent( "era5_tool_response": getattr(state, 'era5_tool_response', None), "destine_tool_response": getattr(state, 'destine_tool_response', None), "references": state.references, # Propagate collected references + "downloadable_datasets": state.downloadable_datasets, } diff --git a/src/climsight/streamlit_interface.py b/src/climsight/streamlit_interface.py index d10bfd5..8b181f3 100644 --- a/src/climsight/streamlit_interface.py +++ b/src/climsight/streamlit_interface.py @@ -196,22 +196,6 @@ def _on_mode_change(): with col1: # Always show additional information (removed toggle per user request) show_add_info = True - smart_agent = st.toggle("Use extra search", value=False, help="""If this is activated, ClimSight will make additional requests to Wikipedia and RAG, which can significantly increase response time.""") - use_era5_data = st.toggle( - "Enable ERA5 data", - value=config.get("use_era5_data", False), - help="Allow the data analysis agent to retrieve ERA5 data into the sandbox.", - ) - use_destine_data = st.toggle( - "Enable DestinE data", - value=config.get("use_destine_data", False), - help="Allow retrieval of DestinE Climate DT projections (SSP3-7.0, 82 parameters).", - ) - use_powerful_data_analysis = st.toggle( - "Enable Python analysis", - value=config.get("use_powerful_data_analysis", False), - help="Allow the data analysis agent to use the Python REPL and generate plots.", - ) # remove the llmModeKey_box from the form, as we tend to run the agent mode, direct mode is for development only #llmModeKey_box = st.radio("Select LLM mode 👉", key="visibility", options=["Direct", "Agent (experimental)"]) @@ -480,9 +464,9 @@ def update_progress_ui(message): show_add_info_display = st.session_state.get('last_show_add_info', False) if show_add_info_display: - tab_text, tab_add, tab_refs = st.tabs(["Report", "Additional information", "References"]) + tab_text, tab_figs, tab_data, tab_refs = st.tabs(["Report", "Figures", "Data", "References"]) else: - tab_text, tab_refs = st.tabs(["Report", "References"]) + tab_text, tab_data, tab_refs = st.tabs(["Report", "Data", "References"]) with tab_text: st.markdown(st.session_state['last_output']) @@ -493,12 +477,12 @@ def update_progress_ui(message): st.markdown(f"- {ref}") if show_add_info_display: - with tab_add: + with tab_figs: stored_input_params = st.session_state.get('last_input_params', {}) stored_figs = st.session_state.get('last_figs', {}) stored_climatemodel_name = st.session_state.get('last_climatemodel_name', 'unknown') - - st.subheader("Additional information", divider='rainbow') + + st.subheader("Figures", divider='rainbow') if 'lat' in stored_input_params and 'lon' in stored_input_params: st.markdown(f"**Coordinates:** {stored_input_params['lat']}, {stored_input_params['lon']}") if 'elevation' in stored_input_params: @@ -632,6 +616,51 @@ def update_progress_ui(message): for image_path in other_plots: st.image(image_path) + # Data tab - downloadable datasets + with tab_data: + stored_input_params_data = st.session_state.get('last_input_params', {}) + datasets = stored_input_params_data.get('downloadable_datasets', []) + if datasets: + st.subheader("Available Datasets", divider='rainbow') + for idx, ds_entry in enumerate(datasets): + path = ds_entry.get("path", "") + label = ds_entry.get("label", "Dataset") + source = ds_entry.get("source", "") + if path and os.path.exists(path): + col_label, col_btn = st.columns([3, 1]) + with col_label: + st.markdown(f"**{label}**") + st.caption(f"{source} — {os.path.basename(path)}") + with col_btn: + if os.path.isdir(path): + # Zarr directories: zip on the fly + import io + import zipfile + buf = io.BytesIO() + with zipfile.ZipFile(buf, 'w', zipfile.ZIP_DEFLATED) as zf: + for root, dirs, files in os.walk(path): + for f in files: + fp = os.path.join(root, f) + zf.write(fp, os.path.relpath(fp, os.path.dirname(path))) + st.download_button( + "Download", + buf.getvalue(), + file_name=os.path.basename(path) + ".zip", + mime="application/zip", + key=f"dl_data_{idx}", + ) + else: + with open(path, "rb") as f: + file_data = f.read() + st.download_button( + "Download", + file_data, + file_name=os.path.basename(path), + key=f"dl_data_{idx}", + ) + else: + st.info("No datasets were generated for this query.") + # Download buttons st.markdown("---") # Add a separator diff --git a/test/plot_destine_data.py b/test/plot_destine_data.py index 87911dd..8690f29 100644 --- a/test/plot_destine_data.py +++ b/test/plot_destine_data.py @@ -1,9 +1,19 @@ -"""Quick script to inspect and plot DestinE Zarr data.""" +"""Quick script to inspect and plot DestinE Zarr data. + +Usage: + python plot_destine_data.py path/to/destine_167_sfc_20200101_20211231.zarr +""" + +import argparse +import sys import xarray as xr import matplotlib.pyplot as plt -zarr_path = "/Users/ikuznets/work/projects/climsight/code/climsight/tmp/sandbox/38c864498d174b8a90ebb24ac67cf70e/destine_data/destine_167_sfc_20200101_20211231.zarr" +parser = argparse.ArgumentParser(description="Inspect and plot a DestinE Zarr dataset.") +parser.add_argument("zarr_path", help="Path to the DestinE .zarr directory") +args = parser.parse_args() +zarr_path = args.zarr_path ds = xr.open_dataset(zarr_path, engine="zarr")