diff --git a/.gitignore b/.gitignore index 828fb59..77ffd89 100644 --- a/.gitignore +++ b/.gitignore @@ -154,4 +154,7 @@ data_mid_2/ data_mid_3/ data_prep/ data_prep_1/ -data_prep_2/ \ No newline at end of file +data_prep_2/ + +# static files +static_data/ \ No newline at end of file diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e5b0496..589fe01 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,8 @@ exclude: | src/cerf/data_acquisition_scrape\.py| src/disaster_charter/data_acquisition_scrape\.py| src/glide/data_acquisition_scrape\.py| - docs/NOTEBOOK_DATASETS\.md + docs/NOTEBOOK_DATASETS\.md| + README\.md| )$ repos: diff --git a/Makefile b/Makefile index 90a0c28..143c2bd 100644 --- a/Makefile +++ b/Makefile @@ -44,7 +44,7 @@ run_idus_download: @echo "Downloading IDUS dump → data_raw/idmc_idu/idus_all.json" @mkdir -p data_raw/idmc_idu @curl -L --compressed \ - -o data_raw/idmc_idu/idus_all.json \ + -o data/idmc_idu/idus_all.json \ "https://helix-copilot-prod-helix-media-external.s3.amazonaws.com/external-media/api-dump/idus-all/2025-06-04-10-00-32/5mndO/idus_all.json" @echo "✅ Saved (decompressed): data_raw/idmc_idu/idus_all.json" diff --git a/README.md b/README.md index 9f49049..e28eb8e 100644 --- a/README.md +++ b/README.md @@ -1,167 +1,152 @@ # Disaster Impact Database -**Disaster Impact Database** is an open-source project designed to ingest, -process, and analyse disaster-related data from multiple sources. -The project reads raw data from Azure Blob Storage, normalises CSV files, -and lays the groundwork for future data consolidation and analysis. -Data sources include **GLIDE**, **GDACS**, **CERF**, **EMDAT**, -**IDMC**, **IFRC** and more. +The **Disaster Impact Database** is an open‑source initiative that collects, cleans, and harmonises disaster‑related data from multiple global providers. It produces a unified, analysis‑ready dataset that supports the Anticipatory Action Framework and broader humanitarian research. -## Project Purpose +> Supported sources: **GDACS · GLIDE · CERF · EM‑DAT · IDMC · IFRC‑DREF · Disaster Charter** -The primary goal is to build a unified disaster impact database that: +--- -- **Downloads raw data** from Azure Blob Storage. -- **Curates and normalises** data from various humanitarian and disaster sources. -- **Standardixes** data into consistent formats using JSON schemas. -- **Exports data** as normalised CSV files. -- **Prepares for future consolidation** by grouping events by type, country, -and event date (with a ±7 days window). +## Key Goals -## Project Structure +- **Automated downloads** of raw data where APIs exist. +- **Headless scraping** for semi‑automated sources. +- **Normalisation** into consistent, JSON‑schema‑validated tables (**under development**). +- **Export** of tidy CSVs for each provider. +- **Event matching** across feeds by hazard, country, and date (±7 days). +- **Exploratory analytics** that quantify overlap and uniqueness across sources. -```bash -. -├── docs # Documentation -├── LICENSE # Project license -├── Makefile # Automation commands -├── notebooks # Jupyter notebooks for data inspection and experimentation -├── poetry.lock # Poetry lock file for dependencies -├── poetry.toml # Poetry configuration -├── pyproject.toml # Project metadata and dependency management -├── README.md # This file -├── src # Source code modules -│ ├── cerf # CERF data processing (downloader, normalization, schema) -│ ├── data_consolidation # Future module for data consolidation tasks -│ ├── disaster_charter # Disaster Charter data processing -│ ├── emdat # EM-DAT data processing -│ ├── gdacs # GDACS data processing -│ ├── glide # GLIDE data processing -│ ├── idmc # IDMC data processing -│ ├── ifrc_eme # IFRC data processing -│ ├── unified # Unified schema, consolidated data, and blob upload utilities -│ └── utils # Utility scripts -├── static_data # Static reference data (e.g., country codes, event codes) -└── tests # Unit and integration tests -``` - -## Key Features +--- -- **Data Download**: Retrieve raw data directly from Azure Blob Storage. -- **Data Curation**: Clean and preprocess raw data. -- **Normalisation & Standardisation**: Process and flatten, -ensuring data from different sources is standardised. -- **Data Schemas**: Use JSON schemas to validate and enforce data structure consistency. -- **CSV Output**: Export normalized data -to CSV for downstream analysis. -- **Future Data Consolidation**: Group events by type, country, and event date -(with a ±7 days window) to create a consolidated dataset. -- **Automation**: Utilise Makefile commands for environment setup, -testing, linting, and more. +## Prerequisites -## Usage Instructions +| Requirement | Purpose | +|-------------|---------| +| **Python ≥ 3.10** | Core language | +| **Poetry** | Virtual‑env and dependency management | +| **Firefox** | Headless scraping engine | +| **GeckoDriver** | WebDriver interface for Firefox | +| **Unix‑like shell** | Tested on macOS, Linux, and Windows WSL2 | -### Environment Setup +> **Tip — GLIDE scraper profile** +> The GLIDE portal occasionally shows CAPTCHAs. Edit `src/glide/data_acquisition_scrape.py` and set `FIREFOX_PROFILE` to the absolute path of a persistent Firefox profile (e.g. `~/.mozilla/firefox/abcd1234.default-release`). -This project uses [Poetry](https://python-poetry.org/) for dependency management. -To set up your development environment: +--- -**Create and activate the virtual environment:** +## Quick Start - ```bash - make .venv - ``` +```bash +# 0 Install Poetry (skip if you already have it) +$ curl -sSL https://install.python-poetry.org | python3 - -### Running Normalisation Scripts +# 1  Clone the repo and enter it +$ git clone https://github.com/mapaction/disaster-impact.git +$ cd disaster-impact -Each data source module under `src` contains scripts for data normalisation. -For example, to run the normalisation process for GLIDE data: +# 2  Create the virtual environment (Poetry will be installed if missing) +$ make .venv -```bash -python -m src.glide.data_normalisation_glide +# 3  Activate the environment (Poetry makes this automatic in new shells) +$ poetry shell ``` -Replace `glide` with the appropriate module name for other data sources -(e.g., `gdacs`, `cerf`, etc.). +All Makefile targets below assume the environment is active. -### Automation with Makefile - -The included `Makefile` provides several automation commands: - -- **Set up the environment:** +--- - ```bash - make .venv - ``` +## Data Acquisition -- **Run tests:** +| Dataset | Access Method | Historical Coverage | Makefile Target | Status | +|---------|---------------|---------------------|-----------------|--------| +| **GDACS** | REST API | 2000 – present | `make run_gdacs_download` | Automated | +| **IDMC IDU** | REST API | 2016 – present | `make run_idus_download` | Automated | +| **GLIDE** | Headless scrape | 1930 – present | `make run_glide_scrape` | Semi‑automated | +| **CERF** | Headless scrape | 2006 – present | `make run_cerf_scrape` | Semi‑automated | +| **Disaster Charter** | Headless scrape | 2000 – present | `make run_charter_scrape` | Semi‑automated | +| **EM‑DAT** | Manual download | 2000 – present | — | Manual | +| **IFRC DREF** | Manual download | 2018 – present | — | Manual | - ```bash - make test - ``` +Raw files are stored in `data//`, preserving provenance and update timestamps. -- **Lint the code:** +--- - ```bash - make lint - ``` +## Processing & Analysis Workflow -- **Clean the environment:** +1. **Load** raw datasets (see `notebooks/process_sandbox.ipynb`). +2. **Pre‑process**: select columns, rename, parse dates, harmonise hazard labels. +3. **Match events** by hazard, ISO‑3 country code, and date window (±7 days). +4. **Generate analytics**: bar charts of retention/overlap and a chord diagram of pairwise matches. - ```bash - make clean - ``` +The notebook is fully reproducible; rerun it after refreshing data to obtain an updated master table. -## Testing, Linting, and Environment Cleanup +--- -- **Testing**: Run unit and integration tests located in the `tests` directory. +## Project Structure - ```bash - make test - ``` +```text +. +├── data/ # Raw datasets (one sub‑folder per provider) +├── docs/ # Additional documentation +├── notebooks/ # Jupyter notebooks for ETL and analysis +├── src/ # Source code modules +│   ├── cerf/ +│   ├── disaster_charter/ +│   ├── emdat/ +│   ├── gdacs/ +│   ├── glide/ +│   ├── idmc/ +│   ├── ifrc_dref/ +│   ├── unified/ # Unified schema & helpers +│   └── utils/ +├── static_data/ # Reference tables (e.g., country & hazard codes) +├── tests/ # Unit & integration tests +├── Makefile # Automation commands +├── pyproject.toml # Project metadata +└── README.md # This file +``` -- **Linting**: Check code quality with linting tools. +--- - ```bash - make lint - ``` +## Common Make Targets -- **Clean Environment**: Remove temporary files and reset the environment as needed. +| Target | Action | +|--------|--------| +| `.venv` | Bootstrap the Poetry virtual‑env | +| `test` | Run the test suite (`pytest`) | +| `lint` | Run `ruff` and `mypy` checks | +| `clean` | Remove virtual‑env, caches & temporary files | +| `run__download` | Refresh a specific feed (see table above) | - ```bash - make clean - ``` +--- -## Development Notes & Key Scripts +## Limitations & Roadmap -- **Key Scripts:** - - **Normalization:** `src/*/data_normalisation*.py` - - **JSON Schemas:** Located in each module (e.g., `src/cerf/cerf_schema.json`) - - **CSV Processing:** `src/utils/combine_csv.py`, `src/utils/splitter.py` - - **Future Consolidation:** `src/data_consolidation/` +- **Matching logic** is intentionally conservative; multi‑country or slow‑onset events may be under‑linked. +- **ETL pipeline** is notebook‑driven; migration to a parametric workflow (e.g., Airflow, Dagster) is planned. +- **Manual feeds** (EM‑DAT, IFRC‑DREF) need scripted ingestion once stable APIs become available. +- **Funding gap** stalled development after the HNPW 2025 demo; contributions are welcome to resume full ETL work. -- **Development Notes:** - - Update JSON schemas as the data structure evolves. - - Extend the Makefile for additional automation tasks. - - Contributions to enhance data consolidation features are highly encouraged. +--- ## Contributing -Contributions are welcome! To contribute: -- Clone the repository and create a branch from `main`. -- Submit pull requests with detailed descriptions of your changes. +1. Fork the repository and create a feature branch from `main`. +2. Commit logical, well‑documented changes. +3. Ensure `make test lint` passes. +4. Open a pull request; the CI pipeline will run automatically. + +--- ## License -This project is licensed under the GNU GENERAL PUBLIC license. -See the [LICENSE](./LICENSE) file for details. +Distributed under the **GNU GPL v3**. See the [LICENSE](LICENSE) file for details. -## Author Information +--- + +## Author -- **Author:** ediakatos -- **Contact:** ediakatos@mapaction.org +**Evangelos Diakatos** · ediakatos@mapaction.org --- -Thank you for using the Disaster Impact Database! -For issues or feature requests, please open an issue on GitHub. Happy coding! +*Happy coding & stay safe!* + diff --git a/docs/TABLES.md b/docs/TABLES.md deleted file mode 100644 index fa5bee9..0000000 --- a/docs/TABLES.md +++ /dev/null @@ -1,41 +0,0 @@ - -# Event Pairing and Linking Logic - -This document outlines the logic for pairing and linking disaster events across -multiple datasets to build a unified global disaster database. - ---- - ---- - -## **Output Schema** - -| **Field** | **Description** -|-------------------------|--------------------------- -| `event_ID` | Unified event identifier. -| `related_event_ID` | For linked but distinct events. -| `ev_sdate`, `ev_fdate` | Start and end dates for the event. -| `ev_ISO3s` | ISO3 country code of the event. -| `all_hazs_Ab` | Standardized hazard abbreviation. -| `imp_value` | Consolidated impact metrics. -| `approved_amount` | Consolidated funding data. -| `sources` | List of datasets contributing to the record. -| `relationship_type` | Describes the relationship (e.g., "caused_by", "funded_by"). - ---- - -## **Manual Pairing Process** - -For each event: - -1. Review `event_ID` (Blue) and align identical events. -2. Cross-check dates (Green) and hazard types (Orange) for temporal and hazard matching. -3. Compare names/descriptions (Yellow) for fuzzy matches. -4. Validate spatial proximity using locations (Pink) or calculated distances (Cyan). -5. Ensure financial and impact metrics align for linked funding (Purple). -6. Document pairing in a QA column for tracking. - ---- - -This **README.md** provides a comprehensive structure and color-coded guide -for pairing and linking disaster events across datasets. diff --git a/notebooks/exploration.ipynb b/notebooks/exploration.ipynb deleted file mode 100644 index 5b5fb07..0000000 --- a/notebooks/exploration.ipynb +++ /dev/null @@ -1,97 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Notebook: Exploration of csv files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"Exploration notebook for data analysis.\n", - "\n", - "This notebook contains data exploration steps for disaster analysis.\n", - "\"\"\"\n", - "\n", - "import sys\n", - "from pathlib import Path\n", - "\n", - "import pandas as pd\n", - "\n", - "module_path = Path(\"..\").resolve()\n", - "sys.path.append(str(module_path))" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "metadata": {}, - "outputs": [], - "source": [ - "def read_dat(dat_file: str) -> pd:\n", - " \"\"\"Reads a CSV file from the data_prep directory.\"\"\"\n", - " dat_dir = Path(\"../data_prep/\").resolve()\n", - " dat_path = dat_dir / dat_file\n", - " return pd.read_csv(dat_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [], - "source": [ - "glide_prep_df = read_dat(\"glide_prep.csv\")\n", - "gdacs_prep_df = read_dat(\"gdacs_prep.csv\")\n", - "emdat_prep_df = read_dat(\"emdat_prep.csv\")\n", - "disaster_charter_df = read_dat(\"disaster_charter_prep.csv\")\n", - "cerf_df = read_dat(\"cerf_prep.csv\")\n", - "idmc_df = read_dat(\"idmc_prep.csv\")\n", - "ifrc_df = read_dat(\"ifrc_prep.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": {}, - "outputs": [], - "source": [ - "pre_dfs = [\n", - " glide_prep_df,\n", - " gdacs_prep_df,\n", - " emdat_prep_df,\n", - " disaster_charter_df,\n", - " cerf_df,\n", - " idmc_df,\n", - " ifrc_df,\n", - "]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}