mapaction · ediakatos · Feb 28, 2025 · Feb 28, 2025 · Feb 28, 2025 · Feb 28, 2025
diff --git a/README.md b/README.md
@@ -66,18 +66,12 @@ testing, linting, and more.
 This project uses [Poetry](https://python-poetry.org/) for dependency management.
 To set up your development environment:
 
-1. **Create and activate the virtual environment:**
+**Create and activate the virtual environment:**
 
    ```bash
    make .venv
    ```
 
-2. **Install project dependencies:**
-
-   ```bash
-   poetry install
-   ```
-
 ### Running Normalisation Scripts
 
 Each data source module under `src` contains scripts for data normalisation.

diff --git a/notebooks/exploration.ipynb b/notebooks/exploration.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notebook: Exploration of csv files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"Exploration notebook for data analysis.\n",
+    "\n",
+    "This notebook contains data exploration steps for disaster analysis.\n",
+    "\"\"\"\n",
+    "\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "from src.data_consolidation.dictionary import STANDARD_COLUMNS\n",
+    "\n",
+    "module_path = Path(\"..\").resolve()\n",
+    "sys.path.append(str(module_path))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_dat(dat_file: str) -> pd:\n",
+    "    \"\"\"Reads a CSV file from the data_prep directory.\"\"\"\n",
+    "    dat_dir = Path(\"../data_prep/\").resolve()\n",
+    "    dat_path = dat_dir / dat_file\n",
+    "    return pd.read_csv(dat_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "glide_prep_df = read_dat(\"glide_prep.csv\")\n",
+    "gdacs_prep_df = read_dat(\"gdacs_prep.csv\")\n",
+    "emdat_prep_df = read_dat(\"emdat_prep.csv\")\n",
+    "disaster_charter_df = read_dat(\"disaster_charter_prep.csv\")\n",
+    "cerf_df = read_dat(\"cerf_prep.csv\")\n",
+    "idmc_df = read_dat(\"idmc_prep.csv\")\n",
+    "ifrc_df = read_dat(\"ifrc_prep.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pre_dfs = [\n",
+    "    glide_prep_df,\n",
+    "    gdacs_prep_df,\n",
+    "    emdat_prep_df,\n",
+    "    disaster_charter_df,\n",
+    "    cerf_df,\n",
+    "    idmc_df,\n",
+    "    ifrc_df,\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_18473/1884474460.py:8: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n",
+      "  all_data = pd.concat(pre_dfs, ignore_index=True)\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i, df in enumerate(pre_dfs):\n",
+    "    missing_cols = set(STANDARD_COLUMNS) - set(df.columns)\n",
+    "    for col in missing_cols:\n",
+    "        df[col] = None\n",
+    "    df_standard = df[STANDARD_COLUMNS]\n",
+    "    pre_dfs[i] = df_standard\n",
+    "\n",
+    "all_data = pd.concat(pre_dfs, ignore_index=True)\n",
+    "all_data[\"Date\"] = pd.to_datetime(all_data[\"Date\"], errors=\"coerce\")\n",
+    "group_key = [\"Event_Type\", \"Country\"]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -29,6 +29,7 @@ pycirclize = "^1.8.0"
 circlify = "^0.15.0"
 azure-storage-blob = "^12.24.1"
 azure-identity = "^1.19.0"
+nbqa = "^1.9.1"
 
 [tool.poetry.group.dev.dependencies]
 pre-commit = "^3.8.0"