diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a239a1d..fc1d04d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,6 +32,9 @@ jobs: curl -sSL https://install.python-poetry.org | python3 - --version $POETRY_VERSION poetry --version + - name: Add Poetry to PATH + run: echo "$HOME/.local/bin" >> $GITHUB_PATH + - name: Cache Poetry virtual environment uses: actions/cache@v3 with: diff --git a/README.md b/README.md index 81098bc..9f49049 100644 --- a/README.md +++ b/README.md @@ -66,18 +66,12 @@ testing, linting, and more. This project uses [Poetry](https://python-poetry.org/) for dependency management. To set up your development environment: -1. **Create and activate the virtual environment:** +**Create and activate the virtual environment:** ```bash make .venv ``` -2. **Install project dependencies:** - - ```bash - poetry install - ``` - ### Running Normalisation Scripts Each data source module under `src` contains scripts for data normalisation. diff --git a/notebooks/exploration.ipynb b/notebooks/exploration.ipynb new file mode 100644 index 0000000..7d8339c --- /dev/null +++ b/notebooks/exploration.ipynb @@ -0,0 +1,221 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notebook: Exploration of csv files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"Exploration notebook for data analysis.\n", + "\n", + "This notebook contains data exploration steps for disaster analysis.\n", + "\"\"\"\n", + "\n", + "import hashlib\n", + "import sys\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "\n", + "from src.data_consolidation.dictionary import STANDARD_COLUMNS\n", + "\n", + "module_path = Path(\"..\").resolve()\n", + "sys.path.append(str(module_path))" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [], + "source": [ + "def read_dat(dat_file: str) -> pd:\n", + " \"\"\"Reads a CSV file from the data_prep directory.\"\"\"\n", + " dat_dir = Path(\"../data_prep/\").resolve()\n", + " dat_path = dat_dir / dat_file\n", + " return pd.read_csv(dat_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "glide_prep_df = read_dat(\"glide_prep.csv\")\n", + "gdacs_prep_df = read_dat(\"gdacs_prep.csv\")\n", + "emdat_prep_df = read_dat(\"emdat_prep.csv\")\n", + "disaster_charter_df = read_dat(\"disaster_charter_prep.csv\")\n", + "cerf_df = read_dat(\"cerf_prep.csv\")\n", + "idmc_df = read_dat(\"idmc_prep.csv\")\n", + "ifrc_df = read_dat(\"ifrc_prep.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "pre_dfs = [\n", + " glide_prep_df,\n", + " gdacs_prep_df,\n", + " emdat_prep_df,\n", + " disaster_charter_df,\n", + " cerf_df,\n", + " idmc_df,\n", + " ifrc_df,\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_18473/1884474460.py:8: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.\n", + " all_data = pd.concat(pre_dfs, ignore_index=True)\n" + ] + } + ], + "source": [ + "for i, df in enumerate(pre_dfs):\n", + " missing_cols = set(STANDARD_COLUMNS) - set(df.columns)\n", + " for col in missing_cols:\n", + " df[col] = None\n", + " df_standard = df[STANDARD_COLUMNS]\n", + " pre_dfs[i] = df_standard\n", + "\n", + "all_data = pd.concat(pre_dfs, ignore_index=True)\n", + "all_data[\"Date\"] = pd.to_datetime(all_data[\"Date\"], errors=\"coerce\")\n", + "group_key = [\"Event_Type\", \"Country\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [], + "source": [ + "def consolidate_group(group: pd.DataFrame) -> dict:\n", + " \"\"\"Consolidates a group of data.\"\"\"\n", + " consolidated_row = {}\n", + " event_ids = sorted(set(group[\"Source_Event_IDs\"].dropna().astype(str).tolist()))\n", + " consolidated_row[\"Event_ID\"] = event_ids\n", + " unique_str = \"|\".join(event_ids)\n", + " disaster_impact_id = \"DI_\" + hashlib.sha256(unique_str.encode(\"utf-8\")).hexdigest()\n", + " consolidated_row[\"Disaster_Impact_ID\"] = disaster_impact_id\n", + " for column in group.columns:\n", + " if column in group_key or column in [\"Event_ID\", \"Disaster_Impact_ID\"]:\n", + " if column == \"Disaster_Impact_ID\":\n", + " continue\n", + " consolidated_row[column] = sorted(\n", + " set(group[column].dropna().astype(str).tolist()),\n", + " )\n", + " else:\n", + " values = group[column].dropna().tolist()\n", + " if values:\n", + " if all(isinstance(val, list) for val in values):\n", + " flat_values = [item for sublist in values for item in sublist]\n", + " consolidated_row[column] = sorted(set(map(str, flat_values)))\n", + " else:\n", + " consolidated_row[column] = sorted(set(map(str, values)))\n", + " else:\n", + " consolidated_row[column] = None\n", + " return consolidated_row" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": {}, + "outputs": [], + "source": [ + "all_data[\"Date_Group\"] = all_data[\"Date\"].apply(\n", + " lambda x: (\n", + " x - pd.Timedelta(days=7),\n", + " x + pd.Timedelta(days=7),\n", + " )\n", + " if pd.notna(x)\n", + " else (None, None),\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def group_by_date_range(data: pd.DataFrame, date_col: str) -> list:\n", + " \"\"\"Groups data by date range.\"\"\"\n", + " rows = []\n", + " used_indices = set()\n", + " for idx, row in data.iterrows():\n", + " if idx in used_indices or pd.isna(row[date_col]):\n", + " continue\n", + " matching_rows = data[\n", + " (data[date_col] >= row[\"Date_Group\"][0])\n", + " & (data[date_col] <= row[\"Date_Group\"][1])\n", + " & (data[\"Event_Type\"] == row[\"Event_Type\"])\n", + " & (data[\"Country\"] == row[\"Country\"])\n", + " ]\n", + " used_indices.update(matching_rows.index)\n", + " rows.append(consolidate_group(matching_rows))\n", + " return rows\n", + "\n", + "\n", + "unified_rows = group_by_date_range(all_data, \"Date\")\n", + "unified_df = pd.DataFrame(unified_rows)\n", + "unified_df = unified_df[STANDARD_COLUMNS]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Matrix is empty (all zeros); no Circos plot to display.\n" + ] + } + ], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/poetry.lock b/poetry.lock index 142c6cd..bbe54ed 100644 --- a/poetry.lock +++ b/poetry.lock @@ -72,6 +72,21 @@ docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphi tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] +[[package]] +name = "autopep8" +version = "2.3.2" +description = "A tool that automatically formats Python code to conform to the PEP 8 style guide" +optional = false +python-versions = ">=3.9" +files = [ + {file = "autopep8-2.3.2-py2.py3-none-any.whl", hash = "sha256:ce8ad498672c845a0c3de2629c15b635ec2b05ef8177a6e7c91c74f3e9b51128"}, + {file = "autopep8-2.3.2.tar.gz", hash = "sha256:89440a4f969197b69a995e4ce0661b031f455a9f776d2c5ba3dbd83466931758"}, +] + +[package.dependencies] +pycodestyle = ">=2.12.0" +tomli = {version = "*", markers = "python_version < \"3.11\""} + [[package]] name = "azure-core" version = "1.32.0" @@ -1948,6 +1963,26 @@ files = [ msal = ">=1.29,<2" portalocker = ">=1.4,<3" +[[package]] +name = "nbqa" +version = "1.9.1" +description = "Run any standard Python code quality tool on a Jupyter Notebook" +optional = false +python-versions = ">=3.9" +files = [ + {file = "nbqa-1.9.1-py3-none-any.whl", hash = "sha256:95552d2f6c2c038136252a805aa78d85018aef922586270c3a074332737282e5"}, + {file = "nbqa-1.9.1.tar.gz", hash = "sha256:a1f4bcf587c597302fed295951001fc4e1be4ce0e77e1ab1b25ac2fbe3db0cdd"}, +] + +[package.dependencies] +autopep8 = ">=1.5" +ipython = ">=7.8.0" +tokenize-rt = ">=3.2.0" +tomli = "*" + +[package.extras] +toolchain = ["black", "blacken-docs", "flake8", "isort", "jupytext", "mypy", "pylint", "pyupgrade", "ruff"] + [[package]] name = "nest-asyncio" version = "1.6.0" @@ -3994,6 +4029,17 @@ files = [ {file = "text_unidecode-1.3-py2.py3-none-any.whl", hash = "sha256:1311f10e8b895935241623731c2ba64f4c455287888b18189350b67134a822e8"}, ] +[[package]] +name = "tokenize-rt" +version = "6.1.0" +description = "A wrapper around the stdlib `tokenize` which roundtrips." +optional = false +python-versions = ">=3.9" +files = [ + {file = "tokenize_rt-6.1.0-py2.py3-none-any.whl", hash = "sha256:d706141cdec4aa5f358945abe36b911b8cbdc844545da99e811250c0cee9b6fc"}, + {file = "tokenize_rt-6.1.0.tar.gz", hash = "sha256:e8ee836616c0877ab7c7b54776d2fefcc3bde714449a206762425ae114b53c86"}, +] + [[package]] name = "tomli" version = "2.0.2" @@ -4373,4 +4419,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "f9a97deb62b63308062f23af2146692b8c76ba4f5e1abdc2dc03292e6aa07c41" +content-hash = "a4df48d04e23f1ebe6f2e8f0517f4945124c081a90a696503731b5f9b28f60a7" diff --git a/pyproject.toml b/pyproject.toml index d3ed39b..16a8d7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ pycirclize = "^1.8.0" circlify = "^0.15.0" azure-storage-blob = "^12.24.1" azure-identity = "^1.19.0" +nbqa = "^1.9.1" [tool.poetry.group.dev.dependencies] pre-commit = "^3.8.0"