Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ data/*
data1
data
data_raw/
data_v2

data_mid/
data_out/
Expand Down
6 changes: 3 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ run_disaster_charter_download:
@poetry run python -m src.disaster_charter.data_acquisition_scrape

run_idus_download:
@echo "Downloading IDUS dump → data_raw/idmc_idu/idus_all.json"
@mkdir -p data_raw/idmc_idu
@echo "Downloading IDUS dump → data/idmc_idu/idus_all.json"
@mkdir -p data/idmc_idu
@curl -L --compressed \
-o data/idmc_idu/idus_all.json \
"https://helix-copilot-prod-helix-media-external.s3.amazonaws.com/external-media/api-dump/idus-all/2025-06-04-10-00-32/5mndO/idus_all.json"
@echo "✅ Saved (decompressed): data_raw/idmc_idu/idus_all.json"
@echo "✅ Saved (decompressed): data/idmc_idu/idus_all.json"

run_glide_normal:
@echo "Running Glide normalisation"
Expand Down
4 changes: 2 additions & 2 deletions docs/DATASETS.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ update any of the datasets:
1. Visit the source link listed in the table below.
2. Locate and download any new records since the last extraction.
3. Either:
- Append new entries to the existing CSVs stored in the Azure blob, or
- Append new entries to the existing CSVs stored in the Azure blob or local, or
- Replace the file entirely with a newly exported version.
4. Upload updated files to the correct path inside the
`disaster-impact/raw/` container.
`disaster-impact/data/` container.

**Important**: Always preserve the folder structure to avoid breaking downstream
processes.
Expand Down
146 changes: 41 additions & 105 deletions notebooks/process_sandbox.ipynb

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion src/cerf/data_acquisition_scrape.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from datetime import datetime
import os

import pandas as pd
import requests
from bs4 import BeautifulSoup

os.makedirs("./data/cerf/", exist_ok=True)

BASE_URL = "https://cerf.un.org/what-we-do/allocation/all/emergency/"
DETAILS_BASE_URL = "https://cerf.un.org/what-we-do/allocation/"

Expand Down Expand Up @@ -155,7 +158,7 @@ def main():
all_table_data.extend(table_data)

df = pd.DataFrame(all_table_data)
output_csv = "./data_raw/cerf/cerf_emergency_data.csv"
output_csv = "./data/cerf/cerf_emergency_data_dynamic_web_scrape.csv"
df.to_csv(output_csv, index=False)

print(f"Data saved to {output_csv}")
Expand Down
2 changes: 1 addition & 1 deletion src/disaster_charter/data_acquisition_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
HEADERS = {"User-Agent": "Mozilla/5.0"}
SLEEP_SEC = 0.35 # polite delay between page hits

OUT_DIR = "./data_raw/disaster_charter/"
OUT_DIR = "./data/disaster-charter/"
CSV_PATH = os.path.join(OUT_DIR, "disaster_activations_web_scrape_2000_2025.csv")
os.makedirs(OUT_DIR, exist_ok=True)

Expand Down
2 changes: 1 addition & 1 deletion src/gdacs/data_acquisition_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import requests

SEARCH_URL = "https://www.gdacs.org/gdacsapi/api/events/geteventlist/SEARCH"
OUTPUT_DIR = "./data_raw/gdacs/"
OUTPUT_DIR = "./data/gdacs/"
pathlib.Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)


Expand Down
21 changes: 11 additions & 10 deletions src/glide/data_acquisition_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,16 @@
from selenium.webdriver.support.ui import WebDriverWait

URL = "https://glidenumber.net/glide/public/result/report.jsp"
GECKODRIVER_PATH = "/usr/local/bin/geckodriver"
PROFILE_PATH = (
"/home/evangelos/snap/firefox/common/.mozilla/firefox/"
"cf7shfvv.selenium_profile"
)
# GECKODRIVER_PATH = "/usr/local/bin/geckodriver"
GECKODRIVER_PATH = "/opt/homebrew/bin/geckodriver"
# PROFILE_PATH = (
# "/home/evangelos/snap/firefox/common/.mozilla/firefox/"
# "cf7shfvv.selenium_profile"
# )
PROFILE_PATH = "/Users/evangelosdiakatossaoulas/Library/Application Support/Firefox/Profiles/r0zabgcj.default-release"


Path("./data_raw/glide_v2/").mkdir(parents=True, exist_ok=True)
CSV_OUTPUT = "./data_raw/glide/glide_data_combined_all.csv"
Path("./data/glide/").mkdir(parents=True, exist_ok=True)
CSV_OUTPUT = "./data/glide/glide_events.csv"

def scrape_with_selenium() -> str:
"""Use Selenium to interact with the Glide Number website and return the rendered.
Expand All @@ -29,8 +30,8 @@ def scrape_with_selenium() -> str:
"""
options = FirefoxOptions()
options.headless = False # type: ignore[attr-defined]
options.add_argument("-profile")
options.add_argument(PROFILE_PATH)
# options.add_argument("-profile")
# options.add_argument(PROFILE_PATH)

service = FirefoxService(GECKODRIVER_PATH)
driver = webdriver.Firefox(service=service, options=options)
Expand Down