mapaction · ediakatos · Jun 13, 2025 · Jun 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -145,6 +145,7 @@ data/*
 data1
 data
 data_raw/
+data_v2
 
 data_mid/
 data_out/

diff --git a/Makefile b/Makefile
@@ -41,12 +41,12 @@ run_disaster_charter_download:
 	@poetry run python -m src.disaster_charter.data_acquisition_scrape
 
 run_idus_download:
-	@echo "Downloading IDUS dump → data_raw/idmc_idu/idus_all.json"
-	@mkdir -p data_raw/idmc_idu
+	@echo "Downloading IDUS dump → data/idmc_idu/idus_all.json"
+	@mkdir -p data/idmc_idu
 	@curl -L --compressed \
 		-o data/idmc_idu/idus_all.json \
 		"https://helix-copilot-prod-helix-media-external.s3.amazonaws.com/external-media/api-dump/idus-all/2025-06-04-10-00-32/5mndO/idus_all.json"
-	@echo "✅  Saved (decompressed): data_raw/idmc_idu/idus_all.json"
+	@echo "✅  Saved (decompressed): data/idmc_idu/idus_all.json"
 
 run_glide_normal:
 	@echo "Running Glide normalisation"

diff --git a/docs/DATASETS.md b/docs/DATASETS.md
@@ -13,10 +13,10 @@ update any of the datasets:
 1. Visit the source link listed in the table below.
 2. Locate and download any new records since the last extraction.
 3. Either:
-   - Append new entries to the existing CSVs stored in the Azure blob, or
+   - Append new entries to the existing CSVs stored in the Azure blob or local, or
    - Replace the file entirely with a newly exported version.
 4. Upload updated files to the correct path inside the
-   `disaster-impact/raw/` container.
+   `disaster-impact/data/` container.
 
 **Important**: Always preserve the folder structure to avoid breaking downstream
 processes.

diff --git a/notebooks/process_sandbox.ipynb b/notebooks/process_sandbox.ipynb
diff --git a/src/cerf/data_acquisition_scrape.py b/src/cerf/data_acquisition_scrape.py
@@ -1,9 +1,12 @@
 from datetime import datetime
+import os
 
 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 
+os.makedirs("./data/cerf/", exist_ok=True)
+
 BASE_URL = "https://cerf.un.org/what-we-do/allocation/all/emergency/"
 DETAILS_BASE_URL = "https://cerf.un.org/what-we-do/allocation/"
 
@@ -155,7 +158,7 @@ def main():
         all_table_data.extend(table_data)
 
     df = pd.DataFrame(all_table_data)
-    output_csv = "./data_raw/cerf/cerf_emergency_data.csv"
+    output_csv = "./data/cerf/cerf_emergency_data_dynamic_web_scrape.csv"
     df.to_csv(output_csv, index=False)
 
     print(f"Data saved to {output_csv}")

diff --git a/src/disaster_charter/data_acquisition_scrape.py b/src/disaster_charter/data_acquisition_scrape.py
@@ -25,7 +25,7 @@
 HEADERS   = {"User-Agent": "Mozilla/5.0"}
 SLEEP_SEC = 0.35          # polite delay between page hits
 
-OUT_DIR   = "./data_raw/disaster_charter/"
+OUT_DIR   = "./data/disaster-charter/"
 CSV_PATH  = os.path.join(OUT_DIR, "disaster_activations_web_scrape_2000_2025.csv")
 os.makedirs(OUT_DIR, exist_ok=True)
 

diff --git a/src/gdacs/data_acquisition_api.py b/src/gdacs/data_acquisition_api.py
@@ -8,7 +8,7 @@
 import requests
 
 SEARCH_URL = "https://www.gdacs.org/gdacsapi/api/events/geteventlist/SEARCH"
-OUTPUT_DIR = "./data_raw/gdacs/"
+OUTPUT_DIR = "./data/gdacs/"
 pathlib.Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
 
 

diff --git a/src/glide/data_acquisition_scrape.py b/src/glide/data_acquisition_scrape.py
@@ -12,15 +12,16 @@
 from selenium.webdriver.support.ui import WebDriverWait
 
 URL = "https://glidenumber.net/glide/public/result/report.jsp"
-GECKODRIVER_PATH = "/usr/local/bin/geckodriver"
-PROFILE_PATH = (
-    "/home/evangelos/snap/firefox/common/.mozilla/firefox/"
-    "cf7shfvv.selenium_profile"
-)
+# GECKODRIVER_PATH = "/usr/local/bin/geckodriver"
+GECKODRIVER_PATH = "/opt/homebrew/bin/geckodriver"
+# PROFILE_PATH = (
+#     "/home/evangelos/snap/firefox/common/.mozilla/firefox/"
+#     "cf7shfvv.selenium_profile"
+# )
+PROFILE_PATH = "/Users/evangelosdiakatossaoulas/Library/Application Support/Firefox/Profiles/r0zabgcj.default-release"
 
-
-Path("./data_raw/glide_v2/").mkdir(parents=True, exist_ok=True)
-CSV_OUTPUT = "./data_raw/glide/glide_data_combined_all.csv"
+Path("./data/glide/").mkdir(parents=True, exist_ok=True)
+CSV_OUTPUT = "./data/glide/glide_events.csv"
 
 def scrape_with_selenium() -> str:
     """Use Selenium to interact with the Glide Number website and return the rendered.
@@ -29,8 +30,8 @@ def scrape_with_selenium() -> str:
     """
     options = FirefoxOptions()
     options.headless = False # type: ignore[attr-defined]
-    options.add_argument("-profile")
-    options.add_argument(PROFILE_PATH)
+    # options.add_argument("-profile")
+    # options.add_argument(PROFILE_PATH)
 
     service = FirefoxService(GECKODRIVER_PATH)
     driver = webdriver.Firefox(service=service, options=options)
-Original file line number
+Diff line change
@@ Expand Up / @@ -145,6 +145,7 @@ data/* @@
     data1
     data
     data_raw/
+    data_v2
     data_mid/
     data_out/
@@ Expand Down @@