Skip to content

[REQUEST] Add Bosnia and Herz. #85

@thiagovmdon

Description

@thiagovmdon

Data source
Website: https://vodostaji.voda.ba/#2031
Terms of use: https://vodostaji.voda.ba/#Impressum

Type of data portal
No API access, but endpoint downloads. No registration required, but data available up to 1-year back

Country/Countries/Regions
Bosnia and Herz.

Variables
hourly discharge, water temperature and stage.

Station list
https://vodostaji.voda.ba/data/internet/layers/20/index.json
(But also implemented in the code below)

Code snippets
Metadata/list of stations:

import requests
import pandas as pd
import numpy as np


def get_vodostaji_metadata() -> pd.DataFrame:
    """
    Fetch latest snapshot data (value + timestamp + metadata)
    from vodostaji.voda.ba.

    - Keeps ALL original columns
    - Renames only user-selected fields
    - Converts numeric and datetime columns
    """

    url = "https://vodostaji.voda.ba/data/internet/layers/20/index.json"

    # --- Step 1: Fetch JSON ---
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        data = r.json()
        if not isinstance(data, list) or not data:
            print("No snapshot records returned.")
            return pd.DataFrame()
    except Exception as e:
        print(f"Failed to fetch snapshot data: {e}")
        return pd.DataFrame()

    # --- Step 2: Normalize JSON ---
    df_all = pd.json_normalize(data)

    # --- Step 3: Rename only selected fields ---
    rename_map = {
        # identifiers
        "metadata_station_no": "gauge_id",
        "metadata_station_name": "station_name",
        "metadata_river_name": "river",
        "metadata_catchment_name": "catchment",
        "metadata_station_latitude": "latitude",
        "metadata_station_longitude": "longitude",
        "metadata_CATCHMENT_SIZE": "area",

    }
    df_all = df_all.rename(columns=rename_map)

    # --- Step 4: Type conversions ---
    # numeric
    numeric_cols = [
        "latitude",
        "longitude",
        "area",
        "metadata_station_carteasting",
        "metadata_station_cartnorthing",
        "metadata_station_local_x",
        "metadata_station_local_y",
    ]

    for col in numeric_cols:
        if col in df_all.columns:
            df_all[col] = pd.to_numeric(df_all[col], errors="coerce")

    # --- Step 5: Optional: extract catchment area as number ---
    if "metadata_CATCHMENT_SIZE" in df_all.columns:
        df_all["catchment_area_km2"] = (
            df_all["metadata_CATCHMENT_SIZE"]
            .str.replace("km²", "", regex=False)
            .str.strip()
            .pipe(pd.to_numeric, errors="coerce")
        )

    # --- Step 6: Add static metadata ---
    df_all["country"] = "Bosnia and Herzegovina"
    df_all["source"] = "vodostaji.voda.ba"

    # --- Step 7: Basic cleanup ---
    df_all = df_all.reset_index(drop=True)

    return df_all

Time-series:

import requests
import pandas as pd
import numpy as np
from typing import Optional
from io import BytesIO


def get_vodostaji_data(
    gauge_id: str,
    variable: str,
    frequency: str = "instantaneous",
    start_date: Optional[str] = None,
    end_date: Optional[str] = None,
    station_groups=range(1, 11),
) -> pd.DataFrame:
    """
    Download hydrological time series data from vodostaji.voda.ba,
    automatically detecting the correct station group.

    Parameters
    ----------
    gauge_id : str
        Station number (e.g. '4411')
    variable : str
        'discharge', 'stage', or 'temperature'
    frequency : str
        'instantaneous' or 'daily'
    start_date, end_date : str, optional
        ISO dates ('YYYY-MM-DD')
    station_groups : iterable
        Station group IDs to try (default: 1..10)

    Returns
    -------
    pd.DataFrame
        Columns: ['time', '<variable>']
    """

    variable = variable.lower()
    frequency = frequency.lower()

    if variable not in ("discharge", "stage", "temperature"):
        raise ValueError("variable must be 'discharge', 'stage', or 'temperature'.")

    if frequency not in ("instantaneous", "daily"):
        raise ValueError("frequency must be 'instantaneous' or 'daily'.")

    # --- Variable mapping ---
    var_map = {
        "discharge": {"code": "Q", "file": "Q_1Y.xlsx"},
        "stage": {"code": "H", "file": "H_1Y.xlsx"},
        "temperature": {"code": "WT", "file": "Tvode_1Y.xlsx"},
    }

    var_code = var_map[variable]["code"]
    filename = var_map[variable]["file"]

    # --- Try station groups until one works ---
    content = None
    used_group = None

    for group in station_groups:
        base_url = f"https://vodostaji.voda.ba/data/internet/stations/{group}"
        url = f"{base_url}/{gauge_id}/{var_code}/{filename}"

        try:
            r = requests.get(url, timeout=20)
            if r.status_code == 200 and len(r.content) > 10_000:
                content = r.content
                used_group = group
                break
        except Exception:
            continue

    if content is None:
        return pd.DataFrame(columns=["time", variable])

    # --- Read Excel (skip metadata header) ---
    df = pd.read_excel(
        BytesIO(content),
        skiprows=8,
        names=["time", variable],
    )

    if df.empty:
        return pd.DataFrame(columns=["time", variable])

    # --- Parse types ---
    df["time"] = pd.to_datetime(df["time"], dayfirst=True, errors="coerce")
    df[variable] = pd.to_numeric(df[variable], errors="coerce")
    df = df.dropna(subset=["time", variable])

    # --- Date filtering ---
    if start_date:
        df = df[df["time"] >= pd.to_datetime(start_date)]
    if end_date:
        df = df[df["time"] <= pd.to_datetime(end_date)]

    # --- Aggregate if needed ---
    if frequency == "daily":
        df = (
            df.set_index("time")
            .resample("D")
            .mean()
            .dropna()
            .reset_index()
        )

    # --- Final cleanup ---
    df = df.drop_duplicates(subset="time", keep="first")
    df = df.sort_values("time").reset_index(drop=True)

    # Optional: attach metadata
    df.attrs["station_group"] = used_group
    df.attrs["station_id"] = gauge_id
    df.attrs["variable"] = variable

    return df

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or request

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions