-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Labels
enhancementNew feature or requestNew feature or request
Description
Data source
Website: https://vodostaji.voda.ba/#2031
Terms of use: https://vodostaji.voda.ba/#Impressum
Type of data portal
No API access, but endpoint downloads. No registration required, but data available up to 1-year back
Country/Countries/Regions
Bosnia and Herz.
Variables
hourly discharge, water temperature and stage.
Station list
https://vodostaji.voda.ba/data/internet/layers/20/index.json
(But also implemented in the code below)
Code snippets
Metadata/list of stations:
import requests
import pandas as pd
import numpy as np
def get_vodostaji_metadata() -> pd.DataFrame:
"""
Fetch latest snapshot data (value + timestamp + metadata)
from vodostaji.voda.ba.
- Keeps ALL original columns
- Renames only user-selected fields
- Converts numeric and datetime columns
"""
url = "https://vodostaji.voda.ba/data/internet/layers/20/index.json"
# --- Step 1: Fetch JSON ---
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
data = r.json()
if not isinstance(data, list) or not data:
print("No snapshot records returned.")
return pd.DataFrame()
except Exception as e:
print(f"Failed to fetch snapshot data: {e}")
return pd.DataFrame()
# --- Step 2: Normalize JSON ---
df_all = pd.json_normalize(data)
# --- Step 3: Rename only selected fields ---
rename_map = {
# identifiers
"metadata_station_no": "gauge_id",
"metadata_station_name": "station_name",
"metadata_river_name": "river",
"metadata_catchment_name": "catchment",
"metadata_station_latitude": "latitude",
"metadata_station_longitude": "longitude",
"metadata_CATCHMENT_SIZE": "area",
}
df_all = df_all.rename(columns=rename_map)
# --- Step 4: Type conversions ---
# numeric
numeric_cols = [
"latitude",
"longitude",
"area",
"metadata_station_carteasting",
"metadata_station_cartnorthing",
"metadata_station_local_x",
"metadata_station_local_y",
]
for col in numeric_cols:
if col in df_all.columns:
df_all[col] = pd.to_numeric(df_all[col], errors="coerce")
# --- Step 5: Optional: extract catchment area as number ---
if "metadata_CATCHMENT_SIZE" in df_all.columns:
df_all["catchment_area_km2"] = (
df_all["metadata_CATCHMENT_SIZE"]
.str.replace("km²", "", regex=False)
.str.strip()
.pipe(pd.to_numeric, errors="coerce")
)
# --- Step 6: Add static metadata ---
df_all["country"] = "Bosnia and Herzegovina"
df_all["source"] = "vodostaji.voda.ba"
# --- Step 7: Basic cleanup ---
df_all = df_all.reset_index(drop=True)
return df_all
Time-series:
import requests
import pandas as pd
import numpy as np
from typing import Optional
from io import BytesIO
def get_vodostaji_data(
gauge_id: str,
variable: str,
frequency: str = "instantaneous",
start_date: Optional[str] = None,
end_date: Optional[str] = None,
station_groups=range(1, 11),
) -> pd.DataFrame:
"""
Download hydrological time series data from vodostaji.voda.ba,
automatically detecting the correct station group.
Parameters
----------
gauge_id : str
Station number (e.g. '4411')
variable : str
'discharge', 'stage', or 'temperature'
frequency : str
'instantaneous' or 'daily'
start_date, end_date : str, optional
ISO dates ('YYYY-MM-DD')
station_groups : iterable
Station group IDs to try (default: 1..10)
Returns
-------
pd.DataFrame
Columns: ['time', '<variable>']
"""
variable = variable.lower()
frequency = frequency.lower()
if variable not in ("discharge", "stage", "temperature"):
raise ValueError("variable must be 'discharge', 'stage', or 'temperature'.")
if frequency not in ("instantaneous", "daily"):
raise ValueError("frequency must be 'instantaneous' or 'daily'.")
# --- Variable mapping ---
var_map = {
"discharge": {"code": "Q", "file": "Q_1Y.xlsx"},
"stage": {"code": "H", "file": "H_1Y.xlsx"},
"temperature": {"code": "WT", "file": "Tvode_1Y.xlsx"},
}
var_code = var_map[variable]["code"]
filename = var_map[variable]["file"]
# --- Try station groups until one works ---
content = None
used_group = None
for group in station_groups:
base_url = f"https://vodostaji.voda.ba/data/internet/stations/{group}"
url = f"{base_url}/{gauge_id}/{var_code}/{filename}"
try:
r = requests.get(url, timeout=20)
if r.status_code == 200 and len(r.content) > 10_000:
content = r.content
used_group = group
break
except Exception:
continue
if content is None:
return pd.DataFrame(columns=["time", variable])
# --- Read Excel (skip metadata header) ---
df = pd.read_excel(
BytesIO(content),
skiprows=8,
names=["time", variable],
)
if df.empty:
return pd.DataFrame(columns=["time", variable])
# --- Parse types ---
df["time"] = pd.to_datetime(df["time"], dayfirst=True, errors="coerce")
df[variable] = pd.to_numeric(df[variable], errors="coerce")
df = df.dropna(subset=["time", variable])
# --- Date filtering ---
if start_date:
df = df[df["time"] >= pd.to_datetime(start_date)]
if end_date:
df = df[df["time"] <= pd.to_datetime(end_date)]
# --- Aggregate if needed ---
if frequency == "daily":
df = (
df.set_index("time")
.resample("D")
.mean()
.dropna()
.reset_index()
)
# --- Final cleanup ---
df = df.drop_duplicates(subset="time", keep="first")
df = df.sort_values("time").reset_index(drop=True)
# Optional: attach metadata
df.attrs["station_group"] = used_group
df.attrs["station_id"] = gauge_id
df.attrs["variable"] = variable
return df
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or request