From d0a52e9b2723fb4c5e3e5f1f3559e80d5420ea12 Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Mon, 3 Nov 2025 19:15:48 +0000 Subject: [PATCH 1/7] WIP commit --- rivretrieve/japan.py | 289 +++++++++++++++++++++++++++++++++---------- 1 file changed, 221 insertions(+), 68 deletions(-) diff --git a/rivretrieve/japan.py b/rivretrieve/japan.py index 06b24c6..e91f7b6 100644 --- a/rivretrieve/japan.py +++ b/rivretrieve/japan.py @@ -1,9 +1,12 @@ + "Fetcher for Japanese river gauge data." import io import logging -from datetime import datetime -from typing import List, Optional +import re +import calendar +from datetime import datetime, timedelta +from typing import List, Optional, Dict, Any import pandas as pd import requests @@ -14,6 +17,24 @@ logger = logging.getLogger(__name__) +# Map Japanese descriptions to RivRetrieve constants +KIND_DESC_MAP = { + "時間水位": constants.STAGE_INSTANT, + "日水位": constants.STAGE_DAILY_MEAN, + "時間流量": constants.DISCHARGE_INSTANT, + "日流量": constants.DISCHARGE_DAILY_MEAN, + "リアルタイム水位": constants.STAGE_INSTANT, # Real-time is also instantaneous + "リアルタイム流量": constants.DISCHARGE_INSTANT, +} + +# Inverse map to get KIND from variable +# This is a preliminary map, can be refined +VARIABLE_KIND_MAP = { + constants.STAGE_INSTANT: [1, 9], + constants.STAGE_DAILY_MEAN: [2], + constants.DISCHARGE_INSTANT: [5, 10], # Assuming 10 might exist for real-time discharge + constants.DISCHARGE_DAILY_MEAN: [6], +} class JapanFetcher(base.RiverDataFetcher): """Fetches river gauge data from Japan's Ministry of Land, Infrastructure, Transport and Tourism (MLIT). @@ -23,9 +44,13 @@ class JapanFetcher(base.RiverDataFetcher): Supported Variables: - ``constants.DISCHARGE_DAILY_MEAN`` (m³/s) - ``constants.STAGE_DAILY_MEAN`` (m) + - ``constants.DISCHARGE_INSTANT`` (m³/s) + - ``constants.STAGE_INSTANT`` (m) """ - BASE_URL = "http://www1.river.go.jp/cgi-bin/DspWaterData.exe" + BASE_URL = "http://www1.river.go.jp" + DSP_URL = f"{BASE_URL}/cgi-bin/DspWaterData.exe" + SITE_INFO_URL = f"{BASE_URL}/cgi-bin/SiteInfo.exe" @staticmethod def get_cached_metadata() -> pd.DataFrame: @@ -41,13 +66,16 @@ def get_cached_metadata() -> pd.DataFrame: @staticmethod def get_available_variables() -> tuple[str, ...]: - return (constants.DISCHARGE_DAILY_MEAN, constants.STAGE_DAILY_MEAN) - - def _get_kind(self, variable: str) -> int: - if variable == constants.STAGE_DAILY_MEAN: - return 2 - elif variable == constants.DISCHARGE_DAILY_MEAN: - return 6 + return ( + constants.DISCHARGE_DAILY_MEAN, + constants.STAGE_DAILY_MEAN, + constants.DISCHARGE_INSTANT, + constants.STAGE_INSTANT, + ) + + def _get_kind(self, variable: str) -> Optional[List[int]]: + if variable in VARIABLE_KIND_MAP: + return VARIABLE_KIND_MAP[variable] else: raise ValueError(f"Unsupported variable: {variable}") @@ -57,104 +85,139 @@ def _download_data( variable: str, start_date: str, end_date: str, - ) -> List[pd.DataFrame]: - """Downloads raw data month by month.""" - kind = self._get_kind(variable) + ) -> List[str]: + """Downloads raw .dat file contents month by month.""" + possible_kinds = self._get_kind(variable) + if not possible_kinds: + logger.error(f"No KIND found for variable {variable}") + return [] start_dt = datetime.strptime(start_date, "%Y-%m-%d") end_dt = datetime.strptime(end_date, "%Y-%m-%d") current_dt = start_dt.replace(day=1) - monthly_data = [] + monthly_dat_contents = [] s = utils.requests_retry_session() + # To pick the best KIND, we might need to check SiteInfo.exe, + # but for now, let's try the first one in the list. + kind = possible_kinds[0] + logger.info(f"Using KIND={kind} for {variable}") + while current_dt <= end_dt: - month_start_str = current_dt.strftime("%Y%m%d") - # End date for the request can be a bit beyond the current month - request_end_dt = current_dt + relativedelta(months=1, days=-1) - if request_end_dt > end_dt: - request_end_dt = end_dt - request_end_str = request_end_dt.strftime("%Y%m%d") + year = current_dt.year + month = current_dt.month + month_str = f"{month:02d}" + last_day = calendar.monthrange(year, month)[1] + + month_start_str = f"{year}{month_str}01" + month_end_str = f"{year}{month_str}{last_day}" params = { "KIND": kind, "ID": gauge_id, "BGNDATE": month_start_str, - "ENDDATE": request_end_str, + "ENDDATE": month_end_str, + "KAWABOU": "NO", } try: - response = s.get(self.BASE_URL, params=params) + logger.debug(f"Fetching DspWaterData page for {gauge_id} {year}-{month_str}") + response = s.get(self.DSP_URL, params=params) response.raise_for_status() - response.encoding = "shift_jis" # Japanese encoding - soup = BeautifulSoup(response.text, "lxml") - tables = soup.find_all("table") - if len(tables) > 1: - table = tables[1] # Second table has the data + response.encoding = "EUC-JP" + soup = BeautifulSoup(response.text, 'html.parser') + + link_tag = soup.find('a', href=re.compile(r"/dat/dload/download/.*\.dat")) + if link_tag: + dat_url_path = link_tag['href'] + dat_url = f"{self.BASE_URL}{dat_url_path}" + logger.debug(f"Found .dat link: {dat_url}") + + dat_response = s.get(dat_url) + dat_response.raise_for_status() + dat_content = dat_response.content.decode('shift_jis', errors='replace') + monthly_dat_contents.append(dat_content) + logger.info(f"Successfully downloaded {dat_url_path.split('/')[-1]}") else: - table = None - - if table: - df = pd.read_html(io.StringIO(str(table)), header=None)[0] - monthly_data.append(df) - else: - logger.warning(f"No table found for site {gauge_id} for {current_dt.strftime('%Y-%m')}") + logger.warning(f"No .dat link found for site {gauge_id} for {year}-{month_str} with KIND {kind}") except requests.exceptions.RequestException as e: - logger.error(f"Error fetching data for site {gauge_id} for {current_dt.strftime('%Y-%m')}: {e}") + logger.error(f"Error fetching data for site {gauge_id} for {year}-{month_str}: {e}") except Exception as e: - logger.error(f"Error processing data for site {gauge_id} for {current_dt.strftime('%Y-%m')}: {e}") + logger.error(f"Error processing data for site {gauge_id} for {year}-{month_str}: {e}") current_dt += relativedelta(months=1) - return monthly_data + return monthly_dat_contents def _parse_data( self, gauge_id: str, - raw_data_list: List[pd.DataFrame], + raw_data_list: List[str], variable: str, ) -> pd.DataFrame: - """Parses the list of monthly DataFrames.""" + """Parses the list of monthly .dat file contents.""" if not raw_data_list: return pd.DataFrame(columns=[constants.TIME_INDEX, variable]) all_dfs = [] - for df in raw_data_list: - if df.empty or len(df) < 5: - continue - - # Skip header rows of the second table, data starts around row 2 - data_df = df.iloc[2:].copy() - if data_df.empty: - continue - - # Columns: Date, 0h, 1h, ..., 12h, ..., 23h - # We need Date (index 0) and the value at 12h (index 12) - if data_df.shape[1] < 13: - logger.warning(f"Unexpected table structure for site {gauge_id}, skipping month.") - continue - - data_df = data_df.iloc[:, [0, 12]] - data_df.columns = [constants.TIME_INDEX, "Value"] - + for dat_content in raw_data_list: try: - data_df[constants.TIME_INDEX] = pd.to_datetime( - data_df[constants.TIME_INDEX], format="%Y/%m/%d", errors="coerce" - ) - data_df = data_df.dropna(subset=[constants.TIME_INDEX]) + lines = dat_content.strip().split('\r\n') + data_lines = [line for line in lines if not line.startswith('#') and line.strip()] + + if not data_lines: + continue + + if not data_lines[0].startswith(','): # Data starts after header + data_lines = data_lines[1:] + + if not data_lines: + continue + + # The first column is Date, followed by 24 pairs of (Value, Flag) + col_names = [constants.TIME_INDEX] + for i in range(1, 25): + col_names.append(f"{i}時") + col_names.append(f"{i}時フラグ") + + # Read the data part + csv_io = io.StringIO('\n'.join(data_lines)) + df = pd.read_csv(csv_io, header=None, names=col_names, na_values=["-9999.00"], dtype={constants.TIME_INDEX: str}) + + df[constants.TIME_INDEX] = pd.to_datetime(df[constants.TIME_INDEX], format="%Y/%m/%d", errors="coerce") + df = df.dropna(subset=[constants.TIME_INDEX]) + + # Melt hourly columns + value_cols = [f"{i}時" for i in range(1, 25)] + + df_long = df.melt(id_vars=[constants.TIME_INDEX], value_vars=value_cols, var_name='Hour', value_name='Value') + df_long['Hour'] = df_long['Hour'].str.replace('時', '').astype(int) + df_long['Value'] = pd.to_numeric(df_long['Value'], errors='coerce') + df_long = df_long.dropna(subset=['Value']) + + if constants.INSTANTANEOUS in variable: + # Build datetime index for hourly data + df_long[constants.TIME_INDEX] = df_long.apply( + lambda row: row[constants.TIME_INDEX] + timedelta(hours=row['Hour'] - 1), axis=1 + ) + hourly_df = df_long[[constants.TIME_INDEX, 'Value']].rename(columns={'Value': variable}) + all_dfs.append(hourly_df) + elif constants.DAILY in variable: + # Calculate daily mean + daily_df = df_long.groupby(constants.TIME_INDEX)['Value'].mean().reset_index() + daily_df = daily_df.rename(columns={'Value': variable}) + all_dfs.append(daily_df) - data_df["Value"] = pd.to_numeric(data_df["Value"], errors="coerce") - all_dfs.append(data_df.dropna()) except Exception as e: - logger.error(f"Error parsing DataFrame: {e}\n{df.head()}") + logger.error(f"Error parsing .dat content for {gauge_id}: {e}", exc_info=True) continue if not all_dfs: return pd.DataFrame(columns=[constants.TIME_INDEX, variable]) final_df = pd.concat(all_dfs, ignore_index=True) - final_df = final_df.rename(columns={"Value": variable}) final_df = final_df.sort_values(by=constants.TIME_INDEX) return final_df.set_index(constants.TIME_INDEX) @@ -200,11 +263,101 @@ def get_data( raw_data_list = self._download_data(gauge_id, variable, start_date, end_date) df = self._parse_data(gauge_id, raw_data_list, variable) - start_date_dt = pd.to_datetime(start_date) - end_date_dt = pd.to_datetime(end_date) - df = df[(df.index >= start_date_dt) & (df.index <= end_date_dt)] + if not df.empty: + start_date_dt = pd.to_datetime(start_date) + end_date_dt = pd.to_datetime(end_date) + # For daily data, index is date. For hourly, index is datetime. + if constants.DAILY in variable: + df = df[(df.index >= start_date_dt) & (df.index <= end_date_dt)] + elif constants.INSTANTANEOUS in variable: + df = df[(df.index >= start_date_dt) & (df.index <= pd.to_datetime(end_date) + timedelta(days=1))] return df except Exception as e: logger.error(f"Failed to get data for site {gauge_id}, variable {variable}: {e}") return pd.DataFrame(columns=[constants.TIME_INDEX, variable]) + + def get_metadata(self, gauge_ids: Optional[List[str]] = None) -> pd.DataFrame: + """Fetches metadata for given gauge IDs from the MLIT Water Information System. + + Args: + gauge_ids: A list of gauge IDs to fetch metadata for. If None, IDs are loaded from the cached CSV. + + Returns: + A pandas DataFrame containing metadata for the stations, indexed by gauge_id. + """ + if gauge_ids is None: + cached_meta = self.get_cached_metadata() + gauge_ids = cached_meta.index.tolist() + + all_station_data = [] + s = utils.requests_retry_session() + + for gauge_id in gauge_ids: + logger.info(f"Fetching metadata for station: {gauge_id}") + site_info_url = f"{self.BASE_URL}/cgi-bin/SiteInfo.exe?ID={gauge_id}" + try: + response = s.get(site_info_url) + response.raise_for_status() + response.encoding = "EUC-JP" + soup = BeautifulSoup(response.text, 'html.parser') + + station_data = {constants.GAUGE_ID: gauge_id} + + # Extract metadata from the main table + info_table = soup.find('table', {'align': 'CENTER', 'width': '600'}) + if info_table: + for row in info_table.find_all('tr'): + cells = row.find_all('td') + if len(cells) == 2: + key = cells[0].text.strip() + value = cells[1].text.strip() + if key == '観測所名': + station_data[constants.STATION_NAME] = value + elif key == '所在地': + station_data['location'] = value + elif key == '水系名': + station_data[constants.RIVER] = value # Approximate + elif key == '河川名': + station_data['river_name_jp'] = value + elif key == '緯度経度': + try: + # Format: N34度2分2秒 E132度26分5秒 + lat_match = re.search(r'N(\d+)度(\d+)分(\d+)秒', value) + lon_match = re.search(r'E(\d+)度(\d+)分(\d+)秒', value) + if lat_match: + lat = float(lat_match.group(1)) + float(lat_match.group(2))/60 + float(lat_match.group(3))/3600 + station_data[constants.LATITUDE] = lat + if lon_match: + lon = float(lon_match.group(1)) + float(lon_match.group(2))/60 + float(lon_match.group(3))/3600 + station_data[constants.LONGITUDE] = lon + except Exception as e: + logger.warning(f"Could not parse lat/lon for {gauge_id}: {value} - {e}") + + # Extract available data types (KINDs) + kind_map = {} + data_links = soup.find_all('a', href=re.compile(r"DspWaterData\.exe\?KIND=")) + for link in data_links: + href = link['href'] + kind_match = re.search(r"KIND=(\d+)", href) + if kind_match: + kind = int(kind_match.group(1)) + img_tag = link.find('img') + if img_tag and img_tag.get('alt'): + alt_text = img_tag['alt'].strip() + kind_map[kind] = alt_text + station_data['available_kinds'] = kind_map + + all_station_data.append(station_data) + + except requests.exceptions.RequestException as e: + logger.error(f"Error fetching SiteInfo for {gauge_id}: {e}") + except Exception as e: + logger.error(f"Error parsing SiteInfo for {gauge_id}: {e}", exc_info=True) + + df = pd.DataFrame(all_station_data) + if not df.empty: + return df.set_index(constants.GAUGE_ID) + else: + return pd.DataFrame(columns=[constants.GAUGE_ID]).set_index(constants.GAUGE_ID) + From 1a849b9855e2e3be2146de3826b9799c62488839 Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Wed, 5 Nov 2025 11:18:15 +0000 Subject: [PATCH 2/7] Working timeseries retrival --- rivretrieve/japan.py | 350 +++++++++++++++++++++---------------------- 1 file changed, 168 insertions(+), 182 deletions(-) diff --git a/rivretrieve/japan.py b/rivretrieve/japan.py index e91f7b6..225d048 100644 --- a/rivretrieve/japan.py +++ b/rivretrieve/japan.py @@ -1,4 +1,3 @@ - "Fetcher for Japanese river gauge data." import io @@ -17,23 +16,12 @@ logger = logging.getLogger(__name__) -# Map Japanese descriptions to RivRetrieve constants -KIND_DESC_MAP = { - "時間水位": constants.STAGE_INSTANT, - "日水位": constants.STAGE_DAILY_MEAN, - "時間流量": constants.DISCHARGE_INSTANT, - "日流量": constants.DISCHARGE_DAILY_MEAN, - "リアルタイム水位": constants.STAGE_INSTANT, # Real-time is also instantaneous - "リアルタイム流量": constants.DISCHARGE_INSTANT, -} - -# Inverse map to get KIND from variable -# This is a preliminary map, can be refined +# Maps RivRetrieve variable to the single confirmed KIND value. VARIABLE_KIND_MAP = { - constants.STAGE_INSTANT: [1, 9], - constants.STAGE_DAILY_MEAN: [2], - constants.DISCHARGE_INSTANT: [5, 10], # Assuming 10 might exist for real-time discharge - constants.DISCHARGE_DAILY_MEAN: [6], + constants.STAGE_HOURLY_MEAN: 2, + constants.STAGE_DAILY_MEAN: 3, + constants.DISCHARGE_HOURLY_MEAN: 6, + constants.DISCHARGE_DAILY_MEAN: 7, } class JapanFetcher(base.RiverDataFetcher): @@ -41,11 +29,15 @@ class JapanFetcher(base.RiverDataFetcher): Data Source: Water Information System (http://www1.river.go.jp/) + Note: KINDs 2 and 6, described as "Daily" on the website, actually provide HOURLY data. + KINDs 3 and 7 are expected to provide true DAILY data. + This fetcher returns data at the resolution provided in the source .dat files. + Supported Variables: + - ``constants.DISCHARGE_HOURLY_MEAN`` (m³/s) + - ``constants.STAGE_HOURLY_MEAN`` (m) - ``constants.DISCHARGE_DAILY_MEAN`` (m³/s) - ``constants.STAGE_DAILY_MEAN`` (m) - - ``constants.DISCHARGE_INSTANT`` (m³/s) - - ``constants.STAGE_INSTANT`` (m) """ BASE_URL = "http://www1.river.go.jp" @@ -54,26 +46,19 @@ class JapanFetcher(base.RiverDataFetcher): @staticmethod def get_cached_metadata() -> pd.DataFrame: - """Retrieves a DataFrame of available Japanese gauge IDs and metadata. - - This method loads the metadata from a cached CSV file located in - the ``rivretrieve/cached_site_data/`` directory. - - Returns: - pd.DataFrame: A DataFrame indexed by gauge_id, containing site metadata. - """ + """Retrieves a DataFrame of available Japanese gauge IDs and metadata.""" return utils.load_cached_metadata_csv("japan") @staticmethod def get_available_variables() -> tuple[str, ...]: return ( - constants.DISCHARGE_DAILY_MEAN, + constants.STAGE_HOURLY_MEAN, constants.STAGE_DAILY_MEAN, - constants.DISCHARGE_INSTANT, - constants.STAGE_INSTANT, + constants.DISCHARGE_HOURLY_MEAN, + constants.DISCHARGE_DAILY_MEAN, ) - def _get_kind(self, variable: str) -> Optional[List[int]]: + def _get_kind(self, variable: str) -> int: if variable in VARIABLE_KIND_MAP: return VARIABLE_KIND_MAP[variable] else: @@ -86,70 +71,71 @@ def _download_data( start_date: str, end_date: str, ) -> List[str]: - """Downloads raw .dat file contents month by month.""" - possible_kinds = self._get_kind(variable) - if not possible_kinds: - logger.error(f"No KIND found for variable {variable}") - return [] + """Downloads raw .dat file contents.""" + s = utils.requests_retry_session() + kind_to_try = self._get_kind(variable) start_dt = datetime.strptime(start_date, "%Y-%m-%d") end_dt = datetime.strptime(end_date, "%Y-%m-%d") - current_dt = start_dt.replace(day=1) - monthly_dat_contents = [] - s = utils.requests_retry_session() - - # To pick the best KIND, we might need to check SiteInfo.exe, - # but for now, let's try the first one in the list. - kind = possible_kinds[0] - logger.info(f"Using KIND={kind} for {variable}") - - while current_dt <= end_dt: - year = current_dt.year - month = current_dt.month - month_str = f"{month:02d}" - last_day = calendar.monthrange(year, month)[1] - - month_start_str = f"{year}{month_str}01" - month_end_str = f"{year}{month_str}{last_day}" - - params = { - "KIND": kind, - "ID": gauge_id, - "BGNDATE": month_start_str, - "ENDDATE": month_end_str, - "KAWABOU": "NO", - } - - try: - logger.debug(f"Fetching DspWaterData page for {gauge_id} {year}-{month_str}") - response = s.get(self.DSP_URL, params=params) - response.raise_for_status() - response.encoding = "EUC-JP" - soup = BeautifulSoup(response.text, 'html.parser') - - link_tag = soup.find('a', href=re.compile(r"/dat/dload/download/.*\.dat")) - if link_tag: - dat_url_path = link_tag['href'] - dat_url = f"{self.BASE_URL}{dat_url_path}" - logger.debug(f"Found .dat link: {dat_url}") - - dat_response = s.get(dat_url) - dat_response.raise_for_status() - dat_content = dat_response.content.decode('shift_jis', errors='replace') - monthly_dat_contents.append(dat_content) - logger.info(f"Successfully downloaded {dat_url_path.split('/')[-1]}") - else: - logger.warning(f"No .dat link found for site {gauge_id} for {year}-{month_str} with KIND {kind}") - - except requests.exceptions.RequestException as e: - logger.error(f"Error fetching data for site {gauge_id} for {year}-{month_str}: {e}") - except Exception as e: - logger.error(f"Error processing data for site {gauge_id} for {year}-{month_str}: {e}") - - current_dt += relativedelta(months=1) - - return monthly_dat_contents + dat_contents = [] + headers = { + "User-Agent": "Mozilla/5.0", + "Referer": self.BASE_URL + } + + if kind_to_try in [2, 6]: # Monthly requests for hourly data + current_dt = start_dt.replace(day=1) + while current_dt <= end_dt: + year = current_dt.year + month = current_dt.month + month_str = f"{month:02d}" + last_day = calendar.monthrange(year, month)[1] + month_start_str = f"{year}{month_str}01" + month_end_str = f"{year}{month_str}{last_day}" + + params = {"KIND": kind_to_try, "ID": gauge_id, "BGNDATE": month_start_str, "ENDDATE": month_end_str, "KAWABOU": "NO"} + try: + logger.debug(f"Fetching DspWaterData page for {gauge_id} {year}-{month_str} KIND {kind_to_try}") + response = s.get(self.DSP_URL, params=params, headers=headers) + response.raise_for_status() + response.encoding = "EUC-JP" + soup = BeautifulSoup(response.text, 'html.parser') + link_tag = soup.find(re.compile("a", re.IGNORECASE), href=re.compile(r"/dat/dload/download/")) + if link_tag: + dat_url = f"{self.BASE_URL}{link_tag['href']}" + dat_response = s.get(dat_url, headers=headers) + dat_response.raise_for_status() + dat_contents.append(dat_response.content.decode('shift_jis', errors='replace')) + logger.info(f"Successfully downloaded {link_tag['href'].split('/')[-1]}") + else: + logger.warning(f"No .dat link found for {gauge_id} {year}-{month_str} KIND {kind_to_try}") + except Exception as e: + logger.error(f"Error fetching for {gauge_id} {year}-{month_str} KIND {kind_to_try}: {e}") + current_dt += relativedelta(months=1) + elif kind_to_try in [3, 7]: # Yearly requests for daily data + for year in range(start_dt.year, end_dt.year + 1): + year_start_str = f"{year}0131" + year_end_str = f"{year}1231" + params = {"KIND": kind_to_try, "ID": gauge_id, "BGNDATE": year_start_str, "ENDDATE": year_end_str, "KAWABOU": "NO"} + try: + logger.debug(f"Fetching DspWaterData page for {gauge_id} {year} KIND {kind_to_try}") + response = s.get(self.DSP_URL, params=params, headers=headers) + response.raise_for_status() + response.encoding = "EUC-JP" + soup = BeautifulSoup(response.text, 'html.parser') + link_tag = soup.find(re.compile("a", re.IGNORECASE), href=re.compile(r"/dat/dload/download/")) + if link_tag: + dat_url = f"{self.BASE_URL}{link_tag['href']}" + dat_response = s.get(dat_url, headers=headers) + dat_response.raise_for_status() + dat_contents.append(dat_response.content.decode('shift_jis', errors='replace')) + logger.info(f"Successfully downloaded {link_tag['href'].split('/')[-1]}") + else: + logger.warning(f"No .dat link found for {gauge_id} {year} KIND {kind_to_try}") + except Exception as e: + logger.error(f"Error fetching for {gauge_id} {year} KIND {kind_to_try}: {e}") + return dat_contents def _parse_data( self, @@ -161,7 +147,9 @@ def _parse_data( if not raw_data_list: return pd.DataFrame(columns=[constants.TIME_INDEX, variable]) + kind = self._get_kind(variable) all_dfs = [] + for dat_content in raw_data_list: try: lines = dat_content.strip().split('\r\n') @@ -170,48 +158,76 @@ def _parse_data( if not data_lines: continue - if not data_lines[0].startswith(','): # Data starts after header + header_line = next((line for line in lines if line.startswith(',')), None) + if header_line: data_lines = data_lines[1:] - if not data_lines: continue - # The first column is Date, followed by 24 pairs of (Value, Flag) - col_names = [constants.TIME_INDEX] - for i in range(1, 25): - col_names.append(f"{i}時") - col_names.append(f"{i}時フラグ") - - # Read the data part csv_io = io.StringIO('\n'.join(data_lines)) - df = pd.read_csv(csv_io, header=None, names=col_names, na_values=["-9999.00"], dtype={constants.TIME_INDEX: str}) - - df[constants.TIME_INDEX] = pd.to_datetime(df[constants.TIME_INDEX], format="%Y/%m/%d", errors="coerce") - df = df.dropna(subset=[constants.TIME_INDEX]) - # Melt hourly columns - value_cols = [f"{i}時" for i in range(1, 25)] - - df_long = df.melt(id_vars=[constants.TIME_INDEX], value_vars=value_cols, var_name='Hour', value_name='Value') - df_long['Hour'] = df_long['Hour'].str.replace('時', '').astype(int) - df_long['Value'] = pd.to_numeric(df_long['Value'], errors='coerce') - df_long = df_long.dropna(subset=['Value']) + if kind in [2, 6]: # Hourly data format + col_names = [constants.TIME_INDEX] + for i in range(1, 25): + col_names.append(f"{i}時") + col_names.append(f"{i}時フラグ") + + df = pd.read_csv(csv_io, header=None, names=col_names, na_values=["-9999.00"], dtype={constants.TIME_INDEX: str}) + df[constants.TIME_INDEX] = pd.to_datetime(df[constants.TIME_INDEX], format="%Y/%m/%d", errors="coerce") + df = df.dropna(subset=[constants.TIME_INDEX]) + + value_cols = [f"{i}時" for i in range(1, 25)] + df_long = df.melt(id_vars=[constants.TIME_INDEX], value_vars=value_cols, var_name='Hour', value_name='Value') + df_long['Hour'] = df_long['Hour'].str.replace('時', '').astype(int) + df_long['Value'] = pd.to_numeric(df_long['Value'], errors='coerce') + df_long = df_long.dropna(subset=['Value']) - if constants.INSTANTANEOUS in variable: - # Build datetime index for hourly data df_long[constants.TIME_INDEX] = df_long.apply( lambda row: row[constants.TIME_INDEX] + timedelta(hours=row['Hour'] - 1), axis=1 ) - hourly_df = df_long[[constants.TIME_INDEX, 'Value']].rename(columns={'Value': variable}) - all_dfs.append(hourly_df) - elif constants.DAILY in variable: - # Calculate daily mean - daily_df = df_long.groupby(constants.TIME_INDEX)['Value'].mean().reset_index() - daily_df = daily_df.rename(columns={'Value': variable}) - all_dfs.append(daily_df) + parsed_df = df_long[[constants.TIME_INDEX, 'Value']].rename(columns={'Value': variable}) + all_dfs.append(parsed_df) + + elif kind in [3, 7]: # Daily data format + year = None + for line in lines: + if line.endswith("年"): + year_match = re.search(r"(\d{4})年", line) + if year_match: + year = int(year_match.group(1)) + break + if year is None: + logger.warning(f"Could not extract year from .dat file for {gauge_id} KIND {kind}") + continue + + col_names = ["月"] + for i in range(1, 32): + col_names.append(f"{i}日") + col_names.append(f"{i}日フラグ") + + df = pd.read_csv(csv_io, header=None, names=col_names, na_values=[" ", "-9999.00"], encoding='utf-8') + + month_map = {f"{i}月": i for i in range(1, 13)} + df["Month"] = df["月"].map(month_map) + df = df.dropna(subset=["Month"]) + df["Year"] = year + + value_cols = [f"{i}日" for i in range(1, 32)] + df_long = df.melt(id_vars=["Year", "Month"], value_vars=value_cols, var_name='Day', value_name='Value') + df_long['Day'] = df_long['Day'].str.replace('日', '').astype(int) + df_long['Value'] = pd.to_numeric(df_long['Value'], errors='coerce') + df_long = df_long.dropna(subset=['Value']) + + df_long[constants.TIME_INDEX] = pd.to_datetime(df_long[['Year', 'Month', 'Day']], errors='coerce') + parsed_df = df_long[[constants.TIME_INDEX, 'Value']].rename(columns={'Value': variable}) + parsed_df = parsed_df.dropna(subset=[constants.TIME_INDEX]) + all_dfs.append(parsed_df) + else: + logger.warning(f"Unsupported KIND {kind} for parsing in _parse_data") + continue except Exception as e: - logger.error(f"Error parsing .dat content for {gauge_id}: {e}", exc_info=True) + logger.error(f"Error parsing .dat content for {gauge_id} KIND {kind}: {e}", exc_info=True) continue if not all_dfs: @@ -229,31 +245,7 @@ def get_data( start_date: Optional[str] = None, end_date: Optional[str] = None, ) -> pd.DataFrame: - """Fetches and parses time series data for a specific gauge and variable. - - This method retrieves the requested data from the provider's API or data source, - parses it, and returns it in a standardized pandas DataFrame format. - - Args: - gauge_id: The site-specific identifier for the gauge. - variable: The variable to fetch. Must be one of the strings listed - in the fetcher's ``get_available_variables()`` output. - These are typically defined in ``rivretrieve.constants``. - start_date: Optional start date for the data retrieval in 'YYYY-MM-DD' format. - If None, data is fetched from the earliest available date. - end_date: Optional end date for the data retrieval in 'YYYY-MM-DD' format. - If None, data is fetched up to the latest available date. - - Returns: - pd.DataFrame: A pandas DataFrame indexed by datetime objects (``constants.TIME_INDEX``) - with a single column named after the requested ``variable``. The DataFrame - will be empty if no data is found for the given parameters. - - Raises: - ValueError: If the requested ``variable`` is not supported by this fetcher. - requests.exceptions.RequestException: If a network error occurs during data download. - Exception: For other unexpected errors during data fetching or parsing. - """ + """Fetches and parses time series data for a specific gauge and variable.""" start_date = utils.format_start_date(start_date) end_date = utils.format_end_date(end_date) if variable not in self.get_available_variables(): @@ -261,50 +253,45 @@ def get_data( try: raw_data_list = self._download_data(gauge_id, variable, start_date, end_date) + if not raw_data_list: + return pd.DataFrame(columns=[constants.TIME_INDEX, variable]) + df = self._parse_data(gauge_id, raw_data_list, variable) if not df.empty: start_date_dt = pd.to_datetime(start_date) - end_date_dt = pd.to_datetime(end_date) - # For daily data, index is date. For hourly, index is datetime. - if constants.DAILY in variable: - df = df[(df.index >= start_date_dt) & (df.index <= end_date_dt)] - elif constants.INSTANTANEOUS in variable: - df = df[(df.index >= start_date_dt) & (df.index <= pd.to_datetime(end_date) + timedelta(days=1))] + end_date_dt = pd.to_datetime(end_date) + timedelta(days=1) # Include end date + df = df[(df.index >= start_date_dt) & (df.index < end_date_dt)] return df except Exception as e: - logger.error(f"Failed to get data for site {gauge_id}, variable {variable}: {e}") + logger.error(f"Failed to get data for site {gauge_id}, variable {variable}: {e}", exc_info=True) return pd.DataFrame(columns=[constants.TIME_INDEX, variable]) def get_metadata(self, gauge_ids: Optional[List[str]] = None) -> pd.DataFrame: - """Fetches metadata for given gauge IDs from the MLIT Water Information System. - - Args: - gauge_ids: A list of gauge IDs to fetch metadata for. If None, IDs are loaded from the cached CSV. - - Returns: - A pandas DataFrame containing metadata for the stations, indexed by gauge_id. - """ + """Fetches metadata for given gauge IDs from the MLIT Water Information System.""" if gauge_ids is None: cached_meta = self.get_cached_metadata() gauge_ids = cached_meta.index.tolist() all_station_data = [] s = utils.requests_retry_session() + headers = { + "User-Agent": "Mozilla/5.0", + "Referer": self.BASE_URL + } for gauge_id in gauge_ids: logger.info(f"Fetching metadata for station: {gauge_id}") - site_info_url = f"{self.BASE_URL}/cgi-bin/SiteInfo.exe?ID={gauge_id}" + site_info_url = f"{self.SITE_INFO_URL}?ID={gauge_id}" try: - response = s.get(site_info_url) + response = s.get(site_info_url, headers=headers) response.raise_for_status() response.encoding = "EUC-JP" soup = BeautifulSoup(response.text, 'html.parser') station_data = {constants.GAUGE_ID: gauge_id} - # Extract metadata from the main table info_table = soup.find('table', {'align': 'CENTER', 'width': '600'}) if info_table: for row in info_table.find_all('tr'): @@ -317,12 +304,11 @@ def get_metadata(self, gauge_ids: Optional[List[str]] = None) -> pd.DataFrame: elif key == '所在地': station_data['location'] = value elif key == '水系名': - station_data[constants.RIVER] = value # Approximate + station_data[constants.RIVER] = value elif key == '河川名': station_data['river_name_jp'] = value elif key == '緯度経度': try: - # Format: N34度2分2秒 E132度26分5秒 lat_match = re.search(r'N(\d+)度(\d+)分(\d+)秒', value) lon_match = re.search(r'E(\d+)度(\d+)分(\d+)秒', value) if lat_match: @@ -333,25 +319,26 @@ def get_metadata(self, gauge_ids: Optional[List[str]] = None) -> pd.DataFrame: station_data[constants.LONGITUDE] = lon except Exception as e: logger.warning(f"Could not parse lat/lon for {gauge_id}: {value} - {e}") - - # Extract available data types (KINDs) + + # Fetch available kinds for the station kind_map = {} - data_links = soup.find_all('a', href=re.compile(r"DspWaterData\.exe\?KIND=")) - for link in data_links: - href = link['href'] - kind_match = re.search(r"KIND=(\d+)", href) - if kind_match: - kind = int(kind_match.group(1)) - img_tag = link.find('img') - if img_tag and img_tag.get('alt'): - alt_text = img_tag['alt'].strip() - kind_map[kind] = alt_text + # Commenting out the SiteInfo fetch for KINDs due to 403 errors + # try: + # # This part is still blocked by 403, so kind_map will be empty + # pass # s_kinds = utils.requests_retry_session() + # # response = s_kinds.get(site_info_url, headers=headers) + # # response.raise_for_status() + # # ... parsing logic ... + # except Exception as e: + # logger.error(f"Error fetching/parsing SiteInfo for {gauge_id} for KINDS: {e}") station_data['available_kinds'] = kind_map - all_station_data.append(station_data) except requests.exceptions.RequestException as e: - logger.error(f"Error fetching SiteInfo for {gauge_id}: {e}") + if e.response and e.response.status_code == 403: + logger.error(f"Access forbidden for SiteInfo {gauge_id}: {e}") + else: + logger.error(f"Error fetching SiteInfo for {gauge_id}: {e}") except Exception as e: logger.error(f"Error parsing SiteInfo for {gauge_id}: {e}", exc_info=True) @@ -359,5 +346,4 @@ def get_metadata(self, gauge_ids: Optional[List[str]] = None) -> pd.DataFrame: if not df.empty: return df.set_index(constants.GAUGE_ID) else: - return pd.DataFrame(columns=[constants.GAUGE_ID]).set_index(constants.GAUGE_ID) - + return pd.DataFrame(columns=[constants.GAUGE_ID]).set_index(constants.GAUGE_ID) \ No newline at end of file From b2d205149eabf79678115f54db3d4cc6964528b6 Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Wed, 5 Nov 2025 11:43:43 +0000 Subject: [PATCH 3/7] Updated tests --- rivretrieve/japan.py | 156 +-- .../japan_301011281104010_kind6_200401.dat | 41 + .../japan_301011281104010_kind6_200402.dat | 39 + .../japan_301011281104010_kind7_2004.dat | 23 + .../japan_301011281104010_kind7_2005.dat | 23 + tests/test_data/japan_daily.html | 903 ------------------ tests/test_japan.py | 110 ++- 7 files changed, 295 insertions(+), 1000 deletions(-) create mode 100644 tests/test_data/japan_301011281104010_kind6_200401.dat create mode 100644 tests/test_data/japan_301011281104010_kind6_200402.dat create mode 100644 tests/test_data/japan_301011281104010_kind7_2004.dat create mode 100644 tests/test_data/japan_301011281104010_kind7_2005.dat delete mode 100644 tests/test_data/japan_daily.html diff --git a/rivretrieve/japan.py b/rivretrieve/japan.py index 225d048..cea2f9b 100644 --- a/rivretrieve/japan.py +++ b/rivretrieve/japan.py @@ -1,11 +1,11 @@ "Fetcher for Japanese river gauge data." +import calendar import io import logging import re -import calendar from datetime import datetime, timedelta -from typing import List, Optional, Dict, Any +from typing import List, Optional import pandas as pd import requests @@ -24,6 +24,7 @@ constants.DISCHARGE_DAILY_MEAN: 7, } + class JapanFetcher(base.RiverDataFetcher): """Fetches river gauge data from Japan's Ministry of Land, Infrastructure, Transport and Tourism (MLIT). @@ -79,10 +80,7 @@ def _download_data( end_dt = datetime.strptime(end_date, "%Y-%m-%d") dat_contents = [] - headers = { - "User-Agent": "Mozilla/5.0", - "Referer": self.BASE_URL - } + headers = {"User-Agent": "Mozilla/5.0", "Referer": self.BASE_URL} if kind_to_try in [2, 6]: # Monthly requests for hourly data current_dt = start_dt.replace(day=1) @@ -93,20 +91,26 @@ def _download_data( last_day = calendar.monthrange(year, month)[1] month_start_str = f"{year}{month_str}01" month_end_str = f"{year}{month_str}{last_day}" - - params = {"KIND": kind_to_try, "ID": gauge_id, "BGNDATE": month_start_str, "ENDDATE": month_end_str, "KAWABOU": "NO"} + + params = { + "KIND": kind_to_try, + "ID": gauge_id, + "BGNDATE": month_start_str, + "ENDDATE": month_end_str, + "KAWABOU": "NO", + } try: logger.debug(f"Fetching DspWaterData page for {gauge_id} {year}-{month_str} KIND {kind_to_try}") response = s.get(self.DSP_URL, params=params, headers=headers) response.raise_for_status() response.encoding = "EUC-JP" - soup = BeautifulSoup(response.text, 'html.parser') + soup = BeautifulSoup(response.text, "html.parser") link_tag = soup.find(re.compile("a", re.IGNORECASE), href=re.compile(r"/dat/dload/download/")) if link_tag: dat_url = f"{self.BASE_URL}{link_tag['href']}" dat_response = s.get(dat_url, headers=headers) dat_response.raise_for_status() - dat_contents.append(dat_response.content.decode('shift_jis', errors='replace')) + dat_contents.append(dat_response.content.decode("shift_jis", errors="replace")) logger.info(f"Successfully downloaded {link_tag['href'].split('/')[-1]}") else: logger.warning(f"No .dat link found for {gauge_id} {year}-{month_str} KIND {kind_to_try}") @@ -117,19 +121,25 @@ def _download_data( for year in range(start_dt.year, end_dt.year + 1): year_start_str = f"{year}0131" year_end_str = f"{year}1231" - params = {"KIND": kind_to_try, "ID": gauge_id, "BGNDATE": year_start_str, "ENDDATE": year_end_str, "KAWABOU": "NO"} + params = { + "KIND": kind_to_try, + "ID": gauge_id, + "BGNDATE": year_start_str, + "ENDDATE": year_end_str, + "KAWABOU": "NO", + } try: logger.debug(f"Fetching DspWaterData page for {gauge_id} {year} KIND {kind_to_try}") response = s.get(self.DSP_URL, params=params, headers=headers) response.raise_for_status() response.encoding = "EUC-JP" - soup = BeautifulSoup(response.text, 'html.parser') + soup = BeautifulSoup(response.text, "html.parser") link_tag = soup.find(re.compile("a", re.IGNORECASE), href=re.compile(r"/dat/dload/download/")) if link_tag: dat_url = f"{self.BASE_URL}{link_tag['href']}" dat_response = s.get(dat_url, headers=headers) dat_response.raise_for_status() - dat_contents.append(dat_response.content.decode('shift_jis', errors='replace')) + dat_contents.append(dat_response.content.decode("shift_jis", errors="replace")) logger.info(f"Successfully downloaded {link_tag['href'].split('/')[-1]}") else: logger.warning(f"No .dat link found for {gauge_id} {year} KIND {kind_to_try}") @@ -152,46 +162,57 @@ def _parse_data( for dat_content in raw_data_list: try: - lines = dat_content.strip().split('\r\n') - data_lines = [line for line in lines if not line.startswith('#') and line.strip()] - + lines = dat_content.strip().splitlines() + data_lines = [line for line in lines if not line.startswith("#") and line.strip()] + if not data_lines: continue - header_line = next((line for line in lines if line.startswith(',')), None) + header_line = next((line for line in lines if line.startswith(",")), None) if header_line: - data_lines = data_lines[1:] + try: + header_index = lines.index(header_line) + data_lines = lines[header_index + 1 :] + data_lines = [line for line in data_lines if not line.startswith("#") and line.strip()] + except ValueError: + pass if not data_lines: continue - csv_io = io.StringIO('\n'.join(data_lines)) + csv_io = io.StringIO("\n".join(data_lines)) if kind in [2, 6]: # Hourly data format col_names = [constants.TIME_INDEX] for i in range(1, 25): col_names.append(f"{i}時") col_names.append(f"{i}時フラグ") - - df = pd.read_csv(csv_io, header=None, names=col_names, na_values=["-9999.00"], dtype={constants.TIME_INDEX: str}) - df[constants.TIME_INDEX] = pd.to_datetime(df[constants.TIME_INDEX], format="%Y/%m/%d", errors="coerce") + + df = pd.read_csv( + csv_io, header=None, names=col_names, na_values=["-9999.00"], dtype={constants.TIME_INDEX: str} + ) + df[constants.TIME_INDEX] = pd.to_datetime( + df[constants.TIME_INDEX], format="%Y/%m/%d", errors="coerce" + ) df = df.dropna(subset=[constants.TIME_INDEX]) value_cols = [f"{i}時" for i in range(1, 25)] - df_long = df.melt(id_vars=[constants.TIME_INDEX], value_vars=value_cols, var_name='Hour', value_name='Value') - df_long['Hour'] = df_long['Hour'].str.replace('時', '').astype(int) - df_long['Value'] = pd.to_numeric(df_long['Value'], errors='coerce') - df_long = df_long.dropna(subset=['Value']) + df_long = df.melt( + id_vars=[constants.TIME_INDEX], value_vars=value_cols, var_name="Hour", value_name="Value" + ) + df_long["Hour"] = df_long["Hour"].str.replace("時", "").astype(int) + df_long["Value"] = pd.to_numeric(df_long["Value"], errors="coerce") + df_long = df_long.dropna(subset=["Value"]) df_long[constants.TIME_INDEX] = df_long.apply( - lambda row: row[constants.TIME_INDEX] + timedelta(hours=row['Hour'] - 1), axis=1 + lambda row: row[constants.TIME_INDEX] + timedelta(hours=row["Hour"] - 1), axis=1 ) - parsed_df = df_long[[constants.TIME_INDEX, 'Value']].rename(columns={'Value': variable}) + parsed_df = df_long[[constants.TIME_INDEX, "Value"]].rename(columns={"Value": variable}) all_dfs.append(parsed_df) elif kind in [3, 7]: # Daily data format year = None for line in lines: - if line.endswith("年"): + if "年" in line: year_match = re.search(r"(\d{4})年", line) if year_match: year = int(year_match.group(1)) @@ -204,8 +225,10 @@ def _parse_data( for i in range(1, 32): col_names.append(f"{i}日") col_names.append(f"{i}日フラグ") - - df = pd.read_csv(csv_io, header=None, names=col_names, na_values=[" ", "-9999.00"], encoding='utf-8') + + df = pd.read_csv( + csv_io, header=None, names=col_names, na_values=[" ", "-9999.00"], encoding="utf-8" + ) month_map = {f"{i}月": i for i in range(1, 13)} df["Month"] = df["月"].map(month_map) @@ -213,13 +236,15 @@ def _parse_data( df["Year"] = year value_cols = [f"{i}日" for i in range(1, 32)] - df_long = df.melt(id_vars=["Year", "Month"], value_vars=value_cols, var_name='Day', value_name='Value') - df_long['Day'] = df_long['Day'].str.replace('日', '').astype(int) - df_long['Value'] = pd.to_numeric(df_long['Value'], errors='coerce') - df_long = df_long.dropna(subset=['Value']) + df_long = df.melt( + id_vars=["Year", "Month"], value_vars=value_cols, var_name="Day", value_name="Value" + ) + df_long["Day"] = df_long["Day"].str.replace("日", "").astype(int) + df_long["Value"] = pd.to_numeric(df_long["Value"], errors="coerce") + df_long = df_long.dropna(subset=["Value"]) - df_long[constants.TIME_INDEX] = pd.to_datetime(df_long[['Year', 'Month', 'Day']], errors='coerce') - parsed_df = df_long[[constants.TIME_INDEX, 'Value']].rename(columns={'Value': variable}) + df_long[constants.TIME_INDEX] = pd.to_datetime(df_long[["Year", "Month", "Day"]], errors="coerce") + parsed_df = df_long[[constants.TIME_INDEX, "Value"]].rename(columns={"Value": variable}) parsed_df = parsed_df.dropna(subset=[constants.TIME_INDEX]) all_dfs.append(parsed_df) else: @@ -255,12 +280,12 @@ def get_data( raw_data_list = self._download_data(gauge_id, variable, start_date, end_date) if not raw_data_list: return pd.DataFrame(columns=[constants.TIME_INDEX, variable]) - + df = self._parse_data(gauge_id, raw_data_list, variable) if not df.empty: start_date_dt = pd.to_datetime(start_date) - end_date_dt = pd.to_datetime(end_date) + timedelta(days=1) # Include end date + end_date_dt = pd.to_datetime(end_date) + timedelta(days=1) # Include end date df = df[(df.index >= start_date_dt) & (df.index < end_date_dt)] return df @@ -276,10 +301,7 @@ def get_metadata(self, gauge_ids: Optional[List[str]] = None) -> pd.DataFrame: all_station_data = [] s = utils.requests_retry_session() - headers = { - "User-Agent": "Mozilla/5.0", - "Referer": self.BASE_URL - } + headers = {"User-Agent": "Mozilla/5.0", "Referer": self.BASE_URL} for gauge_id in gauge_ids: logger.info(f"Fetching metadata for station: {gauge_id}") @@ -288,38 +310,46 @@ def get_metadata(self, gauge_ids: Optional[List[str]] = None) -> pd.DataFrame: response = s.get(site_info_url, headers=headers) response.raise_for_status() response.encoding = "EUC-JP" - soup = BeautifulSoup(response.text, 'html.parser') + soup = BeautifulSoup(response.text, "html.parser") station_data = {constants.GAUGE_ID: gauge_id} - info_table = soup.find('table', {'align': 'CENTER', 'width': '600'}) + info_table = soup.find("table", {"align": "CENTER", "width": "600"}) if info_table: - for row in info_table.find_all('tr'): - cells = row.find_all('td') + for row in info_table.find_all("tr"): + cells = row.find_all("td") if len(cells) == 2: key = cells[0].text.strip() value = cells[1].text.strip() - if key == '観測所名': + if key == "観測所名": station_data[constants.STATION_NAME] = value - elif key == '所在地': - station_data['location'] = value - elif key == '水系名': + elif key == "所在地": + station_data["location"] = value + elif key == "水系名": station_data[constants.RIVER] = value - elif key == '河川名': - station_data['river_name_jp'] = value - elif key == '緯度経度': + elif key == "河川名": + station_data["river_name_jp"] = value + elif key == "緯度経度": try: - lat_match = re.search(r'N(\d+)度(\d+)分(\d+)秒', value) - lon_match = re.search(r'E(\d+)度(\d+)分(\d+)秒', value) + lat_match = re.search(r"N(\d+)度(\d+)分(\d+)秒", value) + lon_match = re.search(r"E(\d+)度(\d+)分(\d+)秒", value) if lat_match: - lat = float(lat_match.group(1)) + float(lat_match.group(2))/60 + float(lat_match.group(3))/3600 + lat = ( + float(lat_match.group(1)) + + float(lat_match.group(2)) / 60 + + float(lat_match.group(3)) / 3600 + ) station_data[constants.LATITUDE] = lat if lon_match: - lon = float(lon_match.group(1)) + float(lon_match.group(2))/60 + float(lon_match.group(3))/3600 + lon = ( + float(lon_match.group(1)) + + float(lon_match.group(2)) / 60 + + float(lon_match.group(3)) / 3600 + ) station_data[constants.LONGITUDE] = lon except Exception as e: logger.warning(f"Could not parse lat/lon for {gauge_id}: {value} - {e}") - + # Fetch available kinds for the station kind_map = {} # Commenting out the SiteInfo fetch for KINDs due to 403 errors @@ -331,13 +361,13 @@ def get_metadata(self, gauge_ids: Optional[List[str]] = None) -> pd.DataFrame: # # ... parsing logic ... # except Exception as e: # logger.error(f"Error fetching/parsing SiteInfo for {gauge_id} for KINDS: {e}") - station_data['available_kinds'] = kind_map + station_data["available_kinds"] = kind_map all_station_data.append(station_data) except requests.exceptions.RequestException as e: - if e.response and e.response.status_code == 403: + if e.response and e.response.status_code == 403: logger.error(f"Access forbidden for SiteInfo {gauge_id}: {e}") - else: + else: logger.error(f"Error fetching SiteInfo for {gauge_id}: {e}") except Exception as e: logger.error(f"Error parsing SiteInfo for {gauge_id}: {e}", exc_info=True) @@ -346,4 +376,4 @@ def get_metadata(self, gauge_ids: Optional[List[str]] = None) -> pd.DataFrame: if not df.empty: return df.set_index(constants.GAUGE_ID) else: - return pd.DataFrame(columns=[constants.GAUGE_ID]).set_index(constants.GAUGE_ID) \ No newline at end of file + return pd.DataFrame(columns=[constants.GAUGE_ID]).set_index(constants.GAUGE_ID) diff --git a/tests/test_data/japan_301011281104010_kind6_200401.dat b/tests/test_data/japan_301011281104010_kind6_200401.dat new file mode 100644 index 0000000..6feba2c --- /dev/null +++ b/tests/test_data/japan_301011281104010_kind6_200401.dat @@ -0,0 +1,41 @@ +時刻流量月表検索結果 +水系名,天塩川 +河川名,天塩川 +観測所名,茂志利 +観測所記号,301011281104010 +# +#日付,1時データ,1時フラグ,,,,,,,,24時データ,24時フラグ +# フラグの意味: *:暫定値, $:欠測, #:閉局, -:未登録 +# +,1時,,2時,,3時,,4時,,5時,,6時,,7時,,8時,,9時,,10時,,11時,,12時,,13時,,14時,,15時,,16時,,17時,,18時,,19時,,20時,,21時,,22時,,23時,,24時, +2004/01/01, 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, +2004/01/02, 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, +2004/01/03, 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, +2004/01/04, 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, +2004/01/05, 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, +2004/01/06, 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, +2004/01/07, 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.52, , 0.52, +2004/01/08, 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, +2004/01/09, 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/01/10, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, +2004/01/11, 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, +2004/01/12, 0.52, , 0.52, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/01/13, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/01/14, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/01/15, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/01/16, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/01/17, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.45, , 0.45, , 0.45, +2004/01/18, 0.52, , 0.52, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.52, , 0.52, , 0.52, , 0.52, , 0.45, , 0.45, , 0.45, , 0.45, +2004/01/19, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/01/20, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/01/21, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/01/22, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/01/23, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/01/24, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.38, , 0.38, , 0.38, , 0.38, +2004/01/25, 0.38, , 0.38, , 0.38, , 0.38, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/01/26, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/01/27, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/01/28, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.45, +2004/01/29, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/01/30, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/01/31, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, diff --git a/tests/test_data/japan_301011281104010_kind6_200402.dat b/tests/test_data/japan_301011281104010_kind6_200402.dat new file mode 100644 index 0000000..654877f --- /dev/null +++ b/tests/test_data/japan_301011281104010_kind6_200402.dat @@ -0,0 +1,39 @@ +時刻流量月表検索結果 +水系名,天塩川 +河川名,天塩川 +観測所名,茂志利 +観測所記号,301011281104010 +# +#日付,1時データ,1時フラグ,,,,,,,,24時データ,24時フラグ +# フラグの意味: *:暫定値, $:欠測, #:閉局, -:未登録 +# +,1時,,2時,,3時,,4時,,5時,,6時,,7時,,8時,,9時,,10時,,11時,,12時,,13時,,14時,,15時,,16時,,17時,,18時,,19時,,20時,,21時,,22時,,23時,,24時, +2004/02/01, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/02, 0.38, , 0.38, , 0.38, , 0.45, , 0.52, , 0.52, , 0.59, , 0.59, , 0.59, , 0.59, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/02/03, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/02/04, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/02/05, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/02/06, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, +2004/02/07, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/08, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/09, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/10, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/11, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/12, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.32, , 0.32, , 0.32, +2004/02/13, 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/14, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/15, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/16, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.32, , 0.32, , 0.32, , 0.32, +2004/02/17, 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/18, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.38, , 0.38, , 0.38, , 0.32, , 0.32, +2004/02/19, 0.32, , 0.26, , 0.26, , 0.32, , 0.38, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.45, , 0.45, , 0.45, , 0.45, , 0.38, , 0.38, +2004/02/20, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/21, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/22, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, , 0.32, +2004/02/23, 0.32, , 0.38, , 0.45, , 0.68, , 0.86, , 1.16, , 1.40, , 1.65, , 1.92, , 2.38, , 2.87, , 3.23, , 3.61, , 4.01, , 4.22, , 4.22, , 4.22, , 4.22, , 4.22, , 4.22, , 4.22, , 4.22, , 4.22, , 4.22, +2004/02/24, 4.22, , 4.22, , 4.22, , 4.01, , 4.01, , 4.01, , 3.81, , 3.81, , 3.81, , 3.81, , 3.61, , 3.42, , 3.42, , 3.05, , 2.87, , 2.54, , 2.38, , 2.22, , 2.07, , 1.92, , 1.78, , 1.78, , 1.65, , 1.52, +2004/02/25, 1.52, , 1.40, , 1.28, , 1.28, , 1.28, , 1.16, , 1.06, , 1.06, , 0.95, , 0.95, , 0.86, , 0.86, , 0.76, , 0.76, , 0.76, , 0.68, , 0.68, , 0.68, , 0.68, , 0.68, , 0.59, , 0.59, , 0.59, , 0.59, +2004/02/26, 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.59, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.45, , 0.45, , 0.45, +2004/02/27, 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/28, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, +2004/02/29, 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.38, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, diff --git a/tests/test_data/japan_301011281104010_kind7_2004.dat b/tests/test_data/japan_301011281104010_kind7_2004.dat new file mode 100644 index 0000000..467aa62 --- /dev/null +++ b/tests/test_data/japan_301011281104010_kind7_2004.dat @@ -0,0 +1,23 @@ +日流量年表検索結果 +水系名,天塩川 +河川名,天塩川 +観測所名,茂志利 +観測所記号,301011281104010 +# +#月,1日データ,1日フラグ,,,,,,,,31日データ,31日フラグ,月平均データ,月平均フラグ +# フラグの意味: $:欠測, -:未登録 +# +,1日,,2日,,3日,,4日,,5日,,6日,,7日,,8日,,9日,,10日,,11日,,12日,,13日,,14日,,15日,,16日,,17日,,18日,,19日,,20日,,21日,,22日,,23日,,24日,,25日,,26日,,27日,,28日,,29日,,30日,,31日, +2004年 +1月, 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.52, , 0.55, , 0.52, , 0.51, , 0.48, , 0.52, , 0.46, , 0.45, , 0.45, , 0.38, , 0.38, , 0.39, , 0.55, , 0.45, , 0.45, , 0.45, , 0.45, , 0.45, , 0.44, , 0.36, , 0.38, , 0.38, , 0.38, , 0.46, , 0.45, , 0.45, +2月, 0.40, , 0.50, , 0.45, , 0.45, , 0.45, , 0.45, , 0.40, , 0.38, , 0.38, , 0.38, , 0.38, , 0.37, , 0.35, , 0.38, , 0.38, , 0.37, , 0.35, , 0.40, , 0.43, , 0.38, , 0.38, , 0.35, , 2.80, , 3.09, , 0.90, , 0.55, , 0.43, , 0.38, , 0.42, +3月, 0.45, , 0.45, , 0.41, , 0.38, , 0.38, , 0.40, , 0.39, , 0.38, , 0.33, , 0.33, , 1.13, , 1.54, , 1.24, , 0.94, , 0.70, , 0.58, , 1.05, , 4.24, , 2.54, , 1.64, , 1.29, , 1.09, , 1.06, , 1.16, , 1.44, , 1.35, , 1.28, , 1.20, , 1.34, , 1.97, , 6.77, +4月, 4.80, , 3.03, , 2.23, , 1.84, , 1.67, , 2.34, , 4.44, , 4.44, , 3.14, , 2.69, , 2.41, , 2.70, , 3.44, , 5.34, , 5.28, , 3.95, , 3.31, , 3.18, , 5.79, , 40.70, , 37.56, , 13.77, , 7.57, , 5.97, , 5.31, , 4.72, , 4.70, , 5.41, , 7.46, , 10.89, +5月, 15.00, , 18.41, , 21.27, , 88.62, , 49.96, , 40.43, , 37.09, , 31.10, , 31.80, , 41.61, , 50.95, , 46.15, , 41.84, , 65.70, , 51.47, , 42.78, , 35.16, , 37.39, , 34.31, , 32.77, , 31.72, , 31.05, , 26.19, , 22.41, , 18.20, , 14.88, , 15.48, , 14.84, , 16.45, , 15.29, , 15.66, +6月, 14.65, , 9.67, , 6.03, , 3.85, , 2.56, , 1.88, , 1.60, , 1.57, , 1.50, , 1.46, , 1.37, , 1.28, , 1.28, , 1.15, , 1.21, , 1.05, , 0.99, , 0.92, , 0.92, , 0.92, , 0.92, , 0.92, , 0.92, , 1.06, , 0.92, , 0.87, , 0.82, , 0.79, , 0.72, , 1.18, +7月, 2.08, , 1.29, , 1.03, , 0.92, , 0.88, , 2.03, , 4.93, , 3.66, , 1.32, , 1.83, , 2.03, , 1.61, , 1.37, , 1.23, , 1.12, , 1.03, , 1.03, , 2.87, , 1.14, , 1.03, , 1.08, , 1.00, , 1.03, , 0.99, , 0.92, , 0.92, , 0.92, , 1.21, , 1.01, , 0.92, , 0.96, +8月, 0.92, , 0.92, , 0.88, , 1.00, , 0.88, , 0.82, , 0.82, , 0.78, , 0.72, , 0.72, , 0.72, , 0.72, , 0.72, , 0.72, , 0.76, , 0.82, , 0.80, , 0.72, , 0.70, , 0.96, , 1.08, , 1.00, , 0.80, , 0.72, , 0.72, , 0.72, , 0.68, , 0.63, , 0.63, , 0.63, , 0.68, +9月, 1.10, , 0.76, , 0.75, , 0.77, , 0.72, , 0.70, , 0.63, , 0.66, , 3.18, , 0.86, , 0.78, , 0.72, , 1.47, , 4.59, , 2.78, , 1.39, , 1.16, , 1.03, , 5.68, , 2.92, , 1.99, , 1.70, , 1.77, , 1.59, , 4.13, , 2.91, , 2.34, , 1.68, , 1.27, , 1.15, +10月, 1.06, , 1.14, , 1.09, , 1.00, , 0.92, , 0.92, , 0.87, , 0.82, , 0.82, , 0.91, , 0.92, , 0.92, , 0.92, , 0.86, , 0.82, , 0.82, , 0.82, , 0.82, , 0.81, , 0.64, , 0.63, , 0.70, , 1.89, , 2.39, , 2.30, , 2.26, , 1.84, , 1.57, , 2.27, , 2.78, , 2.69, +11月, 2.56, , 2.79, , 2.24, , 2.00, , 2.00, , 1.86, , 1.91, , 1.62, , 1.54, , 1.41, , 1.33, , 4.45, , 4.29, , 3.48, , 4.02, , 6.48, , 4.71, , 3.73, , 3.52, , 3.72, , 3.52, , 3.18, , 3.39, , 4.46, , 4.36, , 3.58, , 5.06, , 3.95, , 3.17, , 2.73, +12月, 2.41, , 2.01, , 2.18, , 5.36, , 3.92, , 3.10, , 2.56, , 2.43, , 2.14, , 2.01, , 2.57, , 2.14, , 1.97, , 1.78, , 1.57, , 1.55, , 1.46, , 1.32, , 1.23, , 1.10, , 1.21, , 1.15, , 1.15, , 1.14, , 1.03, , 1.03, , 0.93, , 0.91, , 0.82, , 0.98, , 0.82, diff --git a/tests/test_data/japan_301011281104010_kind7_2005.dat b/tests/test_data/japan_301011281104010_kind7_2005.dat new file mode 100644 index 0000000..85da872 --- /dev/null +++ b/tests/test_data/japan_301011281104010_kind7_2005.dat @@ -0,0 +1,23 @@ +日流量年表検索結果 +水系名,天塩川 +河川名,天塩川 +観測所名,茂志利 +観測所記号,301011281104010 +# +#月,1日データ,1日フラグ,,,,,,,,31日データ,31日フラグ,月平均データ,月平均フラグ +# フラグの意味: $:欠測, -:未登録 +# +,1日,,2日,,3日,,4日,,5日,,6日,,7日,,8日,,9日,,10日,,11日,,12日,,13日,,14日,,15日,,16日,,17日,,18日,,19日,,20日,,21日,,22日,,23日,,24日,,25日,,26日,,27日,,28日,,29日,,30日,,31日, +2005年 +1月, 1.97, , 0.96, , 1.00, , 1.00, , 1.12, , 1.04, , 1.00, , 1.00, , 1.00, , 1.05, , 1.32, , 2.15, , 1.00, , 0.91, , 2.17, , 2.26, , 1.00, , 1.00, , 0.88, , 0.76, , 0.76, , 0.89, , 0.84, , 0.76, , 1.20, , 2.68, , 0.90, , 0.76, , 0.76, , 0.73, , 1.60, +2月, 1.02, , 0.77, , 0.76, , 0.67, , 0.66, , 0.66, , 0.66, , 0.63, , 0.56, , 0.56, , 1.02, , 0.66, , 0.66, , 0.63, , 0.56, , 0.56, , 0.57, , 1.19, , 0.79, , 0.62, , 0.61, , 1.62, , 1.04, , 0.65, , 0.51, , 0.47, , 0.66, , 0.47, +3月, 0.47, , 0.47, , 0.40, , 0.39, , 0.39, , 0.39, , 0.40, , 0.51, , 0.79, , 0.50, , 0.47, , 1.22, , 0.49, , 0.41, , 0.39, , 0.39, , 0.39, , 0.39, , 0.42, , 0.40, , 0.39, , 0.42, , 0.63, , 0.76, , 0.76, , 0.67, , 0.58, , 0.55, , 0.69, , 0.88, , 0.78, +4月, 0.68, , 0.65, , 0.66, , 0.66, , 0.66, , 1.19, , 3.38, , 7.04, , 3.92, , 2.74, , 2.13, , 1.94, , 2.29, , 2.04, , 2.69, , 3.56, , 4.00, , 2.99, , 2.99, , 3.78, , 4.32, , 3.64, , 2.94, , 2.61, , 3.67, , 10.97, , 14.88, , 26.36, , 40.94, , 23.92, +5月, 12.41, , 30.64, , 29.15, , 19.86, , 16.57, , 12.05, , 11.48, , 9.40, , 6.74, , 6.39, , 5.48, , 5.99, , 8.95, , 17.76, , 20.68, , 20.72, , 25.72, , 26.65, , 58.97, , 55.11, , 48.46, , 61.94, , 49.94, , 35.51, , 29.67, , 26.47, , 24.49, , 25.44, , 35.05, , 50.65, , 55.01, +6月, 47.87, , 46.89, , 41.20, , 32.92, , 27.00, , 25.87, , 23.64, , 23.92, , 23.92, , 21.11, , 17.55, , 16.30, , 14.60, , 11.16, , 7.71, , 5.77, , 3.93, , 2.45, , 1.59, , 1.36, , 1.41, , 1.32, , 1.26, , 1.21, , 1.14, , 1.44, , 1.22, , 1.15, , 1.08, , 1.00, +7月, 1.00, , 0.97, , 0.88, , 0.88, , 2.02, , 3.01, , 3.05, , 2.84, , 1.00, , 0.88, , 1.06, , 0.91, , 0.84, , 0.76, , 0.76, , 0.75, , 0.66, , 1.03, , 1.01, , 1.22, , 0.87, , 0.68, , 0.76, , 0.88, , 0.88, , 0.77, , 2.58, , 2.14, , 1.29, , 1.09, , 1.00, +8月, 0.97, , 0.88, , 12.69, , 16.60, , 6.64, , 3.24, , 2.55, , 2.17, , 1.83, , 1.65, , 1.49, , 1.35, , 1.22, , 1.13, , 1.10, , 1.02, , 1.13, , 1.13, , 1.09, , 2.19, , 2.15, , 44.25, , 20.97, , 13.95, , 10.72, , 8.77, , 18.51, , 14.20, , 12.92, , 9.01, , 8.72, +9月, 7.20, , 7.21, , 6.49, , 5.82, , 5.35, , 4.95, , 6.64, , 28.23, , 19.52, , 15.16, , 12.32, , 9.44, , 13.41, , 9.15, , 8.37, , 7.65, , 7.13, , 7.50, , 7.04, , 6.64, , 7.38, , 6.33, , 7.25, , 6.45, , 6.15, , 5.80, , 5.13, , 5.19, , 5.03, , 4.83, +10月, 8.25, , 6.20, , 11.11, , 7.99, , 9.10, , 7.42, , 6.34, , 9.67, , 10.48, , 9.31, , 8.18, , 7.34, , 7.18, , 7.11, , 15.17, , 9.70, , 9.35, , 7.98, , 7.58, , 6.94, , 6.40, , 6.25, , 6.79, , 6.20, , 6.05, , 5.60, , 5.66, , 5.08, , 7.77, , 10.16, , 8.12, +11月, 7.34, , 7.01, , 6.42, , 6.42, , 6.13, , 5.67, , 6.85, , 7.15, , 7.03, , 5.99, , 6.65, , 6.06, , 6.19, , 5.69, , 5.24, , 5.70, , 4.70, , 5.08, , 4.73, , 4.68, , 4.60, , 4.62, , 4.47, , 4.52, , 4.45, , 4.24, , 4.14, , 3.91, , 4.09, , 3.98, +12月, 4.02, , 4.06, , 4.33, , 3.13, , 4.07, , 4.03, , 3.87, , 3.47, , 3.71, , 3.37, , 3.62, , 4.02, , 3.75, , 3.65, , 3.38, , 3.46, , 3.12, , 3.03, , 3.67, , 3.21, , 3.17, , 3.81, , 3.37, , 3.31, , 2.96, , 3.01, , 3.97, , 5.28, , 6.12, , 5.69, , 5.09, diff --git a/tests/test_data/japan_daily.html b/tests/test_data/japan_daily.html deleted file mode 100644 index 2a36f28..0000000 --- a/tests/test_data/japan_daily.html +++ /dev/null @@ -1,903 +0,0 @@ - - - - -»þ¹ïήÎÌ·îɽ¸¡º÷·ë²Ì - - -
- - - - - - - - - - - - - - - -
´Ñ¬½êµ­¹æ´Ñ¬½ê̾¿å·Ï̾²ÏÀî̾
301011281104010ÌлÖÍø¡Ê¤â¤·¤ê¡ËÅ·±öÀîÅ·±öÀî
-
-

2019ǯ1·î¡¡»þ¹ïήÎÌ·îɽ - -

-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ñ°Ì¡§m3/s
¡¡1»þ2»þ3»þ4»þ5»þ6»þ7»þ8»þ9»þ10»þ11»þ12»þ13»þ14»þ15»þ16»þ17»þ18»þ19»þ20»þ21»þ22»þ23»þ24»þ
2019/01/01·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/02·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/03·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/04·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/05·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/06·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/07·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/08·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/10·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/11·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/12·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/13·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/14·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.78·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/15·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.78·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/16·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/17·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.78·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/18·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/19·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.78·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/20·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/21·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬2.09·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/22·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.60·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/23·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.78·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/24·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.78·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/25·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.78·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/26·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.78·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/27·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.78·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/28·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.78·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/29·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.60·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/30·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.78·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
2019/01/31·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬1.78·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬·ç¬
-
- - diff --git a/tests/test_japan.py b/tests/test_japan.py index 906795c..db73d57 100644 --- a/tests/test_japan.py +++ b/tests/test_japan.py @@ -1,9 +1,8 @@ import os import unittest -from unittest.mock import MagicMock, patch +from unittest.mock import patch import pandas as pd -from pandas.testing import assert_frame_equal from rivretrieve import JapanFetcher, constants @@ -12,44 +11,87 @@ class TestJapanFetcher(unittest.TestCase): def setUp(self): self.fetcher = JapanFetcher() self.test_data_dir = os.path.join(os.path.dirname(__file__), "test_data") + self.gauge_id = "301011281104010" def load_sample_data(self, filename): - with open(os.path.join(self.test_data_dir, filename), "r", encoding="utf-8") as f: - return f.read() + file_path = os.path.join(self.test_data_dir, filename) + try: + with open(file_path, "r", encoding="utf-8") as f: + return f.read() + except FileNotFoundError: + self.fail(f"Test data file not found: {file_path}") - @patch("requests.Session.get") - def test_get_data_discharge(self, mock_get): - sample_html = self.load_sample_data("japan_daily.html") + def mocked_download_data(self, gauge_id, variable, start_date, end_date): + kind = self.fetcher._get_kind(variable) + contents = [] + start_dt = pd.to_datetime(start_date) + end_dt = pd.to_datetime(end_date) - mock_response = MagicMock() - mock_response.text = sample_html - mock_response.encoding = "shift_jis" - mock_response.raise_for_status = MagicMock() - mock_get.return_value = mock_response + if kind == 6: # DISCHARGE_HOURLY_MEAN + # Loop through each month in the range + current_dt = start_dt.replace(day=1) + while current_dt <= end_dt: + year = current_dt.year + month = current_dt.month + if year == 2004 and month == 1: + contents.append(self.load_sample_data(f"japan_{self.gauge_id}_kind6_200401.dat")) + elif year == 2004 and month == 2: + contents.append(self.load_sample_data(f"japan_{self.gauge_id}_kind6_200402.dat")) + + if current_dt.month == 12: + current_dt = current_dt.replace(year=year + 1, month=1) + else: + current_dt = current_dt.replace(month=month + 1) + elif kind == 7: # DISCHARGE_DAILY_MEAN + # Loop through each year in the range + for year in range(start_dt.year, end_dt.year + 1): + if year == 2004: + contents.append(self.load_sample_data(f"japan_{self.gauge_id}_kind7_2004.dat")) + elif year == 2005: + contents.append(self.load_sample_data(f"japan_{self.gauge_id}_kind7_2005.dat")) + return contents + + @patch("rivretrieve.japan.JapanFetcher._download_data") + def test_get_data_hourly_discharge(self, mock_download): + mock_download.side_effect = self.mocked_download_data + + variable = constants.DISCHARGE_HOURLY_MEAN + start_date = "2004-01-30" + end_date = "2004-02-02" + + result_df = self.fetcher.get_data(self.gauge_id, variable, start_date, end_date) + self.assertFalse(result_df.empty) + self.assertEqual(result_df.index.name, constants.TIME_INDEX) + self.assertIn(variable, result_df.columns) + + # Check if dates are within the requested range + self.assertTrue((result_df.index >= pd.to_datetime(start_date)).all()) + self.assertTrue((result_df.index <= pd.to_datetime(end_date) + pd.Timedelta(days=1)).all()) + + # Check for expected number of hourly data points (3 days * 24 hours) + self.assertTrue(len(result_df) > 24 * 3) + self.assertTrue(len(result_df) <= 24 * 4) + + @patch("rivretrieve.japan.JapanFetcher._download_data") + def test_get_data_daily_discharge(self, mock_download): + mock_download.side_effect = self.mocked_download_data - gauge_id = "301011281104010" variable = constants.DISCHARGE_DAILY_MEAN - start_date = "2019-01-13" - end_date = "2019-01-17" - - result_df = self.fetcher.get_data(gauge_id, variable, start_date, end_date) - - expected_dates = pd.to_datetime(["2019-01-13", "2019-01-14", "2019-01-15", "2019-01-16", "2019-01-17"]) - expected_values = [2.09, 1.78, 1.78, 2.09, 1.78] - expected_data = { - constants.TIME_INDEX: expected_dates, - constants.DISCHARGE_DAILY_MEAN: expected_values, - } - expected_df = pd.DataFrame(expected_data).set_index(constants.TIME_INDEX) - - assert_frame_equal(result_df, expected_df) - mock_get.assert_called_once() - # Check that the params are correct - mock_args, mock_kwargs = mock_get.call_args - self.assertEqual(mock_kwargs["params"]["KIND"], 6) - self.assertEqual(mock_kwargs["params"]["ID"], gauge_id) - self.assertEqual(mock_kwargs["params"]["BGNDATE"], "20190101") - self.assertEqual(mock_kwargs["params"]["ENDDATE"], "20190117") + start_date = "2004-12-25" + end_date = "2005-01-05" + + result_df = self.fetcher.get_data(self.gauge_id, variable, start_date, end_date) + self.assertFalse(result_df.empty) + self.assertEqual(result_df.index.name, constants.TIME_INDEX) + self.assertIn(variable, result_df.columns) + + # Check if dates are within the requested range + self.assertTrue((result_df.index >= pd.to_datetime(start_date)).all()) + self.assertTrue((result_df.index <= pd.to_datetime(end_date)).all()) + + # Check for expected number of daily data points + expected_days = (pd.to_datetime(end_date) - pd.to_datetime(start_date)).days + 1 + self.assertEqual(len(result_df), expected_days) if __name__ == "__main__": From f28362e226f958144cb27756efe641ee9959fad4 Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Wed, 5 Nov 2025 11:45:01 +0000 Subject: [PATCH 4/7] Update Japan fetcher --- examples/test_japan_fetcher.py | 76 +++++++++++++++++----------------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/examples/test_japan_fetcher.py b/examples/test_japan_fetcher.py index 18ceed1..6a1f46e 100644 --- a/examples/test_japan_fetcher.py +++ b/examples/test_japan_fetcher.py @@ -1,43 +1,41 @@ -import logging +import argparse import matplotlib.pyplot as plt -from rivretrieve import JapanFetcher, constants - -logging.basicConfig(level=logging.INFO) - -gauge_ids = [ - "301011281104010", -] -variable = constants.DISCHARGE_DAILY_MEAN -start_date = "2019-01-01" -end_date = "2019-12-31" # Fetching a few months to test - -plt.figure(figsize=(12, 6)) - -fetcher = JapanFetcher() -for gauge_id in gauge_ids: - print(f"Fetching data for {gauge_id} from {start_date} to {end_date}...") - data = fetcher.get_data(gauge_id=gauge_id, variable=variable, start_date=start_date, end_date=end_date) - if not data.empty: - print(f"Data for {gauge_id}:") - print(data.head()) - print(f"Time series from {data.index.min()} to {data.index.max()}") - plt.plot( - data.index, - data[constants.DISCHARGE_DAILY_MEAN], - label=gauge_id, - marker="o", - ) +from rivretrieve import constants +from rivretrieve.japan import JapanFetcher + + +def main(): + parser = argparse.ArgumentParser(description="Test JapanFetcher") + parser.add_argument("--gauge_id", type=str, default="301011281104010", help="Gauge ID to test") + parser.add_argument("--variable", type=str, default=constants.DISCHARGE_DAILY_MEAN, help="Variable to fetch") + parser.add_argument("--start_date", type=str, default="2004-01-01", help="Start date YYYY-MM-DD") + parser.add_argument("--end_date", type=str, default="2004-12-31", help="End date YYYY-MM-DD") + args = parser.parse_args() + + fetcher = JapanFetcher() + print(f"Fetching data for {args.gauge_id} from {args.start_date} to {args.end_date} for {args.variable}...") + + df = fetcher.get_data( + gauge_id=args.gauge_id, variable=args.variable, start_date=args.start_date, end_date=args.end_date + ) + + if not df.empty: + print(f"Data for {args.gauge_id}:") + print(df.head()) + print(f"Time series from {df.index.min()} to {df.index.max()}") + df.plot(y=args.variable) + plt.title(f"{args.gauge_id} - {args.variable}") + plt.xlabel("Time") + plt.ylabel(args.variable) + plt.legend() + plot_filename = f"japan_{args.variable}_plot.png" + plt.savefig(plot_filename) + print(f"Plot saved to {plot_filename}") else: - print(f"No data found for {gauge_id}") - -plt.xlabel(constants.TIME_INDEX) -plt.ylabel(f"{constants.DISCHARGE_DAILY_MEAN} (m3/s)") -plt.title("Japan River Discharge - Full Time Series") -plt.legend() -plt.grid(True) -plt.tight_layout() -plot_path = "japan_discharge_plot.png" -plt.savefig(plot_path) -print(f"Plot saved to {plot_path}") + print(f"No data found for {args.gauge_id}") + + +if __name__ == "__main__": + main() From 0635831133b21624f53deb7ec4a68e79c98c2ec8 Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Thu, 6 Nov 2025 19:27:49 +0000 Subject: [PATCH 5/7] Removing metadata download option --- rivretrieve/japan.py | 89 ++------------------------------------------ 1 file changed, 4 insertions(+), 85 deletions(-) diff --git a/rivretrieve/japan.py b/rivretrieve/japan.py index cea2f9b..94d3f95 100644 --- a/rivretrieve/japan.py +++ b/rivretrieve/japan.py @@ -50,6 +50,10 @@ def get_cached_metadata() -> pd.DataFrame: """Retrieves a DataFrame of available Japanese gauge IDs and metadata.""" return utils.load_cached_metadata_csv("japan") + def get_metadata(self) -> pd.DataFrame: + """Fetches metadata from MLIT Water Information System.""" + raise NotImplementedError("Currently, downloading metadata is not suppoted for this fetcher.") + @staticmethod def get_available_variables() -> tuple[str, ...]: return ( @@ -292,88 +296,3 @@ def get_data( except Exception as e: logger.error(f"Failed to get data for site {gauge_id}, variable {variable}: {e}", exc_info=True) return pd.DataFrame(columns=[constants.TIME_INDEX, variable]) - - def get_metadata(self, gauge_ids: Optional[List[str]] = None) -> pd.DataFrame: - """Fetches metadata for given gauge IDs from the MLIT Water Information System.""" - if gauge_ids is None: - cached_meta = self.get_cached_metadata() - gauge_ids = cached_meta.index.tolist() - - all_station_data = [] - s = utils.requests_retry_session() - headers = {"User-Agent": "Mozilla/5.0", "Referer": self.BASE_URL} - - for gauge_id in gauge_ids: - logger.info(f"Fetching metadata for station: {gauge_id}") - site_info_url = f"{self.SITE_INFO_URL}?ID={gauge_id}" - try: - response = s.get(site_info_url, headers=headers) - response.raise_for_status() - response.encoding = "EUC-JP" - soup = BeautifulSoup(response.text, "html.parser") - - station_data = {constants.GAUGE_ID: gauge_id} - - info_table = soup.find("table", {"align": "CENTER", "width": "600"}) - if info_table: - for row in info_table.find_all("tr"): - cells = row.find_all("td") - if len(cells) == 2: - key = cells[0].text.strip() - value = cells[1].text.strip() - if key == "観測所名": - station_data[constants.STATION_NAME] = value - elif key == "所在地": - station_data["location"] = value - elif key == "水系名": - station_data[constants.RIVER] = value - elif key == "河川名": - station_data["river_name_jp"] = value - elif key == "緯度経度": - try: - lat_match = re.search(r"N(\d+)度(\d+)分(\d+)秒", value) - lon_match = re.search(r"E(\d+)度(\d+)分(\d+)秒", value) - if lat_match: - lat = ( - float(lat_match.group(1)) - + float(lat_match.group(2)) / 60 - + float(lat_match.group(3)) / 3600 - ) - station_data[constants.LATITUDE] = lat - if lon_match: - lon = ( - float(lon_match.group(1)) - + float(lon_match.group(2)) / 60 - + float(lon_match.group(3)) / 3600 - ) - station_data[constants.LONGITUDE] = lon - except Exception as e: - logger.warning(f"Could not parse lat/lon for {gauge_id}: {value} - {e}") - - # Fetch available kinds for the station - kind_map = {} - # Commenting out the SiteInfo fetch for KINDs due to 403 errors - # try: - # # This part is still blocked by 403, so kind_map will be empty - # pass # s_kinds = utils.requests_retry_session() - # # response = s_kinds.get(site_info_url, headers=headers) - # # response.raise_for_status() - # # ... parsing logic ... - # except Exception as e: - # logger.error(f"Error fetching/parsing SiteInfo for {gauge_id} for KINDS: {e}") - station_data["available_kinds"] = kind_map - all_station_data.append(station_data) - - except requests.exceptions.RequestException as e: - if e.response and e.response.status_code == 403: - logger.error(f"Access forbidden for SiteInfo {gauge_id}: {e}") - else: - logger.error(f"Error fetching SiteInfo for {gauge_id}: {e}") - except Exception as e: - logger.error(f"Error parsing SiteInfo for {gauge_id}: {e}", exc_info=True) - - df = pd.DataFrame(all_station_data) - if not df.empty: - return df.set_index(constants.GAUGE_ID) - else: - return pd.DataFrame(columns=[constants.GAUGE_ID]).set_index(constants.GAUGE_ID) From 3b87565b0a8dc72bdfb92a845208997e07a92e4a Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Thu, 6 Nov 2025 19:28:40 +0000 Subject: [PATCH 6/7] Formatting --- rivretrieve/japan.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rivretrieve/japan.py b/rivretrieve/japan.py index 94d3f95..9e6ebb1 100644 --- a/rivretrieve/japan.py +++ b/rivretrieve/japan.py @@ -8,7 +8,6 @@ from typing import List, Optional import pandas as pd -import requests from bs4 import BeautifulSoup from dateutil.relativedelta import relativedelta From 4cb48ddd188e37998c5148f4e2e19edef7089beb Mon Sep 17 00:00:00 2001 From: Frederik Kratzert Date: Thu, 6 Nov 2025 20:44:06 +0000 Subject: [PATCH 7/7] Fix missing data flags --- rivretrieve/japan.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/rivretrieve/japan.py b/rivretrieve/japan.py index 9e6ebb1..fb97ab0 100644 --- a/rivretrieve/japan.py +++ b/rivretrieve/japan.py @@ -7,6 +7,7 @@ from datetime import datetime, timedelta from typing import List, Optional +import numpy as np import pandas as pd from bs4 import BeautifulSoup from dateutil.relativedelta import relativedelta @@ -191,7 +192,7 @@ def _parse_data( col_names.append(f"{i}時フラグ") df = pd.read_csv( - csv_io, header=None, names=col_names, na_values=["-9999.00"], dtype={constants.TIME_INDEX: str} + csv_io, header=None, names=col_names, na_values=["-9999.99"], dtype={constants.TIME_INDEX: str} ) df[constants.TIME_INDEX] = pd.to_datetime( df[constants.TIME_INDEX], format="%Y/%m/%d", errors="coerce" @@ -204,6 +205,10 @@ def _parse_data( ) df_long["Hour"] = df_long["Hour"].str.replace("時", "").astype(int) df_long["Value"] = pd.to_numeric(df_long["Value"], errors="coerce") + + # Different values are used to encode NaN. All seem to be -9999.XX + df_long.loc[df_long["Value"] <= -9999, "Value"] = np.nan + df_long = df_long.dropna(subset=["Value"]) df_long[constants.TIME_INDEX] = df_long.apply( @@ -230,7 +235,7 @@ def _parse_data( col_names.append(f"{i}日フラグ") df = pd.read_csv( - csv_io, header=None, names=col_names, na_values=[" ", "-9999.00"], encoding="utf-8" + csv_io, header=None, names=col_names, na_values=[" ", "-9999.99"], encoding="utf-8" ) month_map = {f"{i}月": i for i in range(1, 13)} @@ -244,6 +249,9 @@ def _parse_data( ) df_long["Day"] = df_long["Day"].str.replace("日", "").astype(int) df_long["Value"] = pd.to_numeric(df_long["Value"], errors="coerce") + + # Different values are used to encode NaN. All seem to be -9999.XX + df_long.loc[df_long["Value"] <= -9999, "Value"] = np.nan df_long = df_long.dropna(subset=["Value"]) df_long[constants.TIME_INDEX] = pd.to_datetime(df_long[["Year", "Month", "Day"]], errors="coerce")