From 79e54d3a04a903bad8853519bc8e1e24b1a9e474 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Thu, 10 Apr 2025 12:52:01 +0200 Subject: [PATCH 01/53] Minimal viable product: partial bulk download --- open_mastr/mastr.py | 14 ++- open_mastr/utils/helpers.py | 12 ++- .../xml_download/utils_download_bulk.py | 94 +++++++++++++++++++ 3 files changed, 112 insertions(+), 8 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index 646a50b1..1ce558c6 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -2,7 +2,10 @@ from sqlalchemy import inspect, create_engine # import xml dependencies -from open_mastr.xml_download.utils_download_bulk import download_xml_Mastr +from open_mastr.xml_download.utils_download_bulk import ( + download_xml_Mastr, + download_xml_Mastr_partial +) from open_mastr.xml_download.utils_write_to_database import ( write_mastr_xml_to_database, ) @@ -224,7 +227,7 @@ def download( date = transform_date_parameter(self, method, date, **kwargs) - if method == "bulk": + if method == "bulk" or method == 'partial bulk': # Find the name of the zipped xml folder bulk_download_date = parse_date_string(date) xml_folder_path = os.path.join(self.output_dir, "data", "xml_download") @@ -233,7 +236,10 @@ def download( xml_folder_path, f"Gesamtdatenexport_{bulk_download_date}.zip", ) - download_xml_Mastr(zipped_xml_file_path, date, xml_folder_path) + if method == 'bulk': + download_xml_Mastr(zipped_xml_file_path, date, xml_folder_path) + else: + download_xml_Mastr_partial(zipped_xml_file_path, date, data, xml_folder_path) print( f"\nWould you like to speed up the bulk download?\n" @@ -248,7 +254,7 @@ def download( bulk_cleansing=bulk_cleansing, bulk_download_date=bulk_download_date, ) - + if method == "API": validate_api_credentials() diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index ad4f4dd8..f08a7049 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -122,8 +122,8 @@ def validate_parameter_format_for_download_method( def validate_parameter_method(method) -> None: - if method not in ["bulk", "API"]: - raise ValueError("parameter method has to be either 'bulk' or 'API'.") + if method not in ["bulk", "partial bulk", "API"]: + raise ValueError("parameter method has to be either 'bulk', 'partial bulk' or 'API'.") def validate_parameter_api_location_types(api_location_types) -> None: @@ -172,7 +172,7 @@ def validate_parameter_api_limit(api_limit) -> None: def validate_parameter_date(method, date) -> None: if date is None: # default return - if method == "bulk": + if method == "bulk" or method == "partial bulk": if date not in ["today", "existing"]: try: _ = parse(date) @@ -216,6 +216,10 @@ def validate_parameter_data(method, data) -> None: raise ValueError( f"Allowed values for parameter data with bulk method are {BULK_DATA}" ) + if method == "partial bulk" and value not in BULK_DATA: + raise ValueError( + f"Allowed values for parameter data with bulk method are {BULK_DATA}" + ) if method == "API" and value not in API_DATA: raise ValueError( f"Allowed values for parameter data with API method are {API_DATA}" @@ -298,7 +302,7 @@ def transform_data_parameter( def transform_date_parameter(self, method, date, **kwargs): - if method == "bulk": + if method == "bulk" or method == "partial bulk": date = kwargs.get("bulk_date", date) date = "today" if date is None else date if date == "existing": diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index 02e69e84..42861cef 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -3,13 +3,17 @@ import time from importlib.metadata import PackageNotFoundError, version from zipfile import BadZipfile, ZipFile +import shutil +from pathlib import Path import numpy as np import requests from tqdm import tqdm +import unzip_http # setup logger from open_mastr.utils.config import setup_logger +from open_mastr.utils.constants import BULK_INCLUDE_TABLES_MAP try: USER_AGENT = ( @@ -203,3 +207,93 @@ def download_xml_Mastr( time_b = time.perf_counter() print(f"Download is finished. It took {int(np.around(time_b - time_a))} seconds.") print(f"MaStR was successfully downloaded to {xml_folder_path}.") + + +def download_xml_Mastr_partial( + save_path: str, bulk_date_string: str, bulk_data_list: list, xml_folder_path: str +) -> None: + """Downloads the zipped MaStR. + + Parameters + ----------- + save_path: str + The path where the downloaded MaStR zipped folder will be saved. + """ + + if os.path.exists(save_path): + try: + _ = ZipFile(save_path) + except BadZipfile: + log.info(f"Bad Zip file is deleted: {save_path}") + os.remove(save_path) + else: + print("MaStR already downloaded.") + return None + + if bulk_date_string != "today": + raise OSError( + "There exists no file for given date. MaStR can only be downloaded " + "from the website if today's date is given." + ) + shutil.rmtree(xml_folder_path, ignore_errors=True) + os.makedirs(xml_folder_path, exist_ok=True) + + print_message = ( + "Download has started, this can take several minutes." + "The download bar is only a rough estimate." + ) + warning_message = ( + "Warning: The servers from MaStR restrict the download speed." + " You may want to download it another time." + ) + print(print_message) + + now = time.localtime() + url = gen_url(now) + + time_a = time.perf_counter() + r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) + if r.status_code == 404: + log.warning( + "Download file was not found. Assuming that the new file was not published yet and retrying with yesterday." + ) + now = time.localtime( + time.mktime(now) - (24 * 60 * 60) + ) # subtract 1 day from the date + url = gen_url(now) + r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) + if r.status_code == 404: + url = gen_url(now, use_version="before") # Use lower MaStR Version + log.warning( + f"Download file was not found. Assuming that the version of MaStR has changed and retrying with download link: {url}" + ) + r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) + if r.status_code == 404: + url = gen_url(now, use_version="after") # Use higher MaStR Version + log.warning( + f"Download file was not found. Assuming that the version of MaStR has changed and retrying with download link: {url}" + ) + r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) + + if r.status_code == 404: + log.error("Could not download file: download URL not found") + return + + remote_zip_file = unzip_http.RemoteZipFile(url) + remote_zip_names = [remote_zip_name.lower().split('_')[0].split('.')[0] for remote_zip_name in remote_zip_file.namelist()] + + remote_index_list = [] + for bulk_data_name in bulk_data_list: + for bulk_file_name in BULK_INCLUDE_TABLES_MAP[bulk_data_name]: + remote_index_list = [remote_index for remote_index, remote_zip_name in enumerate(remote_zip_names) if remote_zip_name == bulk_file_name] + for remote_index in remote_index_list: + remote_zip_file.extract(remote_zip_file.namelist()[remote_index],path=Path(save_path[:-4])) + + remote_zip_file.extract('Katalogwerte.xml',path=Path(save_path[:-4])) + + shutil.make_archive(save_path[:-4], 'zip', save_path[:-4]) + shutil.rmtree(save_path[:-4]) + + time_b = time.perf_counter() + print(f"Download is finished. It took {int(np.around(time_b - time_a))} seconds.") + print(f"MaStR was successfully downloaded to {xml_folder_path}.") From 6a86ac533a80f3dc1cfbc404685ddfa9ca266f5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Fri, 25 Apr 2025 12:14:28 +0200 Subject: [PATCH 02/53] Remove "partial-bulk" from helpers functions --- open_mastr/utils/helpers.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index f08a7049..1543e222 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -122,8 +122,8 @@ def validate_parameter_format_for_download_method( def validate_parameter_method(method) -> None: - if method not in ["bulk", "partial bulk", "API"]: - raise ValueError("parameter method has to be either 'bulk', 'partial bulk' or 'API'.") + if method not in ["bulk", "API"]: + raise ValueError("parameter method has to be either 'bulk', or 'API'.") def validate_parameter_api_location_types(api_location_types) -> None: @@ -172,7 +172,7 @@ def validate_parameter_api_limit(api_limit) -> None: def validate_parameter_date(method, date) -> None: if date is None: # default return - if method == "bulk" or method == "partial bulk": + if method == "bulk": if date not in ["today", "existing"]: try: _ = parse(date) @@ -216,10 +216,6 @@ def validate_parameter_data(method, data) -> None: raise ValueError( f"Allowed values for parameter data with bulk method are {BULK_DATA}" ) - if method == "partial bulk" and value not in BULK_DATA: - raise ValueError( - f"Allowed values for parameter data with bulk method are {BULK_DATA}" - ) if method == "API" and value not in API_DATA: raise ValueError( f"Allowed values for parameter data with API method are {API_DATA}" @@ -302,7 +298,7 @@ def transform_data_parameter( def transform_date_parameter(self, method, date, **kwargs): - if method == "bulk" or method == "partial bulk": + if method == "bulk": date = kwargs.get("bulk_date", date) date = "today" if date is None else date if date == "existing": From 3580bfd2096bb04cf198b74716e80363bdef52f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Fri, 25 Apr 2025 12:17:08 +0200 Subject: [PATCH 03/53] Remove "partial bulk" from Mastr.download function --- open_mastr/mastr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index 1ce558c6..03165375 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -227,7 +227,7 @@ def download( date = transform_date_parameter(self, method, date, **kwargs) - if method == "bulk" or method == 'partial bulk': + if method == "bulk": # Find the name of the zipped xml folder bulk_download_date = parse_date_string(date) xml_folder_path = os.path.join(self.output_dir, "data", "xml_download") @@ -236,7 +236,7 @@ def download( xml_folder_path, f"Gesamtdatenexport_{bulk_download_date}.zip", ) - if method == 'bulk': + if data is None: download_xml_Mastr(zipped_xml_file_path, date, xml_folder_path) else: download_xml_Mastr_partial(zipped_xml_file_path, date, data, xml_folder_path) From 6c5a052201cc6a4e5c9aca541b5128e5fdbe072a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Fri, 25 Apr 2025 14:12:07 +0200 Subject: [PATCH 04/53] Add download completeness check, add sequential download functionality --- .../xml_download/utils_download_bulk.py | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index 42861cef..1a8af87d 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -208,6 +208,21 @@ def download_xml_Mastr( print(f"Download is finished. It took {int(np.around(time_b - time_a))} seconds.") print(f"MaStR was successfully downloaded to {xml_folder_path}.") +def check_download_completeness( + save_path: str,bulk_data_list: list +) -> list: + """Checks if an existing download contains the xml-files corresponding to the bulk_data_list. + """ + with ZipFile(save_path, 'r') as zip_ref: + existing_files = [zip_name.lower().split('_')[0].split('.')[0] for zip_name in zip_ref.namelist()] + + missing_data_set = set() + for bulk_data_name in bulk_data_list: + for bulk_file_name in BULK_INCLUDE_TABLES_MAP[bulk_data_name]: + if bulk_file_name not in existing_files: + missing_data_set.add(bulk_data_name) + return list(missing_data_set) + def download_xml_Mastr_partial( save_path: str, bulk_date_string: str, bulk_data_list: list, xml_folder_path: str @@ -227,8 +242,12 @@ def download_xml_Mastr_partial( log.info(f"Bad Zip file is deleted: {save_path}") os.remove(save_path) else: - print("MaStR already downloaded.") - return None + bulk_data_list = check_download_completeness(save_path,bulk_data_list) + if bool(bulk_data_list): + print(f"MaStR is missing the following data: {bulk_data_list}") + else: + print("MaStR already downloaded.") + return None if bulk_date_string != "today": raise OSError( From 17e8347767343736d1a94bc47923c5039ff933d1 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 29 Apr 2025 06:02:18 +0200 Subject: [PATCH 05/53] Remove default branch for test pypi publication Branch was always set on workflow trigger and manually selected branch ignored --- .github/workflows/test-pypi-publish.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/test-pypi-publish.yml b/.github/workflows/test-pypi-publish.yml index 5fae8627..83c19222 100644 --- a/.github/workflows/test-pypi-publish.yml +++ b/.github/workflows/test-pypi-publish.yml @@ -13,8 +13,6 @@ jobs: environment: pypi-publish steps: - uses: actions/checkout@v4 - with: - ref: release - name: Set up Python 3.10 uses: actions/setup-python@v3 with: From c00d6057256b9bcf6c36a1a4923bb503753616ff Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 29 Apr 2025 06:21:22 +0200 Subject: [PATCH 06/53] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1dbe5e5b..44934ba5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ ## [v0.XX.X] unreleased - 202X-XX-XX ### Added ### Changed +- Fix package publication workflow + [#636](https://github.com/OpenEnergyPlatform/open-MaStR/pull/636) ### Removed From aa4bafc91abba7b05dde75a7a0fa5b0296e5867c Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 29 Apr 2025 06:24:54 +0200 Subject: [PATCH 07/53] Remove trailing white space in changelog to trigger tests --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44934ba5..53591623 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,7 +37,7 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ [#621](https://github.com/OpenEnergyPlatform/open-MaStR/pull/621) ### Removed - Moved old code artefacts from `scripts` folder to paper specific - [repository](https://github.com/FlorianK13/verify-marktstammdaten) + [repository](https://github.com/FlorianK13/verify-marktstammdaten) [#561](https://github.com/OpenEnergyPlatform/open-MaStR/pull/561) - Remove old dependencies and broken README links [#619](https://github.com/OpenEnergyPlatform/open-MaStR/pull/619) From d18fafe7b1cc1c0ef80166a939ed9a16a55457ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Mon, 5 May 2025 16:20:30 +0200 Subject: [PATCH 08/53] Add unzip_http as own function instead of install and import --- open_mastr/utils/unzip_http.py | 408 +++++++++++++++++++++++++++++++++ 1 file changed, 408 insertions(+) create mode 100644 open_mastr/utils/unzip_http.py diff --git a/open_mastr/utils/unzip_http.py b/open_mastr/utils/unzip_http.py new file mode 100644 index 00000000..548a7890 --- /dev/null +++ b/open_mastr/utils/unzip_http.py @@ -0,0 +1,408 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2022 Saul Pwanson +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +# Originally from +# https://github.com/saulpw/unzip-http +# Adjusted for our use case + +""" +usage: unzip_http [-h] [-l] [-f] [-o] url [files ...] + +Extract individual files from .zip files over http without downloading the +entire archive. HTTP server must send `Accept-Ranges: bytes` and +`Content-Length` in headers. + +positional arguments: + url URL of the remote zip file + files Files to extract. If no filenames given, displays .zip + contents (filenames and sizes). Each filename can be a + wildcard glob. + +options: + -h, --help show this help message and exit + -l, --list List files in the remote zip file + -f, --full-filepaths Recreate folder structure from zip file when extracting + (instead of extracting the files to the current + directory) + -o, --stdout Write files to stdout (if multiple files: concatenate + them to stdout, in zipfile order) +""" + +import sys +import os +import io +import math +import time +import zlib +import struct +import fnmatch +import argparse +import pathlib +import urllib.parse +import zipfile + + +__version__ = '0.6' + + +def error(s): + raise Exception(s) + +def warning(s): + print(s, file=sys.stderr) + +def get_bits(val:int, *args): + 'Generate bitfields (one for each arg) from LSB to MSB.' + for n in args: + x = val & (2**n-1) + val >>= n + yield x + + +class RemoteZipInfo: + def __init__(self, filename:str='', + date_time:int = 0, + header_offset:int = 0, + compress_type:int = 0, + compress_size:int = 0, + file_size:int = 0): + self.filename = filename + self.header_offset = header_offset + self.compress_type = compress_type + self.compress_size = compress_size + self.file_size = file_size + + sec, mins, hour, day, mon, year = get_bits(date_time, 5, 6, 5, 5, 4, 7) + self.date_time = (year+1980, mon, day, hour, mins, sec) + + def is_dir(self): + return self.filename.endswith('/') + + def parse_extra(self, extra): + i = 0 + while i < len(extra): + fieldid, fieldsz = struct.unpack_from('= 0: + magic, eocd_sz, create_ver, min_ver, disk_num, disk_start, disk_num_records, total_num_records, \ + cdir_bytes, cdir_start = struct.unpack_from(self.fmt_eocd64, resp.data, offset=i) + else: + i = resp.data.rfind(self.magic_eocd) + if i >= 0: + magic, \ + disk_num, disk_start, disk_num_records, total_num_records, \ + cdir_bytes, cdir_start, comment_len = struct.unpack_from(self.fmt_eocd, resp.data, offset=i) + + if cdir_start < 0 or cdir_start >= self.zip_size: + error('cannot find central directory') + + if self.zip_size <= 65536: + filehdr_index = cdir_start + else: + filehdr_index = 65536 - (self.zip_size - cdir_start) + + if filehdr_index < 0: + resp = self.get_range(cdir_start, self.zip_size - cdir_start) + filehdr_index = 0 + + cdir_end = filehdr_index + cdir_bytes + while filehdr_index < cdir_end: + sizeof_cdirentry = struct.calcsize(self.fmt_cdirentry) + + magic, ver, ver_needed, flags, method, date_time, crc, \ + complen, uncomplen, fnlen, extralen, commentlen, \ + disknum_start, internal_attr, external_attr, local_header_ofs = \ + struct.unpack_from(self.fmt_cdirentry, resp.data, offset=filehdr_index) + + filehdr_index += sizeof_cdirentry + + filename = resp.data[filehdr_index:filehdr_index+fnlen] + filehdr_index += fnlen + + extra = resp.data[filehdr_index:filehdr_index+extralen] + filehdr_index += extralen + + # comment = resp.data[filehdr_index:filehdr_index+commentlen] + filehdr_index += commentlen + + rzi = RemoteZipInfo(filename.decode(), date_time, local_header_ofs, method, complen, uncomplen) + + rzi.parse_extra(extra) + yield rzi + + def extract(self, member, path=None, pwd=None): + if pwd: + raise NotImplementedError('Passwords not supported yet') + + path = path or pathlib.Path('.') + + outpath = path/member + os.makedirs(outpath.parent, exist_ok=True) + with self.open(member) as fpin: + with open(path/member, mode='wb') as fpout: + while True: + r = fpin.read(65536) + if not r: + break + fpout.write(r) + + + def extractzip(self, member, path=None, pwd=None): + if pwd: + raise NotImplementedError('Passwords not supported yet') + + path = path or pathlib.Path('.') + outpath = path + os.makedirs(outpath.parent, exist_ok=True) + with self.open(member) as fpin: + with zipfile.ZipFile(outpath, 'a', zipfile.ZIP_DEFLATED) as zout: + with zout.open(member,'w') as fpout: + while True: + r = fpin.read(65536) + if not r: + break + fpout.write(r) + + + def extractall(self, path=None, members=None, pwd=None): + for fn in members or self.namelist(): + self.extract(fn, path, pwd=pwd) + + def get_range(self, start, n): + return self.http.request('GET', self.url, headers={'Range': f'bytes={start}-{start+n-1}'}, preload_content=False) + + def matching_files(self, *globs): + for f in self.files.values(): + if any(fnmatch.fnmatch(f.filename, g) for g in globs): + yield f + + def open(self, fn): + if isinstance(fn, str): + f = list(self.matching_files(fn)) + if not f: + error(f'no files matching {fn}') + f = f[0] + else: + f = fn + + sizeof_localhdr = struct.calcsize(self.fmt_localhdr) + r = self.get_range(f.header_offset, sizeof_localhdr) + localhdr = struct.unpack_from(self.fmt_localhdr, r.data) + magic, ver, flags, method, dos_datetime, _, _, uncomplen, fnlen, extralen = localhdr + if method == 0: # none + return self.get_range(f.header_offset + sizeof_localhdr + fnlen + extralen, f.compress_size) + elif method == 8: # DEFLATE + resp = self.get_range(f.header_offset + sizeof_localhdr + fnlen + extralen, f.compress_size) + return io.BufferedReader(RemoteZipStream(resp, f)) + else: + error(f'unknown compression method {method}') + + def open_text(self, fn): + return io.TextIOWrapper(self.open(fn)) + + +class RemoteZipStream(io.RawIOBase): + def __init__(self, fp, info): + super().__init__() + self.raw = fp + self._decompressor = zlib.decompressobj(-15) + self._buffer = bytes() + + def readable(self): + return True + + def readinto(self, b): + r = self.read(len(b)) + b[:len(r)] = r + return len(r) + + def read(self, n): + while n > len(self._buffer): + r = self.raw.read(2**18) + if not r: + self._buffer += self._decompressor.flush() + break + self._buffer += self._decompressor.decompress(r) + + ret = self._buffer[:n] + self._buffer = self._buffer[n:] + + return ret + + + ### script start + +class StreamProgress: + def __init__(self, fp, name='', total=0): + self.name = name + self.fp = fp + self.total = total + self.start_time = time.time() + self.last_update = 0 + self.amtread = 0 + + def read(self, n): + r = self.fp.read(n) + self.amtread += len(r) + now = time.time() + if now - self.last_update > 0.1: + self.last_update = now + + elapsed_s = now - self.start_time + sys.stderr.write(f'\r{elapsed_s:.0f}s {self.amtread/10**6:.02f}/{self.total/10**6:.02f}MB ({self.amtread/10**6/elapsed_s:.02f} MB/s) {self.name}') + + if not r: + sys.stderr.write('\n') + + return r + + +def list_files(rzf): + def safelog(x): + return 1 if x == 0 else math.ceil(math.log10(x)) + + digits_compr = max(safelog(f.compress_size) for f in rzf.infolist()) + digits_plain = max(safelog(f.file_size ) for f in rzf.infolist()) + fmtstr = f'%{digits_compr}d -> %{digits_plain}d\t%s' + for f in rzf.infolist(): + print(fmtstr % (f.compress_size, f.file_size, f.filename), file=sys.stderr) + + +def extract_one(outfile, rzf, f, ofname): + print(f'Extracting {f.filename} to {ofname}...', file=sys.stderr) + + fp = StreamProgress(rzf.open(f), name=f.filename, total=f.compress_size) + while r := fp.read(2**18): + outfile.write(r) + + +def download_file(f, rzf, args): + if not any(fnmatch.fnmatch(f.filename, g) for g in args.files): + return + + if args.stdout: + extract_one(sys.stdout.buffer, rzf, f, "stdout") + else: + path = pathlib.Path(f.filename) + if args.full_filepaths: + path.parent.mkdir(parents=True, exist_ok=True) + else: + path = path.name + + with open(str(path), 'wb') as of: + extract_one(of, rzf, f, str(path)) + + +def main(): + parser = argparse.ArgumentParser(prog='unzip-http', \ + description="Extract individual files from .zip files over http without downloading the entire archive. HTTP server must send `Accept-Ranges: bytes` and `Content-Length` in headers.") + + parser.add_argument('-l', '--list', action='store_true', default=False, + help="List files in the remote zip file") + parser.add_argument('-f', '--full-filepaths', action='store_true', default=False, + help="Recreate folder structure from zip file when extracting (instead of extracting the files to the current directory)") + parser.add_argument('-o', '--stdout', action='store_true', default=False, + help="Write files to stdout (if multiple files: concatenate them to stdout, in zipfile order)") + + parser.add_argument("url", nargs=1, help="URL of the remote zip file") + parser.add_argument("files", nargs='*', help="Files to extract. If no filenames given, displays .zip contents (filenames and sizes). Each filename can be a wildcard glob.") + + args = parser.parse_args() + + rzf = RemoteZipFile(args.url[0]) + if args.list or len(args.files) == 0: + list_files(rzf) + else: + for f in rzf.infolist(): + download_file(f, rzf, args) + + + +if __name__ == '__main__': + main() From c579ab3efd9b70dd83f5d0d781e39a5103653355 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Mon, 5 May 2025 16:22:48 +0200 Subject: [PATCH 09/53] Add katalogwerte_bool and fit code to new utils.unzip_http --- open_mastr/mastr.py | 2 +- .../xml_download/utils_download_bulk.py | 26 +++++++++++-------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index 03165375..79917a7c 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -239,7 +239,7 @@ def download( if data is None: download_xml_Mastr(zipped_xml_file_path, date, xml_folder_path) else: - download_xml_Mastr_partial(zipped_xml_file_path, date, data, xml_folder_path) + data = download_xml_Mastr_partial(zipped_xml_file_path, date, data, xml_folder_path) print( f"\nWould you like to speed up the bulk download?\n" diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index 1a8af87d..51883dcc 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -3,17 +3,16 @@ import time from importlib.metadata import PackageNotFoundError, version from zipfile import BadZipfile, ZipFile -import shutil from pathlib import Path import numpy as np import requests from tqdm import tqdm -import unzip_http # setup logger from open_mastr.utils.config import setup_logger from open_mastr.utils.constants import BULK_INCLUDE_TABLES_MAP +from open_mastr.utils import unzip_http try: USER_AGENT = ( @@ -208,9 +207,10 @@ def download_xml_Mastr( print(f"Download is finished. It took {int(np.around(time_b - time_a))} seconds.") print(f"MaStR was successfully downloaded to {xml_folder_path}.") + def check_download_completeness( save_path: str,bulk_data_list: list -) -> list: +) -> (list, bool): """Checks if an existing download contains the xml-files corresponding to the bulk_data_list. """ with ZipFile(save_path, 'r') as zip_ref: @@ -221,12 +221,16 @@ def check_download_completeness( for bulk_file_name in BULK_INCLUDE_TABLES_MAP[bulk_data_name]: if bulk_file_name not in existing_files: missing_data_set.add(bulk_data_name) - return list(missing_data_set) + + katalogwerte_bool = 0 + if 'katalogwerte' in existing_files: + katalogwerte_bool = True + return list(missing_data_set), katalogwerte_bool def download_xml_Mastr_partial( save_path: str, bulk_date_string: str, bulk_data_list: list, xml_folder_path: str -) -> None: +) -> list: """Downloads the zipped MaStR. Parameters @@ -235,6 +239,7 @@ def download_xml_Mastr_partial( The path where the downloaded MaStR zipped folder will be saved. """ + katalogwerte_bool = False if os.path.exists(save_path): try: _ = ZipFile(save_path) @@ -242,7 +247,7 @@ def download_xml_Mastr_partial( log.info(f"Bad Zip file is deleted: {save_path}") os.remove(save_path) else: - bulk_data_list = check_download_completeness(save_path,bulk_data_list) + bulk_data_list, katalogwerte_bool = check_download_completeness(save_path,bulk_data_list) if bool(bulk_data_list): print(f"MaStR is missing the following data: {bulk_data_list}") else: @@ -306,13 +311,12 @@ def download_xml_Mastr_partial( for bulk_file_name in BULK_INCLUDE_TABLES_MAP[bulk_data_name]: remote_index_list = [remote_index for remote_index, remote_zip_name in enumerate(remote_zip_names) if remote_zip_name == bulk_file_name] for remote_index in remote_index_list: - remote_zip_file.extract(remote_zip_file.namelist()[remote_index],path=Path(save_path[:-4])) - - remote_zip_file.extract('Katalogwerte.xml',path=Path(save_path[:-4])) + remote_zip_file.extractzip(remote_zip_file.namelist()[remote_index],path=Path(save_path)) - shutil.make_archive(save_path[:-4], 'zip', save_path[:-4]) - shutil.rmtree(save_path[:-4]) + if not katalogwerte_bool: + remote_zip_file.extractzip('Katalogwerte.xml',path=Path(save_path)) time_b = time.perf_counter() print(f"Download is finished. It took {int(np.around(time_b - time_a))} seconds.") print(f"MaStR was successfully downloaded to {xml_folder_path}.") + return bulk_data_list From 4fe0a0af971af4a2ab0f77ac91965a61d97ad9f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Mon, 12 May 2025 09:41:38 +0200 Subject: [PATCH 10/53] Remove unnecessary imports from unzip_http --- open_mastr/utils/unzip_http.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/open_mastr/utils/unzip_http.py b/open_mastr/utils/unzip_http.py index 548a7890..ef4140f8 100644 --- a/open_mastr/utils/unzip_http.py +++ b/open_mastr/utils/unzip_http.py @@ -55,7 +55,6 @@ import zlib import struct import fnmatch -import argparse import pathlib import urllib.parse import zipfile @@ -378,31 +377,3 @@ def download_file(f, rzf, args): with open(str(path), 'wb') as of: extract_one(of, rzf, f, str(path)) - -def main(): - parser = argparse.ArgumentParser(prog='unzip-http', \ - description="Extract individual files from .zip files over http without downloading the entire archive. HTTP server must send `Accept-Ranges: bytes` and `Content-Length` in headers.") - - parser.add_argument('-l', '--list', action='store_true', default=False, - help="List files in the remote zip file") - parser.add_argument('-f', '--full-filepaths', action='store_true', default=False, - help="Recreate folder structure from zip file when extracting (instead of extracting the files to the current directory)") - parser.add_argument('-o', '--stdout', action='store_true', default=False, - help="Write files to stdout (if multiple files: concatenate them to stdout, in zipfile order)") - - parser.add_argument("url", nargs=1, help="URL of the remote zip file") - parser.add_argument("files", nargs='*', help="Files to extract. If no filenames given, displays .zip contents (filenames and sizes). Each filename can be a wildcard glob.") - - args = parser.parse_args() - - rzf = RemoteZipFile(args.url[0]) - if args.list or len(args.files) == 0: - list_files(rzf) - else: - for f in rzf.infolist(): - download_file(f, rzf, args) - - - -if __name__ == '__main__': - main() From e3d3b3d644f7848c6c139cf0edd004bad41e1676 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Wed, 16 Jul 2025 16:36:34 +0200 Subject: [PATCH 11/53] Prepare metadata file creation --- open_mastr/mastr.py | 2 ++ open_mastr/utils/helpers.py | 14 ++++++++++++++ open_mastr/xml_download/utils_download_bulk.py | 3 +-- 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index 79917a7c..0a9cc0a4 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -26,6 +26,7 @@ create_db_query, db_query_to_csv, reverse_fill_basic_units, + create_metadata_file ) from open_mastr.utils.config import ( create_data_dir, @@ -254,6 +255,7 @@ def download( bulk_cleansing=bulk_cleansing, bulk_download_date=bulk_download_date, ) + create_metadata_file(self, date, data) if method == "API": validate_api_credentials() diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index 1543e222..a38a6467 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -4,6 +4,7 @@ from contextlib import contextmanager from datetime import date, datetime from warnings import warn +import csv import dateutil import sqlalchemy @@ -322,6 +323,19 @@ def transform_date_parameter(self, method, date, **kwargs): return date +# def create_metadata_file(self, date, data): +# log_file = os.path.join(self.output_dir, "data", "metadata_log_file.csv") +# if not os.path.isfile(log_file): +# with open(log_file, "w", newline="") as file: +# writer = csv.writer(file) +# writer.writerow(["date", "date_input", "data_tables"]) +# if date == "today": +# actual_date = datetime.today().strftime("%Y%m%d") +# with open(log_file, "a", newline="") as file: +# writer = csv.writer(file) +# writer.writerow([actual_date, date, data]) + + @contextmanager def session_scope(engine): """Provide a transactional scope around a series of operations.""" diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index 51883dcc..9210665f 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -313,8 +313,7 @@ def download_xml_Mastr_partial( for remote_index in remote_index_list: remote_zip_file.extractzip(remote_zip_file.namelist()[remote_index],path=Path(save_path)) - if not katalogwerte_bool: - remote_zip_file.extractzip('Katalogwerte.xml',path=Path(save_path)) + remote_zip_file.extractzip('Katalogwerte.xml',path=Path(save_path)) time_b = time.perf_counter() print(f"Download is finished. It took {int(np.around(time_b - time_a))} seconds.") From 84b8647c2a738a867bc478af51d79dee1aa558c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Fri, 18 Jul 2025 12:17:18 +0200 Subject: [PATCH 12/53] Moving two check functions outside download_xml function #616 --- open_mastr/mastr.py | 20 ++--- open_mastr/utils/helpers.py | 16 +++- .../xml_download/utils_download_bulk.py | 75 ++++++++----------- 3 files changed, 56 insertions(+), 55 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index 0a9cc0a4..021bc143 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -4,7 +4,8 @@ # import xml dependencies from open_mastr.xml_download.utils_download_bulk import ( download_xml_Mastr, - download_xml_Mastr_partial + download_xml_Mastr_partial, + delete_xml_files_not_from_given_date ) from open_mastr.xml_download.utils_write_to_database import ( write_mastr_xml_to_database, @@ -26,7 +27,10 @@ create_db_query, db_query_to_csv, reverse_fill_basic_units, - create_metadata_file + delete_zip_file_if_corrupted, + create_database_engine, + rename_table, + create_translated_database_engine, ) from open_mastr.utils.config import ( create_data_dir, @@ -37,13 +41,6 @@ ) import open_mastr.utils.orm as orm -# import initialize_database dependencies -from open_mastr.utils.helpers import ( - create_database_engine, - rename_table, - create_translated_database_engine, -) - # constants from open_mastr.utils.constants import TECHNOLOGIES, ADDITIONAL_TABLES @@ -237,6 +234,10 @@ def download( xml_folder_path, f"Gesamtdatenexport_{bulk_download_date}.zip", ) + + delete_zip_file_if_corrupted(zipped_xml_file_path) + delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) + if data is None: download_xml_Mastr(zipped_xml_file_path, date, xml_folder_path) else: @@ -255,7 +256,6 @@ def download( bulk_cleansing=bulk_cleansing, bulk_download_date=bulk_download_date, ) - create_metadata_file(self, date, data) if method == "API": validate_api_credentials() diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index a38a6467..8af859f3 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -4,7 +4,7 @@ from contextlib import contextmanager from datetime import date, datetime from warnings import warn -import csv +from zipfile import BadZipfile, ZipFile import dateutil import sqlalchemy @@ -824,3 +824,17 @@ def create_translated_database_engine(engine, folder_path) -> sqlalchemy.engine. ) return create_engine(f"sqlite:///{db_path}") + + +def delete_zip_file_if_corrupted(save_path: str): + """ + Check if existing zip file is corrupted and if yes, delete it, if no, zipfile exists. + """ + if os.path.exists(save_path): + try: + with ZipFile(save_path) as _: + pass + except BadZipfile: + log.info(f"Bad Zip file is deleted: {save_path}") + os.remove(save_path) + \ No newline at end of file diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index 9210665f..a9f0f286 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -2,7 +2,7 @@ import shutil import time from importlib.metadata import PackageNotFoundError, version -from zipfile import BadZipfile, ZipFile +from zipfile import ZipFile from pathlib import Path import numpy as np @@ -125,24 +125,7 @@ def download_xml_Mastr( save_path: str The path where the downloaded MaStR zipped folder will be saved. """ - - if os.path.exists(save_path): - try: - _ = ZipFile(save_path) - except BadZipfile: - log.info(f"Bad Zip file is deleted: {save_path}") - os.remove(save_path) - else: - print("MaStR already downloaded.") - return None - - if bulk_date_string != "today": - raise OSError( - "There exists no file for given date. MaStR can only be downloaded " - "from the website if today's date is given." - ) - shutil.rmtree(xml_folder_path, ignore_errors=True) - os.makedirs(xml_folder_path, exist_ok=True) + print_message = ( "Download has started, this can take several minutes." @@ -210,7 +193,7 @@ def download_xml_Mastr( def check_download_completeness( save_path: str,bulk_data_list: list -) -> (list, bool): +) -> tuple[list, bool]: """Checks if an existing download contains the xml-files corresponding to the bulk_data_list. """ with ZipFile(save_path, 'r') as zip_ref: @@ -222,10 +205,10 @@ def check_download_completeness( if bulk_file_name not in existing_files: missing_data_set.add(bulk_data_name) - katalogwerte_bool = 0 + is_katalogwerte_existing = False if 'katalogwerte' in existing_files: - katalogwerte_bool = True - return list(missing_data_set), katalogwerte_bool + is_katalogwerte_existing = True + return list(missing_data_set), is_katalogwerte_existing def download_xml_Mastr_partial( @@ -239,28 +222,14 @@ def download_xml_Mastr_partial( The path where the downloaded MaStR zipped folder will be saved. """ - katalogwerte_bool = False + is_katalogwerte_existing = False if os.path.exists(save_path): - try: - _ = ZipFile(save_path) - except BadZipfile: - log.info(f"Bad Zip file is deleted: {save_path}") - os.remove(save_path) + bulk_data_list, is_katalogwerte_existing = check_download_completeness(save_path,bulk_data_list) + if bool(bulk_data_list): + print(f"MaStR is missing the following data: {bulk_data_list}") else: - bulk_data_list, katalogwerte_bool = check_download_completeness(save_path,bulk_data_list) - if bool(bulk_data_list): - print(f"MaStR is missing the following data: {bulk_data_list}") - else: - print("MaStR already downloaded.") - return None - - if bulk_date_string != "today": - raise OSError( - "There exists no file for given date. MaStR can only be downloaded " - "from the website if today's date is given." - ) - shutil.rmtree(xml_folder_path, ignore_errors=True) - os.makedirs(xml_folder_path, exist_ok=True) + print("MaStR already downloaded.") + return None print_message = ( "Download has started, this can take several minutes." @@ -308,14 +277,32 @@ def download_xml_Mastr_partial( remote_index_list = [] for bulk_data_name in bulk_data_list: + # Example: ['wind','solar'] for bulk_file_name in BULK_INCLUDE_TABLES_MAP[bulk_data_name]: + # Example: From "wind" we get ["anlageneegwind", "einheitenwind"], and from "solar" we get ["anlageneegsolar", "einheitensolar"] + # and we have to find the corresponding index in the remote_zip_file list in order to fetch the correct file remote_index_list = [remote_index for remote_index, remote_zip_name in enumerate(remote_zip_names) if remote_zip_name == bulk_file_name] + # for remote_index in tqdm(remote_index_list): for remote_index in remote_index_list: + # Example: remote_zip_file.namelist()[remote_index] corresponds to e.g. 'AnlagenEegSolar_1.xml' remote_zip_file.extractzip(remote_zip_file.namelist()[remote_index],path=Path(save_path)) - remote_zip_file.extractzip('Katalogwerte.xml',path=Path(save_path)) + if not is_katalogwerte_existing: + remote_zip_file.extractzip('Katalogwerte.xml',path=Path(save_path)) time_b = time.perf_counter() print(f"Download is finished. It took {int(np.around(time_b - time_a))} seconds.") print(f"MaStR was successfully downloaded to {xml_folder_path}.") return bulk_data_list + + +def delete_xml_files_not_from_given_date(save_path: str, xml_folder_path: str): + """ + Delete xml files that are not corresponding to the given date. + Assumes that the xml folder only contains one zipfile. + """ + if os.path.exists(save_path): + return + else: + shutil.rmtree(xml_folder_path) + os.makedirs(xml_folder_path) From 2d681cfd1d7ae61273226e9992f8222d1efa03d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Fri, 18 Jul 2025 15:46:02 +0200 Subject: [PATCH 13/53] Deprecation of date='existing', merging of partial and full download in existing download() function #616 --- environment.yml | 2 +- open_mastr/mastr.py | 14 +- open_mastr/utils/helpers.py | 65 +++---- .../xml_download/utils_download_bulk.py | 174 ++++++++---------- 4 files changed, 101 insertions(+), 154 deletions(-) diff --git a/environment.yml b/environment.yml index 104130fe..66f2bd03 100644 --- a/environment.yml +++ b/environment.yml @@ -3,4 +3,4 @@ channels: - conda-forge - defaults dependencies: - - python=3.10 + - python=3.11 diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index 021bc143..ada4311e 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -4,8 +4,7 @@ # import xml dependencies from open_mastr.xml_download.utils_download_bulk import ( download_xml_Mastr, - download_xml_Mastr_partial, - delete_xml_files_not_from_given_date + delete_xml_files_not_from_given_date, ) from open_mastr.xml_download.utils_write_to_database import ( write_mastr_xml_to_database, @@ -158,7 +157,7 @@ def download( |-----------------------|------|------| | "today" | latest files are downloaded from marktstammdatenregister.de | - | | "20230101" | If file from this date exists locally, it is used. Otherwise it throws an error (You can only receive todays data from the server) | - | - | "existing" | Use latest downloaded zipped xml files, throws an error if the bulk download folder is empty | - | + | "existing" | Deprecated since 0.16, see [#616](https://github.com/OpenEnergyPlatform/open-MaStR/issues/616#issuecomment-3089377062)mkdo | - | | "latest" | - | Retrieve data that is newer than the newest data already in the table | | datetime.datetime(2020, 11, 27) | - | Retrieve data that is newer than this time stamp | | None | set date="today" | set date="latest" | @@ -237,11 +236,8 @@ def download( delete_zip_file_if_corrupted(zipped_xml_file_path) delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) - - if data is None: - download_xml_Mastr(zipped_xml_file_path, date, xml_folder_path) - else: - data = download_xml_Mastr_partial(zipped_xml_file_path, date, data, xml_folder_path) + + download_xml_Mastr(zipped_xml_file_path, date, data, xml_folder_path) print( f"\nWould you like to speed up the bulk download?\n" @@ -256,7 +252,7 @@ def download( bulk_cleansing=bulk_cleansing, bulk_download_date=bulk_download_date, ) - + if method == "API": validate_api_credentials() diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index 8af859f3..a4a1f525 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -245,18 +245,16 @@ def raise_warning_for_invalid_parameter_combinations( ) if method == "bulk" and ( - ( - any( - parameter is not None - for parameter in [ - api_processes, - api_data_types, - api_location_types, - ] - ) - or api_limit != 50 - or api_chunksize != 1000 + any( + parameter is not None + for parameter in [ + api_processes, + api_data_types, + api_location_types, + ] ) + or api_limit != 50 + or api_chunksize != 1000 ): warn( "For method = 'bulk', API related parameters (with prefix api_) are ignored." @@ -303,39 +301,23 @@ def transform_date_parameter(self, method, date, **kwargs): date = kwargs.get("bulk_date", date) date = "today" if date is None else date if date == "existing": - existing_files_list = os.listdir( - os.path.join(self.output_dir, "data", "xml_download") + log.warning( + """ + The date parameter 'existing' is deprecated and will be removed in the future. + The date parameter is set to `today`. + + If this change causes problems for you, please comment in this issue on github: + https://github.com/OpenEnergyPlatform/open-MaStR/issues/616#issuecomment-3089377062 + + """ ) - if not existing_files_list: - date = "today" - print( - "By choosing `date`='existing' you want to use an existing " - "xml download." - "However no xml_files were downloaded yet. The parameter `date` is" - "therefore set to 'today'." - ) - # we assume that there is only one file in the folder which is the - # zipped xml folder - date = existing_files_list[0].split("_")[1].split(".")[0] + date = "today" elif method == "API": date = kwargs.get("api_date", date) return date -# def create_metadata_file(self, date, data): -# log_file = os.path.join(self.output_dir, "data", "metadata_log_file.csv") -# if not os.path.isfile(log_file): -# with open(log_file, "w", newline="") as file: -# writer = csv.writer(file) -# writer.writerow(["date", "date_input", "data_tables"]) -# if date == "today": -# actual_date = datetime.today().strftime("%Y%m%d") -# with open(log_file, "a", newline="") as file: -# writer = csv.writer(file) -# writer.writerow([actual_date, date, data]) - - @contextmanager def session_scope(engine): """Provide a transactional scope around a series of operations.""" @@ -369,7 +351,7 @@ def print_api_settings( ) if "permit" in harmonisation_log: print( - f"data_types: {api_data_types}" "\033[31m", + f"data_types: {api_data_types}\033[31m", "Attention, 'permit_data' was automatically set in api_data_types, " "as you defined 'permit' in parameter data_api.", "\033[m", @@ -494,9 +476,7 @@ def create_db_query( unit_type_map_reversed = reverse_unit_type_map() with session_scope(engine=engine) as session: - if tech: - # Select orm tables for specified additional_data. orm_tables = { f"{dat}": getattr(orm, ORM_MAP[tech].get(dat, "KeyNotAvailable"), None) @@ -567,7 +547,6 @@ def create_db_query( return query_tech if additional_table: - orm_table = getattr(orm, ORM_MAP[additional_table], None) query_additional_tables = Query(orm_table, session=session) @@ -755,7 +734,6 @@ def db_query_to_csv(db_query, data_table: str, chunksize: int) -> None: chunk_df[col] = chunk_df[col].str.replace("\r", "") if not chunk_df.empty: - if chunk_number == 0: chunk_df.to_csv( csv_file, @@ -836,5 +814,4 @@ def delete_zip_file_if_corrupted(save_path: str): pass except BadZipfile: log.info(f"Bad Zip file is deleted: {save_path}") - os.remove(save_path) - \ No newline at end of file + os.remove(save_path) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index a9f0f286..202f2b32 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -11,7 +11,7 @@ # setup logger from open_mastr.utils.config import setup_logger -from open_mastr.utils.constants import BULK_INCLUDE_TABLES_MAP +from open_mastr.utils.constants import BULK_INCLUDE_TABLES_MAP, BULK_DATA from open_mastr.utils import unzip_http try: @@ -116,7 +116,7 @@ def gen_url(when: time.struct_time = time.localtime(), use_version="current") -> def download_xml_Mastr( - save_path: str, bulk_date_string: str, xml_folder_path: str + save_path: str, bulk_date_string: str, bulk_data_list: list, xml_folder_path: str ) -> None: """Downloads the zipped MaStR. @@ -125,7 +125,6 @@ def download_xml_Mastr( save_path: str The path where the downloaded MaStR zipped folder will be saved. """ - print_message = ( "Download has started, this can take several minutes." @@ -168,141 +167,116 @@ def download_xml_Mastr( log.error("Could not download file: download URL not found") return - total_length = int(18000 * 1024 * 1024) - with ( - open(save_path, "wb") as zfile, - tqdm(desc=save_path, total=(total_length / 1024 / 1024), unit="") as bar, - ): - for chunk in r.iter_content(chunk_size=1024 * 1024): - # chunk size of 1024 * 1024 needs 9min 11 sek = 551sek - # chunk size of 1024 needs 9min 11 sek as well - if chunk: - zfile.write(chunk) - zfile.flush() - bar.update() - # if the rate falls below 100 kB/s -> prompt warning - if bar.format_dict["rate"] and bar.format_dict["rate"] < 2: - bar.set_postfix_str(s=warning_message) - else: - # remove warning - bar.set_postfix_str(s="") + if bulk_data_list == BULK_DATA: + full_download_without_unzip_http(save_path, r) + else: + try: + partial_download_with_unzip_http(save_path, url, bulk_data_list) + except Exception as e: + log.warning(f"Partial download failed, fallback to full download: {e}") + full_download_without_unzip_http(save_path, r) + time_b = time.perf_counter() print(f"Download is finished. It took {int(np.around(time_b - time_a))} seconds.") print(f"MaStR was successfully downloaded to {xml_folder_path}.") def check_download_completeness( - save_path: str,bulk_data_list: list + save_path: str, bulk_data_list: list ) -> tuple[list, bool]: - """Checks if an existing download contains the xml-files corresponding to the bulk_data_list. - """ - with ZipFile(save_path, 'r') as zip_ref: - existing_files = [zip_name.lower().split('_')[0].split('.')[0] for zip_name in zip_ref.namelist()] + """Checks if an existing download contains the xml-files corresponding to the bulk_data_list.""" + with ZipFile(save_path, "r") as zip_ref: + existing_files = [ + zip_name.lower().split("_")[0].split(".")[0] + for zip_name in zip_ref.namelist() + ] missing_data_set = set() for bulk_data_name in bulk_data_list: - for bulk_file_name in BULK_INCLUDE_TABLES_MAP[bulk_data_name]: - if bulk_file_name not in existing_files: - missing_data_set.add(bulk_data_name) + for bulk_file_name in BULK_INCLUDE_TABLES_MAP[bulk_data_name]: + if bulk_file_name not in existing_files: + missing_data_set.add(bulk_data_name) is_katalogwerte_existing = False - if 'katalogwerte' in existing_files: + if "katalogwerte" in existing_files: is_katalogwerte_existing = True return list(missing_data_set), is_katalogwerte_existing -def download_xml_Mastr_partial( - save_path: str, bulk_date_string: str, bulk_data_list: list, xml_folder_path: str -) -> list: - """Downloads the zipped MaStR. - - Parameters - ----------- - save_path: str - The path where the downloaded MaStR zipped folder will be saved. +def delete_xml_files_not_from_given_date(save_path: str, xml_folder_path: str): """ + Delete xml files that are not corresponding to the given date. + Assumes that the xml folder only contains one zipfile. + """ + if os.path.exists(save_path): + return + else: + shutil.rmtree(xml_folder_path) + os.makedirs(xml_folder_path) + +def partial_download_with_unzip_http(save_path: str, url: str, bulk_data_list: list): is_katalogwerte_existing = False if os.path.exists(save_path): - bulk_data_list, is_katalogwerte_existing = check_download_completeness(save_path,bulk_data_list) + bulk_data_list, is_katalogwerte_existing = check_download_completeness( + save_path, bulk_data_list + ) if bool(bulk_data_list): print(f"MaStR is missing the following data: {bulk_data_list}") else: print("MaStR already downloaded.") return None - print_message = ( - "Download has started, this can take several minutes." - "The download bar is only a rough estimate." - ) - warning_message = ( - "Warning: The servers from MaStR restrict the download speed." - " You may want to download it another time." - ) - print(print_message) - - now = time.localtime() - url = gen_url(now) - - time_a = time.perf_counter() - r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) - if r.status_code == 404: - log.warning( - "Download file was not found. Assuming that the new file was not published yet and retrying with yesterday." - ) - now = time.localtime( - time.mktime(now) - (24 * 60 * 60) - ) # subtract 1 day from the date - url = gen_url(now) - r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) - if r.status_code == 404: - url = gen_url(now, use_version="before") # Use lower MaStR Version - log.warning( - f"Download file was not found. Assuming that the version of MaStR has changed and retrying with download link: {url}" - ) - r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) - if r.status_code == 404: - url = gen_url(now, use_version="after") # Use higher MaStR Version - log.warning( - f"Download file was not found. Assuming that the version of MaStR has changed and retrying with download link: {url}" - ) - r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) - - if r.status_code == 404: - log.error("Could not download file: download URL not found") - return - remote_zip_file = unzip_http.RemoteZipFile(url) - remote_zip_names = [remote_zip_name.lower().split('_')[0].split('.')[0] for remote_zip_name in remote_zip_file.namelist()] + remote_zip_names = [ + remote_zip_name.lower().split("_")[0].split(".")[0] + for remote_zip_name in remote_zip_file.namelist() + ] remote_index_list = [] + download_files_list = [] for bulk_data_name in bulk_data_list: # Example: ['wind','solar'] for bulk_file_name in BULK_INCLUDE_TABLES_MAP[bulk_data_name]: # Example: From "wind" we get ["anlageneegwind", "einheitenwind"], and from "solar" we get ["anlageneegsolar", "einheitensolar"] # and we have to find the corresponding index in the remote_zip_file list in order to fetch the correct file - remote_index_list = [remote_index for remote_index, remote_zip_name in enumerate(remote_zip_names) if remote_zip_name == bulk_file_name] + remote_index_list = [ + remote_index + for remote_index, remote_zip_name in enumerate(remote_zip_names) + if remote_zip_name == bulk_file_name + ] # for remote_index in tqdm(remote_index_list): for remote_index in remote_index_list: # Example: remote_zip_file.namelist()[remote_index] corresponds to e.g. 'AnlagenEegSolar_1.xml' - remote_zip_file.extractzip(remote_zip_file.namelist()[remote_index],path=Path(save_path)) + download_files_list.append(remote_zip_file.namelist()[remote_index]) - if not is_katalogwerte_existing: - remote_zip_file.extractzip('Katalogwerte.xml',path=Path(save_path)) + for zipfile_name in tqdm(download_files_list, unit=" file"): + remote_zip_file.extractzip(zipfile_name, path=Path(save_path)) - time_b = time.perf_counter() - print(f"Download is finished. It took {int(np.around(time_b - time_a))} seconds.") - print(f"MaStR was successfully downloaded to {xml_folder_path}.") - return bulk_data_list + if not is_katalogwerte_existing: + remote_zip_file.extractzip("Katalogwerte.xml", path=Path(save_path)) -def delete_xml_files_not_from_given_date(save_path: str, xml_folder_path: str): - """ - Delete xml files that are not corresponding to the given date. - Assumes that the xml folder only contains one zipfile. - """ - if os.path.exists(save_path): - return - else: - shutil.rmtree(xml_folder_path) - os.makedirs(xml_folder_path) +def full_download_without_unzip_http(save_path: str, r: requests.models.Response): + warning_message = ( + "Warning: The servers from MaStR restrict the download speed." + " You may want to download it another time." + ) + total_length = int(23000) + with ( + open(save_path, "wb") as zfile, + tqdm(desc=save_path, total=total_length, unit="") as bar, + ): + for chunk in r.iter_content(chunk_size=1024 * 1024): + # chunk size of 1024 * 1024 needs 9min 11 sek = 551sek + # chunk size of 1024 needs 9min 11 sek as well + if chunk: + zfile.write(chunk) + zfile.flush() + bar.update() + # if the rate falls below 100 kB/s -> prompt warning + if bar.format_dict["rate"] and bar.format_dict["rate"] < 2: + bar.set_postfix_str(s=warning_message) + else: + # remove warning + bar.set_postfix_str(s="") From 118f75070aa7446de6163f036b6e34cf7a05ee20 Mon Sep 17 00:00:00 2001 From: Florian Kotthoff Date: Mon, 21 Jul 2025 09:20:14 +0200 Subject: [PATCH 14/53] Change print statements #616 --- open_mastr/mastr.py | 6 +++--- open_mastr/xml_download/utils_download_bulk.py | 5 +---- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index ada4311e..ba0cd86e 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -240,9 +240,9 @@ def download( download_xml_Mastr(zipped_xml_file_path, date, data, xml_folder_path) print( - f"\nWould you like to speed up the bulk download?\n" - f"Try our new parallelized processing by setting os.environ['USE_RECOMMENDED_NUMBER_OF_PROCESSES'] = True " - f"or configure your own number of processes via os.environ['NUMBER_OF_PROCESSES'] = your_number\n" + "\nWould you like to speed up the creation of your MaStR database?\n" + "Try our new parallelized processing by setting os.environ['USE_RECOMMENDED_NUMBER_OF_PROCESSES'] = True " + "or configure your own number of processes via os.environ['NUMBER_OF_PROCESSES'] = your_number\n" ) write_mastr_xml_to_database( diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index 202f2b32..fcc604f4 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -126,10 +126,7 @@ def download_xml_Mastr( The path where the downloaded MaStR zipped folder will be saved. """ - print_message = ( - "Download has started, this can take several minutes." - "The download bar is only a rough estimate." - ) + print_message = "Starting the Download from marktstammdatenregister.de." warning_message = ( "Warning: The servers from MaStR restrict the download speed." " You may want to download it another time." From c8177fb92eaa6b52a6647bfd6175cdec362e62b1 Mon Sep 17 00:00:00 2001 From: Florian Kotthoff Date: Mon, 21 Jul 2025 09:46:02 +0200 Subject: [PATCH 15/53] Create test function for delete_xl_files #616 --- .../xml_download/test_utils_download_bulk.py | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/tests/xml_download/test_utils_download_bulk.py b/tests/xml_download/test_utils_download_bulk.py index e1f60bb0..8f650933 100644 --- a/tests/xml_download/test_utils_download_bulk.py +++ b/tests/xml_download/test_utils_download_bulk.py @@ -1,5 +1,10 @@ import time -from open_mastr.xml_download.utils_download_bulk import gen_url +from open_mastr.xml_download.utils_download_bulk import ( + gen_url, + delete_xml_files_not_from_given_date, +) +import os +import shutil def test_gen_url(): @@ -84,3 +89,27 @@ def test_gen_url(): url == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20240402_24.2.zip" ) + + +def test_delete_xml_files_not_from_given_date(): + xml_folder_path = os.path.join("tests", "test_utils_download") + expected_file = os.path.join(xml_folder_path, "20250102.txt") + os.makedirs(xml_folder_path) + + # Case where expected file exists + open(expected_file, "w").close() + delete_xml_files_not_from_given_date( + save_path=expected_file, xml_folder_path=xml_folder_path + ) + assert os.path.exists(expected_file) + os.remove(expected_file) + + # Case where old date is deleted + path_old_file = os.path.join(xml_folder_path, "20250101.txt") + open(path_old_file, "w").close() + delete_xml_files_not_from_given_date( + save_path=expected_file, xml_folder_path=xml_folder_path + ) + assert not os.path.exists(path_old_file) + # clean up test folder + shutil.rmtree(xml_folder_path) From e219fa1dfbb98c094af6ce1954601a1a40d91ccc Mon Sep 17 00:00:00 2001 From: Florian Kotthoff Date: Mon, 21 Jul 2025 10:24:53 +0200 Subject: [PATCH 16/53] Add test for partial download #616 --- tests/test_mastr.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/tests/test_mastr.py b/tests/test_mastr.py index 9fe8883b..df4a3a20 100644 --- a/tests/test_mastr.py +++ b/tests/test_mastr.py @@ -14,11 +14,6 @@ _xml_file_exists = True -@pytest.fixture -def db(): - return Mastr() - - @pytest.fixture def db_path(): return os.path.join( @@ -26,6 +21,11 @@ def db_path(): ) +@pytest.fixture +def db(db_path): + return Mastr(engine=sqlalchemy.create_engine(f"sqlite:///{db_path}")) + + @pytest.fixture def db_translated(db_path): engine = sqlalchemy.create_engine(f"sqlite:///{db_path}") @@ -71,3 +71,14 @@ def test_Mastr_translate(db_translated, db_path): for table in table_names: assert pd.read_sql(sql=table, con=db_empty.engine).shape[0] == 0 + + +def test_mastr_download(db): + db.download(data="wind") + df_wind = pd.read_sql("wind_extended", con=db.engine) + assert len(df_wind) > 10000 + + db.download(data="biomass") + df_biomass = pd.read_sql("biomass_extended", con=db.engine) + assert len(df_wind) > 10000 + assert len(df_biomass) > 10000 From 566c75fc5faf781d3fa405d417e152dea2caa612 Mon Sep 17 00:00:00 2001 From: Florian Kotthoff Date: Mon, 21 Jul 2025 10:31:12 +0200 Subject: [PATCH 17/53] Remove "cleansing" from print statements #644 --- open_mastr/xml_download/utils_write_to_database.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/open_mastr/xml_download/utils_write_to_database.py b/open_mastr/xml_download/utils_write_to_database.py index 4b220909..11bbe015 100644 --- a/open_mastr/xml_download/utils_write_to_database.py +++ b/open_mastr/xml_download/utils_write_to_database.py @@ -28,7 +28,7 @@ def write_mastr_xml_to_database( bulk_download_date: str, ) -> None: """Write the Mastr in xml format into a database defined by the engine parameter.""" - print("Starting bulk download and data cleansing...") + print("Starting bulk download...") include_tables = data_to_include_tables(data, mapping="write_xml") threads_data = [] @@ -71,7 +71,7 @@ def write_mastr_xml_to_database( for item in interleaved_files: process_xml_file(*item) - print("Bulk download and data cleansing were successful.") + print("Bulk download was successful.") def get_number_of_processes(): From 7251b3385ed67ec1c1dcafba7c774530fe6e2357 Mon Sep 17 00:00:00 2001 From: Florian Kotthoff Date: Mon, 21 Jul 2025 10:33:00 +0200 Subject: [PATCH 18/53] Update Changelog #644 --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53591623..e06dc604 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ ### Changed - Fix package publication workflow [#636](https://github.com/OpenEnergyPlatform/open-MaStR/pull/636) +- Change print statement about data cleansing + [#650](https://github.com/OpenEnergyPlatform/open-MaStR/pull/650) ### Removed From 99af202957edae5287f78c443f194cbc2d2ec17d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Wed, 23 Jul 2025 11:21:34 +0200 Subject: [PATCH 19/53] Add test for delete_zip_file_if_corrupted #616 --- tests/test_helpers.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 4a19f4fb..7779a9c8 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -7,6 +7,7 @@ from datetime import datetime import pandas as pd from open_mastr import Mastr +from zipfile import ZipFile from open_mastr.utils import orm from open_mastr.utils.constants import ( @@ -25,6 +26,7 @@ create_db_query, db_query_to_csv, reverse_unit_type_map, + delete_zip_file_if_corrupted, ) @@ -398,6 +400,18 @@ def test_db_query_to_csv(tmpdir, engine): os.rmdir(get_data_version_dir()) +def test_delete_zip_file_if_corrupted(): + test_zip_path = os.path.join("tests", "test.zip") + with ZipFile(test_zip_path, "w") as zf: + zf.writestr(os.path.join("tests", "file.txt"), "Hello, world!") + with open(test_zip_path, "wb+") as f: + f.seek(10) + f.write(b"\xff\xff\xff\xff") + + delete_zip_file_if_corrupted(test_zip_path) + assert not os.path.exists(test_zip_path) + + def test_save_metadata(): # FIXME: implement in #386 pass From 797486502734bd7602793f302593cbd17a2c2be8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Thu, 31 Jul 2025 11:05:12 +0200 Subject: [PATCH 20/53] =?UTF-8?q?Add=20Kevin=20Kr=C3=A4mer=20to=20CITATION?= =?UTF-8?q?.cff?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CITATION.cff | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CITATION.cff b/CITATION.cff index d2fe6752..99458ea3 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -34,6 +34,10 @@ authors: given-names: "Alexandra-Andreea" alias: "@AlexandraImbrisca" affiliation: "Technical University of Munich" + - family-names: 'Krämer' + given-names: "Kevin" + alias: "pt-kkraemer" + affiliation: "ProjectTogether gGmbH" title: "open-MaStR" type: software license: AGPL-3.0 From e3927147b31edbabfc89e6653f09241efa36213a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Thu, 31 Jul 2025 11:13:23 +0200 Subject: [PATCH 21/53] Add PR to CHANGELOG #616 --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53591623..8929b086 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ ## [v0.XX.X] unreleased - 202X-XX-XX ### Added +- Add partial bulk download + [#652](https://github.com/OpenEnergyPlatform/open-MaStR/pull/652) ### Changed - Fix package publication workflow [#636](https://github.com/OpenEnergyPlatform/open-MaStR/pull/636) From ed3f249278080c5a8b2507dd91bff744c7fa61e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Thu, 31 Jul 2025 14:01:21 +0200 Subject: [PATCH 22/53] Add "Einheittyp" to system_catalog #651 --- open_mastr/xml_download/colums_to_replace.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/open_mastr/xml_download/colums_to_replace.py b/open_mastr/xml_download/colums_to_replace.py index 8e6ead17..421ac44c 100644 --- a/open_mastr/xml_download/colums_to_replace.py +++ b/open_mastr/xml_download/colums_to_replace.py @@ -23,6 +23,20 @@ 3: "Gaserzeugungslokation", 4: "Gasverbrauchslokation", }, + "Einheittyp": { + 1: "Solareinheit", + 2: "Windeinheit", + 3: "Biomasse", + 4: "Wasser", + 5: "Geothermie", + 6: "Verbrennung", + 7: "Kernenergie", + 8: "Stromspeichereinheit", + 9: "Stromverbrauchseinheit", + 10: "Gasverbrauchseinheit", + 11: "Gaserzeugungseinheit", + 12: "Gasspeichereinheit", + }, } # columns to replace lists all columns where the entries have From 09fc84e5bb365a6bd38923ca99e9e62a429b99af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Thu, 31 Jul 2025 14:10:45 +0200 Subject: [PATCH 23/53] Add PR to changelog #651 --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 53591623..5d2ecb47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ ## [v0.XX.X] unreleased - 202X-XX-XX ### Added ### Changed +- Updates the system_catalog dict with missing Einheittyp values + [#653](https://github.com/OpenEnergyPlatform/open-MaStR/pull/653) - Fix package publication workflow [#636](https://github.com/OpenEnergyPlatform/open-MaStR/pull/636) ### Removed From 49e6c42e07b62ef49e68880fb2d9626f3098e52f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kevin=20Kr=C3=A4mer?= Date: Mon, 18 Aug 2025 12:24:56 +0200 Subject: [PATCH 24/53] Update docstring gescription of partial download when using "data" #616 --- open_mastr/mastr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index ba0cd86e..fd71fc32 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -126,8 +126,8 @@ def download( from marktstammdatenregister.de, (see :ref:`Configuration `). Default to 'bulk'. data : str or list or None, optional - Determines which types of data are written to the database. If None, all data is - used. If it is a list, possible entries are listed below with respect to the download method. Missing categories are + Determines which data is partially downloaded from the bulk download and written to the database. If None, all data is downloaded and written to the database. + If it is a list, possible entries are listed below with respect to the download method. Missing categories are being developed. If only one data is of interest, this can be given as a string. Default to None, where all data is included. | Data | Bulk | API | @@ -157,7 +157,7 @@ def download( |-----------------------|------|------| | "today" | latest files are downloaded from marktstammdatenregister.de | - | | "20230101" | If file from this date exists locally, it is used. Otherwise it throws an error (You can only receive todays data from the server) | - | - | "existing" | Deprecated since 0.16, see [#616](https://github.com/OpenEnergyPlatform/open-MaStR/issues/616#issuecomment-3089377062)mkdo | - | + | "existing" | Deprecated since 0.16, see [#616](https://github.com/OpenEnergyPlatform/open-MaStR/issues/616#issuecomment-3089377062) | - | | "latest" | - | Retrieve data that is newer than the newest data already in the table | | datetime.datetime(2020, 11, 27) | - | Retrieve data that is newer than this time stamp | | None | set date="today" | set date="latest" | From 258724a5b9960c2572916fc421840d7afc29bdbf Mon Sep 17 00:00:00 2001 From: FlorianK13 Date: Tue, 19 Aug 2025 13:51:26 +0200 Subject: [PATCH 25/53] Delete unused print message #616 --- open_mastr/xml_download/utils_download_bulk.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index fcc604f4..10f60e3d 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -127,12 +127,9 @@ def download_xml_Mastr( """ print_message = "Starting the Download from marktstammdatenregister.de." - warning_message = ( - "Warning: The servers from MaStR restrict the download speed." - " You may want to download it another time." - ) print(print_message) + # TODO this should take bulk_date_string now = time.localtime() url = gen_url(now) From a2e3dc7d9f090079d1138540da15d618b38214f5 Mon Sep 17 00:00:00 2001 From: FlorianK13 Date: Tue, 19 Aug 2025 15:19:03 +0200 Subject: [PATCH 26/53] Extend docs #616 --- docs/getting_started.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/docs/getting_started.md b/docs/getting_started.md index 5a3dd671..891efbbe 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -35,7 +35,16 @@ db = Mastr() db.download() ``` -When a `Mastr` object is initialized, a sqlite database is created in `$HOME/.open-MaStR/data/sqlite`. With the function `Mastr.download()`, the **whole MaStR is downloaded** in the zipped xml file format. It is then read into the sqlite database and simple data cleansing functions are started. +When a `Mastr` object is initialized, a sqlite database is created in `$HOME/.open-MaStR/data/sqlite`. With the function [`Mastr.download()`][open_mastr.Mastr.download], the **whole MaStR is downloaded** in the zipped xml file format. It is then read into the sqlite database and simple data cleansing functions are started. + +If you are interested in a specific part of the dataset, you can specify this by using the `data` parameter: + +```python +from open_mastr import Mastr + +db = Mastr() +db.download(data=["wind","hydro"]) +``` More detailed information can be found in the section [bulk download](advanced.md#bulk-download). From d603388f092499fea63761d7f6e5712bc7017a2e Mon Sep 17 00:00:00 2001 From: nesnoj Date: Mon, 20 Oct 2025 14:06:03 +0200 Subject: [PATCH 27/53] Replace print() statements by logging #657 --- open_mastr/mastr.py | 18 +++++++------ open_mastr/soap_api/metadata/description.py | 11 +++++--- open_mastr/utils/helpers.py | 25 +++++++----------- .../xml_download/utils_download_bulk.py | 13 +++++----- .../xml_download/utils_write_to_database.py | 26 ++++++++++--------- 5 files changed, 49 insertions(+), 44 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index fd71fc32..f5cffd7d 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -92,7 +92,7 @@ def __init__(self, engine="sqlite", connect_to_translated_db=False) -> None: else: self.engine = create_database_engine(engine, self._sqlite_folder_path) - print( + log.info( f"Data will be written to the following database: {self.engine.url}\n" "If you run into problems, try to " "delete the database and update the package by running " @@ -239,7 +239,7 @@ def download( download_xml_Mastr(zipped_xml_file_path, date, data, xml_folder_path) - print( + log.info( "\nWould you like to speed up the creation of your MaStR database?\n" "Try our new parallelized processing by setting os.environ['USE_RECOMMENDED_NUMBER_OF_PROCESSES'] = True " "or configure your own number of processes via os.environ['NUMBER_OF_PROCESSES'] = your_number\n" @@ -259,8 +259,8 @@ def download( # Set api_processes to None in order to avoid the malfunctioning usage if api_processes: api_processes = None - print( - "Warning: The implementation of parallel processes " + log.warning( + "The implementation of parallel processes " "is currently under construction. Please let " "the argument api_processes at the default value None." ) @@ -429,9 +429,11 @@ def translate(self) -> None: try: os.remove(new_path) except Exception as e: - print(f"An error occurred: {e}") + log.error( + f"An error occurred while removing old translated database: {e}" + ) - print("Replacing previous version of the translated database...") + log.info("Replacing previous version of the translated database...") for table in inspector.get_table_names(): rename_table(table, inspector.get_columns(table), self.engine) @@ -440,9 +442,9 @@ def translate(self) -> None: try: os.rename(old_path, new_path) - print(f"Database '{old_path}' changed to '{new_path}'") + log.info(f"Database '{old_path}' changed to '{new_path}'") except Exception as e: - print(f"An error occurred: {e}") + log.error(f"An error occurred while renaming database: {e}") self.engine = create_engine(f"sqlite:///{new_path}") self.is_translated = True diff --git a/open_mastr/soap_api/metadata/description.py b/open_mastr/soap_api/metadata/description.py index a4986959..728aec23 100644 --- a/open_mastr/soap_api/metadata/description.py +++ b/open_mastr/soap_api/metadata/description.py @@ -1,10 +1,13 @@ from io import BytesIO +import logging import re from urllib.request import urlopen from zipfile import ZipFile import xmltodict from collections import OrderedDict +log = logging.getLogger(__name__) + class DataDescription(object): """ @@ -150,9 +153,11 @@ def functions_data_documentation(self): fcn["sequence"]["element"]["@type"].split(":")[1] ]["sequence"]["element"] else: - print(type(fcn["sequence"])) - print(fcn["sequence"]) - raise ValueError + log.error(f"Unexpected sequence type: {type(fcn['sequence'])}") + log.error(f"Sequence content: {fcn['sequence']}") + raise ValueError( + f"Unexpected sequence structure in function metadata" + ) # Add data for inherited columns from base types if "@base" in fcn: diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index a4a1f525..9ac2492b 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -343,37 +343,32 @@ def print_api_settings( api_processes, api_location_types, ): - print( + log.info( f"Downloading with soap_API.\n\n -- API settings -- \nunits after date: " f"{date}\nunit download limit per data: " f"{api_limit}\nparallel_processes: {api_processes}\nchunksize: " f"{api_chunksize}\ndata_api: {data}" ) if "permit" in harmonisation_log: - print( - f"data_types: {api_data_types}\033[31m", + log.warning( + f"data_types: {api_data_types} - " "Attention, 'permit_data' was automatically set in api_data_types, " - "as you defined 'permit' in parameter data_api.", - "\033[m", + "as you defined 'permit' in parameter data_api." ) else: - print(f"data_types: {api_data_types}") + log.info(f"data_types: {api_data_types}") if "location" in harmonisation_log: - print( - "location_types:", - "\033[31m", - "Attention, 'location' is in parameter data. location_types are set to", - "\033[m", - f"{api_location_types}" - "\n If you want to change location_types, please remove 'location' " + log.warning( + f"location_types: {api_location_types} - " + "Attention, 'location' is in parameter data. location_types are set accordingly. " + "If you want to change location_types, please remove 'location' " "from data_api and specify api_location_types." - "\n ------------------ \n", ) else: - print( + log.info( f"location_types: {api_location_types}", "\n ------------------ \n", ) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index 10f60e3d..785d2a3d 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -126,8 +126,7 @@ def download_xml_Mastr( The path where the downloaded MaStR zipped folder will be saved. """ - print_message = "Starting the Download from marktstammdatenregister.de." - print(print_message) + log.info("Starting the Download from marktstammdatenregister.de.") # TODO this should take bulk_date_string now = time.localtime() @@ -171,8 +170,10 @@ def download_xml_Mastr( full_download_without_unzip_http(save_path, r) time_b = time.perf_counter() - print(f"Download is finished. It took {int(np.around(time_b - time_a))} seconds.") - print(f"MaStR was successfully downloaded to {xml_folder_path}.") + log.info( + f"Download is finished. It took {int(np.around(time_b - time_a))} seconds." + ) + log.info(f"MaStR was successfully downloaded to {xml_folder_path}.") def check_download_completeness( @@ -216,9 +217,9 @@ def partial_download_with_unzip_http(save_path: str, url: str, bulk_data_list: l save_path, bulk_data_list ) if bool(bulk_data_list): - print(f"MaStR is missing the following data: {bulk_data_list}") + log.info(f"MaStR is missing the following data: {bulk_data_list}") else: - print("MaStR already downloaded.") + log.info("MaStR already downloaded.") return None remote_zip_file = unzip_http.RemoteZipFile(url) diff --git a/open_mastr/xml_download/utils_write_to_database.py b/open_mastr/xml_download/utils_write_to_database.py index 11bbe015..e71abc18 100644 --- a/open_mastr/xml_download/utils_write_to_database.py +++ b/open_mastr/xml_download/utils_write_to_database.py @@ -19,6 +19,8 @@ from open_mastr.utils.orm import tablename_mapping from open_mastr.xml_download.utils_cleansing_bulk import cleanse_bulk_data +log = setup_logger() + def write_mastr_xml_to_database( engine: sqlalchemy.engine.Engine, @@ -28,7 +30,7 @@ def write_mastr_xml_to_database( bulk_download_date: str, ) -> None: """Write the Mastr in xml format into a database defined by the engine parameter.""" - print("Starting bulk download...") + log.info("Starting bulk download...") include_tables = data_to_include_tables(data, mapping="write_xml") threads_data = [] @@ -71,7 +73,7 @@ def write_mastr_xml_to_database( for item in interleaved_files: process_xml_file(*item) - print("Bulk download was successful.") + log.info("Bulk download was successful.") def get_number_of_processes(): @@ -82,11 +84,11 @@ def get_number_of_processes(): try: number_of_processes = int(os.environ.get("NUMBER_OF_PROCESSES")) except ValueError: - print("Warning: Invalid value for NUMBER_OF_PROCESSES. Fallback to 1.") + log.warning("Invalid value for NUMBER_OF_PROCESSES. Fallback to 1.") return 1 if number_of_processes >= cpu_count(): - print( - f"Warning: Your system supports {cpu_count()} CPUs. Using " + log.warning( + f"Your system supports {cpu_count()} CPUs. Using " f"more processes than available CPUs may cause excessive " f"context-switching overhead." ) @@ -118,9 +120,9 @@ def process_xml_file( # The connection url obfuscates the password. We must replace the masked password with the actual password. engine = create_efficient_engine(connection_url) with ZipFile(zipped_xml_file_path, "r") as f: - print(f"Processing file '{file_name}'...") + log.info(f"Processing file '{file_name}'...") if is_first_file(file_name): - print(f"Creating table '{sql_table_name}'...") + log.info(f"Creating table '{sql_table_name}'...") create_database_table(engine, xml_table_name) df = read_xml_file(f, file_name) df = process_table_before_insertion( @@ -137,7 +139,7 @@ def process_xml_file( df, xml_table_name, sql_table_name, engine ) except Exception as e: - print(f"Error processing file '{file_name}': '{e}'") + log.error(f"Error processing file '{file_name}': '{e}'") def create_efficient_engine(connection_url: str) -> sqlalchemy.engine.Engine: @@ -224,7 +226,7 @@ def is_table_relevant(xml_table_name: str, include_tables: list) -> bool: tablename_mapping[xml_table_name]["__class__"] is not None ) except KeyError: - print( + log.warning( f"Table '{xml_table_name}' is not supported by your open-mastr version and " f"will be skipped." ) @@ -451,7 +453,7 @@ def write_single_entries_until_not_unique_comes_up( labels=key_list, errors="ignore" ) # drop primary keys that already exist in the table df = df.reset_index() - print(f"{len_df_before - len(df)} entries already existed in the database.") + log.warning(f"{len_df_before - len(df)} entries already existed in the database.") return df @@ -509,7 +511,7 @@ def add_missing_columns_to_table( def delete_wrong_xml_entry(err: Error, df: pd.DataFrame) -> pd.DataFrame: delete_entry = str(err).split("«")[0].split("»")[1] - print(f"The entry {delete_entry} was deleted due to its false data type.") + log.warning(f"The entry {delete_entry} was deleted due to its false data type.") return df.replace(delete_entry, np.nan) @@ -548,7 +550,7 @@ def find_nearest_brackets(xml_string: str, position: int) -> tuple[int, int]: row_with_error[: left_bracket + 1] + row_with_error[right_bracket:] ) try: - print("One invalid xml expression was deleted.") + log.warning("One invalid xml expression was deleted.") df = pd.read_xml(StringIO("\n".join(data))) return df except lxml.etree.XMLSyntaxError as e: From a9081addd6abd3eee4aadcf3e9724846785a0284 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Mon, 20 Oct 2025 14:23:17 +0200 Subject: [PATCH 28/53] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 725f1ce6..9ef592cf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ [#636](https://github.com/OpenEnergyPlatform/open-MaStR/pull/636) - Change print statement about data cleansing [#650](https://github.com/OpenEnergyPlatform/open-MaStR/pull/650) +- Improve logging + [#666](https://github.com/OpenEnergyPlatform/open-MaStR/pull/666) ### Removed From ed96ecbcbbb02b185cd7e84a1c2f543acf6f3f0d Mon Sep 17 00:00:00 2001 From: nesnoj Date: Mon, 20 Oct 2025 14:35:47 +0200 Subject: [PATCH 29/53] Replace print() statements by logging in unzip_http.py and apply black #657 --- open_mastr/utils/unzip_http.py | 245 +++++++++++++++++++++------------ 1 file changed, 157 insertions(+), 88 deletions(-) diff --git a/open_mastr/utils/unzip_http.py b/open_mastr/utils/unzip_http.py index ef4140f8..0674e130 100644 --- a/open_mastr/utils/unzip_http.py +++ b/open_mastr/utils/unzip_http.py @@ -58,32 +58,39 @@ import pathlib import urllib.parse import zipfile +import logging +log = logging.getLogger(__name__) -__version__ = '0.6' +__version__ = "0.6" def error(s): raise Exception(s) + def warning(s): - print(s, file=sys.stderr) + log.warning(s) + -def get_bits(val:int, *args): - 'Generate bitfields (one for each arg) from LSB to MSB.' +def get_bits(val: int, *args): + "Generate bitfields (one for each arg) from LSB to MSB." for n in args: - x = val & (2**n-1) + x = val & (2**n - 1) val >>= n yield x class RemoteZipInfo: - def __init__(self, filename:str='', - date_time:int = 0, - header_offset:int = 0, - compress_type:int = 0, - compress_size:int = 0, - file_size:int = 0): + def __init__( + self, + filename: str = "", + date_time: int = 0, + header_offset: int = 0, + compress_type: int = 0, + compress_size: int = 0, + file_size: int = 0, + ): self.filename = filename self.header_offset = header_offset self.compress_type = compress_type @@ -91,46 +98,51 @@ def __init__(self, filename:str='', self.file_size = file_size sec, mins, hour, day, mon, year = get_bits(date_time, 5, 6, 5, 5, 4, 7) - self.date_time = (year+1980, mon, day, hour, mins, sec) + self.date_time = (year + 1980, mon, day, hour, mins, sec) def is_dir(self): - return self.filename.endswith('/') + return self.filename.endswith("/") def parse_extra(self, extra): i = 0 while i < len(extra): - fieldid, fieldsz = struct.unpack_from('= 0: - magic, eocd_sz, create_ver, min_ver, disk_num, disk_start, disk_num_records, total_num_records, \ - cdir_bytes, cdir_start = struct.unpack_from(self.fmt_eocd64, resp.data, offset=i) + ( + magic, + eocd_sz, + create_ver, + min_ver, + disk_num, + disk_start, + disk_num_records, + total_num_records, + cdir_bytes, + cdir_start, + ) = struct.unpack_from(self.fmt_eocd64, resp.data, offset=i) else: i = resp.data.rfind(self.magic_eocd) if i >= 0: - magic, \ - disk_num, disk_start, disk_num_records, total_num_records, \ - cdir_bytes, cdir_start, comment_len = struct.unpack_from(self.fmt_eocd, resp.data, offset=i) + ( + magic, + disk_num, + disk_start, + disk_num_records, + total_num_records, + cdir_bytes, + cdir_start, + comment_len, + ) = struct.unpack_from(self.fmt_eocd, resp.data, offset=i) if cdir_start < 0 or cdir_start >= self.zip_size: - error('cannot find central directory') + error("cannot find central directory") if self.zip_size <= 65536: filehdr_index = cdir_start @@ -194,67 +222,91 @@ def infoiter(self): while filehdr_index < cdir_end: sizeof_cdirentry = struct.calcsize(self.fmt_cdirentry) - magic, ver, ver_needed, flags, method, date_time, crc, \ - complen, uncomplen, fnlen, extralen, commentlen, \ - disknum_start, internal_attr, external_attr, local_header_ofs = \ - struct.unpack_from(self.fmt_cdirentry, resp.data, offset=filehdr_index) + ( + magic, + ver, + ver_needed, + flags, + method, + date_time, + crc, + complen, + uncomplen, + fnlen, + extralen, + commentlen, + disknum_start, + internal_attr, + external_attr, + local_header_ofs, + ) = struct.unpack_from(self.fmt_cdirentry, resp.data, offset=filehdr_index) filehdr_index += sizeof_cdirentry - filename = resp.data[filehdr_index:filehdr_index+fnlen] + filename = resp.data[filehdr_index : filehdr_index + fnlen] filehdr_index += fnlen - extra = resp.data[filehdr_index:filehdr_index+extralen] + extra = resp.data[filehdr_index : filehdr_index + extralen] filehdr_index += extralen # comment = resp.data[filehdr_index:filehdr_index+commentlen] filehdr_index += commentlen - rzi = RemoteZipInfo(filename.decode(), date_time, local_header_ofs, method, complen, uncomplen) + rzi = RemoteZipInfo( + filename.decode(), + date_time, + local_header_ofs, + method, + complen, + uncomplen, + ) rzi.parse_extra(extra) yield rzi def extract(self, member, path=None, pwd=None): - if pwd: - raise NotImplementedError('Passwords not supported yet') + if pwd: + raise NotImplementedError("Passwords not supported yet") - path = path or pathlib.Path('.') + path = path or pathlib.Path(".") - outpath = path/member - os.makedirs(outpath.parent, exist_ok=True) - with self.open(member) as fpin: - with open(path/member, mode='wb') as fpout: - while True: - r = fpin.read(65536) - if not r: - break - fpout.write(r) + outpath = path / member + os.makedirs(outpath.parent, exist_ok=True) + with self.open(member) as fpin: + with open(path / member, mode="wb") as fpout: + while True: + r = fpin.read(65536) + if not r: + break + fpout.write(r) - def extractzip(self, member, path=None, pwd=None): if pwd: - raise NotImplementedError('Passwords not supported yet') + raise NotImplementedError("Passwords not supported yet") - path = path or pathlib.Path('.') + path = path or pathlib.Path(".") outpath = path os.makedirs(outpath.parent, exist_ok=True) with self.open(member) as fpin: - with zipfile.ZipFile(outpath, 'a', zipfile.ZIP_DEFLATED) as zout: - with zout.open(member,'w') as fpout: + with zipfile.ZipFile(outpath, "a", zipfile.ZIP_DEFLATED) as zout: + with zout.open(member, "w") as fpout: while True: r = fpin.read(65536) if not r: break fpout.write(r) - def extractall(self, path=None, members=None, pwd=None): for fn in members or self.namelist(): self.extract(fn, path, pwd=pwd) def get_range(self, start, n): - return self.http.request('GET', self.url, headers={'Range': f'bytes={start}-{start+n-1}'}, preload_content=False) + return self.http.request( + "GET", + self.url, + headers={"Range": f"bytes={start}-{start+n-1}"}, + preload_content=False, + ) def matching_files(self, *globs): for f in self.files.values(): @@ -265,7 +317,7 @@ def open(self, fn): if isinstance(fn, str): f = list(self.matching_files(fn)) if not f: - error(f'no files matching {fn}') + error(f"no files matching {fn}") f = f[0] else: f = fn @@ -273,14 +325,29 @@ def open(self, fn): sizeof_localhdr = struct.calcsize(self.fmt_localhdr) r = self.get_range(f.header_offset, sizeof_localhdr) localhdr = struct.unpack_from(self.fmt_localhdr, r.data) - magic, ver, flags, method, dos_datetime, _, _, uncomplen, fnlen, extralen = localhdr - if method == 0: # none - return self.get_range(f.header_offset + sizeof_localhdr + fnlen + extralen, f.compress_size) - elif method == 8: # DEFLATE - resp = self.get_range(f.header_offset + sizeof_localhdr + fnlen + extralen, f.compress_size) + ( + magic, + ver, + flags, + method, + dos_datetime, + _, + _, + uncomplen, + fnlen, + extralen, + ) = localhdr + if method == 0: # none + return self.get_range( + f.header_offset + sizeof_localhdr + fnlen + extralen, f.compress_size + ) + elif method == 8: # DEFLATE + resp = self.get_range( + f.header_offset + sizeof_localhdr + fnlen + extralen, f.compress_size + ) return io.BufferedReader(RemoteZipStream(resp, f)) else: - error(f'unknown compression method {method}') + error(f"unknown compression method {method}") def open_text(self, fn): return io.TextIOWrapper(self.open(fn)) @@ -298,7 +365,7 @@ def readable(self): def readinto(self, b): r = self.read(len(b)) - b[:len(r)] = r + b[: len(r)] = r return len(r) def read(self, n): @@ -315,10 +382,11 @@ def read(self, n): return ret - ### script start +### script start + class StreamProgress: - def __init__(self, fp, name='', total=0): + def __init__(self, fp, name="", total=0): self.name = name self.fp = fp self.total = total @@ -334,10 +402,12 @@ def read(self, n): self.last_update = now elapsed_s = now - self.start_time - sys.stderr.write(f'\r{elapsed_s:.0f}s {self.amtread/10**6:.02f}/{self.total/10**6:.02f}MB ({self.amtread/10**6/elapsed_s:.02f} MB/s) {self.name}') + sys.stderr.write( + f"\r{elapsed_s:.0f}s {self.amtread/10**6:.02f}/{self.total/10**6:.02f}MB ({self.amtread/10**6/elapsed_s:.02f} MB/s) {self.name}" + ) if not r: - sys.stderr.write('\n') + sys.stderr.write("\n") return r @@ -347,14 +417,14 @@ def safelog(x): return 1 if x == 0 else math.ceil(math.log10(x)) digits_compr = max(safelog(f.compress_size) for f in rzf.infolist()) - digits_plain = max(safelog(f.file_size ) for f in rzf.infolist()) - fmtstr = f'%{digits_compr}d -> %{digits_plain}d\t%s' + digits_plain = max(safelog(f.file_size) for f in rzf.infolist()) + fmtstr = f"%{digits_compr}d -> %{digits_plain}d\t%s" for f in rzf.infolist(): - print(fmtstr % (f.compress_size, f.file_size, f.filename), file=sys.stderr) + log.info(fmtstr % (f.compress_size, f.file_size, f.filename)) def extract_one(outfile, rzf, f, ofname): - print(f'Extracting {f.filename} to {ofname}...', file=sys.stderr) + log.info(f"Extracting {f.filename} to {ofname}...") fp = StreamProgress(rzf.open(f), name=f.filename, total=f.compress_size) while r := fp.read(2**18): @@ -374,6 +444,5 @@ def download_file(f, rzf, args): else: path = path.name - with open(str(path), 'wb') as of: + with open(str(path), "wb") as of: extract_one(of, rzf, f, str(path)) - From e7ca8862faa6cdd9b39e21906bd29c4c2888c64c Mon Sep 17 00:00:00 2001 From: nesnoj Date: Mon, 20 Oct 2025 17:02:39 +0200 Subject: [PATCH 30/53] Logging: add formatter for debug messages #664 --- open_mastr/utils/config/logging.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/open_mastr/utils/config/logging.yml b/open_mastr/utils/config/logging.yml index 64a5ac75..67ce82b3 100644 --- a/open_mastr/utils/config/logging.yml +++ b/open_mastr/utils/config/logging.yml @@ -4,6 +4,8 @@ disable_existing_loggers: False formatters: standard: format: "%(asctime)s [%(levelname)s] %(message)s" + debug: + format: "%(asctime)s [%(levelname)s] %(name)s:%(funcName)s:%(lineno)d - %(message)s" handlers: console: From 2f7ba6c17de6349e8b5c95462b6387c38d660cbe Mon Sep 17 00:00:00 2001 From: nesnoj Date: Mon, 20 Oct 2025 17:03:49 +0200 Subject: [PATCH 31/53] Logging: set package log level instead of global log level #664 --- open_mastr/utils/config/logging.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/open_mastr/utils/config/logging.yml b/open_mastr/utils/config/logging.yml index 67ce82b3..64b038ec 100644 --- a/open_mastr/utils/config/logging.yml +++ b/open_mastr/utils/config/logging.yml @@ -14,14 +14,12 @@ handlers: class: "logging.StreamHandler" stream: "ext://sys.stdout" file: - class: "logging.FileHandler" level: "DEBUG" - formatter: "standard" + formatter: "debug" + class: "logging.FileHandler" mode: "a" -root: - level: "DEBUG" - loggers: open-MaStR: + level: "DEBUG" handlers: ["console", "file"] From 57a58969dc586f504493995b9d0d3af305fe62f5 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Mon, 20 Oct 2025 17:07:15 +0200 Subject: [PATCH 32/53] Logging: do not propagate messages to global logger #664 --- open_mastr/utils/config/logging.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/open_mastr/utils/config/logging.yml b/open_mastr/utils/config/logging.yml index 64b038ec..68a6999a 100644 --- a/open_mastr/utils/config/logging.yml +++ b/open_mastr/utils/config/logging.yml @@ -23,3 +23,4 @@ loggers: open-MaStR: level: "DEBUG" handlers: ["console", "file"] + propagate: no From 5d6d4d52aca92e5a48fda549fbea39206b230cff Mon Sep 17 00:00:00 2001 From: nesnoj Date: Mon, 20 Oct 2025 17:37:30 +0200 Subject: [PATCH 33/53] Logging: set default console log level to info #664 --- open_mastr/utils/config/logging.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open_mastr/utils/config/logging.yml b/open_mastr/utils/config/logging.yml index 68a6999a..c1b4c29b 100644 --- a/open_mastr/utils/config/logging.yml +++ b/open_mastr/utils/config/logging.yml @@ -21,6 +21,6 @@ handlers: loggers: open-MaStR: - level: "DEBUG" + level: "INFO" handlers: ["console", "file"] propagate: no From bf7e46362dc29f8b1858d46b49403ec16967580c Mon Sep 17 00:00:00 2001 From: nesnoj Date: Mon, 20 Oct 2025 17:37:52 +0200 Subject: [PATCH 34/53] Add splash screen --- open_mastr/mastr.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index f5cffd7d..0cc16f21 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -93,6 +93,9 @@ def __init__(self, engine="sqlite", connect_to_translated_db=False) -> None: self.engine = create_database_engine(engine, self._sqlite_folder_path) log.info( + "\n==================================================\n" + "---------> open-MaStR started <---------\n" + "==================================================\n" f"Data will be written to the following database: {self.engine.url}\n" "If you run into problems, try to " "delete the database and update the package by running " From 0ff6c8154b54a9250cef74a65005616fc1b26891 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Mon, 20 Oct 2025 17:49:30 +0200 Subject: [PATCH 35/53] Logging: extend instructions in docs #664 --- docs/advanced.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/advanced.md b/docs/advanced.md index ec4ffd39..b5441632 100644 --- a/docs/advanced.md +++ b/docs/advanced.md @@ -83,6 +83,19 @@ The project home directory is structured as follows (files and folders below `da For the download via the API, logs are stored in a single file in `/$HOME//.open-MaStR/logs/open_mastr.log`. New logging messages are appended. It is recommended to delete the log file from time to time because of its required disk space. +By default, the log level is set to `INFO`. You can increase or decrease the verbosity by either changing `logging.yml` (see above) +or adjusting it manually in your code. E.g. to enable `DEBUG` messages in `open_mastr.log` you can use the following snippet: + +```python + + import logging + from open_mastr import Mastr + + # Increase to DEBUG to show more details in open_mastr.log + # Must be called after importing open_mastr to have the open-MaStR logger imported + logging.getLogger("open-MaStR").setLevel(logging.DEBUG) +``` + ### Data From 2899552918cb0c621c1456497017279c66a9ac00 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Mon, 20 Oct 2025 17:52:06 +0200 Subject: [PATCH 36/53] Logging: extend instructions in docs #664 --- docs/advanced.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/advanced.md b/docs/advanced.md index b5441632..7aa5d9b7 100644 --- a/docs/advanced.md +++ b/docs/advanced.md @@ -63,7 +63,7 @@ The project home directory is structured as follows (files and folders below `da File names are defined here. * `logging.yml`
Logging configuration. For changing the log level to increase or decrease details of log - messages, edit the level of the handlers. + messages, edit the level of the handlers. See below for details on logging. * **data** * `dataversion-`
Contains exported data as csv files from method [`to_csv`][open_mastr.Mastr.to_csv] From ca2229cc2f1f7c8bf4e5b67fbedf6520b39bba67 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 07:05:52 +0200 Subject: [PATCH 37/53] Add option to keep old zip files on download #564 --- open_mastr/mastr.py | 9 ++++++- .../xml_download/utils_download_bulk.py | 26 ++++++++++++++++--- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index fd71fc32..a9c9d999 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -107,6 +107,7 @@ def download( data=None, date=None, bulk_cleansing=True, + keep_old_downloads: bool = False, api_processes=None, api_limit=50, api_chunksize=1000, @@ -168,6 +169,8 @@ def download( In its original format, many entries in the MaStR are encoded with IDs. Columns like `state` or `fueltype` do not contain entries such as "Hessen" or "Braunkohle", but instead only contain IDs. Cleansing replaces these IDs with their corresponding original entries. + keep_old_downloads: bool + If set to True, prior downloaded MaStR zip files will be kept. api_processes : int or None or "max", optional Number of parallel processes used to download additional data. Defaults to `None`. If set to "max", the maximum number of possible processes @@ -235,7 +238,11 @@ def download( ) delete_zip_file_if_corrupted(zipped_xml_file_path) - delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) + delete_xml_files_not_from_given_date( + zipped_xml_file_path, + xml_folder_path, + keep_old_downloads, + ) download_xml_Mastr(zipped_xml_file_path, date, data, xml_folder_path) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index 10f60e3d..aeaaccff 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -123,7 +123,11 @@ def download_xml_Mastr( Parameters ----------- save_path: str - The path where the downloaded MaStR zipped folder will be saved. + Full file path where the downloaded MaStR zip file will be saved. + bulk_date_string: str + Date for which the file should be downloaded. + xml_folder_path: str + Path where the downloaded MaStR zip file will be saved. """ print_message = "Starting the Download from marktstammdatenregister.de." @@ -197,16 +201,30 @@ def check_download_completeness( return list(missing_data_set), is_katalogwerte_existing -def delete_xml_files_not_from_given_date(save_path: str, xml_folder_path: str): +def delete_xml_files_not_from_given_date( + save_path: str, + xml_folder_path: str, + keep_old_downloads: bool = False, +) -> None: """ Delete xml files that are not corresponding to the given date. Assumes that the xml folder only contains one zipfile. + + Parameters + ---------- + save_path: str + Full file path where the downloaded MaStR zip file will be saved. + xml_folder_path: str + Path where the downloaded MaStR zip file will be saved. + keep_old_downloads: bool + If set to True, prior downloaded MaStR zip files will be kept. """ if os.path.exists(save_path): return else: - shutil.rmtree(xml_folder_path) - os.makedirs(xml_folder_path) + if not keep_old_downloads: + shutil.rmtree(xml_folder_path) + os.makedirs(xml_folder_path) def partial_download_with_unzip_http(save_path: str, url: str, bulk_data_list: list): From 8a6911fb930683682ed220e1c25e423469dcf628 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 09:07:47 +0200 Subject: [PATCH 38/53] Complete docstring #564 --- open_mastr/xml_download/utils_download_bulk.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index aeaaccff..9069908c 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -126,6 +126,8 @@ def download_xml_Mastr( Full file path where the downloaded MaStR zip file will be saved. bulk_date_string: str Date for which the file should be downloaded. + bulk_data_list: list + List of tables/technologis to be downloaded. xml_folder_path: str Path where the downloaded MaStR zip file will be saved. """ From 7200b29dfa8e515120d555e419d43a6f3de4e29b Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 09:10:09 +0200 Subject: [PATCH 39/53] Add technology checks to full bulk download: do not download if data is present #668 --- .../xml_download/utils_download_bulk.py | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index 9069908c..2469141d 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -168,13 +168,13 @@ def download_xml_Mastr( return if bulk_data_list == BULK_DATA: - full_download_without_unzip_http(save_path, r) + full_download_without_unzip_http(save_path, r, bulk_data_list) else: try: partial_download_with_unzip_http(save_path, url, bulk_data_list) except Exception as e: log.warning(f"Partial download failed, fallback to full download: {e}") - full_download_without_unzip_http(save_path, r) + full_download_without_unzip_http(save_path, r, bulk_data_list) time_b = time.perf_counter() print(f"Download is finished. It took {int(np.around(time_b - time_a))} seconds.") @@ -271,7 +271,38 @@ def partial_download_with_unzip_http(save_path: str, url: str, bulk_data_list: l remote_zip_file.extractzip("Katalogwerte.xml", path=Path(save_path)) -def full_download_without_unzip_http(save_path: str, r: requests.models.Response): +def full_download_without_unzip_http( + save_path: str, + r: requests.models.Response, + bulk_data_list: list, +) -> None: + """ + + Parameters + ---------- + save_path: str + Full file path where the downloaded MaStR zip file will be saved. + r: requests.models.Response + Response from making a request to MaStR. + bulk_data_list: list + List of tables/technologis to be downloaded. + + Returns + ------- + None + """ + if os.path.exists(save_path): + bulk_data_list, is_katalogwerte_existing = check_download_completeness( + save_path, bulk_data_list + ) + if bool(bulk_data_list): + print( + f"MaStR file already present but missing the following data: {bulk_data_list}" + ) + else: + print(f"MaStR file already present: {save_path}") + return None + warning_message = ( "Warning: The servers from MaStR restrict the download speed." " You may want to download it another time." From 33903e033079e0dcb0ba729eb746bb6abb6b2259 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 09:11:04 +0200 Subject: [PATCH 40/53] Complete docstring and adjust messages #564 --- .../xml_download/utils_download_bulk.py | 21 +++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index 2469141d..a18f4111 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -230,15 +230,32 @@ def delete_xml_files_not_from_given_date( def partial_download_with_unzip_http(save_path: str, url: str, bulk_data_list: list): + """ + + Parameters + ---------- + save_path: str + Full file path where the downloaded MaStR zip file will be saved. + url: str + URL path to bulk file. + bulk_data_list: list + List of tables/technologis to be downloaded. + + Returns + ------- + None + """ is_katalogwerte_existing = False if os.path.exists(save_path): bulk_data_list, is_katalogwerte_existing = check_download_completeness( save_path, bulk_data_list ) if bool(bulk_data_list): - print(f"MaStR is missing the following data: {bulk_data_list}") + print( + f"MaStR file already present but missing the following data: {bulk_data_list}" + ) else: - print("MaStR already downloaded.") + print(f"MaStR file already present: {save_path}") return None remote_zip_file = unzip_http.RemoteZipFile(url) From 95f2dab2746c2ff852addde55fff6c09b274eb6e Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 09:14:44 +0200 Subject: [PATCH 41/53] Update changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 725f1ce6..66ff8971 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,10 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ [#636](https://github.com/OpenEnergyPlatform/open-MaStR/pull/636) - Change print statement about data cleansing [#650](https://github.com/OpenEnergyPlatform/open-MaStR/pull/650) +- Several improvements in XML download: Support retaining old bulk XML files; + Prevent XML file deletion on full download; Add technology checks to full + bulk download + [#667](https://github.com/OpenEnergyPlatform/open-MaStR/pull/667) ### Removed From d1206f0f8a1885200229b74bdeaeb5a021c74a24 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 09:30:15 +0200 Subject: [PATCH 42/53] Adjust docs on partial downloads --- docs/advanced.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/advanced.md b/docs/advanced.md index ec4ffd39..f78aafeb 100644 --- a/docs/advanced.md +++ b/docs/advanced.md @@ -148,7 +148,7 @@ If needed, the tables in the database can be obtained as csv files. Those files === "Disadvantages" * No single tables or entries can be downloaded - * Download takes long time + * Download takes long time (you can use the partial download though, see [Getting Started](getting_started.md#bulk-download)) ## SOAP API download From b0193548e4b78e1d17ad46c8616f59e1ba046760 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 09:45:44 +0200 Subject: [PATCH 43/53] Extend docs #564 --- docs/advanced.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/advanced.md b/docs/advanced.md index f78aafeb..0cbc9ae6 100644 --- a/docs/advanced.md +++ b/docs/advanced.md @@ -150,6 +150,9 @@ If needed, the tables in the database can be obtained as csv files. Those files * No single tables or entries can be downloaded * Download takes long time (you can use the partial download though, see [Getting Started](getting_started.md#bulk-download)) +**Note**: By default, existing zip files in `$HOME/.open-MaStR/data/xml_download` are deleted when a new file is +downloaded. You can change this behavior by setting `keep_old_downloads`=True in +[`Mastr.download()`][open_mastr.Mastr.download]. ## SOAP API download From c51f3791ca928ef369fcc5f984f745e2c1fe7dee Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 09:49:32 +0200 Subject: [PATCH 44/53] Update changelog --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 66ff8971..dbd12aa2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,8 +17,8 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ [#636](https://github.com/OpenEnergyPlatform/open-MaStR/pull/636) - Change print statement about data cleansing [#650](https://github.com/OpenEnergyPlatform/open-MaStR/pull/650) -- Several improvements in XML download: Support retaining old bulk XML files; - Prevent XML file deletion on full download; Add technology checks to full +- Several improvements in bulk download: Support retaining old zip bulk files; + Prevent zip file deletion on full download; Add technology checks to full bulk download [#667](https://github.com/OpenEnergyPlatform/open-MaStR/pull/667) ### Removed From b5a4aacbe917d417831892d542b529c722dd385b Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 15:34:55 +0200 Subject: [PATCH 45/53] Add fixture zipped_xml_file_path to test_mastr.py --- tests/test_mastr.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/test_mastr.py b/tests/test_mastr.py index df4a3a20..4fe446de 100644 --- a/tests/test_mastr.py +++ b/tests/test_mastr.py @@ -14,6 +14,16 @@ _xml_file_exists = True +@pytest.fixture(scope="module") +def zipped_xml_file_path(): + zipped_xml_file_path = None + for entry in os.scandir(path=_xml_folder_path): + if "Gesamtdatenexport" in entry.name: + zipped_xml_file_path = os.path.join(_xml_folder_path, entry.name) + + return zipped_xml_file_path + + @pytest.fixture def db_path(): return os.path.join( From 9a007cabdc54e0c4b420d7815685df2bf9fa63c1 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 15:36:43 +0200 Subject: [PATCH 46/53] Add test: check if keeping old downloads works #564 --- tests/test_mastr.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_mastr.py b/tests/test_mastr.py index 4fe446de..16f7c1f6 100644 --- a/tests/test_mastr.py +++ b/tests/test_mastr.py @@ -1,10 +1,14 @@ +import shutil + from open_mastr.mastr import Mastr import os +import re import sqlalchemy import pytest from os.path import expanduser import pandas as pd from open_mastr.utils.constants import TRANSLATIONS +from datetime import date, timedelta _xml_file_exists = False _xml_folder_path = os.path.join(expanduser("~"), ".open-MaStR", "data", "xml_download") @@ -83,6 +87,7 @@ def test_Mastr_translate(db_translated, db_path): assert pd.read_sql(sql=table, con=db_empty.engine).shape[0] == 0 +@pytest.mark.dependency(name="bulk_downloaded") def test_mastr_download(db): db.download(data="wind") df_wind = pd.read_sql("wind_extended", con=db.engine) @@ -92,3 +97,15 @@ def test_mastr_download(db): df_biomass = pd.read_sql("biomass_extended", con=db.engine) assert len(df_wind) > 10000 assert len(df_biomass) > 10000 + + +@pytest.mark.dependency(depends=["bulk_downloaded"]) +def test_mastr_download_keep_old_files(db, zipped_xml_file_path): + file_today = zipped_xml_file_path + yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d") + file_old = re.sub(r"\d{8}", yesterday, os.path.basename(file_today)) + file_old = os.path.join(os.path.dirname(zipped_xml_file_path), file_old) + shutil.copy(file_today, file_old) + db.download(data="gsgk", keep_old_files=True) + + assert os.path.exists(file_old) From d6ef3308aff9823e1e9a0e94c6fb191814e1d5c1 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 16:06:21 +0200 Subject: [PATCH 47/53] Set number of parallel CI jobs to 1 This prevents being blocked by the MaStR server due to parallel HTTP requests by GH actions --- .github/workflows/ci-develop.yml | 1 + .github/workflows/ci-production.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/.github/workflows/ci-develop.yml b/.github/workflows/ci-develop.yml index 6a7a6457..5794d864 100644 --- a/.github/workflows/ci-develop.yml +++ b/.github/workflows/ci-develop.yml @@ -13,6 +13,7 @@ jobs: runs-on: ${{ matrix.os }} if: ${{ !github.event.pull_request.draft }} strategy: + max-parallel: 1 matrix: os: [macos-latest, ubuntu-latest, windows-latest] python-version: ['3.10', '3.11', '3.12'] diff --git a/.github/workflows/ci-production.yml b/.github/workflows/ci-production.yml index 16065860..5c4ffc3b 100644 --- a/.github/workflows/ci-production.yml +++ b/.github/workflows/ci-production.yml @@ -13,6 +13,7 @@ jobs: runs-on: ${{ matrix.os }} if: ${{ !github.event.pull_request.draft }} strategy: + max-parallel: 1 matrix: os: [macos-latest, ubuntu-latest, windows-latest] python-version: ['3.10', '3.11', '3.12'] From 0f53502c9b18353064009ef653694a80ece26dbc Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 16:11:33 +0200 Subject: [PATCH 48/53] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 725f1ce6..0eb6fa2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ [#636](https://github.com/OpenEnergyPlatform/open-MaStR/pull/636) - Change print statement about data cleansing [#650](https://github.com/OpenEnergyPlatform/open-MaStR/pull/650) +- Limit number of parallel CI jobs + [#669](https://github.com/OpenEnergyPlatform/open-MaStR/pull/669) ### Removed From 846d500e436be1064e54ea00d4b077ca571ef310 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Sat, 25 Oct 2025 07:02:53 +0200 Subject: [PATCH 49/53] Move check of keep_old_downloads outside of function #564 --- open_mastr/mastr.py | 10 +++++----- open_mastr/xml_download/utils_download_bulk.py | 8 ++------ 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index a9c9d999..2f0ffe9f 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -238,11 +238,11 @@ def download( ) delete_zip_file_if_corrupted(zipped_xml_file_path) - delete_xml_files_not_from_given_date( - zipped_xml_file_path, - xml_folder_path, - keep_old_downloads, - ) + if not keep_old_downloads: + delete_xml_files_not_from_given_date( + zipped_xml_file_path, + xml_folder_path, + ) download_xml_Mastr(zipped_xml_file_path, date, data, xml_folder_path) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index a18f4111..50158117 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -206,7 +206,6 @@ def check_download_completeness( def delete_xml_files_not_from_given_date( save_path: str, xml_folder_path: str, - keep_old_downloads: bool = False, ) -> None: """ Delete xml files that are not corresponding to the given date. @@ -218,15 +217,12 @@ def delete_xml_files_not_from_given_date( Full file path where the downloaded MaStR zip file will be saved. xml_folder_path: str Path where the downloaded MaStR zip file will be saved. - keep_old_downloads: bool - If set to True, prior downloaded MaStR zip files will be kept. """ if os.path.exists(save_path): return else: - if not keep_old_downloads: - shutil.rmtree(xml_folder_path) - os.makedirs(xml_folder_path) + shutil.rmtree(xml_folder_path) + os.makedirs(xml_folder_path) def partial_download_with_unzip_http(save_path: str, url: str, bulk_data_list: list): From 2d1b7151ed7836cff8c60e0ecd4a1feaa59d6b08 Mon Sep 17 00:00:00 2001 From: Florian Kotthoff <74312290+FlorianK13@users.noreply.github.com> Date: Tue, 11 Nov 2025 09:49:37 +0100 Subject: [PATCH 50/53] Repair or delete broken links #679 --- README.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 9bbc4dbb..7e15968e 100644 --- a/README.rst +++ b/README.rst @@ -108,7 +108,6 @@ These projects already use open-mastr: - `Wasserstoffatlas `_ - `EE-Status App `_ - `Digiplan Anhalt `_ -- `Data Quality Assessment of the MaStR `_ - `EmPowerPlan `_ - `Goal100 Monitor `_ @@ -119,7 +118,6 @@ changes in a `Pull Request `_. - The `bundesAPI/Marktstammdaten-API `_ is another implementation to access data via an official API. Collaboration @@ -146,7 +144,7 @@ Data .. |badge_license| image:: https://img.shields.io/github/license/OpenEnergyPlatform/open-MaStR - :target: LICENSE.txt + :target: LICENSE.md :alt: License .. |badge_rtd| image:: https://readthedocs.org/projects/open-mastr/badge/?style=flat From cffe9eb130d0b728693e51bb2f75c6b55e0f18e0 Mon Sep 17 00:00:00 2001 From: Florian Kotthoff <74312290+FlorianK13@users.noreply.github.com> Date: Tue, 11 Nov 2025 09:50:40 +0100 Subject: [PATCH 51/53] Fix formatting in README #679 --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 7e15968e..80250d3a 100644 --- a/README.rst +++ b/README.rst @@ -118,6 +118,7 @@ changes in a `Pull Request `_ is another implementation to access data via an official API. Collaboration From 0bce4ea51a8f0805faa02f4a65c3b9fa3520b85c Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 25 Nov 2025 09:28:12 +0100 Subject: [PATCH 52/53] Version update v0.16.0 --- .bumpversion.cfg | 2 +- .github/workflows/ci-production.yml | 2 +- CHANGELOG.md | 2 +- CITATION.cff | 4 ++-- pyproject.toml | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 6e37a578..f22ed637 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.15.0 +current_version = 0.16.0 parse = (?P\d+)\.(?P\d+)\.(?P\d+)((?P(a|na))+(?P\d+))? serialize = {major}.{minor}.{patch}{release}{build} diff --git a/.github/workflows/ci-production.yml b/.github/workflows/ci-production.yml index 5c4ffc3b..ce18c2e9 100644 --- a/.github/workflows/ci-production.yml +++ b/.github/workflows/ci-production.yml @@ -33,7 +33,7 @@ jobs: - name: create package run: python -m build --sdist - name: import open-mastr - run: python -m pip install ./dist/open_mastr-0.15.0.tar.gz + run: python -m pip install ./dist/open_mastr-0.16.0.tar.gz - name: Create credentials file env: MASTR_TOKEN: ${{ secrets.MASTR_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md index e950fd7d..e9e1281a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ For each version important additions, changes and removals are listed here. The format is inspired from [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## [v0.XX.X] unreleased - 202X-XX-XX +## [v0.16.0] PartialPumpkinPull - 2025-11-26 ### Added - Add partial bulk download [#652](https://github.com/OpenEnergyPlatform/open-MaStR/pull/652) diff --git a/CITATION.cff b/CITATION.cff index 99458ea3..d496ecf2 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -41,7 +41,7 @@ authors: title: "open-MaStR" type: software license: AGPL-3.0 -version: 0.15.0 +version: 0.16.0 doi: -date-released: 2025-04-19 +date-released: 2025-11-26 url: "https://github.com/OpenEnergyPlatform/open-MaStR/" diff --git a/pyproject.toml b/pyproject.toml index a4fcb367..5871bfbe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "open_mastr" -version = "0.15.0" +version = "0.16.0" dependencies = [ "pandas>=2.2.2", "numpy", @@ -79,4 +79,4 @@ open_mastr = [ include = ["open_mastr", "open_mastr.soap_api", "open_mastr.soap_api.metadata", "open_mastr.utils", "open_mastr.utils.config", "open_mastr.xml_download"] # package names should match these glob patterns (["*"] by default) # from setup.py - not yet included in here -# download_url="https://github.com/OpenEnergyPlatform/open-MaStR/archive""/refs/tags/v0.15.0.tar.gz", +# download_url="https://github.com/OpenEnergyPlatform/open-MaStR/archive""/refs/tags/v0.16.0.tar.gz", From dde75165584a2450ecdcf47bd7fcb0ec3e480eee Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 25 Nov 2025 15:02:39 +0100 Subject: [PATCH 53/53] Change release title --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e9e1281a..a6382314 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ For each version important additions, changes and removals are listed here. The format is inspired from [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## [v0.16.0] PartialPumpkinPull - 2025-11-26 +## [v0.16.0] Partial downloads with open-MaStR PartialPumpkinPull - 2025-11-26 ### Added - Add partial bulk download [#652](https://github.com/OpenEnergyPlatform/open-MaStR/pull/652)