From 72223eaac07f006a363d778f1ed4e956fd6788a5 Mon Sep 17 00:00:00 2001 From: Simon Will Date: Sun, 7 Dec 2025 22:43:51 +0100 Subject: [PATCH 1/7] Add draft for dynamic table creation --- open_mastr/utils/sqlalchemy_tables.py | 137 ++++++++++++++++++++++++++ open_mastr/utils/xsd_tables.py | 99 +++++++++++++++++++ 2 files changed, 236 insertions(+) create mode 100644 open_mastr/utils/sqlalchemy_tables.py create mode 100644 open_mastr/utils/xsd_tables.py diff --git a/open_mastr/utils/sqlalchemy_tables.py b/open_mastr/utils/sqlalchemy_tables.py new file mode 100644 index 00000000..b2204022 --- /dev/null +++ b/open_mastr/utils/sqlalchemy_tables.py @@ -0,0 +1,137 @@ +import datetime +from dataclasses import dataclass +from typing import Any, Union +from sqlalchemy import Column, Integer, String, Float, Boolean, Date, DateTime +from sqlalchemy.orm import DeclarativeBase, mapped_column, Mapped + +import xmlschema +from xmlschema.validators.simple_types import XsdAtomicBuiltin, XsdAtomicRestriction +from open_mastr.utils.xsd_tables import MastrColumnType, MastrTableDescription + + +MASTR_COLUMN_TYPE_TO_SQLALCHEMY_TYPE = { + MastrColumnType.STRING: String, + MastrColumnType.INTEGER: Integer, + MastrColumnType.FLOAT: Float, + MastrColumnType.DATE: Date, + MastrColumnType.DATETIME: DateTime(timezone=True), + MastrColumnType.BOOLEAN: Boolean, + MastrColumnType.CATALOG_VALUE: Integer, # TODO: Think about how to deal with mapping catalog values +} + +MASTR_TABLE_NAME_TO_PRIMARY_KEY_COLUMNS = { + "AnlagenEegBiomasse": {"EegMastrNummer"}, + "AnlagenEegGeothermieGrubengasDruckentspannung": {"EegMastrNummer"}, + "AnlagenEegSolar": {"EegMastrNummer"}, + "AnlagenEegSpeicher": {"EegMastrNummer"}, + "AnlagenEegWasser": {"EegMastrNummer"}, + "AnlagenEegWind": {"EegMastrNummer"}, + "AnlagenGasSpeicher": {"MastrNummer"}, + "AnlagenKwk": {"KwkMastrNummer"}, + "AnlagenStromSpeicher": {"MastrNummer"}, + "Bilanzierungsgebiete": {"Id"}, + "EinheitenAenderungNetzbetreiberzuordnungen": {"EinheitMastrNummer"}, # TODO: Is not a primary key on its own! + "EinheitenBiomasse": {"EinheitMastrNummer"}, + "EinheitenGasErzeuger": {"EinheitMastrNummer"}, + "EinheitenGasSpeicher": {"EinheitMastrNummer"}, + "EinheitenGasverbraucher": {"EinheitMastrNummer"}, + "EinheitenGenehmigung": {"GenMastrNummer"}, + "EinheitenGeothermieGrubengasDruckentspannung": {"EinheitMastrNummer"}, + "EinheitenKernkraft": {"EinheitMastrNummer"}, + "EinheitenSolar": {"EinheitMastrNummer"}, + "EinheitenStromSpeicher": {"EinheitMastrNummer"}, + "EinheitenStromVerbraucher": {"EinheitMastrNummer"}, + "Einheitentypen": {"Id"}, + "EinheitenVerbrennung": {"EinheitMastrNummer"}, + "EinheitenWasser": {"EinheitMastrNummer"}, + "EinheitenWind": {"EinheitMastrNummer"}, + "Ertuechtigungen": {"Id"}, + "GeloeschteUndDeaktivierteEinheiten": {"EinheitMastrNummer"}, + "GeloeschteUndDeaktivierteMarktakteure": {"MarktakteurMastrNummer"}, + "Katalogkategorien": {"Id"}, + "Katalogwerte": {"Id"}, + "Lokationen": {"MastrNummer"}, + "Lokationstypen": {"Id"}, + "MarktakteureUndRollen": {"MarktakteurMastrNummer"}, + "Marktakteure": {"MastrNummer"}, + "Marktfunktionen": {"Id"}, + "Marktrollen": {"Id"}, + "Netzanschlusspunkte": {"NetzanschlusspunktMastrNummer"}, + "Netze": {"MastrNummer"}, +} + + +class Base(DeclarativeBase): + pass + + +class ParentAllTables(object): + DatenQuelle: Mapped[str] = mapped_column(String) + DatumDownload: Mapped[datetime.date] = mapped_column(Date) + + +def make_sqlalchemy_model_from_mastr_table_description( + table_description: MastrTableDescription, + base: DeclarativeBase = Base, + mixins: tuple[type, ...] = (ParentAllTables,), +): + return _make_sqlalchemy_model( + class_name=table_description.instance_name, + table_name=table_description.table_name, + column_name_to_column_type={ + column.name: MASTR_COLUMN_TYPE_TO_SQLALCHEMY_TYPE[column.type] + for column in table_description.columns + }, + primary_key_columns=MASTR_TABLE_NAME_TO_PRIMARY_KEY_COLUMNS[table_description.table_name], + base=Base, + mixins=(ParentAllTables,) + ) + + +def _make_sqlalchemy_model( + class_name: str, + table_name: str, + column_name_to_column_type: dict[str, Any], + primary_key_columns: set[str], + base: DeclarativeBase, + mixins: tuple[type, ...] = tuple(), +): + namespace = { + "__tablename__": table_name, + "__annotations__": {}, + } + + for column_name, column_type in column_name_to_column_type.items(): + kwargs = {"primary_key": True} if column_name in primary_key_columns else {"nullable": True} + namespace[column_name] = mapped_column(column_type, **kwargs) + + bases = (base,) + mixins + return type(class_name, bases, namespace) + + +if __name__ == "__main__": + import os + import sys + from sqlalchemy import create_engine + import traceback + import xmlschema + + print("Parsing XSD files") + xsd_path = sys.argv[1] + for xsd_path in sys.argv[1:]: + schema = xmlschema.XMLSchema(xsd_path) + try: + table_description = MastrTableDescription.from_xml_schema(schema) + except ValueError: + traceback.print_exc() + print("Failed for ", xsd_path) + sys.exit(1) + + model = make_sqlalchemy_model_from_mastr_table_description( + table_description=table_description, + ) + + db_path = os.path.join(os.getcwd(), "test.db") + print(f"Creating SQLite database at {db_path}") + engine = create_engine(f"sqlite:///{db_path}") + Base.metadata.create_all(engine) diff --git a/open_mastr/utils/xsd_tables.py b/open_mastr/utils/xsd_tables.py new file mode 100644 index 00000000..06e09d79 --- /dev/null +++ b/open_mastr/utils/xsd_tables.py @@ -0,0 +1,99 @@ +import re +from enum import auto, Enum +from dataclasses import dataclass +from typing import Union + +import xmlschema +from xmlschema.validators.simple_types import XsdAtomicBuiltin, XsdAtomicRestriction + +_XML_SCHEMA_PREFIX = "{http://www.w3.org/2001/XMLSchema}" + + +def normalize_column_name(original_mastr_column_name: str) -> str: + return original_mastr_column_name.replace("MaStR", "Mastr") + + +class MastrColumnType(Enum): + STRING = auto() + INTEGER = auto() + FLOAT = auto() + DATE = auto() + DATETIME = auto() + BOOLEAN = auto() + CATALOG_VALUE = auto() + + @classmethod + def from_xsd_type(cls, xsd_type: Union[XsdAtomicBuiltin, XsdAtomicRestriction]) -> "MastrColumnDescription": + xsd_type_to_mastr_column_type = { + f"{_XML_SCHEMA_PREFIX}string": cls.STRING, + f"{_XML_SCHEMA_PREFIX}decimal": cls.INTEGER, + f"{_XML_SCHEMA_PREFIX}int": cls.INTEGER, + f"{_XML_SCHEMA_PREFIX}short": cls.INTEGER, + f"{_XML_SCHEMA_PREFIX}byte": cls.INTEGER, + f"{_XML_SCHEMA_PREFIX}float": cls.FLOAT, + f"{_XML_SCHEMA_PREFIX}date": cls.DATE, + f"{_XML_SCHEMA_PREFIX}dateTime": cls.DATETIME, + } + if xsd_type.is_restriction(): + if enumeration := xsd_type.enumeration: + if set(xsd_type.enumeration) == {0, 1}: + return cls.BOOLEAN + else: + return cls.CATALOG_VALUE + # Ertuechtigungen.xsd has some normal types defined as restrictions for some reason. + # We cope with that by extracting the primitive type it's restricted to. + inner_xsd_type = xsd_type.primitive_type + if mastr_column_type := xsd_type_to_mastr_column_type.get(inner_xsd_type.name): + return mastr_column_type + + if mastr_column_type := xsd_type_to_mastr_column_type.get(xsd_type.name): + return mastr_column_type + + raise ValueError(f"Could not determine MastrColumnType from XSD type {xsd_type!r}") + + +@dataclass +class MastrColumnDescription: + name: str + type: MastrColumnType + + @classmethod + def from_xsd_element(cls, xsd_element: xmlschema.XsdElement) -> "MastrColumnDescription": + name = normalize_column_name(xsd_element.name) + return cls( + name=name, + type=MastrColumnType.from_xsd_type(xsd_element.type) + ) + + +@dataclass +class MastrTableDescription: + table_name: str + instance_name: str + columns: tuple[MastrColumnDescription] + + @classmethod + def from_xml_schema(cls, schema: xmlschema.XMLSchema) -> "MastrTableDescription": + if len(schema.root_elements) != 1: + raise ValueError( + "XML schema must have exactly one root element," + f" but has {len(schema.root_elements)} ({schema.root_elements!r})" + ) + root = schema.root_elements[0] + + try: + main_element = root.content.content[0] + column_elements = main_element.content.content + except (AttributeError, IndexError, TypeError) as e: + raise ValueError(f"Could not find columns in XML schema {schema!r}") from e + + columns = tuple( + MastrColumnDescription.from_xsd_element(element) + for element in column_elements + ) + + return cls( + table_name=root.name, + instance_name=main_element.name, + columns=columns, + ) From aca6f0cd3e4ca7e82a8a2a41feb0ad9195c869b7 Mon Sep 17 00:00:00 2001 From: Simon Will Date: Sun, 14 Dec 2025 21:15:05 +0100 Subject: [PATCH 2/7] Continue with implementing XSD reading --- open_mastr/mastr_2.py | 262 ++++++++++++++++++ open_mastr/utils/sqlalchemy_tables.py | 10 +- open_mastr/utils/xsd_tables.py | 37 ++- .../xml_download/utils_download_bulk.py | 72 +++-- pyproject.toml | 1 + 5 files changed, 361 insertions(+), 21 deletions(-) create mode 100644 open_mastr/mastr_2.py diff --git a/open_mastr/mastr_2.py b/open_mastr/mastr_2.py new file mode 100644 index 00000000..a626d8e5 --- /dev/null +++ b/open_mastr/mastr_2.py @@ -0,0 +1,262 @@ +import os +from sqlalchemy import inspect, create_engine + +# import xml dependencies +from open_mastr.xml_download.utils_download_bulk import ( + download_documentation, + download_xml_Mastr, + delete_xml_files_not_from_given_date, +) +from open_mastr.xml_download.utils_write_to_database import ( + write_mastr_xml_to_database, +) + +from open_mastr.utils.helpers import ( + validate_parameter_format_for_download_method, + validate_parameter_format_for_mastr_init, + validate_parameter_data, + transform_data_parameter, + parse_date_string, + transform_date_parameter, + data_to_include_tables, + create_db_query, + db_query_to_csv, + reverse_fill_basic_units, + delete_zip_file_if_corrupted, + create_database_engine, + rename_table, + create_translated_database_engine, +) +from open_mastr.utils.config import ( + create_data_dir, + get_data_version_dir, + get_project_home_dir, + get_output_dir, + setup_logger, +) +import open_mastr.utils.orm as orm +from open_mastr.utils.sqlalchemy_tables import ( + make_sqlalchemy_model_from_mastr_table_description, + MastrBase +) + +# constants +from open_mastr.utils.constants import TECHNOLOGIES, ADDITIONAL_TABLES + +# setup logger +log = setup_logger() + + +class Mastr: + """ + `Mastr` is used to download the MaStR database and keep it up-to-date. + + An SQL database is used to mirror the MaStR database. It is filled by + downloading and parsing the MaStR via bulk download. + + !!! example + + ```python + from open_mastr import Mastr + + db = Mastr() + db.download() + ``` + + Parameters + ---------- + engine : {'sqlite', sqlalchemy.engine.Engine}, optional + Defines the engine of the database where the MaStR is mirrored to. + Default is 'sqlite'. + connect_to_translated_db: boolean, optional + Allows connection to an existing translated database. Default is 'False'. + Only for 'sqlite'-type engines. + + + + """ + + def __init__( + self, + engine: Union[Engine, Literal["sqlite"]] = "sqlite", + mastr_table_to_db_table_name: Optional[dict[str, str]] = None, + output_dir: Optional[Union[str, Path]] = None, + home_dir: Optional[Union[str, Path]] = None, + ) -> None: + validate_parameter_format_for_mastr_init(engine) + + self.output_dir = output_dir or get_output_dir() + self.home_directory = home_dir or get_project_home_dir() + + self._sqlite_folder_path = os.path.join(self.output_dir, "data", "sqlite") + + os.makedirs(self._sqlite_folder_path, exist_ok=True) + + self.engine = create_database_engine(engine, self._sqlite_folder_path) + + log.info( + "\n==================================================\n" + "---------> open-MaStR started <---------\n" + "==================================================\n" + f"Data will be written to the following database: {self.engine.url}\n" + "If you run into problems, try to " + "delete the database and update the package by running " + "'pip install --upgrade open-mastr'\n" + ) + + def generate_data_model( + self, data: Optional[list[str]] = None + ) -> dict[MastrTableDescription, Model]: + docs_folder_path = os.path.join(self.output_dir, "data", "docs_download") + os.makedirs(docs_folder_path, exist_ok=True) + zipped_docs_file_path = os.path.join( + xml_folder_path, + "Dokumentation MaStR Gesamtdatenexport.zip" + ) + download_documentation(zipped_docs_file_path) + + mastr_table_descriptions = read_mastr_table_descriptions_from_xsd( + zipped_docs_file_path=zipped_docs_file_path, data=data + ) + mastr_table_to_db_model: dict[MastrTableDescription, MastrBase] = {} + for mastr_table_description in mastr_table_descriptions: + sqlalchemy_model = make_sqlalchemy_model_from_mastr_table_description(mastr_table_description) + mastr_table_to_db_model[mastr_table_description] = sqlalchemy_model + + return mastr_table_description + + def download( + self, + method="bulk", + data=None, + date=None, + bulk_cleansing=True, + keep_old_downloads: bool = False, + mastr_table_to_db_model: Optional[Mapping[MastrTableDescription, Model]] = None, + **kwargs, + ) -> None: + """ + Downloads the MaStR registry and writes it to a local database. + + Parameters + ---------- + method : 'bulk', optional + Only "bulk" is a valid value. The download via the MaStR SOAP API is deprecated. + Default to 'bulk'. + data : str or list or None, optional + Specifies which tables to download. + + **Possible values:** + + - "wind" + - "solar" + - "biomass" + - "hydro" + - "gsgk" + - "combustion" + - "nuclear" + - "gas" + - "storage" + - "storage_units" + - "electricity_consumer" + - "location" + - "market" + - "grid" + - "balancing_area" + - "permit" + - "deleted_units" + - "deleted_market_actors" + - "retrofit_units" + + **Usage:** + + - If `None`, all data is downloaded. + - If a string, only the specified table is downloaded (e.g., `"wind"`). + - If a list, multiple tables are downloaded (e.g., `["wind", "solar"]`). + + date : None or `datetime.datetime` or str, optional + + | date | description | + |-----------------------|------| + | "today" | latest files are downloaded from marktstammdatenregister.de | + | "20230101" | If file from this date exists locally, it is used. Otherwise it throws an error (You can only receive todays data from the server) | + | "existing" | Deprecated since 0.16, see [#616](https://github.com/OpenEnergyPlatform/open-MaStR/issues/616#issuecomment-3089377062) | + | None | set date="today" | + + Default to `None`. + bulk_cleansing : bool, optional + If set to True, data cleansing is applied after the download (which is recommended). + In its original format, many entries in the MaStR are encoded with IDs. Columns like + `state` or `fueltype` do not contain entries such as "Hessen" or "Braunkohle", but instead + only contain IDs. Cleansing replaces these IDs with their corresponding original entries. + keep_old_downloads: bool + If set to True, prior downloaded MaStR zip files will be kept. + """ + if method == "API": + log.warning( + "Downloading the whole registry via the MaStR SOAP-API is deprecated. " + "You can still use the open_mastr.soap_api.download.MaStRAPI class " + "to construct single calls." + ) + log.warning("Attention: method='API' changed to method='bulk'.") + method = "bulk" + + if not mastr_table_to_db_model: + mastr_table_to_db_model = generate_data_model() + + validate_parameter_format_for_download_method( + method=method, + data=data, + date=date, + bulk_cleansing=bulk_cleansing, + **kwargs, + ) + data = transform_data_parameter(data, **kwargs) + + date = transform_date_parameter(self, date, **kwargs) + + # Find the name of the zipped xml folder + bulk_download_date = parse_date_string(date) + xml_folder_path = os.path.join(self.output_dir, "data", "xml_download") + os.makedirs(xml_folder_path, exist_ok=True) + zipped_xml_file_path = os.path.join( + xml_folder_path, + f"Gesamtdatenexport_{bulk_download_date}.zip", + ) + + delete_zip_file_if_corrupted(zipped_xml_file_path) + if not keep_old_downloads: + delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) + + download_xml_Mastr(zipped_xml_file_path, date, data, xml_folder_path) + + log.info( + "\nWould you like to speed up the creation of your MaStR database?\n" + "Try our new parallelized processing by setting os.environ['USE_RECOMMENDED_NUMBER_OF_PROCESSES'] = True " + "or configure your own number of processes via os.environ['NUMBER_OF_PROCESSES'] = your_number\n" + ) + + delete_zip_file_if_corrupted(zipped_xml_file_path) + delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) + + + print( + "\nWould you like to speed up the creation of your MaStR database?\n" + "Try our new parallelized processing by setting os.environ['USE_RECOMMENDED_NUMBER_OF_PROCESSES'] = True " + "or configure your own number of processes via os.environ['NUMBER_OF_PROCESSES'] = your_number\n" + ) + + write_mastr_xml_to_database( + engine=self.engine, + zipped_xml_file_path=zipped_xml_file_path, + data=data, + bulk_cleansing=bulk_cleansing, + bulk_download_date=bulk_download_date, + ) + + def to_csv( + self, tables: list = None, chunksize: int = 500000, limit: int = None + ) -> None: + pass + # TODO: Think about this. + diff --git a/open_mastr/utils/sqlalchemy_tables.py b/open_mastr/utils/sqlalchemy_tables.py index b2204022..0a6265b3 100644 --- a/open_mastr/utils/sqlalchemy_tables.py +++ b/open_mastr/utils/sqlalchemy_tables.py @@ -19,6 +19,10 @@ MastrColumnType.CATALOG_VALUE: Integer, # TODO: Think about how to deal with mapping catalog values } +# Potential hierarchy +# Id -> MastrNummer -> EinheitMastrNummer +# -> EegMastrNummer -> KwkMastrNummer -> GenMastrNummer +# -> MarktakteurMastrNummer -> NetzanschlusspunktMastrNummer MASTR_TABLE_NAME_TO_PRIMARY_KEY_COLUMNS = { "AnlagenEegBiomasse": {"EegMastrNummer"}, "AnlagenEegGeothermieGrubengasDruckentspannung": {"EegMastrNummer"}, @@ -61,7 +65,7 @@ } -class Base(DeclarativeBase): +class MastrBase(DeclarativeBase): pass @@ -72,7 +76,7 @@ class ParentAllTables(object): def make_sqlalchemy_model_from_mastr_table_description( table_description: MastrTableDescription, - base: DeclarativeBase = Base, + base: DeclarativeBase = MastrBase, mixins: tuple[type, ...] = (ParentAllTables,), ): return _make_sqlalchemy_model( @@ -83,7 +87,7 @@ def make_sqlalchemy_model_from_mastr_table_description( for column in table_description.columns }, primary_key_columns=MASTR_TABLE_NAME_TO_PRIMARY_KEY_COLUMNS[table_description.table_name], - base=Base, + base=MastrBase, mixins=(ParentAllTables,) ) diff --git a/open_mastr/utils/xsd_tables.py b/open_mastr/utils/xsd_tables.py index 06e09d79..b3764f80 100644 --- a/open_mastr/utils/xsd_tables.py +++ b/open_mastr/utils/xsd_tables.py @@ -52,7 +52,7 @@ def from_xsd_type(cls, xsd_type: Union[XsdAtomicBuiltin, XsdAtomicRestriction]) raise ValueError(f"Could not determine MastrColumnType from XSD type {xsd_type!r}") -@dataclass +@dataclass(frozen=True) class MastrColumnDescription: name: str type: MastrColumnType @@ -66,7 +66,7 @@ def from_xsd_element(cls, xsd_element: xmlschema.XsdElement) -> "MastrColumnDesc ) -@dataclass +@dataclass(frozen=True) class MastrTableDescription: table_name: str instance_name: str @@ -97,3 +97,36 @@ def from_xml_schema(cls, schema: xmlschema.XMLSchema) -> "MastrTableDescription" instance_name=main_element.name, columns=columns, ) + + +def read_mastr_table_descriptions_from_xsd( + zipped_docs_file_path: Union[Path, str], data: Optional[list[str]] = None +) -> set[MastrTableDescription]: + include_tables = set(data_to_include_tables(data, mapping="write_xml")) + + mastr_table_descriptions = set() + with ZipFile(zipped_docs_file_path, "r") as docs_z: + xsd_zip_entry = _find_xsd_zip_file(docs_z) + with ZipFile(docs_z.open(xsd_zip_entry)) as xsd_z: + for entry in xsd_z: + if entry.is_dir() or not entry.filename.endswith(".xsd"): + continue + + normalized_name = os.path.basename(entry.filename).removesuffix(".xsd").lower() + if normalized_name in include_tables: + with xsd_z.open(entry) as xsd_file: + mastr_table_description = MastrTableDescription.from_xml_schema(XMLSchema(xsd_file)) + mastr_table_descriptions.add(mastr_table_description) + + return mastr_table_descriptions + + +def _find_xsd_zip_entry(docs_zip_file: ZipFile) -> ZipInfo: + desired_filename = "xsd.zip" + for entry in docs_zip_file.filelist: + if os.path.basename(entry.filename) == desired_filename: + return entry + raise RuntimeError( + f"Did not find XSD files in the form of {desired_filename!r} in the documentation" + f" ZIP file {docs_zip_file.filename!r}" + ) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index a8d37ae3..d92fefc4 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -1,3 +1,5 @@ +import datetime +import math import os import shutil import time @@ -24,7 +26,7 @@ def gen_version( - when: time.struct_time = time.localtime(), use_version: str = "current" + when: datetime.date, use_version: str = "current" ) -> str: """ Generates the current version. @@ -53,13 +55,13 @@ def gen_version( 2024-31-12 = version 24.2 """ - year = when.tm_year + year = when.year release = 1 - if when.tm_mon < 4 or (when.tm_mon == 4 and when.tm_mday == 1): + if when.month < 4 or (when.month == 4 and when.day == 1): year = year - 1 release = 2 - elif when.tm_mon > 10 or (when.tm_mon == 10 and when.tm_mday > 1): + elif when.month > 10 or (when.month == 10 and when.day > 1): release = 2 # Change to MaStR version number that was used before @@ -84,7 +86,7 @@ def gen_version( return f"{year}.{release}" -def gen_url(when: time.struct_time = time.localtime(), use_version="current") -> str: +def gen_url(when: datetime.date, use_version="current") -> str: """Generates the download URL for the specified date. Note that not all dates are archived on the website. @@ -116,7 +118,7 @@ def gen_url(when: time.struct_time = time.localtime(), use_version="current") -> def download_xml_Mastr( - save_path: str, bulk_date_string: str, bulk_data_list: list, xml_folder_path: str + save_path: str, bulk_date: datetime.date, bulk_data_list: list, xml_folder_path: str ) -> None: """Downloads the zipped MaStR. @@ -124,7 +126,7 @@ def download_xml_Mastr( ----------- save_path: str Full file path where the downloaded MaStR zip file will be saved. - bulk_date_string: str + bulk_date_string: datetime.date Date for which the file should be downloaded. bulk_data_list: list List of tables/technologis to be downloaded. @@ -134,9 +136,7 @@ def download_xml_Mastr( log.info("Starting the Download from marktstammdatenregister.de.") - # TODO this should take bulk_date_string - now = time.localtime() - url = gen_url(now) + url = gen_url(bulk_date) time_a = time.perf_counter() r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) @@ -144,19 +144,17 @@ def download_xml_Mastr( log.warning( "Download file was not found. Assuming that the new file was not published yet and retrying with yesterday." ) - now = time.localtime( - time.mktime(now) - (24 * 60 * 60) - ) # subtract 1 day from the date - url = gen_url(now) + bulk_date -= datetime.timedelta(days=1) + url = gen_url(bulk_date) r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) if r.status_code == 404: - url = gen_url(now, use_version="before") # Use lower MaStR Version + url = gen_url(bulk_date, use_version="before") # Use lower MaStR Version log.warning( f"Download file was not found. Assuming that the version of MaStR has changed and retrying with download link: {url}" ) r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) if r.status_code == 404: - url = gen_url(now, use_version="after") # Use higher MaStR Version + url = gen_url(bulk_date, use_version="after") # Use higher MaStR Version log.warning( f"Download file was not found. Assuming that the version of MaStR has changed and retrying with download link: {url}" ) @@ -321,6 +319,7 @@ def full_download_without_unzip_http( "Warning: The servers from MaStR restrict the download speed." " You may want to download it another time." ) + # TODO: Explain this number total_length = int(23000) with ( open(save_path, "wb") as zfile, @@ -339,3 +338,44 @@ def full_download_without_unzip_http( else: # remove warning bar.set_postfix_str(s="") + + +def download_documentation( + save_path: str, xml_folder_path: str +) -> None: + """Downloads the zipped MaStR. + + Parameters + ----------- + save_path: str + Full file path where the downloaded MaStR zip file will be saved. + xml_folder_path: str + Path where the downloaded MaStR zip file will be saved. + """ + log.info("Starting the MaStR documentation download from marktstammdatenregister.de.") + url = "https://www.marktstammdatenregister.de/MaStRHilfe/files/gesamtdatenexport/Dokumentation%20MaStR%20Gesamtdatenexport.zip" + + time_a = time.perf_counter() + r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) + + r.raise_for_status() + + chunk_size = 1024 * 1024 + content_length = r.headers.get("Content-Length") + expected_steps = math.ceil(content_length / chunk_size) + with ( + open(save_path, "wb") as zfile, + tqdm(desc=save_path, total=expected_steps) as bar, + ): + for chunk in r.iter_content(chunk_size=chunk_size): + if chunk: + zfile.write(chunk) + zfile.flush() + bar.update() + + time_b = time.perf_counter() + log.info( + f"MaStR documentation download is finished. It took {round(time_b - time_a)} seconds." + ) + log.info(f"MaStR was successfully downloaded to {xml_folder_path}.") + diff --git a/pyproject.toml b/pyproject.toml index 5871bfbe..205d7672 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "keyring", "pyyaml", "xmltodict", + "xmlschema", ] requires-python = ">=3.9, <4" From 75c52f5d3842001fa33356476ad00b9d51dd1d96 Mon Sep 17 00:00:00 2001 From: Simon Will Date: Sun, 21 Dec 2025 19:28:42 +0100 Subject: [PATCH 3/7] Continue with implementation --- open_mastr/mastr_2.py | 46 +++- open_mastr/utils/helpers.py | 17 +- open_mastr/utils/sqlalchemy_tables.py | 61 +++-- open_mastr/utils/xsd_tables.py | 14 +- open_mastr/xml_download/colums_to_replace.py | 5 +- .../xml_download/utils_cleansing_bulk.py | 28 +- .../xml_download/utils_download_bulk.py | 38 +-- .../xml_download/utils_write_to_database.py | 240 ++++++++++-------- 8 files changed, 267 insertions(+), 182 deletions(-) diff --git a/open_mastr/mastr_2.py b/open_mastr/mastr_2.py index a626d8e5..681c8403 100644 --- a/open_mastr/mastr_2.py +++ b/open_mastr/mastr_2.py @@ -1,5 +1,9 @@ import os -from sqlalchemy import inspect, create_engine +from pathlib import Path +from sqlalchemy import inspect, create_engine, Engine +from sqlalchemy.orm import DeclarativeBase +from typing import Literal, Optional, Type, TypeVar, Union +from collections.abc import Mapping # import xml dependencies from open_mastr.xml_download.utils_download_bulk import ( @@ -10,6 +14,7 @@ from open_mastr.xml_download.utils_write_to_database import ( write_mastr_xml_to_database, ) +from open_mastr.utils.xsd_tables import MastrTableDescription, read_mastr_table_descriptions_from_xsd from open_mastr.utils.helpers import ( validate_parameter_format_for_download_method, @@ -46,6 +51,9 @@ # setup logger log = setup_logger() +# TODO: Repeating Type[DeclarativeBase_T] in function signatures is strange. There must be a better option. +DeclarativeBase_T = TypeVar("DeclarativeBase_T", bound=DeclarativeBase) + class Mastr: """ @@ -105,12 +113,17 @@ def __init__( ) def generate_data_model( - self, data: Optional[list[str]] = None - ) -> dict[MastrTableDescription, Model]: + self, + data: Optional[list[str]] = None, + catalog_value_as_str: bool = True, + base: Type[DeclarativeBase_T] = MastrBase, + ) -> dict[str, Type[DeclarativeBase_T]]: + data = transform_data_parameter(data) + docs_folder_path = os.path.join(self.output_dir, "data", "docs_download") os.makedirs(docs_folder_path, exist_ok=True) zipped_docs_file_path = os.path.join( - xml_folder_path, + docs_folder_path, "Dokumentation MaStR Gesamtdatenexport.zip" ) download_documentation(zipped_docs_file_path) @@ -118,12 +131,16 @@ def generate_data_model( mastr_table_descriptions = read_mastr_table_descriptions_from_xsd( zipped_docs_file_path=zipped_docs_file_path, data=data ) - mastr_table_to_db_model: dict[MastrTableDescription, MastrBase] = {} + mastr_table_to_db_model: dict[str, DeclarativeBase_T] = {} for mastr_table_description in mastr_table_descriptions: - sqlalchemy_model = make_sqlalchemy_model_from_mastr_table_description(mastr_table_description) - mastr_table_to_db_model[mastr_table_description] = sqlalchemy_model + sqlalchemy_model = make_sqlalchemy_model_from_mastr_table_description( + table_description=mastr_table_description, + catalog_value_as_str=catalog_value_as_str, + base=base + ) + mastr_table_to_db_model[mastr_table_description.table_name] = sqlalchemy_model - return mastr_table_description + return mastr_table_to_db_model def download( self, @@ -132,7 +149,7 @@ def download( date=None, bulk_cleansing=True, keep_old_downloads: bool = False, - mastr_table_to_db_model: Optional[Mapping[MastrTableDescription, Model]] = None, + mastr_table_to_db_model: Optional[Mapping[str, Type[DeclarativeBase_T]]] = None, **kwargs, ) -> None: """ @@ -202,7 +219,11 @@ def download( method = "bulk" if not mastr_table_to_db_model: - mastr_table_to_db_model = generate_data_model() + mastr_table_to_db_model = self.generate_data_model(data=data, catalog_value_as_str=bulk_cleansing) + log.info("Ensuring database tables for MaStR are present") + for db_model in mastr_table_to_db_model: + db_model.__table__.drop(self.engine, checkfirst=True) + db_model.__table__.create(self.engine) validate_parameter_format_for_download_method( method=method, @@ -221,14 +242,14 @@ def download( os.makedirs(xml_folder_path, exist_ok=True) zipped_xml_file_path = os.path.join( xml_folder_path, - f"Gesamtdatenexport_{bulk_download_date}.zip", + f"Gesamtdatenexport_{bulk_download_date.strftime('%Y%m%d')}.zip", ) delete_zip_file_if_corrupted(zipped_xml_file_path) if not keep_old_downloads: delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) - download_xml_Mastr(zipped_xml_file_path, date, data, xml_folder_path) + download_xml_Mastr(zipped_xml_file_path, bulk_download_date, data, xml_folder_path) log.info( "\nWould you like to speed up the creation of your MaStR database?\n" @@ -239,7 +260,6 @@ def download( delete_zip_file_if_corrupted(zipped_xml_file_path) delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) - print( "\nWould you like to speed up the creation of your MaStR database?\n" "Try our new parallelized processing by setting os.environ['USE_RECOMMENDED_NUMBER_OF_PROCESSES'] = True " diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index 1e8b1365..544dc879 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -1,9 +1,11 @@ import os import json from contextlib import contextmanager -from datetime import date +import datetime from warnings import warn +from typing import Literal, Union from zipfile import BadZipfile, ZipFile +from zoneinfo import ZoneInfo import dateutil import sqlalchemy @@ -33,6 +35,8 @@ TRANSLATIONS, ) +MASTR_TIMEZONE = ZoneInfo("Europe/Berlin") + def chunks(lst, n): """Yield successive n-sized chunks from lst. @@ -58,11 +62,14 @@ def create_database_engine(engine, sqlite_db_path) -> sqlalchemy.engine.Engine: return engine -def parse_date_string(bulk_date_string: str) -> str: +def parse_date_string(bulk_date_string: str) -> datetime.date: if bulk_date_string == "today": - return date.today().strftime("%Y%m%d") + dt = datetime.datetime.now(tz=MASTR_TIMEZONE) else: - return parse(bulk_date_string).strftime("%Y%m%d") + dt = parse(bulk_date_string) + if dt.tzinfo: + dt = dt.astimezone(MASTR_TIMEZONE) + return dt.date() def validate_parameter_format_for_mastr_init(engine) -> None: @@ -158,7 +165,7 @@ def transform_data_parameter(data, **kwargs): return data -def transform_date_parameter(self, date, **kwargs): +def transform_date_parameter(self, date: Union[datetime.date, Literal["today"]], **kwargs) -> Union[datetime.date, Literal["today"]]: date = kwargs.get("bulk_date", date) date = "today" if date is None else date if date == "existing": diff --git a/open_mastr/utils/sqlalchemy_tables.py b/open_mastr/utils/sqlalchemy_tables.py index 0a6265b3..14a9f337 100644 --- a/open_mastr/utils/sqlalchemy_tables.py +++ b/open_mastr/utils/sqlalchemy_tables.py @@ -1,6 +1,6 @@ import datetime from dataclasses import dataclass -from typing import Any, Union +from typing import Any, Union, Type, TypeVar from sqlalchemy import Column, Integer, String, Float, Boolean, Date, DateTime from sqlalchemy.orm import DeclarativeBase, mapped_column, Mapped @@ -9,16 +9,6 @@ from open_mastr.utils.xsd_tables import MastrColumnType, MastrTableDescription -MASTR_COLUMN_TYPE_TO_SQLALCHEMY_TYPE = { - MastrColumnType.STRING: String, - MastrColumnType.INTEGER: Integer, - MastrColumnType.FLOAT: Float, - MastrColumnType.DATE: Date, - MastrColumnType.DATETIME: DateTime(timezone=True), - MastrColumnType.BOOLEAN: Boolean, - MastrColumnType.CATALOG_VALUE: Integer, # TODO: Think about how to deal with mapping catalog values -} - # Potential hierarchy # Id -> MastrNummer -> EinheitMastrNummer # -> EegMastrNummer -> KwkMastrNummer -> GenMastrNummer @@ -74,20 +64,27 @@ class ParentAllTables(object): DatumDownload: Mapped[datetime.date] = mapped_column(Date) +DeclarativeBase_T = TypeVar("DeclarativeBase_T", bound=DeclarativeBase) + + def make_sqlalchemy_model_from_mastr_table_description( table_description: MastrTableDescription, - base: DeclarativeBase = MastrBase, + catalog_value_as_str: bool, + base: Type[DeclarativeBase_T] = MastrBase, mixins: tuple[type, ...] = (ParentAllTables,), -): +) -> Type[DeclarativeBase_T]: return _make_sqlalchemy_model( class_name=table_description.instance_name, table_name=table_description.table_name, column_name_to_column_type={ - column.name: MASTR_COLUMN_TYPE_TO_SQLALCHEMY_TYPE[column.type] + column.name: _get_sqlalchemy_type_for_mastr_column_type( + mastr_column_type=column.type, + catalog_value_as_str=catalog_value_as_str, + ) for column in table_description.columns }, primary_key_columns=MASTR_TABLE_NAME_TO_PRIMARY_KEY_COLUMNS[table_description.table_name], - base=MastrBase, + base=base, mixins=(ParentAllTables,) ) @@ -97,9 +94,9 @@ def _make_sqlalchemy_model( table_name: str, column_name_to_column_type: dict[str, Any], primary_key_columns: set[str], - base: DeclarativeBase, + base: Type[DeclarativeBase_T], mixins: tuple[type, ...] = tuple(), -): +) -> Type[DeclarativeBase_T]: # TODO: Is there a way to say that the returned model is a sub-type of DeclarativeBase_T? namespace = { "__tablename__": table_name, "__annotations__": {}, @@ -113,6 +110,36 @@ def _make_sqlalchemy_model( return type(class_name, bases, namespace) +_MASTR_COLUMN_TYPE_TO_SQLALCHEMY_TYPE = { + MastrColumnType.STRING: String, + MastrColumnType.INTEGER: Integer, + MastrColumnType.FLOAT: Float, + MastrColumnType.DATE: Date, + MastrColumnType.DATETIME: DateTime(timezone=True), + MastrColumnType.BOOLEAN: Boolean, +} + + +# We're creating special column types for the catalog columns here so that +# we can identify the catalog columns later when processing the XML files. +class CatalogInteger(Integer): + pass + + +class CatalogString(String): + pass + + +def _get_sqlalchemy_type_for_mastr_column_type( + mastr_column_type: MastrColumnType, catalog_value_as_str: bool, +) -> Union[Type[String], Type[Integer], Type[Float], Type[Date], Type[DateTime], Type[Boolean]]: + if mastr_column_type is MastrColumnType.CATALOG_VALUE: + return CatalogString if catalog_value_as_str else CatalogInteger + return _MASTR_COLUMN_TYPE_TO_SQLALCHEMY_TYPE[mastr_column_type] + + + +# TODO: Remove this or make it useful for outsiders. if __name__ == "__main__": import os import sys diff --git a/open_mastr/utils/xsd_tables.py b/open_mastr/utils/xsd_tables.py index b3764f80..a97ed158 100644 --- a/open_mastr/utils/xsd_tables.py +++ b/open_mastr/utils/xsd_tables.py @@ -1,11 +1,15 @@ +import os import re from enum import auto, Enum from dataclasses import dataclass -from typing import Union - +from pathlib import Path +from typing import Optional, Union +from zipfile import ZipFile, ZipInfo import xmlschema from xmlschema.validators.simple_types import XsdAtomicBuiltin, XsdAtomicRestriction +from open_mastr.utils.helpers import data_to_include_tables + _XML_SCHEMA_PREFIX = "{http://www.w3.org/2001/XMLSchema}" @@ -106,16 +110,16 @@ def read_mastr_table_descriptions_from_xsd( mastr_table_descriptions = set() with ZipFile(zipped_docs_file_path, "r") as docs_z: - xsd_zip_entry = _find_xsd_zip_file(docs_z) + xsd_zip_entry = _find_xsd_zip_entry(docs_z) with ZipFile(docs_z.open(xsd_zip_entry)) as xsd_z: - for entry in xsd_z: + for entry in xsd_z.filelist: if entry.is_dir() or not entry.filename.endswith(".xsd"): continue normalized_name = os.path.basename(entry.filename).removesuffix(".xsd").lower() if normalized_name in include_tables: with xsd_z.open(entry) as xsd_file: - mastr_table_description = MastrTableDescription.from_xml_schema(XMLSchema(xsd_file)) + mastr_table_description = MastrTableDescription.from_xml_schema(xmlschema.XMLSchema(xsd_file)) mastr_table_descriptions.add(mastr_table_description) return mastr_table_descriptions diff --git a/open_mastr/xml_download/colums_to_replace.py b/open_mastr/xml_download/colums_to_replace.py index 421ac44c..e7608e8a 100644 --- a/open_mastr/xml_download/colums_to_replace.py +++ b/open_mastr/xml_download/colums_to_replace.py @@ -1,6 +1,5 @@ -# system catalog is the mapping for the entries within the two columns -# Marktfunktionen und Lokationstyp (entry 1 is mapped to Stromnetzbetreiber -# in the column Marktfunktionen) +# system catalog is the mapping for the entries within the columns +# Marktfunktion, Lokationtyp and Einheittyp # The values for the system catalog can be found in the pdf of the bulk download # documentation: https://www.marktstammdatenregister.de/MaStR/Datendownload diff --git a/open_mastr/xml_download/utils_cleansing_bulk.py b/open_mastr/xml_download/utils_cleansing_bulk.py index b48a50f1..8cc9428b 100644 --- a/open_mastr/xml_download/utils_cleansing_bulk.py +++ b/open_mastr/xml_download/utils_cleansing_bulk.py @@ -1,23 +1,27 @@ import pandas as pd import numpy as np +from collections.abc import Collection +from zipfile import ZipFile + from open_mastr.xml_download.colums_to_replace import ( system_catalog, - columns_replace_list, ) -from zipfile import ZipFile -def cleanse_bulk_data(df: pd.DataFrame, zipped_xml_file_path: str) -> pd.DataFrame: - df = replace_ids_with_names(df, system_catalog) - # Katalogeintraege: int -> string value +def cleanse_bulk_data( + df: pd.DataFrame, + catalog_columns: Collection[str], + zipped_xml_file_path: str, +) -> pd.DataFrame: + df = replace_system_catalog_ids(df, system_catalog) df = replace_mastr_katalogeintraege( - zipped_xml_file_path=zipped_xml_file_path, df=df + zipped_xml_file_path=zipped_xml_file_path, df=df, catalog_columns=catalog_columns, ) return df -def replace_ids_with_names(df: pd.DataFrame, system_catalog: dict) -> pd.DataFrame: - """Replaces ids with names according to the system catalog. This is +def replace_system_catalog_ids(df: pd.DataFrame, system_catalog: dict[int, str]) -> pd.DataFrame: + """Replaces IDs with names according to the system catalog. This is necessary since the data from the bulk download encodes columns with IDs instead of the actual values.""" for column_name, name_mapping_dictionary in system_catalog.items(): @@ -29,14 +33,16 @@ def replace_ids_with_names(df: pd.DataFrame, system_catalog: dict) -> pd.DataFra def replace_mastr_katalogeintraege( zipped_xml_file_path: str, df: pd.DataFrame, + catalog_columns: Collection[str], ) -> pd.DataFrame: """Replaces the IDs from the mastr database by its mapped string values from - the table katalogwerte""" + the table Katalogwerte""" + # TODO: Create Katalogwerte dict once for whole download, not once per processed file. katalogwerte = create_katalogwerte_from_bulk_download(zipped_xml_file_path) for column_name in df.columns: - if column_name in columns_replace_list: + if column_name in catalog_columns: if df[column_name].dtype == "O": - # Handle comma seperated strings from catalog values + # Handle comma-separated strings from catalog values df[column_name] = ( df[column_name] .str.split(",", expand=True) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index d92fefc4..e01507b1 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -112,7 +112,7 @@ def gen_url(when: datetime.date, use_version="current") -> str: Defaults to "current". """ version = gen_version(when, use_version) - date = time.strftime("%Y%m%d", when) + date = when.strftime("%Y%m%d") return f"https://download.marktstammdatenregister.de/Gesamtdatenexport_{date}_{version}.zip" @@ -340,9 +340,7 @@ def full_download_without_unzip_http( bar.set_postfix_str(s="") -def download_documentation( - save_path: str, xml_folder_path: str -) -> None: +def download_documentation(save_path: str) -> None: """Downloads the zipped MaStR. Parameters @@ -356,26 +354,28 @@ def download_documentation( url = "https://www.marktstammdatenregister.de/MaStRHilfe/files/gesamtdatenexport/Dokumentation%20MaStR%20Gesamtdatenexport.zip" time_a = time.perf_counter() - r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) + r = requests.get(url, headers={"User-Agent": USER_AGENT}) r.raise_for_status() - - chunk_size = 1024 * 1024 - content_length = r.headers.get("Content-Length") - expected_steps = math.ceil(content_length / chunk_size) - with ( - open(save_path, "wb") as zfile, - tqdm(desc=save_path, total=expected_steps) as bar, - ): - for chunk in r.iter_content(chunk_size=chunk_size): - if chunk: - zfile.write(chunk) - zfile.flush() - bar.update() + with open(save_path, "wb") as zfile: + zfile.write(r.content) + + #chunk_size = 1024 * 1024 + #content_length = r.headers.get("Content-Length") + #expected_steps = math.ceil(content_length / chunk_size) + #with ( + # open(save_path, "wb") as zfile, + # tqdm(desc=save_path, total=expected_steps) as bar, + #): + # for chunk in r.iter_content(chunk_size=chunk_size): + # if chunk: + # zfile.write(chunk) + # zfile.flush() + # bar.update() time_b = time.perf_counter() log.info( f"MaStR documentation download is finished. It took {round(time_b - time_a)} seconds." ) - log.info(f"MaStR was successfully downloaded to {xml_folder_path}.") + log.info(f"MaStR was successfully downloaded to {save_path!r}.") diff --git a/open_mastr/xml_download/utils_write_to_database.py b/open_mastr/xml_download/utils_write_to_database.py index e71abc18..13e9f804 100644 --- a/open_mastr/xml_download/utils_write_to_database.py +++ b/open_mastr/xml_download/utils_write_to_database.py @@ -1,8 +1,10 @@ import os +from collections.abc import Mapping from concurrent.futures import ProcessPoolExecutor, wait from io import StringIO from multiprocessing import cpu_count from shutil import Error +from typing import Type, TypeVar from zipfile import ZipFile import re @@ -10,7 +12,8 @@ import numpy as np import pandas as pd import sqlalchemy -from sqlalchemy import select, create_engine, inspect +from sqlalchemy import Column, String, select, create_engine, inspect +from sqlalchemy.orm import DeclarativeBase from sqlalchemy.sql import text from sqlalchemy.sql.sqltypes import Date, DateTime @@ -21,6 +24,8 @@ log = setup_logger() +DeclarativeBase_T = TypeVar("DeclarativeBase_T", bound=DeclarativeBase) + def write_mastr_xml_to_database( engine: sqlalchemy.engine.Engine, @@ -28,6 +33,7 @@ def write_mastr_xml_to_database( data: list, bulk_cleansing: bool, bulk_download_date: str, + mastr_table_to_db_model: Mapping[str, Type[DeclarativeBase_T]], ) -> None: """Write the Mastr in xml format into a database defined by the engine parameter.""" log.info("Starting bulk download...") @@ -44,17 +50,23 @@ def write_mastr_xml_to_database( if not is_table_relevant(xml_table_name, include_tables): continue - sql_table_name = extract_sql_table_name(xml_table_name) + db_model = mastr_table_to_db_model.get(xml_table_name) + if not db_model: + # TODO Warning or error? + log.warning(f"Skipping MaStR file {file_name!r} because no database table was found for {xml_table_name=}") + continue + threads_data.append( ( file_name, xml_table_name, - sql_table_name, + db_model, str(engine.url), engine.url.password, zipped_xml_file_path, bulk_download_date, bulk_cleansing, + mastr_table_to_db_model, ) ) @@ -101,7 +113,7 @@ def get_number_of_processes(): def process_xml_file( file_name: str, xml_table_name: str, - sql_table_name: str, + db_model: Type[DeclarativeBase_T], connection_url: str, password: str, zipped_xml_file_path: str, @@ -122,26 +134,69 @@ def process_xml_file( with ZipFile(zipped_xml_file_path, "r") as f: log.info(f"Processing file '{file_name}'...") if is_first_file(file_name): - log.info(f"Creating table '{sql_table_name}'...") - create_database_table(engine, xml_table_name) + delete_all_existing_entries(db_model) df = read_xml_file(f, file_name) + df = check_for_column_mismatch_and_try_to_solve_it( + df=df, + db_model=db_model, + ) df = process_table_before_insertion( - df, - xml_table_name, - zipped_xml_file_path, - bulk_download_date, - bulk_cleansing, + df=df, + xml_table_name=xml_table_name, + db_model=db_model, + zipped_xml_file_path=zipped_xml_file_path, + bulk_download_date=bulk_download_date, + bulk_cleansing=bulk_cleansing, ) if engine.dialect.name == "sqlite": - add_table_to_sqlite_database(df, xml_table_name, sql_table_name, engine) + add_table_to_sqlite_database( + df=df, + db_model=db_model, + engine=engine, + ) else: add_table_to_non_sqlite_database( - df, xml_table_name, sql_table_name, engine + df=df, + db_model=db_model, + engine=engine, ) except Exception as e: log.error(f"Error processing file '{file_name}': '{e}'") +def delete_all_existing_rows(engine: Engine, db_model: Type[DeclarativeBase_T]) -> None: + with engine.begin() as con: + con.execute(delete(db_model)) + + +def check_for_column_mismatch_and_try_to_solve_it(df: pd.DataFrame, db_model: Type[DeclarativeBase_T]) -> pd.DataFrame: + df_column_names = set(df.columns) + db_column_names = {column.name for column in db_model.__table__.columns} + if additional_db_column_names := db_column_names - df_column_names: + log.warning( + f"Database table {db_model.__table__.name} has some columns that weren't found in the XML file." + f" Proceeding and trying to insert anyway. Additional DB columns:" + f" {', '.join(additional_db_column_names)}" + ) + if additional_df_column_names := df_column_names - db_column_names: + # TODO: Check here if the user specified not to issue DDL statements before trying to insert. + log.warning( + f"XML file has some columns that aren't present in the database table {db_model.__table__.name}." + f" Trying to add the columns to the table. Additional XML columns:" + f" {', '.join(additional_df_column_names)}" + ) + try: + add_missing_columns_to_table( + engine=engine, + db_model=db_model, + missing_columns=missing_columns, + ) + except: + log.exception("Could not add at least some columns to the database. Ignoring the columns from the XML file instead.") + df = df.drop(columns=additional_df_column_names) + return df + + def create_efficient_engine(connection_url: str) -> sqlalchemy.engine.Engine: """Create an efficient engine for the SQLite database.""" is_sqlite = connection_url.startswith("sqlite://") @@ -254,44 +309,35 @@ def is_first_file(file_name: str) -> bool: def cast_date_columns_to_datetime( - xml_table_name: str, df: pd.DataFrame + db_model: Type[DeclarativeBase_T], df: pd.DataFrame ) -> pd.DataFrame: - sqlalchemy_columnlist = tablename_mapping[xml_table_name][ - "__class__" - ].__table__.columns.items() - for column in sqlalchemy_columnlist: - column_name = column[0] - if is_date_column(column, df): + for column in db_model.__table__.columns: + if is_date_column_and_in_df(column, df): # Convert column to datetime64, invalid string -> NaT - df[column_name] = pd.to_datetime(df[column_name], errors="coerce") + df[column.name] = pd.to_datetime(df[column.name], errors="coerce") return df -def cast_date_columns_to_string(xml_table_name: str, df: pd.DataFrame) -> pd.DataFrame: - column_list = tablename_mapping[xml_table_name][ - "__class__" - ].__table__.columns.items() - for column in column_list: - column_name = column[0] - - if not (column[0] in df.columns and is_date_column(column, df)): +def cast_date_columns_to_string(db_model: Type[DeclarativeBase_T], df: pd.DataFrame) -> pd.DataFrame: + for column in columns: + if not is_date_column_and_in_df(column, df): continue df[column_name] = pd.to_datetime(df[column_name], errors="coerce") - if type(column[1].type) is Date: + if type(column.type) is Date: df[column_name] = ( df[column_name].dt.strftime("%Y-%m-%d").replace("NaT", None) ) - elif type(column[1].type) is DateTime: + elif type(column.type) is DateTime: df[column_name] = ( df[column_name].dt.strftime("%Y-%m-%d %H:%M:%S.%f").replace("NaT", None) ) return df -def is_date_column(column, df: pd.DataFrame) -> bool: - return type(column[1].type) in [Date, DateTime] and column[0] in df.columns +def is_date_column_and_in_df(column: Column, df: pd.DataFrame) -> bool: + return type(column.type) in [Date, DateTime] and column.name in df.columns def correct_ordering_of_filelist(files_list: list) -> list: @@ -329,46 +375,27 @@ def read_xml_file(f: ZipFile, file_name: str) -> pd.DataFrame: return handle_xml_syntax_error(xml_file.read().decode("utf-16"), error) -def change_column_names_to_orm_format( - df: pd.DataFrame, xml_table_name: str -) -> pd.DataFrame: - if tablename_mapping[xml_table_name]["replace_column_names"]: - df.rename( - columns=tablename_mapping[xml_table_name]["replace_column_names"], - inplace=True, - ) - return df - - def add_table_to_non_sqlite_database( df: pd.DataFrame, - xml_table_name: str, - sql_table_name: str, + db_model: Type[DeclarativeBase_T], engine: sqlalchemy.engine.Engine, ) -> None: # get a dictionary for the data types - table_columns_list = list( - tablename_mapping[xml_table_name]["__class__"].__table__.columns - ) dtypes_for_writing_sql = { column.name: column.type - for column in table_columns_list + for column in db_model.__table__columns if column.name in df.columns } # Convert date and datetime columns into the datatype datetime. - df = cast_date_columns_to_datetime(xml_table_name, df) - - add_missing_columns_to_table( - engine, xml_table_name, column_list=df.columns.tolist() - ) + df = cast_date_columns_to_datetime(db_model, df) for _ in range(10000): try: with engine.connect() as con: with con.begin(): df.to_sql( - sql_table_name, + db_model.__table__.name, con=con, index=False, if_exists="append", @@ -382,7 +409,7 @@ def add_table_to_non_sqlite_database( except sqlalchemy.exc.IntegrityError: # error resulting from Unique constraint failed df = write_single_entries_until_not_unique_comes_up( - df, xml_table_name, engine + df, db_model, engine ) @@ -419,7 +446,7 @@ def add_zero_as_first_character_for_too_short_string(df: pd.DataFrame) -> pd.Dat def write_single_entries_until_not_unique_comes_up( - df: pd.DataFrame, xml_table_name: str, engine: sqlalchemy.engine.Engine + df: pd.DataFrame, db_model: Type[DeclarativeBase_T], engine: sqlalchemy.engine.Engine ) -> pd.DataFrame: """ Remove from dataframe these rows, which are already existing in the database table @@ -433,15 +460,14 @@ def write_single_entries_until_not_unique_comes_up( ------- Filtered dataframe """ + # TODO: Check if we need to support composite primary keys for the MaStR changes table. + # Because this here assumes single-column primary keys. + primary_key = next(c for c in db_model.__table__.columns if c.primary_key) - table = tablename_mapping[xml_table_name]["__class__"].__table__ - primary_key = next(c for c in table.columns if c.primary_key) - - with engine.connect() as con: - with con.begin(): - key_list = ( - pd.read_sql(sql=select(primary_key), con=con).values.squeeze().tolist() - ) + with engine.begin() as con: + key_list = ( + pd.read_sql(sql=select(primary_key), con=con).values.squeeze().tolist() + ) len_df_before = len(df) df = df.drop_duplicates( @@ -460,8 +486,8 @@ def write_single_entries_until_not_unique_comes_up( def add_missing_columns_to_table( engine: sqlalchemy.engine.Engine, - xml_table_name: str, - column_list: list, + db_model: Type[DeclarativeBase_T], + missing_columns: Collection[str], ) -> None: """ Some files introduce new columns for existing tables. @@ -477,36 +503,27 @@ def add_missing_columns_to_table( ------- """ - log = setup_logger() - - # get the columns name from the existing database - inspector = sqlalchemy.inspect(engine) - table_name = tablename_mapping[xml_table_name]["__class__"].__table__.name - columns = inspector.get_columns(table_name) - column_names_from_database = [column["name"] for column in columns] - - missing_columns = set(column_list) - set(column_names_from_database) - + table_name = db_model.__table__.name for column_name in missing_columns: - if not column_exists(engine, table_name, column_name): - alter_query = 'ALTER TABLE %s ADD "%s" VARCHAR NULL;' % ( - table_name, - column_name, - ) - try: - with engine.connect().execution_options(autocommit=True) as con: - with con.begin(): - con.execute( - text(alter_query).execution_options(autocommit=True) - ) - except sqlalchemy.exc.OperationalError as err: - # If the column already exists, we can ignore the error. - if "duplicate column name" not in str(err): - raise err - log.info( - "From the downloaded xml files following new attribute was " - f"introduced: {table_name}.{column_name}" - ) + alter_query = 'ALTER TABLE %s ADD "%s" VARCHAR NULL;' % ( + table_name, + column_name, + ) + try: + with engine.connect().execution_options(autocommit=True) as con: + with con.begin(): + con.execute( + text(alter_query).execution_options(autocommit=True) + ) + except sqlalchemy.exc.OperationalError as err: + # If the column already exists, we can ignore the error. + if "duplicate column name" not in str(err): + raise err + log.info( + f"Added the following columns to database table {table_name}:" + f" {', '.join(missing_columns)}" + ) + def delete_wrong_xml_entry(err: Error, df: pd.DataFrame) -> pd.DataFrame: @@ -563,52 +580,57 @@ def find_nearest_brackets(xml_string: str, position: int) -> tuple[int, int]: def process_table_before_insertion( df: pd.DataFrame, xml_table_name: str, + db_model: Type[DeclarativeBase_T], zipped_xml_file_path: str, bulk_download_date: str, bulk_cleansing: bool, ) -> pd.DataFrame: df = add_zero_as_first_character_for_too_short_string(df) - df = change_column_names_to_orm_format(df, xml_table_name) # Add Column that refers to the source of the data df["DatenQuelle"] = "bulk" df["DatumDownload"] = bulk_download_date if bulk_cleansing: - df = cleanse_bulk_data(df, zipped_xml_file_path) + catalog_columns = { + column.name + for column in db_model.__table__.columns + # TODO: Is it okay to rely so heavily on the SQLALchemy model to decide how to process the table? + if isinstance(column.type, (CatalogInteger, CatalogString)) + } + df = cleanse_bulk_data( + df=df, catalog_columns=catalog_columns, zipped_xml_file_path=zipped_xml_file_path + ) return df def add_table_to_sqlite_database( df: pd.DataFrame, - xml_table_name: str, - sql_table_name: str, + db_model: Type[DeclarativeBase_T], engine: sqlalchemy.engine.Engine, ) -> None: column_list = df.columns.tolist() - add_missing_columns_to_table(engine, xml_table_name, column_list) # Convert NaNs to None. df = df.where(pd.notnull(df), None) # Convert date columns to strings. Dates are not supported directly by SQLite. - df = cast_date_columns_to_string(xml_table_name, df) + df = cast_date_columns_to_string(db_model, df) # Create SQL statement for bulk insert. ON CONFLICT DO NOTHING prevents duplicates. - insert_stmt = f"INSERT INTO {sql_table_name} ({','.join(column_list)}) VALUES ({','.join(['?' for _ in column_list])}) ON CONFLICT DO NOTHING" + insert_stmt = f"INSERT INTO {db_model.__table__.name} ({','.join(column_list)}) VALUES ({','.join(['?' for _ in column_list])}) ON CONFLICT DO NOTHING" for _ in range(10000): try: - with engine.connect() as con: - with con.begin(): - con.connection.executemany(insert_stmt, df.to_numpy()) - break + with engine.begin() as con: + con.connection.executemany(insert_stmt, df.to_numpy()) + break except sqlalchemy.exc.DataError as err: delete_wrong_xml_entry(err, df) except sqlalchemy.exc.IntegrityError: # error resulting from Unique constraint failed df = write_single_entries_until_not_unique_comes_up( - df, xml_table_name, engine + df, db_model, engine ) except: # If any unexpected error occurs, we'll switch back to the non-SQLite method. From 5f0a7e44c1475203d8691f4c428aae9de86e661f Mon Sep 17 00:00:00 2001 From: Simon Will Date: Sat, 27 Dec 2025 18:06:24 +0100 Subject: [PATCH 4/7] Continue --- open_mastr/mastr_2.py | 4 +- open_mastr/utils/xsd_tables.py | 5 +- .../xml_download/utils_write_to_database.py | 64 +++++++++++-------- 3 files changed, 44 insertions(+), 29 deletions(-) diff --git a/open_mastr/mastr_2.py b/open_mastr/mastr_2.py index 681c8403..0d8bc360 100644 --- a/open_mastr/mastr_2.py +++ b/open_mastr/mastr_2.py @@ -150,6 +150,7 @@ def download( bulk_cleansing=True, keep_old_downloads: bool = False, mastr_table_to_db_model: Optional[Mapping[str, Type[DeclarativeBase_T]]] = None, + create_and_alter_database_tables: bool = True, **kwargs, ) -> None: """ @@ -221,7 +222,7 @@ def download( if not mastr_table_to_db_model: mastr_table_to_db_model = self.generate_data_model(data=data, catalog_value_as_str=bulk_cleansing) log.info("Ensuring database tables for MaStR are present") - for db_model in mastr_table_to_db_model: + for db_model in mastr_table_to_db_model.values(): db_model.__table__.drop(self.engine, checkfirst=True) db_model.__table__.create(self.engine) @@ -272,6 +273,7 @@ def download( data=data, bulk_cleansing=bulk_cleansing, bulk_download_date=bulk_download_date, + mastr_table_to_db_model=mastr_table_to_db_model, ) def to_csv( diff --git a/open_mastr/utils/xsd_tables.py b/open_mastr/utils/xsd_tables.py index a97ed158..cf3523c0 100644 --- a/open_mastr/utils/xsd_tables.py +++ b/open_mastr/utils/xsd_tables.py @@ -13,8 +13,11 @@ _XML_SCHEMA_PREFIX = "{http://www.w3.org/2001/XMLSchema}" +# TODO: Should we really mess with the original column names? +# The BNetzA "choice" to sometimes write MaStR and sometimes Mastr is certainly confusing, +# but are we the ones who should change that? def normalize_column_name(original_mastr_column_name: str) -> str: - return original_mastr_column_name.replace("MaStR", "Mastr") + return original_mastr_column_name.replace("MaStR", "Mastr").replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss").strip() class MastrColumnType(Enum): diff --git a/open_mastr/xml_download/utils_write_to_database.py b/open_mastr/xml_download/utils_write_to_database.py index 13e9f804..b6685d11 100644 --- a/open_mastr/xml_download/utils_write_to_database.py +++ b/open_mastr/xml_download/utils_write_to_database.py @@ -1,5 +1,5 @@ import os -from collections.abc import Mapping +from collections.abc import Collection, Mapping from concurrent.futures import ProcessPoolExecutor, wait from io import StringIO from multiprocessing import cpu_count @@ -12,7 +12,7 @@ import numpy as np import pandas as pd import sqlalchemy -from sqlalchemy import Column, String, select, create_engine, inspect +from sqlalchemy import Column, Engine, delete, select, create_engine, inspect from sqlalchemy.orm import DeclarativeBase from sqlalchemy.sql import text from sqlalchemy.sql.sqltypes import Date, DateTime @@ -20,6 +20,8 @@ from open_mastr.utils.config import setup_logger from open_mastr.utils.helpers import data_to_include_tables from open_mastr.utils.orm import tablename_mapping +from open_mastr.utils.xsd_tables import normalize_column_name +from open_mastr.utils.sqlalchemy_tables import CatalogInteger, CatalogString from open_mastr.xml_download.utils_cleansing_bulk import cleanse_bulk_data log = setup_logger() @@ -40,6 +42,7 @@ def write_mastr_xml_to_database( include_tables = data_to_include_tables(data, mapping="write_xml") threads_data = [] + lower_mastr_table_to_db_model = {table_name.lower(): db_model for table_name, db_model in mastr_table_to_db_model.items()} with ZipFile(zipped_xml_file_path, "r") as f: files_list = correct_ordering_of_filelist(f.namelist()) @@ -50,23 +53,20 @@ def write_mastr_xml_to_database( if not is_table_relevant(xml_table_name, include_tables): continue - db_model = mastr_table_to_db_model.get(xml_table_name) + db_model = lower_mastr_table_to_db_model.get(xml_table_name) if not db_model: - # TODO Warning or error? log.warning(f"Skipping MaStR file {file_name!r} because no database table was found for {xml_table_name=}") continue threads_data.append( ( file_name, - xml_table_name, db_model, str(engine.url), engine.url.password, zipped_xml_file_path, bulk_download_date, bulk_cleansing, - mastr_table_to_db_model, ) ) @@ -112,7 +112,6 @@ def get_number_of_processes(): def process_xml_file( file_name: str, - xml_table_name: str, db_model: Type[DeclarativeBase_T], connection_url: str, password: str, @@ -134,20 +133,20 @@ def process_xml_file( with ZipFile(zipped_xml_file_path, "r") as f: log.info(f"Processing file '{file_name}'...") if is_first_file(file_name): - delete_all_existing_entries(db_model) + delete_all_existing_rows(db_model=db_model, engine=engine) df = read_xml_file(f, file_name) - df = check_for_column_mismatch_and_try_to_solve_it( - df=df, - db_model=db_model, - ) df = process_table_before_insertion( df=df, - xml_table_name=xml_table_name, db_model=db_model, zipped_xml_file_path=zipped_xml_file_path, bulk_download_date=bulk_download_date, bulk_cleansing=bulk_cleansing, ) + df = check_for_column_mismatch_and_try_to_solve_it( + df=df, + db_model=db_model, + engine=engine, + ) if engine.dialect.name == "sqlite": add_table_to_sqlite_database( df=df, @@ -164,20 +163,24 @@ def process_xml_file( log.error(f"Error processing file '{file_name}': '{e}'") -def delete_all_existing_rows(engine: Engine, db_model: Type[DeclarativeBase_T]) -> None: +def delete_all_existing_rows(db_model: Type[DeclarativeBase_T], engine: Engine) -> None: with engine.begin() as con: con.execute(delete(db_model)) -def check_for_column_mismatch_and_try_to_solve_it(df: pd.DataFrame, db_model: Type[DeclarativeBase_T]) -> pd.DataFrame: +def check_for_column_mismatch_and_try_to_solve_it(df: pd.DataFrame, db_model: Type[DeclarativeBase_T], engine: Engine) -> pd.DataFrame: df_column_names = set(df.columns) db_column_names = {column.name for column in db_model.__table__.columns} + if additional_db_column_names := db_column_names - df_column_names: - log.warning( + # Many columns are optional and it's perfectly normal to have and XML file / a dataframe that doesn't have + # a column that is present in the database. So this is only worth a debug message. + log.debug( f"Database table {db_model.__table__.name} has some columns that weren't found in the XML file." f" Proceeding and trying to insert anyway. Additional DB columns:" f" {', '.join(additional_db_column_names)}" ) + if additional_df_column_names := df_column_names - db_column_names: # TODO: Check here if the user specified not to issue DDL statements before trying to insert. log.warning( @@ -185,15 +188,17 @@ def check_for_column_mismatch_and_try_to_solve_it(df: pd.DataFrame, db_model: Ty f" Trying to add the columns to the table. Additional XML columns:" f" {', '.join(additional_df_column_names)}" ) + # TODO: What if we can add some columns and not others? We should then return the columns for which we succeeded. try: add_missing_columns_to_table( engine=engine, db_model=db_model, - missing_columns=missing_columns, + missing_columns=additional_df_column_names, ) - except: + except Exception: log.exception("Could not add at least some columns to the database. Ignoring the columns from the XML file instead.") df = df.drop(columns=additional_df_column_names) + return df @@ -319,19 +324,19 @@ def cast_date_columns_to_datetime( def cast_date_columns_to_string(db_model: Type[DeclarativeBase_T], df: pd.DataFrame) -> pd.DataFrame: - for column in columns: + for column in db_model.__table__.columns: if not is_date_column_and_in_df(column, df): continue - df[column_name] = pd.to_datetime(df[column_name], errors="coerce") + df[column.name] = pd.to_datetime(df[column.name], errors="coerce") if type(column.type) is Date: - df[column_name] = ( - df[column_name].dt.strftime("%Y-%m-%d").replace("NaT", None) + df[column.name] = ( + df[column.name].dt.strftime("%Y-%m-%d").replace("NaT", None) ) elif type(column.type) is DateTime: - df[column_name] = ( - df[column_name].dt.strftime("%Y-%m-%d %H:%M:%S.%f").replace("NaT", None) + df[column.name] = ( + df[column.name].dt.strftime("%Y-%m-%d %H:%M:%S.%f").replace("NaT", None) ) return df @@ -579,7 +584,6 @@ def find_nearest_brackets(xml_string: str, position: int) -> tuple[int, int]: def process_table_before_insertion( df: pd.DataFrame, - xml_table_name: str, db_model: Type[DeclarativeBase_T], zipped_xml_file_path: str, bulk_download_date: str, @@ -591,6 +595,8 @@ def process_table_before_insertion( df["DatenQuelle"] = "bulk" df["DatumDownload"] = bulk_download_date + df = normalize_column_names_in_df(df) + if bulk_cleansing: catalog_columns = { column.name @@ -604,6 +610,10 @@ def process_table_before_insertion( return df +def normalize_column_names_in_df(df: pd.DataFrame) -> pd.DataFrame: + return df.rename(columns={column_name: normalize_column_name(column_name) for column_name in df.columns}) + + def add_table_to_sqlite_database( df: pd.DataFrame, db_model: Type[DeclarativeBase_T], @@ -632,9 +642,9 @@ def add_table_to_sqlite_database( df = write_single_entries_until_not_unique_comes_up( df, db_model, engine ) - except: + except Exception: # If any unexpected error occurs, we'll switch back to the non-SQLite method. - add_table_to_non_sqlite_database(df, xml_table_name, sql_table_name, engine) + add_table_to_non_sqlite_database(df, db_model, engine) break From 8a3a5950a89003ac1a627c621a9b42a88c43ee99 Mon Sep 17 00:00:00 2001 From: Simon Will Date: Sat, 27 Dec 2025 18:51:27 +0100 Subject: [PATCH 5/7] Rename mastr_2.py to mastr.py --- open_mastr/mastr.py | 231 +++++++++++----------------------- open_mastr/mastr_2.py | 284 ------------------------------------------ 2 files changed, 70 insertions(+), 445 deletions(-) delete mode 100644 open_mastr/mastr_2.py diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index be617eb8..0d8bc360 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -1,14 +1,20 @@ import os -from sqlalchemy import inspect, create_engine +from pathlib import Path +from sqlalchemy import inspect, create_engine, Engine +from sqlalchemy.orm import DeclarativeBase +from typing import Literal, Optional, Type, TypeVar, Union +from collections.abc import Mapping # import xml dependencies from open_mastr.xml_download.utils_download_bulk import ( + download_documentation, download_xml_Mastr, delete_xml_files_not_from_given_date, ) from open_mastr.xml_download.utils_write_to_database import ( write_mastr_xml_to_database, ) +from open_mastr.utils.xsd_tables import MastrTableDescription, read_mastr_table_descriptions_from_xsd from open_mastr.utils.helpers import ( validate_parameter_format_for_download_method, @@ -34,6 +40,10 @@ setup_logger, ) import open_mastr.utils.orm as orm +from open_mastr.utils.sqlalchemy_tables import ( + make_sqlalchemy_model_from_mastr_table_description, + MastrBase +) # constants from open_mastr.utils.constants import TECHNOLOGIES, ADDITIONAL_TABLES @@ -41,6 +51,9 @@ # setup logger log = setup_logger() +# TODO: Repeating Type[DeclarativeBase_T] in function signatures is strange. There must be a better option. +DeclarativeBase_T = TypeVar("DeclarativeBase_T", bound=DeclarativeBase) + class Mastr: """ @@ -71,21 +84,23 @@ class Mastr: """ - def __init__(self, engine="sqlite", connect_to_translated_db=False) -> None: + def __init__( + self, + engine: Union[Engine, Literal["sqlite"]] = "sqlite", + mastr_table_to_db_table_name: Optional[dict[str, str]] = None, + output_dir: Optional[Union[str, Path]] = None, + home_dir: Optional[Union[str, Path]] = None, + ) -> None: validate_parameter_format_for_mastr_init(engine) - self.output_dir = get_output_dir() - self.home_directory = get_project_home_dir() + self.output_dir = output_dir or get_output_dir() + self.home_directory = home_dir or get_project_home_dir() + self._sqlite_folder_path = os.path.join(self.output_dir, "data", "sqlite") + os.makedirs(self._sqlite_folder_path, exist_ok=True) - self.is_translated = connect_to_translated_db - if connect_to_translated_db: - self.engine = create_translated_database_engine( - engine, self._sqlite_folder_path - ) - else: - self.engine = create_database_engine(engine, self._sqlite_folder_path) + self.engine = create_database_engine(engine, self._sqlite_folder_path) log.info( "\n==================================================\n" @@ -97,7 +112,35 @@ def __init__(self, engine="sqlite", connect_to_translated_db=False) -> None: "'pip install --upgrade open-mastr'\n" ) - orm.Base.metadata.create_all(self.engine) + def generate_data_model( + self, + data: Optional[list[str]] = None, + catalog_value_as_str: bool = True, + base: Type[DeclarativeBase_T] = MastrBase, + ) -> dict[str, Type[DeclarativeBase_T]]: + data = transform_data_parameter(data) + + docs_folder_path = os.path.join(self.output_dir, "data", "docs_download") + os.makedirs(docs_folder_path, exist_ok=True) + zipped_docs_file_path = os.path.join( + docs_folder_path, + "Dokumentation MaStR Gesamtdatenexport.zip" + ) + download_documentation(zipped_docs_file_path) + + mastr_table_descriptions = read_mastr_table_descriptions_from_xsd( + zipped_docs_file_path=zipped_docs_file_path, data=data + ) + mastr_table_to_db_model: dict[str, DeclarativeBase_T] = {} + for mastr_table_description in mastr_table_descriptions: + sqlalchemy_model = make_sqlalchemy_model_from_mastr_table_description( + table_description=mastr_table_description, + catalog_value_as_str=catalog_value_as_str, + base=base + ) + mastr_table_to_db_model[mastr_table_description.table_name] = sqlalchemy_model + + return mastr_table_to_db_model def download( self, @@ -106,6 +149,8 @@ def download( date=None, bulk_cleansing=True, keep_old_downloads: bool = False, + mastr_table_to_db_model: Optional[Mapping[str, Type[DeclarativeBase_T]]] = None, + create_and_alter_database_tables: bool = True, **kwargs, ) -> None: """ @@ -165,13 +210,6 @@ def download( keep_old_downloads: bool If set to True, prior downloaded MaStR zip files will be kept. """ - - if self.is_translated: - raise TypeError( - "You are currently connected to a translated database.\n" - "A translated database cannot be further processed." - ) - if method == "API": log.warning( "Downloading the whole registry via the MaStR SOAP-API is deprecated. " @@ -181,6 +219,13 @@ def download( log.warning("Attention: method='API' changed to method='bulk'.") method = "bulk" + if not mastr_table_to_db_model: + mastr_table_to_db_model = self.generate_data_model(data=data, catalog_value_as_str=bulk_cleansing) + log.info("Ensuring database tables for MaStR are present") + for db_model in mastr_table_to_db_model.values(): + db_model.__table__.drop(self.engine, checkfirst=True) + db_model.__table__.create(self.engine) + validate_parameter_format_for_download_method( method=method, data=data, @@ -192,21 +237,20 @@ def download( date = transform_date_parameter(self, date, **kwargs) - # Find the name of the zipped xml folder bulk_download_date = parse_date_string(date) xml_folder_path = os.path.join(self.output_dir, "data", "xml_download") os.makedirs(xml_folder_path, exist_ok=True) zipped_xml_file_path = os.path.join( xml_folder_path, - f"Gesamtdatenexport_{bulk_download_date}.zip", + f"Gesamtdatenexport_{bulk_download_date.strftime('%Y%m%d')}.zip", ) delete_zip_file_if_corrupted(zipped_xml_file_path) if not keep_old_downloads: - delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) + delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) - download_xml_Mastr(zipped_xml_file_path, date, data, xml_folder_path) + download_xml_Mastr(zipped_xml_file_path, bulk_download_date, data, xml_folder_path) log.info( "\nWould you like to speed up the creation of your MaStR database?\n" @@ -217,7 +261,6 @@ def download( delete_zip_file_if_corrupted(zipped_xml_file_path) delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) - print( "\nWould you like to speed up the creation of your MaStR database?\n" "Try our new parallelized processing by setting os.environ['USE_RECOMMENDED_NUMBER_OF_PROCESSES'] = True " @@ -230,146 +273,12 @@ def download( data=data, bulk_cleansing=bulk_cleansing, bulk_download_date=bulk_download_date, + mastr_table_to_db_model=mastr_table_to_db_model, ) def to_csv( self, tables: list = None, chunksize: int = 500000, limit: int = None ) -> None: - """ - Save the database as csv files along with the metadata file. - If 'tables=None' all possible tables will be exported. - - Parameters - ------------ - tables: None or list - For exporting selected tables choose from: - ["wind", "solar", "biomass", "hydro", "gsgk", "combustion", "nuclear", "storage", - "balancing_area", "electricity_consumer", "gas_consumer", "gas_producer", - "gas_storage", "gas_storage_extended", - "grid_connections", "grids", "market_actors", "market_roles", - "locations_extended", "permit", "deleted_units", "storage_units"] - chunksize: int - Defines the chunksize of the tables export. - Default value is 500.000 rows to include in each chunk. - limit: None or int - Limits the number of exported data rows. - """ - - if self.is_translated: - raise TypeError( - "You are currently connected to a translated database.\n" - "A translated database cannot be used for the csv export." - ) - - log.info("Starting csv-export") - - data_path = get_data_version_dir() - - create_data_dir() - - # Validate and parse tables parameter - validate_parameter_data(method="csv_export", data=tables) - data = transform_data_parameter( - method="bulk", data=tables, api_data_types=None, api_location_types=None - ) - - # Determine tables to export - technologies_to_export = [] - additional_tables_to_export = [] - for table in data: - if table in TECHNOLOGIES: - technologies_to_export.append(table) - elif table in ADDITIONAL_TABLES: - additional_tables_to_export.append(table) - else: - additional_tables_to_export.extend( - data_to_include_tables([table], mapping="export_db_tables") - ) - - if technologies_to_export: - log.info(f"Technology tables: {technologies_to_export}") - if additional_tables_to_export: - log.info(f"Additional tables: {additional_tables_to_export}") - - log.info(f"Tables are saved to: {data_path}") - - reverse_fill_basic_units(technology=technologies_to_export, engine=self.engine) - - # Export technologies to csv - for tech in technologies_to_export: - db_query_to_csv( - db_query=create_db_query(tech=tech, limit=limit, engine=self.engine), - data_table=tech, - chunksize=chunksize, - ) - # Export additional tables to csv - for addit_table in additional_tables_to_export: - db_query_to_csv( - db_query=create_db_query( - additional_table=addit_table, limit=limit, engine=self.engine - ), - data_table=addit_table, - chunksize=chunksize, - ) - - # FIXME: Currently metadata is only created for technology data, Fix in #386 - # Configure and save data package metadata file along with data - # save_metadata(data=technologies_to_export, engine=self.engine) - - def translate(self) -> None: - """ - A database can be translated only once. - - Deletes translated versions of the currently connected database. - - Translates currently connected database,renames it with '-translated' - suffix and updates self.engine's path accordingly. - - !!! example - ```python - - from open_mastr import Mastr - import pandas as pd - - db = Mastr() - db.download(data='biomass') - db.translate() - - df = pd.read_sql(sql='biomass_extended', con=db.engine) - print(df.head(10)) - ``` - - """ - - if "sqlite" not in self.engine.dialect.name: - raise ValueError("engine has to be of type 'sqlite'") - if self.is_translated: - raise TypeError("The currently connected database is already translated.") - - inspector = inspect(self.engine) - old_path = r"{}".format(self.engine.url.database) - new_path = old_path[:-3] + "-translated.db" - - if os.path.exists(new_path): - try: - os.remove(new_path) - except Exception as e: - log.error( - f"An error occurred while removing old translated database: {e}" - ) - - log.info("Replacing previous version of the translated database...") - - for table in inspector.get_table_names(): - rename_table(table, inspector.get_columns(table), self.engine) - - self.engine.dispose() - - try: - os.rename(old_path, new_path) - log.info(f"Database '{old_path}' changed to '{new_path}'") - except Exception as e: - log.error(f"An error occurred while renaming database: {e}") + pass + # TODO: Think about this. - self.engine = create_engine(f"sqlite:///{new_path}") - self.is_translated = True diff --git a/open_mastr/mastr_2.py b/open_mastr/mastr_2.py deleted file mode 100644 index 0d8bc360..00000000 --- a/open_mastr/mastr_2.py +++ /dev/null @@ -1,284 +0,0 @@ -import os -from pathlib import Path -from sqlalchemy import inspect, create_engine, Engine -from sqlalchemy.orm import DeclarativeBase -from typing import Literal, Optional, Type, TypeVar, Union -from collections.abc import Mapping - -# import xml dependencies -from open_mastr.xml_download.utils_download_bulk import ( - download_documentation, - download_xml_Mastr, - delete_xml_files_not_from_given_date, -) -from open_mastr.xml_download.utils_write_to_database import ( - write_mastr_xml_to_database, -) -from open_mastr.utils.xsd_tables import MastrTableDescription, read_mastr_table_descriptions_from_xsd - -from open_mastr.utils.helpers import ( - validate_parameter_format_for_download_method, - validate_parameter_format_for_mastr_init, - validate_parameter_data, - transform_data_parameter, - parse_date_string, - transform_date_parameter, - data_to_include_tables, - create_db_query, - db_query_to_csv, - reverse_fill_basic_units, - delete_zip_file_if_corrupted, - create_database_engine, - rename_table, - create_translated_database_engine, -) -from open_mastr.utils.config import ( - create_data_dir, - get_data_version_dir, - get_project_home_dir, - get_output_dir, - setup_logger, -) -import open_mastr.utils.orm as orm -from open_mastr.utils.sqlalchemy_tables import ( - make_sqlalchemy_model_from_mastr_table_description, - MastrBase -) - -# constants -from open_mastr.utils.constants import TECHNOLOGIES, ADDITIONAL_TABLES - -# setup logger -log = setup_logger() - -# TODO: Repeating Type[DeclarativeBase_T] in function signatures is strange. There must be a better option. -DeclarativeBase_T = TypeVar("DeclarativeBase_T", bound=DeclarativeBase) - - -class Mastr: - """ - `Mastr` is used to download the MaStR database and keep it up-to-date. - - An SQL database is used to mirror the MaStR database. It is filled by - downloading and parsing the MaStR via bulk download. - - !!! example - - ```python - from open_mastr import Mastr - - db = Mastr() - db.download() - ``` - - Parameters - ---------- - engine : {'sqlite', sqlalchemy.engine.Engine}, optional - Defines the engine of the database where the MaStR is mirrored to. - Default is 'sqlite'. - connect_to_translated_db: boolean, optional - Allows connection to an existing translated database. Default is 'False'. - Only for 'sqlite'-type engines. - - - - """ - - def __init__( - self, - engine: Union[Engine, Literal["sqlite"]] = "sqlite", - mastr_table_to_db_table_name: Optional[dict[str, str]] = None, - output_dir: Optional[Union[str, Path]] = None, - home_dir: Optional[Union[str, Path]] = None, - ) -> None: - validate_parameter_format_for_mastr_init(engine) - - self.output_dir = output_dir or get_output_dir() - self.home_directory = home_dir or get_project_home_dir() - - self._sqlite_folder_path = os.path.join(self.output_dir, "data", "sqlite") - - os.makedirs(self._sqlite_folder_path, exist_ok=True) - - self.engine = create_database_engine(engine, self._sqlite_folder_path) - - log.info( - "\n==================================================\n" - "---------> open-MaStR started <---------\n" - "==================================================\n" - f"Data will be written to the following database: {self.engine.url}\n" - "If you run into problems, try to " - "delete the database and update the package by running " - "'pip install --upgrade open-mastr'\n" - ) - - def generate_data_model( - self, - data: Optional[list[str]] = None, - catalog_value_as_str: bool = True, - base: Type[DeclarativeBase_T] = MastrBase, - ) -> dict[str, Type[DeclarativeBase_T]]: - data = transform_data_parameter(data) - - docs_folder_path = os.path.join(self.output_dir, "data", "docs_download") - os.makedirs(docs_folder_path, exist_ok=True) - zipped_docs_file_path = os.path.join( - docs_folder_path, - "Dokumentation MaStR Gesamtdatenexport.zip" - ) - download_documentation(zipped_docs_file_path) - - mastr_table_descriptions = read_mastr_table_descriptions_from_xsd( - zipped_docs_file_path=zipped_docs_file_path, data=data - ) - mastr_table_to_db_model: dict[str, DeclarativeBase_T] = {} - for mastr_table_description in mastr_table_descriptions: - sqlalchemy_model = make_sqlalchemy_model_from_mastr_table_description( - table_description=mastr_table_description, - catalog_value_as_str=catalog_value_as_str, - base=base - ) - mastr_table_to_db_model[mastr_table_description.table_name] = sqlalchemy_model - - return mastr_table_to_db_model - - def download( - self, - method="bulk", - data=None, - date=None, - bulk_cleansing=True, - keep_old_downloads: bool = False, - mastr_table_to_db_model: Optional[Mapping[str, Type[DeclarativeBase_T]]] = None, - create_and_alter_database_tables: bool = True, - **kwargs, - ) -> None: - """ - Downloads the MaStR registry and writes it to a local database. - - Parameters - ---------- - method : 'bulk', optional - Only "bulk" is a valid value. The download via the MaStR SOAP API is deprecated. - Default to 'bulk'. - data : str or list or None, optional - Specifies which tables to download. - - **Possible values:** - - - "wind" - - "solar" - - "biomass" - - "hydro" - - "gsgk" - - "combustion" - - "nuclear" - - "gas" - - "storage" - - "storage_units" - - "electricity_consumer" - - "location" - - "market" - - "grid" - - "balancing_area" - - "permit" - - "deleted_units" - - "deleted_market_actors" - - "retrofit_units" - - **Usage:** - - - If `None`, all data is downloaded. - - If a string, only the specified table is downloaded (e.g., `"wind"`). - - If a list, multiple tables are downloaded (e.g., `["wind", "solar"]`). - - date : None or `datetime.datetime` or str, optional - - | date | description | - |-----------------------|------| - | "today" | latest files are downloaded from marktstammdatenregister.de | - | "20230101" | If file from this date exists locally, it is used. Otherwise it throws an error (You can only receive todays data from the server) | - | "existing" | Deprecated since 0.16, see [#616](https://github.com/OpenEnergyPlatform/open-MaStR/issues/616#issuecomment-3089377062) | - | None | set date="today" | - - Default to `None`. - bulk_cleansing : bool, optional - If set to True, data cleansing is applied after the download (which is recommended). - In its original format, many entries in the MaStR are encoded with IDs. Columns like - `state` or `fueltype` do not contain entries such as "Hessen" or "Braunkohle", but instead - only contain IDs. Cleansing replaces these IDs with their corresponding original entries. - keep_old_downloads: bool - If set to True, prior downloaded MaStR zip files will be kept. - """ - if method == "API": - log.warning( - "Downloading the whole registry via the MaStR SOAP-API is deprecated. " - "You can still use the open_mastr.soap_api.download.MaStRAPI class " - "to construct single calls." - ) - log.warning("Attention: method='API' changed to method='bulk'.") - method = "bulk" - - if not mastr_table_to_db_model: - mastr_table_to_db_model = self.generate_data_model(data=data, catalog_value_as_str=bulk_cleansing) - log.info("Ensuring database tables for MaStR are present") - for db_model in mastr_table_to_db_model.values(): - db_model.__table__.drop(self.engine, checkfirst=True) - db_model.__table__.create(self.engine) - - validate_parameter_format_for_download_method( - method=method, - data=data, - date=date, - bulk_cleansing=bulk_cleansing, - **kwargs, - ) - data = transform_data_parameter(data, **kwargs) - - date = transform_date_parameter(self, date, **kwargs) - - # Find the name of the zipped xml folder - bulk_download_date = parse_date_string(date) - xml_folder_path = os.path.join(self.output_dir, "data", "xml_download") - os.makedirs(xml_folder_path, exist_ok=True) - zipped_xml_file_path = os.path.join( - xml_folder_path, - f"Gesamtdatenexport_{bulk_download_date.strftime('%Y%m%d')}.zip", - ) - - delete_zip_file_if_corrupted(zipped_xml_file_path) - if not keep_old_downloads: - delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) - - download_xml_Mastr(zipped_xml_file_path, bulk_download_date, data, xml_folder_path) - - log.info( - "\nWould you like to speed up the creation of your MaStR database?\n" - "Try our new parallelized processing by setting os.environ['USE_RECOMMENDED_NUMBER_OF_PROCESSES'] = True " - "or configure your own number of processes via os.environ['NUMBER_OF_PROCESSES'] = your_number\n" - ) - - delete_zip_file_if_corrupted(zipped_xml_file_path) - delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) - - print( - "\nWould you like to speed up the creation of your MaStR database?\n" - "Try our new parallelized processing by setting os.environ['USE_RECOMMENDED_NUMBER_OF_PROCESSES'] = True " - "or configure your own number of processes via os.environ['NUMBER_OF_PROCESSES'] = your_number\n" - ) - - write_mastr_xml_to_database( - engine=self.engine, - zipped_xml_file_path=zipped_xml_file_path, - data=data, - bulk_cleansing=bulk_cleansing, - bulk_download_date=bulk_download_date, - mastr_table_to_db_model=mastr_table_to_db_model, - ) - - def to_csv( - self, tables: list = None, chunksize: int = 500000, limit: int = None - ) -> None: - pass - # TODO: Think about this. - From d1660b37844c964f2367b21904ca27646d4364c3 Mon Sep 17 00:00:00 2001 From: Simon Will Date: Sun, 4 Jan 2026 19:28:25 +0100 Subject: [PATCH 6/7] Get to working state --- open_mastr/mastr.py | 84 +++++++--- ...tR-Gesamtdatenexport-20251227-Fallback.zip | Bin 0 -> 38958 bytes open_mastr/utils/sqlalchemy_tables.py | 2 +- open_mastr/utils/xsd_tables.py | 5 +- open_mastr/xml_download/colums_to_replace.py | 95 ------------ open_mastr/xml_download/parse.py | 144 ++++++++++++++++++ open_mastr/xml_download/schema.py | 49 ++++++ .../xml_download/utils_cleansing_bulk.py | 3 +- .../xml_download/utils_download_bulk.py | 17 +-- .../xml_download/utils_write_to_database.py | 127 ++++++++------- open_mastr/xml_download/xsd_to_table.py | 61 ++++++++ tests/test_helpers.py | 22 +-- tests/test_mastr.py | 84 +++------- .../xml_download/test_utils_cleansing_bulk.py | 38 ++++- .../xml_download/test_utils_download_bulk.py | 21 +-- .../test_utils_write_to_database.py | 134 ++++++++-------- 16 files changed, 541 insertions(+), 345 deletions(-) create mode 100644 open_mastr/resources/Dokumentation-MaStR-Gesamtdatenexport-20251227-Fallback.zip create mode 100644 open_mastr/xml_download/parse.py create mode 100644 open_mastr/xml_download/schema.py create mode 100644 open_mastr/xml_download/xsd_to_table.py diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index 0d8bc360..34f98684 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -1,6 +1,6 @@ import os from pathlib import Path -from sqlalchemy import inspect, create_engine, Engine +from sqlalchemy import inspect, create_engine, Engine, Table from sqlalchemy.orm import DeclarativeBase from typing import Literal, Optional, Type, TypeVar, Union from collections.abc import Mapping @@ -53,6 +53,7 @@ # TODO: Repeating Type[DeclarativeBase_T] in function signatures is strange. There must be a better option. DeclarativeBase_T = TypeVar("DeclarativeBase_T", bound=DeclarativeBase) +FALLBACK_DOCS_PATH = Path(__file__).parent / "resources" / "Dokumentation-MaStR-Gesamtdatenexport-20251227-Fallback.zip" class Mastr: @@ -116,6 +117,10 @@ def generate_data_model( self, data: Optional[list[str]] = None, catalog_value_as_str: bool = True, + # TODO: A _repeated_ call to this function with the same base and overlapping data will fail with something like: + # sqlalchemy.exc.InvalidRequestError: Table 'AnlagenEegBiomasse' is already defined for this MetaData instance. + # Specify 'extend_existing=True' to redefine options and columns on an existing Table object. + # Is this expected behavior for us? Should we re-raise with a more understandable message? base: Type[DeclarativeBase_T] = MastrBase, ) -> dict[str, Type[DeclarativeBase_T]]: data = transform_data_parameter(data) @@ -126,21 +131,25 @@ def generate_data_model( docs_folder_path, "Dokumentation MaStR Gesamtdatenexport.zip" ) - download_documentation(zipped_docs_file_path) - - mastr_table_descriptions = read_mastr_table_descriptions_from_xsd( - zipped_docs_file_path=zipped_docs_file_path, data=data - ) - mastr_table_to_db_model: dict[str, DeclarativeBase_T] = {} - for mastr_table_description in mastr_table_descriptions: - sqlalchemy_model = make_sqlalchemy_model_from_mastr_table_description( - table_description=mastr_table_description, + try: + download_documentation(zipped_docs_file_path) + return _download_docs_and_generate_data_model( + zipped_docs_file_path=zipped_docs_file_path, + data=data, + catalog_value_as_str=catalog_value_as_str, + base=base, + ) + except Exception as e: + log.exception( + f"Encountered {e} when downloading or processing MaStR documentation." + f" Falling back to stored docs at {FALLBACK_DOCS_PATH}" + ) + return _download_docs_and_generate_data_model( + zipped_docs_file_path=FALLBACK_DOCS_PATH, + data=data, catalog_value_as_str=catalog_value_as_str, base=base ) - mastr_table_to_db_model[mastr_table_description.table_name] = sqlalchemy_model - - return mastr_table_to_db_model def download( self, @@ -149,8 +158,8 @@ def download( date=None, bulk_cleansing=True, keep_old_downloads: bool = False, - mastr_table_to_db_model: Optional[Mapping[str, Type[DeclarativeBase_T]]] = None, - create_and_alter_database_tables: bool = True, + mastr_table_to_db_table: Optional[Mapping[str, Table]] = None, + alter_database_tables: bool = True, **kwargs, ) -> None: """ @@ -219,12 +228,22 @@ def download( log.warning("Attention: method='API' changed to method='bulk'.") method = "bulk" - if not mastr_table_to_db_model: - mastr_table_to_db_model = self.generate_data_model(data=data, catalog_value_as_str=bulk_cleansing) - log.info("Ensuring database tables for MaStR are present") - for db_model in mastr_table_to_db_model.values(): - db_model.__table__.drop(self.engine, checkfirst=True) - db_model.__table__.create(self.engine) + if not mastr_table_to_db_table: + class TemporaryBase(DeclarativeBase): + pass + mastr_table_to_db_model = self.generate_data_model( + data=data, + catalog_value_as_str=bulk_cleansing, + base=TemporaryBase, + ) + mastr_table_to_db_table = { + mastr_table: db_model.__table__ + for mastr_table, db_model in mastr_table_to_db_model.items() + } + log.info("Ensuring database tables for MaStR are present: Dropping old tables if existing and creating new ones.") + for db_table in mastr_table_to_db_table.values(): + db_table.drop(self.engine, checkfirst=True) + db_table.create(self.engine) validate_parameter_format_for_download_method( method=method, @@ -273,7 +292,8 @@ def download( data=data, bulk_cleansing=bulk_cleansing, bulk_download_date=bulk_download_date, - mastr_table_to_db_model=mastr_table_to_db_model, + mastr_table_to_db_table=mastr_table_to_db_table, + alter_database_tables=alter_database_tables, ) def to_csv( @@ -282,3 +302,23 @@ def to_csv( pass # TODO: Think about this. + +def _download_docs_and_generate_data_model( + zipped_docs_file_path: Path, + data: list[str], + catalog_value_as_str: bool = True, + base: Type[DeclarativeBase_T] = MastrBase, +): + mastr_table_descriptions = read_mastr_table_descriptions_from_xsd( + zipped_docs_file_path=zipped_docs_file_path, data=data + ) + mastr_table_to_db_model: dict[str, DeclarativeBase_T] = {} + for mastr_table_description in mastr_table_descriptions: + sqlalchemy_model = make_sqlalchemy_model_from_mastr_table_description( + table_description=mastr_table_description, + catalog_value_as_str=catalog_value_as_str, + base=base + ) + mastr_table_to_db_model[mastr_table_description.table_name] = sqlalchemy_model + + return mastr_table_to_db_model diff --git a/open_mastr/resources/Dokumentation-MaStR-Gesamtdatenexport-20251227-Fallback.zip b/open_mastr/resources/Dokumentation-MaStR-Gesamtdatenexport-20251227-Fallback.zip new file mode 100644 index 0000000000000000000000000000000000000000..242b39ca9daf5b1d1b6970bb87f74ecaa277b1dc GIT binary patch literal 38958 zcma%iW0+&>wsqOItGaC4wr!hTw(aV&jV`;(wr#V^Hoxk9&%Jw}d+v`f&q}hACs`}W zobMQ8jyWSQ1q=cO@aLj!m#OvF&cAL*0dN3>?W~+_Ol+MDoGk2Y31keEoD>PfOdJht zoQw^eOl(cuzS}uC(Wxjy0szNRC|CecDp)wXK?4AS903CW{Bw~+we*3qt0rK_i*8%AMe2_n{fB*<~ zFjB^q{`oQv1OU*C1OR~buU81zS{s;|*ov5#30m0M7&tnb(0;w~Tz$-TjUJ&L{fG~u zm8}jNg(iSqAb{A$uaM@3JVF;rQdc5qAgPbUXh?6X$XdgPH2&2O`pHyjePtu<8^y_$ z+fT+zMCK73E+5b815ZDN&7@SPx7W5vHvG4}f~9J>P$zx`C`q?o3>qP;ZNjA!7()Z5 zxoa9;^&cbZf+>iM@_weaz2io*;ZF30F*#Usd=LRW`Za9JTVu{h?KX|8J&&j5YlLZ2 zr94w4Is6{vC{Az4};A0aifv0(9KX^z& z^zw%-9c)5c2C7a4C8));K2UC2u(B(STk!g7adW_1d;Rz*F2Rr_r~%9)A+M3k(Ig>H z0yiaCKzSYI)$~QdWIYDMMfQkg0Ho-PFOh`4JBN@ff{Nrh|Iqf$Z0c=AYdSovSAn`S z#X#<=31otO=Wxa3%sgUj%7k_uZ(tsPGY7X8=kc*F(7MKq(ZyAu71NbP0wL0; z(1WdZ>R&P2pPoA7cD5uV{VL_r4~)9t;ki;M;@&m|4eU^~PwI(!jQ}LB(EC#-YIr@U zj{~K;dVVkY$y_Aj0O>a^jX{hEj*_$L+qW1Kp)~V($c`5;Uxw?XLq>3pR;A0+tIt(4F|sr+2XqSuV^s((T!Jfg*Ogb$&6h%f^zj>ZV1RMGV zJ*hw#9;-49Or2>jD58gDv(cr9`CD*3I?ddf1fp@=LPC2;t@eSAFhzc0Oa$eJ`dunrl-9z~ytGt6O%ukC~)d zl^smPPfHHYHc>6;wc5p5(epg^X9W>%_fO~Bshd9UYxKXKIZHWRwyRhoA3@0ueWs# z&aR)!e#c*!jnKwrw>jRPcCV{Fk#tv_8!cWlkWL1B2pBRVuRm|d_tJzvxx8@j>Wc2@ zXxoc{^4Q#{2{PGhlfM8P)tvBa0Ed^8c0jy44uP1Qzs+HLfVD-~k(Z4IO^b`;@qNYG zg^`O#!fD^vnmPTW+O{Nx!)Sl?R`EAzVW_r1q)x#;qCR%ME@8`L>;ZV6unh6|s~qyM zD*7oH94xG}5{lrQx7y=VCf>>1JdDZ#` zI$JFKTJj%l?PmBFS&F%?Mku7N!!yzi(0fs%enauSt-lSV^)sl{O5?;&o2a*niLey& z^NO;sfovTvdXNtanM;vG#1yW^%}0xf%1LieF;T)R@E+APY@jCeVr{&;qz|nkR2qn3 zAq_2I#3i4)5Cr_W!1~P?u>5J!oEG77t)7?RZ`fo%Blr02ql|lLv|Ng6wI$OT&^Y-y zsu(OKS(HNwOapgb8=xH^5`+^H)P{GeQ-|_GMRjE!y}%} z$n3GDZ7)!@q+cozhOQ55>95@Z1E;9Qr<-oi)=r&Hyc(i$3H`znmfywM5_$hsRLdAS z1{PF}K)g7;0%56QJ7!H=mJ&M+OnXT8+*HVrY-|;i`%;o~lKgpB@oV7sDw*YrXxg`AvHNBiILl^ovC@dq*kIZk>ZZjs& z>QrLFm-}>@1uE$egxKD5);`tFRKK<;^i9bQN-`2g$Ylc}6&%A1*k59p_k~W)^=#+k zCST>KI7W=1Whh#KuntU@kt9cd*v3-O)ZWmYG6V=rSX=-oy)r1;h%&rY<0vjVY9ynR zFJl6$eoeF(>5~+q>b+XFt%-c_XT$H7U8=9_6&%K6IDp({BJRKeDJnBLWMrE(&K8`> z`CSv1Y!8bp^@ZYnM>2I}?$>xjsuoM8i5fV~w1wS_ST!s+si6h3Cq6J1LZbeFY~894 zt#J@C&;TY3Gfsvix1TY`&@G}o9QN4bvHnA~_NOyaXB6E?qQIU55SjH5&*Hrji`rt! zO@_{HQ%+kIiMf+T6}Au?EAs)1aY%RUr6`&*grJ1^5?)#_GP&<-RW>TkS=DNtbRz!s znAjdRy8ZXF?#v||s`0G?4b0%32r?Y^Lhwn?C6EEY0m)tGmU>nnwsvj57Z9d2R5*;I zKbIfLellndhk|8;#_(fwM@t1+Ssr*i;;NE0K6(}|a2^8`6c=|Lh~0nu1GR9tjFkft zl^;OAPzwbD0084ZP)qHP8vHBT6G?x;R>)=23qJK$WZqB9pzrz062!!_l92DsDppLa zR4zltu4v*u-a}wUhQI?H2S+PeW`|ksHt^Mpn(k8cs-_q(9kNT00DIPy46yGx32k!d z9}5RV@O_QA%WhzrZUMu?!-(r9qFelgnfcSMPy0df;XxG0DCYbg&IXsMQu(+QHb+xo zXn_Rkm8w}PT?uK-PwVFo0ep{pL;0ywMLcwLogyCO%t0S4ls|wr1z)t(p-mm_9GNEX z1baMg+WiPM2WhxDl2t)t1F5P?242K)xQrab6%N0rQ%xg{nJ;Yt$~k|pi#28IZ^@hT zFD1BYn&9E5q=Cne>YKj_pKDYNxXnW@b=}UuVFB~_)|{=>N!~^=3#Q)d=0+4naLFXd zFelIKK})eT~L;r7LFvR3qVM%Ep@7byBJwxUGf+=m^MN$GXI4qDpt;8Rd%Uu=#rR8ws|5qI$^@r~|)lPhC zyuO5z0ue~KVz79rxEA7zJlz5o*dL+v02rjV#W0Am=SLq$3cjW9@;hUQjcIdZa6uZ% z3`p`8A;b2*Vwcmeqh^Z1yp^xD37>YoeC+WOi%DmximGoTW zlX==UFr2p_0s)|mY2PYiny^mJaWNOOsrF<0v6BaDWmYid6|X9#%H^Z?tAW8d$y^LF zrk)_@FfsFn6$&+gwZK)R5#J{2?vR-a({FblUP6g8@CsEj=Bi0PIGPq`CH zHooSW5b;5(IMoG|ZOVI=?!Gre;hK}gv>DiXkrggPb8&xQ9frAP?d<(S=&?gfKx=s7 zn*JeJl^AKh$WV<405ZMgBJK-<`ymmwB>BE33gU`P?Zryqu~(yF04I!*1||`I=4}ta zykaieGtoSZZ=1~-GTE$h*Au}CuqSTDfcHW)9n+>tV?smcmjm5I5)~x2+Ek?2#-3!q{PS!+Gx4PXY_-~ zX$OSNCYo|7iz-Ag!o$+D;I2PmkpqhB{aL4LDCi5B`1u52F(CrAn-UeMr{}ezcmY9{ z0kJIn9K+DyEb8BT#w6T#fi$GNS?3rKf9oyF_t+=e4$-W8;_9d|2byUPC9rUCyX9!YOc7Smqj9YR;EgxwIm$0jIs z(M>E`6)}jG3uf8;PB9&YofmxF%^$gk&UiQAp@yR>HlJlTolk-olfjC&R>DwFkH zMt6kv6Y%e>b>lNu9`y7HDfuEek^dw(F$2ec5?oTEj$AK2O30uljzXM&m0W%87mY#_ zG;Yl|h0TZeW2eJrXA_)BK zx2>^_00LO0Syl@$v)U@9>6NQfcAU2!%X;g?Hww zxcl-r{<1*+WpGHjTK!eIB`Mgg(jyGzq%~Y3JbQzL6sv?aoXo4HgJsRvS*&Ix;Dn2u z$-k}_zTa^rgun>O=h1mfO}ciyZs|;>X(uw?;t62?DEEVX1wQnTxKaN}tlxFWmNU6L z@H_;cFdHS47_g5gHc!|9!Pb(jRxO#@eArW|a69-W&A7lI9>kow6vhTpyIN(SnFXg z9-t(WdoSF(VJ2wVKws_d8Y2B&yYf}F^e(Nn3-`Hxr4VH{jmpr5Im=Jy{#2sz;GlH( zftgW?ZKmD>q>UF0z;sd%&Q4KZwQq4g5U6q_{j@V{d9HB~Cz3@o?SQSr4lyjw*#XJD z8$3;DIZU}IWKH_eURgazjYMsgIWB>o9lDRf*Oey)4C43ct~DwJbVF4O=59{?D>I0C zTbCCfU>(Y6s?b4Jd5Q=SmV=kqRju|aLrsf?TOHdO35&5ieuu23fin@sOD~sRAMk&t z$s@n9a*br5N6lB7sD7aU&cCOLl9Pj-&A;QoKZUO1GzM%QRo^yawdYdNXn@_k=pjnDY&i{3Au+zBB+)gi8+!}NlG>11$E=eK zqbMjKy%na`V5oLE7moh~J5q8dwXp-S{SgEvC9y7*xKdea{;l0N1GQC`<0S{fSYFP&rV|6$ms7#Ua z<{rA!!ScYhozpXZf#-N)Rct=XOf9SziNr$6RWzw1T6C@sX5892(O$KB7cxZXNV)0X zOJ?PX^=KF17(hlOz5~i01eQ{z_L${xJurou`4{a^i0-3qw<8~erZF4n?4u6b5f9ex zSb>9KzgBejedeG;^!l*NV(QFPVK> z`R^QpvEAO=pp9@gKIflE?~z<=vq)%Rz&5^WcoSCp`~&Ze1dNq6MtU|qz9PN)Wr+TF zqy;Uk4QxFuOdP(<97i(~Lkkn9zdX+@FAK!xHCl4WqDkJK6Ef8Mj`|aFesSjO^Rj7wPfvkCe+-PMTsrv zIp*^sHJ)!qkF&R4kEaLEMjJ3Ii!ljobQHKTm>@^~^5beCg*&wo29o?j3R=pc0VF7O zBX&Ti=BR2l^e*k3ZiE~T7?Yd5)8a|7oTwpU&%{wzlqY$x#|)@qjm)SPk+AZ*Jm`uA ziaSa{>&OP0s|gj~xA=Q?W;Wb3aXdgfde^HUSR_e8K977^DBZTcfwdyxXf8{-^@+rL zw%0aS3JX^8&b#zwQK$~^2iYmHI@|a0@3NSNm(~(&jq%~P;OciS0UmCx>$n7L=;Hgq zuIq5EayviDY)mbcsFI$}f2b~{4cf5xE*W!n&+xBpoupxX{3B-2Usg)w)3?g{ub2sf z0RS-mYjuiP*qWPIe3|b8Cbq_ZMNZbl$-~gZ$-%_J(8R&R+0McEk2z;z`v>G3RW@Yn zzwEis_)Fa3O)dEX2ALUVh3W=UO=LTCjJ4=9>y50&qdBg5#Z|%ZMYoZ+Oh?m0fWSz> z-#yssB`|}#HR$00^ekz;64ZD@OWiC(0>HDPJ(}qhGxyNwkrXo(_#wB??QDD(m|T#t`A76#tsU;_agfDu0r6gt zn7Q)^kyOD-@*wOWWGn5zG3%AqG zS8R!d({;^KN_ILeN&RlLQ|x6m5u%0z^CXo@`bwnscGRPmo$#m2^WIb(+&7wms5`vu zY=SgisYh;uo^_LJr-%vb?g`)*IjV3vcj&0cuKfDz;=q8e?j)0 zBR1XMqjkB~Nh3uig`Ai_(km1N42ga44VfMRmu?r@Va*DQ#XjP^V$Fvq#YJaCa_k~G&+8|MICIO;!-mmT#_idF zdph%E_cHrjX!d8}0SiBTy+TES(B;^1`B2{D?sGxt)a1DmS0;ZM)+h%uyaRYOV**~E z`aw9?ox?kongn-@K@bFFVXn?`FdyDt+$t|mQ|M-Y%gUUoBX1608n_%IGK)z00b7@^ z+m;^8;m&llD8mz~u~0b*Tt`Z6Hvbv_<1V$zV(-nsL3tl_7Rh%X4~D>j>hdj`w^4hW zQ+!tfy**Y%=MQ`K@o1Vw_F)Ge%H+5~k9(`JHn4S;z^vQl%xIw%p=!4^%!XjK>`m}s zzMOpvs6neuII}{W%t4BZxR|7|xyNlK0}Xa|2nm~9OJ=C zflo%JtUbbc!4qi`j)-tuEf1MG=?qHb2F^9(Y?d+TmkiZ(wgrdVXU)qdCg&bpyOeM= zW5N|r=;#=ye8TA^VCt~R;yKSEG3RX=aZ{Nlk)*4(Zz}uSev~`-P9ub0Z6?C25yB`1 zubWO3@C_{MF_I?FV6wXKS+;fs+@}kv9BVkeUm!xH`onQ6!HjqBl3D@!==&BYOlN1a@iRkt$49bpa-m+LAQ97@ZfN$ zGQ@(%La2GUG-t@c^%P4*NZl>Waq2S78b+n4t}+16r2XP2D;>DLQr&*-bP#RLEeY<- z<|TUC^~6p+be87t{CDzqM8%!2*U_VuhatmOJ2@}bcEyzAPD-_}S@JCrO(wBKI9s}h zsx4Bpy2}BNGwgWd>?4U@x5VsF#IzP{wGCvA{@lDF}at!#1aICZEdhucvZTm)tG1uHS-8PAa_T2A-DXsZs7J}_7v;VAxKHE7;&CzHiFWjv3YK73ga?AWl=-^(? z*RQZcIsK~rsAt)Yc*9yzdmn?OrUL5K#>WY^rj<8kz}~~1mNsg~pRyZ%GDtyn4DvZZ z%p?@8EvUi%vCuw?IPvseO;Cc}~vHTDwaKl!tF zq8c*_s3hrs7!Kf8*MJ7zr{atQVDn(?WWzipm-jko&3kHhp+RtvTP%on4DhDzP^cQ4 zk2b%m(2yN+L5%E73ScbqcWJ%$q-)VO+iFg%cl!<6{D>a9uTRxq??#BXpNq5)gfuz& zk$i`V7$;X;85iIsTLANXC4ec)N`Rrzs(-~#??!;ZjR1oO0R|5e2o4ws9`%~asDDN9 z*XFmW%8v;Ke^d$j?HCp&(cp@o(TxC;I}Qd<91NZ~5InvIcx2n(_=CVLRpDzz z+(n&W>k!m2+}DHhxQi;m)~1^^0bJQ>8xq2>X1!6HQ-_I?Hbi`Gw;GVuw|zKgz(W|H zeMDo^!kug-XHnE3+Yinap8~m$&{;dXQdi4jTBDNgm5!c|`W{xNsWa)%y~TY6O|M93 zw@>-UNGt9YKDu1mrOuE1=DYpFlwmHEy6^^2BO$EMwinj7XS;u=6tgu*2Es2Dh)M9b z68s-k@m2eaICz*ioBfyFd9I=H$F=gQp8xcF+YE@t7FW>(CCN*QZ=gKBR;6*PQL9*T zisgU0&tfTQ>9^&-5h@~9w4P$CXWF{JdEJ>KHc28wh&OFNc($|B1$E<#B#+hFj2&8I zlewH5TB_!i-uGOQBOpP|Pt4`1+-5~SW`PMaEjzIIP(v04McUEC@^Az2!flO@A6fFzgSJpz9b=9i7gQWYm$TYL zx{X5wmsw#I&$Aw5(xdW6ypBNIMD}{iokO9izzao21dU}52AyS5;OqcqQTfiIOe!27 zFwM=2R%5=b!%7U5<|Nm8@;Y9Zp90r6sem_~jYkgH0B%Uk3^ScuC8%`JJEpZ+IJhLG zcKIUgSKDYeUVayrnUP~GSjq3IKFC5Fu&*c=WPt)ryA`avpm~^8Dc!NSOm>^6sXa|w zt<1Jj_v~}=AWK^Ac9}PWYDNs!sr;RL<~K7jHzlxA`XYBr#fs!pij3Be zNN6p{&)r$gLeol;pRbFAaGGAHSs2>d$@)Z-%^R#OVvN0PH*%Oy7onZb>~A$_?8Ec8 z#uZ;w^6<(~p>gTK*HFSRs(y=Mf(6I8_{Am5Ip}$p+g4ltT%EwKUZJTka9+1kN!REa z?nx@4sc}VGikI735oSwcg|(0ZY7&s+u)noPQTtZiC6UB}s*phLbMhy9L=YMJM~_kT zo1~#{sggoI;(jk6H3899T>3vCFZlaQu)VdH7C%;WGtMpJqaJLi<%B1%H>znue(uP9 z_Bln++i*U7%CfgaKHhsHo}?(zgr{qs2lKe;1#_x??`F(bO@nib;tLSAHAR_T1-hM( znqI`dyhKpa-nSMfBAT;r>-Oq7T+O{>mrx8_$9{C4hKs79Qy`cM$d*I5bwjbzj~JX* zNn`-4b$nR|vMaFT_)Obj2M&c}Anq<;`uQ9zB)^{E5$9RLbDN0jVgR-5oeZU@k$$bd zTvIjD7vd~|w>qUN__{ONCMq8=SBG%Hp||f2bP?xwvS{=3J+)+$Tma*NIE~eEXi_}h z%RDWXa)WHEc)lB${9EzjQ}3mPb&SV2-WYZ)lJL3*c2di}zNO>p8CLH**$2z;1CTe& zJ4lQa&v#B95i*|bEjT37E;uG<@O^9U*5+07Trodp@asCMd@*hODRnd@b~~Ai zc;F2G`@ChC3S>>Dnrtjq;8Q^TuboepK*sSgSG7GhDI})uEXVjdTd@w$r1V$i?$IBv zG9L6etBds=7YmhFFBz3<`@oaSqN{rS_(`AJzO_n`@9flpno{+?PTeUrajtQ<(B!!}u zL<0{TTwF|CO%n^RCgYTk1jI6KL)W@(vY=1IkmT@OlTB?IYId2Am%5$Jnxjn)NCMK7 z!W05SnmsavyH=@46Usv~Z%w38vJ@c0-2`A&Q25pzgCs3pZHc&Kik2V0GaIhkJ$0aWBn+%y$Qt}bDwI}z1vqK8|BYEvb*P1oxCMhO=?s7*@+Pkf2A1Py!9-gHweY~a_SiW~Pw`P`CQ3R^r zdyW?$sNwY?{z1|w4jFKYUQmsTY8w%TxhVOYpv^$D@Uv)^KJB_fb62!K_#L*kO~E_0 z_&6kot0f8_1>+FycKdq^*)h?Kg9!ojt95tR(|f(a8BBfosF@vKpkl1EBVZLBek1Y8 z%C^t_@<};J95;RU#D3RHvae=A0TDS&hL= zVFvZNJA)XWARu?YsjnZACMJgWieSDQ1=gUD+z3YSRJloPQ^>~QgQv#;7!E=(%n$ri zKMkKfu|J`>B+bQxN(R#&z1X1uyp$jrovHmsYw*5)i;*_R`<5fjju*nB!4e$-82V86 zHS&!OtVx{8c`td!i}6sC zsKW1>!WACu=m(gZ2l#WVW_IwUCNwvALYPvKx@af~+iz%cy({#oUU-zzBWWA`u8{q8 zu^qnHP4_qo-%qVypT~p8@67rKQ(_h#azvAz3y+VjEhbw&^{!mLf`G5mV(bsoPgY+t z;Xj;jpZQHa#4iXkzy|>M+WvCB|0!==OdJdy44nVFeSfZD^T!7N;I;S!Ja`oavY=2| zQSM)rq1<3@O|eV(a>0Z5=(|uxtG77#EpiFY6vE+?RwZR_xLrHBUUj{$?bGX5c_1X9 zH?IR8JJo>k&>;^jhFpwQe~HE9$a z1@RXOfmc!DbGt_qws7Z+#-vcN@K!I8C5*gE+^C5rzh3l9G2=^v1H$%F3jPi zit!XUy8u~~zH=xJD*RD68!0tq%Q6gvPtS1YEgy<(`!SYnIX3Tq7=mL`SJO)?>(GL)=ws-*{*tO$l!Kq(S0L}2FaPBHyy%APJ84N4-Fc@gSqwWC%F=Na)$J?ShqIsE#p5a4^9n z5fbfGlCAYn5Q!=Z-~?O-8tA(eEyBdwE3%qV?M=L!m?!61U}Rc%LS<=Z0)fWX4Sm^h zE@fY&8}_RP&Y|8yx4Jk7Cm2qPy`mq0F`TvRQgXdG=Hh>*skGbeyi(40c7LM^(>$Qt zrucw}XWwZ9x(&houmFC?Rg8d6gV!=P*M>oSR!VZ(PVaND<=7B?pJ&bN&mNP z{x_O@jX;~*SeX557V=zG#_>?rOeHcUhGdHJZo8MjR`Be zN2;r%7?#=XD;q*_9E4XNU2-1J^RG=tjm&@*YlYnkmyVIZgK|KmgYcd^Q#-KZ$0D2T z21~Ih_<&?_Bgj`2iN=veN^0^sMPtqFODx7!VSxG&fWiHf{AL>^DCWsk@WK(6h?X{N zo3hT`$$sQ{E}lQE>)n=T=r9^8a>*up$sI$C_7LTyh53C8qaz^LrJg4oH@Y?F+d0LU z9eBrbDH&)9;Ii4rg~uKqn};hoX4_g)G?Dx=tprGem=rQyM*Qnb!v+;kK=XP$h$C6I z*5_-hrxRIA4v)$Yaft2L>_Jcrde~8@l_Jvh0C@?dZ5ga$4GE&xO?!Df2y5fC69yS~ zz&B!p#fMRz&9#41m##)e#Y0(i0OdvNwhGypSHM~?`^sr+4TEcmcbP?EJb?d-tMBQ? zn!8*j40+F^*?%jb=@oY`qcLh){TSlj&rUzO`#2hWSgc;|JRev7eSs_tISh~;$|?l5 z3z)iMo!`o}dXr=_POM}uqSX72mhs8cKIREBw$k`udr9J6wz{69Oc96l2BFfKTTmJy~ab9G5d-gJs3Xj=FCGmp~kBmEihM%7%s8UYSIi=(6GbX`54 z#>(bYMnD!s5{oj3omJ&yK+5RxF3+L8l;=AZuVL0PWDM5o8#P`wy6^vL(^8(va>sx5 z-tw6M0GR)Stp5K-aL=_)?dJGlKEIq_p`^>{!x+3iZg^&wK>?IjhF?!=}B03u8v`5?ki+eWqmCNrz?o&0=nUWk9|l zK@x%+6u96Ez}U2PbWW_@a)*BQCGPB-REO{guf(2#l>-RUf?In?J{DR}2DhQYQ|XoA zl>rMBi*M1U9>Is+^;g^nw=>C(JZeebrQ{L_qy5I++$o@1F4;{-%taC#+J6~^^pSB+ z39wJxKi#DT)H8>2lz10A`GA~A_FTwg7L1{QP@3q3nDoR%d{a8g7^*1ett44VyZ|9N z@<)n$;ExIl0gRYdA+{q)FW*#9Sh0&5+Nk zmnKSEvH+J&ke(}^eX2aCOjnz@39y}$bkv|3n?m={n`$1^%50Nnw#VmETRFgt^w+D` z-byp88g%P|>qLy_DVWPwY|e_4tZNi0`rgE89FOLXHsv}(j8@;g-|^eV*|O%oxU<6y zDR>;?)28m}_TvJKb_K_F`i9!v3mU7=^&6}=9~jL|l`C*7HWBPf><`nmfDj!0xDY%# zpe4bjs^H6Jw9Y-v<@v@dwkA~$pX?-b4d*aOja8C+A+L19vsQeTWZIvU6yNIiZfoNx zg}l55kfh*rNKz9Y#`F!tj(*~_pQqv)Yo_XzG7Agw_K0QnH#g)bFiX~uVx@xgv1Q46 zq(FtO1mVG*w7G6F&^*5YxLm)$`{)8I#Sv+ggB7c*9u)Z6@t3NgYT^G*QPX5}HbEbk z4c1Kg9)~JTQ-{QAdnH!WiTa-1ZOvcMrSakuz&;QxjZIW1SL$w8=N>yywKKa%J<^jt zQF!Px_F5YvUBuv|l5T?8zX)j-{ESzXa!Ut__N<;d6bEq0*UhX~+Yz=|4UX)6Cgber zyBSK~J1XJdF0w^@QKYN&s7N_%Vxy_p!-|OtzGf_sGu?&_*wHbw`Cy4~FL{{lUf_ih zs6~vFqYd%0A3sTk9^(@|w}3Ypvk+yRAi5X*kS~Ie+6@I02FcXC%flyYlmJIxGX8de zXl=dh(Uzupfv}cd!wGyIxSrvNFO;Z1K*84E8idKpCZ{1m_Jj`I2E^h3x>{%oiR`P5 zZJPA#9n4HP(nwG)2760)4p6Z0u1uKP!t6r2QWS6i-UDU{V@S^7jZIVgQ|0_cZPgo$ zon7z@r2se(qrceM0+@y;vZ@%}X?T0K{)p2P4c9m^I?9kI^5U1-Oz8u4>wUI}>smd# z%r&(v2X`bgbVDpC>^n24-2QRvMFWjjgHQ@hF8(#)gbQr1y2_Xcm5Q++LD=_iIglD@>naSu^WD&#`}+| zep$c;NhKwj9;jz9ecW_)Kn70l7h?^;jO5nf#ubZj!}685aSCc&1i%^7WYr{hBUX^d z=QGny>d=^cA$PRaM+t0B7}G|UYt{A$PxN?IzQN5B$6_d3Zh-J)ldDlmy&%**(!%MlkeTJc>RNLv#Vp%8vz6YW-$ zcG817StaeiPT^^*a;OK@_~z7AhP-OtY$QNqQ7zwGKUyYE_5PL+e>@RG7}!X;FN`Po zYT^F(e~tb}LP(i7*jhOlm^%G6Bly+B`%{awOT5C9Z?TDsyQd8n=U1K1C(9Oi&U8wX z;z$VZ(y#gd>hk#wP(%T{r_E`Ctf*BgadG|JQhzxxlHsamubdVTXVA-){9b0N;^#R+ zzwa~j;^6*PXVT&My8E0#^{nXu*k93`h_pkX@;x)`2||=QT6swFxjIo~hdk&Tp)dll zS_}Ba`rx+~FK-eq)!g}g=SI$q3Ehd4YNQ>tEX|23A3nZQvp@}Yr=omtJ$_a8$ULJC zcz1w2ARnlJSWxWJ=Y(6c4&rR#@6PT#{%0pQtwDMevbN6lz;{s1NcH8=Q&Pe6^*F>7 z_$Lzm-}H1VHJZXQl$U=Fw{k+6a3%YE*&-oHOP85D2fA281QjPP>_Wxcj8O*RPW{SQ zIsUB*-`Q1HxfwxzIQVLHw;P&`q*C1FJ1(l_{7yg6ScPfhKhEO?mIvwu3=0=G2!gst zEu&1KuCB$jtgoPDrmK7hx#z-quv98wXS= z*0n+#X)iq_yF;K_cx_nk>e}_@QnK2rWlbY@BxM@n*hjqe-x4&fvN_{7i-I59rl6_I zX|Pg;Y}?3>V8;5opCOH$A1+{b&Y(7=kC4#~mo3l<2xfmguZrPeGBQT>a$i`Z-S}$Woeh!CF~@zgqbC>IqSIdUvs&wRwFcg*g|)WC ze#+1(3+sxSDgVhictt%S_y*pNy}NnOZb8i-_l4Sweb0&Fs0J5aeLzA%ok-G>w-)pa zgWBPV3n+fLVx`kvibdPe?#VRvNZ)NmJEo!M!7+{j`do2k!Iq%HW`a6#6ZOx>F?kG1 zGuQSk0viC!_hm*XzLoPmH-IRApu@H5-E$E%>IZfrSgvJxqmW2f64HFf_XWYm8gT^` zWlw+G*!^I7`X+0<2@x+Osp5-%NR)hwWzwyI5Xd0CsNyyi>I3xy%KNCCK7#NnI`9a~ z4gL4n^AgA!#J_BJ!?1;%%u>T|_NUBLv5&BVv4u;L)_VSD9-!u`v4Y_v66R+aj^;USGcvU~!Lpo& za;aikf7GiFgwsvQ(Ke5+JESP?MPh0uMh3T&7)kotGma7t)WLxEx~6B}MdQ6LyLATt0(L zBI#qz>3!Tvn3?h?73z`s2^5w?%a9(j-q{iIIrhOm?6a0a0x&n>APyV5nx=JHYBm1L znFgo3__;Vui=%~8{eaixs*zNAIjKsCCmxnxwcC{CiCMe_;6Uza?p3 z+J22ue2M8x`2#Lyi8fLZn0ZTU0(AitjDL(#Xyt^m?F_J&d?r_!a8wKZ_}L(170<=x z`up|vwuMetXF%ZqDF_DH$C_ixx*VuI2SyMNM@x@4hxfixjV_1V$DSl->;uA3)j%@5 zCWVq+M%W8TC|UGmS@L=63x?@~03E`<+-V2+_2y_Hhvzp*JT0Ok(y8)=W2YBuP0!@8 z+>_N`O7AWoof)vEJJU%KcwRkqy&5n>i>Gn11j^lQ68-&erO%Gb2z>_n(52dZ?dDy7gXmkdAD%YEUO>beVTm%VFZkT(I-W@d%p#Za;j z1&O?zs?Wo37&_Nkb}$2#$ZJe`(40Pul0PNi*6&6bvs%kEOOrQH%C z8ckbGOt}Xv>Z4BLMJVE1CJI_rNak=9EG&ODMIMTemLx!D{!Ez$+hGnZeeHMX34MElwar5Lcl)oyUT~ z8(UQ$JoRQ##s;W!~RNYbKbt1l!zBfo5)NX-)c3yG)kZ>jlqi3eNQPJ-Tl za0ceV(R*#}6uMUWVaOa^X+=ru^BZ;X-b27W`d))(GF!+3;&0#GGSD7<`3H+5y{N~5F#z$oyXny(5cBU#`lRvB zTdXgpUSUYpIVj?^_GNan#i0IzVzBr!&;5$#iysNQoVXi5=XRorrfB?^#!5X6qHhdP zY!SpQ_XXqN8s>*Z^e3qNcU?d-SY!T(XUu6gU|r_#KpL9dn z>qA9{Y9TA4p%Usv3_lscn$(5TQEbg_qNicj80G_;tgbljTV$|K08*MjnNWSuI39F~ zhPw|s!h^)EW&*Bds&_R2%|e2_A){mEz4YuDT=FFdb{R!j?f#ufL(i^!+osxOWKf_{ z1a7{oK7!wJ6Sa(@)Dpmg)2WPu!!cIwWEEgQlp~Q@Z53eaITpb}6`1djQLR;MKh6Hiu3Jow`j&)%fhi%ySP%>ohPt^M3N+7H*wgsOuW$y;C{FN zZGd*NH|WZPeW!Rid8D_685TuQZw5}3BV#98M9ue|EsZTEk zI-6iHsf%?GEqk~Y&Bvf)nzdWwI8piW6=-K5!qeT%K5i5FU%tX}jp?v~i5YN{FJk*J zjize{0%luu)G`kQ2;FAz2ourFvT{}4fiXXcI3dpFl-#yxg?uI9qGP8E_^Fas>&@?d zcLL#pFgICXpE2^ivHb-0$vZ%x43sJ5Fwn&I9ng5m}E& z>;rY3J)F7@X1J)ai^!FlGG6*x5s~{Ij{#EThGx7t+Yjat(R1L4UtHXyiUQB!+ns!V zSmTLGC0AGhlbLP%V978BsjZ67nkSRZj7oQ``^@wk{ctwit(C@Mb$3r1r7N~%rRxRB z?MUq56Lb6`-Me50Y_B zX=I6c;iK`%$??gtdL`ZV)|i4pR0vGcw+0IyD>m$Iu@UzD9*f(xfwdB{iT@%W>T{FZd%fy zMVdkX)*uW>-mulG#Lt-oUon)?Zk?Hqtkz&dGap$h(*kF4Xzlu+e_dDixL0 ziz1>Z?zuWIN9C_-acJ_^f3@d>HAHS;#m8$@yiFKo&{sP2RjmYSQo8XdtxA7YD>wRj zA?`hs37ASpGC9lq-a23EY1a0a6|Hxo8H6N1I|u_hg|vLGzo5c_=oqL?h~`pL*^18j7TlE4a}Q(Yx(zqj9^zD$ua+&J z@2Z!tX_~LHjw#`cW`rwk{n5I&VW8c1=)*W0%UP3{_)9%@vctM_HcAu~7m!JXb9svD zS+r?|GP(mU)L%qNiT13$nq>R8!`T0t&JNcSa1mL7APdRySqCC z3+_<3O9&p^f=eJkfFKF(?(XguAXsny8T=Z`za9rqhwQPe2izrAFyz4n~X znse>^iz30zf<5gQPMRufykQP#7NY|p+$%T7xpie=jU&4l?iZrx;XnalBnAoLNUUPx zOv}Dz7aGtN3xbse&P9r9;Z0M|$lNEcwi6mjoErZb?i;pt2{m$htCtGlNf;T}d!^EmC+5}Lmt$|o2-I8V>l z#&pjFI+&B>nD$bOSVULT#5sT;-e^oht2ubtpYC6+uP)?dGtdzP9(mS5k1(0;S2T6E zVb2pk;am@lM}py8H<>uH)T@pucML2voFab-9r=TRG8uyc`%A8jIt9^19>Z|YljDKC zTPZ|Of6v1m?GGXa)Y^4CAV*5Nbv3;nOUM3HsUUsnpeo8lQLMTW(cM7tq5uuaM31u2 zW~2v}k(+mdtFL&BrF5Xz$PuK9J9xR8$NP4J1L!@OHf$*kZ*;3HOk$JGG>`{eybf>9 zOUKrh=8bR|5&}4i>h9==@^Zvs^|`&6Q0X}^f-1x)o49c8j#+5Rdh6F8=b)?LDxbgs z@#e3lp{p1xTL$1}yIu!+@A@f2IqJCZLJ;kZ8ar&Ok4PvNe!nKfkEM;5o2FQTtsf;~9o)Mp&*N=Nf(z;rmXhv`hx~9L3HTafyXc zxmhrHRN1v_w2RZk;IVJ8R~t~&s&7|D>{T_2MH(8`Sj7W&Cbsk<6lGk@WInbPOn%th zO^xbH8|;xRItjG{tQI<|iWWOL)(zG*)HYTBQj!I(h! zPTO9i)w@*3&q{@>zg@Ov*9nh#C!7y(%qjBpKto1JI|!-Nu~rW8TgATWkthL5aErcQ zcPynKN3l28nYFuT*|zu)S>0G$urTs-XRprg>yVhM$p9$d*RKhxH*a2X9K-67TuL7(=LsfKfV%kGF9ck%YChkpZb&q|U{+o{`6%ee1U+P5O2$axAt zg!3W4Ex%`gixMW=1^mLkvL=W7Q!(Ov(x)vhN;i4W+Nb%Ww#;jR&x1M7_{YFd883C zhApx(QKu8LfFq{NAVCV4R5qx~=A+X(^MI)4L{Ya*$%>#IUhJIjiwLl5<(67nQX-m3 zr4{ZF=*e#R@wo-AHqWW>x>4>s*Cf9P_{9z+(N`pauJu)}js~EQ+{Qk8K!*Oz`zX>p zR6;0=n}baXd)(T_N5tjU-ak3$YtvZnX4!A_GwA24Ol+2J| z%4eensZ6QHn!DJb3FCba zhc5~a<%edY6+iJ3MKUxXX4YD#lH3vt?(yZ#MExoCnqv00oA;XOokG%2U7mbFWsxU9Qm`PGQo?oi3aNtx?`&#(+>_P7AS*LKaTf=ekvx{DASsMnM(&Ws14=2G?@?1-s<^fLWJufz>%X5g# z^xs?0;A*cUy6D_LgeWohHGioW2GSF3UNatsxikBS;q!zs$1tZba8t5&6`Mq!&w?JXjmIMRh|~ADTW#2!flw5(f#CG6edO@ugM=71rjt8sc+)q+jFa zPNzts_*mYq3VhdR+8dD2beu4Z-cT{K`a?iabsq?+`tm2fNQ03;;IK-2XlD1-+ zyz2dh-KD*h(TbyJw4#2;*V+D^KI~Z6k915`9!4s#1l=L9pv7N)AoanoumXSCs~p{I z|5)G^9wRI7_h(Ue3p9*yiNs9a+-=Y;*BVV{HHHy_()pAYq4GwR10 z7gC@3PD*AgCk~P7eRNQkXpmiTYN)LZ*jW8eYOeF`J8HaMtH+D7Wjo_fwQb=r@$w{J zbx`8xC1|?e3kA&HXPlsoWwx|2>9o0YLLa!i9$V`pL7+_jOjloJ zq)92WqPv5dLs43yNK_T!QKRS-u$O${i!{GNH?05sW(YG8b2B>V6Pn{lyt)!H z%NzPCv&d)=h(Y=#@&pQ0JJ`ncdw#Nu_1Bu=j7j}Z}~ zv3emLm>4)9^{v7LmT8j|7r5O6P`F2?vzbNXH>{L zLS;bNAa5kDYRFYY%WEI3-7DEjt=zRFXwVY$Red8XZI5-mzvXecN-;DTTWt@W6rtp# z;zT%6h@sF>PLzN6@#=CcG)VAtX6)&CM(S=?ExW&n+6Ts$;{%BG5@qX<&`gm7imdAV z4mjd|Fjw6qfsk=8-SoEE8kM%Ywyr*Mv6$RctEI>QA_3vz=9rTr4g2Gb4IXn;x@sNj z>eOc&%)RI~h}bYO#gwoV<0~hVb*Ms$C(D zjVI$p(wh!Wzst-eIge?d?uo=m43tb3s`FGK8tm;egu4rv)bknn%|{kETYSX8-`>Gr zl&tqmrhqk?n6>e|8g0&_^ySP$GylF$np5*(i5L5gD7y!Nk?=|G%;y8 zNsi9P52A%HV8+>!0BELb72RLa$Y$>Yv)z=m(uSRs@{2-^Z+{5w*HKMn<+#Vbg}XS+ z998>_AgJ?so}i!kqvQlV1RFRLlcp1Oz!=;|vH?JRt9fNNnbUVIkv5W8hQ-yWsdTeS zsnV0&X}-z`p)RG{$(J`?_1wk6YC&t0P=2{BnIzfp=3jS5K-1)gwNd>r79;%U;X|1u zc-SA_&%saWchBnUb=}{;j_WTEP{l`Pcx(iy6TBCm08FiCJ%*XMP6BtU7 z#P;UW9O)5I8ekXj<6QRYV`|juo|cAo`Ft$%qjoLOz}k4SDe^YU(9BOk~d1@ z3HK4@&iKnv7XRSzyRokcXj73WVjR)tFNM#kaCVterV2%10U`uDvagH4(kR5_$ z(S^E1?C^!Vov}a>d+Zveyd)V#yUMxzNk0A{ave~oIU{jC79@#W%eS%_x+V`}JYdW$ ziDnPEmdg3$@E(z@mIaD@ilVE$SRTn-$KAR$2aAH|Ra~=4(MOL!v0GF0L0bq!d#Ooc z3U5SvxGBodWt^Q=hICoRiU4hDB&INdXXu%N_)>Tn^=b!M!?h46RLu4GLIJ#%B?Kv_ zS9IzDbG*Z_U)MO#VNU`R>34WP>xST*ou{jkq^-)!^eM%4ldn&*T2dZXJ0w>09=s6Y z9j(SR3PLbideJO(N;Zn!udTrwd-5e=1Y8LiCV7qsFonD5w6KK8*1@QQsOC4+!8?;d<+jxDJa8(q(W(<2=l9XxzZ79Hs8 z2Y`5b5-78Bg@Av7fIrM$3TgNh-W;3b5a-by2Akv0J8sUAY?(CxLh%NbcYMQ)kmcK) z^a5?d-GohnanvpClssyzo})Wl!ieJF*EliUKt(%ju?~G~48cIMQHHR$de|5(wsdcN zAzGS!^_B$~Y>BZsP!RB0usQTR&sW&6Im{SrhxL|UFJ}6(fc8gA++P|9C5Q`+r2T*2 zkwT7082QLLgW4~mu$M;KT^l|$CXW)lB6lcN7QJO*ojt4WVdy80^H_4MdEs_inO@ox z>^|9Q)rG~p_1#OG2AB4Gj5CkB#md-oqYOe~ZbDw)qwUL3dE|UA_x1fTv}YNFRo5q_ z3wfqMKHgy$o(O~PACq}`(0sgwC)+*#^85qsc|^&p9*!g)j!`2HE=$=ZjFF9%3Qhg5p*cW2`)mUc@8GqJNM8#XGgInc z?>BKo$0mQ}^tAk%gWW4>4(wf*em=LP zU1!;4O;LYc2UZn6(wPEn>N6wL;r$q-G0n|UDowV%AYoj7#V<|n!t%I`)7E4+AwoG=J-UVW=lE{IG;DpIYZZ}d3dFM@GFV_P~#8U0kUjBM&~m!R#U zfrJO|MRF2eQd4QU6OP~4*Db+tm08;KsT6UBxvBMffC^==Jv|lHwf4=K%X`}_^)o_K=*vFon`M%oIskL2?S*LdKAaIpg zz#i(gXXTLhPhlKhKn)%WQ?T2D-7EA|{6pP&YhzU>$psKDS=@qMoe62N+I$0w)WD3mfTOp?p~ZOo!@KVhG61$nQBm6 zvEeIB+#`*&v!|`ZoZ5uKjdb@utgrX5P^csuy#ErpsmfQAd0if}q}F$KT(LNYGvyk3 z92H`6zt#kikckN7!*e&)t;SuurMVD`|Atcm9^YD|noqXD6yHia)lCNVokn+rJXWKB zHRe2Bz1Uc9cfhO895ny|2Xqp=*TB8g{=zx0)KZp7r8ykc06Lvte^wzraIX?^G5kPs z%{Zz;W7hoyi}gm`;;; zI223)+l;4swuenRYYd*$LC`$Rb^Ta^SNcUo;@qzO55+)MQ( zNCBrzo0L2oP$K<)6cwp8G2dzP$^KfTp4W@3+~#2U>?Il?zc10U(B=U0Ox}1!iq)c_ zpk=iqC!Uzzz$SWU@DGe`h@+H=ZJ^uI6iPvJPy}2n10>N@@+XE*N$iR~sXif}8DDqQ zBMUFziYIR3;)PbHoap67ZfdiWEruXzZpl#x8=2tB=D^|XzN0q!sFih2BXyG7A=%L< zkSd+%u|ogd4#6gx>+SmYh($iC80k6#a)`jWSlwg}?n%m+f!QEDS>aG(1_)H_FM<8U zA?G)whqBXH)y_K;oVVa26s#Xi4v9P!eT>N!rxvxdML6nv7ORzjiigJ z&y;=rl#1NF#?e%!kQx5{t~ko3~r*Y_O8eT5CmqWk)k=1g8S5kSkZR zgMqPz(bF_e+4QrZ!VYTTby+q0w=kTo?#$T8Vqa+GJT^m0^(3g8|TxFT4{+$brPb>dnxGZv9TS}(yYjclXSARyLu4a3B8 zy$ys2G?UoOinPfg;_WMvpM>x8Zm(~bSo13E6~o1=yw4+1fp5Rr-|SWx^bmZ>U>m;( zC>nhNjE`t3VEryvJQ}UYibia@Pmbjpt2dcDW0r&eqytFpG~%U-;-3e8;aEOjEnYF3 z&q|(bYsFG!{_qUSTMS654c@x`3pmWA}# z9AQ}$dXaHvsJ(H)EyhyI3%uO@3*pkOt(iJOU(5PaSr6B53TsV$L)E%>kdKG=nP@nu z6OcDbL<$THQIpcBm%3@>B_tR6--boYswh%r?1q9a4-Y^i-b}jh`9w_*oE%MKH;e_wYm) zdu&Kl<9(J+8sX*X^`9nk$F@Vw*F>q?l3$g0R2*asx?k$4;@2uM`q8VXK}s)|UcC#2 zsfM=e>9GCr{;~_&oGD}lorRjspr_PCOJP1iDZA%pYYU_E*GresE`NgGs?)Gye#6C8HzHmUWTP%O=CzSH_bG~ zjp&@k7PX5?f*~p><@j9Y*w1n`NHIC~YLodzR=LiekTllT*i%~Hb3r*H=;x^l@@py+ zs2i*-H7!?|JJ)=qE&Wb{IjCGSM zdzLJhM!n)Rs}ijiO1)CSWfaZjhTgOiElMMH)NQw2#f9X*F<%ua3s7Q2bNLbt|6?DD za0Xvj;hfe87n2F(e!_9*fp)SV_jvNBV9r60L|9i3q@&Y?Ooy5TEje=q$(&zKkCz>? z59Qx9^^X*O;_5v4B!T1R_r=9EVNFUcm(EH7MJ$JyPIz}qWgFXaA1735B&f{DK1@^@ zr&n%GCizavlmR=mUIEpZa^JisE-NXAcxJ;-qRB5>QrV!2s=z1PH}U;f>9D$^i*#eD z>37{PR)|(DfVoWlj#$%PWN}DGx%+m(YlSa!6Vsy&^!*wg^7qEWR^zr+8zG;q^7jOi zn{mHO3?_4E^LWY1qOul;T^i5#wu4O@6S>KDgtCBlHZR z4R|>uih~L^El+B`AR~1LcnO%AfpdwD@f;FpaD$c;x@OOYx6qC%`px&Vbr;d3=~|Q% zFw6&%IfZ2#isx#`rSU~D3`k>WqdYgUh(}&ULzcB~_H?KlTYViF1u?V{Lq2J2x0u2_ z4Eu2{=Zp%0^jjEh$~o&q09;nR-CJU(BZ|s+DpDrD{tqTl4nE~$F0h^g$x~7YbsxMS z<8YrqPoNc-Ce{dYk6{NPQu9l(jVebOUN>G8zin73+H0ibfP6QNul^L8r2EU~aw=Lc zc#}zFuJzT?D!oB3cuY{Vi(&&W_`t;~y*aN?f~QMw-k;Mt`+J6_;Vv0WLG2lm2roTr zMSJS_iCY$)Xm5B6bGRyBakJUi*b0px);z5fdlvsZMEno>3nfWJg3Zu-DQHoTt~g2f`q{V(>R6D&C1>zb^RocL>##{ z3KZgGhRg%4Hwp~^dZ89aavq}_>n${IIVvBV9Tq?|d@WRBK7!?}8Q%0_ekk6Vb(S{k zd1-J&GbtB%=vSETkrfxDZtTpbwWVY3T0lFyxo=HRLlz|P(WI+t4x!vjB-U8gg-3Gxt>Y5W!QX9826h$j{e`JmSjOg?=EG8XS!qQ`kjvQCmG;fuf$I(+$_sV&Oi z>2iKJn`Nzh86CM$?0u)M93~r;zYuvPG4gp@!QB+zLjIH_j@ZP*-3IG*=n~37(kp{> zGVLT6O^n6vlYVq?P0pvhoNYC2CA`>I@R$WRnG1xsY1zPc=jH(MLBU&K5hIrCH{<>J z!6&!C%M-2QRL3nJ_zlA1iR6Q1hw|5#^t!Cy-x!9a)VZmDUV2B1wXoKoI+qZYK|6bG zP1U`)2826x%|WtqyI z+`8Mx}tc_9ZMM?8( zHd=u%!6dlp9R4q?*P+4^(Xi3-bP3Qr+ewaLz{x06xfx>5;(E`d@nXU4=JOnxk{kiz z1lIQM(|c!M#|D-gNuQ^4y#>F-GWw9{pQw%N7=_vGkY}$jW;o`T|IYBq#-SI2YI%f~ ztJC63v>S{Bk(hWF^PMonAE5r;0S=!QTljZ=us?8{F$jEb0$;LJ$w@I(EWD#7l=1PrzzYG|58_EtMHxTnCO0o=b9-&Nm&&zsU>`lw!9_H(Zl*^2G+#dmX8otmweOI zYivPc@fmJExqbZc^R9v{b^tyk;t3FnSIFhV0>m<=uuBU%E?i#a{6h4wm8r?z$br{k zlRJ1i8fpg|?1b*D4f2la1(Ee(et^@Gk>*f4Ts_+;GIo*0-An%2yo|r6L@|c48cmnb zg8iVj-U7v^rv7tEByMa4Gk%+OrrK7nsD43t4@n4t1KjVD9dlq6yot8X@TE|_nd7O99x(LM+wTo#iC%gcwPl01Zm#n7ou^q=by zek||y-|6`F*`vaxe^AcFWj4n=sT?`R179(;om3k`(|I{*4&TsZqHv?e6*DSY2niLP zPUpf=hAQ4X9$@0tF4%W2o~>wz&I-41eC|hll3@cR1oU9%oYhlb%8V*&3-uS1(7(PR z<|+6DaRb$%@LH@Iy#N_p5F|WL!L*MOo*_WiLrv>KtkWz`) z^9|&w5((&cfr!xdHB?`2#|YN;^-^CR#(+t?#)U3!_kDrlt?DxOD*dCA;daU3#?Ga| zU{ezWvY{U_;;3$NhA#9NJqwpjo8Xw+JEWAm;zBF|DiX$~u_;YC(5NpiqQKc!04XUpI zi~TlAXNcl+4R|cW?o>XGXMGj^7*4^0zMV162`MhVuZ&8p=_^R)NcTNbGvx9nw?D@x zBKnwBao-qnC8dSWEN>YE#^6C}Qo_uO!ZxLA0d?* z=L>ZFQ4=KHp-zjBzY!IWTkd_#f}!lN(F;yKdQNHIY4VH~El>TWfYi=v`G9?-SXzOd zu2VWSY7CE25&|MGejg4m45Q-q02Ng3>L9T4lS|@z0aM_64~e9fSJPfC0??Qw0~)31 zmF>yWysw=hsRn1Wg9$=1YIQpAK6ub=U)6u}6773Y-MGwrp+5OsN?aDK2}RdAaiBKG z+?MQPmySmK5-a~FWI=*L@3}~}CeFEd=yKX0$IEzsAKMW^rYgFwq8IR>b_@l1`hLau zlh$|Cw*s14g3>IR0`1NJboMFrfJSnZhUZ7flk@q?)tQtw^dC_TxE08XZJ1Obg8 zTzI6ikqC?zU`cy317p)@wa9xYG%3PN7&6EO6t;3vYK;ynb#)e6?d{y6E9jiu0^XwS zH(;dv;p?MC9%Y7c$Errx@)cEy`lxWt58u=|qEuJzgg||;8#kJ3ZZa^?KOLh>(UaQ9 z&)c{^K&**2nsgo`;t+s9SiLnGM>KC}{z7|N&^NX>Ta*xr z^| z;*)5{M8y~(Rz;XjO)!#Jy5@AD9{M1pRSKvEC#X6$FhMog397;SBu}8om-K(upv(P$ zcs~w?Kdd}Sr9UeVyxo_yy0(Yq^E&ydQ0vlKIqZAL*>swwq^kbunXLFXo|&JuU%-`7 zK6yhu1*J|NFg5Xg2NUzSFZ5GI{=kd4ForFJR~|vqpykpwiC7VD)U}2nmp~e+Hv%EO z1S~g{Po<5=CY3&>7LIY)Bm*pLNUeJ>xFk)E?j5Cr2ya+P(%k4J(PJf6d^rJ*GnsbP z2Qw}knv>vj6Q<$Zj?J_-Uf7n9^1=>T$<0;MjOW=nxsuw; zN-$vRNLSkDQ0-gMpO{w`pcl2U*R4!)gpXH9TYYob@b6%vI+`eOU&P9W<&4w@;Qy;P7;Tu=hDqBG>KOu`&6c!(j2)Aj`s0{{RvE4WN&C2NKEnpeL~;p7tOZ=?LKq~FMh~ZjlIG`)1E#4vZ^J8oD(SExSB+! zhEUxzz3%^6j#T1lA>`x;kB$C*b-vdjZq8x341en|PKpz27(oQnh^9UvZ79K5#i%+I zx-p6&M2l}U88#)NLE$Izo^7OI`j)vcCcGl>HCubq-aDd(jUz^9>BV@@ea@nmdGX;j zAbdYqD^-D&D1*y)fLoXFvSv*e6us)^$>3K9DrlwODQAf?^c_`G$y%Uw@ZKl?%qJ&Dd1eb}@AE;al?`~oU_5-l&| z2&en8<)Mn;SBpA4P!Y7;4=f@F5;uMoF01Htlv1 z*s}mb6Bm<3YB{W;f@y9|its@_%#LOStlT*Fl$K+6Hz)Ye%MH=l+I9?Xl{c`YFC65n zS=1I*#4g@49l9TglCz`sMJc=&l3l`V3=__AmfU5Uel`KSB|9cEAZzsP-K(ynuqM7w z=@G^#1vtCC*Alq_tq5wHvI@k`jvo~>D&;%^?qvjAU&Wn<3HH4>tkCb?Lgk8FYoIUI z?$^${>Np=t?FfdiFz;#Kz6_?4`iev;*p2Kz9J`Rve79w9)YgRONpEJpDsw6S&W@=@ zA3xl>fk*a=+;9~sqs=4v;re_iEmIX$guu>cNT8_6tA7tH|2&iKEocGPM6f%Lc1e#2 z3g(568;eb~MFcl!p1{A^AhwB0k<%&+QTEj9^L9_Qk!IJCI`*l=iasQ_?v%Wl-oH9$ zSVQJ-)a-Y;labY;-E9kL;7|LO>t4x|zf;!MBR|yST9W8_Gg5cp={n&(JUBR(HBVbt zoF8#b=k?hNGb3m0ONL8OY~zn?`mG7`c)38@{xQ;tg;{vr>(j$`PS-7}9vqwpwSCBE zXS(&SM?XTp-5NVoZjI&d-Cpgn^XoRv7`g5)EKH;E2tZ)omBoZqvkj)?EeNY8#&{gn;dZi@5iL_~s@*O#mH(1J!# zp#JYeorLb^9!ydFs=WkKr@neSj?komS z)Hv}852dCdc;ltyj`zjv#_31?J~w`ZzM{l$BP2(@Bt-i~KIJ~w-X7;o z%ePBQd!G@{HZ6lz6&F6lmV5hKwGo~fNS&8rbp~+|+M|FIW!K9`!y2Ad9T9_kJm!sF zU5T=J7sPm-%h^`gYg#CMx6;=soVWE>YW!OA7Gt}&ep+5?gM%3sWv^5Q4tISPX0IyQ zc=+V6uM8@ds#a}3bYV2+F2nco5k!qWxzNPj2D?ePdW}@N&6c*Hy0DN=%l~1+F2t$Wq*C4_j){^~JZjtvn*WyR#$M>d2vH zFHgArH3wUw41dla%IRzHHWAE2cAK1oJY7j!9LM;k>1+KCscy|lPaiyRHd(@Q9KU8p z<~R;!j{J`g4mM9x!7m87n)vB$H=zdWo0G6Oj=@cJm-~@zH=lAGL!@3l8N2~A^?uFF z-<(AB=i3g-JkWVI%J_A%cKd7{EpzA;KxO;}S-Uf~j+ME%JRpzP+BTm2lIvCH7<+o- z(MfpTP#+R=dgJj)THXPV*VZ<-{F2+%;F!xFZ)F^vFsFNWjt#k9;mGr^ZJYAkhUNWy z^9P^2Pe$HQ&tUZojyU(4g=y+uSY4W3H8Y({=Qb#yAx;LVO+zO`P{)XOmL47;z)bY7kL zL)E98ReW2MTk}_`S{V+z4mMtcMME|J|NJJ@F51}_*t!}2TdURxHhuY&(n8tUDezBl zV-&K9{N{Ai1;CE_&ZnDYH&KHHi1*b?bYy7veh=snBlQ^t>&g^78#!o5xrwV zv?SF=z)CxFoi?HU5yF|AvpKfWW1?P@f)%M=L&0C}t@+7jG!P=2h)#*34L(S8ovxUl zrzXkU@r9u+kZ2x)Lcf5BH)P+m$$3{RZ-wsK%Jt=vXH4Ky-r7J65t}wc9+x+N7jVn9iDJ5g@>jelJIy9y8hgZe1^l~pI`a(>njJpzT)}wD{?;fq?YZ-KYnQA z_2&<@mj3+VO(snQ}>?Gjr4jXvr-0eB&++&cc zJV#=uus&*}#Ik}O~GHB^6ba%@q0jtfZ$0S=L(BbPB4@K4mNp5a>`MR@> zP?a15DX-iUQz_qE7Bi`UTt_phklb=JsqowrGpXoY z7IUe1fTKAe&Kyu~4j4BFoR|a9EdVSQ02K>>qXi(&0#I%N7`FhNSOCy10W6jP6-$7l zB_PfcP;Lnrw*;J60?@4hELH#&D}bXFAkGR(0-RU@(5(S1)&Lc2fTJ}a&Kgi| z4H&luoLB?UZ2&Ab02Ld6qYWU=22gGT7`FkO*Z|Oh02UxX1qg5i0^)#xav)$F2si-( z&}{)Mwg44dfTJxS&K6K^3mCTroY(@;?Eoxx02Mocqa7g54p43f7`FqQ*a6V(0W9{y zRi%gqXRj(xjmKW;+H|~;Pd`@R*?R9vck-b|A#PJhIr$iM$U%42ZBUwL$Sq`xlm3Wj zi#&a+zWq2g!8QHYhtX3G5%uW~>!+#DOSbUcc(=NDImLB{IKR6O{@0Hn#C4HDU$EolE=c{_VKMDehl($xT+hY>Cv8%jRx z4TV?>+W-6ZA9eB>h3EeV_im(0&(-CTqr+ED6HapsF2iwK@>Ix~O6dYQJa44jpyR)z zP;XGF^csGQ+ftxH&QeMj$>Dh`<%Svmoq>A8T%~sn^noH3@&~1Ki5#973s=Czq&7c) zVY1`MZBlZF1#9cXtiRVnPOjWmTWI>#&ZW;wsP7ZSw`2o_j_wW1YeRikDdP>=ds}O{ z-%PjK2npYr>dQ7^R zL&^nFa)Y!WeV{^kr>i21qD*~~HTP=ZKDvwP`>WkF+DSOt;P&jU!LF3^tK6{`^Y>+X z(47VN&qsD0MZpr+^IPr62*nbfcCDTm`2ZEB8)bC``UFsMFu33J&3}b=eMgrq!Qtt| zq&x(tB-SkeF{;#3#S$qpM{h-@8=QN~kUF%LcBx3gzIIqTw-*rEwtH=#xI7}b9ONSe zP01=^Ksn&Q?%#~#+I&Jc#MCT$l*Q9?%Hn5m!WO`!*+E;+HL3p8N1&uxG}G8Fg=|ox zgmsW0SNE-&hG^X?ydzp@xOQ?_u*dV2!k$u!EJ-^`PN%hU=;y6_rAm?vO1RYJ@RD+` zAR5q8WxMf=F|fPV-a*>HH_13X^`_MfchRr<;cyTy`i`PYd$XU@me3%SJpYyXJlqki z>0ZjnG#Ds0x+U+d6oM-w?9moZ`zYN>2N06g1YXNgM);j9a}p;6gNzaonP&)H5U1AgKi+G$z!@np2C5eJOE^l{NFI!r; zlDxbh1A0@+1wpb4R-!I~@1DWC#A;Vtu@WdkE#}!TXR6axRaV5*cKg#y9!);m(wIe` zrk~=zD3{X56lqr2s70}eZQXTub3hKgJXIZ?o!-=nOSL0?&LmzvF2<+Vqa#YiQQ=aL zn$z6+LWNRsBr*VK2+X&dpGB_}al#Lj9fKUu$s7RGhiARKXmoeyO!loZ`NVucp0tXp ztYKG{J(^E&IsH-2GOt6sueR0M>*G-{Qguhe@qpOjD;>R zt9UH`cl8;&kZ7G*mH0meRi8Jg`an4~e_C^YR-gS}$H^mO?q8O0RE(!@mv_jcUCd*A(j8p8)PbmUhB(7l$7?u3~SEL)`P!p&HU@zCtpdjZ$(TEz_dTt&wu-6iLh zApZxVGOKS8KxQk|;~;g6dyFt+5Osw&Ym_IRcXH8^5!pBusRsXax~XL&QrOx3`|aG3 z4AHi07_6&a*lIMZYf`~F_(2QYn4ub<2FgwPS`;=UfKt}DwQvqyX(u~r|MO?Yndk%g zys-xJ(u9KOrX`Dt+q>J{I#TTH;q-*3KJQP+$fgb1%{GatNc1ZPKc!-8%jn2NjFF*n z;PDu1guPqCC8ANS!kv#uaf9vu(f(PLi=IWQ2!GRwO#59nZRNTreTa*4K7|p?oZ+Qa zwgk$VNuXCI4m+Eq0^^+n<&a$j523U)X!)@Xo*oSJQ=S&4cGds}*T8b;w-Qx3`I~NO za?Mey*S4lM{3;AO^a_YkQtLHk#*%v~)~qkys$OO0vJWkkvb5z=!5yQorFMr3(0_?Y zm$C#xzb`L`R(MS)s38Oe@*3h-gU=SfzGVmN@~+sI4c#g!S-iT9$DhMl(fyz&MRcOy ziH33!DxF3$MfueLH^uw;dC~;Yvaandw2q?!gjvDikaH?wUN47cF`lyy?FeF<4wGmF z_lctANIlVsplmlHTWAd-NlxXUp;rv%JdH6v`R)ZbWf@V>re-b)2QiAf?jmOB-(e;!_0=>JF4fzLn{tcy z$8~{AJr#o|(Bg=)K7A!f6hnWH4EGCiqg6@yn?NH&Fz8&0_@IaX(Wg~gZ010&3FK*h z>{@#J%085KS}ybjeQOspwr|vPpPs8^DTi3hd)|JX4#BGHna!fKXQ665zYrN28|zK} z`XqIV9ALonRunQZzNVA(S(6D?M&sMD*hr$>_>2oHBsI~u-PtTA$;G=`SKBF=r-b$Au{UMlWR0(|>W|zh)wL;zzYoBi%oZQL)Xfgz%tZwi) z@4rL^TV`Ug^Z?T0o%J)5dFk{B7TS5|pYG?u?+c?vt4<8jzzodScj{#GxSHdt+IpdiPwyRQ#0Q-PK+XHt4HAs(&*z|_?dR>@ z!v_aO{PXz}P{!^eVC}ndqKz} zk_aJvF#UgT#oyC=KP1^8{6_jyoBW6~_l`X5UpT)fWPZpY12x=(68$4R^CK210#f+D zut)}Oh8A2POcXW@{Z|Mw&9_tdh#828`PjC8=h7MAY-_!oDWT%g4HPl)EM*BT6=R;aIXsPx+?Jo(||CFZl(LR*8 z-uC{>VSV3*k26F5+K2l#?8E-YKKx$;Sy11|zc%GP>T$-fhbTf&hK)zX{YU<=hXpg{ zp7@_=k8=n;q`5!;o%S%J&_i0*)c;0%oG#=c?Uv$qn)Bm$9}j`pDU<(Nyx%hl`~uz| z7t%EU1U}9$@Q?=fW=?9{>%R~F_f7XW!2B=TeJ?4X`;7*4G&8ohHa8ZxcQP=xG1Yev zv3D}GG`4Ycu+{%-x%|Vuz^WYm*D~Ml#p4jj5BDMh6!rd52YM6``5`Op1t{|Tzl=Ng ztjBRaAF?pG{=xc>XS=ksq5p~VIAq#G4wK+NIsZ(h{jglq;N$-j?{P4ihdc?1|HS(l z<>2o;HxH37GX5R;dkBhO$osC^A@dL9Us95X9Iz+&JAMB$c;7eK<7f@PIQN|tT<#y7 zpQZXccEdwjg~0#T86PhmeMnnT{Rizo&m5k@{BOhN<7G?_LCv7q?W11*Yf;m~@=V7J z|7*wiT?YOObKgo^27h9H7ll6rfmPx`cmK;7=XVYAL(rJfAE5tDk^G3g+GAp>JNsYf zd%tU4AF{Vh{)PR}*m^Gne1wPPV3+=vVf1&2=0kjm*&p~n<%S@w+ylYpBb2Uz?Jnbg zpKt!IWqgP_GJk~nOWF8{lDFdXZ}mVZ!wsl@&;Og~~cGwz=MOG$p$p8aCn zAJ*{Jj~Ra}(H?T>g{-u={^gqPcahdF&iw%(W%CE;zjgr}a9k*+|K2XY+wp%;$C|{z&*AoKLjCO#I^og?fq_TeF$puc>wy`_WBSa z!cE8WZSz9s z@fHfS78MMP0`!*zvOly1+`qavED%C~t~Eh7ZD8sna|Hlokoe<<0=5U541iX$0xg}a zK@+K;N?KxBePu@_VsT>!eQQS}(12lM>}m_NccfQUfdYewqEa+>a)k$b0{InWCHVR9 zkfjP@Rfsiu*}nq=yZ@*CivMNCD2AW6F?7O F{~trdUgrP+ literal 0 HcmV?d00001 diff --git a/open_mastr/utils/sqlalchemy_tables.py b/open_mastr/utils/sqlalchemy_tables.py index 14a9f337..cc2e20f2 100644 --- a/open_mastr/utils/sqlalchemy_tables.py +++ b/open_mastr/utils/sqlalchemy_tables.py @@ -24,7 +24,7 @@ "AnlagenKwk": {"KwkMastrNummer"}, "AnlagenStromSpeicher": {"MastrNummer"}, "Bilanzierungsgebiete": {"Id"}, - "EinheitenAenderungNetzbetreiberzuordnungen": {"EinheitMastrNummer"}, # TODO: Is not a primary key on its own! + "EinheitenAenderungNetzbetreiberzuordnungen": {"EinheitMastrNummer"}, # TODO: May not be a primary key on its own. Check this. "EinheitenBiomasse": {"EinheitMastrNummer"}, "EinheitenGasErzeuger": {"EinheitMastrNummer"}, "EinheitenGasSpeicher": {"EinheitMastrNummer"}, diff --git a/open_mastr/utils/xsd_tables.py b/open_mastr/utils/xsd_tables.py index cf3523c0..8d76739c 100644 --- a/open_mastr/utils/xsd_tables.py +++ b/open_mastr/utils/xsd_tables.py @@ -17,6 +17,8 @@ # The BNetzA "choice" to sometimes write MaStR and sometimes Mastr is certainly confusing, # but are we the ones who should change that? def normalize_column_name(original_mastr_column_name: str) -> str: + # BNethA sometimes has MaStR, other times MaStR. We normalize that. + # Also, in case the column names in the XSD contain äöüß, we replace them. This is probably a BNetzA oversight, but has happened at least once. return original_mastr_column_name.replace("MaStR", "Mastr").replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss").strip() @@ -107,8 +109,9 @@ def from_xml_schema(cls, schema: xmlschema.XMLSchema) -> "MastrTableDescription" def read_mastr_table_descriptions_from_xsd( - zipped_docs_file_path: Union[Path, str], data: Optional[list[str]] = None + zipped_docs_file_path: Union[Path, str], data: list[str] ) -> set[MastrTableDescription]: + print(data) include_tables = set(data_to_include_tables(data, mapping="write_xml")) mastr_table_descriptions = set() diff --git a/open_mastr/xml_download/colums_to_replace.py b/open_mastr/xml_download/colums_to_replace.py index e7608e8a..fde8d5e7 100644 --- a/open_mastr/xml_download/colums_to_replace.py +++ b/open_mastr/xml_download/colums_to_replace.py @@ -37,98 +37,3 @@ 12: "Gasspeichereinheit", }, } - -# columns to replace lists all columns where the entries have -# to be replaced according to the tables katalogwerte and katalogeinträge -# from the bulk download of the MaStR - -columns_replace_list = [ - # anlageneegsolar - "AnlageBetriebsstatus", - # anlageneegspeicher - # anlagenstromspeicher - # einheitensolar - "Land", - "Bundesland", - "EinheitSystemstatus", - "EinheitBetriebsstatus", - "Energietraeger", - "Einspeisungsart", - "GemeinsamerWechselrichterMitSpeicher", - "Lage", - "Leistungsbegrenzung", - "Hauptausrichtung", - "HauptausrichtungNeigungswinkel", - "Nutzungsbereich", - "Nebenausrichtung", - "NebenausrichtungNeigungswinkel", - "ArtDerFlaecheIds", - # einheitenstromspeicher - "AcDcKoppelung", - "Batterietechnologie", - "Technologie", - "Pumpspeichertechnologie", - "Einsatzort", - # geloeschteunddeaktivierteEinheiten - # geloeschteunddeaktivierteMarktAkteure - "MarktakteurStatus", - # lokationen - # marktakteure - "Personenart", - "Rechtsform", - "HauptwirtdschaftszweigAbteilung", - "HauptwirtdschaftszweigGruppe", - "HauptwirtdschaftszweigAbschnitt", - "Registergericht", - "LandAnZustelladresse", - # netzanschlusspunkte - "Gasqualitaet", - "Spannungsebene", - # anlageneegbiomasse - # anlageneeggeosolarthermiegrubenklaerschlammdruckentspannung - # anlageneegwasser - # anlageneegwind - # anlagengasspeicher - # anlagenkwk - # bilanzierungsgebiete - # einheitenaenderungnetzbetreiberzuordnungen - "ArtDerAenderung", - # einheitenbiomasse - "Hauptbrennstoff", - "Biomasseart", - # einheitengaserzeuger - # einheitengasspeicher - "Speicherart", - # einheitengasverbraucher - # einheitengenehmigung - "Art", - # einheitengeosolarthermiegrubenklaerschlammdruckentspannung - # einheitenkernkraft - # einheitenstromverbraucher - "ArtAbschaltbareLast", - # einheitentypen - # einheitenverbrennung - "WeitererHauptbrennstoff", - "WeitereBrennstoffe", - "ArtDerStilllegung", - # einheitenwasser - "ArtDesZuflusses", - "ArtDerWasserkraftanlage", - # marktrollen - # netze - "Sparte", - # einheitenwind - "Lage", - "Hersteller", - "Seelage", - "ClusterNordsee", - "ClusterOstsee", - # various tables - "NetzbetreiberpruefungStatus", - "WindAnLandOderAufSee", - "TechnologieFlugwindenergieanlage", - "Flughoehe", - "Flugradius", - "ArtDerSolaranlage", - "SpeicherAmGleichenOrt", -] diff --git a/open_mastr/xml_download/parse.py b/open_mastr/xml_download/parse.py new file mode 100644 index 00000000..6573d1ee --- /dev/null +++ b/open_mastr/xml_download/parse.py @@ -0,0 +1,144 @@ +import xmlschema +from xmlschema.validators import XsdComplexType, XsdSimpleType, XsdElement +from typing import Dict, List, Optional + +# ---------------------------------------------- +# 1. Mapping XSD builtin types → SQLAlchemy types +# ---------------------------------------------- +XSD_TO_SQLA = { + "string": "String", + "integer": "Integer", + "int": "Integer", + "short": "Integer", + "long": "BigInteger", + "decimal": "Float", + "float": "Float", + "double": "Float", + "boolean": "Boolean", + "date": "Date", + "dateTime": "DateTime", + "time": "Time", +} + + +def map_xsd_type(xsd_type: XsdSimpleType) -> str: + """Map XSD builtin type to SQLAlchemy column type.""" + if xsd_type.is_simple() and xsd_type.primitive_type: + name = xsd_type.primitive_type.local_name + return XSD_TO_SQLA.get(name, "String") # default fallback + return "String" + + +# ---------------------------------------------- +# 2. Main model generation +# ---------------------------------------------- +def generate_sqlalchemy_models(xsd_file: str) -> str: + schema = xmlschema.XMLSchema(xsd_file) + output = [] + + output.append("from sqlalchemy import Column, Integer, String, Float, Boolean, Date, DateTime, BigInteger, ForeignKey") + output.append("from sqlalchemy.orm import declarative_base, relationship") + output.append("\nBase = declarative_base()\n") + + processed_types = {} + + # Iterate over all global elements (entry points) + for element_name, element in schema.elements.items(): + output.append(generate_class_from_element(element, processed_types)) + + return "\n".join(output) + + +# ---------------------------------------------- +# 3. Generate a class for an element +# ---------------------------------------------- +def generate_class_from_element( + element: XsdElement, + processed_types: Dict[str, str] +) -> str: + """Generate a SQLAlchemy class for the top-level element.""" + cls_name = to_class_name(element.name) + + # If it is a complexType element → + if isinstance(element.type, XsdComplexType): + return generate_class_from_complex_type(cls_name, element.type, processed_types) + + return f"# Skipped simple element {element.name}\n" + + +# ---------------------------------------------- +# 4. Generate class for a complex type +# ---------------------------------------------- +def generate_class_from_complex_type( + cls_name: str, + complex_type: XsdComplexType, + processed_types: Dict[str, str] +) -> str: + + if cls_name in processed_types: + return "" # already generated + + processed_types[cls_name] = cls_name + + lines = [] + lines.append(f"class {cls_name}(Base):") + lines.append(f" __tablename__ = '{camel_to_snake(cls_name)}'") + lines.append(" id = Column(Integer, primary_key=True)\n") + + # Iterate through child elements (sequence, choice, etc.) + for child in complex_type.content.iter_elements(): + + child_name = child.name + col_name = camel_to_snake(child_name) + + if isinstance(child.type, XsdComplexType): + # Nested complex type → child table with relationship + child_class_name = to_class_name(child_name) + lines.append( + f" {col_name}_id = Column(Integer, ForeignKey('{camel_to_snake(child_class_name)}.id'))" + ) + lines.append( + f" {col_name} = relationship('{child_class_name}')" + ) + # Generate nested class too + nested = generate_class_from_complex_type(child_class_name, child.type, processed_types) + lines.append("\n" + nested) + + else: + # Simple child element + sqlalchemy_type = map_xsd_type(child.type) + + nullable = "True" if child.min_occurs == 0 else "False" + lines.append( + f" {col_name} = Column({sqlalchemy_type}, nullable={nullable})" + ) + + lines.append("") + return "\n".join(lines) + + +# ---------------------------------------------- +# 5. Helpers +# ---------------------------------------------- +def to_class_name(name: str) -> str: + return "".join(part.capitalize() for part in name.split("_")) + + +def camel_to_snake(name: str) -> str: + out = "" + for i, ch in enumerate(name): + if ch.isupper() and i > 0: + out += "_" + out += ch.lower() + return out + + +# ---------------------------------------------- +# 6. Run example +# ---------------------------------------------- +if __name__ == "__main__": + import sys + xsd_path = sys.argv[1] + models = generate_sqlalchemy_models(xsd_path) + print(models) + diff --git a/open_mastr/xml_download/schema.py b/open_mastr/xml_download/schema.py new file mode 100644 index 00000000..2b223d7d --- /dev/null +++ b/open_mastr/xml_download/schema.py @@ -0,0 +1,49 @@ +from pathlib import Path +import glob +from xml.etree import ElementTree + +import xmlschema + + +def check_if_files_valid_under_schema(xsd_file, xml_files): + schema = xmlschema.XMLSchema(xsd_file) + for xml_file in xml_files: + xml_resource = xmlschema.XMLResource(xml_file, lazy=True) + errors = schema.iter_errors(xml_resource) + error_count = 0 + for error in errors: + error_count += 1 + breakpoint() + print(" -", error) + if error_count == 0: + print(f"{xml_file}\tValid.") + + +def check_if_files_valid_under_schema_et(xsd_file, xml_files): + schema = xmlschema.XMLSchema(xsd_file) + for xml_file in xml_files: + xt = ElementTree.parse(xml_file) + errors = schema.iter_errors(xt) + error_count = 0 + for error in errors: + error_count += 1 + breakpoint() + print(" -", error) + if error_count == 0: + print(f"{xml_file}\tValid.") + + + +def main(): + xsd_root = Path("/home/gorgor/.open-MaStR/data/xml_download/Dokumentation MaStR Gesamtdatenexport/xsd") + xml_root = Path("/home/gorgor/.open-MaStR/data/xml_download/Gesamtdatenexport_20251129") + xsd_file = xsd_root / "EinheitenWind.xsd" + xml_files = [xml_root / basename for basename in glob.glob("EinheitenWind*.xml", root_dir=xml_root)] + xml_files =["/home/gorgor/.open-MaStR/data/xml_download/EinheitenWind_formatted.xml"] + print(xsd_file) + print(xml_files) + check_if_files_valid_under_schema_et(xsd_file=xsd_file, xml_files=xml_files) + + +if __name__ == "__main__": + main() diff --git a/open_mastr/xml_download/utils_cleansing_bulk.py b/open_mastr/xml_download/utils_cleansing_bulk.py index 8cc9428b..b38da277 100644 --- a/open_mastr/xml_download/utils_cleansing_bulk.py +++ b/open_mastr/xml_download/utils_cleansing_bulk.py @@ -14,6 +14,7 @@ def cleanse_bulk_data( zipped_xml_file_path: str, ) -> pd.DataFrame: df = replace_system_catalog_ids(df, system_catalog) + catalog_columns = set(catalog_columns) - system_catalog.keys() df = replace_mastr_katalogeintraege( zipped_xml_file_path=zipped_xml_file_path, df=df, catalog_columns=catalog_columns, ) @@ -31,9 +32,9 @@ def replace_system_catalog_ids(df: pd.DataFrame, system_catalog: dict[int, str]) def replace_mastr_katalogeintraege( - zipped_xml_file_path: str, df: pd.DataFrame, catalog_columns: Collection[str], + zipped_xml_file_path: str, ) -> pd.DataFrame: """Replaces the IDs from the mastr database by its mapped string values from the table Katalogwerte""" diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index e01507b1..52af4422 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -346,9 +346,7 @@ def download_documentation(save_path: str) -> None: Parameters ----------- save_path: str - Full file path where the downloaded MaStR zip file will be saved. - xml_folder_path: str - Path where the downloaded MaStR zip file will be saved. + Full file path where the downloaded MaStR documentation zip file will be saved. """ log.info("Starting the MaStR documentation download from marktstammdatenregister.de.") url = "https://www.marktstammdatenregister.de/MaStRHilfe/files/gesamtdatenexport/Dokumentation%20MaStR%20Gesamtdatenexport.zip" @@ -360,19 +358,6 @@ def download_documentation(save_path: str) -> None: with open(save_path, "wb") as zfile: zfile.write(r.content) - #chunk_size = 1024 * 1024 - #content_length = r.headers.get("Content-Length") - #expected_steps = math.ceil(content_length / chunk_size) - #with ( - # open(save_path, "wb") as zfile, - # tqdm(desc=save_path, total=expected_steps) as bar, - #): - # for chunk in r.iter_content(chunk_size=chunk_size): - # if chunk: - # zfile.write(chunk) - # zfile.flush() - # bar.update() - time_b = time.perf_counter() log.info( f"MaStR documentation download is finished. It took {round(time_b - time_a)} seconds." diff --git a/open_mastr/xml_download/utils_write_to_database.py b/open_mastr/xml_download/utils_write_to_database.py index b6685d11..3b2769b8 100644 --- a/open_mastr/xml_download/utils_write_to_database.py +++ b/open_mastr/xml_download/utils_write_to_database.py @@ -12,7 +12,7 @@ import numpy as np import pandas as pd import sqlalchemy -from sqlalchemy import Column, Engine, delete, select, create_engine, inspect +from sqlalchemy import Column, Engine, Table, delete, select, create_engine, inspect from sqlalchemy.orm import DeclarativeBase from sqlalchemy.sql import text from sqlalchemy.sql.sqltypes import Date, DateTime @@ -35,14 +35,15 @@ def write_mastr_xml_to_database( data: list, bulk_cleansing: bool, bulk_download_date: str, - mastr_table_to_db_model: Mapping[str, Type[DeclarativeBase_T]], + mastr_table_to_db_table: Mapping[str, Table], + alter_database_tables: bool, ) -> None: """Write the Mastr in xml format into a database defined by the engine parameter.""" log.info("Starting bulk download...") include_tables = data_to_include_tables(data, mapping="write_xml") threads_data = [] - lower_mastr_table_to_db_model = {table_name.lower(): db_model for table_name, db_model in mastr_table_to_db_model.items()} + lower_mastr_table_to_db_table = {table_name.lower(): db_table for table_name, db_table in mastr_table_to_db_table.items()} with ZipFile(zipped_xml_file_path, "r") as f: files_list = correct_ordering_of_filelist(f.namelist()) @@ -53,20 +54,21 @@ def write_mastr_xml_to_database( if not is_table_relevant(xml_table_name, include_tables): continue - db_model = lower_mastr_table_to_db_model.get(xml_table_name) - if not db_model: + db_table = lower_mastr_table_to_db_table.get(xml_table_name) + if db_table is None: log.warning(f"Skipping MaStR file {file_name!r} because no database table was found for {xml_table_name=}") continue threads_data.append( ( file_name, - db_model, + db_table, str(engine.url), engine.url.password, zipped_xml_file_path, bulk_download_date, bulk_cleansing, + alter_database_tables, ) ) @@ -112,12 +114,13 @@ def get_number_of_processes(): def process_xml_file( file_name: str, - db_model: Type[DeclarativeBase_T], + db_table: Table, connection_url: str, password: str, zipped_xml_file_path: str, bulk_download_date: str, bulk_cleansing: bool, + alter_database_tables: bool, ) -> None: """Process a single xml file and write it to the database.""" try: @@ -133,70 +136,84 @@ def process_xml_file( with ZipFile(zipped_xml_file_path, "r") as f: log.info(f"Processing file '{file_name}'...") if is_first_file(file_name): - delete_all_existing_rows(db_model=db_model, engine=engine) + delete_all_existing_rows(db_table=db_table, engine=engine) df = read_xml_file(f, file_name) df = process_table_before_insertion( df=df, - db_model=db_model, + db_table=db_table, zipped_xml_file_path=zipped_xml_file_path, bulk_download_date=bulk_download_date, bulk_cleansing=bulk_cleansing, ) df = check_for_column_mismatch_and_try_to_solve_it( df=df, - db_model=db_model, + db_table=db_table, engine=engine, + alter_database_tables=alter_database_tables, ) if engine.dialect.name == "sqlite": add_table_to_sqlite_database( df=df, - db_model=db_model, + db_table=db_table, engine=engine, ) else: add_table_to_non_sqlite_database( df=df, - db_model=db_model, + db_table=db_table, engine=engine, ) except Exception as e: log.error(f"Error processing file '{file_name}': '{e}'") -def delete_all_existing_rows(db_model: Type[DeclarativeBase_T], engine: Engine) -> None: +def delete_all_existing_rows(db_table: Table, engine: Engine) -> None: with engine.begin() as con: - con.execute(delete(db_model)) + con.execute(delete(db_table)) -def check_for_column_mismatch_and_try_to_solve_it(df: pd.DataFrame, db_model: Type[DeclarativeBase_T], engine: Engine) -> pd.DataFrame: +def check_for_column_mismatch_and_try_to_solve_it( + df: pd.DataFrame, + db_table: Table, + engine: Engine, + alter_database_tables: bool, +) -> pd.DataFrame: df_column_names = set(df.columns) - db_column_names = {column.name for column in db_model.__table__.columns} + db_column_names = {column.name for column in db_table.columns} if additional_db_column_names := db_column_names - df_column_names: # Many columns are optional and it's perfectly normal to have and XML file / a dataframe that doesn't have # a column that is present in the database. So this is only worth a debug message. log.debug( - f"Database table {db_model.__table__.name} has some columns that weren't found in the XML file." + f"Database table {db_table.name} has some columns that weren't found in the XML file." f" Proceeding and trying to insert anyway. Additional DB columns:" f" {', '.join(additional_db_column_names)}" ) if additional_df_column_names := df_column_names - db_column_names: - # TODO: Check here if the user specified not to issue DDL statements before trying to insert. - log.warning( - f"XML file has some columns that aren't present in the database table {db_model.__table__.name}." - f" Trying to add the columns to the table. Additional XML columns:" - f" {', '.join(additional_df_column_names)}" - ) - # TODO: What if we can add some columns and not others? We should then return the columns for which we succeeded. - try: - add_missing_columns_to_table( - engine=engine, - db_model=db_model, - missing_columns=additional_df_column_names, + if alter_database_tables: + # TODO: Check here if the user specified not to issue DDL statements before trying to insert. + log.warning( + f"XML file has some columns that aren't present in the database table {db_table.name}." + f" Trying to add the columns to the table. Additional XML columns:" + f" {', '.join(additional_df_column_names)}" + ) + # TODO: What if we can add some columns and not others? We should then return the columns for which we succeeded. + try: + add_missing_columns_to_table( + engine=engine, + db_table=db_table, + missing_columns=additional_df_column_names, + ) + except Exception: + log.exception("Could not add at least some columns to the database. Ignoring the columns from the XML file instead.") + df = df.drop(columns=additional_df_column_names) + else: + log.warning( + f"XML file has some columns that aren't present in the database table {db_table.name}." + f" Ignoring those columns since you asked not to alter tables. Additional XML columns:" + f" {', '.join(additional_df_column_names)}" ) - except Exception: - log.exception("Could not add at least some columns to the database. Ignoring the columns from the XML file instead.") df = df.drop(columns=additional_df_column_names) return df @@ -314,18 +331,18 @@ def is_first_file(file_name: str) -> bool: def cast_date_columns_to_datetime( - db_model: Type[DeclarativeBase_T], df: pd.DataFrame + db_table: Table, df: pd.DataFrame ) -> pd.DataFrame: - for column in db_model.__table__.columns: - if is_date_column_and_in_df(column, df): + for column in db_table.columns: + if is_date_column(column) and column.name in df.columns: # Convert column to datetime64, invalid string -> NaT df[column.name] = pd.to_datetime(df[column.name], errors="coerce") return df -def cast_date_columns_to_string(db_model: Type[DeclarativeBase_T], df: pd.DataFrame) -> pd.DataFrame: - for column in db_model.__table__.columns: - if not is_date_column_and_in_df(column, df): +def cast_date_columns_to_string(db_table: Table, df: pd.DataFrame) -> pd.DataFrame: + for column in db_table.columns: + if not is_date_column(column) or column.name not in df.columns: continue df[column.name] = pd.to_datetime(df[column.name], errors="coerce") @@ -341,8 +358,8 @@ def cast_date_columns_to_string(db_model: Type[DeclarativeBase_T], df: pd.DataFr return df -def is_date_column_and_in_df(column: Column, df: pd.DataFrame) -> bool: - return type(column.type) in [Date, DateTime] and column.name in df.columns +def is_date_column(column: Column) -> bool: + return type(column.type) in [Date, DateTime] def correct_ordering_of_filelist(files_list: list) -> list: @@ -382,25 +399,25 @@ def read_xml_file(f: ZipFile, file_name: str) -> pd.DataFrame: def add_table_to_non_sqlite_database( df: pd.DataFrame, - db_model: Type[DeclarativeBase_T], + db_table: Table, engine: sqlalchemy.engine.Engine, ) -> None: # get a dictionary for the data types dtypes_for_writing_sql = { column.name: column.type - for column in db_model.__table__columns + for column in db_table.columns if column.name in df.columns } # Convert date and datetime columns into the datatype datetime. - df = cast_date_columns_to_datetime(db_model, df) + df = cast_date_columns_to_datetime(db_table, df) for _ in range(10000): try: with engine.connect() as con: with con.begin(): df.to_sql( - db_model.__table__.name, + db_table.name, con=con, index=False, if_exists="append", @@ -414,7 +431,7 @@ def add_table_to_non_sqlite_database( except sqlalchemy.exc.IntegrityError: # error resulting from Unique constraint failed df = write_single_entries_until_not_unique_comes_up( - df, db_model, engine + df, db_table, engine ) @@ -451,7 +468,7 @@ def add_zero_as_first_character_for_too_short_string(df: pd.DataFrame) -> pd.Dat def write_single_entries_until_not_unique_comes_up( - df: pd.DataFrame, db_model: Type[DeclarativeBase_T], engine: sqlalchemy.engine.Engine + df: pd.DataFrame, db_table: Table, engine: sqlalchemy.engine.Engine ) -> pd.DataFrame: """ Remove from dataframe these rows, which are already existing in the database table @@ -467,7 +484,7 @@ def write_single_entries_until_not_unique_comes_up( """ # TODO: Check if we need to support composite primary keys for the MaStR changes table. # Because this here assumes single-column primary keys. - primary_key = next(c for c in db_model.__table__.columns if c.primary_key) + primary_key = next(c for c in db_table.columns if c.primary_key) with engine.begin() as con: key_list = ( @@ -491,7 +508,7 @@ def write_single_entries_until_not_unique_comes_up( def add_missing_columns_to_table( engine: sqlalchemy.engine.Engine, - db_model: Type[DeclarativeBase_T], + db_table: Table, missing_columns: Collection[str], ) -> None: """ @@ -508,7 +525,7 @@ def add_missing_columns_to_table( ------- """ - table_name = db_model.__table__.name + table_name = db_table.name for column_name in missing_columns: alter_query = 'ALTER TABLE %s ADD "%s" VARCHAR NULL;' % ( table_name, @@ -584,7 +601,7 @@ def find_nearest_brackets(xml_string: str, position: int) -> tuple[int, int]: def process_table_before_insertion( df: pd.DataFrame, - db_model: Type[DeclarativeBase_T], + db_table: Table, zipped_xml_file_path: str, bulk_download_date: str, bulk_cleansing: bool, @@ -600,7 +617,7 @@ def process_table_before_insertion( if bulk_cleansing: catalog_columns = { column.name - for column in db_model.__table__.columns + for column in db_table.columns # TODO: Is it okay to rely so heavily on the SQLALchemy model to decide how to process the table? if isinstance(column.type, (CatalogInteger, CatalogString)) } @@ -616,7 +633,7 @@ def normalize_column_names_in_df(df: pd.DataFrame) -> pd.DataFrame: def add_table_to_sqlite_database( df: pd.DataFrame, - db_model: Type[DeclarativeBase_T], + db_table: Table, engine: sqlalchemy.engine.Engine, ) -> None: column_list = df.columns.tolist() @@ -625,10 +642,10 @@ def add_table_to_sqlite_database( df = df.where(pd.notnull(df), None) # Convert date columns to strings. Dates are not supported directly by SQLite. - df = cast_date_columns_to_string(db_model, df) + df = cast_date_columns_to_string(db_table, df) # Create SQL statement for bulk insert. ON CONFLICT DO NOTHING prevents duplicates. - insert_stmt = f"INSERT INTO {db_model.__table__.name} ({','.join(column_list)}) VALUES ({','.join(['?' for _ in column_list])}) ON CONFLICT DO NOTHING" + insert_stmt = f"INSERT INTO {db_table.name} ({','.join(column_list)}) VALUES ({','.join(['?' for _ in column_list])}) ON CONFLICT DO NOTHING" for _ in range(10000): try: @@ -640,11 +657,11 @@ def add_table_to_sqlite_database( except sqlalchemy.exc.IntegrityError: # error resulting from Unique constraint failed df = write_single_entries_until_not_unique_comes_up( - df, db_model, engine + df, db_table, engine ) except Exception: # If any unexpected error occurs, we'll switch back to the non-SQLite method. - add_table_to_non_sqlite_database(df, db_model, engine) + add_table_to_non_sqlite_database(df, db_table, engine) break diff --git a/open_mastr/xml_download/xsd_to_table.py b/open_mastr/xml_download/xsd_to_table.py new file mode 100644 index 00000000..49fb63f7 --- /dev/null +++ b/open_mastr/xml_download/xsd_to_table.py @@ -0,0 +1,61 @@ + + +class SqlalchemyMastrModelMaker: + MASTR_COLUMN_TYPE_TO_SQLALCHEMY_TYPE = { + MastrColumnType.STRING: String, + MastrColumnType.INTEGER: Integer, + MastrColumnType.FLOAT: Float, + MastrColumnType.DATE: Date, + MastrColumnType.DATETIME: DateTime(timezone=True), + MastrColumnType.BOOLEAN: Boolean, + MastrColumnType.CATALOG_VALUE: Integer, + } + + @classmethod + def make_sqlalchemy_mastr_model( + cls, + table: MastrTableDescription, + primary_key_columns: set[str], + base: DeclarativeBase, + mixins: tuple[type, ...] = tuple(), + ): + namespace = { + "__tablename__": table.table_name, + "__annotations__": {}, + } + + for col in table.columns: + sa_type = cls.MASTR_COLUMN_TYPE_TO_SQLALCHEMY_TYPE[col.type] + kwargs = {"primary_key": True} if col.name in primary_key_columns else {"nullable": True} + namespace[col.name] = mapped_column(sa_type, **kwargs) + + bases = (base,) + mixins + return type(table.instance_name, bases, namespace) + + +class Base(DeclarativeBase): + pass + + +class ParentAllTables(object): + DatenQuelle = Column(String) + DatumDownload = Column(Date) + + +def generate_sqlalchemy_models(xsd_file: str) -> str: + schema = xmlschema.XMLSchema(xsd_file) + table = MastrTableDescription.from_xml_schema(schema) + + model = SqlalchemyMastrModelMaker.make_sqlalchemy_mastr_model( + table=table, + primary_key_columns={"EinheitMastrNummer"}, + base=Base, + mixins=(ParentAllTables,) + ) + + +if __name__ == "__main__": + import sys + xsd_path = sys.argv[1] + generate_sqlalchemy_models(xsd_path) + diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 5f8cfa81..f635b387 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,6 +1,7 @@ import pytest import os from os.path import expanduser +from pathlib import Path import itertools import random @@ -26,21 +27,10 @@ ) -# Check if db is empty -_db_exists = False -_db_folder_path = os.path.join( - expanduser("~"), ".open-MaStR", "data", "sqlite" -) # FIXME: use path in tmpdir when implemented -if os.path.isdir(_db_folder_path): - for entry in os.scandir(path=_db_folder_path): - _db_path = os.path.join(_db_folder_path, "open-mastr.db") - if os.path.getsize(_db_path) > 1000000: # empty db = 327.7kB < 1 MB - _db_exists = True - - @pytest.fixture -def db(): - return Mastr() +def mastr(tmp_path: Path): + output_dir = tmp_path / "output_dir" + return Mastr(output_dir=output_dir) def test_Mastr_validate_working_parameter(): @@ -119,8 +109,8 @@ def test_Mastr_validate_not_working_parameter(): ) -def test_validate_parameter_format_for_mastr_init(db): - engine_list_working = ["sqlite", db.engine] +def test_validate_parameter_format_for_mastr_init(mastr): + engine_list_working = ["sqlite", mastr.engine] engine_list_failing = ["HI", 12] for engine in engine_list_working: diff --git a/tests/test_mastr.py b/tests/test_mastr.py index ce7cd6fa..c2355f49 100644 --- a/tests/test_mastr.py +++ b/tests/test_mastr.py @@ -1,4 +1,5 @@ import shutil +from pathlib import Path from open_mastr.mastr import Mastr import os @@ -14,97 +15,58 @@ _xml_folder_path = os.path.join(expanduser("~"), ".open-MaStR", "data", "xml_download") if os.path.isdir(_xml_folder_path): for entry in os.scandir(path=_xml_folder_path): - if "Gesamtdatenexport" in entry.name: + if "Gesamtdatenexport" in entry.name and entry.name.endswith(".zip"): _xml_file_exists = True @pytest.fixture(scope="module") -def zipped_xml_file_path(): +def zipped_xml_file_path() -> str: zipped_xml_file_path = None for entry in os.scandir(path=_xml_folder_path): - if "Gesamtdatenexport" in entry.name: + if "Gesamtdatenexport" in entry.name and entry.name.endswith(".zip"): zipped_xml_file_path = os.path.join(_xml_folder_path, entry.name) return zipped_xml_file_path @pytest.fixture -def db_path(): - return os.path.join( - os.path.expanduser("~"), ".open-MaStR", "data", "sqlite", "mastr-test.db" - ) +def mastr(tmp_path: Path) -> Mastr: + output_dir = tmp_path / "output_dir" + return Mastr(output_dir=output_dir) -@pytest.fixture -def db(db_path): - return Mastr(engine=sqlalchemy.create_engine(f"sqlite:///{db_path}")) - - -@pytest.fixture -def db_translated(db_path): - engine = sqlalchemy.create_engine(f"sqlite:///{db_path}") - db_api = Mastr(engine=engine) - - db_api.download(date="existing", data=["wind", "hydro", "biomass", "combustion"]) - db_api.translate() - - return db_api - - -def test_Mastr_init(db): +def test_mastr_init(mastr: Mastr) -> None: # test if folder structure exists - assert os.path.exists(db.home_directory) - assert os.path.exists(db._sqlite_folder_path) + assert os.path.exists(mastr.home_directory) + assert os.path.exists(mastr._sqlite_folder_path) # test if engine and connection were created - assert type(db.engine) == sqlalchemy.engine.Engine - - -@pytest.mark.skipif( - not _xml_file_exists, reason="The zipped xml file could not be found." -) -def test_Mastr_translate(db_translated, db_path): - # test if database was renamed correctly - transl_path = db_path[:-3] + "-translated.db" - assert os.path.exists(transl_path) - - # test if columns got translated - inspector = sqlalchemy.inspect(db_translated.engine) - table_names = inspector.get_table_names() - - for table in table_names: - for column in inspector.get_columns(table): - column = column["name"] - assert column in TRANSLATIONS.values() or column not in TRANSLATIONS.keys() - - # test if new translated version replaces previous one - db_translated.engine.dispose() - engine = sqlalchemy.create_engine(f"sqlite:///{db_path}") - db_empty = Mastr(engine=engine) - db_empty.translate() - - for table in table_names: - assert pd.read_sql(sql=table, con=db_empty.engine).shape[0] == 0 + assert type(mastr.engine) == sqlalchemy.engine.Engine @pytest.mark.dependency(name="bulk_downloaded") -def test_mastr_download(db): - db.download(data="wind") - df_wind = pd.read_sql("wind_extended", con=db.engine) +def test_mastr_download(mastr: Mastr) -> None: + mastr.download(data="wind") + df_wind = pd.read_sql("EinheitenWind", con=mastr.engine) + assert len(df_wind) > 10000 + + mastr.download(data="biomass") + df_biomass = pd.read_sql("EinheitenBiomasse", con=mastr.engine) assert len(df_wind) > 10000 + assert len(df_biomass) > 10000 - db.download(data="biomass") - df_biomass = pd.read_sql("biomass_extended", con=db.engine) + mastr.download(data=["wind", "nuclear"]) + df_biomass = pd.read_sql("EinheitenBiomasse", con=mastr.engine) assert len(df_wind) > 10000 assert len(df_biomass) > 10000 @pytest.mark.dependency(depends=["bulk_downloaded"]) -def test_mastr_download_keep_old_files(db, zipped_xml_file_path): +def test_mastr_download_keep_old_files(mastr: Mastr, zipped_xml_file_path: str) -> None: file_today = zipped_xml_file_path yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d") file_old = re.sub(r"\d{8}", yesterday, os.path.basename(file_today)) file_old = os.path.join(os.path.dirname(zipped_xml_file_path), file_old) shutil.copy(file_today, file_old) - db.download(data="gsgk", keep_old_files=True) + mastr.download(data="gsgk", keep_old_files=True) assert os.path.exists(file_old) diff --git a/tests/xml_download/test_utils_cleansing_bulk.py b/tests/xml_download/test_utils_cleansing_bulk.py index 9a29ad76..f3c01abc 100644 --- a/tests/xml_download/test_utils_cleansing_bulk.py +++ b/tests/xml_download/test_utils_cleansing_bulk.py @@ -7,6 +7,7 @@ import pytest from open_mastr.xml_download.utils_cleansing_bulk import ( + cleanse_bulk_data, create_katalogwerte_from_bulk_download, replace_mastr_katalogeintraege, ) @@ -16,7 +17,7 @@ _xml_folder_path = os.path.join(expanduser("~"), ".open-MaStR", "data", "xml_download") if os.path.isdir(_xml_folder_path): for entry in os.scandir(path=_xml_folder_path): - if "Gesamtdatenexport" in entry.name: + if "Gesamtdatenexport" in entry.name and entry.name.endswith(".zip"): _xml_file_exists = True _sqlite_folder_path = os.path.join(expanduser("~"), ".open-MaStR", "data", "sqlite") @@ -42,12 +43,40 @@ def con(): def zipped_xml_file_path(): zipped_xml_file_path = None for entry in os.scandir(path=_xml_folder_path): - if "Gesamtdatenexport" in entry.name: + if "Gesamtdatenexport" in entry.name and entry.name.endswith(".zip"): zipped_xml_file_path = os.path.join(_xml_folder_path, entry.name) return zipped_xml_file_path + +@pytest.mark.skipif( + not _xml_file_exists, reason="The zipped xml file could not be found." +) +def test_cleanse_bulk_data(zipped_xml_file_path): + df_raw = pd.DataFrame( + { + "ID": [0, 1, 2], + "Bundesland": [335, 335, 336], + "Einheittyp": [1, 8, 5], + } + ) + df_replaced = pd.DataFrame( + { + "ID": [0, 1, 2], + "Bundesland": ["Bayern", "Bayern", "Bremen"], + "Einheittyp": ["Solareinheit", "Stromspeichereinheit", "Geothermie"], + } + ) + + pd.testing.assert_frame_equal( + cleanse_bulk_data( + df=df_raw, zipped_xml_file_path=zipped_xml_file_path, catalog_columns={"Bundesland", "Einheittyp"}, + ), + df_replaced, + ) + + @pytest.mark.skipif( not _xml_file_exists, reason="The zipped xml file could not be found." ) @@ -57,7 +86,10 @@ def test_replace_mastr_katalogeintraege(zipped_xml_file_path): {"ID": [0, 1, 2], "Bundesland": ["Bayern", "Bayern", "Bremen"]} ) pd.testing.assert_frame_equal( - df_replaced, replace_mastr_katalogeintraege(zipped_xml_file_path, df_raw) + replace_mastr_katalogeintraege( + zipped_xml_file_path=zipped_xml_file_path, df=df_raw, catalog_columns={"Bundesland", "Einheittyp"}, + ), + df_replaced, ) diff --git a/tests/xml_download/test_utils_download_bulk.py b/tests/xml_download/test_utils_download_bulk.py index 8f650933..4557dbe8 100644 --- a/tests/xml_download/test_utils_download_bulk.py +++ b/tests/xml_download/test_utils_download_bulk.py @@ -1,3 +1,4 @@ +from datetime import date import time from open_mastr.xml_download.utils_download_bulk import ( gen_url, @@ -8,7 +9,7 @@ def test_gen_url(): - when = time.strptime("2024-01-01", "%Y-%m-%d") + when = date(2024, 1, 1) url = gen_url(when) assert type(url) == str assert ( @@ -16,7 +17,7 @@ def test_gen_url(): == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20240101_23.2.zip" ) - when = time.strptime("2024-04-01", "%Y-%m-%d") + when = date(2024, 4, 1) url = gen_url(when) assert type(url) == str assert ( @@ -24,7 +25,7 @@ def test_gen_url(): == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20240401_23.2.zip" ) - when = time.strptime("2024-04-02", "%Y-%m-%d") + when = date(2024, 4, 2) url = gen_url(when) assert type(url) == str assert ( @@ -32,7 +33,7 @@ def test_gen_url(): == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20240402_24.1.zip" ) - when = time.strptime("2024-10-01", "%Y-%m-%d") + when = date(2024, 10, 1) url = gen_url(when) assert type(url) == str assert ( @@ -40,7 +41,7 @@ def test_gen_url(): == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20241001_24.1.zip" ) - when = time.strptime("2024-10-02", "%Y-%m-%d") + when = date(2024, 10, 2) url = gen_url(when) assert type(url) == str assert ( @@ -48,7 +49,7 @@ def test_gen_url(): == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20241002_24.2.zip" ) - when = time.strptime("2024-12-31", "%Y-%m-%d") + when = date(2024, 12, 31) url = gen_url(when) assert type(url) == str assert ( @@ -58,7 +59,7 @@ def test_gen_url(): # Tests for use_version parameter - when = time.strptime("2024-12-31", "%Y-%m-%d") + when = date(2024, 12, 31) url = gen_url(when, use_version="before") assert type(url) == str assert ( @@ -66,7 +67,7 @@ def test_gen_url(): == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20241231_24.1.zip" ) - when = time.strptime("2024-12-31", "%Y-%m-%d") + when = date(2024, 12, 31) url = gen_url(when, use_version="after") assert type(url) == str assert ( @@ -74,7 +75,7 @@ def test_gen_url(): == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20241231_25.1.zip" ) - when = time.strptime("2024-04-02", "%Y-%m-%d") + when = date(2024, 4, 2) url = gen_url(when, use_version="before") assert type(url) == str assert ( @@ -82,7 +83,7 @@ def test_gen_url(): == "https://download.marktstammdatenregister.de/Gesamtdatenexport_20240402_23.2.zip" ) - when = time.strptime("2024-04-02", "%Y-%m-%d") + when = date(2024, 4, 2) url = gen_url(when, use_version="after") assert type(url) == str assert ( diff --git a/tests/xml_download/test_utils_write_to_database.py b/tests/xml_download/test_utils_write_to_database.py index bf54f16d..3b26ef42 100644 --- a/tests/xml_download/test_utils_write_to_database.py +++ b/tests/xml_download/test_utils_write_to_database.py @@ -8,7 +8,20 @@ import numpy as np import pandas as pd import pytest -from sqlalchemy import create_engine, inspect +from sqlalchemy import ( + Boolean, + Column, + create_engine, + Date, + DateTime, + Double, + inspect, + Integer, + MetaData, + String, + Table, +) + from sqlalchemy.sql import text from open_mastr.utils import orm @@ -17,7 +30,6 @@ add_missing_columns_to_table, add_zero_as_first_character_for_too_short_string, cast_date_columns_to_string, - change_column_names_to_orm_format, correct_ordering_of_filelist, create_database_table, extract_sql_table_name, @@ -37,7 +49,7 @@ _xml_folder_path = os.path.join(expanduser("~"), ".open-MaStR", "data", "xml_download") if os.path.isdir(_xml_folder_path): for entry in os.scandir(path=_xml_folder_path): - if "Gesamtdatenexport" in entry.name: + if "Gesamtdatenexport" in entry.name and entry.name.endswith(".zip"): _xml_file_exists = True @@ -51,9 +63,11 @@ def capture_wrap(): @pytest.fixture(scope="module") def zipped_xml_file_path(): + # TODO: Remove this + return "/home/gorgor/.open-MaStR/data/Gesamtdatenexport_20251228.zip" zipped_xml_file_path = None for entry in os.scandir(path=_xml_folder_path): - if "Gesamtdatenexport" in entry.name: + if "Gesamtdatenexport" in entry.name and entry.name.endswith(".zip"): zipped_xml_file_path = os.path.join(_xml_folder_path, entry.name) return zipped_xml_file_path @@ -97,16 +111,6 @@ def test_is_table_relevant(): assert is_table_relevant("netzanschlusspunkte", include_tables) is False -def test_create_database_table(engine_testdb): - orm.Base.metadata.create_all(engine_testdb) - xml_table_name = "einheitenkernkraft" - sql_table_name = "nuclear_extended" - - create_database_table(engine_testdb, xml_table_name) - - assert inspect(engine_testdb).has_table(sql_table_name) is True - - def test_is_first_file(): assert is_first_file("EinheitenKernkraft.xml") is True assert is_first_file("EinheitenKernkraft_1.xml") is True @@ -114,9 +118,16 @@ def test_is_first_file(): def test_cast_date_columns_to_string(): + table = Table( + "anlageneegwasser", + MetaData(), + Column("EegMastrNummer", String, primary_key=True), + Column("Registrierungsdatum", Date), + Column("DatumLetzteAktualisierung", DateTime), + ) initial_df = pd.DataFrame( { - "EegMastrNummer": [1, 2, 3], + "EegMastrNummer": ["1", "2", "3"], "Registrierungsdatum": [ datetime(2024, 3, 11).date(), datetime(1999, 2, 1).date(), @@ -131,7 +142,7 @@ def test_cast_date_columns_to_string(): ) expected_df = pd.DataFrame( { - "EegMastrNummer": [1, 2, 3], + "EegMastrNummer": ["1", "2", "3"], "Registrierungsdatum": ["2024-03-11", "1999-02-01", np.nan], "DatumLetzteAktualisierung": [ "2022-03-22 00:00:00.000000", @@ -142,32 +153,14 @@ def test_cast_date_columns_to_string(): ) pd.testing.assert_frame_equal( - expected_df, cast_date_columns_to_string("anlageneegwasser", initial_df) + expected_df, cast_date_columns_to_string(table, initial_df) ) def test_is_date_column(): - columns = RetrofitUnits.__table__.columns.items() - df = pd.DataFrame( - { - "Id": [1], - "DatumLetzteAktualisierung": [datetime(2022, 3, 22)], - "WiederinbetriebnahmeDatum": [datetime(2024, 3, 11).date()], - } - ) - - date_column = list(filter(lambda col: col[0] == "Id", columns))[0] - assert is_date_column(date_column, df) is False - - datetime_column = list( - filter(lambda col: col[0] == "DatumLetzteAktualisierung", columns) - )[0] - assert is_date_column(datetime_column, df) is True - - date_column = list( - filter(lambda col: col[0] == "WiederinbetriebnahmeDatum", columns) - )[0] - assert is_date_column(date_column, df) is True + assert is_date_column(Column("Id", Integer, primary_key=True)) is False + assert is_date_column(Column("DatumLetzteAktualisierung", DateTime)) is True + assert is_date_column(Column("WiederinbetriebnahmeDatum", Date)) is True def test_correct_ordering_of_filelist(): @@ -226,15 +219,6 @@ def test_read_xml_file(zipped_xml_file_path): assert df.shape[0] > 0 - # Since the file is from the latest download, its content can vary over time. To make sure that the table is - # correctly created, we check that all of its columns are associated are included in our mapping. - for column in df.columns: - if column in tablename_mapping[file_name.lower()]["replace_column_names"]: - column = tablename_mapping[file_name.lower()]["replace_column_names"][ - column - ] - assert column in ElectricityConsumer.__table__.columns.keys() - def test_add_zero_as_first_character_for_too_short_string(): # Prepare @@ -251,6 +235,8 @@ def test_add_zero_as_first_character_for_too_short_string(): pd.testing.assert_frame_equal(df_edited, df_correct) +# TODO: Do we want to keep this kind of renaming? +@pytest.mark.skip def test_change_column_names_to_orm_format(): initial_df = pd.DataFrame( { @@ -307,12 +293,17 @@ def test_process_table_before_insertion(zipped_xml_file_path): def test_add_missing_columns_to_table(engine_testdb): + table = Table( + "einheitengasverbraucher", + MetaData(), + Column("EinheitMastrNummer", String, primary_key=True), + Column("DatumLetzteAktualisierung", DateTime), + ) + # We must recreate the table to be sure that the new column is not present. + table.drop(engine_testdb, checkfirst=True) + table.create(engine_testdb) with engine_testdb.connect() as con: with con.begin(): - # We must recreate the table to be sure that the new colum is not present. - con.execute(text("DROP TABLE IF EXISTS gas_consumer")) - create_database_table(engine_testdb, "einheitengasverbraucher") - initial_data_in_db = pd.DataFrame( { "EinheitMastrNummer": ["id1"], @@ -320,11 +311,11 @@ def test_add_missing_columns_to_table(engine_testdb): } ) initial_data_in_db.to_sql( - "gas_consumer", con=con, if_exists="append", index=False + table.name, con=con, if_exists="append", index=False ) add_missing_columns_to_table( - engine_testdb, "einheitengasverbraucher", ["NewColumn"] + engine_testdb, table, ["NewColumn"] ) expected_df = pd.DataFrame( @@ -336,7 +327,7 @@ def test_add_missing_columns_to_table(engine_testdb): ) with engine_testdb.connect() as con: with con.begin(): - actual_df = pd.read_sql_table("gas_consumer", con=con) + actual_df = pd.read_sql_table(table.name, con=con) # The actual_df will contain more columns than the expected_df, so we can't use assert_frame_equal. assert expected_df.index.isin(actual_df.index).all() @@ -346,13 +337,28 @@ def test_add_missing_columns_to_table(engine_testdb): [add_table_to_sqlite_database, add_table_to_non_sqlite_database], ) def test_add_table_to_sqlite_database(engine_testdb, add_table_to_database_function): - with engine_testdb.connect() as con: - with con.begin(): - # We must recreate the table to be sure that no other data is present. - con.execute(text("DROP TABLE IF EXISTS gsgk_eeg")) - create_database_table( - engine_testdb, "anlageneeggeothermiegrubengasdruckentspannung" - ) + table = Table( + "anlageneeggeothermiegrubengasdruckentspannung", + MetaData(), + Column("EegMastrNummer", String, primary_key=True), + Column("InstallierteLeistung", Double), + Column("AnlageBetriebsstatus", String), + Column("Registrierungsdatum", Date), + Column("Meldedatum", DateTime), + Column("DatumLetzteAktualisierung", DateTime), + Column("EegInbetriebnahmedatum", DateTime), + Column("VerknuepfteEinheit", String), + Column("AnlagenschluesselEeg", String), + Column("AusschreibungZuschlag", Boolean), + Column("AnlagenkennzifferAnlagenregister", String), + Column("AnlagenkennzifferAnlagenregister_nv", String), + Column("Netzbetreiberzuordnungen", String), + Column("DatenQuelle", String), + Column("DatumDownload", DateTime), + ) + # We must recreate the table to be sure that no other data is present. + table.drop(engine_testdb, checkfirst=True) + table.create(engine_testdb) df = pd.DataFrame( { @@ -369,10 +375,10 @@ def test_add_table_to_sqlite_database(engine_testdb, add_table_to_database_funct ) expected_df = pd.DataFrame( { + "EegMastrNummer": ["id1", "id2"], "InstallierteLeistung": [1.0, 100.4], "AnlageBetriebsstatus": [None, None], "Registrierungsdatum": [datetime(2022, 2, 2), datetime(2024, 3, 20)], - "EegMastrNummer": ["id1", "id2"], "Meldedatum": [np.datetime64("NaT"), np.datetime64("NaT")], "DatumLetzteAktualisierung": [ datetime(2022, 12, 2, 10, 10, 10, 300), @@ -391,12 +397,12 @@ def test_add_table_to_sqlite_database(engine_testdb, add_table_to_database_funct ) add_table_to_database_function( - df, "anlageneeggeothermiegrubengasdruckentspannung", "gsgk_eeg", engine_testdb + df, table, engine_testdb ) with engine_testdb.connect() as con: with con.begin(): pd.testing.assert_frame_equal( - expected_df, pd.read_sql_table("gsgk_eeg", con=con) + expected_df, pd.read_sql_table(table.name, con=con) ) From 5625087a42a4994fbe4a53bd5f859a78ea06128d Mon Sep 17 00:00:00 2001 From: Simon Will Date: Sun, 4 Jan 2026 19:59:23 +0100 Subject: [PATCH 7/7] Make a couple small adjustments --- open_mastr/mastr.py | 23 ++++++++----------- open_mastr/utils/sqlalchemy_tables.py | 6 +---- open_mastr/utils/xsd_tables.py | 2 ++ .../xml_download/utils_write_to_database.py | 1 - 4 files changed, 13 insertions(+), 19 deletions(-) diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index 34f98684..5221e75b 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -40,10 +40,7 @@ setup_logger, ) import open_mastr.utils.orm as orm -from open_mastr.utils.sqlalchemy_tables import ( - make_sqlalchemy_model_from_mastr_table_description, - MastrBase -) +from open_mastr.utils.sqlalchemy_tables import make_sqlalchemy_model_from_mastr_table_description # constants from open_mastr.utils.constants import TECHNOLOGIES, ADDITIONAL_TABLES @@ -117,11 +114,7 @@ def generate_data_model( self, data: Optional[list[str]] = None, catalog_value_as_str: bool = True, - # TODO: A _repeated_ call to this function with the same base and overlapping data will fail with something like: - # sqlalchemy.exc.InvalidRequestError: Table 'AnlagenEegBiomasse' is already defined for this MetaData instance. - # Specify 'extend_existing=True' to redefine options and columns on an existing Table object. - # Is this expected behavior for us? Should we re-raise with a more understandable message? - base: Type[DeclarativeBase_T] = MastrBase, + base: Optional[Type[DeclarativeBase_T]] = None, ) -> dict[str, Type[DeclarativeBase_T]]: data = transform_data_parameter(data) @@ -229,12 +222,9 @@ def download( method = "bulk" if not mastr_table_to_db_table: - class TemporaryBase(DeclarativeBase): - pass mastr_table_to_db_model = self.generate_data_model( data=data, catalog_value_as_str=bulk_cleansing, - base=TemporaryBase, ) mastr_table_to_db_table = { mastr_table: db_model.__table__ @@ -307,8 +297,15 @@ def _download_docs_and_generate_data_model( zipped_docs_file_path: Path, data: list[str], catalog_value_as_str: bool = True, - base: Type[DeclarativeBase_T] = MastrBase, + base: Optional[Type[DeclarativeBase_T]] = None, ): + if base is None: + + class MastrBase(DeclarativeBase): + pass + + base = MastrBase + mastr_table_descriptions = read_mastr_table_descriptions_from_xsd( zipped_docs_file_path=zipped_docs_file_path, data=data ) diff --git a/open_mastr/utils/sqlalchemy_tables.py b/open_mastr/utils/sqlalchemy_tables.py index cc2e20f2..4f671deb 100644 --- a/open_mastr/utils/sqlalchemy_tables.py +++ b/open_mastr/utils/sqlalchemy_tables.py @@ -55,10 +55,6 @@ } -class MastrBase(DeclarativeBase): - pass - - class ParentAllTables(object): DatenQuelle: Mapped[str] = mapped_column(String) DatumDownload: Mapped[datetime.date] = mapped_column(Date) @@ -70,7 +66,7 @@ class ParentAllTables(object): def make_sqlalchemy_model_from_mastr_table_description( table_description: MastrTableDescription, catalog_value_as_str: bool, - base: Type[DeclarativeBase_T] = MastrBase, + base: Type[DeclarativeBase_T], mixins: tuple[type, ...] = (ParentAllTables,), ) -> Type[DeclarativeBase_T]: return _make_sqlalchemy_model( diff --git a/open_mastr/utils/xsd_tables.py b/open_mastr/utils/xsd_tables.py index 8d76739c..57bdf0b0 100644 --- a/open_mastr/utils/xsd_tables.py +++ b/open_mastr/utils/xsd_tables.py @@ -16,6 +16,8 @@ # TODO: Should we really mess with the original column names? # The BNetzA "choice" to sometimes write MaStR and sometimes Mastr is certainly confusing, # but are we the ones who should change that? +# Also TODO: Should we also apply the more opinionated normalization/renaming that is currently stored in orm.py? +# E.g. "VerknuepfteEinheitenMaStRNummern" -> "VerknuepfteEinheiten", "NetzanschlusspunkteMaStRNummern" -> "Netzanschlusspunkte", etc. def normalize_column_name(original_mastr_column_name: str) -> str: # BNethA sometimes has MaStR, other times MaStR. We normalize that. # Also, in case the column names in the XSD contain äöüß, we replace them. This is probably a BNetzA oversight, but has happened at least once. diff --git a/open_mastr/xml_download/utils_write_to_database.py b/open_mastr/xml_download/utils_write_to_database.py index 3b2769b8..d45945cd 100644 --- a/open_mastr/xml_download/utils_write_to_database.py +++ b/open_mastr/xml_download/utils_write_to_database.py @@ -192,7 +192,6 @@ def check_for_column_mismatch_and_try_to_solve_it( if additional_df_column_names := df_column_names - db_column_names: if alter_database_tables: - # TODO: Check here if the user specified not to issue DDL statements before trying to insert. log.warning( f"XML file has some columns that aren't present in the database table {db_table.name}." f" Trying to add the columns to the table. Additional XML columns:"