diff --git a/CHANGELOG.md b/CHANGELOG.md index d71344c4..16346ef1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,8 +8,14 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ ## [v0.xx.x] Unreleased - 202x-xx-xx ### Added +- Add the option to pass a custom database schema + [#718](https://github.com/OpenEnergyPlatform/open-MaStR/pull/718) ### Changed +- Switch to dynamic table generation based on parsing of XSD files; + change table names and column names to align more closely with original names; + simplify CSV export by removing table joins + [#718](https://github.com/OpenEnergyPlatform/open-MaStR/pull/718) ### Removed @@ -46,7 +52,6 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ [#685](https://github.com/OpenEnergyPlatform/open-MaStR/pull/685) - ## [v0.16.0] Partial downloads with open-MaStR PartialPumpkinPull - 2025-11-26 ### Added - Add partial bulk download diff --git a/CITATION.cff b/CITATION.cff index 2a59b914..f877e2bb 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -14,30 +14,33 @@ authors: given-names: "Christoph" alias: "@chrwm" affiliation: "Reiner Lemoine Institut" - orcid: " https://orcid.org/0000-0001-8144-5260" + orcid: "https://orcid.org/0000-0001-8144-5260" - family-names: "Kotthoff" given-names: "Florian" alias: "@FlorianK13" affiliation: "fortiss" - orcid: " https://orcid.org/0000-0003-3666-6122" + orcid: "https://orcid.org/0000-0003-3666-6122" - family-names: "Tepe" given-names: "Deniz" alias: "@deniztepe" affiliation: "fortiss" - orcid: " https://orcid.org/0000-0002-7605-0173" + orcid: "https://orcid.org/0000-0002-7605-0173" - family-names: "Amme" given-names: "Jonathan" alias: "@nesnoj" affiliation: "Reiner Lemoine Institut" - orcid: " https://orcid.org/0000-0002-8563-5261" + orcid: "https://orcid.org/0000-0002-8563-5261" - family-names: "Imbrisca" given-names: "Alexandra-Andreea" alias: "@AlexandraImbrisca" affiliation: "Technical University of Munich" - family-names: 'Krämer' given-names: "Kevin" - alias: "pt-kkraemer" + alias: "@pt-kkraemer" affiliation: "ProjectTogether gGmbH" + - family-names: "Will" + given-names: "Simon" + alias: "@Simon-Will" title: "open-MaStR" type: software license: AGPL-3.0 diff --git a/docs/advanced.md b/docs/advanced.md index 041bd66a..32f8f4b4 100644 --- a/docs/advanced.md +++ b/docs/advanced.md @@ -4,6 +4,7 @@ or the [SOAP API download](#soap-api-download). ## Configuration ### Database settings +#### Using a custom database Configure your database with the `engine` parameter of [`Mastr`][open_mastr.Mastr]. @@ -12,22 +13,70 @@ It defines the engine of the database where the MaStR is mirrored to. Default is The possible databases are: * **sqlite**: By default the database will be stored in `$HOME/.open-MaStR/data/sqlite/open-mastr.db`. -* **own database**: The Mastr class accepts a sqlalchemy.engine.Engine object as engine which enables the user to +* **own database**: The Mastr class accepts a `sqlalchemy.engine.Engine` object as engine which enables the user to use any other desired database such as PostgreSQL. The tables are created in the default DB schema, in PostgreSQL this is `public`. If you use an own database so, you need to insert the connection parameter into the engine variable. In the example below, the following parameters are used: user `open-mastr`, password `open-mastr-pw`, database `open-mastr-db`. Make sure it exists and the user has sufficient permissions. +!!! warning MySQL needs special table definitions + You can pass an engine for a MySQL database, but MySQL demands maximum lengths for its `VARCHAR` fields. + Since open-mastr generates its database string fields without maximum length, using MySQL will fail by default. + You can make it work by defining your own tables beforehand and [passing your own database schema](#using-a-custom-database-schema). + ```python +from sqlalchemy import create_engine + +# SQLite DB +engine_sqlite = create_engine("sqlite:///path/to/sqlite/database.db") +# PostgreSQL DB +engine_postgres = create_engine("postgresql+psycopg2://open-mastr:open-mastr-pw@localhost:55443/open-mastr-db") +mastr = Mastr(engine=engine_sqlite) # or engine=engine_postgres +mastr.download() +``` + +#### Using a custom database schema + +By default, `Mastr.download` will download the MaStR documentation, generate a database schema from the contained XSD +files and create all database tables necessary for storing MaStR data. + +If you want to prepare the database yourself, you can pass your own mapping from the original MaStR table name to your +database table to `Mastr.download` with the `mastr_table_to_db_table` parameter. +To get started with the default database schema, we recommend generating it from the MaStR docs using +`Mastr.generate_data_model` and then adjusting it: + +```python +from sqlalchemy import create_engine +from open_mastr import Mastr, format_mastr_table_to_db_table + +engine_postgres = create_engine("postgresql+psycopg2://open-mastr:open-mastr-pw@localhost:55443/open-mastr-db") +mastr = Mastr(engine=engine_postgres) + +# Generate SQLAlchemy table definitions without creating the tables +mastr_table_to_db_table = mastr.generate_data_model() +# Print the tables so that you can see what was generated. +print(format_mastr_table_to_db_table(mastr_table_to_db_table)) + +# Now you need to go and create the tables in your database and adjust them to your needs. +# It's best to use the table definitions we generated and adjust them. +# Finally, you need your custom version of mastr_table_to_db_table. +mastr_table_to_your_custom_db_table = ... + +# Download MaStR data into your custom tables. +mastr.download(mastr_table_to_db_table=mastr_table_to_your_custom_db_table) +``` - from sqlalchemy import create_engine +When open-mastr encounters XML files in the MaStR download that have additional columns when compared to the database +tables, it will issue `ALTER` statements to add the columns to the database on the fly. To avoid this and to instead +skip the additional columns during import, you can pass the parameter `alter_database_tables=False`: - # SQLite DB - engine_sqlite = create_engine("sqlite:///path/to/sqlite/database.db") - # postgreSQL DB - engine_postgres = create_engine("postgresql+psycopg2://open-mastr:open-mastr-pw@localhost:55443/open-mastr-db") - db = Mastr(engine=engine_sqlite) +```python +# Download MaStR data into your custom tables. +mastr.download( + mastr_table_to_db_table=mastr_table_to_your_custom_db_table, + alter_database_tables=False, +) ``` ### Project directory @@ -37,8 +86,7 @@ You can change this default path, see [environment variables](#environment-varia Default config files are copied to this directory which can be modified - but with caution. The project home directory is structured as follows (files and folders below `data/` just an example). -```bash - +``` .open-MaStR/ ├── config │   ├── credentials.cfg @@ -46,15 +94,16 @@ The project home directory is structured as follows (files and folders below `da │   ├── logging.yml ├── data │   ├── dataversion- + │ ├── docs_download + │ │ └── Dokumentation MaStR Gesamtdatenexport_.zip │   ├── sqlite - │      └── open-mastr.db - └── xml_download - └── Gesamtdatenexport_.zip + │   │   └── open-mastr.db + │ └── xml_download + │ └── Gesamtdatenexport_.zip └── logs └── open_mastr.log ``` - * **config** * `credentials.cfg`
Credentials used to access @@ -69,15 +118,15 @@ The project home directory is structured as follows (files and folders below `da Contains exported data as csv files from method [`to_csv`][open_mastr.Mastr.to_csv] * `sqlite`
Contains the sqlite database in `open-mastr.db` + * `docs_download`
+ Contains the documentation of the MaStR download. * `xml_download`
Contains the bulk download in `Gesamtdatenexport_.zip`
New bulk download versions overwrite older versions. * **logs** - * `open_mastr.log`
+ * `open_mastr.log`
The files stores the logging information from executing open-mastr. - - ### Logs For the download via the API, logs are stored in a single file in `/$HOME//.open-MaStR/logs/open_mastr.log`. @@ -87,7 +136,6 @@ By default, the log level is set to `INFO`. You can increase or decrease the ver or adjusting it manually in your code. E.g. to enable `DEBUG` messages in `open_mastr.log` you can use the following snippet: ```python - import logging from open_mastr import Mastr @@ -96,7 +144,6 @@ or adjusting it manually in your code. E.g. to enable `DEBUG` messages in `open_ logging.getLogger("open-MaStR").setLevel(logging.DEBUG) ``` - ### Data If the zipped dump of the MaStR is downloaded, it is saved in the folder `$HOME/.open-MaStR/data/xml_download`. @@ -122,8 +169,8 @@ There are some environment variables to customize open-MaStR: ## Bulk download On the homepage [MaStR/Datendownload](https://www.marktstammdatenregister.de/MaStR/Datendownload) a zipped folder containing the whole -MaStR is offered. The data is delivered as xml-files. The official documentation can be found -on the same page (in german). This data is updated on a daily base. +MaStR is offered. The data is delivered as XML files. The official documentation can be found +on the same page (in German). This data is updated on a daily basis. ``` mermaid flowchart LR @@ -132,9 +179,8 @@ flowchart LR id2 --> id3[("📗 open-mastr database")] id3 --> id4("🔧 Decode and cleanse data") id4 --> id3 - id3 --> id5("Merge corresponding tables - and save as csv") - id5 --> id6>"📜 open-mastr csv files"] + id3 --> id5("Export to CSV") + id5 --> id6>"📜 open-mastr CSV files"] click id1 "https://www.marktstammdatenregister.de/MaStR/Datendownload" _blank click id2 "https://github.com/OpenEnergyPlatform/open-MaStR/blob/7b155a9ebdd5204de8ae6ba7a96036775a1f4aec/open_mastr/xml_download/utils_write_to_database.py#L17C6-L17C6" _blank click id4 "https://github.com/OpenEnergyPlatform/open-MaStR/blob/7b155a9ebdd5204de8ae6ba7a96036775a1f4aec/open_mastr/xml_download/utils_cleansing_bulk.py#L10" _blank @@ -143,17 +189,24 @@ flowchart LR ``` -In the following, the process is described that is started when calling the [`Mastr.download`][open_mastr.Mastr.download] function with the parameter `method`="bulk". -First, the zipped files are downloaded and saved in `$HOME/.open-MaStR/data/xml_download`. The zipped folder contains many xml files, -which represent the different tables from the MaStR. Those tables are then parsed to a sqlite database. If only some specific -tables are of interest, they can be specified with the parameter `data`. Every table that is selected in `data` will be deleted from the local database, if existent, and then filled with data from the xml files. +In the following, the process is described that is started when calling the [`Mastr.download`][open_mastr.Mastr.download] without parameters. +First, the zipped documentation is downloaded and saved in `$HOME/.open-MaStR/data/docs_download`. The zipped documentation contains +XSD files that describe the MaStR XML files that contain the data. open-mastr reads the XSD files and generates a database schema for +importing the data. I.e., for each MaStR table, it defines a database table and then creates it in a SQLite database. + +Then, the zipped files are downloaded and saved in `$HOME/.open-MaStR/data/xml_download`. The zipped folder contains +many XML files, which represent the different tables from the MaStR. Those XML files are then read and imported into the +previously created SQLite database tables. + +If only some specific tables are of interest, they can be specified with the parameter `data`. Every table that is +selected in `data` will be deleted from the local database, if existent, and then filled with data from the xml files. In the next step, a basic data cleansing is performed. Many entries in the MaStR from the bulk download are replaced by numbers. -As an example, instead of writing the german states where the unit is registered (Saxony, Brandenburg, Bavaria, ...) the MaStR states +As an example, instead of writing the German states where the unit is registered (Saxony, Brandenburg, Bavaria, ...) the MaStR states corresponding digits (7, 2, 9, ...). One major step of cleansing is therefore to replace those digits with their original meaning. Moreover, the datatypes of different entries are set in the data cleansing process and corrupted files are repaired. -If needed, the tables in the database can be obtained as csv files. Those files are created by first merging corresponding tables (e.g all tables that contain information about solar) and then dumping those tables to `.csv` files with the [`to_csv`][open_mastr.Mastr.to_csv] method. +The tables in the database can be exported to CSV files using the [`to_csv`][open_mastr.Mastr.to_csv] method. **Note**: By default, existing zip files in `$HOME/.open-MaStR/data/xml_download` are deleted when a new file is downloaded. You can change this behavior by setting `keep_old_downloads`=True in diff --git a/docs/dataset.md b/docs/dataset.md index 3cf4a0af..c6860e7c 100644 --- a/docs/dataset.md +++ b/docs/dataset.md @@ -19,12 +19,6 @@ The German Federal Network Agency regularly updates the dataset and adds new tab As you may have noticed, we distinguish between `bulk` and `API` datasets. The `bulk` dataset refers to the data obtained from the zipped XML files downloaded from [here](https://www.marktstammdatenregister.de/MaStR/Datendownload) using the [`Mastr.download`][open_mastr.Mastr.download] function. The `API` data is obtained by requesting information via the SOAP-API and the [`soap_api.download.MaStRAPI`][open_mastr.soap_api.download.MaStRAPI] module. -??? question "Why is the table structure in the open-mastr database as it is?" - - The structure of the database is historically determined by the data retrieved via API. (open-mastr existed before the XML-dump was provided). -
See [MaStR data model](#mastr-data-model) - - ## Tables in the database !!! question "Confused by all the tables?" @@ -34,59 +28,77 @@ After downloading the MaStR, you will find a database with a large number of tab ### Tables in the local database - -=== "_extended tables" - The main information about the different technologies lies in the `_extended` tables. You can find the capacity, location, and other technology-specific attributes here. - - | Table name | Comments | - |------|------| - | biomass_extended | | - | combustion_extended | *Conventional powerplants: Gas, Oil, Coal, ...* | - | gsgk_extended | *gsgk is short for: Geothermal, Mine gas, and Pressure relaxation* | - | hydro_extended | | - | nuclear_extended | | - | solar_extended | | - | storage_extended | | - | wind_extended | | - -=== "_eeg tables" - In germany, renewable energies were subsidized by the state - according to a law called 'EEG'. Relevant information like the 'EEG ID' are in the `_eeg` tables. - - | Table name | Comments | - |------|------| - | biomass_eeg | | - | gsgk_eeg | *gsgk is short for: Geothermal, Mine gas, and Pressure relaxation* | - | hydro_eeg | | - | solar_eeg | | - | storage_eeg | | - | wind_eeg | | +=== "Units related to electric power" + The main information about power plants producing power/gas and other units is in tables prefixed with + "Einheiten"/"units". You can find the capacity, location, and other technology-specific attributes here. + + | Original German name | English name | Comments | + |------|------|------| + | EinheitenBiomasse | units_biomass | Biomass combustion power plants | + | EinheitenGeothermieGrubengasDruckentspannung | units_gsgk | Geothermal, mine gas and pressure relaxation units | + | EinheitenKernkraft | units_nuclear | Nuclear power plants | + | EinheitenSolar | units_solar | Solar power plants | + | EinheitenStromSpeicher | units_electricity_storage | Electric power storage units | + | EinheitenStromVerbraucher | units_electricity_consumers | *Large* electric power consumers | + | EinheitenVerbrennung | units_combustion | Conventional combustion power plants: gas, oil, coal, … | + | EinheitenWasser | units_hydro | Hydroelectric power plants | + | EinheitenWind | units_wind | Wind power plants | + +=== "Units related to gas" + The tables prefixed with "EinheitenGas"/"units_gas" refer to units related to gas. + + | Original German name | English name | Comments | + |------|------|------| + | EinheitenGasErzeuger | units_gas_producers | Gas production units (natural gas extraction, biomethane production, …)| + | EinheitenGasSpeicher | units_gas_storage | Gas storage units | + | EinheitenGasverbraucher | units_gas_consumers | *Large* gas consumers | + +=== "Groups of units" + Tables prefixed with "Anlagen"/"installations" define groups of units; they have a column called + "VerknuepfteEinheitenMastrNummern"/"linkedUnitsMastrNumbers", which you can use to look up the connected units. + + Some of them are special subsidy groups: In Germany, renewable energies as well as combined heat and power (CHP/KWK) + plants are subsidized by the state according to laws called 'EEG' (for renewable energies) and 'KWK' (for CHP + plants). These tables contain information about the subsidies such as the 'EEG ID'. + + | Original German name | English name | Comments | + |------|------|------| + | AnlagenEegBiomasse | installations_eeg_biomass | | + | AnlagenEegGeothermieGrubengasDruckentspannung | installations_eeg_gsgk | | + | AnlagenEegSolar | installations_eeg_solar | | + | AnlagenEegSpeicher | installations_eeg_storage | | + | AnlagenEegWasser | installations_eeg_hydro | | + | AnlagenEegWind | installations_eeg_wind | | + | AnlagenGasSpeicher | installations_gas_storage | | + | AnlagenKwk | installations_kwk | | + | AnlagenStromSpeicher | installations_electricity_storage | | === "Other tables" - Other tables contain information about the grid, the energy market, or gas consumers and producers: - - | Table name | Comments | - |------|------| - | balancing_area | *Related to the energy market* | - | changed_dso_assignment | *Units where the DSO responsibility changed* | - | electricity_consumer | *Only large consumers* | - | gas_consumer | *Only large consumers* | - | gas_producer | | - | gas_storage | | - | gas_storage_extended | | - | grid_connections | *Does not contain geoinformation* | - | grids | *Does not contain geoinformation* | - | locations_extended | *Connects units with grids - to get coordinates of units use the _extended tables*| - | market_actors | | - | market_actors_and_roles | | - | permit | | - | storage_units | | - | kwk | *short for: Combined heat and power (CHP)* | - | deleted_units | Units from all technologies that were deleted or deactivated | - | deleted_market_actors | Market actors that were deleted or deactivated | - + Other tables contain information about the grid, the energy market and changes to units and market actors. + + | Original German name | English name | Comments | + |------|------|------| + | Bilanzierungsgebiete | balancing_areas | Balancing areas | + | EinheitenAenderungNetzbetreiberzuordnungen | changes_dso_assignment | Changes of DSO assigment of units | + | EinheitenGenehmigung | permits | Unit permits | + | Einheitentypen | unit_types | Meta information about unit types. **Not imported by open-mastr** | + | Ertuechtigungen | retrofits | Retrofits of units | + | GeloeschteUndDeaktivierteEinheiten | deleted_and_deactivated_units | Deleted & deactived units | + | GeloeschteUndDeaktivierteMarktakteure | deleted_and_deactivated_market_actors | Deleted & deactived market actors | + | Katalogkategorien | catalog_categories | Meta information about MaStR values. **Not imported by open-mastr** | + | Katalogwerte | catalog_values | Meta information about MaStR values. **Not imported by open-mastr** | + | Lokationen | locations | Connects units with grid connections | + | Lokationstypen | location_types | Meta information location types. **Not imported by open-mastr** | + | Marktakteure | market_actors | Market actors | + | MarktakteureUndRollen | market_actors_and_roles | Roles filled by market actors | + | Marktfunktionen | market_functions | Meta information about market functions. **Not imported by open-mastr** | + | Marktrollen | market_roles | Meta information about market roles. **Not imported by open-mastr** | + | Netzanschlusspunkte | grid_connections | Connects locations with grids | + | Netze | grids | Grids | ### MaStR data model -A useful overview of the MaStR data model can be found at the MaStR [help page](https://www.marktstammdatenregister.de/MaStRHilfe/subpages/faq.html). A translated version using the names from the tables you can find in your local database is presented here: + +A useful overview of the MaStR data model can be found at the MaStR [help page](https://www.marktstammdatenregister.de/MaStRHilfe/subpages/faq.html). A translated version using the names from the tables you can find in your local database is presented here: === "translated image (english)" ![Data model of the MaStR](images/DetailAnlagen_english.PNG) @@ -97,8 +109,10 @@ A useful overview of the MaStR data model can be found at the MaStR [help page]( ## Tables as CSV -Tables from the database can be exported to csv files. By default, all available power plant unit data will be exported -to csv files. +Tables from the database can be exported to CSV files. By default, all available power plant unit data will be exported +to csv files. + +!!! warning "Joining of tables for CSV export has been removed" + In versions > `v1.0.0`, the database tables are exported to CSV as they are. Joins between unit, CHP/EEG data and permits are not done anymore. -For exported csv's additional available data is joined on basic unit data. For example: For biomass power plants one csv -is exported consisting of the join of four database tables (unit data, chp data, permit data, eeg data). We regularly run the whole download and cleansing pipeline and upload the dataset as csv files at [zenodo](https://doi.org/10.5281/zenodo.6807425). +We occasionally run the whole download and cleansing pipeline and upload the dataset as csv files at [zenodo](https://doi.org/10.5281/zenodo.6807425). diff --git a/docs/getting_started.md b/docs/getting_started.md index 0c17fcb4..877316ca 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -18,11 +18,7 @@ pip install open-mastr --upgrade ## Downloading the MaStR data -The MaStR dataset is updated on a daily basis. To download todays MaStR and save it in a sqlite database, you will use the [`Mastr`][open_mastr.Mastr] class and its [`download`][open_mastr.Mastr.download] method. The [`download`][open_mastr.Mastr.download] method offers two different ways to get the data by changing the `method` parameter (if not specified, `method` defaults to "bulk"): - -1. `method` = "bulk": Get data via the bulk download from [MaStR/Datendownload](https://www.marktstammdatenregister.de/MaStR/Datendownload). Use this if you want to download the whole dataset (few Gigabite) or if you want to download all units of a given technology (e.g. all wind turbines in Germany). -2. `method` = "API": Get data via the MaStR SOAP-API. Use this if you want specific information about single units and if you have registerd to get an API token. - +The MaStR dataset is updated on a daily basis. To download today's MaStR and save it in a sqlite database, you will use the [`Mastr`][open_mastr.Mastr] class and its [`download`][open_mastr.Mastr.download] method. This will get the bulk download from [MaStR/Datendownload](https://www.marktstammdatenregister.de/MaStR/Datendownload). You can either download the whole dataset (a few GB) or all units of a given technology (e.g. all wind turbines in Germany). ### Bulk download @@ -86,4 +82,4 @@ additional tables are mirrored from database to csv as they are. To export the d tables=["wind", "grids"] db.to_csv(tables) -``` \ No newline at end of file +``` diff --git a/docs/index.md b/docs/index.md index 698e224a..c57695bc 100644 --- a/docs/index.md +++ b/docs/index.md @@ -21,7 +21,7 @@ In particular, `open-mastr` facilitates access to the daily provided MaStR dumps | ------------------------- | ---------------------------------------------------------------------------------------------------------------- | | Data download and parsing | Download, decode, and write data to a local database | | Translation to English | Translate table names and columns from German to English as well as an English documentation page of the dataset | -| Data processing | Merge relevant information about different technologies to single csv files | +| CSV export | Export downloaded data as CSV | !!! question "Just here for the data?" :sparkles: We regularly run the whole download and cleansing pipeline and upload the dataset as csv files at [zenodo](https://doi.org/10.5281/zenodo.6807425)! @@ -29,4 +29,4 @@ In particular, `open-mastr` facilitates access to the daily provided MaStR dumps ## License The original dataset is licensed under the **Datenlizenz Deutschland – Namensnennung – Version 2.0** (DL-DE-BY-2.0) -[Marktstammdatenregister](https://www.marktstammdatenregister.de/MaStR) - © Bundesnetzagentur für Elektrizität, Gas, Telekommunikation, Post und Eisenbahnen | [DL-DE-BY-2.0](https://www.govdata.de/dl-de/by-2-0) \ No newline at end of file +[Marktstammdatenregister](https://www.marktstammdatenregister.de/MaStR) - © Bundesnetzagentur für Elektrizität, Gas, Telekommunikation, Post und Eisenbahnen | [DL-DE-BY-2.0](https://www.govdata.de/dl-de/by-2-0) diff --git a/open_mastr/__init__.py b/open_mastr/__init__.py index 819ebc17..0a400501 100644 --- a/open_mastr/__init__.py +++ b/open_mastr/__init__.py @@ -4,3 +4,4 @@ # This import should be called after the setup project home, since .open_mastr folder must be defined first from .mastr import Mastr # noqa: E402 ignore import order warning of flake8 +from .utils.sqlalchemy_tables import format_mastr_table_to_db_table # noqa: E402 diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index fbb382db..66eb6f73 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -1,51 +1,59 @@ import os -from sqlalchemy import inspect, create_engine +from pathlib import Path +from typing import Any, Literal, Optional, Union +from collections.abc import Iterable, Mapping + +import pandas as pd +from sqlalchemy import inspect, Engine, Table, MetaData # import xml dependencies from open_mastr.xml_download.utils_download_bulk import ( + download_documentation, download_xml_Mastr, select_download_date, delete_xml_files_not_from_given_date, + list_available_downloads, + get_date_from_docs_url, ) from open_mastr.xml_download.utils_write_to_database import ( write_mastr_xml_to_database, ) +from open_mastr.utils.xsd_tables import ( + read_mastr_table_descriptions_from_xsd, +) from open_mastr.utils.helpers import ( validate_parameter_format_for_download_method, validate_parameter_format_for_mastr_init, - validate_parameter_data, transform_data_parameter, parse_date_string, transform_date_parameter, - data_to_include_tables, - create_db_query, - db_query_to_csv, - reverse_fill_basic_units, delete_zip_file_if_corrupted, create_database_engine, - rename_table, - create_translated_database_engine, ) from open_mastr.utils.config import ( - create_data_dir, get_data_version_dir, - get_project_home_dir, get_output_dir, setup_logger, ) -import open_mastr.utils.orm as orm +from open_mastr.utils.sqlalchemy_tables import ( + make_sqlalchemy_table_from_mastr_table_description, +) +from open_mastr.utils.sqlalchemy_views import create_views -# constants -from open_mastr.utils.constants import TECHNOLOGIES, ADDITIONAL_TABLES # setup logger log = setup_logger() +FALLBACK_DOCS_PATH = ( + Path(__file__).parent + / "resources" + / "Dokumentation-MaStR-Gesamtdatenexport-20260216-Fallback.zip" +) + class Mastr: - """ - `Mastr` is used to download the MaStR database and keep it up-to-date. + """`Mastr` is used to download the MaStR database. An SQL database is used to mirror the MaStR database. It is filled by downloading and parsing the MaStR via bulk download. @@ -64,29 +72,24 @@ class Mastr: engine : {'sqlite', sqlalchemy.engine.Engine}, optional Defines the engine of the database where the MaStR is mirrored to. Default is 'sqlite'. - connect_to_translated_db: boolean, optional - Allows connection to an existing translated database. Default is 'False'. - Only for 'sqlite'-type engines. - - - + output_dir : Top-level directory of produced output (downloaded files, created database) + Default is the content of env var OUTPUT_PATH, else ~/.open-MaStR """ - def __init__(self, engine="sqlite", connect_to_translated_db=False) -> None: + def __init__( + self, + engine: Union[Engine, Literal["sqlite"]] = "sqlite", + output_dir: Optional[Union[str, Path]] = None, + ) -> None: validate_parameter_format_for_mastr_init(engine) - self.output_dir = get_output_dir() - self.home_directory = get_project_home_dir() + self.output_dir = output_dir or get_output_dir() + self._sqlite_folder_path = os.path.join(self.output_dir, "data", "sqlite") + os.makedirs(self._sqlite_folder_path, exist_ok=True) - self.is_translated = connect_to_translated_db - if connect_to_translated_db: - self.engine = create_translated_database_engine( - engine, self._sqlite_folder_path - ) - else: - self.engine = create_database_engine(engine, self._sqlite_folder_path) + self.engine = create_database_engine(engine, self._sqlite_folder_path) log.info( "\n==================================================\n" @@ -94,24 +97,131 @@ def __init__(self, engine="sqlite", connect_to_translated_db=False) -> None: "==================================================\n" f"Data will be written to the following database: {self.engine.url}\n" "If you run into problems, try to " - "delete the database and update the package by running " + "delete the open-mastr directory & database and update the package by running " "'pip install --upgrade open-mastr'\n" ) - orm.Base.metadata.create_all(self.engine) + def generate_data_model( + self, + data: Optional[list[str]] = None, + date: Optional[str] = None, + url: Optional[str] = None, + metadata: Optional[MetaData] = None, + catalog_value_as_str: bool = True, + english: bool = False, + ) -> dict[str, Table]: + """Generate data model from MaStR documentation. + + Download the MaStR documentation, extract the XSD files that describe the format of the + MaStR XML files, and generate SQLAlchemy tables from those XSD files. The tables are not + created in the database. + + You can use this method to create a mapping from the MaStR data model to your own data model. + This mapping should then be passed to the `Mastr.download` method. + + !!! example + + ```python + from open_mastr import Mastr, format_mastr_table_to_db_table + db = Mastr() + mapping = db.generate_data_model() + # Print the tables so that you can see what was generated. + print(format_mastr_table_to_db_table(mapping)) + # edit your mapping here + db.download(mastr_table_to_db_table=mapping) + ``` + + Parameters + ---------- + data : str or list or None, optional + Specifies which tables to generate. See the `download` method for details. + + date : None or `datetime.datetime` or str, optional + Specifies date for the docs download. See the `download` method for details. + + url : str or None, optional + The URL to download the MaStR documentation from. If this is given, the `date` parameter + is not used. + + metadata : SQLAlchemy MetaData or None, optional + SQLAlchemy MetaData object to use for the tables. If not given, a new MetaData object + is created. + + catalog_value_as_str : bool, optional + If set to True, columns which contain values from the MaStR catalog (Katalogwerte) are + generated as string/VARCHAR columns. If set to False, they are generated as int columns. + This should usually be set to True if you want to use the `bulk_cleansing` option of + the `download` method to convert the catalog IDs to their values. + Defaults to True. + + english: bool, optional + If set to True, table and column names are translated from their MaStR name to an + English name if open-mastr already has a translation stored for that table/column. + + The English name and original MaStR name are always stored in the table's/column's + `info` attribute. + + Defaults to False. + + Returns + ------- + dict from str to SQLAlchemy Table + Dict mapping original MaStR table name to SQLAlchemy table + Example: {"EinheitenWind": Table(...), "EinheitenSolar": Table(...), ...} + """ + data = transform_data_parameter(data) + date = parse_date_string(transform_date_parameter(date)) + if url: + # This is awkward. We want to give the option to call this function with just a URL. + # But in our download file path, we want to have the date. So we need to get the date + # from the URL now. + if parsed_date := get_date_from_docs_url(url): + date = parsed_date + + docs_folder_path = os.path.join(self.output_dir, "data", "docs_download") + os.makedirs(docs_folder_path, exist_ok=True) + zipped_docs_file_path = os.path.join( + docs_folder_path, f"Dokumentation MaStR Gesamtdatenexport_{date}.zip" + ) + try: + download_documentation( + zipped_docs_file_path, bulk_date_string=date, url=url + ) + return _generate_data_model_from_downloaded_docs( + zipped_docs_file_path=zipped_docs_file_path, + data=data, + catalog_value_as_str=catalog_value_as_str, + metadata=metadata, + english=english, + ) + except Exception as e: + log.exception( + f"Encountered {e!r} when downloading or processing MaStR documentation." + f" Falling back to stored docs at {FALLBACK_DOCS_PATH}" + ) + return _generate_data_model_from_downloaded_docs( + zipped_docs_file_path=FALLBACK_DOCS_PATH, + data=data, + catalog_value_as_str=catalog_value_as_str, + metadata=metadata, + english=english, + ) def download( self, - method="bulk", + method: Literal["bulk"] = "bulk", data=None, date=None, - bulk_cleansing=True, - keep_old_downloads: bool = False, select_date_interactively: bool = False, - **kwargs, + bulk_cleansing: bool = True, + keep_old_downloads: bool = False, + mastr_table_to_db_table: Optional[Mapping[str, Table]] = None, + alter_database_tables: bool = True, + english: bool = False, + add_views_for_old_table_names: bool = True, + **kwargs: Any, ) -> None: - """ - Downloads the MaStR registry and writes it to a local database. + """Download the MaStR registry and write it to a local database. Parameters ---------- @@ -151,34 +261,66 @@ def download( date : None or `datetime.datetime` or str, optional - | date | description | - |-----------------------|------| - | "today" | latest files are downloaded from marktstammdatenregister.de | - | "20230101" | If file from this date exists locally, it is used. Otherwise it throws an error (You can only receive todays data from the server) | - | "existing" | Deprecated since 0.16, see [#616](https://github.com/OpenEnergyPlatform/open-MaStR/issues/616#issuecomment-3089377062) | - | None | set date="today" | + | date | description | + |------------|-------------| + | "20230101" | If file from this date exists locally, it is used. Otherwise, it tries to get it from markstammdatenregister.de | + | "today" | Shorthand for specify today's date in YYYYMMDD format | + | None | set date="today" | + | "existing" | Deprecated since 0.16, see [#616](https://github.com/OpenEnergyPlatform/open-MaStR/issues/616#issuecomment-3089377062) | + + Defaults to `None`. - Default to `None`. select_date_interactively : bool, optional If set to True, the user will be presented with a list of available download dates from the MaStR website and can interactively select which date to download. This allows downloading historical data instead of just the latest available data. When True, the `date` parameter is ignored. Defaults to False. + bulk_cleansing : bool, optional If set to True, data cleansing is applied after the download (which is recommended). In its original format, many entries in the MaStR are encoded with IDs. Columns like `state` or `fueltype` do not contain entries such as "Hessen" or "Braunkohle", but instead only contain IDs. Cleansing replaces these IDs with their corresponding original entries. - keep_old_downloads: bool - If set to True, prior downloaded MaStR zip files will be kept. - """ + Defaults to True. - if self.is_translated: - raise TypeError( - "You are currently connected to a translated database.\n" - "A translated database cannot be further processed." - ) + keep_old_downloads : bool, optional + If set to True, prior downloaded MaStR zip files will be kept. Defaults to False. + + mastr_table_to_db_table : Mapping from MaStR table name (str) to SQLALchemy Table, or None, optional + If given, downloaded data from a MaStR file will be stored in the SQLAlchemy table + associated with that file. The tables must exist already; they are not created. + Example: {"EinheitenWind": Table(...), "EinheitenSolar": Table(...), ...} + + If None / not given, the mapping will be generated by calling + `Mastr.generate_data_model` (and thus downloading the MaStR documentation). In this + case, the tables will be created. + + You can pass this parameter to use your own mapping. + + Defaults to None. + + alter_database_tables : bool, optional + If set to True, ALTER statememts to add database columns will be issued if there are + fields in the downloaded XML that are not yet present in the database tables that have + been generated or given. If set to False and such unexpected fields are found, those + fields are not imported. + + Defaults to True. + + english : bool = False, optional, + If set to True and no `mastr_table_to_db_table` mapping is given, the generated tables + will have English names and English columns. (Some untranslated German names may remain + if they haven't been added to open-mastr's translation info yet.) + + Defaults to False. + add_views_for_old_table_names : bool = True, optional, + If set to True, database views will be generated for tables renamed in version 1.0 so that + the previous table names still work. Only has an effect if mastr_table_to_db_table is not + given. + + Defaults to True. + """ if method == "API": log.warning( "Downloading the whole registry via the MaStR SOAP-API has been removed. " @@ -195,29 +337,52 @@ def download( bulk_cleansing=bulk_cleansing, **kwargs, ) - data = transform_data_parameter(data, **kwargs) - date = transform_date_parameter(self, date, **kwargs) + date = transform_date_parameter(date, **kwargs) # Handle interactive date selection if requested if select_date_interactively: log.info( "Interactive date selection enabled. Fetching available downloads..." ) - selected_date, selected_url = select_download_date() + selected_link = select_download_date() - if selected_date is None: - log.info("Download cancelled by user.") + if selected_link is None: + log.info("Download cancelled by user or no download links found.") return # Update the date and use the selected URL - date = selected_date - bulk_download_date = selected_date - custom_url = selected_url + bulk_download_date = selected_link["date"] + custom_xml_url = selected_link["url"] + custom_docs_url = selected_link["docs_url"] else: # Find the name of the zipped xml folder bulk_download_date = parse_date_string(date) - custom_url = None + custom_xml_url = None + custom_docs_url = None + + if not mastr_table_to_db_table: + mastr_table_to_db_table = self.generate_data_model( + data=data, + date=bulk_download_date, + url=custom_docs_url, + catalog_value_as_str=bulk_cleansing, + english=english, + ) + log.info( + "Ensuring database tables for MaStR are present:" + " Dropping old tables if existing and creating new ones." + ) + for db_table in mastr_table_to_db_table.values(): + db_table.drop(self.engine, checkfirst=True) + db_table.create(self.engine) + + if add_views_for_old_table_names: + create_views( + engine=self.engine, mastr_table_to_db_table=mastr_table_to_db_table + ) + + data = transform_data_parameter(data, **kwargs) xml_folder_path = os.path.join(self.output_dir, "data", "xml_download") os.makedirs(xml_folder_path, exist_ok=True) @@ -231,7 +396,11 @@ def download( delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) download_xml_Mastr( - zipped_xml_file_path, bulk_download_date, data, xml_folder_path, custom_url + zipped_xml_file_path, + bulk_download_date, + data, + xml_folder_path, + custom_xml_url, ) log.info( "\nWould you like to speed up the creation of your MaStR database?\n" @@ -245,151 +414,56 @@ def download( data=data, bulk_cleansing=bulk_cleansing, bulk_download_date=bulk_download_date, + mastr_table_to_db_table=mastr_table_to_db_table, + alter_database_tables=alter_database_tables, ) def to_csv( - self, tables: list = None, chunksize: int = 500000, limit: int = None + self, + db_table_names: Iterable[str] = None, + chunksize: int = 500000, ) -> None: - """ - Save the database as csv files along with the metadata file. - If 'tables=None' all possible tables will be exported. + """Export tables from existing database to CSV. Parameters - ------------ - tables: None or list - For exporting selected tables choose from: - ["wind", "solar", "biomass", "hydro", "gsgk", "combustion", "nuclear", "storage", - "balancing_area", "electricity_consumer", "gas_consumer", "gas_producer", - "gas_storage", "gas_storage_extended", - "grid_connections", "grids", "market_actors", "market_actors_and_roles", - "locations_extended", "permit", "deleted_units", "storage_units"] - chunksize: int - Defines the chunksize of the tables export. - Default value is 500.000 rows to include in each chunk. - limit: None or int - Limits the number of exported data rows. - """ - - if self.is_translated: - raise TypeError( - "You are currently connected to a translated database.\n" - "A translated database cannot be used for the csv export." - ) - - log.info("Starting csv-export") - - data_path = get_data_version_dir() - - create_data_dir() - - # Validate and parse tables parameter - validate_parameter_data(method="csv_export", data=tables) - data = transform_data_parameter(tables, **kwargs) - - # Determine tables to export - technologies_to_export = [] - additional_tables_to_export = [] - for table in data: - if table in TECHNOLOGIES: - technologies_to_export.append(table) - elif table in ADDITIONAL_TABLES: - additional_tables_to_export.append(table) - else: - additional_tables_to_export.extend( - data_to_include_tables([table], mapping="export_db_tables") - ) - - if technologies_to_export: - log.info(f"Technology tables: {technologies_to_export}") - if additional_tables_to_export: - log.info(f"Additional tables: {additional_tables_to_export}") - - log.info(f"Tables are saved to: {data_path}") - - reverse_fill_basic_units(technology=technologies_to_export, engine=self.engine) - - # Export technologies to csv - for tech in technologies_to_export: - db_query_to_csv( - db_query=create_db_query(tech=tech, limit=limit, engine=self.engine), - data_table=tech, - chunksize=chunksize, - ) - # Export additional tables to csv - for addit_table in additional_tables_to_export: - db_query_to_csv( - db_query=create_db_query( - additional_table=addit_table, limit=limit, engine=self.engine - ), - data_table=addit_table, - chunksize=chunksize, - ) - - # FIXME: Currently metadata is only created for technology data, Fix in #386 - # Configure and save data package metadata file along with data - # save_metadata(data=technologies_to_export, engine=self.engine) - - def translate(self) -> None: - """ - A database can be translated only once. - - Deletes translated versions of the currently connected database. - - Translates currently connected database,renames it with '-translated' - suffix and updates self.engine's path accordingly. - - !!! example - ```python - - from open_mastr import Mastr - import pandas as pd - - db = Mastr() - db.download(data='biomass') - db.translate() - - df = pd.read_sql(sql='biomass_extended', con=db.engine) - print(df.head(10)) - ``` + ---------- + db_table_names : Iterable of str or None, optional + The names of the database tables to export. If None, all tables in the database will be + exported. Defaults to None. + chunksize : int, optional + Number of rows to retrieve from the database before dumping them to the CSV file. + Defaults to 500000. """ - - if "sqlite" not in self.engine.dialect.name: - raise ValueError("engine has to be of type 'sqlite'") - if self.is_translated: - raise TypeError("The currently connected database is already translated.") + data_path = get_data_version_dir() + os.makedirs(data_path, exist_ok=True) inspector = inspect(self.engine) - old_path = r"{}".format(self.engine.url.database) - new_path = old_path[:-3] + "-translated.db" - - if os.path.exists(new_path): - try: - os.remove(new_path) - except Exception as e: - log.error( - f"An error occurred while removing old translated database: {e}" - ) - - log.info("Replacing previous version of the translated database...") - - for table in inspector.get_table_names(): - rename_table(table, inspector.get_columns(table), self.engine) - - self.engine.dispose() - - try: - os.rename(old_path, new_path) - log.info(f"Database '{old_path}' changed to '{new_path}'") - except Exception as e: - log.error(f"An error occurred while renaming database: {e}") - - self.engine = create_engine(f"sqlite:///{new_path}") - self.is_translated = True + existing_table_names = set(inspector.get_table_names()) + if db_table_names is None: + db_table_names = existing_table_names - def browse_available_downloads(self): - """ - Browse available MaStR downloads from the website without starting the download. + log.info( + f"Exporting the following database tables to CSV: {', '.join(db_table_names)}" + ) + with self.engine.connect() as conn: + for requested_table_name in db_table_names: + if requested_table_name not in existing_table_names: + log.warning( + f"Table {requested_table_name} does not exist. Skipping." + ) + continue + csv_path = os.path.join(data_path, f"{requested_table_name}.csv") + if os.path.exists(csv_path): + log.info(f"Deleting existing file {csv_path}") + os.unlink(csv_path) + for i, chunk in enumerate( + pd.read_sql_table(requested_table_name, conn, chunksize=chunksize) + ): + chunk.to_csv(csv_path, mode="a", index=False, header=i == 0) + + def browse_available_downloads(self) -> list[dict[str, Optional[str]]]: + """Browse available MaStR downloads from the website without starting the download. This method fetches and displays all available download dates from the MaStR website, allowing users to see what historical data is available before deciding to download. @@ -404,10 +478,44 @@ def browse_available_downloads(self): >>> from open_mastr import Mastr >>> db = Mastr() >>> available_downloads = db.browse_available_downloads() - >>> # User can then choose a date and download with: - >>> # db.download(select_date_interactively=True) """ - from open_mastr.xml_download.utils_download_bulk import list_available_downloads - log.info("Browsing available MaStR downloads...") return list_available_downloads() + + def translate(self) -> None: + """ + The translate method has been removed. You can use the `english` option + in the `Mastr.download` method to get English table and column names. + """ + raise NotImplementedError( + "The translate method has been removed. You can use the `english` option" + " in the `Mastr.download` method to get English table and column names." + ) + + +def _generate_data_model_from_downloaded_docs( + zipped_docs_file_path: Path, + data: list[str], + catalog_value_as_str: bool = True, + metadata: Optional[MetaData] = None, + english: bool = False, +) -> dict[str, Table]: + if metadata is None: + metadata = MetaData() + + mastr_table_descriptions = read_mastr_table_descriptions_from_xsd( + zipped_docs_file_path=zipped_docs_file_path, data=data + ) + mastr_table_to_db_table = {} + for mastr_table_description in mastr_table_descriptions: + sqlalchemy_model = make_sqlalchemy_table_from_mastr_table_description( + table_description=mastr_table_description, + catalog_value_as_str=catalog_value_as_str, + metadata=metadata, + english=english, + ) + mastr_table_to_db_table[ + mastr_table_description.original_table_name + ] = sqlalchemy_model + + return mastr_table_to_db_table diff --git a/open_mastr/resources/Dokumentation-MaStR-Gesamtdatenexport-20260216-Fallback.zip b/open_mastr/resources/Dokumentation-MaStR-Gesamtdatenexport-20260216-Fallback.zip new file mode 100644 index 00000000..0e29b97f Binary files /dev/null and b/open_mastr/resources/Dokumentation-MaStR-Gesamtdatenexport-20260216-Fallback.zip differ diff --git a/open_mastr/soap_api/metadata/create.py b/open_mastr/soap_api/metadata/create.py index 87b74233..ea4939ac 100644 --- a/open_mastr/soap_api/metadata/create.py +++ b/open_mastr/soap_api/metadata/create.py @@ -1,11 +1,9 @@ import csv import datetime -import json import os import uuid -from open_mastr.soap_api.metadata.description import DataDescription -from open_mastr.utils.config import get_data_config, get_filenames, column_renaming +from open_mastr.utils.config import get_data_config, get_filenames # TODO: We should not describe the data in both metadata folder and orm.py diff --git a/open_mastr/soap_api/metadata/description.py b/open_mastr/soap_api/metadata/description.py index 728aec23..2a19926f 100644 --- a/open_mastr/soap_api/metadata/description.py +++ b/open_mastr/soap_api/metadata/description.py @@ -156,7 +156,7 @@ def functions_data_documentation(self): log.error(f"Unexpected sequence type: {type(fcn['sequence'])}") log.error(f"Sequence content: {fcn['sequence']}") raise ValueError( - f"Unexpected sequence structure in function metadata" + "Unexpected sequence structure in function metadata" ) # Add data for inherited columns from base types diff --git a/open_mastr/utils/config.py b/open_mastr/utils/config.py index 40f67ec8..77367d98 100644 --- a/open_mastr/utils/config.py +++ b/open_mastr/utils/config.py @@ -151,17 +151,6 @@ def create_project_home_dir(): ) -def create_data_dir(): - """ - Create direct for current data version - - The directory that is created for this fata version can - be returned by :func:`~.get_data_version_dir`. - """ - - os.makedirs(get_data_version_dir(), exist_ok=True) - - def _filenames_generator(): """Write default file names .yml to project home dir""" diff --git a/open_mastr/utils/config/logging.yml b/open_mastr/utils/config/logging.yml index c1b4c29b..b3d4db32 100644 --- a/open_mastr/utils/config/logging.yml +++ b/open_mastr/utils/config/logging.yml @@ -22,5 +22,9 @@ handlers: loggers: open-MaStR: level: "INFO" - handlers: ["console", "file"] - propagate: no + handlers: [] + propagate: yes + +root: + level: "INFO" + handlers: ["console", "file"] diff --git a/open_mastr/utils/constants.py b/open_mastr/utils/constants.py index 5059df5b..59000a22 100644 --- a/open_mastr/utils/constants.py +++ b/open_mastr/utils/constants.py @@ -48,7 +48,6 @@ "grids", "locations_extended", "market_actors", - "market_roles", "market_actors_and_roles", "permit", "deleted_units", @@ -118,86 +117,48 @@ "changed_dso_assignment": ["changed_dso_assignment"], } -# used to map the parameter options in open-mastr to the exact table class names in orm.py -ORM_MAP = { - "wind": { - "unit_data": "WindExtended", - "eeg_data": "WindEeg", - "permit_data": "Permit", - }, - "solar": { - "unit_data": "SolarExtended", - "eeg_data": "SolarEeg", - "permit_data": "Permit", - }, - "biomass": { - "unit_data": "BiomassExtended", - "eeg_data": "BiomassEeg", - "kwk_data": "Kwk", - "permit_data": "Permit", - }, - "combustion": { - "unit_data": "CombustionExtended", - "kwk_data": "Kwk", - "permit_data": "Permit", - }, - "gsgk": { - "unit_data": "GsgkExtended", - "eeg_data": "GsgkEeg", - "kwk_data": "Kwk", - "permit_data": "Permit", - }, - "hydro": { - "unit_data": "HydroExtended", - "eeg_data": "HydroEeg", - "permit_data": "Permit", - }, - "nuclear": {"unit_data": "NuclearExtended", "permit_data": "Permit"}, - "storage": { - "unit_data": "StorageExtended", - "eeg_data": "StorageEeg", - "permit_data": "Permit", - }, - "gas_consumer": "GasConsumer", - "gas_producer": "GasProducer", - "gas_storage": "GasStorage", - "gas_storage_extended": "GasStorageExtended", - "electricity_consumer": "ElectricityConsumer", - "locations_extended": "LocationExtended", - "market_actors": "MarketActors", - "market_roles": "MarketRoles", - "market_actors_and_roles": "MarketActorsAndRoles", - "grid_connections": "GridConnections", - "grids": "Grids", - "balancing_area": "BalancingArea", - "permit": "Permit", - "deleted_units": "DeletedUnits", - "deleted_market_actors": "DeletedMarketActors", - "retrofit_units": "RetrofitUnits", - "changed_dso_assignment": "ChangedDSOAssignment", - "storage_units": "StorageUnits", +TABLE_TRANSLATIONS = { + "AnlagenEegBiomasse": "installations_eeg_biomass", + "AnlagenEegGeothermieGrubengasDruckentspannung": "installations_eeg_gsgk", + "AnlagenEegSolar": "installations_eeg_solar", + "AnlagenEegSpeicher": "installations_eeg_storage", + "AnlagenEegWasser": "installations_eeg_hydro", + "AnlagenEegWind": "installations_eeg_wind", + "AnlagenGasSpeicher": "installations_gas_storage", + "AnlagenKwk": "installations_kwk", + "AnlagenStromSpeicher": "installations_electricity_storage", + "Bilanzierungsgebiete": "balancing_areas", + "EinheitenAenderungNetzbetreiberzuordnungen": "changes_dso_assignment", + "EinheitenBiomasse": "units_biomass", + "EinheitenGasErzeuger": "units_gas_producers", + "EinheitenGasSpeicher": "units_gas_storage", + "EinheitenGasverbraucher": "units_gas_consumers", + "EinheitenGenehmigung": "permits", + "EinheitenGeothermieGrubengasDruckentspannung": "units_gsgk", + "EinheitenKernkraft": "units_nuclear", + "EinheitenSolar": "units_solar", + "EinheitenStromSpeicher": "units_electricity_storage", + "EinheitenStromVerbraucher": "units_electricity_consumers", + "Einheitentypen": "unit_types", + "EinheitenVerbrennung": "units_combustion", + "EinheitenWasser": "units_hydro", + "EinheitenWind": "units_wind", + "Ertuechtigungen": "retrofits", + "GeloeschteUndDeaktivierteEinheiten": "deleted_and_deactivated_units", + "GeloeschteUndDeaktivierteMarktakteure": "deleted_and_deactivated_market_actors", + "Katalogkategorien": "catalog_categories", + "Katalogwerte": "catalog_values", + "Lokationen": "locations", + "Lokationstypen": "location_types", + "Marktakteure": "market_actors", + "MarktakteureUndRollen": "market_actors_and_roles", + "Marktfunktionen": "market_functions", + "Marktrollen": "market_roles", + "Netzanschlusspunkte": "grid_connections", + "Netze": "grids", } -UNIT_TYPE_MAP = { - "Windeinheit": "wind", - "Solareinheit": "solar", - "Biomasse": "biomass", - "Wasser": "hydro", - "Geothermie": "gsgk", - "Verbrennung": "combustion", - "Kernenergie": "nuclear", - "Stromspeichereinheit": "storage", - "Gasspeichereinheit": "gas_storage", - "Gasverbrauchseinheit": "gas_consumer", - "Stromverbrauchseinheit": "electricity_consumer", - "Gaserzeugungseinheit": "gas_producer", - "Stromerzeugungslokation": "location_elec_generation", - "Stromverbrauchslokation": "location_elec_consumption", - "Gaserzeugungslokation": "location_gas_generation", - "Gasverbrauchslokation": "location_gas_consumption", -} - -TRANSLATIONS = { +COLUMN_TRANSLATIONS = { "RegisternummerAusland": "foreignRegisterNumber", "PumpbetriebKontinuierlichRegelbar": "continuousControlOfPumpOperation", "AuflagenAbschaltungSonstige": "requirementShutdownOther", @@ -342,6 +303,7 @@ "LandAnZustelladresse": "countryForDeliveryAddress", "VerhaeltnisErtragsschaetzungReferenzertrag": "yieldEstimationReferenceYieldRatio", "VerknuepfteEinheiten": "linkedUnits", + "VerknuepfteEinheitenMastrNummern": "linkedUnitsMastrNumbers", "InanspruchnahmeZahlungNachEeg": "UseOfPaymentAccordingToEeg", "Marktgebiet": "marketArea", "Stromgrosshaendler": "electricityWholesaler", diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index f4361a0e..0ef8c7b5 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -1,66 +1,52 @@ import os -import json from contextlib import contextmanager -from datetime import date +import datetime from warnings import warn +from typing import Literal, Optional, Union from zipfile import BadZipfile, ZipFile +from zoneinfo import ZoneInfo import dateutil import sqlalchemy -from sqlalchemy.sql import insert, literal_column, text from dateutil.parser import parse from sqlalchemy import create_engine -from sqlalchemy.orm import Query, sessionmaker - -import pandas as pd -from tqdm import tqdm -from open_mastr.utils import orm -from open_mastr.utils.config import ( - get_filenames, - get_data_version_dir, - column_renaming, -) +from sqlalchemy.orm import sessionmaker + from open_mastr.soap_api.download import log from open_mastr.utils.constants import ( BULK_DATA, TECHNOLOGIES, BULK_INCLUDE_TABLES_MAP, - BULK_ADDITIONAL_TABLES_CSV_EXPORT_MAP, - ORM_MAP, - UNIT_TYPE_MAP, ADDITIONAL_TABLES, - TRANSLATIONS, ) - -def chunks(lst, n): - """Yield successive n-sized chunks from lst. - - `Credits - `_ - """ - length = lst.count() if isinstance(lst, Query) else len(lst) - for i in range(0, length, n): - yield lst[i : i + n] +MASTR_TIMEZONE = ZoneInfo("Europe/Berlin") -def create_database_engine(engine, sqlite_db_path) -> sqlalchemy.engine.Engine: - if engine == "sqlite": - sqlite_database_path = os.environ.get( - "SQLITE_DATABASE_PATH", - os.path.join(sqlite_db_path, "open-mastr.db"), +def create_database_engine( + engine: Union[Literal["sqlite"] | sqlalchemy.engine.Engine], + sqlite_db_path: Optional[str], +) -> sqlalchemy.engine.Engine: + if isinstance(engine, sqlalchemy.engine.Engine): + return engine + if engine != "sqlite": + log.warning( + "engine parameter is neither 'sqlite' nor an SQLALchemy engine." + " Creating SQLite engine." ) - db_url = f"sqlite:///{sqlite_database_path}" - return create_engine(db_url) - if type(engine) == sqlalchemy.engine.Engine: - return engine + sqlite_database_path = os.environ.get( + "SQLITE_DATABASE_PATH", + os.path.join(sqlite_db_path, "open-mastr.db"), + ) + db_url = f"sqlite:///{sqlite_database_path}" + return create_engine(db_url) def parse_date_string(bulk_date_string: str) -> str: if bulk_date_string == "today": - return date.today().strftime("%Y%m%d") + return datetime.datetime.now(tz=MASTR_TIMEZONE).strftime("%Y%m%d") else: return parse(bulk_date_string).strftime("%Y%m%d") @@ -153,18 +139,20 @@ def transform_data_parameter(data, **kwargs): if isinstance(data, str): data = [data] elif data is None: + # TODO: This should be adapted so that all tables are downloaded if no data is given. + # Right now, it would skip new tables. data = BULK_DATA return data -def transform_date_parameter(self, date, **kwargs): +def transform_date_parameter(date: Union[datetime.date, Literal["today"]], **kwargs: Optional[str]) -> str: date = kwargs.get("bulk_date", date) date = "today" if date is None else date if date == "existing": log.warning( """ - The date parameter 'existing' is deprecated and will be removed in the future. + The date parameter 'existing' is deprecated and will be removed in the future. The date parameter is set to `today`. If this change causes problems for you, please comment in this issue on github: @@ -192,427 +180,24 @@ def session_scope(engine): session.close() -def data_to_include_tables(data: list, mapping: str = None) -> list: +def data_to_include_tables(data: list[str]) -> set[str]: """ - Convert user input 'data' to the list 'include_tables'. - It contains file names from zipped bulk download, if mapping="write_xml". - It contains database table names, if mapping="export_db_tables". + Convert user input 'data' to the set 'include_tables'. + It contains file names from zipped bulk download. Parameters ---------- data: list The user input for data selection - mapping: str - Specify the mapping dict for the function and thus the list output. - Returns - ------- - list - List of file names | List of database table names - ------- - - """ - if mapping == "write_xml": - # Map data selection to include tables in xml - include_tables = [ - table for tech in data for table in BULK_INCLUDE_TABLES_MAP[tech] - ] - return include_tables - - if mapping == "export_db_tables": - # Map data selection to include tables for csv export - include_tables = [ - table - for possible_data_bulk in data - for table in BULK_ADDITIONAL_TABLES_CSV_EXPORT_MAP[possible_data_bulk] - ] - return include_tables - - raise NotImplementedError( - "This function is only implemented for 'write_xml' and 'export_db_tables', " - "please specify when calling the function." - ) - - -def reverse_unit_type_map(): - return {v: k for k, v in UNIT_TYPE_MAP.items()} - - -# EXPORT RELEVANT FUNCTIONS - - -def create_db_query( - tech=None, - additional_table=None, - additional_data=["unit_data", "eeg_data", "kwk_data", "permit_data"], - limit=None, - engine=None, -): - """ - Create a database query to export a snapshot MaStR data from database to CSV. - - For technologies, during the query creation, additional available data is joined on - list of basic units. A query for a single technology is created separately because - of multiple non-overlapping columns. Duplicate columns for a single technology - (a results on data from different sources) are suffixed. - - The data in the database probably has duplicates because - of the history how data was collected in the - Marktstammdatenregister. - - Along with the data, metadata is saved in the file `datapackage.json`. - - Parameters - ---------- - technology: `list` of `str` - See list of available technologies in - `open_mastr.utils.constants.TECHNOLOGIES` - additional_table: `list` of `str` - See list of available technologies or additional tables in - `open_mastr.utils.constants.ADDITIONAL_TABLES` - engine: - User-defined database engine. - limit: int - Limit number of rows. - additional_data: `list` - Defaults to "export all available additional data" which is: - `["unit_data", "eeg_data", "kwk_data", "permit_data"]`. - chunksize: int or None - Defines the chunksize of the tables export. Default to 500.000 which is roughly 2.5 GB. - """ - - renaming = column_renaming() - - unit_type_map_reversed = reverse_unit_type_map() - - with session_scope(engine=engine) as session: - if tech: - # Select orm tables for specified additional_data. - orm_tables = { - f"{dat}": getattr(orm, ORM_MAP[tech].get(dat, "KeyNotAvailable"), None) - for dat in additional_data - } - - # Filter for possible orm-additional_data combinations (not None) - orm_tables = {k: v for k, v in orm_tables.items() if v is not None} - - # Build query based on available tables for tech and user input; always use basic units - subtables = partially_suffixed_columns( - orm.BasicUnit, - renaming["basic_data"]["columns"], - renaming["basic_data"]["suffix"], - ) - - # Extend table with columns from selected additional_data orm - for addit_data_type, addit_data_orm in orm_tables.items(): - subtables.extend( - partially_suffixed_columns( - addit_data_orm, - renaming[addit_data_type]["columns"], - renaming[addit_data_type]["suffix"], - ) - ) - - query_tech = Query(subtables, session=session) - - # Define joins based on available tables for data and user input - if "unit_data" in orm_tables: - query_tech = query_tech.join( - orm_tables["unit_data"], - orm.BasicUnit.EinheitMastrNummer - == orm_tables["unit_data"].EinheitMastrNummer, - isouter=True, - ) - if "eeg_data" in orm_tables: - query_tech = query_tech.join( - orm_tables["eeg_data"], - orm.BasicUnit.EegMastrNummer - == orm_tables["eeg_data"].EegMastrNummer, - isouter=True, - ) - if "kwk_data" in orm_tables: - query_tech = query_tech.join( - orm_tables["kwk_data"], - orm.BasicUnit.KwkMastrNummer - == orm_tables["kwk_data"].KwkMastrNummer, - isouter=True, - ) - if "permit_data" in orm_tables: - query_tech = query_tech.join( - orm_tables["permit_data"], - orm.BasicUnit.GenMastrNummer - == orm_tables["permit_data"].GenMastrNummer, - isouter=True, - ) - - # Restricted to technology - query_tech = query_tech.filter( - orm.BasicUnit.Einheittyp == unit_type_map_reversed[tech] - ) - - # Limit returned rows of query - if limit: - query_tech = query_tech.limit(limit) - - return query_tech - - if additional_table: - orm_table = getattr(orm, ORM_MAP[additional_table], None) - - query_additional_tables = Query(orm_table, session=session) - - # Limit returned rows of query - if limit: - query_additional_tables = query_additional_tables.limit(limit) - - return query_additional_tables - - -# At the time of commenting this, the call of this function in mastr.py was already -# commented out for more than a year - -# def save_metadata(data: list = None, engine=None) -> None: -# """ -# Save metadata during csv export. -# -# Parameters -# ---------- -# data: list -# List of exported technologies for which metadata is needed. -# engine: -# User-defined database engine. -# -# Returns -# ------- -# -# """ -# data_path = get_data_version_dir() -# filenames = get_filenames() -# metadata_file = os.path.join(data_path, filenames["metadata"]) -# unit_type_map_reversed = reverse_unit_type_map() -# -# with session_scope(engine=engine) as session: -# # check for latest db entry for exported technologies -# mastr_technologies = [unit_type_map_reversed[tech] for tech in data] -# newest_date = ( -# session.query(orm.BasicUnit.DatumLetzteAktualisierung) -# .filter(orm.BasicUnit.Einheittyp.in_(mastr_technologies)) -# .order_by(orm.BasicUnit.DatumLetzteAktualisierung.desc()) -# .first()[0] -# ) -# -# metadata = create_datapackage_meta_json(newest_date, data, json_serialize=False) -# -# with open(metadata_file, "w", encoding="utf-8") as f: -# json.dump(metadata, f, ensure_ascii=False, indent=4) -# -# log.info("Saved metadata") - - -def reverse_fill_basic_units(technology=None, engine=None): - """ - The basic_units table is empty after bulk download. - To enable csv export, the table is filled from extended - tables reversely. - - .. warning:: - The basic_units table will be dropped and then recreated. - Returns ------- - - Parameters - ---------- - technology: list of str - Available technologies are in open_mastr.Mastr.to_csv() - """ - - with session_scope(engine=engine) as session: - # Empty the basic_units table, because it will be filled entirely from extended tables - session.query(getattr(orm, "BasicUnit", None)).delete() - - for tech in tqdm(technology, desc="Performing reverse fill of basic units: "): - # Get the class of extended table - unit_data_orm = getattr(orm, ORM_MAP[tech]["unit_data"], None) - basic_unit_column_names = [ - column.name - for column in getattr(orm, "BasicUnit", None).__mapper__.columns - ] - - unit_columns_to_reverse_fill = [ - column - for column in unit_data_orm.__mapper__.columns - if column.name in basic_unit_column_names - ] - unit_column_names_to_reverse_fill = [ - column.name for column in unit_columns_to_reverse_fill - ] - - unit_type_map_reversed = reverse_unit_type_map() - - # Add Einheittyp artificially - unit_typ = "'" + unit_type_map_reversed.get(tech, None) + "'" - unit_columns_to_reverse_fill.append( - literal_column(unit_typ).label("Einheittyp") - ) - unit_column_names_to_reverse_fill.append("Einheittyp") - - # Build query - query = Query(unit_columns_to_reverse_fill, session=session) - insert_query = insert(orm.BasicUnit).from_select( - unit_column_names_to_reverse_fill, query - ) - - session.execute(insert_query) - - -def partially_suffixed_columns(mapper, column_names, suffix): - """ - Add a suffix to a subset of ORM map tables for a query - - Parameters - ---------- - mapper: - SQLAlchemy ORM table mapper - column_names: list - Names of columns to be suffixed - suffix: str - Suffix that is append like + "_" + suffix - Returns ------- - list - List of ORM table mapper instance - """ - columns = list(mapper.__mapper__.columns) - return [ - _.label(f"{_.name}_{suffix}") if _.name in column_names else _ for _ in columns - ] - - -def db_query_to_csv(db_query, data_table: str, chunksize: int) -> None: - """ - Export database query to CSV file - - Save CSV files to the respective data version directory, see - :meth:`open_mastr.utils.config.get_data_version_dir`. - - Parameters - ---------- - db_query: - QueryORM-level SQL construction object that holds tables for export. - data_table: str - See list of available technologies or additional tables in - `open_mastr.utils.constants.TECHNOLOGIES` and - `open_mastr.utils.constants.ADDITIONAL_TABLES` - chunksize: int - Defines the size of the chunks that are saved to csv. - Useful when export fails due to memory issues. - """ - data_path = get_data_version_dir() - filenames = get_filenames() - - # Set export settings per table type - if data_table in TECHNOLOGIES: - index = True - index_col = "EinheitMastrNummer" - index_label = "EinheitMastrNummer" - csv_file = os.path.join(data_path, filenames["raw"][data_table]["joined"]) - if data_table in ADDITIONAL_TABLES: - index = False - index_col = None - index_label = None - csv_file = os.path.join( - data_path, filenames["raw"]["additional_table"][data_table] - ) - - with db_query.session.bind.connect() as con: - with con.begin(): - # Read data into pandas.DataFrame in chunks of max. 500000 rows of ~2.5 GB RAM - for chunk_number, chunk_df in enumerate( - pd.read_sql( - sql=db_query.statement, - con=con, - index_col=index_col, - chunksize=chunksize, - ) - ): - # For debugging purposes, check RAM usage of chunk_df - # chunk_df.info(memory_usage='deep') - - # Make sure no duplicate column names exist - assert not any(chunk_df.columns.duplicated()) - - # Remove newline statements from certain strings - if data_table in TECHNOLOGIES: - for col in ["Aktenzeichen", "Behoerde"]: - chunk_df[col] = chunk_df[col].str.replace("\r", "") - - if not chunk_df.empty: - if chunk_number == 0: - chunk_df.to_csv( - csv_file, - index=index, - index_label=index_label, - encoding="utf-8", - ) - log.info(f"Created csv: {csv_file.split('/')[-1:]} ") - else: - chunk_df.to_csv( - csv_file, - mode="a", - header=False, - index=index, - index_label=index_label, - encoding="utf-8", - ) - log.info( - f"Appended {len(chunk_df)} rows to: {csv_file.split('/')[-1:]}" - ) - - -def rename_table(table, columns, engine) -> None: - """ - Rename table based on translation dictionary. - """ - alter_statements = [] - - for column in columns: - column = column["name"] - - if column in TRANSLATIONS: - alter_statement = text( - f"ALTER TABLE {table} RENAME COLUMN {column} TO {TRANSLATIONS[column]}" - ) - alter_statements.append(alter_statement) - - with engine.connect() as connection: - for statement in alter_statements: - try: - connection.execute(statement) - except sqlalchemy.exc.OperationalError: - continue - - -def create_translated_database_engine(engine, folder_path) -> sqlalchemy.engine.Engine: - """ - Check if translated version of the database, as defined with engine parameter, exists. - Return sqlite engine connected with the translated database. - """ - - if engine == "sqlite": - db_path = os.path.join(folder_path, "open-mastr-translated.db") - else: - if "sqlite" not in engine.dialect.name: - raise ValueError("engine has to be of type 'sqlite'") - - prev_path = r"{}".format(engine.url.database) - engine.dispose() - db_path = prev_path[:-3] + "-translated.db" - - if not os.path.exists(db_path): - raise FileNotFoundError( - f"no database at {db_path} found.\n" - "make sure the database has been translated before with translate()" - ) - - return create_engine(f"sqlite:///{db_path}") + set + Set of file names + """ + # Map data selection to include tables in xml + include_tables = { + table for tech in data for table in BULK_INCLUDE_TABLES_MAP[tech] + } + return include_tables def delete_zip_file_if_corrupted(save_path: str): diff --git a/open_mastr/utils/orm.py b/open_mastr/utils/orm.py deleted file mode 100644 index 3d4c9f19..00000000 --- a/open_mastr/utils/orm.py +++ /dev/null @@ -1,1026 +0,0 @@ -from sqlalchemy.orm import DeclarativeBase -from sqlalchemy import ( - Column, - Integer, - String, - Float, - Sequence, - DateTime, - Boolean, - func, - Date, - JSON, -) - - -class Base(DeclarativeBase): - pass - - -class ParentAllTables(object): - DatenQuelle = Column(String) - DatumDownload = Column(Date) - - -class BasicUnit(Base): - __tablename__ = "basic_units" - - EinheitMastrNummer = Column(String, primary_key=True) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - Name = Column(String) - Einheitart = Column(String) - Einheittyp = Column(String) - Standort = Column(String) - Bruttoleistung = Column(Float) - Erzeugungsleistung = Column(Float) - EinheitBetriebsstatus = Column(String) - Anlagenbetreiber = Column(String) - EegMastrNummer = Column(String) - KwkMastrNummer = Column(String) - SpeMastrNummer = Column(String) - GenMastrNummer = Column(String) - BestandsanlageMastrNummer = Column(String) - NichtVorhandenInMigriertenEinheiten = Column(String) - EinheitSystemstatus = Column(String) - - -class Extended(object): - NetzbetreiberMastrNummer = Column(String) - Registrierungsdatum = Column(Date) - EinheitMastrNummer = Column(String, primary_key=True) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - LokationMastrNummer = Column(String) - NetzbetreiberpruefungStatus = Column(String) - NetzbetreiberpruefungDatum = Column(Date) - AnlagenbetreiberMastrNummer = Column(String) - Land = Column(String) - Bundesland = Column(String) - Landkreis = Column(String) - Gemeinde = Column(String) - Gemeindeschluessel = Column(String) - Postleitzahl = Column(String) - Gemarkung = Column(String) - FlurFlurstuecknummern = Column(String) - Strasse = Column(String) - StrasseNichtGefunden = Column(Boolean) - Hausnummer = Column(String) - HausnummerNichtGefunden = Column(Boolean) - Adresszusatz = Column(String) - Ort = Column(String) - Laengengrad = Column(Float) - Breitengrad = Column(Float) - UtmZonenwert = Column(String) - UtmEast = Column(Float) - UtmNorth = Column(Float) - GaussKruegerHoch = Column(Float) - GaussKruegerRechts = Column(Float) - Meldedatum = Column(Date) - GeplantesInbetriebnahmedatum = Column(Date) - Inbetriebnahmedatum = Column(Date) - DatumEndgueltigeStilllegung = Column(Date) - DatumBeginnVoruebergehendeStilllegung = Column(Date) - DatumBeendigungVorlaeufigenStilllegung = Column(Date) - DatumWiederaufnahmeBetrieb = Column(Date) - EinheitSystemstatus = Column(String) - EinheitBetriebsstatus = Column(String) - BestandsanlageMastrNummer = Column(String) - NichtVorhandenInMigriertenEinheiten = Column(Boolean) - AltAnlagenbetreiberMastrNummer = Column(String) - DatumDesBetreiberwechsels = Column(Date) - DatumRegistrierungDesBetreiberwechsels = Column(Date) - NameStromerzeugungseinheit = Column(String) - Weic = Column(String) - WeicDisplayName = Column(String) - Kraftwerksnummer = Column(String) - Energietraeger = Column(String) - Bruttoleistung = Column(Float) - Nettonennleistung = Column(Float) - AnschlussAnHoechstOderHochSpannung = Column(Boolean) - Schwarzstartfaehigkeit = Column(Boolean) - Inselbetriebsfaehigkeit = Column(Boolean) - Einsatzverantwortlicher = Column(String) - FernsteuerbarkeitNb = Column(Boolean) - FernsteuerbarkeitDv = Column(Boolean) - FernsteuerbarkeitDr = Column(Boolean) - Einspeisungsart = Column(String) - PraequalifiziertFuerRegelenergie = Column(Boolean) - GenMastrNummer = Column(String) - Netzbetreiberzuordnungen = Column(String) - ReserveartNachDemEnWG = Column(String) - DatumUeberfuehrungInReserve = Column(Date) - # from bulk download - Hausnummer_nv = Column(Boolean) - Weic_nv = Column(Boolean) - Kraftwerksnummer_nv = Column(Boolean) - - -class WindExtended(Extended, ParentAllTables, Base): - __tablename__ = "wind_extended" - - # wind specific attributes - NameWindpark = Column(String) - Lage = Column(String) - Seelage = Column(String) - ClusterOstsee = Column(String) - ClusterNordsee = Column(String) - Hersteller = Column(String) - HerstellerId = Column(String) - Technologie = Column(String) - Typenbezeichnung = Column(String) - Nabenhoehe = Column(Float) - Rotordurchmesser = Column(Float) - Rotorblattenteisungssystem = Column(Boolean) - AuflageAbschaltungLeistungsbegrenzung = Column(Boolean) - AuflagenAbschaltungSchallimmissionsschutzNachts = Column(Boolean) - AuflagenAbschaltungSchallimmissionsschutzTagsueber = Column(Boolean) - AuflagenAbschaltungSchattenwurf = Column(Boolean) - AuflagenAbschaltungTierschutz = Column(Boolean) - AuflagenAbschaltungEiswurf = Column(Boolean) - AuflagenAbschaltungSonstige = Column(Boolean) - Wassertiefe = Column(Float) - Kuestenentfernung = Column(Float) - Buergerenergie = Column(Boolean) - Nachtkennzeichen = Column(Boolean) - EegMastrNummer = Column(String) - WindAnLandOderAufSee = Column(String) - TechnologieFlugwind = Column(String) - Flughoehe = Column(Float) - Flugradius = Column(Float) - - -class SolarExtended(Extended, ParentAllTables, Base): - __tablename__ = "solar_extended" - - ZugeordneteWirkleistungWechselrichter = Column(Float) - GemeinsamerWechselrichterMitSpeicher = Column(String) - AnzahlModule = Column(Integer) - Lage = Column(String) - Leistungsbegrenzung = Column(String) - EinheitlicheAusrichtungUndNeigungswinkel = Column(Boolean) - Hauptausrichtung = Column(String) - HauptausrichtungNeigungswinkel = Column(String) - Nebenausrichtung = Column(String) - NebenausrichtungNeigungswinkel = Column(String) - InAnspruchGenommeneFlaeche = Column(Float) - ArtDerFlaeche = Column(String) - InAnspruchGenommeneLandwirtschaftlichGenutzteFlaeche = Column(Float) - Nutzungsbereich = Column(String) - Buergerenergie = Column(Boolean) - EegMastrNummer = Column(String) - ArtDerFlaecheIds = Column(String) - ArtDerSolaranlage = Column(String) - Zaehlernummer = Column(String) - InAnspruchGenommeneAckerflaeche = Column(Float) - SpeicherAmGleichenOrt = Column(String) - - -class BiomassExtended(Extended, ParentAllTables, Base): - __tablename__ = "biomass_extended" - - Hauptbrennstoff = Column(String) - Biomasseart = Column(String) - Technologie = Column(String) - EegMastrNummer = Column(String) - KwkMastrNummer = Column(String) - - -class CombustionExtended(Extended, ParentAllTables, Base): - __tablename__ = "combustion_extended" - - NameKraftwerk = Column(String) - NameKraftwerksblock = Column(String) - DatumBaubeginn = Column(Date) - SteigerungNettonennleistungKombibetrieb = Column(Float) - AnlageIstImKombibetrieb = Column(Boolean) - MastrNummernKombibetrieb = Column(String) - NetzreserveAbDatum = Column(Date) - SicherheitsbereitschaftAbDatum = Column(Date) - Hauptbrennstoff = Column(String) - WeitererHauptbrennstoff = Column(String) - WeitereBrennstoffe = Column(String) - VerknuepfteErzeugungseinheiten = Column(String) - BestandteilGrenzkraftwerk = Column(Boolean) - NettonennleistungDeutschland = Column(Float) - AnteiligNutzungsberechtigte = Column(String) - Notstromaggregat = Column(Boolean) - Einsatzort = Column(String) - KwkMastrNummer = Column(String) - Technologie = Column(String) - AusschliesslicheVerwendungImKombibetrieb = Column(Boolean) - - -class GsgkExtended(Extended, ParentAllTables, Base): - __tablename__ = "gsgk_extended" - - Technologie = Column(String) - KwkMastrNummer = Column(String) - EegMastrNummer = Column(String) - - -class HydroExtended(Extended, ParentAllTables, Base): - __tablename__ = "hydro_extended" - - NameKraftwerk = Column(String) - ArtDerWasserkraftanlage = Column(String) - MinderungStromerzeugung = Column(Boolean) - BestandteilGrenzkraftwerk = Column(Boolean) - NettonennleistungDeutschland = Column(Float) - ArtDesZuflusses = Column(String) - EegMastrNummer = Column(String) - - -class NuclearExtended(Extended, ParentAllTables, Base): - __tablename__ = "nuclear_extended" - - NameKraftwerk = Column(String) - NameKraftwerksblock = Column(String) - Technologie = Column(String) - - -class StorageExtended(Extended, ParentAllTables, Base): - __tablename__ = "storage_extended" - - Einsatzort = Column(String) - AcDcKoppelung = Column(String) - Batterietechnologie = Column(String) - PumpbetriebLeistungsaufnahme = Column(Float) - PumpbetriebKontinuierlichRegelbar = Column(Boolean) - Pumpspeichertechnologie = Column(String) - Notstromaggregat = Column(Boolean) - BestandteilGrenzkraftwerk = Column(Boolean) - NettonennleistungDeutschland = Column(Float) - ZugeordneteWirkleistungWechselrichter = Column(Float) - NutzbareSpeicherkapazitaet = Column(Float) - SpeMastrNummer = Column(String) - EegMastrNummer = Column(String) - EegAnlagentyp = Column(String) - Technologie = Column(String) - LeistungsaufnahmeBeimEinspeichern = Column(Float) - GemeinsamRegistrierteSolareinheitMastrNummer = Column(String) - - -class Eeg(object): - Registrierungsdatum = Column(Date) - EegMastrNummer = Column(String, primary_key=True) - Meldedatum = Column(Date) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - EegInbetriebnahmedatum = Column(Date) - VerknuepfteEinheit = Column(String) - AnlagenschluesselEeg = Column(String) - AusschreibungZuschlag = Column(Boolean) - AnlagenkennzifferAnlagenregister = Column(String) - AnlagenkennzifferAnlagenregister_nv = Column(Boolean) - Netzbetreiberzuordnungen = Column(String) - - -class WindEeg(Eeg, ParentAllTables, Base): - __tablename__ = "wind_eeg" - - PrototypAnlage = Column(Boolean) - PilotAnlage = Column(Boolean) - InstallierteLeistung = Column(Float) - VerhaeltnisErtragsschaetzungReferenzertrag = Column(Float) - VerhaeltnisReferenzertragErtrag5Jahre = Column(Float) - VerhaeltnisReferenzertragErtrag10Jahre = Column(Float) - VerhaeltnisReferenzertragErtrag15Jahre = Column(Float) - Zuschlagsnummer = Column(String) - AnlageBetriebsstatus = Column(String) - VerhaeltnisErtragsschaetzungReferenzertrag_nv = Column(Boolean) - VerhaeltnisReferenzertragErtrag5Jahre_nv = Column(Boolean) - VerhaeltnisReferenzertragErtrag10Jahre_nv = Column(Boolean) - VerhaeltnisReferenzertragErtrag15Jahre_nv = Column(Boolean) - - -class SolarEeg(Eeg, ParentAllTables, Base): - __tablename__ = "solar_eeg" - - InanspruchnahmeZahlungNachEeg = Column(Boolean) - InstallierteLeistung = Column(Float) - RegistrierungsnummerPvMeldeportal = Column(String) - MieterstromRegistrierungsdatum = Column(Date) - MieterstromZugeordnet = Column(Boolean) - MieterstromMeldedatum = Column(Date) - MieterstromErsteZuordnungZuschlag = Column(Date) - ZugeordneteGebotsmenge = Column(Float) - Zuschlagsnummer = Column(String) - AnlageBetriebsstatus = Column(String) - RegistrierungsnummerPvMeldeportal_nv = Column(Boolean) - - -class BiomassEeg(Eeg, ParentAllTables, Base): - __tablename__ = "biomass_eeg" - - InstallierteLeistung = Column(Float) - AusschliesslicheVerwendungBiomasse = Column(Boolean) - Zuschlagsnummer = Column(String) - BiogasInanspruchnahmeFlexiPraemie = Column(Boolean) - BiogasDatumInanspruchnahmeFlexiPraemie = Column(Date) - BiogasLeistungserhoehung = Column(Boolean) - BiogasDatumLeistungserhoehung = Column(Date) - BiogasUmfangLeistungserhoehung = Column(Float) - BiogasGaserzeugungskapazitaet = Column(Float) - Hoechstbemessungsleistung = Column(Float) - BiomethanErstmaligerEinsatz = Column(Date) - AnlageBetriebsstatus = Column(String) - BiogasGaserzeugungskapazitaet_nv = Column(Boolean) - BiomethanErstmaligerEinsatz_nv = Column(Boolean) - - -class GsgkEeg(Eeg, ParentAllTables, Base): - __tablename__ = "gsgk_eeg" - - InstallierteLeistung = Column(Float) - AnlageBetriebsstatus = Column(String) - - -class HydroEeg(Eeg, ParentAllTables, Base): - __tablename__ = "hydro_eeg" - - InstallierteLeistung = Column(Float) - AnlageBetriebsstatus = Column(String) - Ertuechtigung = Column(JSON) - ErtuechtigungIds = Column(String) - - -class StorageEeg(Eeg, ParentAllTables, Base): - __tablename__ = "storage_eeg" - - eegAnlagenschluessel = Column(String) - eegZuschlagsnummer = Column(String) - eegAusschreibungZuschlag = Column(Boolean) - - -class Kwk(ParentAllTables, Base): - __tablename__ = "kwk" - - Registrierungsdatum = Column(Date) - KwkMastrNummer = Column(String, primary_key=True) - Zuschlagnummer = Column(String) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - Inbetriebnahmedatum = Column(Date) - Meldedatum = Column(Date) - ThermischeNutzleistung = Column(Float) - ElektrischeKwkLeistung = Column(Float) - VerknuepfteEinheiten = Column(String) - AnlageBetriebsstatus = Column(String) - AusschreibungZuschlag = Column(Boolean) - Netzbetreiberzuordnungen = Column(String) - - -class Permit(ParentAllTables, Base): - __tablename__ = "permit" - - Registrierungsdatum = Column(Date) - GenMastrNummer = Column(String, primary_key=True) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - Art = Column(String) - Datum = Column(Date) - Behoerde = Column(String) - Aktenzeichen = Column(String) - Frist = Column(Date) - WasserrechtsNummer = Column(String) - WasserrechtAblaufdatum = Column(Date) - Meldedatum = Column(Date) - VerknuepfteEinheiten = Column(String) - Frist_nv = Column(Boolean) - WasserrechtAblaufdatum_nv = Column(Boolean) - Netzbetreiberzuordnungen = Column(String) - DatumAntragstellung = Column(Date) - - -class LocationBasic(Base): - __tablename__ = "locations_basic" - - LokationMastrNummer = Column(String, primary_key=True) - NameDerTechnischenLokation = Column(String) - Lokationtyp = Column(String) - AnzahlNetzanschlusspunkte = Column(Integer) - - -class LocationExtended(ParentAllTables, Base): - __tablename__ = "locations_extended" - - MastrNummer = Column(String, primary_key=True) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - NameDerTechnischenLokation = Column(String) - VerknuepfteEinheiten = Column(String) - Netzanschlusspunkte = Column(String) - Lokationtyp = Column(String) - - -class GasStorage(ParentAllTables, Base): - __tablename__ = "gas_storage" - - MastrNummer = Column(String, primary_key=True) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - Speichername = Column(String) - Registrierungsdatum = Column(Date) - AnlageBetriebsstatus = Column(String) - VerknuepfteEinheit = Column(String) - - -class GasStorageExtended(ParentAllTables, Base): - __tablename__ = "gas_storage_extended" - EinheitMastrNummer = Column(String, primary_key=True) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - LokationMastrNummer = Column(String) - NetzbetreiberpruefungStatus = Column(String) - NetzbetreiberpruefungDatum = Column(Date) - AnlagenbetreiberMastrNummer = Column(String) - Land = Column(String) - Bundesland = Column(String) - Landkreis = Column(String) - Gemeinde = Column(String) - Gemeindeschluessel = Column(String) - Postleitzahl = Column(String) - Ort = Column(String) - Strasse = Column(String) - StrasseNichtGefunden = Column(Integer) - Hausnummer = Column(String) - Hausnummer_nv = Column(Integer) - HausnummerNichtGefunden = Column(Integer) - Laengengrad = Column(Float) - Breitengrad = Column(Float) - Registrierungsdatum = Column(String) - Inbetriebnahmedatum = Column(String) - EinheitSystemstatus = Column(String) - EinheitBetriebsstatus = Column(String) - NichtVorhandenInMigriertenEinheiten = Column(Integer) - NameGasspeicher = Column(String) - Speicherart = Column(String) - MaximalNutzbaresArbeitsgasvolumen = Column(Float) - MaximaleEinspeicherleistung = Column(Float) - MaximaleAusspeicherleistung = Column(Float) - DurchschnittlicherBrennwert = Column(Float) - Weic = Column(String) - Weic_Na = Column(Integer) - SpeicherMastrNummer = Column(String) - Gemarkung = Column(String) - FlurFlurstuecknummern = Column(String) - Adresszusatz = Column(String) - DatumBeginnVoruebergehendeStilllegung = Column(Date) - DatumDesBetreiberwechsels = Column(Date) - DatumRegistrierungDesBetreiberwechsels = Column(Date) - DatumEndgueltigeStilllegung = Column(Date) - ZugeordnenteWirkleistungWechselrichter = Column(Float) - - -class StorageUnits(ParentAllTables, Base): - __tablename__ = "storage_units" - MastrNummer = Column(String, primary_key=True) - Registrierungsdatum = Column(Date) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - NutzbareSpeicherkapazitaet = Column(Float) - VerknuepfteEinheit = Column(String) - AnlageBetriebsstatus = Column(String) - - -class BalancingArea(ParentAllTables, Base): - __tablename__ = "balancing_area" - - Id = Column(Integer, primary_key=True) - Yeic = Column(String) - RegelzoneNetzanschlusspunkt = Column(String) - BilanzierungsgebietNetzanschlusspunkt = Column(String) - - -class GasProducer(ParentAllTables, Base): - __tablename__ = "gas_producer" - - EinheitMastrNummer = Column(String, primary_key=True) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - LokationMastrNummer = Column(String) - NetzbetreiberpruefungStatus = Column(String) - NetzbetreiberpruefungDatum = Column(Date) - AnlagenbetreiberMastrNummer = Column(String) - Land = Column(String) - Bundesland = Column(String) - Landkreis = Column(String) - Gemeinde = Column(String) - Gemeindeschluessel = Column(String) - Postleitzahl = Column(String) - Ort = Column(String) - Registrierungsdatum = Column(Date) - Inbetriebnahmedatum = Column(Date) - EinheitSystemstatus = Column(String) - EinheitBetriebsstatus = Column(String) - NichtVorhandenInMigriertenEinheiten = Column(Integer) - NameGaserzeugungseinheit = Column(String) - SpeicherMastrNummer = Column(String) - Strasse = Column(String) - StrasseNichtGefunden = Column(Integer) - Hausnummer = Column(String) - Hausnummer_nv = Column(Integer) - HausnummerNichtGefunden = Column(Integer) - Adresszusatz = Column(String) - Laengengrad = Column(Float) - Breitengrad = Column(Float) - Technologie = Column(String) - Erzeugungsleistung = Column(Float) - DatumDesBetreiberwechsels = Column(Date) - DatumRegistrierungDesBetreiberwechsels = Column(Date) - Gemarkung = Column(String) - FlurFlurstuecknummern = Column(String) - GeplantesInbetriebnahmedatum = Column(Date) - DatumBeginnVoruebergehendeStilllegung = Column(Date) - DatumEndgueltigeStilllegung = Column(Date) - - -class GasConsumer(ParentAllTables, Base): - __tablename__ = "gas_consumer" - - EinheitMastrNummer = Column(String, primary_key=True) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - LokationMastrNummer = Column(String) - NetzbetreiberpruefungStatus = Column(String) - NetzbetreiberpruefungDatum = Column(Date) - AnlagenbetreiberMastrNummer = Column(String) - Land = Column(String) - Bundesland = Column(String) - Landkreis = Column(String) - Gemeinde = Column(String) - Gemeindeschluessel = Column(String) - Postleitzahl = Column(String) - Ort = Column(String) - Strasse = Column(String) - StrasseNichtGefunden = Column(Integer) - Hausnummer = Column(String) - Hausnummer_nv = Column(Integer) - HausnummerNichtGefunden = Column(Integer) - Laengengrad = Column(Float) - Breitengrad = Column(Float) - Registrierungsdatum = Column(String) - Inbetriebnahmedatum = Column(String) - EinheitSystemstatus = Column(String) - EinheitBetriebsstatus = Column(String) - NichtVorhandenInMigriertenEinheiten = Column(Integer) - NameGasverbrauchsseinheit = Column(String) - EinheitDientDerStromerzeugung = Column(String) - MaximaleGasbezugsleistung = Column(Float) - VerknuepfteEinheit = Column(String) - GeplantesInbetriebnahmedatum = Column(Date) - Adresszusatz = Column(String) - Gemarkung = Column(String) - FlurFlurstuecknummern = Column(String) - DatumDesBetreiberwechsels = Column(Date) - DatumRegistrierungDesBetreiberwechsels = Column(Date) - DatumEndgueltigeStilllegung = Column(Date) - DatumBeginnVoruebergehendeStilllegung = Column(Date) - - -class ElectricityConsumer(ParentAllTables, Base): - __tablename__ = "electricity_consumer" - - EinheitMastrNummer = Column(String, primary_key=True) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - LokationMastrNummer = Column(String) - NetzbetreiberpruefungStatus = Column(String) - NetzbetreiberpruefungDatum = Column(Date) - AnlagenbetreiberMastrNummer = Column(String) - Land = Column(String) - Bundesland = Column(String) - Landkreis = Column(String) - Gemeinde = Column(String) - Gemeindeschluessel = Column(String) - Postleitzahl = Column(String) - Ort = Column(String) - Strasse = Column(String) - StrasseNichtGefunden = Column(Integer) - Hausnummer = Column(String) - Hausnummer_nv = Column(Integer) - HausnummerNichtGefunden = Column(Integer) - Adresszusatz = Column(String) - Gemarkung = Column(String) - FlurFlurstuecknummern = Column(String) - Laengengrad = Column(Float) - Breitengrad = Column(Float) - Registrierungsdatum = Column(String) - Inbetriebnahmedatum = Column(String) - EinheitSystemstatus = Column(String) - EinheitBetriebsstatus = Column(String) - NichtVorhandenInMigriertenEinheiten = Column(Integer) - Einsatzverantwortlicher = Column(String) - NameStromverbrauchseinheit = Column(String) - AnzahlStromverbrauchseinheitenGroesser50Mw = Column(Integer) - PraequalifiziertGemaessAblav = Column(Boolean) - AnteilBeinflussbareLast = Column(Float) - ArtAbschaltbareLast = Column(String) - DatumDesBetreiberwechsels = Column(Date) - DatumRegistrierungDesBetreiberwechsels = Column(Date) - DatumBeginnVoruebergehendeStilllegung = Column(Date) - DatumEndgueltigeStilllegung = Column(Date) - GeplantesInbetriebnahmedatum = Column(Date) - - -class MarketActorsAndRoles(ParentAllTables, Base): - __tablename__ = "market_actors_and_roles" - - MastrNummer = Column(String, primary_key=True) - MarktakteurMastrNummer = Column(String) - Marktrolle = Column(String) - Marktpartneridentifikationsnummer_nv = Column(Boolean) - BundesnetzagenturBetriebsnummer = Column(String) - BundesnetzagenturBetriebsnummer_nv = Column(Boolean) - Marktpartneridentifikationsnummer = Column(String) - KontaktdatenMarktrolle = Column(String) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - - -class MarketActors(ParentAllTables, Base): - __tablename__ = "market_actors" - - MastrNummer = Column(String, primary_key=True) - Personenart = Column(String) - Marktfunktion = Column(String) - RegistergerichtAusland = Column(String) - Registernummer = Column(String) - DatumLetzeAktualisierung = Column(DateTime(timezone=True)) - Firmenname = Column(String) - Rechtsform = Column(String) - Land = Column(String) - Strasse = Column(String) - Hausnummer = Column(String) - Hausnummer_nv = Column(Boolean) - Postleitzahl = Column(String) - Ort = Column(String) - Bundesland = Column(String) - Nuts2 = Column(String) - Email = Column(String) - Telefon = Column(String) - Fax_nv = Column(Boolean) - Webseite_nv = Column(Boolean) - Taetigkeitsbeginn = Column(Date) - AcerCode_nv = Column(Boolean) - Umsatzsteueridentifikationsnummer_nv = Column(Boolean) - BundesnetzagenturBetriebsnummer = Column(String) - BundesnetzagenturBetriebsnummer_nv = Column(Boolean) - HausnummerAnZustelladresse_nv = Column(Boolean) - Kmu = Column(Integer) - RegistrierungsdatumMarktakteur = Column(DateTime(timezone=True)) - Fax = Column(String) - HauptwirtdschaftszweigAbteilung = Column(String) - HauptwirtdschaftszweigGruppe = Column(String) - HauptwirtdschaftszweigAbschnitt = Column(String) - Webseite = Column(String) - Umsatzsteueridentifikationsnummer = Column(String) - Registergericht = Column(String) - Adresszusatz = Column(String) - LandAnZustelladresse = Column(String) - PostleitzahlAnZustelladresse = Column(String) - OrtAnZustelladresse = Column(String) - StrasseAnZustelladresse = Column(String) - HausnummerAnZustelladresse = Column(String) - RegisternummerAusland = Column(String) - SonstigeRechtsform = Column(String) - AcerCode = Column(String) - AdresszusatzAnZustelladresse = Column(String) - Taetigkeitsende = Column(Date) - Region = Column(String) - Taetigkeitsende_nv = Column(Boolean) - Marktrollen = Column(String) - Gasgrosshaendler = Column(Boolean) - BelieferungVonLetztverbrauchernGas = Column(Boolean) - BelieferungHaushaltskundenGas = Column(Boolean) - Netz = Column(String) - Direktvermarktungsunternehmen = Column(Boolean) - BelieferungVonLetztverbrauchernStrom = Column(Boolean) - BelieferungHaushaltskundenStrom = Column(Boolean) - Stromgrosshaendler = Column(Boolean) - MarktakteurVorname = Column(String) - MarktakteurNachname = Column(String) - WebportalDesNetzbetreibers = Column(String) - RegisternummerPraefix = Column(String) - - -class Grids(ParentAllTables, Base): - __tablename__ = "grids" - - MastrNummer = Column(String, primary_key=True) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - Sparte = Column(String) - KundenAngeschlossen = Column(String) - GeschlossenesVerteilnetz = Column(String) - Bezeichnung = Column(String) - Marktgebiet = Column(String) - Bundesland = Column(String) - - -class GridConnections(ParentAllTables, Base): - __tablename__ = "grid_connections" - - NetzanschlusspunktMastrNummer = Column(String, primary_key=True) - NetzanschlusspunktBezeichnung = Column(String) - LetzteAenderung = Column(DateTime(timezone=True)) - LokationMastrNummer = Column(String) - Lokationtyp = Column(String) - MaximaleEinspeiseleistung = Column(Float) - Gasqualitaet = Column(String) - NetzMastrNummer = Column(String) - NochInPlanung = Column(Boolean) - NameDerTechnischenLokation = Column(String) - MaximaleAusspeiseleistung = Column(Float) - Messlokation = Column(String) - Spannungsebene = Column(String) - BilanzierungsgebietNetzanschlusspunktId = Column(Integer) - Nettoengpassleistung = Column(Float) - Netzanschlusskapazitaet = Column(Float) - RegelzoneNetzanschlusspunkt = Column(String) - - -class DeletedUnits(ParentAllTables, Base): - __tablename__ = "deleted_units" - - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - EinheitMastrNummer = Column(String, primary_key=True) - Einheittyp = Column(String) - EinheitSystemstatus = Column(String) - EinheitBetriebsstatus = Column(String) - - -class DeletedMarketActors(ParentAllTables, Base): - __tablename__ = "deleted_market_actors" - - MarktakteurMastrNummer = Column(String, primary_key=True) - MarktakteurStatus = Column(String) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - - -class RetrofitUnits(ParentAllTables, Base): - __tablename__ = "retrofit_units" - - Id = Column(Integer, primary_key=True) - EegMastrNummer = Column(String) - Leistungserhoehung = Column(Float) - WiederinbetriebnahmeDatum = Column(Date) - DatumLetzteAktualisierung = Column(DateTime(timezone=True)) - Ertuechtigungsart = Column(String) - ErtuechtigungIstZulassungspflichtig = Column(Boolean) - - -class ChangedDSOAssignment(ParentAllTables, Base): - __tablename__ = "changed_dso_assignment" - - EinheitMastrNummer = Column(String, primary_key=True) - LokationMastrNummer = Column(String) - NetzanschlusspunktMastrNummer = Column(String) - NetzbetreiberMastrNummerNeu = Column(String) - NetzbetreiberMastrNummerAlt = Column(String) - ArtDerAenderung = Column(String) - RegistrierungsdatumNetzbetreiberzuordnungsaenderung = Column( - DateTime(timezone=True) - ) - Netzbetreiberzuordnungsaenderungsdatum = Column(DateTime(timezone=True)) - - -tablename_mapping = { - "anlageneegbiomasse": { - "__name__": BiomassEeg.__tablename__, - "__class__": BiomassEeg, - "replace_column_names": { - "EegMaStRNummer": "EegMastrNummer", - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheit", - "BiogasHoechstbemessungsleistung": "Hoechstbemessungsleistung", - }, - }, - "einheitenbiomasse": { - "__name__": BiomassExtended.__tablename__, - "__class__": BiomassExtended, - "replace_column_names": { - "EegMaStRNummer": "EegMastrNummer", - "KwkMaStRNummer": "KwkMastrNummer", - "LokationMaStRNummer": "LokationMastrNummer", - }, - }, - "anlageneeggeothermiegrubengasdruckentspannung": { - "__name__": GsgkEeg.__tablename__, - "__class__": GsgkEeg, - "replace_column_names": { - "EegMaStRNummer": "EegMastrNummer", - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheit", - }, - }, - "einheitengeothermiegrubengasdruckentspannung": { - "__name__": GsgkExtended.__tablename__, - "__class__": GsgkExtended, - "replace_column_names": { - "EegMaStRNummer": "EegMastrNummer", - "KwkMaStRNummer": "KwkMastrNummer", - "LokationMaStRNummer": "LokationMastrNummer", - }, - }, - "anlageneegsolar": { - "__name__": SolarEeg.__tablename__, - "__class__": SolarEeg, - "replace_column_names": { - "EegMaStRNummer": "EegMastrNummer", - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheit", - }, - }, - "einheitensolar": { - "__name__": SolarExtended.__tablename__, - "__class__": SolarExtended, - "replace_column_names": { - "EegMaStRNummer": "EegMastrNummer", - "LokationMaStRNummer": "LokationMastrNummer", - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheit", - }, - }, - "anlageneegspeicher": { - "__name__": StorageEeg.__tablename__, - "__class__": StorageEeg, - "replace_column_names": { - "EegMaStRNummer": "EegMastrNummer", - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheit", - "Zuschlagsnummer": "eegZuschlagsnummer", - }, - }, - "anlageneegwasser": { - "__name__": HydroEeg.__tablename__, - "__class__": HydroEeg, - "replace_column_names": { - "EegMaStRNummer": "EegMastrNummer", - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheit", - }, - }, - "einheitenwasser": { - "__name__": HydroExtended.__tablename__, - "__class__": HydroExtended, - "replace_column_names": { - "EegMaStRNummer": "EegMastrNummer", - "LokationMaStRNummer": "LokationMastrNummer", - }, - }, - "anlageneegwind": { - "__name__": WindEeg.__tablename__, - "__class__": WindEeg, - "replace_column_names": { - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheit", - "EegMaStRNummer": "EegMastrNummer", - }, - }, - "einheitenwind": { - "__name__": WindExtended.__tablename__, - "__class__": WindExtended, - "replace_column_names": { - "LokationMaStRNummer": "LokationMastrNummer", - "EegMaStRNummer": "EegMastrNummer", - "Nachtkennzeichnung": "Nachtkennzeichen", - }, - }, - "anlagengasspeicher": { - "__name__": GasStorage.__tablename__, - "__class__": GasStorage, - "replace_column_names": { - "MaStRNummer": "MastrNummer", - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheit", - }, - }, - "einheitengasspeicher": { - "__name__": GasStorageExtended.__tablename__, - "__class__": GasStorageExtended, - "replace_column_names": { - "LokationMaStRNummer": "LokationMastrNummer", - "SpeicherMaStRNummer": "SpeicherMastrNummer", - }, - }, - "anlagenkwk": { - "__name__": Kwk.__tablename__, - "__class__": Kwk, - "replace_column_names": { - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheiten" - }, - }, - "anlagenstromspeicher": { - "__name__": StorageUnits.__tablename__, - "__class__": StorageUnits, - "replace_column_names": { - "MaStRNummer": "MastrNummer", - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheit", - }, - }, - "bilanzierungsgebiete": { - "__name__": BalancingArea.__tablename__, - "__class__": BalancingArea, - "replace_column_names": None, - }, - "einheitenaenderungnetzbetreiberzuordnungen": { - "__name__": ChangedDSOAssignment.__tablename__, - "__class__": ChangedDSOAssignment, - "replace_column_names": None, - }, - "einheitengaserzeuger": { - "__name__": GasProducer.__tablename__, - "__class__": GasProducer, - "replace_column_names": { - "LokationMaStRNummer": "LokationMastrNummer", - "SpeicherMaStRNummer": "SpeicherMastrNummer", - }, - }, - "einheitengasverbraucher": { - "__name__": GasConsumer.__tablename__, - "__class__": GasConsumer, - "replace_column_names": { - "LokationMaStRNummer": "LokationMastrNummer", - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheit", - }, - }, - "einheitengenehmigung": { - "__name__": Permit.__tablename__, - "__class__": Permit, - "replace_column_names": { - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheiten", - "DatumDerAntragstellung": "DatumAntragstellung", - }, - }, - "einheitenkernkraft": { - "__name__": NuclearExtended.__tablename__, - "__class__": NuclearExtended, - "replace_column_names": { - "LokationMaStRNummer": "LokationMastrNummer", - }, - }, - "einheitenstromverbraucher": { - "__name__": ElectricityConsumer.__tablename__, - "__class__": ElectricityConsumer, - "replace_column_names": { - "LokationMaStRNummer": "LokationMastrNummer", - }, - }, - "einheitenstromspeicher": { - "__name__": StorageExtended.__tablename__, - "__class__": StorageExtended, - "replace_column_names": { - "LokationMaStRNummer": "LokationMastrNummer", - "EegMaStRNummer": "EegMastrNummer", - }, - }, - "einheitenverbrennung": { - "__name__": CombustionExtended.__tablename__, - "__class__": CombustionExtended, - "replace_column_names": { - "LokationMaStRNummer": "LokationMastrNummer", - "KwkMaStRNummer": "KwkMastrNummer", - }, - }, - "ertuechtigungen": { - "__name__": RetrofitUnits.__tablename__, - "__class__": RetrofitUnits, - "replace_column_names": None, - }, - "geloeschteunddeaktivierteeinheiten": { - "__name__": DeletedUnits.__tablename__, - "__class__": DeletedUnits, - "replace_column_names": None, - }, - "geloeschteunddeaktiviertemarktakteure": { - "__name__": DeletedMarketActors.__tablename__, - "__class__": DeletedMarketActors, - "replace_column_names": None, - }, - "marktakteure": { - "__name__": MarketActors.__tablename__, - "__class__": MarketActors, - "replace_column_names": None, - }, - "marktakteureundrollen": { - "__name__": MarketActorsAndRoles.__tablename__, - "__class__": MarketActorsAndRoles, - "replace_column_names": None, - }, - "netze": { - "__name__": Grids.__tablename__, - "__class__": Grids, - "replace_column_names": None, - }, - "netzanschlusspunkte": { - "__name__": GridConnections.__tablename__, - "__class__": GridConnections, - "replace_column_names": { - "LokationMaStRNummer": "LokationMastrNummer", - "NetzMaStRNummer": "NetzMastrNummer", - }, - }, - "katalogkategorien": { - "__name__": "katalogkategorien", - "__class__": None, - "replace_column_names": None, - }, - "katalogwerte": { - "__name__": "katalogwerte", - "__class__": None, - "replace_column_names": None, - }, - "lokationen": { - "__name__": LocationExtended.__tablename__, - "__class__": LocationExtended, - "replace_column_names": { - "VerknuepfteEinheitenMaStRNummern": "VerknuepfteEinheiten", - "NetzanschlusspunkteMaStRNummern": "Netzanschlusspunkte", - }, - }, - "einheitentypen": { - "__name__": "einheitentypen", - "__class__": None, - "replace_column_names": None, - }, -} diff --git a/open_mastr/utils/sqlalchemy_tables.py b/open_mastr/utils/sqlalchemy_tables.py new file mode 100644 index 00000000..23d92dc2 --- /dev/null +++ b/open_mastr/utils/sqlalchemy_tables.py @@ -0,0 +1,308 @@ +import logging +from typing import Any, Type, Union +from sqlalchemy import Column, Integer, String, Float, Boolean, Date, DateTime, Table, MetaData + +from open_mastr.utils.xsd_tables import ( + MastrColumnType, + MastrTableDescription, + translate_mastr_column_name, +) + +log = logging.getLogger("open-MaStR") + +# Potential hierarchy +# Id -> MastrNummer -> EinheitMastrNummer +# -> EegMastrNummer -> KwkMastrNummer -> GenMastrNummer +# -> MarktakteurMastrNummer -> NetzanschlusspunktMastrNummer +# in case we want to auto-detect the primary key. +MASTR_TABLE_NAME_TO_PRIMARY_KEY_COLUMNS = { + "AnlagenEegBiomasse": {"EegMastrNummer"}, + "AnlagenEegGeothermieGrubengasDruckentspannung": {"EegMastrNummer"}, + "AnlagenEegSolar": {"EegMastrNummer"}, + "AnlagenEegSpeicher": {"EegMastrNummer"}, + "AnlagenEegWasser": {"EegMastrNummer"}, + "AnlagenEegWind": {"EegMastrNummer"}, + "AnlagenGasSpeicher": {"MastrNummer"}, + "AnlagenKwk": {"KwkMastrNummer"}, + "AnlagenStromSpeicher": {"MastrNummer"}, + "Bilanzierungsgebiete": {"Id"}, + + # There is no unique key for this table. So we will have to insert one. + # Check for example the entries for SEE990510388975. We could use + # EinheitMastrNummer + RegistrierungsdatumNetzbetreiberzuordnungsaenderung, + # but the MaStR docs say that RegistrierungsdatumNetzbetreiberzuordnungsaenderung + # can be NULL. + # Plus, SEE904666329300 would additionally need Netzbetreiberzuordnungsaenderungsdatum + # to make it unique. But Netzbetreiberzuordnungsaenderungsdatum actually has NULL values for + # some rows, so it cannot be used in a composite primary key because it must be nullable. + # (Nullable columns in a primary key are OK in SQLite, but not in PostgreSQL & MySQL.) + "EinheitenAenderungNetzbetreiberzuordnungen": None, + + "EinheitenBiomasse": {"EinheitMastrNummer"}, + "EinheitenGasErzeuger": {"EinheitMastrNummer"}, + "EinheitenGasSpeicher": {"EinheitMastrNummer"}, + "EinheitenGasverbraucher": {"EinheitMastrNummer"}, + "EinheitenGenehmigung": {"GenMastrNummer"}, + "EinheitenGeothermieGrubengasDruckentspannung": {"EinheitMastrNummer"}, + "EinheitenKernkraft": {"EinheitMastrNummer"}, + "EinheitenSolar": {"EinheitMastrNummer"}, + "EinheitenStromSpeicher": {"EinheitMastrNummer"}, + "EinheitenStromVerbraucher": {"EinheitMastrNummer"}, + "Einheitentypen": {"Id"}, + "EinheitenVerbrennung": {"EinheitMastrNummer"}, + "EinheitenWasser": {"EinheitMastrNummer"}, + "EinheitenWind": {"EinheitMastrNummer"}, + "Ertuechtigungen": {"Id"}, + "GeloeschteUndDeaktivierteEinheiten": {"EinheitMastrNummer"}, + "GeloeschteUndDeaktivierteMarktakteure": {"MarktakteurMastrNummer"}, + "Katalogkategorien": {"Id"}, + "Katalogwerte": {"Id"}, + "Lokationen": {"MastrNummer"}, + "Lokationstypen": {"Id"}, + "MarktakteureUndRollen": {"MastrNummer"}, + "Marktakteure": {"MastrNummer"}, + "Marktfunktionen": {"Id"}, + "Marktrollen": {"Id"}, + "Netzanschlusspunkte": {"NetzanschlusspunktMastrNummer"}, + "Netze": {"MastrNummer"}, +} + + +def make_sqlalchemy_table_from_mastr_table_description( + table_description: MastrTableDescription, + catalog_value_as_str: bool, + metadata: MetaData, + english: bool = False, + mixins: tuple[type, ...] = tuple(), + include_download_metadata: bool = True, +) -> Table: + if english: + if table_description.english_table_name: + table_name = table_description.english_table_name + else: + table_name = table_description.original_table_name + english = False + log.warning( + f"English table name not available for {table_name}." + " Using German for the whole table." + ) + else: + table_name = table_description.original_table_name + + try: + primary_key_columns = MASTR_TABLE_NAME_TO_PRIMARY_KEY_COLUMNS[ + table_description.original_table_name + ] or set() + artificial_primary_key_name = "OpenMastrId" + except KeyError: + # This table is not yet known to open-mastr. We insert a temporary + # primary key, but make that clear in the name so that users don't + # rely on it. + primary_key_columns = set() + artificial_primary_key_name = "TempOpenMastrIdForUnknownTable" + + if primary_key_columns and english: + primary_key_columns = {translate_mastr_column_name(column) for column in primary_key_columns} + + db_column_kwargs = [] + for mastr_column in table_description.columns: + column_name = ( + mastr_column.english_name or mastr_column.normalized_name + if english + else mastr_column.normalized_name + + ) + column_type = _get_sqlalchemy_type_for_mastr_column_type( + mastr_column_type=mastr_column.type, + catalog_value_as_str=catalog_value_as_str, + ) + kwargs = ( + {"primary_key": True} + if column_name in primary_key_columns + else {"nullable": True} + ) + db_column_kwargs.append( + { + "name": column_name, + "type_": column_type, + "info": { + "original_name": mastr_column.original_name, + "normalized_name": mastr_column.normalized_name, + "english_name": mastr_column.english_name, + }, + **kwargs, + } + ) + db_column_kwargs = _prepend_primary_key_if_missing( + expected_primary_key_columns=primary_key_columns, + db_column_kwargs=db_column_kwargs, + table_name=table_description.original_table_name, + new_primary_key_name=artificial_primary_key_name, + ) + + if include_download_metadata: + data_source_col_info = { + "normalized_name": "DatenQuelle", + "english_name": "dataSource", + } + data_source_col_name = data_source_col_info["english_name" if english else "normalized_name"] + db_column_kwargs.append( + { + "name": data_source_col_name, + "type_": String, + "info": data_source_col_info, + } + ) + download_date_col_info = { + "normalized_name": "DatumDownload", + "english_name": "downloadDate", + } + download_date_col_name = download_date_col_info["english_name" if english else "normalized_name"] + db_column_kwargs.append( + { + "name": download_date_col_name, + "type_": String, + "info": download_date_col_info, + } + ) + + db_columns = [Column(**kwargs) for kwargs in db_column_kwargs] + + return Table( + table_name, + metadata, + *db_columns, + info={ + "original_name": table_description.original_table_name, + "english_name": table_description.english_table_name, + }, + ) + + +def _prepend_primary_key_if_missing( + expected_primary_key_columns: set[str], + db_column_kwargs: list[dict[str, Any]], + table_name: str, + new_primary_key_name: str, +) -> list[dict[str, Any]]: + realized_primary_key_columns = { + kwargs["name"] + for kwargs in db_column_kwargs + if kwargs["name"] in expected_primary_key_columns + } + if expected_primary_key_columns and ( + realized_primary_key_columns == expected_primary_key_columns + ): + return db_column_kwargs.copy() + + id_column_name = "OpenMastrId" + log.info( + f"Missing primary key column for table {table_name}." + f" Inserting custom ID column {new_primary_key_name!r}" + ) + return [ + {"name": new_primary_key_name, "type_": Integer, "primary_key": True, "autoincrement": True} + ] + [ + kwargs | {"primary_key": False, "nullable": True} + for kwargs in db_column_kwargs + ] + + +_MASTR_COLUMN_TYPE_TO_SQLALCHEMY_TYPE = { + MastrColumnType.STRING: String, + MastrColumnType.INTEGER: Integer, + MastrColumnType.FLOAT: Float, + MastrColumnType.DATE: Date, + MastrColumnType.DATETIME: DateTime(timezone=True), + MastrColumnType.BOOLEAN: Boolean, +} + + +# We're creating special column types for the catalog columns here so that +# we can identify the catalog columns later when processing the XML files. +class CatalogInteger(Integer): + pass + + +class CatalogString(String): + pass + + +def _get_sqlalchemy_type_for_mastr_column_type( + mastr_column_type: MastrColumnType, catalog_value_as_str: bool, +) -> Union[Type[String], Type[Integer], Type[Float], Type[Date], Type[DateTime], Type[Boolean]]: + if mastr_column_type is MastrColumnType.CATALOG_VALUE: + return CatalogString if catalog_value_as_str else CatalogInteger + return _MASTR_COLUMN_TYPE_TO_SQLALCHEMY_TYPE[mastr_column_type] + + +def format_sqlalchemy_column(column: Column) -> str: + """Format SQLAlchemy column + + This is an almost exact copy of sqlalchemy.Column.__repr__ with the difference + that "info" is also formatted. + """ + kwarg = [] + if column.key != column.name: + kwarg.append("key") + if column.primary_key: + kwarg.append("primary_key") + if not column.nullable: + kwarg.append("nullable") + if column.onupdate: + kwarg.append("onupdate") + if column.default: + kwarg.append("default") + if column.server_default: + kwarg.append("server_default") + if column.comment: + kwarg.append("comment") + if column.info: + kwarg.append("info") + return "Column(%s)" % ", ".join( + [repr(column.name)] + + [repr(column.type)] + + [repr(x) for x in column.foreign_keys if x is not None] + + [repr(x) for x in column.constraints] + + [ + ( + column.table is not None + and "table=<%s>" % column.table.description + or "table=None" + ) + ] + + ["%s=%s" % (k, repr(getattr(column, k))) for k in kwarg] + ) + + +def format_sqlalchemy_table(table: Table) -> str: + """Format SQLAlchemy column + + This is an almost exact copy of sqlalchemy.Table.__repr__ with two differences: + - "info" is also formatted + - more whitespace (especially linebreaks) to make it more easily readable + """ + return "Table(\n %s\n)" % ",\n ".join( + [repr(table.name)] + + [repr(table.metadata)] + + [format_sqlalchemy_column(x) for x in table.columns] + + ["%s=%s" % (k, repr(getattr(table, k))) for k in ["info", "schema"]] + ) + + +def format_mastr_table_to_db_table(mastr_table_to_db_table: dict[str, Table]) -> str: + """Format mapping from MaStR table to SQLAlchemy table. + + Parameters + ---------- + mastr_table_to_db_table : Mapping from MaStR table name (str) to SQLALchemy Table + + Returns + ------- + str + The formatted mapping as a string. + """ + parts = [] + for mastr_table, db_table in mastr_table_to_db_table.items(): + parts.append(f"{mastr_table}: {format_sqlalchemy_table(db_table)}") + return "\n\n".join(parts) diff --git a/open_mastr/utils/sqlalchemy_views.py b/open_mastr/utils/sqlalchemy_views.py new file mode 100644 index 00000000..89546554 --- /dev/null +++ b/open_mastr/utils/sqlalchemy_views.py @@ -0,0 +1,98 @@ +import logging +from collections.abc import Mapping + +from sqlalchemy import Engine, MetaData, Table, inspect, select, text +from sqlalchemy.ext.compiler import compiles +from sqlalchemy.sql.ddl import DDLElement + +log = logging.getLogger("open-MaStR") + + +class CreateView(DDLElement): + def __init__(self, name: str, selectable): + self.name = name + self.selectable = selectable + + +@compiles(CreateView) +def _compile_create_view(element, compiler, **kw): + selectable_sql = compiler.sql_compiler.process( + element.selectable, literal_binds=True + ) + return f"CREATE VIEW {element.name} AS {selectable_sql}" + + +OLD_MASTR_TABLE_TO_OLD_DB_TABLE_NAME = { + "AnlagenEegBiomasse": "biomass_eeg", + "AnlagenEegGeothermieGrubengasDruckentspannung": "gsgk_eeg", + "AnlagenEegSolar": "solar_eeg", + "AnlagenEegSpeicher": "storage_eeg", + "AnlagenEegWasser": "hydro_eeg", + "AnlagenEegWind": "wind_eeg", + "AnlagenGasSpeicher": "gas_storage", + "AnlagenKwk": "kwk", + "AnlagenStromSpeicher": "storage_units", + "Bilanzierungsgebiete": "balancing_area", + "EinheitenAenderungNetzbetreiberzuordnungen": "changed_dso_assignment", + "EinheitenBiomasse": "biomass_extended", + "EinheitenGasErzeuger": "gas_producer", + "EinheitenGasSpeicher": "gas_storage_extended", + "EinheitenGasverbraucher": "gas_consumer", + "EinheitenGenehmigung": "permit", + "EinheitenGeothermieGrubengasDruckentspannung": "gsgk_extended", + "EinheitenKernkraft": "nuclear_extended", + "EinheitenSolar": "solar_extended", + "EinheitenStromSpeicher": "storage_extended", + "EinheitenStromVerbraucher": "electricity_consumer", + "EinheitenVerbrennung": "combustion_extended", + "EinheitenWasser": "hydro_extended", + "EinheitenWind": "wind_extended", + "Ertuechtigungen": "retrofit_units", + "GeloeschteUndDeaktivierteEinheiten": "deleted_units", + "GeloeschteUndDeaktivierteMarktakteure": "deleted_market_actors", + "Katalogkategorien": "catalog_categories", + "Katalogwerte": "catalog_values", + "Lokationen": "locations", + "Marktakteure": "market_actors", + "MarktakteureUndRollen": "market_actors_and_roles", + "Netzanschlusspunkte": "grid_connections", + "Netze": "grids", +} + + +def _create_view( + engine: Engine, + db_table: Table, + view_name: str, +) -> None: + inspector = inspect(engine) + tables = inspector.get_table_names() + views = inspector.get_view_names() + + with engine.begin() as conn: + if view_name in views: + conn.execute(text(f'DROP VIEW "{view_name}"')) + elif view_name in tables: + conn.execute(text(f'DROP TABLE "{view_name}"')) + conn.execute(CreateView(view_name, select(db_table))) + + +def create_views(engine: Engine, mastr_table_to_db_table: Mapping[str, Table]) -> None: + for mastr_table, db_table in mastr_table_to_db_table.items(): + old_table_name = OLD_MASTR_TABLE_TO_OLD_DB_TABLE_NAME.get(mastr_table) + if not old_table_name: + log.debug( + f"No old table name known for MaStR table {mastr_table}." + " Not creating any view." + ) + continue + if old_table_name == db_table.name: + log.debug( + f"Old table name {old_table_name} was the same as the current name." + " Not creating any view." + ) + continue + + log.info(f"Creating view {old_table_name} mirroring table {db_table.name}") + _create_view(engine=engine, db_table=db_table, view_name=old_table_name) + diff --git a/open_mastr/utils/xsd_tables.py b/open_mastr/utils/xsd_tables.py new file mode 100644 index 00000000..e8195aa8 --- /dev/null +++ b/open_mastr/utils/xsd_tables.py @@ -0,0 +1,181 @@ +import logging +import os +from enum import auto, Enum +from dataclasses import dataclass +from pathlib import Path +from typing import Optional, Union +from zipfile import ZipFile, ZipInfo +import xmlschema +from xmlschema.validators.simple_types import XsdAtomicBuiltin, XsdAtomicRestriction +from xmlschema.validators.exceptions import XMLSchemaModelError + +from open_mastr.utils.helpers import data_to_include_tables +from open_mastr.utils.constants import COLUMN_TRANSLATIONS, TABLE_TRANSLATIONS + +_XML_SCHEMA_PREFIX = "{http://www.w3.org/2001/XMLSchema}" + +log = logging.getLogger("open-MaStR") + + +def normalize_mastr_name(original_mastr_name: str) -> str: + """Normalize original MaStR column name. + + BNetzA sometimes has "MaStR", other times "Mastr". We normalize that. + Also, in case the column names in the XSD contain äöüß, we replace them. + This is probably a BNetzA oversight, but has happened at least once. + """ + return original_mastr_name.replace("MaStR", "Mastr").replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss").strip() + + +def translate_mastr_column_name(normalized_mastr_column_name: str) -> Optional[str]: + translated = COLUMN_TRANSLATIONS.get(normalized_mastr_column_name) + if not translated: + log.warning(f"No translation available for column {normalized_mastr_column_name!r}") + return translated + + +def translate_mastr_table_name(normalized_mastr_table_name: str) -> Optional[str]: + translated = TABLE_TRANSLATIONS.get(normalized_mastr_table_name) + if not translated: + log.warning(f"No translation available for table {normalized_mastr_table_name!r}") + return translated + + +class MastrColumnType(Enum): + STRING = auto() + INTEGER = auto() + FLOAT = auto() + DATE = auto() + DATETIME = auto() + BOOLEAN = auto() + CATALOG_VALUE = auto() + + @classmethod + def from_xsd_type(cls, xsd_type: Union[XsdAtomicBuiltin, XsdAtomicRestriction]) -> "MastrColumnDescription": + xsd_type_to_mastr_column_type = { + f"{_XML_SCHEMA_PREFIX}string": cls.STRING, + f"{_XML_SCHEMA_PREFIX}decimal": cls.INTEGER, + f"{_XML_SCHEMA_PREFIX}int": cls.INTEGER, + f"{_XML_SCHEMA_PREFIX}short": cls.INTEGER, + f"{_XML_SCHEMA_PREFIX}byte": cls.INTEGER, + f"{_XML_SCHEMA_PREFIX}float": cls.FLOAT, + f"{_XML_SCHEMA_PREFIX}date": cls.DATE, + f"{_XML_SCHEMA_PREFIX}dateTime": cls.DATETIME, + } + if xsd_type.is_restriction(): + if enumeration := xsd_type.enumeration: + if set(enumeration) == {0, 1}: + return cls.BOOLEAN + return cls.CATALOG_VALUE + # Ertuechtigungen.xsd has some normal types defined as restrictions for some reason. + # We cope with that by extracting the primitive type it's restricted to. + inner_xsd_type = xsd_type.primitive_type + if mastr_column_type := xsd_type_to_mastr_column_type.get(inner_xsd_type.name): + return mastr_column_type + + if mastr_column_type := xsd_type_to_mastr_column_type.get(xsd_type.name): + return mastr_column_type + + raise ValueError(f"Could not determine MastrColumnType from XSD type {xsd_type!r}") + + +@dataclass(frozen=True) +class MastrColumnDescription: + original_name: str + normalized_name: str + english_name: Optional[str] + type: MastrColumnType + + @classmethod + def from_xsd_element(cls, xsd_element: xmlschema.XsdElement) -> "MastrColumnDescription": + normalized_name = normalize_mastr_name(xsd_element.name) + return cls( + original_name=xsd_element.name, + normalized_name=normalized_name, + english_name=translate_mastr_column_name(normalized_name), + type=MastrColumnType.from_xsd_type(xsd_element.type) + ) + + +@dataclass(frozen=True) +class MastrTableDescription: + original_table_name: str + english_table_name: Optional[str] + instance_name: str + columns: tuple[MastrColumnDescription] + + @classmethod + def from_xml_schema(cls, schema: xmlschema.XMLSchema) -> "MastrTableDescription": + if len(schema.root_elements) != 1: + raise ValueError( + "XML schema must have exactly one root element," + f" but has {len(schema.root_elements)} ({schema.root_elements!r})" + ) + root = schema.root_elements[0] + + try: + main_element = root.content.content[0] + column_elements = main_element.content.content + except (AttributeError, IndexError, TypeError) as e: + raise ValueError(f"Could not find columns in XML schema {schema!r}") from e + + columns = tuple( + MastrColumnDescription.from_xsd_element(element) + for element in column_elements + ) + + # We don't normalize the table name because + # - it would introduce too much complexity to have two German table names + # - the normalization would leave the table name as is (at least as of Feb 2026) + original_table_name = root.name + english_table_name = translate_mastr_table_name(original_table_name) + + return cls( + original_table_name=original_table_name, + english_table_name=english_table_name, + instance_name=main_element.name, + columns=columns, + ) + + +class InvalidXmlSchemaError(Exception): + pass + + +def read_mastr_table_descriptions_from_xsd( + zipped_docs_file_path: Union[Path, str], data: list[str] +) -> set[MastrTableDescription]: + include_tables = data_to_include_tables(data) + + mastr_table_descriptions = set() + with ZipFile(zipped_docs_file_path, "r") as docs_z: + xsd_zip_entry = _find_xsd_zip_entry(docs_z) + with ZipFile(docs_z.open(xsd_zip_entry)) as xsd_z: + for entry in xsd_z.filelist: + if entry.is_dir() or not entry.filename.endswith(".xsd"): + continue + + normalized_name = os.path.basename(entry.filename).removesuffix(".xsd").lower() + if normalized_name in include_tables: + with xsd_z.open(entry) as xsd_file: + try: + schema = xmlschema.XMLSchema(xsd_file) + except XMLSchemaModelError as e: + raise InvalidXmlSchemaError( + f"Invalid XML Schema in {os.path.basename(entry.filename)}" + ) from e + mastr_table_description = MastrTableDescription.from_xml_schema(schema) + mastr_table_descriptions.add(mastr_table_description) + + return mastr_table_descriptions + + +def _find_xsd_zip_entry(docs_zip_file: ZipFile) -> ZipInfo: + desired_filename = "xsd.zip" + for entry in docs_zip_file.filelist: + if os.path.basename(entry.filename) == desired_filename: + return entry + raise RuntimeError( + f"Did not find XSD files in the form of {desired_filename!r} in the documentation" + f" ZIP file {docs_zip_file.filename!r}" + ) diff --git a/open_mastr/xml_download/colums_to_replace.py b/open_mastr/xml_download/colums_to_replace.py index 421ac44c..fde8d5e7 100644 --- a/open_mastr/xml_download/colums_to_replace.py +++ b/open_mastr/xml_download/colums_to_replace.py @@ -1,6 +1,5 @@ -# system catalog is the mapping for the entries within the two columns -# Marktfunktionen und Lokationstyp (entry 1 is mapped to Stromnetzbetreiber -# in the column Marktfunktionen) +# system catalog is the mapping for the entries within the columns +# Marktfunktion, Lokationtyp and Einheittyp # The values for the system catalog can be found in the pdf of the bulk download # documentation: https://www.marktstammdatenregister.de/MaStR/Datendownload @@ -38,98 +37,3 @@ 12: "Gasspeichereinheit", }, } - -# columns to replace lists all columns where the entries have -# to be replaced according to the tables katalogwerte and katalogeinträge -# from the bulk download of the MaStR - -columns_replace_list = [ - # anlageneegsolar - "AnlageBetriebsstatus", - # anlageneegspeicher - # anlagenstromspeicher - # einheitensolar - "Land", - "Bundesland", - "EinheitSystemstatus", - "EinheitBetriebsstatus", - "Energietraeger", - "Einspeisungsart", - "GemeinsamerWechselrichterMitSpeicher", - "Lage", - "Leistungsbegrenzung", - "Hauptausrichtung", - "HauptausrichtungNeigungswinkel", - "Nutzungsbereich", - "Nebenausrichtung", - "NebenausrichtungNeigungswinkel", - "ArtDerFlaecheIds", - # einheitenstromspeicher - "AcDcKoppelung", - "Batterietechnologie", - "Technologie", - "Pumpspeichertechnologie", - "Einsatzort", - # geloeschteunddeaktivierteEinheiten - # geloeschteunddeaktivierteMarktAkteure - "MarktakteurStatus", - # lokationen - # marktakteure - "Personenart", - "Rechtsform", - "HauptwirtdschaftszweigAbteilung", - "HauptwirtdschaftszweigGruppe", - "HauptwirtdschaftszweigAbschnitt", - "Registergericht", - "LandAnZustelladresse", - # netzanschlusspunkte - "Gasqualitaet", - "Spannungsebene", - # anlageneegbiomasse - # anlageneeggeosolarthermiegrubenklaerschlammdruckentspannung - # anlageneegwasser - # anlageneegwind - # anlagengasspeicher - # anlagenkwk - # bilanzierungsgebiete - # einheitenaenderungnetzbetreiberzuordnungen - "ArtDerAenderung", - # einheitenbiomasse - "Hauptbrennstoff", - "Biomasseart", - # einheitengaserzeuger - # einheitengasspeicher - "Speicherart", - # einheitengasverbraucher - # einheitengenehmigung - "Art", - # einheitengeosolarthermiegrubenklaerschlammdruckentspannung - # einheitenkernkraft - # einheitenstromverbraucher - "ArtAbschaltbareLast", - # einheitentypen - # einheitenverbrennung - "WeitererHauptbrennstoff", - "WeitereBrennstoffe", - "ArtDerStilllegung", - # einheitenwasser - "ArtDesZuflusses", - "ArtDerWasserkraftanlage", - # marktrollen - # netze - "Sparte", - # einheitenwind - "Lage", - "Hersteller", - "Seelage", - "ClusterNordsee", - "ClusterOstsee", - # various tables - "NetzbetreiberpruefungStatus", - "WindAnLandOderAufSee", - "TechnologieFlugwindenergieanlage", - "Flughoehe", - "Flugradius", - "ArtDerSolaranlage", - "SpeicherAmGleichenOrt", -] diff --git a/open_mastr/xml_download/utils_cleansing_bulk.py b/open_mastr/xml_download/utils_cleansing_bulk.py index 928f7c68..6f4592db 100644 --- a/open_mastr/xml_download/utils_cleansing_bulk.py +++ b/open_mastr/xml_download/utils_cleansing_bulk.py @@ -1,24 +1,29 @@ import pandas as pd import numpy as np +from collections.abc import Collection +from zipfile import ZipFile +import io + from open_mastr.xml_download.colums_to_replace import ( system_catalog, - columns_replace_list, ) -from zipfile import ZipFile -import io -def cleanse_bulk_data(df: pd.DataFrame, zipped_xml_file_path: str) -> pd.DataFrame: - df = replace_ids_with_names(df, system_catalog) - # Katalogeintraege: int -> string value +def cleanse_bulk_data( + df: pd.DataFrame, + catalog_columns: Collection[str], + zipped_xml_file_path: str, +) -> pd.DataFrame: + df = replace_system_catalog_ids(df, system_catalog) + catalog_columns = set(catalog_columns) - system_catalog.keys() df = replace_mastr_katalogeintraege( - zipped_xml_file_path=zipped_xml_file_path, df=df + zipped_xml_file_path=zipped_xml_file_path, df=df, catalog_columns=catalog_columns, ) return df -def replace_ids_with_names(df: pd.DataFrame, system_catalog: dict) -> pd.DataFrame: - """Replaces ids with names according to the system catalog. This is +def replace_system_catalog_ids(df: pd.DataFrame, system_catalog: dict[int, str]) -> pd.DataFrame: + """Replaces IDs with names according to the system catalog. This is necessary since the data from the bulk download encodes columns with IDs instead of the actual values.""" for column_name, name_mapping_dictionary in system_catalog.items(): @@ -28,16 +33,18 @@ def replace_ids_with_names(df: pd.DataFrame, system_catalog: dict) -> pd.DataFra def replace_mastr_katalogeintraege( - zipped_xml_file_path: str, df: pd.DataFrame, + catalog_columns: Collection[str], + zipped_xml_file_path: str, ) -> pd.DataFrame: """Replaces the IDs from the mastr database by its mapped string values from - the table katalogwerte""" + the table Katalogwerte""" + # TODO: Create Katalogwerte dict once for whole download, not once per processed file. katalogwerte = create_katalogwerte_from_bulk_download(zipped_xml_file_path) for column_name in df.columns: - if column_name in columns_replace_list: + if column_name in catalog_columns: if df[column_name].dtype == "O": - # Handle comma seperated strings from catalog values + # Handle comma-separated strings from catalog values df[column_name] = ( df[column_name] .str.split(",", expand=True) diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index 9eedebab..54f05db6 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -1,13 +1,14 @@ import os import shutil import time -from datetime import datetime as dt +from collections import defaultdict from importlib.metadata import PackageNotFoundError, version from zipfile import ZipFile from pathlib import Path import urllib.request import re -from datetime import datetime +from datetime import date, datetime +from typing import Optional import numpy as np import requests @@ -88,7 +89,7 @@ def gen_version( return f"{year}.{release}" -def gen_url( +def gen_xml_download_url( when: time.struct_time = time.localtime(), use_version="current", use_stichtag=False ) -> str: """Generates the download URL for the specified date. @@ -128,7 +129,7 @@ def gen_url( def download_xml_Mastr( save_path: str, bulk_date_string: str, - bulk_data_list: list, + bulk_data_list: list[str], xml_folder_path: str, url: str = None, ) -> None: @@ -147,15 +148,18 @@ def download_xml_Mastr( url: str, optional Custom download URL. If None, generates URL based on bulk_date_string. """ + missing_tables = _find_missing_tables(save_path, bulk_data_list) + if not missing_tables: + return - log.info("Starting the Download from marktstammdatenregister.de.") + log.info("Starting the download from marktstammdatenregister.de.") # Helper function to convert date string to time.struct_time def _parse_date_string(date_str): """Convert YYYYMMDD string to time.struct_time object.""" try: # Use datetime.strptime for robust date parsing - parsed_date = dt.strptime(date_str, "%Y%m%d") + parsed_date = datetime.strptime(date_str, "%Y%m%d") # Convert to time.struct_time using timetuple() return parsed_date.timetuple() except (ValueError, IndexError) as e: @@ -168,7 +172,7 @@ def _parse_date_string(date_str): # Determine the URL to use if url is None: # Generate URL from date string if no custom URL provided - url = gen_url(url_time) + url = gen_xml_download_url(url_time) # else: custom URL is already provided, use it as-is time_a = time.perf_counter() @@ -180,23 +184,23 @@ def _parse_date_string(date_str): now = time.localtime( time.mktime(url_time) - (24 * 60 * 60) ) # subtract 1 day from the date - url = gen_url(now) + url = gen_xml_download_url(now) r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) if r.status_code == 404: - url = gen_url(url_time, use_version="before") # Use lower MaStR Version + url = gen_xml_download_url(url_time, use_version="before") # Use lower MaStR Version log.warning( f"Download file was not found. Assuming that the version of MaStR has changed and retrying with download link: {url}" ) r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) if r.status_code == 404: - url = gen_url(url_time, use_version="after") # Use higher MaStR Version + url = gen_xml_download_url(url_time, use_version="after") # Use higher MaStR Version log.warning( f"Download file was not found. Assuming that the version of MaStR has changed and retrying with download link: {url}" ) r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) if r.status_code == 404: - url = gen_url( + url = gen_xml_download_url( url_time, use_stichtag=True ) # Use different url-structure for older downloads log.warning( @@ -207,14 +211,15 @@ def _parse_date_string(date_str): log.error("Could not download file: download URL not found") return - if bulk_data_list == BULK_DATA: - full_download_without_unzip_http(save_path, r, bulk_data_list) + all_tables = {table for tables in BULK_INCLUDE_TABLES_MAP for table in tables} + if missing_tables.issuperset(all_tables): + full_download_without_unzip_http(save_path, r) else: try: - partial_download_with_unzip_http(save_path, url, bulk_data_list) + partial_download_with_unzip_http(save_path, url, missing_tables) except Exception as e: log.warning(f"Partial download failed, fallback to full download: {e}") - full_download_without_unzip_http(save_path, r, bulk_data_list) + full_download_without_unzip_http(save_path, r) time_b = time.perf_counter() log.info( @@ -223,26 +228,36 @@ def _parse_date_string(date_str): log.info(f"MaStR was successfully downloaded to {xml_folder_path}.") -def check_download_completeness( - save_path: str, bulk_data_list: list -) -> tuple[list, bool]: - """Checks if an existing download contains the xml-files corresponding to the bulk_data_list.""" +def _find_missing_tables( + save_path: str, bulk_data_list: list[str] +) -> set[str]: + """Check if an existing download contains the XML files corresponding to the bulk_data_list.""" + needed_tables = { + bulk_table_name + for bulk_data_name in bulk_data_list + for bulk_table_name in BULK_INCLUDE_TABLES_MAP[bulk_data_name] + } | {"katalogwerte"} # We always need Katalogwerte! + if not os.path.exists(save_path): + return needed_tables + with ZipFile(save_path, "r") as zip_ref: - existing_files = [ + existing_tables = { zip_name.lower().split("_")[0].split(".")[0] for zip_name in zip_ref.namelist() - ] + } - missing_data_set = set() - for bulk_data_name in bulk_data_list: - for bulk_file_name in BULK_INCLUDE_TABLES_MAP[bulk_data_name]: - if bulk_file_name not in existing_files: - missing_data_set.add(bulk_data_name) + missing_tables = needed_tables - existing_tables + if missing_tables: + log.info( + f"MaStR XML ZIP file already present but missing the following data: {missing_tables}" + ) + else: + log.info( + "MaStR XML ZIP file already present and has all info. Not downloading again." + f" Existing file: {save_path}" + ) - is_katalogwerte_existing = False - if "katalogwerte" in existing_files: - is_katalogwerte_existing = True - return list(missing_data_set), is_katalogwerte_existing + return missing_tables def delete_xml_files_not_from_given_date( @@ -267,7 +282,7 @@ def delete_xml_files_not_from_given_date( os.makedirs(xml_folder_path) -def partial_download_with_unzip_http(save_path: str, url: str, bulk_data_list: list): +def partial_download_with_unzip_http(save_path: str, url: str, names_to_download: set[str]): """ Parameters @@ -276,60 +291,35 @@ def partial_download_with_unzip_http(save_path: str, url: str, bulk_data_list: l Full file path where the downloaded MaStR zip file will be saved. url: str URL path to bulk file. - bulk_data_list: list - List of tables/technologis to be downloaded. + names_to_download: set + List of tables to be downloaded. + E.g. {"anlageneegsolar", "einheitensolar", "katalogwerte"} Returns ------- None """ - is_katalogwerte_existing = False - if os.path.exists(save_path): - bulk_data_list, is_katalogwerte_existing = check_download_completeness( - save_path, bulk_data_list - ) - if bool(bulk_data_list): - log.info( - f"MaStR file already present but missing the following data: {bulk_data_list}" - ) - else: - log.info(f"MaStR file already present: {save_path}") - return None - remote_zip_file = unzip_http.RemoteZipFile(url) remote_zip_names = [ remote_zip_name.lower().split("_")[0].split(".")[0] for remote_zip_name in remote_zip_file.namelist() ] - remote_index_list = [] download_files_list = [] - for bulk_data_name in bulk_data_list: - # Example: ['wind','solar'] - for bulk_file_name in BULK_INCLUDE_TABLES_MAP[bulk_data_name]: - # Example: From "wind" we get ["anlageneegwind", "einheitenwind"], and from "solar" we get ["anlageneegsolar", "einheitensolar"] - # and we have to find the corresponding index in the remote_zip_file list in order to fetch the correct file - remote_index_list = [ - remote_index - for remote_index, remote_zip_name in enumerate(remote_zip_names) - if remote_zip_name == bulk_file_name - ] - # for remote_index in tqdm(remote_index_list): - for remote_index in remote_index_list: + for name in names_to_download: + # we have to find the corresponding index in the remote_zip_file list in order to fetch the correct file + for remote_index, remote_zip_name in enumerate(remote_zip_names): + if remote_zip_name == name: # Example: remote_zip_file.namelist()[remote_index] corresponds to e.g. 'AnlagenEegSolar_1.xml' download_files_list.append(remote_zip_file.namelist()[remote_index]) for zipfile_name in tqdm(download_files_list, unit=" file"): remote_zip_file.extractzip(zipfile_name, path=Path(save_path)) - if not is_katalogwerte_existing: - remote_zip_file.extractzip("Katalogwerte.xml", path=Path(save_path)) - def full_download_without_unzip_http( save_path: str, r: requests.models.Response, - bulk_data_list: list, ) -> None: """ @@ -339,29 +329,18 @@ def full_download_without_unzip_http( Full file path where the downloaded MaStR zip file will be saved. r: requests.models.Response Response from making a request to MaStR. - bulk_data_list: list - List of tables/technologis to be downloaded. Returns ------- None """ - if os.path.exists(save_path): - bulk_data_list, is_katalogwerte_existing = check_download_completeness( - save_path, bulk_data_list - ) - if bool(bulk_data_list): - print( - f"MaStR file already present but missing the following data: {bulk_data_list}" - ) - else: - print(f"MaStR file already present: {save_path}") - return None - warning_message = ( "Warning: The servers from MaStR restrict the download speed." " You may want to download it another time." ) + # We could get rid of this magic number by first making a request to get the file size + # and then using that as total length for the progress bar. + # See https://github.com/OpenEnergyPlatform/open-MaStR/issues/570 total_length = int(23000) with ( open(save_path, "wb") as zfile, @@ -401,7 +380,8 @@ def get_available_download_links( list of dict A list of dictionaries containing information about available downloads. Each dictionary contains: - - 'url': The download URL + - 'url': The download URL of the XML containing the MaStR data + - 'docs_url': The download URL of the docs describing the MaStR data format - 'date': The date of the export (YYYYMMDD format) - 'version': The MaStR version (e.g., '24.1', '24.2') - 'type': 'current' for current exports, 'stichtag' for historical exports @@ -427,45 +407,82 @@ def get_available_download_links( log.error(f"Failed to fetch download page: {e}") return [] - # Pattern for current exports - pattern_current = re.compile( - r"https://download\.marktstammdatenregister\.de/Gesamtdatenexport_([0-9]{8})_([0-9]{2}\.[0-9])\.zip" - ) - # Pattern for historical exports (Stichtag) - pattern_stichtag = re.compile( - r"https://download\.marktstammdatenregister\.de/Stichtag/Gesamtdatenexport_([0-9]{8})_([0-9]{2}\.[0-9])\.zip" - ) + # We have in principle two ways of finding the URLs in the HTML of the MaStR Datendownload page: + # 1. Depend on some parts of the HTML structure, identify the tags and get the href attributes and somehow the date. + # 2. Depend on the URL structure and search for them using regex. Pull the date from the URL. + # We guess that the URL structure is more stable than the HTML structure, so we implement approach 2. - # Find all current export links - current_matches = pattern_current.findall(html) - current_links = [ - { - "url": f"https://download.marktstammdatenregister.de/Gesamtdatenexport_{date}_{version}.zip", - "date": date, - "version": version, - "type": "current", - } - for date, version in current_matches - ] + current_link = _find_current_download_link(html) + stichtag_links = _find_stichtag_download_links(html) + + # Combine and sort by date (newest first) + all_links = [current_link] + stichtag_links + all_links.sort(key=lambda x: (x["date"], x["version"]), reverse=True) - # Find all historical export links - stichtag_matches = pattern_stichtag.findall(html) - stichtag_links = [ - { - "url": f"https://download.marktstammdatenregister.de/Stichtag/Gesamtdatenexport_{date}_{version}.zip", + log.info(f"Found {len(all_links)} available download links") + return all_links + + +def _find_current_download_link(html: str) -> dict[str, Optional[str]]: + pattern_current_xml = r"https://download\.marktstammdatenregister\.de/Gesamtdatenexport_(?P[0-9]{8})_(?P[0-9]{2}\.[0-9])\.zip" + match_xml = re.search(pattern_current_xml, html) + if not match_xml: + log.error("Found no link for the current XML download in MaStR download list HTML") + return {} + link = { + "url": match_xml.group(), + "docs_url": None, + "date": match_xml.group("date"), + "version": match_xml.group("version"), + "type": "current", + } + + mastr_origin = "https://www.marktstammdatenregister.de" + # The URL origin is actually omitted in the HTML. We make it work with and without it in case BNetzA adds the origin in the link. + pattern_current_docs = rf"(?P{re.escape(mastr_origin)})?/MaStRHilfe/files/gesamtdatenexport/Dokumentation%20MaStR%20Gesamtdatenexport.zip" + if match_docs := re.search(pattern_current_docs, html): + matched_string = match_docs.group() + link["docs_url"] = matched_string if match_docs.group("origin") else mastr_origin + matched_string + else: + log.error("Found no link for the current docs download in MaStR download list HTML") + + return link + + +def _find_stichtag_download_links(html: str) -> list[dict[str, Optional[str]]]: + pattern_stichtag_xml = r"https://download\.marktstammdatenregister\.de/Stichtag/Gesamtdatenexport_(?P[0-9]{8})_(?P[0-9]{2}\.[0-9])\.zip" + date_to_links = defaultdict(list) + for match_xml in re.finditer(pattern_stichtag_xml, html): + date = match_xml.group("date") + link = { + "url": match_xml.group(), + "docs_url": None, "date": date, - "version": version, + "version": match_xml.group("version"), "type": "stichtag", } - for date, version in stichtag_matches - ] + date_to_links[date].append(link) - # Combine and sort by date (newest first) - all_links = current_links + stichtag_links - all_links.sort(key=lambda x: x["date"], reverse=True) + pattern_stichtag_docs = ( + r"https://download\.marktstammdatenregister\.de/Stichtag/" + r"Dokumentation(?:%20| )MaStR(?:%20| )Gesamtdatenexport(?:%20| )" + r"(?P[0-9]{2})-(?P[0-9]{2})-(?P[0-9]{4}).zip" + ) + for match_docs in re.finditer(pattern_stichtag_docs, html): + # When there are two XML downloads with different versions for the same day, + # there is still (strangely) only one docs download. So we assign it to multiple links. + date = match_docs.group("year") + match_docs.group("month") + match_docs.group("day") + if links := date_to_links.get(date): + for link in links: + link["docs_url"] = match_docs.group() + else: + log.error( + f"Found a docs download link for {date} in MaStR download list HTML" + ", but not a corresponding XML download link" + ) - log.info(f"Found {len(all_links)} available download links") - return all_links + links = [link for links in date_to_links.values() for link in links] + return links def list_available_downloads(): @@ -483,17 +500,20 @@ def list_available_downloads(): print("No download links found. Please check your internet connection.") return [] + url_pad = max(len(link["url"]) for link in links) + 2 + print("\n" + "=" * 80) print("AVAILABLE MAStR DOWNLOADS") print("=" * 80) - print(f"{'#':<4} {'Date':<12} {'Version':<10} {'Type':<12} {'URL'}") + print(f"{'#':<4} {'Date':<12} {'Version':<10} {'Type':<12} {'XML URL':<{url_pad}} Docs URL") print("-" * 80) for i, link in enumerate(links, 1): # Format date for better readability date_formatted = f"{link['date'][:4]}-{link['date'][4:6]}-{link['date'][6:]}" print( - f"{i:<4} {date_formatted:<12} {link['version']:<10} {link['type']:<12} {link['url']}" + f"{i:<4} {date_formatted:<12} {link['version']:<10}" + f" {link['type']:<12} {link['url']:<{url_pad}} {link['docs_url']}" ) print("=" * 80) @@ -503,7 +523,7 @@ def list_available_downloads(): return links -def select_download_date(): +def select_download_date() -> Optional[dict[str, Optional[str]]]: """ Interactive function to let the user select a download date. @@ -511,14 +531,13 @@ def select_download_date(): Returns ------- - tuple - (date_string, url) where date_string is in YYYYMMDD format and url is the download URL - Returns (None, None) if user cancels or no valid selection is made + list of dict + List of available downloads with formatted dates and versions. """ links = list_available_downloads() if not links: - return None, None + return None print("\nOptions:") print("1. Select from the list above (enter the number)") @@ -537,7 +556,7 @@ def select_download_date(): print( f"\nSelected: {selected['date']} (Version {selected['version']}, Type: {selected['type']})" ) - return selected["date"], selected["url"] + return selected else: print(f"Please enter a number between 1 and {len(links)}") except ValueError: @@ -545,7 +564,85 @@ def select_download_date(): elif choice == "2": print("Download selection cancelled.") - return None, None + return None else: print("Invalid choice. Please enter 1, or 2.") + + +def get_date_from_docs_url(url: str) -> Optional[str]: + if m := re.search(r"(?P[0-9]{2})-(?P[0-9]{2})-(?P[0-9]{4})", url): + return f"{m.group('year')}{m.group('month')}{m.group('day')}" + return None + + +def gen_docs_download_urls( + bulk_date_string: Optional[str] = None, + url: Optional[str] = None +) -> tuple[str, Optional[str]]: + newest_url = "https://www.marktstammdatenregister.de/MaStRHilfe/files/gesamtdatenexport/Dokumentation%20MaStR%20Gesamtdatenexport.zip" + if url: + preferred_url = url + fallback_url = None + else: + if bulk_date_string: + dt = datetime.strptime(bulk_date_string, "%Y%m%d") + stichtag_url = ( + "https://download.marktstammdatenregister.de/Stichtag/" + "Dokumentation%20MaStR%20Gesamtdatenexport%20" + f"{dt.day:0>2}-{dt.month:0>2}-{dt.year:0>4}.zip" + ) + if dt.date() == date.today(): + preferred_url = newest_url + fallback_url = stichtag_url + else: + preferred_url = stichtag_url + fallback_url = newest_url + else: + preferred_url = newest_url + fallback_url = None + + return preferred_url, fallback_url + + +def download_documentation( + save_path: str, + bulk_date_string: Optional[str] = None, + url: Optional[str] = None, +) -> None: + """Downloads the zipped MaStR. + + Parameters + ----------- + save_path: str + Full file path where the downloaded MaStR documentation zip file will be saved. + """ + if os.path.exists(save_path): + log.info( + "MaStR docs file already present and has all info. Not downloading again." + f" Existing file: {save_path}" + ) + return + + log.info("Starting MaStR documentation download from marktstammdatenregister.de.") + preferred_url, fallback_url = gen_docs_download_urls(bulk_date_string, url) + + time_a = time.perf_counter() + log.info(f"Downloading MaStR documentation from {preferred_url}") + r = requests.get(preferred_url, headers={"User-Agent": USER_AGENT}) + if r.status_code == 404: + log.warning( + "MaStR documentation download file was not found." + f" Trying to download from {fallback_url}" + ) + r = requests.get(fallback_url, headers={"User-Agent": USER_AGENT}) + + r.raise_for_status() + with open(save_path, "wb") as zfile: + zfile.write(r.content) + + time_b = time.perf_counter() + log.info( + f"MaStR documentation was successfully downloaded to {save_path!r}." + f" It took {round(time_b - time_a)} seconds." + ) diff --git a/open_mastr/xml_download/utils_write_to_database.py b/open_mastr/xml_download/utils_write_to_database.py index 3cff7a43..7f143752 100644 --- a/open_mastr/xml_download/utils_write_to_database.py +++ b/open_mastr/xml_download/utils_write_to_database.py @@ -1,4 +1,5 @@ import os +from collections.abc import Collection, Mapping from concurrent.futures import ProcessPoolExecutor, wait from io import StringIO from multiprocessing import cpu_count @@ -10,13 +11,17 @@ import numpy as np import pandas as pd import sqlalchemy -from sqlalchemy import select, create_engine, inspect +from sqlalchemy import Column, Engine, Table, delete, select, create_engine from sqlalchemy.sql import text from sqlalchemy.sql.sqltypes import Date, DateTime from open_mastr.utils.config import setup_logger from open_mastr.utils.helpers import data_to_include_tables -from open_mastr.utils.orm import tablename_mapping +from open_mastr.utils.xsd_tables import ( + normalize_mastr_name, + translate_mastr_column_name, +) +from open_mastr.utils.sqlalchemy_tables import CatalogInteger, CatalogString from open_mastr.xml_download.utils_cleansing_bulk import cleanse_bulk_data log = setup_logger() @@ -28,12 +33,18 @@ def write_mastr_xml_to_database( data: list, bulk_cleansing: bool, bulk_download_date: str, + mastr_table_to_db_table: Mapping[str, Table], + alter_database_tables: bool, ) -> None: """Write the Mastr in xml format into a database defined by the engine parameter.""" log.info("Starting bulk download...") - include_tables = data_to_include_tables(data, mapping="write_xml") + include_tables = data_to_include_tables(data) threads_data = [] + lower_mastr_table_to_db_table = { + db_table.info.get("original_name", mastr_table_name).lower(): db_table + for mastr_table_name, db_table in mastr_table_to_db_table.items() + } with ZipFile(zipped_xml_file_path, "r") as f: files_list = correct_ordering_of_filelist(f.namelist()) @@ -41,20 +52,26 @@ def write_mastr_xml_to_database( for file_name in files_list: xml_table_name = extract_xml_table_name(file_name) - if not is_table_relevant(xml_table_name, include_tables): + if xml_table_name not in include_tables: + continue + + db_table = lower_mastr_table_to_db_table.get(xml_table_name) + if db_table is None: + log.warning( + f"Skipping MaStR file {file_name!r} because no database table was found for {xml_table_name=}" + ) continue - sql_table_name = extract_sql_table_name(xml_table_name) threads_data.append( ( file_name, - xml_table_name, - sql_table_name, + db_table, str(engine.url), engine.url.password, zipped_xml_file_path, bulk_download_date, bulk_cleansing, + alter_database_tables, ) ) @@ -100,13 +117,13 @@ def get_number_of_processes(): def process_xml_file( file_name: str, - xml_table_name: str, - sql_table_name: str, + db_table: Table, connection_url: str, password: str, zipped_xml_file_path: str, bulk_download_date: str, bulk_cleansing: bool, + alter_database_tables: bool, ) -> None: """Process a single xml file and write it to the database.""" try: @@ -122,26 +139,90 @@ def process_xml_file( with ZipFile(zipped_xml_file_path, "r") as f: log.info(f"Processing file '{file_name}'...") if is_first_file(file_name): - log.info(f"Creating table '{sql_table_name}'...") - create_database_table(engine, xml_table_name) + delete_all_existing_rows(db_table=db_table, engine=engine) df = read_xml_file(f, file_name) df = process_table_before_insertion( - df, - xml_table_name, - zipped_xml_file_path, - bulk_download_date, - bulk_cleansing, + df=df, + db_table=db_table, + zipped_xml_file_path=zipped_xml_file_path, + bulk_download_date=bulk_download_date, + bulk_cleansing=bulk_cleansing, + ) + df = check_for_column_mismatch_and_try_to_solve_it( + df=df, + db_table=db_table, + engine=engine, + alter_database_tables=alter_database_tables, ) if engine.dialect.name == "sqlite": - add_table_to_sqlite_database(df, xml_table_name, sql_table_name, engine) + add_table_to_sqlite_database( + df=df, + db_table=db_table, + engine=engine, + ) else: add_table_to_non_sqlite_database( - df, xml_table_name, sql_table_name, engine + df=df, + db_table=db_table, + engine=engine, ) except Exception as e: log.error(f"Error processing file '{file_name}': '{e}'") +def delete_all_existing_rows(db_table: Table, engine: Engine) -> None: + with engine.begin() as con: + con.execute(delete(db_table)) + + +def check_for_column_mismatch_and_try_to_solve_it( + df: pd.DataFrame, + db_table: Table, + engine: Engine, + alter_database_tables: bool, +) -> pd.DataFrame: + df_column_names = set(df.columns) + db_column_names = {column.name for column in db_table.columns} + + if additional_db_column_names := db_column_names - df_column_names: + # Many columns are optional and it's perfectly normal to have and XML file / a dataframe that doesn't have + # a column that is present in the database. So this is only worth a debug message. + log.debug( + f"Database table {db_table.name} has some columns that weren't found in the XML file." + f" Proceeding and trying to insert anyway. Additional DB columns:" + f" {', '.join(additional_db_column_names)}" + ) + + if additional_df_column_names := df_column_names - db_column_names: + if alter_database_tables: + log.warning( + f"XML file has some columns that aren't present in the database table {db_table.name}." + f" Trying to add the columns to the table. Additional XML columns:" + f" {', '.join(additional_df_column_names)}" + ) + # TODO: What if we can add some columns and not others? We should then return the columns for which we succeeded. + try: + add_missing_columns_to_table( + engine=engine, + db_table=db_table, + missing_columns=additional_df_column_names, + ) + except Exception: + log.exception( + "Could not add at least some columns to the database. Ignoring the columns from the XML file instead." + ) + df = df.drop(columns=additional_df_column_names) + else: + log.warning( + f"XML file has some columns that aren't present in the database table {db_table.name}." + f" Ignoring those columns since you asked not to alter tables. Additional XML columns:" + f" {', '.join(additional_df_column_names)}" + ) + df = df.drop(columns=additional_df_column_names) + + return df + + def create_efficient_engine(connection_url: str) -> sqlalchemy.engine.Engine: """Create an efficient engine for the SQLite database.""" is_sqlite = connection_url.startswith("sqlite://") @@ -211,40 +292,6 @@ def extract_xml_table_name(file_name: str) -> str: return file_name.split("_")[0].split(".")[0].lower() -def extract_sql_table_name(xml_table_name: str) -> str: - """Extract the SQL table name from the xml table name.""" - return tablename_mapping[xml_table_name]["__name__"] - - -def is_table_relevant(xml_table_name: str, include_tables: list) -> bool: - """Checks if the table contains relevant data and if the user wants to - have it in the database.""" - # few tables are only needed for data cleansing of the xml files and contain no - # information of relevance - try: - boolean_write_table_to_sql_database = ( - tablename_mapping[xml_table_name]["__class__"] is not None - ) - except KeyError: - log.warning( - f"Table '{xml_table_name}' is not supported by your open-mastr version and " - f"will be skipped." - ) - return False - # check if the table should be written to sql database (depends on user input) - include_count = include_tables.count(xml_table_name) - - return include_count == 1 and boolean_write_table_to_sql_database - - -def create_database_table( - engine: sqlalchemy.engine.Engine, xml_table_name: str -) -> None: - orm_class = tablename_mapping[xml_table_name]["__class__"] - orm_class.__table__.drop(engine, checkfirst=True) - orm_class.__table__.create(engine) - - def is_first_file(file_name: str) -> bool: """check if the file name indicates that it is the first file from the table""" return ( @@ -253,47 +300,35 @@ def is_first_file(file_name: str) -> bool: ) -def cast_date_columns_to_datetime( - xml_table_name: str, df: pd.DataFrame -) -> pd.DataFrame: - sqlalchemy_columnlist = tablename_mapping[xml_table_name][ - "__class__" - ].__table__.columns.items() - for column in sqlalchemy_columnlist: - column_name = column[0] - if is_date_column(column, df): +def cast_date_columns_to_datetime(db_table: Table, df: pd.DataFrame) -> pd.DataFrame: + for column in db_table.columns: + if is_date_column(column) and column.name in df.columns: # Convert column to datetime64, invalid string -> NaT - df[column_name] = pd.to_datetime(df[column_name], errors="coerce") + df[column.name] = pd.to_datetime(df[column.name], errors="coerce") return df -def cast_date_columns_to_string(xml_table_name: str, df: pd.DataFrame) -> pd.DataFrame: - column_list = tablename_mapping[xml_table_name][ - "__class__" - ].__table__.columns.items() - for column in column_list: - column_name = column[0] - - if not (column[0] in df.columns and is_date_column(column, df)): +def cast_date_columns_to_string(db_table: Table, df: pd.DataFrame) -> pd.DataFrame: + for column in db_table.columns: + if not is_date_column(column) or column.name not in df.columns: continue - df[column_name] = pd.to_datetime(df[column_name], errors="coerce") + df[column.name] = pd.to_datetime(df[column.name], errors="coerce") - if type(column[1].type) is Date: - mask = df[column_name].notna() - df[column_name] = df[column_name].dt.strftime("%Y-%m-%d") - df.loc[mask, column_name] = df.loc[mask, column_name].str.zfill(10) - df[column_name] = df[column_name].replace("NaT", None) - - elif type(column[1].type) is DateTime: - df[column_name] = ( - df[column_name].dt.strftime("%Y-%m-%d %H:%M:%S.%f").replace("NaT", None) + if type(column.type) is Date: + mask = df[column.name].notna() + df[column.name] = df[column.name].dt.strftime("%Y-%m-%d") + df.loc[mask, column.name] = df.loc[mask, column.name].str.zfill(10) + df[column.name] = df[column.name].replace("NaT", None) + elif type(column.type) is DateTime: + df[column.name] = ( + df[column.name].dt.strftime("%Y-%m-%d %H:%M:%S.%f").replace("NaT", None) ) return df -def is_date_column(column, df: pd.DataFrame) -> bool: - return type(column[1].type) in [Date, DateTime] and column[0] in df.columns +def is_date_column(column: Column) -> bool: + return type(column.type) in [Date, DateTime] def correct_ordering_of_filelist(files_list: list) -> list: @@ -331,46 +366,27 @@ def read_xml_file(f: ZipFile, file_name: str) -> pd.DataFrame: return handle_xml_syntax_error(xml_file.read().decode("utf-16"), error) -def change_column_names_to_orm_format( - df: pd.DataFrame, xml_table_name: str -) -> pd.DataFrame: - if tablename_mapping[xml_table_name]["replace_column_names"]: - df.rename( - columns=tablename_mapping[xml_table_name]["replace_column_names"], - inplace=True, - ) - return df - - def add_table_to_non_sqlite_database( df: pd.DataFrame, - xml_table_name: str, - sql_table_name: str, + db_table: Table, engine: sqlalchemy.engine.Engine, ) -> None: # get a dictionary for the data types - table_columns_list = list( - tablename_mapping[xml_table_name]["__class__"].__table__.columns - ) dtypes_for_writing_sql = { column.name: column.type - for column in table_columns_list + for column in db_table.columns if column.name in df.columns } # Convert date and datetime columns into the datatype datetime. - df = cast_date_columns_to_datetime(xml_table_name, df) - - add_missing_columns_to_table( - engine, xml_table_name, column_list=df.columns.tolist() - ) + df = cast_date_columns_to_datetime(db_table, df) for _ in range(10000): try: with engine.connect() as con: with con.begin(): df.to_sql( - sql_table_name, + db_table.name, con=con, index=False, if_exists="append", @@ -383,9 +399,9 @@ def add_table_to_non_sqlite_database( except sqlalchemy.exc.IntegrityError: # error resulting from Unique constraint failed - df = write_single_entries_until_not_unique_comes_up( - df, xml_table_name, engine - ) + # FIXME: This error can also indicate other problems than non-unique. + # We should differentiate more and show it to the user for cases we cannot solve. + df = write_single_entries_until_not_unique_comes_up(df, db_table, engine) def add_zero_as_first_character_for_too_short_string(df: pd.DataFrame) -> pd.DataFrame: @@ -421,7 +437,7 @@ def add_zero_as_first_character_for_too_short_string(df: pd.DataFrame) -> pd.Dat def write_single_entries_until_not_unique_comes_up( - df: pd.DataFrame, xml_table_name: str, engine: sqlalchemy.engine.Engine + df: pd.DataFrame, db_table: Table, engine: sqlalchemy.engine.Engine ) -> pd.DataFrame: """ Remove from dataframe these rows, which are already existing in the database table @@ -435,15 +451,14 @@ def write_single_entries_until_not_unique_comes_up( ------- Filtered dataframe """ + # TODO: Check if we need to support composite primary keys for the MaStR changes table. + # Because this here assumes single-column primary keys. + primary_key = next(c for c in db_table.columns if c.primary_key) - table = tablename_mapping[xml_table_name]["__class__"].__table__ - primary_key = next(c for c in table.columns if c.primary_key) - - with engine.connect() as con: - with con.begin(): - key_list = ( - pd.read_sql(sql=select(primary_key), con=con).values.squeeze().tolist() - ) + with engine.begin() as con: + key_list = ( + pd.read_sql(sql=select(primary_key), con=con).values.squeeze().tolist() + ) len_df_before = len(df) df = df.drop_duplicates( @@ -462,8 +477,8 @@ def write_single_entries_until_not_unique_comes_up( def add_missing_columns_to_table( engine: sqlalchemy.engine.Engine, - xml_table_name: str, - column_list: list, + db_table: Table, + missing_columns: Collection[str], ) -> None: """ Some files introduce new columns for existing tables. @@ -479,36 +494,24 @@ def add_missing_columns_to_table( ------- """ - log = setup_logger() - - # get the columns name from the existing database - inspector = sqlalchemy.inspect(engine) - table_name = tablename_mapping[xml_table_name]["__class__"].__table__.name - columns = inspector.get_columns(table_name) - column_names_from_database = [column["name"] for column in columns] - - missing_columns = set(column_list) - set(column_names_from_database) - + table_name = db_table.name for column_name in missing_columns: - if not column_exists(engine, table_name, column_name): - alter_query = 'ALTER TABLE %s ADD "%s" VARCHAR NULL;' % ( - table_name, - column_name, - ) - try: - with engine.connect().execution_options(autocommit=True) as con: - with con.begin(): - con.execute( - text(alter_query).execution_options(autocommit=True) - ) - except sqlalchemy.exc.OperationalError as err: - # If the column already exists, we can ignore the error. - if "duplicate column name" not in str(err): - raise err - log.info( - "From the downloaded xml files following new attribute was " - f"introduced: {table_name}.{column_name}" - ) + alter_query = 'ALTER TABLE %s ADD "%s" VARCHAR NULL;' % ( + table_name, + column_name, + ) + try: + with engine.connect().execution_options(autocommit=True) as con: + with con.begin(): + con.execute(text(alter_query).execution_options(autocommit=True)) + except sqlalchemy.exc.OperationalError as err: + # If the column already exists, we can ignore the error. + if "duplicate column name" not in str(err): + raise err + log.info( + f"Added the following columns to database table {table_name}:" + f" {', '.join(missing_columns)}" + ) def delete_wrong_xml_entry(err: Error, df: pd.DataFrame) -> pd.DataFrame: @@ -564,61 +567,86 @@ def find_nearest_brackets(xml_string: str, position: int) -> tuple[int, int]: def process_table_before_insertion( df: pd.DataFrame, - xml_table_name: str, + db_table: Table, zipped_xml_file_path: str, bulk_download_date: str, bulk_cleansing: bool, ) -> pd.DataFrame: df = add_zero_as_first_character_for_too_short_string(df) - df = change_column_names_to_orm_format(df, xml_table_name) # Add Column that refers to the source of the data df["DatenQuelle"] = "bulk" df["DatumDownload"] = bulk_download_date + df = align_df_column_names_to_db_column_names(df=df, db_table=db_table) + if bulk_cleansing: - df = cleanse_bulk_data(df, zipped_xml_file_path) + catalog_columns = { + column.name + for column in db_table.columns + if isinstance(column.type, (CatalogInteger, CatalogString)) + } + df = cleanse_bulk_data( + df=df, + catalog_columns=catalog_columns, + zipped_xml_file_path=zipped_xml_file_path, + ) return df +def align_df_column_names_to_db_column_names( + df: pd.DataFrame, + db_table: Table, +) -> pd.DataFrame: + old_column_name_to_new_column_name = { + column_name: normalize_mastr_name(column_name) for column_name in df.columns + } + if db_table.name == db_table.info.get("english_name"): + # Database is in English. We must translate the df columns + english_updates = {} + for ( + old_column_name, + normalized_column_name, + ) in old_column_name_to_new_column_name.items(): + if english_column_name := translate_mastr_column_name( + normalized_column_name + ): + english_updates[old_column_name] = english_column_name + old_column_name_to_new_column_name.update(english_updates) + + renamed_df = df.rename(columns=old_column_name_to_new_column_name) + return renamed_df + + def add_table_to_sqlite_database( df: pd.DataFrame, - xml_table_name: str, - sql_table_name: str, + db_table: Table, engine: sqlalchemy.engine.Engine, ) -> None: column_list = df.columns.tolist() - add_missing_columns_to_table(engine, xml_table_name, column_list) # Convert NaNs to None. df = df.where(pd.notnull(df), None) # Convert date columns to strings. Dates are not supported directly by SQLite. - df = cast_date_columns_to_string(xml_table_name, df) + df = cast_date_columns_to_string(db_table, df) # Create SQL statement for bulk insert. ON CONFLICT DO NOTHING prevents duplicates. - insert_stmt = f"INSERT INTO {sql_table_name} ({','.join(column_list)}) VALUES ({','.join(['?' for _ in column_list])}) ON CONFLICT DO NOTHING" + insert_stmt = f"INSERT INTO {db_table.name} ({','.join(column_list)}) VALUES ({','.join(['?' for _ in column_list])}) ON CONFLICT DO NOTHING" for _ in range(10000): try: - with engine.connect() as con: - with con.begin(): - con.connection.executemany(insert_stmt, df.to_numpy()) - break + with engine.begin() as con: + con.connection.executemany(insert_stmt, df.to_numpy()) + break except sqlalchemy.exc.DataError as err: delete_wrong_xml_entry(err, df) except sqlalchemy.exc.IntegrityError: # error resulting from Unique constraint failed - df = write_single_entries_until_not_unique_comes_up( - df, xml_table_name, engine - ) - except: + # FIXME: This error can also indicate other problems than non-unique. + # We should differentiate more and show it to the user for cases we cannot solve. + df = write_single_entries_until_not_unique_comes_up(df, db_table, engine) + except Exception: # If any unexpected error occurs, we'll switch back to the non-SQLite method. - add_table_to_non_sqlite_database(df, xml_table_name, sql_table_name, engine) + add_table_to_non_sqlite_database(df, db_table, engine) break - - -def column_exists(engine, table_name, column_name): - inspector = inspect(engine) - columns = [col["name"] for col in inspector.get_columns(table_name)] - return column_name in columns diff --git a/pyproject.toml b/pyproject.toml index f5550d09..69e6b25f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ dependencies = [ "keyring", "pyyaml", "xmltodict", + "xmlschema", ] requires-python = ">=3.9, <4" @@ -26,6 +27,7 @@ authors = [ {name = "Kotthoff Florian"}, {name = "Tepe Deniz"}, {name = "Amme Jonathan"}, + {name = "Simon Will"}, {name = "Open Energy Family"}, ] @@ -60,6 +62,8 @@ dev = [ "mkdocs-material", "mkdocs-include-markdown-plugin", "black", + "responses", + "pytest-responses", ] [project.urls] diff --git a/tests/conftest.py b/tests/conftest.py index eb5ce0fa..9d1d833e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,47 +1,83 @@ -""" -The conftest.py file serves as a means of providing fixtures for an entire directory. -Fixtures defined in a conftest.py can be used by any test in that package without -needing to import them (pytest will automatically discover them). - -You can have multiple nested directories/packages containing your tests, -and each directory can have its own conftest.py with its own fixtures, -adding on to the ones provided by the conftest.py files in parent directories. - -https://docs.pytest.org/en/7.2.x/reference/fixtures.html -""" +import os +import shutil +from datetime import date +from pathlib import Path +from typing import Optional import pytest -from open_mastr import Mastr +from open_mastr import Mastr from open_mastr.utils.config import get_project_home_dir -from open_mastr.utils.helpers import create_database_engine -import os -@pytest.fixture(scope="function") -def make_Mastr_class(): - """ - Factory to create different Mastr class objects. +_data_dir = Path(get_project_home_dir()) / "data" - Parameters - ---------- - engine_type: str - Define type of engine, for details see - :meth: `~.open_mastr.utils.helpers.create_database_engine` +EXISTING_XML_ZIP: Optional[Path] = None +_xml_data_dir = _data_dir / "xml_download" +if _xml_data_dir.is_dir(): + for entry in os.scandir(_xml_data_dir): + if "Gesamtdatenexport" in entry.name and entry.name.endswith(".zip"): + EXISTING_XML_ZIP = Path(entry.path) + # We don't break here in case there are multiple files. + # We take the last matching entry in the hope of getting the most recent file. - Returns - ------- - Mastr class object - """ +EXISTING_DOCS_ZIP: Optional[Path] = None +_docs_data_dir = _data_dir / "docs_download" +if _docs_data_dir.is_dir(): + for entry in os.scandir(_docs_data_dir): + if "Dokumentation MaStR Gesamtdatenexport" in entry.name and entry.name.endswith(".zip"): + EXISTING_DOCS_ZIP = Path(entry.path) + # We don't break here in case there are multiple files. + # We take the last matching entry in the hope of getting the most recent file. - def _make_Mastr_class(engine_type): - return Mastr(engine=engine_type) - return _make_Mastr_class +@pytest.fixture +def output_dir(tmp_path: Path) -> Path: + dir_ = tmp_path / "output" + yield dir_ + # Clean up so we don't leave this huge file lying around in some tmp dir. + shutil.rmtree(dir_) @pytest.fixture -def engine(): - return create_database_engine( - "sqlite", os.path.join(get_project_home_dir(), "data", "sqlite") - ) +def mastr(output_dir: Path) -> Mastr: + return Mastr(output_dir=str(output_dir)) + + +@pytest.fixture +def existing_xml_zip_in_output_dir(output_dir: Path) -> Path: + if not EXISTING_XML_ZIP: + raise ValueError( + "There is no existing XML ZIP file to copy to the output dir." + " This indicates faulty test setup. This fixture must only be used" + " when EXISTING_XML_ZIP is not None." + ) + + xml_dir = output_dir / "data" / "xml_download" + xml_dir.mkdir(parents=True, exist_ok=True) + # We pretend that this file is from "today". + dest_path = xml_dir / f"Gesamtdatenexport_{date.today().strftime('%Y%m%d')}.zip" + + # The XML file is pretty large, making this copy operation a bit costly for a unit test. + # So, use this fixture sparingly. + # Would be nice to have a hard link with copy-on-write semantics. Is there such a thing? + shutil.copy(EXISTING_XML_ZIP, dest_path) + return dest_path + + +@pytest.fixture +def existing_docs_zip_in_output_dir(output_dir: Path) -> Path: + if not EXISTING_DOCS_ZIP: + raise ValueError( + "There is no existing docs ZIP file to copy to the output dir." + " This indicates faulty test setup. This fixture must only be used" + " when EXISTING_DOCS_ZIP is not None." + ) + + docs_dir = output_dir / "data" / "docs_download" + docs_dir.mkdir(parents=True, exist_ok=True) + # We pretend that this file is from "today". + dest_path = docs_dir / f"Dokumentation MaStR Gesamtdatenexport_{date.today().strftime('%Y%m%d')}.zip" + + shutil.copy(EXISTING_DOCS_ZIP, dest_path) + return dest_path diff --git a/tests/test_helpers.py b/tests/test_helpers.py index df9de681..88124c4f 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,48 +1,22 @@ import pytest import os -from os.path import expanduser +from pathlib import Path import itertools -import random -from os.path import join -import pandas as pd from open_mastr import Mastr from zipfile import ZipFile -from open_mastr.utils import orm -from open_mastr.utils.constants import TECHNOLOGIES, ADDITIONAL_TABLES, BULK_DATA -from open_mastr.utils.config import get_data_version_dir, create_data_dir +from open_mastr.utils.constants import BULK_DATA from open_mastr.utils.helpers import ( validate_parameter_format_for_download_method, validate_parameter_format_for_mastr_init, transform_data_parameter, data_to_include_tables, - session_scope, - create_db_query, - db_query_to_csv, - reverse_unit_type_map, delete_zip_file_if_corrupted, ) -# Check if db is empty -_db_exists = False -_db_folder_path = os.path.join( - expanduser("~"), ".open-MaStR", "data", "sqlite" -) # FIXME: use path in tmpdir when implemented -if os.path.isdir(_db_folder_path): - for entry in os.scandir(path=_db_folder_path): - _db_path = os.path.join(_db_folder_path, "open-mastr.db") - if os.path.getsize(_db_path) > 1000000: # empty db = 327.7kB < 1 MB - _db_exists = True - - -@pytest.fixture -def db(): - return Mastr() - - def test_Mastr_validate_working_parameter(): valid_params = { "method": ["bulk"], @@ -119,8 +93,8 @@ def test_Mastr_validate_not_working_parameter(): ) -def test_validate_parameter_format_for_mastr_init(db): - engine_list_working = ["sqlite", db.engine] +def test_validate_parameter_format_for_mastr_init(mastr): + engine_list_working = ["sqlite", mastr.engine] engine_list_failing = ["HI", 12] for engine in engine_list_working: @@ -145,40 +119,17 @@ def test_transform_data_parameter(): def test_data_to_include_tables(): # Prepare - include_tables_list = [ + include_tables_list = { "anlageneegwind", "einheitenwind", "anlageneegwasser", "einheitenwasser", - ] - include_tables_str = ["einheitenstromverbraucher"] - - map_to_db_table_list = ["market_actors", "market_actors_and_roles"] - map_to_db_table_str = ["locations_extended"] + } + include_tables_str = {"einheitenstromverbraucher"} # Assert - assert include_tables_list == data_to_include_tables( - data=["wind", "hydro"], mapping="write_xml" - ) - assert include_tables_str == data_to_include_tables( - data=["electricity_consumer"], mapping="write_xml" - ) - assert map_to_db_table_list == data_to_include_tables( - data=["market"], mapping="export_db_tables" - ) - assert map_to_db_table_str == data_to_include_tables( - data=["location"], mapping="export_db_tables" - ) - - -def test_data_to_include_tables_error(): - # test for non-existent 'mapping' parameter input - with pytest.raises( - NotImplementedError, - match="This function is only implemented for 'write_xml' and 'export_db_tables'," - " please specify when calling the function.", - ): - data_to_include_tables(data=["wind", "hydro"], mapping="X32J_22") + assert include_tables_list == data_to_include_tables(data={"wind", "hydro"}) + assert include_tables_str == data_to_include_tables(data={"electricity_consumer"}) def test_delete_zip_file_if_corrupted(): diff --git a/tests/test_interactive_download.py b/tests/test_interactive_download.py index 0fd6b2e0..b7bf8511 100644 --- a/tests/test_interactive_download.py +++ b/tests/test_interactive_download.py @@ -1,4 +1,3 @@ -import pytest from unittest.mock import patch, MagicMock from open_mastr.xml_download.utils_download_bulk import ( get_available_download_links, @@ -11,9 +10,14 @@ SAMPLE_HTML = """ - - - + + + + + + + + """ @@ -21,21 +25,31 @@ # Sample download links for mocking SAMPLE_LINKS = [ { - "url": "https://download.marktstammdatenregister.de/Gesamtdatenexport_20250103_24.2.zip", - "date": "20250103", - "version": "24.2", + "url": "https://download.marktstammdatenregister.de/Gesamtdatenexport_20260103_25.2.zip", + "docs_url": "https://www.marktstammdatenregister.de/MaStRHilfe/files/gesamtdatenexport/Dokumentation%20MaStR%20Gesamtdatenexport.zip", + "date": "20260103", + "version": "25.2", "type": "current", }, { - "url": "https://download.marktstammdatenregister.de/Gesamtdatenexport_20241231_24.2.zip", - "date": "20241231", - "version": "24.2", - "type": "current", + "url": "https://download.marktstammdatenregister.de/Stichtag/Gesamtdatenexport_20260101_25.2.zip", + "docs_url": "https://download.marktstammdatenregister.de/Stichtag/Dokumentation%20MaStR%20Gesamtdatenexport%2001-01-2026.zip", + "date": "20260101", + "version": "25.2", + "type": "stichtag", + }, + { + "url": "https://download.marktstammdatenregister.de/Stichtag/Gesamtdatenexport_20251001_25.2.zip", + "docs_url": "https://download.marktstammdatenregister.de/Stichtag/Dokumentation%20MaStR%20Gesamtdatenexport%2001-10-2025.zip", + "date": "20251001", + "version": "25.2", + "type": "stichtag", }, { - "url": "https://download.marktstammdatenregister.de/Stichtag/Gesamtdatenexport_20241130_24.1.zip", - "date": "20241130", - "version": "24.1", + "url": "https://download.marktstammdatenregister.de/Stichtag/Gesamtdatenexport_20251001_25.1.zip", + "docs_url": "https://download.marktstammdatenregister.de/Stichtag/Dokumentation%20MaStR%20Gesamtdatenexport%2001-10-2025.zip", + "date": "20251001", + "version": "25.1", "type": "stichtag", }, ] @@ -50,14 +64,7 @@ def test_get_available_download_links(mock_urlopen): mock_urlopen.return_value = mock_response links = get_available_download_links() - - assert len(links) == 3 - assert links[0]["date"] == "20250103" - assert links[0]["version"] == "24.2" - assert links[0]["type"] == "current" - assert links[2]["date"] == "20241130" - assert links[2]["version"] == "24.1" - assert links[2]["type"] == "stichtag" + assert links == SAMPLE_LINKS @patch("open_mastr.xml_download.utils_download_bulk.get_available_download_links") @@ -73,8 +80,10 @@ def test_list_available_downloads(mock_print, mock_get_links): mock_print.assert_any_call("=" * 80) mock_print.assert_any_call("AVAILABLE MAStR DOWNLOADS") mock_print.assert_any_call( - f"{'#':<4} {'Date':<12} {'Version':<10} {'Type':<12} {'URL'}" + "# Date Version Type XML URL" + " Docs URL" ) + mock_print.assert_any_call("Total: 4 downloads available") @patch("open_mastr.xml_download.utils_download_bulk.list_available_downloads") @@ -84,9 +93,8 @@ def test_select_download_date_valid_selection(mock_list_downloads): # Simulate user choosing option 1, then selecting the 2nd item with patch("builtins.input", side_effect=["1", "2"]): - date, url = select_download_date() - assert date == "20241231" - assert url == SAMPLE_LINKS[1]["url"] + link = select_download_date() + assert link == SAMPLE_LINKS[1] @patch("open_mastr.xml_download.utils_download_bulk.list_available_downloads") @@ -96,37 +104,47 @@ def test_select_download_date_cancel(mock_list_downloads): # Simulate user choosing option 2 (Cancel) with patch("builtins.input", side_effect=["2"]): - date, url = select_download_date() - assert date is None - assert url is None + link = select_download_date() + assert link is None +@patch("open_mastr.mastr.Mastr.generate_data_model") @patch("open_mastr.mastr.write_mastr_xml_to_database") @patch("open_mastr.mastr.select_download_date") @patch("open_mastr.mastr.download_xml_Mastr") -def test_mastr_download_interactive(mock_download, mock_select_date, mock_write_db): +def test_mastr_download_interactive( + mock_download, + mock_select_date, + mock_write_db, + mock_generate_data_model, + mastr: Mastr, +): """Test the main download method with interactive selection.""" - mock_select_date.return_value = ("20241231", "http://example.com/file.zip") - db = Mastr() - db.download(select_date_interactively=True) + link = SAMPLE_LINKS[0] + mock_select_date.return_value = link + mastr.download(select_date_interactively=True) # Assert that select_download_date was called mock_select_date.assert_called_once() + # Assert that generate_data_model was called with the correct URL + mock_generate_data_model.assert_called_once() + _, kwargs = mock_generate_data_model.call_args + assert kwargs["url"] == link["docs_url"] + # Assert that download_xml_Mastr was called with the correct URL mock_download.assert_called_once() - args, kwargs = mock_download.call_args - assert args[4] == "http://example.com/file.zip" - assert args[1] == "20241231" # date argument + args, _ = mock_download.call_args + assert args[4] == link["url"] + assert args[1] == link["date"] @patch("open_mastr.mastr.select_download_date") @patch("open_mastr.mastr.download_xml_Mastr") -def test_mastr_download_interactive_cancel(mock_download, mock_select_date): +def test_mastr_download_interactive_cancel(mock_download, mock_select_date, mastr: Mastr): """Test the main download method when interactive selection is cancelled.""" - mock_select_date.return_value = (None, None) - db = Mastr() - db.download(select_date_interactively=True) + mock_select_date.return_value = None + mastr.download(select_date_interactively=True) # Assert that select_download_date was called mock_select_date.assert_called_once() @@ -135,12 +153,11 @@ def test_mastr_download_interactive_cancel(mock_download, mock_select_date): mock_download.assert_not_called() -@patch("open_mastr.xml_download.utils_download_bulk.list_available_downloads") -def test_mastr_browse_available_downloads(mock_list_downloads): +@patch("open_mastr.mastr.list_available_downloads") +def test_mastr_browse_available_downloads(mock_list_downloads, mastr: Mastr): """Test the browse_available_downloads method.""" mock_list_downloads.return_value = SAMPLE_LINKS - db = Mastr() - result = db.browse_available_downloads() + result = mastr.browse_available_downloads() mock_list_downloads.assert_called_once() assert result == SAMPLE_LINKS diff --git a/tests/test_mastr.py b/tests/test_mastr.py index ce7cd6fa..ca7f5c20 100644 --- a/tests/test_mastr.py +++ b/tests/test_mastr.py @@ -1,110 +1,336 @@ +import io import shutil +import zipfile +from pathlib import Path +from typing import Optional -from open_mastr.mastr import Mastr import os import re import sqlalchemy import pytest +import responses from os.path import expanduser import pandas as pd -from open_mastr.utils.constants import TRANSLATIONS from datetime import date, timedelta +from open_mastr.mastr import Mastr +from open_mastr.utils.constants import TABLE_TRANSLATIONS +from open_mastr.utils.sqlalchemy_tables import CatalogString -_xml_file_exists = False -_xml_folder_path = os.path.join(expanduser("~"), ".open-MaStR", "data", "xml_download") -if os.path.isdir(_xml_folder_path): - for entry in os.scandir(path=_xml_folder_path): - if "Gesamtdatenexport" in entry.name: - _xml_file_exists = True - +from tests.conftest import EXISTING_DOCS_ZIP, EXISTING_XML_ZIP -@pytest.fixture(scope="module") -def zipped_xml_file_path(): - zipped_xml_file_path = None - for entry in os.scandir(path=_xml_folder_path): - if "Gesamtdatenexport" in entry.name: - zipped_xml_file_path = os.path.join(_xml_folder_path, entry.name) - return zipped_xml_file_path +def test_mastr_init(mastr: Mastr) -> None: + # test if folder structure exists + assert os.path.exists(mastr._sqlite_folder_path) + # test if engine and connection were created + assert type(mastr.engine) == sqlalchemy.engine.Engine -@pytest.fixture -def db_path(): - return os.path.join( - os.path.expanduser("~"), ".open-MaStR", "data", "sqlite", "mastr-test.db" - ) +@pytest.mark.dependency(name="bulk_downloaded") +@pytest.mark.skipif( + not EXISTING_XML_ZIP or not EXISTING_DOCS_ZIP, + reason="The zipped XML or docs could not be found." +) +def test_mastr_download_latest_real_xml( + mastr: Mastr, + existing_xml_zip_in_output_dir: Path, + existing_docs_zip_in_output_dir: Path, +) -> None: + mastr.download(data="wind") + df_wind = pd.read_sql("EinheitenWind", con=mastr.engine) + assert len(df_wind) > 10000 + mastr.download(data="biomass") + df_biomass = pd.read_sql("EinheitenBiomasse", con=mastr.engine) + # Test that old biomass data is not deleted. + assert len(df_wind) > 10000 + assert len(df_biomass) > 10000 -@pytest.fixture -def db(db_path): - return Mastr(engine=sqlalchemy.create_engine(f"sqlite:///{db_path}")) + # Test that we can pass a list of data. + mastr.download(data=["wind", "nuclear"]) + df_wind = pd.read_sql("EinheitenWind", con=mastr.engine) + df_biomass = pd.read_sql("EinheitenBiomasse", con=mastr.engine) + df_nuclear = pd.read_sql("EinheitenKernkraft", con=mastr.engine) + assert len(df_wind) > 10000 + assert len(df_biomass) > 10000 + assert len(df_nuclear) > 1 + with mastr.engine.connect() as con: + query = sqlalchemy.text( + "SELECT Gemeinde, Bruttoleistung, WindAnlandOderAufSee" + " FROM EinheitenWind" + " WHERE EinheitMastrNummer = 'SEE909443729526'" + ) + result = con.execute(query) + rows = result.all() + assert rows == [("Badbergen", 6800.0, "Windkraft an Land")] -@pytest.fixture -def db_translated(db_path): - engine = sqlalchemy.create_engine(f"sqlite:///{db_path}") - db_api = Mastr(engine=engine) + # Check that the view wind_extended works. + query = sqlalchemy.text( + "SELECT Gemeinde, Bruttoleistung, WindAnlandOderAufSee" + " FROM wind_extended" + " WHERE EinheitMastrNummer = 'SEE909443729526'" + ) + result = con.execute(query) + rows = result.all() + assert rows == [("Badbergen", 6800.0, "Windkraft an Land")] - db_api.download(date="existing", data=["wind", "hydro", "biomass", "combustion"]) - db_api.translate() - return db_api +@pytest.mark.skipif( + not EXISTING_XML_ZIP or not EXISTING_DOCS_ZIP, + reason="The zipped XML or docs could not be found." +) +def test_mastr_download_latest_real_xml_english( + mastr: Mastr, + existing_xml_zip_in_output_dir: Path, + existing_docs_zip_in_output_dir: Path, +) -> None: + mastr.download(data="wind", english=True) + df_wind = pd.read_sql("units_wind", con=mastr.engine) + assert len(df_wind) > 10000 + with mastr.engine.connect() as con: + query = sqlalchemy.text( + "SELECT municipality, grossCapacity, windOnshoreOrOffshore" + " FROM units_wind" + " WHERE unitMastrNumber = 'SEE909443729526'" + ) + result = con.execute(query) + rows = result.all() + assert rows == [("Badbergen", 6800.0, "Windkraft an Land")] -def test_Mastr_init(db): - # test if folder structure exists - assert os.path.exists(db.home_directory) - assert os.path.exists(db._sqlite_folder_path) - # test if engine and connection were created - assert type(db.engine) == sqlalchemy.engine.Engine + # Check that the view wind_extended works. + query = sqlalchemy.text( + "SELECT municipality, grossCapacity, windOnshoreOrOffshore" + " FROM wind_extended" + " WHERE unitMastrNumber = 'SEE909443729526'" + ) + result = con.execute(query) + rows = result.all() + assert rows == [("Badbergen", 6800.0, "Windkraft an Land")] @pytest.mark.skipif( - not _xml_file_exists, reason="The zipped xml file could not be found." + not EXISTING_XML_ZIP, + reason="The zipped XML or docs could not be found." ) -def test_Mastr_translate(db_translated, db_path): - # test if database was renamed correctly - transl_path = db_path[:-3] + "-translated.db" - assert os.path.exists(transl_path) +def test_download_latest_without_altering_tables( + mastr: Mastr, + existing_xml_zip_in_output_dir: Path, +) -> None: + db_table = sqlalchemy.Table( + 'balancing_area', + sqlalchemy.MetaData(), + sqlalchemy.Column('id', sqlalchemy.Integer(), primary_key=True, nullable=False, info={'original_name': 'Id', 'normalized_name': 'Id', 'english_name': 'id'}), + sqlalchemy.Column('yeic', sqlalchemy.String(), info={'original_name': 'Yeic', 'normalized_name': 'Yeic', 'english_name': 'yeic'}), + sqlalchemy.Column('accountingAreaNetworkConnectionPoint', sqlalchemy.String(), info={'original_name': 'BilanzierungsgebietNetzanschlusspunkt', 'normalized_name': 'BilanzierungsgebietNetzanschlusspunkt', 'english_name': 'accountingAreaNetworkConnectionPoint'}), + sqlalchemy.Column('dataSource', sqlalchemy.String(), info={'normalized_name': 'DatenQuelle', 'english_name': 'dataSource'}), + sqlalchemy.Column('downloadDate', sqlalchemy.String(), info={'normalized_name': 'DatumDownload', 'english_name': 'downloadDate'}), + info={'original_name': 'Bilanzierungsgebiete', 'english_name': 'balancing_area'}, + ) + db_table.create(mastr.engine) + + mastr.download( + data="balancing_area", + english=True, + mastr_table_to_db_table={"Bilanzierungsgebiete": db_table}, + alter_database_tables=False, + ) - # test if columns got translated - inspector = sqlalchemy.inspect(db_translated.engine) - table_names = inspector.get_table_names() + # Check that column RegelzoneNetzanschlusspunkt/controlZoneNetworkConnectionPoint has not been created. + db_column_names = {column["name"] for column in sqlalchemy.inspect(mastr.engine).get_columns(db_table.name)} + assert db_column_names == {'id', 'yeic', 'accountingAreaNetworkConnectionPoint', 'dataSource', 'downloadDate'} - for table in table_names: - for column in inspector.get_columns(table): - column = column["name"] - assert column in TRANSLATIONS.values() or column not in TRANSLATIONS.keys() + # Check that data has been imported. + with mastr.engine.connect() as con: + row_count_query = sqlalchemy.text("SELECT COUNT(*) FROM balancing_area") + row_count = con.scalar(row_count_query) + assert row_count > 1000 + query = sqlalchemy.text( + "SELECT id, yeic, accountingAreaNetworkConnectionPoint" + " FROM balancing_area" + " WHERE yeic = '11YW-FREUDENST-L'" + ) + result = con.execute(query) + rows = result.all() + assert rows == [(428, "11YW-FREUDENST-L", "Stromnetz Freudenstadt")] - # test if new translated version replaces previous one - db_translated.engine.dispose() - engine = sqlalchemy.create_engine(f"sqlite:///{db_path}") - db_empty = Mastr(engine=engine) - db_empty.translate() - for table in table_names: - assert pd.read_sql(sql=table, con=db_empty.engine).shape[0] == 0 +@pytest.mark.skipif( + not EXISTING_DOCS_ZIP, + reason="The zipped docs could not be found." +) +def test_mastr_generate_data_model( + mastr: Mastr, + existing_docs_zip_in_output_dir: Path, +) -> None: + mastr_table_to_db_table = mastr.generate_data_model() + expected_keys = set(TABLE_TRANSLATIONS.keys()) - { + # A couple of tables with meta information we do not create. + "Einheitentypen", + "Katalogkategorien", + "Katalogwerte", + "Lokationstypen", + "Marktrollen", + "Marktfunktionen", + } + assert set(mastr_table_to_db_table.keys()) == expected_keys + # Check some samples + solar_table = mastr_table_to_db_table["EinheitenSolar"] + assert solar_table.name == "EinheitenSolar" + solar_table.info == {"original_name": "EinheitenSolar", "english_name": "units_solar"} + # Check a couple of columns + assert solar_table.c.EinheitMastrNummer.primary_key is True + assert isinstance(solar_table.c.EinheitMastrNummer.type, sqlalchemy.String) + assert isinstance(solar_table.c.Bruttoleistung.type, sqlalchemy.Float) + assert isinstance(solar_table.c.Hauptausrichtung.type, CatalogString) + assert isinstance(solar_table.c.EinheitlicheAusrichtungUndNeigungswinkel.type, sqlalchemy.Boolean) + changed_dso_assignment_table = mastr_table_to_db_table["EinheitenAenderungNetzbetreiberzuordnungen"] + assert changed_dso_assignment_table.c.OpenMastrId.primary_key is True + assert isinstance(changed_dso_assignment_table.c.OpenMastrId.type, sqlalchemy.Integer) -@pytest.mark.dependency(name="bulk_downloaded") -def test_mastr_download(db): - db.download(data="wind") - df_wind = pd.read_sql("wind_extended", con=db.engine) - assert len(df_wind) > 10000 - db.download(data="biomass") - df_biomass = pd.read_sql("biomass_extended", con=db.engine) - assert len(df_wind) > 10000 - assert len(df_biomass) > 10000 +@pytest.mark.skipif( + not EXISTING_DOCS_ZIP, + reason="The zipped docs could not be found." +) +def test_mastr_generate_data_model_english( + mastr: Mastr, + existing_docs_zip_in_output_dir: Path, +) -> None: + mastr_table_to_db_table = mastr.generate_data_model(english=True) + expected_keys = set(TABLE_TRANSLATIONS.keys()) - { + # A couple of tables with meta information we do not create. + "Einheitentypen", + "Katalogkategorien", + "Katalogwerte", + "Lokationstypen", + "Marktrollen", + "Marktfunktionen", + } + assert set(mastr_table_to_db_table.keys()) == expected_keys + # Check some samples + solar_table = mastr_table_to_db_table["EinheitenSolar"] + assert solar_table.name == "units_solar" + solar_table.info == {"original_name": "EinheitenSolar", "english_name": "units_solar"} + # Check a couple of columns + assert solar_table.c.unitMastrNumber.primary_key is True + assert isinstance(solar_table.c.unitMastrNumber.type, sqlalchemy.String) + assert isinstance(solar_table.c.grossCapacity.type, sqlalchemy.Float) + assert isinstance(solar_table.c.mainOrientation.type, CatalogString) + assert isinstance(solar_table.c.uniformOrientationAndTiltAngle.type, sqlalchemy.Boolean) + + changed_dso_assignment_table = mastr_table_to_db_table["EinheitenAenderungNetzbetreiberzuordnungen"] + assert changed_dso_assignment_table.c.OpenMastrId.primary_key is True + assert isinstance(changed_dso_assignment_table.c.OpenMastrId.type, sqlalchemy.Integer) @pytest.mark.dependency(depends=["bulk_downloaded"]) -def test_mastr_download_keep_old_files(db, zipped_xml_file_path): - file_today = zipped_xml_file_path +@pytest.mark.skipif( + not EXISTING_XML_ZIP or not EXISTING_DOCS_ZIP, + reason="The zipped XML or docs could not be found." +) +def test_mastr_download_keep_old_downloads( + mastr: Mastr, + existing_xml_zip_in_output_dir: Path, + existing_docs_zip_in_output_dir: Path, +) -> None: + file_today = existing_xml_zip_in_output_dir + if not file_today: + raise ValueError( + "Zip file is missing. This should never happen and indicates a faulty test." + " The file has somehow been deleted between test discovery time and this test" + " being started." + ) yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d") - file_old = re.sub(r"\d{8}", yesterday, os.path.basename(file_today)) - file_old = os.path.join(os.path.dirname(zipped_xml_file_path), file_old) + file_old_basename = re.sub(r"\d{8}", yesterday, os.path.basename(file_today)) + file_old = os.path.join(os.path.dirname(existing_xml_zip_in_output_dir), file_old_basename) shutil.copy(file_today, file_old) - db.download(data="gsgk", keep_old_files=True) + mastr.download(data="gsgk", keep_old_downloads=True) assert os.path.exists(file_old) + + +def test_mastr_generate_data_model_fallback_to_included_docs( + mastr: Mastr, + output_dir: Path, + responses: responses.RequestsMock, + caplog: pytest.LogCaptureFixture, +) -> None: + caplog.set_level("ERROR") + + invalid_xsd = """ + + + + + + + + + + + + + + + + + + """ + # Create inner ZIP file xsd.zip containing Netze.xsd + inner_zip_content = io.BytesIO() + with zipfile.ZipFile(inner_zip_content, 'w') as inner_zip: + inner_zip.writestr('Netze.xsd', invalid_xsd) + inner_zip_content.seek(0) + + # Create outer ZIP file containing xsd.zip + outer_zip_content = io.BytesIO() + with zipfile.ZipFile(outer_zip_content, 'w') as outer_zip: + outer_zip.writestr('xsd.zip', inner_zip_content.getvalue()) + outer_zip_content.seek(0) + + expected_url = ( + "https://download.marktstammdatenregister.de/Stichtag/" + "Dokumentation%20MaStR%20Gesamtdatenexport%2001-03-2026.zip" + ) + # Mock the GET request + responses.add( + responses.GET, + expected_url, + body=outer_zip_content.getvalue(), + content_type='application/zip' + ) + + mastr_table_to_db_table = mastr.generate_data_model( + date="20260301" + ) + # We expect that reading the invalid XSD Netze.xsd fails, triggering the fallback. + assert len(caplog.messages) == 1 + assert "Falling back to stored docs" in caplog.messages[0] + + expected_keys = set(TABLE_TRANSLATIONS.keys()) - { + # A couple of tables with meta information we do not create. + "Einheitentypen", + "Katalogkategorien", + "Katalogwerte", + "Lokationstypen", + "Marktrollen", + "Marktfunktionen", + } + assert set(mastr_table_to_db_table.keys()) == expected_keys + # Check some samples + solar_table = mastr_table_to_db_table["EinheitenSolar"] + assert solar_table.name == "EinheitenSolar" + solar_table.info == {"original_name": "EinheitenSolar", "english_name": "units_solar"} + # Check a couple of columns + assert solar_table.c.EinheitMastrNummer.primary_key is True + assert isinstance(solar_table.c.EinheitMastrNummer.type, sqlalchemy.String) + assert isinstance(solar_table.c.Bruttoleistung.type, sqlalchemy.Float) + assert isinstance(solar_table.c.Hauptausrichtung.type, CatalogString) + assert isinstance(solar_table.c.EinheitlicheAusrichtungUndNeigungswinkel.type, sqlalchemy.Boolean) + + changed_dso_assignment_table = mastr_table_to_db_table["EinheitenAenderungNetzbetreiberzuordnungen"] + assert changed_dso_assignment_table.c.OpenMastrId.primary_key is True + assert isinstance(changed_dso_assignment_table.c.OpenMastrId.type, sqlalchemy.Integer) diff --git a/tests/xml_download/test_utils_cleansing_bulk.py b/tests/xml_download/test_utils_cleansing_bulk.py index 9a29ad76..0fc6587f 100644 --- a/tests/xml_download/test_utils_cleansing_bulk.py +++ b/tests/xml_download/test_utils_cleansing_bulk.py @@ -1,72 +1,78 @@ -import sys -import sqlite3 -from os.path import expanduser import os import pandas as pd -import numpy as np import pytest +from os.path import expanduser +from pathlib import Path from open_mastr.xml_download.utils_cleansing_bulk import ( + cleanse_bulk_data, create_katalogwerte_from_bulk_download, replace_mastr_katalogeintraege, ) +from tests.conftest import EXISTING_XML_ZIP + # Check if xml file exists _xml_file_exists = False _xml_folder_path = os.path.join(expanduser("~"), ".open-MaStR", "data", "xml_download") if os.path.isdir(_xml_folder_path): for entry in os.scandir(path=_xml_folder_path): - if "Gesamtdatenexport" in entry.name: + if "Gesamtdatenexport" in entry.name and entry.name.endswith(".zip"): _xml_file_exists = True -_sqlite_folder_path = os.path.join(expanduser("~"), ".open-MaStR", "data", "sqlite") -_sqlite_file_path = os.path.join(_sqlite_folder_path, "open-mastr.db") -_sqlite_db_exists = bool(os.path.exists(_sqlite_file_path)) - -# Silence ValueError caused by logger https://github.com/pytest-dev/pytest/issues/5502 -@pytest.fixture(autouse=True) -def capture_wrap(): - sys.stderr.close = lambda *args: None - sys.stdout.close = lambda *args: None - yield +@pytest.mark.skipif( + not EXISTING_XML_ZIP, + reason="The zipped XML could not be found." +) +def test_cleanse_bulk_data(existing_xml_zip_in_output_dir: Path) -> None: + df_raw = pd.DataFrame( + { + "ID": [0, 1, 2], + "Bundesland": [335, 335, 336], + "Einheittyp": [1, 8, 5], + } + ) + df_replaced = pd.DataFrame( + { + "ID": [0, 1, 2], + "Bundesland": ["Bayern", "Bayern", "Bremen"], + "Einheittyp": ["Solareinheit", "Stromspeichereinheit", "Geothermie"], + } + ) -@pytest.fixture(scope="module") -def con(): - con = sqlite3.connect(_sqlite_file_path) - yield con - con.close() - - -@pytest.fixture(scope="module") -def zipped_xml_file_path(): - zipped_xml_file_path = None - for entry in os.scandir(path=_xml_folder_path): - if "Gesamtdatenexport" in entry.name: - zipped_xml_file_path = os.path.join(_xml_folder_path, entry.name) - - return zipped_xml_file_path + pd.testing.assert_frame_equal( + cleanse_bulk_data( + df=df_raw, zipped_xml_file_path=str(existing_xml_zip_in_output_dir), catalog_columns={"Bundesland", "Einheittyp"}, + ), + df_replaced, + ) @pytest.mark.skipif( - not _xml_file_exists, reason="The zipped xml file could not be found." + not EXISTING_XML_ZIP, + reason="The zipped XML could not be found." ) -def test_replace_mastr_katalogeintraege(zipped_xml_file_path): +def test_replace_mastr_katalogeintraege(existing_xml_zip_in_output_dir: Path) -> None: df_raw = pd.DataFrame({"ID": [0, 1, 2], "Bundesland": [335, 335, 336]}) df_replaced = pd.DataFrame( {"ID": [0, 1, 2], "Bundesland": ["Bayern", "Bayern", "Bremen"]} ) pd.testing.assert_frame_equal( - df_replaced, replace_mastr_katalogeintraege(zipped_xml_file_path, df_raw) + replace_mastr_katalogeintraege( + zipped_xml_file_path=str(existing_xml_zip_in_output_dir), df=df_raw, catalog_columns={"Bundesland", "Einheittyp"}, + ), + df_replaced, ) @pytest.mark.skipif( - not _xml_file_exists, reason="The zipped xml file could not be found." + not EXISTING_XML_ZIP, + reason="The zipped XML could not be found." ) -def test_create_katalogwerte_from_bulk_download(zipped_xml_file_path): +def test_create_katalogwerte_from_bulk_download(existing_xml_zip_in_output_dir: Path) -> None: katalogwerte = create_katalogwerte_from_bulk_download( - zipped_xml_file_path=zipped_xml_file_path + zipped_xml_file_path=existing_xml_zip_in_output_dir ) assert type(katalogwerte) == dict assert len(katalogwerte) > 1000 diff --git a/tests/xml_download/test_utils_download_bulk.py b/tests/xml_download/test_utils_download_bulk.py index 8f650933..5017e37c 100644 --- a/tests/xml_download/test_utils_download_bulk.py +++ b/tests/xml_download/test_utils_download_bulk.py @@ -1,15 +1,15 @@ import time from open_mastr.xml_download.utils_download_bulk import ( - gen_url, + gen_xml_download_url, delete_xml_files_not_from_given_date, ) import os import shutil -def test_gen_url(): +def test_gen_xml_download_url(): when = time.strptime("2024-01-01", "%Y-%m-%d") - url = gen_url(when) + url = gen_xml_download_url(when) assert type(url) == str assert ( url @@ -17,7 +17,7 @@ def test_gen_url(): ) when = time.strptime("2024-04-01", "%Y-%m-%d") - url = gen_url(when) + url = gen_xml_download_url(when) assert type(url) == str assert ( url @@ -25,7 +25,7 @@ def test_gen_url(): ) when = time.strptime("2024-04-02", "%Y-%m-%d") - url = gen_url(when) + url = gen_xml_download_url(when) assert type(url) == str assert ( url @@ -33,7 +33,7 @@ def test_gen_url(): ) when = time.strptime("2024-10-01", "%Y-%m-%d") - url = gen_url(when) + url = gen_xml_download_url(when) assert type(url) == str assert ( url @@ -41,7 +41,7 @@ def test_gen_url(): ) when = time.strptime("2024-10-02", "%Y-%m-%d") - url = gen_url(when) + url = gen_xml_download_url(when) assert type(url) == str assert ( url @@ -49,7 +49,7 @@ def test_gen_url(): ) when = time.strptime("2024-12-31", "%Y-%m-%d") - url = gen_url(when) + url = gen_xml_download_url(when) assert type(url) == str assert ( url @@ -59,7 +59,7 @@ def test_gen_url(): # Tests for use_version parameter when = time.strptime("2024-12-31", "%Y-%m-%d") - url = gen_url(when, use_version="before") + url = gen_xml_download_url(when, use_version="before") assert type(url) == str assert ( url @@ -67,7 +67,7 @@ def test_gen_url(): ) when = time.strptime("2024-12-31", "%Y-%m-%d") - url = gen_url(when, use_version="after") + url = gen_xml_download_url(when, use_version="after") assert type(url) == str assert ( url @@ -75,7 +75,7 @@ def test_gen_url(): ) when = time.strptime("2024-04-02", "%Y-%m-%d") - url = gen_url(when, use_version="before") + url = gen_xml_download_url(when, use_version="before") assert type(url) == str assert ( url @@ -83,7 +83,7 @@ def test_gen_url(): ) when = time.strptime("2024-04-02", "%Y-%m-%d") - url = gen_url(when, use_version="after") + url = gen_xml_download_url(when, use_version="after") assert type(url) == str assert ( url diff --git a/tests/xml_download/test_utils_write_to_database.py b/tests/xml_download/test_utils_write_to_database.py index 4f466157..bf16585f 100644 --- a/tests/xml_download/test_utils_write_to_database.py +++ b/tests/xml_download/test_utils_write_to_database.py @@ -3,28 +3,37 @@ import sys from datetime import datetime from os.path import expanduser +from pathlib import Path +from typing import Any, Callable from zipfile import ZipFile import numpy as np import pandas as pd import pytest -from sqlalchemy import create_engine, inspect -from sqlalchemy.sql import text +from sqlalchemy import ( + Boolean, + Column, + create_engine, + Date, + DateTime, + Double, + Engine, + Integer, + MetaData, + String, + Table, +) + -from open_mastr.utils import orm -from open_mastr.utils.orm import RetrofitUnits, ElectricityConsumer, tablename_mapping +from open_mastr.utils.sqlalchemy_tables import CatalogString from open_mastr.xml_download.utils_write_to_database import ( add_missing_columns_to_table, add_zero_as_first_character_for_too_short_string, cast_date_columns_to_string, - change_column_names_to_orm_format, correct_ordering_of_filelist, - create_database_table, - extract_sql_table_name, extract_xml_table_name, is_date_column, is_first_file, - is_table_relevant, process_table_before_insertion, read_xml_file, add_table_to_non_sqlite_database, @@ -32,51 +41,12 @@ interleave_files, ) -# Check if xml file exists -_xml_file_exists = False -_xml_folder_path = os.path.join(expanduser("~"), ".open-MaStR", "data", "xml_download") -if os.path.isdir(_xml_folder_path): - for entry in os.scandir(path=_xml_folder_path): - if "Gesamtdatenexport" in entry.name: - _xml_file_exists = True - - -# Silence ValueError caused by logger https://github.com/pytest-dev/pytest/issues/5502 -@pytest.fixture(autouse=True) -def capture_wrap(): - sys.stderr.close = lambda *args: None - sys.stdout.close = lambda *args: None - yield - - -@pytest.fixture(scope="module") -def zipped_xml_file_path(): - zipped_xml_file_path = None - for entry in os.scandir(path=_xml_folder_path): - if "Gesamtdatenexport" in entry.name: - zipped_xml_file_path = os.path.join(_xml_folder_path, entry.name) - - return zipped_xml_file_path +from tests.conftest import EXISTING_XML_ZIP -@pytest.fixture(scope="module") -def con_testdb(): - testdb_file_path = os.path.join( - expanduser("~"), ".open-MaStR", "data", "sqlite", "test-open-mastr.db" - ) - # Create testdb - con_testdb = sqlite3.connect(testdb_file_path) - yield con_testdb - con_testdb.close() - # Remove testdb - os.remove(testdb_file_path) - - -@pytest.fixture(scope="module") -def engine_testdb(): - testdb_file_path = os.path.join( - expanduser("~"), ".open-MaStR", "data", "sqlite", "test-open-mastr.db" - ) +@pytest.fixture +def engine_testdb(tmp_path: Path) -> Engine: + testdb_file_path = tmp_path / "test-open-mastr.db" testdb_url = f"sqlite:///{testdb_file_path}" yield create_engine(testdb_url) @@ -86,27 +56,6 @@ def test_extract_xml_table_name(): assert extract_xml_table_name(file_name) == "netzanschlusspunkte" -def text_extract_sql_table_name(): - xml_table_name = "netzanschlusspunkte" - assert extract_sql_table_name(xml_table_name) == "network_connection_points" - - -def test_is_table_relevant(): - include_tables = ["anlagengasspeicher", "marktakteure"] - assert is_table_relevant("anlagengasspeicher", include_tables) is True - assert is_table_relevant("netzanschlusspunkte", include_tables) is False - - -def test_create_database_table(engine_testdb): - orm.Base.metadata.create_all(engine_testdb) - xml_table_name = "einheitenkernkraft" - sql_table_name = "nuclear_extended" - - create_database_table(engine_testdb, xml_table_name) - - assert inspect(engine_testdb).has_table(sql_table_name) is True - - def test_is_first_file(): assert is_first_file("EinheitenKernkraft.xml") is True assert is_first_file("EinheitenKernkraft_1.xml") is True @@ -114,9 +63,16 @@ def test_is_first_file(): def test_cast_date_columns_to_string(): + table = Table( + "anlageneegwasser", + MetaData(), + Column("EegMastrNummer", String, primary_key=True), + Column("Registrierungsdatum", Date), + Column("DatumLetzteAktualisierung", DateTime), + ) initial_df = pd.DataFrame( { - "EegMastrNummer": [1, 2, 3], + "EegMastrNummer": ["1", "2", "3"], "Registrierungsdatum": [ datetime(2024, 3, 11).date(), datetime(1999, 2, 1).date(), @@ -131,7 +87,7 @@ def test_cast_date_columns_to_string(): ) expected_df = pd.DataFrame( { - "EegMastrNummer": [1, 2, 3], + "EegMastrNummer": ["1", "2", "3"], "Registrierungsdatum": ["2024-03-11", "1999-02-01", np.nan], "DatumLetzteAktualisierung": [ "2022-03-22 00:00:00.000000", @@ -142,32 +98,14 @@ def test_cast_date_columns_to_string(): ) pd.testing.assert_frame_equal( - expected_df, cast_date_columns_to_string("anlageneegwasser", initial_df) + expected_df, cast_date_columns_to_string(table, initial_df) ) def test_is_date_column(): - columns = RetrofitUnits.__table__.columns.items() - df = pd.DataFrame( - { - "Id": [1], - "DatumLetzteAktualisierung": [datetime(2022, 3, 22)], - "WiederinbetriebnahmeDatum": [datetime(2024, 3, 11).date()], - } - ) - - date_column = list(filter(lambda col: col[0] == "Id", columns))[0] - assert is_date_column(date_column, df) is False - - datetime_column = list( - filter(lambda col: col[0] == "DatumLetzteAktualisierung", columns) - )[0] - assert is_date_column(datetime_column, df) is True - - date_column = list( - filter(lambda col: col[0] == "WiederinbetriebnahmeDatum", columns) - )[0] - assert is_date_column(date_column, df) is True + assert is_date_column(Column("Id", Integer, primary_key=True)) is False + assert is_date_column(Column("DatumLetzteAktualisierung", DateTime)) is True + assert is_date_column(Column("WiederinbetriebnahmeDatum", Date)) is True def test_correct_ordering_of_filelist(): @@ -216,27 +154,16 @@ def test_correct_ordering_of_filelist(): ] -@pytest.mark.skipif( - not _xml_file_exists, reason="The zipped xml file could not be found." -) -def test_read_xml_file(zipped_xml_file_path): +@pytest.mark.skipif(not EXISTING_XML_ZIP, reason="The zipped XML could not be found.") +def test_read_xml_file(existing_xml_zip_in_output_dir: Path) -> None: file_name = "EinheitenStromVerbraucher" - with ZipFile(zipped_xml_file_path, "r") as f: + with ZipFile(existing_xml_zip_in_output_dir, "r") as f: df = read_xml_file(f, f"{file_name}.xml") assert df.shape[0] > 0 - # Since the file is from the latest download, its content can vary over time. To make sure that the table is - # correctly created, we check that all of its columns are associated are included in our mapping. - for column in df.columns: - if column in tablename_mapping[file_name.lower()]["replace_column_names"]: - column = tablename_mapping[file_name.lower()]["replace_column_names"][ - column - ] - assert column in ElectricityConsumer.__table__.columns.keys() - -def test_add_zero_as_first_character_for_too_short_string(): +def test_add_zero_as_first_character_for_too_short_string() -> None: # Prepare df_raw = pd.DataFrame( {"ID": [0, 1, 2], "Gemeindeschluessel": [9162000, np.nan, 19123456]} @@ -251,68 +178,58 @@ def test_add_zero_as_first_character_for_too_short_string(): pd.testing.assert_frame_equal(df_edited, df_correct) -def test_change_column_names_to_orm_format(): - initial_df = pd.DataFrame( - { - "VerknuepfteEinheitenMaStRNummern": ["test1", "test2"], - "NetzanschlusspunkteMaStRNummern": [1, 2], - } - ) - expected_df = pd.DataFrame( - { - "VerknuepfteEinheiten": ["test1", "test2"], - "Netzanschlusspunkte": [1, 2], - } - ) - - pd.testing.assert_frame_equal( - expected_df, change_column_names_to_orm_format(initial_df, "lokationen") - ) - - -@pytest.mark.skipif( - not _xml_file_exists, reason="The zipped xml file could not be found." -) -def test_process_table_before_insertion(zipped_xml_file_path): +@pytest.mark.skipif(not EXISTING_XML_ZIP, reason="The zipped XML could not be found.") +def test_process_table_before_insertion(existing_xml_zip_in_output_dir: Path) -> None: bulk_download_date = datetime.now().date().strftime("%Y%m%d") initial_df = pd.DataFrame( { "Gemeindeschluessel": [9162000, 19123456], "Postleitzahl": [1234, 54321], "NameKraftwerk": ["test1", "test2"], - "LokationMaStRNummer": ["test3", "test4"], + "NetzbetreiberpruefungStatus": [2954, 2955], } ) + db_table = Table( + "einheitenkernkraft", + MetaData(), + Column("EinheitMastrNummer", String, primary_key=True), + Column("DatumLetzteAktualisierung", DateTime), + Column("Gemeindeschluessel", String), + Column("Postleitzahl", String), + Column("NameKraftwerk", String), + Column("NetzbetreiberpruefungStatus", CatalogString), + ) + actual_df = process_table_before_insertion( + initial_df, + db_table, + existing_xml_zip_in_output_dir, + bulk_download_date, + bulk_cleansing=True, + ) expected_df = pd.DataFrame( { "Gemeindeschluessel": ["09162000", "19123456"], "Postleitzahl": ["01234", "54321"], "NameKraftwerk": ["test1", "test2"], - "LokationMastrNummer": ["test3", "test4"], + "NetzbetreiberpruefungStatus": ["Geprüft", "In Prüfung"], "DatenQuelle": ["bulk", "bulk"], "DatumDownload": [bulk_download_date, bulk_download_date], } ) - pd.testing.assert_frame_equal( - expected_df, - process_table_before_insertion( - initial_df, - "einheitenkernkraft", - zipped_xml_file_path, - bulk_download_date, - bulk_cleansing=False, - ), - ) + pd.testing.assert_frame_equal(actual_df, expected_df) -def test_add_missing_columns_to_table(engine_testdb): +def test_add_missing_columns_to_table(engine_testdb: Engine) -> None: + table = Table( + "einheitengasverbraucher", + MetaData(), + Column("EinheitMastrNummer", String, primary_key=True), + Column("DatumLetzteAktualisierung", DateTime), + ) + table.create(engine_testdb) with engine_testdb.connect() as con: with con.begin(): - # We must recreate the table to be sure that the new colum is not present. - con.execute(text("DROP TABLE IF EXISTS gas_consumer")) - create_database_table(engine_testdb, "einheitengasverbraucher") - initial_data_in_db = pd.DataFrame( { "EinheitMastrNummer": ["id1"], @@ -320,12 +237,10 @@ def test_add_missing_columns_to_table(engine_testdb): } ) initial_data_in_db.to_sql( - "gas_consumer", con=con, if_exists="append", index=False + table.name, con=con, if_exists="append", index=False ) - add_missing_columns_to_table( - engine_testdb, "einheitengasverbraucher", ["NewColumn"] - ) + add_missing_columns_to_table(engine_testdb, table, ["NewColumn"]) expected_df = pd.DataFrame( { @@ -336,7 +251,7 @@ def test_add_missing_columns_to_table(engine_testdb): ) with engine_testdb.connect() as con: with con.begin(): - actual_df = pd.read_sql_table("gas_consumer", con=con) + actual_df = pd.read_sql_table(table.name, con=con) # The actual_df will contain more columns than the expected_df, so we can't use assert_frame_equal. assert expected_df.index.isin(actual_df.index).all() @@ -345,14 +260,32 @@ def test_add_missing_columns_to_table(engine_testdb): "add_table_to_database_function", [add_table_to_sqlite_database, add_table_to_non_sqlite_database], ) -def test_add_table_to_sqlite_database(engine_testdb, add_table_to_database_function): - with engine_testdb.connect() as con: - with con.begin(): - # We must recreate the table to be sure that no other data is present. - con.execute(text("DROP TABLE IF EXISTS gsgk_eeg")) - create_database_table( - engine_testdb, "anlageneeggeothermiegrubengasdruckentspannung" - ) +def test_add_table_to_sqlite_database( + engine_testdb: Engine, + add_table_to_database_function: Callable[[pd.DataFrame, Table, Engine], Any], +) -> None: + table = Table( + "anlageneeggeothermiegrubengasdruckentspannung", + MetaData(), + Column("EegMastrNummer", String, primary_key=True), + Column("InstallierteLeistung", Double), + Column("AnlageBetriebsstatus", String), + Column("Registrierungsdatum", Date), + Column("Meldedatum", DateTime), + Column("DatumLetzteAktualisierung", DateTime), + Column("EegInbetriebnahmedatum", DateTime), + Column("VerknuepfteEinheit", String), + Column("AnlagenschluesselEeg", String), + Column("AusschreibungZuschlag", Boolean), + Column("AnlagenkennzifferAnlagenregister", String), + Column("AnlagenkennzifferAnlagenregister_nv", String), + Column("Netzbetreiberzuordnungen", String), + Column("DatenQuelle", String), + Column("DatumDownload", DateTime), + ) + # We must recreate the table to be sure that no other data is present. + table.drop(engine_testdb, checkfirst=True) + table.create(engine_testdb) df = pd.DataFrame( { @@ -369,10 +302,10 @@ def test_add_table_to_sqlite_database(engine_testdb, add_table_to_database_funct ) expected_df = pd.DataFrame( { + "EegMastrNummer": ["id1", "id2"], "InstallierteLeistung": [1.0, 100.4], "AnlageBetriebsstatus": [None, None], "Registrierungsdatum": [datetime(2022, 2, 2), datetime(2024, 3, 20)], - "EegMastrNummer": ["id1", "id2"], "Meldedatum": [np.datetime64("NaT"), np.datetime64("NaT")], "DatumLetzteAktualisierung": [ datetime(2022, 12, 2, 10, 10, 10, 300), @@ -390,14 +323,11 @@ def test_add_table_to_sqlite_database(engine_testdb, add_table_to_database_funct } ) - add_table_to_database_function( - df, "anlageneeggeothermiegrubengasdruckentspannung", "gsgk_eeg", engine_testdb - ) + add_table_to_database_function(df, table, engine_testdb) with engine_testdb.connect() as con: with con.begin(): - actual_df = pd.read_sql_table("gsgk_eeg", con=con) pd.testing.assert_frame_equal( - expected_df[df.columns], actual_df[df.columns], check_dtype=False + expected_df, pd.read_sql_table(table.name, con=con) )