From abc21ce3b2687aaaaaff5b1a5324667dd19cda0c Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 18:12:04 +0200 Subject: [PATCH 1/8] Update pre-commit config and modernize linting setup - Consolidate linting with ruff, replaces black and extents linting - Additional file checks (case-conflict, docstring-first, mixed-line-ending) - Configure global excludes for build artifacts and cache directories - Enable ruff docstring checking with numpy convention --- .pre-commit-config.yaml | 45 +++++++++++++++++++++++++++++++++++++---- pyproject.toml | 27 ++++++++++++++++++++++--- 2 files changed, 65 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 821c6237..e0e21c44 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,6 +1,43 @@ +# Global excludes - apply to all hooks +exclude: ^(build/|dist/|.*\.egg-info/|\.tox/|\.pytest_cache/|\.mypy_cache/|__pycache__/|postprocessing/) + repos: - - repo: https://github.com/psf/black - rev: 22.6.0 + # Base pre-commit hooks for common issues + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 hooks: - - id: black - language_version: python3.11 + - id: trailing-whitespace + exclude: \.ipynb$ + - id: end-of-file-fixer + exclude: \.ipynb$ + - id: check-yaml + - id: check-added-large-files + args: ['--maxkb=1000'] + - id: check-json + - id: check-toml + - id: check-merge-conflict + - id: debug-statements + - id: check-case-conflict + - id: check-docstring-first + - id: mixed-line-ending + args: ['--fix=lf'] + + # Ruff - fast, comprehensive linter and formatter (replaces flake8, isort, pylint, pyupgrade, black, pydocstyle) + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.14.1 + hooks: + - id: ruff + args: [--fix, --exit-non-zero-on-fix] + - id: ruff-format + + # mypy - static type checking + - repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.18.2 + hooks: + - id: mypy + additional_dependencies: + - types-requests + - types-PyYAML + - types-tqdm + args: [--ignore-missing-imports, --show-error-codes] + exclude: ^(tests/|scripts/|docs/|postprocessing/) diff --git a/pyproject.toml b/pyproject.toml index a4fcb367..57017066 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,8 +49,6 @@ classifiers = [ [project.optional-dependencies] dev = [ - "flake8", - "pylint", "pytest", "pytest-dependency", "xmltodict", @@ -59,7 +57,8 @@ dev = [ "mkdocstrings[python]", "mkdocs-material", "mkdocs-include-markdown-plugin", - "black", + "ruff", + "mypy", ] [project.urls] @@ -80,3 +79,25 @@ include = ["open_mastr", "open_mastr.soap_api", "open_mastr.soap_api.metadata", # from setup.py - not yet included in here # download_url="https://github.com/OpenEnergyPlatform/open-MaStR/archive""/refs/tags/v0.15.0.tar.gz", + +[tool.ruff] +line-length = 88 +target-version = "py39" + +[tool.ruff.lint] +# Enable: +# E - pycodestyle errors +# F - pyflakes +# I - isort (import sorting) +# N - pep8-naming +# W - pycodestyle warnings +# UP - pyupgrade (modernize Python code) +# D - pydocstyle (docstring conventions) +select = ["E", "F", "I", "N", "W", "UP", "D"] +ignore = [] + +[tool.ruff.lint.pydocstyle] +convention = "numpy" + +[tool.ruff.lint.isort] +known-first-party = ["open_mastr"] From b8a76c144b45192963def6a23395998224034b70 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 18:14:08 +0200 Subject: [PATCH 2/8] Add GH Actions workflow for automated pre-commit checks in CI Ensures consistent checking locally and on GH --- .github/workflows/pre-commit.yml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 .github/workflows/pre-commit.yml diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml new file mode 100644 index 00000000..a0319cee --- /dev/null +++ b/.github/workflows/pre-commit.yml @@ -0,0 +1,32 @@ +name: Pre-commit Checks + +on: + pull_request: + branches: + - production + - develop + push: + branches: + - production + - develop + +jobs: + pre-commit: + runs-on: ubuntu-latest + name: Run pre-commit hooks + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pre-commit + + - name: Run pre-commit + run: pre-commit run --all-files --show-diff-on-failure From 296486825d507f0df63cc4c81adce9a618232af1 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 18:38:09 +0200 Subject: [PATCH 3/8] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0eb6fa2c..a743fa3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,8 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ ### Added - Add partial bulk download [#652](https://github.com/OpenEnergyPlatform/open-MaStR/pull/652) +- Modernize linting setup and add pre-commit checks in CI + [#671](https://github.com/OpenEnergyPlatform/open-MaStR/pull/671) ### Changed - Updates the system_catalog dict with missing Einheittyp values [#653](https://github.com/OpenEnergyPlatform/open-MaStR/pull/653) From f91e00fb58269610eec305d7a5e9c3125a0aaf9b Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 18:48:40 +0200 Subject: [PATCH 4/8] Delete old .flake8 cfg --- .flake8 | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 .flake8 diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 7b59df75..00000000 --- a/.flake8 +++ /dev/null @@ -1,4 +0,0 @@ -[flake8] -exclude = meta/migrations/ -max-line-length = 100 -extend-ignore = E203 \ No newline at end of file From d6513ef36f5e498ebcced40d994197b7f1cf442f Mon Sep 17 00:00:00 2001 From: nesnoj Date: Tue, 21 Oct 2025 18:49:07 +0200 Subject: [PATCH 5/8] Exclude 'open_mastr/soap_api/' and 'scripts/' from linting --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e0e21c44..6dbcdb4d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,5 @@ # Global excludes - apply to all hooks -exclude: ^(build/|dist/|.*\.egg-info/|\.tox/|\.pytest_cache/|\.mypy_cache/|__pycache__/|postprocessing/) +exclude: ^(build/|dist/|.*\.egg-info/|\.tox/|\.pytest_cache/|\.mypy_cache/|__pycache__/|postprocessing/|open_mastr/soap_api/|scripts/) repos: # Base pre-commit hooks for common issues From 65b1fadd1b0e9e388173ff06befc11ce742a03e0 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Thu, 11 Dec 2025 23:36:24 +0100 Subject: [PATCH 6/8] Fix merge conflict error --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 616e7e29..3ca0cf3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,7 +78,7 @@ open_mastr = [ include = ["open_mastr", "open_mastr.soap_api", "open_mastr.soap_api.metadata", "open_mastr.utils", "open_mastr.utils.config", "open_mastr.xml_download"] # package names should match these glob patterns (["*"] by default) # from setup.py - not yet included in here -# download_url="https://github.com/OpenEnergyPlatform/open-MaStR/archive""/refs/tags/v0.15.0.tar.gz", +# download_url="https://github.com/OpenEnergyPlatform/open-MaStR/archive""/refs/tags/v0.16.0.tar.gz", [tool.ruff] line-length = 88 From 630771cbf90d1f79f21bddb86676284f136b51ec Mon Sep 17 00:00:00 2001 From: nesnoj Date: Mon, 15 Dec 2025 11:54:23 +0100 Subject: [PATCH 7/8] Update pre-commit config --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6dbcdb4d..9537c213 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -24,7 +24,7 @@ repos: # Ruff - fast, comprehensive linter and formatter (replaces flake8, isort, pylint, pyupgrade, black, pydocstyle) - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.14.1 + rev: v0.14.9 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] @@ -32,7 +32,7 @@ repos: # mypy - static type checking - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.18.2 + rev: v1.19.1 hooks: - id: mypy additional_dependencies: From 1051986aaa7af7c510668f8657a731f109416d55 Mon Sep 17 00:00:00 2001 From: nesnoj Date: Mon, 15 Dec 2025 12:24:31 +0100 Subject: [PATCH 8/8] Apply pre-commit hooks for testing: auto fixes --- .bumpversion.cfg | 4 +- .github/ISSUE_TEMPLATE/issue_template_bug.md | 8 +- .../ISSUE_TEMPLATE/issue_template_feature.md | 2 +- .../ISSUE_TEMPLATE/issue_template_release.md | 2 +- .../issue_template_user_kudos.md | 10 +-- .github/workflows/extend_user_cff.yml | 8 +- .github/workflows/pypi-publish.yml | 2 +- .github/workflows/test-pypi-publish.yml | 2 +- .readthedocs.yml | 2 +- CHANGELOG.md | 52 ++++++------ CITATION.cff | 4 +- CONTRIBUTING.md | 30 +++---- LICENSE.md | 1 - README.rst | 14 ++-- USERS.cff | 6 +- docs/_data/raw_data.rst | 12 ++- docs/_static/custom.css | 2 +- docs/advanced.md | 79 +++++++++---------- docs/data-release-notes.md | 9 +-- docs/dataset.md | 22 +++--- docs/development/changelog_mirror.md | 2 +- docs/development/contributing_mirror.md | 2 +- docs/getting_started.md | 6 +- docs/images/MaStR_Mirror.svg | 2 +- docs/images/MaStR_downloading.svg | 2 +- docs/index.md | 24 +++--- docs/mastr_structure.graphml | 6 +- docs/reference/basic.md | 2 - docs/requirements.txt | 1 - main.py | 4 +- mkdocs.yml | 8 +- open_mastr/mastr.py | 70 ++++++++-------- open_mastr/utils/config.py | 25 ++---- open_mastr/utils/credentials.py | 22 ++---- open_mastr/utils/helpers.py | 34 ++++---- open_mastr/utils/orm.py | 20 ++--- open_mastr/utils/unzip_http.py | 22 +++--- .../xml_download/utils_cleansing_bulk.py | 17 ++-- .../xml_download/utils_download_bulk.py | 16 ++-- .../xml_download/utils_write_to_database.py | 23 +++--- tests/conftest.py | 5 +- tests/preparation.py | 3 +- tests/soap_api/test_download.py | 8 +- tests/test_credentials.py | 3 +- tests/test_helpers.py | 38 +++++---- tests/test_mastr.py | 13 +-- tests/test_requirements.txt | 2 +- .../xml_download/test_utils_cleansing_bulk.py | 7 +- .../xml_download/test_utils_download_bulk.py | 7 +- .../test_utils_write_to_database.py | 8 +- 50 files changed, 326 insertions(+), 347 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index f22ed637..cf284529 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,14 +1,14 @@ [bumpversion] current_version = 0.16.0 parse = (?P\d+)\.(?P\d+)\.(?P\d+)((?P(a|na))+(?P\d+))? -serialize = +serialize = {major}.{minor}.{patch}{release}{build} {major}.{minor}.{patch} [bumpversion:part:release] first_value = a optional_value = na -values = +values = a na diff --git a/.github/ISSUE_TEMPLATE/issue_template_bug.md b/.github/ISSUE_TEMPLATE/issue_template_bug.md index fc0ce68b..c7ff7fd4 100644 --- a/.github/ISSUE_TEMPLATE/issue_template_bug.md +++ b/.github/ISSUE_TEMPLATE/issue_template_bug.md @@ -10,7 +10,7 @@ assignees: '' ## Description of the issue Describe the problem in as much detail as possible. -Focus on the expected and current behavior. +Focus on the expected and current behavior. If necessary, create a screenshot and insert below. ## Steps to Reproduce @@ -23,9 +23,9 @@ If necessary, create a screenshot and insert below. Describe possible ideas for solution and evaluate advantages and disadvantages. ## Context and Environment -* Version used: -* Operating system: -* Environment setup and (python) version: +* Version used: +* Operating system: +* Environment setup and (python) version: ## Workflow checklist - [ ] I have checked [the documentation](https://open-mastr.readthedocs.io/en/latest/) and confirmed this issue is not already addressed there. diff --git a/.github/ISSUE_TEMPLATE/issue_template_feature.md b/.github/ISSUE_TEMPLATE/issue_template_feature.md index 8d2b4037..d80ce96d 100644 --- a/.github/ISSUE_TEMPLATE/issue_template_feature.md +++ b/.github/ISSUE_TEMPLATE/issue_template_feature.md @@ -9,7 +9,7 @@ assignees: '' ## Description of the issue -Describe the problem in as much detail as possible. +Describe the problem in as much detail as possible. ## Ideas of solution diff --git a/.github/ISSUE_TEMPLATE/issue_template_release.md b/.github/ISSUE_TEMPLATE/issue_template_release.md index 2c35176e..56d4056e 100644 --- a/.github/ISSUE_TEMPLATE/issue_template_release.md +++ b/.github/ISSUE_TEMPLATE/issue_template_release.md @@ -15,7 +15,7 @@ assignees: '' ## Preparation - [ ] 🐙 Create a `Draft GitHub Release` with the release version number `vx.x.x` as title - [ ] Merge all open Pull Requests to `develop` -- [ ] Run tests locally with `pytest` and apply linting with `pre-commit run -a` +- [ ] Run tests locally with `pytest` and apply linting with `pre-commit run -a` ## Create a `release` branch - [ ] Checkout `develop` and branch with `git checkout -b release-vx.x.x` - [ ] Update version for test release with `bump2version --current-version current_version> --new-version patch` diff --git a/.github/ISSUE_TEMPLATE/issue_template_user_kudos.md b/.github/ISSUE_TEMPLATE/issue_template_user_kudos.md index 86890bed..b2344336 100644 --- a/.github/ISSUE_TEMPLATE/issue_template_user_kudos.md +++ b/.github/ISSUE_TEMPLATE/issue_template_user_kudos.md @@ -16,10 +16,10 @@ Please, insert your information below - fill out at minimum affiliation :purple_ :pencil2: **Spaces** and the following special characters are allowed: @ ? ! | . , : ; - _ [ / ( ) \ ] § $ % & = + < > -family-names: -given-names: -alias: -affiliation: -orcid: +family-names: +given-names: +alias: +affiliation: +orcid: Thank you! diff --git a/.github/workflows/extend_user_cff.yml b/.github/workflows/extend_user_cff.yml index e7db64fb..774d88a3 100644 --- a/.github/workflows/extend_user_cff.yml +++ b/.github/workflows/extend_user_cff.yml @@ -8,7 +8,7 @@ on: permissions: contents: write pull-requests: write - + jobs: create-pr-to-add-new-user: if: contains(join(github.event.issue.labels.*.name, ','), 'user') @@ -67,10 +67,10 @@ jobs: orcid: ${{env.ORCID}} EOF - + - name: Print updated USERS.cff run: cat USERS.cff - + - name: Create Pull Request uses: peter-evans/create-pull-request@v6 with: @@ -82,7 +82,7 @@ jobs: Add new user to USERS.cff body: | This pull request updates the USERS.cff file with new user information extracted from issue #${{ github.event.issue.number }} - + Closes #${{ github.event.issue.number }} Many thanks @${{ github.actor }}! diff --git a/.github/workflows/pypi-publish.yml b/.github/workflows/pypi-publish.yml index d4a7bea9..932fec54 100644 --- a/.github/workflows/pypi-publish.yml +++ b/.github/workflows/pypi-publish.yml @@ -35,4 +35,4 @@ jobs: if: startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@release/v1 with: - password: ${{ secrets.PYPI }} \ No newline at end of file + password: ${{ secrets.PYPI }} diff --git a/.github/workflows/test-pypi-publish.yml b/.github/workflows/test-pypi-publish.yml index 83c19222..a4aae1e3 100644 --- a/.github/workflows/test-pypi-publish.yml +++ b/.github/workflows/test-pypi-publish.yml @@ -36,4 +36,4 @@ jobs: with: password: ${{ secrets.PYPI_TEST }} repository-url: https://test.pypi.org/legacy/ - verbose: true \ No newline at end of file + verbose: true diff --git a/.readthedocs.yml b/.readthedocs.yml index 5944fdae..f5a6c3d5 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -12,4 +12,4 @@ mkdocs: python: install: - - requirements: docs/requirements.txt \ No newline at end of file + - requirements: docs/requirements.txt diff --git a/CHANGELOG.md b/CHANGELOG.md index b06c4eae..f69791b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ # Changelog All notable changes to this project will be documented in this file. -For each version important additions, changes and removals are listed here. +For each version important additions, changes and removals are listed here. The format is inspired from [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/v2.0.0.html). @@ -103,7 +103,7 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/ - Fixed missing call to gen_url in case first bulk download fails as xml file for today is not yet available [#534](https://github.com/OpenEnergyPlatform/open-MaStR/pull/534) - Repair links in the documentation page [#536](https://github.com/OpenEnergyPlatform/open-MaStR/pull/536) -## [v0.14.3] Fix Pypi Release - 2024-04-24 +## [v0.14.3] Fix Pypi Release - 2024-04-24 ### Added - Add new table `changed_dso_assignment` [#510](https://github.com/OpenEnergyPlatform/open-MaStR/pull/510) - Add deprecation warning for `MaStRMirror` and `MaStRDownload` [#492](https://github.com/OpenEnergyPlatform/open-MaStR/pull/492) @@ -267,27 +267,27 @@ Additionally, a new datasource was identified and can be used: the xml bulk down The API was updated to the newest version and the data model was adapted. ### Added -- The class :class:`open_mastr.mastr.Matr` - was introduced as the entrypoint for users, the API download was included in this entrypoint +- The class :class:`open_mastr.mastr.Matr` + was introduced as the entrypoint for users, the API download was included in this entrypoint [#203](https://github.com/OpenEnergyPlatform/open-MaStR/issues/203) - A method for downloading and parsing the xml dump from the MaStR website - was implemented + was implemented [#202](https://github.com/OpenEnergyPlatform/open-MaStR/issues/202) -- New data classes and attributes were introduced to orm.py +- New data classes and attributes were introduced to orm.py [#209](https://github.com/OpenEnergyPlatform/open-MaStR/issues/209) - The documentation page was updated -- Unit tests were created +- Unit tests were created [#207](https://github.com/OpenEnergyPlatform/open-MaStR/issues/207) -- A CI pipeline was introduced +- A CI pipeline was introduced [#208](https://github.com/OpenEnergyPlatform/open-MaStR/issues/208) -- The metadata was updated +- The metadata was updated [#219](https://github.com/OpenEnergyPlatform/open-MaStR/issues/219) ## [v0.10.0] Unreleased - Refactoring - 2020-10-08 -A complete refactoring took place! Downloading data was entirely changed; introducing layers of code and removing -duplicated code while more of less following DRY. -Moreover, post-processing was changed to be more accessible and easier to execute. For example, docker now helps to +A complete refactoring took place! Downloading data was entirely changed; introducing layers of code and removing +duplicated code while more of less following DRY. +Moreover, post-processing was changed to be more accessible and easier to execute. For example, docker now helps to spin up a database container. The documention on RTD was extended, update and improved to be more helpful for new users. Read more about the details: @@ -297,24 +297,24 @@ Read more about the details: - added documentation for ReadTheDocs - improved parallel download - merged all stale branches -- The class :class:`open_mastr.soap_api.mirror.MaStRMirror` - was introduced for mirroring MaStR data with latest updates +- The class :class:`open_mastr.soap_api.mirror.MaStRMirror` + was introduced for mirroring MaStR data with latest updates [#149](https://github.com/OpenEnergyPlatform/open-MaStR/issues/149) - Introduce project home `~/.open-MaStR/config/` [#120](https://github.com/OpenEnergyPlatform/open-MaStR/issues/120) - Documentation of post-processing [#117](https://github.com/OpenEnergyPlatform/open-MaStR/issues/117) - Updated documentation of downloading data [#124](https://github.com/OpenEnergyPlatform/open-MaStR/issues/124) which is harmonized with the other parts of docs and with GitHubs README [#135](https://github.com/OpenEnergyPlatform/open-MaStR/issues/135) -- Local execution of post-processing now possible, optionally in dockered database +- Local execution of post-processing now possible, optionally in dockered database [#116](https://github.com/OpenEnergyPlatform/open-MaStR/issues/116) - Post-processing adapted to CSV data from :class:`open_mastr.soap_api.mirror.MaStRMirror` [#172](https://github.com/OpenEnergyPlatform/open-MaStR/issues/172) - Tests for changed download code are added [#131](https://github.com/OpenEnergyPlatform/open-MaStR/issues/131) -- Metadata added for raw data as frictionless data package +- Metadata added for raw data as frictionless data package [#160](https://github.com/OpenEnergyPlatform/open-MaStR/issues/160) - Suffix columns instead of deferring in database CSV export [#157](https://github.com/OpenEnergyPlatform/open-MaStR/issues/157) -- Code examples added for :class:`open_mastr.soap_api.mirror.MaStRMirror` explaining basic use of +- Code examples added for :class:`open_mastr.soap_api.mirror.MaStRMirror` explaining basic use of mirroring database [#164](https://github.com/OpenEnergyPlatform/open-MaStR/issues/164) - CSV file reader for MaStR raw data added [#181](https://github.com/OpenEnergyPlatform/open-MaStR/issues/181) @@ -331,27 +331,27 @@ Read more about the details: - The CHANGELOG is now included in the documentation ### Changed -- Download of raw data has entirely been refactored. A - [python wrapper](https://open-mastr.readthedocs.io/en/latest/download.html#mastr-api-wrapper) for querying +- Download of raw data has entirely been refactored. A + [python wrapper](https://open-mastr.readthedocs.io/en/latest/download.html#mastr-api-wrapper) for querying the MaStR API was introduced [#83](https://github.com/OpenEnergyPlatform/open-MaStR/issues/83) -- Based on that, for bulk data download, - [MaStRDownload](https://open-mastr.readthedocs.io/en/latest/download.html#bulk-download) provides handy query +- Based on that, for bulk data download, + [MaStRDownload](https://open-mastr.readthedocs.io/en/latest/download.html#bulk-download) provides handy query functions for power unit data [#86](https://github.com/OpenEnergyPlatform/open-MaStR/issues/86). See also [#128](https://github.com/OpenEnergyPlatform/open-MaStR/issues/128) - configuration through config filen in `~/.open-MaStR/config/` with less hard-coded parameters in source files - [#120](https://github.com/OpenEnergyPlatform/open-MaStR/issues/120), + [#120](https://github.com/OpenEnergyPlatform/open-MaStR/issues/120), [#112](https://github.com/OpenEnergyPlatform/open-MaStR/issues/112) - move code into one package named `open_mastr` [#123](https://github.com/OpenEnergyPlatform/open-MaStR/issues/123) - Switch to GitHub Actions for CI instead of Travis [#143](https://github.com/OpenEnergyPlatform/open-MaStR/issues/143) -- Fixed unexpected line breaks during CSV export that corrupted data +- Fixed unexpected line breaks during CSV export that corrupted data [#170](https://github.com/OpenEnergyPlatform/open-MaStR/issues/170) -- Filtering of duplicates in MaStR data (see +- Filtering of duplicates in MaStR data (see `MaStR help `_) got changed to filter units by leading three characters and select only directly entered data [#180](https://github.com/OpenEnergyPlatform/open-MaStR/issues/180) -- Generalize CSV reading function +- Generalize CSV reading function [#188](https://github.com/OpenEnergyPlatform/open-MaStR/issues/188) ### Removed @@ -365,7 +365,7 @@ Read more about the details: - tests - setup.py file - added update function (based on latest timestamp in powerunits csv) -- added wind functions +- added wind functions * only download power units for wind to avoid massive download * changed : process units wind ("one-click solution") - added loop to retry failed power unit downloads, currently one retry diff --git a/CITATION.cff b/CITATION.cff index d496ecf2..6ec75415 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -37,11 +37,11 @@ authors: - family-names: 'KrĂ€mer' given-names: "Kevin" alias: "pt-kkraemer" - affiliation: "ProjectTogether gGmbH" + affiliation: "ProjectTogether gGmbH" title: "open-MaStR" type: software license: AGPL-3.0 version: 0.16.0 -doi: +doi: date-released: 2025-11-26 url: "https://github.com/OpenEnergyPlatform/open-MaStR/" diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8c023f6a..81f6ae8a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -27,7 +27,7 @@ And you don't just have to write code. You can help out by writing documentation, tests, or even by giving feedback about this work. (And yes, that includes giving feedback about the contribution guidelines.) -([Adrienne Friend](https://github.com/adriennefriend/imposter-syndrome-disclaimer) +([Adrienne Friend](https://github.com/adriennefriend/imposter-syndrome-disclaimer) came up with this disclaimer language.) ## Prerequisites @@ -39,22 +39,22 @@ This repository is following the [Contributor Covenant Code of Conduct](https:// Please be self-reflective and always maintain a good culture of discussion and active participation. ### A. Use -Since the open license allows free use, no notification is required. -However, for the authors it is valuable information who uses the software for what purpose. -Indicators are `Watch`, `Fork` and `Starred` of the repository. +Since the open license allows free use, no notification is required. +However, for the authors it is valuable information who uses the software for what purpose. +Indicators are `Watch`, `Fork` and `Starred` of the repository. If you are a user, please add your name and details in [USERS.cff](https://github.com/OpenEnergyPlatform/open-MaStR/blob/production/USERS.cff) by using the [issue template](https://github.com/OpenEnergyPlatform/open-MaStR/issues/new?assignees=&labels=user&projects=&template=user-kudos.md&title=Add+new+user+to+USERS.cff). ### B. Comment -You can give ideas, hints or report bugs in issues, in PR, at meetings or other channels. -This is no development but can be considered a notable contribution. +You can give ideas, hints or report bugs in issues, in PR, at meetings or other channels. +This is no development but can be considered a notable contribution. ### C. Contribute and Review -You add code and become an author of the repository. +You add code and become an author of the repository. You must follow the workflow! ### D. Mantain and Release -You contribute and take care of the repository. -You review and answer questions. +You contribute and take care of the repository. +You review and answer questions. You coordinate and carry out the release. ## Workflow @@ -62,7 +62,7 @@ The workflow for contributing to this project has been inspired by the workflow ### 1. Describe the issue on GitHub Create [an issue](https://help.github.com/en/articles/creating-an-issue) -in the GitHub repository. +in the GitHub repository. The `issue title` describes the problem you will address.
This is an important step as it forces one to think about the "issue". Make a checklist for all needed steps if possible. @@ -100,10 +100,10 @@ Naming convention for branches: `type`-`issue-nr`-`short-description` The majority of the development will be done in `feature` branches. ##### `issue-nr` -The `issueNumber` should be taken from Step 1. Do not use the "#". +The `issueNumber` should be taken from Step 1. Do not use the "#". ##### `short-description` -Describe shortly what the branch is about. +Describe shortly what the branch is about. Avoid long and short descriptive names for branches, 2-4 words are optimal. ##### Other hints @@ -128,7 +128,7 @@ Check branch status: git status ``` -#### 2.3. Commit your changes +#### 2.3. Commit your changes First, make sure you have the pre-commit hooks installed to have your code automatically checked on commit for programmatic and stylistic errors: ```bash @@ -153,7 +153,7 @@ Write a good `commit message`: - Keep the subject line [shorter than 50 characters](https://chris.beams.io/posts/git-commit/#limit-50) - Do not commit more than a few changes at the time: [atomic commits](https://en.wikipedia.org/wiki/Atomic_commit) - Use [imperative](https://chris.beams.io/posts/git-commit/#imperative) -- Do not end the commit message with a [period](https://chris.beams.io/posts/git-commit/#end) ~~.~~ +- Do not end the commit message with a [period](https://chris.beams.io/posts/git-commit/#end) ~~.~~ - Allways end the commit message with the `issueNumber` including the "#" Examples of commit message: `Added function with some method #42` or `Update documentation for commit messages #1` @@ -194,7 +194,7 @@ If you are the reviewer: - Check the changes in all corresponding files. - Checkout the branch and run code. - Comment if you would like to change something (Use `Request changes`) -- If all tests pass and all changes are good, `Approve` the PR. +- If all tests pass and all changes are good, `Approve` the PR. - Leave a comment and some nice words! #### 4.1. Merge the PR diff --git a/LICENSE.md b/LICENSE.md index 9591157b..dbbe3558 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -659,4 +659,3 @@ specific requirements. if any, to sign a "copyright disclaimer" for the program, if necessary. For more information on this, and how to apply and follow the GNU AGPL, see . - diff --git a/README.rst b/README.rst index 80250d3a..ba242dea 100644 --- a/README.rst +++ b/README.rst @@ -27,7 +27,7 @@ open-mastr - |badge_issue_open| |badge_issue_closes| |badge_pr_open| |badge_pr_closes| * - Community - |badge_contributing| |PyPI download month| |Total PyPI downloads| - + .. contents:: :depth: 2 @@ -37,16 +37,16 @@ open-mastr Introduction ============ -The `Marktstammdatenregister (MaStR) `_ is a German register +The `Marktstammdatenregister (MaStR) `_ is a German register provided by the German Federal Network Agency (Bundesnetzagentur / BNetza) that keeps track of all power and gas units located in Germany. The MaStR data can be - + #. browsed and filtered `online `_ #. taken from `daily provided dumps `_ #. be accessed via the `web service `_ -| The python package ``open-mastr`` provides an interface for accessing the data. +| The python package ``open-mastr`` provides an interface for accessing the data. | It contains methods to download and parse the xml files (bulk) and the SOAP web service (API). | In this repository we are developing methods to analyze, validate and enrich the data. | We want to collect and compile post processing scripts to improve data quality. @@ -111,7 +111,7 @@ These projects already use open-mastr: - `EmPowerPlan `_ - `Goal100 Monitor `_ -If you want to see your project in this list, write an +If you want to see your project in this list, write an `Issue `_ or add changes in a `Pull Request `_. @@ -178,7 +178,7 @@ Data .. |badge_contributing| image:: https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat :alt: contributions - + .. |PyPI download month| image:: https://img.shields.io/pypi/dm/open-mastr?label=PyPi%20Downloads :target: https://pypistats.org/packages/open-mastr @@ -187,5 +187,3 @@ Data .. |badge_joss| image:: https://joss.theoj.org/papers/dc0d33e7dc74f7233e15a7b6fe0c7a3e/status.svg :target: https://joss.theoj.org/papers/dc0d33e7dc74f7233e15a7b6fe0c7a3e - - diff --git a/USERS.cff b/USERS.cff index 0527701d..6d71b187 100644 --- a/USERS.cff +++ b/USERS.cff @@ -1,8 +1,8 @@ cff-version: 1.2.0 message: "If you use this software, please leave your name for reference. authors: - - family-names: - given-names: - alias: + - family-names: + given-names: + alias: affiliation: Reiner Lemoine Institut orcid: diff --git a/docs/_data/raw_data.rst b/docs/_data/raw_data.rst index 274e1be2..912e7441 100644 --- a/docs/_data/raw_data.rst +++ b/docs/_data/raw_data.rst @@ -1,24 +1,24 @@ Data Description ===================== -In the following, we will describe the data retrieved from the MaStR database. The two figures showing the data model are +In the following, we will describe the data retrieved from the MaStR database. The two figures showing the data model are taken from `here `_. .. figure:: /images/ObjektmodellMaStR.png :width: 70% :align: center - + Overview of MaStR data model and its sub-categories. As can be seen from the first figure, the MaStR can be divided into three sub-categories: Actor (Marktakteur), grid (Netz), -and unit (Einheit). Since the technological units are the core of the MaStR, their data model is shown in the next figure -with a higher level of detail. +and unit (Einheit). Since the technological units are the core of the MaStR, their data model is shown in the next figure +with a higher level of detail. .. figure:: /images/DetailAnlagenModellMaStR.png :width: 90% :align: center - + Overview of MaStR data model with a focus on the electricity and gas units. @@ -97,5 +97,3 @@ storage :file: raw/bnetza_mastr_storage_raw.csv :widths: 20, 35, 15, 15 :header-rows: 1 - - diff --git a/docs/_static/custom.css b/docs/_static/custom.css index d55f0e15..9c5cfc20 100644 --- a/docs/_static/custom.css +++ b/docs/_static/custom.css @@ -4,4 +4,4 @@ :not([data-md-state="blur"]) + nav { display: none; -} \ No newline at end of file +} diff --git a/docs/advanced.md b/docs/advanced.md index e9471f1a..7a89c19e 100644 --- a/docs/advanced.md +++ b/docs/advanced.md @@ -1,4 +1,4 @@ -For most users, the functionalites described in [Getting Started](getting_started.md) are sufficient. If you want +For most users, the functionalites described in [Getting Started](getting_started.md) are sufficient. If you want to examine how you can configure the package's behavior for your own needs, check out [Configuration](#configuration). Or you can explore the two main functionalities of the package, namely the [Bulk Download](#bulk-download) or the [SOAP API download](#soap-api-download). @@ -53,8 +53,8 @@ The project home directory is structured as follows (files and folders below `da └── logs └── open_mastr.log ``` - - + + * **config** * `credentials.cfg`
Credentials used to access @@ -71,7 +71,7 @@ The project home directory is structured as follows (files and folders below `da Contains the sqlite database in `open-mastr.db` * `xml_download`
Contains the bulk download in `Gesamtdatenexport_.zip`
- New bulk download versions overwrite older versions. + New bulk download versions overwrite older versions. * **logs** * `open_mastr.log`
The files stores the logging information from executing open-mastr. @@ -99,7 +99,7 @@ or adjusting it manually in your code. E.g. to enable `DEBUG` messages in `open_ ### Data -If the zipped dump of the MaStR is downloaded, it is saved in the folder `$HOME/.open-MaStR/data/xml_download`. +If the zipped dump of the MaStR is downloaded, it is saved in the folder `$HOME/.open-MaStR/data/xml_download`. The data can then be written to any sql database supported by [sqlalchemy](https://docs.sqlalchemy.org/). The type of the sql database is determined by the parameter `engine` in the [Mastr][open_mastr.Mastr] class. @@ -122,8 +122,8 @@ There are some environment variables to customize open-MaStR: ## Bulk download On the homepage [MaStR/Datendownload](https://www.marktstammdatenregister.de/MaStR/Datendownload) a zipped folder containing the whole -MaStR is offered. The data is delivered as xml-files. The official documentation can be found -on the same page (in german). This data is updated on a daily base. +MaStR is offered. The data is delivered as xml-files. The official documentation can be found +on the same page (in german). This data is updated on a daily base. ``` mermaid flowchart LR @@ -143,14 +143,14 @@ flowchart LR ``` -In the following, the process is described that is started when calling the [`Mastr.download`][open_mastr.Mastr.download] function with the parameter `method`="bulk". +In the following, the process is described that is started when calling the [`Mastr.download`][open_mastr.Mastr.download] function with the parameter `method`="bulk". First, the zipped files are downloaded and saved in `$HOME/.open-MaStR/data/xml_download`. The zipped folder contains many xml files, which represent the different tables from the MaStR. Those tables are then parsed to a sqlite database. If only some specific tables are of interest, they can be specified with the parameter `data`. Every table that is selected in `data` will be deleted from the local database, if existent, and then filled with data from the xml files. In the next step, a basic data cleansing is performed. Many entries in the MaStR from the bulk download are replaced by numbers. -As an example, instead of writing the german states where the unit is registered (Saxony, Brandenburg, Bavaria, ...) the MaStR states -corresponding digits (7, 2, 9, ...). One major step of cleansing is therefore to replace those digits with their original meaning. +As an example, instead of writing the german states where the unit is registered (Saxony, Brandenburg, Bavaria, ...) the MaStR states +corresponding digits (7, 2, 9, ...). One major step of cleansing is therefore to replace those digits with their original meaning. Moreover, the datatypes of different entries are set in the data cleansing process and corrupted files are repaired. If needed, the tables in the database can be obtained as csv files. Those files are created by first merging corresponding tables (e.g all tables that contain information about solar) and then dumping those tables to `.csv` files with the [`to_csv`][open_mastr.Mastr.to_csv] method. @@ -178,7 +178,7 @@ via its API a [registration](https://www.marktstammdatenregister.de/MaStRHilfe/f To download data from the MaStR API using the `open-MaStR`, the credentials (MaStR user and token) need to be provided in a certain way. Three options exist: -1. **Credentials file:** +1. **Credentials file:** Both, user and token, are stored in plain text in the credentials file. For storing the credentials in the credentials file (plus optionally using keyring for the token) simply instantiate [`MaStRDownload`][open_mastr.soap_api.download.MaStRDownload] once and you get asked for a user name and a token. The @@ -196,13 +196,13 @@ To download data from the MaStR API using the `open-MaStR`, the credentials (MaS The credentials file needs to be stored at: `$HOME/.open-MaStR/config/credentials.cfg` -2. **Credentials file + keyring:** +2. **Credentials file + keyring:** The user is stored in the credentials file, while the token is stored encrypted in the [keyring](https://pypi.org/project/keyring/). Read in the documentation of the keyring library how to store your token in the keyring. -3. **Don't store:** +3. **Don't store:** Just use the password for one query and forget it The latter option is only available when using [`MaStRAPI`][open_mastr.soap_api.download.MaStRAPI]. @@ -217,7 +217,7 @@ To download data from the MaStR API using the `open-MaStR`, the credentials (MaS ### MaStRAPI -You can access the MaStR data via API by using the class `MaStRAPI` directly if you have the API credentials +You can access the MaStR data via API by using the class `MaStRAPI` directly if you have the API credentials configured correctly. Use the code snippet below for queries. @@ -242,8 +242,8 @@ For API calls, models and optional parameters refer to the ???+ example "Example queries and their responses (for model 'Anlage')" === "mastr_api.GetLokaleUhrzeit()" - - Response: + + Response: ```python { 'Ergebniscode': 'OK', @@ -257,12 +257,12 @@ For API calls, models and optional parameters refer to the API function name: `GetLokaleUhrzeit`
Example query: `mastr_api.GetLokaleUhrzeit()`
Parameter: `None` - - + + === "mastr_api.GetListeAlleEinheiten(limit=1)" - - Response: + + Response: ```python { "Ergebniscode": "OkWeitereDatenVorhanden", @@ -297,7 +297,7 @@ For API calls, models and optional parameters refer to the API function name: `GetEinheitSolar`
Example query: `mastr_api.GetListeAlleEinheiten(limit=1)` - + | Parameter | Description | |------------------------|-----------------------------------------------------------------------------------------------------------| | marktakteurMastrNummer | The MaStR number of the requested unit | @@ -305,10 +305,10 @@ For API calls, models and optional parameters refer to the | datumAb | Restrict the amount of data to be retrieved to changed data from the specified date [Default value: NULL] | | limit | Limit of the maximum data records to be delivered [default/maximum value: maximum of own limit] | | einheitMastrNummern[] | | - + === "mastr_api.GetEinheitSolar(einheitMastrNummer="SEE984033548619")" - - Response: + + Response: ```python { "Ergebniscode": "OK", @@ -405,37 +405,37 @@ For API calls, models and optional parameters refer to the |--------------------------|-------------------------------------------------------------------| | `apiKey` | The web service key for validation | | `marktakteurMastrNummer` | The MaStR number of the market actor used by the web service user | - | `einheitMastrNummer` | The MaStR number of the requested unit | + | `einheitMastrNummer` | The MaStR number of the requested unit | ??? note "Why can't I just query all information of all units of a specific power plant type?" - As the example queries above demonstrate, the API is structured so that units of power plants types (e.g. wind - turbine, solar PV systems, gas power plant) have to be queried directly by their unique identifier ( - `EinheitMastrNummer"`) and a distinct API query. To download all unit information of a specific power plant + As the example queries above demonstrate, the API is structured so that units of power plants types (e.g. wind + turbine, solar PV systems, gas power plant) have to be queried directly by their unique identifier ( + `EinheitMastrNummer"`) and a distinct API query. To download all unit information of a specific power plant you need to know the "EinheitMastrNummer".
- Firstly, by querying for all units with `mastr_api.GetListeAlleEinheiten()` you'll get all units, their unique - identifier (`EinheitMastrNummer`) and their power plant type (`Einheitentyp`). You can then sort them by power + Firstly, by querying for all units with `mastr_api.GetListeAlleEinheiten()` you'll get all units, their unique + identifier (`EinheitMastrNummer`) and their power plant type (`Einheitentyp`). You can then sort them by power plant type and use the power plant type specific API query to retrieve information about it.
Cumbersome?
- Luckily, `open-MaStR` has you covered and provides methods to just query for all units of a power - plant type. + Luckily, `open-MaStR` has you covered and provides methods to just query for all units of a power + plant type. ### MaStRDownload -The class `MaStRDownload` builds upon methods provided in the class `MaStRAPI`.
+The class `MaStRDownload` builds upon methods provided in the class `MaStRAPI`.
-It provides methods to download power plant unit types and additional information -for each unit type, such as extended unit data, permit data, chp-specific data, location data +It provides methods to download power plant unit types and additional information +for each unit type, such as extended unit data, permit data, chp-specific data, location data or eeg-specific data.
-The class handles the querying logic and knows which additional data for each unit type is available -and which SOAP service has to be used to query it. +The class handles the querying logic and knows which additional data for each unit type is available +and which SOAP service has to be used to query it. ### MaStRMirror @@ -444,8 +444,5 @@ The class `MaStRMirror` builds upon methods provided in the class `MaStRDownload The aim of the class has been to mirror the Marktstammdatenregister database and keep it up-to-date. Historically, `open-mastr` has been developed before the owner of the dataset, BNetzA, offered the `bulk` download. -The class can still be used for use-cases where only the most recent changes to a local database are of interest. +The class can still be used for use-cases where only the most recent changes to a local database are of interest. For downloading the entire MaStR database we recommend the bulk download functionalities by specifying `donwload(method="bulk")`. - - - diff --git a/docs/data-release-notes.md b/docs/data-release-notes.md index e59b599e..1771cfab 100644 --- a/docs/data-release-notes.md +++ b/docs/data-release-notes.md @@ -12,20 +12,15 @@ - Code version [v0.9.0](https://github.com/OpenEnergyPlatform/open-MaStR/releases/tag/v0.9.0) - Includes: wind; hydro; biomass - new power-unit download: true -#### dataversion-2.1.2 +#### dataversion-2.1.2 - Code version [v0.9.0](https://github.com/OpenEnergyPlatform/open-MaStR/releases/tag/v0.9.0) - Includes: wind; hydro; biomass #### dataversion-1.5 - Includes: wind-permits; storages; solar; basic postprocessing -#### dataversion-1.4 +#### dataversion-1.4 - Includes: permit data for wind and updated metadata #### dataversion-1.3 #### dataversion-1.2 #### dataversion-1.1 #### dataversion-1.0 Test version - - - - - diff --git a/docs/dataset.md b/docs/dataset.md index 2063cdf0..19f48c45 100644 --- a/docs/dataset.md +++ b/docs/dataset.md @@ -28,7 +28,7 @@ As you may have noticed, we distinguish between `bulk` and `API` datasets. The ` ## Tables in the database !!! question "Confused by all the tables?" - :sparkles: We regularly run the whole download and cleansing pipeline and upload the dataset as csv files at [zenodo](https://doi.org/10.5281/zenodo.6807425)! + :sparkles: We regularly run the whole download and cleansing pipeline and upload the dataset as csv files at [zenodo](https://doi.org/10.5281/zenodo.6807425)! After downloading the MaStR, you will find a database with a large number of tables. Here we give a brief overview of what you can find in those tables: @@ -39,33 +39,33 @@ After downloading the MaStR, you will find a database with a large number of tab The main information about the different technologies lies in the `_extended` tables. You can find the capacity, location, and other technology-specific attributes here. | Table name | Comments | - |------|------| + |------|------| | biomass_extended | | - | combustion_extended | *Conventional powerplants: Gas, Oil, Coal, ...* | + | combustion_extended | *Conventional powerplants: Gas, Oil, Coal, ...* | | gsgk_extended | *gsgk is short for: Geothermal, Mine gas, and Pressure relaxation* | | hydro_extended | | - | nuclear_extended | | + | nuclear_extended | | | solar_extended | | - | storage_extended | | + | storage_extended | | | wind_extended | | === "_eeg tables" In germany, renewable energies were subsidized by the state - according to a law called 'EEG'. Relevant information like the 'EEG ID' are in the `_eeg` tables. | Table name | Comments | - |------|------| + |------|------| | biomass_eeg | | | gsgk_eeg | *gsgk is short for: Geothermal, Mine gas, and Pressure relaxation* | | hydro_eeg | | | solar_eeg | | - | storage_eeg | | + | storage_eeg | | | wind_eeg | | === "Other tables" Other tables contain information about the grid, the energy market, or gas consumers and producers: | Table name | Comments | - |------|------| + |------|------| | balancing_area | *Related to the energy market* | | changed_dso_assignment | *Units where the DSO responsibility changed* | | electricity_consumer | *Only large consumers* | @@ -86,7 +86,7 @@ After downloading the MaStR, you will find a database with a large number of tab ### MaStR data model -A useful overview of the MaStR data model can be found [here (in german)](https://www.marktstammdatenregister.de/MaStRHilfe/files/webdienst/Objektmodell%20-%20Fachliche%20Ansicht%20V1.2.0.pdf). A translated version using the names from the tables you can find in your local database is presented here: +A useful overview of the MaStR data model can be found [here (in german)](https://www.marktstammdatenregister.de/MaStRHilfe/files/webdienst/Objektmodell%20-%20Fachliche%20Ansicht%20V1.2.0.pdf). A translated version using the names from the tables you can find in your local database is presented here: === "translated image (english)" ![Data model of the MaStR](images/DetailAnlagen_english.PNG) @@ -98,7 +98,7 @@ A useful overview of the MaStR data model can be found [here (in german)](https: ## Tables as CSV Tables from the database can be exported to csv files. By default, all available power plant unit data will be exported -to csv files. +to csv files. For exported csv's additional available data is joined on basic unit data. For example: For biomass power plants one csv -is exported consisting of the join of four database tables (unit data, chp data, permit data, eeg data). We regularly run the whole download and cleansing pipeline and upload the dataset as csv files at [zenodo](https://doi.org/10.5281/zenodo.6807425). +is exported consisting of the join of four database tables (unit data, chp data, permit data, eeg data). We regularly run the whole download and cleansing pipeline and upload the dataset as csv files at [zenodo](https://doi.org/10.5281/zenodo.6807425). diff --git a/docs/development/changelog_mirror.md b/docs/development/changelog_mirror.md index a1df8959..55f67f37 100644 --- a/docs/development/changelog_mirror.md +++ b/docs/development/changelog_mirror.md @@ -4,4 +4,4 @@ hide: --- {% include-markdown "../../CHANGELOG.md" -%} \ No newline at end of file +%} diff --git a/docs/development/contributing_mirror.md b/docs/development/contributing_mirror.md index 66369d0c..386cf55c 100644 --- a/docs/development/contributing_mirror.md +++ b/docs/development/contributing_mirror.md @@ -5,4 +5,4 @@ hide: {% include-markdown "../../CONTRIBUTING.md" -%} \ No newline at end of file +%} diff --git a/docs/getting_started.md b/docs/getting_started.md index 891efbbe..79d579ac 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -50,7 +50,7 @@ More detailed information can be found in the section [bulk download](advanced.m API download ----------------------------------- -When using `download(method="API")`, the data is retrieved from the MaStR API. For using the MaStR API, credentials +When using `download(method="API")`, the data is retrieved from the MaStR API. For using the MaStR API, credentials are needed (see [SOAP API download](advanced.md#soap-api-download)). ```python @@ -64,7 +64,7 @@ The default settings will save retrieved data into the sqlite database. The func ## Accessing the database -For accessing and working with the MaStR database after you have downloaded it, you can use sqlite browsers +For accessing and working with the MaStR database after you have downloaded it, you can use sqlite browsers such as [DB Browser for SQLite](https://sqlitebrowser.org/) or any python module which can process sqlite data. Pandas, for example, comes with the function [read_sql](https://pandas.pydata.org/docs/reference/api/pandas.read_sql.html). @@ -94,4 +94,4 @@ additional tables are mirrored from database to csv as they are. To export the d tables=["wind", "grids"] db.to_csv(tables) -``` \ No newline at end of file +``` diff --git a/docs/images/MaStR_Mirror.svg b/docs/images/MaStR_Mirror.svg index 4d34d54e..a7d0f132 100644 --- a/docs/images/MaStR_Mirror.svg +++ b/docs/images/MaStR_Mirror.svg @@ -864,4 +864,4 @@ id="tspan1316" x="568.73059" y="58.005318">data for export - \ No newline at end of file + diff --git a/docs/images/MaStR_downloading.svg b/docs/images/MaStR_downloading.svg index 15e332e8..838a2f6c 100644 --- a/docs/images/MaStR_downloading.svg +++ b/docs/images/MaStR_downloading.svg @@ -239,4 +239,4 @@ inkscape:connector-curvature="0" id="path110-0" style="fill:#000000;fill-opacity:1;fill-rule:evenodd;stroke:none" - d="m 0,0 v -16.538 c 3.429,0.806 5.31,1.135 8.425,1.257 -4.792,-1.913 -7.62,-4.162 -9.938,-7.005 -2.42,2.843 -4.992,5.196 -9.934,7.005 2.866,-0.04 4.968,-0.436 8.193,-1.232 V 0 Z" /> \ No newline at end of file + d="m 0,0 v -16.538 c 3.429,0.806 5.31,1.135 8.425,1.257 -4.792,-1.913 -7.62,-4.162 -9.938,-7.005 -2.42,2.843 -4.992,5.196 -9.934,7.005 2.866,-0.04 4.968,-0.436 8.193,-1.232 V 0 Z" /> diff --git a/docs/index.md b/docs/index.md index cb644717..6bb42a3d 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,34 +1,34 @@ # Introduction -The [Marktstammdatenregister (MaStR)](https://www.marktstammdatenregister.de/MaStR) is a German register +The [Marktstammdatenregister (MaStR)](https://www.marktstammdatenregister.de/MaStR) is a German register provided by the German Federal Network Agency (Bundesnetzagentur / BNetza) that keeps track of all power and gas units located in Germany. It is a daily growing dataset with more than 8.2 million data points covering electricity and gas production units, electricity and gas consumers, storages, grids, and energy market participants (as of spring 2024). Generally, the MaStR data can be accessed via various options: - + 1. browse, filter and download [online](https://www.marktstammdatenregister.de/MaStR) 1. download [daily provided dumps](https://www.marktstammdatenregister.de/MaStR/Datendownload) 1. access via the [web service](https://www.marktstammdatenregister.de/MaStRHilfe/subpages/webdienst.html) -The python package `open-mastr` provides an interface for accessing the data and contributes to improving the -usability of the access options above. This repository is intended for people who wish to simply work with the +The python package `open-mastr` provides an interface for accessing the data and contributes to improving the +usability of the access options above. This repository is intended for people who wish to simply work with the MaStR data and do not want to deal with the individual obstacles to data access of the three options above.

-It facilitates access to the daily provided MaStR dumps with download methods (bulk) and by -parsing the XML files to a relational database. Furthermore, the software provides a Python wrapper to access the MaStR +It facilitates access to the daily provided MaStR dumps with download methods (bulk) and by +parsing the XML files to a relational database. Furthermore, the software provides a Python wrapper to access the MaStR SOAP web service (API). ## Benefits provided by `open-mastr` -Benefit | Description -------- | ------ -Data download and parsing | Download, decode, and write data to a local database -Translation to English | Translate table names and columns from German to English as well as an English documentation page of the dataset +Benefit | Description +------- | ------ +Data download and parsing | Download, decode, and write data to a local database +Translation to English | Translate table names and columns from German to English as well as an English documentation page of the dataset Data processing | Merge relevant information about different technologies to single csv files !!! question "Just here for the data?" - :sparkles: We regularly run the whole download and cleansing pipeline and upload the dataset as csv files at [zenodo](https://doi.org/10.5281/zenodo.6807425)! + :sparkles: We regularly run the whole download and cleansing pipeline and upload the dataset as csv files at [zenodo](https://doi.org/10.5281/zenodo.6807425)! ## License The original dataset is licensed under the **Datenlizenz Deutschland – Namensnennung – Version 2.0** (DL-DE-BY-2.0) -[Marktstammdatenregister](https://www.marktstammdatenregister.de/MaStR) - © Bundesnetzagentur fĂŒr ElektrizitĂ€t, Gas, Telekommunikation, Post und Eisenbahnen | [DL-DE-BY-2.0](https://www.govdata.de/dl-de/by-2-0) \ No newline at end of file +[Marktstammdatenregister](https://www.marktstammdatenregister.de/MaStR) - © Bundesnetzagentur fĂŒr ElektrizitĂ€t, Gas, Telekommunikation, Post und Eisenbahnen | [DL-DE-BY-2.0](https://www.govdata.de/dl-de/by-2-0) diff --git a/docs/mastr_structure.graphml b/docs/mastr_structure.graphml index f8db7182..7eeee261 100644 --- a/docs/mastr_structure.graphml +++ b/docs/mastr_structure.graphml @@ -820,9 +820,9 @@ GrubenKlaerschlamm energietraeger -'AndereGase' or 'Braunkohle' or -'Erdgas' or 'Mineraloelprodukte' or -'NichtBiogenerAbfall' or +'AndereGase' or 'Braunkohle' or +'Erdgas' or 'Mineraloelprodukte' or +'NichtBiogenerAbfall' or 'Steinkohle' or 'Waerme' diff --git a/docs/reference/basic.md b/docs/reference/basic.md index bf47274f..fad23e1a 100644 --- a/docs/reference/basic.md +++ b/docs/reference/basic.md @@ -1,4 +1,2 @@ # Basic functions ::: open_mastr.Mastr - - diff --git a/docs/requirements.txt b/docs/requirements.txt index 47cb3193..e6dce0e7 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,3 @@ mkdocstrings[python] mkdocs-material mkdocs-include-markdown-plugin - diff --git a/main.py b/main.py index 88730b4e..8166f17d 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- """ open-MaStR - Main file @@ -11,7 +10,6 @@ """ from open_mastr import Mastr -import os ## specify download parameter @@ -96,6 +94,6 @@ ## export to csv """ Technology-related tables are exported as joined, whereas additional tables - are duplicated as they are in the database. + are duplicated as they are in the database. """ db.to_csv() diff --git a/mkdocs.yml b/mkdocs.yml index 82276c0f..0ad97def 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,5 +1,5 @@ site_name: open-mastr -theme: +theme: name: material features: - navigation.tabs @@ -21,7 +21,7 @@ theme: # Palette toggle for light mode - scheme: default toggle: - icon: material/brightness-7 + icon: material/brightness-7 name: Switch to dark mode icon: repo: fontawesome/brands/github @@ -42,7 +42,7 @@ plugins: docstring_style: numpy docstring_section_style: spacy - include-markdown - + watch: - open_mastr @@ -78,7 +78,7 @@ nav: - Reference: - Basic Usage: reference/basic.md - Advanced Usage of the MaStR SOAP-API: reference/advanced.md - - Development: + - Development: - Contributing: development/contributing_mirror.md - Changelog: development/changelog_mirror.md diff --git a/open_mastr/mastr.py b/open_mastr/mastr.py index 1be39e2f..eac0ff68 100644 --- a/open_mastr/mastr.py +++ b/open_mastr/mastr.py @@ -1,47 +1,48 @@ import os -from sqlalchemy import inspect, create_engine -# import xml dependencies -from open_mastr.xml_download.utils_download_bulk import ( - download_xml_Mastr, - delete_xml_files_not_from_given_date, -) -from open_mastr.xml_download.utils_write_to_database import ( - write_mastr_xml_to_database, -) +from sqlalchemy import create_engine, inspect + +import open_mastr.utils.orm as orm # import soap_API dependencies from open_mastr.soap_api.mirror import MaStRMirror - -from open_mastr.utils.helpers import ( - print_api_settings, - validate_api_credentials, - validate_parameter_format_for_download_method, - validate_parameter_format_for_mastr_init, - validate_parameter_data, - transform_data_parameter, - parse_date_string, - transform_date_parameter, - data_to_include_tables, - create_db_query, - db_query_to_csv, - reverse_fill_basic_units, - delete_zip_file_if_corrupted, - create_database_engine, - rename_table, - create_translated_database_engine, -) from open_mastr.utils.config import ( create_data_dir, get_data_version_dir, - get_project_home_dir, get_output_dir, + get_project_home_dir, setup_logger, ) -import open_mastr.utils.orm as orm # constants -from open_mastr.utils.constants import TECHNOLOGIES, ADDITIONAL_TABLES +from open_mastr.utils.constants import ADDITIONAL_TABLES, TECHNOLOGIES +from open_mastr.utils.helpers import ( + create_database_engine, + create_db_query, + create_translated_database_engine, + data_to_include_tables, + db_query_to_csv, + delete_zip_file_if_corrupted, + parse_date_string, + print_api_settings, + rename_table, + reverse_fill_basic_units, + transform_data_parameter, + transform_date_parameter, + validate_api_credentials, + validate_parameter_data, + validate_parameter_format_for_download_method, + validate_parameter_format_for_mastr_init, +) + +# import xml dependencies +from open_mastr.xml_download.utils_download_bulk import ( + delete_xml_files_not_from_given_date, + download_xml_Mastr, +) +from open_mastr.xml_download.utils_write_to_database import ( + write_mastr_xml_to_database, +) # setup logger log = setup_logger() @@ -200,7 +201,6 @@ def download( "location_elec_generation", "location_elec_consumption", "location_gas_generation", "location_gas_consumption". Defaults to all. """ - if self.is_translated: raise TypeError( "You are currently connected to a translated database.\n" @@ -320,7 +320,7 @@ def to_csv( If 'tables=None' all possible tables will be exported. Parameters - ------------ + ---------- tables: None or list For exporting selected tables choose from: ["wind", "solar", "biomass", "hydro", "gsgk", "combustion", "nuclear", "storage", @@ -334,7 +334,6 @@ def to_csv( limit: None or int Limits the number of exported data rows. """ - if self.is_translated: raise TypeError( "You are currently connected to a translated database.\n" @@ -425,14 +424,13 @@ def translate(self) -> None: ``` """ - if "sqlite" not in self.engine.dialect.name: raise ValueError("engine has to be of type 'sqlite'") if self.is_translated: raise TypeError("The currently connected database is already translated.") inspector = inspect(self.engine) - old_path = r"{}".format(self.engine.url.database) + old_path = rf"{self.engine.url.database}" new_path = old_path[:-3] + "-translated.db" if os.path.exists(new_path): diff --git a/open_mastr/utils/config.py b/open_mastr/utils/config.py index 40f67ec8..88506b73 100644 --- a/open_mastr/utils/config.py +++ b/open_mastr/utils/config.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- """ @@ -17,21 +16,21 @@ __issue__ = "https://github.com/OpenEnergyPlatform/examples/issues/52" __version__ = "v0.10.0" +import logging +import logging.config import os -import yaml -import shutil import pathlib +import shutil from datetime import date -import logging -import logging.config +import yaml + from open_mastr.utils.constants import ( - TECHNOLOGIES, - API_LOCATION_TYPES, ADDITIONAL_TABLES, + API_LOCATION_TYPES, + TECHNOLOGIES, ) - log = logging.getLogger(__name__) @@ -46,7 +45,6 @@ def get_project_home_dir(): path-like object Absolute path to root dir of open-MaStR project home """ - return os.path.join(os.path.expanduser("~"), ".open-MaStR") @@ -58,7 +56,6 @@ def get_output_dir(): path-like object Absolute path to output path """ - if "OUTPUT_PATH" in os.environ: return os.environ.get("OUTPUT_PATH") @@ -110,10 +107,9 @@ def get_data_config(): str dataversion """ - today = date.today() - data_config = f'dataversion-{today.strftime("%Y-%m-%d")}' + data_config = f"dataversion-{today.strftime('%Y-%m-%d')}" return data_config @@ -158,13 +154,11 @@ def create_data_dir(): The directory that is created for this fata version can be returned by :func:`~.get_data_version_dir`. """ - os.makedirs(get_data_version_dir(), exist_ok=True) def _filenames_generator(): """Write default file names .yml to project home dir""" - filenames_file = os.path.join(get_project_home_dir(), "config", "filenames.yml") # How files are prefixed @@ -207,7 +201,6 @@ def _filenames_generator(): for section, section_filenames in filenames_template.items(): filenames[section] = {} for tech in TECHNOLOGIES: - # Files for all technologies files = ["joined", "basic", "extended", "extended_fail"] @@ -264,7 +257,6 @@ def setup_project_home(): Create PROJECTHOME returned by :func:`~.get_project_home_dir`. In addition, default config files are copied to `PROJECTHOME/config/`. """ - # Create directory structure of project home dir create_project_home_dir() @@ -280,7 +272,6 @@ def setup_logger(): logging.Logger Logger with two handlers: console and file. """ - # Read logging config with open( os.path.join(get_project_home_dir(), "config", "logging.yml") diff --git a/open_mastr/utils/credentials.py b/open_mastr/utils/credentials.py index ee818828..7dfbd836 100644 --- a/open_mastr/utils/credentials.py +++ b/open_mastr/utils/credentials.py @@ -14,18 +14,18 @@ __issue__ = "https://github.com/OpenEnergyPlatform/examples/issues/83" __version__ = "v0.10.0" -import os import configparser as cp -from open_mastr.utils.config import get_project_home_dir +import logging +import os + import keyring -import logging +from open_mastr.utils.config import get_project_home_dir log = logging.getLogger(__name__) def _load_config_file(): - config_file = os.path.join(get_project_home_dir(), "config", "credentials.cfg") cfg = cp.ConfigParser() @@ -78,7 +78,6 @@ def get_mastr_user(): def check_and_set_mastr_user(): """Checks if MaStR user is stored, otherwise asks for it.""" - user = get_mastr_user() if not user: @@ -88,9 +87,8 @@ def check_and_set_mastr_user(): cfg = _load_config_file() user = input( - "\n\nCannot not find a MaStR user name in {config_file}.\n\n" + f"\n\nCannot not find a MaStR user name in {credentials_file}.\n\n" "Please enter MaStR-ID (pattern: SOM123456789012): " - "".format(config_file=credentials_file) ) cfg["MaStR"] = {"user": user} @@ -116,7 +114,6 @@ def get_mastr_token(user): ------- str : Token (password) """ - # Try to get password from keyring keyring.get_keyring() # Retrieving password from keyring does currently fail on headless systems @@ -146,7 +143,6 @@ def get_mastr_token(user): def check_and_set_mastr_token(user): """Checks if MaStR token is stored, otherwise asks for it.""" - password = get_mastr_token(user) if not password: @@ -158,13 +154,11 @@ def check_and_set_mastr_token(user): # If also no password in credentials file, ask the user to input password # Two options: (1) storing in keyring; (2) storing in config file password = input( - "\n\nCannot not find a MaStR password, neither in keyring nor in {config_file}.\n\n" + f"\n\nCannot not find a MaStR password, neither in keyring nor in {credentials_file}.\n\n" "Please enter a valid access token of a role (Benutzerrolle) " - "associated to the user {user}.\n" + f"associated to the user {user}.\n" "The token might look like: " - "koo5eixeiQuoi'w8deighai8ahsh1Ha3eib3coqu7ceeg%ies...\n".format( - config_file=credentials_file, user=user - ) + "koo5eixeiQuoi'w8deighai8ahsh1Ha3eib3coqu7ceeg%ies...\n" ) # let the user decide where to store the password diff --git a/open_mastr/utils/helpers.py b/open_mastr/utils/helpers.py index 9ac2492b..9b83e788 100644 --- a/open_mastr/utils/helpers.py +++ b/open_mastr/utils/helpers.py @@ -1,5 +1,5 @@ -import os import json +import os import sys from contextlib import contextmanager from datetime import date, datetime @@ -7,35 +7,34 @@ from zipfile import BadZipfile, ZipFile import dateutil +import pandas as pd import sqlalchemy -from sqlalchemy.sql import insert, literal_column, text from dateutil.parser import parse from sqlalchemy import create_engine from sqlalchemy.orm import Query, sessionmaker - -import pandas as pd +from sqlalchemy.sql import insert, literal_column, text from tqdm import tqdm + +from open_mastr.soap_api.download import MaStRAPI, log from open_mastr.soap_api.metadata.create import create_datapackage_meta_json from open_mastr.utils import orm from open_mastr.utils.config import ( - get_filenames, - get_data_version_dir, column_renaming, + get_data_version_dir, + get_filenames, ) - -from open_mastr.soap_api.download import MaStRAPI, log from open_mastr.utils.constants import ( - BULK_DATA, - TECHNOLOGIES, + ADDITIONAL_TABLES, API_DATA, API_DATA_TYPES, API_LOCATION_TYPES, - BULK_INCLUDE_TABLES_MAP, BULK_ADDITIONAL_TABLES_CSV_EXPORT_MAP, + BULK_DATA, + BULK_INCLUDE_TABLES_MAP, ORM_MAP, - UNIT_TYPE_MAP, - ADDITIONAL_TABLES, + TECHNOLOGIES, TRANSLATIONS, + UNIT_TYPE_MAP, ) @@ -303,7 +302,7 @@ def transform_date_parameter(self, method, date, **kwargs): if date == "existing": log.warning( """ - The date parameter 'existing' is deprecated and will be removed in the future. + The date parameter 'existing' is deprecated and will be removed in the future. The date parameter is set to `today`. If this change causes problems for you, please comment in this issue on github: @@ -384,12 +383,14 @@ def data_to_include_tables(data: list, mapping: str = None) -> list: Convert user input 'data' to the list 'include_tables'. It contains file names from zipped bulk download, if mapping="write_xml". It contains database table names, if mapping="export_db_tables". + Parameters ---------- data: list The user input for data selection mapping: str Specify the mapping dict for the function and thus the list output. + Returns ------- list @@ -465,7 +466,6 @@ def create_db_query( chunksize: int or None Defines the chunksize of the tables export. Default to 500.000 which is roughly 2.5 GB. """ - renaming = column_renaming() unit_type_map_reversed = reverse_unit_type_map() @@ -606,7 +606,6 @@ def reverse_fill_basic_units(technology=None, engine=None): technology: list of str Available technologies are in open_mastr.Mastr.to_csv() """ - with session_scope(engine=engine) as session: # Empty the basic_units table, because it will be filled entirely from extended tables session.query(getattr(orm, "BasicUnit", None)).delete() @@ -779,14 +778,13 @@ def create_translated_database_engine(engine, folder_path) -> sqlalchemy.engine. Check if translated version of the database, as defined with engine parameter, exists. Return sqlite engine connected with the translated database. """ - if engine == "sqlite": db_path = os.path.join(folder_path, "open-mastr-translated.db") else: if "sqlite" not in engine.dialect.name: raise ValueError("engine has to be of type 'sqlite'") - prev_path = r"{}".format(engine.url.database) + prev_path = rf"{engine.url.database}" engine.dispose() db_path = prev_path[:-3] + "-translated.db" diff --git a/open_mastr/utils/orm.py b/open_mastr/utils/orm.py index 667b415e..240eb621 100644 --- a/open_mastr/utils/orm.py +++ b/open_mastr/utils/orm.py @@ -1,23 +1,23 @@ -from sqlalchemy.orm import DeclarativeBase from sqlalchemy import ( + JSON, + Boolean, Column, - Integer, - String, + Date, + DateTime, Float, + Integer, Sequence, - DateTime, - Boolean, + String, func, - Date, - JSON, ) +from sqlalchemy.orm import DeclarativeBase class Base(DeclarativeBase): pass -class ParentAllTables(object): +class ParentAllTables: DatenQuelle = Column(String) DatumDownload = Column(Date) @@ -72,7 +72,7 @@ class MissedAdditionalData(Base): download_date = Column(DateTime(timezone=True), default=func.now()) -class Extended(object): +class Extended: NetzbetreiberMastrNummer = Column(String) Registrierungsdatum = Column(Date) EinheitMastrNummer = Column(String, primary_key=True) @@ -287,7 +287,7 @@ class StorageExtended(Extended, ParentAllTables, Base): GemeinsamRegistrierteSolareinheitMastrNummer = Column(String) -class Eeg(object): +class Eeg: Registrierungsdatum = Column(Date) EegMastrNummer = Column(String, primary_key=True) Meldedatum = Column(Date) diff --git a/open_mastr/utils/unzip_http.py b/open_mastr/utils/unzip_http.py index 0674e130..6d67d80b 100644 --- a/open_mastr/utils/unzip_http.py +++ b/open_mastr/utils/unzip_http.py @@ -47,18 +47,18 @@ them to stdout, in zipfile order) """ -import sys -import os +import fnmatch import io +import logging import math -import time -import zlib -import struct -import fnmatch +import os import pathlib +import struct +import sys +import time import urllib.parse import zipfile -import logging +import zlib log = logging.getLogger(__name__) @@ -74,7 +74,7 @@ def warning(s): def get_bits(val: int, *args): - "Generate bitfields (one for each arg) from LSB to MSB." + """Generate bitfields (one for each arg) from LSB to MSB.""" for n in args: x = val & (2**n - 1) val >>= n @@ -304,7 +304,7 @@ def get_range(self, start, n): return self.http.request( "GET", self.url, - headers={"Range": f"bytes={start}-{start+n-1}"}, + headers={"Range": f"bytes={start}-{start + n - 1}"}, preload_content=False, ) @@ -358,7 +358,7 @@ def __init__(self, fp, info): super().__init__() self.raw = fp self._decompressor = zlib.decompressobj(-15) - self._buffer = bytes() + self._buffer = b"" def readable(self): return True @@ -403,7 +403,7 @@ def read(self, n): elapsed_s = now - self.start_time sys.stderr.write( - f"\r{elapsed_s:.0f}s {self.amtread/10**6:.02f}/{self.total/10**6:.02f}MB ({self.amtread/10**6/elapsed_s:.02f} MB/s) {self.name}" + f"\r{elapsed_s:.0f}s {self.amtread / 10**6:.02f}/{self.total / 10**6:.02f}MB ({self.amtread / 10**6 / elapsed_s:.02f} MB/s) {self.name}" ) if not r: diff --git a/open_mastr/xml_download/utils_cleansing_bulk.py b/open_mastr/xml_download/utils_cleansing_bulk.py index b48a50f1..6121f730 100644 --- a/open_mastr/xml_download/utils_cleansing_bulk.py +++ b/open_mastr/xml_download/utils_cleansing_bulk.py @@ -1,10 +1,12 @@ -import pandas as pd +from zipfile import ZipFile + import numpy as np +import pandas as pd + from open_mastr.xml_download.colums_to_replace import ( - system_catalog, columns_replace_list, + system_catalog, ) -from zipfile import ZipFile def cleanse_bulk_data(df: pd.DataFrame, zipped_xml_file_path: str) -> pd.DataFrame: @@ -19,7 +21,8 @@ def cleanse_bulk_data(df: pd.DataFrame, zipped_xml_file_path: str) -> pd.DataFra def replace_ids_with_names(df: pd.DataFrame, system_catalog: dict) -> pd.DataFrame: """Replaces ids with names according to the system catalog. This is necessary since the data from the bulk download encodes columns with - IDs instead of the actual values.""" + IDs instead of the actual values. + """ for column_name, name_mapping_dictionary in system_catalog.items(): if column_name in df.columns: df[column_name] = df[column_name].replace(name_mapping_dictionary) @@ -31,7 +34,8 @@ def replace_mastr_katalogeintraege( df: pd.DataFrame, ) -> pd.DataFrame: """Replaces the IDs from the mastr database by its mapped string values from - the table katalogwerte""" + the table katalogwerte + """ katalogwerte = create_katalogwerte_from_bulk_download(zipped_xml_file_path) for column_name in df.columns: if column_name in columns_replace_list: @@ -57,7 +61,8 @@ def replace_mastr_katalogeintraege( def create_katalogwerte_from_bulk_download(zipped_xml_file_path) -> dict: """Creates a dictionary from the id -> value mapping defined in the table - katalogwerte from MaStR.""" + katalogwerte from MaStR. + """ with ZipFile(zipped_xml_file_path, "r") as f: data = f.read("Katalogwerte.xml") df_katalogwerte = pd.read_xml(data, encoding="UTF-16", compression="zip") diff --git a/open_mastr/xml_download/utils_download_bulk.py b/open_mastr/xml_download/utils_download_bulk.py index a8d37ae3..c13da81d 100644 --- a/open_mastr/xml_download/utils_download_bulk.py +++ b/open_mastr/xml_download/utils_download_bulk.py @@ -2,17 +2,18 @@ import shutil import time from importlib.metadata import PackageNotFoundError, version -from zipfile import ZipFile from pathlib import Path +from zipfile import ZipFile import numpy as np import requests from tqdm import tqdm +from open_mastr.utils import unzip_http + # setup logger from open_mastr.utils.config import setup_logger -from open_mastr.utils.constants import BULK_INCLUDE_TABLES_MAP, BULK_DATA -from open_mastr.utils import unzip_http +from open_mastr.utils.constants import BULK_DATA, BULK_INCLUDE_TABLES_MAP try: USER_AGENT = ( @@ -43,7 +44,8 @@ def gen_version( see - Examples: + Examples + -------- 2024-01-01 = version 23.2 2024-04-01 = version 23.2 2024-04-02 = version 24.1 @@ -52,7 +54,6 @@ def gen_version( 2024-10-02 = version 24.2 2024-31-12 = version 24.2 """ - year = when.tm_year release = 1 @@ -121,7 +122,7 @@ def download_xml_Mastr( """Downloads the zipped MaStR. Parameters - ----------- + ---------- save_path: str Full file path where the downloaded MaStR zip file will be saved. bulk_date_string: str @@ -131,7 +132,6 @@ def download_xml_Mastr( xml_folder_path: str Path where the downloaded MaStR zip file will be saved. """ - log.info("Starting the Download from marktstammdatenregister.de.") # TODO this should take bulk_date_string @@ -321,7 +321,7 @@ def full_download_without_unzip_http( "Warning: The servers from MaStR restrict the download speed." " You may want to download it another time." ) - total_length = int(23000) + total_length = 23000 with ( open(save_path, "wb") as zfile, tqdm(desc=save_path, total=total_length, unit="") as bar, diff --git a/open_mastr/xml_download/utils_write_to_database.py b/open_mastr/xml_download/utils_write_to_database.py index e71abc18..757dca69 100644 --- a/open_mastr/xml_download/utils_write_to_database.py +++ b/open_mastr/xml_download/utils_write_to_database.py @@ -1,16 +1,16 @@ import os +import re from concurrent.futures import ProcessPoolExecutor, wait from io import StringIO from multiprocessing import cpu_count from shutil import Error from zipfile import ZipFile -import re import lxml import numpy as np import pandas as pd import sqlalchemy -from sqlalchemy import select, create_engine, inspect +from sqlalchemy import create_engine, inspect, select from sqlalchemy.sql import text from sqlalchemy.sql.sqltypes import Date, DateTime @@ -79,7 +79,8 @@ def write_mastr_xml_to_database( def get_number_of_processes(): """Get the number of processes to use for the bulk download. Returns -1 if the user has not opted for the parallelized implementation. Otherwise, we recommend using the number of available CPUs - 1. If the user wants to - use more processes, they can set the custom environment variable.""" + use more processes, they can set the custom environment variable. + """ if "NUMBER_OF_PROCESSES" in os.environ: try: number_of_processes = int(os.environ.get("NUMBER_OF_PROCESSES")) @@ -218,7 +219,8 @@ def extract_sql_table_name(xml_table_name: str) -> str: def is_table_relevant(xml_table_name: str, include_tables: list) -> bool: """Checks if the table contains relevant data and if the user wants to - have it in the database.""" + have it in the database. + """ # few tables are only needed for data cleansing of the xml files and contain no # information of relevance try: @@ -246,7 +248,7 @@ def create_database_table( def is_first_file(file_name: str) -> bool: - """check if the file name indicates that it is the first file from the table""" + """Check if the file name indicates that it is the first file from the table""" return ( file_name.split(".")[0].split("_")[-1] == "1" or len(file_name.split(".")[0].split("_")) == 1 @@ -296,7 +298,8 @@ def is_date_column(column, df: pd.DataFrame) -> bool: def correct_ordering_of_filelist(files_list: list) -> list: """Files that end with a single digit number get a 0 prefixed to this number - to correct the list ordering. Afterwards the 0 is deleted again.""" + to correct the list ordering. Afterwards the 0 is deleted again. + """ files_list_ordered = [] count_if_zeros_are_prefixed = 0 for file_name in files_list: @@ -390,7 +393,6 @@ def add_zero_as_first_character_for_too_short_string(df: pd.DataFrame) -> pd.Dat """Some columns are read as integer even though they are actually strings starting with a 0. This function converts those columns back to strings and adds a 0 as first character. """ - dict_of_columns_and_string_length = { "Gemeindeschluessel": 8, "Postleitzahl": 5, @@ -423,6 +425,7 @@ def write_single_entries_until_not_unique_comes_up( ) -> pd.DataFrame: """ Remove from dataframe these rows, which are already existing in the database table + Parameters ---------- df @@ -433,7 +436,6 @@ def write_single_entries_until_not_unique_comes_up( ------- Filtered dataframe """ - table = tablename_mapping[xml_table_name]["__class__"].__table__ primary_key = next(c for c in table.columns if c.primary_key) @@ -467,6 +469,7 @@ def add_missing_columns_to_table( Some files introduce new columns for existing tables. If the pandas dataframe contains columns that do not exist in the database, they are added to the database. + Parameters ---------- engine @@ -519,14 +522,14 @@ def handle_xml_syntax_error(data: str, err: Error) -> pd.DataFrame: """Deletes entries that cause an xml syntax error and produces DataFrame. Parameters - ----------- + ---------- data : str Decoded xml file as one string err : ErrorMessage Error message that appeared when trying to use pd.read_xml on invalid xml file. Returns - ---------- + ------- df : pandas.DataFrame DataFrame which is read from the changed xml data. """ diff --git a/tests/conftest.py b/tests/conftest.py index eb5ce0fa..b521739c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,12 +10,13 @@ https://docs.pytest.org/en/7.2.x/reference/fixtures.html """ +import os + import pytest -from open_mastr import Mastr +from open_mastr import Mastr from open_mastr.utils.config import get_project_home_dir from open_mastr.utils.helpers import create_database_engine -import os @pytest.fixture(scope="function") diff --git a/tests/preparation.py b/tests/preparation.py index 0f58bd3f..f8d07d64 100644 --- a/tests/preparation.py +++ b/tests/preparation.py @@ -1,4 +1,5 @@ import os + from open_mastr.utils.config import get_project_home_dir @@ -13,7 +14,7 @@ def create_credentials_file(): user = os.getenv("MASTR_USER") section_title = "[MaStR]" - file_content = f"{section_title}\n" f"user = {user}\n" f"token = {token}\n" + file_content = f"{section_title}\nuser = {user}\ntoken = {token}\n" with open(credentials_file, "w") as credentials_fh: credentials_fh.write(file_content) diff --git a/tests/soap_api/test_download.py b/tests/soap_api/test_download.py index e4dc4c0d..0d187474 100644 --- a/tests/soap_api/test_download.py +++ b/tests/soap_api/test_download.py @@ -1,7 +1,9 @@ -from open_mastr.soap_api.download import MaStRAPI, MaStRDownload, flatten_dict -import pytest import datetime +import pytest + +from open_mastr.soap_api.download import MaStRAPI, MaStRDownload, flatten_dict + @pytest.fixture def mastr_api_fake_credentials(): @@ -73,7 +75,6 @@ def test_basic_unit_data(mastr_download): def test_additional_data_nuclear(mastr_download): - data_fcns = [ ("SME963513379837", "extended_unit_data"), ("SGE951929415553", "permit_unit_data"), @@ -88,7 +89,6 @@ def test_additional_data_nuclear(mastr_download): def test_additional_data_biomass(mastr_download): - data_fcns = [ ("SEE936595511945", "extended_unit_data"), ("EEG929630520224", "extended_unit_data"), diff --git a/tests/test_credentials.py b/tests/test_credentials.py index 989d9889..813aac3c 100644 --- a/tests/test_credentials.py +++ b/tests/test_credentials.py @@ -1,8 +1,7 @@ -from open_mastr.utils.credentials import get_mastr_user, get_mastr_token +from open_mastr.utils.credentials import get_mastr_token, get_mastr_user def test_get_mastr_user_and_token(): - user = get_mastr_user() assert len(user) == 15 diff --git a/tests/test_helpers.py b/tests/test_helpers.py index 7779a9c8..ec5d4373 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -1,35 +1,34 @@ -import pytest import os -from os.path import expanduser -import sys import random -from os.path import join +import sys from datetime import datetime -import pandas as pd -from open_mastr import Mastr +from os.path import expanduser, join from zipfile import ZipFile +import pandas as pd +import pytest + +from open_mastr import Mastr from open_mastr.utils import orm +from open_mastr.utils.config import create_data_dir, get_data_version_dir from open_mastr.utils.constants import ( + ADDITIONAL_TABLES, API_LOCATION_TYPES, TECHNOLOGIES, - ADDITIONAL_TABLES, ) -from open_mastr.utils.config import get_data_version_dir, create_data_dir from open_mastr.utils.helpers import ( - validate_parameter_format_for_download_method, - validate_parameter_format_for_mastr_init, - validate_api_credentials, - transform_data_parameter, - data_to_include_tables, - session_scope, create_db_query, + data_to_include_tables, db_query_to_csv, - reverse_unit_type_map, delete_zip_file_if_corrupted, + reverse_unit_type_map, + session_scope, + transform_data_parameter, + validate_api_credentials, + validate_parameter_format_for_download_method, + validate_parameter_format_for_mastr_init, ) - # Check if db is empty _db_exists = False _db_folder_path = os.path.join( @@ -253,7 +252,12 @@ def test_validate_parameter_format_for_mastr_init(db): def test_transform_data_parameter(): - (data, api_data_types, api_location_types, harm_log,) = transform_data_parameter( + ( + data, + api_data_types, + api_location_types, + harm_log, + ) = transform_data_parameter( method="API", data=["wind", "location"], api_data_types=["eeg_data"], diff --git a/tests/test_mastr.py b/tests/test_mastr.py index 16f7c1f6..18074955 100644 --- a/tests/test_mastr.py +++ b/tests/test_mastr.py @@ -1,14 +1,15 @@ -import shutil - -from open_mastr.mastr import Mastr import os import re -import sqlalchemy -import pytest +import shutil +from datetime import date, timedelta from os.path import expanduser + import pandas as pd +import pytest +import sqlalchemy + +from open_mastr.mastr import Mastr from open_mastr.utils.constants import TRANSLATIONS -from datetime import date, timedelta _xml_file_exists = False _xml_folder_path = os.path.join(expanduser("~"), ".open-MaStR", "data", "xml_download") diff --git a/tests/test_requirements.txt b/tests/test_requirements.txt index e34f71ba..2781d2e5 100644 --- a/tests/test_requirements.txt +++ b/tests/test_requirements.txt @@ -2,4 +2,4 @@ flake8 pylint pytest pytest-dependency -pandas \ No newline at end of file +pandas diff --git a/tests/xml_download/test_utils_cleansing_bulk.py b/tests/xml_download/test_utils_cleansing_bulk.py index 9a29ad76..e7753350 100644 --- a/tests/xml_download/test_utils_cleansing_bulk.py +++ b/tests/xml_download/test_utils_cleansing_bulk.py @@ -1,9 +1,9 @@ -import sys +import os import sqlite3 +import sys from os.path import expanduser -import os + import pandas as pd -import numpy as np import pytest from open_mastr.xml_download.utils_cleansing_bulk import ( @@ -23,6 +23,7 @@ _sqlite_file_path = os.path.join(_sqlite_folder_path, "open-mastr.db") _sqlite_db_exists = bool(os.path.exists(_sqlite_file_path)) + # Silence ValueError caused by logger https://github.com/pytest-dev/pytest/issues/5502 @pytest.fixture(autouse=True) def capture_wrap(): diff --git a/tests/xml_download/test_utils_download_bulk.py b/tests/xml_download/test_utils_download_bulk.py index 8f650933..2ba0d4d7 100644 --- a/tests/xml_download/test_utils_download_bulk.py +++ b/tests/xml_download/test_utils_download_bulk.py @@ -1,10 +1,11 @@ +import os +import shutil import time + from open_mastr.xml_download.utils_download_bulk import ( - gen_url, delete_xml_files_not_from_given_date, + gen_url, ) -import os -import shutil def test_gen_url(): diff --git a/tests/xml_download/test_utils_write_to_database.py b/tests/xml_download/test_utils_write_to_database.py index 75243b1a..1251c1cd 100644 --- a/tests/xml_download/test_utils_write_to_database.py +++ b/tests/xml_download/test_utils_write_to_database.py @@ -12,9 +12,11 @@ from sqlalchemy.sql import text from open_mastr.utils import orm -from open_mastr.utils.orm import RetrofitUnits, NuclearExtended, tablename_mapping +from open_mastr.utils.orm import NuclearExtended, RetrofitUnits, tablename_mapping from open_mastr.xml_download.utils_write_to_database import ( add_missing_columns_to_table, + add_table_to_non_sqlite_database, + add_table_to_sqlite_database, add_zero_as_first_character_for_too_short_string, cast_date_columns_to_string, change_column_names_to_orm_format, @@ -22,14 +24,12 @@ create_database_table, extract_sql_table_name, extract_xml_table_name, + interleave_files, is_date_column, is_first_file, is_table_relevant, process_table_before_insertion, read_xml_file, - add_table_to_non_sqlite_database, - add_table_to_sqlite_database, - interleave_files, ) # Check if xml file exists