-
Notifications
You must be signed in to change notification settings - Fork 30
Add interactive download functionality for MaStR date selection #696 #697
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
FlorianK13
merged 6 commits into
OpenEnergyPlatform:develop
from
fharookshaik:feature-696-add-interactive-download
Feb 5, 2026
Merged
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
dfd0cf2
Add interactive download functionality for MaStR date selection #696
fharookshaik e4cddae
Add tests for interactive download functionality
fharookshaik 0f2876f
Merge branch 'develop' into feature-696-add-interactive-download
fharookshaik ecd8ba0
Refactor date parsing in download_xml_Mastr to improve robustness and…
fharookshaik 8c4e56c
Fix prompt message in select_download_date function to reflect valid …
fharookshaik c291b37
Merge branch 'develop' into feature-696-add-interactive-download
FlorianK13 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -4,6 +4,7 @@ | |||
| # import xml dependencies | ||||
| from open_mastr.xml_download.utils_download_bulk import ( | ||||
| download_xml_Mastr, | ||||
| select_download_date, | ||||
| delete_xml_files_not_from_given_date, | ||||
| ) | ||||
| from open_mastr.xml_download.utils_write_to_database import ( | ||||
|
|
@@ -106,6 +107,7 @@ def download( | |||
| date=None, | ||||
| bulk_cleansing=True, | ||||
| keep_old_downloads: bool = False, | ||||
| select_date_interactively: bool = False, | ||||
| **kwargs, | ||||
| ) -> None: | ||||
| """ | ||||
|
|
@@ -157,6 +159,11 @@ def download( | |||
| | None | set date="today" | | ||||
|
|
||||
| Default to `None`. | ||||
| select_date_interactively : bool, optional | ||||
| If set to True, the user will be presented with a list of available download dates | ||||
| from the MaStR website and can interactively select which date to download. | ||||
| This allows downloading historical data instead of just the latest available data. | ||||
| When True, the `date` parameter is ignored. Defaults to False. | ||||
| bulk_cleansing : bool, optional | ||||
| If set to True, data cleansing is applied after the download (which is recommended). | ||||
| In its original format, many entries in the MaStR are encoded with IDs. Columns like | ||||
|
|
@@ -192,8 +199,26 @@ def download( | |||
|
|
||||
| date = transform_date_parameter(self, date, **kwargs) | ||||
|
|
||||
| # Find the name of the zipped xml folder | ||||
| bulk_download_date = parse_date_string(date) | ||||
| # Handle interactive date selection if requested | ||||
| if select_date_interactively: | ||||
| log.info( | ||||
| "Interactive date selection enabled. Fetching available downloads..." | ||||
| ) | ||||
| selected_date, selected_url = select_download_date() | ||||
|
|
||||
| if selected_date is None: | ||||
| log.info("Download cancelled by user.") | ||||
| return | ||||
|
|
||||
| # Update the date and use the selected URL | ||||
| date = selected_date | ||||
| bulk_download_date = selected_date | ||||
| custom_url = selected_url | ||||
| else: | ||||
| # Find the name of the zipped xml folder | ||||
| bulk_download_date = parse_date_string(date) | ||||
| custom_url = None | ||||
|
|
||||
| xml_folder_path = os.path.join(self.output_dir, "data", "xml_download") | ||||
| os.makedirs(xml_folder_path, exist_ok=True) | ||||
| zipped_xml_file_path = os.path.join( | ||||
|
|
@@ -206,9 +231,8 @@ def download( | |||
| delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path) | ||||
|
|
||||
| download_xml_Mastr( | ||||
| zipped_xml_file_path, bulk_download_date, data, xml_folder_path | ||||
| zipped_xml_file_path, bulk_download_date, data, xml_folder_path, custom_url | ||||
| ) | ||||
|
|
||||
| log.info( | ||||
| "\nWould you like to speed up the creation of your MaStR database?\n" | ||||
| "Try our new parallelized processing by setting os.environ['USE_RECOMMENDED_NUMBER_OF_PROCESSES'] = True " | ||||
|
|
@@ -371,3 +395,28 @@ def translate(self) -> None: | |||
|
|
||||
| self.engine = create_engine(f"sqlite:///{new_path}") | ||||
| self.is_translated = True | ||||
|
|
||||
| def browse_available_downloads(self): | ||||
| """ | ||||
| Browse available MaStR downloads from the website without starting the download. | ||||
|
|
||||
| This method fetches and displays all available download dates from the MaStR website, | ||||
| allowing users to see what historical data is available before deciding to download. | ||||
|
|
||||
| Returns | ||||
| ------- | ||||
| list of dict | ||||
| List of available downloads with date, version, and type information. | ||||
|
|
||||
| Examples | ||||
| -------- | ||||
| >>> from open_mastr import Mastr | ||||
| >>> db = Mastr() | ||||
| >>> available_downloads = db.browse_available_downloads() | ||||
| >>> # User can then choose a date and download with: | ||||
| >>> # db.download(select_date_interactively=True) | ||||
| """ | ||||
| from open_mastr.xml_download.utils_download_bulk import list_available_downloads | ||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
This is better imported at the top of the file I guess. |
||||
|
|
||||
| log.info("Browsing available MaStR downloads...") | ||||
| return list_available_downloads() | ||||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,9 @@ | |
| from importlib.metadata import PackageNotFoundError, version | ||
| from zipfile import ZipFile | ||
| from pathlib import Path | ||
| import urllib.request | ||
| import re | ||
| from datetime import datetime | ||
|
|
||
| import numpy as np | ||
| import requests | ||
|
|
@@ -123,7 +126,11 @@ def gen_url( | |
|
|
||
|
|
||
| def download_xml_Mastr( | ||
| save_path: str, bulk_date_string: str, bulk_data_list: list, xml_folder_path: str | ||
| save_path: str, | ||
| bulk_date_string: str, | ||
| bulk_data_list: list, | ||
| xml_folder_path: str, | ||
| url: str = None, | ||
| ) -> None: | ||
| """Downloads the zipped MaStR. | ||
|
|
||
|
|
@@ -137,12 +144,32 @@ def download_xml_Mastr( | |
| List of tables/technologis to be downloaded. | ||
| xml_folder_path: str | ||
| Path where the downloaded MaStR zip file will be saved. | ||
| url: str, optional | ||
| Custom download URL. If None, generates URL based on bulk_date_string. | ||
| """ | ||
|
|
||
| log.info("Starting the Download from marktstammdatenregister.de.") | ||
|
|
||
| url_time = dt.strptime(bulk_date_string, "%Y%m%d").date().timetuple() | ||
| url = gen_url(url_time) | ||
| # Helper function to convert date string to time.struct_time | ||
| def _parse_date_string(date_str): | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you reuse |
||
| """Convert YYYYMMDD string to time.struct_time object.""" | ||
| try: | ||
| # Use datetime.strptime for robust date parsing | ||
| parsed_date = dt.strptime(date_str, "%Y%m%d") | ||
| # Convert to time.struct_time using timetuple() | ||
| return parsed_date.timetuple() | ||
| except (ValueError, IndexError) as e: | ||
| log.warning(f"Invalid date format '{date_str}': {e}. Using current date.") | ||
| return time.localtime() | ||
|
|
||
| # Parse the date string to time.struct_time (needed for both cases) | ||
| url_time = _parse_date_string(bulk_date_string) | ||
|
|
||
| # Determine the URL to use | ||
| if url is None: | ||
| # Generate URL from date string if no custom URL provided | ||
| url = gen_url(url_time) | ||
| # else: custom URL is already provided, use it as-is | ||
|
|
||
| time_a = time.perf_counter() | ||
| r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT}) | ||
|
|
@@ -353,3 +380,172 @@ def full_download_without_unzip_http( | |
| else: | ||
| # remove warning | ||
| bar.set_postfix_str(s="") | ||
|
|
||
|
|
||
| def get_available_download_links( | ||
| url="https://www.marktstammdatenregister.de/MaStR/Datendownload", | ||
| ): | ||
| """ | ||
| Fetch all available download links from the MaStR website. | ||
|
|
||
| This function retrieves all available Gesamtdatenexport files from the MaStR | ||
| download page, including both current and historical exports. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| url : str, optional | ||
| The URL of the MaStR download page. Defaults to the official download page. | ||
|
|
||
| Returns | ||
| ------- | ||
| list of dict | ||
| A list of dictionaries containing information about available downloads. | ||
| Each dictionary contains: | ||
| - 'url': The download URL | ||
| - 'date': The date of the export (YYYYMMDD format) | ||
| - 'version': The MaStR version (e.g., '24.1', '24.2') | ||
| - 'type': 'current' for current exports, 'stichtag' for historical exports | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> links = get_available_download_links() | ||
| >>> for link in links[:3]: | ||
| ... print(f"Date: {link['date']}, Version: {link['version']}, Type: {link['type']}") | ||
| Date: 20250103, Version: 24.2, Type: current | ||
| Date: 20241231, Version: 24.2, Type: current | ||
| Date: 20241230, Version: 24.2, Type: current | ||
| """ | ||
| log.info("Fetching available download links from MaStR website...") | ||
|
|
||
| headers = {"User-Agent": USER_AGENT} | ||
| req = urllib.request.Request(url, headers=headers) | ||
|
|
||
| try: | ||
| with urllib.request.urlopen(req) as response: | ||
| html = response.read().decode("utf-8") | ||
| except Exception as e: | ||
| log.error(f"Failed to fetch download page: {e}") | ||
| return [] | ||
|
|
||
| # Pattern for current exports | ||
| pattern_current = re.compile( | ||
| r"https://download\.marktstammdatenregister\.de/Gesamtdatenexport_([0-9]{8})_([0-9]{2}\.[0-9])\.zip" | ||
| ) | ||
| # Pattern for historical exports (Stichtag) | ||
| pattern_stichtag = re.compile( | ||
| r"https://download\.marktstammdatenregister\.de/Stichtag/Gesamtdatenexport_([0-9]{8})_([0-9]{2}\.[0-9])\.zip" | ||
| ) | ||
|
|
||
| # Find all current export links | ||
| current_matches = pattern_current.findall(html) | ||
| current_links = [ | ||
| { | ||
| "url": f"https://download.marktstammdatenregister.de/Gesamtdatenexport_{date}_{version}.zip", | ||
| "date": date, | ||
| "version": version, | ||
| "type": "current", | ||
| } | ||
| for date, version in current_matches | ||
| ] | ||
|
|
||
| # Find all historical export links | ||
| stichtag_matches = pattern_stichtag.findall(html) | ||
| stichtag_links = [ | ||
| { | ||
| "url": f"https://download.marktstammdatenregister.de/Stichtag/Gesamtdatenexport_{date}_{version}.zip", | ||
| "date": date, | ||
| "version": version, | ||
| "type": "stichtag", | ||
| } | ||
| for date, version in stichtag_matches | ||
| ] | ||
|
|
||
| # Combine and sort by date (newest first) | ||
| all_links = current_links + stichtag_links | ||
| all_links.sort(key=lambda x: x["date"], reverse=True) | ||
|
|
||
| log.info(f"Found {len(all_links)} available download links") | ||
| return all_links | ||
|
|
||
|
|
||
| def list_available_downloads(): | ||
| """ | ||
| Display available downloads in a user-friendly format. | ||
|
|
||
| Returns | ||
| ------- | ||
| list of dict | ||
| List of available downloads with formatted dates and versions. | ||
| """ | ||
| links = get_available_download_links() | ||
|
|
||
| if not links: | ||
| print("No download links found. Please check your internet connection.") | ||
| return [] | ||
|
|
||
| print("\n" + "=" * 80) | ||
| print("AVAILABLE MAStR DOWNLOADS") | ||
| print("=" * 80) | ||
| print(f"{'#':<4} {'Date':<12} {'Version':<10} {'Type':<12} {'URL'}") | ||
| print("-" * 80) | ||
|
|
||
| for i, link in enumerate(links, 1): | ||
| # Format date for better readability | ||
| date_formatted = f"{link['date'][:4]}-{link['date'][4:6]}-{link['date'][6:]}" | ||
| print( | ||
| f"{i:<4} {date_formatted:<12} {link['version']:<10} {link['type']:<12} {link['url']}" | ||
| ) | ||
|
|
||
| print("=" * 80) | ||
| print(f"Total: {len(links)} downloads available") | ||
| print("=" * 80) | ||
|
|
||
| return links | ||
|
|
||
|
|
||
| def select_download_date(): | ||
| """ | ||
| Interactive function to let the user select a download date. | ||
|
|
||
| Prompts the user to choose from available downloads or enter a custom date. | ||
|
|
||
| Returns | ||
| ------- | ||
| tuple | ||
| (date_string, url) where date_string is in YYYYMMDD format and url is the download URL | ||
| Returns (None, None) if user cancels or no valid selection is made | ||
| """ | ||
| links = list_available_downloads() | ||
|
|
||
| if not links: | ||
| return None, None | ||
|
|
||
| print("\nOptions:") | ||
| print("1. Select from the list above (enter the number)") | ||
| print("2. Cancel") | ||
|
|
||
| while True: | ||
| choice = input("\nPlease enter your choice (1-2): ").strip() | ||
|
|
||
| if choice == "1": | ||
| # Select from list | ||
| while True: | ||
| try: | ||
| index = int(input(f"Enter a number (1-{len(links)}): ").strip()) | ||
| if 1 <= index <= len(links): | ||
| selected = links[index - 1] | ||
| print( | ||
| f"\nSelected: {selected['date']} (Version {selected['version']}, Type: {selected['type']})" | ||
| ) | ||
| return selected["date"], selected["url"] | ||
| else: | ||
| print(f"Please enter a number between 1 and {len(links)}") | ||
| except ValueError: | ||
| print("Please enter a valid number") | ||
|
|
||
| elif choice == "2": | ||
| print("Download selection cancelled.") | ||
| return None, None | ||
|
|
||
| else: | ||
| print("Invalid choice. Please enter 1, or 2.") | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.