Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ and the versioning aims to respect [Semantic Versioning](http://semver.org/spec/

## [v0.XX.X] unreleased - 202X-XX-XX
### Added
- Add interactive download functionality for MaStR date selection
[#696](https://github.com/OpenEnergyPlatform/open-MaStR/pull/696)
- Add trusted publishing for test releases
[#713](https://github.com/OpenEnergyPlatform/open-MaStR/pull/713)
### Changed
Expand Down
57 changes: 53 additions & 4 deletions open_mastr/mastr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# import xml dependencies
from open_mastr.xml_download.utils_download_bulk import (
download_xml_Mastr,
select_download_date,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
select_download_date,
select_download_date,
list_available_downloads,

delete_xml_files_not_from_given_date,
)
from open_mastr.xml_download.utils_write_to_database import (
Expand Down Expand Up @@ -106,6 +107,7 @@ def download(
date=None,
bulk_cleansing=True,
keep_old_downloads: bool = False,
select_date_interactively: bool = False,
**kwargs,
) -> None:
"""
Expand Down Expand Up @@ -157,6 +159,11 @@ def download(
| None | set date="today" |

Default to `None`.
select_date_interactively : bool, optional
If set to True, the user will be presented with a list of available download dates
from the MaStR website and can interactively select which date to download.
This allows downloading historical data instead of just the latest available data.
When True, the `date` parameter is ignored. Defaults to False.
bulk_cleansing : bool, optional
If set to True, data cleansing is applied after the download (which is recommended).
In its original format, many entries in the MaStR are encoded with IDs. Columns like
Expand Down Expand Up @@ -192,8 +199,26 @@ def download(

date = transform_date_parameter(self, date, **kwargs)

# Find the name of the zipped xml folder
bulk_download_date = parse_date_string(date)
# Handle interactive date selection if requested
if select_date_interactively:
log.info(
"Interactive date selection enabled. Fetching available downloads..."
)
selected_date, selected_url = select_download_date()

if selected_date is None:
log.info("Download cancelled by user.")
return

# Update the date and use the selected URL
date = selected_date
bulk_download_date = selected_date
custom_url = selected_url
else:
# Find the name of the zipped xml folder
bulk_download_date = parse_date_string(date)
custom_url = None

xml_folder_path = os.path.join(self.output_dir, "data", "xml_download")
os.makedirs(xml_folder_path, exist_ok=True)
zipped_xml_file_path = os.path.join(
Expand All @@ -206,9 +231,8 @@ def download(
delete_xml_files_not_from_given_date(zipped_xml_file_path, xml_folder_path)

download_xml_Mastr(
zipped_xml_file_path, bulk_download_date, data, xml_folder_path
zipped_xml_file_path, bulk_download_date, data, xml_folder_path, custom_url
)

log.info(
"\nWould you like to speed up the creation of your MaStR database?\n"
"Try our new parallelized processing by setting os.environ['USE_RECOMMENDED_NUMBER_OF_PROCESSES'] = True "
Expand Down Expand Up @@ -371,3 +395,28 @@ def translate(self) -> None:

self.engine = create_engine(f"sqlite:///{new_path}")
self.is_translated = True

def browse_available_downloads(self):
"""
Browse available MaStR downloads from the website without starting the download.

This method fetches and displays all available download dates from the MaStR website,
allowing users to see what historical data is available before deciding to download.

Returns
-------
list of dict
List of available downloads with date, version, and type information.

Examples
--------
>>> from open_mastr import Mastr
>>> db = Mastr()
>>> available_downloads = db.browse_available_downloads()
>>> # User can then choose a date and download with:
>>> # db.download(select_date_interactively=True)
"""
from open_mastr.xml_download.utils_download_bulk import list_available_downloads
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
from open_mastr.xml_download.utils_download_bulk import list_available_downloads

This is better imported at the top of the file I guess.


log.info("Browsing available MaStR downloads...")
return list_available_downloads()
202 changes: 199 additions & 3 deletions open_mastr/xml_download/utils_download_bulk.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@
from importlib.metadata import PackageNotFoundError, version
from zipfile import ZipFile
from pathlib import Path
import urllib.request
import re
from datetime import datetime

import numpy as np
import requests
Expand Down Expand Up @@ -123,7 +126,11 @@ def gen_url(


def download_xml_Mastr(
save_path: str, bulk_date_string: str, bulk_data_list: list, xml_folder_path: str
save_path: str,
bulk_date_string: str,
bulk_data_list: list,
xml_folder_path: str,
url: str = None,
) -> None:
"""Downloads the zipped MaStR.

Expand All @@ -137,12 +144,32 @@ def download_xml_Mastr(
List of tables/technologis to be downloaded.
xml_folder_path: str
Path where the downloaded MaStR zip file will be saved.
url: str, optional
Custom download URL. If None, generates URL based on bulk_date_string.
"""

log.info("Starting the Download from marktstammdatenregister.de.")

url_time = dt.strptime(bulk_date_string, "%Y%m%d").date().timetuple()
url = gen_url(url_time)
# Helper function to convert date string to time.struct_time
def _parse_date_string(date_str):
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you reuse parse_date_string() from helpers.py?

"""Convert YYYYMMDD string to time.struct_time object."""
try:
# Use datetime.strptime for robust date parsing
parsed_date = dt.strptime(date_str, "%Y%m%d")
# Convert to time.struct_time using timetuple()
return parsed_date.timetuple()
except (ValueError, IndexError) as e:
log.warning(f"Invalid date format '{date_str}': {e}. Using current date.")
return time.localtime()

# Parse the date string to time.struct_time (needed for both cases)
url_time = _parse_date_string(bulk_date_string)

# Determine the URL to use
if url is None:
# Generate URL from date string if no custom URL provided
url = gen_url(url_time)
# else: custom URL is already provided, use it as-is

time_a = time.perf_counter()
r = requests.get(url, stream=True, headers={"User-Agent": USER_AGENT})
Expand Down Expand Up @@ -353,3 +380,172 @@ def full_download_without_unzip_http(
else:
# remove warning
bar.set_postfix_str(s="")


def get_available_download_links(
url="https://www.marktstammdatenregister.de/MaStR/Datendownload",
):
"""
Fetch all available download links from the MaStR website.

This function retrieves all available Gesamtdatenexport files from the MaStR
download page, including both current and historical exports.

Parameters
----------
url : str, optional
The URL of the MaStR download page. Defaults to the official download page.

Returns
-------
list of dict
A list of dictionaries containing information about available downloads.
Each dictionary contains:
- 'url': The download URL
- 'date': The date of the export (YYYYMMDD format)
- 'version': The MaStR version (e.g., '24.1', '24.2')
- 'type': 'current' for current exports, 'stichtag' for historical exports

Examples
--------
>>> links = get_available_download_links()
>>> for link in links[:3]:
... print(f"Date: {link['date']}, Version: {link['version']}, Type: {link['type']}")
Date: 20250103, Version: 24.2, Type: current
Date: 20241231, Version: 24.2, Type: current
Date: 20241230, Version: 24.2, Type: current
"""
log.info("Fetching available download links from MaStR website...")

headers = {"User-Agent": USER_AGENT}
req = urllib.request.Request(url, headers=headers)

try:
with urllib.request.urlopen(req) as response:
html = response.read().decode("utf-8")
except Exception as e:
log.error(f"Failed to fetch download page: {e}")
return []

# Pattern for current exports
pattern_current = re.compile(
r"https://download\.marktstammdatenregister\.de/Gesamtdatenexport_([0-9]{8})_([0-9]{2}\.[0-9])\.zip"
)
# Pattern for historical exports (Stichtag)
pattern_stichtag = re.compile(
r"https://download\.marktstammdatenregister\.de/Stichtag/Gesamtdatenexport_([0-9]{8})_([0-9]{2}\.[0-9])\.zip"
)

# Find all current export links
current_matches = pattern_current.findall(html)
current_links = [
{
"url": f"https://download.marktstammdatenregister.de/Gesamtdatenexport_{date}_{version}.zip",
"date": date,
"version": version,
"type": "current",
}
for date, version in current_matches
]

# Find all historical export links
stichtag_matches = pattern_stichtag.findall(html)
stichtag_links = [
{
"url": f"https://download.marktstammdatenregister.de/Stichtag/Gesamtdatenexport_{date}_{version}.zip",
"date": date,
"version": version,
"type": "stichtag",
}
for date, version in stichtag_matches
]

# Combine and sort by date (newest first)
all_links = current_links + stichtag_links
all_links.sort(key=lambda x: x["date"], reverse=True)

log.info(f"Found {len(all_links)} available download links")
return all_links


def list_available_downloads():
"""
Display available downloads in a user-friendly format.

Returns
-------
list of dict
List of available downloads with formatted dates and versions.
"""
links = get_available_download_links()

if not links:
print("No download links found. Please check your internet connection.")
return []

print("\n" + "=" * 80)
print("AVAILABLE MAStR DOWNLOADS")
print("=" * 80)
print(f"{'#':<4} {'Date':<12} {'Version':<10} {'Type':<12} {'URL'}")
print("-" * 80)

for i, link in enumerate(links, 1):
# Format date for better readability
date_formatted = f"{link['date'][:4]}-{link['date'][4:6]}-{link['date'][6:]}"
print(
f"{i:<4} {date_formatted:<12} {link['version']:<10} {link['type']:<12} {link['url']}"
)

print("=" * 80)
print(f"Total: {len(links)} downloads available")
print("=" * 80)

return links


def select_download_date():
"""
Interactive function to let the user select a download date.

Prompts the user to choose from available downloads or enter a custom date.

Returns
-------
tuple
(date_string, url) where date_string is in YYYYMMDD format and url is the download URL
Returns (None, None) if user cancels or no valid selection is made
"""
links = list_available_downloads()

if not links:
return None, None

print("\nOptions:")
print("1. Select from the list above (enter the number)")
print("2. Cancel")

while True:
choice = input("\nPlease enter your choice (1-2): ").strip()

if choice == "1":
# Select from list
while True:
try:
index = int(input(f"Enter a number (1-{len(links)}): ").strip())
if 1 <= index <= len(links):
selected = links[index - 1]
print(
f"\nSelected: {selected['date']} (Version {selected['version']}, Type: {selected['type']})"
)
return selected["date"], selected["url"]
else:
print(f"Please enter a number between 1 and {len(links)}")
except ValueError:
print("Please enter a valid number")

elif choice == "2":
print("Download selection cancelled.")
return None, None

else:
print("Invalid choice. Please enter 1, or 2.")
Loading
Loading