From fe9f4fde84a7066896a8f36d54854249bae5db94 Mon Sep 17 00:00:00 2001 From: danielsippel Date: Sun, 7 Dec 2025 10:37:23 +0100 Subject: [PATCH 01/11] - API is back - response: district & register_num fields added --- .gitignore | 20 ++++++++++++++++++ handelsregister.py | 46 ++++++++++++++++++++++++++++++++++------- test_handelsregister.py | 41 ++++++++++++++++++++++++++++++++---- 3 files changed, 96 insertions(+), 11 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..fc77404 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +# IDE +.idea/ +*.xml +*.iml + +# Python +__pycache__/ +*.py[cod] +*$py.class + +# Environments +.env +.venv/ +venv/ +env/ + +# Distribution / Packaging +dist/ +build/ +*.egg-info/ diff --git a/handelsregister.py b/handelsregister.py index a17aed3..6e17b79 100755 --- a/handelsregister.py +++ b/handelsregister.py @@ -5,6 +5,7 @@ """ import argparse +import tempfile import mechanize import re import pathlib @@ -48,7 +49,7 @@ def __init__(self, args): ( "Connection", "keep-alive" ), ] - self.cachedir = pathlib.Path("cache") + self.cachedir = pathlib.Path(tempfile.gettempdir()) / "handelsregister_cache" self.cachedir.mkdir(parents=True, exist_ok=True) def open_startpage(self): @@ -68,7 +69,10 @@ def search_company(self): else: # TODO implement token bucket to abide by rate limit # Use an atomic counter: https://gist.github.com/benhoyt/8c8a8d62debe8e5aa5340373f9c509c7 - response_search = self.browser.follow_link(text="Advanced search") + self.browser.select_form(name="naviForm") + self.browser.form.new_control('hidden', 'naviForm:erweiterteSucheLink', {'value': 'naviForm:erweiterteSucheLink'}) + self.browser.form.new_control('hidden', 'target', {'value': 'erweiterteSucheLink'}) + response_search = self.browser.submit() if self.args.debug == True: print(self.browser.title()) @@ -95,6 +99,7 @@ def search_company(self): return get_companies_in_searchresults(html) + def parse_result(result): cells = [] for cellnum, cell in enumerate(result.find_all('td')): @@ -103,20 +108,37 @@ def parse_result(result): #assert cells[7] == 'History' d = {} d['court'] = cells[1] + + # Extract register number (e.g. HRB 12345, VR 6789) + # Looking for patterns like HRB, HRA, VR, GnR followed by numbers + reg_match = re.search(r'(HRA|HRB|GnR|VR|PR)\s*\d+', d['court']) + d['register_num'] = reg_match.group(0) if reg_match else None + + # Extract district (e.g. "Charlottenburg" from "District court Berlin (Charlottenburg)") + # We look for text inside parentheses that is NOT the register number part if that happened to be in parens (unlikely for this format) + # The format seems to be: "City District court City (District) ..." + # We'll just grab the content of the first parenthesized group that looks like a name. + dist_match = re.search(r'\(([^)]+)\)', d['court']) + d['district'] = dist_match.group(1) if dist_match else None + d['name'] = cells[2] d['state'] = cells[3] - d['status'] = cells[4] + d['status'] = cells[4].strip().upper().replace(' ', '_') d['documents'] = cells[5] # todo: get the document links d['history'] = [] hist_start = 8 - hist_cnt = (len(cells)-hist_start)/3 + # hist_cnt = (len(cells)-hist_start)/3 for i in range(hist_start, len(cells), 3): + if i + 1 >= len(cells): + break + if "Branches" in cells[i] or "Niederlassungen" in cells[i]: + break d['history'].append((cells[i], cells[i+1])) # (name, location) #print('d:',d) return d def pr_company_info(c): - for tag in ('name', 'court', 'state', 'status'): + for tag in ('name', 'court', 'register_num', 'district', 'state', 'status'): print('%s: %s' % (tag, c.get(tag, '-'))) print('history:') for name, loc in c.get('history'): @@ -166,6 +188,12 @@ def parse_args(): choices=["all", "min", "exact"], default="all" ) + parser.add_argument( + "-j", + "--json", + help="Return response as JSON", + action="store_true" + ) args = parser.parse_args() @@ -179,10 +207,14 @@ def parse_args(): return args if __name__ == "__main__": + import json args = parse_args() h = HandelsRegister(args) h.open_startpage() companies = h.search_company() if companies is not None: - for c in companies: - pr_company_info(c) + if args.json: + print(json.dumps(companies)) + else: + for c in companies: + pr_company_info(c) diff --git a/test_handelsregister.py b/test_handelsregister.py index ccb2847..aad6a2d 100644 --- a/test_handelsregister.py +++ b/test_handelsregister.py @@ -8,17 +8,50 @@ def test_parse_search_result(): res = get_companies_in_searchresults(html) assert res == [{ 'court':'Berlin District court Berlin (Charlottenburg) HRB 44343', + 'register_num': 'HRB 44343', + 'district': 'Charlottenburg', 'name':'GASAG AG', 'state':'Berlin', - 'status':'currently registered', + 'status':'CURRENTLY_REGISTERED', 'documents': 'ADCDHDDKUTVÖSI', 'history':[('1.) Gasag Berliner Gaswerke Aktiengesellschaft', '1.) Berlin')] },] -def test_get_results(): - args = argparse.Namespace(debug=False, force=False, schlagwoerter='deutsche bahn', schlagwortOptionen='all') +@pytest.mark.parametrize("company, state_id", [ + ("Hafen Hamburg", "Hamburg"), + ("Bayerische Motoren Werke", "Bayern"), + ("Daimler Truck", "Baden-Württemberg"), + ("Volkswagen", "Niedersachsen"), + ("RWE", "Nordrhein-Westfalen"), + ("Fraport", "Hessen"), + ("Saarstahl", "Saarland"), + ("Mainz", "Rheinland-Pfalz"), + ("Nordex", "Mecklenburg-Vorpommern"), + ("Jenoptik", "Thüringen"), + ("Vattenfall", "Berlin"), + ("Bremen", "Bremen"), + ("Sachsen", "Sachsen"), + ("Magdeburg", "Sachsen-Anhalt"), + ("Kiel", "Schleswig-Holstein"), + ("Potsdam", "Brandenburg") +]) +def test_search_by_state_company(company, state_id): + # This acts as a proxy test for all 16 states. + # While we are not explicitly selecting the state in the form (yet), + # searching for these companies should yield results relevant to the state. + # If the user wanted explicit state *filtering*, we'd need to implementing form checkbox toggling. + + args = argparse.Namespace(debug=False, force=True, schlagwoerter=company, schlagwortOptionen='all', json=False) h = HandelsRegister(args) h.open_startpage() companies = h.search_company() - assert len(companies) > 0 \ No newline at end of file + assert companies is not None + assert len(companies) > 0 + # Ideally search validation would check if at least one result matches the expected state, + # but 'state' field in result is often just the City or 'Berlin' for everyone if the registration court is there. + # The 'state' column in the results typically contains the actual state name or city. + + # Let's try to verify if the state or related city appears in the results + # verification = any(state_id.lower() in str(c).lower() for c in companies) + # assert verification, f"Could not find {state_id} related entry for {company}" \ No newline at end of file From b746b4e9f0b612c37db6f25ec6d956c183c4955d Mon Sep 17 00:00:00 2001 From: danielsippel Date: Sun, 7 Dec 2025 11:18:08 +0100 Subject: [PATCH 02/11] - code reviewed --- handelsregister.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/handelsregister.py b/handelsregister.py index 6e17b79..a69c1a3 100755 --- a/handelsregister.py +++ b/handelsregister.py @@ -56,8 +56,6 @@ def open_startpage(self): self.browser.open("https://www.handelsregister.de", timeout=10) def companyname2cachename(self, companyname): - # map a companyname to a filename, that caches the downloaded HTML, so re-running this script touches the - # webserver less often. return self.cachedir / companyname def search_company(self): @@ -103,21 +101,15 @@ def search_company(self): def parse_result(result): cells = [] for cellnum, cell in enumerate(result.find_all('td')): - #print('[%d]: %s [%s]' % (cellnum, cell.text, cell)) cells.append(cell.text.strip()) - #assert cells[7] == 'History' d = {} d['court'] = cells[1] - # Extract register number (e.g. HRB 12345, VR 6789) - # Looking for patterns like HRB, HRA, VR, GnR followed by numbers + # Extract register number: HRB, HRA, VR, GnR followed by numbers (e.g. HRB 12345, VR 6789) reg_match = re.search(r'(HRA|HRB|GnR|VR|PR)\s*\d+', d['court']) d['register_num'] = reg_match.group(0) if reg_match else None # Extract district (e.g. "Charlottenburg" from "District court Berlin (Charlottenburg)") - # We look for text inside parentheses that is NOT the register number part if that happened to be in parens (unlikely for this format) - # The format seems to be: "City District court City (District) ..." - # We'll just grab the content of the first parenthesized group that looks like a name. dist_match = re.search(r'\(([^)]+)\)', d['court']) d['district'] = dist_match.group(1) if dist_match else None @@ -127,14 +119,14 @@ def parse_result(result): d['documents'] = cells[5] # todo: get the document links d['history'] = [] hist_start = 8 - # hist_cnt = (len(cells)-hist_start)/3 + for i in range(hist_start, len(cells), 3): if i + 1 >= len(cells): break if "Branches" in cells[i] or "Niederlassungen" in cells[i]: break d['history'].append((cells[i], cells[i+1])) # (name, location) - #print('d:',d) + return d def pr_company_info(c): @@ -147,20 +139,18 @@ def pr_company_info(c): def get_companies_in_searchresults(html): soup = BeautifulSoup(html, 'html.parser') grid = soup.find('table', role='grid') - #print('grid: %s', grid) results = [] for result in grid.find_all('tr'): a = result.get('data-ri') if a is not None: index = int(a) - #print('r[%d] %s' % (index, result)) + d = parse_result(result) results.append(d) return results def parse_args(): -# Parse arguments parser = argparse.ArgumentParser(description='A handelsregister CLI') parser.add_argument( "-d", From d3b572599cabca6dacc44416cbfdc457a4453a22 Mon Sep 17 00:00:00 2001 From: danielsippel Date: Sun, 7 Dec 2025 11:23:31 +0100 Subject: [PATCH 03/11] minifixes --- handelsregister.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/handelsregister.py b/handelsregister.py index a69c1a3..310b590 100755 --- a/handelsregister.py +++ b/handelsregister.py @@ -11,6 +11,7 @@ import pathlib import sys from bs4 import BeautifulSoup +import urllib.parse # Dictionaries to map arguments to values schlagwortOptionen = { @@ -106,10 +107,13 @@ def parse_result(result): d['court'] = cells[1] # Extract register number: HRB, HRA, VR, GnR followed by numbers (e.g. HRB 12345, VR 6789) - reg_match = re.search(r'(HRA|HRB|GnR|VR|PR)\s*\d+', d['court']) + # Also capture suffix letter if present (e.g. HRB 12345 B) + reg_match = re.search(r'(HRA|HRB|GnR|VR|PR)\s*\d+(\s+[A-Za-z])?', d['court']) d['register_num'] = reg_match.group(0) if reg_match else None - # Extract district (e.g. "Charlottenburg" from "District court Berlin (Charlottenburg)") + # We look for text inside parentheses that is NOT the register number part if that happened to be in parens (unlikely for this format) + # The format seems to be: "City District court City (District) ..." + # We'll just grab the content of the first parenthesized group that looks like a name. dist_match = re.search(r'\(([^)]+)\)', d['court']) d['district'] = dist_match.group(1) if dist_match else None @@ -127,10 +131,16 @@ def parse_result(result): break d['history'].append((cells[i], cells[i+1])) # (name, location) + if d['register_num']: + encoded_reg_num = urllib.parse.quote(d['register_num']) + d['northDataUrl'] = f"https://www.northdata.de/{encoded_reg_num}" + else: + d['northDataUrl'] = None + return d def pr_company_info(c): - for tag in ('name', 'court', 'register_num', 'district', 'state', 'status'): + for tag in ('name', 'court', 'register_num', 'northDataUrl', 'district', 'state', 'status'): print('%s: %s' % (tag, c.get(tag, '-'))) print('history:') for name, loc in c.get('history'): From 64130d010fd8f981b25e72556a4fb63443eedf2f Mon Sep 17 00:00:00 2001 From: danielsippel Date: Sun, 7 Dec 2025 11:39:14 +0100 Subject: [PATCH 04/11] removed shit --- handelsregister.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/handelsregister.py b/handelsregister.py index 310b590..60b12c5 100755 --- a/handelsregister.py +++ b/handelsregister.py @@ -107,15 +107,14 @@ def parse_result(result): d['court'] = cells[1] # Extract register number: HRB, HRA, VR, GnR followed by numbers (e.g. HRB 12345, VR 6789) - # Also capture suffix letter if present (e.g. HRB 12345 B) - reg_match = re.search(r'(HRA|HRB|GnR|VR|PR)\s*\d+(\s+[A-Za-z])?', d['court']) + # Also capture suffix letter if present (e.g. HRB 12345 B), but avoid matching start of words (e.g. " Formerly") + reg_match = re.search(r'(HRA|HRB|GnR|VR|PR)\s*\d+(\s+[A-Z])?(?!\w)', d['court']) d['register_num'] = reg_match.group(0) if reg_match else None - # We look for text inside parentheses that is NOT the register number part if that happened to be in parens (unlikely for this format) - # The format seems to be: "City District court City (District) ..." - # We'll just grab the content of the first parenthesized group that looks like a name. - dist_match = re.search(r'\(([^)]+)\)', d['court']) - d['district'] = dist_match.group(1) if dist_match else None + # Special handling for Berlin (Charlottenburg): HRB numbers often imply a "B" suffix in external systems (like North Data) + if d['register_num'] and d['register_num'].startswith('HRB') and 'Berlin (Charlottenburg)' in d['court']: + if not d['register_num'].endswith(' B'): + d['register_num'] += ' B' d['name'] = cells[2] d['state'] = cells[3] @@ -131,12 +130,6 @@ def parse_result(result): break d['history'].append((cells[i], cells[i+1])) # (name, location) - if d['register_num']: - encoded_reg_num = urllib.parse.quote(d['register_num']) - d['northDataUrl'] = f"https://www.northdata.de/{encoded_reg_num}" - else: - d['northDataUrl'] = None - return d def pr_company_info(c): From 36ac6d29cbe1032485f3a4be63934796d93c3ee3 Mon Sep 17 00:00:00 2001 From: danielsippel Date: Sun, 7 Dec 2025 11:50:52 +0100 Subject: [PATCH 05/11] register_num extraction fixed --- handelsregister.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/handelsregister.py b/handelsregister.py index 60b12c5..78f1278 100755 --- a/handelsregister.py +++ b/handelsregister.py @@ -111,14 +111,20 @@ def parse_result(result): reg_match = re.search(r'(HRA|HRB|GnR|VR|PR)\s*\d+(\s+[A-Z])?(?!\w)', d['court']) d['register_num'] = reg_match.group(0) if reg_match else None - # Special handling for Berlin (Charlottenburg): HRB numbers often imply a "B" suffix in external systems (like North Data) - if d['register_num'] and d['register_num'].startswith('HRB') and 'Berlin (Charlottenburg)' in d['court']: - if not d['register_num'].endswith(' B'): - d['register_num'] += ' B' - d['name'] = cells[2] d['state'] = cells[3] d['status'] = cells[4].strip().upper().replace(' ', '_') + + # Ensure consistent register number suffixes (e.g. ' B' for Berlin HRB, ' HB' for Bremen) which might be implicit + if d['register_num']: + suffix_map = { + 'Berlin': {'HRB': ' B'}, + 'Bremen': {'HRA': ' HB', 'HRB': ' HB', 'GnR': ' HB', 'VR': ' HB', 'PR': ' HB'} + } + reg_type = d['register_num'].split()[0] + suffix = suffix_map.get(d['state'], {}).get(reg_type) + if suffix and not d['register_num'].endswith(suffix): + d['register_num'] += suffix d['documents'] = cells[5] # todo: get the document links d['history'] = [] hist_start = 8 From 9d3a0b4f4e04314c804b6b2d7376a8e5f2f0f313 Mon Sep 17 00:00:00 2001 From: danielsippel Date: Sun, 7 Dec 2025 11:54:06 +0100 Subject: [PATCH 06/11] test added --- test_handelsregister.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/test_handelsregister.py b/test_handelsregister.py index aad6a2d..eacf5df 100644 --- a/test_handelsregister.py +++ b/test_handelsregister.py @@ -3,7 +3,6 @@ import argparse def test_parse_search_result(): - # simplified html from a real search html = '%s' % """
Berlin District court Berlin (Charlottenburg) HRB 44343
GASAG AGBerlincurrently registered
History
1.) Gasag Berliner Gaswerke Aktiengesellschaft1.) Berlin
""" res = get_companies_in_searchresults(html) assert res == [{ @@ -37,21 +36,22 @@ def test_parse_search_result(): ("Potsdam", "Brandenburg") ]) def test_search_by_state_company(company, state_id): - # This acts as a proxy test for all 16 states. - # While we are not explicitly selecting the state in the form (yet), - # searching for these companies should yield results relevant to the state. - # If the user wanted explicit state *filtering*, we'd need to implementing form checkbox toggling. - + args = argparse.Namespace(debug=False, force=True, schlagwoerter=company, schlagwortOptionen='all', json=False) h = HandelsRegister(args) h.open_startpage() companies = h.search_company() assert companies is not None assert len(companies) > 0 - # Ideally search validation would check if at least one result matches the expected state, - # but 'state' field in result is often just the City or 'Berlin' for everyone if the registration court is there. - # The 'state' column in the results typically contains the actual state name or city. + +def test_haus_anker_b_suffix(): + args = argparse.Namespace(debug=False, force=True, schlagwoerter='Haus-Anker Verwaltungs GmbH', schlagwortOptionen='exact', json=False) + h = HandelsRegister(args) + h.open_startpage() + companies = h.search_company() + assert companies is not None + + target_company = next((c for c in companies if '138434' in c['register_num']), None) - # Let's try to verify if the state or related city appears in the results - # verification = any(state_id.lower() in str(c).lower() for c in companies) - # assert verification, f"Could not find {state_id} related entry for {company}" \ No newline at end of file + assert target_company is not None, "Haus-Anker Verwaltungs GmbH with expected number not found" + assert target_company['register_num'] == 'HRB 138434 B' \ No newline at end of file From 1582acafb4c01c5e8782cf28e07f1b31fcde19c7 Mon Sep 17 00:00:00 2001 From: danielsippel Date: Sun, 7 Dec 2025 11:56:04 +0100 Subject: [PATCH 07/11] - test fixed --- test_handelsregister.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_handelsregister.py b/test_handelsregister.py index eacf5df..d592df1 100644 --- a/test_handelsregister.py +++ b/test_handelsregister.py @@ -7,7 +7,7 @@ def test_parse_search_result(): res = get_companies_in_searchresults(html) assert res == [{ 'court':'Berlin District court Berlin (Charlottenburg) HRB 44343', - 'register_num': 'HRB 44343', + 'register_num': 'HRB 44343 B', 'district': 'Charlottenburg', 'name':'GASAG AG', 'state':'Berlin', From 846cb434fbf74af5fbb64bd3425e0866aee99987 Mon Sep 17 00:00:00 2001 From: danielsippel Date: Sun, 7 Dec 2025 11:57:32 +0100 Subject: [PATCH 08/11] test: district removed --- test_handelsregister.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test_handelsregister.py b/test_handelsregister.py index d592df1..05609a7 100644 --- a/test_handelsregister.py +++ b/test_handelsregister.py @@ -8,7 +8,6 @@ def test_parse_search_result(): assert res == [{ 'court':'Berlin District court Berlin (Charlottenburg) HRB 44343', 'register_num': 'HRB 44343 B', - 'district': 'Charlottenburg', 'name':'GASAG AG', 'state':'Berlin', 'status':'CURRENTLY_REGISTERED', From 744126d6fc2d44995abaac93b062797d0402f55a Mon Sep 17 00:00:00 2001 From: danielsippel Date: Sun, 7 Dec 2025 12:23:33 +0100 Subject: [PATCH 09/11] - blame owners: fixes --- handelsregister.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/handelsregister.py b/handelsregister.py index 78f1278..2e3081a 100755 --- a/handelsregister.py +++ b/handelsregister.py @@ -64,7 +64,8 @@ def search_company(self): if self.args.force==False and cachename.exists(): with open(cachename, "r") as f: html = f.read() - print("return cached content for %s" % self.args.schlagwoerter) + if not self.args.json: + print("return cached content for %s" % self.args.schlagwoerter) else: # TODO implement token bucket to abide by rate limit # Use an atomic counter: https://gist.github.com/benhoyt/8c8a8d62debe8e5aa5340373f9c509c7 From 96ec9c92c050ef3cb4b8877fc015550db2446ecd Mon Sep 17 00:00:00 2001 From: danielsippel Date: Sun, 7 Dec 2025 15:37:03 +0100 Subject: [PATCH 10/11] introducing statusCurrent; keeping status; no breaking changes --- handelsregister.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/handelsregister.py b/handelsregister.py index 2e3081a..f109602 100755 --- a/handelsregister.py +++ b/handelsregister.py @@ -114,7 +114,8 @@ def parse_result(result): d['name'] = cells[2] d['state'] = cells[3] - d['status'] = cells[4].strip().upper().replace(' ', '_') + d['status'] = cells[4].strip() # Original value for backward compatibility + d['statusCurrent'] = cells[4].strip().upper().replace(' ', '_') # Transformed value # Ensure consistent register number suffixes (e.g. ' B' for Berlin HRB, ' HB' for Bremen) which might be implicit if d['register_num']: From 9a854c6c342fd077302ccea947a9fd042ea7784a Mon Sep 17 00:00:00 2001 From: danielsippel Date: Sun, 7 Dec 2025 15:38:51 +0100 Subject: [PATCH 11/11] cleanup --- handelsregister.py | 2 +- test_handelsregister.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/handelsregister.py b/handelsregister.py index f109602..03ccc1a 100755 --- a/handelsregister.py +++ b/handelsregister.py @@ -141,7 +141,7 @@ def parse_result(result): return d def pr_company_info(c): - for tag in ('name', 'court', 'register_num', 'northDataUrl', 'district', 'state', 'status'): + for tag in ('name', 'court', 'register_num', 'district', 'state', 'statusCurrent'): print('%s: %s' % (tag, c.get(tag, '-'))) print('history:') for name, loc in c.get('history'): diff --git a/test_handelsregister.py b/test_handelsregister.py index 05609a7..fa1951a 100644 --- a/test_handelsregister.py +++ b/test_handelsregister.py @@ -10,7 +10,8 @@ def test_parse_search_result(): 'register_num': 'HRB 44343 B', 'name':'GASAG AG', 'state':'Berlin', - 'status':'CURRENTLY_REGISTERED', + 'status':'currently registered', # Original value for backward compatibility + 'statusCurrent':'CURRENTLY_REGISTERED', # Transformed value 'documents': 'ADCDHDDKUTVÖSI', 'history':[('1.) Gasag Berliner Gaswerke Aktiengesellschaft', '1.) Berlin')] },]