Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# IDE
.idea/
*.xml
*.iml

# Python
__pycache__/
*.py[cod]
*$py.class

# Environments
.env
.venv/
venv/
env/

# Distribution / Packaging
dist/
build/
*.egg-info/
65 changes: 49 additions & 16 deletions handelsregister.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,13 @@
"""

import argparse
import tempfile
import mechanize
import re
import pathlib
import sys
from bs4 import BeautifulSoup
import urllib.parse

# Dictionaries to map arguments to values
schlagwortOptionen = {
Expand Down Expand Up @@ -48,27 +50,29 @@ def __init__(self, args):
( "Connection", "keep-alive" ),
]

self.cachedir = pathlib.Path("cache")
self.cachedir = pathlib.Path(tempfile.gettempdir()) / "handelsregister_cache"
self.cachedir.mkdir(parents=True, exist_ok=True)

def open_startpage(self):
self.browser.open("https://www.handelsregister.de", timeout=10)

def companyname2cachename(self, companyname):
# map a companyname to a filename, that caches the downloaded HTML, so re-running this script touches the
# webserver less often.
return self.cachedir / companyname

def search_company(self):
cachename = self.companyname2cachename(self.args.schlagwoerter)
if self.args.force==False and cachename.exists():
with open(cachename, "r") as f:
html = f.read()
print("return cached content for %s" % self.args.schlagwoerter)
if not self.args.json:
print("return cached content for %s" % self.args.schlagwoerter)
else:
# TODO implement token bucket to abide by rate limit
# Use an atomic counter: https://gist.github.com/benhoyt/8c8a8d62debe8e5aa5340373f9c509c7
response_search = self.browser.follow_link(text="Advanced search")
self.browser.select_form(name="naviForm")
self.browser.form.new_control('hidden', 'naviForm:erweiterteSucheLink', {'value': 'naviForm:erweiterteSucheLink'})
self.browser.form.new_control('hidden', 'target', {'value': 'erweiterteSucheLink'})
response_search = self.browser.submit()

if self.args.debug == True:
print(self.browser.title())
Expand All @@ -95,28 +99,49 @@ def search_company(self):
return get_companies_in_searchresults(html)



def parse_result(result):
cells = []
for cellnum, cell in enumerate(result.find_all('td')):
#print('[%d]: %s [%s]' % (cellnum, cell.text, cell))
cells.append(cell.text.strip())
#assert cells[7] == 'History'
d = {}
d['court'] = cells[1]

# Extract register number: HRB, HRA, VR, GnR followed by numbers (e.g. HRB 12345, VR 6789)
# Also capture suffix letter if present (e.g. HRB 12345 B), but avoid matching start of words (e.g. " Formerly")
reg_match = re.search(r'(HRA|HRB|GnR|VR|PR)\s*\d+(\s+[A-Z])?(?!\w)', d['court'])
d['register_num'] = reg_match.group(0) if reg_match else None

d['name'] = cells[2]
d['state'] = cells[3]
d['status'] = cells[4]
d['status'] = cells[4].strip() # Original value for backward compatibility
d['statusCurrent'] = cells[4].strip().upper().replace(' ', '_') # Transformed value

# Ensure consistent register number suffixes (e.g. ' B' for Berlin HRB, ' HB' for Bremen) which might be implicit
if d['register_num']:
suffix_map = {
'Berlin': {'HRB': ' B'},
'Bremen': {'HRA': ' HB', 'HRB': ' HB', 'GnR': ' HB', 'VR': ' HB', 'PR': ' HB'}
}
reg_type = d['register_num'].split()[0]
suffix = suffix_map.get(d['state'], {}).get(reg_type)
if suffix and not d['register_num'].endswith(suffix):
d['register_num'] += suffix
d['documents'] = cells[5] # todo: get the document links
d['history'] = []
hist_start = 8
hist_cnt = (len(cells)-hist_start)/3

for i in range(hist_start, len(cells), 3):
if i + 1 >= len(cells):
break
if "Branches" in cells[i] or "Niederlassungen" in cells[i]:
break
d['history'].append((cells[i], cells[i+1])) # (name, location)
#print('d:',d)

return d

def pr_company_info(c):
for tag in ('name', 'court', 'state', 'status'):
for tag in ('name', 'court', 'register_num', 'district', 'state', 'statusCurrent'):
print('%s: %s' % (tag, c.get(tag, '-')))
print('history:')
for name, loc in c.get('history'):
Expand All @@ -125,20 +150,18 @@ def pr_company_info(c):
def get_companies_in_searchresults(html):
soup = BeautifulSoup(html, 'html.parser')
grid = soup.find('table', role='grid')
#print('grid: %s', grid)

results = []
for result in grid.find_all('tr'):
a = result.get('data-ri')
if a is not None:
index = int(a)
#print('r[%d] %s' % (index, result))

d = parse_result(result)
results.append(d)
return results

def parse_args():
# Parse arguments
parser = argparse.ArgumentParser(description='A handelsregister CLI')
parser.add_argument(
"-d",
Expand Down Expand Up @@ -166,6 +189,12 @@ def parse_args():
choices=["all", "min", "exact"],
default="all"
)
parser.add_argument(
"-j",
"--json",
help="Return response as JSON",
action="store_true"
)
args = parser.parse_args()


Expand All @@ -179,10 +208,14 @@ def parse_args():
return args

if __name__ == "__main__":
import json
args = parse_args()
h = HandelsRegister(args)
h.open_startpage()
companies = h.search_company()
if companies is not None:
for c in companies:
pr_company_info(c)
if args.json:
print(json.dumps(companies))
else:
for c in companies:
pr_company_info(c)
43 changes: 38 additions & 5 deletions test_handelsregister.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,55 @@
import argparse

def test_parse_search_result():
# simplified html from a real search
html = '<html><body>%s</body></html>' % """<table role="grid"><thead></thead><tbody id="ergebnissForm:selectedSuchErgebnisFormTable_data" class="ui-datatable-data ui-widget-content"><tr data-ri="0" class="ui-widget-content ui-datatable-even" role="row"><td role="gridcell" colspan="9" class="borderBottom3"><table id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt147" class="ui-panelgrid ui-widget" role="grid"><tbody><tr class="ui-widget-content ui-panelgrid-even borderBottom1" role="row"><td role="gridcell" class="ui-panelgrid-cell fontTableNameSize" colspan="5">Berlin <span class="fontWeightBold"> District court Berlin (Charlottenburg) HRB 44343 </span></td></tr><tr class="ui-widget-content ui-panelgrid-odd" role="row"><td role="gridcell" class="ui-panelgrid-cell paddingBottom20Px" colspan="5"><span class="marginLeft20">GASAG AG</span></td><td role="gridcell" class="ui-panelgrid-cell sitzSuchErgebnisse"><span class="verticalText ">Berlin</span></td><td role="gridcell" class="ui-panelgrid-cell" style="text-align: center;padding-bottom: 20px;"><span class="verticalText">currently registered</span></td><td role="gridcell" class="ui-panelgrid-cell textAlignLeft paddingBottom20Px" colspan="2"><div id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt160" class="ui-outputpanel ui-widget linksPanel"><script type="text/javascript" src="/rp_web/javax.faces.resource/jsf.js.xhtml?ln=javax.faces"></script><a id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:0:fade" href="#" class="dokumentList" aria-describedby="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:0:toolTipFade"><span id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:0:popupLink" class="underlinedText">AD</span></a><a id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:1:fade" href="#" class="dokumentList" aria-describedby="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:1:toolTipFade"><span id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:1:popupLink" class="underlinedText">CD</span></a><a id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:2:fade" href="#" class="dokumentList" aria-describedby="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:2:toolTipFade"><span id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:2:popupLink" class="underlinedText">HD</span></a><a id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:3:fade" href="#" class="dokumentList" aria-describedby="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:3:toolTipFade"><span id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:3:popupLink" class="underlinedText">DK</span></a><a id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:4:fade" href="#" class="dokumentList" aria-describedby="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:4:toolTipFade"><span id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:4:popupLink" class="underlinedText">UT</span></a><a id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:5:fade" href="#" class="dokumentList" aria-describedby="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:5:toolTipFade"><span id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:5:popupLink" class="underlinedText">VÖ</span></a><a id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:6:fade" href="#" class="dokumentList" aria-describedby="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:6:toolTipFade"><span id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt161:6:popupLink" class="underlinedText">SI</span></a></div></td></tr><tr class="ui-widget-content ui-panelgrid-even" role="row"><td role="gridcell" class="ui-panelgrid-cell" colspan="7"><table id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt172" class="ui-panelgrid ui-widget marginLeft20" role="grid"><tbody><tr class="ui-widget-content ui-panelgrid-even borderBottom1 RegPortErg_Klein" role="row"><td role="gridcell" class="ui-panelgrid-cell padding0Px">History</td></tr></tbody></table><table id="ergebnissForm:selectedSuchErgebnisFormTable:0:j_idt176" class="ui-panelgrid ui-widget" role="grid"><tbody><tr class="ui-widget-content" role="row"><td role="gridcell" class="ui-panelgrid-cell RegPortErg_HistorieZn marginLeft20 padding0Px" colspan="5"><span class="marginLeft20 fontSize85">1.) Gasag Berliner Gaswerke Aktiengesellschaft</span></td><td role="gridcell" class="ui-panelgrid-cell RegPortErg_SitzStatus "><span class="fontSize85">1.) Berlin</span></td><td role="gridcell" class="ui-panelgrid-cell textAlignCenter"></td></tr></tbody></table></td></tr></tbody></table></td></tr></tbody></table>"""
res = get_companies_in_searchresults(html)
assert res == [{
'court':'Berlin District court Berlin (Charlottenburg) HRB 44343',
'register_num': 'HRB 44343 B',
'name':'GASAG AG',
'state':'Berlin',
'status':'currently registered',
'status':'currently registered', # Original value for backward compatibility
'statusCurrent':'CURRENTLY_REGISTERED', # Transformed value
'documents': 'ADCDHDDKUTVÖSI',
'history':[('1.) Gasag Berliner Gaswerke Aktiengesellschaft', '1.) Berlin')]
},]


def test_get_results():
args = argparse.Namespace(debug=False, force=False, schlagwoerter='deutsche bahn', schlagwortOptionen='all')
@pytest.mark.parametrize("company, state_id", [
("Hafen Hamburg", "Hamburg"),
("Bayerische Motoren Werke", "Bayern"),
("Daimler Truck", "Baden-Württemberg"),
("Volkswagen", "Niedersachsen"),
("RWE", "Nordrhein-Westfalen"),
("Fraport", "Hessen"),
("Saarstahl", "Saarland"),
("Mainz", "Rheinland-Pfalz"),
("Nordex", "Mecklenburg-Vorpommern"),
("Jenoptik", "Thüringen"),
("Vattenfall", "Berlin"),
("Bremen", "Bremen"),
("Sachsen", "Sachsen"),
("Magdeburg", "Sachsen-Anhalt"),
("Kiel", "Schleswig-Holstein"),
("Potsdam", "Brandenburg")
])
def test_search_by_state_company(company, state_id):

args = argparse.Namespace(debug=False, force=True, schlagwoerter=company, schlagwortOptionen='all', json=False)
h = HandelsRegister(args)
h.open_startpage()
companies = h.search_company()
assert companies is not None
assert len(companies) > 0

def test_haus_anker_b_suffix():
args = argparse.Namespace(debug=False, force=True, schlagwoerter='Haus-Anker Verwaltungs GmbH', schlagwortOptionen='exact', json=False)
h = HandelsRegister(args)
h.open_startpage()
companies = h.search_company()
assert len(companies) > 0
assert companies is not None

target_company = next((c for c in companies if '138434' in c['register_num']), None)

assert target_company is not None, "Haus-Anker Verwaltungs GmbH with expected number not found"
assert target_company['register_num'] == 'HRB 138434 B'
Loading