#!/bin/env python3 from typing import List, Tuple import argparse import requests import re import tempfile import zipfile import PyPDF2 GEM = re.compile(r'^([1-9][0-9]{2} [0-9]{2}) X') ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü(][A-Za-z0-9äöüÄÖÜßẞ():,.-]* ?|[0-9]+\..*?)+) ?' r'([()X0-9.]+ [()X0-9.]+ ?|$)') STRIP_NUM = re.compile(r'[X0-9. -]+$') STRIP_CODE = re.compile(r' *(Agh|Alm|Alpe|B|Bd|Bwg|Burg|Camp|D|E|Fbk|Fhei|Gh|Hgr|Hot|Indz|Jh|Jhtt|Ki|Kl|Krwk|Ks|M|Mh|' r'Mü|R|Ru|Sa|Sä|Sb|Schh|Schih|Schl|Sdlg|Sgr|St|Stbr|Stt|V|W|We|Ek|Z|Zgl|ZH) *(\([0-9]+\) *)?$') KGV_URL = 'https://www.bev.gv.at/portal/page?_pageid=713,2601283&_dad=portal&_schema=PORTAL' KGV_LINK = re.compile(r'') OV_URL = 'https://statistik.at/fileadmin/publications/Ortsverzeichnis_2001__{}.pdf' OV_NAMES = ['Wien', 'Niederoesterreich', 'Oberoesterreich', 'Kaernten', 'Steiermark', 'Vorarlberg', 'Burgenland', 'Tirol', 'Salzburg'] WIEN_UMGEBUNG = { 32401: 30729, 32402: 30730, 32403: 31949, 32404: 31235, 32405: 30731, 32406: 30732, 32407: 30733, 32408: 32144, 32409: 30734, 32410: 30735, 32411: 30736, 32412: 31950, 32413: 30737, 32415: 31951, 32416: 31952, 32417: 30738, 32418: 30739, 32419: 30740, 32421: 31953, 32423: 31954, 32424: 30741, } GEMEINDEN = { # Oberösterreich 40803: 40835, 40819: 40835, 41308: 41344, 41330: 41344, 41519: 41344, 41625: 41628, # Steiermark 60204: 62139, 60207: 62142, 60209: 62140, 60211: 62139, 60213: 62125, 60216: 62128, 60217: 62146, 60219: 62147, 60301: 60345, 60307: 60344, 60309: 60350, 60310: 60348, 60313: 60345, 60315: 60349, 60327: 60347, 60330: 60348, 60331: 60349, 60336: 60347, 60339: 60346, 60407: 62311, 60410: 62378, 60416: 62380, 60426: 61057, 60435: 62382, 61504: 62383, 61508: 62390, 61510: 62383, 61514: 62377, 61517: 62380, 61009: 61054, 61011: 61012, 61015: 61052, 60512: 62273, 61034: 61052, 61036: 61061, 61041: 61050, 61602: 61631, 61609: 61630, 61104: 61107, 61620: 61628, 60601: 60665, 60603: 60666, 60607: 60664, 60614: 60664, 60620: 60661, 60634: 60619, 60658: 60660, 61706: 61711, 61712: 61759, 61713: 62266, 61720: 61763, 61721: 61727, 60703: 62272, 60706: 62211, 60708: 62270, 61218: 61222, 61223: 61260, 61734: 61763, 61737: 61762, 61226: 61254, 60723: 62275, 60726: 62277, 61238: 61266, 60729: 62266, 61244: 61256, 60736: 62265, 60738: 62275, 60741: 62268, 61301: 62144, 61306: 62115, 61308: 62144, 61311: 62143, 60801: 62044, 61314: 62132, 60804: 62010, 60809: 62048, 60810: 62044, 60814: 62044, 60821: 62042, 60824: 62039, 61411: 61439, 60907: 62034, 60417: 62380, 60422: 62378, 60425: 62381, 60438: 60668, 60442: 62380, 60449: 62381, 61505: 62383, 61509: 62335, 61520: 61060, 60501: 62267, 60502: 62206, 60504: 62267, 60507: 62269, 60509: 62273, 60511: 62232, 61022: 61054, 60514: 62267, 61028: 61049, 61040: 61059, 61046: 61059, 61047: 61059, 61048: 61057, 61601: 61626, 61607: 61630, 61613: 61627, 61117: 61120, 60606: 60661, 60622: 60664, 60625: 60668, 60652: 60670, 61201: 61254, 60702: 62205, 60705: 62277, 60711: 62266, 60715: 62272, 60717: 62274, 60210: 62142, 61235: 61267, 61753: 61765, 61242: 61265, 61754: 61757, 60732: 62245, 61755: 61766, 60748: 62266, 61305: 62141, 61315: 62145, 61316: 62145, 60302: 60345, 60822: 62048, 60320: 60344, 60322: 60345, 60325: 60350, 60333: 60350, 60334: 60350, 60338: 60344, 60343: 62380, 61417: 61445, 60403: 62386, 60411: 62389, 60739: 62266, 60644: 60669, 60409: 62314, } KgvRow = Tuple[int, str, int, str] OvRow = Tuple[int, int, str] def get_kvg_zip_url() -> str: r = requests.get(KGV_URL) if r.status_code != 200: raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}') matches = KGV_LINK.findall(r.text) if len(matches) == 0: raise RuntimeError('Unable to find url of zip file') return matches[0] def download_kgv() -> List[KgvRow]: with tempfile.NamedTemporaryFile() as f: with requests.get(get_kvg_zip_url(), stream=True) as r: if r.status_code != 200: raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}') for chunk in r.iter_content(chunk_size=8192): f.write(chunk) rows = [] with zipfile.ZipFile(f, 'r') as zip_file: files = [name for name in zip_file.namelist() if name.endswith('.csv')] if len(files) == 0: raise RuntimeError('Unable to find csv file in zip') with zip_file.open(files[0], 'r') as csv: first = True for r_line in csv: if first: first = False continue line = r_line.decode('utf8').rstrip() row = [c[1:-1] if c[0] == '"' else int(c) for c in line.split(';')] rows.append((int(row[0]), str(row[1]), int(row[3]), str(row[4]))) return rows GKZ = {} def download_ov_land(bundesland: str) -> List[OvRow]: rows = [] with tempfile.NamedTemporaryFile() as f: r = requests.get(OV_URL.format(bundesland)) if r.status_code != 200: raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}') for chunk in r.iter_content(chunk_size=8192): f.write(chunk) pdf = PyPDF2.PdfFileReader(f) gkz = None last = None valid = False for page in pdf.pages: page_num = pdf.getPageNumber(page) text = page.extractText() if len(text) < 100: if text.strip().replace(' ', '') == 'Ortsverzeichnis': valid = True continue elif valid and text.strip().replace(' ', '') == 'ALPHABETISCHEVERZEICHNISSE': break if not valid: continue with open(f'out/{bundesland}.{page_num}.txt', 'w+') as o: o.write(text) lines = text.splitlines() for line in lines: m1 = ORT.match(line) m2 = GEM.match(line) if last is not None: if m1 is None: last = f'{last} {line}' val = STRIP_NUM.sub('', ORT.match(last).group(0)) okz = int(val[:5]) name = STRIP_CODE.sub('', val[6:]) rows.append((gkz, okz, name)) last = None continue else: val = last okz = int(val[:5]) name = STRIP_CODE.sub('', val[6:]) rows.append((gkz, okz, name)) last = None if 'Katastralgemeinden:' in line: p1 = line.find('Katastralgemeinden:') p2 = line.find('Postleitzahl') val = [' '.join(a.split(' ')[:-2]) for a in line[p1 + 20:p2].split(', ')] GKZ[gkz] = val continue if m1: val = STRIP_NUM.sub('', m1.group(0)) if m1.group(4) == '': last = val else: okz = int(val[:5]) name = STRIP_CODE.sub('', val[6:]) rows.append((gkz, okz, name)) elif m2: gkz = int(m2.group(1).replace(' ', '')) if gkz > 90000: # Gemeinde Wien gkz = 90001 elif gkz >= 32400 and gkz <= 32499: # ehem. Bezirk Wien Umgebung gkz = WIEN_UMGEBUNG[gkz] elif gkz in GEMEINDEN: # Gemeindereformen (OÖ, Stmk.) gkz = GEMEINDEN[gkz] return rows def download_ov() -> List[OvRow]: rows = [] for name in OV_NAMES: rows += download_ov_land(name) return rows def write_sql(kgv_rows: List[KgvRow], ov_rows: List[OvRow]) -> None: kgv = {kgnr: (kg_name, gkz, gem_name) for kgnr, kg_name, gkz, gem_name in kgv_rows} ov = {okz: (name, gkz) for gkz, okz, name in ov_rows} gemeinden = {gkz: (gem_name, []) for kgnr, kg_name, gkz, gem_name in kgv_rows} with open('gemeinden.sql', 'wb') as f: f.write(b"\nINSERT INTO AT_gem VALUES\n") for gkz, (name, _) in gemeinden.items(): f.write(f"({gkz:5}, '{name}'),\n".encode('utf8')) f.seek(-2, 1) f.write(b';\n') f.write(b"\nINSERT INTO AT_kg VALUES\n") for kgnr, name, gkz, _ in kgv_rows: gemeinden[gkz][1].append(kgnr) f.write(f"({kgnr:5}, {gkz:5}, '{name}'),\n".encode('utf8')) f.seek(-2, 1) f.write(b';\n') f.write(b"\nINSERT INTO AT_ort VALUES\n") pr = set() for gkz, okz, name in ov_rows: kgnr_o = None if gkz not in gemeinden: pr.add(gkz) continue for kgnr in gemeinden[gkz][1]: if kgv[kgnr][0] in name or name in kgv[kgnr][0]: kgnr_o = kgnr f.write(f"({okz:5}, {kgnr_o if kgnr_o is not None else 'NULL':5}, '{name}'),\n".encode('utf8')) f.seek(-2, 1) f.write(b';\n') p = set() for e in pr: possible = filter(lambda a: 60000 <= a[2] < 70000, kgv_rows.copy()) for name in GKZ[e]: if name == '': continue possible = filter(lambda a: a[1] == name, possible) possible = list(possible) if len(possible) == 1: print(f' {e}: {possible[0][2]},') p.add(e) print(pr - p) if __name__ == '__main__': parser = argparse.ArgumentParser() args = parser.parse_args() print('Downloading and parsing Ortsverzeichnis from statistik.at') ov_data = download_ov() print('Downloading Katastralgemeindenverzeichnis from www.bev.gv.at') kgv_data = download_kgv() write_sql(kgv_data, ov_data) print('Successfully created gemeinden.sql!')