#!/bin/env python3 from typing import List, Tuple import argparse import requests import re import tempfile import zipfile import PyPDF2 GEM = re.compile(r'^([1-9][0-9]{2} [0-9]{2}) [X0-9]+') ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü(][A-Za-z0-9äöüÄÖÜßẞ():,.-]* ?|[0-9]+\..*?)+) ?' r'([()X0-9.]+ [()X0-9.]+ ?|$)') STRIP_NUM = re.compile(r'[X0-9. -]+$') STRIP_CODE = re.compile(r' *(Agh|Alm|Alpe|B|Bd|Bwg|Burg|Camp|D|E|Fbk|Fhei|Gh|Hgr|Hot|Indz|Jh|Jhtt|Ki|Kl|Krwk|Ks|M|Mh|' r'Mü|R|Ru|Sa|Sä|Sb|Schh|Schih|Schl|Sdlg|Sgr|St|Stbr|Stt|V|W|We|Ek|Z|Zgl|ZH) *(\([0-9]+\) *)?$') KGV_URL = 'https://www.bev.gv.at/portal/page?_pageid=713,2601283&_dad=portal&_schema=PORTAL' KGV_LINK = re.compile(r'') OV_URL = 'https://statistik.at/fileadmin/publications/Ortsverzeichnis_2001__{}.pdf' OV_NAMES = ['Wien', 'Niederoesterreich', 'Oberoesterreich', 'Kaernten', 'Steiermark', 'Vorarlberg', 'Burgenland', 'Tirol', 'Salzburg'] WIEN_UMGEBUNG = { 32401: 30729, 32402: 30730, 32403: 31949, 32404: 31235, 32405: 30731, 32406: 30732, 32407: 30733, 32408: 32144, 32409: 30734, 32410: 30735, 32411: 30736, 32412: 31950, 32413: 30737, 32415: 31951, 32416: 31952, 32417: 30738, 32418: 30739, 32419: 30740, 32421: 31953, 32423: 31954, 32424: 30741, } GEMEINDEN = { # Tirol 70330: 70370, 70327: 70370, 70341: 70370, # Oberösterreich 40803: 40835, 40819: 40835, 41308: 41344, 41330: 41344, 41519: 41344, 41625: 41628, 41301: 41346, 41303: 41343, 41310: 41345, 41335: 41346, 41339: 41343, 41340: 41628, 41520: 41522, # Steiermark 60204: 62139, 60207: 62142, 60209: 62140, 60211: 62139, 60213: 62125, 60216: 62128, 60217: 62146, 60219: 62147, 60301: 60345, 60307: 60344, 60309: 60350, 60310: 60348, 60313: 60345, 60315: 60349, 60327: 60347, 60330: 60348, 60331: 60349, 60336: 60347, 60339: 60346, 60407: 62311, 60410: 62378, 60416: 62380, 60426: 61057, 60435: 62382, 61504: 62383, 61508: 62390, 61510: 62383, 61514: 62377, 61517: 62380, 61009: 61054, 61011: 61012, 61015: 61052, 60512: 62273, 61034: 61052, 61036: 61061, 61041: 61050, 61602: 61631, 61609: 61630, 61104: 61107, 61620: 61628, 60601: 60665, 60603: 60666, 60607: 60664, 60614: 60664, 60620: 60661, 60634: 60619, 60658: 60660, 61706: 61711, 61712: 61759, 61713: 62266, 61720: 61763, 61721: 61727, 60703: 62272, 60706: 62211, 60708: 62270, 61218: 61222, 61223: 61260, 61734: 61763, 61737: 61762, 61226: 61254, 60723: 62275, 60726: 62277, 61238: 61266, 60729: 62266, 61244: 61256, 60736: 62265, 60738: 62275, 60741: 62268, 61301: 62144, 61306: 62115, 61308: 62144, 61311: 62143, 60801: 62044, 61314: 62132, 60804: 62010, 60809: 62048, 60810: 62044, 60814: 62044, 60821: 62042, 60824: 62039, 61411: 61439, 60907: 62034, 60417: 62380, 60422: 62378, 60425: 62381, 60438: 60668, 60442: 62380, 60449: 62381, 61505: 62383, 61509: 62335, 61520: 61060, 60501: 62267, 60502: 62206, 60504: 62267, 60507: 62269, 60509: 62273, 60511: 62232, 61022: 61054, 60514: 62267, 61028: 61049, 61040: 61059, 61046: 61059, 61047: 61059, 61048: 61057, 61601: 61626, 61607: 61630, 61613: 61627, 61117: 61120, 60606: 60661, 60622: 60664, 60625: 60668, 60652: 60670, 61201: 61254, 60702: 62205, 60705: 62277, 60711: 62266, 60715: 62272, 60717: 62274, 60210: 62142, 61235: 61267, 61753: 61765, 61242: 61265, 61754: 61757, 60732: 62245, 61755: 61766, 60748: 62266, 61305: 62141, 61315: 62145, 61316: 62145, 60302: 60345, 60822: 62048, 60320: 60344, 60322: 60345, 60325: 60350, 60333: 60350, 60334: 60350, 60338: 60344, 60343: 62380, 61417: 61445, 60403: 62386, 60411: 62389, 60739: 62266, 60644: 60669, 60409: 62314, 61511: 62383, 61513: 62376, 61515: 62377, 61519: 62380, 61603: 61630, 61605: 61632, 61606: 61631, 61608: 61630, 61610: 61630, 61614: 61627, 61616: 61629, 61617: 61626, 61619: 61632, 61622: 61633, 61623: 61628, 61705: 61757, 61707: 61756, 61714: 61757, 61715: 61761, 61717: 61757, 61718: 62266, 61725: 61760, 61726: 61760, 61732: 61756, 61733: 61763, 61735: 61759, 61736: 61763, 61739: 61762, 61742: 61764, 61749: 61761, 60201: 62138, 60202: 62138, 60205: 62147, 60206: 62146, 60208: 62142, 60212: 62140, 60214: 62147, 60218: 62142, 60221: 62135, 60303: 60345, 60306: 60344, 60308: 60349, 60311: 60349, 60314: 60348, 60316: 60344, 60319: 60351, 60321: 60344, 60328: 60345, 60332: 60345, 60335: 60350, 60340: 60351, 60342: 60351, 60402: 62380, 60404: 62375, 60406: 62386, 60408: 62385, 60412: 62382, 60413: 62385, 60414: 62387, 60415: 62389, 60418: 62379, 60419: 62380, 60423: 62378, 60428: 62386, 60429: 62390, 60431: 62386, 60432: 62380, 60433: 62375, 60434: 61057, 60436: 62382, 60437: 62382, 60439: 62384, 60440: 62378, 60441: 60668, 60443: 62380, 60444: 62379, 60445: 62380, 60450: 61057, 60452: 62382, 60453: 62375, 60454: 62380, 60455: 62372, 60456: 62381, 60503: 62206, 60513: 62273, 60604: 60660, 60605: 60666, 60609: 60662, 60612: 60667, 60615: 60659, 60616: 60664, 60621: 60661, 60630: 60662, 60633: 60669, 60635: 60667, 60636: 60663, 60640: 60668, 60649: 61758, 60650: 61758, 60657: 60670, 60701: 62266, 60704: 62272, 60709: 62270, 60713: 62272, 60714: 62266, 60716: 62277, 60718: 62274, 60719: 62279, 60720: 62274, 60721: 62233, 60724: 62278, 60725: 62275, 60727: 62276, 60728: 62277, 60730: 62242, 60731: 62266, 60733: 62245, 60737: 62265, 60740: 62266, 60742: 62268, 60743: 62256, 60744: 62270, 60745: 62278, 60746: 62278, 60747: 62279, 60749: 62262, 60802: 62044, 60805: 62010, 60807: 62042, 60808: 62042, 60811: 62043, 60812: 62040, 60813: 62044, 60815: 62021, 60816: 62040, 60817: 62048, 60818: 62026, 60819: 62044, 60820: 62044, 60823: 62048, 60901: 62041, 60903: 62047, 60904: 62008, 60905: 62039, 60906: 62039, 60908: 62014, 60909: 62046, 60912: 62046, 60913: 62034, 61003: 61049, 61005: 61049, 61006: 61054, 61010: 61054, 61014: 61057, 61018: 61053, 61023: 61054, 61025: 61061, 61026: 61051, 61029: 61049, 61031: 61055, 61035: 61052, 61037: 61054, 61038: 61053, 61039: 61061, 61042: 61050, 61044: 61061, 61102: 61120, 61103: 61120, 61202: 61254, 61208: 61257, 61209: 61257, 61210: 61256, 61212: 61254, 61214: 61266, 61216: 61253, 61219: 61253, 61220: 61266, 61221: 61258, 61224: 61260, 61227: 61262, 61229: 61263, 61230: 61258, 61232: 61265, 61234: 61260, 61239: 61264, 61241: 61266, 61245: 61255, 61246: 61256, 61248: 61264, 61249: 61259, 61250: 61253, 61304: 62144, 61309: 62141, 61310: 62144, 61401: 61439, 61402: 61442, 61403: 61446, 61404: 61437, 61405: 61437, 61406: 61437, 61407: 61439, 61408: 61438, 61412: 61439, 61414: 61440, 61415: 61440, 61416: 61439, 61418: 61441, 61419: 61441, 61420: 61443, 61422: 61443, 61426: 61442, 61427: 61444, 61430: 61445, 61431: 61438, 61433: 61438, 61434: 61440, 61435: 61439, } KgvRow = Tuple[int, str, int, str] OvRow = Tuple[int, int, str] def get_kvg_zip_url() -> str: r = requests.get(KGV_URL) if r.status_code != 200: raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}') matches = KGV_LINK.findall(r.text) if len(matches) == 0: raise RuntimeError('Unable to find url of zip file') return matches[0] def download_kgv() -> List[KgvRow]: with tempfile.NamedTemporaryFile() as f: with requests.get(get_kvg_zip_url(), stream=True) as r: if r.status_code != 200: raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}') for chunk in r.iter_content(chunk_size=8192): f.write(chunk) rows = [] with zipfile.ZipFile(f, 'r') as zip_file: files = [name for name in zip_file.namelist() if name.endswith('.csv')] if len(files) == 0: raise RuntimeError('Unable to find csv file in zip') with zip_file.open(files[0], 'r') as csv: first = True for r_line in csv: if first: first = False continue line = r_line.decode('utf8').rstrip() row = [c[1:-1] if c[0] == '"' else int(c) for c in line.split(';')] rows.append((int(row[0]), str(row[1]), int(row[3]), str(row[4]))) return rows GKZ = {} def download_ov_land(bundesland: str) -> List[OvRow]: rows = [] with tempfile.NamedTemporaryFile() as f: r = requests.get(OV_URL.format(bundesland)) if r.status_code != 200: raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}') for chunk in r.iter_content(chunk_size=8192): f.write(chunk) pdf = PyPDF2.PdfFileReader(f) gkz = None last = None valid = False for page in pdf.pages: page_num = pdf.getPageNumber(page) text = page.extractText() if len(text) < 100: if text.strip().replace(' ', '') == 'Ortsverzeichnis': valid = True continue elif valid and text.strip().replace(' ', '') == 'ALPHABETISCHEVERZEICHNISSE': break if not valid: continue with open(f'out/{bundesland}.{page_num + 1}.txt', 'w+') as o: o.write(text) lines = text.splitlines() cont = False for line in lines: m1 = ORT.match(line) m2 = GEM.match(line) if last is not None: if line == 'Gemeindename': cont = True break elif m1 is None: last = f'{last} {line}' val = STRIP_NUM.sub('', ORT.match(last).group(0)) okz = int(val[:5]) name = STRIP_CODE.sub('', val[6:]) rows.append((gkz, okz, name)) last = None continue else: val = last okz = int(val[:5]) name = STRIP_CODE.sub('', val[6:]) rows.append((gkz, okz, name)) last = None if 'Katastralgemeinden:' in line: p1 = line.find('Katastralgemeinden:') p2 = line.find('Postleitzahl') val = [' '.join(a.split(' ')[:-2]) for a in line[p1 + 20:p2].split(', ')] GKZ[gkz] = val continue if m1: val = STRIP_NUM.sub('', m1.group(0)) if m1.group(4) == '': last = val else: okz = int(val[:5]) name = STRIP_CODE.sub('', val[6:]) rows.append((gkz, okz, name)) elif m2: gkz = int(m2.group(1).replace(' ', '')) if gkz > 90000: # Gemeinde Wien gkz = 90001 elif gkz >= 32400 and gkz <= 32499: # ehem. Bezirk Wien Umgebung gkz = WIEN_UMGEBUNG[gkz] elif gkz in GEMEINDEN: # Gemeindereformen (OÖ, Stmk.) gkz = GEMEINDEN[gkz] if cont: continue return rows def download_ov() -> List[OvRow]: rows = [] for name in OV_NAMES: rows += download_ov_land(name) return rows def write_sql(kgv_rows: List[KgvRow], ov_rows: List[OvRow]) -> None: kgv = {kgnr: (kg_name, gkz, gem_name) for kgnr, kg_name, gkz, gem_name in kgv_rows} ov = {okz: (name, gkz) for gkz, okz, name in ov_rows} gemeinden = {gkz: (gem_name, []) for kgnr, kg_name, gkz, gem_name in kgv_rows} with open('gemeinden.sql', 'wb') as f: f.write(b"\nINSERT INTO AT_gem VALUES\n") for gkz, (name, _) in gemeinden.items(): f.write(f"({gkz:5}, '{name}'),\n".encode('utf8')) f.seek(-2, 1) f.write(b';\n') f.write(b"\nINSERT INTO AT_kg VALUES\n") for kgnr, name, gkz, _ in kgv_rows: gemeinden[gkz][1].append(kgnr) f.write(f"({kgnr:5}, {gkz:5}, '{name}'),\n".encode('utf8')) f.seek(-2, 1) f.write(b';\n') f.write(b"\nINSERT INTO AT_ort VALUES\n") pr = set() for gkz, okz, name in ov_rows: kgnr_o = None if name.startswith('Wien '): name = name.replace('Wien ', 'Wien, ').replace('.', '. ') elif name.startswith('Graz,'): name = name.replace('Graz,', 'Graz, ').replace('.Bez.:', '. Bezirk: ').replace('0', '') if gkz not in gemeinden: pr.add(gkz) continue if len(gemeinden[gkz][1]) == 1: kgnr_o = gemeinden[gkz][1][0] else: for kgnr in gemeinden[gkz][1]: n11 = name.lower().replace('-', '').replace('th', 't') n12 = name.lower().replace('-', ' ').replace('th', 't') n21 = kgv[kgnr][0].lower().replace('-', '').replace('th', 't') n22 = kgv[kgnr][0].lower().replace('-', ' ').replace('th', 't') if n11 in n21 or n11 in n22 or n12 in n21 or n12 in n22 or n21 in n11 or n21 in n12 or n22 in n11 or n22 in n12: kgnr_o = kgnr f.write(f"({okz:5}, {gkz:5}, {kgnr_o if kgnr_o is not None else 'NULL':>5}, '{name}'),\n".encode('utf8')) f.seek(-2, 1) f.write(b';\n') p = set() for e in pr: possible = filter(lambda a: a[2] // 10000 == e // 10000, kgv_rows.copy()) if e in GKZ: for name in GKZ[e]: if name == '': continue possible = filter(lambda a: a[1] == name, possible) possible = list(possible) if len(possible) == 1: print(f' {e}: {possible[0][2]},') p.add(e) u = list(pr - p) u.sort() print(u) if __name__ == '__main__': parser = argparse.ArgumentParser() args = parser.parse_args() print('Downloading and parsing Ortsverzeichnis from statistik.at') ov_data = download_ov() print('Downloading Katastralgemeindenverzeichnis from www.bev.gv.at') kgv_data = download_kgv() write_sql(kgv_data, ov_data) print('Successfully created gemeinden.sql!')