diff --git a/data/gemeinden.py b/data/gemeinden.py
deleted file mode 100755
index a88ef9e..0000000
--- a/data/gemeinden.py
+++ /dev/null
@@ -1,645 +0,0 @@
-#!/bin/env python3
-
-from typing import List, Tuple
-import argparse
-import requests
-import re
-import tempfile
-import zipfile
-import PyPDF2
-
-
-GEM = re.compile(r'^([1-9][0-9]{2} [0-9]{2}) [X0-9]+')
-ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü(][A-Za-z0-9äöüÄÖÜßẞ():,.-]* ?|[0-9]+\..*?)+) ?'
- r'([()X0-9.]+ [()X0-9.]+ ?|$)')
-STRIP_NUM = re.compile(r'[X0-9. -]+$')
-STRIP_CODE = re.compile(r' *(Agh|Alm|Alpe|B|Bd|Bwg|Burg|Camp|D|E|Fbk|Fhei|Gh|Hgr|Hot|Indz|Jh|Jhtt|Ki|Kl|Krwk|Ks|M|Mh|'
- r'Mü|R|Ru|Sa|Sä|Sb|Schh|Schih|Schl|Sdlg|Sgr|St|Stbr|Stt|V|W|We|Ek|Z|Zgl|ZH) *(\([0-9]+\) *)?$')
-
-KGV_URL = 'https://www.bev.gv.at/portal/page?_pageid=713,2601283&_dad=portal&_schema=PORTAL'
-KGV_LINK = re.compile(r'')
-
-OV_URL = 'https://statistik.at/fileadmin/publications/Ortsverzeichnis_2001__{}.pdf'
-OV_NAMES = ['Wien', 'Niederoesterreich', 'Oberoesterreich', 'Kaernten',
- 'Steiermark', 'Vorarlberg', 'Burgenland', 'Tirol', 'Salzburg']
-
-WIEN_UMGEBUNG = {
- 32401: 30729,
- 32402: 30730,
- 32403: 31949,
- 32404: 31235,
- 32405: 30731,
- 32406: 30732,
- 32407: 30733,
- 32408: 32144,
- 32409: 30734,
- 32410: 30735,
- 32411: 30736,
- 32412: 31950,
- 32413: 30737,
- 32415: 31951,
- 32416: 31952,
- 32417: 30738,
- 32418: 30739,
- 32419: 30740,
- 32421: 31953,
- 32423: 31954,
- 32424: 30741,
-}
-
-GEMEINDEN = {
- # Tirol
- 70330: 70370,
- 70327: 70370,
- 70341: 70370,
- # Oberösterreich
- 40803: 40835,
- 40819: 40835,
- 41308: 41344,
- 41330: 41344,
- 41519: 41344,
- 41625: 41628,
- 41301: 41346,
- 41303: 41343,
- 41310: 41345,
- 41335: 41346,
- 41339: 41343,
- 41340: 41628,
- 41520: 41522,
- # Steiermark
- 60204: 62139,
- 60207: 62142,
- 60209: 62140,
- 60211: 62139,
- 60213: 62125,
- 60216: 62128,
- 60217: 62146,
- 60219: 62147,
- 60301: 60345,
- 60307: 60344,
- 60309: 60350,
- 60310: 60348,
- 60313: 60345,
- 60315: 60349,
- 60327: 60347,
- 60330: 60348,
- 60331: 60349,
- 60336: 60347,
- 60339: 60346,
- 60407: 62311,
- 60410: 62378,
- 60416: 62380,
- 60426: 61057,
- 60435: 62382,
- 61504: 62383,
- 61508: 62390,
- 61510: 62383,
- 61514: 62377,
- 61517: 62380,
- 61009: 61054,
- 61011: 61012,
- 61015: 61052,
- 60512: 62273,
- 61034: 61052,
- 61036: 61061,
- 61041: 61050,
- 61602: 61631,
- 61609: 61630,
- 61104: 61107,
- 61620: 61628,
- 60601: 60665,
- 60603: 60666,
- 60607: 60664,
- 60614: 60664,
- 60620: 60661,
- 60634: 60619,
- 60658: 60660,
- 61706: 61711,
- 61712: 61759,
- 61713: 62266,
- 61720: 61763,
- 61721: 61727,
- 60703: 62272,
- 60706: 62211,
- 60708: 62270,
- 61218: 61222,
- 61223: 61260,
- 61734: 61763,
- 61737: 61762,
- 61226: 61254,
- 60723: 62275,
- 60726: 62277,
- 61238: 61266,
- 60729: 62266,
- 61244: 61256,
- 60736: 62265,
- 60738: 62275,
- 60741: 62268,
- 61301: 62144,
- 61306: 62115,
- 61308: 62144,
- 61311: 62143,
- 60801: 62044,
- 61314: 62132,
- 60804: 62010,
- 60809: 62048,
- 60810: 62044,
- 60814: 62044,
- 60821: 62042,
- 60824: 62039,
- 61411: 61439,
- 60907: 62034,
- 60417: 62380,
- 60422: 62378,
- 60425: 62381,
- 60438: 60668,
- 60442: 62380,
- 60449: 62381,
- 61505: 62383,
- 61509: 62335,
- 61520: 61060,
- 60501: 62267,
- 60502: 62206,
- 60504: 62267,
- 60507: 62269,
- 60509: 62273,
- 60511: 62232,
- 61022: 61054,
- 60514: 62267,
- 61028: 61049,
- 61040: 61059,
- 61046: 61059,
- 61047: 61059,
- 61048: 61057,
- 61601: 61626,
- 61607: 61630,
- 61613: 61627,
- 61117: 61120,
- 60606: 60661,
- 60622: 60664,
- 60625: 60668,
- 60652: 60670,
- 61201: 61254,
- 60702: 62205,
- 60705: 62277,
- 60711: 62266,
- 60715: 62272,
- 60717: 62274,
- 60210: 62142,
- 61235: 61267,
- 61753: 61765,
- 61242: 61265,
- 61754: 61757,
- 60732: 62245,
- 61755: 61766,
- 60748: 62266,
- 61305: 62141,
- 61315: 62145,
- 61316: 62145,
- 60302: 60345,
- 60822: 62048,
- 60320: 60344,
- 60322: 60345,
- 60325: 60350,
- 60333: 60350,
- 60334: 60350,
- 60338: 60344,
- 60343: 62380,
- 61417: 61445,
- 60403: 62386,
- 60411: 62389,
- 60739: 62266,
- 60644: 60669,
- 60409: 62314,
- 61511: 62383,
- 61513: 62376,
- 61515: 62377,
- 61519: 62380,
- 61603: 61630,
- 61605: 61632,
- 61606: 61631,
- 61608: 61630,
- 61610: 61630,
- 61614: 61627,
- 61616: 61629,
- 61617: 61626,
- 61619: 61632,
- 61622: 61633,
- 61623: 61628,
- 61705: 61757,
- 61707: 61756,
- 61714: 61757,
- 61715: 61761,
- 61717: 61757,
- 61718: 62266,
- 61725: 61760,
- 61726: 61760,
- 61732: 61756,
- 61733: 61763,
- 61735: 61759,
- 61736: 61763,
- 61739: 61762,
- 61742: 61764,
- 61749: 61761,
- 60201: 62138,
- 60202: 62138,
- 60205: 62147,
- 60206: 62146,
- 60208: 62142,
- 60212: 62140,
- 60214: 62147,
- 60218: 62142,
- 60221: 62135,
- 60303: 60345,
- 60306: 60344,
- 60308: 60349,
- 60311: 60349,
- 60314: 60348,
- 60316: 60344,
- 60319: 60351,
- 60321: 60344,
- 60328: 60345,
- 60332: 60345,
- 60335: 60350,
- 60340: 60351,
- 60342: 60351,
- 60402: 62380,
- 60404: 62375,
- 60406: 62386,
- 60408: 62385,
- 60412: 62382,
- 60413: 62385,
- 60414: 62387,
- 60415: 62389,
- 60418: 62379,
- 60419: 62380,
- 60423: 62378,
- 60428: 62386,
- 60429: 62390,
- 60431: 62386,
- 60432: 62380,
- 60433: 62375,
- 60434: 61057,
- 60436: 62382,
- 60437: 62382,
- 60439: 62384,
- 60440: 62378,
- 60441: 60668,
- 60443: 62380,
- 60444: 62379,
- 60445: 62380,
- 60450: 61057,
- 60452: 62382,
- 60453: 62375,
- 60454: 62380,
- 60455: 62372,
- 60456: 62381,
- 60503: 62206,
- 60513: 62273,
- 60604: 60660,
- 60605: 60666,
- 60609: 60662,
- 60612: 60667,
- 60615: 60659,
- 60616: 60664,
- 60621: 60661,
- 60630: 60662,
- 60633: 60669,
- 60635: 60667,
- 60636: 60663,
- 60640: 60668,
- 60649: 61758,
- 60650: 61758,
- 60657: 60670,
- 60701: 62266,
- 60704: 62272,
- 60709: 62270,
- 60713: 62272,
- 60714: 62266,
- 60716: 62277,
- 60718: 62274,
- 60719: 62279,
- 60720: 62274,
- 60721: 62233,
- 60724: 62278,
- 60725: 62275,
- 60727: 62276,
- 60728: 62277,
- 60730: 62242,
- 60731: 62266,
- 60733: 62245,
- 60737: 62265,
- 60740: 62266,
- 60742: 62268,
- 60743: 62256,
- 60744: 62270,
- 60745: 62278,
- 60746: 62278,
- 60747: 62279,
- 60749: 62262,
- 60802: 62044,
- 60805: 62010,
- 60807: 62042,
- 60808: 62042,
- 60811: 62043,
- 60812: 62040,
- 60813: 62044,
- 60815: 62021,
- 60816: 62040,
- 60817: 62048,
- 60818: 62026,
- 60819: 62044,
- 60820: 62044,
- 60823: 62048,
- 60901: 62041,
- 60903: 62047,
- 60904: 62008,
- 60905: 62039,
- 60906: 62039,
- 60908: 62014,
- 60909: 62046,
- 60912: 62046,
- 60913: 62034,
- 61003: 61049,
- 61005: 61049,
- 61006: 61054,
- 61010: 61054,
- 61014: 61057,
- 61018: 61053,
- 61023: 61054,
- 61025: 61061,
- 61026: 61051,
- 61029: 61049,
- 61031: 61055,
- 61035: 61052,
- 61037: 61054,
- 61038: 61053,
- 61039: 61061,
- 61042: 61050,
- 61044: 61061,
- 61102: 61120,
- 61103: 61120,
- 61202: 61254,
- 61208: 61257,
- 61209: 61257,
- 61210: 61256,
- 61212: 61254,
- 61214: 61266,
- 61216: 61253,
- 61219: 61253,
- 61220: 61266,
- 61221: 61258,
- 61224: 61260,
- 61227: 61262,
- 61229: 61263,
- 61230: 61258,
- 61232: 61265,
- 61234: 61260,
- 61239: 61264,
- 61241: 61266,
- 61245: 61255,
- 61246: 61256,
- 61248: 61264,
- 61249: 61259,
- 61250: 61253,
- 61304: 62144,
- 61309: 62141,
- 61310: 62144,
- 61401: 61439,
- 61402: 61442,
- 61403: 61446,
- 61404: 61437,
- 61405: 61437,
- 61406: 61437,
- 61407: 61439,
- 61408: 61438,
- 61412: 61439,
- 61414: 61440,
- 61415: 61440,
- 61416: 61439,
- 61418: 61441,
- 61419: 61441,
- 61420: 61443,
- 61422: 61443,
- 61426: 61442,
- 61427: 61444,
- 61430: 61445,
- 61431: 61438,
- 61433: 61438,
- 61434: 61440,
- 61435: 61439,
-}
-
-KgvRow = Tuple[int, str, int, str]
-OvRow = Tuple[int, int, str]
-
-
-def get_kvg_zip_url() -> str:
- r = requests.get(KGV_URL)
- if r.status_code != 200:
- raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
-
- matches = KGV_LINK.findall(r.text)
- if len(matches) == 0:
- raise RuntimeError('Unable to find url of zip file')
-
- return matches[0]
-
-
-def download_kgv() -> List[KgvRow]:
- with tempfile.NamedTemporaryFile() as f:
- with requests.get(get_kvg_zip_url(), stream=True) as r:
- if r.status_code != 200:
- raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
- for chunk in r.iter_content(chunk_size=8192):
- f.write(chunk)
-
- rows = []
- with zipfile.ZipFile(f, 'r') as zip_file:
- files = [name for name in zip_file.namelist() if name.endswith('.csv')]
- if len(files) == 0:
- raise RuntimeError('Unable to find csv file in zip')
-
- with zip_file.open(files[0], 'r') as csv:
- first = True
- for r_line in csv:
- if first:
- first = False
- continue
- line = r_line.decode('utf8').rstrip()
- row = [c[1:-1] if c[0] == '"' else int(c) for c in line.split(';')]
- rows.append((int(row[0]), str(row[1]), int(row[3]), str(row[4])))
- return rows
-
-
-GKZ = {}
-
-
-def download_ov_land(bundesland: str) -> List[OvRow]:
- rows = []
- with tempfile.NamedTemporaryFile() as f:
- r = requests.get(OV_URL.format(bundesland))
- if r.status_code != 200:
- raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
- for chunk in r.iter_content(chunk_size=8192):
- f.write(chunk)
-
- pdf = PyPDF2.PdfFileReader(f)
- gkz = None
- last = None
- valid = False
- for page in pdf.pages:
- page_num = pdf.getPageNumber(page)
- text = page.extractText()
-
- if len(text) < 100:
- if text.strip().replace(' ', '') == 'Ortsverzeichnis':
- valid = True
- continue
- elif valid and text.strip().replace(' ', '') == 'ALPHABETISCHEVERZEICHNISSE':
- break
- if not valid:
- continue
-
- with open(f'out/{bundesland}.{page_num + 1}.txt', 'w+') as o:
- o.write(text)
-
- lines = text.splitlines()
- cont = False
- for line in lines:
- m1 = ORT.match(line)
- m2 = GEM.match(line)
-
- if last is not None:
- if line == 'Gemeindename':
- cont = True
- break
- elif m1 is None:
- last = f'{last} {line}'
- val = STRIP_NUM.sub('', ORT.match(last).group(0))
- okz = int(val[:5])
- name = STRIP_CODE.sub('', val[6:])
- rows.append((gkz, okz, name))
- last = None
- continue
- else:
- val = last
- okz = int(val[:5])
- name = STRIP_CODE.sub('', val[6:])
- rows.append((gkz, okz, name))
- last = None
-
- if 'Katastralgemeinden:' in line:
- p1 = line.find('Katastralgemeinden:')
- p2 = line.find('Postleitzahl')
- val = [' '.join(a.split(' ')[:-2]) for a in line[p1 + 20:p2].split(', ')]
- GKZ[gkz] = val
- continue
-
- if m1:
- val = STRIP_NUM.sub('', m1.group(0))
- if m1.group(4) == '':
- last = val
- else:
- okz = int(val[:5])
- name = STRIP_CODE.sub('', val[6:])
- rows.append((gkz, okz, name))
- elif m2:
- gkz = int(m2.group(1).replace(' ', ''))
- if gkz > 90000:
- # Gemeinde Wien
- gkz = 90001
- elif gkz >= 32400 and gkz <= 32499:
- # ehem. Bezirk Wien Umgebung
- gkz = WIEN_UMGEBUNG[gkz]
- elif gkz in GEMEINDEN:
- # Gemeindereformen (OÖ, Stmk.)
- gkz = GEMEINDEN[gkz]
- if cont:
- continue
- return rows
-
-
-def download_ov() -> List[OvRow]:
- rows = []
- for name in OV_NAMES:
- rows += download_ov_land(name)
- return rows
-
-
-def write_sql(kgv_rows: List[KgvRow], ov_rows: List[OvRow]) -> None:
- kgv = {kgnr: (kg_name, gkz, gem_name) for kgnr, kg_name, gkz, gem_name in kgv_rows}
- ov = {okz: (name, gkz) for gkz, okz, name in ov_rows}
- gemeinden = {gkz: (gem_name, []) for kgnr, kg_name, gkz, gem_name in kgv_rows}
-
- with open('gemeinden.sql', 'wb') as f:
- f.write(b"\nINSERT INTO AT_gem VALUES\n")
- for gkz, (name, _) in gemeinden.items():
- f.write(f"({gkz:5}, '{name}'),\n".encode('utf8'))
- f.seek(-2, 1)
- f.write(b';\n')
-
- f.write(b"\nINSERT INTO AT_kg VALUES\n")
- for kgnr, name, gkz, _ in kgv_rows:
- gemeinden[gkz][1].append(kgnr)
- f.write(f"({kgnr:5}, {gkz:5}, '{name}'),\n".encode('utf8'))
- f.seek(-2, 1)
- f.write(b';\n')
-
- f.write(b"\nINSERT INTO AT_ort VALUES\n")
- pr = set()
- for gkz, okz, name in ov_rows:
- kgnr_o = None
-
- if name.startswith('Wien '):
- name = name.replace('Wien ', 'Wien, ').replace('.', '. ')
- elif name.startswith('Graz,'):
- name = name.replace('Graz,', 'Graz, ').replace('.Bez.:', '. Bezirk: ').replace('0', '')
-
- if gkz not in gemeinden:
- pr.add(gkz)
- continue
-
- if len(gemeinden[gkz][1]) == 1:
- kgnr_o = gemeinden[gkz][1][0]
- else:
- for kgnr in gemeinden[gkz][1]:
- n11 = name.lower().replace('-', '').replace('th', 't')
- n12 = name.lower().replace('-', ' ').replace('th', 't')
- n21 = kgv[kgnr][0].lower().replace('-', '').replace('th', 't')
- n22 = kgv[kgnr][0].lower().replace('-', ' ').replace('th', 't')
- if n11 in n21 or n11 in n22 or n12 in n21 or n12 in n22 or n21 in n11 or n21 in n12 or n22 in n11 or n22 in n12:
- kgnr_o = kgnr
-
- f.write(f"({okz:5}, {gkz:5}, {kgnr_o if kgnr_o is not None else 'NULL':>5}, '{name}'),\n".encode('utf8'))
- f.seek(-2, 1)
- f.write(b';\n')
-
- p = set()
- for e in pr:
- possible = filter(lambda a: a[2] // 10000 == e // 10000, kgv_rows.copy())
- if e in GKZ:
- for name in GKZ[e]:
- if name == '':
- continue
- possible = filter(lambda a: a[1] == name, possible)
- possible = list(possible)
- if len(possible) == 1:
- print(f' {e}: {possible[0][2]},')
- p.add(e)
-
- u = list(pr - p)
- u.sort()
- print(u)
-
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser()
- args = parser.parse_args()
-
- print('Downloading and parsing Ortsverzeichnis from statistik.at')
- ov_data = download_ov()
- print('Downloading Katastralgemeindenverzeichnis from www.bev.gv.at')
- kgv_data = download_kgv()
- write_sql(kgv_data, ov_data)
- print('Successfully created gemeinden.sql!')
diff --git a/data/plz.py b/data/plz.py
index 0232cbd..1334172 100755
--- a/data/plz.py
+++ b/data/plz.py
@@ -7,31 +7,74 @@ import re
import xlrd
import tempfile
import os
+import zipfile
+import PyPDF2
-URL = 'https://www.post.at/g/c/postlexikon'
-BUTTON = re.compile(r'title="PLZ Bestimmungsort" href="(.*?)"')
+PLZ_URL = 'https://www.post.at/g/c/postlexikon'
+PLZ_BUTTON = re.compile(r'title="PLZ Bestimmungsort" href="(.*?)"')
-Row = Tuple[int, str, int, str, int, str]
+KGV_URL = 'https://www.bev.gv.at/portal/page?_pageid=713,2601283&_dad=portal&_schema=PORTAL'
+KGV_LINK = re.compile(r'')
+
+OV_URL = 'https://statistik.gv.at/fileadmin/publications/Ortsverzeichnis_2001__{}.pdf'
+OV_NAMES = ['Burgenland', 'Niederoesterreich', 'Wien']
+
+GEM = re.compile(r'^([1-9][0-9]{2} [0-9]{2}) [X0-9]+')
+ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü(][A-Za-z0-9äöüÄÖÜßẞ():,.-]* ?|[0-9]+\..*?)+) ?'
+ r'([()X0-9.]+ ?|$)')
+STRIP_NUM = re.compile(r'[X0-9. -]+$')
+STRIP_CODE = re.compile(r' *(Agh|Alm|Alpe|B|Bd|Bwg|Burg|Camp|D|E|Fbk|Fhei|Gh|Hgr|Hot|Indz|Jh|Jhtt|Ki|Kl|Krwk|Ks|M|Mh|'
+ r'Mü|R|Ru|Sa|Sä|Sb|Schh|Schih|Schl|Sdlg|Sgr|St|Stbr|Stt|V|W|We|Ek|Z|Zgl|ZH) *(\([0-9]+\) *)?$')
+
+PlzRow = Tuple[int, str, int, str, int, str]
+KgvRow = Tuple[int, str, int, str]
+OvRow = Tuple[int, int, str]
+
+WIEN_UMGEBUNG = {
+ 32401: 30729,
+ 32402: 30730,
+ 32403: 31949,
+ 32404: 31235,
+ 32405: 30731,
+ 32406: 30732,
+ 32407: 30733,
+ 32408: 32144,
+ 32409: 30734,
+ 32410: 30735,
+ 32411: 30736,
+ 32412: 31950,
+ 32413: 30737,
+ 32415: 31951,
+ 32416: 31952,
+ 32417: 30738,
+ 32418: 30739,
+ 32419: 30740,
+ 32421: 31953,
+ 32423: 31954,
+ 32424: 30741,
+}
+
+GKZ = {}
-def get_excel_url() -> str:
- r = requests.get(URL, headers={'User-Agent': 'Mozilla/5.0'})
+def get_plz_url() -> str:
+ r = requests.get(PLZ_URL, headers={'User-Agent': 'Mozilla/5.0'})
if r.status_code != 200:
raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
- matches = BUTTON.findall(r.text)
+ matches = PLZ_BUTTON.findall(r.text)
if len(matches) == 0:
raise RuntimeError('Unable to find url of file')
return matches[0]
-def download_excel() -> List[Row]:
+def download_plz() -> List[PlzRow]:
f_name = None
try:
f = tempfile.NamedTemporaryFile(delete=False)
- with requests.get(get_excel_url(), stream=True) as r:
+ with requests.get(get_plz_url(), stream=True) as r:
if r.status_code != 200:
raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
for chunk in r.iter_content(chunk_size=8192):
@@ -51,19 +94,235 @@ def download_excel() -> List[Row]:
os.remove(f_name)
-def write_sql(data: List[Row]) -> None:
+def get_kvg_zip_url() -> str:
+ r = requests.get(KGV_URL)
+ if r.status_code != 200:
+ raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
+
+ matches = KGV_LINK.findall(r.text)
+ if len(matches) == 0:
+ raise RuntimeError('Unable to find url of zip file')
+
+ return matches[0]
+
+
+def download_kgv() -> List[KgvRow]:
+ with tempfile.NamedTemporaryFile() as f:
+ with requests.get(get_kvg_zip_url(), stream=True) as r:
+ if r.status_code != 200:
+ raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
+ for chunk in r.iter_content(chunk_size=8192):
+ f.write(chunk)
+
+ rows = []
+ with zipfile.ZipFile(f, 'r') as zip_file:
+ files = [name for name in zip_file.namelist() if name.endswith('.csv')]
+ if len(files) == 0:
+ raise RuntimeError('Unable to find csv file in zip')
+
+ with zip_file.open(files[0], 'r') as csv:
+ first = True
+ for r_line in csv:
+ if first:
+ first = False
+ continue
+ line = r_line.decode('utf8').rstrip()
+ row = [c[1:-1] if c[0] == '"' else int(c) for c in line.split(';')]
+ rows.append((int(row[0]), str(row[1]), int(row[3]), str(row[4])))
+ return rows
+
+
+def download_ov_land(bundesland: str) -> List[OvRow]:
+ rows = []
+ with tempfile.NamedTemporaryFile() as f:
+ r = requests.get(OV_URL.format(bundesland))
+ if r.status_code != 200:
+ raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
+ for chunk in r.iter_content(chunk_size=8192):
+ f.write(chunk)
+
+ pdf = PyPDF2.PdfFileReader(f)
+ valid = False
+ for page in pdf.pages:
+ page_num = pdf.getPageNumber(page)
+ text = page.extractText()
+
+ if len(text) < 100:
+ if text.strip().replace(' ', '') == 'Ortsverzeichnis':
+ valid = True
+ continue
+ elif valid and text.strip().replace(' ', '') == 'ALPHABETISCHEVERZEICHNISSE':
+ break
+ if not valid:
+ continue
+
+ with open(f'out/{bundesland}/{page_num + 1}.txt', 'w+') as o:
+ o.write(text)
+
+ return rows
+
+
+def download_ov() -> None:
+ try:
+ os.mkdir('out')
+ for name in OV_NAMES:
+ os.mkdir(f'out/{name}')
+ download_ov_land(name)
+ except FileExistsError:
+ print('Using cache')
+ return
+
+
+def parse_ov() -> List[OvRow]:
+ rows = []
+ for bundesland in sorted(os.listdir('out')):
+ gkz = None
+ last = None
+ for page_name in sorted(os.listdir(f'out/{bundesland}')):
+ with open(f'out/{bundesland}/{page_name}', 'r') as f:
+ cont = False
+ for line in f:
+ line = line.rstrip()
+ m1 = ORT.match(line)
+ m2 = GEM.match(line)
+
+ if last is not None:
+ if line == 'Gemeindename':
+ cont = True
+ break
+ elif m1 is None:
+ last = f'{last} {line}'
+ print(last)
+ val = STRIP_NUM.sub('', ORT.match(last).group(0))
+ okz = int(val[:5])
+ name = STRIP_CODE.sub('', val[6:])
+ rows.append((gkz, okz, name))
+ last = None
+ continue
+ else:
+ val = last
+ okz = int(val[:5])
+ name = STRIP_CODE.sub('', val[6:])
+ rows.append((gkz, okz, name))
+ last = None
+
+ if 'Katastralgemeinden:' in line:
+ p1 = line.find('Katastralgemeinden:')
+ p2 = line.find('Postleitzahl')
+ val = [' '.join(a.split(' ')[:-2]) for a in line[p1 + 20:p2].split(', ')]
+ GKZ[gkz] = val
+ continue
+
+ if m1:
+ val = STRIP_NUM.sub('', m1.group(0))
+ if m1.group(4) == '':
+ last = val
+ else:
+ okz = int(val[:5])
+ name = STRIP_CODE.sub('', val[6:])
+ rows.append((gkz, okz, name))
+ print(rows[-1])
+ elif m2:
+ if len(line.split(' ')) <= 9:
+ continue
+ gkz = int(m2.group(1).replace(' ', ''))
+ if gkz > 90000:
+ # Gemeinde Wien
+ gkz = 90001
+ elif gkz >= 32400 and gkz <= 32499:
+ # ehem. Bezirk Wien Umgebung
+ gkz = WIEN_UMGEBUNG[gkz]
+ if cont:
+ continue
+ return rows
+
+
+def write_sql(plz_rows: List[PlzRow], kgv_rows: List[KgvRow], ov_rows: List[OvRow]) -> None:
+ kgv = {kgnr: (kg_name, gkz, gem_name) for kgnr, kg_name, gkz, gem_name in kgv_rows}
+ ov = {okz: (o_name, gkz) for plz, _, okz, o_name, gkz, _ in plz_rows}
+ ov.update({okz: (name, gkz) for gkz, okz, name in ov_rows})
+ gemeinden = {gkz: (gem_name, [], []) for kgnr, kg_name, gkz, gem_name in kgv_rows}
+
+ pr = set()
with open('plz.sql', 'wb') as f:
+ f.write(b"\nINSERT INTO AT_gem VALUES\n")
+ for gkz, (name, _, _) in gemeinden.items():
+ f.write(f"({gkz:5}, '{name}'),\n".encode('utf8'))
+ f.seek(-2, 1)
+ f.write(b';\n')
+
+ f.write(b"\nINSERT INTO AT_kg VALUES\n")
+ for kgnr, name, gkz, _ in kgv_rows:
+ gemeinden[gkz][1].append(kgnr)
+ f.write(f"({kgnr:5}, {gkz:5}, '{name}'),\n".encode('utf8'))
+ f.seek(-2, 1)
+ f.write(b';\n')
+
+ f.write(b"\nINSERT INTO AT_ort VALUES\n")
+ for okz, (name, gkz) in ov.items():
+ kgnr_o = None
+
+ if name.startswith('Wien '):
+ name = name.replace('Wien ', 'Wien, ').replace('.', '. ')
+ elif 'Bez.' in name:
+ name = name.replace(',', ', ', 1).replace('.Bez.:', '. Bezirk: ').replace('0', '')
+
+ if gkz not in gemeinden:
+ print(okz, name, gkz)
+ pr.add(gkz)
+ continue
+
+ if len(gemeinden[gkz][1]) == 1:
+ kgnr_o = gemeinden[gkz][1][0]
+ else:
+ for kgnr in gemeinden[gkz][1]:
+ n11 = name.lower().replace('-', '').replace('th', 't')
+ n12 = name.lower().replace('-', ' ').replace('th', 't')
+ n21 = kgv[kgnr][0].lower().replace('-', '').replace('th', 't')
+ n22 = kgv[kgnr][0].lower().replace('-', ' ').replace('th', 't')
+ if n11 in n21 or n11 in n22 or n12 in n21 or n12 in n22 or n21 in n11 or n21 in n12 or n22 in n11 or n22 in n12:
+ kgnr_o = kgnr
+
+ f.write(f"({okz:5}, {gkz:5}, {kgnr_o if kgnr_o is not None else 'NULL':>5}, '{name}'),\n".encode('utf8'))
+ f.seek(-2, 1)
+ f.write(b';\n')
+
f.write(b"\nINSERT INTO AT_plz VALUES\n")
- for plz, dest, okz, _, _, _ in data:
+ for plz, dest, okz, _, _, _ in plz_rows:
f.write(f"({plz:4}, {okz:5}, '{dest}'),\n".encode('utf8'))
f.seek(-2, 1)
f.write(b';\n')
+ p = set()
+ for e in pr:
+ possible = filter(lambda a: a[2] // 10000 == e // 10000, kgv_rows.copy())
+ if e in GKZ:
+ for name in GKZ[e]:
+ if name == '':
+ continue
+ possible = filter(lambda a: a[1] == name, possible)
+ possible = list(possible)
+ if len(possible) == 1:
+ print(f' {e}: {possible[0][2]},')
+ p.add(e)
+
+ u = list(pr - p)
+ u.sort()
+ print(u)
+
if __name__ == '__main__':
parser = argparse.ArgumentParser()
args = parser.parse_args()
+ print('Downloading Ortsverzeichnis from statistik.gv.at')
+ download_ov()
+ print('Parsing Ortsverzeichnis')
+ ov_data = parse_ov()
print('Downloading PLZ data from www.post.at')
- write_sql(download_excel())
+ plz_data = download_plz()
+ print('Downloading Katastralgemeindenverzeichnis from www.bev.gv.at')
+ kgv_data = download_kgv()
+ print('Generating plz.sql')
+ write_sql(plz_data, kgv_data, ov_data)
print('Successfully created plz.sql!')