396 lines
11 KiB
Python
Executable File
396 lines
11 KiB
Python
Executable File
#!/bin/env python3
|
|
|
|
from typing import List, Tuple
|
|
import argparse
|
|
import requests
|
|
import re
|
|
import tempfile
|
|
import zipfile
|
|
import PyPDF2
|
|
|
|
|
|
GEM = re.compile(r'^([1-9][0-9]{2} [0-9]{2}) X')
|
|
ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü(][A-Za-z0-9äöüÄÖÜßẞ():,.-]* ?|[0-9]+\..*?)+) ?'
|
|
r'([()X0-9.]+ [()X0-9.]+ ?|$)')
|
|
STRIP_NUM = re.compile(r'[X0-9. -]+$')
|
|
STRIP_CODE = re.compile(r' *(Agh|Alm|Alpe|B|Bd|Bwg|Burg|Camp|D|E|Fbk|Fhei|Gh|Hgr|Hot|Indz|Jh|Jhtt|Ki|Kl|Krwk|Ks|M|Mh|'
|
|
r'Mü|R|Ru|Sa|Sä|Sb|Schh|Schih|Schl|Sdlg|Sgr|St|Stbr|Stt|V|W|We|Ek|Z|Zgl|ZH) *(\([0-9]+\) *)?$')
|
|
|
|
KGV_URL = 'https://www.bev.gv.at/portal/page?_pageid=713,2601283&_dad=portal&_schema=PORTAL'
|
|
KGV_LINK = re.compile(r'<a\s+href="(.*?)"\s*>')
|
|
|
|
OV_URL = 'https://statistik.at/fileadmin/publications/Ortsverzeichnis_2001__{}.pdf'
|
|
OV_NAMES = ['Wien', 'Niederoesterreich', 'Oberoesterreich', 'Kaernten',
|
|
'Steiermark', 'Vorarlberg', 'Burgenland', 'Tirol', 'Salzburg']
|
|
|
|
WIEN_UMGEBUNG = {
|
|
32401: 30729,
|
|
32402: 30730,
|
|
32403: 31949,
|
|
32404: 31235,
|
|
32405: 30731,
|
|
32406: 30732,
|
|
32407: 30733,
|
|
32408: 32144,
|
|
32409: 30734,
|
|
32410: 30735,
|
|
32411: 30736,
|
|
32412: 31950,
|
|
32413: 30737,
|
|
32415: 31951,
|
|
32416: 31952,
|
|
32417: 30738,
|
|
32418: 30739,
|
|
32419: 30740,
|
|
32421: 31953,
|
|
32423: 31954,
|
|
32424: 30741,
|
|
}
|
|
|
|
GEMEINDEN = {
|
|
# Oberösterreich
|
|
40803: 40835,
|
|
40819: 40835,
|
|
41308: 41344,
|
|
41330: 41344,
|
|
41519: 41344,
|
|
41625: 41628,
|
|
# Steiermark
|
|
60204: 62139,
|
|
60207: 62142,
|
|
60209: 62140,
|
|
60211: 62139,
|
|
60213: 62125,
|
|
60216: 62128,
|
|
60217: 62146,
|
|
60219: 62147,
|
|
60301: 60345,
|
|
60307: 60344,
|
|
60309: 60350,
|
|
60310: 60348,
|
|
60313: 60345,
|
|
60315: 60349,
|
|
60327: 60347,
|
|
60330: 60348,
|
|
60331: 60349,
|
|
60336: 60347,
|
|
60339: 60346,
|
|
60407: 62311,
|
|
60410: 62378,
|
|
60416: 62380,
|
|
60426: 61057,
|
|
60435: 62382,
|
|
61504: 62383,
|
|
61508: 62390,
|
|
61510: 62383,
|
|
61514: 62377,
|
|
61517: 62380,
|
|
61009: 61054,
|
|
61011: 61012,
|
|
61015: 61052,
|
|
60512: 62273,
|
|
61034: 61052,
|
|
61036: 61061,
|
|
61041: 61050,
|
|
61602: 61631,
|
|
61609: 61630,
|
|
61104: 61107,
|
|
61620: 61628,
|
|
60601: 60665,
|
|
60603: 60666,
|
|
60607: 60664,
|
|
60614: 60664,
|
|
60620: 60661,
|
|
60634: 60619,
|
|
60658: 60660,
|
|
61706: 61711,
|
|
61712: 61759,
|
|
61713: 62266,
|
|
61720: 61763,
|
|
61721: 61727,
|
|
60703: 62272,
|
|
60706: 62211,
|
|
60708: 62270,
|
|
61218: 61222,
|
|
61223: 61260,
|
|
61734: 61763,
|
|
61737: 61762,
|
|
61226: 61254,
|
|
60723: 62275,
|
|
60726: 62277,
|
|
61238: 61266,
|
|
60729: 62266,
|
|
61244: 61256,
|
|
60736: 62265,
|
|
60738: 62275,
|
|
60741: 62268,
|
|
61301: 62144,
|
|
61306: 62115,
|
|
61308: 62144,
|
|
61311: 62143,
|
|
60801: 62044,
|
|
61314: 62132,
|
|
60804: 62010,
|
|
60809: 62048,
|
|
60810: 62044,
|
|
60814: 62044,
|
|
60821: 62042,
|
|
60824: 62039,
|
|
61411: 61439,
|
|
60907: 62034,
|
|
60417: 62380,
|
|
60422: 62378,
|
|
60425: 62381,
|
|
60438: 60668,
|
|
60442: 62380,
|
|
60449: 62381,
|
|
61505: 62383,
|
|
61509: 62335,
|
|
61520: 61060,
|
|
60501: 62267,
|
|
60502: 62206,
|
|
60504: 62267,
|
|
60507: 62269,
|
|
60509: 62273,
|
|
60511: 62232,
|
|
61022: 61054,
|
|
60514: 62267,
|
|
61028: 61049,
|
|
61040: 61059,
|
|
61046: 61059,
|
|
61047: 61059,
|
|
61048: 61057,
|
|
61601: 61626,
|
|
61607: 61630,
|
|
61613: 61627,
|
|
61117: 61120,
|
|
60606: 60661,
|
|
60622: 60664,
|
|
60625: 60668,
|
|
60652: 60670,
|
|
61201: 61254,
|
|
60702: 62205,
|
|
60705: 62277,
|
|
60711: 62266,
|
|
60715: 62272,
|
|
60717: 62274,
|
|
60210: 62142,
|
|
61235: 61267,
|
|
61753: 61765,
|
|
61242: 61265,
|
|
61754: 61757,
|
|
60732: 62245,
|
|
61755: 61766,
|
|
60748: 62266,
|
|
61305: 62141,
|
|
61315: 62145,
|
|
61316: 62145,
|
|
60302: 60345,
|
|
60822: 62048,
|
|
60320: 60344,
|
|
60322: 60345,
|
|
60325: 60350,
|
|
60333: 60350,
|
|
60334: 60350,
|
|
60338: 60344,
|
|
60343: 62380,
|
|
61417: 61445,
|
|
60403: 62386,
|
|
60411: 62389,
|
|
60739: 62266,
|
|
60644: 60669,
|
|
60409: 62314,
|
|
}
|
|
|
|
KgvRow = Tuple[int, str, int, str]
|
|
OvRow = Tuple[int, int, str]
|
|
|
|
|
|
def get_kvg_zip_url() -> str:
|
|
r = requests.get(KGV_URL)
|
|
if r.status_code != 200:
|
|
raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
|
|
|
|
matches = KGV_LINK.findall(r.text)
|
|
if len(matches) == 0:
|
|
raise RuntimeError('Unable to find url of zip file')
|
|
|
|
return matches[0]
|
|
|
|
|
|
def download_kgv() -> List[KgvRow]:
|
|
with tempfile.NamedTemporaryFile() as f:
|
|
with requests.get(get_kvg_zip_url(), stream=True) as r:
|
|
if r.status_code != 200:
|
|
raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
|
|
for chunk in r.iter_content(chunk_size=8192):
|
|
f.write(chunk)
|
|
|
|
rows = []
|
|
with zipfile.ZipFile(f, 'r') as zip_file:
|
|
files = [name for name in zip_file.namelist() if name.endswith('.csv')]
|
|
if len(files) == 0:
|
|
raise RuntimeError('Unable to find csv file in zip')
|
|
|
|
with zip_file.open(files[0], 'r') as csv:
|
|
first = True
|
|
for r_line in csv:
|
|
if first:
|
|
first = False
|
|
continue
|
|
line = r_line.decode('utf8').rstrip()
|
|
row = [c[1:-1] if c[0] == '"' else int(c) for c in line.split(';')]
|
|
rows.append((int(row[0]), str(row[1]), int(row[3]), str(row[4])))
|
|
return rows
|
|
|
|
|
|
GKZ = {}
|
|
|
|
|
|
def download_ov_land(bundesland: str) -> List[OvRow]:
|
|
rows = []
|
|
with tempfile.NamedTemporaryFile() as f:
|
|
r = requests.get(OV_URL.format(bundesland))
|
|
if r.status_code != 200:
|
|
raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
|
|
for chunk in r.iter_content(chunk_size=8192):
|
|
f.write(chunk)
|
|
|
|
pdf = PyPDF2.PdfFileReader(f)
|
|
gkz = None
|
|
last = None
|
|
valid = False
|
|
for page in pdf.pages:
|
|
page_num = pdf.getPageNumber(page)
|
|
text = page.extractText()
|
|
|
|
if len(text) < 100:
|
|
if text.strip().replace(' ', '') == 'Ortsverzeichnis':
|
|
valid = True
|
|
continue
|
|
elif valid and text.strip().replace(' ', '') == 'ALPHABETISCHEVERZEICHNISSE':
|
|
break
|
|
if not valid:
|
|
continue
|
|
|
|
with open(f'out/{bundesland}.{page_num}.txt', 'w+') as o:
|
|
o.write(text)
|
|
|
|
lines = text.splitlines()
|
|
for line in lines:
|
|
m1 = ORT.match(line)
|
|
m2 = GEM.match(line)
|
|
|
|
if last is not None:
|
|
if m1 is None:
|
|
last = f'{last} {line}'
|
|
val = STRIP_NUM.sub('', ORT.match(last).group(0))
|
|
okz = int(val[:5])
|
|
name = STRIP_CODE.sub('', val[6:])
|
|
rows.append((gkz, okz, name))
|
|
last = None
|
|
continue
|
|
else:
|
|
val = last
|
|
okz = int(val[:5])
|
|
name = STRIP_CODE.sub('', val[6:])
|
|
rows.append((gkz, okz, name))
|
|
last = None
|
|
|
|
if 'Katastralgemeinden:' in line:
|
|
p1 = line.find('Katastralgemeinden:')
|
|
p2 = line.find('Postleitzahl')
|
|
val = [' '.join(a.split(' ')[:-2]) for a in line[p1 + 20:p2].split(', ')]
|
|
GKZ[gkz] = val
|
|
continue
|
|
|
|
if m1:
|
|
val = STRIP_NUM.sub('', m1.group(0))
|
|
if m1.group(4) == '':
|
|
last = val
|
|
else:
|
|
okz = int(val[:5])
|
|
name = STRIP_CODE.sub('', val[6:])
|
|
rows.append((gkz, okz, name))
|
|
elif m2:
|
|
gkz = int(m2.group(1).replace(' ', ''))
|
|
if gkz > 90000:
|
|
# Gemeinde Wien
|
|
gkz = 90001
|
|
elif gkz >= 32400 and gkz <= 32499:
|
|
# ehem. Bezirk Wien Umgebung
|
|
gkz = WIEN_UMGEBUNG[gkz]
|
|
elif gkz in GEMEINDEN:
|
|
# Gemeindereformen (OÖ, Stmk.)
|
|
gkz = GEMEINDEN[gkz]
|
|
return rows
|
|
|
|
|
|
def download_ov() -> List[OvRow]:
|
|
rows = []
|
|
for name in OV_NAMES:
|
|
rows += download_ov_land(name)
|
|
return rows
|
|
|
|
|
|
def write_sql(kgv_rows: List[KgvRow], ov_rows: List[OvRow]) -> None:
|
|
kgv = {kgnr: (kg_name, gkz, gem_name) for kgnr, kg_name, gkz, gem_name in kgv_rows}
|
|
ov = {okz: (name, gkz) for gkz, okz, name in ov_rows}
|
|
gemeinden = {gkz: (gem_name, []) for kgnr, kg_name, gkz, gem_name in kgv_rows}
|
|
|
|
with open('gemeinden.sql', 'wb') as f:
|
|
f.write(b"\nINSERT INTO AT_gem VALUES\n")
|
|
for gkz, (name, _) in gemeinden.items():
|
|
f.write(f"({gkz:5}, '{name}'),\n".encode('utf8'))
|
|
f.seek(-2, 1)
|
|
f.write(b';\n')
|
|
|
|
f.write(b"\nINSERT INTO AT_kg VALUES\n")
|
|
for kgnr, name, gkz, _ in kgv_rows:
|
|
gemeinden[gkz][1].append(kgnr)
|
|
f.write(f"({kgnr:5}, {gkz:5}, '{name}'),\n".encode('utf8'))
|
|
f.seek(-2, 1)
|
|
f.write(b';\n')
|
|
|
|
f.write(b"\nINSERT INTO AT_ort VALUES\n")
|
|
pr = set()
|
|
for gkz, okz, name in ov_rows:
|
|
kgnr_o = None
|
|
|
|
if gkz not in gemeinden:
|
|
pr.add(gkz)
|
|
continue
|
|
|
|
for kgnr in gemeinden[gkz][1]:
|
|
if kgv[kgnr][0] in name or name in kgv[kgnr][0]:
|
|
kgnr_o = kgnr
|
|
|
|
f.write(f"({okz:5}, {kgnr_o if kgnr_o is not None else 'NULL':5}, '{name}'),\n".encode('utf8'))
|
|
f.seek(-2, 1)
|
|
f.write(b';\n')
|
|
|
|
p = set()
|
|
for e in pr:
|
|
possible = filter(lambda a: 60000 <= a[2] < 70000, kgv_rows.copy())
|
|
for name in GKZ[e]:
|
|
if name == '':
|
|
continue
|
|
possible = filter(lambda a: a[1] == name, possible)
|
|
possible = list(possible)
|
|
if len(possible) == 1:
|
|
print(f' {e}: {possible[0][2]},')
|
|
p.add(e)
|
|
print(pr - p)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
args = parser.parse_args()
|
|
|
|
print('Downloading and parsing Ortsverzeichnis from statistik.at')
|
|
ov_data = download_ov()
|
|
print('Downloading Katastralgemeindenverzeichnis from www.bev.gv.at')
|
|
kgv_data = download_kgv()
|
|
write_sql(kgv_data, ov_data)
|
|
print('Successfully created gemeinden.sql!')
|