This commit is contained in:
2022-11-30 00:51:54 +01:00
parent c51d7673a7
commit db07458847

View File

@ -9,7 +9,7 @@ import zipfile
import PyPDF2 import PyPDF2
GEM = re.compile(r'^([1-9][0-9]{2} [0-9]{2}) X') GEM = re.compile(r'^([1-9][0-9]{2} [0-9]{2}) [X0-9]+')
ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü(][A-Za-z0-9äöüÄÖÜßẞ():,.-]* ?|[0-9]+\..*?)+) ?' ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü(][A-Za-z0-9äöüÄÖÜßẞ():,.-]* ?|[0-9]+\..*?)+) ?'
r'([()X0-9.]+ [()X0-9.]+ ?|$)') r'([()X0-9.]+ [()X0-9.]+ ?|$)')
STRIP_NUM = re.compile(r'[X0-9. -]+$') STRIP_NUM = re.compile(r'[X0-9. -]+$')
@ -48,6 +48,10 @@ WIEN_UMGEBUNG = {
} }
GEMEINDEN = { GEMEINDEN = {
# Tirol
70330: 70370,
70327: 70370,
70341: 70370,
# Oberösterreich # Oberösterreich
40803: 40835, 40803: 40835,
40819: 40835, 40819: 40835,
@ -55,6 +59,13 @@ GEMEINDEN = {
41330: 41344, 41330: 41344,
41519: 41344, 41519: 41344,
41625: 41628, 41625: 41628,
41301: 41346,
41303: 41343,
41310: 41345,
41335: 41346,
41339: 41343,
41340: 41628,
41520: 41522,
# Steiermark # Steiermark
60204: 62139, 60204: 62139,
60207: 62142, 60207: 62142,
@ -200,6 +211,223 @@ GEMEINDEN = {
60739: 62266, 60739: 62266,
60644: 60669, 60644: 60669,
60409: 62314, 60409: 62314,
61511: 62383,
61513: 62376,
61515: 62377,
61519: 62380,
61603: 61630,
61605: 61632,
61606: 61631,
61608: 61630,
61610: 61630,
61614: 61627,
61616: 61629,
61617: 61626,
61619: 61632,
61622: 61633,
61623: 61628,
61705: 61757,
61707: 61756,
61714: 61757,
61715: 61761,
61717: 61757,
61718: 62266,
61725: 61760,
61726: 61760,
61732: 61756,
61733: 61763,
61735: 61759,
61736: 61763,
61739: 61762,
61742: 61764,
61749: 61761,
60201: 62138,
60202: 62138,
60205: 62147,
60206: 62146,
60208: 62142,
60212: 62140,
60214: 62147,
60218: 62142,
60221: 62135,
60303: 60345,
60306: 60344,
60308: 60349,
60311: 60349,
60314: 60348,
60316: 60344,
60319: 60351,
60321: 60344,
60328: 60345,
60332: 60345,
60335: 60350,
60340: 60351,
60342: 60351,
60402: 62380,
60404: 62375,
60406: 62386,
60408: 62385,
60412: 62382,
60413: 62385,
60414: 62387,
60415: 62389,
60418: 62379,
60419: 62380,
60423: 62378,
60428: 62386,
60429: 62390,
60431: 62386,
60432: 62380,
60433: 62375,
60434: 61057,
60436: 62382,
60437: 62382,
60439: 62384,
60440: 62378,
60441: 60668,
60443: 62380,
60444: 62379,
60445: 62380,
60450: 61057,
60452: 62382,
60453: 62375,
60454: 62380,
60455: 62372,
60456: 62381,
60503: 62206,
60513: 62273,
60604: 60660,
60605: 60666,
60609: 60662,
60612: 60667,
60615: 60659,
60616: 60664,
60621: 60661,
60630: 60662,
60633: 60669,
60635: 60667,
60636: 60663,
60640: 60668,
60649: 61758,
60650: 61758,
60657: 60670,
60701: 62266,
60704: 62272,
60709: 62270,
60713: 62272,
60714: 62266,
60716: 62277,
60718: 62274,
60719: 62279,
60720: 62274,
60721: 62233,
60724: 62278,
60725: 62275,
60727: 62276,
60728: 62277,
60730: 62242,
60731: 62266,
60733: 62245,
60737: 62265,
60740: 62266,
60742: 62268,
60743: 62256,
60744: 62270,
60745: 62278,
60746: 62278,
60747: 62279,
60749: 62262,
60802: 62044,
60805: 62010,
60807: 62042,
60808: 62042,
60811: 62043,
60812: 62040,
60813: 62044,
60815: 62021,
60816: 62040,
60817: 62048,
60818: 62026,
60819: 62044,
60820: 62044,
60823: 62048,
60901: 62041,
60903: 62047,
60904: 62008,
60905: 62039,
60906: 62039,
60908: 62014,
60909: 62046,
60912: 62046,
60913: 62034,
61003: 61049,
61005: 61049,
61006: 61054,
61010: 61054,
61014: 61057,
61018: 61053,
61023: 61054,
61025: 61061,
61026: 61051,
61029: 61049,
61031: 61055,
61035: 61052,
61037: 61054,
61038: 61053,
61039: 61061,
61042: 61050,
61044: 61061,
61102: 61120,
61103: 61120,
61202: 61254,
61208: 61257,
61209: 61257,
61210: 61256,
61212: 61254,
61214: 61266,
61216: 61253,
61219: 61253,
61220: 61266,
61221: 61258,
61224: 61260,
61227: 61262,
61229: 61263,
61230: 61258,
61232: 61265,
61234: 61260,
61239: 61264,
61241: 61266,
61245: 61255,
61246: 61256,
61248: 61264,
61249: 61259,
61250: 61253,
61304: 62144,
61309: 62141,
61310: 62144,
61401: 61439,
61402: 61442,
61403: 61446,
61404: 61437,
61405: 61437,
61406: 61437,
61407: 61439,
61408: 61438,
61412: 61439,
61414: 61440,
61415: 61440,
61416: 61439,
61418: 61441,
61419: 61441,
61420: 61443,
61422: 61443,
61426: 61442,
61427: 61444,
61430: 61445,
61431: 61438,
61433: 61438,
61434: 61440,
61435: 61439,
} }
KgvRow = Tuple[int, str, int, str] KgvRow = Tuple[int, str, int, str]
@ -273,16 +501,20 @@ def download_ov_land(bundesland: str) -> List[OvRow]:
if not valid: if not valid:
continue continue
with open(f'out/{bundesland}.{page_num}.txt', 'w+') as o: with open(f'out/{bundesland}.{page_num + 1}.txt', 'w+') as o:
o.write(text) o.write(text)
lines = text.splitlines() lines = text.splitlines()
cont = False
for line in lines: for line in lines:
m1 = ORT.match(line) m1 = ORT.match(line)
m2 = GEM.match(line) m2 = GEM.match(line)
if last is not None: if last is not None:
if m1 is None: if line == 'Gemeindename':
cont = True
break
elif m1 is None:
last = f'{last} {line}' last = f'{last} {line}'
val = STRIP_NUM.sub('', ORT.match(last).group(0)) val = STRIP_NUM.sub('', ORT.match(last).group(0))
okz = int(val[:5]) okz = int(val[:5])
@ -323,6 +555,8 @@ def download_ov_land(bundesland: str) -> List[OvRow]:
elif gkz in GEMEINDEN: elif gkz in GEMEINDEN:
# Gemeindereformen (OÖ, Stmk.) # Gemeindereformen (OÖ, Stmk.)
gkz = GEMEINDEN[gkz] gkz = GEMEINDEN[gkz]
if cont:
continue
return rows return rows
@ -357,30 +591,46 @@ def write_sql(kgv_rows: List[KgvRow], ov_rows: List[OvRow]) -> None:
for gkz, okz, name in ov_rows: for gkz, okz, name in ov_rows:
kgnr_o = None kgnr_o = None
if name.startswith('Wien '):
name = name.replace('Wien ', 'Wien, ').replace('.', '. ')
elif name.startswith('Graz,'):
name = name.replace('Graz,', 'Graz, ').replace('.Bez.:', '. Bezirk: ').replace('0', '')
if gkz not in gemeinden: if gkz not in gemeinden:
pr.add(gkz) pr.add(gkz)
continue continue
for kgnr in gemeinden[gkz][1]: if len(gemeinden[gkz][1]) == 1:
if kgv[kgnr][0] in name or name in kgv[kgnr][0]: kgnr_o = gemeinden[gkz][1][0]
kgnr_o = kgnr else:
for kgnr in gemeinden[gkz][1]:
n11 = name.lower().replace('-', '').replace('th', 't')
n12 = name.lower().replace('-', ' ').replace('th', 't')
n21 = kgv[kgnr][0].lower().replace('-', '').replace('th', 't')
n22 = kgv[kgnr][0].lower().replace('-', ' ').replace('th', 't')
if n11 in n21 or n11 in n22 or n12 in n21 or n12 in n22 or n21 in n11 or n21 in n12 or n22 in n11 or n22 in n12:
kgnr_o = kgnr
f.write(f"({okz:5}, {kgnr_o if kgnr_o is not None else 'NULL':5}, '{name}'),\n".encode('utf8')) f.write(f"({okz:5}, {gkz:5}, {kgnr_o if kgnr_o is not None else 'NULL':>5}, '{name}'),\n".encode('utf8'))
f.seek(-2, 1) f.seek(-2, 1)
f.write(b';\n') f.write(b';\n')
p = set() p = set()
for e in pr: for e in pr:
possible = filter(lambda a: 60000 <= a[2] < 70000, kgv_rows.copy()) possible = filter(lambda a: a[2] // 10000 == e // 10000, kgv_rows.copy())
for name in GKZ[e]: if e in GKZ:
if name == '': for name in GKZ[e]:
continue if name == '':
possible = filter(lambda a: a[1] == name, possible) continue
possible = filter(lambda a: a[1] == name, possible)
possible = list(possible) possible = list(possible)
if len(possible) == 1: if len(possible) == 1:
print(f' {e}: {possible[0][2]},') print(f' {e}: {possible[0][2]},')
p.add(e) p.add(e)
print(pr - p)
u = list(pr - p)
u.sort()
print(u)
if __name__ == '__main__': if __name__ == '__main__':