diff --git a/data/gemeinden.py b/data/gemeinden.py index 984f340..1cbdfc2 100755 --- a/data/gemeinden.py +++ b/data/gemeinden.py @@ -10,7 +10,8 @@ import PyPDF2 GEM = re.compile(r'^([1-9][0-9]{2} [0-9]{2}) X') -ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü][A-Za-z0-9äöüÄÖÜßẞ:.-]* |[0-9]+\..*?)+)( ?[0-9]+ |$)') +ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü(][A-Za-z0-9äöüÄÖÜßẞ():,.-]* ?|[0-9]+\..*?)+) ?' + r'([()X0-9.]+ [()X0-9.]+ ?|$)') STRIP_NUM = re.compile(r'[X0-9. -]+$') STRIP_CODE = re.compile(r' *(Agh|Alm|Alpe|B|Bd|Bwg|Burg|Camp|D|E|Fbk|Fhei|Gh|Hgr|Hot|Indz|Jh|Jhtt|Ki|Kl|Krwk|Ks|M|Mh|' r'Mü|R|Ru|Sa|Sä|Sb|Schh|Schih|Schl|Sdlg|Sgr|St|Stbr|Stt|V|W|We|Ek|Z|Zgl|ZH) *(\([0-9]+\) *)?$') @@ -137,6 +138,68 @@ GEMEINDEN = { 60824: 62039, 61411: 61439, 60907: 62034, + 60417: 62380, + 60422: 62378, + 60425: 62381, + 60438: 60668, + 60442: 62380, + 60449: 62381, + 61505: 62383, + 61509: 62335, + 61520: 61060, + 60501: 62267, + 60502: 62206, + 60504: 62267, + 60507: 62269, + 60509: 62273, + 60511: 62232, + 61022: 61054, + 60514: 62267, + 61028: 61049, + 61040: 61059, + 61046: 61059, + 61047: 61059, + 61048: 61057, + 61601: 61626, + 61607: 61630, + 61613: 61627, + 61117: 61120, + 60606: 60661, + 60622: 60664, + 60625: 60668, + 60652: 60670, + 61201: 61254, + 60702: 62205, + 60705: 62277, + 60711: 62266, + 60715: 62272, + 60717: 62274, + 60210: 62142, + 61235: 61267, + 61753: 61765, + 61242: 61265, + 61754: 61757, + 60732: 62245, + 61755: 61766, + 60748: 62266, + 61305: 62141, + 61315: 62145, + 61316: 62145, + 60302: 60345, + 60822: 62048, + 60320: 60344, + 60322: 60345, + 60325: 60350, + 60333: 60350, + 60334: 60350, + 60338: 60344, + 60343: 62380, + 61417: 61445, + 60403: 62386, + 60411: 62389, + 60739: 62266, + 60644: 60669, + 60409: 62314, } KgvRow = Tuple[int, str, int, str] @@ -181,6 +244,9 @@ def download_kgv() -> List[KgvRow]: return rows +GKZ = {} + + def download_ov_land(bundesland: str) -> List[OvRow]: rows = [] with tempfile.NamedTemporaryFile() as f: @@ -195,7 +261,7 @@ def download_ov_land(bundesland: str) -> List[OvRow]: last = None valid = False for page in pdf.pages: - # page_num = pdf.getPageNumber(page) + page_num = pdf.getPageNumber(page) text = page.extractText() if len(text) < 100: @@ -207,19 +273,37 @@ def download_ov_land(bundesland: str) -> List[OvRow]: if not valid: continue + with open(f'out/{bundesland}.{page_num}.txt', 'w+') as o: + o.write(text) + lines = text.splitlines() for line in lines: - if last is not None: - last += ' ' + line - val = STRIP_NUM.sub('', ORT.match(last).group(0)) - okz = int(val[:5]) - name = STRIP_CODE.sub('', val[6:]) - rows.append((gkz, okz, name)) - last = None - continue - m1 = ORT.match(line) m2 = GEM.match(line) + + if last is not None: + if m1 is None: + last = f'{last} {line}' + val = STRIP_NUM.sub('', ORT.match(last).group(0)) + okz = int(val[:5]) + name = STRIP_CODE.sub('', val[6:]) + rows.append((gkz, okz, name)) + last = None + continue + else: + val = last + okz = int(val[:5]) + name = STRIP_CODE.sub('', val[6:]) + rows.append((gkz, okz, name)) + last = None + + if 'Katastralgemeinden:' in line: + p1 = line.find('Katastralgemeinden:') + p2 = line.find('Postleitzahl') + val = [' '.join(a.split(' ')[:-2]) for a in line[p1 + 20:p2].split(', ')] + GKZ[gkz] = val + continue + if m1: val = STRIP_NUM.sub('', m1.group(0)) if m1.group(4) == '': @@ -271,8 +355,6 @@ def write_sql(kgv_rows: List[KgvRow], ov_rows: List[OvRow]) -> None: f.write(b"\nINSERT INTO AT_ort VALUES\n") pr = set() for gkz, okz, name in ov_rows: - if name.lower() == 'kollnbrunn': - print(name) kgnr_o = None if gkz not in gemeinden: @@ -287,7 +369,18 @@ def write_sql(kgv_rows: List[KgvRow], ov_rows: List[OvRow]) -> None: f.seek(-2, 1) f.write(b';\n') - print(pr) + p = set() + for e in pr: + possible = filter(lambda a: 60000 <= a[2] < 70000, kgv_rows.copy()) + for name in GKZ[e]: + if name == '': + continue + possible = filter(lambda a: a[1] == name, possible) + possible = list(possible) + if len(possible) == 1: + print(f' {e}: {possible[0][2]},') + p.add(e) + print(pr - p) if __name__ == '__main__':