From 216e195102e8ae264c44d275ce736311dce7b108 Mon Sep 17 00:00:00 2001 From: Lorenz Stechauner Date: Wed, 30 Nov 2022 13:22:00 +0100 Subject: [PATCH] Working --- data/plz.py | 46 ++++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/data/plz.py b/data/plz.py index 1334172..c2e9895 100755 --- a/data/plz.py +++ b/data/plz.py @@ -21,11 +21,14 @@ OV_URL = 'https://statistik.gv.at/fileadmin/publications/Ortsverzeichnis_2001__{ OV_NAMES = ['Burgenland', 'Niederoesterreich', 'Wien'] GEM = re.compile(r'^([1-9][0-9]{2} [0-9]{2}) [X0-9]+') -ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü(][A-Za-z0-9äöüÄÖÜßẞ():,.-]* ?|[0-9]+\..*?)+) ?' - r'([()X0-9.]+ ?|$)') +ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü(][A-Za-z0-9äöüÄÖÜßẞ():,.-]* ?|[0-9]+\..*?)+)' + r'(( [()X0-9.-]+)*)?([A-Za-zÄÖÜäöüßẞ ]+([0-9]*))?$') STRIP_NUM = re.compile(r'[X0-9. -]+$') -STRIP_CODE = re.compile(r' *(Agh|Alm|Alpe|B|Bd|Bwg|Burg|Camp|D|E|Fbk|Fhei|Gh|Hgr|Hot|Indz|Jh|Jhtt|Ki|Kl|Krwk|Ks|M|Mh|' - r'Mü|R|Ru|Sa|Sä|Sb|Schh|Schih|Schl|Sdlg|Sgr|St|Stbr|Stt|V|W|We|Ek|Z|Zgl|ZH) *(\([0-9]+\) *)?$') + +CODES = re.compile(r'Agh|Alm|Alpe|B|Bd|Bwg|Burg|Camp|D|E|Fbk|Fhei|Gh|Hgr|Hot|Indz|Jh|Jhtt|Ki|Kl|Krwk|Ks|M|Mh|' + r'Mü|R|Ru|Sa|Sä|Sb|Schh|Schih|Schl|Sdlg|Sgr|St|Stbr|Stt|V|W|We|Ek|Z|Zgl|ZH') +STRIP_CODE = re.compile(r' *(' + CODES.pattern + r')[()X0-9. -]*$') +STRIP_CODE_ALL = re.compile(r' +\b(' + CODES.pattern + r')\b.*$') PlzRow = Tuple[int, str, int, str, int, str] KgvRow = Tuple[int, str, int, str] @@ -74,7 +77,7 @@ def download_plz() -> List[PlzRow]: f_name = None try: f = tempfile.NamedTemporaryFile(delete=False) - with requests.get(get_plz_url(), stream=True) as r: + with requests.get(get_plz_url(), stream=True, headers={'User-Agent': 'Mozilla/5.0'}) as r: if r.status_code != 200: raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}') for chunk in r.iter_content(chunk_size=8192): @@ -156,7 +159,7 @@ def download_ov_land(bundesland: str) -> List[OvRow]: if not valid: continue - with open(f'out/{bundesland}/{page_num + 1}.txt', 'w+') as o: + with open(f'out/{bundesland}/{page_num + 1:03}.txt', 'w+') as o: o.write(text) return rows @@ -191,18 +194,18 @@ def parse_ov() -> List[OvRow]: cont = True break elif m1 is None: - last = f'{last} {line}' - print(last) - val = STRIP_NUM.sub('', ORT.match(last).group(0)) - okz = int(val[:5]) - name = STRIP_CODE.sub('', val[6:]) + if ',' not in line and ':' not in line: + last = f'{last} {line}' + m3 = ORT.match(last) + okz = int(m3.group(1)) + name = STRIP_CODE_ALL.sub('', m3.group(2)) rows.append((gkz, okz, name)) last = None continue else: - val = last - okz = int(val[:5]) - name = STRIP_CODE.sub('', val[6:]) + m3 = ORT.match(last) + okz = int(m3.group(1)) + name = STRIP_CODE.sub('', m3.group(2)) rows.append((gkz, okz, name)) last = None @@ -215,13 +218,16 @@ def parse_ov() -> List[OvRow]: if m1: val = STRIP_NUM.sub('', m1.group(0)) - if m1.group(4) == '': - last = val + + if m1.group(4) == '' or (m1.group(6) is not None and m1.group(6) != ''): + last = m1.group(1) + ' ' + m1.group(2) else: - okz = int(val[:5]) - name = STRIP_CODE.sub('', val[6:]) - rows.append((gkz, okz, name)) - print(rows[-1]) + okz = int(m1.group(1)) + name = STRIP_CODE.sub('', m1.group(2)) + if len(name) == len(m1.group(2)): + last = val + else: + rows.append((gkz, okz, name)) elif m2: if len(line.split(' ')) <= 9: continue