Working
This commit is contained in:
40
data/plz.py
40
data/plz.py
@ -21,11 +21,14 @@ OV_URL = 'https://statistik.gv.at/fileadmin/publications/Ortsverzeichnis_2001__{
|
|||||||
OV_NAMES = ['Burgenland', 'Niederoesterreich', 'Wien']
|
OV_NAMES = ['Burgenland', 'Niederoesterreich', 'Wien']
|
||||||
|
|
||||||
GEM = re.compile(r'^([1-9][0-9]{2} [0-9]{2}) [X0-9]+')
|
GEM = re.compile(r'^([1-9][0-9]{2} [0-9]{2}) [X0-9]+')
|
||||||
ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü(][A-Za-z0-9äöüÄÖÜßẞ():,.-]* ?|[0-9]+\..*?)+) ?'
|
ORT = re.compile(r'^([0-9]{5}) (([A-Za-zÄÖÜäöü(][A-Za-z0-9äöüÄÖÜßẞ():,.-]* ?|[0-9]+\..*?)+)'
|
||||||
r'([()X0-9.]+ ?|$)')
|
r'(( [()X0-9.-]+)*)?([A-Za-zÄÖÜäöüßẞ ]+([0-9]*))?$')
|
||||||
STRIP_NUM = re.compile(r'[X0-9. -]+$')
|
STRIP_NUM = re.compile(r'[X0-9. -]+$')
|
||||||
STRIP_CODE = re.compile(r' *(Agh|Alm|Alpe|B|Bd|Bwg|Burg|Camp|D|E|Fbk|Fhei|Gh|Hgr|Hot|Indz|Jh|Jhtt|Ki|Kl|Krwk|Ks|M|Mh|'
|
|
||||||
r'Mü|R|Ru|Sa|Sä|Sb|Schh|Schih|Schl|Sdlg|Sgr|St|Stbr|Stt|V|W|We|Ek|Z|Zgl|ZH) *(\([0-9]+\) *)?$')
|
CODES = re.compile(r'Agh|Alm|Alpe|B|Bd|Bwg|Burg|Camp|D|E|Fbk|Fhei|Gh|Hgr|Hot|Indz|Jh|Jhtt|Ki|Kl|Krwk|Ks|M|Mh|'
|
||||||
|
r'Mü|R|Ru|Sa|Sä|Sb|Schh|Schih|Schl|Sdlg|Sgr|St|Stbr|Stt|V|W|We|Ek|Z|Zgl|ZH')
|
||||||
|
STRIP_CODE = re.compile(r' *(' + CODES.pattern + r')[()X0-9. -]*$')
|
||||||
|
STRIP_CODE_ALL = re.compile(r' +\b(' + CODES.pattern + r')\b.*$')
|
||||||
|
|
||||||
PlzRow = Tuple[int, str, int, str, int, str]
|
PlzRow = Tuple[int, str, int, str, int, str]
|
||||||
KgvRow = Tuple[int, str, int, str]
|
KgvRow = Tuple[int, str, int, str]
|
||||||
@ -74,7 +77,7 @@ def download_plz() -> List[PlzRow]:
|
|||||||
f_name = None
|
f_name = None
|
||||||
try:
|
try:
|
||||||
f = tempfile.NamedTemporaryFile(delete=False)
|
f = tempfile.NamedTemporaryFile(delete=False)
|
||||||
with requests.get(get_plz_url(), stream=True) as r:
|
with requests.get(get_plz_url(), stream=True, headers={'User-Agent': 'Mozilla/5.0'}) as r:
|
||||||
if r.status_code != 200:
|
if r.status_code != 200:
|
||||||
raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
|
raise RuntimeError(f'Unexpected response: {r.status_code} {r.reason}')
|
||||||
for chunk in r.iter_content(chunk_size=8192):
|
for chunk in r.iter_content(chunk_size=8192):
|
||||||
@ -156,7 +159,7 @@ def download_ov_land(bundesland: str) -> List[OvRow]:
|
|||||||
if not valid:
|
if not valid:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
with open(f'out/{bundesland}/{page_num + 1}.txt', 'w+') as o:
|
with open(f'out/{bundesland}/{page_num + 1:03}.txt', 'w+') as o:
|
||||||
o.write(text)
|
o.write(text)
|
||||||
|
|
||||||
return rows
|
return rows
|
||||||
@ -191,18 +194,18 @@ def parse_ov() -> List[OvRow]:
|
|||||||
cont = True
|
cont = True
|
||||||
break
|
break
|
||||||
elif m1 is None:
|
elif m1 is None:
|
||||||
|
if ',' not in line and ':' not in line:
|
||||||
last = f'{last} {line}'
|
last = f'{last} {line}'
|
||||||
print(last)
|
m3 = ORT.match(last)
|
||||||
val = STRIP_NUM.sub('', ORT.match(last).group(0))
|
okz = int(m3.group(1))
|
||||||
okz = int(val[:5])
|
name = STRIP_CODE_ALL.sub('', m3.group(2))
|
||||||
name = STRIP_CODE.sub('', val[6:])
|
|
||||||
rows.append((gkz, okz, name))
|
rows.append((gkz, okz, name))
|
||||||
last = None
|
last = None
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
val = last
|
m3 = ORT.match(last)
|
||||||
okz = int(val[:5])
|
okz = int(m3.group(1))
|
||||||
name = STRIP_CODE.sub('', val[6:])
|
name = STRIP_CODE.sub('', m3.group(2))
|
||||||
rows.append((gkz, okz, name))
|
rows.append((gkz, okz, name))
|
||||||
last = None
|
last = None
|
||||||
|
|
||||||
@ -215,13 +218,16 @@ def parse_ov() -> List[OvRow]:
|
|||||||
|
|
||||||
if m1:
|
if m1:
|
||||||
val = STRIP_NUM.sub('', m1.group(0))
|
val = STRIP_NUM.sub('', m1.group(0))
|
||||||
if m1.group(4) == '':
|
|
||||||
|
if m1.group(4) == '' or (m1.group(6) is not None and m1.group(6) != ''):
|
||||||
|
last = m1.group(1) + ' ' + m1.group(2)
|
||||||
|
else:
|
||||||
|
okz = int(m1.group(1))
|
||||||
|
name = STRIP_CODE.sub('', m1.group(2))
|
||||||
|
if len(name) == len(m1.group(2)):
|
||||||
last = val
|
last = val
|
||||||
else:
|
else:
|
||||||
okz = int(val[:5])
|
|
||||||
name = STRIP_CODE.sub('', val[6:])
|
|
||||||
rows.append((gkz, okz, name))
|
rows.append((gkz, okz, name))
|
||||||
print(rows[-1])
|
|
||||||
elif m2:
|
elif m2:
|
||||||
if len(line.split(' ')) <= 9:
|
if len(line.split(' ')) <= 9:
|
||||||
continue
|
continue
|
||||||
|
Reference in New Issue
Block a user