Files
elwig-www/www/organic/external/bioqs/.operators.py

93 lines
3.3 KiB
Python
Executable File

#!/bin/env python3
import re
import argparse
import requests
import html
import json
import urllib.parse
BASE_URL = 'https://www.bioqs.at'
URL = f'{BASE_URL}/ACM/faces/form/cms/portal/index.jsp'
ACTION_RE = re.compile(r'action="([^"]*)"')
HIDDEN_RE = re.compile(r'<input type="hidden" name="([^"]*)" .*?value="([^"]*)"')
ROW_RE = re.compile(r'<tr[^>]*>\s*(.*?)\s*</tr>', re.DOTALL)
UNCOLLAPSED_ROW_RE = re.compile(r'<tr style="">(\s*<td>\s*(.*?)\s*</td>\s*){7}</tr>', re.DOTALL)
COLLAPSED_ROW_RE = re.compile(r'<table width=[^>]*>\s*(.*?)\s*</table>', re.DOTALL)
TD_RE = re.compile(r'<td[^>]*>\s*(.*?)\s*</td>', re.DOTALL)
TAG_RE = re.compile(r'<[^>]*>')
SPACE_RE = re.compile(r'\s+')
ATTACHMENT_RE = re.compile(r"\[\['cert_attachment_sid','([^']*)'\]\]")
def remove_tags(text: str) -> str:
return SPACE_RE.sub(' ', html.unescape(TAG_RE.sub(' ', text))).strip()
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('query', type=str)
args = parser.parse_args()
query = {'PartnerCertSearchForm:pcs_' + q.split('=', 1)[0]: urllib.parse.unquote(q.split('=', 1)[-1]) for q in args.query.split('&')}
s = requests.Session()
r = s.get(f'{URL}?menu_sid=5002')
uri = ACTION_RE.findall(r.text)[0]
hidden = {m[1]: m[2] for m in HIDDEN_RE.finditer(r.text)}
r = s.post(f'{BASE_URL}{uri}', data={
**query,
'PartnerCertSearchForm:button_search': 'Suche starten...',
'PartnerCertSearchForm_SUBMIT': '1',
'javax.faces.ViewState': hidden['javax.faces.ViewState'],
})
result_table = r.text[r.text.find('<table'):r.text.rfind('</table>') + 8]
uncollapsed_rows = [tuple(remove_tags(m[1])
for m in TD_RE.finditer(row[0]))
for row in UNCOLLAPSED_ROW_RE.finditer(result_table)]
collapsed_rows = [[tuple(remove_tags((ATTACHMENT_RE.search(m[1]) or m)[1]) for m in TD_RE.finditer(row[1]))
for row in ROW_RE.finditer(tbl[0])]
for tbl in COLLAPSED_ROW_RE.finditer(result_table)]
print('[')
first = True
for row, tbl in zip(uncollapsed_rows, collapsed_rows):
meta = {}
certificates = []
for srow in tbl:
if len(srow) == 1:
[k,v] = srow[0].split(':', 1)
meta[k.strip()] = v.strip()
continue
if len(srow) == 0:
continue
certificates.append({
'nr': srow[0],
'validFrom': '-'.join(reversed(srow[1].split('-'))),
'validTo': '-'.join(reversed(srow[2].split('-'))),
'type': srow[3],
'attachmentSid': srow[4],
'url': f'https://elwig.at/organic/external/bioqs/attachments/{urllib.parse.quote(srow[0])}',
})
if not first:
print(',', flush=True)
print(' ', json.dumps({
'idNr': row[0],
'lfbisNr': row[1] or None,
'name': row[2],
'postalCode': row[3],
'city': row[4],
'address': row[5],
'autorityName': meta['Kontrollstelle'],
'productGroups': meta['Bereiche'],
'certificates': certificates,
}, ensure_ascii=False), end='')
first = False
print('\n]')
if __name__ == '__main__':
main()