#!/bin/env python3 import re import argparse import requests import html import json import urllib.parse BASE_URL = 'https://www.bioqs.at' URL = f'{BASE_URL}/ACM/faces/form/cms/portal/index.jsp' ACTION_RE = re.compile(r'action="([^"]*)"') HIDDEN_RE = re.compile(r']*>\s*(.*?)\s*', re.DOTALL) UNCOLLAPSED_ROW_RE = re.compile(r'(\s*\s*(.*?)\s*\s*){7}', re.DOTALL) COLLAPSED_ROW_RE = re.compile(r']*>\s*(.*?)\s*
', re.DOTALL) TD_RE = re.compile(r']*>\s*(.*?)\s*', re.DOTALL) TAG_RE = re.compile(r'<[^>]*>') SPACE_RE = re.compile(r'\s+') ATTACHMENT_RE = re.compile(r"\[\['cert_attachment_sid','([^']*)'\]\]") def remove_tags(text: str) -> str: return SPACE_RE.sub(' ', html.unescape(TAG_RE.sub(' ', text))).strip() def main() -> None: parser = argparse.ArgumentParser() parser.add_argument('query', type=str) args = parser.parse_args() query = {'PartnerCertSearchForm:pcs_' + q.split('=', 1)[0]: urllib.parse.unquote(q.split('=', 1)[-1]) for q in args.query.split('&')} s = requests.Session() r = s.get(f'{URL}?menu_sid=5002') uri = ACTION_RE.findall(r.text)[0] hidden = {m[1]: m[2] for m in HIDDEN_RE.finditer(r.text)} r = s.post(f'{BASE_URL}{uri}', data={ **query, 'PartnerCertSearchForm:button_search': 'Suche starten...', 'PartnerCertSearchForm_SUBMIT': '1', 'javax.faces.ViewState': hidden['javax.faces.ViewState'], }) result_table = r.text[r.text.find('') + 8] uncollapsed_rows = [tuple(remove_tags(m[1]) for m in TD_RE.finditer(row[0])) for row in UNCOLLAPSED_ROW_RE.finditer(result_table)] collapsed_rows = [[tuple(remove_tags((ATTACHMENT_RE.search(m[1]) or m)[1]) for m in TD_RE.finditer(row[1])) for row in ROW_RE.finditer(tbl[0])] for tbl in COLLAPSED_ROW_RE.finditer(result_table)] print('[') first = True for row, tbl in zip(uncollapsed_rows, collapsed_rows): meta = {} certificates = [] for srow in tbl: if len(srow) == 1: [k,v] = srow[0].split(':', 1) meta[k.strip()] = v.strip() continue if len(srow) == 0: continue certificates.append({ 'nr': srow[0], 'validFrom': '-'.join(reversed(srow[1].split('-'))), 'validTo': '-'.join(reversed(srow[2].split('-'))), 'type': srow[3], 'attachmentSid': srow[4], }) if not first: print(',', flush=True) print(' ', json.dumps({ 'idNr': row[0], 'lfbisNr': row[1] or None, 'name': row[2], 'postalCode': row[3], 'city': row[4], 'address': row[5], 'autorityName': meta['Kontrollstelle'], 'productGroups': meta['Bereiche'], 'certificates': certificates, }, ensure_ascii=False), end='') first = False print('\n]') if __name__ == '__main__': main()