From a3c401ead2eb9fdbd9a3d0c39c1822300fee78ed Mon Sep 17 00:00:00 2001 From: Lorenz Stechauner Date: Sun, 13 Jul 2025 20:15:33 +0200 Subject: [PATCH] organic: Add pdf endpoint --- www/organic/external/easy-cert/operators.php | 154 ++++++++++++++-- www/organic/pdf.php | 182 +++++++++++++++++++ 2 files changed, 321 insertions(+), 15 deletions(-) create mode 100644 www/organic/pdf.php diff --git a/www/organic/external/easy-cert/operators.php b/www/organic/external/easy-cert/operators.php index 462505a..bb59a1f 100644 --- a/www/organic/external/easy-cert/operators.php +++ b/www/organic/external/easy-cert/operators.php @@ -1,12 +1,5 @@ {'totalCount'} !== 1) { + header('Status: 404'); + header('Content-Length: 0'); + exit; + } + header('Status: 303'); + header('Location: ./' . $json->{'data'}[0]->{'db'} . ':' . $json->{'data'}[0]->{'id'}); + exit; + } + + + $refDate = $_GET['referenceDate'] ?? null; + if ($refDate !== null) { + $refDate = explode('-', $refDate); + $refDate = $refDate[2] . '.' . $refDate[1] . '.' . $refDate[0]; + } + $url = "https://www.easy-cert.com/htm/suchresultat-detail.htm?sprache=de&db=" . urlencode($parts[0]) . "&id=" . urlencode(str_replace('-', '_', $parts[1])) . "&historyDate=" . urlencode($refDate); + $s = curl_init($url); + curl_setopt($s, CURLOPT_RETURNTRANSFER, true); + if (($html = curl_exec($s)) === false) { + header('Status: 500'); + header('Content-Length: 0'); + exit; + } + + $html = preg_replace('//s', ' ', $html); + preg_match_all('@\s*]*>([^<]*)\s*]*>([^<]*)\s*@', $html, $matches, PREG_SET_ORDER); + + $data = []; + foreach ($matches as $m) { + $data[$m[1]] = trim(html_entity_decode($m[2])); + if ($data[$m[1]] === '') { + $data[$m[1]] = null; + } + } + $plzOrt = $data['PLZ / Ort'] ?? null; + if ($plzOrt === null) { + $postalCode = null; + $city = null; + } else { + $p = explode(' ', $plzOrt, 2); + $postalCode = trim($p[0]); + $city = trim($p[1]); + } + + if ($data['Name'] === null) { + header('Status: 404'); + header('Content-Length: 0'); + exit; + } + + if (preg_match('@name="historydate_input" value="([^"]*)"@', $html, $matches) === 1) { + $refDate = $matches[1]; + $refDate = explode('.', $refDate); + $refDate = $refDate[2] . '-' . $refDate[1] . '-' . $refDate[0]; + } else { + $refDate = null; + } + + $certs = []; + preg_match_all('@\s*([^<]*)\s*]*>([^<]*)\s*]*>\s*([^<]*)\s*.*?\s*]*>[^<]*\s*@', $html, $matches, PREG_SET_ORDER); + foreach ($matches as $m) { + $certs[] = [ + 'nr' => $m[1], + 'validUntil' => implode('-', array_reverse(explode('.', $m[2]))), + 'type' => $m[3] === '' ? null : $m[3], + 'pdfUrl' => str_replace('http://', 'https://', $m[4]), + ]; + } + + $labels = []; + preg_match_all('@\s*([^<]*)\s*]*>([^<]*)\s*]*>\s*([^<]*)\s*.*?\s*]*>[^<]*\s*@', $html, $matches, PREG_SET_ORDER); + foreach ($matches as $m) { + $labels[] = [ + 'nr' => $m[1], + 'validUntil' => implode('-', array_reverse(explode('.', $m[2]))), + 'type' => $m[3] === '' ? null : $m[3], + 'pdfUrl' => str_replace('http://', 'https://', $m[4]), + ]; + } + + header("Content-Type: application/json; charset=UTF-8"); + + echo '{"db":' . jenc($parts[0]) . + ',"id":' . jenc($parts[1]) . + ",\n \"idNr\":" . jenc($data['ID-Nummer'] ?? null) . + ',"name":' . jenc($data['Name']) . + ',"address":' . jenc($data['Strasse'] ?? null) . + ',"postalCode":' . jenc($postalCode) . + ',"city":' . jenc($city) . + ',"countryCode":' . jenc($data['Land'] ?? null) . + ",\n \"referenceDate\":" . jenc($refDate) . + ",\n \"certificates\":" . jenc($certs) . + ",\n \"privateStandardApprovals\":" . jenc($labels) . + '}'; + + exit; +} + $search_url = null; $url = null; $query_id = null; @@ -43,9 +152,10 @@ if (isset($_GET['queryId'])) { $country = $_GET['country'] ?? null; $postalCode = $_GET['postalCode'] ?? null; $name = $_GET['name'] ?? null; + $idNr = $_GET['idNr'] ?? null; $renew = ($_GET['renew'] ?? 'false') === 'true'; - $search_url = "https://www.easy-cert.com/htm/suchergebnis.htm?suchtyp=einfach&CountryCode=$country&PostalCode=$postalCode&Name=$name"; + $search_url = "https://www.easy-cert.com/htm/suchergebnis.htm?suchtyp=einfach&CountryCode=$country&PostalCode=$postalCode&Name=$name&CustomerNumber=$idNr"; if (!$renew && $cache = fopen('.cache.csv', 'r')) { while (($line = fgets($cache)) !== false) { @@ -111,15 +221,28 @@ $sed = [ 's/^/ /', 's/}$/},/', ]; +$replace = [ + '"DB":' => '"db":', + '"Name":' => '"name":', + '"PostalCode":' => '"postalCode":', + '"Town":' => '"city":', + '"CustomerNumber":' => '"idNr":', + '"ID":' => '"id":', + '"CountryCode":' => '"countryCode":', + '"xx"' => 'null', + '"XX"' => 'null', + '""' => 'null', +]; +$replaceSed = array_map(fn($v, $k): string => "s/$k/$v/", $replace, array_keys($replace)); header("Content-Type: application/json; charset=UTF-8"); -echo '{"searchUrl":' . json_encode($search_url) . - ',"queryId":' . json_encode($query_id) . - ',"rawFileUrl":' . json_encode($url) . - ',"timestamp":' . json_encode(gmdate('Y-m-d\TH:i:s\Z', $timestamp)) . - ',"limit":' . json_encode($limit) . - ',"offset":' . json_encode($offset) . ",\"data\":[\n"; +echo '{"searchUrl":' . jenc($search_url) . + ',"queryId":' . jenc($query_id) . + ',"rawFileUrl":' . jenc($url) . + ',"timestamp":' . jenc(gmdate('Y-m-d\TH:i:s\Z', $timestamp)) . + ',"limit":' . jenc($limit) . + ',"offset":' . jenc($offset) . ",\"data\":[\n"; $fd_spec = [ 0 => ["pipe", "r"], // stdin @@ -138,6 +261,7 @@ $process = proc_open( "tee >(wc -l 1>&3) | " . // copy stdout into wc and write result into fd 3 "tail -n +$offset | " . // apply offset ($limit !== null ? " head -n $limit | " : "") . // optionally apply limit + "sed '" . implode(';', $replaceSed) . "' | " . // replace strings in json "sed '\$s/.$//'"], // remove last comma of last line $fd_spec, $pipes @@ -153,4 +277,4 @@ $count = intval(trim(stream_get_contents($pipes[3]))); fclose($pipes[3]); $return_value = proc_close($process); -echo '],"totalCount":' . json_encode($count) . "}\n"; +echo '],"totalCount":' . jenc($count) . "}\n"; diff --git a/www/organic/pdf.php b/www/organic/pdf.php new file mode 100644 index 0000000..177441b --- /dev/null +++ b/www/organic/pdf.php @@ -0,0 +1,182 @@ += $from; $i--) { + $el = $array[$i]; + if (sizeof($postalCode) > 0) { + if (sizeof($address) === 0) $el = rtrim($el, ", \n\r\t\v\0"); + if (strlen($el) === 0) continue; + array_unshift($address, $el); + } else if (preg_match("/^[A-Z0-9.\-]{3,},?$/", $el)) { + array_unshift($postalCode, trim($el, ", \n\r\t\v\0")); + } else { + array_unshift($city, $el); + } + } + return [implode(' ', $address), implode(' ', $postalCode), implode(' ', $city)]; +} + +function jenc($data): string { + return json_encode($data, JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE); +} + +if ($format === 'text') { + header('Content-Type: text/plain; charset=UTF-8'); + passthru("curl -s '" . escapeshellarg($url) . "' | pdftotext -raw - -"); +} else if ($format === 'json') { + header('Content-Type: application/json; charset=UTF-8'); + + $fd_spec = [ + 0 => ["pipe", "r"], // stdin + 1 => ["pipe", "w"], // stdout + 2 => ["pipe", "w"], // stderr + ]; + $process = proc_open( + ['bash', '-c', + "curl -s " . escapeshellarg($url) . " | " . + "pdftotext -raw - -"], + $fd_spec, + $pipes + ); + fclose($pipes[0]); + $text = stream_get_contents($pipes[1]); + fclose($pipes[1]); + $stderr = stream_get_contents($pipes[2]); + fclose($pipes[2]); + $return_value = proc_close($process); + + if ($stderr !== '') { + header('Status: 500'); + header('Content-Length: ' . strlen($stderr)); + header('Content-Type: text/plain'); + exit($stderr); + } + + $r = preg_match('@([a-z]{2}) (https://webgate\.ec\.europa\.eu/tracesnt/directory/publication/organic-operator/(.*?)\.pdf) (\d+) / (\d+)@', $text, $matches); + if ($r === 1) { + // TRACES certificate + + $data = []; + $parts = preg_split('@\n(I+\.\d+) ([^\n]*)@', $text, -1, PREG_SPLIT_DELIM_CAPTURE); + $status = str_replace("\n", '', $parts[0]); + for ($i = 3; $i < sizeof($parts); $i += 3) { + $data[$parts[$i - 2]] = trim($parts[$i]); + } + + $lang = $matches[1]; + $splitAddr = [ + 'de' => 'Adresse', + 'en' => 'Address', + ][$lang]; + $splitCountry = [ + 'de' => 'Land', + 'en' => 'Country', + ][$lang]; + $statusMap = [ + 'de' => [ + 'AUSGESTELLT' => 'issued', + ], + 'en' => [ + 'ISSUED' => 'issued', + ] + ][$lang]; + $activityMap = [ + 'de' => [ + 'Aufbereitung' => 'preparation', + 'Ausfuhr' => 'export', + 'Einfuhr' => 'import', + 'Lagerung' => 'storing', + 'Produktion' => 'production', + 'Vertrieb' => 'distribution', + 'Vertrieb/Inverkehrbringen' => 'distribution_placing_on_the_market', + ], + 'en' => [ + 'Distribution' => 'distribution', + 'Distribution/Placing on the market' => 'distribution_placing_on_the_market', + 'Export' => 'export', + 'Import' => 'import', + 'Preparation' => 'preparation', + 'Production' => 'production', + 'Storing' => 'storing', + ], + ][$lang]; + + $certUrl = $matches[2]; + $certId = $matches[3]; + $authorityId = explode('.', $certId)[0]; + $operatorId = explode('.', $certId)[1]; + $operator = preg_split('@\s+@', trim($data['I.3'])); + $p1 = array_search($splitAddr, $operator); + $p2 = array_search($splitCountry, $operator); + $operatorName = implode(' ', array_filter($operator, fn($k,$i) => $i > 0 && $i < $p1, ARRAY_FILTER_USE_BOTH)); + [$opAddr, $opPostal, $opCity] = get_address($operator, $p1 + 1, $p2 - 1); + + $authority = preg_split('@\s+@', trim($data['I.4'])); + $until = array_search("($authorityId)", $authority); + $p1 = array_search($splitAddr, $authority); + $p2 = array_search($splitCountry, $authority); + $authorityName = implode(' ', array_filter($authority, fn($k,$i) => $i > 0 && $i < $p1 - 1 && ($i !== $p1 - 2 || !str_starts_with($k, '(')), ARRAY_FILTER_USE_BOTH)); + [$aAddr, $aPostal, $aCity] = get_address($authority, $p1 + 1, $p2 - 1); + + $activities = []; + foreach (explode("\n", $data['I.5']) as $a) { + $activities[] = $activityMap[trim($a, '• ')]; + } + + preg_match_all('/\([a-g]\)/', $data['I.6'], $matches, PREG_SET_ORDER); + $products = []; + foreach ($matches as $m) { + $products[] = $m[0]; + } + + preg_match_all('@\d+/\d+/\d+@', $data['I.8'], $matches, PREG_SET_ORDER); + $valid1 = implode('-', array_reverse(explode('/', $matches[0][0]))); + $valid2 = implode('-', array_reverse(explode('/', $matches[1][0]))); + + echo "{\"type\":\"traces\",\"lang\":\"$lang\",\"id\":\"$certId\",\"status\":\"$statusMap[$status]\""; + echo ",\n \"operator\":{\"id\":" . jenc($operatorId). + ',"groupOfOperators":' . jenc(!str_starts_with($data['I.2'], '☑')) . + ',"name":' . jenc($operatorName) . + ',"address":' . jenc($opAddr) . + ',"postalCode":' . jenc($opPostal) . + ',"city":' . jenc($opCity) . + ',"countryCode":' . jenc($operator[sizeof($operator) - 1]) . + "},\n \"authority\":{\"id\":" . jenc($authorityId) . + ',"name":' . jenc($authorityName) . + ',"address":' . jenc($aAddr) . + ',"postalCode":' . jenc($aPostal) . + ',"city":' . jenc($aCity) . + ',"countryCode":' . jenc($authority[sizeof($authority) - 1]) . + "},\n \"activities\":" . jenc($activities) . + ",\n \"productCategories\":" . jenc($products) . + ",\n \"validFrom\":" . jenc($valid1) . + ',"validTo":' . jenc($valid2) . + ",\n \"url\":\"$certUrl\"\n}\n"; + } else { + echo "{\"type\":\"unknown\"}\n"; + } +} else { + header('Content-Type: application/pdf'); + $s = curl_init($url); + curl_exec($s); +}