#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import sys import time import traceback import gc import weasyprint import pypdf BATCH_SIZE = 10 def convert(input_file_names: list[str], output_file_name: str, encoding: str = None) -> list[int]: # it takes roughly 100ms to generate one document page_nums = [] tmp_file_names = [] for i in range(0, len(input_file_names), BATCH_SIZE): batch = input_file_names[i:i + BATCH_SIZE] documents = [] for file_name in batch: html = weasyprint.HTML(file_name, encoding=encoding) doc = html.render() documents.append(doc) del html all_pages = [p for doc in documents for p in doc.pages] tmp_file_name = f'{output_file_name}.part.{i:0000}' documents[0].copy(all_pages).write_pdf(tmp_file_name) tmp_file_names.append(tmp_file_name) page_nums += [len(doc.pages) for doc in documents] del documents del all_pages gc.collect() merger = pypdf.PdfWriter() for pdf in tmp_file_names: merger.append(pdf) os.remove(pdf) merger.write(output_file_name) merger.close() return page_nums def _wrapper_convert(args: list[str], encoding: str = None) -> None: try: t0 = time.process_time() pages = convert(args[:-1], args[-1], encoding=encoding) t1 = time.process_time() print(f'{len(args) - 1} documents, ' f'{sum(pages)} pages ({", ".join(str(p) for p in pages)}), ' f'{t1 - t0:.1f} sec', flush=True) except Exception as e: msg = str(e).replace('\n', ' ') print(f'error: {msg}', flush=True) traceback.print_exception(e) def usage() -> None: print(f'usage: {sys.argv[0]} [-h] [-d DIR] [-e ENCODING] [ - | INPUT [INPUT...] OUTPUT ]\n\n' 'options:\n' ' -h, --help show this help message and exit\n' ' -d, --directory set the working directory\n' ' -e, --encoding encoding of the input files\n' '\n' ' - use stdin for retrieving input and output file names (semi-colon-seperated)\n' ' INPUT name of an html input file\n' ' OUTPUT name of an pdf output file', file=sys.stderr) sys.exit(1) def _get_arg(args: list[str], n1: str, n2: str = None) -> str: v = None for n in [n1] + (n2 and [n2] or []): if n in args: i = args.index(n) if i + 1 >= len(args): usage() v = args[i + 1] args.pop(i) args.pop(i) return v def main() -> None: args = sys.argv[1:] if len(args) == 0 or '-h' in args or '--help' in args: usage() working_dir = _get_arg(args, '-d', '--directory') if working_dir: os.chdir(working_dir) encoding = _get_arg(args, '-e', '--encoding') if args == ['-']: for line in sys.stdin: _wrapper_convert(line.strip().split(';'), encoding=encoding) elif len(args) < 2: usage() else: _wrapper_convert(args, encoding=encoding) if __name__ == '__main__': main() sys.exit(0)