diff --git a/winziprint/winziprint.py b/winziprint/winziprint.py new file mode 100755 index 0000000..bb0fe8b --- /dev/null +++ b/winziprint/winziprint.py @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import sys +import time +import traceback +import gc + +import weasyprint +import pypdf + + +BATCH_SIZE = 10 + + +def convert(input_file_names: list[str], output_file_name: str, encoding: str = None) -> list[int]: + # it takes roughly 100ms to generate one document + page_nums = [] + tmp_file_names = [] + + for i in range(0, len(input_file_names), BATCH_SIZE): + batch = input_file_names[i:i + BATCH_SIZE] + documents = [] + for file_name in batch: + html = weasyprint.HTML(file_name, encoding=encoding) + doc = html.render() + documents.append(doc) + del html + all_pages = [p for doc in documents for p in doc.pages] + tmp_file_name = f'{output_file_name}.part.{i:0000}' + documents[0].copy(all_pages).write_pdf(tmp_file_name) + tmp_file_names.append(tmp_file_name) + page_nums += [len(doc.pages) for doc in documents] + del documents + del all_pages + gc.collect() + + merger = pypdf.PdfWriter() + for pdf in tmp_file_names: + merger.append(pdf) + os.remove(pdf) + merger.write(output_file_name) + merger.close() + + return page_nums + + +def _wrapper_convert(args: list[str], encoding: str = None) -> None: + try: + t0 = time.process_time() + pages = convert(args[:-1], args[-1], encoding=encoding) + t1 = time.process_time() + print(f'{len(args) - 1} documents, ' + f'{sum(pages)} pages ({", ".join(str(p) for p in pages)}), ' + f'{t1 - t0:.1f} sec', + flush=True) + except Exception as e: + msg = str(e).replace('\n', ' ') + print(f'error: {msg}', flush=True) + traceback.print_exception(e) + + +def usage() -> None: + print(f'usage: {sys.argv[0]} [-h] [-d DIR] [-e ENCODING] [ - | INPUT [INPUT...] OUTPUT ]\n\n' + 'options:\n' + ' -h, --help show this help message and exit\n' + ' -d, --directory set the working directory\n' + ' -e, --encoding encoding of the input files\n' + '\n' + ' - use stdin for retrieving input and output file names (semi-colon-seperated)\n' + ' INPUT name of an html input file\n' + ' OUTPUT name of an pdf output file', file=sys.stderr) + sys.exit(1) + + +def _get_arg(args: list[str], n1: str, n2: str = None) -> str: + v = None + for n in [n1] + (n2 and [n2] or []): + if n in args: + i = args.index(n) + if i + 1 >= len(args): + usage() + v = args[i + 1] + args.pop(i) + args.pop(i) + return v + + +def main() -> None: + args = sys.argv[1:] + if len(args) == 0 or '-h' in args or '--help' in args: + usage() + + working_dir = _get_arg(args, '-d', '--directory') + if working_dir: + os.chdir(working_dir) + encoding = _get_arg(args, '-e', '--encoding') + + if args == ['-']: + for line in sys.stdin: + _wrapper_convert(line.strip().split(';'), encoding=encoding) + elif len(args) < 2: + usage() + else: + _wrapper_convert(args, encoding=encoding) + + +if __name__ == '__main__': + main() + sys.exit(0)