diff --git a/src/winziprint.py b/src/winziprint.py index 55ae86e..01f1084 100755 --- a/src/winziprint.py +++ b/src/winziprint.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -from typing import TextIO +from typing import TextIO, Callable import os import sys import time @@ -10,6 +10,7 @@ import gc import socketserver import io import signal +import math import weasyprint import pypdf @@ -21,6 +22,24 @@ SOCKET_ADDRESS = ('127.0.0.1', 30983) BATCH_SIZE = 10 +def convert_part(output_file: str, batch: list[str], step_cb: Callable, encoding: str = None) -> list[int]: + documents = [] + for n, file_name in enumerate(batch): + html = weasyprint.HTML(filename=file_name, encoding=encoding) + doc = html.render() + documents.append(doc) + del html + step_cb() + all_pages = [p for doc in documents for p in doc.pages] + documents[0].copy(all_pages).write_pdf(output_file) + tmp_page_nums = [len(doc.pages) for doc in documents] + del documents + del all_pages + gc.collect() + step_cb() + return tmp_page_nums + + def convert(input_files: list[str], output_files: str, encoding: str = None, @@ -28,34 +47,25 @@ def convert(input_files: list[str], progress: bool = False, out: TextIO = sys.stdout) -> list[int]: # it takes roughly 100ms to generate one document - tmp_page_nums = [] tmp_file_names = [] page_nums = [] html_files = [file.lstrip('!') for file in input_files if not file.endswith('.pdf')] - steps = len(html_files) + len(html_files) // BATCH_SIZE + 1 + total_steps = len(html_files) + math.ceil(len(html_files) / BATCH_SIZE) + 1 + convert.steps = 0 + + def next_step() -> None: + convert.steps += 1 + if progress: + print(f'progress: {convert.steps}/{total_steps}', file=out, flush=True) try: + tmp_page_nums = [] for i in range(0, len(html_files), BATCH_SIZE): + tmp_file = f'{output_files}.{i:04}.part' + tmp_file_names.append(tmp_file) batch = html_files[i:i + BATCH_SIZE] - documents = [] - for n, file_name in enumerate(batch): - html = weasyprint.HTML(filename=file_name, encoding=encoding) - doc = html.render() - documents.append(doc) - del html - if progress: - print(f'progress: {i + n + i // BATCH_SIZE + 1}/{steps}', file=out, flush=True) - all_pages = [p for doc in documents for p in doc.pages] - tmp_file_name = f'{output_files}.{i:04}.part' - documents[0].copy(all_pages).write_pdf(tmp_file_name) - tmp_file_names.append(tmp_file_name) - tmp_page_nums += [len(doc.pages) for doc in documents] - del documents - del all_pages - gc.collect() - if progress and i + BATCH_SIZE < len(html_files): - print(f'progress: {i + BATCH_SIZE + i // BATCH_SIZE + 1}/{steps}', file=out, flush=True) + tmp_page_nums += convert_part(tmp_file, batch, next_step, encoding=encoding) merger = pypdf.PdfWriter() i = 0 @@ -80,8 +90,7 @@ def convert(input_files: list[str], if os.path.isfile(pdf): os.remove(pdf) - if progress: - print(f'progress: {steps}/{steps}', file=out, flush=True) + next_step() return page_nums @@ -200,7 +209,7 @@ def main() -> None: usage(True) # a tcp server is used due to the lack of unix sockets on Windows with socketserver.ThreadingTCPServer(SOCKET_ADDRESS, ConnectionHandler) as server: - def exit_gracefully(signum: int, frame) -> None: + def exit_gracefully(_signum: int, _frame) -> None: raise KeyboardInterrupt() signal.signal(signal.SIGINT, exit_gracefully) signal.signal(signal.SIGTERM, exit_gracefully)