winziprint.py: Extract convert_part() from convert()

2024-01-14 19:48:02 +01:00
parent 5b3b96fb35
commit 81848ac767
1 changed files with 33 additions and 24 deletions
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

-from typing import TextIO
+from typing import TextIO, Callable
 import os
 import sys
 import time
@@ -10,6 +10,7 @@ import gc
 import socketserver
 import io
 import signal
+import math

 import weasyprint
 import pypdf
@@ -21,6 +22,24 @@ SOCKET_ADDRESS = ('127.0.0.1', 30983)
 BATCH_SIZE = 10


+def convert_part(output_file: str, batch: list[str], step_cb: Callable, encoding: str = None) -> list[int]:
+    documents = []
+    for n, file_name in enumerate(batch):
+        html = weasyprint.HTML(filename=file_name, encoding=encoding)
+        doc = html.render()
+        documents.append(doc)
+        del html
+        step_cb()
+    all_pages = [p for doc in documents for p in doc.pages]
+    documents[0].copy(all_pages).write_pdf(output_file)
+    tmp_page_nums = [len(doc.pages) for doc in documents]
+    del documents
+    del all_pages
+    gc.collect()
+    step_cb()
+    return tmp_page_nums
+
+
 def convert(input_files: list[str],
            output_files: str,
            encoding: str = None,
@@ -28,34 +47,25 @@ def convert(input_files: list[str],
            progress: bool = False,
            out: TextIO = sys.stdout) -> list[int]:
    # it takes roughly 100ms to generate one document
-    tmp_page_nums = []
    tmp_file_names = []
    page_nums = []

    html_files = [file.lstrip('!') for file in input_files if not file.endswith('.pdf')]
-    steps = len(html_files) + len(html_files) // BATCH_SIZE + 1
+    total_steps = len(html_files) + math.ceil(len(html_files) / BATCH_SIZE) + 1
+    convert.steps = 0
+
+    def next_step() -> None:
+        convert.steps += 1
+        if progress:
+            print(f'progress: {convert.steps}/{total_steps}', file=out, flush=True)

    try:
+        tmp_page_nums = []
        for i in range(0, len(html_files), BATCH_SIZE):
+            tmp_file = f'{output_files}.{i:04}.part'
+            tmp_file_names.append(tmp_file)
            batch = html_files[i:i + BATCH_SIZE]
-            documents = []
-            for n, file_name in enumerate(batch):
-                html = weasyprint.HTML(filename=file_name, encoding=encoding)
-                doc = html.render()
-                documents.append(doc)
-                del html
-                if progress:
-                    print(f'progress: {i + n + i // BATCH_SIZE + 1}/{steps}', file=out, flush=True)
-            all_pages = [p for doc in documents for p in doc.pages]
-            tmp_file_name = f'{output_files}.{i:04}.part'
-            documents[0].copy(all_pages).write_pdf(tmp_file_name)
-            tmp_file_names.append(tmp_file_name)
-            tmp_page_nums += [len(doc.pages) for doc in documents]
-            del documents
-            del all_pages
-            gc.collect()
-            if progress and i + BATCH_SIZE < len(html_files):
-                print(f'progress: {i + BATCH_SIZE + i // BATCH_SIZE + 1}/{steps}', file=out, flush=True)
+            tmp_page_nums += convert_part(tmp_file, batch, next_step, encoding=encoding)

        merger = pypdf.PdfWriter()
        i = 0
@@ -80,8 +90,7 @@ def convert(input_files: list[str],
            if os.path.isfile(pdf):
                os.remove(pdf)

-    if progress:
-        print(f'progress: {steps}/{steps}', file=out, flush=True)
+    next_step()

    return page_nums

@@ -200,7 +209,7 @@ def main() -> None:
            usage(True)
        # a tcp server is used due to the lack of unix sockets on Windows
        with socketserver.ThreadingTCPServer(SOCKET_ADDRESS, ConnectionHandler) as server:
-            def exit_gracefully(signum: int, frame) -> None:
+            def exit_gracefully(_signum: int, _frame) -> None:
                raise KeyboardInterrupt()
            signal.signal(signal.SIGINT, exit_gracefully)
            signal.signal(signal.SIGTERM, exit_gracefully)