winziprint.py: Extract convert_part() from convert()

This commit is contained in:
2024-01-14 19:48:02 +01:00
parent 5b3b96fb35
commit 81848ac767

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
from typing import TextIO from typing import TextIO, Callable
import os import os
import sys import sys
import time import time
@ -10,6 +10,7 @@ import gc
import socketserver import socketserver
import io import io
import signal import signal
import math
import weasyprint import weasyprint
import pypdf import pypdf
@ -21,6 +22,24 @@ SOCKET_ADDRESS = ('127.0.0.1', 30983)
BATCH_SIZE = 10 BATCH_SIZE = 10
def convert_part(output_file: str, batch: list[str], step_cb: Callable, encoding: str = None) -> list[int]:
documents = []
for n, file_name in enumerate(batch):
html = weasyprint.HTML(filename=file_name, encoding=encoding)
doc = html.render()
documents.append(doc)
del html
step_cb()
all_pages = [p for doc in documents for p in doc.pages]
documents[0].copy(all_pages).write_pdf(output_file)
tmp_page_nums = [len(doc.pages) for doc in documents]
del documents
del all_pages
gc.collect()
step_cb()
return tmp_page_nums
def convert(input_files: list[str], def convert(input_files: list[str],
output_files: str, output_files: str,
encoding: str = None, encoding: str = None,
@ -28,34 +47,25 @@ def convert(input_files: list[str],
progress: bool = False, progress: bool = False,
out: TextIO = sys.stdout) -> list[int]: out: TextIO = sys.stdout) -> list[int]:
# it takes roughly 100ms to generate one document # it takes roughly 100ms to generate one document
tmp_page_nums = []
tmp_file_names = [] tmp_file_names = []
page_nums = [] page_nums = []
html_files = [file.lstrip('!') for file in input_files if not file.endswith('.pdf')] html_files = [file.lstrip('!') for file in input_files if not file.endswith('.pdf')]
steps = len(html_files) + len(html_files) // BATCH_SIZE + 1 total_steps = len(html_files) + math.ceil(len(html_files) / BATCH_SIZE) + 1
convert.steps = 0
def next_step() -> None:
convert.steps += 1
if progress:
print(f'progress: {convert.steps}/{total_steps}', file=out, flush=True)
try: try:
tmp_page_nums = []
for i in range(0, len(html_files), BATCH_SIZE): for i in range(0, len(html_files), BATCH_SIZE):
tmp_file = f'{output_files}.{i:04}.part'
tmp_file_names.append(tmp_file)
batch = html_files[i:i + BATCH_SIZE] batch = html_files[i:i + BATCH_SIZE]
documents = [] tmp_page_nums += convert_part(tmp_file, batch, next_step, encoding=encoding)
for n, file_name in enumerate(batch):
html = weasyprint.HTML(filename=file_name, encoding=encoding)
doc = html.render()
documents.append(doc)
del html
if progress:
print(f'progress: {i + n + i // BATCH_SIZE + 1}/{steps}', file=out, flush=True)
all_pages = [p for doc in documents for p in doc.pages]
tmp_file_name = f'{output_files}.{i:04}.part'
documents[0].copy(all_pages).write_pdf(tmp_file_name)
tmp_file_names.append(tmp_file_name)
tmp_page_nums += [len(doc.pages) for doc in documents]
del documents
del all_pages
gc.collect()
if progress and i + BATCH_SIZE < len(html_files):
print(f'progress: {i + BATCH_SIZE + i // BATCH_SIZE + 1}/{steps}', file=out, flush=True)
merger = pypdf.PdfWriter() merger = pypdf.PdfWriter()
i = 0 i = 0
@ -80,8 +90,7 @@ def convert(input_files: list[str],
if os.path.isfile(pdf): if os.path.isfile(pdf):
os.remove(pdf) os.remove(pdf)
if progress: next_step()
print(f'progress: {steps}/{steps}', file=out, flush=True)
return page_nums return page_nums
@ -200,7 +209,7 @@ def main() -> None:
usage(True) usage(True)
# a tcp server is used due to the lack of unix sockets on Windows # a tcp server is used due to the lack of unix sockets on Windows
with socketserver.ThreadingTCPServer(SOCKET_ADDRESS, ConnectionHandler) as server: with socketserver.ThreadingTCPServer(SOCKET_ADDRESS, ConnectionHandler) as server:
def exit_gracefully(signum: int, frame) -> None: def exit_gracefully(_signum: int, _frame) -> None:
raise KeyboardInterrupt() raise KeyboardInterrupt()
signal.signal(signal.SIGINT, exit_gracefully) signal.signal(signal.SIGINT, exit_gracefully)
signal.signal(signal.SIGTERM, exit_gracefully) signal.signal(signal.SIGTERM, exit_gracefully)