winziprint.py: Extract convert_part() from convert()
This commit is contained in:
		@@ -1,7 +1,7 @@
 | 
				
			|||||||
#!/usr/bin/env python3
 | 
					#!/usr/bin/env python3
 | 
				
			||||||
# -*- coding: utf-8 -*-
 | 
					# -*- coding: utf-8 -*-
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from typing import TextIO
 | 
					from typing import TextIO, Callable
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import sys
 | 
					import sys
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
@@ -10,6 +10,7 @@ import gc
 | 
				
			|||||||
import socketserver
 | 
					import socketserver
 | 
				
			||||||
import io
 | 
					import io
 | 
				
			||||||
import signal
 | 
					import signal
 | 
				
			||||||
 | 
					import math
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import weasyprint
 | 
					import weasyprint
 | 
				
			||||||
import pypdf
 | 
					import pypdf
 | 
				
			||||||
@@ -21,6 +22,24 @@ SOCKET_ADDRESS = ('127.0.0.1', 30983)
 | 
				
			|||||||
BATCH_SIZE = 10
 | 
					BATCH_SIZE = 10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def convert_part(output_file: str, batch: list[str], step_cb: Callable, encoding: str = None) -> list[int]:
 | 
				
			||||||
 | 
					    documents = []
 | 
				
			||||||
 | 
					    for n, file_name in enumerate(batch):
 | 
				
			||||||
 | 
					        html = weasyprint.HTML(filename=file_name, encoding=encoding)
 | 
				
			||||||
 | 
					        doc = html.render()
 | 
				
			||||||
 | 
					        documents.append(doc)
 | 
				
			||||||
 | 
					        del html
 | 
				
			||||||
 | 
					        step_cb()
 | 
				
			||||||
 | 
					    all_pages = [p for doc in documents for p in doc.pages]
 | 
				
			||||||
 | 
					    documents[0].copy(all_pages).write_pdf(output_file)
 | 
				
			||||||
 | 
					    tmp_page_nums = [len(doc.pages) for doc in documents]
 | 
				
			||||||
 | 
					    del documents
 | 
				
			||||||
 | 
					    del all_pages
 | 
				
			||||||
 | 
					    gc.collect()
 | 
				
			||||||
 | 
					    step_cb()
 | 
				
			||||||
 | 
					    return tmp_page_nums
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert(input_files: list[str],
 | 
					def convert(input_files: list[str],
 | 
				
			||||||
            output_files: str,
 | 
					            output_files: str,
 | 
				
			||||||
            encoding: str = None,
 | 
					            encoding: str = None,
 | 
				
			||||||
@@ -28,34 +47,25 @@ def convert(input_files: list[str],
 | 
				
			|||||||
            progress: bool = False,
 | 
					            progress: bool = False,
 | 
				
			||||||
            out: TextIO = sys.stdout) -> list[int]:
 | 
					            out: TextIO = sys.stdout) -> list[int]:
 | 
				
			||||||
    # it takes roughly 100ms to generate one document
 | 
					    # it takes roughly 100ms to generate one document
 | 
				
			||||||
    tmp_page_nums = []
 | 
					 | 
				
			||||||
    tmp_file_names = []
 | 
					    tmp_file_names = []
 | 
				
			||||||
    page_nums = []
 | 
					    page_nums = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    html_files = [file.lstrip('!') for file in input_files if not file.endswith('.pdf')]
 | 
					    html_files = [file.lstrip('!') for file in input_files if not file.endswith('.pdf')]
 | 
				
			||||||
    steps = len(html_files) + len(html_files) // BATCH_SIZE + 1
 | 
					    total_steps = len(html_files) + math.ceil(len(html_files) / BATCH_SIZE) + 1
 | 
				
			||||||
 | 
					    convert.steps = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def next_step() -> None:
 | 
				
			||||||
 | 
					        convert.steps += 1
 | 
				
			||||||
 | 
					        if progress:
 | 
				
			||||||
 | 
					            print(f'progress: {convert.steps}/{total_steps}', file=out, flush=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
 | 
					        tmp_page_nums = []
 | 
				
			||||||
        for i in range(0, len(html_files), BATCH_SIZE):
 | 
					        for i in range(0, len(html_files), BATCH_SIZE):
 | 
				
			||||||
 | 
					            tmp_file = f'{output_files}.{i:04}.part'
 | 
				
			||||||
 | 
					            tmp_file_names.append(tmp_file)
 | 
				
			||||||
            batch = html_files[i:i + BATCH_SIZE]
 | 
					            batch = html_files[i:i + BATCH_SIZE]
 | 
				
			||||||
            documents = []
 | 
					            tmp_page_nums += convert_part(tmp_file, batch, next_step, encoding=encoding)
 | 
				
			||||||
            for n, file_name in enumerate(batch):
 | 
					 | 
				
			||||||
                html = weasyprint.HTML(filename=file_name, encoding=encoding)
 | 
					 | 
				
			||||||
                doc = html.render()
 | 
					 | 
				
			||||||
                documents.append(doc)
 | 
					 | 
				
			||||||
                del html
 | 
					 | 
				
			||||||
                if progress:
 | 
					 | 
				
			||||||
                    print(f'progress: {i + n + i // BATCH_SIZE + 1}/{steps}', file=out, flush=True)
 | 
					 | 
				
			||||||
            all_pages = [p for doc in documents for p in doc.pages]
 | 
					 | 
				
			||||||
            tmp_file_name = f'{output_files}.{i:04}.part'
 | 
					 | 
				
			||||||
            documents[0].copy(all_pages).write_pdf(tmp_file_name)
 | 
					 | 
				
			||||||
            tmp_file_names.append(tmp_file_name)
 | 
					 | 
				
			||||||
            tmp_page_nums += [len(doc.pages) for doc in documents]
 | 
					 | 
				
			||||||
            del documents
 | 
					 | 
				
			||||||
            del all_pages
 | 
					 | 
				
			||||||
            gc.collect()
 | 
					 | 
				
			||||||
            if progress and i + BATCH_SIZE < len(html_files):
 | 
					 | 
				
			||||||
                print(f'progress: {i + BATCH_SIZE + i // BATCH_SIZE + 1}/{steps}', file=out, flush=True)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        merger = pypdf.PdfWriter()
 | 
					        merger = pypdf.PdfWriter()
 | 
				
			||||||
        i = 0
 | 
					        i = 0
 | 
				
			||||||
@@ -80,8 +90,7 @@ def convert(input_files: list[str],
 | 
				
			|||||||
            if os.path.isfile(pdf):
 | 
					            if os.path.isfile(pdf):
 | 
				
			||||||
                os.remove(pdf)
 | 
					                os.remove(pdf)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if progress:
 | 
					    next_step()
 | 
				
			||||||
        print(f'progress: {steps}/{steps}', file=out, flush=True)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return page_nums
 | 
					    return page_nums
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -200,7 +209,7 @@ def main() -> None:
 | 
				
			|||||||
            usage(True)
 | 
					            usage(True)
 | 
				
			||||||
        # a tcp server is used due to the lack of unix sockets on Windows
 | 
					        # a tcp server is used due to the lack of unix sockets on Windows
 | 
				
			||||||
        with socketserver.ThreadingTCPServer(SOCKET_ADDRESS, ConnectionHandler) as server:
 | 
					        with socketserver.ThreadingTCPServer(SOCKET_ADDRESS, ConnectionHandler) as server:
 | 
				
			||||||
            def exit_gracefully(signum: int, frame) -> None:
 | 
					            def exit_gracefully(_signum: int, _frame) -> None:
 | 
				
			||||||
                raise KeyboardInterrupt()
 | 
					                raise KeyboardInterrupt()
 | 
				
			||||||
            signal.signal(signal.SIGINT, exit_gracefully)
 | 
					            signal.signal(signal.SIGINT, exit_gracefully)
 | 
				
			||||||
            signal.signal(signal.SIGTERM, exit_gracefully)
 | 
					            signal.signal(signal.SIGTERM, exit_gracefully)
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user