diff --git a/winziprint/winziprint.py b/winziprint/winziprint.py index 9bd346a..03edcea 100755 --- a/winziprint/winziprint.py +++ b/winziprint/winziprint.py @@ -17,36 +17,26 @@ VERSION = __version__ = '0.1.0' BATCH_SIZE = 10 -def _get_blank_page() -> weasyprint.Page: - html = weasyprint.HTML(string='') - doc = html.render() - blank_page = doc.pages[0] - del html - del doc - return blank_page - - def convert(input_files: list[str], output_files: str, encoding: str = None, padding: bool = False, progress: bool = False) -> list[int]: # it takes roughly 100ms to generate one document - page_nums = [] + tmp_page_nums = [] tmp_file_names = [] + page_nums = [] - steps = len(input_files) + len(input_files) // BATCH_SIZE + 1 - blank_page = _get_blank_page() if padding else None + html_files = [file for file in input_files if not file.endswith('.pdf')] + steps = len(html_files) + len(html_files) // BATCH_SIZE + 1 try: - for i in range(0, len(input_files), BATCH_SIZE): - batch = input_files[i:i + BATCH_SIZE] + for i in range(0, len(html_files), BATCH_SIZE): + batch = html_files[i:i + BATCH_SIZE] documents = [] for n, file_name in enumerate(batch): html = weasyprint.HTML(filename=file_name, encoding=encoding) doc = html.render() - if padding and len(doc.pages) % 2 != 0: - doc.pages.append(blank_page) documents.append(doc) del html if progress: @@ -55,16 +45,28 @@ def convert(input_files: list[str], tmp_file_name = f'{output_files}.{i:04}.part' documents[0].copy(all_pages).write_pdf(tmp_file_name) tmp_file_names.append(tmp_file_name) - page_nums += [len(doc.pages) for doc in documents] + tmp_page_nums += [len(doc.pages) for doc in documents] del documents del all_pages gc.collect() - if progress: + if progress and i < BATCH_SIZE: print(f'progress: {i + BATCH_SIZE + i // BATCH_SIZE + 1}/{steps}', flush=True) merger = pypdf.PdfWriter() - for pdf in tmp_file_names: - merger.append(pdf) + i = 0 + for n, file_name in enumerate(input_files): + p0 = len(merger.pages) + if file_name.endswith('.pdf'): + merger.append(file_name) + else: + batch_page_nums = tmp_page_nums[i // BATCH_SIZE * BATCH_SIZE:(i // BATCH_SIZE + 1) * BATCH_SIZE] + page_start = sum(batch_page_nums[:i % BATCH_SIZE]) + merger.append(tmp_file_names[n // BATCH_SIZE], pages=(page_start, page_start + tmp_page_nums[i])) + i += 1 + p1 = len(merger.pages) + page_nums.append(p1 - p0) + if padding and len(merger.pages) % 2 != 0: + merger.add_blank_page() merger.write(output_files) merger.close() del merger @@ -91,10 +93,11 @@ def _wrapper_convert(args: list[str], encoding: str = None, padding: bool = Fals padding = True t0 = time.process_time() pages = convert(inputs, output, encoding=encoding, padding=padding, progress=progress) + total = sum(p + 1 if padding and p % 2 != 0 else p for p in pages) t1 = time.process_time() print(f'success: ' f'{len(args) - 1} documents, ' - f'{sum(pages)} pages ({", ".join(str(p) for p in pages)}), ' + f'{total} pages ({", ".join(str(p) for p in pages)}), ' f'{t1 - t0:.1f} sec', flush=True) except Exception as e: