winziprint: allow pdf files to be merged too

2023-10-23 19:42:36 +02:00
parent 3157719549
commit 2db4ed1cc3
1 changed files with 24 additions and 21 deletions
@@ -17,36 +17,26 @@ VERSION = __version__ = '0.1.0'
 BATCH_SIZE = 10


-def _get_blank_page() -> weasyprint.Page:
-    html = weasyprint.HTML(string='')
-    doc = html.render()
-    blank_page = doc.pages[0]
-    del html
-    del doc
-    return blank_page
-
-
 def convert(input_files: list[str],
            output_files: str,
            encoding: str = None,
            padding: bool = False,
            progress: bool = False) -> list[int]:
    # it takes roughly 100ms to generate one document
-    page_nums = []
+    tmp_page_nums = []
    tmp_file_names = []
+    page_nums = []

-    steps = len(input_files) + len(input_files) // BATCH_SIZE + 1
-    blank_page = _get_blank_page() if padding else None
+    html_files = [file for file in input_files if not file.endswith('.pdf')]
+    steps = len(html_files) + len(html_files) // BATCH_SIZE + 1

    try:
-        for i in range(0, len(input_files), BATCH_SIZE):
-            batch = input_files[i:i + BATCH_SIZE]
+        for i in range(0, len(html_files), BATCH_SIZE):
+            batch = html_files[i:i + BATCH_SIZE]
            documents = []
            for n, file_name in enumerate(batch):
                html = weasyprint.HTML(filename=file_name, encoding=encoding)
                doc = html.render()
-                if padding and len(doc.pages) % 2 != 0:
-                    doc.pages.append(blank_page)
                documents.append(doc)
                del html
                if progress:
@@ -55,16 +45,28 @@ def convert(input_files: list[str],
            tmp_file_name = f'{output_files}.{i:04}.part'
            documents[0].copy(all_pages).write_pdf(tmp_file_name)
            tmp_file_names.append(tmp_file_name)
-            page_nums += [len(doc.pages) for doc in documents]
+            tmp_page_nums += [len(doc.pages) for doc in documents]
            del documents
            del all_pages
            gc.collect()
-            if progress:
+            if progress and i < BATCH_SIZE:
                print(f'progress: {i + BATCH_SIZE + i // BATCH_SIZE + 1}/{steps}', flush=True)

        merger = pypdf.PdfWriter()
-        for pdf in tmp_file_names:
-            merger.append(pdf)
+        i = 0
+        for n, file_name in enumerate(input_files):
+            p0 = len(merger.pages)
+            if file_name.endswith('.pdf'):
+                merger.append(file_name)
+            else:
+                batch_page_nums = tmp_page_nums[i // BATCH_SIZE * BATCH_SIZE:(i // BATCH_SIZE + 1) * BATCH_SIZE]
+                page_start = sum(batch_page_nums[:i % BATCH_SIZE])
+                merger.append(tmp_file_names[n // BATCH_SIZE], pages=(page_start, page_start + tmp_page_nums[i]))
+                i += 1
+            p1 = len(merger.pages)
+            page_nums.append(p1 - p0)
+            if padding and len(merger.pages) % 2 != 0:
+                merger.add_blank_page()
        merger.write(output_files)
        merger.close()
        del merger
@@ -91,10 +93,11 @@ def _wrapper_convert(args: list[str], encoding: str = None, padding: bool = Fals
            padding = True
        t0 = time.process_time()
        pages = convert(inputs, output, encoding=encoding, padding=padding, progress=progress)
+        total = sum(p + 1 if padding and p % 2 != 0 else p for p in pages)
        t1 = time.process_time()
        print(f'success: '
              f'{len(args) - 1} documents, '
-              f'{sum(pages)} pages ({", ".join(str(p) for p in pages)}), '
+              f'{total} pages ({", ".join(str(p) for p in pages)}), '
              f'{t1 - t0:.1f} sec',
              flush=True)
    except Exception as e: