winziprint: allow pdf files to be merged too
This commit is contained in:
@ -17,36 +17,26 @@ VERSION = __version__ = '0.1.0'
|
|||||||
BATCH_SIZE = 10
|
BATCH_SIZE = 10
|
||||||
|
|
||||||
|
|
||||||
def _get_blank_page() -> weasyprint.Page:
|
|
||||||
html = weasyprint.HTML(string='')
|
|
||||||
doc = html.render()
|
|
||||||
blank_page = doc.pages[0]
|
|
||||||
del html
|
|
||||||
del doc
|
|
||||||
return blank_page
|
|
||||||
|
|
||||||
|
|
||||||
def convert(input_files: list[str],
|
def convert(input_files: list[str],
|
||||||
output_files: str,
|
output_files: str,
|
||||||
encoding: str = None,
|
encoding: str = None,
|
||||||
padding: bool = False,
|
padding: bool = False,
|
||||||
progress: bool = False) -> list[int]:
|
progress: bool = False) -> list[int]:
|
||||||
# it takes roughly 100ms to generate one document
|
# it takes roughly 100ms to generate one document
|
||||||
page_nums = []
|
tmp_page_nums = []
|
||||||
tmp_file_names = []
|
tmp_file_names = []
|
||||||
|
page_nums = []
|
||||||
|
|
||||||
steps = len(input_files) + len(input_files) // BATCH_SIZE + 1
|
html_files = [file for file in input_files if not file.endswith('.pdf')]
|
||||||
blank_page = _get_blank_page() if padding else None
|
steps = len(html_files) + len(html_files) // BATCH_SIZE + 1
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for i in range(0, len(input_files), BATCH_SIZE):
|
for i in range(0, len(html_files), BATCH_SIZE):
|
||||||
batch = input_files[i:i + BATCH_SIZE]
|
batch = html_files[i:i + BATCH_SIZE]
|
||||||
documents = []
|
documents = []
|
||||||
for n, file_name in enumerate(batch):
|
for n, file_name in enumerate(batch):
|
||||||
html = weasyprint.HTML(filename=file_name, encoding=encoding)
|
html = weasyprint.HTML(filename=file_name, encoding=encoding)
|
||||||
doc = html.render()
|
doc = html.render()
|
||||||
if padding and len(doc.pages) % 2 != 0:
|
|
||||||
doc.pages.append(blank_page)
|
|
||||||
documents.append(doc)
|
documents.append(doc)
|
||||||
del html
|
del html
|
||||||
if progress:
|
if progress:
|
||||||
@ -55,16 +45,28 @@ def convert(input_files: list[str],
|
|||||||
tmp_file_name = f'{output_files}.{i:04}.part'
|
tmp_file_name = f'{output_files}.{i:04}.part'
|
||||||
documents[0].copy(all_pages).write_pdf(tmp_file_name)
|
documents[0].copy(all_pages).write_pdf(tmp_file_name)
|
||||||
tmp_file_names.append(tmp_file_name)
|
tmp_file_names.append(tmp_file_name)
|
||||||
page_nums += [len(doc.pages) for doc in documents]
|
tmp_page_nums += [len(doc.pages) for doc in documents]
|
||||||
del documents
|
del documents
|
||||||
del all_pages
|
del all_pages
|
||||||
gc.collect()
|
gc.collect()
|
||||||
if progress:
|
if progress and i < BATCH_SIZE:
|
||||||
print(f'progress: {i + BATCH_SIZE + i // BATCH_SIZE + 1}/{steps}', flush=True)
|
print(f'progress: {i + BATCH_SIZE + i // BATCH_SIZE + 1}/{steps}', flush=True)
|
||||||
|
|
||||||
merger = pypdf.PdfWriter()
|
merger = pypdf.PdfWriter()
|
||||||
for pdf in tmp_file_names:
|
i = 0
|
||||||
merger.append(pdf)
|
for n, file_name in enumerate(input_files):
|
||||||
|
p0 = len(merger.pages)
|
||||||
|
if file_name.endswith('.pdf'):
|
||||||
|
merger.append(file_name)
|
||||||
|
else:
|
||||||
|
batch_page_nums = tmp_page_nums[i // BATCH_SIZE * BATCH_SIZE:(i // BATCH_SIZE + 1) * BATCH_SIZE]
|
||||||
|
page_start = sum(batch_page_nums[:i % BATCH_SIZE])
|
||||||
|
merger.append(tmp_file_names[n // BATCH_SIZE], pages=(page_start, page_start + tmp_page_nums[i]))
|
||||||
|
i += 1
|
||||||
|
p1 = len(merger.pages)
|
||||||
|
page_nums.append(p1 - p0)
|
||||||
|
if padding and len(merger.pages) % 2 != 0:
|
||||||
|
merger.add_blank_page()
|
||||||
merger.write(output_files)
|
merger.write(output_files)
|
||||||
merger.close()
|
merger.close()
|
||||||
del merger
|
del merger
|
||||||
@ -91,10 +93,11 @@ def _wrapper_convert(args: list[str], encoding: str = None, padding: bool = Fals
|
|||||||
padding = True
|
padding = True
|
||||||
t0 = time.process_time()
|
t0 = time.process_time()
|
||||||
pages = convert(inputs, output, encoding=encoding, padding=padding, progress=progress)
|
pages = convert(inputs, output, encoding=encoding, padding=padding, progress=progress)
|
||||||
|
total = sum(p + 1 if padding and p % 2 != 0 else p for p in pages)
|
||||||
t1 = time.process_time()
|
t1 = time.process_time()
|
||||||
print(f'success: '
|
print(f'success: '
|
||||||
f'{len(args) - 1} documents, '
|
f'{len(args) - 1} documents, '
|
||||||
f'{sum(pages)} pages ({", ".join(str(p) for p in pages)}), '
|
f'{total} pages ({", ".join(str(p) for p in pages)}), '
|
||||||
f'{t1 - t0:.1f} sec',
|
f'{t1 - t0:.1f} sec',
|
||||||
flush=True)
|
flush=True)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
Reference in New Issue
Block a user