Split and Combine PDFs in R and Python

Program
Published

October 24, 2021

There are 100 PDFs and now the first page of each PDF needs to be extracted and then combined to create a 100-page PDF.

R code using qpdf package:

input_path <- "/Users/apple/Documents/test/input"
temp_dir <- "/Users/apple/Documents/test/output"

library(qpdf)
all_file_path <- paste0(input_path, "/", list.files(input_path))
for(i in c(1: length(all_file_path))) {
  pdf_subset(all_file_path[i], pages = 1, paste0(temp_dir, "/temp", i, ".pdf"))
}

all_file_path <- paste0(temp_dir, "/", list.files(temp_dir))
pdf_combine(all_file_path, paste0(temp_dir, "/output.pdf"))

JIKE@无声 wrote a Python program for me and it worked very well.

import os

from pathlib import Path

from PyPDF2 import PdfFileMerger, PdfFileReader, PdfFileWriter


def walk_target_folder(target_folder_path):
    pdf_file_list = []
    for root, folder, files in os.walk(target_folder_path):
        for file in files:
            if file.endswith('.pdf'):
                pdf_file_list.append(root + '/' + file)

    return pdf_file_list


def main(target_folder_path, save_file_path):
    if not os.path.exists(target_folder_path):
        print('文件夹地址不存在')
        return

    pdf_file_list = walk_target_folder(target_folder_path)

    pdf_writer = PdfFileWriter()

    for index, pdf_file_path in enumerate(pdf_file_list):
        try:
            reader = PdfFileReader(pdf_file_path)

            # 获取第一页
            page = reader.getPage(0)
            print('读取第{}个pdf'.format(index + 1))

            pdf_writer.addPage(page)
            print('加入PdfFileWriter')
        except Exception as e:
            print('==============================\n读取第{}个pdf 失败 文件加密状态:{}'.format(index + 1, reader.isEncrypted))
            print('Pdf: {} 执行失败'.format(pdf_file_path))
            print('错误信息: ' + str(e) + '\n==============================\n')

    print('开始写文件...')
    with Path(save_file_path).open(mode="wb") as output_file:
        pdf_writer.write(output_file)

    print("导出成功")


if __name__ == '__main__':
    # 目标文件夹地址
    target_folder_path = '/Users/apple/Documents/PDFs'

    # 生成文件的目标地址 
    save_file_path = '/Users/apple/Desktop/combine.pdf'

    main(target_folder_path, save_file_path)