I have a large PDF file, and need to split it every 'X' pages, but where 'X' could vary. I need it to split every page where a page contains the text 'Name:', but where the text after the 'Name: ' changes...
So page 1 might have 'Name: Sachin', then page 2 might also have 'Name: Sachin', but page 3 has 'Name: Sarah', so it should split from Pages 1 to 2, and then Page 3.
Here is a script I found, except it splits on every page, regardless.
https://www.blog.pythonlibrary.org/2018/04/11/splitting-and-merging-pdfs-with-python/
Thanks in advance,
Sachin
UPDATE:
Here is some code that splits every page regardless, but which detects the name after the text 'Name:' is found and renames the split file accordingly, it has the name in the filename.
How though do I update the code so that if there are two consecutive pages with the same name found (after the text field 'Name:') that it DOES NOT SPLIT on that page, but merges the two pages with the same name into one pdf file?
Thanks again,
Sachin
import os
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_file_path = 'Payslips.pdf'
file_base_name = pdf_file_path.replace('.pdf', '')
output_folder_path = os.path.join(os.getcwd(), 'Output')
pdf = PdfFileReader(pdf_file_path)
for page_num in range(pdf.numPages):
# Setup Objects & Classes
pdfWriter = PdfFileWriter()
pageObj = pdf.getPage(page_num)
pdfWriter.addPage(pageObj)
# Extract Text
Text = pageObj.extractText()
# print(Text)
MatchedTextArray = re.findall("Name:[^0-9] ?\s", Text)
MatchedText = (MatchedTextArray[0].replace('Name:', '')).replace('\n', '')
# Splitting on UpperCase
res_pos = [i for i, e in enumerate(MatchedText 'A') if e.isupper()]
res_list = [MatchedText[res_pos[j]:res_pos[j 1]]
for j in range(len(res_pos)-1)]
# Extracting Firstname
firstname = res_list[1]
# Extracting Surname
del res_list[0:2]
surname = ''.join(res_list)
with open(os.path.join(output_folder_path,
'{0}, {1} - {2}.pdf'.format(surname.upper(), firstname.upper(), file_base_name.upper())),
'wb') as f:
pdfWriter.write(f)
f.close()
print("Split Page " str(page_num))
CodePudding user response:
Something like this should work:
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
def pdf_splitter(path):
fname = os.path.splitext(os.path.basename(path))[0]
pdf = PdfFileReader(path)
for page in range(pdf.getNumPages()):
pdf_writer = PdfFileWriter()
pdf_writer.addPage(pdf.getPage(page))
output_filename = '{}_page_{}.pdf'.format(
fname, page 1)
if not your_condition: # only write of condition isn't met (anymore)
with open("Give_it_a_name.txt", 'wb') as out:
pdf_writer.write(out)
print('Created: {}'.format("Give_it_a_name.txt"))
if __name__ == '__main__':
path = 'w9.pdf'
pdf_splitter(path)
CodePudding user response:
Ok, I think I solved it:
import os
import re
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_file_path = 'Payslips.pdf'
file_base_name = pdf_file_path.replace('.pdf', '')
output_folder_path = os.path.join(os.getcwd(), 'Output')
pdf = PdfFileReader(pdf_file_path)
# Split Files
count = 0
for page_num in range(pdf.numPages):
# Skip Parent Loop if needed
if count > 0:
count -= count
continue
# Setup Objects & Classes
pdfWriter = PdfFileWriter()
pageObj = pdf.getPage(page_num)
pdfWriter.addPage(pageObj)
# Search on Current Page
Text = pageObj.extractText()
MatchedTextArray = re.findall("Name:[^0-9] ?\s", Text)
MatchedText = (MatchedTextArray[0].replace('Name:', '')).replace('\n', '')
# Search on following Pages
i = page_num 1
while i < pdf.numPages:
pageObjNext = pdf.getPage(i)
TextNext = pageObjNext.extractText()
MatchedTextArrayNext = re.findall("Name:[^0-9] ?\s", TextNext)
MatchedTextNext = (MatchedTextArrayNext[0].replace('Name:', '')).replace('\n', '')
if MatchedText == MatchedTextNext:
i = 1
count = 1
pdfWriter.addPage(pageObjNext)
else:
break
# Splitting on UpperCase
res_pos = [i for i, e in enumerate(MatchedText 'A') if e.isupper()]
res_list = [MatchedText[res_pos[j]:res_pos[j 1]] for j in range(len(res_pos)-1)]
# Extracting Firstname
firstname = res_list[1]
# Extracting Surname
surname = ''
del res_list[0:2]
if len(res_list) == 1:
surname = surname res_list[0]
else:
surname = surname res_list[0]
for i in (n 1 for n in range(len(res_list)-1)):
if res_list[i-1][-1] == "-" or res_list[i-1][-1] == "'" :
surname = surname res_list[i]
else:
surname = surname " " res_list[i]
# Write PDF File
with open(os.path.join(output_folder_path,
'{0}, {1}'.format(surname.upper(), firstname.upper())), 'wb') as f:
pdfWriter.write(f)
f.close()
# Rename Files in Output Directory
files = os.listdir(output_folder_path)
for file in files:
os.rename(os.path.join(output_folder_path, file),
os.path.join(output_folder_path, 'WE 25JAN 2022 - ' file ' - PAYSLIP' '.pdf'))
