Home > Blockchain >  How do I split a PDF using Python, every page that contains a set of specific unique text
How do I split a PDF using Python, every page that contains a set of specific unique text

Time:01-30

I have a large PDF file, and need to split it every 'X' pages, but where 'X' could vary. I need it to split every page where a page contains the text 'Name:', but where the text after the 'Name: ' changes...

So page 1 might have 'Name: Sachin', then page 2 might also have 'Name: Sachin', but page 3 has 'Name: Sarah', so it should split from Pages 1 to 2, and then Page 3.

Here is a script I found, except it splits on every page, regardless.

https://www.blog.pythonlibrary.org/2018/04/11/splitting-and-merging-pdfs-with-python/

Thanks in advance,

Sachin

UPDATE:

Here is some code that splits every page regardless, but which detects the name after the text 'Name:' is found and renames the split file accordingly, it has the name in the filename.

How though do I update the code so that if there are two consecutive pages with the same name found (after the text field 'Name:') that it DOES NOT SPLIT on that page, but merges the two pages with the same name into one pdf file?

Thanks again,

Sachin

import os
import re
from PyPDF2 import PdfFileReader, PdfFileWriter

pdf_file_path = 'Payslips.pdf'
file_base_name = pdf_file_path.replace('.pdf', '')
output_folder_path = os.path.join(os.getcwd(), 'Output')

pdf = PdfFileReader(pdf_file_path)

for page_num in range(pdf.numPages):

    # Setup Objects & Classes
    pdfWriter = PdfFileWriter()
    pageObj = pdf.getPage(page_num)
    pdfWriter.addPage(pageObj)

    # Extract Text
    Text = pageObj.extractText() 

    # print(Text)
    MatchedTextArray = re.findall("Name:[^0-9] ?\s", Text)
    MatchedText = (MatchedTextArray[0].replace('Name:', '')).replace('\n', '')
   
    # Splitting on UpperCase
    res_pos = [i for i, e in enumerate(MatchedText 'A') if e.isupper()]
    res_list = [MatchedText[res_pos[j]:res_pos[j   1]]
            for j in range(len(res_pos)-1)]

    # Extracting Firstname
    firstname = res_list[1]

    # Extracting Surname
    del res_list[0:2]
    surname = ''.join(res_list)


    with open(os.path.join(output_folder_path, 
        '{0}, {1} - {2}.pdf'.format(surname.upper(), firstname.upper(), file_base_name.upper())), 
        'wb') as f:
        pdfWriter.write(f)
        f.close()

    print("Split Page "   str(page_num)) 

CodePudding user response:

Something like this should work:

import os
from PyPDF2 import PdfFileReader, PdfFileWriter
def pdf_splitter(path):
    fname = os.path.splitext(os.path.basename(path))[0]
    pdf = PdfFileReader(path)
    for page in range(pdf.getNumPages()):
        pdf_writer = PdfFileWriter()
        pdf_writer.addPage(pdf.getPage(page))
        output_filename = '{}_page_{}.pdf'.format(
            fname, page 1)
        if not your_condition: # only write of condition isn't met (anymore)
            with open("Give_it_a_name.txt", 'wb') as out:
                pdf_writer.write(out)
            print('Created: {}'.format("Give_it_a_name.txt"))
if __name__ == '__main__':
    path = 'w9.pdf'
    pdf_splitter(path)

CodePudding user response:

Ok, I think I solved it:

import os
import re
from PyPDF2 import PdfFileReader, PdfFileWriter

pdf_file_path = 'Payslips.pdf'
file_base_name = pdf_file_path.replace('.pdf', '')
output_folder_path = os.path.join(os.getcwd(), 'Output')
pdf = PdfFileReader(pdf_file_path)

# Split Files
count = 0
for page_num in range(pdf.numPages):

    # Skip Parent Loop if needed
    if count > 0:
        count -= count
        continue
         
    # Setup Objects & Classes
    pdfWriter = PdfFileWriter()
    pageObj = pdf.getPage(page_num)
    pdfWriter.addPage(pageObj)

    # Search on Current Page
    Text = pageObj.extractText() 
    MatchedTextArray = re.findall("Name:[^0-9] ?\s", Text)
    MatchedText = (MatchedTextArray[0].replace('Name:', '')).replace('\n', '')

    # Search on following Pages
    i = page_num   1
    while i < pdf.numPages:
        pageObjNext = pdf.getPage(i)
        TextNext = pageObjNext.extractText() 
        MatchedTextArrayNext = re.findall("Name:[^0-9] ?\s", TextNext)
        MatchedTextNext = (MatchedTextArrayNext[0].replace('Name:', '')).replace('\n', '')

        if MatchedText == MatchedTextNext:
            i  = 1
            count  = 1
            pdfWriter.addPage(pageObjNext)
        else:
            break

    # Splitting on UpperCase
    res_pos = [i for i, e in enumerate(MatchedText 'A') if e.isupper()]
    res_list = [MatchedText[res_pos[j]:res_pos[j   1]] for j in range(len(res_pos)-1)]

    # Extracting Firstname
    firstname = res_list[1]

    # Extracting Surname
    surname = ''
    del res_list[0:2]
    if len(res_list) == 1:
        surname = surname   res_list[0]
    else:
        surname = surname   res_list[0]
        for i in (n 1 for n in range(len(res_list)-1)):
            if res_list[i-1][-1] == "-" or res_list[i-1][-1] == "'" :
                surname = surname   res_list[i]
            else:
                surname = surname   " "   res_list[i]
 
    # Write PDF File
    with open(os.path.join(output_folder_path, 
        '{0}, {1}'.format(surname.upper(), firstname.upper())), 'wb') as f:
        pdfWriter.write(f)
        f.close()

# Rename Files in Output Directory
files = os.listdir(output_folder_path)
for file in files:
    os.rename(os.path.join(output_folder_path, file), 
    os.path.join(output_folder_path, 'WE 25JAN 2022 - '   file   ' - PAYSLIP'   '.pdf'))
  •  Tags:  
  • Related