In Python, how to loop through all clickable elements of a given Xpath group when length of list of-CodePudding

I am trying to generate code that will loop through each year on this site of New York public employee salaries and individually select each "Subagency/Type" option that is available. The challenge is that the number of options changes from year to year. I have thought of a few ways one might do this, but haven't been able to figure any of them out. The XPATHs of these elements follow the pattern of

So before I realized that the number of options changed from year to year, I had been using a loop that worked like this:

for box in tqdm(range(1,74)):
    link_to_use = '//*[@id="subagencyGroup"]/ul/li['   str(box)   ']/span[1]'
    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, link_to_use)))
    driver.find_element(By.XPATH, link_to_use).click()

I would then execute some code on the results and finish the iteration by unclicking that element so that the next iteration would only click the next element. My thought was perhaps I could use beautifulsoup to get all of the li[N] values and find them maximum of N on the page and then used that 1 for the upper value of my range iterator.

But then I started looking into a more direct route of just getting from the page a list of those elements to click, so the code would look something more like:

elem = driver.find_elements_by_xpath('//*[@id="subagencyGroup"]')
all_li = elem.find_elements_by_tag_name("li")

for el in tqdm(all_li):
    WebDriverWait(driver, 10).until(EC.presence_of_element_located(el)
    el.click()

But this seems to stall for some reason unknown to me - despite clicking the first option successfully.

The full code is below, and I would appreciate any guidance.

# check if "page next" button is disabled
def check_for_break_condition(driver):

    page_next_button = driver.find_element(By.ID, "data_loader")
    styles = page_next_button.get_attribute("style")
    if('none' in styles):
        return True
    else:
        return False

# Loop Through Years 
for yr in range(1, 15):

    # Store URL to Use
    url = 'https://www.seethroughny.net/payrolls'

    # Initialize WebDriver
    options = Options()
    # options.headless = True
    options.add_argument("start-maximized")
    options.add_argument("disable-infobars")
    options.add_argument("--disable-extensions")
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-application-cache')
    options.add_argument('--disable-gpu')
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1560,840") 
    driver = webdriver.Chrome('C:/Windows/System32/chromedriver.exe', options=options)
    driver.get(url)

    try:
        driver.find_element(By.XPATH, '//*[@id="donate-banner"]/button/span').click()
    except:
        pass

    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[@id="stny_payYear_2020"]/div')))
    driver.find_element(By.CSS_SELECTOR, "#yearGroupHeading a").click()

    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[@id="stny_payYear_2020"]/div')))
    driver.find_element(By.XPATH, '//*[@id="stny_payYear_2020"]/div').click()

    yr_path = '//*[@id="yearGroup"]/ul/li['   str(yr)   ']'
    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, yr_path)))
    driver.find_element(By.XPATH, yr_path).click()

    # Select Colleges 
    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[@id="subagencyGroupHeading"]/h4/a')))
    driver.find_element(By.CSS_SELECTOR, "#subagencyGroupHeading a").click()
    driver.find_element(By.CSS_SELECTOR, "#subagencyGroup .form-control").click()

    # WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[@id="subagencyGroupHeading"]/h4/a')))
    driver.find_element(By.XPATH, '//*[@id="subagencyGroup"]/div/input').send_keys("College")

    time.sleep(5)

    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="subagencyGroup"]/ul/li[1]/span[1]')))

    elem = driver.find_elements_by_xpath('//*[@id="subagencyGroup"]')
    all_li = elem.find_elements_by_tag_name("li")

    # Select SUNY and CUNY 
    for el in tqdm(all_li):

        # link_to_use = '//*[@id="subagencyGroup"]/ul/li['   str(box)   ']/span[1]'
        WebDriverWait(driver, 10).until(EC.presence_of_element_located(el)
        el.click()
        time.sleep(1.5)

        #for i in tqdm(range(1,2500)):
        while not check_for_break_condition(driver):

            try: 
                WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, '//*[@id="data_loader"]')))
                driver.execute_script("stnyEvent.getNextPage();")
            except:
                captureError(driver)

        el.click()

CodePudding user response：

Give this a try. Looks like you can get it through post requests.

import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import time

s = requests.Session()
s.get('https://www.seethroughny.net/')

cookies = s.cookies.get_dict()
cookieStr = ''
for k,v in cookies.items():
    cookieStr  = f'{k}={v};'

url = 'https://www.seethroughny.net/tools/required/reports/payroll'
headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
        'cookie':cookieStr,
        'accept': 'application/json, text/javascript, */*; q=0.01',
        'referer': 'https://www.seethroughny.net/payrolls'}
payload = {'action':'get'}



def get_data(s, formData, payload):
    retry = 0
    try:
        jsonData = s.post(url, headers=headers, data=formData, params=payload, timeout=10).json()
        success = True
    except:
        retry =1
        print('Retry: %s' %retry)
        jsonData = None
        success = False
        time.sleep(10)
        
    return jsonData, success
        
    
def parse_data(jsonData):
    rows = []
    html = jsonData['html']   
    soup = BeautifulSoup(html, 'html.parser')    
        
    trs = soup.find_all('tr', {'id':re.compile("^resultRow")})
    for tr in trs:
        tds = tr.find_all('td')
        
        row = {
            'Name':tds[1].text,
            'Employer/Agency':tds[2].text,
            'Total Pay':tds[3].text,
            'Subagency/Type':tds[4].text
            }
        
        expandedRow = tr.find_next('tr', {'id':re.compile("^expandRow")})
        td = expandedRow.find_all('td')[1]
        divs = td.find_all('div', {'class':re.compile("^row")})
        for each in divs:
            header = each.find_all('div')[0].text
            value = each.find_all('div')[1].text
            
            row.update({header:value})
            
        rows.append(row)
        
    return rows
    

rows = []
for year in [2021]:
    completedYear = False
    # Get Total pages 
    formData = {
    'PayYear[]': f'{year}',
    'SortBy': 'YTDPay DESC',
    'current_page': '0',
    'result_id': '0',
    'url': '/tools/required/reports/payroll?action=get',
    'nav_request': '0'}   
    
    jsonData, success = get_data(s, formData, payload)
    total_pages = jsonData['total_pages']
    print(f'Year: {year}\tPage: 0 of {total_pages}')
    
    rows  = parse_data(jsonData)
    
    page = 1
    while completedYear == False:
        success = False
        while success == False:
            formData.update({'current_page': '%s' %page})
            jsonData, success = get_data(s, formData, payload)
            
        rows  = parse_data(jsonData)   
        print(f'Year: {year}\tPage: {page} of {total_pages}')
        page =1
        if page > total_pages:
            completedYear = True
            
df = pd.DataFrame(rows)

Output:

print(df.head(5).to_string())
                   Name                            Employer/Agency   Total Pay                             Subagency/Type                             SubAgency/Type            Title Rate of Pay Pay Year Pay Basis Branch/Major Category
0      Johnson, Candace  Roswell Park Cancer Institute Corporation  $1,622,807  Roswell Park Cancer Institute Corporation  Roswell Park Cancer Institute Corporation  President & Ceo          $0     2021    Annual    Public Authorities
1      Kuettel, Michael  Roswell Park Cancer Institute Corporation    $958,373  Roswell Park Cancer Institute Corporation  Roswell Park Cancer Institute Corporation            Chair          $0     2021    Annual    Public Authorities
2      Odunsi, Adekunle  Roswell Park Cancer Institute Corporation    $918,079  Roswell Park Cancer Institute Corporation  Roswell Park Cancer Institute Corporation  Deputy Director          $0     2021    Annual    Public Authorities
3  Fenstermaker, Robert  Roswell Park Cancer Institute Corporation    $914,887  Roswell Park Cancer Institute Corporation  Roswell Park Cancer Institute Corporation            Chair          $0     2021    Annual    Public Authorities
4        Guru, Khurshid  Roswell Park Cancer Institute Corporation    $856,979  Roswell Park Cancer Institute Corporation  Roswell Park Cancer Institute Corporation            Chair          $0     2021    Annual    Public Authorities