I am trying to generate code that will loop through each year on this site of New York public employee salaries and individually select each "Subagency/Type" option that is available. The challenge is that the number of options changes from year to year. I have thought of a few ways one might do this, but haven't been able to figure any of them out. The XPATHs of these elements follow the pattern of
So before I realized that the number of options changed from year to year, I had been using a loop that worked like this:
for box in tqdm(range(1,74)):
link_to_use = '//*[@id="subagencyGroup"]/ul/li[' str(box) ']/span[1]'
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, link_to_use)))
driver.find_element(By.XPATH, link_to_use).click()
I would then execute some code on the results and finish the iteration by unclicking that element so that the next iteration would only click the next element. My thought was perhaps I could use beautifulsoup to get all of the li[N] values and find them maximum of N on the page and then used that 1 for the upper value of my range iterator.
But then I started looking into a more direct route of just getting from the page a list of those elements to click, so the code would look something more like:
elem = driver.find_elements_by_xpath('//*[@id="subagencyGroup"]')
all_li = elem.find_elements_by_tag_name("li")
for el in tqdm(all_li):
WebDriverWait(driver, 10).until(EC.presence_of_element_located(el)
el.click()
But this seems to stall for some reason unknown to me - despite clicking the first option successfully.
The full code is below, and I would appreciate any guidance.
# check if "page next" button is disabled
def check_for_break_condition(driver):
page_next_button = driver.find_element(By.ID, "data_loader")
styles = page_next_button.get_attribute("style")
if('none' in styles):
return True
else:
return False
# Loop Through Years
for yr in range(1, 15):
# Store URL to Use
url = 'https://www.seethroughny.net/payrolls'
# Initialize WebDriver
options = Options()
# options.headless = True
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument('--no-sandbox')
options.add_argument('--disable-application-cache')
options.add_argument('--disable-gpu')
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--window-size=1560,840")
driver = webdriver.Chrome('C:/Windows/System32/chromedriver.exe', options=options)
driver.get(url)
try:
driver.find_element(By.XPATH, '//*[@id="donate-banner"]/button/span').click()
except:
pass
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[@id="stny_payYear_2020"]/div')))
driver.find_element(By.CSS_SELECTOR, "#yearGroupHeading a").click()
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[@id="stny_payYear_2020"]/div')))
driver.find_element(By.XPATH, '//*[@id="stny_payYear_2020"]/div').click()
yr_path = '//*[@id="yearGroup"]/ul/li[' str(yr) ']'
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, yr_path)))
driver.find_element(By.XPATH, yr_path).click()
# Select Colleges
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[@id="subagencyGroupHeading"]/h4/a')))
driver.find_element(By.CSS_SELECTOR, "#subagencyGroupHeading a").click()
driver.find_element(By.CSS_SELECTOR, "#subagencyGroup .form-control").click()
# WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, '//*[@id="subagencyGroupHeading"]/h4/a')))
driver.find_element(By.XPATH, '//*[@id="subagencyGroup"]/div/input').send_keys("College")
time.sleep(5)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="subagencyGroup"]/ul/li[1]/span[1]')))
elem = driver.find_elements_by_xpath('//*[@id="subagencyGroup"]')
all_li = elem.find_elements_by_tag_name("li")
# Select SUNY and CUNY
for el in tqdm(all_li):
# link_to_use = '//*[@id="subagencyGroup"]/ul/li[' str(box) ']/span[1]'
WebDriverWait(driver, 10).until(EC.presence_of_element_located(el)
el.click()
time.sleep(1.5)
#for i in tqdm(range(1,2500)):
while not check_for_break_condition(driver):
try:
WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, '//*[@id="data_loader"]')))
driver.execute_script("stnyEvent.getNextPage();")
except:
captureError(driver)
el.click()
CodePudding user response:
Give this a try. Looks like you can get it through post requests.
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import time
s = requests.Session()
s.get('https://www.seethroughny.net/')
cookies = s.cookies.get_dict()
cookieStr = ''
for k,v in cookies.items():
cookieStr = f'{k}={v};'
url = 'https://www.seethroughny.net/tools/required/reports/payroll'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',
'cookie':cookieStr,
'accept': 'application/json, text/javascript, */*; q=0.01',
'referer': 'https://www.seethroughny.net/payrolls'}
payload = {'action':'get'}
def get_data(s, formData, payload):
retry = 0
try:
jsonData = s.post(url, headers=headers, data=formData, params=payload, timeout=10).json()
success = True
except:
retry =1
print('Retry: %s' %retry)
jsonData = None
success = False
time.sleep(10)
return jsonData, success
def parse_data(jsonData):
rows = []
html = jsonData['html']
soup = BeautifulSoup(html, 'html.parser')
trs = soup.find_all('tr', {'id':re.compile("^resultRow")})
for tr in trs:
tds = tr.find_all('td')
row = {
'Name':tds[1].text,
'Employer/Agency':tds[2].text,
'Total Pay':tds[3].text,
'Subagency/Type':tds[4].text
}
expandedRow = tr.find_next('tr', {'id':re.compile("^expandRow")})
td = expandedRow.find_all('td')[1]
divs = td.find_all('div', {'class':re.compile("^row")})
for each in divs:
header = each.find_all('div')[0].text
value = each.find_all('div')[1].text
row.update({header:value})
rows.append(row)
return rows
rows = []
for year in [2021]:
completedYear = False
# Get Total pages
formData = {
'PayYear[]': f'{year}',
'SortBy': 'YTDPay DESC',
'current_page': '0',
'result_id': '0',
'url': '/tools/required/reports/payroll?action=get',
'nav_request': '0'}
jsonData, success = get_data(s, formData, payload)
total_pages = jsonData['total_pages']
print(f'Year: {year}\tPage: 0 of {total_pages}')
rows = parse_data(jsonData)
page = 1
while completedYear == False:
success = False
while success == False:
formData.update({'current_page': '%s' %page})
jsonData, success = get_data(s, formData, payload)
rows = parse_data(jsonData)
print(f'Year: {year}\tPage: {page} of {total_pages}')
page =1
if page > total_pages:
completedYear = True
df = pd.DataFrame(rows)
Output:
print(df.head(5).to_string())
Name Employer/Agency Total Pay Subagency/Type SubAgency/Type Title Rate of Pay Pay Year Pay Basis Branch/Major Category
0 Johnson, Candace Roswell Park Cancer Institute Corporation $1,622,807 Roswell Park Cancer Institute Corporation Roswell Park Cancer Institute Corporation President & Ceo $0 2021 Annual Public Authorities
1 Kuettel, Michael Roswell Park Cancer Institute Corporation $958,373 Roswell Park Cancer Institute Corporation Roswell Park Cancer Institute Corporation Chair $0 2021 Annual Public Authorities
2 Odunsi, Adekunle Roswell Park Cancer Institute Corporation $918,079 Roswell Park Cancer Institute Corporation Roswell Park Cancer Institute Corporation Deputy Director $0 2021 Annual Public Authorities
3 Fenstermaker, Robert Roswell Park Cancer Institute Corporation $914,887 Roswell Park Cancer Institute Corporation Roswell Park Cancer Institute Corporation Chair $0 2021 Annual Public Authorities
4 Guru, Khurshid Roswell Park Cancer Institute Corporation $856,979 Roswell Park Cancer Institute Corporation Roswell Park Cancer Institute Corporation Chair $0 2021 Annual Public Authorities
