By referring https://medium.com/analytics-vidhya/how-to-scrape-data-from-a-website-using-python-for-beginner-5c770a1fbe2d I have started scraping data from a website with login information. My website is a bit different and I got result. But it is in a different format. code:
from pprint import pprint
import datetime
import requests
from bs4 import BeautifulSoup
cookies = {
'CFID': '180615757',
'CFTOKEN': '64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE',
'visid_incap_2388351': '0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier',
'_ga_6ZQNJ4ELG2': 'GS1.1.1662315508.15.1.1662315668.0.0.0',
'_ga': 'GA1.2.147261521.1662080801',
'_gid': 'GA1.2.1149490171.1662080801',
'reese84': '3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani 7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8 nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj 39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu pb0mSp5n iKotUEn9h sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG 3Qe3zAfpdrs=',
'__atuvc': '65|35,2|36',
'COOKIESTATUS': 'ON',
'HIDECOOKIEBANNER': 'TRUE',
'nlbi_2388351': 'jGGxMFazFBqnU x okRrFAAAAAC/AJ/k R2U vs5Q4LIRTS7',
'nlbi_2388351_2147483392': 'PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N',
'incap_ses_989_2388351': 'mWy Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==',
'incap_ses_468_2388351': 'sDNcR2labTHyNXYlUqx BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==',
'__atuvs': '6314ec0cdbe92a78001',
'_gat_gtag_UA_12825325_1': '1',
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:104.0) Gecko/20100101 Firefox/104.0',
'Accept': 'text/html,application/xhtml xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
# 'Accept-Encoding': 'gzip, deflate, br',
'Referer': 'https://www.higheredjobs.com/admin/',
'Connection': 'keep-alive',
# Requests sorts cookies= alphabetically
# 'Cookie': 'CFID=180615757; CFTOKEN=64089929988eb934-58E2ACC9-AD21-785B-2AFBCE86106B41FE; visid_incap_2388351=0Vmr7QpDRvmVw8fbXUJFkB5XEWMAAAAAQUIPAAAAAADtlXunU/D8GLU5VofHHier; _ga_6ZQNJ4ELG2=GS1.1.1662315508.15.1.1662315668.0.0.0; _ga=GA1.2.147261521.1662080801; _gid=GA1.2.1149490171.1662080801; reese84=3:yMGXsdMquwoCj3IoSFRCMg==:Vf20HwL77P8oWYTTKbE0XigwyQE3d2lLQpPVoZYcoL8SJTmLeqAani 7GspfC2BiJYOOytBlkIp9MewLgs/XbkaiLrSvLnMdZ0aT8/M9FvBohByybnJXNl25ya/yfpGhL9oT1HKMZYnKqSR0Sg8 nHTUEO0/YErJgQmfoeYIT4kmE01S8cndGIemtuGjvq1hzB/D9VAQL7S3idutOumBNu84j5FyCdOBClCJTriE X9j40lj1swIxFlryTmBAtLHnEvN9M57N4LMb13yuSBaCawrv4fnron0JnUvfKpLU0CXTnpcM9hJNGv9Ekb4Ap43CZDPdeLVzEmj 39wCVtXPtMqBNCU6mPVBSeJCRHyRuQjY y0Sv5w7ME2LXhT8bEGHyE8yeuxddxvoG51STebu pb0mSp5n iKotUEn9h sA=:WH64twwKGqtE4pUorYOeGylONeXRsfG 3Qe3zAfpdrs=; __atuvc=65|35,2|36; COOKIESTATUS=ON; HIDECOOKIEBANNER=TRUE; nlbi_2388351=jGGxMFazFBqnU x okRrFAAAAAC/AJ/k R2U vs5Q4LIRTS7; nlbi_2388351_2147483392=PUildkEvtiZ9uje3okRrFAAAAABv1NR/7gPLX7Lc/iS5ei8N; incap_ses_989_2388351=mWy Uq7aLX000xomDaO5DfTrFGMAAAAA6XmB42vG5CO6i609/RhyKg==; incap_ses_468_2388351=sDNcR2labTHyNXYlUqx BipAFGMAAAAAImV2A07lGANZGfpvhvPlLg==; __atuvs=6314ec0cdbe92a78001; _gat_gtag_UA_12825325_1=1',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
# Requests doesn't support trailers
# 'TE': 'trailers',
}
params = {
'JobCat': '141',
'CatName': 'Academic Advising',
}
response = requests.get('https://www.higheredjobs.com/admin/search.cfm', params=params, cookies=cookies, headers=headers)
soup = BeautifulSoup(response.text,'html.parser')#'lxml')#
job_title = soup.find_all('div',class_=["row record","col-sm-5 text-sm-right"])
jobs_list = []
for i in job_title:
name = i.text
jobs_list.append(name)
df = pd.DataFrame({'Jobs title':jobs_list})
Present output:
df =
Jobs title
0 \n\nRe-Sort\n\n\r\n\t\t\tResults 1 - 70 of 70\...
1 \n\n\r\n\t\t\t\t\t\t\t\t\t\t\tAssistant Profes...
2 \r\n\t\t\t\t\t\t\t\t\t\t\r\n\t\t\t\t\t\t\t\t\t...
Expected output:
df =
Jobs title Company name location Posted
0 Assistant Professor/Associate University of Southern Indiana Evansville, IN 09/02/22
Professor of Engineering,
Pott College of Science,
Engineering, and Education - F22057F1
CodePudding user response:
Main issue is that you try to create your DataFrame from unstructured data, that is collected in your list.
So try to structure it first e.g. as dict, append it to your list and then create your DataFrame:
jobs_list = []
for i in soup.select('.row.record'):
jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))
pd.DataFrame(jobs_list)
Note: If you like to change the headers, change this list -> ['title','university','location','study','date']
Example
from bs4 import BeautifulSoup
html ='''
<div >
<div ><a href="details.cfm?JobCode=178085874&Title=Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1">
Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1</a>
<br/>
University of Southern Indiana <br/>
Evansville, IN
</div>
<div >
Electrical Engineering
<br/> Posted 09/02/22<br/>
</div>
</div>
<div >
<div >
<a href="details.cfm?JobCode=178085843&Title=Assistant Professor of Engineering F99507">
Assistant Professor of Engineering F99507</a>
<br/>
McNeese State University <br/>
Lake Charles, LA
</div>
<div >
Electrical Engineering
<br/> Posted 09/02/22<br/>
</div>
</div>
'''
soup = BeautifulSoup(html)
jobs_list = []
for i in soup.select('.row.record'):
jobs_list.append(dict(zip(['title','university','location','study','date'],i.stripped_strings)))
pd.DataFrame(jobs_list)
Output
| title | university | location | study | date | |
|---|---|---|---|---|---|
| 0 | Assistant Professor/Associate Professor of Engineering, Pott College of Science, Engineering, and Education - F22057F1 | University of Southern Indiana | Evansville, IN | Electrical Engineering | Posted 09/02/22 |
| 1 | Assistant Professor of Engineering F99507 | McNeese State University | Lake Charles, LA | Electrical Engineering | Posted 09/02/22 |
CodePudding user response:
The following is a complete example of how you can extract the jobs under 'Academic Advising' from that website:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(1, 1337, 100)):
url = f'https://www.higheredjobs.com/admin/search.cfm?JobCat=141&StartRow={x}&SortBy=4&NumJobs=100'
r = s.get(url)
soup = bs(r.text, 'html.parser')
jobs = soup.select_one('div#js-results').select('div[]')
for job in jobs:
job_title = job.select_one('a').get_text(strip=True)
job_url = job.select_one('a').get('href')
big_list.append((job_title, job_url))
df = pd.DataFrame(list(set(big_list)), columns = ['Job', 'Url'])
print(df)
Result is a dataframe with all those jobs (1337):
Job Url
0 Director, Usha Kundu, MD College of Health Adv... details.cfm?JobCode=178071028&Title=Director%2...
1 Academic Advisor, College of Natural, Behavior... details.cfm?JobCode=178061977&Title=Academic%2...
2 Part-Time Academic Advisor for EAP & Foreign L... details.cfm?JobCode=177870235&Title=Part-Tim...
3 Student Service Assistant ll (Temp) details.cfm?JobCode=178044985&Title=Student ...
4 On-Call Academic Advisor (Applicant Pool) details.cfm?JobCode=177522145&Title=On-Call%...
... ... ...
1332 Part-Time Academic Support Coach details.cfm?JobCode=178060131&Title=Part-Tim...
1333 Academic Advisor details.cfm?JobCode=178005430&Title=Academic%2...
1334 Retention Coordinator/Academic Advisor details.cfm?JobCode=178077784&Title=Retention%...
1335 P220178 - Academic Advisor, School of Public H... details.cfm?JobCode=177930648&Title=P220178 ...
1336 Director of Academic Advising - Georgetown Uni... details.cfm?JobCode=178021588&Title=Director%2...
CodePudding user response:
To remove newlines meaning \n\t , you can invoke get_text() property instead of .text
name = i.get_text(strip=True)
