I'm trying to build my first web scrapper but I can't figure out how to stop my program from looking for "next-page" links.
#get URLs for all pages
def page_parse(main_url, url_list):
page = requests.get(main_url);
soup = BeautifulSoup(page.content, 'html.parser');
#check if next page button inactive
if soup.find('a.next.ajax-page', href=True) == None:
print('debug');
return url_list;
next_page = soup.select_one('a.next.ajax-page', href=True)['href']
next_page = (f'http://www.yellowpages.com{next_page}')
url_list.append(next_page);
print(str(url_list))
page_parse(next_page, url_list);
return url_list;
I know what the error is I just have no idea how to check if the "next page" button is active. I've tried looking for differences in the html between the first and last page's "next page" buttons (first page uses a.next.ajax-page while the last uses div.next). Depending on what I change around my code either hits the print('debug') or gets to the last page and hits a TypeError [see below]. I think the issue is not being able to check if an element exists without calling it.
Error code:
['http://www.yellowpages.com/omaha-ne/towing?page=2']
['http://www.yellowpages.com/omaha-ne/towing?page=2', 'http://www.yellowpages.com/omaha-ne/towing?page=3']
['http://www.yellowpages.com/omaha-ne/towing?page=2', 'http://www.yellowpages.com/omaha-ne/towing?page=3', 'http://www.yellowpages.com/omaha-ne/towing?page=4']
['http://www.yellowpages.com/omaha-ne/towing?page=2', 'http://www.yellowpages.com/omaha-ne/towing?page=3', 'http://www.yellowpages.com/omaha-ne/towing?page=4', 'http://www.yellowpages.com/omaha-ne/towing?page=5']
['http://www.yellowpages.com/omaha-ne/towing?page=2', 'http://www.yellowpages.com/omaha-ne/towing?page=3', 'http://www.yellowpages.com/omaha-ne/towing?page=4', 'http://www.yellowpages.com/omaha-ne/towing?page=5', 'http://www.yellowpages.com/omaha-ne/towing?page=6']
['http://www.yellowpages.com/omaha-ne/towing?page=2', 'http://www.yellowpages.com/omaha-ne/towing?page=3', 'http://www.yellowpages.com/omaha-ne/towing?page=4', 'http://www.yellowpages.com/omaha-ne/towing?page=5', 'http://www.yellowpages.com/omaha-ne/towing?page=6', 'http://www.yellowpages.com/omaha-ne/towing?page=7']
Traceback (most recent call last):
File "c:\Users\-\Documents\code\Python Projects\webscrape2.py", line 49, in <module>
url_list = page_parse(main_url, url_list);
File "c:\Users\-\Documents\code\Python Projects\webscrape2.py", line 19, in page_parse
page_parse(next_page, url_list);
File "c:\Users\-\Documents\code\Python Projects\webscrape2.py", line 19, in page_parse
page_parse(next_page, url_list);
File "c:\Users\-\Documents\code\Python Projects\webscrape2.py", line 19, in page_parse
page_parse(next_page, url_list);
[Previous line repeated 3 more times]
File "c:\Users\-\Documents\code\Python Projects\webscrape2.py", line 15, in page_parse
next_page = soup.select_one('a.next.ajax-page', href=True)['href']
TypeError: 'NoneType' object is not subscriptable
Sorry if this is confusing this is my first time posting a question.
CodePudding user response:
The problem here is that you are trying to access a NoneType variable. next_page = soup.select_one('a.next.ajax-page', href=True) return nothing so you cant access ['href'] inside
CodePudding user response:
What happens?
Your selection soup.find('a.next.ajax-page', href=True) is not finding the element you are searching for in any way cause it is a mix of syntaxes (find and css selectors) and will always return None - So it also won't be able accessing the attribute value.
How to fix?
Change your line checking the next page element from:
if soup.find('a.next.ajax-page', href=True) == None:
to:
if soup.find('a',{'class':'next ajax-page'}) == None:
or
if soup.select_one('a.next.ajax-page') == None:
You also should be able to scrape all basic information of the search results and store these in one step instead of returning a list of urls for search pages:
def page_parse(url):
data = []
while True:
page = requests.get(url)
soup = BeautifulSoup(page.text)
for item in soup.select('div.result'):
data.append({
'title':item.h2.text,
'url':f"{baseUrl}{item.a['href']}"
})
if (url := soup.select_one('a.next.ajax-page')):
url = f"{baseUrl}{url['href']}"
else:
return data
Example
import requests
from bs4 import BeautifulSoup
baseUrl = 'http://www.yellowpages.com'
def page_parse(url):
data = []
while True:
page = requests.get(url)
soup = BeautifulSoup(page.text)
for item in soup.select('div.result'):
data.append({
'title':item.h2.text,
'url':f"{baseUrl}{item.a['href']}"
})
if (url := soup.select_one('a.next.ajax-page')):
url = f"{baseUrl}{url['href']}"
else:
return data
page_parse('http://www.yellowpages.com/omaha-ne/towing')
Output
[{'title': "1. Keith's BP",
'url': 'http://www.yellowpages.com/omaha-ne/mip/keiths-bp-460502890?lid=1002059325385'},
{'title': '2. Neff Towing Svc',
'url': 'http://www.yellowpages.com/omaha-ne/mip/neff-towing-svc-21969600?lid=1000282974083#gallery'},
{'title': '3. A & A Towing',
'url': 'http://www.yellowpages.com/omaha-ne/mip/a-a-towing-505777665?lid=1002056319136'},
{'title': '4. Cross Electronic Recycling',
'url': 'http://www.yellowpages.com/omaha-ne/mip/cross-electronic-recycling-473693798?lid=1000236876513'},
{'title': '5. 24 Hour Towing',
'url': 'http://www.yellowpages.com/omaha-ne/mip/24-hour-towing-521607477?lid=1001918028003'},
{'title': '6. A & A Towing Fast Friendly',
'url': 'http://www.yellowpages.com/omaha-ne/mip/a-a-towing-fast-friendly-478453697?lid=1000090213043'},
{'title': '7. Austin David Towing',
'url': 'http://www.yellowpages.com/omaha-ne/mip/austin-david-towing-465037110?lid=1001788338357'},...]
