I am giving the url as input : url = "https://www.amazon.in/s?k=headphones&page=1" This works fine but stops at page 19 Instead of we breaking at page 19, I want to give the next input as "https://www.amazon.in/s?k="
- "speakers&page=1"
- "earbuds&page=1" and so on to run in a loop
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
data =[]
def getdata (url):
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=header)
amazon_html = urllib.request.urlopen(req).read()
a_soup = soup(amazon_html,'html.parser')
for e in a_soup.select('div[data-component-type="s-search-result"]'):
try:
title = e.find('h2').text
except:
title = None
data.append({
'title':title
})
return a_soup
def getnextpage(a_soup):
page= a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})
page = page['href']
url = 'http://www.amazon.in' str(page)
return url
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)```
```output = pd.DataFrame(data)
output
This code is returning the correct results, but instead of me giving a new url every time I want it to input a list of items which can be added at the end of the url one at a time to fetch the results which can be added to the DataFrame Note: The search results stop at 19th page
CodePudding user response:
Make a list for your keywords, iterate it and include the while loop into each iteration.
keywords = ['speakers','earbuds']
for k in keywords:
url = 'https://www.amazon.in/s?k=' k
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)
Be aware that amazon does not like such automated access to its pages and recognises the patterns of access quite quickly. To reduce the frequency of the requests a bit, you should at least include some delay time.sleep(). Of course, it would be even better to use an official api.
Example
from bs4 import BeautifulSoup as soup
import pandas as pd
import requests
import urllib
data =[]
def getdata (url):
header = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }
req = urllib.request.Request(url, headers=header)
amazon_html = urllib.request.urlopen(req).read()
a_soup = soup(amazon_html,'html.parser')
for e in a_soup.select('div[data-component-type="s-search-result"]'):
try:
title = e.find('h2').text
except:
title = None
data.append({
'title':title,
'url':'http://www.amazon.in' e.h2.a['href']
})
return a_soup
def getnextpage(a_soup):
try:
page = a_soup.find('a',attrs={"class": 's-pagination-item s-pagination-next s-pagination-button s-pagination-separator'})['href']
url = 'http://www.amazon.in' str(page)
except:
url = None
return url
keywords = ['speakers','earbuds']
for k in keywords:
url = 'https://www.amazon.in/s?k=' k
while True:
geturl = getdata(url)
url = getnextpage(geturl)
if not url:
break
print(url)
Output (print)
http://www.amazon.in/s?k=speakers&page=2&qid=1649420352&ref=sr_pg_1
...
http://www.amazon.in/s?k=speakers&page=20&qid=1649420373&ref=sr_pg_19
http://www.amazon.in/s?k=earbuds&page=2&qid=1649420375&ref=sr_pg_1
...
http://www.amazon.in/s?k=earbuds&page=20&qid=1649420394&ref=sr_pg_19
