how to scrape page inside the result card using Bs4?-CodePudding

<img data-src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium" alt="Biryani By Kilo" data-gatype="RestaurantImageClick" data-url="/delhi/biryani-by-kilo-connaught-place-central-delhi-40178" data-w-onclick="cardClickHandler" src="https://im1.dineout.co.in/images/uploads/restaurant/sharpen/4/h/u/p4059-15500352575c63a9394c209.jpg?tr=tr:n-medium">

page url - https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p=1

this page contains some restaurants card now while scrapping the page in the loop I want to go inside the restaurant card URL which is in the above HTML code name by data-url class and scrape the no. of reviews from inside it, I don't know how to do it my current code for normal front page scrapping is ;

def extract(page):
    url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}"  # URL of the website 
    header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
    r = requests.get(url, headers=header)
    soup = BeautifulSoup(r.content, 'html.parser')
    return soup

def transform(soup): # function to scrape the page
    divs = soup.find_all('div', class_ = 'restnt-card restaurant')
    for item in divs:
        title = item.find('a').text.strip() # restaurant name
        loc = item.find('div', class_ = 'restnt-loc ellipsis').text.strip() # restaurant location
        try: # used this try and except method because some restaurants are unrated and while scrpaping those we would run into an error
            rating = item.find('div', class_="img-wrap").text 
            rating = (re.sub("[^0-9,.]", "", rating))
            
        except:
            rating = None
        pricce = item.find('span', class_="double-line-ellipsis").text.strip() # price for biriyani
        price = re.sub("[^0-9]", "", pricce)[:-1]

        biry_del = {
            'name': title,
            'location': loc,
            'rating': rating,
            'price': price
        }
        rest_list.append(biry_del)

        
rest_list = []

for i in range(1,18):
    print(f'getting page, {i}')
    c = extract(i)
    transform(c)

I hope you guys understood please ask in comment for any confusion.

CodePudding user response：

It's not very fast but it looks like you can get all the details you want including the review count (not 232!) if you hit this backend api endpoint: https://www.dineout.co.in/get_rdp_data_main/delhi/69676/restaurant_detail_main

import requests
from bs4 import BeautifulSoup
import pandas as pd

rest_list = []
for page in range(1,3):
    print(f'getting page, {page}')

    s = requests.Session()

    url = f"https://www.dineout.co.in/delhi-restaurants?search_str=biryani&p={page}"  # URL of the website
    header = {'User-Agent':'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36'} # Temporary user agent
    r = s.get(url, headers=header)
    soup = BeautifulSoup(r.content, 'html.parser')

    divs = soup.find_all('div', class_ = 'restnt-card restaurant')

    for item in divs:
        code = item.find('a')['href'].split('-')[-1] # restaurant code
        print(f'Getting details for {code}')
        data = s.get(f'https://www.dineout.co.in/get_rdp_data_main/delhi/{code}/restaurant_detail_main').json()

        info = data['header']
        info.pop('share') #clean up csv
        info.pop('options')
        rest_list.append(info)

df = pd.DataFrame(rest_list)
df.to_csv('dehli_rest.csv',index=False)