The website is giving me same results for different urls scraped. I guess the reason for this is that selenium is not letting the website load completely before producing the result. I wrote my code using beautiful soup first but according to SO community, selenium had to be used to get the final webpage to scrape. I implemented selenium to scrape the data and beautiful soup for parsing the data but still the same problem persists. The code is given below:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import datetime
import os
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
date_list = pd.date_range(start = "1971-02-01", end=datetime.date.today(), freq='1d')
chrome_options = Options()
chrome_options.add_argument("--headless") # Opens the browser up in background
driver = webdriver.Chrome()
def get_batsmen(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting?at={date}'
with Chrome(options=chrome_options) as browser:
browser.get(url)
html = browser.page_source
browser.implicitly_wait(10)
doc = BeautifulSoup(html, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_bowler(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling?at={date}'
# page = requests.get(url).text
with Chrome(options=chrome_options) as browser:
browser.get(url)
html = browser.page_source
doc = BeautifulSoup(html, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_allrounder(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/all-rounder?at={date}'
# page = requests.get(url).text
with Chrome(options=chrome_options) as browser:
browser.get(url)
html = browser.page_source
doc = BeautifulSoup(html, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
#Storing the data into multiple csvs
for date in date_list:
year = date.year
month = date.month
day = date.day
newpath = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}'
if not os.path.exists(newpath):
os.makedirs(newpath)
newpath1 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}'
if not os.path.exists(newpath1):
os.makedirs(newpath1)
newpath2 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}\{day}'
if not os.path.exists(newpath2):
os.makedirs(newpath2)
get_batsmen(date).to_csv(newpath2 '/batsmen.csv')
get_bowler(date).to_csv(newpath2 '/bowler.csv')
get_allrounder(date).to_csv(newpath2 '/allrounder.csv')
I will be eternally grateful to anyone who could help
CodePudding user response:
Using another method may help, try the following
WebDriverWait(browser, delay)
Refer to this Answer
CodePudding user response:
use browser.implicitly_wait(10) before defining html
from bs4 import BeautifulSoup
import pandas as pd
import requests
import re
import datetime
import os
import time
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
date_list = pd.date_range(start = "1971-02-01", end=datetime.date.today(), freq='1d')
chrome_options = Options()
chrome_options.add_argument("--headless") # Opens the browser up in background
driver = webdriver.Chrome()
def get_batsmen(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/batting?at={date}'
with Chrome(options=chrome_options) as browser:
browser.get(url)
#time.sleep(15)#it will wait for page to load, remove '#' if it does not works
browser.implicitly_wait(10)
html = browser.page_source
doc = BeautifulSoup(html, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_bowler(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/bowling?at={date}'
# page = requests.get(url).text
with Chrome(options=chrome_options) as browser:
browser.get(url)
html = browser.page_source
doc = BeautifulSoup(html, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
def get_allrounder(date):
url = f'https://www.icc-cricket.com/rankings/mens/player-rankings/odi/all-rounder?at={date}'
# page = requests.get(url).text
with Chrome(options=chrome_options) as browser:
browser.get(url)
html = browser.page_source
doc = BeautifulSoup(html, "html.parser")
find_class = doc.find_all("td", class_ = 'table-body__cell rankings-table__name name')
player_list = []
find_top = doc.find('div', class_='rankings-block__banner--name-large')
player_list.append(find_top.text)
for item in find_class:
player_name = item.find("a")
# print(player_name.text)
player_list.append(player_name.text)
df = pd.DataFrame(player_list, columns = ['Player Name'])
return df
#Storing the data into multiple csvs
for date in date_list:
year = date.year
month = date.month
day = date.day
date = date.strftime("%Y-%m-%d")
newpath = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}'
if not os.path.exists(newpath):
os.makedirs(newpath)
newpath1 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}'
if not os.path.exists(newpath1):
os.makedirs(newpath1)
newpath2 = rf'C:\Users\divya\OneDrive\Desktop\8th Sem\ISB assignment\{year}\{month}\{day}'
if not os.path.exists(newpath2):
os.makedirs(newpath2)
get_batsmen(date).to_csv(newpath2 '/batsmen.csv')
get_bowler(date).to_csv(newpath2 '/bowler.csv')
get_allrounder(date).to_csv(newpath2 '/allrounder.csv')
CodePudding user response:
you can use time.sleep(10) for 10 seconds if you want to wait until a part to load you can use
WebDriverWait(driver, 500).until(EC.presence_of_element_located((By.XPATH, 'XPath of that element/Path')))
it will wait 500 seconds until that part loads
instead of browser.implicitly_wait(10) use time.sleep(10) it will work
