I am trying to scrape a bunch of baseball statistics and get all that data into separate data Frames so I can use it for my project. I am able to get all of the data, but I am having trouble figuring out how to store all of this data in variables and slice it accordingly.
def parse_row(rows):
return [str(x.string)for x in rows.find_all('td')]
def soop(url):
page = requests.get(url)
text = soup(page.text, features = 'lxml')
row = text.find_all('tr')
data = [parse_row(rows)for rows in row]
df = pd.DataFrame(data)
df = df.dropna()
if dp_num in url:
df.columns = dp_col
elif sb_num in url:
df.columns = sb_col
elif hr_num in url:
df.columns = hr_col
elif obp_num in url:
df.columns = obp_col
elif b2_num in url:
df.columns = b2_col
elif b3_num in url:
df.columns = b3_col
elif era_num in url:
df.columns = era_col
elif fld_num in url:
df.columns = fld_col
else:
print('error')
return(df)
# ncaa scraping function
def scrape(id_num):
loop = 1
page_num = 2
page_numii = 2
page_numiii = 2
url = 'https://www.ncaa.com/stats/baseball/d1/current/team/' id_num
dii_url = 'https://www.ncaa.com/stats/baseball/d2/current/team/' id_num
diii_url = 'https://www.ncaa.com/stats/baseball/d3/current/team/' id_num
while loop == 1: #first di page
df = soop(url)
loop = 1
print(df)
while loop <= 6: #number of remaining di pages
df = soop(url '/p' str(page_num))
page_num = 1
loop = 1
print(df)
while loop == 7: # first d2 page
df = soop(dii_url)
loop = 1
print(df)
while loop <= 11:#remaining d2 pages
df = soop(dii_url '/p' str(page_numii))
page_numii = 1
loop = 1
print(df)
while loop == 12: #first diii page
df = soop(diii_url)
loop = 1
print(df)
while loop < 20:#remaining d3 pages
df = soop(diii_url '/p' str(page_numiii))
page_numiii = 1
loop = 1
print(df)
All of the code works, and I get no errors, but I would like to store the data it prints out in variables instead of printing it out, and then have those as separate data Frames for each stat page I scraped. But I have no clue where to start doing that, I have seen on here that maybe i should try appending it to a list? I am a statistics major in college, and I am pretty new to programming. Any help is appreciated.
CodePudding user response:
To store dataframes into variables, you would have to construct a list or dictionary to store the dataframes.
With that being said, I probably wouldn't store the tables into variables, but rather write to a database or csv files so that you have the data locally available. Otherwise you'd have to run the scrape every time to get the data. Pandas can handle that for you (as well as parse the tables with .read_html()).
Not sure exactly what data you want or how you want it (I'm also surprised to not see an api here to get that data), but this will grab it and store it into folders with the structure of:
-data
-d1
-INDIVIDUAL STATISTICS
csv files
...
...
-TEAM STATISTICS
.csv files
...
...
-d2
-INDIVIDUAL STATISTICS
csv files
...
...
-TEAM STATISTICS
csv files
...
...
-d3
-INDIVIDUAL STATISTICS
csv files
...
...
-TEAM STATISTICS
csv files
...
...
So looks like this:
Code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
statsIds_dict = {}
for division in [1,2,3]:
statsIds_dict[f'd{division}'] = {}
url = f'https://www.ncaa.com/stats/baseball/d{division}/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
statsIds = soup.find_all('div', {'class':'stats-header__filter'})
for each in statsIds:
statsType = each.text.split('\n')[0]
statsIds_dict[f'd{division}'][statsType] = {}
options = each.find_all('option')
for option in options:
if option['value']:
statsIds_dict[f'd{division}'][statsType][option.text] = 'https://www.ncaa.com' option['value']
for division, v1 in statsIds_dict.items():
for statsType, v2 in v1.items():
for statTitle, link in v2.items():
response = requests.get(link)
soup = BeautifulSoup(response.text, 'html.parser')
try:
totPages = int(soup.find('ul', {'class':'stats-pager'}).find_all('li')[-2].text)
except:
totPages = 1
df = pd.read_html(link)[0]
print(link)
for page in range(2, totPages 1):
temp_df = pd.read_html(link f'/p{page}')[0]
print(link f'/p{page}')
df = df.append(temp_df).reset_index(drop=True)
path = f'data/{division}/{statsType}'
# Check whether the specified path exists or not
isExist = os.path.exists(path)
if not isExist:
# Create a new directory because it does not exist
os.makedirs(path)
print(f"The data/{division}/{statsType} directory is created!")
df.to_csv(f'data/{division}/{statsType}/{division}_{statsType}_{statTitle}.csv' , index=False)
print(f'Saved: {division} {statsType} {statTitle}')

