Scraping data, appending to list, and making it a dataframe-CodePudding

I am trying to scrape a bunch of baseball statistics and get all that data into separate data Frames so I can use it for my project. I am able to get all of the data, but I am having trouble figuring out how to store all of this data in variables and slice it accordingly.

def parse_row(rows):
    return [str(x.string)for x in rows.find_all('td')]

def soop(url):
    page = requests.get(url)
    text = soup(page.text, features = 'lxml')
    row = text.find_all('tr')
    data = [parse_row(rows)for rows in row]
    df = pd.DataFrame(data)
    df = df.dropna()
    if dp_num in url:
        df.columns = dp_col
    elif sb_num in url:
        df.columns = sb_col
    elif hr_num in url:
        df.columns = hr_col
    elif obp_num in url:
        df.columns = obp_col
    elif b2_num in url:
        df.columns = b2_col
    elif b3_num in url:
        df.columns = b3_col
    elif era_num in url:
        df.columns = era_col
    elif fld_num in url:
        df.columns = fld_col
    else:
        print('error')
    return(df)

# ncaa scraping function
def scrape(id_num):
    loop = 1
    page_num = 2
    page_numii = 2
    page_numiii = 2
    url = 'https://www.ncaa.com/stats/baseball/d1/current/team/'   id_num
    dii_url = 'https://www.ncaa.com/stats/baseball/d2/current/team/'   id_num
    diii_url = 'https://www.ncaa.com/stats/baseball/d3/current/team/'   id_num
    while loop == 1: #first di page
        df = soop(url)
        loop  = 1
        print(df)
    while loop <= 6: #number of remaining di pages
        df = soop(url   '/p'   str(page_num))
        page_num  = 1
        loop  = 1
        print(df)
    while loop == 7: # first d2 page
        df = soop(dii_url)
        loop  = 1
        print(df)
    while loop <= 11:#remaining d2 pages
        df = soop(dii_url   '/p'   str(page_numii))
        page_numii  = 1
        loop  = 1
        print(df)
    while loop == 12: #first diii page
        df = soop(diii_url)
        loop  = 1
        print(df)
    while loop < 20:#remaining d3 pages
        df = soop(diii_url   '/p'   str(page_numiii))
        page_numiii  = 1
        loop  = 1
        print(df)

All of the code works, and I get no errors, but I would like to store the data it prints out in variables instead of printing it out, and then have those as separate data Frames for each stat page I scraped. But I have no clue where to start doing that, I have seen on here that maybe i should try appending it to a list? I am a statistics major in college, and I am pretty new to programming. Any help is appreciated.

CodePudding user response：

To store dataframes into variables, you would have to construct a list or dictionary to store the dataframes.

With that being said, I probably wouldn't store the tables into variables, but rather write to a database or csv files so that you have the data locally available. Otherwise you'd have to run the scrape every time to get the data. Pandas can handle that for you (as well as parse the tables with .read_html()).

Not sure exactly what data you want or how you want it (I'm also surprised to not see an api here to get that data), but this will grab it and store it into folders with the structure of:

-data
    -d1
        -INDIVIDUAL STATISTICS
             csv files
             ...
             ...
        -TEAM STATISTICS
             .csv files
             ...
             ...
    -d2
        -INDIVIDUAL STATISTICS
             csv files
             ...
             ...
        -TEAM STATISTICS
             csv files
             ...
             ...
    -d3
        -INDIVIDUAL STATISTICS
             csv files
             ...
             ...
        -TEAM STATISTICS
             csv files
             ...
             ...

So looks like this:

Code:

import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

statsIds_dict = {}
for division in [1,2,3]:
    statsIds_dict[f'd{division}'] = {}
    url = f'https://www.ncaa.com/stats/baseball/d{division}/'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    statsIds = soup.find_all('div', {'class':'stats-header__filter'})
    for each in statsIds:
        statsType = each.text.split('\n')[0]
        statsIds_dict[f'd{division}'][statsType] = {}
        options = each.find_all('option')
        for option in options:
            if option['value']:
                statsIds_dict[f'd{division}'][statsType][option.text] = 'https://www.ncaa.com'   option['value']
                
for division, v1 in statsIds_dict.items():
    for statsType, v2 in v1.items():
        for statTitle, link in v2.items():
            response = requests.get(link)
            soup = BeautifulSoup(response.text, 'html.parser')
            try:
                totPages = int(soup.find('ul', {'class':'stats-pager'}).find_all('li')[-2].text)
            except:
                totPages = 1
            df = pd.read_html(link)[0]
            print(link)
            for page in range(2, totPages 1):
                temp_df = pd.read_html(link   f'/p{page}')[0]
                print(link   f'/p{page}')
                df = df.append(temp_df).reset_index(drop=True)
                
            path = f'data/{division}/{statsType}'
            # Check whether the specified path exists or not
            isExist = os.path.exists(path)
            
            if not isExist:
  
                # Create a new directory because it does not exist 
                os.makedirs(path)
                print(f"The data/{division}/{statsType} directory is created!")
            
            df.to_csv(f'data/{division}/{statsType}/{division}_{statsType}_{statTitle}.csv' , index=False)
            print(f'Saved: {division} {statsType} {statTitle}')