web-scraping and pagination with python, csv, beautifulsoup and Pandas-CodePudding

This website https://aviation-safety.net/wikibase/ DB begins from year 1902 to 2022. The code presented here captures some years for misses some as well. The years before 1912 and the year after 2021 are not captured. I want to scrape All Accidents for each type of aircraft for all or by year(s). This webDB starts from https://aviation-safety.net/wikibase/dblist.php?Year=1902 and should end on https://aviation-safety.net/wikibase/dblist.php?Year=2022. Currently, the code dumps the results in .csv file, but it could also be in SQLite.

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures

def scrape_year(year):
    # use a default looking header to cover my tracks in case they block requests that don't have "accept" and "user-agent" which sometimes happens
    headers =   {
        'accept':'*/*',
        'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
        }

    url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
    req = requests.get(url, headers=headers)

    soup = BeautifulSoup(req.text,'html.parser')

    page_container = soup.find('div',{'class':'pagenumbers'})
    pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])   # get the maximum number of pages using "list comprehension", I get all the links at the bottom of the page ('a' tags) and the get the [href] for each, but split it on "=" making each a list, then get the last one ([-1]) and turn the text into an integer so I can get the max of all the integers ie the last page number

    info = []
    for page in range(1,pages 1):

        new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
        print(new_url)

        data = requests.get(new_url,headers=headers)
        soup = BeautifulSoup(data.text,'html.parser')


        table = soup.find('table',{'class':'hp'})


        regex = re.compile('list.*')
        for index,row in enumerate(table.find_all('tr',{'class':regex})):
            if index == 0:
                continue

            acc_link = 'https://aviation-safety.net/' row.find('a')['href']
            try:
                acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
            except ValueError:
                try:
                    acc_date = datetime.strptime("01" row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                except ValueError:
                    try:
                        acc_date = datetime.strptime("01-01" row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                    except ValueError:
                        continue

            acc_type = row.find_all('td')[1].text
            acc_reg = row.find_all('td')[2].text
            acc_operator = row.find_all('td')[3].text
            acc_fat = row.find_all('td')[4].text
            acc_location = row.find_all('td')[5].text
            acc_dmg = row.find_all('td')[7].text

            item = {
                'acc_link' : acc_link,
                'acc_date': acc_date,
                'acc_type': acc_type,
                'acc_reg': acc_reg,
                'acc_operator' :acc_operator,
                'acc_fat':acc_fat,
                'acc_location':acc_location,
                'acc_dmg':acc_dmg
                }

            info.append(item)

    df= pd.DataFrame(info)
    df.to_csv(f'{year}_aviation-safety.csv', encoding='utf-8-sig', index=False)


if __name__ == "__main__":

    START = 1901
    STOP = 2023

    years = [year for year in range(START,STOP 1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
            final_list = executor.map(scrape_year,years)

CodePudding user response：

Lmao, I wrote that code for someone on this site once before. I've edited to work for the missing years here:

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures

def scrape_year(year):

    try:
        headers =   {
            'accept':'*/*',
            'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
            }

        url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
        req = requests.get(url, headers=headers)

        soup = BeautifulSoup(req.text,'html.parser')

        page_container = soup.find('div',{'class':'pagenumbers'})

        try:
            pages = max([int(page['href'].split('=')[-1]) for page in  page_container.find_all('a')])
        except:
            pages = 1

        info = []
        for page in range(1,pages 1):

            new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
            print(new_url)

            data = requests.get(new_url,headers=headers)
            soup = BeautifulSoup(data.text,'html.parser')


            table = soup.find('table',{'class':'hp'})


            regex = re.compile('list.*')
            for index,row in enumerate(table.find_all('tr',{'class':regex})):
                if index == 0:
                    continue

                acc_link = 'https://aviation-safety.net/' row.find('a')['href']
                try:
                    acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                except ValueError:
                    try:
                        acc_date = datetime.strptime("01" row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                    except ValueError:
                        try:
                            acc_date = datetime.strptime("01-01" row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
                        except ValueError:
                            continue

                acc_type = row.find_all('td')[1].text
                acc_reg = row.find_all('td')[2].text
                acc_operator = row.find_all('td')[3].text
                acc_fat = row.find_all('td')[4].text
                acc_location = row.find_all('td')[5].text
                acc_dmg = row.find_all('td')[7].text

                item = {
                    'acc_link' : acc_link,
                    'acc_date': acc_date,
                    'acc_type': acc_type,
                    'acc_reg': acc_reg,
                    'acc_operator' :acc_operator,
                    'acc_fat':acc_fat,
                    'acc_location':acc_location,
                    'acc_dmg':acc_dmg
                    }

                info.append(item)

        return info

    except Exception as e:
        print(e, url)
        return []


if __name__ == "__main__":

    START = 2022
    STOP = 2023

    years = [year for year in range(START,STOP 1)]

    print(f'Scraping {len(years)} years of data')

    with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
        final_list = executor.map(scrape_year,years)

    list_of_dicts= list(final_list)
    flat_list = [item for sublist in list_of_dicts for item in sublist] #convert list of lists into one big list

    df= pd.DataFrame(flat_list)
    df.to_csv('all_years_aviation-safety.csv',index=False)