This website https://aviation-safety.net/wikibase/ DB begins from year 1902 to 2022. The code presented here captures some years for misses some as well. The years before 1912 and the year after 2021 are not captured. I want to scrape All Accidents for each type of aircraft for all or by year(s). This webDB starts from https://aviation-safety.net/wikibase/dblist.php?Year=1902 and should end on https://aviation-safety.net/wikibase/dblist.php?Year=2022. Currently, the code dumps the results in .csv file, but it could also be in SQLite.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
def scrape_year(year):
# use a default looking header to cover my tracks in case they block requests that don't have "accept" and "user-agent" which sometimes happens
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')]) # get the maximum number of pages using "list comprehension", I get all the links at the bottom of the page ('a' tags) and the get the [href] for each, but split it on "=" making each a list, then get the last one ([-1]) and turn the text into an integer so I can get the max of all the integers ie the last page number
info = []
for page in range(1,pages 1):
new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
print(new_url)
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table',{'class':'hp'})
regex = re.compile('list.*')
for index,row in enumerate(table.find_all('tr',{'class':regex})):
if index == 0:
continue
acc_link = 'https://aviation-safety.net/' row.find('a')['href']
try:
acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01" row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01-01" row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
continue
acc_type = row.find_all('td')[1].text
acc_reg = row.find_all('td')[2].text
acc_operator = row.find_all('td')[3].text
acc_fat = row.find_all('td')[4].text
acc_location = row.find_all('td')[5].text
acc_dmg = row.find_all('td')[7].text
item = {
'acc_link' : acc_link,
'acc_date': acc_date,
'acc_type': acc_type,
'acc_reg': acc_reg,
'acc_operator' :acc_operator,
'acc_fat':acc_fat,
'acc_location':acc_location,
'acc_dmg':acc_dmg
}
info.append(item)
df= pd.DataFrame(info)
df.to_csv(f'{year}_aviation-safety.csv', encoding='utf-8-sig', index=False)
if __name__ == "__main__":
START = 1901
STOP = 2023
years = [year for year in range(START,STOP 1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scrape_year,years)
CodePudding user response:
Lmao, I wrote that code for someone on this site once before. I've edited to work for the missing years here:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
def scrape_year(year):
try:
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
try:
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')])
except:
pages = 1
info = []
for page in range(1,pages 1):
new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
print(new_url)
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table',{'class':'hp'})
regex = re.compile('list.*')
for index,row in enumerate(table.find_all('tr',{'class':regex})):
if index == 0:
continue
acc_link = 'https://aviation-safety.net/' row.find('a')['href']
try:
acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01" row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01-01" row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
continue
acc_type = row.find_all('td')[1].text
acc_reg = row.find_all('td')[2].text
acc_operator = row.find_all('td')[3].text
acc_fat = row.find_all('td')[4].text
acc_location = row.find_all('td')[5].text
acc_dmg = row.find_all('td')[7].text
item = {
'acc_link' : acc_link,
'acc_date': acc_date,
'acc_type': acc_type,
'acc_reg': acc_reg,
'acc_operator' :acc_operator,
'acc_fat':acc_fat,
'acc_location':acc_location,
'acc_dmg':acc_dmg
}
info.append(item)
return info
except Exception as e:
print(e, url)
return []
if __name__ == "__main__":
START = 2022
STOP = 2023
years = [year for year in range(START,STOP 1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scrape_year,years)
list_of_dicts= list(final_list)
flat_list = [item for sublist in list_of_dicts for item in sublist] #convert list of lists into one big list
df= pd.DataFrame(flat_list)
df.to_csv('all_years_aviation-safety.csv',index=False)
