I had a problem when I took out html files and imported them into excel.
This is the site i need to get information: https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html
As you can see, in the GDP table I have a row named : 年份 separated from 2 lines
That's why after i exported the excel file it gave unexpected results
The result I want is that the first line in excel will only have : 年份 , GDP(美元), 占世界%
Sorry for my confusing explanation, I really don't know how to explain it in detail.
Here is my python code
import requests
from bs4 import BeautifulSoup
import lxml
import csv
def get_html(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
r = "fail"
return r
def getGDP(ulist,html):
soup = BeautifulSoup(html,"html.parser")
trs = soup.find_all('tr')
for tr in trs:
list = []
for th in tr:
ts = th.string
if ts == '\n':
continue
list.append(ts)
ulist.append(list)
def saveGDP(ulist):
file_name = '21095010 胡碧玉 GDP.csv'
with open(file_name,'w',errors='ignore',newline='') as f:
f_csv = csv.writer(f)
f_csv.writerows(ulist)
def main():
unifo=[]
url='https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html'
html=get_html(url)
getGDP(unifo,html)
saveGDP(unifo)
if __name__=="__main__":
main()
Thank you so much!
CodePudding user response:
Using pandas scraping tables and cleaning of results in most cases is mutch easier - under the hood beautifulsoup is working for you.
In this case read_html() the table, drop the unwanted header level and filter out the rows containings ads:
import pandas as pd
df = pd.read_html('https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html')[0].droplevel(0, axis=1)
df[~df.iloc[:,0].str.contains('ads')].to_csv('21095010 胡碧玉 GDP.csv', index=False)
Answering your question
You have to select your elements more specific e.g. with css selectors.
So first get the thead information from all th witout colspan, than collect the data from all tr in tbody that do not contains ads:
def getGDP(html):
soup = BeautifulSoup(html,"html.parser")
data = []
data.append([th.text for th in soup.select('thead th:not([colspan])')])
for row in soup.select('tbody tr:not(:-soup-contains("ads"))'):
data.append(list(row.stripped_strings))
return data
Example
import requests
from bs4 import BeautifulSoup
import lxml
import csv
def get_html(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
r = "fail"
return r
def getGDP(html):
soup = BeautifulSoup(html,"html.parser")
data = []
data.append([th.text for th in soup.select('thead th:not([colspan])')])
for x in soup.select('tbody tr:not(:-soup-contains("ads"))'):
data.append(list(x.stripped_strings))
return data
def saveGDP(ulist):
file_name = '21095010 胡碧玉 GDP.csv'
print(ulist)
with open(file_name,'w',errors='ignore', encoding='utf-8') as f:
f_csv = csv.writer(f)
f_csv.writerows(ulist)
def main():
url='https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html'
html=get_html(url)
saveGDP(getGDP(html))
if __name__=="__main__":
main()
