Handling HTML in python-CodePudding

I had a problem when I took out html files and imported them into excel.

This is the site i need to get information: https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html

As you can see, in the GDP table I have a row named : 年份 separated from 2 lines

That's why after i exported the excel file it gave unexpected results

The result I want is that the first line in excel will only have : 年份 , GDP(美元), 占世界%

Sorry for my confusing explanation, I really don't know how to explain it in detail.

Here is my python code

import requests
from bs4 import BeautifulSoup
import lxml
import csv

def get_html(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding  
        return r.text
    except:
        r = "fail"
        return r
    
def getGDP(ulist,html):
    soup = BeautifulSoup(html,"html.parser")
    trs = soup.find_all('tr')    
    for tr in trs:
        list = []    
        for th in tr:  
            ts = th.string  
            if ts == '\n':
                continue
            list.append(ts)
        ulist.append(list)
        
def saveGDP(ulist):
        file_name = '21095010 胡碧玉 GDP.csv'
        with open(file_name,'w',errors='ignore',newline='') as f:
                f_csv = csv.writer(f)
                f_csv.writerows(ulist)
        
def main():  
        unifo=[]    
        url='https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html'
        html=get_html(url)    
        getGDP(unifo,html)
        saveGDP(unifo)    

if __name__=="__main__":
    main()

Thank you so much!

CodePudding user response：

Using pandas scraping tables and cleaning of results in most cases is mutch easier - under the hood beautifulsoup is working for you.

In this case read_html() the table, drop the unwanted header level and filter out the rows containings ads:

import pandas as pd
df = pd.read_html('https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html')[0].droplevel(0, axis=1)

df[~df.iloc[:,0].str.contains('ads')].to_csv('21095010 胡碧玉 GDP.csv', index=False)

Answering your question

You have to select your elements more specific e.g. with css selectors.

So first get the thead information from all th witout colspan, than collect the data from all tr in tbody that do not contains ads:

def getGDP(html):
    soup = BeautifulSoup(html,"html.parser")
    data = []
    data.append([th.text for th in soup.select('thead th:not([colspan])')])
    for row in soup.select('tbody tr:not(:-soup-contains("ads"))'):
        data.append(list(row.stripped_strings))
    return data

Example

import requests
from bs4 import BeautifulSoup
import lxml
import csv

def get_html(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding  
        return r.text
    except:
        r = "fail"
        return r
    
def getGDP(html):
    soup = BeautifulSoup(html,"html.parser")
    data = []
    data.append([th.text for th in soup.select('thead th:not([colspan])')])
    for x in soup.select('tbody tr:not(:-soup-contains("ads"))'):
        data.append(list(x.stripped_strings))
    return data
        
def saveGDP(ulist):
        file_name = '21095010 胡碧玉 GDP.csv'
        print(ulist)
        with open(file_name,'w',errors='ignore', encoding='utf-8') as f:
            f_csv = csv.writer(f)
            f_csv.writerows(ulist)
        
def main():  
        url='https://www.kylc.com/stats/global/yearly_per_country/g_gdp/vnm.html'
        html=get_html(url)    
        saveGDP(getGDP(html))    

if __name__=="__main__":
    main()