Download and extract only news from BBC-CodePudding

I need to get all articles from BBC main page using Selenium in Python. After going through the website HTML I was able to extract the sections for the whole page. problem is im trying to filter the non-relevant sections such as language changing and skip to contect sort of urls.

My goal is to enter each article and grab its content only using Selenium.

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By

PATH = "C:/Program Files (x86)/chromedriver.exe"
driver = webdriver.Chrome(PATH)
url = 'https://www.bbc.com/'

driver.get(url)


hrefs = []

# Getting all sections from BBC
media_list = WebDriverWait(driver, 10).until(
    EC.presence_of_all_elements_located((By.TAG_NAME, "section"))
)

for item in media_list:
    print(item.find_element_by_css_selector('trying to figure what to write here'))

driver.close()
driver.quit()

CodePudding user response：

In case you want to enter each article, scrape the data, get back and go to the next article your code will be not so simple as you presented here.
You will need to get all the articles elements again each time you are back to the main page.
You will need to scroll the desired article element into view before you going to click it.
BTW presence_of_all_elements_located will wait for appearance of at least 1 element, not all the elements matching the passed locator as you could think. So it's better to use visibility_of_element_located - this will wait for more mature state of the element and then add some additional delay to make all the other elements loaded. 1 second will be more than enough here.
Also your locator was wrong.
This should work:

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains

PATH = "C:/Program Files (x86)/chromedriver.exe"
driver = webdriver.Chrome(PATH)
url = 'https://www.bbc.com/'

driver.get(url)
actions = ActionChains(driver)

hrefs = []

# Getting all sections from BBC
WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a.block-link__overlay-link")))
time.sleep(1)
media_list = driver.find_elements_by_css_selector("a.block-link__overlay-link")
for idx, val in enumerate(media_list):
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, "a.block-link__overlay-link")))
    time.sleep(1)
    media_list = driver.find_elements_by_css_selector("a.block-link__overlay-link")
    item = media_list[idx]
    actions.move_to_element(item).perform()
    time.sleep(0.5)
    item.click()
    #scrape your data
    driver.execute_script("window.history.go(-1)")

CodePudding user response：

I provide example without Selenium, but i hope you can get it. First goal - get every link to news from main page

def get_links():
    LINKS = []
    url = 'https://www.bbc.com'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, features='lxml')
    for html in soup.find_all('li', class_='media-list__item'):
        link = html.find('a').get('href')
        if "https://" in link:
            LINKS.append(link)
        else:
            link = url   link
            LINKS.append(link)
    return LINKS

Output:

    https://www.bbc.com/news/world-asia-60034170
    https://www.bbc.com/news/world-australia-60027360
    https://www.bbc.com/news/world-europe-60030615
    https://www.bbc.com/worklife/article/20220114-can-sleep-leadership-help-banish-burnout
    https://www.bbc.com/culture/article/20220117-what-happens-to-fascist-architecture-after-fascism
    https://www.bbc.com/news/world-latin-america-59944126
    https://www.bbc.com/news/business-60036831
    https://www.bbc.com/news/uk-60033012
    https://www.bbc.com/sport/live/tennis/58856643
    https://www.bbc.com/sport/live/football/60036185
    ...
    https://www.bbc.com/news/world-africa-59703123

Now we have every href from main page, let's scrap head and body of articles:

def get_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, features='lxml')
    heading = soup.find('h1', {"id": "main-heading"}).getText()
    print(heading) #or write it to file
    for block in soup.find_all('div', {"data-component": "text-block"}):
        print(block.getText()) #or write it to file

Last goal we need only news links, so simple get it:

for link in get_links():
    if "/news/" in link:
        get_page(link)

Full code:

import requests
from bs4 import BeautifulSoup


def get_links():
    LINKS = []
    url = 'https://www.bbc.com'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, features='lxml')
    for html in soup.find_all('li', class_='media-list__item'):
        link = html.find('a').get('href')
        if "https://" in link:
            LINKS.append(link)
        else:
            link = url   link
            LINKS.append(link)
    return LINKS


def get_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, features='lxml')
    heading = soup.find('h1', {"id": "main-heading"}).getText()
    print(heading)
    for block in soup.find_all('div', {"data-component": "text-block"}):
        print(block.getText())

for link in get_links():
    if "/news/" in link:
        get_page(link)