Home > Back-end >  AttributeError: 'NoneType' object has no attribute 'find_all' (BeautifulSoup) (C
AttributeError: 'NoneType' object has no attribute 'find_all' (BeautifulSoup) (C

Time:01-17

I am trying to run some code but I am getting the error message AttributeError: 'NoneType' object has no attribute 'find_all' in the looping part of the pages. I think that JS detecting that I'm using ChromeDriver and blocks the request to the web page.

Suggestions for what to do?

cards = []
pages = 5

for i in range(pages):
    url = 'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o='  str(i 1)
    driver.get(url)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")

    for anuncio in anuncios:
        card = {}

        card['value'] = get_text(anuncio.find('p', {'class':"OLXad-list-price"}))
        card['location'] = get_text(anuncio.find('p', class_="detail-region"))
        card['metragem'] = get_text(anuncio.find('p', class_="detail-specific"))
        card['link'] = get_link(anuncio.find('a', href=True))

        if len(card['value']):
            cards.append(card)

dataset = pd.DataFrame(cards)

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
/tmp/ipykernel_11539/2840841130.py in <module>
      7 
      8     soup = BeautifulSoup(driver.page_source, 'html.parser')
----> 9     anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")
     10 
     11     for anuncio in anuncios:

AttributeError: 'NoneType' object has no attribute 'find_all'

COMPLETE CODE

from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd

def get_text(bs_tag):
    if bs_tag:
        return bs_tag.get_text().strip().replace('\n', '').replace('\t', '')
    else:
        return ''

def get_link(bs_tag):
    if bs_tag:
        return bs_tag['href']
    else:
        return ''

options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(chromedriver, options=options)

driver.implicitly_wait(5)

cards = []
pages = 5

for i in range(pages):
    url = 'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o='  str(i 1)
    driver.get(url)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")

    for anuncio in anuncios:
        card = {}

        card['value'] = get_text(anuncio.find('p', {'class':"OLXad-list-price"}))
        card['location'] = get_text(anuncio.find('p', class_="detail-region"))
        card['metragem'] = get_text(anuncio.find('p', class_="detail-specific"))
        card['link'] = get_link(anuncio.find('a', href=True))

        if len(card['value']):
            cards.append(card)

dataset = pd.DataFrame(cards)

CodePudding user response:

anuncios = soup.find('div', {'class' : 'section_OLXad-list'}).find_all('li', class_="item")

The soup.find() call did not find anything, so it returned None.

So you're effectively trying to say this:

anuncios = None.find_all('li', class_="item")

which of course does not work.

CodePudding user response:

You don't have to use Selenium for this, all the data is stored in a script tag which can easily be scraped like this:

import requests
from bs4 import BeautifulSoup
import json
import pandas as pd

headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"}

final = []
for page in range(1,5):

    url = f'https://rj.olx.com.br/rio-de-janeiro-e-regiao/imoveis/aluguel?o={page}'

    landing_page = requests.get(url,headers=headers)
    print(f'Scraping page {page}')
    soup = BeautifulSoup(landing_page.text,'html.parser')

    dirty = soup.find('script',{'id':'initial-data'})['data-json']
    clean = json.loads(dirty.replace('&quot;','"'))

    data = clean['listingProps']['adList']
    for listing in data:

        try:
            listing.pop('images') #clean up csv
            listing.pop('properties')
        except:
            continue #some listings don't have images/properties

        final.append(listing)

df = pd.DataFrame(final)
df.to_csv('output.csv',index=False)
  •  Tags:  
  • Related