I have run into a problem and I can´t figure out how to get any further.
I have scraped multiple pages for a companies name, location and province, along with a link to additional information on another page. The link which I have collected provides 3 more pieces of information that I require.
I need to access the link, and take out the address, phone number (if it has one) and a CNAE code, and append that to the previous data.
The working script for the first scrape I currently have is as follows:
import requests
from bs4 import BeautifulSoup
baseurl = ["https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/index.html"]
urls = [f'https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/{i}.html'.format(i) for i in range(2,65)]
allurls = baseurl urls
print(allurls)
for url in allurls:
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
lists = soup.select("div#simulacion_tabla ul")
#scrape the pages
for lis in lists:
title = lis.find('li', class_="col1").text
location = lis.find('li', class_="col2").text
province = lis.find('li', class_="col3").text
link = lis.select("li.col1 a")[0]['href']
info = [title, location, province, link]
print(info)
On the second page the data is in a table with the id names below. This is the code I thought I would need to use but it isn´t working and I am going round in circles trying to figure out why:
section = soup.select("section#datos_empresa")
lslinks = link
for ls in lslinks
location = lis.find('tr', id_="tamano_empresa").text
cnae = lis.find('tr', id_="cnae_codigo_empresa").text
phone = lis.find('tr', id_="telefono_empresa").text
addinfo = [location, cnae, phone]
info.append(addinfo)
Here´s an example of one of the links
Ideally the output would be:
['AGRICOLA CALLEJA SL', 'CARPIO', 'VALLADOLID', 'https://www.expansion.com/directorio-empresas/agricola-calleja-sl_1480101_A02_47.html', C/ LA TORRE, 2., 150, 983863247]
which I would write to a text file so I can import it to excel.
Any help would be greatly appreciated!
Cheers!
CodePudding user response:
In your sub page, you were trying to select the ID not the class for the section so it was failing to match any entries. You could also use the td.
Your logic for the sub page needs to be combined with your main page. Try the following:
import requests
from bs4 import BeautifulSoup
import csv
with open('output.csv', 'w', newline='', encoding='utf-8') as f_output:
csv_output = csv.writer(f_output)
csv_output.writerow(["Title", "Location", "Province", "Link", "Location", "cnae", "Phone"])
urls = ["https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/index.html"]
urls.extend(f'https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/{i}.html' for i in range(2, 65))
for url in urls:
print(url)
r_main = requests.get(url)
soup_main = BeautifulSoup(r_main.content, "html.parser")
for lis in soup_main.select("div#simulacion_tabla ul"):
title = lis.find('li', class_="col1").text
location = lis.find('li', class_="col2").text
province = lis.find('li', class_="col3").text
link = lis.select("li.col1 a")[0]['href']
print(' ', link)
r_sub = requests.get(link)
soup_sub = BeautifulSoup(r_sub.content, "html.parser")
section = soup_sub.select_one("section.datos_empresa")
location = section.find('td', id="tamano_empresa").text
cnae = section.find('td', id="cnae_codigo_empresa").text
phone = section.find('td', id="telefono_empresa").text
csv_output.writerow([title, location, province, link, location, cnae, phone])
This will create a CSV output file starting:
Title,Location,Province,Link,Location,cnae,Phone
A CORTIÑA DOS ACIVROS SL,DESCONOCIDO,LUGO,https://www.expansion.com/directorio-empresas/a-cortina-dos-acivros-sl_9163006_A02_27.html,DESCONOCIDO,150,
A CORTIÑA DOS ACIVROS SL,DESCONOCIDO,LUGO,https://www.expansion.com/directorio-empresas/a-cortina-dos-acivros-sl_9163006_A02_27.html,DESCONOCIDO,150,
A P V 19 32 SL,MICROEMPRESA,VALENCIA,https://www.expansion.com/directorio-empresas/a-p-v-19-32-sl_672893_A02_46.html,MICROEMPRESA,150,
ABADIA DE JABUGO SL,DESCONOCIDO,HUELVA,https://www.expansion.com/directorio-empresas/abadia-de-jabugo-sl_5442689_A02_21.html,DESCONOCIDO,150,
ABALOS REAL SLL,MICROEMPRESA,CUENCA,https://www.expansion.com/directorio-empresas/abalos-real-sll_1239004_A02_16.html,MICROEMPRESA,150,969142092
CodePudding user response:
Here is the minimal working solution so far.
Code:
import requests
from bs4 import BeautifulSoup
baseurl = ["https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/index.html"]
urls = [f'https://www.expansion.com/empresas-de/ganaderia/granjas-en-general/{i}.html'.format(i) for i in range(2,65)]
allurls = baseurl urls
#print(allurls)
for url in allurls:
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
lists = soup.select("div#simulacion_tabla ul")
#scrape the pages
for lis in lists:
title = lis.find('li', class_="col1").te___xt
location = lis.find('li', class_="col2").text
province = lis.find('li', class_="col3").text
link = lis.select_one("li.col1 a")['href']
#info = [title, location, province, link]
#print(info)
sub_page = requests.get(link)
soup2 = BeautifulSoup(sub_page.content, "html.parser")
direction = soup2.select_one('#direccion_empresa').text
cnae = soup2.select_one('#cnae_codigo_empresa').text
phone=soup2.select_one('#telefono_empresa')
telephoe = phone.text if phone else None
print([title,location,province,link,direction,cnae,telephoe])
Output:
['A CORTIÑA DOS ACIVROS SL', 'LUGO', 'LUGO', 'https://www.expansion.com/directorio-empresas/a-cortina-dos-acivros-sl_9163006_A02_27.html', 'CRTA. A CORUÑA, 16.', '150', '']
['A CORTIÑA DOS ACIVROS SL', 'LUGO', 'LUGO', 'https://www.expansion.com/directorio-empresas/a-cortina-dos-acivros-sl_9163006_A02_27.html', 'CRTA. A CORUÑA, 16.', '150', '']
['A P V 19 32 SL', 'VALENCIA', 'VALENCIA', 'https://www.expansion.com/directorio-empresas/a-p-v-19-32-sl_672893_A02_46.html', 'CALLE SALVA, 8 1 2B.', '150', '']
['ABADIA DE JABUGO SL', 'CARTAYA', 'HUELVA', 'https://www.expansion.com/directorio-empresas/abadia-de-jabugo-sl_5442689_A02_21.html', 'URB. MARINA EL ROMPIDO, 31 VILLA M-31. CRTA. EL RO.', '150', '']
['ABALOS REAL SLL', 'CARBONERAS DE GUADAZAON', 'CUENCA', 'https://www.expansion.com/directorio-empresas/abalos-real-sll_1239004_A02_16.html', 'C/ DON CRUZ, 23.', '150', '969142092']
... so on
