I'm trying to scrap data from this website: https://www.aliexpress.com/wholesale?catId=0&initiative_id=AS_20220313071939&SearchText=bluetooth earphones Especially I want to get all reviews from each product page. The main issue is that I'm struggling to get this surrounded bottom in order to scrape each comment and customer country: Here is a photo showing that: enter image description here
This is my code :
from selenium import webdriver
from lxml import html
import cssselect
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones<ype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders", "Shipping Cost", "Product links", "Country","Comments"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5)
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@]//a'):
title = product.xpath('.//h1/text()')
if title:
title = title[0]
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
currency = price[0]
price = ''.join(price[1:])
review = product.xpath('.//span[@]/text()')
if review:
review = review[0]
else:
review = ''
nb_sold = product.xpath('.//span[@]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
ship_cost = product.xpath('.//span[@]/text()')
if ship_cost:
ship_cost = ship_cost[0]
else:
ship_cost = ''
###########################################
links = product.xpath('//div[@]//a/@href')
if links:
links = links[0]
else:
links = ''
# scraping data from each inner page
for link in links :
driver.get(link)
sleep(2)
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5)
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
for cmt in tree.xpath('//*[@id="transction-feedback"]/div[5]/div[1]'):
country = cmt.xpath('.//div[@]//b/text()')
if country:
country = country[0]
else:
country = ''
comment = cmt.xpath('.//span[@id="0.0.0.i4.5dc4sSFDsSFD5B"]/text()')
if comment:
comment = comment[0]
else:
comment = ''
row = [title, price, currency, review, nb_sold, ship_cost, links,country, comment]
results.append(row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()
CodePudding user response:
There are two problems:
First:
You have to use html.fromstring(driver.page_source) AFTER you scroll down.
Second:
It adds items only when they are displayed inside window (in viewport) so you can't jump directly to the end of page. You have to scroll partially (in loop) using i.e. window.innerHeight.
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
#print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
Full working code with other changes in xpath.
It gives me 60 items on every page.
from selenium import webdriver
from lxml import html
from time import sleep
from itertools import zip_longest
import csv
driver = webdriver.Edge(executable_path=r"C:/Users/OUISSAL/Desktop/wscraping/XEW/scraping/codes/msedgedriver")
#driver = webdriver.Firefox()
url = 'https://www.aliexpress.com/wholesale?trafficChannel=main&d=y&CatId=0&SearchText=bluetooth earphones<ype=wholesale&SortType=default&page={}'
with open ("data.csv", "w", encoding="utf-8") as csvfile:
wr = csv.writer(csvfile)
wr.writerow(["Title","Price", "Currency", "Reviews", "Number of orders"])
for page_nb in range(1, 4):
print('---', page_nb, '---')
driver.get(url.format(page_nb))
sleep(2)
# jump to the end of page
#driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
# scroll partially
current_offset = 0
while True:
driver.execute_script("window.scrollBy(0, window.innerHeight);")
sleep(.5) # JavaScript has time to add elements
new_offset = driver.execute_script("return window.pageYOffset;")
print(new_offset,current_offset)
if new_offset <= current_offset:
break
current_offset = new_offset
sleep(3)
tree = html.fromstring(driver.page_source)
results = []
for product in tree.xpath('//div[@]//a'):
title = product.xpath('.//h1/text()')
#print('[DEBUG] title:', title)
if title:
title = title[0]
#print('[DEBUG] title:', title)
price = product.cssselect('div.mGXnE._37W_B span')
price = [x.text for x in price]
# for `$ 35.00`
currency = price[0]
price = ''.join(price[1:])
# for `35.00 zł`
#currency = price[-1]
#price = ''.join(price[:-1])
#print('[DEBUG] price:', price)
#print('[DEBUG] currency:', currency)
review = product.xpath('.//span[@]/text()')
if review:
review = review[0]
else:
review = ''
#print('[DEBUG] review:', review)
nb_sold = product.xpath('.//span[@]/text()')
if nb_sold:
nb_sold = nb_sold[0]
else:
nb_sold = ''
#print('[DEBUG] nb_sold:', nb_sold)
row = [title, price, currency, review, nb_sold]
results.append(row)
#print('[DEBUG] row:', row)
print('len(results):', len(results))
wr.writerows(results)
driver.close()
