Home > Back-end >  Only click next page button after the first page has been scraped and then scrape next page
Only click next page button after the first page has been scraped and then scrape next page

Time:01-23

How would I make selenium wait for scrapy to scrape the information needed from the first page, and only then click the next page button and then scrape the next page. Ultimately I am trying to repeat this process until the last page is reached which is page 301.

# -*- coding: utf-8 -*-
from typing_extensions import Self
import scrapy
from scrapy.selector import Selector
from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from shutil import which 

#login info
username = "xxx"
password = "xxx"

class HtSpiderSelenium(scrapy.Spider):
    name = 'ht_selenium1'
    allowed_domains = ['https://app.xxx.bootstart.tech']
    start_urls = ['https://app.xxx.bootstart.tech']
    
    def __init__(self):
        chrome_options = Options()
        #chrome_options.add_argument("--headless")

        driver = webdriver.Chrome(executable_path="./chromedriver")

        #get login page
        driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
        driver.get("https://auth.bootstart.tech/auth/realms/xxxPlatform/protocol/openid-connect/auth?client_id=xxx-platform&redirect_uri=https://app.xxx.bootstart.tech/?redirect_fragment=%2Fstartup&state=8780862b-1eaf-4b6e-92e5-fd9ab464c57f&nonce=79d66ef5-f0bb-4e75-8db2-6402114b9aa8&response_mode=fragment&response_type=code")

        #login
        driver.find_element_by_id("username").send_keys(username)
        driver.find_element_by_id("password").send_keys(password)
        driver.find_element_by_name("login").click()
        sleep(15)

        #next page button
        driver.find_element_by_xpath("/html/body/div[2]/div[2]/nav/div/div[2]/ul/li[14]/a").click()
        sleep(10)


        self.html = driver.page_source
        driver.close()

    #scrape needed info
    def parse(self, response):
        resp = Selector(text=self.html)
        for startup in resp.xpath("//div[contains(@class, 'col-sm-12')]"):
            yield {
                'startup name': startup.xpath(".//span[contains(@class, 'no-outline ng-binding')]/text()").get(),
                'startup descript': ''.join(startup.xpath('//div//p//div//text()').getall()),
                'startup location': startup.xpath(".//h4//small[@class='ng-binding']//text()").get(),
                'startup industry': startup.xpath(".//h4//span[@class='ng-binding']/text()").get(),
            }
    

CodePudding user response:

You can try to do something like this:

# -*- coding: utf-8 -*-
# from typing_extensions import Self
import scrapy
from scrapy.selector import Selector
# from scrapy_selenium import SeleniumRequest
from time import sleep
from turtle import delay
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from shutil import which

#login info
username = "xxx"
password = "xxx"


class HtSpiderSelenium(scrapy.Spider):
    name = 'ht_selenium1'
    allowed_domains = ['app.xxx.bootstart.tech']


    def __init__(self):
        chrome_options = Options()
        #chrome_options.add_argument("--headless")

        self.driver = webdriver.Chrome(executable_path="./chromedriver")

        #get login page
        self.driver = webdriver.Chrome(executable_path="./chromedriver", options=chrome_options)
        self.driver.get("https://auth.bootstart.tech/auth/realms/xxxPlatform/protocol/openid-connect/auth?client_id=xxx-platform&redirect_uri=https://app.xxx.bootstart.tech/?redirect_fragment=%2Fstartup&state=8780862b-1eaf-4b6e-92e5-fd9ab464c57f&nonce=79d66ef5-f0bb-4e75-8db2-6402114b9aa8&response_mode=fragment&response_type=code")

        #login
        self.driver.find_element_by_id("username").send_keys(username)
        self.driver.find_element_by_id("password").send_keys(password)
        self.driver.find_element_by_name("login").click()

        self.start_urls = [self.driver.current_url]

    #scrape needed info
    def parse(self, response):
        self.driver.get(response.url)
        while True:
            resp = Selector(text=self.driver.page_source)
            for startup in resp.xpath("//div[contains(@class, 'col-sm-12')]"):
                yield {
                    'startup name': startup.xpath(".//span[contains(@class, 'no-outline ng-binding')]/text()").get(),
                    'startup description': ''.join(startup.xpath('//div//p//div//text()').getall()),
                    'startup location': startup.xpath(".//h4//small[@class='ng-binding']//text()").get(),
                    'startup industry': startup.xpath(".//h4//span[@class='ng-binding']/text()").get(),
                }

            #next page button
            next_page = self.driver.find_element_by_xpath("/html/body/div[2]/div[2]/nav/div/div[2]/ul/li[14]/a")
            try:
                next_page.click()
            except:
                break
        self.driver.close()
  •  Tags:  
  • Related