requests_htlml infinite scrolling on div instead of entire page-CodePudding

Hello I am trying to get all the links from below web page. This page loads new product when we scroll down and I am trying to get the links for all the products by scrolling to the bottom of the page. I am using scrolldown method of requests_html after

from requests_html import HTML, HTMLSession

baseurl = "https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002"

session = HTMLSession()
page = session.get(baseurl)
page.html.render(scrolldown=50, sleep=3)
html = HTML(html=page.text)
#noticeName = html.find('a href')
all_links = html.links
for ln in all_links:
    print(ln)
print(len(all_links))

filtered_links = [link for link in all_links if link.startswith("/product")]
print(len(filtered_links))

CodePudding user response：

You could just mimic the POST requests the page does and keep requesting batches of 20 results, extracting the links, until you have gathered the total specified number of results.

import requests
import math
from bs4 import BeautifulSoup as bs


def add_product_links(soup):
    product_links.extend(['https://www.medplusmart.com'   i['href']
                          for i in soup.select('.productInfoDiv > div:nth-child(1) > [href^=\/product]')])
    return


product_links = []
n = 0
results_per_page = 20
page = 1

data = {
    'sortField': '',
    'startIndex': n,
    'productCategoryId': 'MART_20002',
    'startPrice': '',
    'endPrice': '',
    'minPrice': '0',
    'maxPrice': '2650',
    'excludeNoStock': 'N',
    'pCatName': 'personal-care_10102',
    'catName': 'skin-care_20002',
    'productIdString': '',
    'Brand Search': ''
}

with requests.Session() as s:
    s.headers = {"User-Agent": "Safari/537.36"}
    r = s.get(
        'https://www.medplusmart.com/categories/personal-care_10102/skin-care_20002')
    soup = bs(r.content, 'lxml')
    data['productIdString'] = soup.select_one('#productIdString')['value']
    num_results = int(soup.select_one('#totalProductFound')['value'])
    num_pages = math.ceil(num_results / results_per_page)
    add_product_links(soup)
    s.headers.update({'x-kl-ajax-request': 'Ajax_Request'})

    while True:
        if page > num_pages:
            break
        data['startIndex'] = n
        r = s.post('https://www.medplusmart.com/loadMoreProduct.mart', data=data)
        soup = bs(r.content, 'lxml')
        add_product_links(soup)
        n  = results_per_page
        page  = 1

print(len(product_links))