Home > Blockchain >  How can I pass the datapoints as items via meta?
How can I pass the datapoints as items via meta?

Time:01-21

In the following spider you can see an example that works. However, in this case I cannot manage to pass the items via meta within the multiple parser.

I would appreciate an example of how to pass the datapoints via meta as items.

Here is the code:

import scrapy

class GsmSpider(scrapy.Spider):
    name = 'gsm'
    allowed_domains = ['gsmarena.com']
    start_urls = ['https://gsmarena.com/makers.php3']

    # LEVEL 1 | all brands

    def parse(self, response):
        gsms = response.xpath('//div[@]/table')
        for gsm in gsms:
            allbranddevicesurl = gsm.xpath('.//a/@href').get()
            brandname = gsm.xpath('.//a/text()').get()
            
            yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
                                    meta= {'brandname': brandname})

    # LEVEL 2 | all devices

    def parse_allbranddevicesurl(self, response):
        
        brandname = response.meta['brandname']
        
        phones = response.xpath('//*[@id="review-body"]//li')
        for phone in phones:
            thumbnailurl = phone.xpath('.//a/img/@src').get()
            detailpageurl = phone.xpath('.//a/@href').get()

            yield response.follow(detailpageurl,
                                    callback=self.parse_detailpage,
                                    meta= {'brandname': brandname,
                                           'thumbnailurl': thumbnailurl,
                                           'detailpageurl': detailpageurl})

        next_page = response.xpath('//a[@]/@href').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse_allbranddevicesurl,
                                    meta= {'brandname': brandname,
                                           'thumbnailurl': thumbnailurl,
                                           'detailpageurl': detailpageurl})

    # LEVEL 3 | detailpage

    def parse_detailpage(self, response):
     
        brandname = response.meta['brandname']
        thumbnailurl = response.meta['thumbnailurl']
        detailpageurl = response.meta['detailpageurl']
 
        details = response.xpath('//div[@]')
        for detail in details:
            phonename = detail.xpath('.//h1/text()').get()
            released = detail.xpath('.//ul/li[1]/span[1]/span/text()').get()

            yield {'brandname': brandname,
                   'thumbnailurl': thumbnailurl,
                   'detailpageurl': detailpageurl,

                   'phonename': phonename,
                   'released': released}

Here is the corrected code according to instructions from @SuperUser

import scrapy
from gsm.items import GsmItem

class GsmSpider(scrapy.Spider):
    name = 'gsm'
    allowed_domains = ['gsmarena.com']
    start_urls = ['https://gsmarena.com/makers.php3']

    custom_settings = {
        'CONCURRENT_REQUESTS': 4,
        'DOWNLOAD_DELAY': 0.5
    }

    # LEVEL 1 | all brands

    def parse(self, response):
        
        item = GsmItem()
        
        gsms = response.xpath('//div[@]/table//td')
        for gsm in gsms:
            allbranddevicesurl = gsm.xpath('.//a/@href').get()
            brandname = gsm.xpath('.//a/text()').get()
            devicecount = gsm.xpath('.//span/text()').get()
            
            item['brandname'] = brandname
            item['devicecount'] = devicecount

            yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
                                    meta= {'brandname': item,
                                           'devicecount': item})

    # LEVEL 2 | all devices

    def parse_allbranddevicesurl(self, response):
        
        item = response.meta['brandname']       
        item = response.meta['devicecount'] 

        phones = response.xpath('//*[@id="review-body"]//li')
        for phone in phones:
            detailpageurl = phone.xpath('.//a/@href').get()

            yield response.follow(detailpageurl,
                                    callback=self.parse_detailpage,
                                    meta= {'brandname': item,
                                           'devicecount': item})

        next_page = response.xpath('//a[@]/@href').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse_allbranddevicesurl,
                                    meta= {'brandname': item,
                                           'devicecount': item})

    # LEVEL 3 | detailpage

    def parse_detailpage(self, response):
     
        item = response.meta['brandname']       
        item = response.meta['devicecount']
         
        details = response.xpath('//div[@]')
        for detail in details:
            phonename = detail.xpath('.//h1/text()').get()
            released = detail.xpath('.//ul/li[1]/span[1]/span/text()').get()

            item['phonename'] = phonename
            item['released'] = released

            yield item

CodePudding user response:

Your xpath for the brands was not correct. See the comments in the code.

import scrapy


# you may want to move this class to "items.py" and import it
class GsmItem(scrapy.Item):
    brandname = scrapy.Field()
    thumbnailurl = scrapy.Field()
    detailpageurl = scrapy.Field()
    phonename = scrapy.Field()
    released = scrapy.Field()


class GsmSpider(scrapy.Spider):
    name = 'gsm'
    allowed_domains = ['gsmarena.com']
    start_urls = ['https://gsmarena.com/makers.php3']

    custom_settings = {
        'CONCURRENT_REQUESTS': 4,
        'DOWNLOAD_DELAY': 0.5
    }

    # LEVEL 1 | all brands

    def parse(self, response):
        # This is the original xpath which is wrong. You're just getting the table without the cell in the table.
        # gsms = response.xpath('//div[@]/table')

        # This is the fixed xpath:
        gsms = response.xpath('//div[@]/table//td')

        for gsm in gsms:
            allbranddevicesurl = gsm.xpath('.//a/@href').get()
            brandname = gsm.xpath('.//a/text()').get()

            yield response.follow(allbranddevicesurl, callback=self.parse_allbranddevicesurl,
                                  meta={'brandname': brandname})

    # LEVEL 2 | all devices

    def parse_allbranddevicesurl(self, response):
        brandname = response.meta['brandname']

        phones = response.xpath('//*[@id="review-body"]//li')
        for phone in phones:
            # we should create a new item for every iteration so we won't overwrite it
            item = GsmItem()

            thumbnailurl = phone.xpath('.//a/img/@src').get()
            detailpageurl = phone.xpath('.//a/@href').get()
            item['thumbnailurl'] = thumbnailurl
            item['detailpageurl'] = detailpageurl
            item['brandname'] = brandname

            yield response.follow(detailpageurl,
                                  callback=self.parse_detailpage,
                                  meta={'item': item})

        next_page = response.xpath('//a[@]/@href').get()
        if next_page is not None:
            yield response.follow(next_page, callback=self.parse_allbranddevicesurl,
                                  meta={'brandname': brandname})

    # LEVEL 3 | detailpage

    def parse_detailpage(self, response):
        item = response.meta['item']

        details = response.xpath('//div[@]')
        for detail in details:
            phonename = detail.xpath('.//h1/text()').get()
            released = detail.xpath('.//ul/li[1]/span[1]/span/text()').get()
            item['phonename'] = phonename
            item['released'] = released

            yield item
  •  Tags:  
  • Related