recently i started studying about scrapy and web scraping. I'm working on my first project and i got stucked. I would appreciate if someone can help me with the problem :)
Im scraping the page http://esg.krx.co.kr/contents/02/02020000/ESG02020000.jsp
So far i got to the moment where my program scrapes all the 77pages (i know its a bit hardcoded, i will try to change it later on) and get's the company_name and company_share_id. So now i'm trying to go to the company_page_url and again send a post request to get the data from the graph (not every company has the graph). However it seems like it doesn't call the parse_company_result.
Below i upload my code:
import scrapy
import json
from scrapy.http import Request
class EsgKrx1Spider(scrapy.Spider):
name = 'esg_krx1'
allowed_domains = ['esg.krx.co.kr']
def start_requests(self):
#sending a post request to the web
return [scrapy.FormRequest("http://esg.krx.co.kr/contents/99/ESG99000001.jspx",
formdata={'sch_com_nm': '',
'sch_yy': '2021',
'pagePath': '/contents/02/02020000/ESG02020000.jsp',
'code': '02/02020000/esg02020000',
'pageFirstCall': 'Y'},
callback=self.parse)]
def parse(self, response):
url = "http://esg.krx.co.kr/contents/99/ESG99000001.jspx"
total_pages = 77
for page in range(total_pages):
payload = {
'sch_com_nm': '',
'sch_yy': '2021',
'pagePath': '/contents/02/02020000/ESG02020000.jsp',
'code': '02/02020000/esg02020000',
'curPage': str(page 1)
}
yield scrapy.FormRequest(url=url,
method='POST',
formdata=payload,
callback=self.parse_result)
def parse_result(self, response):
dict_data = json.loads(response.text)
# looping in the result and assigning the company name
for i in dict_data['result']:
company_name = i['com_abbrv']
compay_share_id = i['isu_cd']
print(company_name, compay_share_id)
company_page_url = f"http://esg.krx.co.kr/contents/02/02010000/ESG02010000.jsp?isu_cd={compay_share_id}"
yield Request(company_page_url)
data_url = "http://esg.krx.co.kr/contents/99/ESG99000001.jspx"
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'
}
# yield response.follow(url=data_url, method='POST', callback=self.parse_company_result, headers=headers)
yield scrapy.FormRequest(url=data_url,
method='POST',
headers=headers,
callback=self.parse_company_result)
def parse_company_result(self, response):
graph_data = json.loads(response.text)
print(graph_data)
All the functions are of course in the class, it just didn't paste the code as i expected.
So my question is:
How do i go to the company page url?
Or maybe the request is correct, but later i do something wrong?
Maybe i don't get the response from the data_url?
I will appreciate all the help.
CodePudding user response:
I have updated your script as there were quite a few errors, namely:
- In
parse_resultit's best to create another function to parse the company urls as opposed to parsing them in the same one. - You need to include the payload to parse the json from the
Request Url, again it's best to split these into separate parsers that way you can see what is happening and what is going on.
I have built a scraper that does this in a hierarchical way so that you can understand what's happening top-down.
Additional note:
cb_kwargsallows you to take variables from one parser to another. Therefore, I can grab the company id and name fromparse_resultand yield this in the last parser. Note - the company id was important for the payload inparse_company. Therefore, you should get used to learning howcb_kwargsworks.
import scrapy
import json
from scrapy.http import Request
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:97.0) Gecko/20100101 Firefox/97.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-GB,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'X-Requested-With': 'XMLHttpRequest',
'Origin': 'http://esg.krx.co.kr',
'Connection': 'keep-alive',
'Referer': 'http://esg.krx.co.kr/contents/02/02010000/ESG02010000.jsp?isu_cd=004710',
}
class EsgKrx1Spider(scrapy.Spider):
name = 'esg_krx1'
allowed_domains = ['esg.krx.co.kr']
def start_requests(self):
#sending a post request to the web
return [scrapy.FormRequest("http://esg.krx.co.kr/contents/99/ESG99000001.jspx",
formdata={'sch_com_nm': '',
'sch_yy': '2021',
'pagePath': '/contents/02/02020000/ESG02020000.jsp',
'code': '02/02020000/esg02020000',
'pageFirstCall': 'Y'},
callback=self.parse)]
def parse(self, response):
url = "http://esg.krx.co.kr/contents/99/ESG99000001.jspx"
total_pages = 77
for page in range(total_pages):
payload = {
'sch_com_nm': '',
'sch_yy': '2021',
'pagePath': '/contents/02/02020000/ESG02020000.jsp',
'code': '02/02020000/esg02020000',
'curPage': str(page 1)
}
yield scrapy.FormRequest(url=url,
method='POST',
formdata=payload,
callback=self.parse_result)
def parse_result(self, response):
dict_data = json.loads(response.text)
# looping in the result and assigning the company name
for i in dict_data['result']:
company_name = i['com_abbrv']
company_share_id = i['isu_cd']
company_page_url = f"http://esg.krx.co.kr/contents/02/02010000/ESG02010000.jsp?isu_cd={company_share_id}"
yield Request(company_page_url,
#headers=headers,
callback = self.parse_company, cb_kwargs = {
'company_share_id':company_share_id,
'company_name':company_name
})
def parse_company(self, response, company_share_id, company_name):
""" Grab the chart ID from the webpage and store it as a list"""
chart_id = response.xpath("(//div[@class='CHART-AREA'])[1]//div//@id").get()
chart_id = [chart_id.split("chart")[-1]]
""" Notice that the number at the end of code in payload changes for each chart"""
for id_of_chart in chart_id:
for code_no in range(1, 3):
yield scrapy.FormRequest(
url = 'http://esg.krx.co.kr/contents/99/ESG99000001.jspx',
method='POST',
# headers=headers,
formdata = {
'url_isu_cd': str(company_share_id),
'isu_cd': '',
'sch_com_nm': '',
'pagePath': '/contents/02/02010000/ESG02010000.jsp',
'code': f'02/02010000/esg02010000_0{code_no}',
'chartNo': f'{id_of_chart}'
},
callback = self.parse_company_result,
cb_kwargs = {
'company_share_id':company_share_id,
'company_name':company_name
}
)
def parse_company_result(self, response, company_share_id, company_name):
graph_data = json.loads(response.text)
yield {
'data':graph_data,
'company_name':company_name,
'company_share_id':company_share_id
}
Output:
{'data': {'block1': [{'yy': '2019', 'pnt0': '7', 'pnt1': '2', 'pnt2': 'null'}, {'yy': '2020', 'pnt0': '7', 'pnt1': '2', 'pnt2': 'null'}, {'yy': '2021', 'pnt0': '7', 'pnt1': '2', 'pnt2': 'null'}]}, 'company_name': '아남전자', 'company_share_id': '008700'}
...
...
