Home > Enterprise >  When getting Data through web scrapping the previous data is lost by the new data
When getting Data through web scrapping the previous data is lost by the new data

Time:01-25

I am web scrapping a piece of Code to get NSE Corporate Announcements. But problem is that the url i am using in this code can only contain 20 Items in one go , Hence what happens is that their are many many 100's of announcements each day which are missed as it only contains 20 at a time

I want this problem to solve so that i get all previous announcements as well as the prior announcements. Here is my code-

import requests
import pandas as pd
from datetime import date
from datetime import datetime

today = date.today()

__request_headers = {
    'Host':'www.nseindia.com', 
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0',
    'Accept':'text/html,application/xhtml xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 
    'Accept-Language':'en-US,en;q=0.5', 
    'Accept-Encoding':'gzip, deflate, br',
    'DNT':'1', 
    'Connection':'keep-alive', 
    'Upgrade-Insecure-Requests':'1',
    'Pragma':'no-cache',
    'Cache-Control':'no-cache',    
}


try:
    nse_url = 'https://www.nseindia.com/'
    url = 'https://www.nseindia.com/api/corporate-announcements?index=equities'
    resp = requests.get(url=nse_url, headers=__request_headers)
    if resp.ok:
        req_cookies = dict(nsit=resp.cookies['nsit'], nseappid=resp.cookies['nseappid'], ak_bmsc=resp.cookies['ak_bmsc'])
        tresp = requests.get(url=url, headers=__request_headers, cookies=req_cookies)
        result = tresp.json()
        result = pd.DataFrame(result)
        result.drop(['difference', 'dt','exchdisstime','csvName','old_new','orgid','seq_id','sm_isin','bflag','symbol','sort_date'], axis = 1, inplace = True)
        result.rename(columns = {'an_dt':'DateandTime', 'attchmntFile':'Source','attchmntText':'Topic','desc':'Type','smIndustry':'Sector','sm_name':'Company Name'}, inplace = True)
        result[['Date','Time']] = result.DateandTime.str.split(expand=True)
        result.to_csv( ( str(today.day)  '-' str(today.month)  '-' 'CA.csv'), index=True)
        print(result)
        res_data = result["NIFTY"]["data"] if "NIFTY" in result and "data" in result["NIFTY"] else []
        if res_data != None and len(res_data) > 0:
            __top_list = res_data
            print(__top_list)
except OSError as err:
    logger.error('Unable to fetch data')

CodePudding user response:

You can use the 1-day url to build your request and use today's date (or whatever date range you want)

import requests
import pandas as pd
from datetime import datetime

s = requests.Session()
headers =   {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'}
url = 'https://www.nseindia.com/'
step = s.get(url,headers=headers)

today = datetime.now().strftime('%d-%m-%Y')
api_url = f'https://www.nseindia.com/api/corporate-announcements?index=equities&from_date={today}&to_date={today}'

resp = s.get(api_url,headers=headers).json()

df= pd.DataFrame(resp)
df.to_csv('nseindia.csv',index=False)

print('Saved to nseindia.csv')
  •  Tags:  
  • Related