巨潮资讯(http://www.cninfo.com.cn/)数据爬取

'''
    巨潮资讯(http://www.cninfo.com.cn/)数据爬取

    最新公告网址:
    http://www.cninfo.com.cn/new/disclosure/stock?stockCode=601519&orgId=9900017431
    定期报告网址:
    http://www.cninfo.com.cn/new/disclosure/stock?stockCode=601519&orgId=9900017431#periodicReports
'''

import requests

url = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'

#自动生成的文件头信息;
#最好再模拟不同的浏览器访问
headers = {
  "Accept": "application/json, text/javascript, */*; q=0.01",
  "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
  "Connection": "keep-alive",
  "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
  "Cookie": "JSESSIONID=DF28F2A2C38640EC78531475CEDABBDB; insert_cookie=37836164; routeId=.uc2; cninfo_user_browse=601519,9900017431,%E5%A4%A7%E6%99%BA%E6%85%A7|600519,gssh0600519,%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0|600030,gssh0600030,%E4%B8%AD%E4%BF%A1%E8%AF%81%E5%88%B8; SID=f3a336a7-e913-4de8-8dd0-1e8e3017bfbb; _sp_id.2141=9802fb76-2236-47f2-9215-9dfa6496e964.1697805509.7.1697884826.1697880028.64529e1c-582f-4bb7-acb8-73b2231212a5",
  "Origin": "http://www.cninfo.com.cn",
  "Referer": "http://www.cninfo.com.cn/new/disclosure/stock?stockCode=601519&orgId=9900017431",
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.57",
  "X-Requested-With": "XMLHttpRequest"
}

#post请求数据
data={
  "stock": "601519,9900017431",
  "tabName": "fulltext",
  "pageSize": "30",
  "pageNum": "1",
  "column": "sse",
  "category": "category_ndbg_szsh;category_bndbg_szsh;category_yjdbg_szsh;category_sjdbg_szsh;",
  "plate": "sh",
  "seDate": "",
  "searchkey": "",
  "secid": "",
  "sortName": "",
  "sortType": "",
  "isHLtitle": "true"
}
#证券代号
#"stock": "601519,9900017431",      #大智慧
#  "stock": "600030,gssh0600030",   #中信证券

# 发送GET请求,目的是获取总页数
response = requests.post(url, headers=headers, data=data).json()
totalpages = response['totalpages']+1

#外循环控制页数
for i in range(1,totalpages+1):
    #读取页面数据;第一页读取了2次
    page_no = str(i)
    data["pageNum"] = page_no
    response = requests.post(url, headers=headers, data=data).json()
    #print(response) #测试一下
    
    #内循环读取一页中的每一行数据  
    for item in response['announcements']:
        file_url = 'https://static.cninfo.com.cn/'+item['adjunctUrl']
        file_name = item['announcementTitle']
        print(file_name,'正在下载...',file_url)
        #将PDF文件数据写入本地文件
        response = requests.get(file_url,headers=headers)
        with open('./download/'+str(page_no)+'--'+file_name+'.pdf','wb') as file:
            file.write(response.content)
    print('-'*100)
    print(f'第{page_no}页')
    #break   #测试一次外循环
    
print("数据全部抓取,完美!")
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值