'''
巨潮资讯(http://www.cninfo.com.cn/)数据爬取
最新公告网址:
http://www.cninfo.com.cn/new/disclosure/stock?stockCode=601519&orgId=9900017431
定期报告网址:
http://www.cninfo.com.cn/new/disclosure/stock?stockCode=601519&orgId=9900017431#periodicReports
'''
import requests
url = 'http://www.cninfo.com.cn/new/hisAnnouncement/query'
#自动生成的文件头信息;
#最好再模拟不同的浏览器访问
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"Cookie": "JSESSIONID=DF28F2A2C38640EC78531475CEDABBDB; insert_cookie=37836164; routeId=.uc2; cninfo_user_browse=601519,9900017431,%E5%A4%A7%E6%99%BA%E6%85%A7|600519,gssh0600519,%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0|600030,gssh0600030,%E4%B8%AD%E4%BF%A1%E8%AF%81%E5%88%B8; SID=f3a336a7-e913-4de8-8dd0-1e8e3017bfbb; _sp_id.2141=9802fb76-2236-47f2-9215-9dfa6496e964.1697805509.7.1697884826.1697880028.64529e1c-582f-4bb7-acb8-73b2231212a5",
"Origin": "http://www.cninfo.com.cn",
"Referer": "http://www.cninfo.com.cn/new/disclosure/stock?stockCode=601519&orgId=9900017431",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.57",
"X-Requested-With": "XMLHttpRequest"
}
#post请求数据
data={
"stock": "601519,9900017431",
"tabName": "fulltext",
"pageSize": "30",
"pageNum": "1",
"column": "sse",
"category": "category_ndbg_szsh;category_bndbg_szsh;category_yjdbg_szsh;category_sjdbg_szsh;",
"plate": "sh",
"seDate": "",
"searchkey": "",
"secid": "",
"sortName": "",
"sortType": "",
"isHLtitle": "true"
}
#证券代号
#"stock": "601519,9900017431", #大智慧
# "stock": "600030,gssh0600030", #中信证券
# 发送GET请求,目的是获取总页数
response = requests.post(url, headers=headers, data=data).json()
totalpages = response['totalpages']+1
#外循环控制页数
for i in range(1,totalpages+1):
#读取页面数据;第一页读取了2次
page_no = str(i)
data["pageNum"] = page_no
response = requests.post(url, headers=headers, data=data).json()
#print(response) #测试一下
#内循环读取一页中的每一行数据
for item in response['announcements']:
file_url = 'https://static.cninfo.com.cn/'+item['adjunctUrl']
file_name = item['announcementTitle']
print(file_name,'正在下载...',file_url)
#将PDF文件数据写入本地文件
response = requests.get(file_url,headers=headers)
with open('./download/'+str(page_no)+'--'+file_name+'.pdf','wb') as file:
file.write(response.content)
print('-'*100)
print(f'第{page_no}页')
#break #测试一次外循环
print("数据全部抓取,完美!")