# -*- coding: utf-8 -*-
import sys
import requests
import json
import jsonpath
import re
from readTxt import readTxt
reload(sys)
sys.setdefaultencoding('utf-8')
class getAnnualReport():
def __init__(self):
self.headers = {
"Host": "www.szse.cn",
"Connection": "keep-alive",
"Content-Length": "126",
"Accept": "application/json, text/javascript, */*; q=0.01",
"X-Request-Type": "ajax",
"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4021.2 Safari/537.36",
"Content-Type": "application/json",
"Origin": "http://www.szse.cn",
"Referer": "http://www.szse.cn/disclosure/listed/notice/index.html",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9"
}
self.get_headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
# "Host": "www.szse.cn",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4021.2 Safari/537.36"
}
self.data = '''{"seDate":["",""],"stock":["{code}"],"channelCode":["listedNotice_disc"],"bigCategoryId":["010301"],"pageSize":30,"pageNum":{pageNum}}'''
self.url = 'http://www.szse.cn/api/disc/announcement/annList?random=0.6619654039313971'
def __request(self, url, post_data):
content = ''
if post_data:
response = requests.post(url=url, headers=self.headers, data=post_data)
else:
response = requests.get(url=url, headers=self.get_headers)
if response.status_code == 200:
print('页面请求成功!')
content = response.text
else:
print('请求失败,状态码:{}'.format(response.status_code))
return content
def getNeedPdf(self, titleAndUrlList):
needList = []
for row in titleAndUrlList:
title = row[1]
if '摘要' in title:
continue
preyear = re.findall(r':(\d+)年', title)
if preyear:
year = preyear[0]
if '2015' <= year <= '2020':
needList.append(row)
else:
print '该条件不符合年份要求:{}'.format(title)
else:
print '正则匹配失败:{}'.format(title)
return needList
def getPageNum(self, code, response):
titleAndUrlList = []
# 获取总页码
response = json.loads(response)
pageCount = jsonpath.jsonpath(response, "$.announceCount")[0]
pageNum = int(pageCount)/30 + 1
# 解析第一页内容
titles = jsonpath.jsonpath(response, "$.data[*].title")
ids = jsonpath.jsonpath(response, "$.data[*].id")
for title, id in zip(titles, ids):
url = 'http://www.szse.cn/api/disc/announcement/bulletin_detail/{}'.format(id)
titleAndUrlList.append([code, title.encode('utf-8'), url])
return pageNum, titleAndUrlList
def downloadPdf(self, needList):
for row in needList:
url = row[2]
# 请求获取PDF链接
detailResponse = self.__request(url, '')
pdfId = jsonpath.jsonpath(json.loads(detailResponse), '$.attachPath')[0]
pdfHref = 'http://disc.static.szse.cn{}'.format(pdfId)
# 请求下载PDF
response = requests.get(url=pdfHref, headers=self.get_headers)
if response.status_code == 200:
print '{}, {}下载成功'.format(row[0], row[1])
file_path = './PDF/' + '{}-{}.pdf'.format(row[0], row[1].replace(':', '-').replace(':', '-'))
with open(file_path.decode('utf-8'), 'wb') as f:
f.write(response.content)
else:
print '{}, {}下载失败'.format(row[0], row[1])
def parsePdf(self):
pass
def run(self):
codeList = readTxt(r'./code.txt')
for code in codeList:
post_data = self.data.replace('{code}', code).replace('{pageNum}', '1')
response = self.__request(self.url, post_data)
pageNum, titleAndUrlList = self.getPageNum(code, response)
needList = self.getNeedPdf(titleAndUrlList)
for row in needList:
print row[0], row[1], row[2]
# for i in range(pageNum):
# post_data = data.replace('{code}', '000001').replace('{pageNum}', str(i + 1))
self.downloadPdf(needList)
break
if __name__ == '__main__':
t = getAnnualReport()
t.run()
企业年报数据抓取
最新推荐文章于 2025-02-28 23:08:45 发布