企业年报数据抓取

最新推荐文章于 2025-02-28 23:08:45 发布
weixin_42405511
最新推荐文章于 2025-02-28 23:08:45 发布
阅读量570
点赞数
文章标签： python
本文链接：https://blog.csdn.net/weixin_42405511/article/details/118029342
版权
# -*- coding: utf-8 -*-
import sys
import requests
import json
import jsonpath
import re
from readTxt import readTxt
reload(sys)
sys.setdefaultencoding('utf-8')


class getAnnualReport():
    def __init__(self):

        self.headers = {
            "Host": "www.szse.cn",
            "Connection": "keep-alive",
            "Content-Length": "126",
            "Accept": "application/json, text/javascript, */*; q=0.01",
            "X-Request-Type": "ajax",
            "X-Requested-With": "XMLHttpRequest",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4021.2 Safari/537.36",
            "Content-Type": "application/json",
            "Origin": "http://www.szse.cn",
            "Referer": "http://www.szse.cn/disclosure/listed/notice/index.html",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9"
        }
        
        self.get_headers = {
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9",
            "Cache-Control": "max-age=0",
            "Connection": "keep-alive",
            # "Host": "www.szse.cn",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4021.2 Safari/537.36"
        }

        self.data = '''{"seDate":["",""],"stock":["{code}"],"channelCode":["listedNotice_disc"],"bigCategoryId":["010301"],"pageSize":30,"pageNum":{pageNum}}'''

        self.url = 'http://www.szse.cn/api/disc/announcement/annList?random=0.6619654039313971'

    def __request(self, url, post_data):
        content = ''
        if post_data:
            response = requests.post(url=url, headers=self.headers, data=post_data)
        else:
            response = requests.get(url=url, headers=self.get_headers)
        if response.status_code == 200:
            print('页面请求成功！')
            content = response.text
        else:
            print('请求失败，状态码：{}'.format(response.status_code))
        return content

    def getNeedPdf(self, titleAndUrlList):
        needList = []
        for row in titleAndUrlList:
            title = row[1]
            if '摘要' in title:
                continue
            preyear = re.findall(r'：(\d+)年', title)
            if preyear:
                year = preyear[0]
                if '2015' <= year <= '2020':
                    needList.append(row)
                else:
                    print '该条件不符合年份要求：{}'.format(title)
            else:
                print '正则匹配失败：{}'.format(title)
        return needList

    def getPageNum(self, code, response):
        titleAndUrlList = []
        # 获取总页码
        response = json.loads(response)
        pageCount = jsonpath.jsonpath(response, "$.announceCount")[0]
        pageNum = int(pageCount)/30 + 1

        # 解析第一页内容
        titles = jsonpath.jsonpath(response, "$.data[*].title")
        ids = jsonpath.jsonpath(response, "$.data[*].id")
        for title, id in zip(titles, ids):
            url = 'http://www.szse.cn/api/disc/announcement/bulletin_detail/{}'.format(id)
            titleAndUrlList.append([code, title.encode('utf-8'), url])
        return pageNum, titleAndUrlList

    def downloadPdf(self, needList):
        for row in needList:
            url = row[2]

            # 请求获取PDF链接
            detailResponse = self.__request(url, '')
            pdfId = jsonpath.jsonpath(json.loads(detailResponse), '$.attachPath')[0]
            pdfHref = 'http://disc.static.szse.cn{}'.format(pdfId)

            # 请求下载PDF
            response = requests.get(url=pdfHref, headers=self.get_headers)
            if response.status_code == 200:
                print '{}， {}下载成功'.format(row[0], row[1])
                file_path = './PDF/' + '{}-{}.pdf'.format(row[0], row[1].replace('：', '-').replace(':', '-'))
                with open(file_path.decode('utf-8'), 'wb') as f:
                    f.write(response.content)
            else:
                print '{}， {}下载失败'.format(row[0], row[1])

    def parsePdf(self):
        pass

    def run(self):
        codeList = readTxt(r'./code.txt')
        for code in codeList:
            post_data = self.data.replace('{code}', code).replace('{pageNum}', '1')
            response = self.__request(self.url, post_data)
            pageNum, titleAndUrlList = self.getPageNum(code, response)

            needList = self.getNeedPdf(titleAndUrlList)
            for row in needList:
                print row[0], row[1], row[2]
            # for i in range(pageNum):
            #     post_data = data.replace('{code}', '000001').replace('{pageNum}', str(i + 1))
            self.downloadPdf(needList)
            break


if __name__ == '__main__':
    t = getAnnualReport()
    t.run()