python 东方财富行业研报下载脚本(练习)

#!coding=utf-8
#VX:china_awen
import os
import time
import requests
import random
import json
import datetime
from datetime import timedelta
time1=time.time()
time2=int(time1)
print(time2)
def re_name(excel_name):#去除名字中的特殊符号
   sets = ['/', '\\', ':', '*', '?', '"', '<', '>', '|']
   for char in excel_name:
    if char in sets:
       excel_name = excel_name.replace(char, '')
   return excel_name
my_headers = [
    "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36",
    "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14",
    "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)",
    'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
    'Opera/9.25 (Windows NT 5.1; U; en)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    "Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7",
    "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "
]
headers = {'User-Agent':random.choice(my_headers)}
def get_page(url):#封装下载页面方法
    response = requests.get(url,headers=headers)
    if response.status_code == 200:
        return response.content.decode("utf-8")#应对乱码
    else:
        return '爬取失败!'
def down_load():
    try:
        excel_name = data_end['title']
        excel_time = data_end['publishDate'][0:10]
        excel_writer = data_end['researcher']
        excel_organ = data_end['orgSName']
        industryName = data_end['industryName']
        #stockCode =data_end['stockCode']
        down_loadurl='https://pdf.dfcfw.com/pdf/H3_'+data_end['infoCode']+'_1.pdf'
        excel_name = re_name(excel_name)
        excel_organ= re_name(excel_organ)
        industryName = re_name(industryName)
        global file_name
        file_name = excel_time + './' + excel_time +'-' + industryName + '-' + excel_name + '-' + excel_organ + '.pdf'
        print(industryName)
    except AttributeError:
        print('没有链接')
    else:
        try:
            print(excel_name)
            os.mkdir(excel_time)
            if os.path.isfile(file_name):
                print("文件已下载")
            else:
                with open(file_name, 'wb') as code:
                    download_pdf = requests.get(down_loadurl)
                    code.write(download_pdf.content)
        except FileExistsError:
            if os.path.isfile(file_name):
                print("文件已下载")
            else:
                with open(file_name,'wb') as code:
                    download_pdf = requests.get(down_loadurl)
                    code.write(download_pdf.content)
num_random_7=random.randint(1000000,9999999)
#now_time = datetime.datetime.now().strftime('%Y-%m-%d')
datetimeNow = datetime.date.today()
datetimeOneYearAgo = datetimeNow - timedelta(days=730)
print(datetimeOneYearAgo)
pageno='1'
html_url='https://reportapi.eastmoney.com/report/list?cb=datatable'+str(num_random_7)+'&industryCode=*'+'&pageSize=50'+'&industry='+'*'+'&rating='+'*'+'&ratingChange='+'*'+'&beginTime='+str( datetimeOneYearAgo)+'&endTime='+str(datetimeNow)+'&pageNo='+pageno+'&fields='+''+'&qType='+'1'+'&orgCode='+'&code='+'*'+'&rcode='+'&_='+str(time2)#链接生成
def page_data(pageno_num):
  num_random_7=random.randint(1000000,9999999)
  #now_time = datetime.datetime.now().strftime('%Y-%m-%d')
  datetimeNow = datetime.date.today()
  datetimeOneYearAgo = datetimeNow - timedelta(days=730)
  pageno=pageno_num
  html_url='https://reportapi.eastmoney.com/report/list?cb=datatable'+str(num_random_7)+'&industryCode=*'+'&pageSize=50'+'&industry='+'*'+'&rating='+'*'+'&ratingChange='+'*'+'&beginTime='+str( datetimeOneYearAgo)+'&endTime='+str(datetimeNow)+'&pageNo='+pageno+'&fields='+''+'&qType='+'1'+'&orgCode='+'&code='+'*'+'&rcode='+'&_='+str(time2)#链接生成
  html=get_page(html_url)
  #print(html)
  html1=html.strip('datatable'+str(num_random_7)+'(')
  html2=html1.rstrip(')')#去掉字符串字段
  data_frist = json.loads(html2)
  return  data_frist
data_frist=page_data(str(1))
value_list=data_frist['data']
size_long=data_frist['size']
TotalPage=data_frist['TotalPage']
print(value_list)
print(type(size_long))
for f in range(TotalPage):
   f = f + 1
   data_frist=page_data(str(f))
   value_list = data_frist['data']
   for data_end in value_list:
    down_load()

很早前写的,应该是下载这个页面的研报
在这里插入图片描述
下载后会按日期建立文件夹并且标明行业,题目,和研究机构。具体功能我也忘了,应该是按照页数一直下载(从当前日期向前追溯)
在这里插入图片描述

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

我文非相

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值