python宏观研究_【python爬虫】抛砖引玉--批量爬取某财富股票网证券机构宏观研报...

[Python] 纯文本查看 复制代码import requests, json, time, random,os

from bs4 import BeautifulSoup

def UserAgent():

user_agent_list = ['Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',

'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36',

'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36',

'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36',

'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36',

'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36',

'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:17.0) Gecko/20100101 Firefox/17.0.6',

'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36',

'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36',

'Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36']

UserAgent={'User-Agent': random.choice(user_agent_list)}

return UserAgent

def timenow():

last_para = int(time.time() * 1000)

now = int(time.time())

timeArray = time.localtime(now)

end_Time = time.strftime("%Y-%m-%d", timeArray)

return last_para,end_Time

def macResearch_page(end_Time,last_para,UserAgent):

# 经测试pageSize最大数值为100

macResearchPageurl = 'http://reportapi.eastmoney.com/report/jg?pageSize=100&beginTime=2018-03-19&endTime=' + str(end_Time) + '&pageNo=1&fields=&qType=3&orgCode=&author=&_=' + str(last_para)

macResearch='http://data.eastmoney.com/report/zw_macresearch.jshtml?encodeUrl='

html = requests.get(macResearchPageurl, timeout=3, headers=UserAgent)

text=html.text

return text,macResearch

def parser_json(text,sub_url):

djs = json.loads(text, encoding='utf-8')

dic={}

for item in djs['data']:

item_publishDate=item['publishDate'][:10].split('-')[0]+item['publishDate'][:10].split('-')[1]+item['publishDate'][:10].split('-')[2]

pdf_title=item_publishDate+item['title']

page_url=sub_url+item["encodeUrl"]

dic[pdf_title]=page_url

return dic

def save_file(dic,UserAgent,num):

# num是整数并且不能超过100

a=0

for key in dic.keys():

url_r=dic[key]

name=key

real_html=requests.get(url_r,headers=UserAgent).text

soup = BeautifulSoup(real_html, 'html.parser')

file_url=soup.find_all('a',attrs={'class','pdf-link'})[0]['href']

last_response=requests.get(file_url,headers=UserAgent)

file_down=last_response.content

file_name=name+'.'+file_url.split('.')[-1]

content_size = int(last_response.headers['Content-Length']) / (1024*1024)

print(file_url)

with open(file_name,'wb') as f:

f.write(file_down)

f.close()

print('成功!保存完成,文件大小:{:0.3f}MB'.format(content_size))

a+=1

if a==num:

break

def main():

num=int(input('注意:请输入1-100的整数:\n'))

user_agent=UserAgent()

last_para,end_Time=timenow()

text,sub_url=macResearch_page(end_Time,last_para,user_agent)

dic=parser_json(text,sub_url)

save_file(dic,user_agent,num)

print('全部完成!')

main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值