利用Python爬取天天基金的数据。
先打开天天基金网,然后选择感兴趣的主题基金,这里以煤炭为例。
进入页面之后,复制一个基金名称,在浏览器的控制台Network搜索这个名称。
点击我们搜索到的js,切换到Headers,这里有几个信息是我们需要的:General里的Request URL、Request Headers里的Cookie、Host、Referer、User-Agent。
import pandas as pd
import requests
import re
import json
import time
import math
#定义一个fund_obj用于存放,刚才在浏览器上获取到的几个值
fund_obj = {
'isbuy': 1,
'sort': 'TRY',
'sorttype': 'desc',
'callback': 'jQuery18306680153539612363_1681216767557',
'cookie': 'intellpositionL=1152px; em_hq_fls=js; intellpositionT=455px; qgqp_b_id=2c2759821488d533c53576ad44a41e5d; HAList=ty-1-000001-%u4E0A%u8BC1%u6307%u6570%2Cty-90-BK1036-%u534A%u5BFC%u4F53%2Ca-sh-603713-%u5BC6%u5C14%u514B%u536B%2Cf-0-000001-%u4E0A%u8BC1%u6307%u6570%2Ca-sh-600559-%u8001%u767D%u5E72%u9152%2Ca-sh-603195-%u516C%u725B%u96C6%u56E2%2Ca-sz-002236-%u5927%u534E%u80A1%u4EFD%2Ca-sh-600759-%u6D32%u9645%u6CB9%u6C14%2Cf-0-399439-%u56FD%u8BC1%u6CB9%u6C14%2Ca-sz-000049-%u5FB7%u8D5B%u7535%u6C60; EMFUND1=null; EMFUND2=null; EMFUND3=null; EMFUND4=null; EMFUND5=null; EMFUND6=null; EMFUND7=null; EMFUND0=null; EMFUND8=04-11%2015%3A07%3A30@%23%24%u534E%u5546%u4FE1%u7528%u589E%u5F3A%u503A%u5238A@%23%24001751; AUTH_FUND.EASTMONEY.COM_GSJZ=AUTH*TTJJ*TOKEN; EMFUND9=04-11 15:32:10@#$%u62DB%u5546%u4E2D%u8BC1%u767D%u9152%u6307%u6570%28LOF%29A@%23%24161725; st_si=97748096283622; st_pvi=48514956678552; st_sp=2019-09-26%2018%3A30%3A34; st_inirUrl=https%3A%2F%2Fwww.baidu.com%2Flink; st_sn=1; st_psi=2023041120385018-112200312942-2229925924; st_asi=delete',
'host': 'api.fund.eastmoney.com',
'referer': 'http://fund.eastmoney.com/',
'useragent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'themecallback': 'jQuery18306592616272463223_1681284141591',
}
#基金信息
class FundInfo:
#初始化
def __init__(self, page_index = 1, page_size = 10, total_page = 0, total_count = 0):
self.pageIndex = page_index
self.pageSize = page_size
self.totalPage = total_page
self.totalCount = total_count
self.dataList = []
self.themeName = ''
#请求接口
def get_response(self, parame_obj, parame_url):
# 伪装
# Request Headers里的Cookie、Host、Referer、User-Agent。
headers = {
'Cookie': parame_obj['cookie'],
'Host': parame_obj['host'],
'Referer': parame_obj['referer'],
'User-Agent': parame_obj['useragent'],
}
response = requests.get(url=parame_url, headers=headers)
return response
#获取基金主题
def get_fund_theme(self, parame_obj):
url = f'http://api.fund.eastmoney.com/ztjj//GetBKDetailInfoNew?callback={parame_obj["themecallback"]}&tp=801950&_=1681284141906'
response = self.get_response(parame_obj, url)
# 获取数据
data = response.text
print('theme===', data)
# 解析数据
res = re.findall(f'{parame_obj["themecallback"]}\((.*)\)', data)[0]
res = json.loads(res)
self.themeName = res['Data']['INDEXNAME']
#获取基金列表数据
def get_fund_data(self, parame_obj):
# General里的Request URL
# 将callback、sort、sorttype、pageindex、pagesize、isbuy这几个参数提取出来
url = f'http://api.fund.eastmoney.com/ztjj/GetBKRelTopicFundNew?callback={parame_obj["callback"]}&sort={parame_obj["sort"]}&sorttype={parame_obj["sorttype"]}&pageindex={self.pageIndex}&pagesize={self.pageSize}&tp=801950&isbuy={parame_obj["isbuy"]}'
response = self.get_response(parame_obj, url)
# 获取数据
data = response.text
print(data)
# 解析数据
res = re.findall(f'{parame_obj["callback"]}\((.*)\)', data)[0]
res = json.loads(res)
return res
#获取总条数、总页数
def get_total_page(self, parame_res):
self.totalCount = int(parame_res['TotalCount'])
self.totalPage = math.ceil(self.totalCount / self.pageSize)
def get_fund_info(self, parame_res):
datas = parame_res['Data']
for list in datas:
itemList = []
for key, val in list.items():
if 'SHORTNAME' in key:
itemList.append(str(val))
elif 'FCODE' in key:
itemList.append(str(val))
elif 'DWJZ' in key:
itemList.append(str(val))
elif 'RZDF' in key:
itemList.append(str(val))
elif 'SYRQ' in key:
itemList.append(str(val))
elif 'FTYPE' in key:
itemList.append(str(val))
elif 'RELATION' in key:
itemList.append(str(val))
elif 'SYL_Z' in key:
itemList.append(str(val))
elif 'SYL_Y' in key:
itemList.append(str(val))
elif 'SYL_3Y' in key:
itemList.append(str(val))
elif 'SYL_6Y' in key:
itemList.append(str(val))
elif 'SYL_JN' in key:
itemList.append(str(val))
elif 'SYL_1N' in key:
itemList.append(str(val))
elif 'SYL_2N' in key:
itemList.append(str(val))
elif 'SYL_3N' in key:
itemList.append(str(val))
elif 'SYL_LN' in key:
itemList.append(str(val))
elif 'RATE' in key:
itemList.append(str(val))
elif 'SOURCERATE' in key:
itemList.append(str(val))
elif 'MINSG' in key:
itemList.append(str(val))
elif 'ISSALES' in key:
itemList.append(str(val))
elif 'ISBUY' in key:
itemList.append(str(val))
self.dataList.append(itemList)
#分页
def change_page(self, parame_obj, parame_res):
self.get_fund_theme(parame_obj)
timestamp = int(time.time())
for page in range(self.totalPage):
print(f'正在爬取第{page+1}页的数据内容')
time.sleep(1)
self.get_fund_data(parame_obj)
self.get_fund_info(parame_res)
self.pageIndex += 1
if self.pageIndex > self.totalPage:
df = pd.DataFrame(self.dataList, columns=['基金代码', '', '日期', '原手续费', '净值', '日增长率', '手续费', 'ISBUY', '购买起点', '基金名称', '近1周','近1月','近3月','近6月','近1年','近2年','近3年','今年来','成立来', 'ISSALES','基金类型'])
df.to_excel(f'{self.themeName}基金{timestamp}.xlsx')
break
fund_info = FundInfo()
res = fund_info.get_fund_data(fund_obj)
fund_info.get_total_page(res)
fund_info.change_page(fund_obj, res)
运行上面的代码,然后就可以得到一份数据。