基金数据爬取与分析评估

最新推荐文章于 2024-04-26 11:28:48 发布

Theory D

最新推荐文章于 2024-04-26 11:28:48 发布

阅读量2k

点赞数 2

分类专栏：量化文章标签： python 数据分析

本文链接：https://blog.csdn.net/duansirui/article/details/105091119

版权

量化专栏收录该内容

5 篇文章 0 订阅

订阅专栏

对量化投资感兴趣的朋友，可关注微信公众号：Quant_Reserch ，与我们交流。公众号中有每日的复盘，研究策略分享。
在这里插入图片描述

基金数据爬取

以前聚宽还没有全部基金的历史数据，于是利用网上的代码，利用爬虫方式从东方财富抓取数据，虽然网上代码很多，不过错误也很多，坑不少。

代码

##参考代码链接  https://www.jianshu.com/p/d79d3cd62560
##修改内容：日期改为datetime，数据采用dataframe，value为float。原网页page参数错误，实际为当前页数。最长一页40条数据，需要组合。
import requests
import pandas as pd
from bs4 import BeautifulSoup
import random
import datetime
import re
import numpy as np
import json
from six import StringIO
from six import BytesIO


##获取代理池，详见 https://github.com/1again/SmartProxyPool
def get_proxy():
    data_json = requests.get("http://proxy.1again.cc:35050/api/v1/proxy/?region=中国").text
    data = json.loads(data_json)
    return data['data']['proxy']


def get_url(url, params=None, proxies=None,header=None):
    rsp = requests.get(url, params=params, proxies={"http": proxies},headers=header)    
    rsp.raise_for_status()
    return rsp.text


##获取全基金
url = 'http://fund.eastmoney.com/js/fundcode_search.js'
html = get_url(url,proxies=None)
soup = BeautifulSoup(html, 'html.parser')
exec('securities'+str(soup).strip('var '))
funds = pd.DataFrame(data=securities, index=None, columns=['code','2','name','type','5'])
# fund type类型 股票型-  混合型 债券型 指数型 保本型 理财型 货币型 混合-FOF ...

def get_header():
 # user_agent列表
    user_agent_list = [
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
  'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
  'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0',
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36',
  'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
    ]
# referer列表
    referer_list = [
  'http://fund.eastmoney.com/110022.html',
  'http://fund.eastmoney.com/110023.html',
  'http://fund.eastmoney.com/110024.html',
  'http://fund.eastmoney.com/110025.html',
  'https://www.baidu.com/s?wd=%E5%A4%A9%E5%A4%A9%E5%9F%BA%E9%87%91%E7%BD%91'
    ]
# 获取一个随机user_agent和Referer
    header = {'User-Agent': random.choice(user_agent_list),'Referer': random.choice(referer_list)}
    return header


def get_fund_data(code, start='', end='',proxy_list=0):
    url = 'http://fund.eastmoney.com/f10/F10DataApi.aspx'
    params = {'type': 'lsjz', 'code': code, 'page': 1, 'per': 40, 'sdate': start, 'edate': end}
    if proxy_list==0:
        html = get_url(url,params,proxies=None,header=get_header())
    else:
        while(1):
            try:
                proxy_random=random.choice(proxy_list)
                html = get_url(url,params,proxies=proxy_random,header=get_header())
                break
            except:
                continue
    soup = BeautifulSoup(html, 'html.parser')
    records = pd.DataFrame(data=None, index=None, columns=['Code','NetAssetValue','AccumulatedNetValue','ChangePercent'])
    tab = soup.findAll('tbody')[0]
    for tr in tab.findAll('tr'):
        if tr.findAll('td') and len((tr.findAll('td'))) == 7:
            date=datetime.datetime.strptime(str(tr.select('td:nth-of-type(1)')[0].getText().strip()),'%Y-%m-%d')
            if tr.select('td:nth-of-type(2)')[0].getText().strip()=='':
                nav=0
            else:
                nav=float(tr.select('td:nth-of-type(2)')[0].getText().strip())

            if tr.select('td:nth-of-type(3)')[0].getText().strip()=='':
                aav=0
            else:
                aav=float(tr.select('td:nth-of-type(3)')[0].getText().strip()) 

            if tr.select('td:nth-of-type(4)')[0].getText().strip('%')=='':
                cpt=0
            else:
                cpt=float(tr.select('td:nth-of-type(4)')[0].getText().strip('%'))
            records.loc[date,:]=[code,nav,aav,cpt]
            
    reg=re.compile(r"(?<=pages:)\d+")
    match=reg.search(str(soup))
    pages=int(match.group(0))
    if pages >1:
        for p in range (2,pages+1):
            params = {'type': 'lsjz', 'code': code, 'page': p, 'per': 40, 'sdate': start, 'edate': end}
            html = get_url(url, params)
            soup = BeautifulSoup(html, 'html.parser')
            tab = soup.findAll('tbody')[0]
            for tr in tab.findAll('tr'):
                if tr.findAll('td') and len((tr.findAll('td'))) == 7:
                    date=datetime.datetime.strptime(str(tr.select('td:nth-of-type(1)')[0].getText().strip()),'%Y-%m-%d')
                    if tr.select('td:nth-of-type(2)')[0].getText().strip()=='':
                        nav=0
                    else:
                        nav=float(tr.select('td:nth-of-type(2)')[0].getText().strip())
                    
                    if tr.select('td:nth-of-type(3)')[0].getText().strip()=='':
                        aav=0
                    else:
                        aav=float(tr.select('td:nth-of-type(3)')[0].getText().strip()) 
                    
                    if tr.select('td:nth-of-type(4)')[0].getText().strip('%')=='':
                        cpt=0
                    else:
                        cpt=float(tr.select('td:nth-of-type(4)')[0].getText().strip('%'))
                        
                    records.loc[date,:]=[code,nav,aav,cpt]    
    return records




## 获取数据示例：get_fund_data('163402','2018-09-18','2019-09-18')

遇到的坑

网上一般都使用 http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code=110022&sdate=2018-02-22&edate=2019-03-02&per=40&page=1 这个接口，参数一目了然。关键问题出在per这个参数，表示每页显示数据条数，经过测试每页最多49条，因此数据量大了，肯定是要多页组合的。需要循环page变量，遍历所有页数。
如果一段时间内频繁抓取数据，会被禁止链接，已经做了随机http头的处理，貌似没有用，关键还是代理，下面提供了一个自动生成代理列表的代码。由于代理池质量不好，需要试很久。如有自建更好，有两三个代理，配合本机IP随机访问基本不会被封了。

#获取代理列表
proxy_list=[]
test_url = 'http://fund.eastmoney.com/js/fundcode_search.js'
while(len(proxy_list)<5):
    while(1):
        try:
            data_json = requests.get("http://proxy.1again.cc:35050/api/v1/proxy/?region=中国").text
            data = json.loads(data_json)
            proxy=data['data']['proxy']
            rsp = requests.get(test_url, params=None, proxies={"http": proxy},headers=get_header(),timeout=5)
            rsp.raise_for_status()
            proxy_list.append(proxy)
            break
        except:
            print(len(proxy_list))
proxy_list.append(None)
proxy_list.append(None)
proxy_list.append(None)

基金数据分析

代码

##研究时间段：
##'2014-06-30'  '2015-06-01' 大涨
##'2015-06-01'  '2016-03-10' 大跌
##'2016-03-10'  '2018-01-18' 缓涨
##'2018-01-18'  '2019-01-18' 缓跌
##'2019-01-18'  '2020-01-18' 缓涨
##'2020-01-18'  '2020-03-16' 疫情涨跌
d1='2014-06-30'
d2='2015-06-01'
d3='2016-03-10'
d4='2018-01-18'
d5='2019-01-18' 
d6='2020-01-17'
d7='2020-03-16'


funds_list=list(funds[(funds['type']!='理财型') &(funds['type']!='货币型') & (funds['type']!='混合-FOF')]['code'])
start_list=[]
history_price= pd.DataFrame(data=None, index=None, columns=['code','p1','p2','p3','p4','p5','p6','p7'])
total=len(funds_list)
div=round(total/10)
##手动配置代理列表，如采用后面的自动获取，需要注释掉
proxy_list=['127.0.0.1:1080',None,]
i=0
for f in funds_list:
    print('%4.2f %%\r' %(float(i/total*100)),end="") #显示百分比
    history_price.loc[f,'code']=str(f)
    history_price.loc[f,'p1']=get_fund_data(f,d1,d1,proxy_list)['AccumulatedNetValue'].values
    history_price.loc[f,'p2']=get_fund_data(f,d2,d2,proxy_list)['AccumulatedNetValue'].values
    history_price.loc[f,'p3']=get_fund_data(f,d3,d3,proxy_list)['AccumulatedNetValue'].values
    history_price.loc[f,'p4']=get_fund_data(f,d4,d4,proxy_list)['AccumulatedNetValue'].values
    history_price.loc[f,'p5']=get_fund_data(f,d5,d5,proxy_list)['AccumulatedNetValue'].values
    history_price.loc[f,'p6']=get_fund_data(f,d6,d6,proxy_list)['AccumulatedNetValue'].values
    history_price.loc[f,'p7']=get_fund_data(f,d7,d7,proxy_list)['AccumulatedNetValue'].values
    i=i+1    

##清洗数据
washed_price=pd.DataFrame(data=None, index=history_price.index, columns=['code','p1','p2','p3','p4','p5','p6','p7'])
washed_price['code']=history_price['code']
for i in range(0,len(history_price)):
    for j in range(1,8):
        if np.size(history_price.iloc[i,j])==0:
            washed_price.iloc[i,j]=0
        else:
            washed_price.iloc[i,j]=float(history_price.iloc[i,j])
            
washed_price=washed_price.drop(washed_price[washed_price['p7']==0].index)
washed_price[washed_price[:]==0]=np.nan

washed_price.fillna(method='bfill',axis=1)


##计算分数
point=pd.DataFrame(data=None, index=histroy_price.index, columns=['p1','p2','p3','p4','p5','p6','p_4year','fall','rise','total'])
point['p1']=(washed_price['p2']-washed_price['p1'])/washed_price['p1']
point['p2']=(washed_price['p3']-washed_price['p2'])/washed_price['p2']
point['p3']=(washed_price['p4']-washed_price['p3'])/washed_price['p3']
point['p4']=(washed_price['p5']-washed_price['p4'])/washed_price['p4']
point['p5']=(washed_price['p6']-washed_price['p5'])/washed_price['p5']
point['p6']=(washed_price['p7']-washed_price['p6'])/washed_price['p6']
point['p_4year']=(washed_price['p7']-washed_price['p3'])/washed_price['p3']
point['fall']=point['p2']+point['p4']
point['rise']=point['p1']+point['p3']+point['p5']
point['total']=point['fall']*3+point['rise']+point['p_4year']*5
point.fillna(0);


##查看分数
show=point[point['fall']>-0.2].sort_values(by=['total'],ascending=False)
show