用python来获取基金历史收益率来估算出那些基金可能比较好,自己的一些愚见,纯粹视为了学习python,里面的内容不能作为任何参考,刚开始学习python,写的不好希望各位大牛能帮忙指出不足,谢谢。
1、获取获取所有基金的代码
import requests
import re
def getAllFundCode():
url = "http://fund.eastmoney.com/js/fundcode_search.js"
all_text = requests.get(url).text
# 取出全部内容
findall = re.findall(r'"\d+\.?\d*"', all_text)
result = []
for x in findall:
# 替换双引号
result.append(re.sub(r'"', "", x))
return result
2、获取基金信息
import requests
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import logging
# 获取html
def getHtml(code, start_date, end_date, page=1, per=20):
url = "http://fund.eastmoney.com/f10/F10DataApi.aspx?type=lsjz&code={0}&page={1}&sdate={2}&edate={3}&per={4}".format(
code, page, start_date, end_date, per)
rsp = requests.get(url)
return rsp.text
# 获取基金数据
def getFund(name, code, start_date, end_date, page=1, per=20):
log_format = "%(asctime)s - %(message)s"
logging.basicConfig(filename="exec.log", format=log_format, level=logging.INFO)
try:
html = getHtml(code, start_date, end_date, per)
soup = BeautifulSoup(html, "html.parser")
# 获取总业
pattern = re.compile('pages:(.*),')
result = re.search(pattern, html).group(1)
total_page = int(result)
# 获取表头信息
heads = []
for head in soup.find_all("th"):
heads.append(head.contents[0])
# 数据存取列表
records = []
# 获取每一页数据
current_page = 1
while current_page <= total_page:
html = getHtml(code, start_date, end_date, current_page, per)
soup = BeautifulSoup(html, 'html.parser')
# 获取数据
for row in soup.findAll("tbody")[0].findAll("tr"):
row_records = []
for record in row.findAll('td'):
val = record.contents
# 处理空值
if val == []:
row_records.append(np.nan)
else:
row_records.append(val[0])
# 记录数据
records.append(row_records)
# 下一页
current_page = current_page + 1
if len(records) <= 0:
return None
# 将数据转换为Dataframe对象
np_records = np.array(records)
fund_df = pd.DataFrame()
for col, col_name in enumerate(heads):
fund_df[col_name] = np_records[:, col]
# 按照日期排序
fund_df['净值日期'] = pd.to_datetime(fund_df['净值日期'], format='%Y/%m/%d')
fund_df = fund_df.sort_values(by='净值日期', axis=0, ascending=True).reset_index(drop=True)
fund_df = fund_df.set_index('净值日期')
# 数据类型处理
fund_df['单位净值'] = fund_df['单位净值'].astype(float)
fund_df['累计净值'] = fund_df['累计净值'].astype(float)
fund_df['日增长率'] = fund_df['日增长率'].str.strip('%').astype(float)
# print("thread_name: " + name + " code: " + code + " finish!")
finish_info = "thread_name: {} code: {} finish!".format(name, code)
logging.info(finish_info)
return fund_df
except Exception as e:
print(e)
error_info = "thread_name: {} code: {} error!".format(name, code)
logging.info(error_info)
return None
3、开始分析基金数据并排序,我使用了基金 收益率方差*收益率标准差/收益率²作为基准进行排序,算出来的值越低,说明偏离和离散程度较低,收益率较高。
import GetFundInfo as gf
import GetFundCode as gd
import datetime
import numpy as np
import threading as tr
import os
threadLock = tr.Lock()
# 获取列表的第三个元素
def takeSecond(elem):
return elem[2]
# 2015年以前的基金不算,因为没有足够的历史数据回测
def compareTime(time):
# end_time = datetime.datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
end_time = time
start_time = datetime.datetime.strptime("2015-01-01", "%Y-%m-%d")
if end_time <= start_time:
return True
return False
# 开始分析
def analysis(name, codes, analysis_list):
try:
for code in codes:
fund_df = gf.getFund(name, code, start_date='2014-01-01', end_date='2021-01-01')
if fund_df is None:
continue
index_min = fund_df.index.min()
if not compareTime(index_min):
continue
# 平均值
df_mean = fund_df['日增长率'].mean(axis=0)
if df_mean <= 0:
continue
# 方差
df_var = fund_df['日增长率'].var(axis=0)
# 标准差
df_td = fund_df['日增长率'].std(axis=0)
# 按照自己方法计算排序 (方差*标准差/平均差²)
df_calculate = df_var * df_td / np.square(df_mean)
info = (code, df_mean, df_var, df_td, df_calculate)
analysis_list.append(info)
except Exception as e:
print("analysis error: " + str(e))
# 取基金代码数据
def getTheCode():
codes_list = []
codes = gd.getAllFundCode()
codes_copies = len(codes) // 20
for i in range(0, len(codes), codes_copies):
codes_list.append(codes[i:i + codes_copies])
return codes_list
# 写日志
def writeDocument(data):
base_dir = os.getcwd()
file_name = os.path.join(base_dir, '', 'my_info.log')
my_open = open(file_name, "a")
my_open.write(str(data) + '\n')
my_open.close()
# 开始分析
def start():
try:
analysis_list = []
threads = []
# 获取代码列表
codes_list = getTheCode()
thread_number = 0
for codes in codes_list:
thread_number = thread_number + 1
t = tr.Thread(target=analysis, args=("thread" + str(thread_number), codes, analysis_list))
# t = myThread(thread_number, "analysis_thread" + str(thread_number), thread_number, codes, analysis_list)
t.start()
threads.append(t)
# print("开始运行!")
writeDocument("开始运行!")
# 等待所有线程任务结束。
for t in threads:
t.join()
# print("完成添加开始排序-------------")
writeDocument("完成添加开始排序-------------")
output_content = sorted(analysis_list, key=lambda x: (x[4]))
# print("所有线程任务完成")
for infos in output_content:
writeDocument(infos)
writeDocument("所有线程任务完成")
except Exception as e:
print("start error: " + str(e))