主要步骤:
1、爬取天天基金的评论和发布评论的时间
2、统计每天有几条评论
3、取7日移动平均值
4、wind下载易方达的净值数据,画图
结论
1、基本没啥意义, 天天基金能取到的数据有限。
2、不过很好爬,是个好网站
3、或许这个关注度可以证明损失效应,投资者对损失更加敏感。
import requests
import re
import xlwt
import pandas as pd
header = { # 请求头
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36'
}
findcom = re.compile(r'<div class="articleh(.*?)</div>',re.S)
findcomment = re.compile(r'title="(.*?)">')
findtime = re.compile(r'<span class="l5">(.*?)</span>',re.S)
savepath = "易方达蓝筹.xls"
url = 'http://guba.eastmoney.com/list,of005827,f_1.html'
result = requests.get(url,headers=header)
text = result.text
com = re.findall(findcom,text)
for i in com:
comment = re.findall(findcomment,i)[0]
time = re.findall(findtime,i)[0]
break
commentlist = []
timelist = []
for j in range(1,1247):
url = 'http://guba.eastmoney.com/list,of005827,f_' + str(j) +'.html'
result = requests.get(url,headers=header)
text = result.text
com = re.findall(findcom,text)
for i in com:
comment = re.findall(findcomment,i)[0]
commentlist.append(comment)
time = re.findall(findtime,i)[0]
timelist.append(time)
data = pd.DataFrame(timelist,commentlist)
data.reset_index(inplace=True)
data.columns=['comment','time']
def saveData(datalist1,datalist2,savepath):
print("save.......")
book = xlwt.Workbook(encoding="utf-8",style_compression=0) #创建workbook对象
sheet = book.add_sheet('基金', cell_overwrite_ok=True) #创建工作表
for i in range(0,len(datalist1)-49819):
sheet.write(i+1,1,datalist1[i]) #数据
for i in range(0,len(datalist2)-49819):
sheet.write(i+1,2,datalist2[i]) #数据
sheet1 = book.add_sheet('基金1', cell_overwrite_ok=True) #创建工作表
for i in range(len(datalist1)-49819,len(datalist1)):
sheet1.write(i-49818,1,datalist1[i]) #数据
for i in range(len(datalist1)-49819,len(datalist1)):
sheet1.write(i-49818,2,datalist2[i]) #数据
book.save(savepath) #保存
saveData(commentlist,timelist,savepath)