tushare ID:441914
我是用jupyter做的分析,先导入相关的库,记得设置tushare的token。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.collections import LineCollection
import matplotlib.cm as cm #colormap
from sklearn.preprocessing import StandardScaler
from sklearn import cluster, covariance, manifold
from sklearn.metrics import silhouette_score # 轮廓系数:评价聚类好坏 越接近1越好
from sklearn.metrics import silhouette_samples
from sklearn.metrics import calinski_harabasz_score# 卡林斯基-哈拉巴斯指数,越高越好,评价聚类好坏
from sklearn.covariance import ShrunkCovariance
from sklearn.covariance import LedoitWolf #收缩协方差估计
import tushare as ts
with open('token.txt') as f:
token=f.read()
ts.set_token(token)
pro = ts.pro_api()
第一步定义相关的方法获取数据,并截取2017-12-31前上市的股票
# No.1
# 数据获取
#%matplotlib inline #Jupyter Notebook显示图形专用
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
#获取股票代码和名称
def get_code():
code=pro.stock_basic(exchange='', list_status='L', fields='ts_code,symbol,name,area,industry,list_date')
code[['list_date']]= code[['list_date']].values.astype(str)
code['list_date']=(pd.to_datetime(code['list_date'])).dt.date
code.set_index(['list_date'],inplace=True)
from datetime import datetime as datetime_
#截取
cut='2017-12-31'
cut=datetime_.strptime(cut,'%Y-%m-%d')
cut=datetime_.date(cut)
code.index<=cut
code=code.loc[code.index<=cut,:]
print('len',len(code))
codes=code.ts_code.values
names=code.Name.values
stocks=dict(zip(codes,names))
return stocks
def get_data(code,start='20130101',end='20210331'):
df=ts.pro_bar(ts_code=code,adj='qfq',start_date=start, end_date=end)
df.index=pd.to_datetime(df.trade_date)
df=df.sort_index()
return df
def data_process(data):
df=data.copy()
df.set_index(['trade_date'],inplace=True)
# 数据筛选
df1=df.copy()
dd=df1.isna().sum()
dd=dd.sort_values(axis = 0,ascending = False)
half = len(dd)// 2
mid=dd[half] + dd[~half]/2
indexs=dd[dd>112].index #此时为nan个数方差在均值附近
print(dd.describe())
df1=df1.drop(indexs,axis=1)
df1=df1.dropna()
variation=df1.dropna().values #保留共同交易日
variation=df1.values
X = variation.copy()
X /= X.std(axis=0)
print(X.shape)
return X,indexs,df1
def get_key (dict, value):
return [k for k, v in dict.items() if v == value]
codes, names = np.array(sorted(get_code().items())).T #对所有可迭代的对象进行排序,再转置为两行
data=pd.DataFrame({name:(get_data(code).close-get_data(code).open)
for code,name in zip(codes,names)})
第二步进行数据处理,剔除空数据,调整输出格式
# No.2
# 数据处理
X,indexs,df=data_process(data)
%调整后面将要输出的格式
stocks=get_code()
for index in indexs:
for name,code in stocks.items():
if name == index:
del stocks[code]
break
第三步训练数据
# No.3
# 相关系数
edge_model = LedoitWolf()
assume_centered=True
edge_model.fit(X)
_, labels = cluster.affinity_propagation(edge_model.covariance_)
n_labels = labels.max()
第四步查看结果
# No.4
# 结果
print(n_labels)
for i in range(n_labels + 1):
print('Cluster %i:'% (i + 1),end='')
for name in names[labels == i]:
key=get_key(stocks,name)
for k in key:
print(' ',k,'-',stocks[k],end=',')
print()
第五步对聚类结果进行评价+评价可视化
# # No.5
# # 聚类评价可视化
n_clusters=n_labels+1
X=X.T
#创建画布,画布上有两个图
fig,ax1=plt.subplots(1,1)
fig.set_size_inches(20,65)
#图一是轮廓系数图像,由各轮廓系数组成的横向条形图
#横坐标是轮廓系数取值,纵坐标是样本,因为轮廓系数是对每一个样本进行计算
#轮廓系数取值为[-1,1],取轮廓系数>0的,所以支取[-0.1,1]
ax1.set_xlim([-0.1,0.5])
#纵坐标[0,X.shape[0]],且每个簇排在一起,不同簇间有空隙,所以在X.shape[0]上加上距离(n_clusters+1)*10作为间隙
ax1.set_ylim([0,X.shape[0]+(n_clusters+1)*20])
print('silhouette_score: ',silhouette_score(X,labels))
print('silhouette_samples>0: ',(silhouette_samples(X,labels)>0).sum())
print('calinski_harabasz_score: ',calinski_harabasz_score(X,labels))
silhouette_score_=silhouette_score(X,labels)
print('For n_clusters = ',n_clusters,'The average silhouette_score is :',silhouette_score_)
#每个样本点的轮廓系数(横坐标)
silhouette_samples_=silhouette_samples(X,labels)
# 设定y轴上的初始取值
y_lower=10
for i in range(n_clusters):
ith_cluster_silhouette_samples_ = silhouette_samples_[labels == i] #去第i簇轮廓系数
ith_cluster_silhouette_samples_.sort()
size_cluster_i =ith_cluster_silhouette_samples_.shape[0] #一簇中样本个数
y_upper = y_lower + size_cluster_i
# print(y_upper)
#colormap中nipy_spectral(小数)用小数调用颜色函数
color = cm.nipy_spectral(float(i)/n_clusters)
#填充图1
#fill_between填充曲线与之交间的空间
#fill_betweenx的直角是在纵坐标上
#fill_betweeny的直角是在横坐标上
#fill_betweenx(纵坐标下限,纵坐标上限,,x轴的取值柱状图颜色)
ax1.fill_betweenx(np.arange(y_lower,y_upper)
,ith_cluster_silhouette_samples_
,facecolor=color
,alpha=0.7
)
#簇上写编号
#text(显示编号位置的横坐标,显示编号位置的纵坐标,内容)
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 20 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# 标上轮廓系数的虚线
ax1.axvline(x=silhouette_score_, color="red", linestyle="--")
#y不显示刻度
ax1.set_yticks([])
ax1.set_xticks([-0.1,0, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30, 0.35, 0.40])
plt.suptitle(("Silhouette analysis for GraphicalLassoCV clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()
第六步对数据进行降维且对聚类结果可视化
# No.6
# 可视化
#计算低维
node_position_model = manifold.LocallyLinearEmbedding(
n_components=2, eigen_solver='dense', n_neighbors=6)
embedding = node_position_model.fit_transform(X).T
# 可视化
plt.figure(1, facecolor='w', figsize=(10, 8))
plt.clf()
ax = plt.axes([0., 0., 1., 1.])
plt.axis('off')
# 计算偏相关系数
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
# 使用嵌入的坐标绘制节点
plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,cmap=plt.cm.nipy_spectral)
# 画相互关联的边
start_idx, end_idx = np.where(non_zero)
segments = [[embedding[:, start], embedding[:, stop]]
for start, stop in zip(start_idx, end_idx)]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(segments,
zorder=0, cmap=plt.cm.hot_r,
norm=plt.Normalize(0, .7 * values.max()))
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)
#向每个节点添加一个标签,难点在于定位标签,以避免与其他标签重叠
for index, (name, label, (x, y)) in enumerate(
zip(names, labels, embedding.T)):
dx = x - embedding[0]
dx[index] = 1
dy = y - embedding[1]
dy[index] = 1
this_dx = dx[np.argmin(np.abs(dy))]
this_dy = dy[np.argmin(np.abs(dx))]
if this_dx > 0:
horizontalalignment = 'left'
x = x + .002
else:
horizontalalignment = 'right'
x = x - .002
if this_dy > 0:
verticalalignment = 'bottom'
y = y + .002
else:
verticalalignment = 'top'
y = y - .002
plt.text(x, y, name, size=10,
horizontalalignment=horizontalalignment,
verticalalignment=verticalalignment,
bbox=dict(facecolor='w',
edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
alpha=.6))
plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
embedding[0].max() + .10 * embedding[0].ptp(),)
plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
embedding[1].max() + .03 * embedding[1].ptp())
plt.show()
最终成果部分截图
学习不易,数据分析新人初来乍到,请大家多多指教。
最后感谢tushare平台,让我们获取股票的数据和进行研究更方便啦!