import pandas as pd
import re
import urllib.request
import tushare as ts
import os
index_list=[]
index_list1=[]
company_list=[]
stock_exchange_list=[]
weighting_list=[]
segment_list=[]
url ="https://en.wikipedia.org/wiki/CSI_300_Index"
headers =("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36")
opener =urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
file=urllib.request.urlopen(url)
data=str(file.read())
pat='<tableclass="wikitable sortable">(.*)<h2><spanclass="mw-headline"id="Sub-Indicies">Sub-Indicies</span><spanclass="mw-editsection">'#<\/todday><\/table>'
data1=str(re.compile(pat).findall(data))
csi_1='<tbody><tr>(.*)<\/td><\/tr><\/tbody><\/table>'
csi =str(re.findall(csi_1,data1,re.S))
hang ='<tr>(.*?)<\/td><\/tr>|<\/td><\/tr><\/tbody><\/table>'
zhenghang = re.findall(hang,csi)
for i in range(len(zhenghang)):
index_1='<td>(\d{6})'
company_1='<td><ahref="/.*?"title=".*?">((\w|\s|&|(|)|-){1,100})</a>' stock_exchange_1='<td>(Shanghai|Shenzhen)'
weighting_1='<td>(\d,\d{1,2})'
segment_1='<td>(Financials|ConsumerStaples|Consumer Discretionary|Utilities|Industrials|HealthCare|IT|Energy|Materials|Telecommunication Services)'
index_list.extend(re.findall(index_1,zhenghang[i]))
company_list.extend(re.findall(company_1,zhenghang[i]))
stock_exchange_list.extend(re.findall(stock_exchange_1,zhenghang[i]))
weighting_list.extend(re.findall(weighting_1,zhenghang[i]))
segment_list.extend(re.findall(segment_1,zhenghang[i]))
test=pd.DataFrame(columns=name,data=index_list)
#存储
test.to_csv('H:/day/index1.csv')
import tushare as ts
csi = ts.get_hs300s()
names=csi.name
df=pro.daily(ts_code=csi.code[i],start_date='20121101',end_date='20181101')
variation = pd.DataFrame()
# 获取股票数据
for i in range(len(csi.code)):
df=pro.daily(ts_code=csi.code[i],start_date='20121101',end_date='20181101')
var = df['close']-df['open']
var.name=csi.code[i]
variation =pd.concat([variation,var],axis=1) #
#variation.fillna(method='ffill',inplace=True)
# 存储数据
variation.to_csv('cis_300.csv')
# 读取数据
variation= pd.read_csv('cis_300.csv', index_col=0)
# 缺省值处理
variation.fillna(0,inplace=True)
edge_model= covariance.GraphLassoCV()
X =variation.copy()
X /=X.std(axis=0)
edge_model.fit(X)
_,labels = cluster.affinity_propagation(edge_model.covariance_)
n_labels= labels.max()
print(labels)
# 第四步
node_position_model =manifold.LocallyLinearEmbedding( n_components=2, eigen_solver='dense', n_neighbors=6)embedding = node_position_model.fit_transform(X.T).T
# 第五步 可视化
font = {'family': 'SimHei',
'color': 'black',
'weight': 'normal',
'size': 18,
}
plt.figure(1, facecolor='w', figsize=(10, 8))
plt.clf()
ax = plt.axes([0., 0., 1., 1.])
plt.axis('off')
#
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)
plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
cmap=plt.cm.nipy_spectral)
start_idx, end_idx = np.where(non_zero)
segments = [[embedding[:, start], embedding[:, stop]]
for start, stop in zip(start_idx, end_idx)]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(segments,
zorder=0, cmap=plt.cm.hot_r,
norm=plt.Normalize(0, .7 * values.max()))
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)
for index, (name, label, (x, y)) in enumerate(
zip(names, labels, embedding.T)):
dx = x - embedding[0]
dx[index] = 1
dy = y - embedding[1]
dy[index] = 1
this_dx = dx[np.argmin(np.abs(dy))]
this_dy = dy[np.argmin(np.abs(dx))]
if this_dx > 0:
horizontalalignment = 'left'
x = x + .002
else:
horizontalalignment = 'right'
x = x - .002
if this_dy > 0:
verticalalignment = 'bottom'
y = y + .002
else:
verticalalignment = 'top'
y = y - .002
plt.text(x, y, name, fontdict=font, size=10,
horizontalalignment=horizontalalignment,
verticalalignment=verticalalignment,
bbox=dict(facecolor='w',
edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
alpha=.6))
plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
embedding[0].max() + .10 * embedding[0].ptp(),)
plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
embedding[1].max() + .03 * embedding[1].ptp())
plt.show()