python爬虫实战

import pandas as pd
import re
import urllib.request
import tushare as ts
import os

index_list=[]
index_list1=[]
company_list=[]
stock_exchange_list=[]
weighting_list=[]
segment_list=[]

url ="https://en.wikipedia.org/wiki/CSI_300_Index"

headers =("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36")
opener =urllib.request.build_opener()
opener.addheaders = [headers]

urllib.request.install_opener(opener)
file=urllib.request.urlopen(url)
data=str(file.read())

pat='<tableclass="wikitable sortable">(.*)<h2><spanclass="mw-headline"id="Sub-Indicies">Sub-Indicies</span><spanclass="mw-editsection">'#<\/todday><\/table>'
data1=str(re.compile(pat).findall(data))
csi_1='<tbody><tr>(.*)<\/td><\/tr><\/tbody><\/table>'
csi =str(re.findall(csi_1,data1,re.S))
hang ='<tr>(.*?)<\/td><\/tr>|<\/td><\/tr><\/tbody><\/table>'
zhenghang = re.findall(hang,csi)

for i in range(len(zhenghang)):
    index_1='<td>(\d{6})'
    company_1='<td><ahref="/.*?"title=".*?">((\w|\s|&|(|)|-){1,100})</a>' stock_exchange_1='<td>(Shanghai|Shenzhen)'
    weighting_1='<td>(\d,\d{1,2})'
    segment_1='<td>(Financials|ConsumerStaples|Consumer Discretionary|Utilities|Industrials|HealthCare|IT|Energy|Materials|Telecommunication Services)'
   index_list.extend(re.findall(index_1,zhenghang[i]))
   company_list.extend(re.findall(company_1,zhenghang[i]))
    stock_exchange_list.extend(re.findall(stock_exchange_1,zhenghang[i]))
   weighting_list.extend(re.findall(weighting_1,zhenghang[i]))
   segment_list.extend(re.findall(segment_1,zhenghang[i]))

test=pd.DataFrame(columns=name,data=index_list)
#存储
test.to_csv('H:/day/index1.csv')


import tushare as ts
csi = ts.get_hs300s()
names=csi.name


df=pro.daily(ts_code=csi.code[i],start_date='20121101',end_date='20181101')



variation = pd.DataFrame()
# 获取股票数据
for i in range(len(csi.code)):
    df=pro.daily(ts_code=csi.code[i],start_date='20121101',end_date='20181101')
    var = df['close']-df['open']
    var.name=csi.code[i]
    variation =pd.concat([variation,var],axis=1)  #

#variation.fillna(method='ffill',inplace=True)
# 存储数据
variation.to_csv('cis_300.csv')




# 读取数据
variation= pd.read_csv('cis_300.csv', index_col=0)
# 缺省值处理
variation.fillna(0,inplace=True)

edge_model= covariance.GraphLassoCV()
X =variation.copy()
X /=X.std(axis=0)
edge_model.fit(X)

_,labels = cluster.affinity_propagation(edge_model.covariance_)
n_labels= labels.max()

print(labels)



# 第四步
node_position_model =manifold.LocallyLinearEmbedding( n_components=2, eigen_solver='dense', n_neighbors=6)embedding = node_position_model.fit_transform(X.T).T




# 第五步 可视化
font = {'family': 'SimHei',
          'color': 'black',
          'weight': 'normal',
          'size': 18,
          }

plt.figure(1, facecolor='w', figsize=(10, 8))
plt.clf()
ax = plt.axes([0., 0., 1., 1.])
plt.axis('off')
# 
partial_correlations = edge_model.precision_.copy()
d = 1 / np.sqrt(np.diag(partial_correlations))
partial_correlations *= d
partial_correlations *= d[:, np.newaxis]
non_zero = (np.abs(np.triu(partial_correlations, k=1)) > 0.02)

plt.scatter(embedding[0], embedding[1], s=100 * d ** 2, c=labels,
            cmap=plt.cm.nipy_spectral)

start_idx, end_idx = np.where(non_zero)

segments = [[embedding[:, start], embedding[:, stop]]
            for start, stop in zip(start_idx, end_idx)]
values = np.abs(partial_correlations[non_zero])
lc = LineCollection(segments,
                    zorder=0, cmap=plt.cm.hot_r,
                    norm=plt.Normalize(0, .7 * values.max()))
lc.set_array(values)
lc.set_linewidths(15 * values)
ax.add_collection(lc)

for index, (name, label, (x, y)) in enumerate(
        zip(names, labels, embedding.T)):
    dx = x - embedding[0]
    dx[index] = 1
    dy = y - embedding[1]
    dy[index] = 1
    this_dx = dx[np.argmin(np.abs(dy))]
    this_dy = dy[np.argmin(np.abs(dx))]
    if this_dx > 0:
        horizontalalignment = 'left'
        x = x + .002
    else:
        horizontalalignment = 'right'
        x = x - .002
    if this_dy > 0:
        verticalalignment = 'bottom'
        y = y + .002
    else:
        verticalalignment = 'top'
        y = y - .002
    plt.text(x, y, name, fontdict=font, size=10,
             horizontalalignment=horizontalalignment,
             verticalalignment=verticalalignment,
             bbox=dict(facecolor='w',
                       edgecolor=plt.cm.nipy_spectral(label / float(n_labels)),
                       alpha=.6))

plt.xlim(embedding[0].min() - .15 * embedding[0].ptp(),
         embedding[0].max() + .10 * embedding[0].ptp(),)
plt.ylim(embedding[1].min() - .03 * embedding[1].ptp(),
         embedding[1].max() + .03 * embedding[1].ptp())

plt.show()

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值