7.6 沪深300指数走势预测
import os
os. chdir( "C:\\Users\\Administrator\\Desktop" )
import pandas as pd
import numpy as np
7.6.1 读取数据
td= pd. read_excel( 'index300.xlsx' )
td. head( 6 )
Indexcd Idxtrd01 Idxtrd02 Idxtrd03 Idxtrd04 Idxtrd05 Idxtrd06 Idxtrd07 Idxtrd08 0 300 2014-01-02 2323.43 2325.99 2310.65 2321.98 451942.91 4901221.11 -0.3454 1 300 2014-01-03 2311.97 2314.84 2280.89 2290.78 597826.45 5773970.99 -1.3436 2 300 2014-01-06 2286.37 2286.37 2229.33 2238.64 663004.03 5997936.01 -2.2762 3 300 2014-01-07 2222.31 2246.79 2218.65 2238.00 437531.03 4256564.81 -0.0284 4 300 2014-01-08 2240.64 2262.58 2228.42 2241.91 513488.54 5069148.89 0.1747 5 300 2014-01-09 2236.97 2258.89 2220.80 2222.22 559870.41 5439949.13 -0.8783
7.6.2 计算各种指标
A1= td[ 'Idxtrd05' ] . values/ td[ 'Idxtrd05' ] . rolling( 10 ) . mean( )
A2= td[ 'Idxtrd06' ] . values/ td[ 'Idxtrd06' ] . rolling( 10 ) . mean( )
A3= td[ 'Idxtrd08' ] . values
A4= td[ 'Idxtrd03' ] . values/ td[ 'Idxtrd05' ] . rolling( 10 ) . mean( )
A5= td[ 'Idxtrd04' ] . values/ td[ 'Idxtrd05' ] . rolling( 10 ) . mean( )
A6= td[ 'Idxtrd03' ] . values- td[ 'Idxtrd04' ] . values
A7= td[ 'Idxtrd05' ] . values- td[ 'Idxtrd02' ] . values
print ( A1)
print ( A5)
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
...
240 1.008022
241 1.033503
242 1.029035
243 1.024952
244 1.042181
Name: Idxtrd05, Length: 245, dtype: float64
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
...
240 0.975050
241 1.000262
242 1.014258
243 1.014464
244 1.018230
Name: Idxtrd05, Length: 245, dtype: float64
X = { 'A1' : A1, 'A2' : A2, 'A3' : A3, 'A4' : A4, 'A5' : A5, 'A6' : A6, 'A7' : A7}
X = pd. DataFrame( X)
X. head( 6 )
A1 A2 A3 A4 A5 A6 A7 0 NaN NaN -0.3454 NaN NaN 15.34 -1.45 1 NaN NaN -1.3436 NaN NaN 33.95 -21.19 2 NaN NaN -2.2762 NaN NaN 57.04 -47.73 3 NaN NaN -0.0284 NaN NaN 28.14 15.69 4 NaN NaN 0.1747 NaN NaN 34.16 1.27 5 NaN NaN -0.8783 NaN NaN 38.09 -14.75
X = X. iloc[ 9 : - 1 , ]
计算决策变量Y
Y = td[ 'Idxtrd05' ] . values[ 1 : ] - td[ 'Idxtrd05' ] . values[ : - 1 ]
Y= Y[ 9 : ]
Y[ Y> 0 ] = 1
Y[ Y<= 0 ] = - 1
Y= Y. reshape( len ( Y) , 1 )
print ( Y)
[[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[-1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[-1.]
[ 1.]
[ 1.]
[-1.]
[-1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]
[ 1.]]
7.6.3 模型求解
x_train= X. iloc[ : len ( X) - 30 , : ]
Y_train= Y[ : len ( Y) - 30 ]
x_test= X. iloc[ len ( X) - 30 : , : ]
Y_test= Y[ len ( Y) - 30 : ]
支持向量机模型
from sklearn import svm
clf = svm. SVC( kernel= 'rbf' )
clf. fit( x_train, Y_train)
rv1= clf. score( x_train, Y_train) ;
R= clf. predict( x_test)
R= R. reshape( len ( R) , 1 )
Z= R- Y_test
Rs1= len ( Z[ Z== 0 ] ) / len ( Z)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
逻辑回归
from sklearn. linear_model import LogisticRegression as LR
lr = LR( )
lr. fit( x_train, Y_train)
rv2= lr. score( x_train, Y_train) ;
R= lr. predict( x_test)
R= R. reshape( len ( R) , 1 )
Z= R- Y_test
Rs2= len ( Z[ Z== 0 ] ) / len ( Z)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
神经网络模型
from sklearn. neural_network import MLPClassifier
clf = MLPClassifier( solver= 'lbfgs' , alpha= 1e-5 , hidden_layer_sizes= ( 5 , 2 ) , random_state= 1 )
clf. fit( x_train, Y_train) ;
rv3= clf. score( x_train, Y_train)
R= clf. predict( x_test)
R= R. reshape( len ( R) , 1 )
Z= R- Y_test
Rs3= len ( Z[ Z== 0 ] ) / len ( Z)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
print ( '支持向量机模型准确率:' , rv1)
print ( '逻辑模型准确率:' , rv2)
print ( '神经网络模型准确率:' , rv3)
print ( '---------------------------------------------' )
print ( '支持向量机模型预测准确率:' , Rs1)
print ( '逻辑模型预测准确率:' , Rs2)
print ( '神经网络模型预测准确率:' , Rs3)
支持向量机模型准确率: 0.5707317073170731
逻辑模型准确率: 0.5658536585365853
神经网络模型准确率: 0.5073170731707317
---------------------------------------------
支持向量机模型预测准确率: 0.7666666666666667
逻辑模型预测准确率: 0.7666666666666667
神经网络模型预测准确率: 0.8
7.7 基于主成分聚类的上市公司盈利能力分析
7.7.1 数据获取
data = pd. read_excel( '财务指标数据.xlsx' )
data. head( 6 )
Stkcd Accper F050502B F050102B F050202B F051201B F051501B F053301B F051401B F052101B 0 667 2015-12-31 0.072496 0.030630 0.025459 0.045300 0.090794 0.287946 0.125722 0.160675 1 838 2015-12-31 0.075536 0.020626 0.019593 0.025699 0.116062 0.348287 0.066612 0.112519 2 600816 2015-12-31 0.424511 0.284350 0.284350 0.188029 0.666531 NaN 0.892360 NaN 3 600358 2015-12-31 0.042172 0.047514 0.017612 0.051383 0.145928 0.867484 0.393601 0.241220 4 601155 2015-12-31 0.183725 0.042560 0.040358 0.093632 0.101813 0.268085 0.138837 0.172821 5 2231 2015-12-31 0.007149 0.005577 0.005543 0.006671 0.012131 0.251830 0.003421 0.021860
7.7.2 数据清理
data2 = data. iloc[ : , [ 0 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 ] ]
data2 = data2[ data2 > 0 ]
data2 = data2. dropna( )
data2 = data2. values
print ( data2)
[[6.670000e+02 7.249600e-02 3.063000e-02 ... 2.879460e-01 1.257220e-01
1.606750e-01]
[8.380000e+02 7.553600e-02 2.062600e-02 ... 3.482870e-01 6.661200e-02
1.125190e-01]
[6.003580e+05 4.217200e-02 4.751400e-02 ... 8.674840e-01 3.936010e-01
2.412200e-01]
...
[6.100000e+01 7.233000e-03 1.635800e-02 ... 4.855560e-01 2.796000e-03
8.079900e-02]
[4.020000e+02 8.117500e-02 3.283800e-02 ... 3.005470e-01 2.048840e-01
2.474820e-01]
[2.000540e+05 1.753463e+00 1.647230e-01 ... 6.664500e-02 1.749230e-01
1.550150e-01]]
for i in range ( 1 , 9 ) :
data2= data2[ data2[ : , i] < 8 * np. mean( data2[ : , i] ) , : ]
dta= pd. read_excel( '申万行业分类.xlsx' )
stkcd= dta. loc[ dta[ '行业名称' ] . values== '计算机' , '股票代码' ] . values
s= data2[ : , 0 ]
I= s== stkcd[ 0 ]
for i in range ( 1 , len ( stkcd) ) :
I1= s== stkcd[ i]
I= I| I1
ddata= data2[ I, : ]
X= ddata[ : , 1 : ]
from sklearn. preprocessing import MinMaxScaler
scaler = MinMaxScaler( )
scaler. fit( X)
X= scaler. transform( X)
7.7.3 主成分分析
from sklearn. decomposition import PCA
pca = PCA( n_components = 0.95 )
Y = pca. fit_transform( X)
tzxl = pca. components_
tz = pca. explained_variance_
gxl = pca. explained_variance_ratio_
scaler = MinMaxScaler( )
scaler. fit( Y)
Y = scaler. transform( Y)
print ( Y)
[[1. 0.35271541 0. ]
[0.39016096 0.53879807 0.57018667]
[0.30010244 0.39406656 0.56879495]
[0.27039046 0.2964222 0.46070532]
[0.48672129 0.35397777 0.38666498]
[0.21309201 0.25788551 0.45081498]
[0.27912283 0.54280988 0.51318843]
[0.1572944 0.50142123 0.36082199]
[0.19677837 0.3616009 0.48022782]
[0.38521072 0.49713145 0.72720359]
[0.14710215 0.39917128 0.39988137]
[0.16162013 0.48137942 0.3416155 ]
[0.52232947 0.52497807 0.41118528]
[0.45481913 0.48211103 0.61319047]
[0.02613532 0.40745569 0.46097522]
[0.19409062 0.62824552 0.47880802]
[0.21853564 0.53107475 0.459496 ]
[0.15451249 0.2584657 0.27652781]
[0.08716958 0.45388591 0.41742003]
[0.36001074 0.16638344 0.0750454 ]
[0.25176958 0.3232513 0.30290889]
[0.56960641 0.59926141 0.6247297 ]
[0.1351583 0.29402457 0.53914207]
[0.27232579 0.28188235 0.72458263]
[0.16228433 0.61384212 0.46062233]
[0.2289046 0.66079344 0.48278713]
[0.398054 0.48727234 0.4361368 ]
[0.30570314 0.57964964 0.67379527]
[0.42098929 0. 0.42194875]
[0.12282552 0.31680236 0.39509641]
[0.11417217 0.50357034 0.40670324]
[0.4014705 0.64036035 0.47896145]
[0.25641914 0.75649382 0.50332891]
[0.46193257 0.74745698 0.55891119]
[0.19872465 0.42003761 0.42507615]
[0.06461358 0.49796433 0.41328105]
[0.13887415 0.19118031 0.59227063]
[0.33953678 0.20816981 0.53728959]
[0.28806008 0.5661635 0.37056341]
[0.58050128 0.36931633 1. ]
[0.31451271 0.1343195 0.46261752]
[0.23753148 0.36934836 0.23481502]
[0.03163158 0.32978406 0.49743078]
[0. 0.50050413 0.24093191]
[0.37779225 0.54069871 0.56592613]
[0.28105514 0.40015422 0.49895377]
[0.18592627 0.25201196 0.85142195]
[0.11816703 0.245758 0.58946425]
[0.52577689 1. 0.61892863]
[0.2766132 0.40461408 0.61805954]
[0.36167024 0.13410407 0.99959069]
[0.24300298 0.33816989 0.36047089]
[0.35341 0.41121232 0.35831831]
[0.27428105 0.57993231 0.62323712]
[0.31181231 0.3417833 0.58027724]
[0.2306254 0.46532333 0.49240407]
[0.31088556 0.63316711 0.38927324]
[0.27945894 0.34491699 0.46414593]
[0.1415173 0.33861322 0.55011358]
[0.42270587 0.40673737 0.32336965]
[0.17331604 0.48086027 0.32196112]
[0.39871826 0.65131775 0.61251796]
[0.19795737 0.44479349 0.57140924]
[0.18988548 0.27265764 0.59527533]
[0.64756809 0.26874946 0.20169429]
[0.20789073 0.60402229 0.43318674]
[0.24534499 0.47924495 0.42494554]
[0.21588013 0.62823678 0.47726656]
[0.29152982 0.56142069 0.57746981]
[0.23964301 0.3610168 0.31973727]
[0.14006785 0.38229718 0.51898427]
[0.24160422 0.39956274 0.30514966]
[0.43254606 0.43522082 0.85266599]
[0.10428289 0.42424925 0.51831267]
[0.18135159 0.41260571 0.32491939]
[0.29927567 0.37596953 0.36773508]
[0.28565189 0.15749429 0.48514758]
[0.35741342 0.35168311 0.47948626]
[0.21023831 0.13466273 0.79021532]
[0.27476166 0.52728366 0.42556418]
[0.43864979 0.38382313 0.34667911]
[0.48912006 0.51053069 0.25666985]
[0.5792899 0.48315188 0.47186856]
[0.08946935 0.60192197 0.34586447]
[0.08836171 0.24211162 0.58207393]
[0.43032257 0.56941608 0.54411348]
[0.39972691 0.51887879 0.3129126 ]
[0.45183958 0.14529451 0.38886917]
[0.22710544 0.30796581 0.44015714]
[0.38234097 0.27028947 0.58782501]
[0.37808409 0.29362605 0.30143069]
[0.16424462 0.53304101 0.46728502]
[0.3915905 0.52237116 0.49260511]
[0.15294694 0.41858487 0.43664275]
[0.42325175 0.03416457 0.56357409]
[0.09925005 0.33448112 0.58506441]
[0.38732987 0.00316802 0.12895581]
[0.44630218 0.56498237 0.35338581]
[0.24619225 0.27918856 0.61966313]
[0.18725085 0.23401407 0.49428424]
[0.26648399 0.3242053 0.57301473]
[0.25558567 0.35664406 0.51600676]
[0.24587741 0.15197482 0.56010903]
[0.43851468 0.0645047 0.40736009]
[0.31288722 0.34411938 0.37381343]
[0.20092546 0.37832537 0.43608698]
[0.2744214 0.43821201 0.46568662]
[0.23056008 0.23317136 0.66354976]
[0.20773999 0.49743118 0.43396559]
[0.29313224 0.48525078 0.49252522]
[0.2341254 0.3597684 0.37329531]
[0.05266023 0.50083512 0.39115158]
[0.32669163 0.84759859 0.60215241]
[0.30555832 0.34529031 0.25144697]
[0.37541006 0.4471835 0.41326947]
[0.50835938 0.23696001 0.68037798]
[0.47968784 0.31902553 0.30531284]
[0.47824868 0.50241261 0.53507055]
[0.24190473 0.51344189 0.4013775 ]
[0.09110503 0.27711092 0.53113462]
[0.31704437 0.4750707 0.28918813]
[0.17131301 0.41297434 0.35863972]
[0.46122029 0.28651035 0.76438713]
[0.21306854 0.57312665 0.5193129 ]
[0.31135553 0.35684707 0.46800267]
[0.21752449 0.70726552 0.49147669]
[0.27467357 0.6547978 0.47292997]
[0.5016703 0.42772846 0.51741645]
[0.05422398 0.40111387 0.23208408]
[0.11352917 0.66592656 0.35955919]]
7.7.4 K-Means聚类
from sklearn. cluster import KMeans
model = KMeans( n_clusters = 5 , random_state= 0 , max_iter = 1000 )
model. fit( Y)
c= model. labels_
center= model. cluster_centers_
center= pd. DataFrame( center)
center. columns= [ 'Y1' , 'Y2' , 'Y3' ]
Fs= pd. Series( c, index= ddata[ : , 0 ] )
Fs= Fs. sort_values( )
co= pd. read_excel( '公司基本信息表.xlsx' )
co1= pd. Series( co[ 'Stknme' ] . values, index= co[ 'Stkcd' ] . values)
for i in range ( 5 ) :
q= co1[ Fs[ Fs== i] . index]
q= pd. DataFrame( q)
q. to_excel( 'c' + str ( i) + '.xlsx' )
7.7.5 计算每类公司总利润平均增长率
rd= pd. read_excel( '利润数据.xlsx' )
r_c= [ ]
for n in range ( 5 ) :
cn= list ( Fs[ Fs== n] . index)
r1_n= 0
r2_n= 0
for t in cn:
I1= rd[ 'Accper' ] . values== '2014-12-31'
I2= rd[ 'Accper' ] . values== '2015-12-31'
I3= rd[ 'Stkcd' ] . values== t
index1= I1& I3
index2= I2& I3
r1= rd. loc[ index1, 'B002000101' ] . values
r2= rd. loc[ index2, 'B002000101' ] . values
if len ( r1) > 0 :
r1_n= r1_n+ r1
if len ( r2) > 0 :
r2_n= r2_n+ r2
p2= r2_n/ len ( cn)
p1= r1_n/ len ( cn)
r_c. append( ( p2- p1) / p1)
r_c= np. array( r_c)
dt= np. hstack( ( center. values, r_c) )
dtt= pd. DataFrame( dt)
dtt. columns= [ 'Y1' , 'Y2' , 'Y3' , 'r_c' ]
print ( dtt)
Y1 Y2 Y3 r_c
0 0.229527 0.289527 0.537123 0.055348
1 0.492402 0.251426 0.272615 1.358886
2 0.196778 0.450577 0.376927 0.239998
3 0.367587 0.600588 0.529996 0.340677
4 0.376598 0.266334 0.832905 -0.160602