import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn import preprocessing
C:\Users\shaoqiu\Anaconda3\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
"This module will be removed in 0.20.", DeprecationWarning)
#读取中国银行数据
df=pd.read_csv(r'G:\\Project\\data\\CB.csv',encoding='gbk')
#将日期作为index,顺序排列
df = df.set_index('date')
df = df.sort_index()
# print (df.head())
#将涨跌幅按照1,0标签分类p_change
value = pd.Series(df['p_change'],index=df.index)
value[value>=0]=1 #0 means fall
value[value<0]=0 #1 means rise
print(df.head())df = df.drop(['price_change'],1)
df = df.fillna(0)
df = df.astype(float)
X = np.array(df.drop(['p_change'], 1))
X = preprocessing.scale(X)
y = np.array(df['p_change'])
print(X.shape)
print(y.shape)
open high close low volume price_change p_change \
date
2015-01-05 4.18 4.50 4.42 4.18 23084548.0 0.27 1.0
2015-01-06 4.38 4.74 4.56 4.28 23127260.0 0.14 1.0
2015-01-07 4.46 4.64 4.54 4.44 15485755.0 -0.02 0.0
2015-01-08 4.55 4.57 4.33 4.31 14892726.0 -0.21 0.0
2015-01-09 4.28 4.76 4.47 4.23 22776194.0 0.14 1.0
ma5 ma10 ma20 v_ma5 v_ma10 v_ma20 turnover
date
2015-01-05 4.036 3.862 3.736 20236083.4 20700121.8 18068874.18 1.13
2015-01-06 4.184 3.964 3.791 21016213.4 21901637.0 18406099.90 1.13
2015-01-07 4.322 4.029 3.838 20103937.2 19875380.1 18223578.34 0.76
2015-01-08 4.400 4.090 3.867 19874622.2 18551524.5 18027645.85 0.73
2015-01-09 4.464 4.180 3.901 19873296.6 19332925.8 18291454.08 1.11
df = df.drop(['price_change'],1)
df = df.fillna(0)
df = df.astype(float)
X = np.array(df.drop(['p_change'], 1))
X = preprocessing.scale(X)
y = np.array(df['p_change'])
print(X.shape)
print(y.shape)
(690, 12)
(690,)
#训练集取数据前80%,测试数据取后20%
X_train , X_test, \
y_train , y_test = train_test_split(X,y,test_size=0.2)
#选择SVM算法的最优参数
print ("开始建模")
t0 = time()
#C 是对错误部分的惩罚;gamma 合成点
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5, 5e5 ,1e6],
'gamma': [0.0001, 0.0005, 0.001, 0.005,0.01, 0.1]}
#C和gamma组合,寻找出最好的一个组合
#class_weight='balanced'表示调整各类别权重,权重与该类中样本数成反比,
#防止模型过于拟合某个样本数量过大的类
clf = GridSearchCV(SVC(kernel='rbf', class_weight='balanced'), param_grid)
print (clf)
#建模
clf = clf.fit(X_train, y_train)
print ("time:%0.3fs" % (time()-t0))
print (clf.best_estimator_ ) #最好的模型的信息
开始建模
GridSearchCV(cv=None, error_score='raise',
estimator=SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False),
fit_params=None, iid=True, n_jobs=1,
param_grid={'C': [1000.0, 5000.0, 10000.0, 50000.0, 100000.0, 500000.0, 1000000.0], 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1]},
pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
scoring=None, verbose=0)
time:34.609s
SVC(C=5000.0, cache_size=200, class_weight='balanced', coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.005, kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
根据网格寻参,得到SVC(C=1.0, cache_size=200, class_weight=’balanced’, coef0=0.0,
decision_function_shape=’ovr’, degree=3, gamma=’auto’, kernel=’rbf’,
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
y_pred = clf.predict(X_test)
#打印预测成绩报告
print (classification_report(y_test,y_pred))
precision recall f1-score support
0.0 0.83 0.85 0.84 62
1.0 0.88 0.86 0.87 76
avg / total 0.86 0.86 0.86 138
#打印预测成绩混淆矩阵
print (confusion_matrix(y_test,y_pred))
[[53 9]
[11 65]]