from sklearn import svm
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tushare as ts
from sklearn import tree
# F1 score是精确率和召回率的调和平均值
from sklearn.metrics import f1_score,accuracy_score
from sklearn.decomposition import PCA
# numpy的默认使用科学计数法显示数据的改变办法,True是关闭科学计数显示
np.set_printoptions(suppress=True)
ts.set_token('462fc78ba2417e9a79a5ac00d8b71b2959b2a8875a0457952921ade4')
pro = ts.pro_api()
df=pd.read_excel(r'E:\权限管理\wq\works\HData\HDatas000001SZ.xlsx')#这个会直接默认读取到这个Excel的第一个表单
df=df.sort_values('trade_date')
# 所有记录总数
counts=len(df)
# 纪律平均数
avg_counts=counts/2
# 所有列名列表
columns_list=df.columns.values.tolist()
# 取消的参数列表
cancel_factor=[]
for i in columns_list:
value_list=[]
value_list.extend(df[i].values)
if value_list.count(0)==counts:
cancel_factor.append(i)
elif avg_counts < value_list.count(0) < counts:
true_datas = sum(value_list) / counts
df[i] = true_datas
else:
pass
x_cancel_factor=['Unnamed: 0', 'ts_code','trade_date','ann_date_x','f_ann_date_x','end_date_x','ann_date_y','f_ann_date_y','end_date_y',
'ann_date','f_ann_date','end_date']
x_cancel_factor.extend(cancel_factor)
y_cancel_factor=['Unnamed: 0','trade_date','ann_date_x','f_ann_date_x','end_date_x','ann_date_y','f_ann_date_y','end_date_y',
'ann_date','f_ann_date','end_date']
y_cancel_factor.extend(cancel_factor)
print(cancel_factor)
data_x = df.drop(x_cancel_factor, axis=1)
data_y = df.drop(y_cancel_factor, axis=1)
# data_x = df[['open', 'high', 'low', 'pre_close', 'change', 'pct_chg', 'vol', 'amount','close']]
x=data_x[0:len(data_x)-2].values
y_list=data_y[1:len(data_y)-1]['change'].values
y=[]
for i in y_list:
if i < 0:
y.append(-1)
elif i>=0:
y.append(1)
# else:
# y.append(0)
# 降低到20维度
estimator = PCA(n_components=5)
estimator.fit(x[:1800])
print('降维训练数据')
pca_x_train = estimator.fit_transform(x[:1800])
print('降维测试数据')
pca_x_test = estimator.transform(x[1801:len(x)-1])
#数据标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(pca_x_train)
X_train_scaled = scaler.transform(pca_x_train)
X_test_scaled = scaler.transform(pca_x_test)
#SVM算法
clf=svm.SVC(C=0.8, kernel='rbf', gamma=20, decision_function_shape='ovr') ##默认参数:kernel='rbf'
# clf.fit(x[:1800],y[:1800])
clf.fit(X_train_scaled,y[:1800])
print("预测...")
# res=clf.predict(rdm_arr) ##两个方括号表面传入的参数是矩阵而不是list
res=clf.predict(X_test_scaled) ##两个方括号表面传入的参数是矩阵而不是list
# 预测的真实值
a=y[1801:len(y)-1]
# 决策树算法
clf1 = tree.DecisionTreeClassifier(criterion='entropy')
clf1.fit(X_train_scaled,y[:1800])
answer = clf1.predict(X_test_scaled)
# 随机森林算法
clf2 = tree.DecisionTreeClassifier()
clf2.fit(X_train_scaled,y[:1800])
answer2 = clf2.predict(X_test_scaled)
# AdaBoost Boosting
from sklearn.ensemble import AdaBoostClassifier # For Classification
from sklearn.ensemble import AdaBoostRegressor # For Regression
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
clf3 = AdaBoostClassifier(n_estimators=100, base_estimator=dt, learning_rate=1)
# Above I have used decision tree as a base estimator, you can use any ML learner as base estimator if it accepts sample weight
clf3.fit(X_train_scaled,y[:1800])
answer3 = clf3.predict(X_test_scaled)
#根据训练出的模型绘制样本点,用训练数据训练模型
# for i in pca_x_train:
# res=clf1.predict(np.array(i).reshape(1, -1))
# # print(i,res)
# if res > 0:
# plt.scatter(i[0],i[1],c='r',marker='*')
# else :
# plt.scatter(i[0],i[1],c='g',marker='*')
# #回执实验数据点
# for i in pca_x_test:
# # 数据最终显示图形,对测试数据进行预测
# res=clf1.predict(np.array(i).reshape(1, -1))
# b.append(res[0])
# # print(i,res)
# if res > 0:
# # X轴靠近数据显示图形
# plt.scatter(i[0],i[1],c='r',marker='.')
# else :
# plt.scatter(i[0],i[1],c='g',marker='.')
from sklearn.ensemble import GradientBoostingClassifier # For Classification
from sklearn.ensemble import GradientBoostingRegressor # For Regression
# Gradient Boosting
clf4 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1)
clf4.fit(X_train_scaled,y[:1800])
answer4 = clf4.predict(X_test_scaled)
# LogisticRegression二元分类 scikit-learn中的LogisticRegression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_scaled,y[:1800])
answer5 = model.predict(X_test_scaled)
# 朴素贝叶斯
from sklearn.naive_bayes import GaussianNB
model1 = GaussianNB()
model1.fit(X_train_scaled,y[:1800])
answer6 = model.predict(X_test_scaled)
print('svm结果集 score:',accuracy_score(a, res))
print('SVM训练集 score:',clf.score(X_train_scaled,y[:1800] ))
print('SVM测试集 score:',clf.score(X_test_scaled,y[1801:len(y)-1]))
print('决策树 score:',accuracy_score(a, answer))
print('随机森林 score:',accuracy_score(a, answer2))
print('AdaBoost score:',accuracy_score(a, answer3))
print('Gradient Boosting score:',accuracy_score(a, answer4))
print('LogisticRegression score:',accuracy_score(a, answer5))
print('贝叶斯 score:',accuracy_score(a, answer6))
回归算法总结
最新推荐文章于 2020-09-17 17:43:00 发布