二分类实例

导入数据

import numpy as np
from matplotlib import pyplot
from pandas import read_csv
from pandas.plotting import scatter_matrix
from pandas import set_option
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
#导入数据
filename = '/home/hadoop/下载/sonar.all-data.csv'
data = read_csv(filename,header=None)

分析数据

描述性统计

#数据维度
print(data.shape)
print(data.dtypes)
print(data.head(20))
#描述性统计信息
set_option('precision',3)
print(data.describe())
#数据的分类分布
print(data.groupby(60).size())

数据可视化
通过多种图表观察数据的分布情况,这会为解决问题提供灵感。

#直方图
data.hist(sharex=False,sharey=False,xlabelsize=1,ylabelsize=1)
pyplot.show()

#密度图
data.plot(kind='density',subplots=True,layout=(8,8),sharex=False,legend=False,fontsize=1)
pyplot.show()

从密度图可以看到,大部分数据都呈现一定程度的偏态分布,也许通过Box-Cox转换可以提高模型的准确度。Box-Cox转换是统计中常用的一种数据变化方式,用于连续响应变量不满足正态分布的情况。Box-Cox转换后,可以在一定程度上减少不可观测的误差,也可以预测变量的相关性,将数据转换为正态分布。

#关系矩阵图
fig = pyplot.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(data.corr(),vmin=-1,vmax=1,interpolation='none')
fig.colorbar(cax)
pyplot.show()

分离评估数据集

#分离评估数据集
array = data.values
X = array[:,0:60].astype(float)
Y = array[:,60]
validation_size = 0.2
seed = 7
X_train,X_validation,Y_train,Y_validation = train_test_split(X,Y,test_size = validation_size,random_state=seed)

评估算法

#评估算法的基准
num_folds = 10
seed = 7
scoring = 'accuracy'

线性算法:逻辑回归算法(LR)和线性判别分析(LDA)
非线性算法:分类与回归算法(CART),支持向量机(SVM),贝叶斯分类器(NB)和K近邻算法(KNN)

#评估算法-------原始数据
models = {}
models['LR'] = LogisticRegression()
models['LDA'] = LinearDiscriminantAnalysis()
models['KNN'] = KNeighborsClassifier()
models['CART'] = DecisionTreeClassifier()
models['NB'] = GaussianNB()
models['SVM'] = SVC()
results = []
for key in models:
   kfold = KFold(n_splits=num_folds,random_state=seed)
   cv_results = cross_val_score(models[key],X_train,Y_train,cv=kfold,scoring=scoring)
   results.append(cv_results)
   print('%s:%f(%f)'%(key,cv_results.mean(),cv_results.std()))
#评估算法-----箱线图
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(models.keys())
pyplot.show()

猜想原始数据分布不均匀,导致有些算法表现不佳,于是将数据进行正态化处理,然后对算法再次进行评估。为了确保数据的一致性,将采用Pipeline来流程化处理

#评估算法-----正态化数据
pipelines = {}
pipelines['ScalarLR'] = Pipeline([('Scaler',StandardScaler()),('LR',LogisticRegression())])
pipelines['ScalerLDA'] = Pipeline([('Scaler',StandardScaler()),('LDA',LinearDiscriminantAnalysis())])
pipelines['ScalerKNN'] = Pipeline([('Scaler',StandardScaler()),('KNN',KNeighborsClassifier())])
pipelines['ScalerCART'] = Pipeline([('Scaler',StandardScaler()),('CART',DecisionTreeClassifier())])
pipelines['ScalerNB'] = Pipeline([('Scaler',StandardScaler()),('NB',GaussianNB())])
pipelines['ScalerSVM'] = Pipeline([('Scaler',StandardScaler()),('SVM',SVC())])
results = []
for key in pipelines:
   kfold = KFold(n_splits=num_folds,random_state=seed)
   cv_results = cross_val_score(pipelines[key],X_train,Y_train,cv=kfold,scoring=scoring)
   results.append(cv_results)
   print('%s:%f(%f)'%(key,cv_results.mean(),cv_results.std()))
#评估算法-----箱线图
fig = pyplot.figure()
fig.suptitle('Scaled Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(models.keys())
pyplot.show()

算法调参
通过对算法的评估,发现K近邻算法(KNN)和支持向量机(SVM)值得我们进一步进行优化。

K近邻算法调参

#调参改进算法----KNN
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_neighbors':[1,3,5,7,9,11,13,15,17,19,21]}
model = KNeighborsClassifier()
kfold = KFold(n_splits=num_folds,random_state=seed)
grid = GridSearchCV(estimator=model,param_grid=param_grid,scoring=scoring,cv=kfold)
grid_result = grid.fit(X=rescaledX,y=Y_train)
print('最优:%s使用%s'%(grid_result.best_score_,grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
               grid_result.cv_results_['std_test_score'],
               grid_result.cv_results_['params'])
for mean,std,param in cv_results:
   print('%f(%f) with %r'%(mean,std,param))

支持向量机调参
支持向量机有两个重要的参数,C(惩罚系数)和kernel(径向基函数),默认的C参数为1.0,默认的kernel参数是rbf。下面将对这两个参数进行调参。

#调参改进算法------SVM
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train).astype(float)
param_grid = {}
param_grid['C'] = [0.1,0.3,0.5,0.7,0.9,1.0,1.3,1.5,1.7,2.0]
param_grid['kernel'] = ['linear','poly','rbf','sigmoid','precomputed']
model = SVC()
kfold = KFold(n_splits=num_folds,random_state=seed)
grid = GridSearchCV(estimator=model,param_grid=param_grid,scoring=scoring,cv=kfold)
grid_result = grid.fit(X=rescaledX,y=Y_train)
print('最优:%s 使用 %s'%(grid_result.best_score_,grid_result.best_params_))
cv_results = zip(grid_result.cv_results_['mean_test_score'],
               grid_result.cv_results_['std_test_score'],
               grid_result.cv_results_['params'])
for mean,std,param in cv_results:
   print('%f (%f) with %r'%(mean,std,param))

集成算法
除了调参,提高算法准确度的方法是集成算法。下面会对四种算法进行比较,以便进一步提到算法的准确度。
装袋算法:随机森林(RF)和极端随机树(ET)
提升算法:AdaBoost(AB)和随机梯度上升

#集成算法
ensembles = {}
ensembles['ScaledAB'] = Pipeline([('Scaler',StandardScaler()),('AB',AdaBoostClassifier())])
ensembles['ScaledGBM'] = Pipeline([('Scaler',StandardScaler()),('GBM',GradientBoostingClassifier())])
ensembles['ScaledRF'] = Pipeline([('Scaler',StandardScaler()),('RFR',RandomForestClassifier())])
ensembles['ScaledET'] = Pipeline([('Scaler',StandardScaler()),('ETR',ExtraTreesClassifier())])
results = []
for key in ensembles:
   kfold = KFold(n_splits=num_folds,random_state=seed)
   cv_result = cross_val_score(ensembles[key],X_train,Y_train,cv=kfold,scoring=scoring)
   results.append(cv_result)
   print('%s:%f(%f)'(key,cv_result.mean(),cv_result.std()))

确定最终模型
通过前面对算法的评估发现,支持向量机(SVM)具有最佳的准确度。所以将会采用支持向量机(SVM)。通过训练集数据生成算法模型,并通过预留的评估数据集来评估模型。在算法评估过程中发现,支持向量机(SVM)对正态化的数据具有较高的准确度。所以对数据集做正态处理,对评估数据集也做相同的处理。

#模型最终化
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = SVC(C=1.5,kernel='rbf')
model.fit(X=rescaledX,y=Y_train)
#评估模型
rescaled_validationX = scaler.transform(X_validation)
predictions = model.predict(rescaled_validationX)
print(accuracy_score(Y_validation,predictions))
print(confusion_matrix(Y_validation,predictions))
print(classification_report(Y_validation,predictions))

PS:范数的指数越高,就越关注大的值而忽略小的值。

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值