利用sklearn自己构建一组含有1000个样本点,6个特征,3个类别的分类数据集(随机种子取666),并利用学过的分类器模型(例如传统的分类模型:决策树、朴素贝叶斯、K近邻、BP神经网络、逻辑回归、支持向量机等,以及集成学习模型:Voting、Bagging、随机森林、AdaBoost、GBDT)对数据进行训练,充分利用交叉验证及网格搜索调优,尽可能地提高模型的分类效果。模型分类效果请通过混淆矩阵、ROC曲线、学习曲线、验证曲线等方式进行展示。
'''
sklearn.datasets.make_classification(n_samples=100, n_features=20, n_informative=2, n_redundant=2,
n_repeated=0, n_classes=2, n_clusters_per_class=2, weights=None,
flip_y=0.01, class_sep=1.0, hypercube=True,shift=0.0, scale=1.0,
shuffle=True, random_state=None)
通常用于分类算法。
n_features :特征个数= n_informative + n_redundant + n_repeated
n_informative:多信息特征的个数
n_redundant:冗余信息,informative特征的随机线性组合
n_repeated :重复信息,随机提取n_informative和n_redundant 特征
n_classes:分类类别
n_clusters_per_class :某一个类别是由几个cluster构成的
'''
from sklearn import datasets
import matplotlib.pyplot as plt
data,target = datasets.make_classification(n_samples=1000,n_features=6,n_classes=3,random_state=666,n_clusters_per_class=1)
print(data.shape)
print(target.shape)
plt.scatter(data[:,0],data[:,1],c=target,cmap=plt.cm.spring,edgecolor='k')
x=data
y=target
数据拆分
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data,target,test_size=0.2)
学习曲线
from sklearn.model_selection import learning_curve# 导入学习曲线
# 设置训练集大小
size=np.linspace(0.1,1,10)
def xuexiquxian(model,size):
x,y=data,target
train_sizes,train_scores,test_scores=learning_curve(model,x,y,train_sizes=size,cv=10)
print(train_sizes)
# 绘制学习曲线
plt.scatter(train_sizes,np.mean(train_scores,axis=1))
plt.scatter(train_sizes,np.mean(test_scores,axis=1))
plt.legend(['train_scores','test_scores'])
plt.show()
交叉验证
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import r2_score
def jiaochayanzheng(model):
x,y=data,target
indices=np.arange(y.shape[0])
np.random.shuffle(indices)
x,y=x[indices],y[indices]
scores=cross_val_score(model,x,y,cv=10)
print('将拆分与评价合并执行')
print(scores)
print(scores.mean(),scores.std())
print('同时使用多个评价指标')
scoring=['r2','explained_variance']
scores=cross_validate(model,x,y,cv=10,scoring=scoring,return_train_score=False)
print(scores)
print(scores['test_r2'].mean())
print('使用交互验证后的模型进行预测')
pred=cross_val_predict(model,x,y,cv=10)
print(r2_score(target,pred))
评价指标
def pinjia(model):
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
print('f1_score')
from sklearn.metrics import f1_score
print(f1_score(y_test,y_pred,average='micro'))
print(f1_score(y_test,y_pred,average='macro'))
print(f1_score(y_test,y_pred,average='weighted'))
print('precision_score')
from sklearn.metrics import precision_score
print(precision_score(y_test,y_pred,average='micro'))
print(precision_score(y_test,y_pred,average='macro'))
print(precision_score(y_test,y_pred,average='weighted'))
print('分类汇总报告')
from sklearn.metrics import classification_report
#分类汇总报告
print(classification_report(y_test,y_pred,digits=3,#小数点后保留的位数
labels=[0,1,2],#类别排序
target_names=['第0类','第1类','第2类'],#类别名称
output_dict=False)#结果是否以字典形式输出
)
print('混淆矩阵')
#混淆举证
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
print(cm)
#热力图展示混淆矩阵
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(cm,cmap=sns.color_palette("Blues"),annot=True)
决策树
from sklearn.tree import DecisionTreeClassifier
clf1=DecisionTreeClassifier()
xuexiquxian(clf1,size)
# 数据拆分
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data,target,test_size=0.2)
from sklearn.tree import DecisionTreeClassifier
clf1=DecisionTreeClassifier()
clf1.fit(x_train,y_train)
y_pred_clf1=clf1.predict(x_test)
print(clf1.score(x_train,y_train),clf1.score(x_test,y_test))
clf1_1=DecisionTreeClassifier( max_depth=25,min_samples_split=500,max_leaf_nodes=20)
clf1_1.fit(x_train,y_train)
y_pred_clf1_1=clf1_1.predict(x_test)
print(clf1_1.score(x_train,y_train),clf1_1.score(x_test,y_test))
clf1_2=DecisionTreeClassifier( max_depth=9,min_samples_split=10,max_leaf_nodes=20)
clf1_2.fit(x_train,y_train)
y_pred_clf1_2=clf1_2.predict(x_test)
print(clf1_2.score(x_train,y_train),clf1_2.score(x_test,y_test))
pinjia(clf1)
jiaochayanzheng(clf1)
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
knc=KNeighborsClassifier()
xuexiquxian(knc,size)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data,target,test_size=0.45)
knc.fit(x_train,y_train)
y_pred_knc=knc.predict(x_test)
print(knc.score(x_train,y_train))
print(knc.score(x_test,y_test))
pinjia(knc)
jiaochayanzheng(knc)
from sklearn.neural_network import MLPClassifier
mlp=MLPClassifier()
xuexiquxian(mlp,size)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(data,target,test_size=0.45)
mlp.fit(x_train,y_train)
y_pred_knc=mlp.predict(x_test)
print(mlp.score(x_train,y_train))
print(mlp.score(x_test,y_test))
pinjia(mlp)
jiaochayanzheng(mlp)