day03-交叉验证-混淆矩阵-分类报告-验证曲线-学习曲线
2.朴素贝叶斯定理
模块:
import sklearn.naive_bayes as nb
model = nb.GaussianNB()
基于高斯分布即正态分布的朴素贝叶斯分类器
代码示例:nb.py
import numpy as np
import sklearn.naive_bayes as nb
import matplotlib.pyplot as mp
x,y = [],[]
with open('../../day01/data/multiple1.txt','r') as f:
for line in f.readlines():
data = [float(substr) for substr in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x ,y = np.array(x),np.array(y,dtype=int)
#朴素贝叶斯分类器
model = nb.GaussianNB()
model.fit(x,y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()] #列合并
flat_y = np.zeros(len(flat_x), dtype=int)#进行自动分类
flat_y = model.predict(flat_x) #注意,在这里才开始训练
grid_y = flat_y.reshape(grid_x[0].shape)
pred_y = model.predict(x)#在这里测试并产生预测值
print((pred_y == y).sum() / pred_y.size )#统计分类结果概率
mp.figure('Naive Bayes Classification',facecolor='lightgray')
mp.title('Naive Bayes Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
mp.scatter(x[:, 0], x[:, 1], c=y, cmap='brg', s=80)
mp.tight_layout()
mp.show()
3.划分训练集测试集
import sklearn.model_selection as ms
ms.train_test_split(输入集,输出集,
test_size=测试集占比,ramdom_state=随机种子)
-->得到:训练输入,测试输入,训练输出,测试输出
代码示例:split.py
import numpy as np
import sklearn.naive_bayes as nb
import matplotlib.pyplot as mp
import sklearn.model_selection as ms
x,y = [],[]
with open('../../day01/data/multiple1.txt','r') as f:
for line in f.readlines():
data = [float(substr) for substr in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x ,y = np.array(x),np.array(y,dtype=int)
#划分训练集合测试集
train_x,test_x,train_y,test_y = \
ms.train_test_split(x,y,test_size=0.25,random_state=7)
#朴素贝叶斯分类器
model = nb.GaussianNB()
model.fit(train_x,train_y)#用训练集进行训练
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()] #列合并
flat_y = np.zeros(len(flat_x), dtype=int)#进行自动分类
flat_y = model.predict(flat_x) #注意,在这里才开始训练
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)#用测试集测试并产生预测值
print((pred_test_y == test_y).sum() / pred_test_y.size )#统计分类结果概率
mp.figure('Naive Bayes Classification',facecolor='lightgray')
mp.title('Naive Bayes Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
mp.scatter(test_x[:, 0], test_x[:, 1], c=test_y, cmap='brg', s=80)
mp.tight_layout()
mp.show()
4.交叉验证
import sklearn.model_selection as ms
ms.cross_val_score(模型,输入集,输出集,
cv=验证(折叠)次数,scoring=指标名) 得到指标数组
指标名scoring:
1)精确度accuracy:分类正确的样本数/总样本数 总样本数是测试集
2)查准率precision_weighted:预测正确数量/预测出来的总数
如100个测试样本中,预测有20个相似,最终只有5个是正确的,
则查准率=5/20=0.25
3)召回率recall_weighted: 预测正确的数量/实际存在的样本数
4)f1得分f1_weighted:
2*查准率*找回率/(查准率+召回率) 越高越好
在交叉验证过程中,针对每一个折叠,计算所有类别的查准率、
召回率或者f1得分,然后取各类别相应指标值的平均数,作为
这一个折叠的评估指标,然后再将所有折叠的评估指标以数组
的形式返回调用对象。
代码示例:cv.py
import numpy as np
import sklearn.naive_bayes as nb
import matplotlib.pyplot as mp
import sklearn.model_selection as ms
x,y = [],[]
with open('../../day01/data/multiple1.txt','r') as f:
for line in f.readlines():
data = [float(substr) for substr in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x ,y = np.array(x),np.array(y,dtype=int)
#划分训练集合测试集
train_x,test_x,train_y,test_y = \
ms.train_test_split(x,y,test_size=0.25,random_state=7)
#朴素贝叶斯分类器
model = nb.GaussianNB()
#交叉验证
#精确度accuracy
ac = ms.cross_val_score(model,train_x,train_y,cv=5,scoring='accuracy')
print(ac.mean())
#查准率precision_weighted
pw = ms.cross_val_score(model,train_x,train_y,cv=5,scoring='precision_weighted')
print(pw.mean())
#召回率recall_weighted
rw = ms.cross_val_score(model,train_x,train_y,cv=5,scoring='recall_weighted')
print(rw.mean())
#f1得分f1_weighted
fw = ms.cross_val_score(model,train_x,train_y,cv=5,scoring='f1_weighted')
print(fw.mean())
model.fit(train_x,train_y)#用训练集进行训练
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()] #列合并
flat_y = np.zeros(len(flat_x), dtype=int)#进行自动分类
flat_y = model.predict(flat_x) #注意,在这里才开始训练
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)#用测试集测试并产生预测值
print((pred_test_y == test_y).sum() / pred_test_y.size )#统计分类结果概率
mp.figure('Naive Bayes Classification',facecolor='lightgray')
mp.title('Naive Bayes Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
mp.scatter(test_x[:, 0], test_x[:, 1], c=test_y, cmap='brg', s=80)
mp.tight_layout()
mp.show()
5.混淆矩阵
每行和每列分别对应样本输出中的每个类别,行表示实际类别
列表示预测类别
引例:
预测类别
实 A B C
际 A 5 0 0->5,表示实际类别和预测类别都为5,比较好
类 B 0 6 0
别 C 0 0 7
以上对角的矩阵都是0,是比较理想完美的交叉矩阵
预测类别
实 A B C
际 A 3 1 1 实际类别:A5个,B6个,C7个
类 B 0 4 2 但预测类别为:A4个,B7个,C7个
别 C 1 2 4
以上是比较糟糕的交叉矩阵
查准率:A,3/4; B,4/7;C,4/7
召回率:A,3/5; B,4/6;C,4/7
混淆矩阵中主对角线上的值,除以所在整行之和,为召回率。
混淆矩阵中主对角线上的值,除以所在整列之和,为查准率。
从而计算出F1得分
模块:
import sklearn.metrics as sm
sm.confusion_matrix(实际类别,预测出的类别)
代码示例:cm.py
import numpy as np
import sklearn.naive_bayes as nb
import matplotlib.pyplot as mp
import sklearn.model_selection as ms
import sklearn.metrics as sm
x,y = [],[]
with open('../../day01/data/multiple1.txt','r') as f:
for line in f.readlines():
data = [float(substr) for substr in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x ,y = np.array(x),np.array(y,dtype=int)
#划分训练集合测试集
train_x,test_x,train_y,test_y = \
ms.train_test_split(x,y,test_size=0.25,random_state=7)
#朴素贝叶斯分类器
model = nb.GaussianNB()
#交叉验证
#精确度accuracy
ac = ms.cross_val_score(model,train_x,train_y,cv=5,scoring='accuracy')
print(ac.mean())
#查准率precision_weighted
pw = ms.cross_val_score(model,train_x,train_y,cv=5,scoring='precision_weighted')
print(pw.mean())
#召回率recall_weighted
rw = ms.cross_val_score(model,train_x,train_y,cv=5,scoring='recall_weighted')
print(rw.mean())
#f1得分f1_weighted
fw = ms.cross_val_score(model,train_x,train_y,cv=5,scoring='f1_weighted')
print(fw.mean())
model.fit(train_x,train_y)#用训练集进行训练
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()] #列合并
flat_y = np.zeros(len(flat_x), dtype=int)#进行自动分类
flat_y = model.predict(flat_x) #注意,在这里才开始训练
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)#用测试集测试并产生预测值
print((pred_test_y == test_y).sum() / pred_test_y.size )#统计分类结果概率
#获得混淆矩阵
cm = sm.confusion_matrix(test_y,pred_test_y)
print(cm)
mp.figure('Naive Bayes Classification',facecolor='lightgray')
mp.title('Naive Bayes Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
mp.scatter(test_x[:, 0], test_x[:, 1], c=test_y, cmap='brg', s=80)
#混淆矩阵图形化
mp.figure('Confusion Matrix',facecolor='lightgray')
mp.title('Confusion Matrix', fontsize=20)
mp.xlabel('Predicted', fontsize=14)
mp.ylabel('True', fontsize=14)
mp.xticks(np.unique(pred_test_y))
mp.yticks(np.unique(test_y))
mp.tick_params(labelsize=10)
mp.imshow(cm,interpolation='nearest',cmap='jet')
mp.tight_layout()
mp.show()
6.分类报告
import sklearn.metrics as sm
sm.classification_report(实际类别,预测出的类别)
代码示例:cr.py
import numpy as np
import sklearn.naive_bayes as nb
import matplotlib.pyplot as mp
import sklearn.model_selection as ms
import sklearn.metrics as sm
x,y = [],[]
with open('../../day01/data/multiple1.txt','r') as f:
for line in f.readlines():
data = [float(substr) for substr in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x ,y = np.array(x),np.array(y,dtype=int)
#划分训练集合测试集
train_x,test_x,train_y,test_y = \
ms.train_test_split(x,y,test_size=0.25,random_state=7)
#朴素贝叶斯分类器
model = nb.GaussianNB()
#交叉验证
#精确度accuracy
ac = ms.cross_val_score(model,train_x,train_y,cv=5,scoring='accuracy')
print(ac.mean())
#查准率precision_weighted
pw = ms.cross_val_score(model,train_x,train_y,cv=5,scoring='precision_weighted')
print(pw.mean())
#召回率recall_weighted
rw = ms.cross_val_score(model,train_x,train_y,cv=5,scoring='recall_weighted')
print(rw.mean())
#f1得分f1_weighted
fw = ms.cross_val_score(model,train_x,train_y,cv=5,scoring='f1_weighted')
print(fw.mean())
model.fit(train_x,train_y)#用训练集进行训练
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()] #列合并
flat_y = np.zeros(len(flat_x), dtype=int)#进行自动分类
flat_y = model.predict(flat_x) #注意,在这里才开始训练
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)#用测试集测试并产生预测值
print((pred_test_y == test_y).sum() / pred_test_y.size )#统计分类结果概率
#获得混淆矩阵
cm = sm.confusion_matrix(test_y,pred_test_y)
print(cm)
#获得分类报告
cr = sm.classification_report(test_y,pred_test_y)
print(cr)
十、决策树分类
1.随机森林分类
以页子表的投票结果确定预测分类。
代码示例:car.py
import numpy as np
import sklearn.preprocessing as sp #标签编码器
import sklearn.ensemble as se #随机森林
import sklearn.model_selection as ms #交叉验证
data = []
with open('../../day01/data/car.txt','r') as f:
for line in f.readlines():
data.append(line[:-1].split(',')) #line[:-1]去掉换行符
data = np.array(data).T
#构建编码字典
encoders,train_x=[],[]
for row in range(len(data)):
encoder = sp.LabelEncoder()
if row < len(data) -1 :
train_x.append(
encoder.fit_transform(data[row]))
else:
train_y = encoder.fit_transform(data[row])
encoders.append(encoder)
train_x = np.array(train_x).T
#随机森林分类器模型
model = se.RandomForestClassifier(max_depth=6,n_estimators=200,random_state=7)
#通过交叉验证查看f1得分
ms = ms.cross_val_score(model,train_x,train_y,cv=2,scoring='f1_weighted')
print(ms.mean())
model.fit(train_x,train_y)
#以下构建测试数据
data=[
['high','med','5more','4','big','low','unacc'],
['high','high','4','4','med','med','acc'],
['low','low','2','4','small','high','good'],
['low','med','3','4','med','high','vgood']]
data = np.array(data).T
#print(data)
test_x=[]
for row in range(len(data)):
encoder = encoders[row]
if row < len(data) -1 :
#print("x ",encoder.transform(data[row]))
test_x.append(encoder.transform(data[row]))
# fit_transform是先创建字典再做转换 transform是不创建字典
# 因为前面已经有字典,这里不需要创建字典
else:
test_y = encoder.transform(data[row])
test_x = np.array(test_x).T
pred_test_y = model.predict(test_x)#对测试数据预测结果
print((pred_test_y == test_y).sum()/pred_test_y.size)
#输出预测结果,进行反向编码,四个正确三个,正确率75%
print(encoders[-1].inverse_transform(test_y))
print(encoders[-1].inverse_transform(pred_test_y))
2.验证曲线
验证曲线显示模型性能与超参数之间的函数关系
获得超参数的最佳值
import sklearn.model_selection as ms
ms.validation_curve(模型,输入集,输出集,'超参数名称',
超参数名称,cv=折叠数)
得到训练集得分和测试集得分
代码示例:vc.py
import numpy as np
import sklearn.preprocessing as sp #标签编码器
import sklearn.ensemble as se #随机森林
import sklearn.model_selection as ms #交叉验证
import matplotlib.pyplot as mp
data = []
with open('../../day01/data/car.txt','r') as f:
for line in f.readlines():
data.append(line[:-1].split(',')) #line[:-1]去掉换行符
data = np.array(data).T
#构建编码字典
encoders,x=[],[]
for row in range(len(data)):
encoder = sp.LabelEncoder()
if row < len(data) -1 :
x.append(
encoder.fit_transform(data[row]))
else:
y = encoder.fit_transform(data[row])
encoders.append(encoder)
x = np.array(x).T
#随机森林分类器模型
#获得关于n_estimators的验证曲线
#去除n_estimators=200超参数,以便用于后面验证
model = se.RandomForestClassifier(max_depth=6,random_state=7)
#n_estimators检验超参数值的影响情况
n_estimators = np.arange(50,1000,50)
train_sco,test_sco = \
ms.validation_curve(model,x,y,'n_estimators',n_estimators,cv=5)
train_means1= train_sco.mean(axis=1)#检查超参数对训练集的影响
#train_means1= test_sco.mean(axis=1)#检查超参数对测试集的影响
for param,score in zip(n_estimators,train_means1):
print(param,'-->', score)
#获得关于max_depth的验证曲线
model = se.RandomForestClassifier(n_estimators=200,random_state=7)
max_depth = np.arange(1,11)
train_sco,test_sco = \
ms.validation_curve(model,x,y,'max_depth',max_depth,cv=5)
train_means2= train_sco.mean(axis=1)#检查超参数对训练集的影响
for param,score in zip(max_depth,train_means2):
print(param,'-max->', score)
#n_estimators验证曲线
mp.figure('n_estimators',facecolor='lightgray')
mp.title('n_estimators', fontsize=20)
mp.xlabel('n_estimators', fontsize=14)
mp.ylabel('F1 Score', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(n_estimators,train_means1,'o-',color='skyblue',label='Training')
mp.legend()
#max_depth验证曲线
mp.figure('max_depth',facecolor='lightgray')
mp.title('max_depth', fontsize=20)
mp.xlabel('max_depth', fontsize=14)
mp.ylabel('F1 Score', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(max_depth,train_means2,'o-',color='skyblue',label='Training')
mp.legend()
mp.tight_layout()
mp.show()
3.学习曲线
反映模型性能与训练集大小之间的函数关系
import sklearn.model_selection as ms
ms.learning_curve(模型,输入集,输出集,
train_sizes=训练集大小序列,cv=折叠数)
获得训练集大小,训练集得分,测试集得分
训练集大小序列 = [0.9 0.8 0.7 0.6 0.5]
代码示例:lc.py
import numpy as np
import sklearn.preprocessing as sp #标签编码器
import sklearn.ensemble as se #随机森林
import sklearn.model_selection as ms #交叉验证
import matplotlib.pyplot as mp
data = []
with open('../../day01/data/car.txt','r') as f:
for line in f.readlines():
data.append(line[:-1].split(',')) #line[:-1]去掉换行符
data = np.array(data).T
encoders,x=[],[]
for row in range(len(data)):
encoder = sp.LabelEncoder()
if row < len(data) -1 :
x.append(
encoder.fit_transform(data[row]))
else:
y = encoder.fit_transform(data[row])
encoders.append(encoder)
x = np.array(x).T
#获得学习曲线
model = se.RandomForestClassifier(max_depth=9,n_estimators=200,random_state=7)
train_sizes = np.linspace(0.1,1,10)#取训练集大小
_,train_sco,test_sco = ms.learning_curve(model,x,y,
train_sizes=train_sizes,cv=5)
train_means= train_sco.mean(axis=1)#检查超参数对训练集的影响
for size,score in zip(train_sizes,train_means):
print(size,'-->', score)
mp.figure('learning_curve',facecolor='lightgray')
mp.title('learning_curve', fontsize=20)
mp.xlabel('train_size', fontsize=14)
mp.ylabel('F1 Score', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(train_sizes,train_means,'o-',color='skyblue',label='Training')
mp.legend()
mp.tight_layout()
mp.show()
4.针对不同形式的特征选择不同类型的编码器
代码示例:inc.py 不同工资收入分类器
import numpy as np
import sklearn.preprocessing as sp #标签编码器
import sklearn.naive_bayes as nb #朴素贝叶斯
import sklearn.model_selection as ms #模型选择
import matplotlib.pyplot as mp
class DigitEncoder():
def fit_transform(self,y):
return y.astype(int)
def transform(self,y):
return y.astype(int)
def inverse_transform(self,y):
return y.astype(str)
num_less,num_more,max_each=0,0,7500
data=[]
with open('../../day01/data/adult.txt','r') as f:
for line in f.readlines():
if '?' not in line: #忽略带?的行数据
line_data = line[:-1].split(', ')
if line_data[-1] == '<=50K' and num_less < max_each :
data.append(line_data)
num_less+=1
elif line_data[-1] == '>50K' and num_more < max_each:
data.append(line_data)
num_more+=1
if num_less >= max_each and num_more >= max_each:
break
data = np.array(data).T
encoders,x = [],[]
for row in range(len(data)):
if data[row,0].isdigit():
encoder = DigitEncoder()
else:
encoder = sp.LabelEncoder()
if row < len(data)-1:
x.append(encoder.fit_transform(data[row]))
else:
y = encoder.fit_transform(data[row])
encoders.append(encoder)
x = np.array(x).T
train_x,test_x,train_y,test_y = \
ms.train_test_split(x,y,test_size=0.25,random_state=5)
model = nb.GaussianNB()
#交叉验证的分数显示不太好,只有0.585
print(ms.cross_val_score(model,x,y,cv=10,scoring='f1_weighted').mean())
model.fit(train_x,train_y)#进行训练
pred_test_y = model.predict(test_x)#进程测试
print((pred_test_y==test_y).sum()/pred_test_y.size)
#预测这样一个人的工资收入
data = [['39','State-gov','77516','Bachelors','13','Never-married',
'Adm-clerical','Not-in-family','White','Male',
'2174',0,40,'United-States']]
data = np.array(data).T
x = []
for row in range(len(data)):
encoder = encoders[row]
encoder.transform(data[row])
x.append(encoder.transform(data[row]))
x = np.array(x).T
pred_y = model.predict(x)
print(encoders[-1].inverse_transform(pred_y))