python MachinelEarning机器学习笔记day03

最新推荐文章于 2023-02-15 09:23:59 发布

pinecn

最新推荐文章于 2023-02-15 09:23:59 发布

阅读量214

点赞数

分类专栏： python机器学习

本文链接：https://blog.csdn.net/pinecn/article/details/89814751

版权

day03-交叉验证-混淆矩阵-分类报告-验证曲线-学习曲线
       2.朴素贝叶斯定理
           模块：
               import sklearn.naive_bayes as nb
               model = nb.GaussianNB()
               基于高斯分布即正态分布的朴素贝叶斯分类器
           代码示例：nb.py
               import numpy as np
               import sklearn.naive_bayes as nb
               import matplotlib.pyplot as mp
               x,y = [],[]
               with open('../../day01/data/multiple1.txt','r') as f:
                   for line in f.readlines():
                       data = [float(substr) for substr in line.split(',')]
                       x.append(data[:-1])
                       y.append(data[-1])
               x ,y = np.array(x),np.array(y,dtype=int)
               #朴素贝叶斯分类器
               model = nb.GaussianNB()
               model.fit(x,y)
               l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
               b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
               grid_x = np.meshgrid(np.arange(l, r, h),np.arange(b, t, v))
               flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()] #列合并
               flat_y = np.zeros(len(flat_x), dtype=int)#进行自动分类
               flat_y = model.predict(flat_x) #注意，在这里才开始训练
               grid_y = flat_y.reshape(grid_x[0].shape)
               pred_y = model.predict(x)#在这里测试并产生预测值
               print((pred_y == y).sum() / pred_y.size )#统计分类结果概率
               mp.figure('Naive Bayes Classification',facecolor='lightgray')
               mp.title('Naive Bayes Classification', fontsize=20)
               mp.xlabel('x', fontsize=14)
               mp.ylabel('y', fontsize=14)
               mp.tick_params(labelsize=10)
               mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
               mp.scatter(x[:, 0], x[:, 1], c=y, cmap='brg', s=80)
               mp.tight_layout()
               mp.show()
       3.划分训练集测试集
           import sklearn.model_selection as ms
           ms.train_test_split(输入集,输出集,
                       test_size=测试集占比,ramdom_state=随机种子)
           -->得到：训练输入，测试输入，训练输出，测试输出
           代码示例：split.py
               import numpy as np
               import sklearn.naive_bayes as nb
               import matplotlib.pyplot as mp
               import sklearn.model_selection as ms
               x,y = [],[]
               with open('../../day01/data/multiple1.txt','r') as f:
                   for line in f.readlines():
                       data = [float(substr) for substr in line.split(',')]
                       x.append(data[:-1])
                       y.append(data[-1])
               x ,y = np.array(x),np.array(y,dtype=int)
               #划分训练集合测试集
               train_x,test_x,train_y,test_y = \
                   ms.train_test_split(x,y,test_size=0.25,random_state=7)
               #朴素贝叶斯分类器
               model = nb.GaussianNB()
               model.fit(train_x,train_y)#用训练集进行训练
               l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
               b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
               grid_x = np.meshgrid(np.arange(l, r, h),np.arange(b, t, v))
               flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()] #列合并
               flat_y = np.zeros(len(flat_x), dtype=int)#进行自动分类
               flat_y = model.predict(flat_x) #注意，在这里才开始训练
               grid_y = flat_y.reshape(grid_x[0].shape)
               pred_test_y = model.predict(test_x)#用测试集测试并产生预测值
               print((pred_test_y == test_y).sum() / pred_test_y.size )#统计分类结果概率
               mp.figure('Naive Bayes Classification',facecolor='lightgray')
               mp.title('Naive Bayes Classification', fontsize=20)
               mp.xlabel('x', fontsize=14)
               mp.ylabel('y', fontsize=14)
               mp.tick_params(labelsize=10)
               mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
               mp.scatter(test_x[:, 0], test_x[:, 1], c=test_y, cmap='brg', s=80)
               mp.tight_layout()
               mp.show()
       4.交叉验证
           import sklearn.model_selection as ms
           ms.cross_val_score(模型，输入集，输出集，
           cv=验证(折叠)次数,scoring=指标名) 得到指标数组
           指标名scoring：
               1)精确度accuracy：分类正确的样本数/总样本数总样本数是测试集
               2)查准率precision_weighted:预测正确数量/预测出来的总数
                   如100个测试样本中，预测有20个相似，最终只有5个是正确的，
                   则查准率=5/20=0.25
               3)召回率recall_weighted: 预测正确的数量/实际存在的样本数
               4)f1得分f1_weighted:
                   2*查准率*找回率/(查准率+召回率) 越高越好
           在交叉验证过程中，针对每一个折叠，计算所有类别的查准率、
           召回率或者f1得分，然后取各类别相应指标值的平均数，作为
           这一个折叠的评估指标，然后再将所有折叠的评估指标以数组
           的形式返回调用对象。
           代码示例：cv.py
               import numpy as np
               import sklearn.naive_bayes as nb
               import matplotlib.pyplot as mp
               import sklearn.model_selection as ms
               x,y = [],[]
               with open('../../day01/data/multiple1.txt','r') as f:
                   for line in f.readlines():
                       data &