朴素贝叶斯

1.介绍

文本分类实验
http://qwone.com/~jason/20Newsgroups/下载数据

2.代码

鸢尾花
Iris_GaussianNB.py

#!/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split # 分数据集


def iris_type(s):
    it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
    return it[s]


if __name__ == "__main__":
    data = np.loadtxt('..\8.Regression\8.iris.data', dtype=float, delimiter=',', converters={4: iris_type})
    print data
    x, y = np.split(data, (4,), axis=1)
    x = x[:, :2]
    print x
    print y

	# train_test_split(x,y,train_size=0.8)
    gnb = Pipeline([
        ('sc', StandardScaler()), # 标准化
        ('clf', GaussianNB())]) #假定服从高斯分布,朴素贝叶斯
    gnb.fit(x, y.ravel()) # 转成向量
    
    # gnb = MultinomialNB().fit(x, y.ravel())
    
    # gnb = KNeighborsClassifier(n_neighbors=5).fit(x, y.ravel()) # K近邻,设置层数

    # 画图
    N, M = 500, 500     # 横纵各采样多少个值
    x1_min, x1_max = x[:, 0].min(), x[:, 0].max()   # 第0列的范围
    x2_min, x2_max = x[:, 1].min(), x[:, 1].max()   # 第1列的范围
    t1 = np.linspace(x1_min, x1_max, N)
    t2 = np.linspace(x2_min, x2_max, M)
    x1, x2 = np.meshgrid(t1, t2)                    # 生成网格采样点
    x_test = np.stack((x1.flat, x2.flat), axis=1)   # 测试点

    # 无意义,只是为了凑另外两个维度
    # x3 = np.ones(x1.size) * np.average(x[:, 2])
    # x4 = np.ones(x1.size) * np.average(x[:, 3])
    # x_test = np.stack((x1.flat, x2.flat, x3, x4), axis=1)  # 测试点

    mpl.rcParams['font.sans-serif'] = [u'simHei']
    mpl.rcParams['axes.unicode_minus'] = False
    
    cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
    cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
    
    y_hat = gnb.predict(x_test)                  # 预测值
    y_hat = y_hat.reshape(x1.shape)                 # 使之与输入的形状相同
    
    plt.figure(facecolor='w')
    plt.pcolormesh(x1, x2, y_hat, cmap=cm_light)     # 预测值的显示
    plt.scatter(x[:, 0], x[:, 1], c=y, edgecolors='k', s=50, cmap=cm_dark)    # 样本的显示
    plt.xlabel(u'花萼长度', fontsize=14)
    plt.ylabel(u'花萼宽度', fontsize=14)
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.title(u'GaussianNB对鸢尾花数据的分类结果', fontsize=18)
    plt.grid(True)
    plt.show()

    # 训练集上的预测结果
    y_hat = gnb.predict(x)
    y = y.reshape(-1)
    result = y_hat == y
    print y_hat
    print result
    acc = np.mean(result)
    print '准确度: %.2f%%' % (100 * acc)

MultinomialNB_intro.py

#!/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
from sklearn.naive_bayes import GaussianNB, MultinomialNB


if __name__ == "__main__":
    np.random.seed(0) # 随机数据一样
    M = 20 # 20样本
    N = 5 # 五维
    x = np.random.randint(2, size=(M, N))     # 只能取0和1的MN矩阵,[low, high)
    x = np.array(list(set([tuple(t) for t in x]))) # 把数据提出变成元组,放入集合,再转成列表,形成新的数据----去重,20行除去重复的成为17行
    M = len(x) # 提取行数
    y = np.arange(M) # 行数作为y的标记值,每一行就是一个类别 
    print '样本个数:%d,特征数目:%d' % x.shape
    print '样本:\n', x
    mnb = MultinomialNB(alpha=1)    # 朴素贝叶斯;动手:换成GaussianNB()试试预测结果?
    # mnb = GaussianNB()
    mnb.fit(x, y) # 训练
    y_hat = mnb.predict(x) # 预测,看y和y_hat是否符合
    print '预测类别:', y_hat
    print '准确率:%.2f%%' % (100*np.mean(y_hat == y))
    print '系统得分:', mnb.score(x, y)
    # from sklearn import metrics
    # print metrics.accuracy_score(y, y_hat)
    err = y_hat != y
    for i, e in enumerate(err): # 枚举错误的值,y是实际的,y_hat是被认为的
        if e:
            print y[i], ':\t', x[i], '被认为与', x[y_hat[i]], '一个类别'

六模型文本处理
text_classification.py

#!/usr/bin/python
# -*- coding:utf-8 -*-

import numpy as np
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.datasets import fetch_20newsgroups # 函数中包含了数据
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from time import time
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib as mpl


def test_clf(clf): # 某个分类器
    print u'分类器:', clf
    alpha_can = np.logspace(-3, 2, 10) # 等比数列,取10个不同的参数值
    model = GridSearchCV(clf, param_grid={'alpha': alpha_can}, cv=5) # 一个分类器做五次交叉验证(网格搜索),共50次
    m = alpha_can.size # 用于计算时间
    if hasattr(clf, 'alpha'): # NB的超参数
        model.set_params(param_grid={'alpha': alpha_can})
        m = alpha_can.size
    if hasattr(clf, 'n_neighbors'): # k-近邻的超参数
        neighbors_can = np.arange(1, 15)
        model.set_params(param_grid={'n_neighbors': neighbors_can})
        m = neighbors_can.size
    if hasattr(clf, 'C'): # SVM超参数
        C_can = np.logspace(1, 3, 3) # 时间长不多取
        gamma_can = np.logspace(-3, 0, 3)
        model.set_params(param_grid={'C':C_can, 'gamma':gamma_can})
        m = C_can.size * gamma_can.size
    if hasattr(clf, 'max_depth'): # 随机森林超参数
        max_depth_can = np.arange(4, 10)
        model.set_params(param_grid={'max_depth': max_depth_can})
        m = max_depth_can.size
    t_start = time()
    model.fit(x_train, y_train) # 训练,得到最佳参数
    t_end = time()
    t_train = (t_end - t_start) / (5*m) # 除以词次数
    print u'5折交叉验证的训练时间为:%.3f秒/(5*%d)=%.3f秒' % ((t_end - t_start), m, t_train)
    print u'最优超参数为:', model.best_params_ # 打印最佳超参数
    t_start = time()
    y_hat = model.predict(x_test)
    t_end = time()
    t_test = t_end - t_start
    print u'测试时间:%.3f秒' % t_test
    acc = metrics.accuracy_score(y_test, y_hat)
    print u'测试集准确率:%.2f%%' % (100 * acc)
    name = str(clf).split('(')[0] # 转成字符串
    index = name.find('Classifier')
    if index != -1: # 如果没找到
        name = name[:index]     # 去掉末尾的Classifier
    if name == 'SVC': # SVC换成SVM
        name = 'SVM'
    return t_train, t_test, 1-acc, name # 返回这个分类器的最终结果


if __name__ == "__main__":
    print u'开始下载/加载数据...'
    t_start = time()
    # remove = ('headers', 'footers', 'quotes') # 删除标题等等
    remove = ()
    categories = 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space' # 关注的四个类别
    # categories = None     # 若分类所有类别,请注意内存是否够用
    data_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=0, remove=remove)
    data_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=0, remove=remove)
    t_end = time() # 显示下载时间
    print u'下载/加载数据完成,耗时%.3f秒' % (t_end - t_start)
    print u'数据类型:', type(data_train)
    print u'训练集包含的文本数目:', len(data_train.data)
    print u'测试集包含的文本数目:', len(data_test.data)
    print u'训练集和测试集使用的%d个类别的名称:' % len(categories) # 4
    categories = data_train.target_names
    pprint(categories) # 效果稍好
    y_train = data_train.target # 类别
    y_test = data_test.target
    # 查看数据
    print u' -- 前10个文本 -- '
    for i in np.arange(10):
        print u'文本%d(属于类别 - %s):' % (i+1, categories[y_train[i]]) # 显示类别
        print data_train.data[i] # 训练数据文本
        print '\n\n'
    # 提取词频做特征,非必须    
    vectorizer = TfidfVectorizer(input='content', stop_words='english', max_df=0.5, sublinear_tf=True) # TF-TDF模型提取特征,设置停止词,出现频率
    x_train = vectorizer.fit_transform(data_train.data)  # 做特征转换;x_train是稀疏的,scipy.sparse.csr.csr_matrix
    x_test = vectorizer.transform(data_test.data) # 测试数据特征转换
    
    print u'训练集样本个数:%d,特征个数:%d' % x_train.shape # 样本个数即文本数,特征个数即词数
    print u'停止词:\n',
    pprint(vectorizer.get_stop_words()) # 停止词(非重要)
    feature_names = np.asarray(vectorizer.get_feature_names())

    print u'\n\n===================\n分类器的比较:\n'
    # 运行时间,一次时间,正确率
    clfs = (MultinomialNB(),                # 0.87(0.017), 0.002, 90.39%
            BernoulliNB(),                  # 1.592(0.032), 0.010, 88.54%
            KNeighborsClassifier(),         # 19.737(0.282), 0.208, 86.03%
            RidgeClassifier(),              # 25.6(0.512), 0.003, 89.73%
            RandomForestClassifier(n_estimators=200),   # 59.319(1.977), 0.248, 77.01%
            SVC()                           # 236.59(5.258), 1.574, 90.10%,允许错误的SVM
            )
    result = []
    for clf in clfs: # 做遍历
        a = test_clf(clf)
        result.append(a) # 放到result中
        print '\n'
    result = np.array(result)
    time_train, time_test, err, names = result.T
    x = np.arange(len(time_train))
    mpl.rcParams['font.sans-serif'] = [u'simHei']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(10, 7), facecolor='w')
    ax = plt.axes()
    b1 = ax.bar(x, err, width=0.25, color='#77E0A0')
    ax_t = ax.twinx()
    b2 = ax_t.bar(x+0.25, time_train, width=0.25, color='#FFA0A0')
    b3 = ax_t.bar(x+0.5, time_test, width=0.25, color='#FF8080')
    plt.xticks(x+0.5, names, fontsize=10)
    leg = plt.legend([b1[0], b2[0], b3[0]], (u'错误率', u'训练时间', u'测试时间'), loc='upper left', shadow=True)
    # for lt in leg.get_texts():
    #     lt.set_fontsize(14)
    plt.title(u'新闻组文本数据不同分类器间的比较', fontsize=18)
    plt.xlabel(u'分类器名称')
    plt.grid(True)
    plt.tight_layout(2)
    plt.show()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值