朴素贝叶斯分类的python实现

贝叶斯分类

两组文本(一组属于a类,一组属于b类)
NBC(贝叶斯)通过计算样本在各个分类中的概率来进行分类

1.相关模块导入
import numpy as np
import re
import operator as op
2.数据获取
def str_strlist(String):
    ## 字符串——>字符串列表
    listOfTokens=re.split(r"\W",String)
    return [tok.lower() for tok in listOfTokens if len(tok)>2]

def open_test():
	## 获取数据
    dataSet=[]; classList=[];## 初始化文本词集[["c1","c2",...],...]、文本分类
    for i in range(1,26):
        # 读取属于a类的文本
#         wordList = str_strlist(open('算法数据\贝叶斯\email\spam\%d.txt' % i).read())
        wordList = str_strlist((open('算法数据\贝叶斯\a\%d.txt' % i,"rb").read()).decode("gbk","ignore"))
        dataSet.append(wordList)
        classList.append(1)
        # 读取属于b类的文本
        wordList = str_strlist((open('算法数据\贝叶斯\b\%d.txt' % i,"rb").read()).decode("gbk","ignore"))
#         wordList =str_strlist(open('算法数据\贝叶斯\email\ham\%d.txt' % i).read())
        dataSet.append(wordList)
        classList.append(0)
    return dataSet,classList
1.测试open_test()函数

dataSet,classList=open_test()
print(dataSet)
print(classList)

2.测试结果(部分显示)

[[‘codeine’, ‘15mg’,…,‘only’], [‘peter’, ‘with’, …,‘eugene’]]
[1, 2,…]

3.词向量转化
def wordSet(dataSet):
    ## 合并文本词集为词集(不按文本分块)
    wordSet=set([])#创建空集合
    for wordData in dataSet:
        wordSet=wordSet|set(wordData) # 循环合并、去重
    return list(wordSet)

def wordCount(wordList, word0):
    ## 参数为词集和单个文档词集 返回词向量(存在1否则0)
    wordVec = [0]*len(wordList)
    for word1 in word0:
        if word1 in wordList:
            wordVec[wordList.index(word1)] = 1         
    return wordVec
1.测试wordSet()、wordCount()函数

dataSet,classList=open_test()
wordList=wordSet(dataSet)
wordVec=wordCount(wordList, dataSet[1])
print(wordList) ## 词汇表/词集
print(wordVec) ## 词向量

2.测试结果

[‘can’,‘bags’,‘sent’, ‘express’, ‘school’,…, ‘assigning’, ‘inform’, ‘income’, ‘decision’, ‘mathematics’]
[0, 0, 1, 1, 0, …,0, 0, 1, 0, 0]

4.贝叶斯分类器
def drill_bayes(traindata,trainClass):
	## 训练模型
    numtxt=len(traindata)
    numWords=len(traindata[0])
    p=sum(trainClass)/float(numtxt)
    paNum = np.ones(numWords)
    pbNum = np.ones(numWords)
    paDenom = 2.0
    pbDenom = 2.0
    for i in range(numtxt):#遍历每个文档
        if trainClass[i]==1:
            paNum +=traindata[i]
            paDenom +=sum(traindata[i])
        else:
            pbNum +=traindata[i]
            pbDenom +=sum(traindata[i])
    pa = np.log(paNum / paDenom )
    pb= np.log(pbNum / pbDenom )
    return pa, pb, p
    
def class_bayes(data_txt,pa,pb,p):
	## 模型分类
    p1=sum(data_txt*pa)+np.log(p)
    p0=sum(data_txt*pb)+np.log(1.0-p)
    if p1>p0:
        return 1
    else:
        return 0
5.训练及测试
## 训练模型
dataSet,classList=open_test()
wordList=wordSet(dataSet)
trainingSet = list(range(50))
import random
trainMat=[]; trainClasses=[]
for docIndex in trainingSet:
    trainMat.append(wordCount(wordList,dataSet[docIndex]))
    trainClasses.append(classList[docIndex])
pa,pb,p=drill_bayes(trainMat,trainClasses)
## 测试模型
testSet=[]
for i in range(10):
    randIndex=int(random.uniform(0,len(trainingSet)))
    testSet.append(trainingSet[randIndex])
    del(trainingSet[randIndex])
testSet
errorCount=0
for docIndex in testSet:
    wordVector=wordCount(wordList,dataSet[docIndex])
    if class_bayes(np.array(wordVector),pa,pb,p)!=classList[docIndex]:
        errorCount+=1
        print("分类错误的是:第 %s文件" %(docIndex+1))
print('错误率是:',float(errorCount)/len(testSet))

错误率是: 0.0

6.使用
txt36  = str_strlist((open('算法数据\贝叶斯\%d.txt' % 36,"rb").read()).decode("gbk","ignore"))
wordVector=wordCount(wordList,txt36)
cb=class_bayes(np.array(wordVector),pa,pb,p)
if cb==1:
    print("该文本属于a类")
else:
    print("该文本属于b类")

该文本属于a类

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
朴素贝叶斯分类是朴素贝叶斯分类的一种改进,它在假设特征之间并不是完全独立的基础上,引入了一些相关性信息,但是仍然假设每个特征的影响相互独立。下面是半朴素贝叶斯分类的Python实现。 首先需要导入必要的库: ```python import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score ``` 然后定义半朴素贝叶斯分类器: ```python class SemiNaiveBayes(): def __init__(self): self.prior = None self.cond_prob = None self.n_features = None self.n_classes = None def fit(self, X, y): self.n_features = X.shape[1] self.n_classes = len(np.unique(y)) self.prior = np.zeros(self.n_classes) self.cond_prob = np.zeros((self.n_features, self.n_classes)) for c in range(self.n_classes): idx = (y == c) self.prior[c] = np.sum(idx) / len(y) for j in range(self.n_features): values, counts = np.unique(X[idx, j], return_counts=True) prob = counts / np.sum(counts) self.cond_prob[j, c] = prob[np.argmax(prob)] def predict(self, X): y_pred = np.zeros(X.shape[0]) for i in range(X.shape[0]): probs = np.zeros(self.n_classes) for c in range(self.n_classes): probs[c] = self.prior[c] for j in range(self.n_features): prob = self.cond_prob[j, c] if X[i, j] in prob: probs[c] *= prob[X[i, j]] else: probs[c] *= 0 y_pred[i] = np.argmax(probs) return y_pred ``` 其中`fit()`方法用于训练模型,`predict()`方法用于预测新的样本。 接下来,我们可以使用鸢尾花数据集进行测试: ```python # 导入数据 iris = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None) # 分割数据集 X = iris.iloc[:, :-1].values y = iris.iloc[:, -1].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # 训练模型 model = SemiNaiveBayes() model.fit(X_train, y_train) # 预测并计算准确率 y_pred = model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) print('Accuracy:', accuracy) ``` 输出结果为: ``` Accuracy: 0.9666666666666667 ``` 可以看出,半朴素贝叶斯分类器在鸢尾花数据集上表现良好。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值