手写实现朴素贝叶斯分类:
# 手写贝叶斯分类算法实现
import numpy as np
import jieba
# 贝叶斯分类也是有监督学习模型(可以用来实现文本的情感分析应用)
# 构造样本
texts = ['中国是一个伟大的国家, 中国很美好, 我喜欢中国。',
'总有些坏人,做一些坏事',
'今天是美好的一天, 心情特别好',
'生活很美好, 人工智能是实现更加美好生活的核心技术',
'天气坏透了, 一直下雨, 心情很糟糕, 坏天气, 坏心情',
'你真坏,你是一个大坏蛋, 坏人']
label = np.array([1, 0, 1, 1, 0, 0])
# 构造词典,词典是不重复的词的一个集合
def makeWordList(tests):
wordList = []
worfSplit = [] # 分完的词保存在这个结构中
for text in texts:
words = list(jieba.cut(text))
print(words)
pass
pass
makeWordList(texts)
完整算法如下:
# 手写贝叶斯分类算法实现
import numpy as np
import jieba
# 贝叶斯分类也是有监督学习模型(可以用来实现文本的情感分析应用)
# 构造样本
texts = ['中国是一个伟大的国家, 中国很美好, 我喜欢中国。',
'总有些坏人,做一些坏事',
'今天是美好的一天, 心情特别好',
'生活很美好, 人工智能是实现更加美好生活的核心技术',
'天气坏透了, 一直下雨, 心情很糟糕, 坏天气, 坏心情',
'你真坏,你是一个大坏蛋, 坏人']
label = np.array([1, 0, 1, 1, 0, 0])
# 1. 构造词典,词典是不重复的词的一个集合
def makeWordList(tests):
wordList = []
wordSplit = [] # 分完的词保存在这个结构中
for text in texts:
words = list(jieba.cut(text))
wordSplit.append(words)
for w in words:
if w not in wordList:
wordList.append(w)
pass
pass
pass
print(wordList)
return wordList, wordSplit
pass
# 2. 统计词频(基于词典统计词频)
def makeWordCount(tests, wordList):
wordCount = []
for test in tests:
wc = np.zeros(len(wordList))
words = list(jieba.cut(test))
for w in words:
if w in wordList:
index = wordList.index(w)
wc[index] += 1
pass
pass
wordCount.append(wc)
pass
return np.array(wordCount)
pass
wordList, wordSplit = makeWordList(texts)
wordCount = makeWordCount(texts, wordList)
print(wordCount)
# 算法实现
def train(wordCount, wordList, labels):
py0 = 3/6
py1 = 3/6
pxy0 = np.zeros(len(wordList)) + 1 # 防止概率0在连乘中出现的问题, 拉普拉斯平滑处理
pxy1 = np.zeros(len(wordList)) + 1
for i, wd in zip(labels, wordCount):
if i == 0:
pxy0 += wd
elif i == 1:
pxy1 += wd
pass
print(pxy0)
print(pxy1)
pxy0 = pxy0/pxy0.sum()
pxy1 = pxy1/pxy1.sum()
print(pxy0)
print(pxy1)
# 模型参数
return py0, py1, pxy0, pxy1
pass
py0, py1, pxy0, pxy1 = train(wordCount, wordList, label)
# 预测函数
def predict(py0, py1, pxy0, pxy1, test, wordList):
wd = makeWordCount(test, wordList)
print(wd)
# 连乘可能会出现小数越界(浮点数超出计算机可以表示的最小浮点数),所以将计算连乘改为计算连加(使用对数的性质)
p0 = np.sum(np.log(pxy0) * wd[0]) + np.log(py0)
p1 = np.sum(np.log(pxy1) * wd[0]) + np.log(py1)
if p0 > p1:
return 0
else:
return 1
pass
result = predict(py0, py1, pxy0, pxy1, ['中国很美好'], wordList)
print(result)
result = predict(py0, py1, pxy0, pxy1, ['大坏蛋'], wordList)
print(result)
加入停用词后的朴素贝叶斯算法如下:
# 手写贝叶斯分类算法实现
import numpy as np
import jieba
# 贝叶斯分类也是有监督学习模型(可以用来实现文本的情感分析应用)
# 构造样本
texts = ['中国是一个伟大的国家, 中国很美好, 我喜欢中国。',
'总有些坏人,做一些坏事',
'今天是美好的一天, 心情特别好',
'生活很美好, 人工智能是实现更加美好生活的核心技术',
'天气坏透了, 一直下雨, 心情很糟糕, 坏天气, 坏心情',
'你真坏,你是一个大坏蛋, 坏人']
label = np.array([1, 0, 1, 1, 0, 0])
#使用停用词处理
stopWords = [',', '我', '是', '个', '一个', '你', '他', '了', '的', '一天', '一些']
# 1. 构造词典,词典是不重复的词的一个集合
def makeWordList(tests, stopWords):
wordList = []
wordSplit = [] # 分完的词保存在这个结构中
for text in texts:
words = list(jieba.cut(text))
wordSplit.append(words)
for w in words :
if w not in wordList and w not in stopWords:
wordList.append(w)
pass
pass
pass
print(wordList)
return wordList, wordSplit
pass
# 2. 统计词频(基于词典统计词频)
def makeWordCount(tests, wordList):
wordCount = []
for test in tests:
wc = np.zeros(len(wordList))
words = list(jieba.cut(test))
for w in words:
if w in wordList:
index = wordList.index(w)
wc[index] += 1
pass
pass
wordCount.append(wc)
pass
return np.array(wordCount)
pass
wordList, wordSplit = makeWordList(texts, stopWords)
wordCount = makeWordCount(texts, wordList)
print(wordCount)
# 算法实现
def train(wordCount, wordList, labels):
py0 = 3/6
py1 = 3/6
pxy0 = np.zeros(len(wordList)) + 1 # 防止概率0在连乘中出现的问题, 拉普拉斯平滑处理
pxy1 = np.zeros(len(wordList)) + 1
for i, wd in zip(labels, wordCount):
if i == 0:
pxy0 += wd
elif i == 1:
pxy1 += wd
pass
print(pxy0)
print(pxy1)
pxy0 = pxy0/pxy0.sum()
pxy1 = pxy1/pxy1.sum()
print(pxy0)
print(pxy1)
# 模型参数
return py0, py1, pxy0, pxy1
pass
py0, py1, pxy0, pxy1 = train(wordCount, wordList, label)
# 预测函数
def predict(py0, py1, pxy0, pxy1, test, wordList):
wd = makeWordCount(test, wordList)
print(wd)
# 连乘可能会出现小数越界(浮点数超出计算机可以表示的最小浮点数),所以将连乘改为连加(使用对数的性质)
p0 = np.sum(np.log(pxy0) * wd[0]) + np.log(py0)
p1 = np.sum(np.log(pxy1) * wd[0]) + np.log(py1)
if p0 > p1:
return 0
else:
return 1
pass
result = predict(py0, py1, pxy0, pxy1, ['中国很美好'], wordList)
print(result)
result = predict(py0, py1, pxy0, pxy1, ['大坏蛋'], wordList)
print(result)