***1.案例简介
利用朴素贝叶斯分类方法对文本数据进行情感分析,进而设计一个留言板过滤系统.(以下代码实现不要使用SKLEARN模块)
2.数据采集
以下表格中是一个文本分类的问题,区分一句话是粗鲁的还是文明的,类别标签只有Yes或No,表示是粗鲁的和不是粗鲁的语句。
3.数据预处理
给定一个词典【”my”,“name”,“is”,“Devin”,“you”,“are”,“stupid”,“boyfriend”,“SB”,“looks”,“very”,“smart”,”like”,”much”】需要将每一段文字进行离散化,即进行空间向量化,
4.建模与分析
利用朴素贝叶斯分类方法测试语句“I like you.”是粗鲁的和不是粗鲁?(答案不粗鲁)***
import pandas as pd
import numpy as np
import numpy
import jieba
data=pd.DataFrame({"Words":["my name is Devin.","you are stupid.","my boyfriend is SB.","you looks very smart,I like you very much."],
"Label":["No","Yes","Yes","No"]})
#清洗数据
data["Words"]=data["Words"].str.replace(","," ")
data["Words"]=data["Words"].str.replace(".","")
#获得名字
text=data.Words.values.tolist()
colname = set(" ".join(text).split(" "))
#赋值
data1 = pd.DataFrame(0,columns=colname, index=range(4))
for i in range(4):
for word in colname:
if (word in data["Words"][i]):
data1.loc[i, word] += data["Words"][i].count(word)
data1["classLabel"] = [0, 1, 1, 0]
data2=data1.drop("classLabel",axis=1)
#表示单词的总数
numWords=len(" ".join(text).split(" "))
print("单词的总数:",numWords)
# p1Num:表示正样本中各单词出现的次数矩阵
# p0Num:表示负样本中各单词出现的次数矩阵
p1Num=list(data2[data1["classLabel"]==1].sum())
p0Num=list(data2[data1["classLabel"]==0].sum())
print("表示正样本中各单词出现的次数矩阵:",p1Num)
print("表示负样本中各单词出现的次数矩阵:",p0Num)
# p1Vec:表示正样本条件下的各单词的出现的概率
# p0Vec:表示负样本条件下的各单词的出现的概率
p1Vec=p1Num/np.array(p1Num).sum()
p0Vec=p0Num/np.array(p0Num).sum()
print("表示正样本条件下的各单词的出现的概率:",list(p1Vec))
print("表示负样本条件下的各单词的出现的概率:",list(p0Vec))
p1=pd.DataFrame(p1Vec,index=data2.columns)
p0=pd.DataFrame(p0Vec,index=data2.columns)
# p(正样本/句子)=p(正样本)p(句子/正样本)/p(句子)
# p(负样本/句子)=p(负样本)p(句子/负样本)/p(句子)
# 比较p(正样本)p(句子/正样本)和=p(负样本)p(句子/负样本)
sentence="I like you"
liii=sentence.split(" ")
positive=0
negative=0
for l in liii:
positive=(len(data2[data1["classLabel"]==1].index)/len(data2.index))*p1.loc[l]+positive
negative=(len(data2[data1["classLabel"]==0].index)/len(data2.index))*p0.loc[l]+negative
if float(positive) > float(negative):
print("粗鲁")
else:
print("不粗鲁")
sklearn 简单处理
import pandas as pd
import numpy as np
import numpy
import jieba
from sklearn.feature_extraction.text import CountVectorizer
word1 = "my name is Devin."
word2 = "you are stupid.",
word3 = "my boyfriend is SB."
word4 = "you looks very smart,I like you very much."
punct = set(u''',.''')
#函数删除标点符号
filterpunt = lambda s: ''.join(filter(lambda x: x not in punct, s))
con1 = " ".join(jieba.cut(filterpunt(word1)))
con2 = " ".join(jieba.cut(filterpunt(word2)))
con3 = " ".join(jieba.cut(filterpunt(word3)))
con4 = " ".join(jieba.cut(filterpunt(word4)))
vect=CountVectorizer()
result=vect.fit_transform([con1,con2,con3,con4])
print(result.toarray())
print(vect.get_feature_names())
data1=pd.DataFrame(result.toarray(),columns=vect.get_feature_names())
data1["label"]=[0,1,1,0]
data2=data1.drop("label",axis=1)