import jieba
import numpy as np
import pandas as pd
#朴素贝叶斯实现中文垃圾邮件分类
#loadbelFile(),加载index文件,使用字典来存储文件对应的标签
def loadLabelFile(labelFile='full/index'):
labelDict={}
for a in open(labelFile,encoding='utf_8'):
if a.strip()!='':
alist=a.strip().split('../data')
labelDict[alist[1]]=alist[0].strip()
return labelDict
c=loadLabelFile()
print(c)
#readDatalFile(),读取data目录下的邮件数据,将所有垃圾邮件放在ham列表中
def readDatalFile(dataFilePath,labelDict):
spam=[]
ham=[]
for path,label in labelDict.items():
filePath=dataFilePath+path
temp= ''
with open(filePath,'rb') as f:
for line in f:
#ignore:忽略非法字符。
#ignore如果不写,如字符串中出现非法字符,该内容则无法转换
online=line.decode('gbk','ignore').strip()
temp=temp+online
if label=='spam':
spam.append(temp)
else:
ham.append(temp)
return spam,ham
spam,am=readDatalFile('data/trec06c/data',c)
print(spam)
def loadStopWord(stopWordPath):
stopWordList = []
with open('chineseStopWords.txt',encoding='utf-8')as f:
for word in f:
if word.strip()!='':
stopWordList.append(word.strip())
return stopWordList
stopWordList=loadStopWord('chineseStopWords.txt')
import re
def dataProcess(mailList,stopWordList):
mailProcessedList=[]
for mail in mailList:
nonChinese=re.findall(r'[\u4e00-\u9fa5]+',mail)
#过滤停用词
cutword = jieba.cut(nonChinese)
mailProcessed=''.join([word for word in cutword if word not in stopWordList])
mailProcessedList.append(mailProcessed)
return mailProcessedList
spanList = dataProcess(spam,stopWordList)
hamList = dataProcess(am,stopWordList)
print(spanList)
def getDataAndLabel(spamList,hamlist):
dataList = []
labelList = []
for spammail in spamList:
dataList.append(spammail)
labelList.append(1)
for hammail in hamlist:
dataList.append(hammail)
labelList.append(0)
return dataList,labelList
dataList,labelList = getDataAndLabel(spanList,hamList)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(dataList,labelList,test_size=0.2,random_state=9)
tfidf = TfidfVectorizer(max_features = 4000)
x_train_tfidf = tfidf.fit_transform(x_train)
print(type(x_train_tfidf))
print(x_train_tfidf.shape)
x_test_tfidf = tfidf.transform(x_test)
print(x_test_tfidf.shape)
mnb=MultinomialNB()
scores = mnb.fit(x_test_tfidf,y_test)
print(scores)