机器学习-文本分类实例-朴素贝叶斯
1.准备训练样本
使用的复旦大学文本分类样本数据
2.训练模型
3.准备测试数据
4.分类
训练模型
import os
import jieba
#Bunch类
from sklearn.datasets.base import Bunch
import pickle
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类
from sklearn.feature_extraction.text import TfidfVectorizer
# 定义两个函数读取和保存文件
# 保存至文件
def savefile(savepath, content):
fp = open(savepath, "w", encoding="GBK")
fp.write(content)
fp.close()
# 读文件
def readfile(path, encode):
content = None
try:
fp = open(path, "r", encoding=encode)
content = fp.read()
fp.close()
except UnicodeDecodeError:
print("Error: 文件读取失败")
else:
return content
# 1.读取和写入Bunch对象的函数
def readbunchobj(path):
file_obj = open(path, "rb")
bunch = pickle.load(file_obj)
file_obj.close()
return bunch
# 写入Bunch对象
def writebunchobj(path,bunchobj):
file_obj = open(path, "wb")
pickle.dump(bunchobj, file_obj)
file_obj.close()
# 整个语料库分词的主程序
# 训练文本分词存储
def segment(corpus_path,seg_path):
# 获取corpus_path下的所有子目录
cateList = os.listdir(corpus_path)
for myDir in cateList:
if not myDir.startswith("."):
# 拼出分类子目录的路径
class_path = corpus_path+myDir+"/"
# 拼出分词后的语料分类目录
seg_dir = seg_path+myDir+"/"
# 是否存在目录,如果没有则创建
if not os.path.exists(seg_dir):
os.makedirs(seg_dir)
# 获得类别目录下的所有文件
file_list = os.listdir(class_path)
# 遍历类别目录下的所有文件
for file_path in file_list:
# 拼出文件名的全路径
fullname = class_path + file_path
print("path:" + fullname)
# 读取文件的内容
content = readfile(fullname, "GBK")
if content != None:
content = content.strip()
# 删除换行和多余的空格
content = content.replace("\r\n", "").strip()
# 为文件的内容分词
content_seg = jieba.cut(content)
# 将处理后的文件保存到分词后的语目录
savefile(seg_dir + file_path, "".join(content_seg))
print("中文语料分析结束!!!")
# 将分好词的文本文件转换并持久化为Bunch类形式
def bunchObj(wordbag_path,seg_path):
bunch = Bunch(target_name=[], label=[], filename=[], contents=[])
# 将分好词的文本文件转换并持久化为Bunch类形式的代码如下:
catelist = os.listdir(seg_path)
# 按类别信息保存到Bunch对象中
bunch.target_name.extend(catelist)
for myDir in catelist:
if not myDir.startswith("."):
class_path = seg_path + myDir + "/"
file_list = os.listdir(class_path)
for file_path in file_list:
fullname = class_path + file_path
print(fullname)
# 保存当前文件的分类标签
bunch.label.append(myDir)
# 保存当前文件路径
bunch.filename.append(fullname)
# 保存文件词向量
bunch.contents.append(readfile(fullname, "GBK").strip())
# Bunch对象的持久化
file_obj = open(wordbag_path, "wb")
pickle.dump(bunch, file_obj)
file_obj.close()
print("构建文本对象结束!!!")
#训练模型
def startTrain(stopword_path, wordbag_path, space_path):
stpwrdlst = readfile(stopword_path,"UTF-8").splitlines()
# 从训练集生成TF-IDF向量词袋
# 2.导入分词后的词向量Bunch对象
bunch = readbunchobj(wordbag_path)
# 3.构建TF-IDF向量空间模型
tfidfspace = Bunch(target_name=bunch.target_name, label=bunch.label, filename=bunch.filename, tdm=[], vocabulary={})
# 使用TfidfVectorizer初始化向量空间模型
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5)
transform = TfidfTransformer() # 该类会统计每个词语放入Tf-IDF权重
# 4.文本转化为词频矩阵:单独保存字典文件
tfidfspace.tdm = vectorizer.fit_transform(bunch.contents)
tfidfspace.vocabulary = vectorizer.vocabulary_
# 5.创建词袋的持久化
writebunchobj(space_path, tfidfspace)
print("文本分类模型训练完成")
# 未分词分类语料库路径
corpus_path = "/Users/FengZhen/Desktop/accumulate/机器学习/文本集/train/"
# 分词后的分类语料库路径
segment_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_segment/"
# 分词语料Bunch对象持久化文件路径
wordbag_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/train_set.dat"
# 停用词路径
stop_words_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/中文停用词表.txt"
# 创建词袋的持久化路径
space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat"
# 训练文本分词存储
# segment(corpus_path, segment_path)
# 将分好词的文本文件转换并持久化为Bunch类形式
# bunchObj(wordbag_path, segment_path)
#开始训练
startTrain(stop_words_path, wordbag_path, space_path)
准备测试数据
import os
import jieba
#Bunch类
from sklearn.datasets.base import Bunch
import pickle
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer #TF-IDF向量转换类
from sklearn.feature_extraction.text import TfidfVectorizer
# 定义两个函数读取和保存文件
# 保存至文件
def savefile(savepath, content):
fp = open(savepath, "w", encoding="GBK")
fp.write(content)
fp.close()
# 读文件
def readfile(path, encode):
content = None
try:
fp = open(path, "r", encoding=encode)
content = fp.read()
fp.close()
except UnicodeDecodeError:
print("Error: 文件读取失败")
else:
return content
# 1.读取和写入Bunch对象的函数
def readbunchobj(path):
file_obj = open(path, "rb")
bunch = pickle.load(file_obj)
file_obj.close()
return bunch
# 写入Bunch对象
def writebunchobj(path,bunchobj):
file_obj = open(path, "wb")
pickle.dump(bunchobj, file_obj)
file_obj.close()
# 整个语料库分词的主程序
# 训练文本分词存储
def segment(corpus_path,seg_path):
# 获取corpus_path下的所有子目录
cateList = os.listdir(corpus_path)
for myDir in cateList:
if not myDir.startswith("."):
# 拼出分类子目录的路径
class_path = corpus_path+myDir+"/"
# 拼出分词后的语料分类目录
seg_dir = seg_path+myDir+"/"
# 是否存在目录,如果没有则创建
if not os.path.exists(seg_dir):
os.makedirs(seg_dir)
# 获得类别目录下的所有文件
file_list = os.listdir(class_path)
# 遍历类别目录下的所有文件
for file_path in file_list:
# 拼出文件名的全路径
fullname = class_path + file_path
print("path:" + fullname)
# 读取文件的内容
content = readfile(fullname, "GBK")
if content != None:
content = content.strip()
# 删除换行和多余的空格
content = content.replace("\r\n", "").strip()
# 为文件的内容分词
content_seg = jieba.cut(content)
# 将处理后的文件保存到分词后的语目录
savefile(seg_dir + file_path, "".join(content_seg))
print("中文语料分析结束!!!")
# 将分好词的文本文件转换并持久化为Bunch类形式
def bunchObj(wordbag_path,seg_path):
bunch = Bunch(target_name=[], label=[], filename=[], contents=[])
# 将分好词的文本文件转换并持久化为Bunch类形式的代码如下:
catelist = os.listdir(seg_path)
# 按类别信息保存到Bunch对象中
bunch.target_name.extend(catelist)
for myDir in catelist:
if not myDir.startswith("."):
class_path = seg_path + myDir + "/"
file_list = os.listdir(class_path)
for file_path in file_list:
fullname = class_path + file_path
print(fullname)
# 保存当前文件的分类标签
bunch.label.append(myDir)
# 保存当前文件路径
bunch.filename.append(fullname)
# 保存文件词向量
bunch.contents.append(readfile(fullname, "GBK").strip())
# Bunch对象的持久化
file_obj = open(wordbag_path, "wb")
pickle.dump(bunch, file_obj)
file_obj.close()
print("构建文本对象结束!!!")
#训练模型
def startTrain(stopword_path, wordbag_path, space_path, train_space_path):
stpwrdlst = readfile(stopword_path,"UTF-8").splitlines()
# 从训练集生成TF-IDF向量词袋
# 2.导入分词后的词向量Bunch对象
bunch = readbunchobj(wordbag_path)
# 3.构建测试集TF-IDF向量空间
testspace = Bunch(target_name=bunch.target_name, label=bunch.label, filenames= bunch.filename, tdm=[], vocabulary={})
# 4.导入训练集词袋
trainbunch = readbunchobj(train_space_path)
# 5.使用TfidfVectorizer初始化向量空间模型
vectorizer = TfidfVectorizer(stop_words=stpwrdlst, sublinear_tf=True, max_df=0.5,vocabulary=trainbunch.vocabulary) # 使用训练集词袋向量
transformer = TfidfTransformer()
testspace.tdm = vectorizer.fit_transform(bunch.contents)
testspace.vocabulary = trainbunch.vocabulary
writebunchobj(space_path, testspace)
print("文本分类模型训练完成")
# 未分词分类语料库路径
corpus_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/测试文本集/"
# 分词后的分类语料库路径
segment_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_segment/"
# 分词语料Bunch对象持久化文件路径
wordbag_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/test_set.dat"
# 停用词路径
stop_words_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/中文停用词表.txt"
# 创建词袋的持久化路径
space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/testspace.dat"
train_space_path = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat"
# 训练文本分词存储
# segment(corpus_path, segment_path)
#
# # 将分好词的文本文件转换并持久化为Bunch类形式
# bunchObj(wordbag_path, segment_path)
#开始训练
startTrain(stop_words_path, wordbag_path, space_path,train_space_path)
测试
import pickle
from sklearn.naive_bayes import MultinomialNB #导入多项式贝叶斯算法包
def readbunchobj(path):
file_obj = open(path,"rb")
bunch = pickle.load(file_obj)
file_obj.close()
return bunch
#导入训练向量空间
trainpath = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/train_word_bag/tfidfspace.dat"
train_set = readbunchobj(trainpath)
#导入测试集向量空间
testpath = "/Users/FengZhen/Desktop/accumulate/机器学习/classify_test/test_word_bag/testspace.dat"
test_set = readbunchobj(testpath)
#应用朴素贝叶斯
#alpha:0.001 alpha越小,迭代次数越多,精度越高
clf = MultinomialNB(alpha = 0.001).fit(train_set.tdm,train_set.label)
#预测分类结果
predicted = clf.predict(test_set.tdm)
total = len(predicted)
rate = 0
for flabel,file_name,expct_cate in zip(test_set.label,test_set.filenames,predicted):
print(file_name, u":实际类别:", flabel, u"-->预测类别:", expct_cate)
if flabel != expct_cate:
rate += 1
# print(file_name,u":实际类别:",flabel,u"-->预测类别:",expct_cate)
#精度
print("error rate:",float(rate)*100/float(total),"%")