文本corpus的格式是每行一篇文章,先经过分词,去停用词之后,再利用jieba.analyse.extract_tags(context,topK = N)进行提取出每个文本前10个重要的关键词。文本corpus_tags里面内容是对应的每一行文本的标签。
jieba分词和去停用词代码:
import jieba
# 创建停用词列表函数
def stopwordslist():
stopwords = [line.strip() for line in open('C:/Users/yin/Desktop/chinesestopwords.txt','r').readlines()]
return stopwords
# 对句子进行中文分词和去停用词函数
def seg_depart(sentence):
# 对文档中的每一行进行中文分词
sentence_depart = jieba.cut(sentence.strip())
# 创建一个停用词列表
stopwords = stopwordslist()
# 输出结果为outstr
outstr = ''
# 去停用词
for word in sentence_depart:
if word not in stopwords:
if word != '\t':
outstr += word
outstr += " "
#字符串转换成字节
#outstr = outstr.encode()
return outstr
# 给出文档路径
filename = "C:/Users/yin/Desktop/data16.txt"#输入文件路径
outfilename = "C:/Users/yin/Desktop/data17.txt"#输出文件路径
inputs = open(filename, 'r',encoding = 'utf-8')#注意文件的编码格式
outputs = open(outfilename, 'w+',encoding = 'utf-8')
# 将输出结果写入ou.txt中
for line in inputs:
line_seg = seg_depart(line)
#输出每行的分词和去停用词结果,然后换行
outputs.write(line_seg + '\n')
outputs.close()
inputs.close()
print("删除停用词和分词成功!!!")
分词和去停用词完成之后,再进行特征提取:
import jieba.analyse
with open('C:/Users/yin/Desktop/data17.txt','r',encoding = 'utf-8') as fr,open('C:/Users/yin/Desktop/data16.txt','w',encoding = 'utf-8') as fd:
for text in fr.readlines():
if text.split():#去除掉文本之间的空行,如果没有空行则不需要
keywords = jieba.analyse.extract_tags(text,topK = 10)
for item in keywords:
fd.write(item[0]+item[1])
fd.write(' ')
#fd.write(item[1])
fd.write('\n')
print('输出成功....')
得到的效果基本上如下图所示:(文本特征词和标签)
然后就把上述的两个文件导入到下方的代码中实现向量化和分类器:(训练和测试的数据按照7:3进行随机切割的)
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report,accuracy_score
from sklearn.model_selection import train_test_split # 切割数据---train + test
from sklearn import preprocessing # 结果评估
corpus = open("C:/Users/yin/Desktop/data16.txt","r",encoding="utf-8-sig")
corpus_tags = open("C:/Users/yin/Desktop/data14.txt","r",encoding="utf-8-sig")
cv=TfidfVectorizer(binary=False,decode_error='ignore',stop_words='english')
vec=cv.fit_transform(corpus.readlines())
arr=vec.toarray() #文本特征值矩阵向量arr
#print(arr)
dicts = {"教育":0,"生活家":1,"汽车控":2,"私房话":3,"养生堂":4,"育儿":5,"财经迷":6,"职场":7,
"旅游":8,"搞笑":9,"八卦精":10,"星座":11,"体育":12,"美食":13,"时尚圈":14,"游戏":15,
"萌宠":16,"科技咖":17,"军事":18,"历史":19}
a = np.array(list(map(lambda x: dicts[x.strip()], corpus_tags))) #标签矩阵向量a
X_train,X_test, y_train, y_test =train_test_split(arr,a,test_size=0.3, random_state=0)#把文本特征向量和标签向量分割成训练集和测试集
def test_gaussian_nb():
X = X_train
Y = y_train
gnb = GaussianNB()
gnb.fit(X, Y)
result = gnb.predict(X_test)
print(classification_report(y_test,result))
if __name__ == '__main__':
test_gaussian_nb()
可以得到一个简单的分类评价效果:
以上就是一些简单的过程,欢迎交流,谢谢!