1.环境介绍
(1)python
(2)jieba,分词
(3)sklearn,机器学习
(4)scipy,数学工具
安装jieba, pip install jieba --upgrade
安装sklearn, pip install sklearn --upgrade
安装scipy, pip install scipy --upgrade
2.准备数据,收集宾馆和旅游的信息,train,test,hotel, travel
3.编写代码
import os
import jieba
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.externals import joblib
import time
def preprocess(path):
text_with_space=""
textfile=open(path,"r",encoding="utf-8").read()
textcute=jieba.cut(textfile)
for word in textcute:
text_with_space+=word+" "
return text_with_space
def loadtrainset(path,classtag):
allfiles=os.listdir(path)
processed_textset=[]
allclasstags=[]
for thisfile in allfiles:
print(thisfile)
path_name=path+"/"+thisfile
processed_textset.append(preprocess(path_name))
allclasstags.append(classtag)
return processed_textset,allclasstags
processed_textdata1,class1=loadtrainset("D:/learning/rickyChatobot/dataset/train/hotel","宾馆")
processed_textdata2,class2=loadtrainset("D:/learning/rickyChatobot/dataset/train/travel","旅游")
train_data=processed_textdata1+processed_textdata2
classtags_list=class1+class2
count_vector=CountVectorizer()
vector_matrix=count_vector.fit_transform(train_data) #fit_transform()的作用就是先训练,找到转换数据的规则,然后根据找到的规则转换数据
#TFIDF
train_tfidf=TfidfTransformer(use_idf=False).fit_transform(vector_matrix)
clf=MultinomialNB().fit(train_tfidf,classtags_list)
testset=[]
path="D:/learning/rickyChatobot/dataset/test/hotel"
allfiles=os.listdir(path)
hotel=0
travel=0
for thisfile in allfiles:
path_name=path+"/"+thisfile
new_count_vector=count_vector.transform([preprocess(path_name)])
new_tfidf=TfidfTransformer(use_idf=False).fit_transform(new_count_vector)
predict_result=clf.predict(new_tfidf)
print(predict_result)
if(predict_result=="宾馆"):
hotel+=1
if(predict_result=="旅游"):
travel+=1
print("宾馆"+str(hotel))
print("旅游"+str(travel))
4.运行结果