import jieba
import re
from gensim import corpora,models,similarities
import pandas as pd
载入用户词典
jieba.load_userdict('userdict.txt')
创建停用词表
def stopwordslist(filepath):
stopwords=[line.strip() for line in open(filepath,'r',encoding='utf-8').readlines()]
return stopwords
对句子进行分词
def seg_sentence(sentence):
reg = "[^0-9A-Za-z\u4e00-\u9fa5]"
sentence = re.sub(reg, "",str(sentence))
sentence_seged=jieba.cut(sentence.strip())
stopwords=stopwordslist('stopwords.txt') #加载停用词表,先去除左右空格
outstr=''
for word in sentence_seged:
if word not in stopwords:
if word !='\t':
outstr+=word
outstr+=" "
return outstr
模型训练
def train_model():
#准备语料
df=pd.read_csv('covid_qa.csv',sep='|',usecols=['question'])
all_doc=list(df.question)
#预处理,抽取所有关键词,放入all_keys中
all_keys=[]
for doc in all_doc:
key_word=seg_sentence(doc).split()
all_keys.append(key_word)
###############################制作语料库############################
#用dictionary方法获取词袋
dictionary=corpora.Dictionary(all_keys)
dictionary.save('covid_qa.dic')
#使用doc2bow制作语料
bow_corpus=[dictionary.doc2bow(doc) for doc in all_keys]
#使用TF-IDF模型对语料库建模
tfidf=models.TfidfModel(bow_corpus)
tfidf.save('covid_qa.tfidf')
#计算稀疏句子相似度,建立一个索引
index_model=similarities.SparseMatrixSimilarity(tfidf[bow_corpus],num_features=len(dictionary.token2id))
index_model.save('covid_qa.model')
问题与答案匹配
def match(question):
############################处理待对比文档###########################
target_doc=question
#对目标文档分词
target_words=seg_sentence(target_doc).split()
#转为词袋表示
dictionary=corpora.Dictionary.load('covid_qa.dic')
target_corpus=dictionary.doc2bow(target_words)
#计算目标文档的相似度
index_model=similarities.SparseMatrixSimilarity('covid_qa.model')
sim=index_model[tfidf[target_corpus]]
#根据相似度排序
results=sorted(enumerate(sim),key=lambda item:-item[1])
result=results[0][0]#相似度最高的文档索引
df=pd.read_csv('covid_qa.csv',sep='|',usecols=['answer'])
print('问题:',question)
print('答案:',df.iloc[result].answer)
return df.iloc[result].answer
def QA():
print('*'*20,'\n新冠知识问答系统(V1.0)')
print('*'*20)
question=input('请输入问题(按字母a结束):')
while question !='a':
print(match(question))
question=input('请输入问题(按字母a结束):')
train_model()#执行一次就行
QA()