文本相似在问答系统中有很重要的应用,如基于知识的问答系统(Knowledge-based QA),基于文档的问答系统(Documen-based QA),以及基于FAQ的问答系统(Community-QA)等。像 对于问题的内容,需要进行相似度匹配,从而选择出与问题最接近,同时最合理的答案。本节介绍 基于bow的余弦距离计算相似度。
基于gensim的方式如下:
import os
import jieba
import pickle
import logging
import numpy as np
from gensim import corpora, models, similarities
class StopWords(object):
'''
'''
def __init__(self, stopwords_file=stopwords_file ):
self.stopwords = set( [ word.strip() for word in open(stopwords_file, 'r') ] )
def del_stopwords(self, words):
return [ word for word in words if word not in self.stopwords ]
stop_word = StopWords()
# gen 3-gram
def _list_3_ngram(words, n=3, m=2):
pattern1 = re.compile(r'[0-9]')
if len(words) < n:
n = len(words)
temp=[words[i - k:i] for k in range(m, n + 1) for i in range(k, len(words) + 1) ]
return [item for item in temp if len(''.join(item).strip())>0 and len(pattern1.findall(''.join(item).strip()))==0]
# 是否分词、 及其停用词语
def _seg_word(words_list, jieba_flag=True, del_stopword=False):
if jieba_flag:
word_list = [[stop_word.del_stopwords(words) if del_stopword else word for word in jieba.cut(words)] for words in words_list]
else:
word_list = [[stop_word.del_stopwords(words) if del_stopword else word for word in words] for words in words_list]
word_list = [['_'.join(i) for i in _list_3_ngram(words,n=3, m=2)] for words in words_list]
return word_list
word_list = ['我爱北京天安门', '你好,在干嘛呢', '这个什么价钱']
word_list = _seg_word(word_list)
dic = corpora.Dictionary(word_list, prune_at=2000000)
# 保存模型
dic_path = './bow.model'
dic.save( dic_path )
# 加载模型
dic = corpora.Dictionary.load(dic_path)
# 构建tfidf模型
tfidf_model_path = './tfidf_model.model'
corpus_model= [dic.doc2bow(word) for word in word_list]
# 构建检索模型
tfidf_model = models.TfidfModel(corpus_model)
tfidf_model.save(tfidf_model_path)
corpus_tfidf = tfidf_model[corpus_model]
tfidf_index_path = './tfidf_index.model'
tfidf_index = similarities.MatrixSimilarity(corpus_tfidf)
tfidf_index.save(tfidf_index_path)
# 得到句子向量, 直接出检索结果(检索是基于word_list的)。
words = '你好,在干嘛呢'
word_bow = dic.doc2bow(_seg_word([words])[0])
word_tfidf = tfidf_model[word_bow]
tfidf_index[word_tfidf]