import json
from matplotlib import pyplot as plt
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from queue import PriorityQueue as PQueue
from functools import reduce```
#1.读取文件
```python
def read_corpus():
"""
读取给定的语料库,并把问题列表和答案列表分别写入到 qlist, alist 里面。 在此过程中,不用对字符换做任何的处理
qlist = ["问题1", “问题2”, “问题3” ....]
alist = ["答案1", "答案2", "答案3" ....]
务必要让每一个问题和答案对应起来(下标位置一致)
"""
qlist = []
alist = []
with open("data/train-v2.0.json", 'r') as path:
fileJson = json.load(path)
json_list=fileJson['data']
for data_dict in json_list:
for data_key in data_dict:
if data_key=="paragraphs":
paragraphs_list=data_dict[data_key]
for content_dict in paragraphs_list:
for qas_key in content_dict:
if "qas" == qas_key:
qas_list = content_dict[qas_key]
for q_a_dict in qas_list:
if len(q_a_dict["answers"]) > 0:
qlist.append(q_a_dict["question"])
alist.append(q_a_dict["answers"][0]["text"])
print("qlist len:" + str(len(qlist)))
print("alist len:" + str(len(alist)))
assert len(qlist) == len(alist) # 确保长度一样
return qlist, alist
2可视化数据(统计信息,理解数据)
统计在qlist总共出现的单词数及其频率
def data_analysis(data):
"""
基于字典统计单词的频率,并排序
"""
qlist_word = []
word_dic = {}
for sentences in data:
cur_word = sentences[:len(sentences)-1].strip().split(" ") #用空格作分割符
qlist_word += cur_word
for word in cur_word:
if word in word_dic.keys():
word_dic[word] = word_dic[word] + 1
else:
word_dic[word] = 1
#统计qlist总共出现多少个不同的单词
word_total = len(set(qlist_word)) #53306
print("word_total:",word_total)
word_dic = sorted(word_dic.items(),key = lambda x:x[1], reverse=True)
#对前100个单词进行可视化
x = range(100)
y = [c[1] for c in word_dic[:100]]
plt.figure()
plt.plot(x,y)
plt.show()
qlist,alist = read_corpus()
data_analysis(qlist)
qlist len:86821
alist len:86821
word_total: 53306
3数据预处理
对qlist,alist做文本预处理操作:
1.停用词过滤,
2.转换为lower_case
3.去掉一些无用的符号,比如连续的!!!
4.去掉频率很低的单词
5.对数字的处理—都可以看成单词,定义为“#number”
6.stemming(porter stemming)
def data_pre(temp_list):
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))#正则匹配特殊符号
word_list_list = []
word_dict = {}
for line in temp_list:
temp_word_list = []
sentence = pattern.sub("", line) # 1.去掉一些无用的符号
sentence = sentence.lower() # 2.转换成lower_case
word_list = sentence.split()
for word in word_list:
if word not in stop_words: # 3.过滤停用词
word = "#number" if word.isdigit() else word # 4.数字特殊处理
word = stemmer.stem(word) # 5.词干提取(包括词形还原)
word_dict[word] = word_dict.get(word, 0) + 1
temp_word_list.append(word)
word_list_list.append(temp_word_list)
return word_dict, word_list_list
#6. 去掉出现频率很低的词
def filter_words(in_list=[], in_dict={}, lower=0, upper=0):
word_list = []
for key, val in in_dict.items():
if val >= lower and val <= upper:
word_list.append(key)
new_list = []
for line in in_list:
words = [w for w in line if w in word_list]
new_list.append(' '.join(words))
return new_list
4利用tf-idf表示文本信息
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(qlist) #结果存放在系数矩阵X中
5.找到与用户输入的问题匹配度最高的5个为,并返回与之对应的答案
def top5results(input_q):
"""
给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。这里面需要做到以下几点:
1. 对于用户的输入 input_q 首先做一系列的预处理,然后再转换成tf-idf向量(利用上面的vectorizer)
2. 计算跟每个库里的问题之间的相似度
3. 找出相似度最高的top5问题的答案
"""
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
pattern = re.compile('[{}]'.format(re.escape(string.punctuation))) # 正则匹配特殊符号
input_q = pattern.sub("", input_q) # 1.去掉一些无用的符号
input_q = input_q.lower() # 2.转换成lower_case
word_list = input_q.split()
temp_word_list=[]
for word in word_list:
if word not in stop_words: # 3.过滤停用词
word = "#number" if word.isdigit() else word # 4.数字特殊处理
word = stemmer.stem(word) # 5.词干提取(包括词形还原)
temp_word_list.append(word)
new_input=' '.join(temp_word_list)
vectorizer = TfidfVectorizer(smooth_idf=False) # 定义一个tf-idf的vectorizer
X = vectorizer.fit_transform(new_qlist) # 结果存放在X矩阵
#注意fit_transform是训练,transform是加入新数据
input_vec = vectorizer.transform([new_input])# 结果存放在X矩阵
res = cosine_similarity(input_vec, X)[0]
#即输出前k个高频词使用优先队列,优化速度
pq = PQueue()
for i, v in enumerate(res):
pq.put((1.0 - v, i))
top_idxs = [] # top_idxs存放相似度最高的(存在qlist里的)问题的下表
for i in range(5):
top_idxs.append(pq.get()[1])
print(top_idxs) # top_idxs存放相似度最高的(存在qlist里的)问题的下表
# hint: 利用priority queue来找出top results. 思考为什么可以这么做?
# 因为优先级队列的第一个值可以是浮点数,所以用1.0-相似度,就可以转换为优先级
result = [alist[i] for i in top_idxs]
return result # 返回相似度最高的问题对应的答案,作为TOP5答案
qlist, alist = read_corpus()
q_dict, q_list_list = data_pre(qlist)
new_qlist = filter_words(q_list_list, q_dict, 2, 1000)
print(top5results("when did Beyonce start becoming popular?"))
print(top5results("what languge does the word of 'symbiosis' come from"))
qlist len:86821
alist len:86821
[0, 60835, 39267, 23136, 693]
['in the late 1990s', 'mandolin-based guitar programs', 'Particularly since the 1950s, pro wrestling events have frequently been responsible for sellout crowds at large arenas', 'early DJs creating music in their own homes', 'Agnèz Deréon']
[7786, 41967, 8154, 27470, 7844]
['Greek', 'living together', 'Persian and Sanskrit', '1570s', 'the evolution of all eukaryotes']
6 利用倒排表的方式进行匹配优化
核心思路:层次过滤思想
上述匹配算法的缺点就是用户的问题需要跟问题库的所有问题做相似度匹配,时间复杂度取决于库中的问题个数。优化方法是,根据倒排表在库中找到与之相似的问题描述,再根据这些candidates去计算余弦相似度。
from functools import reduce
inverted_idx = {} # 定一个一个简单的倒排表
for i in range(len(qlist)):
for word in qlist[i].split():
if word in inverted_idx:
inverted_idx[word].append(i)
else:
inverted_idx[word] = [i]
for key in inverted_idx:
inverted_idx[key] = sorted(inverted_idx[key])
# 求两个set的交集
def intersections(set1, set2):
return set1.intersection(set2)
def top5results_invidx(input_q):
"""
给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。这里面需要做到以下几点:
1. 利用倒排表来筛选 candidate
2. 对于用户的输入 input_q 首先做一系列的预处理,然后再转换成tf-idf向量(利用上面的vectorizer)
3. 计算跟每个库里的问题之间的相似度
4. 找出相似度最高的top5问题的答案
"""
# 处理输入字符串
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
pattern = re.compile('[{}]'.format(re.escape(string.punctuation))) # 正则匹配特殊符号
sentence = pattern.sub("", input_q)
sentence = sentence.lower()
word_list = sentence.split()
result_list = []
for word in word_list:
if word not in stop_words:
word = "#number" if word.isdigit() else word
word = stemmer.stem(word)
result_list.append(word)
# 找到倒排表中相关的索引,用于答案的候选集
candidate_list = []
for word in result_list:
if word in inverted_idx:
idx_list = inverted_idx[word]
candidate_list.append(set(idx_list))
# 候选问题的索引
# print(candidate_list)
candidate_idx = list(reduce(intersections, candidate_list))
input_seg = ' '.join(result_list)
vectorizer = TfidfVectorizer(smooth_idf=False) # 定义一个tf-idf的vectorizer
X = vectorizer.fit_transform(new_qlist) # 结果存放在X矩阵
input_vec = vectorizer.transform([input_seg])
# 计算所有候选索引中的相似度
similarity_list = []
for i in candidate_idx:
similarity = cosine_similarity(input_vec, X[i])[0]
similarity_list.append((i, similarity[0]))
res_sorted = sorted(similarity_list, key=lambda k: k[1], reverse=True)
print(type(res_sorted))
# 根据索引检索top 5答案
answers = []
i = 0
for (idx, score) in res_sorted:
if i < 5:
answer = alist[idx]
answers.append(answer)
i += 1
return answers
7基于词向量的方式表示文本
def load_glove(path):
#第一元素存储全为0的向量,代表词典里不存在的
vocab = {}
embedding = []
vocab["UNK"] = 0
embedding.append([0] * 100)
with open(path, 'r', encoding='utf8') as f:
i = 1
for line in f:
row = line.strip().split()
vocab[row[0]] = i
embedding.append(row[1:])
i += 1
return vocab, embedding
def top5results_emb(input_q=''):
"""
给定用户输入的问题 input_q, 返回最有可能的TOP 5问题。这里面需要做到以下几点:
1. 利用倒排表来筛选 candidate
2. 对于用户的输入 input_q,转换成句子向量
3. 计算跟每个库里的问题之间的相似度
4. 找出相似度最高的top5问题的答案
"""
path = "./data/glove.6B.100d.txt"
# vacab为词典库,embedding为len(vacab)*100的矩阵。
vocab, embedding= load_glove(path)
stop_words = set(stopwords.words('english'))
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
sentence = pattern.sub("", input_q)
sentence = sentence.lower()
word_list = sentence.split()
result_list = []
for word in word_list:
if word not in stop_words:
word = "#number" if word.isdigit() else word
result_list.append(word)
input_q = " ".join(result_list)
qlist, alist = read_corpus()
q_dict, q_list_list = data_pre(qlist)
new_qlist = filter_words(q_list_list, q_dict, 2, 1000)
inverted_idx = {} # 定一个一个简单的倒排表
for i in range(len(new_qlist)):
for word in new_qlist[i].split():
if word in inverted_idx:
inverted_idx[word].append(i)
else:
inverted_idx[word] = [i]
for key in inverted_idx:
inverted_idx[key] = sorted(inverted_idx[key])
candidates = []
for word in result_list:
if word in inverted_idx:
ids = inverted_idx[word]
candidates.append(set(ids))
candidate_idx = list(reduce(intersections, candidates)) # 候选问题索引
input_q_vec=word_to_vec(input_q,vocab, embedding)
scores = []
for i in candidate_idx:
vec = word_to_vec(new_qlist[i], vocab, embedding)
score = cosine_similarity([input_q_vec, vec])[0]
scores.append((i, score[1]))
scores_sorted = sorted(scores, key=lambda k: k[1], reverse=True)
# 根据索引检索top 5答案
answers = []
i = 0
for (idx,score) in scores_sorted:
if i < 5:
answer = alist[idx]
answers.append(answer)
i += 1
return answers
print(top5results_emb("when did Beyonce start becoming popular?"))
print(top5results_emb("what languge does the word of 'symbiosis' come from"))
print(top5results_emb("In her music, what are some?"))