写在前面:所有用到的包
import json
from matplotlib import pyplot as plt
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from queue import PriorityQueue as PQueue
from functools import reduce
1.读取文件
读取给定的语料库,并把问题列表和答案列表分别写入到 qlist, alist 里面
def read_corpus():
"""
读取给定的语料库,并把问题列表和答案列表分别写入到 qlist, alist 里面。 在此过程中,不用对字符换做任何的处理
qlist = ["问题1", “问题2”, “问题3” ....]
alist = ["答案1", "答案2", "答案3" ....]
务必要让每一个问题和答案对应起来(下标位置一致)
"""
qlist = []
alist = []
with open("data/train-v2.0.json", 'r') as path:
fileJson = json.load(path)
json_list=fileJson['data']
for data_dict in json_list:
for data_key in data_dict:
if data_key=="paragraphs":
paragraphs_list=data_dict[data_key]
for content_dict in paragraphs_list:
for qas_key in content_dict:
if "qas" == qas_key:
qas_list = content_dict[qas_key]
for q_a_dict in qas_list:
if len(q_a_dict["answers"]) > 0:
qlist.append(q_a_dict["question"])
alist.append(q_a_dict["answers"][0]["text"])
print("qlist len:" + str(len(qlist)))
print("alist len:" + str(len(alist)))
assert len(qlist) == len(alist) # 确保长度一样
return qlist, alist
2.理解数据(可视化分析/统计信息)
对数据的理解是任何AI工作的第一步,需要充分对手上的数据有个更直观的理解。
def data_analysis(data):
# TODO: 统计一下在qlist 总共出现了多少个单词? 总共出现了多少个不同的单词?
# TODO: 统计一下qlist中每个单词出现的频率,并把这些频率排一下序,然后画成plot.
qlist_word = []
word_dic = {}
for sentences in data:
cur_word = sentences[:len(sentences) - 1].strip().split(" ")
qlist_word += cur_word
for word in cur_word:
if word in word_dic.keys():
word_dic[word] = word_dic[word] + 1
else:
word_dic[word] = 1
#统计一下在qlist总共出现了多少个不同单词
word_total = le