自然语言处理：问答语料生成词汇表,词转向量(三)

本文链接：https://blog.csdn.net/Xin_101/article/details/87359705

自然语言处理：分词(断词)与关键词提取方法(一)
自然语言处理:scrapy爬取关键词信息(二)

1 获取语料

小黄鸡语料:
链接：https://pan.baidu.com/s/1eTuNwfFuCzSsggV4btMweA
提取码：suj1
未分词语料:

E
M 呵呵
M 是王若猫的。
E
M 不是
M 那是什么？
E
M 怎么了
M 我很难过，安慰我~
E
M 开心点哈,一切都会好起来
M 嗯 会的
E
M 我还喜欢她,怎么办
M 我帮你告诉她？发短信还是打电话？
E
M 短信
M 嗯嗯。我也相信
E

分词语料:

E
M 呵/呵
M 是/王/若/猫/的/。
E
M 不/是
M 那/是/什/么/？
E
M 怎/么/了
M 我/很/难/过/，/安/慰/我/~
E
M 开/心/点/哈/,/一/切/都/会/好/起/来
M 嗯/ /会/的
E
M 我/还/喜/欢/她/,/怎/么/办
M 我/帮/你/告/诉/她/？/发/短/信/还/是/打/电/话/？
E
M 短/信
M 嗯/嗯/。/我/也/相/信
E

2 预处理语料

2.1 拆分问答数据集

本文使用未切词的语料处理.

Demo

import os
# 语料路径
conv_path = "./data/xiaohuangji.conv"
if os.path.exists(conv_path) is False:
	print("数据集不存在,请检查数据集路径")
	exit()
# 对话数据集
convs = []  
with open(conv_path, encoding = "utf8") as f:
	# 单轮对话
	one_conv = []        
	for line in f:
		# 分词语料处理
		# line = line.strip('\n').replace('/', '')
		# 未分词语料处理
		line = line.strip('\n')
		# 单行语料
		print("Line vocabulary: {}".format(line))
		if line == '':
			continue
		# E为对话标志位,寻址到E则为一次对话
		# 将对话写入对话语料
		if line[0] == 'E':
			if one_conv:
				convs.append(one_conv)
			# initialize one_conv and clear
			one_conv = []
		# M为对话内容标志位,寻址到M为单轮对话语料
		elif line[0] == 'M':
			# split以空格符号切分字符串,转为list
			# 获取list索引为1的内容,即对话内容
			one_conv.append(line.split(' ')[1])
			print("One conversation: {}".format(one_conv))

	print("Full conversations: {}".format(convs))
	print("Length of full conversations: {}".format(len(convs)))
# 把对话分成问与答
questions = [] # 问
answers = []   # 答
for conv in convs:
	if len(conv) == 1:
		continue
	# 奇数对话数, 转为偶数对话
	if len(conv) % 2 != 0:  
		conv = conv[:-1]
	for i in range(len(conv)):
		if i % 2 == 0:
			questions.append(conv[i])
		else:
			answers.append(conv[i])

print("Questions: {}".format(questions))

print("Answers: {}".format(answers))
print("Lenght of questions: {}".format(len(questions)))
print("Lenght of answers: {}".format(len(answers)))

部分结果

Line vocabulary: M 短信
One conversation: ['短信']
Line vocabulary: M 嗯嗯。我也相信
One conversation: ['短信', '嗯嗯。我也相信']
Line vocabulary: E
Full conversations: [['呵呵', '是王若猫的。'], ['不是', '那是什么？'], ['怎么了', '我很难过，安慰我~'], ['开心点哈,一切都会好起来', '嗯'], ['我还喜欢她,怎么办', '我帮你告诉她？发短信还是打电话？'], ['短信', '嗯嗯。我也相信']]
Length of full conversations: 6
Questions: ['呵呵', '不是', '怎么了', '开心点哈,一切都会好起来', '我还喜欢她,怎么办', '短信']
Answers: ['是王若猫的。', '那是什么？', '我很难过，安慰我~', '嗯', '我帮你告诉她？发短信还是打电话？', '嗯嗯。我也相信']
Lenght of questions: 6
Lenght of answers: 6

2.2 整理训练数据集和测试数据集

Demo

def question_answer_dataset(questions, answers, TESTSET_SIZE = 2):
    # 创建文件
    # 问题文件
    question_train_enc = open('./data/question_train.enc','w') 
    # 回答文件 
    answer_train_dec = open('./data/answer_train.dec','w')  
    # 问题文件
    question_test_enc  = open('./data/question_test.enc', 'w') 
    # 回答文件 
    answer_test_dec  = open('./data/answer_test.dec', 'w')  
 
    # 随机获取问题语料的序号,TESTSET_SIZE为摘取的数量
    test_index = random.sample([i for i in range(len(questions))],TESTSET_SIZE)
 
    for i in range(len(questions)):
        if i in test_index:
            question_test_enc.write(questions[i]+'\n')
            answer_test_dec.write(answers[i]+ '\n' )
        else:
            question_train_enc.write(questions[i]+'\n')
            answer_train_dec.write(answers[i]+ '\n' )
        if i % 1000 == 0:
            print(len(range(len(questions))), '处理进度:', i)
 
    question_train_enc.close()
    answer_train_dec.close()
    question_test_enc.close()
    answer_test_dec.close()
question_answer_dataset(questions, answers)

|-- data
|   |-- answer_test.dec
|   |-- answer_train.dec
|   |-- question_test.enc
|   |-- question_train.enc
|   `-- xiaohuangji.conv

2.3 完整代码

import os
# 语料路径
conv_path = "./data/xiaohuangji.conv"
if os.path.exists(conv_path) is False:
	print("数据集不存在,请检查数据集路径")
	exit()
# 对话数据集
convs = []  
with open(conv_path, encoding = "utf8") as f:
	# 单轮对话
	one_conv = []        
	for line in f:
		# 使用切词是拆解
		# line = line.strip('\n').replace('/', '')
		# 未使用切词
		line = line.strip('\n')
		# 单行语料
		print("Line vocabulary: {}".format(line))
		if line == '':
			continue
		# E为对话标志位,寻址到E则为一次对话
		# 将对话写入对话语料
		if line[0] == 'E':
			if one_conv:
				convs.append(one_conv)
			# initialize one_conv and clear
			one_conv = []
		# M为对话内容标志位,寻址到M为单轮对话语料
		elif line[0] == 'M':
			# split以空格符号切分字符串,转为list
			# 获取list索引为1的内容,即对话内容
			one_conv.append(line.split(' ')[1])
			print("One conversation: {}".format(one_conv))

	print("Full conversations: {}".format(convs))
	print("Length of full conversations: {}".format(len(convs)))
# 把对话分成问与答
questions = [] # 问
answers = []   # 答
for conv in convs:
	if len(conv) == 1:
		continue
	# 奇数对话数, 转为偶数对话
	if len(conv) % 2 != 0:  
		conv = conv[:-1]
	for i in range(len(conv)):
		if i % 2 == 0:
			questions.append(conv[i])
		else:
			answers.append(conv[i])
def question_answer_dataset(questions, answers, TESTSET_SIZE = 2):
    # 创建文件
    # 问题文件
    question_train_enc = open('./data/question_train.enc','w') 
    # 回答文件 
    answer_train_dec = open('./data/answer_train.dec','w')  
    # 问题文件
    question_test_enc  = open('./data/question_test.enc', 'w') 
    # 回答文件 
    answer_test_dec  = open('./data/answer_test.dec', 'w')  
 
    # 随机获取问题语料的序号,TESTSET_SIZE为摘取的数量
    test_index = random.sample([i for i in range(len(questions))],TESTSET_SIZE)
 
    for i in range(len(questions)):
        if i in test_index:
            question_test_enc.write(questions[i]+'\n')
            answer_test_dec.write(answers[i]+ '\n' )
        else:
            question_train_enc.write(questions[i]+'\n')
            answer_train_dec.write(answers[i]+ '\n' )
    question_train_enc.close()
    answer_train_dec.close()
    question_test_enc.close()
    answer_test_dec.close()
if __name__ == "__main__":
	question_answer_dataset(questions, answers)

2.4 词转向量（word2vec）

2.4.1 生成词汇表

Demo

# 问题数据
question_train_encode_file = './data/question_train.enc' 
# 回答数据 
answer_train_decode_file = './data/answer_train.dec' 
question_test_encode_file = './data/question_test.enc' 
answer_test_decode_file  = './data/answer_test.dec' 
# 特殊标记，用来填充标记对话
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__"  # 对话结束
UNK = "__UNK__"  # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
# 生成词汇表文件
def gen_vocabulary_file(input_file, output_file):
	vocabulary = {}
	with open(input_file) as f:
		counter = 0
		for line in f:
			counter += 1
			tokens = [word for word in line.strip()]
			for word in tokens:
				if word in vocabulary:
					vocabulary[word] += 1
				else:
					vocabulary[word] = 1

		print("vocabulary: {}".format(vocabulary))
		vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)

		print("vocabulary list: {}".format(vocabulary_list))
		print(input_file + " 词汇表大小:", len(vocabulary_list))
		with open(output_file, "w") as ff:
			for word in vocabulary_list:
				ff.write(word + "\n")
 
gen_vocabulary_file(question_train_encode_file, "./data/word2vec/train_question_encode_vocabulary")
gen_vocabulary_file(answer_train_decode_file, "./data/word2vec/train_answer_decode_vocabulary")

Result(问题部分词汇表)

__PAD__
__GO__
__EOS__
__UNK__
呵
开
心
点
哈
一
切
都
会
好
起
来
我
还
喜
欢
她
怎
么
办
短
信

2.4.2 生成向量

Demo

# 把对话字符串转为向量形式
def convert_to_vector(input_file, vocabulary_file, output_file):
	tmp_vocab = []
	with open(vocabulary_file, "r") as f:
		# print("vocabulary: {}".format(f.readlines()))
		tmp_vocab.extend(f.readlines())
		print("temporay vocabulary: {}".format(tmp_vocab))
	tmp_vocab = [line.strip() for line in tmp_vocab]
	print("strip temporay vocabulary: {}".format(tmp_vocab))
	vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)])
	print("Dictionary vocabulary: {}".format(vocab))
	#{'硕': 3142, 'v': 577, 'Ｉ': 4789, '\ue796': 4515, '拖': 1333, '疤': 2201 ...}
	output_f = open(output_file, 'w')
	with open(input_file, 'r') as f:
		for line in f:
			line_vec = []
			print("Line: {}".format(line))
			for words in line.strip():
				# print("words: {}".format(words))
				line_vec.append(vocab.get(words, UNK_ID))
			print("Line vector: {}".format(line_vec))
			output_f.write(" ".join([str(num) for num in line_vec]) + "\n")
	output_f.close()
 
convert_to_vector(question_train_encode_file, train_question_encode_vocabulary_file, './data/word2vec/train_question_encode.vec')
convert_to_vector(answer_train_decode_file, train_answer_decode_vocabulary_file, './data/word2vec/train_answer_decode.vec')

Result(问题部分向量)

4 4
5 6 7 8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
24 25

2.4.3 完整代码

# 前一步生成的问答文件路径
# 问题数据
question_train_encode_file = './data/question_train.enc' 
# 回答数据 
answer_train_decode_file = './data/answer_train.dec' 
question_test_encode_file = './data/question_test.enc' 
answer_test_decode_file  = './data/answer_test.dec' 
 
print('开始创建词汇表...')
# 特殊标记，用来填充标记对话
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__"  # 对话结束
UNK = "__UNK__"  # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
 
vocabulary_size = 5000
# 生成词汇表文件
def gen_vocabulary_file(input_file, output_file):
	vocabulary = {}
	with open(input_file) as f:
		counter = 0
		for line in f:
			counter += 1
			tokens = [word for word in line.strip()]
			for word in tokens:
				if word in vocabulary:
					vocabulary[word] += 1
				else:
					vocabulary[word] = 1

		print("vocabulary: {}".format(vocabulary))
		vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)

		print("vocabulary list: {}".format(vocabulary_list))
		print(input_file + " 词汇表大小:", len(vocabulary_list))
		with open(output_file, "w") as ff:
			for word in vocabulary_list:
				ff.write(word + "\n") 
gen_vocabulary_file(question_train_encode_file, "./data/word2vec/train_question_encode_vocabulary")
gen_vocabulary_file(answer_train_decode_file, "./data/word2vec/train_answer_decode_vocabulary")
 
train_question_encode_vocabulary_file = './data/word2vec/train_question_encode_vocabulary'
train_answer_decode_vocabulary_file = './data/word2vec/train_answer_decode_vocabulary'
print("对话转向量...")
# 把对话字符串转为向量形式
def convert_to_vector(input_file, vocabulary_file, output_file):
	tmp_vocab = []
	with open(vocabulary_file, "r") as f:
		# print("vocabulary: {}".format(f.readlines()))
		tmp_vocab.extend(f.readlines())
		print("temporay vocabulary: {}".format(tmp_vocab))
	tmp_vocab = [line.strip() for line in tmp_vocab]
	print("strip temporay vocabulary: {}".format(tmp_vocab))
	vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)])
	print("Dictionary vocabulary: {}".format(vocab))
	output_f = open(output_file, 'w')
	with open(input_file, 'r') as f:
		for line in f:
			line_vec = []
			print("Line: {}".format(line))
			for words in line.strip():
				# print("words: {}".format(words))
				line_vec.append(vocab.get(words, UNK_ID))
			print("Line vector: {}".format(line_vec))
			output_f.write(" ".join([str(num) for num in line_vec]) + "\n")
	output_f.close()
 
convert_to_vector(question_train_encode_file, train_question_encode_vocabulary_file, './data/word2vec/train_question_encode.vec')
convert_to_vector(answer_train_decode_file, train_answer_decode_vocabulary_file, './data/word2vec/train_answer_decode.vec')

3 总结

(1) 语料处理分四部分:获取源数据,生成问答语料集(去除标点符号),生成词汇表,生成词向量;各数据源长这样:
源数据:

E
M 呵呵
M 是王若猫的。
E
M 不是
M 那是什么？
E
M 怎么了
M 我很难过，安慰我~

问答语料集:

呵呵
开心点哈一切都会好起来
我还喜欢她怎么办
短信

词汇表:

__PAD__
__GO__
__EOS__
__UNK__
呵
开
心
点
哈
一
切
都
会
好
起
来
我

词向量:

4 4
5 6 7 8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
24 25

(2) 源数据处理成向量,作为训练的源数据;

拓展
自然语言处理：分词(断词)与关键词提取方法(一)
自然语言处理:scrapy爬取关键词信息(二)

[参考文献]
[1]http://blog.topspeedsnail.com/archives/10735/comment-page-1#comment-1161%E3%80%82
[2]https://blog.csdn.net/mach_learn/article/details/41744487