自然语言处理:分词(断词)与关键词提取方法(一)
自然语言处理:scrapy爬取关键词信息(二)
1 获取语料
小黄鸡语料:
链接:https://pan.baidu.com/s/1eTuNwfFuCzSsggV4btMweA
提取码:suj1
未分词语料:
E
M 呵呵
M 是王若猫的。
E
M 不是
M 那是什么?
E
M 怎么了
M 我很难过,安慰我~
E
M 开心点哈,一切都会好起来
M 嗯 会的
E
M 我还喜欢她,怎么办
M 我帮你告诉她?发短信还是打电话?
E
M 短信
M 嗯嗯。我也相信
E
分词语料:
E
M 呵/呵
M 是/王/若/猫/的/。
E
M 不/是
M 那/是/什/么/?
E
M 怎/么/了
M 我/很/难/过/,/安/慰/我/~
E
M 开/心/点/哈/,/一/切/都/会/好/起/来
M 嗯/ /会/的
E
M 我/还/喜/欢/她/,/怎/么/办
M 我/帮/你/告/诉/她/?/发/短/信/还/是/打/电/话/?
E
M 短/信
M 嗯/嗯/。/我/也/相/信
E
2 预处理语料
2.1 拆分问答数据集
本文使用未切词的语料处理.
- Demo
import os
# 语料路径
conv_path = "./data/xiaohuangji.conv"
if os.path.exists(conv_path) is False:
print("数据集不存在,请检查数据集路径")
exit()
# 对话数据集
convs = []
with open(conv_path, encoding = "utf8") as f:
# 单轮对话
one_conv = []
for line in f:
# 分词语料处理
# line = line.strip('\n').replace('/', '')
# 未分词语料处理
line = line.strip('\n')
# 单行语料
print("Line vocabulary: {}".format(line))
if line == '':
continue
# E为对话标志位,寻址到E则为一次对话
# 将对话写入对话语料
if line[0] == 'E':
if one_conv:
convs.append(one_conv)
# initialize one_conv and clear
one_conv = []
# M为对话内容标志位,寻址到M为单轮对话语料
elif line[0] == 'M':
# split以空格符号切分字符串,转为list
# 获取list索引为1的内容,即对话内容
one_conv.append(line.split(' ')[1])
print("One conversation: {}".format(one_conv))
print("Full conversations: {}".format(convs))
print("Length of full conversations: {}".format(len(convs)))
# 把对话分成问与答
questions = [] # 问
answers = [] # 答
for conv in convs:
if len(conv) == 1:
continue
# 奇数对话数, 转为偶数对话
if len(conv) % 2 != 0:
conv = conv[:-1]
for i in range(len(conv)):
if i % 2 == 0:
questions.append(conv[i])
else:
answers.append(conv[i])
print("Questions: {}".format(questions))
print("Answers: {}".format(answers))
print("Lenght of questions: {}".format(len(questions)))
print("Lenght of answers: {}".format(len(answers)))
- 部分结果
Line vocabulary: M 短信
One conversation: ['短信']
Line vocabulary: M 嗯嗯。我也相信
One conversation: ['短信', '嗯嗯。我也相信']
Line vocabulary: E
Full conversations: [['呵呵', '是王若猫的。'], ['不是', '那是什么?'], ['怎么了', '我很难过,安慰我~'], ['开心点哈,一切都会好起来', '嗯'], ['我还喜欢她,怎么办', '我帮你告诉她?发短信还是打电话?'], ['短信', '嗯嗯。我也相信']]
Length of full conversations: 6
Questions: ['呵呵', '不是', '怎么了', '开心点哈,一切都会好起来', '我还喜欢她,怎么办', '短信']
Answers: ['是王若猫的。', '那是什么?', '我很难过,安慰我~', '嗯', '我帮你告诉她?发短信还是打电话?', '嗯嗯。我也相信']
Lenght of questions: 6
Lenght of answers: 6
2.2 整理训练数据集和测试数据集
- Demo
def question_answer_dataset(questions, answers, TESTSET_SIZE = 2):
# 创建文件
# 问题文件
question_train_enc = open('./data/question_train.enc','w')
# 回答文件
answer_train_dec = open('./data/answer_train.dec','w')
# 问题文件
question_test_enc = open('./data/question_test.enc', 'w')
# 回答文件
answer_test_dec = open('./data/answer_test.dec', 'w')
# 随机获取问题语料的序号,TESTSET_SIZE为摘取的数量
test_index = random.sample([i for i in range(len(questions))],TESTSET_SIZE)
for i in range(len(questions)):
if i in test_index:
question_test_enc.write(questions[i]+'\n')
answer_test_dec.write(answers[i]+ '\n' )
else:
question_train_enc.write(questions[i]+'\n')
answer_train_dec.write(answers[i]+ '\n' )
if i % 1000 == 0:
print(len(range(len(questions))), '处理进度:', i)
question_train_enc.close()
answer_train_dec.close()
question_test_enc.close()
answer_test_dec.close()
question_answer_dataset(questions, answers)
- 目录结构
|-- data
| |-- answer_test.dec
| |-- answer_train.dec
| |-- question_test.enc
| |-- question_train.enc
| `-- xiaohuangji.conv
2.3 完整代码
import os
# 语料路径
conv_path = "./data/xiaohuangji.conv"
if os.path.exists(conv_path) is False:
print("数据集不存在,请检查数据集路径")
exit()
# 对话数据集
convs = []
with open(conv_path, encoding = "utf8") as f:
# 单轮对话
one_conv = []
for line in f:
# 使用切词是拆解
# line = line.strip('\n').replace('/', '')
# 未使用切词
line = line.strip('\n')
# 单行语料
print("Line vocabulary: {}".format(line))
if line == '':
continue
# E为对话标志位,寻址到E则为一次对话
# 将对话写入对话语料
if line[0] == 'E':
if one_conv:
convs.append(one_conv)
# initialize one_conv and clear
one_conv = []
# M为对话内容标志位,寻址到M为单轮对话语料
elif line[0] == 'M':
# split以空格符号切分字符串,转为list
# 获取list索引为1的内容,即对话内容
one_conv.append(line.split(' ')[1])
print("One conversation: {}".format(one_conv))
print("Full conversations: {}".format(convs))
print("Length of full conversations: {}".format(len(convs)))
# 把对话分成问与答
questions = [] # 问
answers = [] # 答
for conv in convs:
if len(conv) == 1:
continue
# 奇数对话数, 转为偶数对话
if len(conv) % 2 != 0:
conv = conv[:-1]
for i in range(len(conv)):
if i % 2 == 0:
questions.append(conv[i])
else:
answers.append(conv[i])
def question_answer_dataset(questions, answers, TESTSET_SIZE = 2):
# 创建文件
# 问题文件
question_train_enc = open('./data/question_train.enc','w')
# 回答文件
answer_train_dec = open('./data/answer_train.dec','w')
# 问题文件
question_test_enc = open('./data/question_test.enc', 'w')
# 回答文件
answer_test_dec = open('./data/answer_test.dec', 'w')
# 随机获取问题语料的序号,TESTSET_SIZE为摘取的数量
test_index = random.sample([i for i in range(len(questions))],TESTSET_SIZE)
for i in range(len(questions)):
if i in test_index:
question_test_enc.write(questions[i]+'\n')
answer_test_dec.write(answers[i]+ '\n' )
else:
question_train_enc.write(questions[i]+'\n')
answer_train_dec.write(answers[i]+ '\n' )
question_train_enc.close()
answer_train_dec.close()
question_test_enc.close()
answer_test_dec.close()
if __name__ == "__main__":
question_answer_dataset(questions, answers)
2.4 词转向量(word2vec)
2.4.1 生成词汇表
- Demo
# 问题数据
question_train_encode_file = './data/question_train.enc'
# 回答数据
answer_train_decode_file = './data/answer_train.dec'
question_test_encode_file = './data/question_test.enc'
answer_test_decode_file = './data/answer_test.dec'
# 特殊标记,用来填充标记对话
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__" # 对话结束
UNK = "__UNK__" # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
# 生成词汇表文件
def gen_vocabulary_file(input_file, output_file):
vocabulary = {}
with open(input_file) as f:
counter = 0
for line in f:
counter += 1
tokens = [word for word in line.strip()]
for word in tokens:
if word in vocabulary:
vocabulary[word] += 1
else:
vocabulary[word] = 1
print("vocabulary: {}".format(vocabulary))
vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
print("vocabulary list: {}".format(vocabulary_list))
print(input_file + " 词汇表大小:", len(vocabulary_list))
with open(output_file, "w") as ff:
for word in vocabulary_list:
ff.write(word + "\n")
gen_vocabulary_file(question_train_encode_file, "./data/word2vec/train_question_encode_vocabulary")
gen_vocabulary_file(answer_train_decode_file, "./data/word2vec/train_answer_decode_vocabulary")
- Result(问题部分词汇表)
__PAD__
__GO__
__EOS__
__UNK__
呵
开
心
点
哈
一
切
都
会
好
起
来
我
还
喜
欢
她
怎
么
办
短
信
2.4.2 生成向量
- Demo
# 把对话字符串转为向量形式
def convert_to_vector(input_file, vocabulary_file, output_file):
tmp_vocab = []
with open(vocabulary_file, "r") as f:
# print("vocabulary: {}".format(f.readlines()))
tmp_vocab.extend(f.readlines())
print("temporay vocabulary: {}".format(tmp_vocab))
tmp_vocab = [line.strip() for line in tmp_vocab]
print("strip temporay vocabulary: {}".format(tmp_vocab))
vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)])
print("Dictionary vocabulary: {}".format(vocab))
#{'硕': 3142, 'v': 577, 'I': 4789, '\ue796': 4515, '拖': 1333, '疤': 2201 ...}
output_f = open(output_file, 'w')
with open(input_file, 'r') as f:
for line in f:
line_vec = []
print("Line: {}".format(line))
for words in line.strip():
# print("words: {}".format(words))
line_vec.append(vocab.get(words, UNK_ID))
print("Line vector: {}".format(line_vec))
output_f.write(" ".join([str(num) for num in line_vec]) + "\n")
output_f.close()
convert_to_vector(question_train_encode_file, train_question_encode_vocabulary_file, './data/word2vec/train_question_encode.vec')
convert_to_vector(answer_train_decode_file, train_answer_decode_vocabulary_file, './data/word2vec/train_answer_decode.vec')
- Result(问题部分向量)
4 4
5 6 7 8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
24 25
2.4.3 完整代码
# 前一步生成的问答文件路径
# 问题数据
question_train_encode_file = './data/question_train.enc'
# 回答数据
answer_train_decode_file = './data/answer_train.dec'
question_test_encode_file = './data/question_test.enc'
answer_test_decode_file = './data/answer_test.dec'
print('开始创建词汇表...')
# 特殊标记,用来填充标记对话
PAD = "__PAD__"
GO = "__GO__"
EOS = "__EOS__" # 对话结束
UNK = "__UNK__" # 标记未出现在词汇表中的字符
START_VOCABULART = [PAD, GO, EOS, UNK]
PAD_ID = 0
GO_ID = 1
EOS_ID = 2
UNK_ID = 3
vocabulary_size = 5000
# 生成词汇表文件
def gen_vocabulary_file(input_file, output_file):
vocabulary = {}
with open(input_file) as f:
counter = 0
for line in f:
counter += 1
tokens = [word for word in line.strip()]
for word in tokens:
if word in vocabulary:
vocabulary[word] += 1
else:
vocabulary[word] = 1
print("vocabulary: {}".format(vocabulary))
vocabulary_list = START_VOCABULART + sorted(vocabulary, key=vocabulary.get, reverse=True)
print("vocabulary list: {}".format(vocabulary_list))
print(input_file + " 词汇表大小:", len(vocabulary_list))
with open(output_file, "w") as ff:
for word in vocabulary_list:
ff.write(word + "\n")
gen_vocabulary_file(question_train_encode_file, "./data/word2vec/train_question_encode_vocabulary")
gen_vocabulary_file(answer_train_decode_file, "./data/word2vec/train_answer_decode_vocabulary")
train_question_encode_vocabulary_file = './data/word2vec/train_question_encode_vocabulary'
train_answer_decode_vocabulary_file = './data/word2vec/train_answer_decode_vocabulary'
print("对话转向量...")
# 把对话字符串转为向量形式
def convert_to_vector(input_file, vocabulary_file, output_file):
tmp_vocab = []
with open(vocabulary_file, "r") as f:
# print("vocabulary: {}".format(f.readlines()))
tmp_vocab.extend(f.readlines())
print("temporay vocabulary: {}".format(tmp_vocab))
tmp_vocab = [line.strip() for line in tmp_vocab]
print("strip temporay vocabulary: {}".format(tmp_vocab))
vocab = dict([(x, y) for (y, x) in enumerate(tmp_vocab)])
print("Dictionary vocabulary: {}".format(vocab))
output_f = open(output_file, 'w')
with open(input_file, 'r') as f:
for line in f:
line_vec = []
print("Line: {}".format(line))
for words in line.strip():
# print("words: {}".format(words))
line_vec.append(vocab.get(words, UNK_ID))
print("Line vector: {}".format(line_vec))
output_f.write(" ".join([str(num) for num in line_vec]) + "\n")
output_f.close()
convert_to_vector(question_train_encode_file, train_question_encode_vocabulary_file, './data/word2vec/train_question_encode.vec')
convert_to_vector(answer_train_decode_file, train_answer_decode_vocabulary_file, './data/word2vec/train_answer_decode.vec')
3 总结
(1) 语料处理分四部分:获取源数据,生成问答语料集(去除标点符号),生成词汇表,生成词向量;各数据源长这样:
源数据:
E
M 呵呵
M 是王若猫的。
E
M 不是
M 那是什么?
E
M 怎么了
M 我很难过,安慰我~
问答语料集:
呵呵
开心点哈一切都会好起来
我还喜欢她怎么办
短信
词汇表:
__PAD__
__GO__
__EOS__
__UNK__
呵
开
心
点
哈
一
切
都
会
好
起
来
我
词向量:
4 4
5 6 7 8 9 10 11 12 13 14 15
16 17 18 19 20 21 22 23
24 25
(2) 源数据处理成向量,作为训练的源数据;
[参考文献]
[1]http://blog.topspeedsnail.com/archives/10735/comment-page-1#comment-1161%E3%80%82
[2]https://blog.csdn.net/mach_learn/article/details/41744487