博客文章基于Google Tensorflow实战深度学习框架第九章部分内容,手写一遍代码加强模型理解。
代码在pycharm中编写运行
1、获取词汇表
# -*- coding:utf-8 -*-
'''
@Author:zhangy
@Modify:2019.7.5
'''
import codecs
import collections
from operator import itemgetter
#设置中英文类型,得到不同的vocab
DATA_TYPE = 'english'
if DATA_TYPE == 'chinese':
RAW_DATA = 'train.txt.zh'
VOCAB_OUTPUT = 'zh.vocab'
VOCAB_SIZE = 4000
elif DATA_TYPE == 'english':
RAW_DATA = 'train.txt.en'
VOCAB_OUTPUT = 'en.vocab'
VOCAB_SIZE = 10000
#对单词进行计数
counter = collections.Counter()
with codecs.open(RAW_DATA,'r','utf-8') as f:
for line in f:
for word in line.strip().split():
counter[word] += 1
#依据词频进行降序排列
sorted_word_to_cnt = sorted(counter.items(),key=itemgetter(1),reverse=True)
#拿到对应的单词列表
sorted_word_list = [x[0] for x in sorted_word_to_cnt]
#加入句子的起止符号和unknown符号
sorted_word_list = ["<sos>","<unk>","<eos>"] + sorted_word_list
if len(sorted_word_list) > VOCAB_SIZE:
sorted_word_list = sorted_word_list[:VOCAB_SIZE]
with codecs.open(VOCAB_OUTPUT,'w','utf-8') as file_output:
for word in sorted_word_list:
file_output.write(word + '\n')
2、将中英文文件根据词汇表转换为对应number
# -*- coding:utf-8 -*-
'''
@Author:zhangy
@Modify:2019.7.5
'''
import codecs
DATA_TYPE = "english"
if DATA_TYPE == 'chinese':
RAW_DATA = 'train.txt.zh'
VOCAB = 'zh.vocab'
OUTPUT_DATA = 'train.zh'
elif DATA_TYPE == 'english':
RAW_DATA = 'train.txt.en'
VOCAB = 'en.vocab'
OUTPUT_DATA = 'train.en'
with codecs.open(VOCAB,'r','utf-8') as f_vocab:
#把所有单词转换为列表形式
vocab = [w.strip() for w in f_vocab.readlines()]
word_to_id = {k:v for (k,v) in zip(vocab,range(len(vocab)))}
def get_id(word):
return word_to_id[word] if word in word_to_id else word_to_id['<unk>']
fin = codecs.open(RAW_DATA,'r','utf-8')
fout = codecs.open(OUTPUT_DATA,'w','utf-8')
for line in fin:
words = line.strip().split() + ['<eos>']
out_line = " ".join([str(get_id(w)) for w in words]) + '\n'
fout.write(out_line)
fin.close()
fout.close()