import os import sys import math from collections import Counter import numpy as np import random import torch import torch.nn as nn import torch.nn.functional as F import nltk
读入中英文数据
- 英文我们使用nltk的word tokenizer来分词,并且使用小写字母
- 中文我们直接使用单个汉字作为基本单元
In [3]:
def load_data(in_file): cn = [] en = [] num_examples = 0 with open(in_file, 'r') as f: for line in f: line = line.strip().split("\t") en.append(["BOS"] + nltk.word_tokenize(line[0].lower()) + ["EOS"]) # split chinese sentence into characters cn.append(["BOS"] + [c for c in line[1]] + ["EOS"]) return en, cn train_file = "nmt/en-cn/train.txt" dev_file = "nmt/en-cn/dev.txt" train_en, train_cn = load_data(train_file) dev_en, dev_cn = load_data(dev_file)
构建单词表
In [4]:
UNK_IDX = 0 PAD_IDX = 1 def build_dict(sentences, max_words=50000): word_count = Counter() for sentence in sentences: for s in sentence: word_count[s] += 1 ls = word_count.most_common(max_words) total_words = len(ls) + 2 word_dict = {w[0]: index+2 for index, w in enumerate(ls)} word_dict["UNK"] = UNK_IDX word_dict["PAD"] = PAD_IDX return word_dict, total_words en_dict, en_total_words = build_dict(train_en) cn_dict, cn_total_words = build_dict(train_cn) inv_en_dict = {v: k for k, v in en_dict.items()} inv_cn_dict = {v: k for k, v in cn_dict.items()}
把单词全部转变成数字
In [5]:
def encode(en_sentences, cn_sentences, en_dict, cn_dict, sort_by_len=True): ''' Encode the sequences. ''' length = len(en_sentences) out_en_sentences = [[en_dict.get(w, 0) for w in sent] for sent in en_sentences] out_cn_sentences = [[cn_dict.get(w, 0) for w in sent] for sent in cn_sentences] # sort sentences by english lengths def len_argsort(seq): return sorted(range(len(seq)), key=lambda x: len(seq[x])) # 把中文和英文按照同样的顺序排序 if sort_by_len: sorted_index = len_argsort(out_en_sentences) out_en_sentences = [out_en_senten