运用神经网络完成机器翻译。使用英语和法语语句组成的数据集,训练一个序列到序列模型(sequence to sequence model),该模型能够将新的英语句子翻译成法语。
获取数据
import os
import picker
import copy
import numpy as np
def load_data(path):
"""
load Dataset from File
"""
input_file = os.path.join(path)
with open(input_file,'r',encoding="utf-8") as f:
return f.read()
source_path = './data/small_vocab_en'
target_path = './data/small_vocab_fr'
source_text = load_data(source_path)
target_text = load_data(target_path)
探索数据
研究view_sentence_range,查看并熟悉该数据的不同部分
view_sentence_range = (0,10)
print("Dataset Stats")
print('Roughly the number of unique words: {}'.format(len({word: None for word in source_text.split()})))
sentences = source_text.split("\n")
word_counts = [len(sentence.split()) for sentence in sentences]
print('Number of sentences: {}'.format(len(sentences)))
print('Average number of words in a sentence: {}'.format(np.average(word_counts)))
print()
print('English sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(source_text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))
print()
print('French sentences {} to {}:'.format(*view_sentence_range))
print('\n'.join(target_text.split('\n')[view_sentence_range[0]:view_sentence_range[1]]))