实现预处理函数
文本到单词id
首先将文本转换成数字。在函数text_to_ids()中,请将单词中的source_text和target_text转为id。注意:需要在target_text中每个句子的末尾,添加< EOS >单词id。这样可以预测句子应该在什么地方结束。
通过以下代码获取< EOS >单词ID:
target_vocab_to_int[’< EOS >’]
使用source_vocab_to_int和target_vocab_to_int获得其他单词id。
CODES = {'<PAD>': 0, '<EOS>': 1, '<UNK>': 2, '<GO>': 3 }
def text_to_ids(source_text,target_text,source_vocab_to_int,target_vocab_to_int):
source_id_text = [[source_vocab_to_int.get(word,source_vocab_to_int['<UNK>'])for word in line.split()] for line in source_text.split('\n')]
target_id_text = [[target_vocab_to_int.get(word,source_vocab_to_int['<UNK>'])for word in line.split()] + [target_vocab_to_int['EOS']] for line in target_text.split('\n')]
return (source_id_text,target_id_text)
def create_lookup_tables(text):
vocab = set(text.split())
vocab_to_int = copy.copy(CODES)
for v_i,v in enumerate(vocab,len(CODES)):
vocab_to_int[v] = v_i
int_to_vocab = {v_i:v for v,v_i in vocab_to_int.items()}
return vocab_to_int,int_to_vocab
def test_text_to_ids(text_to_ids):
test_source_text = 'new jersey is sometimes quiet during autumn , and it is snowy in april .\nthe united states is usually chilly during july , and it is usually freezing in november .\ncalifornia is usually quiet during march , and it is usually hot in june .\nthe united states is sometimes mild during june , and it is cold in september .'
test_target_text = 'new jersey est parfois calme pendant l\' automne , et il est neigeux en avril .\nles états-unis est généralement froid en juillet , et il gèle habituellement en novembre .\ncalifornia est généralement calme en mars , et il est généralement chaud en juin .\nles états-unis est parfois légère en juin , et il fait froid en septembre .'
test_source_text = test_source_text.lower()
test_target_text = text_target_text.lower()
source_vocab_to_int, source_int_to_vocab = create_lookup_tables(test_source_text)
target_vocab_to_int, target_int_to_vocab = create_lookup_tables(test_target_text)