这是我个人综合实验手册所得的源代码,现在托管在该平台。
#下载
! wget https://huawei-ai-certification.obs.cn-north-4.myhuaweicloud.com/CHS/HCIP-AI%20EI%20Developer/V2.1/machine_translation/data.zip
#解压
! unzip data.zip
#导入相关库
import re
import os
import io
import time
import jieba
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
#指定数据路径
path_to_file = "data/cmn.txt"
#预处理函数定义
def preprocess_eng(w):
w = w.lower().strip()
w = re.sub(r"([?.!,])", r" \1 ", w)
w = re.sub(r'[" "]+', " ", w)
w = re.sub(r"[^a-zA-Z?.!,]+", " ", w)
w = w.rstrip().strip()
w = '<start> ' + w + ' <end>'
return w
def preprocess_chinese(w):
w = w.lower().strip()
w = jieba.cut(w, cut_all=False, HMM=True)
w = " ".join(list(w))
w = '<start> ' + w + ' <end>'
return w
#预处理测试
en_sentence = "May I borrow this book?"
chn_sentence = "我可以借这本书吗?"
print(preprocess_eng(en_sentence))
print(preprocess_chinese(chn_sentence))
# 读取数据,每个元素的样式是 [英文, 中文]
def create_dataset(path, num_examples=None):
lines = open(path, encoding='UTF-8').read().strip().split('\n')
word_pairs = [[w for w in l.split('\t')] for l in lines[:num_examples]]
word_pairs = [[preprocess_eng(w[0]), preprocess_chinese(w[1])]
for w in word_pairs]
return word_pairs
word_pairs = create_dataset(path_to_file)
word_pairs[:20]
#中英文分离
en, chn = zip(*create_dataset(path_to_file))
print(en[-1])
print(chn[-1])
print(len(en), len(chn))
# 取数据中的最大文本长度,用来将所有文本统一成一致的长度,模型才能够正常训练
def max_length(tensor):
return max(len(t) for t in tensor)
def tokenize(lang):
lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
lang_tokenizer.fit_on_texts(lang)
text_ids = lang_tokenizer.texts_to_sequences(lang)
padded_text_ids = tf.keras.preprocessing.sequence.pad_sequences(text_ids,padding='post')
return padded_text_ids, lang_tokenizer
# 将中文设置为源语言,英文设置为目标语言
def load_dataset(path, num_examples=None):
targ_lang, inp_lang = zip(*create_dataset(path, num_examples))
input_data, inp_lang_tokenizer = tokenize(inp_lang)
target_data, targ_lang_tokenizer = tokenize(targ_lang)
return input_data, target_data, inp_lang_tokenizer, targ_lang_tokenizer
#设置训练集大小限制等
num_examples = None
input_data, target_data, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)
max_length_t