华为创新实验-三

最新推荐文章于 2023-05-01 19:24:12 发布

Arlan子画

最新推荐文章于 2023-05-01 19:24:12 发布

阅读量191

点赞数 1

文章标签： python

本文链接：https://blog.csdn.net/weixin_53181288/article/details/125121350

版权

本文展示了如何使用Python实现一个基于TensorFlow的中英文翻译模型，包括数据预处理、模型构建、训练及测试。通过加载数据、预处理文本、创建数据集、构建Encoder-Decoder模型，最后进行翻译并可视化注意力权重。

摘要由CSDN通过智能技术生成

这是我个人综合实验手册所得的源代码，现在托管在该平台。

#下载
! wget https://huawei-ai-certification.obs.cn-north-4.myhuaweicloud.com/CHS/HCIP-AI%20EI%20Developer/V2.1/machine_translation/data.zip

#解压
! unzip data.zip

#导入相关库
import re
import os
import io
import time
import jieba
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

#指定数据路径
path_to_file = "data/cmn.txt"

#预处理函数定义
def preprocess_eng(w):
    w = w.lower().strip()
    w = re.sub(r"([?.!,])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,]+", " ", w)
    w = w.rstrip().strip()
    w = '<start> ' + w + ' <end>'
    return w
def preprocess_chinese(w):
    w = w.lower().strip()
    w = jieba.cut(w, cut_all=False, HMM=True)
    w = " ".join(list(w))
    w = '<start> ' + w + ' <end>'
    return w

#预处理测试
en_sentence = "May I borrow this book?"
chn_sentence = "我可以借这本书吗？"
print(preprocess_eng(en_sentence))
print(preprocess_chinese(chn_sentence))

# 读取数据，每个元素的样式是 [英文, 中文]
def create_dataset(path, num_examples=None):
        lines = open(path, encoding='UTF-8').read().strip().split('\n')
        word_pairs = [[w for w in l.split('\t')] for l in lines[:num_examples]]
        word_pairs = [[preprocess_eng(w[0]), preprocess_chinese(w[1])]
        for w in word_pairs]
        return word_pairs
word_pairs = create_dataset(path_to_file)
word_pairs[:20]

#中英文分离
en, chn = zip(*create_dataset(path_to_file))
print(en[-1])
print(chn[-1])
print(len(en), len(chn))

# 取数据中的最大文本长度，用来将所有文本统一成一致的长度，模型才能够正常训练
def max_length(tensor):
return max(len(t) for t in tensor)

def tokenize(lang):
        lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
        lang_tokenizer.fit_on_texts(lang)
        text_ids = lang_tokenizer.texts_to_sequences(lang)
        padded_text_ids = tf.keras.preprocessing.sequence.pad_sequences(text_ids,padding='post')
        return padded_text_ids, lang_tokenizer

# 将中文设置为源语言，英文设置为目标语言
def load_dataset(path, num_examples=None):
        targ_lang, inp_lang = zip(*create_dataset(path, num_examples))
        input_data, inp_lang_tokenizer = tokenize(inp_lang)
        target_data, targ_lang_tokenizer = tokenize(targ_lang)
        return input_data, target_data, inp_lang_tokenizer, targ_lang_tokenizer

#设置训练集大小限制等
num_examples = None
input_data, target_data, inp_lang, targ_lang = load_dataset(path_to_file, num_examples)
max_length_t