import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import backend as K
from tensorflow.keras import activations
from tensorflow.keras.layers import Layer, Input, Embedding, LSTM, Dense, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
import numpy as np
import pandas as pd
import sys
print(tf.__version__)
print(sys.version_info)
for i in np,pd,tf:
print(i.__name__,i.__version__)
2.0.0
sys.version_info(major=3, minor=6, micro=10, releaselevel='final', serial=0)
numpy 1.18.1
pandas 1.0.3
tensorflow 2.0.0
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
luong attention的解释
query = tf.convert_to_tensor(np.asarray([[[1., 1., 1., 3.]]],dtype=np.float32))
key_list = tf.convert_to_tensor(np.asarray([[[1., 1., 2., 4.], [4., 1., 1., 3.], [1., 1., 2., 1.]],
[[1., 0., 2., 1.], [1., 2., 1., 2.], [1., 0., 2., 1.]]],dtype=np.float32))
query_value_attention_seq = tf.keras.layers.Attention()([query, key_list])
query.shape,key_list.shape,query_value_attention_seq.shape
(TensorShape([1, 1, 4]), TensorShape([2, 3, 4]), TensorShape([2, 1, 4]))
scores = tf.matmul(query, key_list, transpose_b=True)
distribution = tf.nn.softmax(scores)
result = tf.matmul(distribution, key_list)
scores.shape,distribution.shape,result.shape
(TensorShape([2, 1, 3]), TensorShape([2, 1, 3]), TensorShape([2, 1, 4]))
print(scores,distribution,result)
tf.Tensor(
[[[16. 15. 7.]]
[[ 6. 10. 6.]]], shape=(2, 1, 3), dtype=float32) tf.Tensor(
[[[7.3099267e-01 2.6891717e-01 9.0211659e-05]]
[[1.7668420e-02 9.6466309e-01 1.7668420e-02]]], shape=(2, 1, 3), dtype=float32) tf.Tensor(
[[[1.8067516 1.0000001 1.731083 3.7308123 ]]
[[0.99999994 1.9293262 1.0353367 1.9646629 ]]], shape=(2, 1, 4), dtype=float32)
distribution 的结果即为每个输入向量的权重值。
[0.731, 0.269, 9.02e-5] 分别代表 [[1., 1., 2., 4.], [4., 1., 1., 3.], [1., 1., 2., 1.]] 的权重值。
-------分割线-------------
def generate_sequence(length, n_unique): # 数字个数,数字上限
return [np.random.randint(1, n_unique-1) for _ in range(length)]
def get_dataset(n_in, n_out, cardinality, n_samples): # 6,3,51,1 数字个数:X1第一维, 前n个数字:y第一维, 第三维维数,第一维
X1, X2, y = list(), list(), list()
for _ in range(n_samples):
# 生成输入序列
source = generate_sequence(n_in, cardinality)
# 定义目标序列,这里就是输入序列的前三个数据
target = source[:n_out]
target.reverse()
# 向前偏移一个时间步目标序列
target_in = [0] + target[:-1]
# 直接使用to_categorical函数进行on_hot编码
#src_encoded = to_categorical(source, num_classes=cardinality)
#tar_encoded = to_categorical(target, num_classes=cardinality)
#tar2_encoded = to_categorical(target_in, num_classes=cardinality)
X1.append(source)
X2.append(target_in)
y.append(target)
return np.array(X1), np.array(X2), np.array(y)
X1,X2,y = get_dataset(6,3,1000,500)
print(X1.shape,X2.shape,y.shape)
(500, 6) (500, 3) (500, 3)
X1 = np.array(X1,dtype = np.int64)
X1.max(),y.max()
(998, 998)
vocab_size = 999 # 最大数+1
max_len_enc = 6
max_len_dec = 3
embedding_dim = 32
hidden_units = 64
x = tf.keras.layers.Embedding(vocab_size, embedding_dim,mask_zero=True)(X1)
x.shape
TensorShape([500, 6, 32])
enc_outputs,enc_state_h,enc_state_c = tf.keras.layers.LSTM(hidden_units,return_sequences=True, return_state=True)(x)
enc_outputs.shape,enc_state_h.shape,enc_state_c.shape
(TensorShape([500, 6, 64]), TensorShape([500, 64]), TensorShape([500, 64]))
dec_states_inputs = [enc_state_h, enc_state_c]
Y = tf.keras.layers.Embedding(vocab_size, embedding_dim,mask_zero=True)(y)
Y.shape
TensorShape([500, 3, 32])
dec_outputs,dec_state_h,dec_state_c = tf.keras.layers.LSTM(hidden_units,
return_sequences=True,
return_state=True)(Y,initial_state=dec_states_inputs)
dec_outputs.shape,dec_state_h.shape,dec_state_c.shape
(TensorShape([500, 3, 64]), TensorShape([500, 64]), TensorShape([500, 64]))
attention_output = tf.keras.layers.Attention()([dec_outputs,enc_outputs])
attention_output.shape
TensorShape([500, 3, 64])
dense_outputs = tf.keras.layers.Dense(vocab_size, activation='softmax', name="dense")(attention_output)
dense_outputs.shape
TensorShape([500, 3, 999])
y.shape
(500, 3)
class Encoder_1(tf.keras.Model):
def __init__(self,vocab_size,embedding_dim,hidden_units): # embedding_dim:第三维,任意
#embedding->LSTM 只变换第三维
super(Encoder_1,self).__init__()
self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, mask_zero=True)
self.encoder_lstm = LSTM(hidden_units,return_sequences=True, return_state=True, name="encode_lstm")
def call(self,inputs):
encoder_embed = self.embedding(inputs)
# encoder_embed.shape : (vocab_size,max_lenth,embedding_dim)
encoder_outputs,state_h,state_c = self.encoder_lstm(encoder_embed)
# encoder_outputs.shape : (vocab_size,max_lenth,hidden_units)
return encoder_outputs,state_h,state_c
class Decoder_1(tf.keras.Model):
def __init__(self,vocab_size,embedding_dim,hidden_units):
super (Decoder_1,self).__init__()
self.embedding = Embedding(vocab_size,embedding_dim,mask_zero=True)
self.decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True, name="decode_lstm")
self.attention = Attention()
def call(self,enc_outputs,dec_inputs,states_inputs):
decoder_embed = self.embedding(dec_inputs)
dec_outputs,dec_state_h,dec_state_c = self.decoder_lstm(decoder_embed,initial_state=states_inputs)
attention_output = self.attention([dec_outputs,enc_outputs])
return attention_output,dec_state_h,dec_state_c
def Seq2Seq_1(maxlen,embedding_dim,hidden_units,vocab_size):
# Input Layer
encoder_inputs = tf.keras.Input(shape = (maxlen,),name="encode_input")
decoder_inputs = tf.keras.Input(shape = (None,),name="decode_input")
# Encoder Layer
encoder = Encoder_1(vocab_size,embedding_dim,hidden_units) #实例化
enc_outputs,enc_state_h, enc_state_c = encoder(encoder_inputs)
dec_states_inputs = [enc_state_h, enc_state_c]
# Decoder Layer
decoder = Decoder_1(vocab_size,embedding_dim,hidden_units)
attention_output, dec_state_h, dec_state_c = decoder(enc_outputs,decoder_inputs,dec_states_inputs)
# Dense Layer
dense_outputs = Dense(vocab_size, activation='softmax', name="dense")(attention_output)
# seq2seq model
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=dense_outputs)
return model
def load_data(in_file):
cn = []
en = []
num_examples = 0
with open(in_file, 'r',encoding='latin-1') as f:
for line in f:
line = line.strip().split("\t")
en.append(["BOS"] + nltk.word_tokenize(line[0].lower()) + ["EOS"]) # beginning of sentence ,end of sentence
# split chinese sentence into characters
cn.append(["BOS"] + [c for c in line[1]] + ["EOS"])
return en, cn
str = "0000000321QQQQQ 00000Runoob01230000000"
str.split(" ")
['0000000321QQQQQ', '', '', '', '00000Runoob01230000000']
def read_vocab(vocab_path):
vocab_words = []
with open(vocab_path, "r", encoding="utf8") as f:
for line in f:
vocab_words.append(line.strip())
#Python strip() 方法用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。
#注意:该方法只能删除开头或是结尾的字符,不能删除中间部分的字符。
return vocab_words
def read_data(data_path):
datas = []
with open(data_path, "r", encoding="utf8") as f:
for line in f:
words = line.strip().split()
datas.append(words)
return datas
def process_data_index(datas, vocab2id):
data_indexs = []
for words in datas:
line_index = [vocab2id[w] if w in vocab2id else vocab2id["<UNK>"] for w in words]
data_indexs.append(line_index)
return data_indexs
vocab_words = read_vocab("data/ch_word_vocab.txt")
vocab_words[:10]
['呵呵', '不是', '怎么', '了', '开心', '点', '哈', ',', '一切', '都会']
len(vocab_words)
70130
special_words = ["<PAD>", "<UNK>", "<GO>", "<EOS>"]
vocab2id = {word: i for i, word in enumerate(vocab_words)}
vocab_words = read_vocab("data/ch_word_vocab.txt")
special_words = ["<PAD>", "<UNK>", "<GO>", "<EOS>"]
vocab_words = special_words + vocab_words
vocab2id = {word: i for i, word in enumerate(vocab_words)}
id2vocab = {i: word for i, word in enumerate(vocab_words)}
num_sample = 1000
source_data = read_data("data/ch_source_data_seg.txt")[:num_sample]
source_data_ids = process_data_index(source_data, vocab2id)
target_data = read_data("data/ch_target_data_seg.txt")[:num_sample]
target_data_ids = process_data_index(target_data, vocab2id)
print("vocab test: ", [id2vocab[i] for i in range(10)])
print("source test: ", source_data[10])
print("source index: ", source_data_ids[10])
print("target test: ", target_data[10])
print("target index: ", target_data_ids[10])
vocab test: ['<PAD>', '<UNK>', '<GO>', '<EOS>', '呵呵', '不是', '怎么', '了', '开心', '点']
source test: ['许兵', '是', '谁']
source index: [26, 27, 24]
target test: ['是', '我', '善良', '可爱', '的', '主人', '的', '老公', '啊']
target index: [27, 16, 9572, 436, 45, 452, 45, 274, 111]
vocab2id
{'<PAD>': 0,
'<UNK>': 1,
'<GO>': 2,
'<EOS>': 3,
'呵呵': 4,
'不是': 5,
'怎么': 6,
.
.
.
.
'吐出来': 998,
'丧尸': 999,
...}
id2vocab
{0: '<PAD>',
1: '<UNK>',
2: '<GO>',
3: '<EOS>',
4: '呵呵',
5: '不是',
6: '怎么',
7: '了',
8: '开心',
9: '点',
10: '哈',
.
.
.
.
995: '哪里找',
996: '稀',
997: '浓',
998: '吐出来',
999: '丧尸',
...}
source_data
[['呵呵'],
['不是'],
['怎么', '了'],
['开心', '点', '哈', ',', '一切', '都会', '好', '起来'],
['我', '还', '喜欢', '她', ',', '怎么办'],
['短信'],
['你', '知道', '谁', '么'],
['许兵', '是', '谁'],
['这么', '假'],
['许兵', '是', '傻', '逼'],
['许兵', '是', '谁'],
['许兵', '是', '谁'],
['许兵', '是', '谁'],
['许兵', '到底', '是', '谁'],
['尼玛', ',', '许兵', '到底', '是', '谁'],
['小黄', '鸭', ',', '你', '有', '女朋友', '么'],
['那', '你', '有', '男朋友', '么'],
['那', '你', '在', '哪'],
['你', '妈', '是', '谁'],
['去', '你', '大爷', '的'],
['你', '在', '骂', '我', '一', '句'],
['你', '大爷', '的'],
['你', '是', '屌丝', '鸡'],
['高富帅'],
['你'],
['呵呵'],
['今天', '是', '谁', '的', '生日'],
['你', '敢不敢'],
['呵呵'],
['呵', '呵呵'],
['你', '是', '女', '的', '了', '?', '怎么回事'],
['呵呵'],
['天王', '盖地虎'],
['小通'],
['在', '监考', ',', '你', '在', '干么'],
['哼', '!', '你', '不想', '我', '我', '就', '不', '和', '你', '玩'],
['你', '要', '气死我', '吗', '?', '坏蛋'],
['恩',
',',
'也',
'是',
'!',
'那',
'我',
'不能',
'生气',
'啦',
'!',
'你',
'生气',
'就',
'行'],
['你', '谈', '过', '恋爱', '么'],
['什么', '让', '你', '这么', '伤心'],
['敢问', '你', '的', '性别'],
['小', '受'],
['是', '吗'],
['你', '干嘛'],
['为什么'],
['你', '有', '爱情', '了'],
['那', '同时', '有', '两', '个', '爱人'],
['那', '你', '不行'],
.
.
.
.
['主人', '我', '想', '炖', '了', '你'],
['好', '无聊'],
['怎么办', '?', '要', '不要', '去', '吃', '翔']]
source_data_ids
[[4],
[5],
[6, 7],
[8, 9, 10, 11, 12, 13, 14, 15],
[16, 17, 18, 19, 11, 20],
[21],
[22, 23, 24, 25],
[26, 27, 24],
[28, 29],
[26, 27, 30, 31],
[26, 27, 24],
[26, 27, 24],
[26, 27, 24],
[26, 32, 27, 24],
[33, 11, 26, 32, 27, 24],
[34, 35, 11, 22, 36, 37, 25],
[38, 22, 36, 39, 25],
[38, 22, 40, 41],
[22, 42, 27, 24],
[43, 22, 44, 45],
[22, 40, 46, 16, 47, 48],
[22, 44, 45],
[22, 27, 49, 50],
[51],
[22],
[4],
[52, 27, 24, 45, 53],
[22, 54],
[4],
[55, 4],
[22, 27, 56, 45, 7, 57, 58],
.
.
.
.
.
.
[171, 993, 9, 498],
[571, 50],
[452, 16, 103, 269, 7, 22],
[14, 248],
[20, 57, 71, 219, 43, 207, 994]]
def process_input_data(source_data_ids, target_indexs, vocab2id): # vocab2id:词:数字
source_inputs = []
decoder_inputs, decoder_outputs = [], []
for source, target in zip(source_data_ids, target_indexs):
source_inputs.append([vocab2id["<GO>"]] + source + [vocab2id["<EOS>"]]) #以2为开始,3为结尾
decoder_inputs.append([vocab2id["<GO>"]] + target) #以2为开始
decoder_outputs.append(target + [vocab2id["<EOS>"]])
return source_inputs, decoder_inputs, decoder_outputs
source_input_ids, target_input_ids, target_output_ids = process_input_data(source_data_ids, target_data_ids, vocab2id)
print("encoder inputs: ", source_input_ids[:2])
print("decoder inputs: ", target_input_ids[:2])
print("decoder outputs: ", target_output_ids[:2])
encoder inputs: [[2, 4, 3], [2, 5, 3]]
decoder inputs: [[2, 27, 37846, 756, 45, 180], [2, 38, 27, 84, 49272]]
decoder outputs: [[27, 37846, 756, 45, 180, 3], [38, 27, 84, 49272, 3]]
for source, target in zip(source_data_ids, target_data_ids):
print([vocab2id["<GO>"]] + target)
[2, 27, 37846, 756, 45, 180]
[2, 38, 27, 84, 49272]
[2, 16, 6692, 82, 49273, 320, 16, 518]
[2, 526]
[2, 16, 438, 22, 328, 19, 49272, 15817, 254, 1764, 49272]
[2, 928, 180, 16, 76, 855]
[2, 2143, 5, 16, 49273, 27, 49274]
[2, 49275, 465, 3504, 89, 762]
[2, 9100, 9101, 76, 29, 49273, 68, 1715, 45, 2222, 111]
[2, 684, 22, 2699, 7, 180]
[2, 27, 16, 9572, 436, 45, 452, 45, 274, 111]
[2, 27, 49276, 45, 1460, 111]
[2, 8347]
[2, 219, 5550, 16, 49277, 1929, 6887, 6999, 27, 49278, 49277]
[2, 16, 144, 16, 376, 328, 22, 16, 27, 33674, 45, 49279, 2893, 49280, 2080]
[2, 8990, 27, 56, 45, 520, 49279]
[2, 277, 282, 49273, 16, 1391, 452, 440, 102, 10569, 16, 514, 180, 180]
[2, 16, 248]
[2, 16, 1251, 27, 851, 777, 293, 2252, 45, 452, 22, 111]
[2, 16, 43, 49273, 346, 22, 44, 45, 49279]
[2, 22, 416, 417, 31072, 518]
[2, 49281, 20183, 7, 180, 4123, 27, 19, 491, 1474, 45, 2501, 26132, 7]
…
…
…
…
[2, 452, 68, 71, 514, 511]
[2, 4389, 111, 1080, 1080, 111]
[2, 22, 5492, 152, 111, 180]
for source, target in zip(source_data_ids, target_data_ids):
print(target + [vocab2id["<EOS>"]])
[27, 37846, 756, 45, 180, 3]
[38, 27, 84, 49272, 3]
[16, 6692, 82, 49273, 320, 16, 518, 3]
[526, 3]
[16, 438, 22, 328, 19, 49272, 15817, 254, 1764, 49272, 3]
[928, 180, 16, 76, 855, 3]
[2143, 5, 16, 49273, 27, 49274, 3]
[49275, 465, 3504, 89, 762, 3]
[9100, 9101, 76, 29, 49273, 68, 1715, 45, 2222, 111, 3]
[684, 22, 2699, 7, 180, 3]
[27, 16, 9572, 436, 45, 452, 45, 274, 111, 3]
[27, 49276, 45, 1460, 111, 3]
[8347, 3]
[219, 5550, 16, 49277, 1929, 6887, 6999, 27, 49278, 49277, 3]
[16, 144, 16, 376, 328, 22, 16, 27, 33674, 45, 49279, 2893, 49280, 2080, 3]
[8990, 27, 56, 45, 520, 49279, 3]
[277, 282, 49273, 16, 1391, 452, 440, 102, 10569, 16, 514, 180, 180, 3]
[16, 248, 3]
[16, 1251, 27, 851, 777, 293, 2252, 45, 452, 22, 111, 3]
[16, 43, 49273, 346, 22, 44, 45, 49279, 3]
[22, 416, 417, 31072, 518, 3]
[49281, 20183, 7, 180, 4123, 27, 19, 491, 1474, 45, 2501, 26132, 7, 3]
[928, 3]
[49282, 111, 3]
[7870, 3]
[22, 148, 3]
[27, 16, 22695, 566, 6615, 45, 53, 49279, 3]
[22, 892, 16, 67, 892, 49273, 748, 49273, 892, 731, 16, 41188, 3]
[4, 22, 128, 151, 3]
[317, 16, 3546, 3]
[22, 2921, 16, 45, 3]
[55, 22, 148, 49273, 22, 4, 7, 16, 71, 6, 473, 22, 180, 407, 49272, 3]
[4285, 1037, 4286, 180, 3]
[31723, 40, 3]
[40, 49283, 3702, 22316, 9153, 180, 3]
[38, 16, 67, 167, 595, 70, 43, 49279, 3]
[78, 27, 3355, 595, 45, 4446, 7488, 483, 180, 8753, 40, 22, 78, 45, 442, 49273, 24, 40, 49284, 282, 49272, 49285, 85, 367, 49286, 7, 49279, 3]
[16, 101, 7, 49279, 16, 2949, 1730, 7, 49273, 5457, 3]
[36011, 49273, 372, 49273, 9958, 7, 49273, 86, 3546, 3]
[847, 16, 45, 38140, 852, 81, 5182, 22, 25818, 49273, 38, 16, 69, 22, 296, 49273, 16, 216, 22, 180, 3]
.
.
.
.
.
.
.
[452, 68, 71, 514, 511, 3]
[4389, 111, 1080, 1080, 111, 3]
[22, 5492, 152, 111, 180, 3]
maxlen = 10
source_input_ids = keras.preprocessing.sequence.pad_sequences(source_input_ids, padding='post', maxlen=maxlen)
target_input_ids = keras.preprocessing.sequence.pad_sequences(target_input_ids, padding='post', maxlen=maxlen)
target_output_ids = keras.preprocessing.sequence.pad_sequences(target_output_ids, padding='post', maxlen=maxlen)
print(source_data_ids[:5])
print(target_input_ids[:5])
print(target_output_ids[:5])
[[4], [5], [6, 7], [8, 9, 10, 11, 12, 13, 14, 15], [16, 17, 18, 19, 11, 20]]
[[ 2 27 37846 756 45 180 0 0 0 0]
[ 2 38 27 84 49272 0 0 0 0 0]
[ 2 16 6692 82 49273 320 16 518 0 0]
[ 2 526 0 0 0 0 0 0 0 0]
[ 16 438 22 328 19 49272 15817 254 1764 49272]]
[[ 27 37846 756 45 180 3 0 0 0 0]
[ 38 27 84 49272 3 0 0 0 0 0]
[ 16 6692 82 49273 320 16 518 3 0 0]
[ 526 3 0 0 0 0 0 0 0 0]
[ 438 22 328 19 49272 15817 254 1764 49272 3]]
maxlen = 10
embedding_dim = 50
hidden_units = 128
vocab_size = len(vocab2id)
model = Seq2Seq_1(maxlen, embedding_dim, hidden_units, vocab_size)
model.summary()
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
encode_input (InputLayer) [(None, 10)] 0
__________________________________________________________________________________________________
encoder_1_1 (Encoder_1) ((None, 10, 128), (N 3598348 encode_input[0][0]
__________________________________________________________________________________________________
decode_input (InputLayer) [(None, None)] 0
__________________________________________________________________________________________________
decoder_1_1 (Decoder_1) ((None, None, 128), 3598348 encoder_1_1[0][0]
__________________________________________________________________________________________________
dense (Dense) (None, None, 70134) 9047286 decoder_1_1[0][0]
==================================================================================================
Total params: 16,243,982
Trainable params: 16,243,982
Non-trainable params: 0
__________________________________________________________________________________________________
tf.keras.utils.plot_model(model,show_shapes=True)
def generate_sequence(length, n_unique):
return [np.random.randint(1, n_unique-1) for _ in range(length)]
def get_dataset(n_in, n_out, cardinality, n_samples): # 6,3,51,1
X1, X2, y = list(), list(), list()
for _ in range(n_samples):
# 生成输入序列
source = generate_sequence(n_in, cardinality)
# 定义目标序列,这里就是输入序列的前三个数据
target = source[:n_out]
target.reverse()
# 向前偏移一个时间步目标序列
target_in = [0] + target[:-1]
# 直接使用to_categorical函数进行on_hot编码
#src_encoded = to_categorical(source, num_classes=cardinality)
#tar_encoded = to_categorical(target, num_classes=cardinality)
#tar2_encoded = to_categorical(target_in, num_classes=cardinality)
X1.append(source)
X2.append(target_in)
y.append(target)
return np.array(X1), np.array(X2), np.array(y)
X1,X2,y = get_dataset(10,10,100,20)
X1.shape
(20, 10)
Embedding_dim = 32
Input = tf.keras.Input(shape=X1.shape[1:])
x = tf.keras.layers.Embedding(20,Embedding_dim)(Input)
xx,h,c = tf.keras.layers.LSTM(64,return_sequences=True,return_state=True)(x)
tf.keras.layers.LSTM(64,return_sequences=True,return_state=True)(x)
[<tf.Tensor 'lstm_1/Identity:0' shape=(None, 10, 64) dtype=float32>,
<tf.Tensor 'lstm_1/Identity_1:0' shape=(None, 64) dtype=float32>,
<tf.Tensor 'lstm_1/Identity_2:0' shape=(None, 64) dtype=float32>]
x.shape
TensorShape([None, 10, 32])