学习目标
- 学习Word2Vec的使用和基础原理
- 学习使用TextCNN、TextRNN进行文本表示
- 学习使用HAN网络结构完成文本分类
文本表示方法
Word2Vec-Skip-Gram实现
1. 读取数据、建立语料字典
from collections import Counter,deque
import random
import tensorflow as tf
import math
import pandas as pd
import numpy as np
import gc
train_df=pd.read_csv('train_set.csv')
test_df=pd.read_csv('test_a.csv')
text = pd.concat([train_df['text_nostopwords'][:50000],test_df['text_nostopwords'][:10000]],axis=0).values
#del train_df,test_df
words = []
for line in text:
words+=line.split(' ')
vocabulary_size = 5000
def bulid_dataset(words):
count = [['UNK',-1]]
count.extend(Counter(words).most_common(vocabulary_size-1))
gc.collect()
dictionary=dict()
for idx,(word,_) in enumerate(count):
dictionary[word]=idx
gc.collect()
data = list()
unk_count=0
print('建立字典。。。。')
for word in words:
if word in dictionary:
data.append(dictionary[word])
else:
unk_count+=1
data.append(0)
print('字典建立完毕')
gc.collect()
count[0][1]=unk_count
reverse_dictionary = dict(zip(dictionary.values(),dictionary.items()))
print(len(dictionary))
return data,count,dictionary,reverse_dictionary
建立字典。。。。
字典建立完毕
5000
2. 生成 Skip-Gram 的批次数据
data_index = 0
def generate_batch_skip_gram(batch_size, window_size):
# 用全局变量 data_index 记录当前取到哪里,每次读取一个数据集后增加 1,如果超出结尾则又从头开始
global data_index
# 在 Skip-Gram 中 batch 为中心词数据集
# labels 为上下文数据集
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
# span 是指中心词及其两侧窗口 [ skip_window target skip_window ] 的总长度
span = 2 * window_size + 1
# buffer 用来存储 span 内部的数据
buffer = deque(maxlen=span)
# 填充 buffer 更新 data_index
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
# 对每个中心词要取的上下文词的个数
num_samples = 2*window_size
# 内循环用 span 里面的数据先来得到 num_samples 个数据点
# 外循环重复 batch_size//num_samples 次来得到整个批次数据
for i in range(batch_size // num_samples):
k=0
# 得到 batch ,label,注意 label 里面没有 target 单词
for j in list(range(window_size))+list(range(window_size+1,2*window_size+1)):
batch[i * num_samples + k] = buffer[window_size]
labels[i * num_samples + k, 0] = buffer[j]
k += 1
# 每次读取 num_samples 个数据后,span 向右移动一位得到新的 span
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return batch, labels
print('data:', [reverse_dictionary[di] for di in data[:8]])
for window_size in [1, 2]:
data_index = 0
batch, labels = generate_batch_skip_gram(batch_size=8, window_size=window_size)
print('\nwith window_size = %d:' %window_size)
print(' batch:', [reverse_dictionary[bi] for bi in batch])
print(' labels:', [reverse_dictionary[li] for li in labels.reshape(8)])
data: [('2967', 281), ('6758', 1118), ('339', 1521), ('2021', 510), ('1854', 191), ('3731', 707), ('4109', 203), ('3792', 258)]
with window_size = 1:
batch: [('6758', 1118), ('6758', 1118), ('6758', 1118), ('6758', 1118), ('6758', 1118), ('6758', 1118), ('6758', 1118), ('6758', 1118)]
labels: [('2967', 281), ('339', 1521), ('2967', 281), ('339', 1521), ('2967', 281), ('339', 1521), ('2967', 281), ('339', 1521)]
with window_size = 2:
batch: [('339', 1521), ('339', 1521), ('339', 1521), ('339', 1521), ('339', 1521), ('339', 1521), ('339', 1521), ('339', 1521)]
labels: [('2967', 281), ('6758', 1118), ('2021', 510), ('1854', 191), ('2967', 281), ('6758', 1118), ('2021', 510), ('1854', 191)]
3. 设置超参数
batch_size = 128
embedding_size = 200 # embedding 向量的维度
window_size = 2 # 中心词的左右两边各取 2 个
valid_size = 4 # 随机选择一个 validation 集来评估单词的相似性
valid_window = 50 # 从一个大窗口随机采样 valid 数据
# 选择 valid 样本时, 取一些频率比较大的单词,也选择一些适度罕见的单词
valid_examples = np.array(random.sample(range(valid_window), valid_size))
valid_examples = np.append(valid_examples,random.sample(range(1000, 1000+valid_window), valid_size),axis=0)
num_sampled = 32 # 负采样的样本个数
4. 定义模型的参数
Embedding 变量用来存储词嵌入向量,初始化为 -1 到 1 之间的随机数,随着优化过程不断调整,在对 loss 函数进行优化时,会将其所有源头变量进行调整优化:
embedding = tf.Variable(tf.random.uniform([vocabulary_size, embedding_size], -1.0, 1.0))
# softmax_weights 和 softmax_biases 是用来做分类的参数
softmax_weights = tf.Variable(
tf.random.truncated_normal([vocabulary_size, embedding_size],
stddev=0.5 / math.sqrt(embedding_size))
)
softmax_biases = tf.Variable(tf.random.uniform([vocabulary_size],0.0,0.01))
5. 定义损失函数
# 输入是训练数据的 embeddings
def get_embedding(x):
with tf.device('/cpu:0'):
# 对于X中的每一个样本查找对应的嵌入向量
x_embed = tf.nn.embedding_lookup(embedding, x)
return x_embed
# 每次用一些负采样样本计算 softmax loss,
# 用这个 loss 来优化 weights, biases, embeddings
def nce_loss(x_embed, y):
y = tf.cast(y, tf.int64)
loss = tf.reduce_mean(
tf.nn.sampled_softmax_loss(
weights=softmax_weights, biases=softmax_biases, inputs=x_embed,
labels=y, num_sampled=num_sampled, num_classes=vocabulary_size)
)
return loss
6.定义优化算法
learning_rate = 0.1
optimizer = tf.optimizers.SGD(learning_rate)
# 优化过程
def run_optimization(x, y):
with tf.device('/cpu:0'):
# 将计算封装在GradientTape中以实现自动微分
with tf.GradientTape() as g:
emb = get_embedding(x)
loss = nce_loss(emb, y)
# 计算梯度
gradients = g.gradient(loss, [embedding, softmax_weights, softmax_biases])
# 按gradients更新 W 和 b
optimizer.apply_gradients(zip(gradients, [embedding, softmax_weights, softmax_biases]))
7.定义计算单词相似性的函数
用余弦距离来计算验证集中单词的相似度, norm 定义为二范数,再将词向量进行正则化得到 normalized_embeddings,这样变成了只考虑方向的单位向量,再从单位词向量矩阵中找出验证集词语所对应的向量,单词的相似性用余弦距离来表示,值越大代表越相似。
# 评估
def evaluate(x_embed):
# 计算输入数据嵌入与每个嵌入向量之间的余弦相似度
x_embed = tf.cast(x_embed, tf.float32)
x_embed_norm = x_embed / tf.sqrt(tf.reduce_sum(tf.square(x_embed)))
embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True), tf.float32)
cosine_sim_op = tf.matmul(x_embed_norm, embedding_norm, transpose_b=True)
return cosine_sim_op
8.训练 Skip-Gram
num_steps = 30000
# 用于测试的单词
# 针对给定步骤数进行训练
for step in range(1,num_steps+1):
batch_x, batch_y = generate_batch_skip_gram(batch_size, window_size)
run_optimization(batch_x, batch_y)
if step % 10000 == 0 or step == 1:
loss = nce_loss(get_embedding(batch_x), batch_y)
print("step: %i, loss: %f" % (step, loss))
step: 1, loss: 3.963947
step: 10000, loss: 3.031795
step: 20000, loss: 2.525633
step: 30000, loss: 0.617142
np.save('skip_embeddings',embedding)
9.使用skip_embeddings 进行文本分类
embedding1=embedding.numpy()
embeddings_matrix = np.zeros((vocabulary_size, 200));
for idx,emb in enumerate(embedding1):
embeddings_matrix[idx] = emb.tolist()
seqs = []
gc.collect()
for line in text:
word = line.split(' ')
seq=[ dictionary[i] if i in dictionary.keys() else 0 for i in word ]
seq=seq[:2000]+ (2000-len(seq))*[0]
seqs.append(seq)
train_seqs =np.array(seqs[:45000])
val_seqs =np.array(seqs[45000:50000])
test_seqs=np.array(seqs[50000:])
labels = np.zeros((50000, 14))
for i in range(50000):
labels[i][train_df.label[i]]=1
train_labels=np.array(labels[:45000])
val_labels = np.array(labels[45000:])
model = tf.keras.Sequential([
tf.keras.layers.Embedding(vocabulary_size, 200, input_length=2000, weights=[embeddings_matrix], trainable=False),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Conv1D(32, 5, activation='relu'),
tf.keras.layers.MaxPooling1D(pool_size=4),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
tf.keras.layers.Dense(14, activation='sigmoid')
])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 2000, 200) 1000000
_________________________________________________________________
dropout (Dropout) (None, 2000, 200) 0
_________________________________________________________________
conv1d (Conv1D) (None, 1996, 32) 32032
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 499, 32) 0
_________________________________________________________________
bidirectional (Bidirectional (None, 32) 6272
_________________________________________________________________
dense (Dense) (None, 14) 462
=================================================================
Total params: 1,038,766
Trainable params: 38,766
Non-trainable params: 1,000,000
_________________________________________________________________
num_epochs = 3
history = model.fit(train_seqs, train_labels, epochs=num_epochs, validation_data=(val_seqs, val_labels), verbose=1)
Epoch 1/3
1407/1407 [==============================] - 692s 492ms/step - loss: 0.2831 - accuracy: 0.9187 - val_loss: 0.3185 - val_accuracy: 0.9090
Epoch 2/3
1000/1407 [====================>.........] - ETA: 3:45 - loss: 0.2534 - accuracy: 0.9258
model.save('sk_self_model.h5')