Task4 - Word2Vec

学习目标

  • 学习Word2Vec的使用和基础原理
  • 学习使用TextCNN、TextRNN进行文本表示
  • 学习使用HAN网络结构完成文本分类

文本表示方法

Word2Vec-Skip-Gram实现

1. 读取数据、建立语料字典


from  collections import Counter,deque
import random
import tensorflow as tf 
import math
import pandas as pd
import numpy as np 
import gc

train_df=pd.read_csv('train_set.csv')
test_df=pd.read_csv('test_a.csv')
text = pd.concat([train_df['text_nostopwords'][:50000],test_df['text_nostopwords'][:10000]],axis=0).values
#del train_df,test_df

words = []
for line in text:
    words+=line.split(' ')

vocabulary_size = 5000
def bulid_dataset(words):
    count = [['UNK',-1]]
    count.extend(Counter(words).most_common(vocabulary_size-1))
    gc.collect()
    dictionary=dict()
    
    for idx,(word,_) in enumerate(count):
        dictionary[word]=idx 
    gc.collect()
    
    data = list()
    unk_count=0
    
    print('建立字典。。。。')
    for word in words:
        if word in dictionary:
            data.append(dictionary[word])
        else:
            unk_count+=1
            data.append(0)
    print('字典建立完毕')
    gc.collect()
    count[0][1]=unk_count 
    
    reverse_dictionary = dict(zip(dictionary.values(),dictionary.items()))
    print(len(dictionary))
    return data,count,dictionary,reverse_dictionary
建立字典。。。。
字典建立完毕
5000

2. 生成 Skip-Gram 的批次数据

data_index = 0

def generate_batch_skip_gram(batch_size, window_size):

    # 用全局变量 data_index 记录当前取到哪里,每次读取一个数据集后增加 1,如果超出结尾则又从头开始
    global data_index                                                         

    # 在 Skip-Gram 中 batch 为中心词数据集
    # labels 为上下文数据集
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)                    
    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)                

    # span 是指中心词及其两侧窗口 [ skip_window target skip_window ] 的总长度
    span = 2 * window_size + 1                                                 

    # buffer 用来存储 span 内部的数据
    buffer = deque(maxlen=span)                                    

   # 填充 buffer 更新 data_index
    for _ in range(span):                                                        
        buffer.append(data[data_index])
        data_index = (data_index + 1) % len(data)

   # 对每个中心词要取的上下文词的个数
    num_samples = 2*window_size                                                 

   # 内循环用 span 里面的数据先来得到 num_samples 个数据点
   # 外循环重复 batch_size//num_samples 次来得到整个批次数据
    for i in range(batch_size // num_samples):
        k=0
        # 得到 batch ,label,注意 label 里面没有 target 单词
        for j in list(range(window_size))+list(range(window_size+1,2*window_size+1)):
            batch[i * num_samples + k] = buffer[window_size]
            labels[i * num_samples + k, 0] = buffer[j]
            k += 1 

    # 每次读取 num_samples 个数据后,span 向右移动一位得到新的 span
    buffer.append(data[data_index])
    data_index = (data_index + 1) % len(data)
    return batch, labels

print('data:', [reverse_dictionary[di] for di in data[:8]])

for window_size in [1, 2]:
    data_index = 0
    batch, labels = generate_batch_skip_gram(batch_size=8, window_size=window_size)
    print('\nwith window_size = %d:' %window_size)
    print('    batch:', [reverse_dictionary[bi] for bi in batch])
    print('    labels:', [reverse_dictionary[li] for li in labels.reshape(8)])
data: [('2967', 281), ('6758', 1118), ('339', 1521), ('2021', 510), ('1854', 191), ('3731', 707), ('4109', 203), ('3792', 258)]

with window_size = 1:
    batch: [('6758', 1118), ('6758', 1118), ('6758', 1118), ('6758', 1118), ('6758', 1118), ('6758', 1118), ('6758', 1118), ('6758', 1118)]
    labels: [('2967', 281), ('339', 1521), ('2967', 281), ('339', 1521), ('2967', 281), ('339', 1521), ('2967', 281), ('339', 1521)]

with window_size = 2:
    batch: [('339', 1521), ('339', 1521), ('339', 1521), ('339', 1521), ('339', 1521), ('339', 1521), ('339', 1521), ('339', 1521)]
    labels: [('2967', 281), ('6758', 1118), ('2021', 510), ('1854', 191), ('2967', 281), ('6758', 1118), ('2021', 510), ('1854', 191)]

3. 设置超参数

batch_size = 128                             
embedding_size = 200             # embedding 向量的维度

window_size = 2                 # 中心词的左右两边各取 2 个

valid_size = 4                 # 随机选择一个 validation 集来评估单词的相似性

valid_window = 50                # 从一个大窗口随机采样 valid 数据

# 选择 valid 样本时, 取一些频率比较大的单词,也选择一些适度罕见的单词
valid_examples = np.array(random.sample(range(valid_window), valid_size))            
valid_examples = np.append(valid_examples,random.sample(range(1000, 1000+valid_window), valid_size),axis=0)

num_sampled = 32                 # 负采样的样本个数

4. 定义模型的参数

Embedding 变量用来存储词嵌入向量,初始化为 -1 到 1 之间的随机数,随着优化过程不断调整,在对 loss 函数进行优化时,会将其所有源头变量进行调整优化:

embedding = tf.Variable(tf.random.uniform([vocabulary_size, embedding_size], -1.0, 1.0))

# softmax_weights 和 softmax_biases 是用来做分类的参数
softmax_weights = tf.Variable(
    tf.random.truncated_normal([vocabulary_size, embedding_size],
                        stddev=0.5 / math.sqrt(embedding_size))
)
softmax_biases = tf.Variable(tf.random.uniform([vocabulary_size],0.0,0.01))

5. 定义损失函数

# 输入是训练数据的 embeddings
def get_embedding(x):
    with tf.device('/cpu:0'):
       # 对于X中的每一个样本查找对应的嵌入向量
        x_embed = tf.nn.embedding_lookup(embedding, x)
        return x_embed

# 每次用一些负采样样本计算 softmax loss, 
# 用这个 loss 来优化 weights, biases, embeddings
def nce_loss(x_embed, y):
    y = tf.cast(y, tf.int64)
    loss = tf.reduce_mean(
        tf.nn.sampled_softmax_loss(
            weights=softmax_weights, biases=softmax_biases, inputs=x_embed,
            labels=y, num_sampled=num_sampled, num_classes=vocabulary_size)
    )
    return loss

6.定义优化算法

learning_rate = 0.1
optimizer = tf.optimizers.SGD(learning_rate)
# 优化过程
def run_optimization(x, y):
    with tf.device('/cpu:0'):
       # 将计算封装在GradientTape中以实现自动微分
        with tf.GradientTape() as g:
            emb = get_embedding(x)
            loss = nce_loss(emb, y)

        # 计算梯度
        gradients = g.gradient(loss, [embedding, softmax_weights, softmax_biases])

         # 按gradients更新 W 和 b
        optimizer.apply_gradients(zip(gradients, [embedding, softmax_weights, softmax_biases]))

7.定义计算单词相似性的函数

用余弦距离来计算验证集中单词的相似度, norm 定义为二范数,再将词向量进行正则化得到 normalized_embeddings,这样变成了只考虑方向的单位向量,再从单位词向量矩阵中找出验证集词语所对应的向量,单词的相似性用余弦距离来表示,值越大代表越相似。

# 评估
def evaluate(x_embed):
     # 计算输入数据嵌入与每个嵌入向量之间的余弦相似度
    x_embed = tf.cast(x_embed, tf.float32)
    x_embed_norm = x_embed / tf.sqrt(tf.reduce_sum(tf.square(x_embed)))
    embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True), tf.float32)
    cosine_sim_op = tf.matmul(x_embed_norm, embedding_norm, transpose_b=True)
    return cosine_sim_op

8.训练 Skip-Gram

num_steps = 30000
# 用于测试的单词
# 针对给定步骤数进行训练
for step in range(1,num_steps+1):
    batch_x, batch_y = generate_batch_skip_gram(batch_size, window_size)
    run_optimization(batch_x, batch_y)
    if step % 10000 == 0 or step == 1:
        loss = nce_loss(get_embedding(batch_x), batch_y)
        print("step: %i, loss: %f" % (step, loss))

step: 1, loss: 3.963947
step: 10000, loss: 3.031795
step: 20000, loss: 2.525633
step: 30000, loss: 0.617142
np.save('skip_embeddings',embedding)

9.使用skip_embeddings 进行文本分类

embedding1=embedding.numpy()
embeddings_matrix = np.zeros((vocabulary_size, 200));
for idx,emb in enumerate(embedding1):
    embeddings_matrix[idx] = emb.tolist()

seqs = []
gc.collect()
for line in text:
    word = line.split(' ')
    seq=[ dictionary[i]  if i in dictionary.keys()  else 0 for i in word ]
    seq=seq[:2000]+ (2000-len(seq))*[0]
    seqs.append(seq)
train_seqs =np.array(seqs[:45000])
val_seqs =np.array(seqs[45000:50000])
test_seqs=np.array(seqs[50000:])    
labels = np.zeros((50000, 14))
for i in range(50000):
    labels[i][train_df.label[i]]=1
    
train_labels=np.array(labels[:45000])
val_labels = np.array(labels[45000:])


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocabulary_size, 200, input_length=2000, weights=[embeddings_matrix], trainable=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Conv1D(32, 5, activation='relu'),
    tf.keras.layers.MaxPooling1D(pool_size=4),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(14, activation='sigmoid')
])
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 2000, 200)         1000000   
_________________________________________________________________
dropout (Dropout)            (None, 2000, 200)         0         
_________________________________________________________________
conv1d (Conv1D)              (None, 1996, 32)          32032     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 499, 32)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 32)                6272      
_________________________________________________________________
dense (Dense)                (None, 14)                462       
=================================================================
Total params: 1,038,766
Trainable params: 38,766
Non-trainable params: 1,000,000
_________________________________________________________________
num_epochs = 3
history = model.fit(train_seqs, train_labels, epochs=num_epochs, validation_data=(val_seqs, val_labels), verbose=1)
Epoch 1/3
1407/1407 [==============================] - 692s 492ms/step - loss: 0.2831 - accuracy: 0.9187 - val_loss: 0.3185 - val_accuracy: 0.9090
Epoch 2/3
1000/1407 [====================>.........] - ETA: 3:45 - loss: 0.2534 - accuracy: 0.9258
model.save('sk_self_model.h5')
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值