【Embedding合集】使用LSTM模型实现对行为序列数据的Embeding

【Embedding合集】使用LSTM模型实现对行为序列数据的Embeding¶
评论
使用LSTM模型实现对不定长的序列数据的Embedding
LSTM是一种递归神经网络（RNN）的变种，能够有效地捕捉和建模序列数据中的长期依赖关系。LSTM模型具有记忆单元和门控机制，可以根据序列中的上下文信息自适应地存储和遗忘信息，从而更好地处理序列中的长期依赖。
Embedding后的数据可用于计算用户行为序列的相似性，对付费、流失等行为进行精准预测。

import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn

Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.
Intel MKL WARNING: Support of Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2) enabled only processors has been deprecated. Intel oneAPI Math Kernel Library 2025.0 will require Intel(R) Advanced Vector Extensions (Intel(R) AVX) instructions.

# 创建一个测试数据
data = {'user_id': [1, 2, 3, 4],
        'sequences': [
            ['click', 'play', 'download'],
            ['upload', 'click'],
            ['play'],
            ['add', 'upload', 'click', 'play']
        ],
        'labels': [1, 0, 0, 2]}

# 使用字典创建DataFrame
df = pd.DataFrame(data)

df.head()

	user_id	sequences	labels
0	1	[click, play, download]	1
1	2	[upload, click]	0
2	3	[play]	0
3	4	[add, upload, click, play]	2

# 构建词汇表
word2idx = {}
idx = 1
for seq in df["sequences"]:
    for word in seq:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1

word2idx

{'click': 1, 'play': 2, 'download': 3, 'upload': 4, 'add': 5}

# 序列数据转换为编码
df["sequences"] = [[word2idx[word] for word in seq] for seq in df["sequences"]]

df.head()

	user_id	sequences	labels
0	1	[1, 2, 3]	1
1	2	[4, 1]	0
2	3	[2]	0
3	4	[5, 4, 1, 2]	2

# 不定长序列填充为定长序列
padded_sequences = pad_sequence([torch.tensor(seq) for seq in df["sequences"]], batch_first=True, padding_value=0)

# 扩展标签的维度，扩展的维度以最终要编码的序列长度为准
expanded_labels = torch.zeros(len(df), 10, dtype=torch.float32)

for i, label in enumerate(df['labels']):
        expanded_labels[i, :] = torch.tensor([label] * 10)

# 定义LSTM模型，模型结构依据数据和实际需求自定义
class LSTMEmbedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 10)
    
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, (h_n, c_n) = self.lstm(x)
        x = self.fc(h_n.squeeze(0))
        return x

# 字典表的大小
dictionary_size = len(word2idx) + 1

embedding_dim = 128
hidden_dim = 50

model = LSTMEmbedding(dictionary_size, embedding_dim, hidden_dim)

# 定义损失函数
criterion = nn.MSELoss()
# 定义优化器
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 模型训练
Y_train = expanded_labels

for epoch in range(100):
    model.train()
    optimizer.zero_grad()

    outputs = model(padded_sequences)
    loss = criterion(outputs, Y_train)
    
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/100], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 0.7141
Epoch [20/100], Loss: 0.2935
Epoch [30/100], Loss: 0.0737
Epoch [40/100], Loss: 0.0163
Epoch [50/100], Loss: 0.0036
Epoch [60/100], Loss: 0.0022
Epoch [70/100], Loss: 0.0007
Epoch [80/100], Loss: 0.0004
Epoch [90/100], Loss: 0.0002
Epoch [100/100], Loss: 0.0001

# 模型预测
model.eval()
    # 预测阶段，禁止梯度计算，减少内存消耗，提高代码执行效率
with torch.no_grad():
    embeddings = model(padded_sequences)

# 将Tensor转换为列表
tensor_list = embeddings.tolist()

df['Embedding'] = tensor_list

df.head()

	user_id	sequences	labels	Embedding
0	1	[1, 2, 3]	1	[0.9973136186599731, 0.9970695376396179, 1.003...
1	2	[4, 1]	0	[0.007720008492469788, 0.01210467517375946, -0...
2	3	[2]	0	[-0.0053714364767074585, -0.001586258411407470...
3	4	[5, 4, 1, 2]	2	[1.993236780166626, 2.0014617443084717, 1.9920...

embeddings

tensor([[ 9.9731e-01,  9.9707e-01,  1.0039e+00,  9.9692e-01,  9.9279e-01,
          9.9808e-01,  9.9333e-01,  1.0054e+00,  1.0042e+00,  9.9489e-01],
        [ 7.7200e-03,  1.2105e-02, -6.0443e-03,  2.2997e-02,  2.2623e-02,
         -1.7220e-02, -6.5562e-03, -5.7095e-03, -1.0300e-03,  1.3729e-02],
        [-5.3714e-03, -1.5863e-03,  1.4794e-02, -2.6556e-02, -2.1868e-02,
          8.8659e-03, -9.2685e-06,  9.5384e-03,  1.4281e-04, -1.4242e-02],
        [ 1.9932e+00,  2.0015e+00,  1.9920e+00,  2.0031e+00,  1.9985e+00,
          1.9993e+00,  2.0001e+00,  1.9908e+00,  1.9968e+00,  1.9987e+00]])