pytorch 复现 FM

总述

factorization machines 提出了将特征进行交叉,以达到从特征中学习到更多有价值的信息。它的思想是为每个特征学习一个隐向量,具体请移步因子分解机。本文主要记录使用 MovieLens 数据集复现 FM 的过程。

数据预处理

为了更好地观察到数据处理后的状态,我们使用 jupter notebook 处理数据。
首先引入需要的包:

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

处理 users 数据

  1. 将用户数据读入
users = pd.read_table('../../dataset/ml-1m/users.dat', sep='::', header=None, engine='python', encoding='utf-8').to_numpy()
  1. 邮编取前 3 位
for user in users:  
    user[4] = int(user[4][0:3]) # 取邮编的前3位
  1. 将用户的 userId, gender, age, occupation, zipCode Label Encoding 编码
le = LabelEncoder()
for i in range(users.shape[1]):
    users[:, i] = le.fit_transform(users[:, i])

处理结果:
在这里插入图片描述

处理用户评分数据

  1. 读入用户评分数据
ratings = pd.read_table('../../dataset/ml-1m/ratings.dat', sep='::', header=None, engine='python', encoding='utf-8').to_numpy()[:, :3]
  1. 划分正负样本
for i in range(len(ratings)):
    ratings[i][2] = 1 if ratings[i][2] > 2 else 0
  1. 将数据 label encoding 编码
le = LabelEncoder()
for i in range(ratings.shape[1]):
    ratings[:, i] = le.fit_transform(ratings[:, i])

处理结果:
在这里插入图片描述

将用户数据和评分数据合并

  1. 合并用户数据和评分数据
unames = ['userId', 'gender', 'age', 'occupation', 'zipCode']
rnames = ['userId', 'movieId', 'rating']

users = pd.DataFrame(users, columns=unames)
ratings = pd.DataFrame(ratings, columns=rnames)
data = pd.merge(users, ratings, on=['userId'])

处理结果:
在这里插入图片描述

  1. 写入文件
data.reset_index(drop=True, inplace=True)
data.to_csv('../data/processed.csv', index=0)

写入文件后的结果:
在这里插入图片描述

数据处理(dataset)

import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from sklearn.preprocessing import OneHotEncoder

class Dataset(Dataset):
    def __init__(self):
        data = pd.read_csv("/workspace/myFM/data/processed.csv", sep=',',engine='python').to_numpy()
        self.x, self.y = self._process_data(data)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, index):
        return (self.x[index], self.y[index])

    def _process_data(self, data):
        y = data[:, 6]
        x = np.delete(data, -1, axis=1) # 将评分列删除
        return x, y

    
    def get_shape(self):
        return self.x.shape

    def get_vocabs_size(self): # 获得每个特征的类别数
        return np.max(self.x, axis=0) + 1

模型

import torch
import numpy as np

class FM(torch.nn.Module):
    '''
    @param n: the number of features
    @param k: the dim of latent vector
    @param vocabs_size: the size of every vocabulary
    '''
    def __init__(self, n, k, vocabs_size):
        super(FM, self).__init__()
        self.n = n
        self.linearEmbedList = torch.nn.ModuleList([torch.nn.Embedding(vocab_size, 1)
            for vocab_size in vocabs_size])
        self.interEmbedList = torch.nn.ModuleList()
        for i in range(n):
            self.interEmbedList.append(torch.nn.Embedding(vocabs_size[i], k))
        self.sigmoid = torch.nn.Sigmoid()

    '''
    @param x: (batch_size * n)
    '''
    def forward(self, x):
        linear_list = []
        inter_list = []
        for i in range(self.n):
            linear_list.append(self.linearEmbedList[i](x[:, i])) # look up
            inter_list.append(self.interEmbedList[i](x[:, i]))
        linear = np.sum(linear_list, axis=0) # (bs * 1)
        sum_square = torch.square(np.sum(inter_list, axis=0)) # (bs * k)
        square_sum = np.sum([torch.square(item) for item in inter_list], axis=0) # (bs * k)
        inter = 0.5 * torch.sum(sum_square - square_sum, dim=1, keepdim=True) # (bs * 1)
        logit = self.sigmoid(linear.reshape((-1, 1)) + inter) # (bs * 1)
        return logit

训练 & 测试

import torch
import numpy as np
from torch.utils.data import DataLoader
from sklearn import metrics

class Train(object):
    def __init__(self, model):
        self.batch_size = 512
        self.epochs = 20
        self._model = model
        self._optimizer = torch.optim.Adam(self._model.parameters(), lr=0.0005, weight_decay=1e-3)
        self._loss_func = torch.nn.BCELoss()
    
    def train(self, train_dataset, test_dataset):
        train_data_loader = DataLoader(dataset=train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2, drop_last=True)
        test_data_loader = DataLoader(dataset=test_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2, drop_last=True)
        self._model.train()
        if self._use_cuda():
            self._model.cuda()
        for epoch in range(self.epochs):
            train_loss = self._train_an_epoch(train_data_loader, epoch)
            test_loss, auc = self._test_an_epoch(test_data_loader)
            print('epoch:%2d/%d train_loss:%f test_loss:%f auc:%.3f'% (epoch + 1, self.epochs, train_loss, test_loss, auc))

    # 训练一个 epoch
    def _train_an_epoch(self, data_loader, epoch):
        running_loss = 0.0
        for batch, (x, y) in enumerate(data_loader):
            if self._use_cuda():
                x, y = x.cuda(), y.cuda()
            loss = self._train_sigle_batch(x, y)
            running_loss += loss
            # if batch % 1000 == 999:
            #    print('epoch:%2d/%d batch:%d loss: %.3f' % (epoch + 1, self.epochs, batch / 1000 + 1, running_loss / 1000))
            #    running_loss = 0.0
        return running_loss / (batch + 1)
        
    # 训练一个 batch
    def _train_sigle_batch(self, x, y):
        self._optimizer.zero_grad()
        y_predict = self._model(x)
        loss = self._loss_func(y_predict.squeeze(1), y.float())
        loss.backward()
        self._optimizer.step()
        return loss.item()
        
     # 测试
    def _test_an_epoch(self, test_data_loader):
        self._model.eval()
        running_loss = 0.0
        ys = []
        y_predicts = []
        for batch, (x, y) in enumerate(test_data_loader):
            if self._use_cuda():
                x, y = x.cuda(), y.cuda()
            y_predict = self._model(x)
            loss = self._loss_func(y_predict.float(), y.unsqueeze(1).float())
            running_loss += loss.item()

            ys = np.append(ys, y.cpu().detach().numpy(), axis=0)
            y_predicts = np.append(y_predicts, y_predict.squeeze(1).cpu().detach().numpy(), axis=0)
        epoch_mean_loss = running_loss / (batch + 1)
        auc = metrics.roc_auc_score(ys, y_predicts)
        return epoch_mean_loss, auc

    def _use_cuda(self):
        return torch.cuda.is_available()
    

main 文件

from dataset.dataset import Dataset
from torch.utils.data import random_split
from model.fm import FM
from train.fm_train import Train
import os

if __name__ == "__main__":
    os.environ["CUDA_VISIBLE_DEVICES"] = "3"
    print('data loading...')
    dataset = Dataset()
    print('data loading finished')

    train_size = int(len(dataset) * 0.8)
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
    
    print('training...')
    _, n = dataset.get_shape()
    vocabs_size = dataset.get_vocabs_size()
    fm = FM(n, 16, vocabs_size)
    trainer = Train(model=fm)
    trainer.train(train_dataset, test_dataset)


代码运行结果

在这里插入图片描述

源代码:myFM

好的,我明白了。您想了解如何使用 PyTorch 复现一个深度学习模型,是吗? 首先,您需要确保已经安装了 PyTorch。然后,您需要编写模型的代码,并将其保存在一个 Python 文件中。接下来,您可以使用 PyTorch 提供的数据加载器来加载训练数据和测试数据。最后,您可以使用 PyTorch 提供的优化器和损失函数来训练模型。 以下是一个简单的示例,展示了如何使用 PyTorch 复现一个简单的线性回归模型: ```python import torch import torch.nn as nn import torch.optim as optim # 定义模型 class LinearRegression(nn.Module): def __init__(self): super(LinearRegression, self).__init__() self.linear = nn.Linear(1, 1) def forward(self, x): out = self.linear(x) return out # 准备数据 x_train = torch.tensor([[1.0], [2.0], [3.0], [4.0]]) y_train = torch.tensor([[2.0], [4.0], [6.0], [8.0]]) # 初始化模型、损失函数和优化器 model = LinearRegression() criterion = nn.MSELoss() optimizer = optim.SGD(model.parameters(), lr=0.01) # 训练模型 for epoch in range(1000): optimizer.zero_grad() outputs = model(x_train) loss = criterion(outputs, y_train) loss.backward() optimizer.step() # 测试模型 x_test = torch.tensor([[5.0], [6.0], [7.0], [8.0]]) y_test = model(x_test) print(y_test) ``` 这个示例中,我们定义了一个简单的线性回归模型,使用均方误差作为损失函数,使用随机梯度下降作为优化器。我们使用 PyTorch 提供的数据类型 `torch.tensor` 来准备数据,并使用 `model.parameters()` 来获取模型的参数。在训练过程中,我们使用 `optimizer.zero_grad()` 来清除梯度,使用 `loss.backward()` 来计算梯度,使用 `optimizer.step()` 来更新参数。在测试过程中,我们使用训练好的模型来预测新的数据。
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值