深度推荐模型包DeepCTR

DeepCTR包主要是对目前的一些“基于深度学习的点击率预测算法”进行了实现,官方文档参考
本文主要记录DeepFM算法的相关操作细节。

实验数据

prefix:用户输入(query前缀)
query_prediction:预测的用户完整需求查询词,最多10条;预测的查询词可能是前缀本身,数字为统计概率
title:文章标题
tag:文章类型
label:是否点击0/1

import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

train_data = pd.read_table('./data/oppo_round1_train_20180929.txt', 
        names= ['prefix','query_prediction','title','tag','label'], header= None, encoding='utf-8').astype(str)
print(train_data.shape)
val_data = pd.read_table('./data/oppo_round1_vali_20180929.txt', 
        names = ['prefix','query_prediction','title','tag','label'], header = None, encoding='utf-8').astype(str)
print(val_data.shape)

# O:(1999999, 5)
# O:(50000, 5)
vctr = train_data['label'].value_counts()   #可以看到标签异常值和样本不均衡
vcvl = val_data['label'].value_counts() 
vctr[0]/vctr[1], vcvl[0]/vcvl[1]

# O:(1.6875350690893836, 1.690341673392521)
train_data = train_data[train_data['label'] != '音乐' ]

train_data = pd.concat([train_data,val_data])   #把train和val集合拼接在一起
train_data['label'] = train_data['label'].apply(lambda x: int(x))
items = ['prefix', 'title', 'tag']   #query前缀、文章标题、文章类型

for item in items:   #分别按各item列分组 按label取值0/1 统计搜索次数、点击次数、点击率
    temp = train_data.groupby(item, as_index = False)['label'].agg({item+'_click':'sum', item+'_count':'count'})
    temp[item+'_ctr'] = temp[item+'_click']/(temp[item+'_count'])
    train_data = pd.merge(train_data, temp, on=item, how='left')
for i in range(len(items)):
    for j in range(i+1, len(items)):
        item_g = [items[i], items[j]]   #三选二,作为分组指标
        temp = train_data.groupby(item_g, as_index=False)['label'].agg({'_'.join(item_g)+'_click': 'sum','_'.join(item_g)+'count':'count'})
        temp['_'.join(item_g)+'_ctr'] = temp['_'.join(item_g)+'_click']/(temp['_'.join(item_g)+'count']+3)
        train_data = pd.merge(train_data, temp, on=item_g, how='left')
from sklearn import preprocessing
import time

# 转换tag
encoder = preprocessing.LabelEncoder()
train_data['tag'] = encoder.fit_transform(train_data['tag'])
encoder = preprocessing.LabelEncoder()
train_data['prefix'] = encoder.fit_transform(train_data['prefix'])
encoder = preprocessing.LabelEncoder()
train_data['title'] = encoder.fit_transform(train_data['title'])
train_data_ = train_data.drop(['query_prediction'], axis = 1)

dense_features = ['prefix_click', 'prefix_count', 'prefix_ctr', 
                   'title_click', 'title_count', 'title_ctr',
                   'tag_click', 'tag_count', 'tag_ctr', 
                   'prefix_title_click', 'prefix_titlecount', 'prefix_title_ctr', 
                   'prefix_tag_click', 'prefix_tagcount', 'prefix_tag_ctr', 
                   'title_tag_click', 'title_tagcount', 'title_tag_ctr']   #类别型特征
sparse_features = ['prefix', 'title', 'tag']   #数值型特征
train_data_[sparse_features] = train_data_[sparse_features].fillna('-1', )
train_data_[dense_features] = train_data_[dense_features].fillna(0, )

target = ['label']
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

for feat in sparse_features:
    lbe = LabelEncoder()
    train_data_[feat] = lbe.fit_transform(train_data_[feat])   #将类别转化成id编码
    
mms = MinMaxScaler(feature_range=(0, 1))
train_data_[dense_features] = mms.fit_transform(train_data_[dense_features])
train_data_.head()

在这里插入图片描述

from deepctr.models import DeepFM
from deepctr.inputs import  SparseFeat, DenseFeat, get_feature_names

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=train_data_[feat].nunique(),embedding_dim=4)
                       for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                      for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns   #类别型特征DNN
linear_feature_columns = fixlen_feature_columns   #连续型特征线性FM

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
train, test = train_test_split(train_data_, test_size=0.2)
train_model_input = {name:train[name] for name in feature_names}   #每项index:该列取值
test_model_input = {name:test[name] for name in feature_names}
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )

history = model.fit(train_model_input, train[target].values,
                    batch_size=256, epochs=10, verbose=2, validation_split=0.2, )

在这里插入图片描述

from sklearn.metrics import f1_score

pred_ans = model.predict(test_model_input, batch_size=256)
f1_score(test[target].values, np.where(pred_ans>0.5, 1, 0))

# O:0.7944898855019384
特征

类别特征embedding:(1)FM二阶项(2)DNN网络
连续值特征:FM一阶项

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值