- 简述了模型部分的各个操作
- 对代码进行了结果展示
- 向量的特征交叉测试(FM部分)
1.模型部分
- 数值型特征的输入层
- 类别型特征的输入层
- 数值型特征不用Embedding
- 类别型特征的Embedding
- 与liner部分结合 - Embedding为1维->flaten->Add->与Dense后的数值型Add()
- FM部分:类别型特征的特征交叉 - 不需要Flaten
- deep: Embedding为多维(4维),然后特征交叉
- 一阶特征相加
- 数值型特征的concat ->Dense + 与类别型特征的add后的add
- fm二阶交叉
- 文章最后自己测试了下特征交叉(就是(a2-b2)/2) – 主要是掌握
- 个人认为可以用layers.Dot()层实现特征交叉,水平有限,有待尝试
- deep部分
- 多维EmbeddingFlatten后->拼接concat->多层神经网络迭代
- 三合一
- Add()
2.代码展示
# 导包
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
# from utils import SparseFeat, DenseFeat, VarLenSparseFeat
# 读取数据
data = pd.read_csv('./data/criteo_sample.txt')
data.head()
label | I1 | I2 | I3 | I4 | I5 | I6 | I7 | I8 | I9 | ... | C17 | C18 | C19 | C20 | C21 | C22 | C23 | C24 | C25 | C26 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | NaN | 3 | 260.0 | NaN | 17668.0 | NaN | NaN | 33.0 | NaN | ... | e5ba7672 | 87c6f83c | NaN | NaN | 0429f84b | NaN | 3a171ecb | c0d61a5c | NaN | NaN |
1 | 0 | NaN | -1 | 19.0 | 35.0 | 30251.0 | 247.0 | 1.0 | 35.0 | 160.0 | ... | d4bb7bd8 | 6fc84bfb | NaN | NaN | 5155d8a3 | NaN | be7c41b4 | ded4aac9 | NaN | NaN |
2 | 0 | 0.0 | 0 | 2.0 | 12.0 | 2013.0 | 164.0 | 6.0 | 35.0 | 523.0 | ... | e5ba7672 | 675c9258 | NaN | NaN | 2e01979f | NaN | bcdee96c | 6d5d1302 | NaN | NaN |
3 | 0 | NaN | 13 | 1.0 | 4.0 | 16836.0 | 200.0 | 5.0 | 4.0 | 29.0 | ... | e5ba7672 | 52e44668 | NaN | NaN | e587c466 | NaN | 32c7478e | 3b183c5c | NaN | NaN |
4 | 0 | 0.0 | 0 | 104.0 | 27.0 | 1990.0 | 142.0 | 4.0 | 32.0 | 37.0 | ... | e5ba7672 | 25c88e42 | 21ddcdc9 | b1252a9d | 0e8585d2 | NaN | 32c7478e | 0d4a6d1a | 001f3601 | 92c878de |
5 rows × 40 columns
# 划分出类别型特征和数值型特征
columns = data.columns.values
dense_features = [f for f in columns if 'I' in f]
sparse_features = [f for f in columns if 'C' in f]
columns
array(['label', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9',
'I10', 'I11', 'I12', 'I13', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6',
'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16',
'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25',
'C26'], dtype=object)
dense_features
['I1',
'I2',
'I3',
'I4',
'I5',
'I6',
'I7',
'I8',
'I9',
'I10',
'I11',
'I12',
'I13']
# 数据处理 : 分别处理数值型特征和类别型特征
def data_process(data_df,dense_festures,sparse_features):
"""
对特征进行处理
"""
# 处理数值型特征
data_df[dense_festures]= data_df[dense_features].fillna(0.0) # 所有的数值列的空值补0
for f in dense_features: # 每一列数值型特征的归一化
data_df[f] = data_df[f].apply(lambda x:np.log(x+1) if x>-1 else -1 )
# 处理类别型特征
data_df[sparse_features] = data_df[sparse_features].fillna('-1')
for f in sparse_features:
# 对每列的类别型特征进行数值化处理
le = LabelEncoder()
data_df[f] = le.fit_transform(data_df[f])
return data_df[dense_features+sparse_features]
train_data = data_process(data,dense_features,sparse_features)
train_data
I1 | I2 | I3 | I4 | I5 | I6 | I7 | I8 | I9 | I10 | ... | C17 | C18 | C19 | C20 | C21 | C22 | C23 | C24 | C25 | C26 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000000 | 1.386294 | 5.564520 | 0.000000 | 9.779567 | 0.000000 | 0.000000 | 3.526361 | 0.000000 | 0.000000 | ... | 8 | 66 | 0 | 0 | 3 | 0 | 1 | 96 | 0 | 0 |
1 | 0.000000 | -1.000000 | 2.995732 | 3.583519 | 10.317318 | 5.513429 | 0.693147 | 3.583519 | 5.081404 | 0.000000 | ... | 7 | 52 | 0 | 0 | 47 | 0 | 7 | 112 | 0 | 0 |
2 | 0.000000 | 0.000000 | 1.098612 | 2.564949 | 7.607878 | 5.105945 | 1.945910 | 3.583519 | 6.261492 | 0.000000 | ... | 8 | 49 | 0 | 0 | 25 | 0 | 6 | 53 | 0 | 0 |
3 | 0.000000 | 2.639057 | 0.693147 | 1.609438 | 9.731334 | 5.303305 | 1.791759 | 1.609438 | 3.401197 | 0.000000 | ... | 8 | 37 | 0 | 0 | 156 | 0 | 0 | 32 | 0 | 0 |
4 | 0.000000 | 0.000000 | 4.653960 | 3.332205 | 7.596392 | 4.962845 | 1.609438 | 3.496508 | 3.637586 | 0.000000 | ... | 8 | 14 | 5 | 3 | 9 | 0 | 0 | 5 | 1 | 47 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
195 | 0.000000 | 0.000000 | 4.736198 | 1.386294 | 8.018625 | 6.356108 | 1.098612 | 1.386294 | 5.370638 | 0.000000 | ... | 0 | 74 | 5 | 1 | 30 | 5 | 0 | 118 | 17 | 48 |
196 | 0.000000 | 0.693147 | 0.693147 | 0.693147 | 7.382746 | 2.564949 | 0.693147 | 2.564949 | 2.772589 | 0.000000 | ... | 1 | 25 | 0 | 0 | 138 | 0 | 0 | 68 | 0 | 0 |
197 | 0.693147 | 0.000000 | 1.945910 | 1.386294 | 0.000000 | 0.000000 | 2.995732 | 1.386294 | 1.386294 | 0.693147 | ... | 4 | 40 | 17 | 2 | 41 | 0 | 0 | 12 | 16 | 11 |
198 | 0.000000 | 3.135494 | 1.945910 | 3.135494 | 5.318120 | 5.036953 | 4.394449 | 2.944439 | 6.232448 | 0.000000 | ... | 4 | 7 | 18 | 1 | 123 | 0 | 0 | 10 | 16 | 49 |
199 | 0.693147 | -1.000000 | 0.000000 | 0.000000 | 4.934474 | 0.000000 | 0.693147 | 0.000000 | 0.000000 | 0.693147 | ... | 7 | 72 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
200 rows × 39 columns
train_data['label'] = data['label']
train_data
I1 | I2 | I3 | I4 | I5 | I6 | I7 | I8 | I9 | I10 | ... | C18 | C19 | C20 | C21 | C22 | C23 | C24 | C25 | C26 | label | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000000 | 1.386294 | 5.564520 | 0.000000 | 9.779567 | 0.000000 | 0.000000 | 3.526361 | 0.000000 | 0.000000 | ... | 66 | 0 | 0 | 3 | 0 | 1 | 96 | 0 | 0 | 0 |
1 | 0.000000 | -1.000000 | 2.995732 | 3.583519 | 10.317318 | 5.513429 | 0.693147 | 3.583519 | 5.081404 | 0.000000 | ... | 52 | 0 | 0 | 47 | 0 | 7 | 112 | 0 | 0 | 0 |
2 | 0.000000 | 0.000000 | 1.098612 | 2.564949 | 7.607878 | 5.105945 | 1.945910 | 3.583519 | 6.261492 | 0.000000 | ... | 49 | 0 | 0 | 25 | 0 | 6 | 53 | 0 | 0 | 0 |
3 | 0.000000 | 2.639057 | 0.693147 | 1.609438 | 9.731334 | 5.303305 | 1.791759 | 1.609438 | 3.401197 | 0.000000 | ... | 37 | 0 | 0 | 156 | 0 | 0 | 32 | 0 | 0 | 0 |
4 | 0.000000 | 0.000000 | 4.653960 | 3.332205 | 7.596392 | 4.962845 | 1.609438 | 3.496508 | 3.637586 | 0.000000 | ... | 14 | 5 | 3 | 9 | 0 | 0 | 5 | 1 | 47 | 0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
195 | 0.000000 | 0.000000 | 4.736198 | 1.386294 | 8.018625 | 6.356108 | 1.098612 | 1.386294 | 5.370638 | 0.000000 | ... | 74 | 5 | 1 | 30 | 5 | 0 | 118 | 17 | 48 | 0 |
196 | 0.000000 | 0.693147 | 0.693147 | 0.693147 | 7.382746 | 2.564949 | 0.693147 | 2.564949 | 2.772589 | 0.000000 | ... | 25 | 0 | 0 | 138 | 0 | 0 | 68 | 0 | 0 | 1 |
197 | 0.693147 | 0.000000 | 1.945910 | 1.386294 | 0.000000 | 0.000000 | 2.995732 | 1.386294 | 1.386294 | 0.693147 | ... | 40 | 17 | 2 | 41 | 0 | 0 | 12 | 16 | 11 | 1 |
198 | 0.000000 | 3.135494 | 1.945910 | 3.135494 | 5.318120 | 5.036953 | 4.394449 | 2.944439 | 6.232448 | 0.000000 | ... | 7 | 18 | 1 | 123 | 0 | 0 | 10 | 16 | 49 | 0 |
199 | 0.693147 | -1.000000 | 0.000000 | 0.000000 | 4.934474 | 0.000000 | 0.693147 | 0.000000 | 0.000000 | 0.693147 | ... | 72 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
200 rows × 40 columns
train_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 40 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 I1 200 non-null float64
1 I2 200 non-null float64
2 I3 200 non-null float64
3 I4 200 non-null float64
4 I5 200 non-null float64
5 I6 200 non-null float64
6 I7 200 non-null float64
7 I8 200 non-null float64
8 I9 200 non-null float64
9 I10 200 non-null float64
10 I11 200 non-null float64
11 I12 200 non-null float64
12 I13 200 non-null float64
13 C1 200 non-null int32
14 C2 200 non-null int32
15 C3 200 non-null int32
16 C4 200 non-null int32
17 C5 200 non-null int32
18 C6 200 non-null int32
19 C7 200 non-null int32
20 C8 200 non-null int32
21 C9 200 non-null int32
22 C10 200 non-null int32
23 C11 200 non-null int32
24 C12 200 non-null int32
25 C13 200 non-null int32
26 C14 200 non-null int32
27 C15 200 non-null int32
28 C16 200 non-null int32
29 C17 200 non-null int32
30 C18 200 non-null int32
31 C19 200 non-null int32
32 C20 200 non-null int32
33 C21 200 non-null int32
34 C22 200 non-null int32
35 C23 200 non-null int32
36 C24 200 non-null int32
37 C25 200 non-null int32
38 C26 200 non-null int32
39 label 200 non-null int64
dtypes: float64(13), int32(26), int64(1)
memory usage: 42.3 KB
模型的创建
# 1. 数值型特征的输入层
dense_inputs=[]
for f in dense_features:
_input = Input([1],name=f)
dense_inputs.append(_input)
dense_inputs
[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I1')>,
<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I2')>,
<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I3')>,
<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I4')>,
<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I5')>,
<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I6')>,
<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I7')>,
<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I8')>,
<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I9')>,
<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I10')>,
<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I11')>,
<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I12')>,
<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'I13')>]
# 2. 类别型特征的输入层
sparse_inputs=[]
for f in sparse_features:
_input = Input([