面试项目1:基于电商评论数据集的多模态混合推荐系统
推荐之前:基于某旅游攻略网站的文章用户画像
混合推荐系统:推荐系统的 离线与在线、召回与排序
1、数据格式介绍
字段 | 描述 |
---|---|
cid | 衣服ID |
age | 年龄(18-99) |
review_text | 评论 |
rating | 分数(1-5) |
recommended | 是否推荐(0、1) |
positive_count | 支持这条评论的人数 |
category_name | 一级分类名(均码、小码、内衣) |
new_class_name | 二级分类名+三级分类名(特征组合) |
2、模型更小点、速度更快点:FTRL
2.1 特征交叉
cross_columns = [
tf.feature_column.crossed_column([age_bucket, category_name2num], hash_bucket_size=16),
tf.feature_column.crossed_column([age_bucket, class_name2num], hash_bucket_size=128)
]
2.2 正则化
linear_optimizer=tf.train.FtrlOptimizer(0.1, l2_regularization_strength=1.0)
3、数据分析&特征工程
3.1 去空值
feature_nona = tmp.dropna(axis=0)
3.2 去重复
feature_nona_duplicated = feature_nona[~feature_nona['Review Text'].duplicated()]
3.3 连续值离散化
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
feature_nona_duplicated['age'].hist(bins=17,color='#A9C5D3')
ax.set_xlabel('Age')
x=list(range(15,100,5))
ax.set_xticks(x)
ax.set_ylabel('Frequency')
ax.set_title('Age bins')
plt.show()
3.4 分类转哑变量
dummy_category_name = pd.get_dummies(feature_nona_duplicated['category_name'])
3.4 假设性检验
sns.factorplot('age_bins','recommended',hue='General Petite',data=feature_dummies_category_name)
plt.show()
3.5 相关性分析
4、拥抱深度学习:Wide&Deep
4.1 tf.data
def make_dataset(data_file, re_time, shuffle, batch_size, predict):
_CSV_COLUMN_DEFAULTS = [[0], [0], [''], [0], [0], [''], [''], [0], [0], [0]]
_CSV_COLUMNS = [
'cid','age','review_text','rating','positive_count','category_name','new_class_name','category_name2num','class_name2num','recommended'
]
def parse_csv(line):
columns = tf.io.decode_csv(line, use_quote_delim=False, record_defaults=_CSV_COLUMN_DEFAULTS)
features = dict(zip(_CSV_COLUMNS, columns))
labels = features.pop('recommended')
print(features)
print(labels)
return features, labels
dataset = tf.data.TextLineDataset(data_file).skip(1)
if shuffle:
dataset = dataset.shuffle(buffer_size=12723)
if predict:
dataset = dataset.map(parse_csv_predict, num_parallel_calls=5)
else:
dataset = dataset.map(parse_csv, num_parallel_calls=5)
dataset = dataset.repeat(re_time)
dataset = dataset.batch(batch_size)
return dataset
4.2 tf.feature_column
def make_feature_column():
age = tf.feature_column.numeric_column("age")
age_bucket = tf.feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 100])
category_name2num = tf.feature_column.numeric_column("category_name2num")
class_name2num = tf.feature_column.numeric_column("class_name2num")
category_name_classes = tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(
'category_name',['Initmates','General','General Petite']))
class_name_classes = tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(
'new_class_name',['Intimate-Intimates','Dresses-Dresses','Bottoms-Pants','Tops-Blouses',
'Tops-Knits','Jackets-Outerwear','Intimate-Lounge','Tops-Sweaters',
'Bottoms-Skirts','Tops-Fine gauge','Intimate-Sleep','Jackets-Jackets',
'Intimate-Swim','Trend-Trend','Bottoms-Jeans','Intimate-Legwear',
'Bottoms-Shorts','Intimate-Layering','Bottoms-Casual bottoms',
'Intimate-Chemises']))
cross_columns = [
tf.feature_column.crossed_column([age_bucket, category_name2num], hash_bucket_size=16),
tf.feature_column.crossed_column([age_bucket, class_name2num], hash_bucket_size=128)
]
wide_columns = [age_bucket, category_name2num, class_name2num] + cross_columns
deep_columns = [
age,
tf.feature_column.embedding_column(category_name_classes),
tf.feature_column.embedding_column(class_name_classes)
]
return wide_columns, deep_columns
4.3 tf.estimator
def make_estimator(wide_column, deep_column):
model = tf.estimator.DNNLinearCombinedClassifier(
linear_feature_columns= wide_column,
linear_optimizer=tf.train.FtrlOptimizer(0.1, l2_regularization_strength=1.0),
dnn_feature_columns= deep_column,
dnn_optimizer= tf.train.ProximalAdagradOptimizer(learning_rate=0.1, l1_regularization_strength=0.001, l2_regularization_strength=0.001),
dnn_hidden_units=[128, 64, 32, 16]
)
return model
5、优化点(评论极性分析)
maxlen = 500
max_words = 10000
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(feature_nona_duplicated.review_text)
sequences = tokenizer.texts_to_sequences(feature_nona_duplicated.review_text)
X = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)
def make_classes(label):
classes = 0
if(label == 5):
classes = 1
return classes
y = feature_nona_duplicated['rating'].apply(make_classes).values
embedding_dim = 16
batch_size = 128
model = keras.models.Sequential([
keras.layers.Embedding(max_words, embedding_dim, input_length = maxlen),
keras.layers.Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'),
keras.layers.MaxPool1D(pool_size=2),
keras.layers.Dropout(0.2),
keras.layers.LSTM(units=100),
keras.layers.Dense(1, activation = 'sigmoid'),
])
model.summary()
model.compile(optimizer = 'adam', loss = 'binary_crossentropy',
metrics = ['accuracy'])
history = model.fit(X, y,
epochs = 2,
batch_size = batch_size,
validation_split = 0.2)