特征工程操作丰富而繁琐,记一下编写时的代码,以便日后重复调用
import pandas as pd
import numpy as np
import logging
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
# 训练模型文件名为 utils,训练函数为 base_train,详情见下一篇博文
from utils import base_train
1. 设置输出格式、logging 日志模块
实例:
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
logging.getLogger().setLevel(logging.INFO)
1.1 设置输出格式
① x: "{0:0.3f}".format(x)
format 部分
# -*- coding: cp936 -*-
##{0}对应于"age",^右对齐输出
##{1}对应于"name",左对齐输出(默认)
print("{0:^30}\n{1:^30}\n{1:10}".format("age","name"))
0.3f 代表保留小数点后3位小数
对于 dic 中 float 的值处理
② formatter
对所有 float 数进行处理,只保留 3 位小数
np.set_printoptions(precision=None, threshold=None, edgeitems=None, linewidth=None, suppress=None, nanstr=None, infstr=None)
函数参数详细介绍可参考库内说明或者以下链接:
1.2 logging 日志模块
① logging.getLogger
返回指定名称的文件的 logging ,没写的话,则返回根目录下的程序的日志程序
② getLogger.setLevel(logging.INFO)
设定日志警告等级为 INFO
2. 读取、合并数据并输出数据
2.2 数据读取与合并
实例:(reduce_mem_usage 为数据压缩函数,不对数据进行任何变化,详情可参考 reduce_mem_usage)
读取三个文件
txdir = "E:/ML/"
df_ad = pd.read_csv(txdir + "train_preliminary/ad.csv")
df_ad = reduce_mem_usage(df_ad)
df_click_log = pd.read_csv(txdir + "train_preliminary/click_log.csv")
df_click_log = reduce_mem_usage(df_click_log)
df_user = pd.read_csv(txdir + "train_preliminary/user.csv")
df_user = reduce_mem_usage(df_user)
2.3 空值处理
df_ad.loc[df_ad['product_id'] == '\\N', 'product_id'] = 0
df_ad.loc[df_ad['industry'] == '\\N', 'industry'] = 0
df_user['gender'] = df_user['gender'] - 1
将值为 \\N ,且竖轴为 product_id 和 industry 的值替换为 0
将性别设定为 0 和 1
2.4 数据集分割
df_train, df_dev = train_test_split(df_user, test_size=0.2, random_state=2020)
五分之一作为测试集
2.5 数据合并
将点击数据与广告数据合并,一个关键是 user 一个关键是 广告id(训练集中部分训练数据的广告 id 信息缺失,最好将测试集合并)
f_click_log=df_click_log.merge(df_ad, how="left", on="creative_id", )
2.6 列处理,生成自定义特征列
特征列处理函数
def get_features(df_click_log):
paras = [
# 返回 user 点了几种广告素材
(['user_id'], 'creative_id', 'unique'),
# 返回 user 点击的广告素材的总次数
(['user_id'], 'creative_id', 'count'),
# 返回 user 点击了几种广告
(['user_id'], 'ad_id', 'unique'),
# 总共点击了几次广告
(['user_id'], 'ad_id', 'count'),
(['user_id'], 'product_id', 'unique'),
(['user_id'], 'product_id', 'count'),
(['user_id'], 'product_category', 'unique'),
(['user_id'], 'product_category', 'count'),
(['user_id'], 'advertiser_id', 'unique'),
(['user_id'], 'advertiser_id', 'count'),
(['user_id'], 'industry', 'unique'),
(['user_id'], 'industry', 'count')
]
df_tmp = pd.DataFrame()
for groupby_cols, stat_col, aggfunc in paras:
tmp = agg_features(df_click_log, groupby_cols, stat_col, aggfunc)
df_tmp = df_tmp.merge(tmp, how='left', on='user_id') if not df_tmp.empty else tmp
return df_tmp
def agg_features(df_click_log, groupby_cols, stat_col, aggfunc):
if type(groupby_cols) == str:
groupby_cols = [groupby_cols]
data = df_click_log[groupby_cols + [stat_col]]
if aggfunc == "size":
tmp = pd.DataFrame(data.groupby(groupby_cols).size()).reset_index()
elif aggfunc == "count":
tmp = pd.DataFrame(data.groupby(groupby_cols)[stat_col].count()).reset_index()
elif aggfunc == "mean":
tmp = pd.DataFrame(data.groupby(groupby_cols)[stat_col].mean()).reset_index()
elif aggfunc == "unique":
tmp = pd.DataFrame(data.groupby(groupby_cols)[stat_col].nunique()).reset_index()
elif aggfunc == "max":
tmp = pd.DataFrame(data.groupby(groupby_cols)[stat_col].max()).reset_index()
elif aggfunc == "min":
tmp = pd.DataFrame(data.groupby(groupby_cols)[stat_col].min()).reset_index()
elif aggfunc == "sum":
tmp = pd.DataFrame(data.groupby(groupby_cols)[stat_col].sum()).reset_index()
elif aggfunc == "std":
tmp = pd.DataFrame(data.groupby(groupby_cols)[stat_col].std()).reset_index()
elif aggfunc == "median":
tmp = pd.DataFrame(data.groupby(groupby_cols)[stat_col].median()).reset_index()
elif aggfunc == "skew":
tmp = pd.DataFrame(data.groupby(groupby_cols)[stat_col].skew()).reset_index()
elif aggfunc == "unique_mean":
group = data.groupby(groupby_cols)
group = group.apply(lambda x: np.mean(list(Counter(list(x[stat_col])).values())))
tmp = pd.DataFrame(group.reset_index())
elif aggfunc == "unique_var":
group = data.groupby(groupby_cols)
group = group.apply(lambda x: np.var(list(Counter(list(x[stat_col])).values())))
tmp = pd.DataFrame(group.reset_index())
else:
raise Exception("aggfunc error")
feat_name = '_'.join(groupby_cols) + "_" + stat_col + "_" + aggfunc
tmp.columns = groupby_cols + [feat_name]
print(feat_name)
return tmp
调用函数进行处理
df_feat = get_features(df_click_log)
2.7 根据关键值进行 训练集、测试集 与 自定义特征列合并
df_train = df_train.merge(df_feat, how='left', on='user_id')
df_dev = df_dev.merge(df_feat, how='left', on='user_id')
2.8 确定 y 值(label),x 值(根据 user 的特征数据),测试集也同样处理
性别训练数据集
y_train = df_train['gender']
x_train = df_train.drop(['gender', 'age', 'user_id'], axis=1)
y_dev = df_dev['gender']
x_dev = df_dev.drop(['gender', 'age', 'user_id'], axis=1)
年龄训练数据集
y_train = df_train['age']
x_train = df_train.drop(['gender', 'age', 'user_id'], axis=1)
y_dev = df_dev['age']
x_dev = df_dev.drop(['gender', 'age', 'user_id'], axis=1)
3. 训练数据
训练预测性别的模型 gbm_gender
gbm_gender = base_train(x_train, y_train, x_dev, y_dev, job='classification')
训练预测年龄的模型 gbm_age
gbm_age = base_train(x_train, y_train, x_dev, y_dev, job='regression')
这里的训练函数就不多写,主要写后面的后处理环节
训练函数放在另一篇文章:数据处理(2.1)点击数据处理-lgb 训练实战
4. 数据后处理
4.1 读取、处理异常值、合并测试集
df_ad_test = pd.read_csv(txdir + "test/ad.csv")
df_click_log_test = pd.read_csv(txdir + "test/click_log.csv")
df_ad_test.loc[df_ad_test['product_id'] == '\\N', 'product_id'] = 0
df_ad_test.loc[df_ad_test['industry'] == '\\N', 'industry'] = 0
df_click_log_test = df_click_log_test.merge(df_ad_test, how='left')
4.2 进行与训练集一致的特征列处理,保持一致
df_feat_test = get_features(df_click_log_test)
df_feat_test.to_csv("df_feat_test.csv",index=False,encoding='utf-8-sig')
df_res = df_feat_test[['user_id']]
df_test = df_feat_test.drop(['user_id'], axis=1)
4.3 预测
pre_age = gbm_age.predict(df_test)
pre_gender = gbm_gender.predict(df_test)
4.4 配置输出结果,调整为与预期一致的格式,并输出为 csv 文件
df_res['predicted_age'] = pre_age
df_res['predicted_gender'] = pre_gender
df_res.loc[df_res['predicted_gender'] >= 0.5, 'predicted_gender'] = 2
df_res.loc[df_res['predicted_gender'] < 0.5, 'predicted_gender'] = 1
df_ad['predicted_age'] = df_ad['predicted_age'].apply(int)
df_res.to_csv("submission.csv", index=False,encoding='utf-8-sig')
完结