数据可视化
数据可视化是常用的分析数据的方法,这里使用热力图和bar来分析
热力图分析
# 导入库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# 读取训练集和测试集文件
train_data = pd.read_csv('用户新增预测挑战赛公开数据/train.csv')
test_data = pd.read_csv('用户新增预测挑战赛公开数据/test.csv')
# 相关性热力图
sns.heatmap(train_data.corr().abs(), cmap='YlOrRd')
xx
# xx分组可视化
fig, axs = plt.subplots(2, 4)
for i in range(8):
sns.barplot(x=f'x{i+1}', y='target', data=train_data, ax=axs[i//4, i%4])
从可视化结果能直观看出x1, x2, x6, x7, x8
都是类别特征,进一步检查其他三个
print(len(train_data.x3.value_counts()))
print(len(train_data.x4.value_counts()))
print(len(train_data.x5.value_counts()))
'''
72
138
408
'''
看来也是类别特征,直接进行独热编码
xx_onehot = pd.get_dummies(train_data, columns = ['x1','x2','x3','x4','x5','x6','x7','x8'])
xx_onehot.head()
common_ts
提取hour信息
train_data['common_ts_hour'] = train_data['common_ts'].dt.hour
test_data['common_ts_hour'] = test_data['common_ts'].dt.hour
绘制
sns.barplot(x='common_ts_hour', y='target', data=train_data)
看图好像没特别的分布规律,类似再统计统计日期(年和月不用统计,都是2023-7)
train_data['common_ts_day'] = train_data['common_ts'].dt.day
test_data['common_ts_day'] = test_data['common_ts'].dt.day
sns.barplot(x='common_ts_day', y='target', data=train_data)
udmap
先用baseline的方法编码
def udmap_onethot(d):
v = np.zeros(9)
if d == 'unknown':
return v
d = eval(d)
for i in range(1, 10):
if 'key' + str(i) in d:
v[i-1] = d['key' + str(i)]
return v
train_udmap_df = pd.DataFrame(np.vstack(train_data['udmap'].apply(udmap_onethot))).astype('int')
test_udmap_df = pd.DataFrame(np.vstack(test_data['udmap'].apply(udmap_onethot))).astype('int')
train_udmap_df.columns = ['key' + str(i) for i in range(1, 10)]
test_udmap_df.columns = ['key' + str(i) for i in range(1, 10)]
检查值
train_data = pd.concat([train_data, train_udmap_df], axis=1)
test_data = pd.concat([test_data, test_udmap_df], axis=1)
'''
1424
1522
83370
20
20
1356
4
3
5
'''
可以考虑对key4, 5, 7, 8, 9
编码
交叉验证
交叉验证使用的数据是已经进行下面特征工程中时间序列提取、xx编码和udmap编码后的数据集
# 导入交叉验证和评价指标
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
# 导入模型
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
# 训练并验证SGDClassifier
pred = cross_val_predict(
SGDClassifier(max_iter=10),
train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),
train_data['target']
)
print(classification_report(train_data['target'], pred, digits=3))
# 训练并验证DecisionTreeClassifier
pred = cross_val_predict(
DecisionTreeClassifier(),
train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),
train_data['target']
)
print(classification_report(train_data['target'], pred, digits=3))
# 训练并验证MultinomialNB
pred = cross_val_predict(
MultinomialNB(),
train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),
train_data['target']
)
print(classification_report(train_data['target'], pred, digits=3))
# 训练并验证RandomForestClassifier
pred = cross_val_predict(
RandomForestClassifier(n_estimators=5),
train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),
train_data['target']
)
print(classification_report(train_data['target'], pred, digits=3))
运行结果
# SGD
precision recall f1-score support
0 0.870 0.733 0.796 533155
1 0.169 0.333 0.224 87201
accuracy 0.677 620356
macro avg 0.520 0.533 0.510 620356
weighted avg 0.772 0.677 0.716 620356
# Decision Treee
precision recall f1-score support
0 0.947 0.943 0.945 533155
1 0.659 0.674 0.667 87201
accuracy 0.905 620356
macro avg 0.803 0.809 0.806 620356
weighted avg 0.906 0.905 0.906 620356
# MultinomialNB
precision recall f1-score support
0 0.902 0.419 0.572 533155
1 0.169 0.723 0.274 87201
accuracy 0.461 620356
macro avg 0.536 0.571 0.423 620356
weighted avg 0.799 0.461 0.530 620356
# Random Forest
precision recall f1-score support
0 0.928 0.976 0.951 533155
1 0.783 0.539 0.638 87201
accuracy 0.914 620356
macro avg 0.856 0.757 0.795 620356
weighted avg 0.908 0.914 0.907 620356
可以观察到,决策树的macro F1
最好,随机森林的加权F1表现要更好些。
考虑到正反例数量差异较大,用随机森林可能更稳定。
再试几种方法
# xgbClassifier
from xgboost import XGBClassifier
pred = cross_val_predict(
XGBClassifier(),
train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),
train_data['target']
)
print(classification_report(train_data['target'], pred, digits=3))
precision recall f1-score support
0 0.889 0.988 0.936 533155
1 0.768 0.245 0.371 87201
accuracy 0.883 620356
macro avg 0.829 0.616 0.654 620356
weighted avg 0.872 0.883 0.856 620356
# catboostClassifier
pred = cross_val_predict(
CatBoostRegressor(iterations = 10000,**params),
train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1),
train_data['target']
)
print(classification_report(train_data['target'], pred, digits=3))
precision recall f1-score support
0 0.945 0.976 0.960 533155
1 0.819 0.649 0.724 87201
accuracy 0.931 620356
macro avg 0.882 0.813 0.842 620356
weighted avg 0.927 0.931 0.927 620356
特征工程
时间序列提取
train_data['common_ts_day'] = train_data['common_ts'].dt.day
test_data['common_ts_day'] = test_data['common_ts'].dt.day
train_data['common_ts_hour'] = train_data['common_ts'].dt.hour
test_data['common_ts_hour'] = test_data['common_ts'].dt.hour
train_data['common_ts_min'] = train_data['common_ts'].dt.minute
test_data['common_ts_min'] = test_data['common_ts'].dt.minute
train_data['common_ts_sec'] = train_data['common_ts'].dt.second
test_data['common_ts_sec'] = test_data['common_ts'].dt.second
估计帮助不大,后面再进行下特征筛选吧
xx编码
train_data = pd.get_dummies(train_data, columns = ['x1','x2','x3','x4','x5','x6','x7','x8'])
test_data = pd.get_dummies(test_data, columns = ['x1','x2','x3','x4','x5','x6','x7','x8'])
udmap编码
def udmap_onethot(d):
v = np.zeros(9)
if d == 'unknown':
return v
d = eval(d)
for i in range(1, 10):
if 'key' + str(i) in d:
v[i-1] = d['key' + str(i)]
return v
train_udmap_df = pd.DataFrame(np.vstack(train_data['udmap'].apply(udmap_onethot))).astype('int')
test_udmap_df = pd.DataFrame(np.vstack(test_data['udmap'].apply(udmap_onethot))).astype('int')
train_udmap_df.columns = ['key' + str(i) for i in range(1, 10)]
test_udmap_df.columns = ['key' + str(i) for i in range(1, 10)]
train_data = pd.concat([train_data, train_udmap_df], axis=1)
test_data = pd.concat([test_data, test_udmap_df], axis=1)
train_data = pd.get_dummies(train_data, columns = ['key4','key5','key7','key8','key9'])
test_data = pd.get_dummies(test_data, columns = ['key4','key5','key7','key8','key9'])
列同步
# 检查train不存在的列
train_nothave = test_data.columns[[(col not in train_data.columns) for col in test_data.columns]]
train_nothave.to_numpy()
# 检查test不存在的列
test_nothave = train_data.columns[[(col not in test_data.columns) for col in train_data.columns]]
test_nothave = test_nothave.drop('target')
test_nothave
train_zeros = pd.DataFrame(np.zeros((train_data.shape[0], len(train_nothave))), columns = train_nothave, )
train_data = pd.concat([train_data, train_zeros], axis = 1, sort = False)
del train_zeros
test_zeros = pd.DataFrame(np.zeros((test_data.shape[0], len(test_nothave))), columns = test_nothave, )
test_data = pd.concat([test_data, test_zeros], axis = 1, sort = False)
del test_zeros
# 调整列序
train_data = train_data.sort_index(1)
test_data = test_data.sort_index(1)
# 验证
train_data.drop('target', axis = 1).columns == test_data.columns
结果导出和提交
使用10折交叉,运行CatBoostClassifier求得平均结果,并提交
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
skf = StratifiedKFold(n_splits = 10)
X = train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1)
y = train_data['target']
preds = np.zeros((test_data.shape[0]))
total_score = 0
fea_imp = np.zeros(X.shape[1])
for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
X_train, X_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]
clf = CatBoostClassifier(iterations = 30000, **params,)
clf.fit(X_train, y_train,
eval_set=(X_val, y_val),
use_best_model=True,
metric_period = 500)
total_score += f1_score(y_val, clf.predict(X_val)) / 10
fea_imp += clf.feature_importances_ # 保存特征重要性
test_pred = clf.predict(test_data.drop(['udmap', 'common_ts', 'uuid'], axis = 1))
preds+=test_pred/10
print(f'Total f1 score: {total_score}')
pd.DataFrame({
'uuid': test_data['uuid'],
'target': preds.round().astype('int')
}).to_csv('submit.csv', index=None)
上传后的分数
根据树模型进行特征选择
绘制特征图像
import matplotlib.pyplot as plt
plt.hist(fea_imp/fea_imp.max(), np.linspace(0, 1, 100))
取0.005
的阈值试一试
sel = fea_imp > 0.005
skf = StratifiedKFold(n_splits = 10)
X = train_data.drop(['udmap', 'common_ts', 'uuid', 'target'], axis=1).iloc[:,sel]
y = train_data['target']
preds = np.zeros((test_data.shape[0]))
total_score = 0
for i, (train_idx, val_idx) in enumerate(skf.split(X, y)):
X_train, X_val, y_train, y_val = X.iloc[train_idx], X.iloc[val_idx], y.iloc[train_idx], y.iloc[val_idx]
clf = CatBoostClassifier(iterations = 30000, **params,)
clf.fit(X_train, y_train,
eval_set=(X_val, y_val),
use_best_model=True,
metric_period = 500)
total_score += f1_score(y_val, clf.predict(X_val)) / 10
test_pred = clf.predict(test_data.drop(['udmap', 'common_ts', 'uuid'], axis = 1).iloc[:,sel])
preds+=test_pred/10
print(f'Total f1 score: {total_score}')
有提升,还不错