作者的主要思路就是:先把列中的特征的类别分开,对他们用int编号后,再放入模型训练,最后再观察特征的重要性程度。
作者先对不同类型数据的特征列进行了分类,不同特征类:二分类型,字符型,数字型
作者后对字符类型的特征列中的特征值进行了特征编码,用了下面的操作:
#下面这段是对特征列中特征值特别多的进行编码并从字符列列表中除去以防下面的操作对其重复操作
for variable in tqdm(frequency_encoded_variables):
freq_enc_dict = frequency_encoding(variable)
train[variable] = train[variable].map(lambda x: freq_enc_dict.get(x, np.nan))
#将train中不在编码后的特征值变为nan
test[variable] = test[variable].map(lambda x: freq_enc_dict.get(x, np.nan))
#同上
categorical_columns.remove(variable)
#
#tqdm是进度条函数,可以让进程可视化,这里最后的操作是处理完数据之后将进度表示出来。
其中 frequency_encoding(variable)函数的定义如下
def frequency_encoding(variable):
t = train[variable].value_counts().reset_index()
t = t.reset_index()
t.loc[t[variable] == 1, 'level_0'] = np.nan
t.set_index('index', inplace=True)
max_label = t['level_0'].max() + 1
t.fillna(max_label, inplace=True)
return t.to_dict()['level_0']
#就是统计特征中相同的特征值然后根据其出现的次数的多少进行排序并编号
下一段:
indexer = {}
for col in tqdm(categorical_columns):
if col == 'MachineIdentifier': continue
#pd.factorize函数会返回两个值,第一个值是将列中的特征值用数字表示出来,第二个值是数字对应的是什么特征值,这里用indexer接住train列中的特征值
_, indexer[col] = pd.factorize(train[col])
for col in tqdm(categorical_columns):
if col == 'MachineIdentifier': continue
train[col] = indexer[col].get_indexer(train[col])
test[col] = indexer[col].get_indexer(test[col])
#将train和test中的每一列特征中的特征值与上面转化后的特征值的编码对应上。
然后
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
#对编码后的特征Dataframe进行缩小其内存的操作
然后
target = train['HasDetections']
del train['HasDetections']
#在训练集中取出其标签用target存储后从train中删除
然后是训练模型的参数
param = {'num_leaves': 60,
'min_data_in_leaf': 60,
'objective':'binary',
'max_depth': -1,
'learning_rate': 0.1,
"boosting": "gbdt",
"feature_fraction": 0.8,
"bagging_freq": 1,
"bagging_fraction": 0.8 ,
"bagging_seed": 11,
"metric": 'auc',
"lambda_l1": 0.1,
"random_state": 133,
"verbosity": -1}
We set the max number of iteration over folds:
max_iter = 5
#最大迭代次数
gc.collect()
#垃圾处理
下面是训练过程
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
categorical_columns = [c for c in categorical_columns if c not in ['MachineIdentifier']]
features = [c for c in train.columns if c not in ['MachineIdentifier']]
predictions = np.zeros(len(test))
start = time.time()
feature_importance_df = pd.DataFrame()
start_time= time.time()
score = [0 for _ in range(folds.n_splits)]
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
print("fold n°{}".format(fold_))
trn_data = lgb.Dataset(train.iloc[trn_idx][features],
label=target.iloc[trn_idx],
categorical_feature = categorical_columns
)
val_data = lgb.Dataset(train.iloc[val_idx][features],
label=target.iloc[val_idx],
categorical_feature = categorical_columns
)
num_round = 10000
clf = lgb.train(param,
trn_data,
num_round,
valid_sets = [trn_data, val_data],
verbose_eval=100,
early_stopping_rounds = 200)
oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
#将特征行标名字
fold_importance_df["feature"] = features
#将其的重要性对应的添加一列添加上去
fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
#表明其的训练的轮数
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
# we perform predictions by chunks通过块的形式把预测呈现出来
initial_idx = 0
chunk_size = 1000000
current_pred = np.zeros(len(test))
while initial_idx < test.shape[0]:
final_idx = min(initial_idx + chunk_size, test.shape[0])
idx = range(initial_idx, final_idx)
#测试得到其预测的结果
current_pred[idx] = clf.predict(test.iloc[idx][features], num_iteration=clf.best_iteration)
#用来确定它现在的预测范围,就是一轮一轮的往上测,一次一个chunk_size
initial_idx = final_idx
predictions += current_pred / min(folds.n_splits, max_iter)
print("time elapsed: {:<5.2}s".format((time.time() - start_time) / 3600))
score[fold_] = metrics.roc_auc_score(target.iloc[val_idx], oof[val_idx])
if fold_ == max_iter - 1: break
if (folds.n_splits == max_iter):
print("CV score: {:<8.5f}".format(metrics.roc_auc_score(target, oof)))
else:
print("CV score: {:<8.5f}".format(sum(score) / max_iter))
然后通过画图来表现出其特征的重要性:
cols = (feature_importance_df[["feature", "importance"]]
.groupby("feature")
.mean()
.sort_values(by="importance", ascending=False)[:1000].index)
best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]
plt.figure(figsize=(14,25))
sns.barplot(x="importance",
y="feature",
data=best_features.sort_values(by="importance",
ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')
然后便是保存结果的操作:
sub_df = pd.DataFrame({"MachineIdentifier": test["MachineIdentifier"].values})
sub_df["HasDetections"] = predictions
sub_df.to_csv("submit.csv", index=False)