写在前面
最近实习步入正轨,也终于闲下来了,就想着总结一下上半年参与的比赛。今年上半年参加了不少的比赛也拿了些奖金,也靠着比赛经历获得了第一份实习,还算过的比较充实的,后续会陆续更新其他的比赛的代码。主要是总结总结,感觉每次比赛中会尝试很多方法但是比赛完又不总结,浪费太多时间了。
代码
导入一些用到的包,虽然有些用不到
import random, os, sys,gc
import pandas as pd
# pd.set_option('display.max_columns',None)
# pd.set_option('display.max_rows',None)
import numpy as np
# np.set_printoptions(threshold=sys.maxsize)
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split,cross_validate, cross_val_score, GridSearchCV,StratifiedKFold
from lightgbm import LGBMClassifier,log_evaluation,early_stopping
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score#导入roc_auc曲线
from sklearn.model_selection import StratifiedKFold
from scipy import signal,fftpack
from scipy.signal import cwt, ricker
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
def seed_everything(seed=2024):
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
seed_everything()
train_data = pd.read_csv("../output/测试集A/train.csv")
display(train_data.shape, train_data.head())
test_data = pd.read_csv("../output/测试集B/test_b_x.csv")
display(test_data.shape, test_data.head())
total=pd.concat((train_data, test_data),axis=0).reset_index(drop=True)
display(total.shape)
这里根据业务造的一些强特
total['avg_bandwidth_usage'] = total['PROD_BANDWIDTH'] / total['GOTONE_ID']
total['long_OVER'] = (total['FLAG_OVER_GPRS'] + total['FLAG_OVER_CALL'] + total["FLAG_CQ_CT"]) / 3
total['duan_OVER'] = (total['FLAG_OF_GPRS'] + total['FLAG_OF_CALL'] + total["FLAG_CQ_CT"]) / 3
total['long_zc'] = (total['FLAG_PHONE_CALL_ZC'] + total["FLAG_PHONE_NET_ZC"] + total["FLAG_PHONE_ZC"]) / 3
display(total.shape, total.head())
for e1, cols in enumerate([["TYPE1_1_MONTH", "TYPE1_3_MONTH"], ["TYPE2_1_MONTH", "TYPE2_3_MONTH"], ["TYPE3_1_MONTH", "TYPE3_3_MONTH"],["ZWFW_1_ACCT1","ZWFW_3_ACCT1"]]):
total[f"type{e1+1}_ratio"] = total[cols[0]] + 1 / total[cols[1]] + 1
total[f"type{e1+1}_furatio"] = total[cols[1]] + 1 / total[cols[0]] + 1
total[f"type{e1+1}_sub"] = total[cols[0]] - total[cols[1]]
total[f"type{e1+1}_fusub"] = total[cols[1]] - total[cols[0]]
total[f"type{e1+1}_all"] = total[cols[0]] + total[cols[1]]
total["1_访问"] = total["TYPE1_1_MONTH"] + total["TYPE2_1_MONTH"] + total["TYPE3_1_MONTH"]
total["3_访问"] = total["TYPE1_3_MONTH"] + total["TYPE2_3_MONTH"] + total["TYPE3_3_MONTH"]
total["1_访问/3_访问"] = total["1_访问"] / total["3_访问"]
total["1_访问-3_访问"] = total["1_访问"] - total["3_访问"]
total["1_访问+3_访问"] = total["1_访问"] + total["3_访问"]
total["1_访问*3_访问"] = total["1_访问"] * total["3_访问"]
total["1访问/1反馈"] = total["1_访问"] / total["ZWFW_1_ACCT1"]
total["1访问-1反馈"] = total["1_访问"] - total["ZWFW_1_ACCT1"]
total["3访问/3反馈"] = total["1_访问"] / total["ZWFW_3_ACCT1"]
total["3访问-3反馈"] = total["1_访问"] - total["ZWFW_3_ACCT1"]
做根据分类特征分组聚合
for group_col in ["GOTONE_ID", "FLAG_AGE", "FLAG_YSF", "long_OVER", "FLAG_FLOW_OVER", "long_zc", "duan_OVER", "FLAG_KD"]:
for col in ["ARPU", "MOU", "DOU", "FLOW_BHD", "PROD_BANDWIDTH"]:
for opt in ["min", "max", "mean", "median", "std", "skew", "nunique"]:
total[group_col+f"_group_{opt}"]=total.groupby(group_col)[col].transform(opt)
对分类特征做独热
for col in ["TYPE1_1_MONTH", "TYPE1_3_MONTH", "TYPE2_1_MONTH", "TYPE2_3_MONTH", "TYPE3_1_MONTH", "TYPE3_3_MONTH", "ZWFW_1_ACCT1","ZWFW_3_ACCT1"]:
total[f'long_{col}'] = (total[col] >= 1).astype(int)
display(total.shape, total.head())
for col in ["long_TYPE1_1_MONTH", "long_TYPE1_3_MONTH", "long_TYPE2_1_MONTH", "long_TYPE2_3_MONTH", "long_TYPE3_1_MONTH", "long_TYPE3_3_MONTH", "long_ZWFW_1_ACCT1","long_ZWFW_3_ACCT1"]:
total = pd.get_dummies(total, columns=[col], prefix=f'{col}_Category')*1
display(total.shape, total.head())
for col in ["GOTONE_ID", "FLAG_AGE", "FLAG_YSF", "FLAG_FLOW_OVER", "FLAG_OVER_GPRS", "FLAG_OVER_CALL", "FLAG_CQ_CT", "FLAG_OF_GPRS", "FLAG_OF_CALL", "FLAG_PHONE_ZC",
"FLAG_PHONE_CALL_ZC", "FLAG_PHONE_NET_ZC", "FLAG_KD"]:
total = pd.get_dummies(total, columns=[col], prefix=f'{col}_Category')*1
display(total.shape, total.head())
计算特征的偏度,并修复偏度
# # 计算偏度和峰度
# from scipy.stats import skew, kurtosis
# for col in ["ARPU", "MOU", "DOU", "FLOW_BHD", "PROD_BANDWIDTH"]:
# col_data = total[col].dropna()
# print(col_data.shape)
# skewness = skew(col_data)
# print(skewness)
for col in ["ARPU", "MOU", "DOU", "FLOW_BHD", "PROD_BANDWIDTH"]:
total[col + '_log'] = np.log1p(total[col])
display(total.shape, total.head())
内存管理
#遍历表格df的所有列修改数据类型减少内存使用
def reduce_mem_usage(df, float16_as32=True):
#memory_usage()是df每列的内存使用量,sum是对它们求和, B->KB->MB
start_mem = df.memory_usage().sum() / 1024**2
print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
for col in df.columns:#遍历每列的列名
col_type = df[col].dtype#列名的type
if col_type != object and str(col_type)!='category':#不是object也就是说这里处理的是数值类型的变量
c_min,c_max = df[col].min(),df[col].max() #求出这列的最大值和最小值
if str(col_type)[:3] == 'int':#如果是int类型的变量,不管是int8,int16,int32还是int64
#如果这列的取值范围是在int8的取值范围内,那就对类型进行转换 (-128 到 127)
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
#如果这列的取值范围是在int16的取值范围内,那就对类型进行转换(-32,768 到 32,767)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
#如果这列的取值范围是在int32的取值范围内,那就对类型进行转换(-2,147,483,648到2,147,483,647)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
#如果这列的取值范围是在int64的取值范围内,那就对类型进行转换(-9,223,372,036,854,775,808到9,223,372,036,854,775,807)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:#如果是浮点数类型.
#如果数值在float16的取值范围内,如果觉得需要更高精度可以考虑float32
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
if float16_as32:#如果数据需要更高的精度可以选择float32
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float16)
#如果数值在float32的取值范围内,对它进行类型转换
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
#如果数值在float64的取值范围内,对它进行类型转换
else:
df[col] = df[col].astype(np.float64)
#计算一下结束后的内存
end_mem = df.memory_usage().sum() / 1024**2
print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
#相比一开始的内存减少了百分之多少
print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
return df
total=reduce_mem_usage(total, float16_as32=False)
模型训练
train=total[~total['FLAG_USER'].isna()].reset_index(drop=True)
test=total[total['FLAG_USER'].isna()].reset_index(drop=True)
print(train.shape, test.shape)
del total
gc.collect()
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GroupKFold
def fit_and_predict(train_feats=train,test_feats=test, model=None, fold=10,seed=2024,name='lgb'):
choose_cols = [i for i in train_feats.columns if i not in ["USER_ID_MD5", 'ARPU', 'MOU', 'DOU', 'PROD_BANDWIDTH', 'FLAG_USER']]
X=train_feats[choose_cols].copy()
y=train_feats['FLAG_USER'].copy()
test_X=test_feats[choose_cols].copy()
oof_pred_pro=np.zeros((len(X),2))
test_pred_pro=np.zeros((10,len(test_X),2))
gkf = GroupKFold(n_splits=10)
for fold, (train_index, valid_index) in (enumerate(gkf.split(X,y,train["USER_ID_MD5"].values))):
print(f"name {name},fold:{fold}")
X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)], callbacks=[log_evaluation(100),early_stopping(10)])
oof_pred_pro[valid_index]=model.predict_proba(X_valid)
test_pred_pro[fold]=model.predict_proba(test_X)
test_preds=test_pred_pro.mean(axis=0)
print(f"roc_auc:{roc_auc_score(y.values,oof_pred_pro[:,1])}")
return oof_pred_pro,test_preds
lgb_params1 = {"boosting_type": "gbdt",
"objective": "binary","metric": "auc",
"max_depth": 13,
'random_state': 2024, 'n_estimators': 1024,
'reg_alpha': 0.1, 'reg_lambda': 10,
'colsample_bytree': 0.8, 'subsample': 0.8,
'learning_rate': 0.05, 'num_leaves': 64, 'min_child_samples': 62,
'max_bin':245, "extra_trees": True,
# 'device':'gpu','gpu_use_dp':True,#GPU环境的参数
}
lgb_oof_pred_pro1,test_preds1=fit_and_predict(model=LGBMClassifier(**lgb_params1),fold=10,seed=2024,name='lgb1')
因为评价指标是准确率,加上样本本来分布不均,直接通过调整阈值得到最佳的分类准确率
submission=test[['USER_ID_MD5','FLAG_USER']]
submission['FLAG_USER']=[0 if i > 0.345 else 1 for i in test_preds1[:,0]]
submission["FLAG_USER"].value_counts()