import pandas as pd
import numpy as np
from tqdm import tqdm
#读取数据
train_df = pd.read_csv(r'C:\Users\86130\Desktop\ML算法竞赛\讯飞\5G移动用户使用预测挑战赛公开数据\train.csv')
test_df = pd.read_csv(r'C:\Users\86130\Desktop\ML算法竞赛\讯飞\5G移动用户使用预测挑战赛公开数据\test.csv')
#合并数据
train_df['is_test'] = 0
test_df['is_test'] = 1
df_all = pd.concat( [train_df,test_df],axis=0,ignore_index=True)
#拼接特征
from sklearn.preprocessing import LabelEncoder
def concat_feature(data,col1,col2):
for col_i in tqdm(col1):
for col_j in col2:
data[f'{col_i}_{col_j}_concat'] = LabelEncoder().fit_transform(data[col_i].astype(str) + '-' + data[col_j].astype(str))
return data
df_all = concat_feature(df_all,['cat_0'],['cat_9'])
#组合特征
df_all['num_22_num_37_num_16_mean'] = df_all[['num_22','num_37','num_16']].mean(axis=1)
df_all['num_22_num_37_num_16_skew'] = df_all[['num_22','num_37','num_16']].skew(axis=1)
df_all['num_14_num_0_num_23_mean'] = df_all[['num_14','num_37','num_16']].mean(axis=1)
df_all['num_14_num_0_num_23_skew'] = df_all[['num_14','num_37','num_16']].skew(axis=1)
#交叉统计特征
groupby_cols =[ 'cat_10','cat_4','cat_1','cat_9','cat_11']
num_cols=['num_28','num_6','num_10','num_37','num_14','num_21']
for col_i in tqdm(groupby_cols):
for col_j in num_cols :
df_all[f"{col_i}_{col_j}_mean"]=df_all[col_i].map(df_all.groupby(col_i)[col_j].mean())
df_all[f"{col_i}_{col_j}_skew"]=df_all[col_i].map(df_all.groupby(col_i)[col_j].skew())
#等宽分箱
num_bins = 20
cut_labels = [i for i in range(num_bins)]
for col in tqdm(num_cols):
df_all[f'{col}_bin'] = pd.cut(df_all[col],num_bins,labels=cut_labels).astype(int)
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import MiniBatchKMeans
group_cluster_cols = {
'group1': ['cat_0', 'cat_1','cat_2'],
'group2': ['num_3', 'num_30', 'num_22'],
'group3': ['num_6', 'num_0', 'num_2'],
'group4': ['num_37', 'num_4', 'num_35'],
}
for group, cols in tqdm(group_cluster_cols.items()):
mbk = MiniBatchKMeans(
init="k-means++",
n_clusters=50,
batch_size=2048,
n_init=10,
max_no_improvement=10,
verbose=0,
random_state=512
)
X = MinMaxScaler().fit_transform(df_all[cols].values)
df_all[f'{group}_cluster'] = mbk.fit_predict(X)
df_all=df_all.replace([np.inf, -np.inf], 0)
train_df = df_all[df_all['is_test'] == 0].reset_index(drop=True)
test_df = df_all[df_all['is_test'] == 1].reset_index(drop=True)
feature_cols = [cols for cols in train_df if cols not in ['target','id','is_test']]
len(feature_cols)
#构建模型
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')
def xgb_model(train_x, train_y, test_x):
seeds=[512]
oof = np.zeros([train_x.shape[0], 2])
test_predict = np.zeros([test_x.shape[0], 2])
feat_imp_df = pd.DataFrame()
feat_imp_df['feature'] = train_x.columns
feat_imp_df['imp'] = 0
for seed in seeds:
print('Seed:',seed)
folds = 5
kf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=seed)
auc_scores = []
# train_x = train_x.values
# train_y = train_y.values
for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print("|-----------------------------------------|")
print("| XGB Fold {} Training Start |".format(str(i + 1)))
print("|-----------------------------------------|")
trn_x, trn_y, val_x, val_y = train_x.values[train_index], train_y.values[train_index], train_x.values[valid_index], \
train_y.values[valid_index]
xgb_params = {
'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'n_estimators':500,
'max_depth': 8,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.8,
'colsample_bylevel': 0.7,
'eta': 0.1,
'tree_method': 'hist',
'seed': seed,
'nthread': 16
}
#训练模型
xgb_model = xgb.XGBClassifier(**xgb_params)
xgb_model.fit(trn_x,trn_y,eval_set=[(trn_x, trn_y),(val_x,val_y)],early_stopping_rounds=20,verbose=20)
val_pred = xgb_model.predict_proba(val_x)
test_pred = xgb_model.predict_proba(test_x)
feat_imp_df['imp'] += xgb_model.feature_importances_ / folds/ len(seeds)
feat_imp_df = feat_imp_df.sort_values(by='imp', ascending=False).reset_index(drop=True)
feat_imp_df['rank'] = range(feat_imp_df.shape[0])
oof[valid_index] = val_pred / kf.n_splits / len(seeds)
test_predict += test_pred / kf.n_splits / len(seeds)
auc_score = roc_auc_score(np.array(val_y), np.array(val_pred[:, 1]))
print(auc_score)
auc_scores.append(auc_score)
print('AVG_auc :',sum(auc_scores)/len(auc_scores))
return oof, test_predict,feat_imp_df
# 训练 XGB模型
xgb_oof, xgb_test, xgb_imp_df = xgb_model(X_resampled, y_resampled, test_df[feature_cols])
xgb_pre_train = xgb_oof[:,1]
xgb_pre_test = xgb_test[:,1]
####################################### 提交结果 #################################
submission = pd.read_csv(r'C:\Users\86130\Desktop\ML算法竞赛\讯飞\5G移动用户使用预测挑战赛公开数据\submit.csv')
submission['target'] = pd.DataFrame(xgb_pre_test)
submission.to_csv(r'C:\Users\86130\Desktop\ML算法竞赛\讯飞\5G移动用户使用预测挑战赛公开数据\sub\sub.csv', index = False)
#读取数据
train_df = pd.read_csv(r'C:\Users\lenovo\Desktop\train.csv')
test_df = pd.read_csv(r'C:\Users\lenovo\Desktop\test.csv')
#合并数据
train_df['is_test'] = 0
test_df['is_test'] = 1
df_all = pd.concat( [train_df,test_df],axis=0,ignore_index=True)
#拼接特征
from sklearn.preprocessing import LabelEncoder
def concat_feature(data,col1,col2):
for col_i in tqdm(col1):
for col_j in col2:
data[f'{col_i}_{col_j}_concat'] = LabelEncoder().fit_transform(data[col_i].astype(str) + '-' + data[col_j].astype(str))
return data
df_all = concat_feature(df_all,['cat_10', 'cat_7', 'cat_5'],['cat_11', 'cat_0', 'cat_1'])
#组合特征
df_all['num_22_num_37_num_16_mean'] = df_all[['num_22','num_37','num_16']].mean(axis=1)
df_all['num_22_num_37_num_16_skew'] = df_all[['num_22','num_37','num_16']].skew(axis=1)
df_all['num_14_num_0_num_23_mean'] = df_all[['num_14','num_37','num_16']].mean(axis=1)
df_all['num_14_num_0_num_23_skew'] = df_all[['num_14','num_37','num_16']].skew(axis=1)
#交叉统计特征
groupby_cols = [
'cat_10',
'cat_7',
'cat_5',
'cat_8',
'cat_12',
'cat_11',
'cat_0',
'cat_1',
'cat_4',
'cat_9'
]
num_cols=[
'num_37',
'num_3',
'num_21',
'num_4',
'num_24',
'num_35',
'num_26',
'num_17',
'num_1',
'num_10'
]
for col_i in tqdm(groupby_cols):
for col_j in num_cols :
df_all[f"{col_i}_{col_j}_mean"]=df_all[col_i].map(df_all.groupby(col_i)[col_j].mean())
df_all[f"{col_i}_{col_j}_skew"]=df_all[col_i].map(df_all.groupby(col_i)[col_j].skew())
df_all[f"{col_i}_{col_j}_max"]=df_all[col_i].map(df_all.groupby(col_i)[col_j].max())
df_all[f"{col_i}_{col_j}_min"]=df_all[col_i].map(df_all.groupby(col_i)[col_j].min())
df_all[f"{col_i}_{col_j}_count"]=df_all[col_i].map(df_all.groupby(col_i)[col_j].count())
#等宽分箱
num_bins = 35
cut_labels = [i for i in range(num_bins)]
for col in tqdm(num_cols):
df_all[f'{col}_bin'] = pd.cut(df_all[col],num_bins,labels=cut_labels).astype(int)
#交叉统计
bin_cols = [cols for cols in df_all if 'bin' in cols]
for col_i in tqdm(bin_cols):
for col_j in groupby_cols :
df_all[f"{col_i}_{col_j}_nunique"]=df_all[col_i].map(df_all.groupby(col_i)[col_j].nunique())
df_all[f"{col_i}_{col_j}_mean"]=df_all[col_i].map(df_all.groupby(col_i)[col_j].mean())
df_all[f"{col_i}_{col_j}_max"]=df_all[col_i].map(df_all.groupby(col_i)[col_j].max())
df_all[f"{col_i}_{col_j}_max"]=df_all[col_i].map(df_all.groupby(col_i)[col_j].min())