其实很久之前的比赛了,之前一直懒得写,最近翻到文件才又想起来,希望能对参与类似赛题的小伙伴提供一点小帮助。
1.赛题简介
基于用户历史的系统访问日志及是否存在风险标记等数据,结合行业知识,构建必要的特征工程,建立机器学习、人工智能或数据挖掘模型,并用该模型预测将来的系统访问是否存在风险。大赛官网为:https://aistudio.baidu.com/projectdetail/4478045?ad-from=1449
id | 样本ID | |
user_name | 用户名 | 若该变量为空,则说明该条日志为用户登录系统前产生 |
department | 用户所在部门 | |
ip_transform | 认证IP(加密后) | 真实认证IP与加密字符一一对应脱敏处理 |
device_num_transform | 认证设备号(加密后) | 真实认证设备号与加密字符一一对应脱敏处理 |
browser_version | 浏览器版本 | |
browser | 浏览器 | |
os_type | 操作系统类型 | |
os_version | 操作系统版本 | |
op_datetime | 认证日期时间 | |
http_status_code | HTTP类型码 | |
op_city | 认证城市 | |
log_system_transform | 接入系统(加密后) | 真实接入系统与加密字符一一对应脱敏处理 |
url | 访问URL | |
op_month | 认证月份 | |
is_risk | 是否存在风险 | 1:有风险;0:无风险。仅train.csv 数据包含该字段 |
2.特征处理
首先,对用户行为模式进行深度挖掘,提取了有风险的访问的特点,分析了用户行为规律。进而,构造了多类特征,包括用户基本登录信息、基于认证时间的衍生特征、用户当天登录信息、用户行为规律特征以及交叉特征,并采用均方差检验进行特征选择。
#时间型特征
data['op_datetime'] = pd.to_datetime(data['op_datetime'])
data['day']=data['op_datetime'].astype(str).apply(lambda x:str(x)[5:10])
data['hour'] = data['op_datetime'].dt.hour
data['weekday'] = data['op_datetime'].dt.weekday+1
data = data.sort_values(by=['user_name', 'op_datetime']).reset_index(drop=True)
data['hour_sin'] = np.sin(data['hour']/24*2*np.pi)
data['hour_cos'] = np.cos(data['hour']/24*2*np.pi)
data['op_day'] = data['op_datetime'].astype(str).apply(lambda x:str(x)[8:10])
data['min'] = data['op_datetime'].apply(lambda x: int(str(x)[-5:-3]))
data['min_sin'] = np.sin(data['min']/60*2*np.pi)
data['min_cos'] = np.cos(data['min']/60*2*np.pi)
#user上次点击时间差
data['diff_last_1'] = data.groupby('user_name')['op_datetime'].apply(lambda i:i.diff(1)).dt.total_seconds()/60
data['diff_last_2'] = data.groupby('user_name')['op_datetime'].apply(lambda i:i.diff(2)).dt.total_seconds()/60
处理diff
train_data = data[data['istest']==0]
test = data[data['istest']==1]
#train:
train_data['diff_next'] = -(train_data.groupby('user_name')['op_datetime'].apply(lambda i:i.diff(-1))).dt.total_seconds()/60
data=pd.merge(data,train_data[['diff_next','id_by_me']],how='left', on='id_by_me')
fea = ['user_name', 'department', 'ip_transform', 'device_num_transform', 'browser_version', 'browser',
'os_type', 'os_version', 'op_city', 'log_system_transform', 'url']
for col in fea:
data[col+'_diff1_mean'] = data.groupby(col)['diff_last_1'].transform('mean')
data[col+'_diff1_std'] = data.groupby(col)['diff_last_1'].transform('std')
data[col+'_diff1_max'] = data.groupby(col)['diff_last_1'].transform('max')
data[col+'_diff1_min'] = data.groupby(col)['diff_last_1'].transform('min')
for col in fea:
data[col+'_diff_next_mean'] = data.groupby(col)['diff_next'].transform('mean')
data[col+'_diff_next_std'] = data.groupby(col)['diff_next'].transform('std')
data[col+'_diff_next_max'] = data.groupby(col)['diff_next'].transform('max')
data[col+'_diff_next_min'] = data.groupby(col)['diff_next'].transform('min')
data=data.fillna(-999)
def is_fail_code(x):
if x==200:
return 0
else:
return 1
#data['http_status_code'].unique():[200, 404, 400, 500, 502],只有200是正常code
data['is_fail_code']=data['http_status_code'].apply(is_fail_code)
def isweekend(x):
if(x<6):
return 0
else:
return 1
data['isweekend']=data['weekday'].apply(isweekend)
def isnight(x):
if (x>7)and(x<20):
return 0
else:
return 1
data['isnight'] = data['hour'].apply(isnight)
#节假日放假
holiday = ['01-31','02-01', '02-02', '02-03', '02-04', '02-05', '02-06',
'04-03','04-04', '04-05', '05-01', '05-02', '05-03', '05-04',
'06-03', '06-04', '06-05']
def if_holiday(x):
if x in holiday:
return 1
else:
return 0
data['isholiday'] = data['op_datetime'].apply(lambda x:if_holiday(str(x)[5:10]))
#调休
adjust = ['01-29', '01-30','04-02', '04-24','05-07']
def if_adjust(x):
if x in adjust:
return 1
else:
return 0
data['is_adjust'] = data['day'].apply(if_adjust)
data['is_not_work'] = data['isweekend'].astype(bool)&(~data['is_adjust'])|(data['isholiday'].astype(bool))
time_fea=['hour','weekday','min','isnight','isholiday','is_not_work']
for col in time_fea:
data[col+'_diff1_mean_u'] = data.groupby(['user_name',col])['diff_last_1'].transform('mean')
data[col+'_diff1_std_u'] = data.groupby(['user_name',col])['diff_last_1'].transform('std')
for col in time_fea:
data[col+'_diff1_next_mean_u'] = data.groupby(['user_name',col])['diff_next'].transform('mean')
data[col+'_diff1_next_std_u'] = data.groupby(['user_name',col])['diff_next'].transform('std')
del data['diff_next']
0点-当前usr使用某属性的次数
cols = ['id_by_me','user_name','ip_transform', 'device_num_transform',
'browser_version', 'browser', 'os_type', 'os_version','http_status_code','op_city',
'log_system_transform','url','op_datetime','is_fail_code']
tmp=data[cols]
tmp['op_day'] = tmp['op_datetime'].dt.date
tmp = tmp.groupby(['user_name','op_day'],as_index=False).agg({'id_by_me':list,'ip_transform':list, 'device_num_transform':list,
'browser_version':list, 'browser':list, 'os_type':list, 'os_version':list,'http_status_code':list,'op_city':list,
'log_system_transform':list,'url':list,'is_fail_code':list})
def get_which_time(col_unique,fea):
fea_dict = dict.fromkeys(col_unique,0)
count_list=[]
for i in range(len(fea)):
fea_dict[fea[i]] = fea_dict[fea[i]]+1
count_list.append(fea_dict[fea[i]])
return count_list
for col in tqdm(['ip_transform', 'device_num_transform',
'browser_version', 'browser', 'os_type', 'os_version','http_status_code','op_city',
'log_system_transform','url','is_fail_code']):
col_unique=data[col].unique()
tmp[col+'_countls'] = tmp[col].apply(lambda x:get_which_time(col_unique,x))
tmp=tmp.explode(['id_by_me', 'ip_transform',
'device_num_transform', 'browser_version', 'browser', 'os_type',
'os_version', 'http_status_code', 'op_city', 'log_system_transform',
'url', 'is_fail_code', 'ip_transform_countls',
'device_num_transform_countls', 'browser_version_countls',
'browser_countls', 'os_type_countls', 'os_version_countls',
'http_status_code_countls', 'op_city_countls',
'log_system_transform_countls', 'url_countls', 'is_fail_code_countls'])
tmp = tmp.reset_index(drop=True)
cols=['id_by_me','ip_transform_countls', 'device_num_transform_countls',
'browser_version_countls', 'browser_countls', 'os_type_countls',
'os_version_countls', 'http_status_code_countls', 'op_city_countls',
'log_system_transform_countls', 'url_countls','is_fail_code_countls']
data=pd.merge(data,tmp[cols],on='id_by_me',how='left')
for col in ['ip_transform_countls', 'device_num_transform_countls',
'browser_version_countls', 'browser_countls', 'os_type_countls',
'os_version_countls', 'http_status_code_countls', 'op_city_countls',
'log_system_transform_countls', 'url_countls','is_fail_code_countls']:
data[col] = data[col].astype(int)
nunique()特征 ——最近登陆的不同ip种类
cols = ['id_by_me','user_name','ip_transform', 'device_num_transform','browser_version', 'browser', 'os_type',
'os_version','http_status_code','op_city','log_system_transform','is_fail_code','url','op_datetime']
tmp=data[cols]
#账号最近几次登陆
for x in range(1,30):
tmp['usr_diff_last_'+str(x)] = tmp.groupby(['user_name'])['op_datetime'].apply(lambda i:i.diff(x)).dt.total_seconds()/60
merge_cols = [col for col in tmp.columns if '_diff_last_' in col]
tmp['ip_diff_list_30']=tmp[merge_cols].values.tolist()
tmp.drop(merge_cols,axis=1,inplace=True)
#账号最近几次登陆对应ip
for x in range(1,30):
tmp['usr_last_ip'+str(x)] = tmp.groupby(['user_name'])['ip_transform'].apply(lambda i:i.shift(x))
merge_cols = [col for col in tmp.columns if '_last_' in col]
tmp['usr_ip_list_30']=tmp[merge_cols].values.tolist()
tmp.drop(merge_cols,axis=1,inplace=True)
def get_nunique_minute(diff_list,uni_list,minute):
ls=[]
for i in range(len(diff_list)):
if diff_list[i]<minute:
ls.append(uni_list[i])
else:
break
return pd.Series(ls).nunique()
tmp['ip_time_nui_6'] = tmp.apply(lambda row:get_nunique_minute(row['ip_diff_list_30'],row['usr_ip_list_30'],60*6),axis=1)
tmp['ip_time_nui_12'] = tmp.apply(lambda row:get_nunique_minute(row['ip_diff_list_30'],row['usr_ip_list_30'],60*12),axis=1)
tmp['ip_time_nui_24'] = tmp.apply(lambda row:get_nunique_minute(row['ip_diff_list_30'],row['usr_ip_list_30'],60*24),axis=1)
cols=[col for col in tmp.columns if 'ip_time_nui_'in col]
cols.append('id_by_me')
data=pd.merge(data,tmp[cols],on='id_by_me',how='left')
通过可视化得到的结合以后和label关系比较密切的交叉特征
cross_cols=[]
#department_city
data['department_op_city'] = data['department'].astype(str)+data['op_city'].astype(str)
cross_cols.append('department_op_city')
#department_log_system_transform
data['department_log_system_transform'] = data['department'].astype(str)+data['log_system_transform'].astype(str)
#data['department_log_system_transform'] = label_encoder(data['department_log_system_transform'])
cross_cols.append('department_log_system_transform')
#browser_version_op_city
data['browser_version_op_city'] = data['browser_version'].astype(str)+data['op_city'].astype(str)
#data['browser_version_op_city'] = label_encoder(data['browser_version_op_city'])
cross_cols.append('browser_version_op_city')
#browser_op_city
data['browser_op_city'] = data['browser'].astype(str)+data['op_city'].astype(str)
#data['browser_op_city'] = label_encoder(data['browser_op_city'])
cross_cols.append('browser_op_city')
#browser_log_system_transform
data['browser_log_system_transform'] = data['browser'].astype(str)+data['log_system_transform'].astype(str)
#data['browser_log_system_transform'] = label_encoder(data['browser_log_system_transform'])
cross_cols.append('browser_log_system_transform')
#os_type_op_city
data['os_type_op_city'] = data['os_type'].astype(str)+data['op_city'].astype(str)
#data['os_type_op_city'] = label_encoder(data['os_type_op_city'])
cross_cols.append('os_type_op_city')
#os_type_log_system_transform
data['os_type_log_system_transform'] = data['os_type'].astype(str)+data['log_system_transform'].astype(str)
#data['os_type_log_system_transform'] = label_encoder(data['os_type_log_system_transform'])
cross_cols.append('os_type_log_system_transform')
#os_version_op_city
data['os_version_op_city'] = data['os_version'].astype(str)+data['op_city'].astype(str)
#data['os_version_op_city'] = label_encoder(data['os_version_op_city'])
cross_cols.append('os_version_op_city')
#os_type_log_system_transform
data['os_type_log_system_transform'] = data['os_type'].astype(str)+data['log_system_transform'].astype(str)
#data['os_type_log_system_transform'] = label_encoder(data['os_type_log_system_transform'])
cross_cols.append('os_type_log_system_transform')
#op_city_log_system_transform
data['op_city_log_system_transform'] = data['op_city'].astype(str)+data['log_system_transform'].astype(str)
#data['op_city_log_system_transform'] = label_encoder(data['op_city_log_system_transform'])
cross_cols.append('op_city_log_system_transform')
#departmen_url
data['op_city_log_system_transform'] = data['department'].astype(str)+data['log_system_transform'].astype(str)
#data['op_city_log_system_transform'] = label_encoder(data['op_city_log_system_transform'])
cross_cols.append('op_city_log_system_transform')
用户在某hour使用某属性的count
cols = ['ip_transform', 'device_num_transform',
'browser_version', 'browser', 'os_type', 'os_version',
'http_status_code', 'op_city', 'log_system_transform', 'url']
for col in cols:
tmp = data[data['istest']==0].groupby(['user_name',col,'hour'])['is_risk'].count().reset_index()
tmp.columns=['user_name',col,'hour',col+'_hour_count']
data=pd.merge(data,tmp,on=['user_name',col,'hour'],how='left')
工作日常用登陆时间
tmp = data[data['istest']==0].groupby(['user_name','is_not_work','hour'],as_index=False)['is_risk'].agg({'work_hour_count':'count'})
data = pd.merge(data,tmp,how='left',on=['user_name','is_not_work','hour'])
date_fea = ['weekday','isholiday','isweekend']
for col in date_fea:
tmp = data[data['istest']==0].groupby(['user_name',col,'hour'],as_index=False)['is_risk'].agg({col+'_count':'count'})
data = pd.merge(data,tmp,how='left',on=['user_name',col,'hour'])
data_copy = copy.deepcopy(data)
data = copy.deepcopy(data_copy)
3.模型
模型采用mvtest特征选择+Lightgbm+后处理的方式:
train_data = data[data['istest']==0]
test = data[data['istest']==1]
from mvtpy.mvtest import mvtest
fea = [col for col in data.columns if col not in['id','id_by_me','op_datetime', 'op_day','day', 'op_month','is_risk', 'istest', 'ts', 'ts1', 'ts2', 'diff_next']]
model = mvtest()
mvtest_sc=[]
for col in tqdm(fea):
mvtest_sc.append(model.test(train_data[col],train_data['is_risk']))
del_index =[]
for i in range(len(mvtest_sc)):
if mvtest_sc[i]['Tn']<0.1:
del_index.append(i)
pd.Series(mvtest_sc).to_csv("D:/数据挖掘竞赛/ccf/mv_test.csv")
for i in del_index:
del data[fea[i]]
data.drop('is_fail_code',axis=1,inplace=True)
训练
train_data = data[data['istest']==0]
test = data[data['istest']==1]
train = train_data[train_data['op_datetime']<'2022-04-01'].reset_index(drop=True)
var = train_data[train_data['op_datetime']>='2022-04-01'].reset_index(drop=True)
fea = [col for col in data.columns if col not in['id','id_by_me','op_datetime', 'op_day','day', 'op_month','is_risk', 'istest', 'ts', 'ts1', 'ts2', 'diff_next']]
x_train = train[fea]
y_train = train['is_risk']
# x_var = var[fea]
# y_var = var['is_risk']
x_test = test[fea]
y_test = test['is_risk']
importance = 0
pred_y = pd.DataFrame()
score = []
seeds=2022
params_lgb = {
'learning_rate': 0.05,
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_leaves': 64,
'verbose': -1,
'seed': 2022,
'n_jobs': -1,
'feature_fraction': 0.8,
'bagging_fraction': 0.9,
'bagging_freq': 4,
# 'min_child_weight': 10,
"min_data_in_leaf":20
}
#kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2022)
kf = KFold(n_splits=5, shuffle=True, random_state=2022)
for i, (train_idx, val_idx) in enumerate(kf.split(x_train, y_train)):
print('************************************ {} {}************************************'.format(str(i+1), str(seeds)))
trn_x, trn_y, val_x, val_y = x_train.iloc[train_idx],y_train [train_idx], x_train.iloc[val_idx], y_train[val_idx]
train_data = lgb.Dataset(trn_x,
trn_y)
val_data = lgb.Dataset(val_x,
val_y)
model = lgb.train(params_lgb, train_data, valid_sets=[val_data], num_boost_round=20000,
callbacks=[lgb.early_stopping(100), lgb.log_evaluation(2000)])
pred_y['fold_%d_seed_%d' % (i, seeds)] = model.predict(x_test)
importance += model.feature_importance(importance_type='gain') / 5
score.append(auc(val_y, model.predict(val_x)))
test['is_risk'] = pred_y.mean(axis=1).values
test = test.sort_values('id').reset_index(drop=True)
train_data_ads = pd.read_csv("D:/浏览器/2022_3_data/train/train_data_ads.csv")
train_data_feeds = pd.read_csv("D:/浏览器/2022_3_data/train/train_data_feeds.csv")
test_data_ads = pd.read_csv("D:/浏览器/2022_3_data/test/test_data_ads.csv")
test_data_feeds = pd.read_csv("D:/浏览器/2022_3_data/test/test_data_feeds.csv")
# 合并数据
train_data_ads['istest'] = 0
test_data_ads['istest'] = 1
data_ads = pd.concat([train_data_ads, test_data_ads], axis=0, ignore_index=True)
train_data_feeds['istest'] = 0
test_data_feeds['istest'] = 1
data_feeds = pd.concat([train_data_feeds, test_data_feeds], axis=0, ignore_index=True)
del train_data_ads, test_data_ads, train_data_feeds, test_data_feeds
gc.collect()
#to_list
def split_click(s):
return s.split("^")
#list:to_int
def to_int(list):
for i in range(len(list)):
list[i]=int(list[i])
return list
to_int(['8','9', '9'])
ads_click_fea = data_ads[['user_id', 'ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003', 'ad_close_list_v001','ad_close_list_v002' , 'ad_close_list_v003','pt_d']]
cols = ['ad_click_list_v001', 'ad_click_list_v002', 'ad_click_list_v003', 'ad_close_list_v001','ad_close_list_v002' , 'ad_close_list_v003']
for col in tqdm(cols):
ads_click_fea[col] = ads_click_fea[col].astype(str)
ads_click_fea[col] = ads_click_fea[col].apply(split_click)
ads_click_fea[col] = ads_click_fea[col].apply(to_int)
ads_click_fea
feeds_click_fea = data_feeds[['u_userId','u_newsCatInterests', 'u_newsCatDislike', 'u_newsCatInterestsST', 'u_click_ca2_news', 'i_docId', 'i_s_sourceId', 'i_entities']]
feeds_click_fea = feeds_click_fea.rename(columns = {'u_userId':'user_id'})
click_fea = pd.merge(ads_click_fea, feeds_click_fea, how='left', on='user_id')