摘要
AI法官队友弃赛,吾心甚痛,欲置ML与身外,奈何毕业将至,身无长物,故,重新振作看论文,打比赛。。。
不扯了,上代码
特征工程
说明 : 粗简版Feature,未OneHot,很多不合理特征,没有不平衡数据处理
得分: 多模型Voting 0.79, 单模型0.75左右,未仔细调参
(不平滑数据的处理看这篇文章的最下边):
http://blog.csdn.net/leyounger/article/details/78667538)
import pandas as pd
import numpy as np
import datetime
import time
import copy
import os
from collections import Counter
from model.view_risk import get_risk_ip_dic
from model.view_risk import get_most_risk_ip_dic
def get_datetime(timestamp):
if not isinstance(timestamp, float):
timestamp = float(timestamp)
return datetime.datetime.fromtimestamp(timestamp)
def get_timestamp(dt):
# dt = '2015-01-01 00:00:41'
dt = datetime.datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
return str(time.mktime(dt.timetuple()))
def mytime(time):
month_dict = {1: 0, 2: 31, 3: 61, 4: 92, 5: 122, 6: 153, 7: 183, 8: 214}
return (month_dict[int(time[5:7])] + int(time[8:10])) * 24 * 60 * 60 + \
int(time[11:13]) * 60 * 60 + int(time[14:16]) * 60 + int(time[17:19])
# 一系列提取特征的函数
# 判断target是否为array中的大多数
def is_majority(target, array, n):
# 如果候选列表只有一个候选项目,则返回待定值2
if len(array) == 1:
return 2
c = Counter(array).most_common(n)
for item in c:
if item[0] == target:
return 1
return 0
# 判断target是否在array中至少出现n次
def is_exists(target, array, n):
return 1 if array.count(target) >= n else 0
# 预处理步骤
def preprocess():
trade_file = '../data/train/t_trade.csv'
login_file = '../data/train/t_login.csv'
trade_test_file = '../data/train/t_trade_test.csv'
login_test_file = '../data/train/t_login_test.csv'
df_train_login = pd.read_csv(login_file)
df_train_login.sort_values(by=['id', 'timestamp', 'result'], inplace=True, ascending=True)
# df_train_login.reset_index(inplace=True)
# print(df_train_login.head(10))
df_test_login = pd.read_csv(login_test_file)
df_test_login.sort_values(by=['id', 'timestamp', 'result'], inplace=True, ascending=True)
# df_test_login.reset_index(inplace=True)
# print(df_test_login.head(10))
df_train_trade = pd.read_csv(trade_file)
df_train_trade['time'] = df_train_trade['time'].apply(lambda dt: get_timestamp(dt[:-2]))
df_train_trade.sort_values(by=['id', 'time'], inplace=True, ascending=True)
# print(df_train_trade.head(10))
df_test_trade = pd.read_csv(trade_test_file)
df_test_trade['time'] = df_test_trade['time'].apply(lambda dt: get_timestamp(dt[:-2]))
df_test_trade.sort_values(by=['id', 'time'], inplace=True, ascending=True)
# print(df_test_trade.head(10))
return df_train_login, df_test_login, df_train_trade, df_test_trade
def build_feat(login, trade, type, mode, train_trade):
print('Start Initiating Feautures Array ... ')
# 初始化各特征列表
login['login_real_month'] = login['timestamp'].apply(lambda x: get_datetime(x).month)
login['login_real_day'] = login['timestamp'].apply(lambda x: get_datetime(x).day)
login['login_real_hour'] = login['timestamp'].apply(lambda x: get_datetime(x).hour)
login['login_real_minute'] = login['timestamp'].apply(lambda x: get_datetime(x).minute)
login['login_real_second'] = login['timestamp'].apply(lambda x: get_datetime(x).second)
trade['trade_real_month'] = trade['time'].apply(lambda x: get_datetime(x).month)
trade['trade_real_day'] = trade['time'].apply(lambda x: get_datetime(x).day)
trade['trade_real_hour'] = trade['time'].apply(lambda x: get_datetime(x).hour)
trade['trade_real_minute'] = trade['time'].apply(lambda x: get_datetime(x).minute)
trade['trade_real_second'] = trade['time'].apply(lambda x: get_datetime(x).second)
login['time'] = login['timestamp']
trade['time'] = trade['time'].apply(lambda x: float(x))
del login['timestamp']
# del login['index']
del trade['rowkey']
login['login_result'] = login['result'].apply(lambda x: 1 if x > 0 else 0)
login['is_scan'] = login['is_scan'].apply(lambda x: 1 if x else 0)
login['is_sec'] = login['is_sec'].apply(lambda x: 1 if x else 0)
# 处理test时,将训练集中的历史交易数据单独放进来提取特征
if type == 'test':
train_trade['trade_real_month'] = train_trade['time'].apply(lambda x: get_datetime(x).month)
train_trade['trade_real_day'] = train_trade['time'].apply(lambda x: get_datetime(x).day)
train_trade['trade_real_hour'] = train_trade['time'].apply(lambda x: get_datetime(x).hour)
train_trade['trade_real_minute'] = train_trade['time'].apply(lambda x: get_datetime(x).minute)
train_trade['trade_real_second'] = train_trade['time'].apply(lambda x: get_datetime(x).second)
train_trade['time'] = train_trade['time'].apply(lambda x: float(x))
del train_trade['rowkey']
previous_train_trade = train_trade
else:
previous_train_trade = trade
# print(login.head(10))
# print(trade.head(10))
# exit()
feature_0 = build_feat_0(login, trade)
print('Building Feature 1 ... ', mode, type)
if os.path.exists('../data/other/hdf/'+mode+'/'+type+'_feature_1.hdf'):
feature_1 = pd.read_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_1.hdf', 'w')
else:
feature_1 = build_feat_1(login, trade)
feature_1.to_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_1.hdf', 'w')
print('Building Feature 2 ... ', mode, type)
if os.path.exists('../data/other/hdf/'+mode+'/'+type+'_feature_2.hdf'):
feature_2 = pd.read_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_2.hdf', 'w')
else:
feature_2 = build_feat_2(login, trade)
feature_2.to_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_2.hdf', 'w')
print('Building Feature 3 ... ', mode, type)
if os.path.exists('../data/other/hdf/'+mode+'/'+type+'_feature_3.hdf'):
feature_3 = pd.read_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_3.hdf', 'w')
else:
feature_3 = build_feat_3(login, trade)
feature_3.to_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_3.hdf', 'w')
print('Building Feature 4 ... ', mode, type)
if os.path.exists('../data/other/hdf/'+mode+'/'+type+'_feature_4.hdf'):
feature_4 = pd.read_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_4.hdf', 'w')
else:
feature_4 = build_feat_4(login, trade)
feature_4.to_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_4.hdf', 'w')
print('Building Feature 5 ... ', mode, type)
if os.path.exists('../data/other/hdf/'+mode+'/'+type+'_feature_5.hdf'):
feature_5 = pd.read_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_5.hdf', 'w')
else:
feature_5 = build_feat_5(trade, previous_train_trade)
feature_5.to_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_5.hdf', 'w')
print('Building Feature 6 ... ', mode, type)
if os.path.exists('../data/other/hdf/'+mode+'/'+type+'_feature_6.hdf'):
feature_6 = pd.read_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_6.hdf', 'w')
else:
feature_6 = build_feat_6(login, trade)
feature_6.to_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_6.hdf', 'w')
all_features = pd.concat([feature_0, feature_1], axis=1)
all_features = pd.concat([all_features, feature_2], axis=1)
all_features = pd.concat([all_features, feature_3], axis=1)
all_features = pd.concat([all_features, feature_4], axis=1)
all_features = pd.concat([all_features, feature_5], axis=1)
all_features = pd.concat([all_features, feature_6], axis=1)
print('Shape of All Features:', all_features.shape)
print('----------------------- End of Feature Extracting ----------------------')
return all_features
def build_feat_0(login, trade):
features = pd.DataFrame()
# 拼接特征矩阵
features['time'] = trade['time'].apply(lambda x: float(x))
features['trade_real_month'] = trade['trade_real_month']
features['trade_real_day'] = trade['trade_real_day']
features['trade_real_hour'] = trade['trade_real_hour']
features['trade_real_minute'] = trade['trade_real_minute']
features['trade_real_second'] = trade['trade_real_second']
print('Shape of Features 0:', features.shape)
return features
def build_feat_1(login, trade):
features = pd.DataFrame()
# 初始化特征列表
last_login_time = []
last_login_result = []
last_login_timelong = []
last_login_device = []
last_login_from = []
last_login_ip = []
last_login_city = []
last_login_type = []
last_login_is_scan = []
last_login_is_sec = []
last_trade_time = []
# 计数特征
count_login_all = [] # 登录的总次数
count_login_previous = [] # 当前时间是第几次登录
count_trade_all = []
count_trade_previous = []
count_login_fail_all = [] # 登录失败总次数
count_login_fail_previous = []
count_login_succ_all = []
count_login_succ_previous = []
avg_login_previous = [] # 上次登录与登录时间平均值的差
max_login_previous = []
min_login_previous = []
count_login_ip_all = [] # 上次登录的ip在所有登录记录中出现的次数,出现次数太多的ip显然是异常的
count_login_device_all = []
count_login_city_all = []
# 类别特征
is_login_device_usual = []
is_login_from_usual = []
is_login_ip_usual = []
is_login_city_usual = []
is_login_type_usual = []
is_login_device_exists_previous = [] # 上次登录的设备是否在以前出现过
is_login_from_exists_previous = []
is_login_ip_exists_previous = []
is_login_city_exists_previous = []
is_login_type_exists_previous = []
# 提前计算好一些数量,可以同时利用上测试集和训练集的特征
print('Start Pre-Calculating ... ')
set_ip = Counter(login['ip'].tolist())
set_device = Counter(login['device'].tolist())
set_city = Counter(login['city'].tolist())
# 开始遍历所有的交易信息
print('Start Search All Trade Info ... ')
for index in range(trade.shape[0]):
if index % 10000==0:
print("Processing till line : ", index)
# 取出交易记录中的每一行
each_line = trade.iloc[index]
# 取出交易记录的id,time等信息
trade_time = float(each_line['time'])
id = each_line['id']
# 查找登录信息中同id的记录
related_rows = login.loc[login['id'] == id]
count_login_all.append(related_rows.shape[0])
count_login_fail_all.append(len([1 for num in related_rows['result'].tolist() if num != 1]))
count_login_succ_all.append(len([1 for num in related_rows['result'].tolist() if num == 1]))
if related_rows.shape[0] != 0:
# 查找当前交易发生之前的登录信息
previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]
count_login_previous.append(previous_record.shape[0])
count_login_fail_previous.append(len([1 for num in previous_record['result'].tolist() if num != 1]))
count_login_succ_previous.append(len([1 for num in previous_record['result'].tolist() if num == 1]))
if previous_record.shape[0] != 0:
first_login_record = previous_record.iloc[0] # 取第一次登录的记录
last_login_record = previous_record.iloc[-1] # 取最近的那一次登录记录
last_login_time.append(trade_time - last_login_record['time'])
last_login_result.append(last_login_record['result'])
last_login_timelong.append(last_login_record['timelong'])
last_login_device.append(last_login_record['device'])
last_login_from.append(last_login_record['log_from'])
last_login_ip.append(last_login_record['ip'])
last_login_city.append(last_login_record['city'])
last_login_type.append(last_login_record['type'])
last_login_is_scan.append(last_login_record['is_scan'])
last_login_is_sec.append(last_login_record['is_sec'])
# 计数特征
avg_login_previous.append(np.average(related_rows['timelong']) - last_login_record['timelong'])
max_login_previous.append(np.max(related_rows['timelong']) - last_login_record['timelong'])
min_login_previous.append(np.min(related_rows['timelong']) - last_login_record['timelong'])
count_login_ip_all.append(set_ip[last_login_record['ip']])
count_login_device_all.append(set_device[last_login_record['device']])
count_login_city_all.append(set_city[last_login_record['city']])
# 类别特征
is_login_device_usual.append(
is_majority(last_login_record['device'], related_rows['device'], 2)) # 上次登录的设备在所有登录设备中是否为常用设备
is_login_from_usual.append(is_majority(last_login_record['log_from'], related_rows['log_from'], 2))
is_login_ip_usual.append(is_majority(last_login_record['ip'], related_rows['ip'], 2))
is_login_city_usual.append(is_majority(last_login_record['city'], related_rows['city'], 2))
is_login_type_usual.append(is_majority(last_login_record['type'], related_rows['type'], 2))
is_login_device_exists_previous.append(
is_exists(last_login_record['device'], previous_record['device'].tolist(), 2)) # 上次登录的设备在之前是否登录过
is_login_from_exists_previous.append(
is_exists(last_login_record['log_from'], previous_record['log_from'].tolist(), 2))
is_login_ip_exists_previous.append(
is_exists(last_login_record['ip'], previous_record['ip'].tolist(), 2))
is_login_city_exists_previous.append(
is_exists(last_login_record['city'], previous_record['city'].tolist(), 2))
is_login_type_exists_previous.append(
is_exists(last_login_record['type'], previous_record['type'].tolist(), 2))
else:
last_login_time.append(-1) # 如果没有登录信息,说明用户之前没有登录,但是后来马上有登录,可信度较大,则赋-1
last_login_result.append(-40)
last_login_timelong.append(-1)
last_login_device.append(0)
last_login_from.append(0)
last_login_ip.append(0)
last_login_city.append(0)
last_login_type.append(0)
last_login_is_scan.append(-1)
last_login_is_sec.append(-1)
# 计数特征
avg_login_previous.append(0)
max_login_previous.append(0)
min_login_previous.append(0)
count_login_ip_all.append(0)
count_login_device_all.append(0)
count_login_city_all.append(0)
# 类别特征
is_login_device_usual.append(0)
is_login_from_usual.append(0)
is_login_ip_usual.append(0)
is_login_city_usual.append(0)
is_login_type_usual.append(0)
is_login_device_exists_previous.append(-1) # 上次登录的设备在之前是否登录过
is_login_from_exists_previous.append(-1)
is_login_ip_exists_previous.append(-1)
is_login_city_exists_previous.append(-1)
is_login_type_exists_previous.append(-1)
else:
# 这里出现的都是没有出现登录记录的
# print(id, 'Cannot Find', train_y[index])
# 如果没有同id的信息,说明用户之前没有登录,后来也没有登录
last_login_time.append(-2)
last_login_result.append(-41)
last_login_timelong.append(-2)
last_login_device.append(0)
last_login_from.append(0)
last_login_ip.append(0)
last_login_city.append(-1)
last_login_type.append(-1)
last_login_is_scan.append(-1)
last_login_is_sec.append(-1)
# 计数特征
count_login_previous.append(0)
avg_login_previous.append(0)
max_login_previous.append(0)
min_login_previous.append(0)
count_login_succ_previous.append(0)
count_login_fail_previous.append(0)
count_login_ip_all.append(0)
count_login_device_all.append(0)
count_login_city_all.append(0)
# 类别特征
is_login_device_usual.append(0)
is_login_from_usual.append(0)
is_login_ip_usual.append(0)
is_login_city_usual.append(0)
is_login_type_usual.append(0)
is_login_device_exists_previous.append(-2)
is_login_from_exists_previous.append(-2)
is_login_ip_exists_previous.append(-2)
is_login_city_exists_previous.append(-2)
is_login_type_exists_previous.append(-2)
# 查找交易信息中同id的记录
related_rows = trade.loc[trade['id'] == id]
count_trade_all.append(related_rows.shape[0])
if related_rows.shape[0] != 0:
# 查找当前交易发生之前的交易信息
previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]
count_trade_previous.append(previous_record.shape[0])
else:
count_trade_previous.append(0)
if index > 0:
last_trade_record = trade.iloc[index - 1]
if last_trade_record['id'] == id:
last_trade_time.append(trade_time - float(last_trade_record['time']))
else:
last_trade_time.append(-1)
else:
last_trade_time.append(-2)
# 将得到的特征拼接起来
features['last_login_time'] = last_login_time
features['last_login_result'] = last_login_result
features['last_login_timelong'] = last_login_timelong
features['last_login_device'] = last_login_device
features['last_login_from'] = last_login_from
features['last_login_ip'] = last_login_ip
features['last_login_city'] = last_login_city
features['last_login_type'] = last_login_type
features['last_login_is_scan'] = last_login_is_scan
features['last_login_is_sec'] = last_login_is_sec
features['last_trade_time'] = last_trade_time
# 计数特征
features['count_login_all'] = count_login_all
features['count_login_previous'] = count_login_previous
features['count_trade_all'] = count_trade_all
features['count_trade_previous'] = count_trade_previous
features['avg_login_previous'] = avg_login_previous
features['max_login_previous'] = max_login_previous
features['min_login_previous'] = min_login_previous
features['count_login_fail_all'] = count_login_fail_all
features['count_login_fail_previous'] = count_login_fail_previous
features['count_login_succ_all'] = count_login_succ_all
features['count_login_succ_previous'] = count_login_succ_previous
features['count_login_ip_all'] = count_login_ip_all # 上次登录的ip在所有登录记录中出现的次数
features['count_login_device_all'] = count_login_device_all
features['count_login_city_all'] = count_login_city_all
# 类别特征
features['is_login_device_usual'] = is_login_device_usual
features['is_login_from_usual'] = is_login_from_usual
features['is_login_ip_usual'] = is_login_ip_usual
features['is_login_city_usual'] = is_login_city_usual
features['is_login_type_usual'] = is_login_type_usual
features['is_login_device_exists_previous'] = is_login_device_exists_previous
features['is_login_from_exists_previous'] = is_login_from_exists_previous
features['is_login_ip_exists_previous'] = is_login_ip_exists_previous
features['is_login_city_exists_previous'] = is_login_city_exists_previous
features['is_login_type_exists_previous'] = is_login_type_exists_previous
print('Shape of Features 1:', features.shape)
return features
def build_feat_2(login, trade):
features = pd.DataFrame()
# 初始化特征列表
# 计数特征
count_10_times_timelong_all = [] # 统计登录时间有大量的10倍数时间,说明当前账号很可能被恶意登录
count_not_10_times_timelong_all = []
count_10_times_timelong_previous = [] # 统计登录时间有大量的10倍数时间,说明当前账号很可能被恶意登录
count_not_10_times_timelong_previous = []
count_city_types = [] # 登录城市的类别个数,类别太多,说明账号被多地登录
count_city_types_previous = [] # 上次登录的城市出现的频数
count_log_from_types = [] # 登录来源类别个数
count_log_from_types_previous = [] # 上次登录来源出现的频数
avg_login_time_dis = [] # 登陆时间间隔的平均值,这个值很小,说明账号频繁登录
std_login_time_dis = []
std_login_city_dis = [] # 登录城市值的标准差
max_login_time = []
min_login_time = []
median_login_time = []
max_min_login_time_dis = []
max_login_city_dis = []
min_login_city_dis = []
median_login_city = []
min_login_timelong_all = []
max_login_timelong_all = []
avg_login_timelong_all = []
std_login_timelong_all = []
min_login_timelong_previous = []
max_login_timelong_previous = []
avg_login_timelong_previous = []
std_login_timelong_previous = []
min_login_city_all = []
max_login_city_all = []
avg_login_city_all = []
std_login_city_all = []
min_login_city_previous = []
max_login_city_previous = []
avg_login_city_previous = []
std_login_city_previous = []
# 比率特征
ratio_log_from_all = []
ratio_log_from_previous = [] # 上一次登录来源log_from所占比例
ratio_city_all = []
ratio_city_previous = [] # 上次登录城市所占比例
ratio_10_times_timelong = [] # 10倍登录时间所占时间
# 类别特征
is_login_10_times_timelong = [] # 通过观察发现,有很多登录的时间恰好是10的倍数!!考察上一次登录的时长是否为10的倍数
is_login_city_the_very_first_time = [] # 上次登录是否为初次登录的city
# 提前计算好一些数量,可以同时利用上测试集和训练集的特征
print('Start Pre-Calculating ... ')
set_ip = Counter(login['ip'].tolist())
set_device = Counter(login['device'].tolist())
set_city = Counter(login['city'].tolist())
# 开始遍历所有的交易信息
print('Start Search All Trade Info ... ')
for index in range(trade.shape[0]):
if index%10000==0:
print("Processing till line : ", index)
# 取出交易记录中的每一行
each_line = trade.iloc[index]
# 取出交易记录的id,time等信息
trade_time = float(each_line['time'])
id = each_line['id']
# 查找登录信息中同id的记录
related_rows = login.loc[login['id'] == id]
count_10_times_timelong_all.append(len([num for num in related_rows['timelong'].values if num % 10 == 0]))
count_not_10_times_timelong_all.append(len([num for num in related_rows['timelong'].values if num % 10 != 0]))
count_city_types.append(len(set(related_rows['city'].values)))
count_log_from_types.append(len(set(related_rows['log_from'].values)))
ratio_10_times_timelong.append(0 if related_rows.shape[0]==0 else len([1 for num in related_rows['timelong'].values if num%10==0])/related_rows.shape[0])
if related_rows.shape[0] != 0:
avg_login_time_dis.append((related_rows['time'].max()-related_rows['time'].min())/related_rows.shape[0])
std_login_time_dis.append(0 if related_rows.shape[0]==1 else related_rows['time'].std(skipna=True))
std_login_city_dis.append(0 if related_rows.shape[0]==1 else related_rows['city'].std(skipna=True))
# if pd.isnull(related_rows['city'].std(skipna=True)):
# print(related_rows)
max_login_time.append(related_rows['time'].max())
min_login_time.append(related_rows['time'].min())
median_login_time.append(related_rows['time'].median())
max_min_login_time_dis.append(related_rows['time'].max()-related_rows['time'].min())
max_login_city_dis.append(related_rows['city'].max())
min_login_city_dis.append(related_rows['city'].min())
median_login_city.append(related_rows['city'].median())
min_login_timelong_all.append(related_rows['timelong'].max())
max_login_timelong_all.append(related_rows['timelong'].min())
avg_login_timelong_all.append(related_rows['timelong'].mean())
std_login_timelong_all.append(0 if related_rows.shape[0]==1 else related_rows['timelong'].std(skipna=True))
min_login_city_all.append(related_rows['city'].max())
max_login_city_all.append(related_rows['city'].min())
avg_login_city_all.append(related_rows['city'].mean())
std_login_city_all.append(0 if related_rows.shape[0]==1 else related_rows['city'].std(skipna=True))
else:
avg_login_time_dis.append(0)
std_login_time_dis.append(0)
std_login_city_dis.append(0)
max_login_time.append(0)
min_login_time.append(0)
median_login_time.append(0)
max_min_login_time_dis.append(0)
max_login_city_dis.append(0)
min_login_city_dis.append(0)
median_login_city.append(0)
min_login_timelong_all.append(0)
max_login_timelong_all.append(0)
avg_login_timelong_all.append(0)
std_login_timelong_all.append(0)
min_login_city_all.append(0)
max_login_city_all.append(0)
avg_login_city_all.append(0)
std_login_city_all.append(0)
if related_rows.shape[0] != 0:
# 查找当前交易发生之前的登录信息
previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]
if previous_record.shape[0] != 0:
first_login_record = previous_record.iloc[0] # 取第一次登录的记录
last_login_record = previous_record.iloc[-1] # 取最近的那一次登录记录
# 计数特征
count_10_times_timelong_previous.append(len([num for num in previous_record['timelong'].values if num % 10 == 0]))
count_not_10_times_timelong_previous.append(len([num for num in previous_record['timelong'].values if num % 10 != 0]))
count_city_types_previous.append(len([1 for city in previous_record['city'].values if city == last_login_record['city']]))
count_log_from_types_previous.append(len([1 for log_from in previous_record['log_from'].values if log_from == last_login_record['log_from']]))
# 比率特征
ratio_city_all.append(len([1 for city in related_rows['city'].values if city==last_login_record['city']])/related_rows.shape[0])
ratio_city_previous.append(len([1 for city in previous_record['city'].values if city==last_login_record['city']])/previous_record.shape[0])
ratio_log_from_all.append(len([1 for log_from in related_rows['log_from'].values if log_from==last_login_record['log_from']])/related_rows.shape[0])
ratio_log_from_previous.append(len([1 for log_from in previous_record['log_from'].values if log_from==last_login_record['log_from']])/previous_record.shape[0])
# 类别特征
is_login_10_times_timelong.append(1 if last_login_record['timelong'] % 10 == 0 else 0)
is_login_city_the_very_first_time.append(1 if last_login_record['city'] == first_login_record['city'] else 0)
min_login_timelong_previous.append(previous_record['timelong'].max())
max_login_timelong_previous.append(previous_record['timelong'].min())
avg_login_timelong_previous.append(previous_record['timelong'].mean())
std_login_timelong_previous.append(0 if previous_record.shape[0]==1 else previous_record['timelong'].std(skipna=True))
min_login_city_previous.append(previous_record['city'].max())
max_login_city_previous.append(previous_record['city'].min())
avg_login_city_previous.append(previous_record['city'].mean())
std_login_city_previous.append(0 if previous_record.shape[0]==1 else previous_record['city'].std(skipna=True))
else:
# 如果没有登录信息,说明用户之前没有登录,但是后来马上有登录
# 计数特征
count_10_times_timelong_previous.append(-1)
count_not_10_times_timelong_previous.append(-1)
count_city_types_previous.append(-1)
count_log_from_types_previous.append(-1)
# 比率特征
ratio_city_all.append(-1)
ratio_city_previous.append(-1)
ratio_log_from_all.append(-1)
ratio_log_from_previous.append(-1)
# 类别特征
is_login_10_times_timelong.append(-1)
is_login_city_the_very_first_time.append(-1)
min_login_timelong_previous.append(-1)
max_login_timelong_previous.append(-1)
avg_login_timelong_previous.append(-1)
std_login_timelong_previous.append(-1)
min_login_city_previous.append(-1)
max_login_city_previous.append(-1)
avg_login_city_previous.append(-1)
std_login_city_previous.append(-1)
else:
# 这里出现的都是没有出现登录记录的
# 如果没有同id的信息,说明用户之前没有登录,后来也没有登录
# 计数特征
count_10_times_timelong_previous.append(-2)
count_not_10_times_timelong_previous.append(-2)
count_city_types_previous.append(-2)
count_log_from_types_previous.append(-2)
# 比率特征
ratio_city_all.append(-2)
ratio_city_previous.append(-2)
ratio_log_from_all.append(-2)
ratio_log_from_previous.append(-2)
# 类别特征
is_login_10_times_timelong.append(-2)
is_login_city_the_very_first_time.append(-2)
min_login_timelong_previous.append(-2)
max_login_timelong_previous.append(-2)
avg_login_timelong_previous.append(-2)
std_login_timelong_previous.append(-2)
min_login_city_previous.append(-2)
max_login_city_previous.append(-2)
avg_login_city_previous.append(-2)
std_login_city_previous.append(-2)
# 将得到的特征拼接起来
# 计数特征
features['count_10_times_timelong_all'] = count_10_times_timelong_all
features['count_not_10_times_timelong_all'] = count_not_10_times_timelong_all
features['count_10_times_timelong_previous'] = count_10_times_timelong_previous
features['count_not_10_times_timelong_previous'] = count_not_10_times_timelong_previous
features['count_city_types'] = count_city_types
features['count_city_types_previous'] = count_city_types_previous
features['count_log_from_types'] = count_log_from_types
features['count_log_from_types_previous'] = count_log_from_types_previous
features['avg_login_time_dis'] = avg_login_time_dis
features['std_login_time_dis'] = std_login_time_dis
features['std_login_city_dis'] = std_login_city_dis
# 比率特征
features['ratio_log_from_all'] = ratio_log_from_all
features['ratio_log_from_previous'] = ratio_log_from_previous
features['ratio_city_all'] = ratio_city_all
features['ratio_city_previous'] = ratio_city_previous
features['ratio_10_times_timelong'] = ratio_10_times_timelong
# 类别特征
features['is_login_10_times_timelong'] = is_login_10_times_timelong
features['is_login_city_the_very_first_time'] = is_login_city_the_very_first_time
features['max_login_time'] = max_login_time
features['min_login_time'] = min_login_time
features['median_login_time'] = median_login_time
features['max_min_login_time_dis'] = max_min_login_time_dis
features['max_login_city_dis'] = max_login_city_dis
features['min_login_city_dis'] = min_login_city_dis
features['median_login_city'] = median_login_city
features['min_login_timelong_all'] = min_login_timelong_all
features['max_login_timelong_all'] = max_login_timelong_all
features['avg_login_timelong_all'] = avg_login_timelong_all
features['std_login_timelong_all'] = std_login_timelong_all
features['min_login_timelong_previous'] = min_login_timelong_previous
features['max_login_timelong_previous'] = max_login_timelong_previous
features['avg_login_timelong_previous'] = avg_login_timelong_previous
features['std_login_timelong_previous'] = std_login_timelong_previous
features['min_login_city_all'] = min_login_city_all
features['max_login_city_all'] = max_login_city_all
features['avg_login_city_all'] = avg_login_city_all
features['std_login_city_all'] = std_login_city_all
features['min_login_city_previous'] = min_login_city_previous
features['max_login_city_previous'] = max_login_city_previous
features['avg_login_city_previous'] = avg_login_city_previous
features['std_login_city_previous'] = std_login_city_previous
print('Shape of Features 2:', features.shape)
return features
def build_feat_3(login, trade):
features = pd.DataFrame()
# 初始化特征列表
# 计数特征
# 比率特征
# 类别特征
is_last2_same_log_from = [] # 倒数第一次登录和倒数第二次登录登录来源是否一致
is_last2_same_device = []
is_last2_same_ip = []
is_last2_same_city = []
is_last2_same_result = []
is_last3_same_log_from = [] # 倒数三次内的登录来源是否一致
is_last3_same_device = []
is_last3_same_ip = []
is_last3_same_city = []
is_last3_same_result = []
# 计算所有登录记录和交易前登录记录中scan和sec的个数和比率
count_login_is_scan_all = []
count_login_not_scan_all = []
count_login_is_sec_all = []
count_login_not_sec_all = []
count_login_is_scan_previous = []
count_login_not_scan_previous = []
count_login_is_sec_previous = []
count_login_not_sec_previous = []
ratio_login_is_scan = []
ratio_login_is_sec = []
# 上次登录距离倒数第二次和第三次的时间
last1_login_month_dis = []
last1_login_day_dis = []
last1_login_hour_dis = []
last1_login_minute_dis = []
last1_trade_month_dis = []
last1_trade_day_dis = []
last1_trade_hour_dis = []
last1_trade_minute_dis = []
last2_login_month_dis = []
last2_login_day_dis = []
last2_login_hour_dis = []
last2_login_minute_dis = []
last2_trade_month_dis = []
last2_trade_day_dis = []
last2_trade_hour_dis = []
last2_trade_minute_dis = []
# 开始遍历所有的交易信息
print('Start Search All Trade Info ... ')
for index in range(trade.shape[0]):
if index%10000==0:
print("Processing till line : ", index)
# 取出交易记录中的每一行
each_line = trade.iloc[index]
# 取出交易记录的id,time等信息
trade_time = float(each_line['time'])
id = each_line['id']
# 查找登录信息中同id的记录
related_rows = login.loc[login['id'] == id]
count_login_is_scan_all.append(len([1 for scan in related_rows['is_scan'].values if scan == 1]))
count_login_not_scan_all.append(len([1 for scan in related_rows['is_scan'].values if scan != 1]))
count_login_is_sec_all.append(len([1 for sec in related_rows['is_sec'].values if sec == 1]))
count_login_not_sec_all.append(len([1 for sec in related_rows['is_sec'].values if sec != 1]))
ratio_login_is_scan.append(len([1 for scan in related_rows['is_scan'].values if scan == 1])/ related_rows.shape[0])
ratio_login_is_sec.append(len([1 for sec in related_rows['is_sec'].values if sec == 1])/ related_rows.shape[0])
if related_rows.shape[0] != 0:
# 查找当前交易发生之前的登录信息
previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]
if previous_record.shape[0] != 0:
count_login_is_scan_previous.append(len([1 for scan in previous_record['is_scan'].values if scan == 1]))
count_login_not_scan_previous.append(len([1 for scan in previous_record['is_scan'].values if scan != 1]))
count_login_is_sec_previous.append(len([1 for sec in previous_record['is_sec'].values if sec == 1]))
count_login_not_sec_previous.append(len([1 for sec in previous_record['is_sec'].values if sec == 1]))
else:
count_login_is_scan_previous.append(-1)
count_login_not_scan_previous.append(-1)
count_login_is_sec_previous.append(-1)
count_login_not_sec_previous.append(-1)
if previous_record.shape[0] >= 2:
last1_login_record = previous_record.iloc[-1] # 倒数第一条记录
last2_login_record = previous_record.iloc[-2] # 倒数第二条记录
is_last2_same_log_from.append(1 if last1_login_record['log_from']==last2_login_record['log_from'] else 0)
is_last2_same_device.append(1 if last1_login_record['device']==last2_login_record['device'] else 0)
is_last2_same_ip.append(1 if last1_login_record['ip']==last2_login_record['ip'] else 0)
is_last2_same_city.append(1 if last1_login_record['city']==last2_login_record['city'] else 0)
is_last2_same_result.append(1 if last1_login_record['result']==last2_login_record['result'] else 0)
last1_login_month_dis.append(last1_login_record['login_real_month'] - last2_login_record['login_real_month'])
last1_login_day_dis.append(last1_login_record['login_real_day'] - last2_login_record['login_real_day'])
last1_login_hour_dis.append(last1_login_record['login_real_hour'] - last2_login_record['login_real_hour'])
last1_login_minute_dis.append(last1_login_record['login_real_minute'] - last2_login_record['login_real_minute'])
else:
is_last2_same_log_from.append(-1)
is_last2_same_device.append(-1)
is_last2_same_ip.append(-1)
is_last2_same_city.append(-1)
is_last2_same_result.append(-1)
last1_login_month_dis.append(-1)
last1_login_day_dis.append(-1)
last1_login_hour_dis.append(-1)
last1_login_minute_dis.append(-1)
if previous_record.shape[0] >= 3:
last1_login_record = previous_record.iloc[-1] # 倒数第一条记录
last2_login_record = previous_record.iloc[-2] # 倒数第二条记录
last3_login_record = previous_record.iloc[-3] # 倒数第三条记录
is_last3_same_log_from.append(1 if len(set([last1_login_record['log_from'], last2_login_record['log_from'], last3_login_record['log_from']]))==1 else 0)
is_last3_same_device.append(1 if len(set([last1_login_record['device'], last2_login_record['device'], last3_login_record['device']]))==1 else 0)
is_last3_same_ip.append(1 if len(set([last1_login_record['ip'], last2_login_record['ip'], last3_login_record['ip']]))==1 else 0)
is_last3_same_city.append(1 if len(set([last1_login_record['city'], last2_login_record['city'], last3_login_record['city']]))==1 else 0)
is_last3_same_result.append(1 if len(set([last1_login_record['result'], last2_login_record['result'], last3_login_record['result']]))==1 else 0)
last2_login_month_dis.append(last1_login_record['login_real_month'] - last3_login_record['login_real_month'])
last2_login_day_dis.append(last1_login_record['login_real_day'] - last3_login_record['login_real_day'])
last2_login_hour_dis.append(last1_login_record['login_real_hour'] - last3_login_record['login_real_hour'])
last2_login_minute_dis.append(last1_login_record['login_real_minute'] - last3_login_record['login_real_minute'])
else:
is_last3_same_log_from.append(-1)
is_last3_same_device.append(-1)
is_last3_same_ip.append(-1)
is_last3_same_city.append(-1)
is_last3_same_result.append(-1)
last2_login_month_dis.append(-1)
last2_login_day_dis.append(-1)
last2_login_hour_dis.append(-1)
last2_login_minute_dis.append(-1)
else:
# 这里出现的都是没有出现登录记录的
is_last2_same_log_from.append(-2)
is_last2_same_device.append(-2)
is_last2_same_ip.append(-2)
is_last2_same_city.append(-2)
is_last2_same_result.append(-2)
is_last3_same_log_from.append(-2)
is_last3_same_device.append(-2)
is_last3_same_ip.append(-2)
is_last3_same_city.append(-2)
is_last3_same_result.append(-2)
count_login_is_scan_previous.append(-2)
count_login_not_scan_previous.append(-2)
count_login_is_sec_previous.append(-2)
count_login_not_sec_previous.append(-2)
last1_login_month_dis.append(-2)
last1_login_day_dis.append(-2)
last1_login_hour_dis.append(-2)
last1_login_minute_dis.append(-2)
last2_login_month_dis.append(-2)
last2_login_day_dis.append(-2)
last2_login_hour_dis.append(-2)
last2_login_minute_dis.append(-2)
# 将得到的特征拼接起来
# 计数特征
features['is_last2_same_log_from'] = is_last2_same_log_from
features['is_last2_same_device'] = is_last2_same_device
features['is_last2_same_ip'] = is_last2_same_ip
features['is_last2_same_city'] = is_last2_same_city
features['is_last2_same_result'] = is_last2_same_result
features['is_last3_same_log_from'] = is_last3_same_log_from
features['is_last3_same_device'] = is_last3_same_device
features['is_last3_same_ip'] = is_last3_same_ip
features['is_last3_same_city'] = is_last3_same_city
features['is_last3_same_result'] = is_last3_same_result
features['count_login_is_scan_all'] = count_login_is_scan_all
features['count_login_not_scan_all'] = count_login_not_scan_all
features['count_login_is_sec_all'] = count_login_is_sec_all
features['count_login_not_sec_all'] = count_login_not_sec_all
features['count_login_is_scan_previous'] = count_login_is_scan_previous
features['count_login_not_scan_previous'] = count_login_not_scan_previous
features['count_login_is_sec_previous'] = count_login_is_sec_previous
features['count_login_not_sec_previous'] = count_login_not_sec_previous
features['ratio_login_is_scan'] = ratio_login_is_scan
features['ratio_login_is_sec'] = ratio_login_is_sec
features['last1_login_month_dis'] = last1_login_month_dis
features['last1_login_day_dis'] = last1_login_day_dis
features['last1_login_hour_dis'] = last1_login_hour_dis
features['last1_login_minute_dis'] = last1_login_minute_dis
features['last2_login_month_dis'] = last2_login_month_dis
features['last2_login_day_dis'] = last2_login_day_dis
features['last2_login_hour_dis'] = last2_login_hour_dis
features['last2_login_minute_dis'] = last2_login_minute_dis
print('Shape of Features 3:', features.shape)
return features
# 包含时序信息的特征
def build_feat_4(login, trade):
features = pd.DataFrame()
last1_risk_ip_dic, last2_risk_ip_dic, all_previous_ip_dic = get_risk_ip_dic()
# 初始化特征列表
# 上一次登录ip是否在risk中出现
last1_in_last1_risk_ip = []
last1_in_last2_risk_ip = []
last2_in_last1_risk_ip = []
last2_in_last2_risk_ip = []
last1_in_all_previous_ip = []
last2_in_all_previous_ip = []
# 当前用户是否在特定时间内登录过,购买过,登录的次数,已经购买历史中是否存在过风险登录
# TODO:
# 开始遍历所有的交易信息
print('Start Search All Trade Info ... ')
for index in range(trade.shape[0]):
if index % 10000 == 0:
print("Processing till line : ", index)
# 取出交易记录中的每一行
each_line = trade.iloc[index]
# 取出交易记录的id,time等信息
trade_time = float(each_line['time'])
id = each_line['id']
# 查找登录信息中同id的记录
related_rows = login.loc[login['id'] == id]
if related_rows.shape[0] != 0:
# 查找当前交易发生之前的登录信息
previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]
if previous_record.shape[0] >= 1:
last1_login_record = previous_record.iloc[-1] # 倒数第一条记录
last1_in_last1_risk_ip.append(last1_risk_ip_dic[last1_login_record['ip']])
last1_in_last2_risk_ip.append(last2_risk_ip_dic[last1_login_record['ip']])
last1_in_all_previous_ip.append(all_previous_ip_dic[last1_login_record['ip']])
else:
last1_in_last1_risk_ip.append(-1)
last1_in_last2_risk_ip.append(-1)
last1_in_all_previous_ip.append(-1)
if previous_record.shape[0] >= 2:
last2_login_record = previous_record.iloc[-2] # 倒数第二条记录
last2_in_last1_risk_ip.append(last1_risk_ip_dic[last2_login_record['ip']])
last2_in_last2_risk_ip.append(last2_risk_ip_dic[last2_login_record['ip']])
last2_in_all_previous_ip.append(all_previous_ip_dic[last2_login_record['ip']])
else:
last2_in_last1_risk_ip.append(-1)
last2_in_last2_risk_ip.append(-1)
last2_in_all_previous_ip.append(-1)
else:
last1_in_last1_risk_ip.append(-2)
last1_in_last2_risk_ip.append(-2)
last1_in_all_previous_ip.append(-2)
last2_in_last1_risk_ip.append(-2)
last2_in_last2_risk_ip.append(-2)
last2_in_all_previous_ip.append(-2)
features['last1_in_last1_risk_ip'] = last1_in_last1_risk_ip
features['last1_in_last2_risk_ip'] = last1_in_last2_risk_ip
features['last2_in_last1_risk_ip'] = last2_in_last1_risk_ip
features['last2_in_last2_risk_ip'] = last2_in_last2_risk_ip
features['last1_in_all_previous_ip'] = last1_in_all_previous_ip
features['last2_in_all_previous_ip'] = last2_in_all_previous_ip
print('Shape of Features 4:', features.shape)
return features
# 从训练集交易数据提取的信息
def build_feat_5(trade, previous_train_trade):
features = pd.DataFrame()
# 初始化特征列表
is_last1_trade_risk = []
is_last2_trade_risk = []
is_last3_trade_risk = []
last_trade_risk_dis = []
last3_risk_trustee = [] # 通过前面三次交易记录得到的权值
count_trade_risk = []
ratio_trade_risk = []
# 距离上一次交易的时间
last1_trade_dis = []
last2_trade_dis = []
last3_trade_dis = []
# 开始遍历所有的交易信息
print('Start Search All Trade Info ... ')
for index in range(trade.shape[0]):
if index % 10000 == 0:
print("Processing till line : ", index)
# 取出交易记录中的每一行
each_line = trade.iloc[index]
# 取出交易记录的id,time等信息
trade_time = float(each_line['time'])
id = each_line['id']
# 查找前期交易信息中同id的记录
related_rows = previous_train_trade.loc[previous_train_trade['id'] == id]
if related_rows.shape[0] >= 1:
# 查找当前交易发生之前的交易信息
previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]
risk_array = previous_record['is_risk'].values
if previous_record.shape[0] >= 1:
# print(previous_record['is_risk'].values)
last1_record = previous_record.iloc[-1]
is_last1_trade_risk.append(1 if last1_record['is_risk']==1 else 0)
last_trade_risk_dis.append((len(risk_array) - np.where(risk_array == 1)[0][-1]) if (1 in risk_array) else -3)
count_trade_risk.append(np.sum(risk_array == 1))
ratio_trade_risk.append(np.sum(risk_array == 1) / len(risk_array))
last1_trade_dis.append(trade_time - float(last1_record['time']))
else:
is_last1_trade_risk.append(-1)
last_trade_risk_dis.append(-1)
count_trade_risk.append(-1)
ratio_trade_risk.append(0)
last1_trade_dis.append(-1)
if previous_record.shape[0] >= 2:
last2_record = previous_record.iloc[-2]
is_last2_trade_risk.append(1 if last2_record['is_risk'] == 1 else 0)
last2_trade_dis.append(trade_time - float(last2_record['time']))
else:
is_last2_trade_risk.append(-1)
last2_trade_dis.append(-1)
if previous_record.shape[0] >= 3:
last1_record = previous_record.iloc[-1]
last2_record = previous_record.iloc[-2]
last3_record = previous_record.iloc[-3]
is_last3_trade_risk.append(1 if last3_record['is_risk'] == 1 else 0)
last3_trade_dis.append(trade_time - float(last3_record['time']))
if last1_record['is_risk'] == 1 and last2_record['is_risk'] == 1 and last3_record['is_risk'] == 1:
last3_risk_trustee.append(100)
elif last1_record['is_risk'] == 1 and last2_record['is_risk'] == 1 and last3_record['is_risk'] == 0:
last3_risk_trustee.append(70)
elif last1_record['is_risk'] == 1 and last2_record['is_risk'] == 0 and last3_record['is_risk'] == 1:
last3_risk_trustee.append(60)
elif last1_record['is_risk'] == 1 and last2_record['is_risk'] == 0 and last3_record['is_risk'] == 0:
last3_risk_trustee.append(40)
elif last1_record['is_risk'] == 0 and last2_record['is_risk'] == 1 and last3_record['is_risk'] == 1:
last3_risk_trustee.append(30)
elif last1_record['is_risk'] == 0 and last2_record['is_risk'] == 0 and last3_record['is_risk'] == 1:
last3_risk_trustee.append(20)
else:
last3_risk_trustee.append(0)
else:
is_last3_trade_risk.append(-1)
last3_risk_trustee.append(-10)
last3_trade_dis.append(-1)
else:
is_last1_trade_risk.append(-2)
is_last2_trade_risk.append(-2)
is_last3_trade_risk.append(-2)
last_trade_risk_dis.append(-2)
last3_risk_trustee.append(-20)
count_trade_risk.append(-2)
ratio_trade_risk.append(-2)
last1_trade_dis.append(-2)
last2_trade_dis.append(-2)
last3_trade_dis.append(-2)
features['is_last1_trade_risk'] = is_last1_trade_risk
features['is_last2_trade_risk'] = is_last2_trade_risk
features['is_last3_trade_risk'] = is_last3_trade_risk
features['last_trade_risk_dis'] = last_trade_risk_dis
features['last3_risk_trustee'] = last3_risk_trustee
features['count_trade_risk'] = count_trade_risk
features['ratio_trade_risk'] = ratio_trade_risk
features['last1_trade_dis'] = last1_trade_dis
features['last2_trade_dis'] = last2_trade_dis
features['last3_trade_dis'] = last3_trade_dis
print('Shape of Features 5:', features.shape)
return features
def build_feat_6(login, trade):
features = pd.DataFrame()
ip_to_risk_std, ip_to_risk_type = get_most_risk_ip_dic()
last1_risk_ip_dic, last2_risk_ip_dic, all_previous_ip_dic = get_risk_ip_dic()
# 得到risk_ip的risk程度
last1_ip_risk_std = []
last1_ip_risk_type = []
last2_ip_risk_std = []
last2_ip_risk_type = []
# 得到上面最后两个ip是否为危险ip
real_last1_in_last1_risk_ip = []
real_last1_in_last2_risk_ip = []
real_last2_in_last1_risk_ip = []
real_last2_in_last2_risk_ip = []
real_last1_in_all_previous_ip = []
real_last2_in_all_previous_ip = []
# 开始遍历所有的交易信息
print('Start Search All Trade Info ... ')
for index in range(trade.shape[0]):
if index % 10000 == 0:
print("Processing till line : ", index)
# 取出交易记录中的每一行
each_line = trade.iloc[index]
# 取出交易记录的id,time等信息
trade_time = float(each_line['time'])
id = each_line['id']
# 查找登录信息中同id的记录
related_rows = login.loc[login['id'] == id]
if related_rows.shape[0] != 0:
# 查找当前交易发生之前的登录信息
previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]
if previous_record.shape[0] == 0:
last1_ip_risk_std.append(0)
last1_ip_risk_type.append(0)
last2_ip_risk_std.append(0)
last2_ip_risk_type.append(0)
real_last1_in_last1_risk_ip.append(0)
real_last1_in_last2_risk_ip.append(0)
real_last1_in_all_previous_ip.append(0)
real_last2_in_last1_risk_ip.append(0)
real_last2_in_last2_risk_ip.append(0)
real_last2_in_all_previous_ip.append(0)
else:
ips = previous_record['ip'].tolist()
last1_ip = ips[-1]
last2_ip = 0
for ip in ips[::-1]:
if ip != last1_ip:
last2_ip = ip
break
# 如果不存在第二个ip
if last2_ip == 0:
if last1_ip in ip_to_risk_std.keys():
last1_ip_risk_std.append(ip_to_risk_std[last1_ip])
last1_ip_risk_type.append(ip_to_risk_type[last1_ip])
else:
last1_ip_risk_std.append(0)
last1_ip_risk_type.append(0)
last2_ip_risk_std.append(0)
last2_ip_risk_type.append(0)
real_last1_in_last1_risk_ip.append(last1_risk_ip_dic[last1_ip])
real_last1_in_last2_risk_ip.append(last2_risk_ip_dic[last1_ip])
real_last1_in_all_previous_ip.append(all_previous_ip_dic[last1_ip])
real_last2_in_last1_risk_ip.append(0)
real_last2_in_last2_risk_ip.append(0)
real_last2_in_all_previous_ip.append(0)
else:
if last1_ip in ip_to_risk_std.keys():
last1_ip_risk_std.append(ip_to_risk_std[last1_ip])
last1_ip_risk_type.append(ip_to_risk_type[last1_ip])
else:
last1_ip_risk_std.append(0)
last1_ip_risk_type.append(0)
if last2_ip in ip_to_risk_std.keys():
last2_ip_risk_std.append(ip_to_risk_std[last2_ip])
last2_ip_risk_type.append(ip_to_risk_type[last2_ip])
else:
last2_ip_risk_std.append(0)
last2_ip_risk_type.append(0)
real_last1_in_last1_risk_ip.append(last1_risk_ip_dic[last2_ip])
real_last1_in_last2_risk_ip.append(last2_risk_ip_dic[last2_ip])
real_last1_in_all_previous_ip.append(all_previous_ip_dic[last2_ip])
real_last2_in_last1_risk_ip.append(last1_risk_ip_dic[last2_ip])
real_last2_in_last2_risk_ip.append(last2_risk_ip_dic[last2_ip])
real_last2_in_all_previous_ip.append(all_previous_ip_dic[last2_ip])
else:
last1_ip_risk_std.append(0)
last1_ip_risk_type.append(0)
last2_ip_risk_std.append(0)
last2_ip_risk_type.append(0)
real_last1_in_last1_risk_ip.append(0)
real_last1_in_last2_risk_ip.append(0)
real_last1_in_all_previous_ip.append(0)
real_last2_in_last1_risk_ip.append(0)
real_last2_in_last2_risk_ip.append(0)
real_last2_in_all_previous_ip.append(0)
features['last1_ip_risk_std'] = last1_ip_risk_std
features['last1_ip_risk_type'] = last1_ip_risk_type
features['last2_ip_risk_std'] = last2_ip_risk_std
features['last2_ip_risk_type'] = last2_ip_risk_type
features['real_last1_in_last1_risk_ip'] = real_last1_in_last1_risk_ip
features['real_last1_in_last2_risk_ip'] = real_last1_in_last2_risk_ip
features['real_last1_in_all_previous_ip'] = real_last1_in_all_previous_ip
features['real_last2_in_last1_risk_ip'] = real_last2_in_last1_risk_ip
features['real_last2_in_last2_risk_ip'] = real_last2_in_last2_risk_ip
features['real_last2_in_all_previous_ip'] = real_last2_in_all_previous_ip
print('Shape of Features 6:', features.shape)
return features
def delete_bad_feat(features):
del features['last_login_from']
del features['last_login_is_scan']
del features['last_login_is_sec']
del features['count_login_is_sec_previous']
del features['count_login_not_sec_previous']
del features['ratio_login_is_sec']
del features['last1_login_month_dis']
del features['last2_login_month_dis']
return features
# 对外接口
def load_data():
df_train_login, df_test_login, df_train_trade, df_test_trade = preprocess()
rowkey = df_test_trade['rowkey'].values
train_y = df_train_trade['is_risk'].values
# del df_train_trade['is_risk']
# 把第1个参数替换成df_train_login+df_test_login可以解决divisionbyzero问题,但是就穿越了,otherwise就写个判断过滤一下吧
# df_all_login = df_train_login + df_test_login
train_x = build_feat(df_train_login.copy(), df_train_trade.copy(), type='train', mode='Seperate', train_trade=None)
test_x = build_feat(df_test_login.copy(), df_test_trade.copy(), type='test', mode='Seperate', train_trade=df_train_trade.copy())
print("Start One-Hot Encoding ... ")
train_x = one_hot(df_all_login.copy(), train_x, test_x, 1)
test_x = one_hot(df_all_login.copy(), test_x, train_x, 2)
return train_x, train_y, test_x, rowkey
if __name__ == '__main__':
load_data()
特征工程 : OneHot
亲测Feature[0, 1, 5, 6] 经过OneHot后效果有提升, 能到0. 82以上
对重要的连续型特征做分片后,能到0.85左右
def one_hot(df_all_login, df_target, df_other, num):
enc_result = OneHotEncoder()
enc_result.fit(np.append(df_all_login['result'].values, 0).reshape(-1, 1))
enc_device = OneHotEncoder()
enc_device.fit(np.append(df_all_login['device'].values, 0).reshape(-1, 1))
enc_logfrom = OneHotEncoder()
enc_logfrom.fit(np.append(df_all_login['log_from'].values, 0).reshape(-1, 1))
enc_ip = OneHotEncoder()
enc_ip.fit(np.append(df_all_login['ip'].values, 0).reshape(-1, 1))
enc_city = OneHotEncoder()
enc_city.fit(np.append(df_all_login['city'].values, 0).reshape(-1, 1))
enc_type = OneHotEncoder()
enc_type.fit(np.append(df_all_login['type'].values, 0).reshape(-1, 1))
feature_1 = pd.DataFrame(enc_result.transform(df_target['last_login_result'].values.reshape(-1, 1)).toarray())
feature_1.columns = ["last_login_result_"+str(num) for num in range(1, 12)]
del df_target['last_login_result']
# Feat 1 ##########################
df_target['timezone'] = df_target['last_login_real_time'].apply(process_time)
enc_time = OneHotEncoder()
enc_time.fit(df_target['timezone'].values.reshape(-1, 1))
feature_1_1 = pd.DataFrame(enc_time.transform(df_target['timezone'].values.reshape(-1, 1)).toarray())
feature_1_1.columns = ['Last_Login_TimeZone_'+str(num) for num in range(1, 6)]
del df_target['last_login_real_time']
df_target['timezone'] = df_target['last_trade_real_time'].apply(process_time)
enc_time = OneHotEncoder()
enc_time.fit(df_target['timezone'].values.reshape(-1, 1))
feature_1_2 = pd.DataFrame(enc_time.transform(df_target['timezone'].values.reshape(-1, 1)).toarray())
feature_1_2.columns = ['Last_Trade_TimeZone_'+str(num) for num in range(1, 6)]
del df_target['last_trade_real_time']
del df_target['timezone']
feature_1_3 = pd.DataFrame(enc_logfrom.transform(df_target['last_login_from'].values.reshape(-1, 1)).toarray())
feature_1_3.columns = ["last_login_from_"+str(num) for num in range(1, 13)]
del df_target['last_login_from']
feature_1_4 = pd.DataFrame(enc_city.transform(df_target['last_login_city'].values.reshape(-1, 1)).toarray())
feature_1_4.columns = ["last_login_city_" + str(num) for num in range(1, 486)]
del df_target['last_login_city']
feature_1_5 = pd.DataFrame(enc_type.transform(df_target['last_login_type'].values.reshape(-1, 1)).toarray())
feature_1_5.columns = ["last_login_type_" + str(num) for num in range(1, 5)]
del df_target['last_login_type']
# 把3的垃圾特征删掉
if num == 2:
del df_target['last1_login_month_dis']
del df_other['last1_login_month_dis']
del df_target['last1_login_day_dis']
del df_other['last1_login_day_dis']
del df_target['last1_login_hour_dis']
del df_other['last1_login_hour_dis']
del df_target['last1_login_minute_dis']
del df_other['last1_login_minute_dis']
del df_target['last2_login_month_dis']
del df_other['last2_login_month_dis']
del df_target['last2_login_day_dis']
del df_other['last2_login_day_dis']
del df_target['last2_login_hour_dis']
del df_other['last2_login_hour_dis']
del df_target['last2_login_minute_dis']
del df_other['last2_login_minute_dis']
# Feat 5 ##############################
enc = OneHotEncoder()
enc.fit(np.append(df_target['is_last1_trade_risk'].values, df_other['is_last1_trade_risk'].values).reshape(-1, 1))
feature_5_1 = pd.DataFrame(
enc.transform(df_target['is_last1_trade_risk'].values.reshape(-1, 1)).toarray())
feature_5_1.columns = ['is_last1_trade_risk' + str(num) for num in range(1, 5)]
if num == 2:
del df_target['is_last1_trade_risk']
del df_other['is_last1_trade_risk']
enc = OneHotEncoder()
enc.fit(np.append(df_target['is_last2_trade_risk'].values, df_other['is_last2_trade_risk']).reshape(-1, 1))
feature_5_2 = pd.DataFrame(
enc.transform(df_target['is_last2_trade_risk'].values.reshape(-1, 1)).toarray())
feature_5_2.columns = ['is_last2_trade_risk' + str(num) for num in range(1, 5)]
if num == 2:
del df_target['is_last2_trade_risk']
del df_other['is_last2_trade_risk']
enc = OneHotEncoder()
enc.fit(np.append(df_target['is_last3_trade_risk'].values, df_other['is_last3_trade_risk']).reshape(-1, 1))
feature_5_3 = pd.DataFrame(
enc.transform(df_target['is_last3_trade_risk'].values.reshape(-1, 1)).toarray())
feature_5_3.columns = ['is_last3_trade_risk' + str(num) for num in range(1, 5)]
if num == 2:
del df_target['is_last3_trade_risk']
del df_other['is_last3_trade_risk']
all_features = pd.concat([df_target, feature_1], axis=1)
all_features = pd.concat([all_features, feature_1_1], axis=1)
all_features = pd.concat([all_features, feature_1_2], axis=1)
all_features = pd.concat([all_features, feature_1_3], axis=1)
all_features = pd.concat([all_features, feature_1_4], axis=1)
all_features = pd.concat([all_features, feature_1_5], axis=1)
all_features = pd.concat([all_features, feature_5_1], axis=1)
all_features = pd.concat([all_features, feature_5_2], axis=1)
all_features = pd.concat([all_features, feature_5_3], axis=1)
print(all_features.info())
return all_features
模型
主要用了LightGBM, XGBoost, Keras(MLP)
代码模板还是在这篇文章里 : http://blog.csdn.net/leyounger/article/details/78667538
Stacking
AI法官用的Stacking模型, 感觉没啥效果,可能是模型太相似了,学不到啥东西
分享一个Keras的分类可视化图,好多重叠啊 T.T
这只是MLP的效果,大家可以试试FNN, PNN,CCPM等FM(或FFM)与DNN结合的模型哦. [比心]
写在最后
本人陕西边家村读本科,保送成都清水寺读研,NLP方向烟酒僧一枚,本人勤快好学,如有大神收留,请收下我的膝盖 Orz