[比赛分享] 2017 JD京东 比赛代码开源分享

摘要

AI法官队友弃赛,吾心甚痛,欲置ML与身外,奈何毕业将至,身无长物,故,重新振作看论文,打比赛。。。

不扯了,上代码


特征工程

说明 : 粗简版Feature,未OneHot,很多不合理特征,没有不平衡数据处理

得分: 多模型Voting 0.79, 单模型0.75左右,未仔细调参

(不平滑数据的处理看这篇文章的最下边):
http://blog.csdn.net/leyounger/article/details/78667538

import pandas as pd
import numpy as np
import datetime
import time
import copy
import os
from collections import Counter
from model.view_risk import get_risk_ip_dic
from model.view_risk import get_most_risk_ip_dic


def get_datetime(timestamp):
    if not isinstance(timestamp, float):
        timestamp = float(timestamp)
    return datetime.datetime.fromtimestamp(timestamp)

def get_timestamp(dt):
    # dt = '2015-01-01 00:00:41'
    dt = datetime.datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
    return str(time.mktime(dt.timetuple()))

def mytime(time):
    month_dict = {1: 0, 2: 31, 3: 61, 4: 92, 5: 122, 6: 153, 7: 183, 8: 214}
    return (month_dict[int(time[5:7])] + int(time[8:10])) * 24 * 60 * 60 + \
           int(time[11:13]) * 60 * 60 + int(time[14:16]) * 60 + int(time[17:19])


# 一系列提取特征的函数

# 判断target是否为array中的大多数
def is_majority(target, array, n):

    # 如果候选列表只有一个候选项目,则返回待定值2
    if len(array) == 1:
        return 2

    c = Counter(array).most_common(n)
    for item in c:
        if item[0] == target:
            return 1
    return 0

# 判断target是否在array中至少出现n次
def is_exists(target, array, n):
    return 1 if array.count(target) >= n else 0



# 预处理步骤
def preprocess():
    trade_file = '../data/train/t_trade.csv'
    login_file = '../data/train/t_login.csv'
    trade_test_file = '../data/train/t_trade_test.csv'
    login_test_file = '../data/train/t_login_test.csv'


    df_train_login = pd.read_csv(login_file)
    df_train_login.sort_values(by=['id', 'timestamp', 'result'], inplace=True, ascending=True)
    # df_train_login.reset_index(inplace=True)
    # print(df_train_login.head(10))

    df_test_login = pd.read_csv(login_test_file)
    df_test_login.sort_values(by=['id', 'timestamp', 'result'], inplace=True, ascending=True)
    # df_test_login.reset_index(inplace=True)
    # print(df_test_login.head(10))


    df_train_trade = pd.read_csv(trade_file)
    df_train_trade['time'] = df_train_trade['time'].apply(lambda dt: get_timestamp(dt[:-2]))
    df_train_trade.sort_values(by=['id', 'time'], inplace=True, ascending=True)
    # print(df_train_trade.head(10))

    df_test_trade = pd.read_csv(trade_test_file)
    df_test_trade['time'] = df_test_trade['time'].apply(lambda dt: get_timestamp(dt[:-2]))
    df_test_trade.sort_values(by=['id', 'time'], inplace=True, ascending=True)
    # print(df_test_trade.head(10))

    return df_train_login, df_test_login, df_train_trade, df_test_trade


def build_feat(login, trade, type, mode, train_trade):

    print('Start Initiating Feautures Array ... ')
    # 初始化各特征列表
    login['login_real_month'] = login['timestamp'].apply(lambda x: get_datetime(x).month)
    login['login_real_day'] = login['timestamp'].apply(lambda x: get_datetime(x).day)
    login['login_real_hour'] = login['timestamp'].apply(lambda x: get_datetime(x).hour)
    login['login_real_minute'] = login['timestamp'].apply(lambda x: get_datetime(x).minute)
    login['login_real_second'] = login['timestamp'].apply(lambda x: get_datetime(x).second)

    trade['trade_real_month'] = trade['time'].apply(lambda x: get_datetime(x).month)
    trade['trade_real_day'] = trade['time'].apply(lambda x: get_datetime(x).day)
    trade['trade_real_hour'] = trade['time'].apply(lambda x: get_datetime(x).hour)
    trade['trade_real_minute'] = trade['time'].apply(lambda x: get_datetime(x).minute)
    trade['trade_real_second'] = trade['time'].apply(lambda x: get_datetime(x).second)

    login['time'] = login['timestamp']
    trade['time'] = trade['time'].apply(lambda x: float(x))
    del login['timestamp']
    # del login['index']
    del trade['rowkey']

    login['login_result'] = login['result'].apply(lambda x: 1 if x > 0 else 0)
    login['is_scan'] = login['is_scan'].apply(lambda x: 1 if x else 0)
    login['is_sec'] = login['is_sec'].apply(lambda x: 1 if x else 0)

    # 处理test时,将训练集中的历史交易数据单独放进来提取特征
    if type == 'test':
        train_trade['trade_real_month'] = train_trade['time'].apply(lambda x: get_datetime(x).month)
        train_trade['trade_real_day'] = train_trade['time'].apply(lambda x: get_datetime(x).day)
        train_trade['trade_real_hour'] = train_trade['time'].apply(lambda x: get_datetime(x).hour)
        train_trade['trade_real_minute'] = train_trade['time'].apply(lambda x: get_datetime(x).minute)
        train_trade['trade_real_second'] = train_trade['time'].apply(lambda x: get_datetime(x).second)
        train_trade['time'] = train_trade['time'].apply(lambda x: float(x))
        del train_trade['rowkey']
        previous_train_trade = train_trade
    else:
        previous_train_trade = trade




    # print(login.head(10))
    # print(trade.head(10))
    # exit()

    feature_0 = build_feat_0(login, trade)

    print('Building Feature 1 ... ', mode, type)
    if os.path.exists('../data/other/hdf/'+mode+'/'+type+'_feature_1.hdf'):
        feature_1 = pd.read_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_1.hdf', 'w')
    else:
        feature_1 = build_feat_1(login, trade)
        feature_1.to_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_1.hdf', 'w')

    print('Building Feature 2 ... ', mode, type)
    if os.path.exists('../data/other/hdf/'+mode+'/'+type+'_feature_2.hdf'):
        feature_2 = pd.read_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_2.hdf', 'w')
    else:
        feature_2 = build_feat_2(login, trade)
        feature_2.to_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_2.hdf', 'w')

    print('Building Feature 3 ... ', mode, type)
    if os.path.exists('../data/other/hdf/'+mode+'/'+type+'_feature_3.hdf'):
        feature_3 = pd.read_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_3.hdf', 'w')
    else:
        feature_3 = build_feat_3(login, trade)
        feature_3.to_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_3.hdf', 'w')

    print('Building Feature 4 ... ', mode, type)
    if os.path.exists('../data/other/hdf/'+mode+'/'+type+'_feature_4.hdf'):
        feature_4 = pd.read_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_4.hdf', 'w')
    else:
        feature_4 = build_feat_4(login, trade)
        feature_4.to_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_4.hdf', 'w')

    print('Building Feature 5 ... ', mode, type)
    if os.path.exists('../data/other/hdf/'+mode+'/'+type+'_feature_5.hdf'):
        feature_5 = pd.read_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_5.hdf', 'w')
    else:
        feature_5 = build_feat_5(trade, previous_train_trade)
        feature_5.to_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_5.hdf', 'w')


    print('Building Feature 6 ... ', mode, type)
    if os.path.exists('../data/other/hdf/'+mode+'/'+type+'_feature_6.hdf'):
        feature_6 = pd.read_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_6.hdf', 'w')
    else:
        feature_6 = build_feat_6(login, trade)
        feature_6.to_hdf('../data/other/hdf/'+mode+'/'+type+'_feature_6.hdf', 'w')


    all_features = pd.concat([feature_0, feature_1], axis=1)
    all_features = pd.concat([all_features, feature_2], axis=1)
    all_features = pd.concat([all_features, feature_3], axis=1)
    all_features = pd.concat([all_features, feature_4], axis=1)
    all_features = pd.concat([all_features, feature_5], axis=1)
    all_features = pd.concat([all_features, feature_6], axis=1)

    print('Shape of All Features:', all_features.shape)
    print('----------------------- End of Feature Extracting ----------------------')
    return all_features

def build_feat_0(login, trade):
    features = pd.DataFrame()
    # 拼接特征矩阵
    features['time'] = trade['time'].apply(lambda x: float(x))
    features['trade_real_month'] = trade['trade_real_month']
    features['trade_real_day'] = trade['trade_real_day']
    features['trade_real_hour'] = trade['trade_real_hour']
    features['trade_real_minute'] = trade['trade_real_minute']
    features['trade_real_second'] = trade['trade_real_second']

    print('Shape of Features 0:', features.shape)
    return features

def build_feat_1(login, trade):

    features = pd.DataFrame()

    # 初始化特征列表
    last_login_time = []
    last_login_result = []
    last_login_timelong = []
    last_login_device = []
    last_login_from = []
    last_login_ip = []
    last_login_city = []
    last_login_type = []
    last_login_is_scan = []
    last_login_is_sec = []
    last_trade_time = []



    # 计数特征
    count_login_all = []  # 登录的总次数
    count_login_previous = []  # 当前时间是第几次登录
    count_trade_all = []
    count_trade_previous = []
    count_login_fail_all = []  # 登录失败总次数
    count_login_fail_previous = []
    count_login_succ_all = []
    count_login_succ_previous = []
    avg_login_previous = []  # 上次登录与登录时间平均值的差
    max_login_previous = []
    min_login_previous = []
    count_login_ip_all = []  # 上次登录的ip在所有登录记录中出现的次数,出现次数太多的ip显然是异常的
    count_login_device_all = []
    count_login_city_all = []

    # 类别特征
    is_login_device_usual = []
    is_login_from_usual = []
    is_login_ip_usual = []
    is_login_city_usual = []
    is_login_type_usual = []
    is_login_device_exists_previous = []  # 上次登录的设备是否在以前出现过
    is_login_from_exists_previous = []
    is_login_ip_exists_previous = []
    is_login_city_exists_previous = []
    is_login_type_exists_previous = []


    # 提前计算好一些数量,可以同时利用上测试集和训练集的特征
    print('Start Pre-Calculating ... ')
    set_ip = Counter(login['ip'].tolist())
    set_device = Counter(login['device'].tolist())
    set_city = Counter(login['city'].tolist())

    # 开始遍历所有的交易信息
    print('Start Search All Trade Info ... ')
    for index in range(trade.shape[0]):
        if index % 10000==0:
            print("Processing till line : ", index)

        # 取出交易记录中的每一行
        each_line = trade.iloc[index]

        # 取出交易记录的id,time等信息
        trade_time = float(each_line['time'])
        id = each_line['id']

        # 查找登录信息中同id的记录
        related_rows = login.loc[login['id'] == id]
        count_login_all.append(related_rows.shape[0])
        count_login_fail_all.append(len([1 for num in related_rows['result'].tolist() if num != 1]))
        count_login_succ_all.append(len([1 for num in related_rows['result'].tolist() if num == 1]))

        if related_rows.shape[0] != 0:
            # 查找当前交易发生之前的登录信息
            previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]
            count_login_previous.append(previous_record.shape[0])
            count_login_fail_previous.append(len([1 for num in previous_record['result'].tolist() if num != 1]))
            count_login_succ_previous.append(len([1 for num in previous_record['result'].tolist() if num == 1]))

            if previous_record.shape[0] != 0:
                first_login_record = previous_record.iloc[0]  # 取第一次登录的记录
                last_login_record = previous_record.iloc[-1]  # 取最近的那一次登录记录

                last_login_time.append(trade_time - last_login_record['time'])
                last_login_result.append(last_login_record['result'])
                last_login_timelong.append(last_login_record['timelong'])
                last_login_device.append(last_login_record['device'])
                last_login_from.append(last_login_record['log_from'])
                last_login_ip.append(last_login_record['ip'])
                last_login_city.append(last_login_record['city'])
                last_login_type.append(last_login_record['type'])
                last_login_is_scan.append(last_login_record['is_scan'])
                last_login_is_sec.append(last_login_record['is_sec'])

                # 计数特征
                avg_login_previous.append(np.average(related_rows['timelong']) - last_login_record['timelong'])
                max_login_previous.append(np.max(related_rows['timelong']) - last_login_record['timelong'])
                min_login_previous.append(np.min(related_rows['timelong']) - last_login_record['timelong'])
                count_login_ip_all.append(set_ip[last_login_record['ip']])
                count_login_device_all.append(set_device[last_login_record['device']])
                count_login_city_all.append(set_city[last_login_record['city']])

                # 类别特征
                is_login_device_usual.append(
                    is_majority(last_login_record['device'], related_rows['device'], 2))  # 上次登录的设备在所有登录设备中是否为常用设备
                is_login_from_usual.append(is_majority(last_login_record['log_from'], related_rows['log_from'], 2))
                is_login_ip_usual.append(is_majority(last_login_record['ip'], related_rows['ip'], 2))
                is_login_city_usual.append(is_majority(last_login_record['city'], related_rows['city'], 2))
                is_login_type_usual.append(is_majority(last_login_record['type'], related_rows['type'], 2))

                is_login_device_exists_previous.append(
                    is_exists(last_login_record['device'], previous_record['device'].tolist(), 2))  # 上次登录的设备在之前是否登录过
                is_login_from_exists_previous.append(
                    is_exists(last_login_record['log_from'], previous_record['log_from'].tolist(), 2))
                is_login_ip_exists_previous.append(
                    is_exists(last_login_record['ip'], previous_record['ip'].tolist(), 2))
                is_login_city_exists_previous.append(
                    is_exists(last_login_record['city'], previous_record['city'].tolist(), 2))
                is_login_type_exists_previous.append(
                    is_exists(last_login_record['type'], previous_record['type'].tolist(), 2))

            else:
                last_login_time.append(-1)  # 如果没有登录信息,说明用户之前没有登录,但是后来马上有登录,可信度较大,则赋-1
                last_login_result.append(-40)
                last_login_timelong.append(-1)
                last_login_device.append(0)
                last_login_from.append(0)
                last_login_ip.append(0)
                last_login_city.append(0)
                last_login_type.append(0)
                last_login_is_scan.append(-1)
                last_login_is_sec.append(-1)

                # 计数特征
                avg_login_previous.append(0)
                max_login_previous.append(0)
                min_login_previous.append(0)
                count_login_ip_all.append(0)
                count_login_device_all.append(0)
                count_login_city_all.append(0)

                # 类别特征
                is_login_device_usual.append(0)
                is_login_from_usual.append(0)
                is_login_ip_usual.append(0)
                is_login_city_usual.append(0)
                is_login_type_usual.append(0)

                is_login_device_exists_previous.append(-1)  # 上次登录的设备在之前是否登录过
                is_login_from_exists_previous.append(-1)
                is_login_ip_exists_previous.append(-1)
                is_login_city_exists_previous.append(-1)
                is_login_type_exists_previous.append(-1)


        else:
            # 这里出现的都是没有出现登录记录的
            # print(id, 'Cannot Find', train_y[index])

            # 如果没有同id的信息,说明用户之前没有登录,后来也没有登录
            last_login_time.append(-2)
            last_login_result.append(-41)
            last_login_timelong.append(-2)
            last_login_device.append(0)
            last_login_from.append(0)
            last_login_ip.append(0)
            last_login_city.append(-1)
            last_login_type.append(-1)
            last_login_is_scan.append(-1)
            last_login_is_sec.append(-1)

            # 计数特征
            count_login_previous.append(0)
            avg_login_previous.append(0)
            max_login_previous.append(0)
            min_login_previous.append(0)
            count_login_succ_previous.append(0)
            count_login_fail_previous.append(0)
            count_login_ip_all.append(0)
            count_login_device_all.append(0)
            count_login_city_all.append(0)

            # 类别特征
            is_login_device_usual.append(0)
            is_login_from_usual.append(0)
            is_login_ip_usual.append(0)
            is_login_city_usual.append(0)
            is_login_type_usual.append(0)

            is_login_device_exists_previous.append(-2)
            is_login_from_exists_previous.append(-2)
            is_login_ip_exists_previous.append(-2)
            is_login_city_exists_previous.append(-2)
            is_login_type_exists_previous.append(-2)

        # 查找交易信息中同id的记录
        related_rows = trade.loc[trade['id'] == id]
        count_trade_all.append(related_rows.shape[0])
        if related_rows.shape[0] != 0:
            # 查找当前交易发生之前的交易信息
            previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]
            count_trade_previous.append(previous_record.shape[0])
        else:
            count_trade_previous.append(0)

        if index > 0:
            last_trade_record = trade.iloc[index - 1]
            if last_trade_record['id'] == id:
                last_trade_time.append(trade_time - float(last_trade_record['time']))
            else:
                last_trade_time.append(-1)
        else:
            last_trade_time.append(-2)

    # 将得到的特征拼接起来
    features['last_login_time'] = last_login_time
    features['last_login_result'] = last_login_result
    features['last_login_timelong'] = last_login_timelong
    features['last_login_device'] = last_login_device
    features['last_login_from'] = last_login_from
    features['last_login_ip'] = last_login_ip
    features['last_login_city'] = last_login_city
    features['last_login_type'] = last_login_type
    features['last_login_is_scan'] = last_login_is_scan
    features['last_login_is_sec'] = last_login_is_sec
    features['last_trade_time'] = last_trade_time

    # 计数特征
    features['count_login_all'] = count_login_all
    features['count_login_previous'] = count_login_previous
    features['count_trade_all'] = count_trade_all
    features['count_trade_previous'] = count_trade_previous
    features['avg_login_previous'] = avg_login_previous
    features['max_login_previous'] = max_login_previous
    features['min_login_previous'] = min_login_previous
    features['count_login_fail_all'] = count_login_fail_all
    features['count_login_fail_previous'] = count_login_fail_previous
    features['count_login_succ_all'] = count_login_succ_all
    features['count_login_succ_previous'] = count_login_succ_previous
    features['count_login_ip_all'] = count_login_ip_all  # 上次登录的ip在所有登录记录中出现的次数
    features['count_login_device_all'] = count_login_device_all
    features['count_login_city_all'] = count_login_city_all

    # 类别特征
    features['is_login_device_usual'] = is_login_device_usual
    features['is_login_from_usual'] = is_login_from_usual
    features['is_login_ip_usual'] = is_login_ip_usual
    features['is_login_city_usual'] = is_login_city_usual
    features['is_login_type_usual'] = is_login_type_usual
    features['is_login_device_exists_previous'] = is_login_device_exists_previous
    features['is_login_from_exists_previous'] = is_login_from_exists_previous
    features['is_login_ip_exists_previous'] = is_login_ip_exists_previous
    features['is_login_city_exists_previous'] = is_login_city_exists_previous
    features['is_login_type_exists_previous'] = is_login_type_exists_previous

    print('Shape of Features 1:', features.shape)
    return features

def build_feat_2(login, trade):
    features = pd.DataFrame()

    # 初始化特征列表
    # 计数特征
    count_10_times_timelong_all = []  # 统计登录时间有大量的10倍数时间,说明当前账号很可能被恶意登录
    count_not_10_times_timelong_all = []
    count_10_times_timelong_previous = []  # 统计登录时间有大量的10倍数时间,说明当前账号很可能被恶意登录
    count_not_10_times_timelong_previous = []
    count_city_types = []  # 登录城市的类别个数,类别太多,说明账号被多地登录
    count_city_types_previous = []  # 上次登录的城市出现的频数
    count_log_from_types = []  # 登录来源类别个数
    count_log_from_types_previous = []  # 上次登录来源出现的频数


    avg_login_time_dis = []  # 登陆时间间隔的平均值,这个值很小,说明账号频繁登录
    std_login_time_dis = []
    std_login_city_dis = []  # 登录城市值的标准差


    max_login_time = []
    min_login_time = []
    median_login_time = []
    max_min_login_time_dis = []
    max_login_city_dis = []
    min_login_city_dis = []
    median_login_city = []

    min_login_timelong_all = []
    max_login_timelong_all = []
    avg_login_timelong_all = []
    std_login_timelong_all = []
    min_login_timelong_previous = []
    max_login_timelong_previous = []
    avg_login_timelong_previous = []
    std_login_timelong_previous = []

    min_login_city_all = []
    max_login_city_all = []
    avg_login_city_all = []
    std_login_city_all = []
    min_login_city_previous = []
    max_login_city_previous = []
    avg_login_city_previous = []
    std_login_city_previous = []

    # 比率特征
    ratio_log_from_all = []
    ratio_log_from_previous = []  # 上一次登录来源log_from所占比例
    ratio_city_all = []
    ratio_city_previous = []  # 上次登录城市所占比例
    ratio_10_times_timelong = []  # 10倍登录时间所占时间

    # 类别特征
    is_login_10_times_timelong = []  # 通过观察发现,有很多登录的时间恰好是10的倍数!!考察上一次登录的时长是否为10的倍数
    is_login_city_the_very_first_time = []  # 上次登录是否为初次登录的city



    # 提前计算好一些数量,可以同时利用上测试集和训练集的特征
    print('Start Pre-Calculating ... ')
    set_ip = Counter(login['ip'].tolist())
    set_device = Counter(login['device'].tolist())
    set_city = Counter(login['city'].tolist())


    # 开始遍历所有的交易信息
    print('Start Search All Trade Info ... ')
    for index in range(trade.shape[0]):
        if index%10000==0:
            print("Processing till line : ", index)

        # 取出交易记录中的每一行
        each_line = trade.iloc[index]

        # 取出交易记录的id,time等信息
        trade_time = float(each_line['time'])
        id = each_line['id']

        # 查找登录信息中同id的记录
        related_rows = login.loc[login['id'] == id]

        count_10_times_timelong_all.append(len([num for num in related_rows['timelong'].values if num % 10 == 0]))
        count_not_10_times_timelong_all.append(len([num for num in related_rows['timelong'].values if num % 10 != 0]))
        count_city_types.append(len(set(related_rows['city'].values)))
        count_log_from_types.append(len(set(related_rows['log_from'].values)))
        ratio_10_times_timelong.append(0 if related_rows.shape[0]==0 else len([1 for num in related_rows['timelong'].values if num%10==0])/related_rows.shape[0])

        if related_rows.shape[0] != 0:
            avg_login_time_dis.append((related_rows['time'].max()-related_rows['time'].min())/related_rows.shape[0])
            std_login_time_dis.append(0 if related_rows.shape[0]==1 else related_rows['time'].std(skipna=True))
            std_login_city_dis.append(0 if related_rows.shape[0]==1 else related_rows['city'].std(skipna=True))


            # if pd.isnull(related_rows['city'].std(skipna=True)):
            #     print(related_rows)


            max_login_time.append(related_rows['time'].max())
            min_login_time.append(related_rows['time'].min())
            median_login_time.append(related_rows['time'].median())
            max_min_login_time_dis.append(related_rows['time'].max()-related_rows['time'].min())
            max_login_city_dis.append(related_rows['city'].max())
            min_login_city_dis.append(related_rows['city'].min())
            median_login_city.append(related_rows['city'].median())

            min_login_timelong_all.append(related_rows['timelong'].max())
            max_login_timelong_all.append(related_rows['timelong'].min())
            avg_login_timelong_all.append(related_rows['timelong'].mean())
            std_login_timelong_all.append(0 if related_rows.shape[0]==1 else related_rows['timelong'].std(skipna=True))
            min_login_city_all.append(related_rows['city'].max())
            max_login_city_all.append(related_rows['city'].min())
            avg_login_city_all.append(related_rows['city'].mean())
            std_login_city_all.append(0 if related_rows.shape[0]==1 else related_rows['city'].std(skipna=True))
        else:
            avg_login_time_dis.append(0)
            std_login_time_dis.append(0)
            std_login_city_dis.append(0)

            max_login_time.append(0)
            min_login_time.append(0)
            median_login_time.append(0)
            max_min_login_time_dis.append(0)
            max_login_city_dis.append(0)
            min_login_city_dis.append(0)
            median_login_city.append(0)

            min_login_timelong_all.append(0)
            max_login_timelong_all.append(0)
            avg_login_timelong_all.append(0)
            std_login_timelong_all.append(0)
            min_login_city_all.append(0)
            max_login_city_all.append(0)
            avg_login_city_all.append(0)
            std_login_city_all.append(0)



        if related_rows.shape[0] != 0:
            # 查找当前交易发生之前的登录信息
            previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]

            if previous_record.shape[0] != 0:
                first_login_record = previous_record.iloc[0]  # 取第一次登录的记录
                last_login_record = previous_record.iloc[-1]  # 取最近的那一次登录记录

                # 计数特征
                count_10_times_timelong_previous.append(len([num for num in previous_record['timelong'].values if num % 10 == 0]))
                count_not_10_times_timelong_previous.append(len([num for num in previous_record['timelong'].values if num % 10 != 0]))

                count_city_types_previous.append(len([1 for city in previous_record['city'].values if city == last_login_record['city']]))
                count_log_from_types_previous.append(len([1 for log_from in previous_record['log_from'].values if log_from == last_login_record['log_from']]))

                # 比率特征
                ratio_city_all.append(len([1 for city in related_rows['city'].values if city==last_login_record['city']])/related_rows.shape[0])
                ratio_city_previous.append(len([1 for city in previous_record['city'].values if city==last_login_record['city']])/previous_record.shape[0])
                ratio_log_from_all.append(len([1 for log_from in related_rows['log_from'].values if log_from==last_login_record['log_from']])/related_rows.shape[0])
                ratio_log_from_previous.append(len([1 for log_from in previous_record['log_from'].values if log_from==last_login_record['log_from']])/previous_record.shape[0])


                # 类别特征
                is_login_10_times_timelong.append(1 if last_login_record['timelong'] % 10 == 0 else 0)
                is_login_city_the_very_first_time.append(1 if last_login_record['city'] == first_login_record['city'] else 0)

                min_login_timelong_previous.append(previous_record['timelong'].max())
                max_login_timelong_previous.append(previous_record['timelong'].min())
                avg_login_timelong_previous.append(previous_record['timelong'].mean())
                std_login_timelong_previous.append(0 if previous_record.shape[0]==1 else previous_record['timelong'].std(skipna=True))
                min_login_city_previous.append(previous_record['city'].max())
                max_login_city_previous.append(previous_record['city'].min())
                avg_login_city_previous.append(previous_record['city'].mean())
                std_login_city_previous.append(0 if previous_record.shape[0]==1 else previous_record['city'].std(skipna=True))
            else:
                # 如果没有登录信息,说明用户之前没有登录,但是后来马上有登录


                # 计数特征
                count_10_times_timelong_previous.append(-1)
                count_not_10_times_timelong_previous.append(-1)
                count_city_types_previous.append(-1)
                count_log_from_types_previous.append(-1)

                # 比率特征
                ratio_city_all.append(-1)
                ratio_city_previous.append(-1)
                ratio_log_from_all.append(-1)
                ratio_log_from_previous.append(-1)

                # 类别特征
                is_login_10_times_timelong.append(-1)
                is_login_city_the_very_first_time.append(-1)

                min_login_timelong_previous.append(-1)
                max_login_timelong_previous.append(-1)
                avg_login_timelong_previous.append(-1)
                std_login_timelong_previous.append(-1)
                min_login_city_previous.append(-1)
                max_login_city_previous.append(-1)
                avg_login_city_previous.append(-1)
                std_login_city_previous.append(-1)

        else:
            # 这里出现的都是没有出现登录记录的
            # 如果没有同id的信息,说明用户之前没有登录,后来也没有登录


            # 计数特征
            count_10_times_timelong_previous.append(-2)
            count_not_10_times_timelong_previous.append(-2)
            count_city_types_previous.append(-2)
            count_log_from_types_previous.append(-2)

            # 比率特征
            ratio_city_all.append(-2)
            ratio_city_previous.append(-2)
            ratio_log_from_all.append(-2)
            ratio_log_from_previous.append(-2)

            # 类别特征
            is_login_10_times_timelong.append(-2)
            is_login_city_the_very_first_time.append(-2)

            min_login_timelong_previous.append(-2)
            max_login_timelong_previous.append(-2)
            avg_login_timelong_previous.append(-2)
            std_login_timelong_previous.append(-2)
            min_login_city_previous.append(-2)
            max_login_city_previous.append(-2)
            avg_login_city_previous.append(-2)
            std_login_city_previous.append(-2)


    # 将得到的特征拼接起来
    # 计数特征
    features['count_10_times_timelong_all'] = count_10_times_timelong_all
    features['count_not_10_times_timelong_all'] = count_not_10_times_timelong_all
    features['count_10_times_timelong_previous'] = count_10_times_timelong_previous
    features['count_not_10_times_timelong_previous'] = count_not_10_times_timelong_previous
    features['count_city_types'] = count_city_types
    features['count_city_types_previous'] = count_city_types_previous
    features['count_log_from_types'] = count_log_from_types
    features['count_log_from_types_previous'] = count_log_from_types_previous

    features['avg_login_time_dis'] = avg_login_time_dis
    features['std_login_time_dis'] = std_login_time_dis
    features['std_login_city_dis'] = std_login_city_dis

    # 比率特征
    features['ratio_log_from_all'] = ratio_log_from_all
    features['ratio_log_from_previous'] = ratio_log_from_previous
    features['ratio_city_all'] = ratio_city_all
    features['ratio_city_previous'] = ratio_city_previous
    features['ratio_10_times_timelong'] = ratio_10_times_timelong

    # 类别特征
    features['is_login_10_times_timelong'] = is_login_10_times_timelong
    features['is_login_city_the_very_first_time'] = is_login_city_the_very_first_time



    features['max_login_time'] = max_login_time
    features['min_login_time'] = min_login_time
    features['median_login_time'] = median_login_time
    features['max_min_login_time_dis'] = max_min_login_time_dis
    features['max_login_city_dis'] = max_login_city_dis
    features['min_login_city_dis'] = min_login_city_dis
    features['median_login_city'] = median_login_city

    features['min_login_timelong_all'] = min_login_timelong_all
    features['max_login_timelong_all'] = max_login_timelong_all
    features['avg_login_timelong_all'] = avg_login_timelong_all
    features['std_login_timelong_all'] = std_login_timelong_all
    features['min_login_timelong_previous'] = min_login_timelong_previous
    features['max_login_timelong_previous'] = max_login_timelong_previous
    features['avg_login_timelong_previous'] = avg_login_timelong_previous
    features['std_login_timelong_previous'] = std_login_timelong_previous

    features['min_login_city_all'] = min_login_city_all
    features['max_login_city_all'] = max_login_city_all
    features['avg_login_city_all'] = avg_login_city_all
    features['std_login_city_all'] = std_login_city_all
    features['min_login_city_previous'] = min_login_city_previous
    features['max_login_city_previous'] = max_login_city_previous
    features['avg_login_city_previous'] = avg_login_city_previous
    features['std_login_city_previous'] = std_login_city_previous

    print('Shape of Features 2:', features.shape)
    return features

def build_feat_3(login, trade):
    features = pd.DataFrame()

    # 初始化特征列表
    # 计数特征

    # 比率特征


    # 类别特征
    is_last2_same_log_from = [] # 倒数第一次登录和倒数第二次登录登录来源是否一致
    is_last2_same_device = []
    is_last2_same_ip = []
    is_last2_same_city = []
    is_last2_same_result = []

    is_last3_same_log_from = [] # 倒数三次内的登录来源是否一致
    is_last3_same_device = []
    is_last3_same_ip = []
    is_last3_same_city = []
    is_last3_same_result = []

    # 计算所有登录记录和交易前登录记录中scan和sec的个数和比率
    count_login_is_scan_all = []
    count_login_not_scan_all = []
    count_login_is_sec_all = []
    count_login_not_sec_all = []
    count_login_is_scan_previous = []
    count_login_not_scan_previous = []
    count_login_is_sec_previous = []
    count_login_not_sec_previous = []
    ratio_login_is_scan = []
    ratio_login_is_sec  = []

    # 上次登录距离倒数第二次和第三次的时间
    last1_login_month_dis = []
    last1_login_day_dis = []
    last1_login_hour_dis = []
    last1_login_minute_dis = []
    last1_trade_month_dis = []
    last1_trade_day_dis = []
    last1_trade_hour_dis = []
    last1_trade_minute_dis = []

    last2_login_month_dis = []
    last2_login_day_dis = []
    last2_login_hour_dis = []
    last2_login_minute_dis = []
    last2_trade_month_dis = []
    last2_trade_day_dis = []
    last2_trade_hour_dis = []
    last2_trade_minute_dis = []


    # 开始遍历所有的交易信息
    print('Start Search All Trade Info ... ')
    for index in range(trade.shape[0]):
        if index%10000==0:
            print("Processing till line : ", index)

        # 取出交易记录中的每一行
        each_line = trade.iloc[index]

        # 取出交易记录的id,time等信息
        trade_time = float(each_line['time'])
        id = each_line['id']

        # 查找登录信息中同id的记录
        related_rows = login.loc[login['id'] == id]

        count_login_is_scan_all.append(len([1 for scan in related_rows['is_scan'].values if scan == 1]))
        count_login_not_scan_all.append(len([1 for scan in related_rows['is_scan'].values if scan != 1]))
        count_login_is_sec_all.append(len([1 for sec in related_rows['is_sec'].values if sec == 1]))
        count_login_not_sec_all.append(len([1 for sec in related_rows['is_sec'].values if sec != 1]))
        ratio_login_is_scan.append(len([1 for scan in related_rows['is_scan'].values if scan == 1])/ related_rows.shape[0])
        ratio_login_is_sec.append(len([1 for sec in related_rows['is_sec'].values if sec == 1])/ related_rows.shape[0])

        if related_rows.shape[0] != 0:
            # 查找当前交易发生之前的登录信息
            previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]

            if previous_record.shape[0] != 0:
                count_login_is_scan_previous.append(len([1 for scan in previous_record['is_scan'].values if scan == 1]))
                count_login_not_scan_previous.append(len([1 for scan in previous_record['is_scan'].values if scan != 1]))
                count_login_is_sec_previous.append(len([1 for sec in previous_record['is_sec'].values if sec == 1]))
                count_login_not_sec_previous.append(len([1 for sec in previous_record['is_sec'].values if sec == 1]))
            else:
                count_login_is_scan_previous.append(-1)
                count_login_not_scan_previous.append(-1)
                count_login_is_sec_previous.append(-1)
                count_login_not_sec_previous.append(-1)

            if previous_record.shape[0] >= 2:
                last1_login_record = previous_record.iloc[-1]  # 倒数第一条记录
                last2_login_record = previous_record.iloc[-2]  # 倒数第二条记录

                is_last2_same_log_from.append(1 if last1_login_record['log_from']==last2_login_record['log_from'] else 0)
                is_last2_same_device.append(1 if last1_login_record['device']==last2_login_record['device'] else 0)
                is_last2_same_ip.append(1 if last1_login_record['ip']==last2_login_record['ip'] else 0)
                is_last2_same_city.append(1 if last1_login_record['city']==last2_login_record['city'] else 0)
                is_last2_same_result.append(1 if last1_login_record['result']==last2_login_record['result'] else 0)


                last1_login_month_dis.append(last1_login_record['login_real_month'] - last2_login_record['login_real_month'])
                last1_login_day_dis.append(last1_login_record['login_real_day'] - last2_login_record['login_real_day'])
                last1_login_hour_dis.append(last1_login_record['login_real_hour'] - last2_login_record['login_real_hour'])
                last1_login_minute_dis.append(last1_login_record['login_real_minute'] - last2_login_record['login_real_minute'])
            else:
                is_last2_same_log_from.append(-1)
                is_last2_same_device.append(-1)
                is_last2_same_ip.append(-1)
                is_last2_same_city.append(-1)
                is_last2_same_result.append(-1)

                last1_login_month_dis.append(-1)
                last1_login_day_dis.append(-1)
                last1_login_hour_dis.append(-1)
                last1_login_minute_dis.append(-1)


            if previous_record.shape[0] >= 3:
                last1_login_record = previous_record.iloc[-1]  # 倒数第一条记录
                last2_login_record = previous_record.iloc[-2]  # 倒数第二条记录
                last3_login_record = previous_record.iloc[-3]  # 倒数第三条记录

                is_last3_same_log_from.append(1 if len(set([last1_login_record['log_from'], last2_login_record['log_from'], last3_login_record['log_from']]))==1 else 0)
                is_last3_same_device.append(1 if len(set([last1_login_record['device'], last2_login_record['device'], last3_login_record['device']]))==1 else 0)
                is_last3_same_ip.append(1 if len(set([last1_login_record['ip'], last2_login_record['ip'], last3_login_record['ip']]))==1 else 0)
                is_last3_same_city.append(1 if len(set([last1_login_record['city'], last2_login_record['city'], last3_login_record['city']]))==1 else 0)
                is_last3_same_result.append(1 if len(set([last1_login_record['result'], last2_login_record['result'], last3_login_record['result']]))==1 else 0)

                last2_login_month_dis.append(last1_login_record['login_real_month'] - last3_login_record['login_real_month'])
                last2_login_day_dis.append(last1_login_record['login_real_day'] - last3_login_record['login_real_day'])
                last2_login_hour_dis.append(last1_login_record['login_real_hour'] - last3_login_record['login_real_hour'])
                last2_login_minute_dis.append(last1_login_record['login_real_minute'] - last3_login_record['login_real_minute'])
            else:
                is_last3_same_log_from.append(-1)
                is_last3_same_device.append(-1)
                is_last3_same_ip.append(-1)
                is_last3_same_city.append(-1)
                is_last3_same_result.append(-1)

                last2_login_month_dis.append(-1)
                last2_login_day_dis.append(-1)
                last2_login_hour_dis.append(-1)
                last2_login_minute_dis.append(-1)

        else:
            # 这里出现的都是没有出现登录记录的
            is_last2_same_log_from.append(-2)
            is_last2_same_device.append(-2)
            is_last2_same_ip.append(-2)
            is_last2_same_city.append(-2)
            is_last2_same_result.append(-2)

            is_last3_same_log_from.append(-2)
            is_last3_same_device.append(-2)
            is_last3_same_ip.append(-2)
            is_last3_same_city.append(-2)
            is_last3_same_result.append(-2)

            count_login_is_scan_previous.append(-2)
            count_login_not_scan_previous.append(-2)
            count_login_is_sec_previous.append(-2)
            count_login_not_sec_previous.append(-2)

            last1_login_month_dis.append(-2)
            last1_login_day_dis.append(-2)
            last1_login_hour_dis.append(-2)
            last1_login_minute_dis.append(-2)
            last2_login_month_dis.append(-2)
            last2_login_day_dis.append(-2)
            last2_login_hour_dis.append(-2)
            last2_login_minute_dis.append(-2)

    # 将得到的特征拼接起来
    # 计数特征
    features['is_last2_same_log_from'] = is_last2_same_log_from
    features['is_last2_same_device'] = is_last2_same_device
    features['is_last2_same_ip'] = is_last2_same_ip
    features['is_last2_same_city'] = is_last2_same_city
    features['is_last2_same_result'] = is_last2_same_result

    features['is_last3_same_log_from'] = is_last3_same_log_from
    features['is_last3_same_device'] = is_last3_same_device
    features['is_last3_same_ip'] = is_last3_same_ip
    features['is_last3_same_city'] = is_last3_same_city
    features['is_last3_same_result'] = is_last3_same_result

    features['count_login_is_scan_all'] = count_login_is_scan_all
    features['count_login_not_scan_all'] = count_login_not_scan_all
    features['count_login_is_sec_all'] = count_login_is_sec_all
    features['count_login_not_sec_all'] = count_login_not_sec_all
    features['count_login_is_scan_previous'] = count_login_is_scan_previous
    features['count_login_not_scan_previous'] = count_login_not_scan_previous
    features['count_login_is_sec_previous'] = count_login_is_sec_previous
    features['count_login_not_sec_previous'] = count_login_not_sec_previous
    features['ratio_login_is_scan'] = ratio_login_is_scan
    features['ratio_login_is_sec'] = ratio_login_is_sec

    features['last1_login_month_dis'] = last1_login_month_dis
    features['last1_login_day_dis'] = last1_login_day_dis
    features['last1_login_hour_dis'] = last1_login_hour_dis
    features['last1_login_minute_dis'] = last1_login_minute_dis
    features['last2_login_month_dis'] = last2_login_month_dis
    features['last2_login_day_dis'] = last2_login_day_dis
    features['last2_login_hour_dis'] = last2_login_hour_dis
    features['last2_login_minute_dis'] = last2_login_minute_dis


    print('Shape of Features 3:', features.shape)
    return features

# 包含时序信息的特征
def build_feat_4(login, trade):
    features = pd.DataFrame()
    last1_risk_ip_dic, last2_risk_ip_dic, all_previous_ip_dic = get_risk_ip_dic()

    # 初始化特征列表

    # 上一次登录ip是否在risk中出现
    last1_in_last1_risk_ip = []
    last1_in_last2_risk_ip = []
    last2_in_last1_risk_ip = []
    last2_in_last2_risk_ip = []
    last1_in_all_previous_ip = []
    last2_in_all_previous_ip = []

    # 当前用户是否在特定时间内登录过,购买过,登录的次数,已经购买历史中是否存在过风险登录
    # TODO:

    # 开始遍历所有的交易信息
    print('Start Search All Trade Info ... ')
    for index in range(trade.shape[0]):
        if index % 10000 == 0:
            print("Processing till line : ", index)

        # 取出交易记录中的每一行
        each_line = trade.iloc[index]

        # 取出交易记录的id,time等信息
        trade_time = float(each_line['time'])
        id = each_line['id']

        # 查找登录信息中同id的记录
        related_rows = login.loc[login['id'] == id]
        if related_rows.shape[0] != 0:
            # 查找当前交易发生之前的登录信息
            previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]

            if previous_record.shape[0] >= 1:
                last1_login_record = previous_record.iloc[-1]  # 倒数第一条记录
                last1_in_last1_risk_ip.append(last1_risk_ip_dic[last1_login_record['ip']])
                last1_in_last2_risk_ip.append(last2_risk_ip_dic[last1_login_record['ip']])
                last1_in_all_previous_ip.append(all_previous_ip_dic[last1_login_record['ip']])
            else:
                last1_in_last1_risk_ip.append(-1)
                last1_in_last2_risk_ip.append(-1)
                last1_in_all_previous_ip.append(-1)

            if previous_record.shape[0] >= 2:
                last2_login_record = previous_record.iloc[-2]  # 倒数第二条记录
                last2_in_last1_risk_ip.append(last1_risk_ip_dic[last2_login_record['ip']])
                last2_in_last2_risk_ip.append(last2_risk_ip_dic[last2_login_record['ip']])
                last2_in_all_previous_ip.append(all_previous_ip_dic[last2_login_record['ip']])
            else:
                last2_in_last1_risk_ip.append(-1)
                last2_in_last2_risk_ip.append(-1)
                last2_in_all_previous_ip.append(-1)
        else:
            last1_in_last1_risk_ip.append(-2)
            last1_in_last2_risk_ip.append(-2)
            last1_in_all_previous_ip.append(-2)
            last2_in_last1_risk_ip.append(-2)
            last2_in_last2_risk_ip.append(-2)
            last2_in_all_previous_ip.append(-2)

    features['last1_in_last1_risk_ip'] = last1_in_last1_risk_ip
    features['last1_in_last2_risk_ip'] = last1_in_last2_risk_ip
    features['last2_in_last1_risk_ip'] = last2_in_last1_risk_ip
    features['last2_in_last2_risk_ip'] = last2_in_last2_risk_ip
    features['last1_in_all_previous_ip'] = last1_in_all_previous_ip
    features['last2_in_all_previous_ip'] = last2_in_all_previous_ip

    print('Shape of Features 4:', features.shape)
    return features

# 从训练集交易数据提取的信息
def build_feat_5(trade, previous_train_trade):
    features = pd.DataFrame()

    # 初始化特征列表
    is_last1_trade_risk = []
    is_last2_trade_risk = []
    is_last3_trade_risk = []
    last_trade_risk_dis = []

    last3_risk_trustee = []  # 通过前面三次交易记录得到的权值

    count_trade_risk = []
    ratio_trade_risk = []

    # 距离上一次交易的时间
    last1_trade_dis = []
    last2_trade_dis = []
    last3_trade_dis = []



    # 开始遍历所有的交易信息
    print('Start Search All Trade Info ... ')
    for index in range(trade.shape[0]):
        if index % 10000 == 0:
            print("Processing till line : ", index)

        # 取出交易记录中的每一行
        each_line = trade.iloc[index]

        # 取出交易记录的id,time等信息
        trade_time = float(each_line['time'])
        id = each_line['id']

        # 查找前期交易信息中同id的记录
        related_rows = previous_train_trade.loc[previous_train_trade['id'] == id]
        if related_rows.shape[0] >= 1:
            # 查找当前交易发生之前的交易信息
            previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]
            risk_array = previous_record['is_risk'].values

            if previous_record.shape[0] >= 1:
                # print(previous_record['is_risk'].values)
                last1_record = previous_record.iloc[-1]
                is_last1_trade_risk.append(1 if last1_record['is_risk']==1 else 0)
                last_trade_risk_dis.append((len(risk_array) - np.where(risk_array == 1)[0][-1]) if (1 in risk_array) else -3)
                count_trade_risk.append(np.sum(risk_array == 1))
                ratio_trade_risk.append(np.sum(risk_array == 1) / len(risk_array))

                last1_trade_dis.append(trade_time - float(last1_record['time']))
            else:
                is_last1_trade_risk.append(-1)
                last_trade_risk_dis.append(-1)
                count_trade_risk.append(-1)
                ratio_trade_risk.append(0)

                last1_trade_dis.append(-1)

            if previous_record.shape[0] >= 2:
                last2_record = previous_record.iloc[-2]
                is_last2_trade_risk.append(1 if last2_record['is_risk'] == 1 else 0)

                last2_trade_dis.append(trade_time - float(last2_record['time']))
            else:
                is_last2_trade_risk.append(-1)
                last2_trade_dis.append(-1)

            if previous_record.shape[0] >= 3:
                last1_record = previous_record.iloc[-1]
                last2_record = previous_record.iloc[-2]
                last3_record = previous_record.iloc[-3]
                is_last3_trade_risk.append(1 if last3_record['is_risk'] == 1 else 0)

                last3_trade_dis.append(trade_time - float(last3_record['time']))

                if last1_record['is_risk'] == 1 and last2_record['is_risk'] == 1 and last3_record['is_risk'] == 1:
                    last3_risk_trustee.append(100)
                elif last1_record['is_risk'] == 1 and last2_record['is_risk'] == 1 and last3_record['is_risk'] == 0:
                    last3_risk_trustee.append(70)
                elif last1_record['is_risk'] == 1 and last2_record['is_risk'] == 0 and last3_record['is_risk'] == 1:
                    last3_risk_trustee.append(60)
                elif last1_record['is_risk'] == 1 and last2_record['is_risk'] == 0 and last3_record['is_risk'] == 0:
                    last3_risk_trustee.append(40)
                elif last1_record['is_risk'] == 0 and last2_record['is_risk'] == 1 and last3_record['is_risk'] == 1:
                    last3_risk_trustee.append(30)
                elif last1_record['is_risk'] == 0 and last2_record['is_risk'] == 0 and last3_record['is_risk'] == 1:
                    last3_risk_trustee.append(20)
                else:
                    last3_risk_trustee.append(0)
            else:
                is_last3_trade_risk.append(-1)
                last3_risk_trustee.append(-10)
                last3_trade_dis.append(-1)
        else:
            is_last1_trade_risk.append(-2)
            is_last2_trade_risk.append(-2)
            is_last3_trade_risk.append(-2)
            last_trade_risk_dis.append(-2)
            last3_risk_trustee.append(-20)

            count_trade_risk.append(-2)
            ratio_trade_risk.append(-2)

            last1_trade_dis.append(-2)
            last2_trade_dis.append(-2)
            last3_trade_dis.append(-2)

    features['is_last1_trade_risk'] = is_last1_trade_risk
    features['is_last2_trade_risk'] = is_last2_trade_risk
    features['is_last3_trade_risk'] = is_last3_trade_risk
    features['last_trade_risk_dis'] = last_trade_risk_dis

    features['last3_risk_trustee'] = last3_risk_trustee

    features['count_trade_risk'] = count_trade_risk
    features['ratio_trade_risk'] = ratio_trade_risk

    features['last1_trade_dis'] = last1_trade_dis
    features['last2_trade_dis'] = last2_trade_dis
    features['last3_trade_dis'] = last3_trade_dis


    print('Shape of Features 5:', features.shape)
    return features

def build_feat_6(login, trade):
    features = pd.DataFrame()

    ip_to_risk_std, ip_to_risk_type = get_most_risk_ip_dic()
    last1_risk_ip_dic, last2_risk_ip_dic, all_previous_ip_dic = get_risk_ip_dic()

    # 得到risk_ip的risk程度
    last1_ip_risk_std = []
    last1_ip_risk_type = []
    last2_ip_risk_std = []
    last2_ip_risk_type = []

    # 得到上面最后两个ip是否为危险ip
    real_last1_in_last1_risk_ip = []
    real_last1_in_last2_risk_ip = []
    real_last2_in_last1_risk_ip = []
    real_last2_in_last2_risk_ip = []
    real_last1_in_all_previous_ip = []
    real_last2_in_all_previous_ip = []

    # 开始遍历所有的交易信息
    print('Start Search All Trade Info ... ')
    for index in range(trade.shape[0]):
        if index % 10000 == 0:
            print("Processing till line : ", index)

        # 取出交易记录中的每一行
        each_line = trade.iloc[index]

        # 取出交易记录的id,time等信息
        trade_time = float(each_line['time'])
        id = each_line['id']

        # 查找登录信息中同id的记录
        related_rows = login.loc[login['id'] == id]
        if related_rows.shape[0] != 0:
            # 查找当前交易发生之前的登录信息
            previous_record = related_rows.loc[related_rows['time'] < float(trade_time)]


            if previous_record.shape[0] == 0:
                last1_ip_risk_std.append(0)
                last1_ip_risk_type.append(0)
                last2_ip_risk_std.append(0)
                last2_ip_risk_type.append(0)

                real_last1_in_last1_risk_ip.append(0)
                real_last1_in_last2_risk_ip.append(0)
                real_last1_in_all_previous_ip.append(0)
                real_last2_in_last1_risk_ip.append(0)
                real_last2_in_last2_risk_ip.append(0)
                real_last2_in_all_previous_ip.append(0)
            else:
                ips = previous_record['ip'].tolist()
                last1_ip = ips[-1]
                last2_ip = 0
                for ip in ips[::-1]:
                    if ip != last1_ip:
                        last2_ip = ip
                        break

                # 如果不存在第二个ip
                if last2_ip == 0:
                    if last1_ip in ip_to_risk_std.keys():
                        last1_ip_risk_std.append(ip_to_risk_std[last1_ip])
                        last1_ip_risk_type.append(ip_to_risk_type[last1_ip])
                    else:
                        last1_ip_risk_std.append(0)
                        last1_ip_risk_type.append(0)
                    last2_ip_risk_std.append(0)
                    last2_ip_risk_type.append(0)

                    real_last1_in_last1_risk_ip.append(last1_risk_ip_dic[last1_ip])
                    real_last1_in_last2_risk_ip.append(last2_risk_ip_dic[last1_ip])
                    real_last1_in_all_previous_ip.append(all_previous_ip_dic[last1_ip])
                    real_last2_in_last1_risk_ip.append(0)
                    real_last2_in_last2_risk_ip.append(0)
                    real_last2_in_all_previous_ip.append(0)
                else:
                    if last1_ip in ip_to_risk_std.keys():
                        last1_ip_risk_std.append(ip_to_risk_std[last1_ip])
                        last1_ip_risk_type.append(ip_to_risk_type[last1_ip])
                    else:
                        last1_ip_risk_std.append(0)
                        last1_ip_risk_type.append(0)
                    if last2_ip in ip_to_risk_std.keys():
                        last2_ip_risk_std.append(ip_to_risk_std[last2_ip])
                        last2_ip_risk_type.append(ip_to_risk_type[last2_ip])
                    else:
                        last2_ip_risk_std.append(0)
                        last2_ip_risk_type.append(0)

                    real_last1_in_last1_risk_ip.append(last1_risk_ip_dic[last2_ip])
                    real_last1_in_last2_risk_ip.append(last2_risk_ip_dic[last2_ip])
                    real_last1_in_all_previous_ip.append(all_previous_ip_dic[last2_ip])
                    real_last2_in_last1_risk_ip.append(last1_risk_ip_dic[last2_ip])
                    real_last2_in_last2_risk_ip.append(last2_risk_ip_dic[last2_ip])
                    real_last2_in_all_previous_ip.append(all_previous_ip_dic[last2_ip])


        else:
            last1_ip_risk_std.append(0)
            last1_ip_risk_type.append(0)
            last2_ip_risk_std.append(0)
            last2_ip_risk_type.append(0)

            real_last1_in_last1_risk_ip.append(0)
            real_last1_in_last2_risk_ip.append(0)
            real_last1_in_all_previous_ip.append(0)
            real_last2_in_last1_risk_ip.append(0)
            real_last2_in_last2_risk_ip.append(0)
            real_last2_in_all_previous_ip.append(0)

    features['last1_ip_risk_std'] = last1_ip_risk_std
    features['last1_ip_risk_type'] = last1_ip_risk_type
    features['last2_ip_risk_std'] = last2_ip_risk_std
    features['last2_ip_risk_type'] = last2_ip_risk_type

    features['real_last1_in_last1_risk_ip'] = real_last1_in_last1_risk_ip
    features['real_last1_in_last2_risk_ip'] = real_last1_in_last2_risk_ip
    features['real_last1_in_all_previous_ip'] = real_last1_in_all_previous_ip
    features['real_last2_in_last1_risk_ip'] = real_last2_in_last1_risk_ip
    features['real_last2_in_last2_risk_ip'] = real_last2_in_last2_risk_ip
    features['real_last2_in_all_previous_ip'] = real_last2_in_all_previous_ip

    print('Shape of Features 6:', features.shape)
    return features

def delete_bad_feat(features):
    del features['last_login_from']
    del features['last_login_is_scan']
    del features['last_login_is_sec']
    del features['count_login_is_sec_previous']
    del features['count_login_not_sec_previous']
    del features['ratio_login_is_sec']
    del features['last1_login_month_dis']
    del features['last2_login_month_dis']

    return features


# 对外接口
def load_data():
    df_train_login, df_test_login, df_train_trade, df_test_trade = preprocess()

    rowkey = df_test_trade['rowkey'].values
    train_y = df_train_trade['is_risk'].values
    # del df_train_trade['is_risk']

    # 把第1个参数替换成df_train_login+df_test_login可以解决divisionbyzero问题,但是就穿越了,otherwise就写个判断过滤一下吧
    # df_all_login = df_train_login + df_test_login
    train_x = build_feat(df_train_login.copy(), df_train_trade.copy(), type='train', mode='Seperate',  train_trade=None)
    test_x  = build_feat(df_test_login.copy(),  df_test_trade.copy(), type='test', mode='Seperate', train_trade=df_train_trade.copy())

    print("Start One-Hot Encoding ... ")
    train_x = one_hot(df_all_login.copy(), train_x, test_x, 1)
    test_x  = one_hot(df_all_login.copy(), test_x, train_x, 2)

    return train_x, train_y, test_x, rowkey

if __name__ == '__main__':
    load_data()

特征工程 : OneHot

亲测Feature[0, 1, 5, 6] 经过OneHot后效果有提升, 能到0. 82以上

对重要的连续型特征做分片后,能到0.85左右

def one_hot(df_all_login, df_target, df_other, num):
    enc_result = OneHotEncoder()
    enc_result.fit(np.append(df_all_login['result'].values, 0).reshape(-1, 1))
    enc_device = OneHotEncoder()
    enc_device.fit(np.append(df_all_login['device'].values, 0).reshape(-1, 1))
    enc_logfrom = OneHotEncoder()
    enc_logfrom.fit(np.append(df_all_login['log_from'].values, 0).reshape(-1, 1))
    enc_ip = OneHotEncoder()
    enc_ip.fit(np.append(df_all_login['ip'].values, 0).reshape(-1, 1))
    enc_city = OneHotEncoder()
    enc_city.fit(np.append(df_all_login['city'].values, 0).reshape(-1, 1))
    enc_type = OneHotEncoder()
    enc_type.fit(np.append(df_all_login['type'].values, 0).reshape(-1, 1))

    feature_1 = pd.DataFrame(enc_result.transform(df_target['last_login_result'].values.reshape(-1, 1)).toarray())
    feature_1.columns = ["last_login_result_"+str(num) for num in range(1, 12)]
    del df_target['last_login_result']

    # Feat 1 ##########################

    df_target['timezone'] = df_target['last_login_real_time'].apply(process_time)
    enc_time = OneHotEncoder()
    enc_time.fit(df_target['timezone'].values.reshape(-1, 1))
    feature_1_1 = pd.DataFrame(enc_time.transform(df_target['timezone'].values.reshape(-1, 1)).toarray())
    feature_1_1.columns = ['Last_Login_TimeZone_'+str(num) for num in range(1, 6)]
    del df_target['last_login_real_time']

    df_target['timezone'] = df_target['last_trade_real_time'].apply(process_time)
    enc_time = OneHotEncoder()
    enc_time.fit(df_target['timezone'].values.reshape(-1, 1))
    feature_1_2 = pd.DataFrame(enc_time.transform(df_target['timezone'].values.reshape(-1, 1)).toarray())
    feature_1_2.columns = ['Last_Trade_TimeZone_'+str(num) for num in range(1, 6)]
    del df_target['last_trade_real_time']

    del df_target['timezone']

    feature_1_3 = pd.DataFrame(enc_logfrom.transform(df_target['last_login_from'].values.reshape(-1, 1)).toarray())
    feature_1_3.columns = ["last_login_from_"+str(num) for num in range(1, 13)]
    del df_target['last_login_from']

    feature_1_4 = pd.DataFrame(enc_city.transform(df_target['last_login_city'].values.reshape(-1, 1)).toarray())
    feature_1_4.columns = ["last_login_city_" + str(num) for num in range(1, 486)]
    del df_target['last_login_city']

    feature_1_5 = pd.DataFrame(enc_type.transform(df_target['last_login_type'].values.reshape(-1, 1)).toarray())
    feature_1_5.columns = ["last_login_type_" + str(num) for num in range(1, 5)]
    del df_target['last_login_type']



    # 把3的垃圾特征删掉
    if num == 2:
        del df_target['last1_login_month_dis']
        del df_other['last1_login_month_dis']
        del df_target['last1_login_day_dis']
        del df_other['last1_login_day_dis']
        del df_target['last1_login_hour_dis']
        del df_other['last1_login_hour_dis']
        del df_target['last1_login_minute_dis']
        del df_other['last1_login_minute_dis']
        del df_target['last2_login_month_dis']
        del df_other['last2_login_month_dis']
        del df_target['last2_login_day_dis']
        del df_other['last2_login_day_dis']
        del df_target['last2_login_hour_dis']
        del df_other['last2_login_hour_dis']
        del df_target['last2_login_minute_dis']
        del df_other['last2_login_minute_dis']


    # Feat 5 ##############################

    enc = OneHotEncoder()
    enc.fit(np.append(df_target['is_last1_trade_risk'].values, df_other['is_last1_trade_risk'].values).reshape(-1, 1))
    feature_5_1 = pd.DataFrame(
        enc.transform(df_target['is_last1_trade_risk'].values.reshape(-1, 1)).toarray())
    feature_5_1.columns = ['is_last1_trade_risk' + str(num) for num in range(1, 5)]
    if num == 2:
        del df_target['is_last1_trade_risk']
        del df_other['is_last1_trade_risk']

    enc = OneHotEncoder()
    enc.fit(np.append(df_target['is_last2_trade_risk'].values, df_other['is_last2_trade_risk']).reshape(-1, 1))
    feature_5_2 = pd.DataFrame(
        enc.transform(df_target['is_last2_trade_risk'].values.reshape(-1, 1)).toarray())
    feature_5_2.columns = ['is_last2_trade_risk' + str(num) for num in range(1, 5)]
    if num == 2:
        del df_target['is_last2_trade_risk']
        del df_other['is_last2_trade_risk']

    enc = OneHotEncoder()
    enc.fit(np.append(df_target['is_last3_trade_risk'].values, df_other['is_last3_trade_risk']).reshape(-1, 1))
    feature_5_3 = pd.DataFrame(
        enc.transform(df_target['is_last3_trade_risk'].values.reshape(-1, 1)).toarray())
    feature_5_3.columns = ['is_last3_trade_risk' + str(num) for num in range(1, 5)]
    if num == 2:
        del df_target['is_last3_trade_risk']
        del df_other['is_last3_trade_risk']


    all_features = pd.concat([df_target, feature_1], axis=1)
    all_features = pd.concat([all_features, feature_1_1], axis=1)
    all_features = pd.concat([all_features, feature_1_2], axis=1)
    all_features = pd.concat([all_features, feature_1_3], axis=1)
    all_features = pd.concat([all_features, feature_1_4], axis=1)
    all_features = pd.concat([all_features, feature_1_5], axis=1)
    all_features = pd.concat([all_features, feature_5_1], axis=1)
    all_features = pd.concat([all_features, feature_5_2], axis=1)
    all_features = pd.concat([all_features, feature_5_3], axis=1)

    print(all_features.info())
    return all_features

模型

主要用了LightGBM, XGBoost, Keras(MLP)

代码模板还是在这篇文章里 : http://blog.csdn.net/leyounger/article/details/78667538


Stacking

AI法官用的Stacking模型, 感觉没啥效果,可能是模型太相似了,学不到啥东西

代码放在我的Git上了 :
https://coding.net/u/wenyangsama/p/NLP_MachineTranslation/git/blob/master/Competition/%E8%AE%A9AI%E5%BD%93%E6%B3%95%E5%AE%98/model/stacking.py


分享一个Keras的分类可视化图,好多重叠啊 T.T

这只是MLP的效果,大家可以试试FNN, PNN,CCPM等FM(或FFM)与DNN结合的模型哦. [比心]

这里写图片描述


写在最后

本人陕西边家村读本科,保送成都清水寺读研,NLP方向烟酒僧一枚,本人勤快好学,如有大神收留,请收下我的膝盖 Orz

评论 8
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值