科技金融应用:欺诈风险识别

https://www.dcic-china.com/competitions/10060

import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm
import pickle
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import os
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder


def train_lgb_kfold(X_train, y_train, X_test, n_fold=5, cate_feats=None):
    '''train lightgbm with k-fold split'''
    gbms = []
    kfold = StratifiedKFold(n_splits=n_fold, random_state=1024, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        logging.info(f'############ fold {fold} ###########')
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = lgb.Dataset(X_tr, y_tr, categorical_feature=cate_feats)
        dvalid = lgb.Dataset(X_val, y_val, categorical_feature=cate_feats, reference=dtrain)

        params = {
            'objective': 'binary',
            'metric': 'auc',
            'num_leaves': 31,  # 31
            # 'n_estimators': 1500,
            'learning_rate': 0.05,
            'min_data_in_leaf': 50,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'n_jobs': -1,
            'seed': 1024
        }

        gbm = lgb.train(params,
                        dtrain,
                        num_boost_round=300,
                        valid_sets=[dtrain, dvalid],
                        verbose_eval=50,
                        early_stopping_rounds=50)

        oof_preds[val_index] = gbm.predict(X_val, num_iteration=gbm.best_iteration)
        test_preds += gbm.predict(X_test, num_iteration=gbm.best_iteration) / kfold.n_splits
        gbms.append(gbm)

    return gbms, oof_preds, test_preds



def gen_thres_new(df_train, oof_preds):
    df_train['oof_preds'] = oof_preds
    quantile_point = df_train['black_flag'].mean()
    thres = df_train['oof_preds'].quantile(1 - quantile_point)

    _thresh = []
    for thres_item in np.arange(thres - 0.2, thres + 0.2, 0.001):
        _thresh.append(
            [thres_item, f1_score(df_train['black_flag'], np.where(oof_preds > thres_item, 1, 0), average='macro')])

    _thresh = np.array(_thresh)
    best_id = _thresh[:, 1].argmax()
    best_thresh = _thresh[best_id][0]

    print("阈值: {}\n训练集的f1: {}".format(best_thresh, _thresh[best_id][1]))
    return best_thresh



data = pd.read_csv('账户交易信息.csv')
data_static = pd.read_csv('账户静态信息.csv')
data_label = pd.read_csv('训练集标签.csv')
data['jyrq'] = pd.to_datetime(data['jyrq'] + " " + data['jysj']).astype(int) // 10**9
data_static['khrq'] = pd.to_datetime(data_static['khrq'] + " 00:00:00").astype(int) // 10**9

d = {}
users = data.zhdh.unique().tolist()
for user_id in tqdm(users):
    lt = data[data.zhdh == user_id].reset_index(drop=True)
    lt = lt.sort_values(['jyrq']).reset_index(drop=True)

    all_cnt = lt.shape[0] # 操作次数
    in_cnt = lt.jdbj.sum() # 转入次数、转出次数、转入次数占比
    out_cnt = all_cnt - in_cnt
    in_ratio = in_cnt / all_cnt

    in_amt = lt[lt.jdbj == 1].jyje.sum() # 转入金额、转出金额、转入金额占比、转入方差、转出方差、交易金额比原始金额
    out_amt = lt[lt.jdbj == 0].jyje.sum()
    in_amt_ratio = in_amt / (in_amt + out_amt)
    in_amt_std = lt[lt.jdbj == 1].jyje.std()
    out_amt_std = lt[lt.jdbj == 0].jyje.std()
    zhengshu = lt['jyje'].apply(lambda x: 1 if x >= 1000 else 0).sum()
    # ratio = (lt.jyje / lt.zhye).mean()
    # inc = 0  # 入账顺序数
    # dec = 0  # 出账顺序数
    # ratio = 0
    #     # if i + 1 < all_cnt and lt.iloc[i]['zhye'] <= lt.iloc[i + 1]['zhye']:
    #     #     inc += 1
    #     # if i + 1 < all_cnt and lt.iloc[i]['zhye'] >= lt.iloc[i + 1]['zhye']:
    #     #     dec += 1
    #     if lt.iloc[i].jdbj == 1:
    #         ratio += lt.iloc[i].jyje / (lt.iloc[i].zhye - lt.iloc[i].jyje)
    #     else:
    #         ratio += lt.iloc[i].jyje / (lt.iloc[i].zhye + lt.iloc[i].jyje)
    # ratio /= all_cnt

    in_user_cnt = lt[lt.jdbj == 1].dfzh.nunique() # 转入人数、转出人数、转入人数占比
    out_user_cnt = lt[lt.jdbj == 0].dfzh.nunique()
    in_user_ratio = in_user_cnt / (in_user_cnt + out_user_cnt)
    
    in_jyqd = lt[lt.jdbj == 1].jyqd.nunique() # 转入渠道数、转出渠道数、转入渠道数占比
    out_jyqd = lt[lt.jdbj == 0].jyqd.nunique()
    in_jyqd_ratio = in_jyqd / (in_jyqd + out_jyqd)

    # in_dfhh = lt[lt.jdbj == 1].dfhh.nunique() / in_cnt # 转入行号数、转出行号数、转入行号数占比
    # out_dfhh = lt[lt.jdbj == 0].dfhh.nunique() / out_cnt
    # in_dfhh_ratio = in_dfhh / (in_dfhh + out_dfhh)

    name_len = lt.dfmccd.mean() # 对方名称长度
    date_cnt = lt.jyrq.nunique() # 转账日期数
    date_std = lt.jyrq.std() # 转帐日期方差
    date_sum = lt.jyrq.sum()
    d[user_id] = [all_cnt, in_cnt, out_cnt, in_ratio,
                in_amt, out_amt, in_amt_ratio, in_amt_std, out_amt_std,
                # inc, dec, 
                # ratio
                zhengshu,
                in_user_cnt, out_user_cnt, in_user_ratio, 
                in_jyqd, out_jyqd, in_jyqd_ratio,
                # in_dfhh, out_dfhh, in_dfhh_ratio,
                name_len,
                date_cnt, date_std, date_sum]

data_df = pd.DataFrame.from_dict(d).T.reset_index()
data_df.columns = ['zhdh', 'all_cnt', 'in_cnt', 'out_cnt', 'in_ratio', 
                    'in_amt', 'out_amt', 'in_amt_ratio', 'in_amt_std', 'out_amt_std',
                    # 'inc', 'dec', 
                    # 'ratio'
                    'zhengshu',
                    'in_user_cnt', 'out_user_cnt', 'in_user_ratio',
                    'in_jyqd', 'out_jyqd', 'in_jyqd_ratio',
                    # 'in_dfhh', 'out_dfhh', 'in_dfhh_ratio',
                    'name_len',
                    'date_cnt', 'date_std', 'date_sum']


data_static.columns = ['zhdh', 'khrq', 'khjgdh', 'xb', 'age']
df_feats = pd.merge(left=data_df, right=data_static[['zhdh', 'xb', 'age', 'khrq']], on='zhdh', how='left')
df_final = pd.merge(left=df_feats, right=data_label, on='zhdh', how='left')

# 转帐日期和开户日期差
df_final['date_sum'] = df_final[['date_sum', 'khrq', 'all_cnt']].apply(lambda x : x['date_sum'] - x['khrq'] * x['all_cnt'], axis=1)
print(df_final.head(5))

df_train = df_final[df_final.black_flag.notnull()].reset_index(drop=True)
df_test = df_final[df_final.black_flag.isnull()].reset_index(drop=True)

feats = df_train.columns[1:-1].tolist()
X_train = df_train[feats]
y_train = df_train['black_flag']
X_test = df_test[feats]


gbms, oof_preds, test_preds = train_lgb_kfold(X_train, y_train, X_test, n_fold=5)
best_thresh = gen_thres_new(df_train, oof_preds)

df_test['black_flag'] = np.where(test_preds > best_thresh, 1, 0)
df_test[['zhdh', 'black_flag']].to_csv('res.csv', index=False)
import random
import pandas as pd
import numpy as np
from tqdm import tqdm
import lightgbm as lgb
import warnings
import gc
import torch
import os
warnings.filterwarnings('ignore')
from sklearn.metrics import roc_auc_score, auc, roc_curve, accuracy_score, f1_score
import pickle
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import os
from sklearn.preprocessing import StandardScaler, QuantileTransformer, KBinsDiscretizer, LabelEncoder, MinMaxScaler, PowerTransformer, OrdinalEncoder
from collections import defaultdict, Counter
from gensim.models import Word2Vec
import xgboost as xgb
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
import sys
import argparse

'''
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple catboost

'''

def gen_thres_new(df_train, oof_preds):
    df_train['oof_preds'] = oof_preds
    quantile_point = df_train['black_flag'].mean()
    thres = df_train['oof_preds'].quantile(1 - quantile_point)

    _thresh = []
    for thres_item in np.arange(thres - 0.2, thres + 0.2, 0.01):
        _thresh.append(
            [thres_item, f1_score(df_train['black_flag'], np.where(oof_preds > thres_item, 1, 0), average='macro')])

    _thresh = np.array(_thresh)
    best_id = _thresh[:, 1].argmax()
    best_thresh = _thresh[best_id][0]

    print("阈值: {}\n训练集的f1: {}".format(best_thresh, _thresh[best_id][1]))
    return best_thresh


def train_xgb_kfold(X_train, y_train, X_test, n_fold=10):
    kfold = StratifiedKFold(n_splits=n_fold, random_state=1024, shuffle=True)
    oof_preds = np.zeros((X_train.shape[0],))
    test_preds = np.zeros((X_test.shape[0],))

    for fold, (train_index, val_index) in enumerate(kfold.split(X_train, y_train)):
        logging.info(f'############ fold {fold} ###########')
        X_tr, X_val, y_tr, y_val = X_train.iloc[train_index], X_train.iloc[val_index], y_train[train_index], y_train[val_index]
        dtrain = xgb.DMatrix(X_tr, y_tr)
        dvalid = xgb.DMatrix(X_val, y_val)
        dtest = xgb.DMatrix(X_test)

        # params={
        #     'booster':'gbtree',
        #     'objective': 'binary:logistic',
        #     'eval_metric': ['logloss', 'auc'],
        #     'max_depth': 8,
        #     'subsample':0.9,
        #     'min_child_weight': 10,
        #     'colsample_bytree':0.85,
        #     'lambda': 10,
        #     'eta': 0.02,
        #     'seed': 1024
        # }
        params = {'booster': 'gbtree',
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'gamma': 1,
            'min_child_weight': 1.5,
            'max_depth': 5,
            'lambda': 10,
            'subsample': 0.7,
            'colsample_bytree': 0.7,
            'colsample_bylevel': 0.7,
            'eta': 0.05,
            'tree_method': 'exact',
            'seed': 2020,
            'nthread': 8
        }
        watchlist = [(dtrain, 'train'), (dvalid, 'test')]

        gbm = xgb.train(params,
                        dtrain,
                        num_boost_round=10000,
                        evals=watchlist,
                        eval_metric=f1_score,
                        verbose_eval=1000,
                        early_stopping_rounds=500)

        oof_preds[val_index] = gbm.predict(dvalid, iteration_range=(0, gbm.best_iteration))
        test_preds += gbm.predict(dtest, iteration_range=(0, gbm.best_iteration)) / kfold.n_splits

    return oof_preds, test_preds



warnings.filterwarnings("ignore")
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUFA_VISIBLE_DEVICES"] = "4"
device = torch.device("cuda:4" if torch.cuda.is_available() else "cpu")

data = pd.read_csv("账户交易信息.csv")
data_static = pd.read_csv("账户静态信息.csv")
data_label = pd.read_csv("训练集标签.csv")
test = pd.read_csv("test_dataset.csv")

# data = data.merge(fea, on='zhdh')
# data = data.merge(act, on='zhdh')

# data['year0'] = data['khrq'].str[:4]
# data['mon0'] = data['khrq'].str[5:7]
# data['day0'] = data['khrq'].str[8:10]
# data['year'] = data['jyrq'].str[:4]
# data['mon'] = data['jyrq'].str[5:7]
# data['day'] = data['jyrq'].str[8:10]
# data['hour'] = data['jysj'].str[:2]
# data['min'] = data['jysj'].str[3:5]
# data['sec'] = data['jysj'].str[6:]
# del data['khrq']
# del data['jyrq']
# del data['jysj']
# del data['jylsxh']

# tmp = pd.concat([data['dfzh'], data['zhdh']], ignore_index=True).unique()
# mapp = {tmp[i]: i for i in range(len(tmp))}
# data['zhdh'] = data['zhdh'].map(lambda x: mapp[x])
# data['dfzh'] = data['dfzh'].map(lambda x: mapp[x])

# tmp = data['zydh'].unique()
# mapp = {tmp[i]: i for i in range(len(tmp))}
# data['zydh'] = data['zydh'].map(lambda x: mapp[x])

# tmp = data['jyqd'].unique()
# mapp = {tmp[i]: i for i in range(len(tmp))}
# data['jyqd'] = data['jyqd'].map(lambda x: mapp[x])

# tmp = pd.concat([data['khjgdh'], data['dfhh']], ignore_index=True).unique()
# mapp = {tmp[i]: i for i in range(len(tmp))}
# data['khjgdh'] = data['khjgdh'].map(lambda x: mapp[x])
# data['dfhh'] = data['dfhh'].map(lambda x: mapp[x])

d = {}
users = data.zhdh.unique().tolist()
cnt = 0
for user in tqdm(users):
    if cnt == 100:
        break
    cnt += 1
    lt = data[data.zhdh == user].reset_index(drop=True)

    all_cnt = lt.shape[0] # 操作次数
    in_cnt = lt.jdbj.sum() # 转入次数、转出次数、转入次数占比
    out_cnt = all_cnt - in_cnt
    in_ratio = in_cnt / all_cnt

    in_amt = lt[lt.jdbj == 1].jyje.sum() # 转入金额、转出金额、转入金额占比
    out_amt = lt[lt.jdbj == 0].jyje.sum()
    in_amt_ratio = in_amt / (in_amt + out_amt)

    in_user_cnt = lt[lt.jdbj == 1].dfzh.nunique() # 转入人数、转出人数、转入人数占比
    out_user_cnt = lt[lt.jdbj == 0].dfzh.nunique()
    in_user_ratio = in_user_cnt / (in_user_cnt + out_user_cnt)

    date_cnt = lt.jyrq.nunique() # 转账日期数
    d[user] = [all_cnt, in_cnt, out_cnt, in_ratio, in_amt, out_amt, in_amt_ratio, in_user_cnt, out_user_cnt, in_user_ratio, date_cnt]


data_df = pd.DataFrame.from_dict(d).T.reset_index()
data_df.columns = ['zhdh', 'all_cnt', 'in_cnt', 'out_cnt', 'in_ratio', 'in_amt', 'out_amt', 'in_amt_ratio', 'in_user_cnt', 'out_user_cnt', 'in_user_ratio', 'date_cnt']

data_static.columns = ['zhdh', 'khrq', 'khjgdh', 'xb', 'age']
feats = pd.merge(left=data_df, right=data_static[['zhdh', 'xb', 'age']], on='zhdh', how='left')
final = pd.merge(left=feats, right=data_label, on='zhdh', how='left')

train = final[final.black_flag.notnull()].reset_index(drop=True)
test = final[final.black_flag.isnull()].reset_index(drop=True)

# feat = ['zhdh', 'khjgdh', 'xb', 'nl', 'dfzh', 'jdbj', 'jyje', 'zhye', 'dfhh', 'jyqd', 'zydh', 'dfmccd', 'year0', 'mon0', 'day0', 'year', 'mon', 'day', 'hour', 'min', 'sec', 'in_num', 'out_num', 'in_mon', 'out_mon', 'cnt']
feat = train.columns[1:].tolist()

# model = lgb.LGBMClassifier(
#     boosting_type="gbdt", num_leaves=128, reg_alpha=5, reg_lambda=5,
#     max_depth=-1, n_estimators=2000, subsample=0.8, colsample_bytree=0.8, n_jobs=-1,
#     subsample_freq=1, min_child_samples=50, learning_rate=0.1, random_state=2023
# )
# model.fit(train[feat], train['black_flag'])
# pred = model.predict_proba(test[feat])

X_train = train[feat]
y_train = train['black_flag']
X_test = test[feat]

oof_preds, test_preds = train_xgb_kfold(X_train, y_train, X_test)
best_thresh = gen_thres_new(train, oof_preds)

test['black_flag'] = np.where(test_preds > best_thresh, 1, 0)
test[['zhdh', 'black_flag']].to_csv('ans.csv', index=False)
  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,kaggle教程,方案分析,竞赛资料,竞赛方案参考,

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值