对通话、短信以及上网记录的数据来预测风险用户
本文代码全部采用jupyter运行
先导入相关的包
# -*- coding: UTF-8 -*-
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics, model_selection
from sklearn.model_selection import train_test_split
处理电话、短信以及上网记录的训练数据
uid_train = pd.read_csv('./new_data/train/uid_train.txt',sep='\t')
voice_train = pd.read_csv('./new_data/train/voice_train.txt',sep='\t',dtype={
'start_time':np.str,'end_time':np.str})
sms_train = pd.read_csv('./new_data/train/sms_train.txt',sep='\t',dtype={
'start_time':str})
wa_train = pd.read_csv('./new_data/train/wa_train.txt',sep='\t',dtype={
'date':np.str})
处理电话、短信以及上网记录的测试数据
voice_test = pd.read_csv('./new_data/test/voice_test.txt',sep='\t', dtype={
'start_time':np.str,'end_time':np.str})
sms_test = pd.read_csv('./new_data/test/sms_test.txt',sep='\t',dtype={
'start_time':np.str})
wa_test = pd.read_csv('./new_data/test/wa_test.txt',sep='\t',dtype={
'date':np.str})
合并电话、短信以及上网记录的训练数据和测试数据
uid_test = pd.read_csv('./new_data/test/uid_test.txt',sep='\t')
voice = pd.concat([voice_train,voice_test],axis=0)
sms = pd.concat([sms_train,sms_test],axis=0)
wa = pd.concat([wa_train,wa_test],axis=0)
数据去重。inplace=True表示直接在原来的DataFrame上删除重复项,而默认值False表示生成一个副本。
voice.drop_duplicates(inplace=True)
sms.drop_duplicates(inplace=True)
wa.drop_duplicates(inplace=True)
#输出查看结果
voice,sms,wa
输出的其中一部分结果
===================================================================
对用户短信记录进行特征工程处理
选取不同值来统计不同号码的主叫被叫数量,unstack将行数据转换为列数据,add_prefix新加行标签的前缀,reset_index重置索引
sms_head_0_count =sms.groupby(['vid'])['opp_head'].\
agg({
'0': lambda x: np.sum(x.values == 0)}).add_prefix('sms_opp_head_').reset_index().fillna(0)
sms_head_0_count['sms_head_0_count_mean'] = \
sms_head_0_count.sms_opp_head_0 - np.mean(sms_head_0_count.sms_opp_head_0)
sms = sms[sms.opp_head != 0]
# 各用户所发送\接收短信号码统计
sms_in_out_unique = sms.groupby(['vid','in_out'])['opp_num'].\
nunique().unstack().add_prefix('sms_in_out_unique_').reset_index().fillna(0)
# 各个用户所接收不同短信号码的数量与其均值的差值
sms_in_out_unique['sms_in_out_unique-mean'] = \
sms_in_out_unique.sms_in_out_unique_1 - np.mean(sms_in_out_unique.sms_in_out_unique_1)
# 各个用户接收\发送不同的短信号码统计
sms_opp_num = sms.groupby(['vid'])['opp_num'].\
agg({
'unique_count': lambda x: len(pd.unique(x)),'count':'count'}).add_prefix('sms_opp_num_').reset_index().fillna(0)
# 各个用于接收\发送短信号码与其均值的差值
sms_opp_num['sms_count-mean'] = \
(sms_opp_num.sms_opp_num_count - np.mean(sms_opp_num.sms_opp_num_count)).astype('float')
# 各个用户所接收\发送不同短信号码数量与均值的差值
sms_opp_num['sms_unique_count-mean'] = \
(sms_opp_num.sms_opp_num_unique_count - np.mean(sms_opp_num.sms_opp_num_unique_count)).astype('float')
# 各个用户重复接收\发送某些短信号码的数量
sms_opp_num['sms_opp_num_diff']