目录:
一. 数据筛选
1.1 读取数据
import numpy as np
import pandas as pd
import csv
data_path = r'..\电费敏感预测\rawdata'
# 工单信息
file_jobinfo_train = '01_arc_s_95598_wkst_train.tsv'
file_jobinfo_test = '01_arc_s_95598_wkst_test.tsv'
# 通话信息记录
file_comm = '02_s_comm_rec.tsv'
# 应收电费信息
file_flow_train = '09_arc_a_rcvbl_flow.tsv'
file_flow_test = '09_arc_a_rcvbl_flow_test.tsv'
# 训练集
file_label = 'train_label.csv'
# 测试集
file_test = 'test_to_predict.csv'
train_info = pd.read_csv(data_path + '\processed_' + file_jobinfo_train,
sep = '\t', quoting = csv.QUOTE_NONE)
# quoting 防止文本里包含英文双引号导致报错
# 过滤CUST_NO为空的用户
train_info = train_info.loc[~train_info.CUST_NO.isnull()]
train_info['CUST_NO'] = train_info.CUST_NO.astype(np.int64)
train_info.head(2)
- 统计单数
train = train_info.CUST_NO.value_counts().to_frame().reset_index()
train.columns = ['CUST_NO', 'counts_of_jobinfo']
train.head()
1.2 加入label值
temp = pd.read_csv(data_path + '/' + file_label, header = None)
temp.columns = ['CUST_NO']
train['label'] = 0
train.loc[train.CUST_NO.isin(temp.CUST_NO), 'label'] = 1
train = train[['CUST_NO', 'label', 'counts_of_jobinfo']]
print(train.shape)
train.head(4)
- 测试集标签用-1 表示
test_info = pd.read_csv(data_path + 'processed_' + file_jobinfo_test, sep='\t', encoding='utf-8', quoting=csv.QUOTE_NONE)
test = test_info.CUST_NO.value_counts().to_frame().reset_index()
test.columns = ['CUST_NO', 'counts_of_jobinfo']
test['label'] = -1
test = test[['CUST_NO', 'label', 'counts_of_jobinfo']]
test.head()
df = train.append(test).copy()
del temp, train, test
1.3 留下一条工单记录的数据
df = df.loc[df.counts_of_jobinfo == 1].copy()
df.reset_index(drop = True, inplace = True)
train = df.loc[df.label != -1]
test = df.loc[df.label == -1]
print('低敏用户训练集: ', train.shape[0])
print('低敏用户正样本: ', train.loc[train.label == 1].shape[0])
print('低敏用户负样本: ', train.loc[train.label == 0].shape[0])
print('低敏用户测试集: ', test.shape[0])
df.drop(['counts_of_jobinfo'], axis = 1, inplace = True)
低敏用户训练集: 401626
低敏用户正样本: 13139
低敏用户负样本: 388487
低敏用户测试集: 327437
jobinfo = train_info.append(test_info).copy()
jobinfo = jobinfo.loc[jobinfo.CUST_NO.isin(df.CUST_NO)].copy()
jobinfo.reset_index(drop = True, inplace = True)
jobinfo = jobinfo.merge(df[['CUST_NO', 'label']], on = 'CUST_NO', how = 'left')
print(jobinfo.shape)
jobinfo.head()
1.4 加载通话数据
comm = pd.read_csv(data_path + '/' + file_comm, sep = '\t')
print('总数据量: ', comm.shape)
comm.drop_duplicates(inplace = True)
print('去掉重复无用的: ', comm.shape)
# 过滤掉没出现在jobinfo中的数据
comm = comm.loc[comm.APP_NO.isin(jobinfo.ID)]
comm = comm.rename(columns = {
'APP_NO': 'ID'})
comm = comm.merge(jobinfo[['ID', 'CUST_NO']], on = 'ID', how = 'left')
print('可用数据量: ', comm