# 数据准备阶段
## 导入基本的包
import os
import pandas as pd
import matplotlib.pyplot as plt
# plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
# plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
# plt.rcParams['font.family'] = ['sans-serif']
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import f1_score
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings("ignore")
## 导入数据
path = './data/'
train = pd.read_csv(path + 'train_2.csv')
test = pd.read_csv(path + 'test_2.csv')
# 迁移学习构造的特征
# train_stacking = pd.read_csv(path + '/stack/train.csv')
# test_stacking = pd.read_csv(path + '/stack/test.csv')
# print(len(train), len(test))
# train = train.merge(train_stacking, 'left', 'user_id')
# test = test.merge(test_stacking, 'left', 'user_id')
# print(len(train), len(test))
print(len(train), len(test))
train.info()
### 发现存在object类型的特征,需要进行转换
for i in train['2_total_fee'].values:
try:
tmp = float(i)
except:
print(i)
for i in train['3_total_fee'].values:
try:
tmp = float(i)
except:
print(i)
train = train.replace('\\N', 0)
train['2_total_fee'] = train['2_total_fee'].astype(float)
train['3_total_fee'] = train['3_total_fee'].astype(float)
train.info()
test.info()
for i in test['2_total_fee'].values:
try:
tmp = float(i)
except:
print(i)
test = test.replace('\\N', 0)
test['2_total_fee'] = test['2_total_fee'].astype(float)
| 字段 | 中文名| 数据类型| 说明 |
|:-------:|:-------:|:-------:|:-------:|
|USERID| 用户ID| VARCHAR2(50)| 用户编码,标识用户的唯一字段|
|current_service| 套餐 |VARCHAR2(500) |/|
|service_type |套餐类型 |VARCHAR2(10) |0:23G融合,1:2I2C,2:2G,3:3G,4:4G|
|is_mix_service |是否固移融合套餐| VARCHAR2(10)| 1.是 0.否|
|online_time |在网时长| VARCHAR2(50) |/|
|1_total_fee| 当月总出账金额_月 |NUMBER| 单位:元|
|2_total_fee |当月前1月总出账金额_月| NUMBER |单位:元|
|3_total_fee| 当月前2月总出账金额_月| NUMBER 单位:元|
|4_total_fee |当月前3月总出账金额_月 |NUMBER| 单位:元|
|month_traffic |当月累计-流量 |NUMBER| 单位:MB|
|many_over_bill| 连续超套 |VARCHAR2(500)| 1-是,0-否|
|contract_type| 合约类型| VARCHAR2(500) |ZBG_DIM.DIM_CBSS_ACTIVITY_TYPE|
|contract_time| 合约时长| VARCHAR2(500)| /|
|is_promise_low_consume |是否承诺低消用户| VARCHAR2(500) |1.是 0.否|
|net_service |网络口径用户| VARCHAR2(500) |20AAAAAA-2G|
|pay_times |交费次数 |NUMBER |单位:次|
|pay_num |交费金额 |NUMBER |单位:元|
|last_month_traffic |上月结转流量| NUMBER| 单位:MB|
|local_trafffic_month| 月累计-本地数据流量 |NUMBER |单位:MB|
|local_caller_time| 本地语音主叫通话时长| NUMBER| 单位:分钟|
|service1_caller_time |套外主叫通话时长| NUMBER |单位:分钟|
|service2_caller_time |Service2_caller_time| NUMBER |单位:分钟|
|gender| 性别 |varchar2(100) |01.男 02女|
|age| 年龄| varchar2(100)| /|
|complaint_level |投诉重要性| VARCHAR2(1000) |1:普通,2:重要,3:重大|
|former_complaint_num|交费金历史投诉总量| NUMBER |单位:次|
|former_complaint_fee| 历史执行补救费用交费金额 |NUMBER |单位:分|
# 数据分析
# 查看训练集中套餐的统计信息及分布
train['current_service'].value_counts()
999999套餐特别的少,初步看作是异常值,毕竟类别少的是很难学习到有效信息的
## 单变量分析
train['age'].hist(bins=70)
由图中可以看出,客户数量最大的年龄段为17-31岁,其中22岁客户占比5%,远超平均水平1.7%。可见客户群体和年龄存在较大关联,且年轻人群占有较大比重。
train['gender'].hist(bins=70)
客户性别分布明显,性别1 客户数量为性别2 客户数量的2倍多,性别的影响非常大。
我们观察到性别中有0 的缺省值,对于这部分,我们使用了两种方法处理,一种是填充service_type(或更细致)对应字段的众数,和原始值。最终我们选取了原始值,我们认为默性别在不同套餐中的转换率呈现了分布差异。
train['service_type'].hist(bins=70)
train['service_type'].value_counts()
服务类型集中在1和4,3很大可能为异常值
train['complaint_level'].hist(bins=70)
投诉重要性大部分为0
## 多变量分析
fig,ax = plt.subplots(figsize=(10,10))
service = train['current_service'].unique()
for i in service:
train[train['current_service']==i]['1_total_fee'].hist(bins=100,label=str(i),ax=ax,alpha=0.5)
plt.xlim([0,500])
plt.legend()
观察1_total_fee(当月总出账金额_月)与套餐的分布,还是有很明显的区分度的,当月总出账金额很大程度上影响套餐类型的。后续可以多考虑从1_total_fee、2_total_fee、3_total_fee、4_total_fee中挖掘更多有用特征。
一般而言,有区分度的特征都为优质特征。
sns.jointplot(x='age',y='1_total_fee',data = train)
话费与年龄存在明显关系,除了年龄为0外,高花费集中区域很明显
sns.jointplot(x='age',y='month_traffic',data = train)
train.groupby(['current_service'])['1_total_fee'].agg({'count','mean'}).reset_index()
train[train.service_type!=3].groupby(['current_service','service_type'])['user_id'].agg({'count'})
我们可以得到一个明显的规律,service_type可以将套餐分为两个部分,这两部分是没有交叉的,其中一类有8个,另外一类有3个。这给我们比赛带来一个思路是,可以分模型预测作为一个尝试的方向。(不考虑service_type=3的情况下)
# 特征工程
## 数据预处理
data = pd.concat([train, test], ignore_index=True).fillna(0)
# current_service为目标
data['label'] = data.current_service.astype(int)
data = data.replace('\\N', 0)
data['gender'] = data.gender.astype(int)
data['service_type'].value_counts()
data.loc[data['service_type'] == 3, 'service_type'] = 4
**原始特征分类**
origin_cate_feature = ['service_type', 'complaint_level', 'contract_type', 'gender', 'is_mix_service',
'is_promise_low_consume',
'many_over_bill', 'net_service']
origin_num_feature = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee',
'age', 'contract_time',
'former_complaint_fee', 'former_complaint_num',
'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic',
'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time']
**类型转换**
由于部分特征为object类型
for i in origin_num_feature:
data[i] = data[i].astype(float)
## Embedding 特征
这里使用Word2Vec构建embedding特征
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing
L = 10
sentence = []
for line in list(data[['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']].values):
sentence.append([str(float(l)) for idx, l in enumerate(line)])
print('training...')
model = Word2Vec(sentence, size=L, window=2, min_count=1, workers=multiprocessing.cpu_count(),
iter=10)
print('outputing...')
for fea in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:
values = []
for line in list(data[fea].values):
values.append(line)
values = set(values)
print(len(values))
w2v = []
for i in values:
a = [i]
a.extend(model[str(float(i))])
w2v.append(a)
out_df = pd.DataFrame(w2v)
name = [fea]
for i in range(L):
name.append(name[0] + 'W' + str(i))
out_df.columns = name
out_df.to_csv( './data/w2v/' + fea + '.csv', index=False)
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
df=pd.read_csv('./data/w2v/3_total_fee.csv')
l=list(df['3_total_fee'].astype('str'))
name=list(df)
def plot_with_labels(low_dim_embs, labels, filename = 'tsne.png'):
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize= (10, 18))
for i, label in enumerate(labels):
x, y = low_dim_embs[i, :]
plt.scatter(x, y)
plt.annotate(label, xy = (x, y), textcoords = 'offset points', ha = 'right', va = 'bottom')
plt.savefig(filename)
tsne = TSNE(perplexity = 30, n_components = 2, init = 'pca', n_iter = 5000)
plot_only = 300
low_dim_embs = tsne.fit_transform(df.iloc[:plot_only][name[1:]])
labels = [l[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs, labels)
### 考虑对Embedding特征进行聚类
观察聚类结果对套餐的区分度
w2v_features = []
for col in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:
df = pd.read_csv('./data/w2v/' + col + '.csv')
df = df.drop_duplicates([col])
fs = list(df)
fs.remove(col)
w2v_features += fs
print(len(data))
data = pd.merge(data, df, on=col, how='left')
print(len(data))
print(w2v_features)
## 统计特征
train['1_total_fee'].value_counts().head(20).plot(kind='bar', figsize=(16,9))
**是否由区分度呢?**
train[train['1_total_fee']==106]['current_service'].value_counts()
count_feature_list = []
def feature_count(data, features=[]):
if len(set(features)) != len(features):
print('equal feature !!!!')
return data
new_feature = 'count'
for i in features:
new_feature += '_' + i.replace('add_', '')
temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
data = data.merge(temp, 'left', on=features)
count_feature_list.append(new_feature)
# 迁移特征
if 'service_type' in features:
temp_2 = train_first.groupby(features).size().reset_index().rename(columns={0: 'train_' + new_feature})
data = data.merge(temp_2, 'left', on=features)
count_feature_list.append('train_' + new_feature)
return data
data = feature_count(data, ['1_total_fee'])
data = feature_count(data, ['2_total_fee'])
data = feature_count(data, ['3_total_fee'])
data = feature_count(data, ['4_total_fee'])
data = feature_count(data, ['former_complaint_fee'])
data = feature_count(data, ['pay_num'])
data = feature_count(data, ['contract_time'])
data = feature_count(data, ['last_month_traffic'])
data = feature_count(data, ['online_time'])
## 迁移特征
# for i in ['service_type', 'contract_type']:
# data = feature_count(data, [i, '1_total_fee'])
# data = feature_count(data, [i, '2_total_fee'])
# data = feature_count(data, [i, '3_total_fee'])
# data = feature_count(data, [i, '4_total_fee'])
# data = feature_count(data, [i, 'former_complaint_fee'])
# data = feature_count(data, [i, 'pay_num'])
# data = feature_count(data, [i, 'contract_time'])
# data = feature_count(data, [i, 'last_month_traffic'])
# data = feature_count(data, [i, 'online_time'])
train1 = pd.read_csv(path + 'train_1.csv')
test1 = pd.read_csv(path + 'test_1.csv')
data1 = pd.concat([train1, test1], ignore_index=True).fillna(0)
# current_service为目标
data1['label'] = data1.current_service.astype(int)
data1 = data1.replace('\\N', 0)
data1['gender'] = data1.gender.astype(int)
train['current_service'].value_counts()
train1['current_service'].value_counts()
train1['1_total_fee'].value_counts().head(20).plot(kind='bar', figsize=(16,9))
其实从最基本的count统计就能看出初赛数据和复赛数据之间的差异
## 差值特征
diff_feature_list = ['diff_total_fee_1', 'diff_total_fee_2', 'diff_total_fee_3', 'last_month_traffic_rest',
'rest_traffic_ratio',
'total_fee_mean', 'total_fee_max', 'total_fee_min', 'total_caller_time', 'service2_caller_ratio',
'local_caller_ratio',
'total_month_traffic', 'month_traffic_ratio', 'last_month_traffic_ratio', 'pay_num_1_total_fee',
'1_total_fee_call_fee', '1_total_fee_call2_fee', '1_total_fee_trfc_fee']
data['diff_total_fee_1'] = data['1_total_fee'] - data['2_total_fee']
data['diff_total_fee_2'] = data['2_total_fee'] - data['3_total_fee']
data['diff_total_fee_3'] = data['3_total_fee'] - data['4_total_fee']
data['pay_num_1_total_fee'] = data['pay_num'] - data['1_total_fee']
data['last_month_traffic_rest'] = data['month_traffic'] - data['last_month_traffic']
data['last_month_traffic_rest'][data['last_month_traffic_rest'] < 0] = 0
data['rest_traffic_ratio'] = (data['last_month_traffic_rest'] * 15 / 1024) / data['1_total_fee']
total_fee = []
for i in range(1, 5):
total_fee.append(str(i) + '_total_fee')
data['total_fee_mean'] = data[total_fee].mean(1)
data['total_fee_max'] = data[total_fee].max(1)
data['total_fee_min'] = data[total_fee].min(1)
data['total_caller_time'] = data['service2_caller_time'] + data['service1_caller_time']
data['service2_caller_ratio'] = data['service2_caller_time'] / data['total_caller_time']
data['local_caller_ratio'] = data['local_caller_time'] / data['total_caller_time']
data['total_month_traffic'] = data['local_trafffic_month'] + data['month_traffic']
data['month_traffic_ratio'] = data['month_traffic'] / data['total_month_traffic']
data['last_month_traffic_ratio'] = data['last_month_traffic'] / data['total_month_traffic']
data['1_total_fee_call_fee'] = data['1_total_fee'] - data['service1_caller_time'] * 0.15
data['1_total_fee_call2_fee'] = data['1_total_fee'] - data['service2_caller_time'] * 0.15
data['1_total_fee_trfc_fee'] = data['1_total_fee'] - (
data['month_traffic'] - 2 * data['last_month_traffic']) * 0.3
data.loc[data.service_type == 1, '1_total_fee_trfc_fee'] = None
cate_feature = origin_cate_feature
num_feature = origin_num_feature + count_feature_list + diff_feature_list + w2v_features
for i in cate_feature:
data[i] = data[i].astype('category')
for i in num_feature:
data[i] = data[i].astype(float)
feature = cate_feature + num_feature
print(len(feature), feature)
# 训练模型
def f1_score_vali(preds, data_vali):
labels = data_vali.get_label()
preds = np.argmax(preds.reshape(11, -1), axis=0)
score_vali = f1_score(y_true=labels, y_pred=preds, average='macro')
return 'f1_score', score_vali ** 2, True
# 提取训练集和标签
X = data[(data.label != 0) & (data.label != 999999)][feature].reset_index(drop=True)
y = data[(data.label != 0) & (data.label != 999999)].label.reset_index(drop=True)
# 套餐映射编码
label2current_service = dict(
zip(range(0, len(set(y))), sorted(list(set(y)))))
current_service2label = dict(
zip(sorted(list(set(y))), range(0, len(set(y)))))
label2current_service
current_service2label
y = pd.Series(y).map(current_service2label)
params = {
"learning_rate": 0.1,
"boosting": 'gbdt',
"lambda_l2": 0.1,
"max_depth": -1,
"num_leaves": 128,
"bagging_fraction": 0.8,
"feature_fraction": 0.8,
"max_bin": 1500,
"metric": None,
"objective": "multiclass",
"num_class": 11,
"silent": True,
"nthread": 10,
"verbose": -1
}
cv_pred = [] # 测试集结果
oof_pred = np.zeros(X.shape[0]) # 验证集结果
skf = StratifiedKFold(n_splits=5, random_state=20181, shuffle=True)
for index, (train_index, test_index) in enumerate(skf.split(X, y)):
print(index)
train_x, test_x, train_y, test_y = X.loc[train_index], X.loc[test_index], y.loc[train_index], y.loc[test_index]
train_data = lgb.Dataset(train_x, label=train_y)
valid_data = lgb.Dataset(test_x, label=test_y)
clf = lgb.train(params, train_data, num_boost_round=2000, valid_sets=[valid_data], feval=f1_score_vali,
verbose_eval=20, early_stopping_rounds=50)
y_test = clf.predict(data[data.label == 0][feature], num_iteration=clf.best_iteration)
y_test = [np.argmax(x) for x in y_test]
oof_pred[train_index] = [np.argmax(x) for x in clf.predict(X.loc[train_index][feature], num_iteration=clf.best_iteration)]
# 将K折进行拼接,讲数组按列顺序进行堆叠
if index == 0:
cv_pred = np.array(y_test).reshape(-1, 1)
else:
cv_pred = np.hstack((cv_pred, np.array(y_test).reshape(-1, 1)))
submit = []
for line in cv_pred:
submit.append(np.argmax(np.bincount(line)))
result = pd.DataFrame()
result['user_id'] = data[data.label == 0]['user_id']
result['predict'] = submit
result['predict'] = result['predict'].map(label2current_service)
result.loc[result['user_id'] == '4VNcD6kE0sjnAvFX', 'predict'] = 999999
print(len(result), result.predict.value_counts())
print(result.sort_values('user_id').head())
result[['user_id', 'predict']].to_csv(
path + '/sub.csv', index=False)
# 误差分析
## 对比每个类别的F1
lgb_model = lgb.LGBMClassifier(
boosting_type="gbdt", num_leaves=32, reg_alpha=0, reg_lambda=0.,
max_depth=-1, n_estimators=100, objective='multiclass', metric="None",
subsample=0.9, colsample_bytree=0.5, subsample_freq=1,
learning_rate=0.2, random_state=2018, n_jobs=10
)
lgb_model.fit(train_x, train_y, categorical_feature=cate_feature)
print(lgb_model.best_score_)
score = f1_score(y_true=test_y, y_pred=lgb_model.predict(test_x), average=None)
print(score)
from sklearn.metrics import confusion_matrix
conf_mx = confusion_matrix(test_y, y_pred=lgb_model.predict(test_x))
plt.matshow(conf_mx, cmap=plt.cm.gray)
pd.DataFrame(conf_mx)
label2current_service
根据混淆矩阵可以看到,在830只有2000多个样本时,有接近一半,被错误的分类到尾数为166的套餐(以下简称166)中,而166对830分错较少
## 观察 99999830 与 89950166的差异
sns.jointplot(x='age',y='1_total_fee',data = train[train.current_service==99999830])
sns.jointplot(x='age',y='1_total_fee',data = train[train.current_service==89950166])
sns.jointplot(x='age',y='month_traffic',data = train[train.current_service==99999830])
sns.jointplot(x='age',y='month_traffic',data = train[train.current_service==89950166])
train[train.current_service==99999830]['complaint_level'].hist(bins=70)
train[train.current_service==89950166]['complaint_level'].hist(bins=70)
再对830和166两类进行EDA,可以看到无论是流量,还是话费,两套餐都呈现出高度一致
所以选择单独对这两类套餐进行二分类,既将830数据视为正样本,166视为负样本对模型进行训练,预测最终模型结果中830套餐为166套餐的概率,并对概率0.5以下进行修改
# 迁移学习
import os
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")
path = './data/'
w2v_path = path + 'w2v'
train = pd.read_csv(path + 'train_2.csv')
test = pd.read_csv(path + 'test_2.csv')
train_first = pd.read_csv(path + 'train_1.csv')
train['data_type'] = 0
test['data_type'] = 0
train_first['data_type'] = 1
data = pd.concat([train, test, train_first], ignore_index=True).fillna(0)
data['label'] = data.current_service.astype(int)
data = data.replace('\\N', 999)
data['gender'] = data.gender.astype(int)
origin_cate_feature = ['service_type', 'complaint_level', 'contract_type', 'gender', 'is_mix_service',
'is_promise_low_consume',
'many_over_bill', 'net_service']
origin_num_feature = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee',
'age', 'contract_time',
'former_complaint_fee', 'former_complaint_num',
'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic',
'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time']
for i in origin_num_feature:
data[i] = data[i].astype(float)
w2v_features = []
for col in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:
df = pd.read_csv(w2v_path + '/' + col + '.csv')
df = df.drop_duplicates([col])
fs = list(df)
fs.remove(col)
w2v_features += fs
data = pd.merge(data, df, on=col, how='left')
count_feature_list = []
def feature_count(data, features=[]):
if len(set(features)) != len(features):
print('equal feature !!!!')
return data
new_feature = 'count'
for i in features:
new_feature += '_' + i.replace('add_', '')
try:
del data[new_feature]
except:
pass
temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
data = data.merge(temp, 'left', on=features)
count_feature_list.append(new_feature)
return data
data = feature_count(data, ['1_total_fee'])
data = feature_count(data, ['2_total_fee'])
data = feature_count(data, ['3_total_fee'])
data = feature_count(data, ['4_total_fee'])
data = feature_count(data, ['former_complaint_fee'])
data = feature_count(data, ['pay_num'])
data = feature_count(data, ['contract_time'])
data = feature_count(data, ['last_month_traffic'])
data = feature_count(data, ['online_time'])
for i in ['service_type', 'contract_type']:
data = feature_count(data, [i, '1_total_fee'])
data = feature_count(data, [i, '2_total_fee'])
data = feature_count(data, [i, '3_total_fee'])
data = feature_count(data, [i, '4_total_fee'])
data = feature_count(data, [i, 'former_complaint_fee'])
data = feature_count(data, [i, 'pay_num'])
data = feature_count(data, [i, 'contract_time'])
data = feature_count(data, [i, 'last_month_traffic'])
data = feature_count(data, [i, 'online_time'])
# 差值特征
diff_feature_list = ['diff_total_fee_1', 'diff_total_fee_2', 'diff_total_fee_3', 'last_month_traffic_rest',
'rest_traffic_ratio',
'total_fee_mean', 'total_fee_max', 'total_fee_min', 'total_caller_time', 'service2_caller_ratio',
'local_caller_ratio',
'total_month_traffic', 'month_traffic_ratio', 'last_month_traffic_ratio', 'pay_num_1_total_fee',
'1_total_fee_call_fee', '1_total_fee_call2_fee', '1_total_fee_trfc_fee']
data['diff_total_fee_1'] = data['1_total_fee'] - data['2_total_fee']
data['diff_total_fee_2'] = data['2_total_fee'] - data['3_total_fee']
data['diff_total_fee_3'] = data['3_total_fee'] - data['4_total_fee']
data['pay_num_1_total_fee'] = data['pay_num'] - data['1_total_fee']
data['last_month_traffic_rest'] = data['month_traffic'] - data['last_month_traffic']
data['last_month_traffic_rest'][data['last_month_traffic_rest'] < 0] = 0
data['rest_traffic_ratio'] = (data['last_month_traffic_rest'] * 15 / 1024) / data['1_total_fee']
total_fee = []
for i in range(1, 5):
total_fee.append(str(i) + '_total_fee')
data['total_fee_mean'] = data[total_fee].mean(1)
data['total_fee_max'] = data[total_fee].max(1)
data['total_fee_min'] = data[total_fee].min(1)
data['total_caller_time'] = data['service2_caller_time'] + data['service1_caller_time']
data['service2_caller_ratio'] = data['service2_caller_time'] / data['total_caller_time']
data['local_caller_ratio'] = data['local_caller_time'] / data['total_caller_time']
data['total_month_traffic'] = data['local_trafffic_month'] + data['month_traffic']
data['month_traffic_ratio'] = data['month_traffic'] / data['total_month_traffic']
data['last_month_traffic_ratio'] = data['last_month_traffic'] / data['total_month_traffic']
data['1_total_fee_call_fee'] = data['1_total_fee'] - data['service1_caller_time'] * 0.15
data['1_total_fee_call2_fee'] = data['1_total_fee'] - data['service2_caller_time'] * 0.15
data['1_total_fee_trfc_fee'] = data['1_total_fee'] - (
data['month_traffic'] - 2 * data['last_month_traffic']) * 0.3
data.loc[data.service_type == 1, '1_total_fee_trfc_fee'] = None
cate_feature = origin_cate_feature
num_feature = origin_num_feature + count_feature_list + diff_feature_list + w2v_features
for i in cate_feature:
data[i] = data[i].astype('category')
for i in num_feature:
data[i] = data[i].astype(float)
feature = cate_feature + num_feature
print(len(feature), feature)
data = data[data.label != 999999]
train_x = data[(data.data_type == 1)][feature]
train_y = data[(data.data_type == 1)].label
test_x = data[(data.data_type == 0) & (data.label != 0)][feature]
test_y = data[(data.data_type == 0) & (data.label != 0)].label
lgb_model = lgb.LGBMClassifier(
boosting_type="gbdt", num_leaves=120, reg_alpha=0, reg_lambda=0.,
max_depth=-1, n_estimators=2500, objective='multiclass', metric="None",
subsample=0.9, colsample_bytree=0.5, subsample_freq=1,
learning_rate=0.035, random_state=2018, n_jobs=10
)
lgb_model.fit(train_x, train_y, categorical_feature=cate_feature)
print(lgb_model.best_score_)
stacking_path = path + 'stack/'
if not os.path.exists(stacking_path):
print(stacking_path)
os.makedirs(stacking_path)
train_proba = lgb_model.predict_proba(test_x[feature])
test_proba = lgb_model.predict_proba(data[data.label == 0][feature])
print(len(train_proba), len(test_proba))
stacking_train = data[(data.data_type == 0) & (data.label != 0)][['user_id']]
stacking_test = data[data.label == 0][['user_id']]
for i in range(11):
stacking_train['stacking_' + str(i)] = train_proba[:, i]
stacking_test['stacking_' + str(i)] = test_proba[:, i]
stacking_train.to_csv(stacking_path + 'train.csv', index=False)
stacking_test.to_csv(stacking_path + 'test.csv', index=False)
score = f1_score(y_true=test_y, y_pred=lgb_model.predict(test_x), average=None)
print(score)