机器学习相关操作分享(五)

# 数据准备阶段
## 导入基本的包
import os
import pandas as pd

import matplotlib.pyplot as plt
# plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
# plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
# plt.rcParams['font.family'] = ['sans-serif']
import seaborn as sns

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import f1_score

from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing

from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings("ignore")
## 导入数据
path = './data/'

train = pd.read_csv(path + 'train_2.csv')
test = pd.read_csv(path + 'test_2.csv')


# 迁移学习构造的特征
# train_stacking = pd.read_csv(path + '/stack/train.csv')
# test_stacking = pd.read_csv(path + '/stack/test.csv')

# print(len(train), len(test))
# train = train.merge(train_stacking, 'left', 'user_id')
# test = test.merge(test_stacking, 'left', 'user_id')
# print(len(train), len(test))
print(len(train), len(test))
train.info()
### 发现存在object类型的特征,需要进行转换
for i in train['2_total_fee'].values:
    try:
        tmp = float(i)
    except:
        print(i)
for i in train['3_total_fee'].values:
    try:
        tmp = float(i)
    except:
        print(i)
train = train.replace('\\N', 0)
train['2_total_fee'] = train['2_total_fee'].astype(float)
train['3_total_fee'] = train['3_total_fee'].astype(float)
train.info()
test.info()
for i in test['2_total_fee'].values:
    try:
        tmp = float(i)
    except:
        print(i)
test = test.replace('\\N', 0)
test['2_total_fee'] = test['2_total_fee'].astype(float)
| 字段     | 中文名| 数据类型|  说明 |
|:-------:|:-------:|:-------:|:-------:|
|USERID|    用户ID|    VARCHAR2(50)|    用户编码,标识用户的唯一字段|
|current_service|    套餐    |VARCHAR2(500)    |/|
|service_type    |套餐类型    |VARCHAR2(10)    |0:23G融合,1:2I2C,2:2G,3:3G,4:4G|
|is_mix_service    |是否固移融合套餐|    VARCHAR2(10)|    1.是 0.否|
|online_time    |在网时长|    VARCHAR2(50)    |/|
|1_total_fee|    当月总出账金额_月    |NUMBER|    单位:元|
|2_total_fee    |当月前1月总出账金额_月|    NUMBER    |单位:元|
|3_total_fee|    当月前2月总出账金额_月|    NUMBER    单位:元|
|4_total_fee    |当月前3月总出账金额_月    |NUMBER|    单位:元|
|month_traffic    |当月累计-流量    |NUMBER|    单位:MB|
|many_over_bill|    连续超套    |VARCHAR2(500)|    1-是,0-否|
|contract_type|    合约类型|    VARCHAR2(500)    |ZBG_DIM.DIM_CBSS_ACTIVITY_TYPE|
|contract_time|    合约时长|    VARCHAR2(500)|    /|
|is_promise_low_consume    |是否承诺低消用户|    VARCHAR2(500)    |1.是 0.否|
|net_service    |网络口径用户|    VARCHAR2(500)    |20AAAAAA-2G|
|pay_times    |交费次数    |NUMBER    |单位:次|
|pay_num    |交费金额    |NUMBER    |单位:元|
|last_month_traffic    |上月结转流量|    NUMBER|    单位:MB|
|local_trafffic_month|    月累计-本地数据流量    |NUMBER    |单位:MB|
|local_caller_time|    本地语音主叫通话时长|    NUMBER|    单位:分钟|
|service1_caller_time    |套外主叫通话时长|    NUMBER    |单位:分钟|
|service2_caller_time    |Service2_caller_time|    NUMBER    |单位:分钟|
|gender|    性别    |varchar2(100)    |01.男 02女|
|age|    年龄|    varchar2(100)|    /|
|complaint_level    |投诉重要性|    VARCHAR2(1000)    |1:普通,2:重要,3:重大|
|former_complaint_num|交费金历史投诉总量|    NUMBER    |单位:次|
|former_complaint_fee|    历史执行补救费用交费金额    |NUMBER    |单位:分|
# 数据分析
# 查看训练集中套餐的统计信息及分布
train['current_service'].value_counts()
999999套餐特别的少,初步看作是异常值,毕竟类别少的是很难学习到有效信息的
## 单变量分析
train['age'].hist(bins=70)
由图中可以看出,客户数量最大的年龄段为17-31岁,其中22岁客户占比5%,远超平均水平1.7%。可见客户群体和年龄存在较大关联,且年轻人群占有较大比重。
train['gender'].hist(bins=70)
客户性别分布明显,性别1 客户数量为性别2 客户数量的2倍多,性别的影响非常大。

我们观察到性别中有0 的缺省值,对于这部分,我们使用了两种方法处理,一种是填充service_type(或更细致)对应字段的众数,和原始值。最终我们选取了原始值,我们认为默性别在不同套餐中的转换率呈现了分布差异。
train['service_type'].hist(bins=70)
train['service_type'].value_counts()
服务类型集中在1和4,3很大可能为异常值
train['complaint_level'].hist(bins=70)
投诉重要性大部分为0
## 多变量分析
fig,ax = plt.subplots(figsize=(10,10))
service = train['current_service'].unique()
for i in service:
    train[train['current_service']==i]['1_total_fee'].hist(bins=100,label=str(i),ax=ax,alpha=0.5)
plt.xlim([0,500])
plt.legend()
观察1_total_fee(当月总出账金额_月)与套餐的分布,还是有很明显的区分度的,当月总出账金额很大程度上影响套餐类型的。后续可以多考虑从1_total_fee、2_total_fee、3_total_fee、4_total_fee中挖掘更多有用特征。

一般而言,有区分度的特征都为优质特征。
sns.jointplot(x='age',y='1_total_fee',data = train)
话费与年龄存在明显关系,除了年龄为0外,高花费集中区域很明显
sns.jointplot(x='age',y='month_traffic',data = train)
train.groupby(['current_service'])['1_total_fee'].agg({'count','mean'}).reset_index()
train[train.service_type!=3].groupby(['current_service','service_type'])['user_id'].agg({'count'})
我们可以得到一个明显的规律,service_type可以将套餐分为两个部分,这两部分是没有交叉的,其中一类有8个,另外一类有3个。这给我们比赛带来一个思路是,可以分模型预测作为一个尝试的方向。(不考虑service_type=3的情况下)
# 特征工程
## 数据预处理
data = pd.concat([train, test], ignore_index=True).fillna(0)
# current_service为目标
data['label'] = data.current_service.astype(int)
data = data.replace('\\N', 0)

data['gender'] = data.gender.astype(int)
data['service_type'].value_counts()
data.loc[data['service_type'] == 3, 'service_type'] = 4
**原始特征分类**
origin_cate_feature = ['service_type', 'complaint_level', 'contract_type', 'gender', 'is_mix_service',
                       'is_promise_low_consume',
                       'many_over_bill', 'net_service']

origin_num_feature = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee',
                      'age', 'contract_time',
                      'former_complaint_fee', 'former_complaint_num',
                      'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic',
                      'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time']
**类型转换**

由于部分特征为object类型
for i in origin_num_feature:
    data[i] = data[i].astype(float)
## Embedding 特征
这里使用Word2Vec构建embedding特征
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing

L = 10

sentence = []
for line in list(data[['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']].values):
    sentence.append([str(float(l)) for idx, l in enumerate(line)])

print('training...')
model = Word2Vec(sentence, size=L, window=2, min_count=1, workers=multiprocessing.cpu_count(),
                 iter=10)
print('outputing...')

for fea in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:
    values = []
    for line in list(data[fea].values):
        values.append(line)
    values = set(values)
    print(len(values))
    w2v = []
    for i in values:
        a = [i]
        a.extend(model[str(float(i))])
        w2v.append(a)
    out_df = pd.DataFrame(w2v)

    name = [fea]
    for i in range(L):
        name.append(name[0] + 'W' + str(i))
    out_df.columns = name
    out_df.to_csv( './data/w2v/' + fea + '.csv', index=False)
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
df=pd.read_csv('./data/w2v/3_total_fee.csv')
l=list(df['3_total_fee'].astype('str'))
name=list(df)

def plot_with_labels(low_dim_embs, labels, filename = 'tsne.png'):
    assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
    plt.figure(figsize= (10, 18))
    for i, label in enumerate(labels):
        x, y = low_dim_embs[i, :]
        plt.scatter(x, y)
        plt.annotate(label, xy = (x, y), textcoords = 'offset points', ha = 'right', va = 'bottom')
    plt.savefig(filename) 

tsne = TSNE(perplexity = 30, n_components = 2, init = 'pca', n_iter = 5000)

plot_only = 300
low_dim_embs = tsne.fit_transform(df.iloc[:plot_only][name[1:]])
labels = [l[i] for i in range(plot_only)]
plot_with_labels(low_dim_embs, labels)
### 考虑对Embedding特征进行聚类
观察聚类结果对套餐的区分度
w2v_features = []
for col in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:
    df = pd.read_csv('./data/w2v/' + col + '.csv')
    df = df.drop_duplicates([col])
    fs = list(df)
    fs.remove(col)
    w2v_features += fs
    print(len(data))
    data = pd.merge(data, df, on=col, how='left')
    print(len(data))
print(w2v_features)
## 统计特征
train['1_total_fee'].value_counts().head(20).plot(kind='bar', figsize=(16,9))
**是否由区分度呢?**
train[train['1_total_fee']==106]['current_service'].value_counts()
count_feature_list = []

def feature_count(data, features=[]):
    if len(set(features)) != len(features):
        print('equal feature !!!!')
        return data
    new_feature = 'count'
    for i in features:
        new_feature += '_' + i.replace('add_', '')
    temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
    data = data.merge(temp, 'left', on=features)
    count_feature_list.append(new_feature)
    
    # 迁移特征
    if 'service_type' in features:
        temp_2 = train_first.groupby(features).size().reset_index().rename(columns={0: 'train_' + new_feature})
        data = data.merge(temp_2, 'left', on=features)
        count_feature_list.append('train_' + new_feature)
    
    return data
data = feature_count(data, ['1_total_fee'])
data = feature_count(data, ['2_total_fee'])
data = feature_count(data, ['3_total_fee'])
data = feature_count(data, ['4_total_fee'])

data = feature_count(data, ['former_complaint_fee'])

data = feature_count(data, ['pay_num'])
data = feature_count(data, ['contract_time'])
data = feature_count(data, ['last_month_traffic'])
data = feature_count(data, ['online_time'])
## 迁移特征
# for i in ['service_type', 'contract_type']:
#     data = feature_count(data, [i, '1_total_fee'])
#     data = feature_count(data, [i, '2_total_fee'])
#     data = feature_count(data, [i, '3_total_fee'])
#     data = feature_count(data, [i, '4_total_fee'])

#     data = feature_count(data, [i, 'former_complaint_fee'])

#     data = feature_count(data, [i, 'pay_num'])
#     data = feature_count(data, [i, 'contract_time'])
#     data = feature_count(data, [i, 'last_month_traffic'])
#     data = feature_count(data, [i, 'online_time'])
train1 = pd.read_csv(path + 'train_1.csv')
test1 = pd.read_csv(path + 'test_1.csv')

data1 = pd.concat([train1, test1], ignore_index=True).fillna(0)
# current_service为目标
data1['label'] = data1.current_service.astype(int)
data1 = data1.replace('\\N', 0)

data1['gender'] = data1.gender.astype(int)
train['current_service'].value_counts()
train1['current_service'].value_counts()
train1['1_total_fee'].value_counts().head(20).plot(kind='bar', figsize=(16,9))
其实从最基本的count统计就能看出初赛数据和复赛数据之间的差异
## 差值特征
diff_feature_list = ['diff_total_fee_1', 'diff_total_fee_2', 'diff_total_fee_3', 'last_month_traffic_rest',
                     'rest_traffic_ratio',
                     'total_fee_mean', 'total_fee_max', 'total_fee_min', 'total_caller_time', 'service2_caller_ratio',
                     'local_caller_ratio',
                     'total_month_traffic', 'month_traffic_ratio', 'last_month_traffic_ratio', 'pay_num_1_total_fee',
                     '1_total_fee_call_fee', '1_total_fee_call2_fee', '1_total_fee_trfc_fee']

data['diff_total_fee_1'] = data['1_total_fee'] - data['2_total_fee']
data['diff_total_fee_2'] = data['2_total_fee'] - data['3_total_fee']
data['diff_total_fee_3'] = data['3_total_fee'] - data['4_total_fee']

data['pay_num_1_total_fee'] = data['pay_num'] - data['1_total_fee']

data['last_month_traffic_rest'] = data['month_traffic'] - data['last_month_traffic']
data['last_month_traffic_rest'][data['last_month_traffic_rest'] < 0] = 0
data['rest_traffic_ratio'] = (data['last_month_traffic_rest'] * 15 / 1024) / data['1_total_fee']

total_fee = []
for i in range(1, 5):
    total_fee.append(str(i) + '_total_fee')
data['total_fee_mean'] = data[total_fee].mean(1)
data['total_fee_max'] = data[total_fee].max(1)
data['total_fee_min'] = data[total_fee].min(1)

data['total_caller_time'] = data['service2_caller_time'] + data['service1_caller_time']
data['service2_caller_ratio'] = data['service2_caller_time'] / data['total_caller_time']
data['local_caller_ratio'] = data['local_caller_time'] / data['total_caller_time']

data['total_month_traffic'] = data['local_trafffic_month'] + data['month_traffic']
data['month_traffic_ratio'] = data['month_traffic'] / data['total_month_traffic']
data['last_month_traffic_ratio'] = data['last_month_traffic'] / data['total_month_traffic']

data['1_total_fee_call_fee'] = data['1_total_fee'] - data['service1_caller_time'] * 0.15
data['1_total_fee_call2_fee'] = data['1_total_fee'] - data['service2_caller_time'] * 0.15
data['1_total_fee_trfc_fee'] = data['1_total_fee'] - (
        data['month_traffic'] - 2 * data['last_month_traffic']) * 0.3
data.loc[data.service_type == 1, '1_total_fee_trfc_fee'] = None

cate_feature = origin_cate_feature
num_feature = origin_num_feature + count_feature_list + diff_feature_list + w2v_features

for i in cate_feature:
    data[i] = data[i].astype('category')
for i in num_feature:
    data[i] = data[i].astype(float)

feature = cate_feature + num_feature

print(len(feature), feature)
# 训练模型
def f1_score_vali(preds, data_vali):
    labels = data_vali.get_label()
    preds = np.argmax(preds.reshape(11, -1), axis=0)
    score_vali = f1_score(y_true=labels, y_pred=preds, average='macro')
    return 'f1_score', score_vali ** 2, True
# 提取训练集和标签
X = data[(data.label != 0) & (data.label != 999999)][feature].reset_index(drop=True)
y = data[(data.label != 0) & (data.label != 999999)].label.reset_index(drop=True)
# 套餐映射编码
label2current_service = dict(
    zip(range(0, len(set(y))), sorted(list(set(y)))))
current_service2label = dict(
    zip(sorted(list(set(y))), range(0, len(set(y)))))
label2current_service
current_service2label
y = pd.Series(y).map(current_service2label)
params = {
    "learning_rate": 0.1,
    "boosting": 'gbdt',  
    "lambda_l2": 0.1,
    "max_depth": -1,
    "num_leaves": 128,
    "bagging_fraction": 0.8,
    "feature_fraction": 0.8,
    "max_bin": 1500,
    "metric": None,
    "objective": "multiclass",
    "num_class": 11,
    "silent": True,
    "nthread": 10,
    "verbose": -1
}
cv_pred = [] # 测试集结果
oof_pred = np.zeros(X.shape[0]) # 验证集结果
skf = StratifiedKFold(n_splits=5, random_state=20181, shuffle=True)

for index, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(index)
    
    train_x, test_x, train_y, test_y = X.loc[train_index], X.loc[test_index], y.loc[train_index], y.loc[test_index]
    
    train_data = lgb.Dataset(train_x, label=train_y)
    valid_data = lgb.Dataset(test_x, label=test_y)
    
    clf = lgb.train(params, train_data, num_boost_round=2000, valid_sets=[valid_data], feval=f1_score_vali,
                    verbose_eval=20, early_stopping_rounds=50)
    
    y_test = clf.predict(data[data.label == 0][feature], num_iteration=clf.best_iteration)
    y_test = [np.argmax(x) for x in y_test]
    
    oof_pred[train_index] = [np.argmax(x) for x in clf.predict(X.loc[train_index][feature], num_iteration=clf.best_iteration)]
    
    # 将K折进行拼接,讲数组按列顺序进行堆叠
    if index == 0:
        cv_pred = np.array(y_test).reshape(-1, 1)
    else:
        cv_pred = np.hstack((cv_pred, np.array(y_test).reshape(-1, 1)))
submit = []
for line in cv_pred:
    submit.append(np.argmax(np.bincount(line)))
result = pd.DataFrame()
result['user_id'] = data[data.label == 0]['user_id']
result['predict'] = submit
result['predict'] = result['predict'].map(label2current_service)
result.loc[result['user_id'] == '4VNcD6kE0sjnAvFX', 'predict'] = 999999

print(len(result), result.predict.value_counts())
print(result.sort_values('user_id').head())
result[['user_id', 'predict']].to_csv(
    path + '/sub.csv', index=False)
# 误差分析
## 对比每个类别的F1
lgb_model = lgb.LGBMClassifier(
    boosting_type="gbdt", num_leaves=32, reg_alpha=0, reg_lambda=0.,
    max_depth=-1, n_estimators=100, objective='multiclass', metric="None",
    subsample=0.9, colsample_bytree=0.5, subsample_freq=1,
    learning_rate=0.2, random_state=2018, n_jobs=10
)
lgb_model.fit(train_x, train_y, categorical_feature=cate_feature)
print(lgb_model.best_score_)

score = f1_score(y_true=test_y, y_pred=lgb_model.predict(test_x), average=None)
print(score)
from sklearn.metrics import confusion_matrix
conf_mx = confusion_matrix(test_y, y_pred=lgb_model.predict(test_x))
plt.matshow(conf_mx, cmap=plt.cm.gray)
pd.DataFrame(conf_mx)
label2current_service
根据混淆矩阵可以看到,在830只有2000多个样本时,有接近一半,被错误的分类到尾数为166的套餐(以下简称166)中,而166对830分错较少
## 观察 99999830 与 89950166的差异
sns.jointplot(x='age',y='1_total_fee',data = train[train.current_service==99999830])
sns.jointplot(x='age',y='1_total_fee',data = train[train.current_service==89950166])
sns.jointplot(x='age',y='month_traffic',data = train[train.current_service==99999830])
sns.jointplot(x='age',y='month_traffic',data = train[train.current_service==89950166])
train[train.current_service==99999830]['complaint_level'].hist(bins=70)
train[train.current_service==89950166]['complaint_level'].hist(bins=70)
再对830和166两类进行EDA,可以看到无论是流量,还是话费,两套餐都呈现出高度一致

所以选择单独对这两类套餐进行二分类,既将830数据视为正样本,166视为负样本对模型进行训练,预测最终模型结果中830套餐为166套餐的概率,并对概率0.5以下进行修改

# 迁移学习
import os
import pandas as pd

import lightgbm as lgb
import numpy as np
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")

path = './data/'

w2v_path = path + 'w2v'
train = pd.read_csv(path + 'train_2.csv')
test = pd.read_csv(path + 'test_2.csv')

train_first = pd.read_csv(path + 'train_1.csv')
train['data_type'] = 0
test['data_type'] = 0
train_first['data_type'] = 1
data = pd.concat([train, test, train_first], ignore_index=True).fillna(0)
data['label'] = data.current_service.astype(int)
data = data.replace('\\N', 999)
data['gender'] = data.gender.astype(int)
origin_cate_feature = ['service_type', 'complaint_level', 'contract_type', 'gender', 'is_mix_service',
                       'is_promise_low_consume',
                       'many_over_bill', 'net_service']

origin_num_feature = ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee',
                      'age', 'contract_time',
                      'former_complaint_fee', 'former_complaint_num',
                      'last_month_traffic', 'local_caller_time', 'local_trafffic_month', 'month_traffic',
                      'online_time', 'pay_num', 'pay_times', 'service1_caller_time', 'service2_caller_time']

for i in origin_num_feature:
    data[i] = data[i].astype(float)
w2v_features = []
for col in ['1_total_fee', '2_total_fee', '3_total_fee', '4_total_fee']:
    df = pd.read_csv(w2v_path + '/' + col + '.csv')
    df = df.drop_duplicates([col])
    fs = list(df)
    fs.remove(col)
    w2v_features += fs
    data = pd.merge(data, df, on=col, how='left')
count_feature_list = []
def feature_count(data, features=[]):
    if len(set(features)) != len(features):
        print('equal feature !!!!')
        return data
    new_feature = 'count'
    for i in features:
        new_feature += '_' + i.replace('add_', '')
    try:
        del data[new_feature]
    except:
        pass
    temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})

    data = data.merge(temp, 'left', on=features)
    count_feature_list.append(new_feature)
    return data


data = feature_count(data, ['1_total_fee'])
data = feature_count(data, ['2_total_fee'])
data = feature_count(data, ['3_total_fee'])
data = feature_count(data, ['4_total_fee'])

data = feature_count(data, ['former_complaint_fee'])

data = feature_count(data, ['pay_num'])
data = feature_count(data, ['contract_time'])
data = feature_count(data, ['last_month_traffic'])
data = feature_count(data, ['online_time'])

for i in ['service_type', 'contract_type']:
    data = feature_count(data, [i, '1_total_fee'])
    data = feature_count(data, [i, '2_total_fee'])
    data = feature_count(data, [i, '3_total_fee'])
    data = feature_count(data, [i, '4_total_fee'])

    data = feature_count(data, [i, 'former_complaint_fee'])

    data = feature_count(data, [i, 'pay_num'])
    data = feature_count(data, [i, 'contract_time'])
    data = feature_count(data, [i, 'last_month_traffic'])
    data = feature_count(data, [i, 'online_time'])
# 差值特征
diff_feature_list = ['diff_total_fee_1', 'diff_total_fee_2', 'diff_total_fee_3', 'last_month_traffic_rest',
                     'rest_traffic_ratio',
                     'total_fee_mean', 'total_fee_max', 'total_fee_min', 'total_caller_time', 'service2_caller_ratio',
                     'local_caller_ratio',
                     'total_month_traffic', 'month_traffic_ratio', 'last_month_traffic_ratio', 'pay_num_1_total_fee',
                     '1_total_fee_call_fee', '1_total_fee_call2_fee', '1_total_fee_trfc_fee']

data['diff_total_fee_1'] = data['1_total_fee'] - data['2_total_fee']
data['diff_total_fee_2'] = data['2_total_fee'] - data['3_total_fee']
data['diff_total_fee_3'] = data['3_total_fee'] - data['4_total_fee']

data['pay_num_1_total_fee'] = data['pay_num'] - data['1_total_fee']

data['last_month_traffic_rest'] = data['month_traffic'] - data['last_month_traffic']
data['last_month_traffic_rest'][data['last_month_traffic_rest'] < 0] = 0
data['rest_traffic_ratio'] = (data['last_month_traffic_rest'] * 15 / 1024) / data['1_total_fee']

total_fee = []
for i in range(1, 5):
    total_fee.append(str(i) + '_total_fee')
data['total_fee_mean'] = data[total_fee].mean(1)
data['total_fee_max'] = data[total_fee].max(1)
data['total_fee_min'] = data[total_fee].min(1)

data['total_caller_time'] = data['service2_caller_time'] + data['service1_caller_time']
data['service2_caller_ratio'] = data['service2_caller_time'] / data['total_caller_time']
data['local_caller_ratio'] = data['local_caller_time'] / data['total_caller_time']

data['total_month_traffic'] = data['local_trafffic_month'] + data['month_traffic']
data['month_traffic_ratio'] = data['month_traffic'] / data['total_month_traffic']
data['last_month_traffic_ratio'] = data['last_month_traffic'] / data['total_month_traffic']

data['1_total_fee_call_fee'] = data['1_total_fee'] - data['service1_caller_time'] * 0.15
data['1_total_fee_call2_fee'] = data['1_total_fee'] - data['service2_caller_time'] * 0.15
data['1_total_fee_trfc_fee'] = data['1_total_fee'] - (
        data['month_traffic'] - 2 * data['last_month_traffic']) * 0.3

data.loc[data.service_type == 1, '1_total_fee_trfc_fee'] = None

cate_feature = origin_cate_feature
num_feature = origin_num_feature + count_feature_list + diff_feature_list + w2v_features
for i in cate_feature:
    data[i] = data[i].astype('category')
for i in num_feature:
    data[i] = data[i].astype(float)

feature = cate_feature + num_feature

print(len(feature), feature)

data = data[data.label != 999999]
train_x = data[(data.data_type == 1)][feature]
train_y = data[(data.data_type == 1)].label

test_x = data[(data.data_type == 0) & (data.label != 0)][feature]
test_y = data[(data.data_type == 0) & (data.label != 0)].label
lgb_model = lgb.LGBMClassifier(
    boosting_type="gbdt", num_leaves=120, reg_alpha=0, reg_lambda=0.,
    max_depth=-1, n_estimators=2500, objective='multiclass', metric="None",
    subsample=0.9, colsample_bytree=0.5, subsample_freq=1,
    learning_rate=0.035, random_state=2018, n_jobs=10
)
lgb_model.fit(train_x, train_y, categorical_feature=cate_feature)
print(lgb_model.best_score_)

stacking_path = path + 'stack/'
if not os.path.exists(stacking_path):
    print(stacking_path)
    os.makedirs(stacking_path)
    train_proba = lgb_model.predict_proba(test_x[feature])
    test_proba = lgb_model.predict_proba(data[data.label == 0][feature])
    print(len(train_proba), len(test_proba))
    stacking_train = data[(data.data_type == 0) & (data.label != 0)][['user_id']]
    stacking_test = data[data.label == 0][['user_id']]
    for i in range(11):
        stacking_train['stacking_' + str(i)] = train_proba[:, i]
        stacking_test['stacking_' + str(i)] = test_proba[:, i]
    stacking_train.to_csv(stacking_path + 'train.csv', index=False)
    stacking_test.to_csv(stacking_path + 'test.csv', index=False)

score = f1_score(y_true=test_y, y_pred=lgb_model.predict(test_x), average=None)
print(score)


 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值