1、模型的训练
# -*- coding: utf-8 -*-
import sys
from pandas import DataFrame, Series
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
import math
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report, log_loss
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
## 特征分桶
def binning(x, nbins=5, strategy='quantile', bin_stat='mean'):
assert strategy in ['uniform', 'quantile']
assert bin_stat in ['mean', 'max', 'min']
_discretizer = KBinsDiscretizer(nbins, strategy=strategy, encode='ordinal')
_result = _discretizer.fit_transform(x.reshape((-1, 1)))
# result = []
# for i in _result:
# result.append(i[0])
# print(result)
return _result, _discretizer
_WOE_MIN = -10
_WOE_MAX = 10
// 分桶后对桶内数据进行woe编码
def woe_single_x(x, y, event=1.0):
nbins = 10
if np.unique(x).size <= nbins:
# 离散型特征,使用等宽分桶
strategy = 'uniform'
else:
# 连续型特征,使用等频分桶
strategy = 'quantile'
x, _discretizer = binning(x, nbins=nbins, strategy=strategy)
event_total, non_event_total = count_binary(y, event=event)
x_labels = np.array(np.unique(x))
woe_dict = {}
iv = 0
for x1 in x_labels:
y1 = y[np.where(x == x1)[0]]
event_count, non_event_count = count_binary(y1, event=event)
rate_event = 1. * event_count / event_total
rate_non_event = 1. * non_event_count / non_event_total
if rate_event == 0:
woe1 = _WOE_MIN
elif rate_non_event == 0:
woe1 = _WOE_MAX
else:
woe1 = math.log(rate_event / rate_non_event)
woe_dict[x1] = woe1
print ",".join([str(x1), str(rate_event), str(woe1)])
iv += (rate_event - rate_non_event) * woe1
return woe_dict, iv, _discretizer
def count_binary(a, event=1.0):
event_count = (a == event).sum()
non_event_count = a.shape[-1] - event_count
return event_count, non_event_count
def woe_encode(arr, woe_dict, discretizer=None):
_vec = np.vectorize(lambda x: woe_dict.get(x,-1))
arr = arr.reshape((-1, 1))
arr = discretizer.transform(arr)
return _vec(arr).reshape((1, -1))[0]
def load_file(file_name):
features = []
for line in open(file_name):
ll = line.strip().split(",")
fe_array = []
for fe in ll[0:]:
try:
fe_value = float(fe)
except:
fe_value = 0.0
fe_array.append(fe_value)
features.append(fe_array)
featuresMat = np.array(features)
return featuresMat
def get_feature_conf(conf_file):
feature_names = []
for line in open(conf_file):
fe_name = line.strip()
feature_names.append(fe_name)
return feature_names
if __name__ == "__main__":
dataset = load_file(sys.argv[1]) #训练样本
X = dataset[:, 3:49]
Y = dataset[:, 2]
print(X.shape)
# 把数据集拆分成训练集和测试集
seed = 7
test_size = 0.33
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=test_size, random_state=seed)
today=datetime.date.today()
formatted_today=today.strftime('%Y%m%d')
# 特征的名称
feature_ids = get_feature_conf("model/feature_name.conf")
feature_id_to_name_dic = {}
for line in open("model/feature_id_name_map.csv"):
ll = line.strip().split(",")
fe_id = ll[0]
fe_name = ll[1]
feature_id_to_name_dic[fe_id] = fe_name
# 计算特征的iv值以及woe编码
trainX_woe = []
woe_dic = {}
iv_dic = {}
for i in range(trainX.shape[-1]):
# 给特征一个特征编号
f_index = i
f_id = feature_ids[f_index]
print(",".join([str(f_index), f_id, str(feature_id_to_name_dic.get(f_id))]))
print ",".join(["bucket_index", "cheat_rate", "woe_value"])
f_arr = trainX[:, i]
woe_dict, iv, _discretizer = woe_single_x(f_arr, trainY)
woe_dic[f_id] = [woe_dict, _discretizer]
iv_dic[f_id] = iv
# 特征选择(选择IV值较高的topN%的特征)
select_rate = 0.8
iv_sort = sorted(iv_dic.items(), key=lambda k: k[1], reverse=True)
select_index = int(len(iv_sort) * select_rate)
print(select_index)
iv_feature_select = iv_sort[:select_index]
selected_features = set([item[0] for item in iv_feature_select])
print(selected_features)
# 保存中间特征预处理参数结果
mid_result = {"woe_dic": woe_dic,
"select_features": selected_features}
joblib.dump(mid_result, 'model/woe_'+formatted_today+'.pkl')
# 特征woe编码
for i in range(trainX.shape[-1]):
f_index = i
f_id = feature_ids[f_index]
if f_id in selected_features:
f_arr = trainX[:, i]
woe_dict, _discretizer = woe_dic.get(f_id)
# print(woe_dict, iv, _discretizer)
f_woe_code = woe_encode(f_arr, woe_dict, _discretizer)
f_woe_code = np.nan_to_num(f_woe_code)
trainX_woe.append(f_woe_code)
testX_woe = []
for i in range(testX.shape[-1]):
f_index = i
f_id = feature_ids[f_index]
if f_id in selected_features:
f_arr = testX[:, i]
woe_dict, _discretizer = woe_dic.get(f_id)
f_woe_code = woe_encode(f_arr, woe_dict, _discretizer)
f_woe_code = np.nan_to_num(f_woe_code)
testX_woe.append(f_woe_code)
testX = np.array(testX_woe).transpose()
trainX = np.array(trainX_woe).transpose()
print(testX.shape)
print(trainX.shape)
# 模型训练
lr = LogisticRegression(penalty='l2', solver='sag')
parameters = {
'C': np.arange(0.02, 0.1, 0.02),
'max_iter': range(10, 100, 20)
}
# clf = LogisticRegressionCV(Cs=[100, 10, 1, .1, .01, .001, .0001], scoring='roc_auc')
# clf = xgb.XGBClassifier(max_depth=4, n_estimators=300, )
grid = GridSearchCV(lr, parameters, cv=3, scoring='roc_auc')
grid.fit(trainX, trainY)
print('================================')
print("grid.best_params_", grid.best_params_)
print("grid.best_score_", grid.best_score_)
clf = grid.best_estimator_
print("grid.best_estimator_:", grid.best_estimator_)
clf.fit(trainX, trainY)
#print("LR 权重参数:", clf.coef_.flatten(), len(clf.coef_.flatten()))
joblib.dump(clf, 'model/lr_model_'+formatted_today+'_zjmj.pkl')
# 计算特征的权重
weight_map = {}
for i in range(trainX.shape[-1]):
f_index = i
f_id = feature_ids[f_index]
if f_id in selected_features:
weight_map.setdefault(f_id, 0)
weight_map[f_id] = clf.coef_.flatten()[i]
print ",".join(["fe_id", "fe_name", "fe_iv", "fe_weight"])
for k in iv_sort:
fe_id = k[0]
fe_name = feature_id_to_name_dic.get(fe_id)
fe_weight = weight_map.get(fe_id) # LR的特征权重参数
fe_iv = k[1] # 特征的IV值
print ",".join([str(fe_id), str(fe_name), str(fe_iv), str(fe_weight)])
# 模型预测
yhat_train = clf.predict_proba(trainX)[:, 1]
yhat_test = clf.predict_proba(testX)[:, 1]
# for i in range(len(testY)):
# print(','.join([str(testY[i]), str(yhat_test[i])]))
# 模型评估
_auc_score_train = roc_auc_score(y_true=trainY, y_score=yhat_train)
print('Auc score of train set , {}'.format(_auc_score_train))
_auc_score_test = roc_auc_score(y_true=testY, y_score=yhat_test)
print('Auc score of test set , {}'.format(_auc_score_test))
_log_loss_train = log_loss(trainY, yhat_train)
print('Logloss of Train , {}'.format(_log_loss_train))
_log_loss_test = log_loss(testY, yhat_test)
print('Logloss of Test , {}'.format(_log_loss_test))
_coef = clf.coef_
# _coef_nums = len(_coef)
#print('coef: {}'.format(_coef))
2、模型的加载和模型预测
# -*- coding: utf-8 -*-
import sys
import numpy as np
from sklearn.externals import joblib
def woe_encode(arr, woe_dict, discretizer=None):
_vec = np.vectorize(lambda x: woe_dict.get(x))
arr = arr.reshape((-1, 1))
arr = discretizer.transform(arr)
return _vec(arr).reshape((1, -1))[0]
def load_file(file_name):
features = []
for line in open(file_name):
ll = line.strip().split("\t")
fe_array = [ll[0]]
for fe in ll[1:]:
try:
fe_value = float(fe)
except:
fe_value = 0.0
fe_array.append(fe_value)
features.append(fe_array)
featuresMat = np.array(features)
return featuresMat
def get_feature_conf(conf_file):
feature_names = []
for line in open(conf_file):
fe_name = line.strip()
feature_names.append(fe_name)
return feature_names
if __name__ == "__main__":
model_version = sys.argv[1]
datasetTest = load_file(sys.argv[2])
# split data into X and y
testX = datasetTest[:, 1:47]
seller_ids = datasetTest[:, 0]
feature_ids = get_feature_conf("model/feature_name.conf")
# 加载中间结果
mid_result = joblib.load('model/woe_'+model_version+'.pkl')
woe_dic_new = mid_result.get("woe_dic")
selected_features = mid_result.get("select_features")
# 特征woe编码
testX_woe = []
for i in range(testX.shape[-1]):
f_id = feature_ids[i]
if f_id in selected_features:
f_arr = testX[:, i]
woe_dict, _discretizer = woe_dic_new.get(f_id)
f_woe_code = woe_encode(f_arr, woe_dict, _discretizer)
f_woe_code = np.nan_to_num(f_woe_code)
testX_woe.append(f_woe_code)
testX = np.array(testX_woe).transpose()
# print(testX.shape)
# 加载模型
clf = joblib.load('model/lr_model_'+model_version+'.pkl')
# 模型预测
yhat = clf.predict_proba(testX)[:, 1]
for i in range(len(seller_ids)):
if("," in seller_ids[i]):
continue
print(','.join([str(seller_ids[i]), str(yhat[i])]))