lr模型的训练和预测代码demo

1、模型的训练

# -*- coding: utf-8 -*-
import sys
from pandas import DataFrame, Series
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np
import math
import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, classification_report, log_loss
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split

## 特征分桶
def binning(x, nbins=5, strategy='quantile', bin_stat='mean'):
    assert strategy in ['uniform', 'quantile']
    assert bin_stat in ['mean', 'max', 'min']

    _discretizer = KBinsDiscretizer(nbins, strategy=strategy, encode='ordinal')

    _result = _discretizer.fit_transform(x.reshape((-1, 1)))
    # result = []
    # for i in _result:
    #    result.append(i[0])
    # print(result)

    return _result, _discretizer


_WOE_MIN = -10
_WOE_MAX = 10

// 分桶后对桶内数据进行woe编码
def woe_single_x(x, y, event=1.0):
    nbins = 10
    if np.unique(x).size <= nbins:
        # 离散型特征,使用等宽分桶
        strategy = 'uniform'
    else:
        # 连续型特征,使用等频分桶
        strategy = 'quantile'


    x, _discretizer = binning(x, nbins=nbins, strategy=strategy)

    event_total, non_event_total = count_binary(y, event=event)
    x_labels = np.array(np.unique(x))
    woe_dict = {}
    iv = 0
    for x1 in x_labels:
        y1 = y[np.where(x == x1)[0]]
        event_count, non_event_count = count_binary(y1, event=event)
        rate_event = 1. * event_count / event_total
        rate_non_event = 1. * non_event_count / non_event_total
        if rate_event == 0:
            woe1 = _WOE_MIN
        elif rate_non_event == 0:
            woe1 = _WOE_MAX
        else:
            woe1 = math.log(rate_event / rate_non_event)
        woe_dict[x1] = woe1
        print ",".join([str(x1), str(rate_event), str(woe1)])
        iv += (rate_event - rate_non_event) * woe1
    return woe_dict, iv, _discretizer


def count_binary(a, event=1.0):
    event_count = (a == event).sum()
    non_event_count = a.shape[-1] - event_count
    return event_count, non_event_count


def woe_encode(arr, woe_dict, discretizer=None):
    _vec = np.vectorize(lambda x: woe_dict.get(x,-1))
    arr = arr.reshape((-1, 1))
    arr = discretizer.transform(arr)
    return _vec(arr).reshape((1, -1))[0]

def load_file(file_name):
    features = []
    for line in open(file_name):
        ll = line.strip().split(",")
        fe_array = []
        for fe in ll[0:]:
            try:
                fe_value = float(fe)
            except:
                fe_value = 0.0
            fe_array.append(fe_value)
        features.append(fe_array)
    featuresMat = np.array(features)

    return featuresMat


def get_feature_conf(conf_file):
    feature_names = []
    for line in open(conf_file):
        fe_name = line.strip()
        feature_names.append(fe_name)

    return feature_names


if __name__ == "__main__":
    dataset = load_file(sys.argv[1])  #训练样本
    X = dataset[:, 3:49]
    Y =  dataset[:, 2]
    print(X.shape)

    # 把数据集拆分成训练集和测试集
    seed = 7
    test_size = 0.33
    trainX,  testX, trainY, testY = train_test_split(X, Y, test_size=test_size, random_state=seed)
    today=datetime.date.today()
    formatted_today=today.strftime('%Y%m%d')


    # 特征的名称
    feature_ids = get_feature_conf("model/feature_name.conf")
    feature_id_to_name_dic = {}
    for line in open("model/feature_id_name_map.csv"):
        ll = line.strip().split(",")
        fe_id = ll[0]
        fe_name = ll[1]
        feature_id_to_name_dic[fe_id] = fe_name

    # 计算特征的iv值以及woe编码
    trainX_woe = []
    woe_dic = {}
    iv_dic = {}
    for i in range(trainX.shape[-1]):
        # 给特征一个特征编号
        f_index = i
        f_id = feature_ids[f_index]

        print(",".join([str(f_index), f_id, str(feature_id_to_name_dic.get(f_id))]))
        print ",".join(["bucket_index", "cheat_rate", "woe_value"])
        f_arr = trainX[:, i]
        woe_dict, iv, _discretizer = woe_single_x(f_arr, trainY)
        woe_dic[f_id] = [woe_dict, _discretizer]
        iv_dic[f_id] = iv

    # 特征选择(选择IV值较高的topN%的特征)
    select_rate = 0.8
    iv_sort = sorted(iv_dic.items(), key=lambda k: k[1], reverse=True)

    select_index = int(len(iv_sort) * select_rate)
    print(select_index)
    iv_feature_select = iv_sort[:select_index]
    selected_features = set([item[0] for item in iv_feature_select])
    print(selected_features)

    # 保存中间特征预处理参数结果
    mid_result = {"woe_dic": woe_dic,
                  "select_features": selected_features}
    joblib.dump(mid_result, 'model/woe_'+formatted_today+'.pkl')

    # 特征woe编码
    for i in range(trainX.shape[-1]):
        f_index = i
        f_id = feature_ids[f_index]
        if f_id in selected_features:
            f_arr = trainX[:, i]
            woe_dict, _discretizer = woe_dic.get(f_id)
            # print(woe_dict, iv, _discretizer)
            f_woe_code = woe_encode(f_arr, woe_dict, _discretizer)
            f_woe_code = np.nan_to_num(f_woe_code)
            trainX_woe.append(f_woe_code)

    testX_woe = []
    for i in range(testX.shape[-1]):
        f_index = i
        f_id = feature_ids[f_index]
        if f_id in selected_features:
            f_arr = testX[:, i]
            woe_dict, _discretizer = woe_dic.get(f_id)
            f_woe_code = woe_encode(f_arr, woe_dict, _discretizer)
            f_woe_code = np.nan_to_num(f_woe_code)
            testX_woe.append(f_woe_code)

    testX = np.array(testX_woe).transpose()
    trainX = np.array(trainX_woe).transpose()
    print(testX.shape)
    print(trainX.shape)

    # 模型训练
    lr = LogisticRegression(penalty='l2', solver='sag')
    parameters = {
        'C': np.arange(0.02, 0.1, 0.02),
        'max_iter': range(10, 100, 20)
    }
    # clf = LogisticRegressionCV(Cs=[100, 10, 1, .1, .01, .001, .0001], scoring='roc_auc')
    # clf = xgb.XGBClassifier(max_depth=4, n_estimators=300, )
    grid = GridSearchCV(lr, parameters, cv=3, scoring='roc_auc')
    grid.fit(trainX, trainY)
    print('================================')
    print("grid.best_params_", grid.best_params_)
    print("grid.best_score_", grid.best_score_)

    clf = grid.best_estimator_
    print("grid.best_estimator_:", grid.best_estimator_)
    clf.fit(trainX, trainY)
    #print("LR 权重参数:", clf.coef_.flatten(), len(clf.coef_.flatten()))
    joblib.dump(clf, 'model/lr_model_'+formatted_today+'_zjmj.pkl')

    # 计算特征的权重
    weight_map = {}
    for i in range(trainX.shape[-1]):
        f_index = i
        f_id = feature_ids[f_index]
        if f_id in selected_features:
            weight_map.setdefault(f_id, 0)
            weight_map[f_id] = clf.coef_.flatten()[i]
    print ",".join(["fe_id", "fe_name", "fe_iv", "fe_weight"])
    for k in iv_sort:
        fe_id = k[0]
        fe_name = feature_id_to_name_dic.get(fe_id)
        fe_weight = weight_map.get(fe_id) # LR的特征权重参数
        fe_iv = k[1] # 特征的IV值
        print ",".join([str(fe_id), str(fe_name), str(fe_iv), str(fe_weight)])

    # 模型预测
    yhat_train = clf.predict_proba(trainX)[:, 1]
    yhat_test = clf.predict_proba(testX)[:, 1]
    # for i in range(len(testY)):
    #     print(','.join([str(testY[i]), str(yhat_test[i])]))
    # 模型评估
    _auc_score_train = roc_auc_score(y_true=trainY, y_score=yhat_train)
    print('Auc score of train set , {}'.format(_auc_score_train))
    _auc_score_test = roc_auc_score(y_true=testY, y_score=yhat_test)
    print('Auc score of test set , {}'.format(_auc_score_test))
    _log_loss_train = log_loss(trainY, yhat_train)
    print('Logloss of Train , {}'.format(_log_loss_train))
    _log_loss_test = log_loss(testY, yhat_test)
    print('Logloss of Test , {}'.format(_log_loss_test))
    _coef = clf.coef_
    # _coef_nums = len(_coef)
    #print('coef: {}'.format(_coef))

2、模型的加载和模型预测

# -*- coding: utf-8 -*-
import sys
import numpy as np
from sklearn.externals import joblib

def woe_encode(arr, woe_dict, discretizer=None):
    _vec = np.vectorize(lambda x: woe_dict.get(x))
    arr = arr.reshape((-1, 1))
    arr = discretizer.transform(arr)
    return _vec(arr).reshape((1, -1))[0]

def load_file(file_name):
    features = []
    for line in open(file_name):
        ll = line.strip().split("\t")
        fe_array = [ll[0]]
        for fe in ll[1:]:
            try:
                fe_value = float(fe)
            except:
                fe_value = 0.0
            fe_array.append(fe_value)
        features.append(fe_array)
    featuresMat = np.array(features)

    return featuresMat


def get_feature_conf(conf_file):
    feature_names = []
    for line in open(conf_file):
        fe_name = line.strip()
        feature_names.append(fe_name)

    return feature_names


if __name__ == "__main__":
    model_version = sys.argv[1]

    datasetTest = load_file(sys.argv[2])
    # split data into X and y
    testX = datasetTest[:, 1:47]
    seller_ids = datasetTest[:, 0]

    feature_ids = get_feature_conf("model/feature_name.conf")
    # 加载中间结果
    mid_result = joblib.load('model/woe_'+model_version+'.pkl')
    woe_dic_new = mid_result.get("woe_dic")
    selected_features = mid_result.get("select_features")
    # 特征woe编码
    testX_woe = []
    for i in range(testX.shape[-1]):
        f_id = feature_ids[i]
        if f_id in selected_features:
            f_arr = testX[:, i]
            woe_dict, _discretizer = woe_dic_new.get(f_id)
            f_woe_code = woe_encode(f_arr, woe_dict, _discretizer)
            f_woe_code = np.nan_to_num(f_woe_code)
            testX_woe.append(f_woe_code)

    testX = np.array(testX_woe).transpose()
    # print(testX.shape)

    # 加载模型
    clf = joblib.load('model/lr_model_'+model_version+'.pkl')

    # 模型预测
    yhat = clf.predict_proba(testX)[:, 1]
    for i in range(len(seller_ids)):
        if("," in seller_ids[i]):
            continue
        print(','.join([str(seller_ids[i]), str(yhat[i])]))






  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
在目标函数`objective`中,我们需要编写模型训练代码,并返回模型在验证集上的性能指标。以下是一个简单的模型训练代码示例: ```python import tensorflow as tf from tensorflow.keras.datasets import mnist from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout from tensorflow.keras.optimizers import Adam # 加载MNIST数据集 (x_train, y_train), (x_test, y_test) = mnist.load_data() x_train = x_train.reshape((-1, 784)).astype('float32') / 255.0 x_test = x_test.reshape((-1, 784)).astype('float32') / 255.0 y_train = tf.keras.utils.to_categorical(y_train) y_test = tf.keras.utils.to_categorical(y_test) # 定义模型 def create_model(params): model = Sequential() for i in range(params['num_layers']): model.add(Dense(units=params['num_units'], activation='relu')) model.add(Dropout(rate=params['dropout_rate'])) model.add(Dense(units=10, activation='softmax')) optimizer = Adam(lr=params['learning_rate']) model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy']) return model # 定义目标函数 def objective(params): model = create_model(params) model.fit(x_train, y_train, validation_split=0.1, epochs=10, batch_size=128) loss, accuracy = model.evaluate(x_test, y_test) return {'loss': -accuracy, 'status': 'ok'} ``` 在上述代码中,我们首先加载了MNIST数据集,并将像素值缩放到0到1之间。然后,我们定义了一个`create_model`函数,用于根据超参数组合创建模型。在此处,我们使用了`tf.keras`库来构建神经网络,并使用了Adam优化器进行模型训练。在`objective`函数中,我们首先根据超参数组合创建模型,然后使用训练集的一部分数据进行模型训练,并在验证集上评估模型性能。最终,我们将模型在测试集上的准确率作为目标函数的返回值,并使用`-accuracy`作为损失函数的值,因为`fmin`函数默认使用损失函数最小化作为优化目标。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值