GBDT+LR学习记录
9.8模型在测试数据集表现(上)
新建check.py文件
新建函数check_core(后来03:30改名为run_check)
check_core我们这里与lr相比只重新选取一下打分函数即可
新建函数get_test_data(同lr)
新建函数run_check_core(同lr)
下面新建定义一下打分函数predict_by_tree
新建get_auc、get_accuracy函数
写main函数
最上边引入模块from future import division
此时check.py如下
# -*- coding: utf-8 -*-
"""
==================================================
File Name: check
Description :
==================================================
"""
from __future__ import division
import sys
import numpy as np
import xgboost as xgb
from scipy.sparse import csc_matrix #稀疏矩阵,方便计算
import math
#9-8和lr测试模型在数据集上的表现一致
def get_test_data(test_file, feature_num_file):
"""读取测试文件"""
total_feature_num = 103
# total_feature_num = utils.get_feature_num(feature_num_file)
test_label = np.genfromtxt(test_file, dtype=np.float32, delimiter=',', usecols=-1)
total_feature_list = range(total_feature_num)
test_feature = np.genfromtxt(test_file, dtype=np.float32, delimiter=',', usecols=total_feature_list)
return test_feature, test_label
# 定义下打分函数
def predict_by_tree(test_feature, tree_model): # 2个输入:测试特征+模型
"""predict by gbdt model"""
predict_list = tree_model.predict(xgb.DMatrix(test_feature)) # 转化数据结构
return predict_list
def get_auc(predict_list, test_label):
"""
auc得分
:param predict_list: model predict score list
:param test_label: label of test data
auc = sum(pos_index) - pos_num(pos_num+1)/2 / (pos_num *neg_num)
"""
total_list = []
for index in range(len(predict_list)):
predict_score = predict_list[index]
label = test_label[index]
total_list.append((label, predict_score))
scorted_total_list = sorted(total_list, key=lambda ele: ele[1])
neg_num = 0
pos_num = 0
count = 1
total_pos_index = 0
for value in scorted_total_list:
label, predict_score = value
if label == 0:
neg_num += 1
else:
pos_num += 1
total_pos_index += count
count += 1
auc_score = (total_pos_index - (pos_num) * (pos_num + 1) / 2) / (pos_num * neg_num + 1)
print("auc: %5f " % (auc_score))
def get_accuracy(predict_list, test_label):
"""
predict_list:model predict list score list
test_label: lable of test data
:return:
"""
right_num = 0
score_thr = 0.5
for index in range(len(predict_list)):
predict_score = predict_list[index]
if predict_score >= score_thr:
predict_label = 1
else:
predict_label = 0
if predict_label == test_label[index]:
right_num += 1
total_num = len(predict_list)
accuracy_score = right_num / total_num
print("accuracy: %5f " % (accuracy_score))
#同lr
def run_check_core(test_feature, test_label, model, score_func):
"""评分函数"""
predict_list = score_func(test_feature, model)
get_auc(predict_list, test_label)
get_accuracy(predict_list, test_label)
#9-8
# 3个参数——测试文件,gbdt模型,记录所有特征维度的文件
def run_check(test_file, tree_model_file, feature_num_file):
"""gbdt模型预测"""
test_feature, test_label = get_test_data(test_file, feature_num_file) # 获得测试的feature和label
# 加载模型
test_model = xgb.Booster(model_file=tree_model_file)
# 打分函数 #zin即我们这里与lr相比只重新选取一下打分函数即可
run_check_core(test_feature, test_label, test_model, predict_by_tree)
if __name__ == "__main__":
# 视频写的如下
# run_check("data/gbdt_test_file", "data/xgb.model", "data/gbdt_feature_num", )
run_check("data/gbdt_test_file", "data/gbdt.model", "data/gbdt_feature_num", )
但是运行报错
D:\develop\Anaconda3\python.exe "D:/develop/PyCharm 2018.3.5_workspace/p_recommendation_mys