# -*- coding: utf-8 -*-
import json
import time
import warnings
from collections import OrderedDict
import pandas as pd
import toad
from matplotlib import pyplot as plt
from toad.plot import bin_plot
from unidecode import unidecode
from common.expression import get_condition_string
from ng.data.LoanTraining import LoanTraining
from ng.data.ParamNeed import ParamNeed
from ng.v0.ParamCommon import extract_json_fields
class FeaturesChi:
def __init__(self, config):
self.loanObject = LoanTraining()
self.paramNeed = ParamNeed()
self.paramterConfig = config
self.param_name = config.paramter_name
self.variable = OrderedDict({
config.paramter_category: config.paramter_category
})
self.file_name = f"G:\\data\\common\\parameters\\{self.param_name}.csv"
self.bins = config.bins
self.min_samples = config.min_samples
print('')
def get_variable(self, var):
d = self.loanObject.get_define(var)
df_ne = self.loanObject.get_data(var)
return d, df_ne
def process(self):
loan = self.loanObject.get_loan()
df_needs = self.paramNeed.get_need()
df_needs = df_needs[df_needs['param_key'] == self.param_name]
df_bining = self.paramNeed.get_bining()
for category_key, category_value in self.variable.items():
df_nds = df_needs[df_needs['category_name'] == category_value]
needs_arrays = df_nds['param_key'].values
print('needs_arrays', needs_arrays)
print(f"训练模型-读取开始: {category_value}")
start_time = time.time()
define, df_data = self.get_variable(category_value)
execution_time = time.time() - start_time
start_time = time.time()
print(f"训练模型-整理数据, 代码执行时间为 {category_value}: {execution_time:.2f} 秒")
# 将所有字段的值添加到 DataFrame 中
for index, row in define.iterrows():
var_key = row['var_key']
var_name = row['var_name']
var_pytype = row['var_pytype']
var_pydefault_value = row['pydefault_value']
if var_key in needs_arrays:
df_data[var_key] = df_data['var_data'].apply(lambda x: extract_json_fields(x, var_key))
# try:
if 'str' == var_pytype or 'object' == var_pytype:
df_data[var_key] = df_data[var_key].apply(unidecode)
df_data[var_key] = df_data[var_key].fillna('U')
df_data[var_key] = df_data[var_key].fillna(var_pydefault_value)
df_data[var_key] = df_data[var_key].astype(var_pytype)
df_data[var_key] = df_data[var_key].str.strip()
df_data[var_key] = df_data[var_key].str.lower()
if var_key == "apply_city":
df_data[var_key] = df_data[var_key].str.replace("'", "")
else:
df_data[var_key] = df_data[var_key].replace('U', var_pydefault_value)
df_data[var_key] = df_data[var_key].replace('unknown', var_pydefault_value)
df_data[var_key] = df_data[var_key].fillna(var_pydefault_value)
if var_key in ('available_memory', 'residual_memory'):
# 替换"GB"为空字符串
df_data[var_key] = df_data[var_key].str.replace('GB', '')
df_data[var_key] = df_data[var_key].str.replace('MB', '')
df_data[var_key] = df_data[var_key].str.replace(',', '.')
df_data[var_key] = df_data[var_key].str.strip()
df_data[var_key] = df_data[var_key].astype(var_pytype)
# 将超过8位长度的字符串转换为数字,并进行除以1024、除以1024、除以1024的操作
df_data.loc[df_data[var_key] > 256, var_key] = \
pd.to_numeric(df_data.loc[df_data[var_key] > 256, var_key]) / 1024 / 1024
df_data.loc[df_data[var_key] > 10000, var_key] = \
pd.to_numeric(df_data.loc[df_data[var_key] > 10000, var_key]) / 1024 / 1024 / 1024
df_data[var_key] = df_data[var_key].round(2)
df_data[var_key] = df_data[var_key].astype(var_pytype)
# except Exception as e:
# print(f'组装数据错误: {category_value}, {var_key}, {var_pytype}, 异常信息: {str(e)}')
df_bining_script = df_bining[((df_bining['var_key'] == var_key) &
(df_bining['default_range'] == 'N'))]
print(var_key)
var_key_plus = var_key + "_plus"
if var_key == 'marital_status':
print("marital_status")
df_data['is_matched'] = False
if len(df_bining_script) > 0:
for i, r in df_bining_script.iterrows():
local_params = {var_key: var_key}
condition_string = get_condition_string(r['range_script'],
local_params)
# print(i, r['range_name'], ' ', condition_string)
# print(df_data.dtypes)
mask_reg = eval(condition_string, {'df': df_data})
mask = (mask_reg & ~df_data['is_matched'])
df_data.loc[mask, var_key_plus] = r['range_name']
df_data.loc[mask, 'is_matched'] = True
# 未命中规则,取默认变量默认分段赋值
if 'str' == var_pytype or 'object' == var_pytype:
df_bining_script_u = df_bining[((df_bining['var_key']
== var_key)
& (df_bining['default_range']
== 'Y'))]
if len(df_bining_script_u) > 0:
for i, r in df_bining_script_u.iterrows():
mask = (~df_data['is_matched'])
df_data.loc[mask, var_key_plus] = r['range_name']
df_data.loc[mask, 'is_matched'] = True
df_data[var_key_plus] = df_data[var_key_plus].astype(var_pytype)
df_data = df_data.drop(var_key, axis=1)
df_data = df_data.rename(columns={var_key_plus: var_key})
print(df_data)
# 删除原始的 var_data 列
df_data.drop(columns=['var_data'], inplace=True)
# 将 data DataFrame 与 loan DataFrame 合并
loan = loan.merge(df_data, on='business_id', how='inner')
execution_time = time.time() - start_time
print(f"训练模型-整理数据, 代码执行时间为 {category_value}: "
f"{execution_time:.2f} 秒, 数据量:{loan.shape[0]}")
print(loan.dtypes)
loan.to_csv(self.file_name, index=False)
loan['target'] = loan['target'].astype('int64')
ex_lst = ['business_id', 'expire_date', 'is_matched']
loan = loan.drop(ex_lst, axis=1)
quality_result = toad.quality(loan, 'target')
print(quality_result)
ex_lis = []
combiner = toad.transform.Combiner()
combiner.fit(loan, loan['target'], method='chi', min_samples=self.min_samples,
n_bins=self.bins,
exclude=ex_lis, empty_separate=True)
print("展示卡方分箱结果")
feature_array = loan.columns
for feature in feature_array:
if feature != 'target':
print(feature, combiner.export()[feature])
bin_plot(combiner.transform(loan[[feature, 'target']],
labels=True), x=feature, target='target')
# 最大化窗体
plt.get_current_fig_manager().window.state('zoomed')
plt.gcf().canvas.manager.set_window_title('Training')
toad库进行卡方分箱:
combiner = toad.transform.Combiner()
combiner.fit(loan, loan['target'], method='chi', min_samples=self.min_samples, n_bins=self.bins, exclude=ex_lis, empty_separate=True)
loan
:输入的特征矩阵或数据集。loan['target']
:目标变量或标签。method='chi'
:特征转换方法的选择,这里使用卡方检验(chi-square test)作为评估指标。min_samples=self.min_samples
:最小样本数,用于指定特征转换方法的最小样本要求。n_bins=self.bins
:特征分箱的数量,用于将连续特征离散化为有序的分箱变量。exclude=ex_lis
:需要排除的特征列表,这些特征将不会进行特征转换。empty_separate=True
:是否将空值作为一个独立的分箱。
这里的method参数有其他字典,我这里使用chi 卡方
method='iv'
:使用信息值(Information Value)作为评估指标进行特征转换。信息值是一种衡量特征与目标变量之间关联性的指标。method='gini'
:使用基尼系数(Gini Coefficient)作为评估指标进行特征转换。基尼系数是一种衡量特征的纯度和不纯度的指标。method='entropy'
:使用熵(Entropy)作为评估指标进行特征转换。熵是一种衡量特征的不确定性和信息量的指标。method='pearson'
:使用皮尔逊相关系数(Pearson Correlation Coefficient)作为评估指标进行特征转换。皮尔逊相关系数衡量了特征与目标变量之间的线性相关性。
这本文中使用了数据库类管理变量信息,方便参数配置:
CREATE TABLE `risk_variable_params` (
`id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '序号',
`apanage` varchar(20) DEFAULT NULL COMMENT '属地',
`model_type` varchar(100) DEFAULT NULL COMMENT '模型类型',
`model_sub_type` varchar(100) DEFAULT NULL COMMENT '模型子类型',
`param_key` varchar(50) DEFAULT NULL COMMENT '参数key',
`param_name` varchar(200) DEFAULT NULL COMMENT '参数注释',
`param_type` varchar(50) DEFAULT NULL COMMENT 'python参数转换',
`param_sort` int(255) DEFAULT NULL COMMENT '训练参数和预测参数顺序保持一致',
`iv` int(1) DEFAULT NULL COMMENT 'iv合格标志',
`corr` int(1) DEFAULT NULL COMMENT 'corr合格标志',
`empty` int(1) DEFAULT NULL COMMENT '空标志',
`categorical` int(1) DEFAULT NULL COMMENT '训练保留类型数据',
`original_categorical` int(1) DEFAULT NULL COMMENT '原始类型数据',
`param_default_value` varchar(30) DEFAULT NULL,
`param_estimate` varchar(30) DEFAULT NULL COMMENT 'est',
`status` varchar(2) DEFAULT NULL COMMENT '状态',
`source_dataset` varchar(100) DEFAULT NULL COMMENT '数据集',
`source_key` varchar(100) DEFAULT NULL COMMENT '数据集key',
`category_name` varchar(255) DEFAULT NULL COMMENT '规则数据集',
`create_time` datetime DEFAULT NULL,
`update_time` datetime DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `idx_model_type` (`model_type`),
KEY `idx_sub_type` (`model_sub_type`)
) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8mb4 ROW_FORMAT=DYNAMIC