1、binning_woe
- binning
def get_interval(df,label,split_func,bins_num=None,self_thres=None):
"""
df : the need process dataframe data
label : the column name of label data
split_func : the method of getting threshold list
bin_num : author specify num of interval
self_thres : if you select method not in [chi,tree] you should specif your threshold list by dict
"""
df=df.fillna(0)
cols=list(filter(lambda item:item !=label,df.columns))
y=df[label]
if split_func=='chi':
threshold_list=[chi_merge(df,item,y,label,bins_num=bins_num) for item in cols]
return dict(zip(cols,threshold_list))
elif split_func=='tree':
threshold_list=[dtree_threshold(df[item],y,bins_num=bins_num) for item in cols]
return dict(zip(cols,threshold_list))
else:
if isinstance(self_thres,dict):
return self_thres
else:
raise ValueError("you need input yourself threshold_list")
def dtree_threshold(X,y,bins_num=None):
clf = DecisionTreeClassifier(max_leaf_nodes=bins_num)
X=np.array(X).reshape(-1,1)
clf.fit(X,y)
interval=list(clf.tree_.threshold[clf.tree_.feature == 0])
interval.append(X.min())
interval.append(X.max())
interval=sorted(interval)
intervals=[[interval[i], interval[i+1]] for i in range(len(interval)-1)]
new_intervals=check_length_interval(X,intervals)
return new_intervals
def check_length_interval(X,intervals):
#default percent is 8%
threshold_num=X.shape[0]*0.08
new_intervals=[]
big_set=set([X.min()])
for index in range(len(intervals)):
count_interval= len(np.where(np.logical_and(X>=intervals[index][0], X<intervals[index][1]))[0])
if count_interval<threshold_num: # Merge the intervals
if index==len(intervals)-1:
t = intervals[index-1] + intervals[index]
else:
t = intervals[index] + intervals[index+1]
append_item=[min(t), max(t)]
else:
append_item=intervals[index]
if min(append_item)>=max(big_set):
big_set.add(max(append_item))
new_intervals.append(append_item)
return new_intervals
from sklearn.base import BaseEstimator, TransformerMixin
import math
# import sys
# from pathlib import Path
# filename = 'deploy'
# paths = str(Path(__file__))
# final_path = paths[:paths.find(filename) + len(filename)]
# sys.path.append(final_path)
class NumtoCategorical(BaseEstimator, TransformerMixin):
"""
Parameters
----------
bins_num : number type, the bins num
self_thres : dict type, you can input your split dict. example {'col1':[[0,2],[2,5]]}
Attributes
----------
threshold_list : dict of intervals,example {'col1':[[0,2],[2,5]]}
Examples
--------
from sklearn.datasets import load_iris
import pandas as pd
iris = load_iris()
df=pd.concat([pd.DataFrame(iris.data),pd.DataFrame(iris.target)],ignore_index=True,axis=1)
df.columns=iris.feature_names+['target']
#split data
Sp=NumtoCategorical(bins_num=5)
clf=Sp.fit(df,'target',split_func='tree')
dff=clf.transform()
dff=pd.concat([dff,df],axis=1)
"""
def __init__(self,bins_num=15,self_thres=None,num_cols=None):
self.bins_num = bins_num
self.self_thres=self_thres
self.num_cols=num_cols
def fit(self, df_all, label,split_func):
"""
df : data only contain num and label columns,cant contain categeory columns
label : the label column name
split_func : the split func you can select from ['tree','chi']
"""
cols=self.num_cols+[label]
if label==None:
# import warnings
# warnings.warn("only split num features,can not calculate woe",Warning)
raise ValueError("you need confirm input label column name, got error")
#spilt num
self.threshold_list=get_interval(df_all[cols],label,split_func,bins_num=self.bins_num,
self_thres=self.self_thres)
self.df=df_all
return self
def transform(self, X=None,cat_style=True):
"""Transform X using one-hot encoding.
Parameters
----------
X : dataframe, if you not input it will use fit data,
the data not contain label column
Returns
-------
df : type dataframe,split data
"""
threshold_list= self.self_thres if self.self_thres !=None else self.threshold_list
if X is not None:
df=X
else:
df=self.df
df=df.fillna('-99')
# assert len(self.num_cols)==len(threshold_list.keys())
if cat_style:
def split(x,col):
for _,item in enumerate(threshold_list[col]):
if x=='-99':
return '_null'
elif item[0] <= x < item[1]:
return str(item[0])+'_'+str(item[1])
#可修改
elif x<threshold_list[col][0][0]:
return '<'+'first'
elif x>=threshold_list[col][-1][1]:
return '>='+'last'
else:
def split(x,col):
for index,item in enumerate(threshold_list[col]):
if x=='-99':
return col+'_null'
elif item[0] <= x < item[1]:
return col+'_'+str(index+1)
elif x<threshold_list[col][0][0]:
return col+'_0'
elif x>=threshold_list[col][-1][1]:
return col+'_'+str(len(threshold_list[col]))
for col in df.columns:
if col in self.num_cols:
df.loc[:, col] = df.loc[:, col].map(lambda x:split(x,col))
return df
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
iris = load_iris()
df=pd.concat([pd.DataFrame(iris.data),pd.DataFrame(iris.target)],ignore_index=True,axis=1)
df.columns=iris.feature_names+['target']
print(df)
#分割数据
Sp=NumtoCategorical(num_cols=iris.feature_names,bins_num=5)
clf=Sp.fit(df,'target',split_func='tree')
dff=clf.transform()
dff
- woe
_data_dir=r'C:\Users\kyrie\Desktop
def woe_transform(df,label):
#�?前只能�?�理两类�?题,对于多类的可以考虑计算WOE后乘以类�?的占比,相当于加入先验�?�率�?
save_path = _data_dir+'\woe_iv1.xlsx'
writer = pd.ExcelWriter(save_path)
labels=df[label].unique()
label_one=labels[0]
label_two=labels[1]
df['num']=df.index
offset = 0
def woe_(attr,offset):
pt = pd.pivot_table(df, index=label,columns=attr, values='num', aggfunc='count').T
if pt.empty:
dict_v=dict(zip(df[attr].unique(),[0]))
return dict_v,offset
#todo
else:
pt['WOEi'] = np.log((pt[label_one] / pt[label_one].sum()) /
(pt[label_two] / pt[label_two].sum())).round(4)
pt['IVi'] = pt.WOEi.mul((pt[label_one] / pt[label_one].sum()) -
(pt[label_two] / pt[label_two].sum())).round(3)
iv = pt.IVi.sum()
pt = pt.fillna(0)
key = pt.index.tolist()
value = pt.WOEi.tolist()
dict_v = dict(zip(key, value))
pt.to_excel(writer, 'woe明细', startrow=offset)
offset += (pt.shape[0] + 2)
return dict_v,offset
cols=list(filter(lambda item:item not in [label,'num'],df.columns))
woe_list=[]
for col in cols:
dict_v,offset=woe_(col,offset)
woe_list.append(dict_v)
writer.save()
return dict(zip(cols,woe_list))
from sklearn.base import BaseEstimator, TransformerMixin
# from Offline.score.binning_woe.binning.utils import woe_transform
class CattoWoe(BaseEstimator, TransformerMixin):
"""
Parameters
----------
label : the label column name
Attributes
----------
woe_dict : dict of intervals,example {'col1':{'xx':0.235}}
Examples
--------
please refer to the readme example
"""
def __init__(self,label,self_woedict=None):
self.label=label
self.self_woedict=self_woedict
def fit(self, df):
"""
df : data only dataframe type
"""
self.df=df
self.woe_dict=woe_transform(df,self.label)
return self
# @classmethod
def transform(self, X=None):
"""Transform X using one-hot encoding.
Parameters
----------
X : dataframe, if you not input it will use fit data,
the data not contain label column
self_woedict: the woe dict by this model fit and save to the file
Returns
-------
df : type dataframe,woe data
"""
df= X if X is not None else self.df
woe_dict= self.self_woedict if self.self_woedict !=None else self.woe_dict
cols=list(filter(lambda item:item not in [self.label,'num'],df.columns))
for attr in cols:
df[attr] = df[attr].map(woe_dict[attr])
if X is None:
df.drop(['num'],axis=1,inplace=True)
return df
Cw=CattoWoe('target')
wclf=Cw.fit(dff)
wdf=wclf.transform()
wdf.head()
2、分割数据
from sklearn.utils import shuffle
X,Y=split_data(wdf,'target')
数据归一化
from sklearn.preprocessing import StandardScaler
def pro_data(X, x_test=None, fit_func=StandardScaler(),save=False):
# 处理数据,可以选择归一化或者正则化
normalizer = StandardScaler()
normalizer.fit(X)
StandardScaler(copy=True, with_mean=True, with_std=True)
# copy 如果为false, 就会用归一化的值替代原来的值;如果被标准化的数据不是np.array或scipy.sparse CSR
# matrix, 原来的数据还是被copy而不是被替代
# with_mean在处理sparse CSR或者CSC matrices 一定要设置False不然会超内存
X = normalizer.transform(X)
x_mean = normalizer.mean_#均值
x_std = normalizer.var_#方差越大,离散程度越大
if save:
save_modelf('standar',normalizer)
# np.save(str(docs_path['model_info']).format('mean'), x_mean)
# np.save(str(docs_path['model_info']).format('std'), x_std)
if x_test is not None:
x_test = normalizer.transform(x_test)
return X, x_test
else:
return X
过采样
from imblearn.over_sampling import SMOTE
def smote_sample(X, y):
smote_nc = SMOTE()
X_resampled, y_resampled = smote_nc.fit_resample(X, y)
return X_resampled,y_resampled
X_resampled,y_resampled=smote_sample(X,Y)
3、特征选择
import random
from sklearn.linear_model import Lasso
def lasso_func(X, Y, cols, alp, positive=False, line=0, save=False):
Y1 = Y.copy()
Y1[Y1 == 0] = -1
# 存储feature比重
index = np.zeros([X.shape[1], ])
for i in range(100):
# 设定alpha的值
alpha = random.uniform(alp[0], alp[1])
clf = Lasso(alpha=alpha, positive=positive)
clf.fit(X, Y1)
data = clf.coef_
data[data != 0] = 1
index = index + data
# 关联列名得到选出的列
df_lasso = pd.DataFrame(index / 100)
# cols = cols
print(df_lasso.shape)
df_lasso = pd.concat([df_lasso, pd.DataFrame(cols)],
ignore_index=True, axis=1)
df_lasso.columns = ['percent', 'col']
df_lasso = df_lasso[df_lasso['percent'] > line]
df_lasso = df_lasso.sort_values(by=['percent'], ascending=False)
if save:
df_lasso.to_excel('lasso_select_temp.xlsx')
return df_lasso
4、模型参数调整
def creat_model_rf(X, Y):
# scoring recall
model = RandomForestClassifier()
# 设定参数选取范围
n_estimators = [random.randint(100, 500) for i in range(5)]
max_depth = [random.randint(10, 30) for i in range(5)]
# max_depth=[None]
max_features = ['sqrt', 'log2', 'auto']
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth,
max_features=max_features)
# 构建gridsearch,选取roc作为评价依据
grid = GridSearchCV(estimator=model, param_grid=param_grid,
n_jobs=1)
grid_result = grid.fit(X, Y)
print("Best: %f using %s" %
(grid_result.best_score_, grid_result.best_params_))
# 输出最优参数
params=grid_result.best_params_
save_params(params,'rf')
return params
5、模型评估和预测数据
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(dff[cols],df['target'],test_size=0.3)
def get_table(y_pre, y_test, func_name):
df_pro = pd.DataFrame(y_pre * 100)
df_result = pd.concat([df_pro, pd.DataFrame(y_test)],
ignore_index=True, axis=1)
bins = [x * 10 for x in range(11)]
df_result['pre'] = pd.cut(df_result[0], bins, include_lowest=True)
df_result['test'] = df_result[1]
df_result['num'] = df_result.index
print(df_result.head())
# table = get_pt(df_result, 'pre', 'test', 'num')
df_result = df_result[[0, 1]]
df_result.rename(columns={0: '{}0'.format(
func_name), 1: '{}1'.format(func_name)}, inplace=True)
return df_result
def rf_model(X, Y, x_test, y_test, params, save_model=False):
clfs = {'random_forest': RandomForestClassifier(**params)}
# 构建分类器,训练样本,预测得分
clf = clfs['random_forest']
clf.fit(X, Y)
clf_score = (clf.score(X, Y), clf.score(x_test, y_test))
# 输出概率
y_pre = clf.predict_proba(x_test)[:, 1]
score_test = classification_report(y_test, clf.predict(x_test))
result = get_table(y_pre, y_test, 'rf')
if save_model:
save_modelf('rf',clf)
return result, clf_score