分箱处理
1、取出数值类型的列(num_cols)和过滤掉没有标签的数据(df)
cols=list(filter(lambda item:item!='classification',df.columns))
df['classification']=df['classification'].replace({'good':1,'bad':0})
df['classification'].value_counts()
num_dtypes=list(filter(lambda dtype:dtype in (['int64','float64']),df.dtypes))
num_cols=list(filter(lambda cols:df[cols].dtype in num_dtypes,df.columns))
df=df.loc[(df['classification']==0) |(df['classification']==1)]
2、
def get_interval(df,label,split_func,bins_num=None,self_thres=None):
"""
df : the need process dataframe data
label : the column name of label data
split_func : the method of getting threshold list
bin_num : author specify num of interval
self_thres : if you select method not in [chi,tree] you should specif your threshold list by dict
"""
df=df.fillna(0)
cols=list(filter(lambda item:item !=label,df.columns))
y=df[label]
if split_func=='chi':
threshold_list=[chi_merge(df,item,y,label,bins_num=bins_num) for item in cols]
return dict(zip(cols,threshold_list))
elif split_func=='tree':
threshold_list=[dtree_threshold(df[item],y,bins_num=bins_num) for item in cols]
return dict(zip(cols,threshold_list))
else:
if isinstance(self_thres,dict):
return self_thres
else:
raise ValueError("you need input yourself threshold_list")
def chi_merge(data,attr,y,label,bins_num=15):
distinct_vals = sorted(set(data[attr])) # Sort the distinct values
labels = sorted(set(y)) # Get all possible labels
empty_count = {l: 0 for l in labels} # A helper function for padding the Counter()
intervals = [[distinct_vals[i], distinct_vals[i+1]] for i in range(len(distinct_vals)-1)] # Initialize the intervals for each attribute
while len(intervals) > bins_num: # While loop
chi = []
for i in range(len(intervals)-1):
# Calculate the Chi2 value
obs0 = data[data[attr].between(intervals[i][0], intervals[i][1])]
obs1 = data[data[attr].between(intervals[i+1][0], intervals[i+1][1])]
total = len(obs0) + len(obs1)
count_0 = np.array([v for i, v in {**empty_count, **Counter(obs0[label])}.items()])
count_1 = np.array([v for i, v in {**empty_count, **Counter(obs1[label])}.items()])
count_total = count_0 + count_1
expected_0 = count_total*sum(count_0)/total
expected_1 = count_total*sum(count_1)/total
chi_ = (count_0 - expected_0)**2/expected_0 + (count_1 - expected_1)**2/expected_1
chi_ = np.nan_to_num(chi_) # Deal with the zero counts
chi.append(sum(chi_)) # Finally do the summation for Chi2
sort_chi=sorted(enumerate(chi),key=lambda x:x[1],reverse=True)
step=len(intervals)-bins_num
min_chi=sort_chi[-step:] # Find the minimal Chi2 for current iteration
min_chi_index=[item[0] for item in min_chi]
new_intervals = [] # Prepare for the merged new data array
big_set=set([min(distinct_vals)])
for index in range(len(intervals)):
#check eve interval num
if index in min_chi_index: # Merge the intervals
t = intervals[index] + intervals[index+1]
append_item=[min(t), max(t)]
else:
append_item=intervals[index]
if min(append_item)>=max(big_set):
big_set.add(max(append_item))
new_intervals.append(append_item)
intervals=new_intervals
intervals=check_length_interval(np.array(data[attr]),intervals)
return intervals
from sklearn.tree import DecisionTreeClassifier
def dtree_threshold(X,y,bins_num=None):
clf = DecisionTreeClassifier(max_leaf_nodes=bins_num)
X=np.array(X).reshape(-1,1)
clf.fit(X,y)
interval=list(clf.tree_.threshold[clf.tree_.feature == 0])
interval.append(X.min())
interval.append(X.max())
interval=sorted(interval)
intervals=[[interval[i], interval[i+1]] for i in range(len(interval)-1)]
new_intervals=check_length_interval(X,intervals)
return new_intervals
def check_length_interval(X,intervals):
#default percent is 8%
threshold_num=X.shape[0]*0.08
new_intervals=[]
big_set=set([X.min()])
for index in range(len(intervals)):
count_interval= len(np.where(np.logical_and(X>=intervals[index][0], X<intervals[index][1]))[0])
if count_interval<threshold_num: # Merge the intervals
if index==len(intervals)-1:
t = intervals[index-1] + intervals[index]
else:
t = intervals[index] + intervals[index+1]
append_item=[min(t), max(t)]
else:
append_item=intervals[index]
if min(append_item)>=max(big_set):
big_set.add(max(append_item))
new_intervals.append(append_item)
return new_intervals
from sklearn.base import BaseEstimator, TransformerMixin
class NumtoCategorical(BaseEstimator, TransformerMixin):
def __init__(self,bins_num=15,self_thres=None,num_cols=None):
self.bins_num = bins_num
self.self_thres=self_thres
self.num_cols=num_cols
def fit(self, df_all, label,split_func):
cols=self.num_cols+[label]
if label==None:
# import warnings
# warnings.warn("only split num features,can not calculate woe",Warning)
raise ValueError("you need confirm input label column name, got error")
#spilt num
self.threshold_list=get_interval(df_all[cols],label,split_func,bins_num=self.bins_num,
self_thres=self.self_thres)
self.df=df_all
return self
def transform(self, X=None,cat_style=True):
threshold_list= self.self_thres if self.self_thres !=None else self.threshold_list
if X is not None:
df=X
else:
df=self.df
df=df.fillna('-99')
# assert len(self.num_cols)==len(threshold_list.keys())
if cat_style:
def split(x,col):
for _,item in enumerate(threshold_list[col]):
if x=='-99':
return '_null'
elif item[0] <= x < item[1]:
return str(item[0])+'_'+str(item[1])
#可修改
elif x<threshold_list[col][0][0]:
return '<'+'first'
elif x>=threshold_list[col][-1][1]:
return '>='+'last'
else:
def split(x,col):
for index,item in enumerate(threshold_list[col]):
if x=='-99':
return col+'_null'
elif item[0] <= x < item[1]:
return col+'_'+str(index+1)
elif x<threshold_list[col][0][0]:
return col+'_0'
elif x>=threshold_list[col][-1][1]:
return col+'_'+str(len(threshold_list[col]))
for col in df.columns:
if col in self.num_cols:
df.loc[:, col] = df.loc[:, col].map(lambda x:split(x,col))
return df
#每个特征数据进行分箱
Sp=NumtoCategorical(num_cols=num_cols,bins_num=5)
clf=Sp.fit(df,'classification',split_func='tree')
dff=clf.transform()
dff