完整数据分析流程

最新推荐文章于 2022-04-21 13:43:13 发布

Kyrie_Irving

最新推荐文章于 2022-04-21 13:43:13 发布

阅读量600

点赞数 1

本文链接：https://blog.csdn.net/Kyrie_Irving/article/details/102772723

版权

1、binning_woe

binning

def get_interval(df,label,split_func,bins_num=None,self_thres=None):
    """
    df : the need process dataframe data
    label : the column name of label data
    split_func : the method of getting threshold list
    bin_num : author specify num of interval
    self_thres : if you select method not in [chi,tree] you should specif your threshold list by dict
    """
    df=df.fillna(0)
    cols=list(filter(lambda item:item !=label,df.columns))
    y=df[label]
    if split_func=='chi':
        threshold_list=[chi_merge(df,item,y,label,bins_num=bins_num) for item in cols]
        return dict(zip(cols,threshold_list))
    elif split_func=='tree':
        threshold_list=[dtree_threshold(df[item],y,bins_num=bins_num) for item in cols]
        return dict(zip(cols,threshold_list))
    else:
        if isinstance(self_thres,dict):
            return self_thres
        else:
            raise ValueError("you need input yourself threshold_list")
 
def dtree_threshold(X,y,bins_num=None):
    clf = DecisionTreeClassifier(max_leaf_nodes=bins_num)
    X=np.array(X).reshape(-1,1)
    clf.fit(X,y)
    interval=list(clf.tree_.threshold[clf.tree_.feature == 0])
    interval.append(X.min())
    interval.append(X.max())
    interval=sorted(interval)
    intervals=[[interval[i], interval[i+1]] for i in range(len(interval)-1)]
    new_intervals=check_length_interval(X,intervals)
    return new_intervals 
def check_length_interval(X,intervals):
    #default percent is 8%
    threshold_num=X.shape[0]*0.08
    new_intervals=[]
    big_set=set([X.min()])
    for index in range(len(intervals)):
        count_interval= len(np.where(np.logical_and(X>=intervals[index][0], X<intervals[index][1]))[0])
        if count_interval<threshold_num: # Merge the intervals
            if index==len(intervals)-1:
                t = intervals[index-1] + intervals[index]
            else:
                t = intervals[index] + intervals[index+1]
            append_item=[min(t), max(t)]
        else:
            append_item=intervals[index]
        if min(append_item)>=max(big_set):
            big_set.add(max(append_item))
            new_intervals.append(append_item)
    return new_intervals 
from sklearn.base import BaseEstimator, TransformerMixin
import math
# import sys
# from pathlib import Path
# filename = 'deploy'
# paths = str(Path(__file__))
# final_path = paths[:paths.find(filename) + len(filename)]
# sys.path.append(final_path)
class NumtoCategorical(BaseEstimator, TransformerMixin):
    """
    Parameters
    ----------
    bins_num : number type, the bins num
    self_thres : dict type, you can input your split dict. example {'col1':[[0,2],[2,5]]}
    Attributes
    ----------
    threshold_list : dict of intervals,example {'col1':[[0,2],[2,5]]}
    Examples
    --------
    from sklearn.datasets import load_iris
    import pandas as pd
    iris = load_iris()
    df=pd.concat([pd.DataFrame(iris.data),pd.DataFrame(iris.target)],ignore_index=True,axis=1)
    df.columns=iris.feature_names+['target']
    #split data
    Sp=NumtoCategorical(bins_num=5)
    clf=Sp.fit(df,'target',split_func='tree')
    dff=clf.transform()
    dff=pd.concat([dff,df],axis=1)
    """

    def __init__(self,bins_num=15,self_thres=None,num_cols=None):
        self.bins_num = bins_num
        self.self_thres=self_thres
        self.num_cols=num_cols   

    def fit(self, df_all, label,split_func):
        """
        df : data only contain num and label columns,cant contain categeory columns
        label : the label column name
        split_func : the split func you can select from ['tree','chi']
        """
        cols=self.num_cols+[label]
        if label==None:
            # import warnings
            # warnings.warn("only split num features,can not calculate woe",Warning)
            raise ValueError("you need confirm input label column name, got error")
        
        #spilt num
        self.threshold_list=get_interval(df_all[cols],label,split_func,bins_num=self.bins_num,
            self_thres=self.self_thres)
        self.df=df_all
        return self

    def transform(self, X=None,cat_style=True):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : dataframe, if you not input it will use fit data, 
            the data not contain label column
        Returns
        -------
        df : type dataframe,split data
        """
        threshold_list= self.self_thres if self.self_thres !=None else self.threshold_list
        if X is not None:
            df=X
        else:
            df=self.df
        df=df.fillna('-99')
        # assert len(self.num_cols)==len(threshold_list.keys())
        if cat_style:
            def split(x,col):
                for _,item in enumerate(threshold_list[col]):
                    if x=='-99':
                        return '_null'
                    elif item[0] <= x < item[1]:
                        return str(item[0])+'_'+str(item[1])
                    #可修改
                    elif x<threshold_list[col][0][0]:
                        return '<'+'first'
                    elif x>=threshold_list[col][-1][1]:
                        return '>='+'last'
        else:
            def split(x,col):
                for index,item in enumerate(threshold_list[col]):
                    if x=='-99':
                        return col+'_null'
                    elif item[0] <= x < item[1]:
                        return col+'_'+str(index+1)
                    elif x<threshold_list[col][0][0]:
                        return col+'_0'
                    elif x>=threshold_list[col][-1][1]:
                        return col+'_'+str(len(threshold_list[col]))
                    
        for col in df.columns:
            if col in self.num_cols:
                df.loc[:, col] = df.loc[:, col].map(lambda x:split(x,col))
        return df

from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
iris = load_iris()
df=pd.concat([pd.DataFrame(iris.data),pd.DataFrame(iris.target)],ignore_index=True,axis=1)
df.columns=iris.feature_names+['target']
print(df)
#分割数据
Sp=NumtoCategorical(num_cols=iris.feature_names,bins_num=5)
clf=Sp.fit(df,'target',split_func='tree')
dff=clf.transform()
dff

_data_dir=r'C:\Users\kyrie\Desktop
def woe_transform(df,label):
    #�?前只能�?�理两类�?题，对于多类的可以考虑计算WOE后乘以类�?的占比，相当于加入先验�?�率�?
    save_path = _data_dir+'\woe_iv1.xlsx'
    writer = pd.ExcelWriter(save_path)
    labels=df[label].unique()
    label_one=labels[0]
    label_two=labels[1]
    df['num']=df.index
    offset = 0
    def woe_(attr,offset):
        pt = pd.pivot_table(df, index=label,columns=attr, values='num', aggfunc='count').T
        if pt.empty:
            dict_v=dict(zip(df[attr].unique(),[0]))
            return dict_v,offset
            #todo
        else:
            pt['WOEi'] = np.log((pt[label_one] / pt[label_one].sum()) /
                            (pt[label_two] / pt[label_two].sum())).round(4)
            pt['IVi'] = pt.WOEi.mul((pt[label_one] / pt[label_one].sum()) -
                            (pt[label_two] / pt[label_two].sum())).round(3)
            iv = pt.IVi.sum()
            pt = pt.fillna(0)
            key = pt.index.tolist()
            value = pt.WOEi.tolist()
            dict_v = dict(zip(key, value))
            pt.to_excel(writer, 'woe明细', startrow=offset)
            offset += (pt.shape[0] + 2)
            return dict_v,offset
    cols=list(filter(lambda item:item not in [label,'num'],df.columns))
    woe_list=[]
    for col in cols:
        dict_v,offset=woe_(col,offset)
        woe_list.append(dict_v)
    writer.save()
    return dict(zip(cols,woe_list))
from sklearn.base import BaseEstimator, TransformerMixin
# from Offline.score.binning_woe.binning.utils import woe_transform

class CattoWoe(BaseEstimator, TransformerMixin):
    """
    Parameters
    ----------
    label : the label column name
    Attributes
    ----------
    woe_dict : dict of intervals,example {'col1':{'xx':0.235}}
    Examples
    --------
    please refer to the readme example
    """

    def __init__(self,label,self_woedict=None):
        self.label=label
        self.self_woedict=self_woedict

    def fit(self, df):
        """
        df : data only dataframe type
        """
        self.df=df
        self.woe_dict=woe_transform(df,self.label)
        return self

    # @classmethod
    def transform(self, X=None):
        """Transform X using one-hot encoding.
        Parameters
        ----------
        X : dataframe, if you not input it will use fit data, 
            the data not contain label column
        self_woedict: the woe dict by this model fit and save to the file
        Returns
        -------
        df : type dataframe,woe data
        """
        df= X if X is not None else self.df
        woe_dict= self.self_woedict if self.self_woedict !=None else self.woe_dict
        cols=list(filter(lambda item:item not in [self.label,'num'],df.columns))
        for attr in cols:
            df[attr] = df[attr].map(woe_dict[attr])
        if X is None:
            df.drop(['num'],axis=1,inplace=True)
        return df
Cw=CattoWoe('target')
wclf=Cw.fit(dff)
wdf=wclf.transform()
wdf.head()

2、分割数据

from sklearn.utils import shuffle
X,Y=split_data(wdf,'target')

数据归一化

from sklearn.preprocessing import StandardScaler
def pro_data(X, x_test=None, fit_func=StandardScaler(),save=False):
    # 处理数据，可以选择归一化或者正则化
    normalizer = StandardScaler()
    normalizer.fit(X)
    StandardScaler(copy=True, with_mean=True, with_std=True)
    # copy 如果为false, 就会用归一化的值替代原来的值;如果被标准化的数据不是np.array或scipy.sparse CSR
    # matrix, 原来的数据还是被copy而不是被替代
    # with_mean在处理sparse CSR或者CSC matrices 一定要设置False不然会超内存
    X = normalizer.transform(X)
    x_mean = normalizer.mean_#均值
    x_std = normalizer.var_#方差越大，离散程度越大
    if save:
        save_modelf('standar',normalizer)
        # np.save(str(docs_path['model_info']).format('mean'), x_mean)
        # np.save(str(docs_path['model_info']).format('std'), x_std)
    if x_test is not None:
        x_test = normalizer.transform(x_test)
        return X, x_test
    else:
        return X

过采样

from imblearn.over_sampling import SMOTE
def smote_sample(X, y):
    smote_nc = SMOTE()
    X_resampled, y_resampled = smote_nc.fit_resample(X, y)
    return X_resampled,y_resampled

X_resampled,y_resampled=smote_sample(X,Y)
3、特征选择

import random
from sklearn.linear_model import Lasso
def lasso_func(X, Y, cols, alp, positive=False, line=0, save=False):
    Y1 = Y.copy()
    Y1[Y1 == 0] = -1
    # 存储feature比重
    index = np.zeros([X.shape[1], ])
    for i in range(100):
        # 设定alpha的值
        alpha = random.uniform(alp[0], alp[1])
        clf = Lasso(alpha=alpha, positive=positive)
        clf.fit(X, Y1)
        data = clf.coef_
        data[data != 0] = 1
        index = index + data
    # 关联列名得到选出的列
    df_lasso = pd.DataFrame(index / 100)
    # cols = cols
    print(df_lasso.shape)
    df_lasso = pd.concat([df_lasso, pd.DataFrame(cols)],
                         ignore_index=True, axis=1)
    df_lasso.columns = ['percent', 'col']
    df_lasso = df_lasso[df_lasso['percent'] > line]
    df_lasso = df_lasso.sort_values(by=['percent'], ascending=False)
    if save:
        df_lasso.to_excel('lasso_select_temp.xlsx')
    return df_lasso

4、模型参数调整

def creat_model_rf(X, Y):
    # scoring recall
    model = RandomForestClassifier()
    # 设定参数选取范围
    n_estimators = [random.randint(100, 500) for i in range(5)]
    max_depth = [random.randint(10, 30) for i in range(5)]
    # max_depth=[None]
    max_features = ['sqrt', 'log2', 'auto']
    param_grid = dict(n_estimators=n_estimators, max_depth=max_depth,
                      max_features=max_features)
    # 构建gridsearch，选取roc作为评价依据
    grid = GridSearchCV(estimator=model, param_grid=param_grid,
                       n_jobs=1)
    grid_result = grid.fit(X, Y)
    print("Best: %f using %s" %
          (grid_result.best_score_, grid_result.best_params_))
    # 输出最优参数
    params=grid_result.best_params_
   save_params(params,'rf')
    return params

5、模型评估和预测数据

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(dff[cols],df['target'],test_size=0.3)
def get_table(y_pre, y_test, func_name):
    df_pro = pd.DataFrame(y_pre * 100)
    df_result = pd.concat([df_pro, pd.DataFrame(y_test)],
                          ignore_index=True, axis=1)
    bins = [x * 10 for x in range(11)]
    df_result['pre'] = pd.cut(df_result[0], bins, include_lowest=True)
    df_result['test'] = df_result[1]
    df_result['num'] = df_result.index
    print(df_result.head())
    # table = get_pt(df_result, 'pre', 'test', 'num')
    df_result = df_result[[0, 1]]
    df_result.rename(columns={0: '{}0'.format(
        func_name), 1: '{}1'.format(func_name)}, inplace=True)
    return df_result

def rf_model(X, Y, x_test, y_test, params, save_model=False):
    clfs = {'random_forest': RandomForestClassifier(**params)}
    # 构建分类器，训练样本，预测得分
    clf = clfs['random_forest']
    clf.fit(X, Y)
    clf_score = (clf.score(X, Y), clf.score(x_test, y_test))
    # 输出概率
    y_pre = clf.predict_proba(x_test)[:, 1]
    score_test = classification_report(y_test, clf.predict(x_test))
    result = get_table(y_pre, y_test, 'rf')
    if save_model:
        save_modelf('rf',clf)
    return result, clf_score

Kyrie_Irving

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
完整数据分析流程

1、binning_woebinningdef get_interval(df,label,split_func,bins_num=None,self_thres=None): """ df : the need process dataframe data label : the column name of label data split_func :...
复制链接

扫一扫