特征衍生方法汇集

weixin_45271076

已于 2024-05-28 13:08:27 修改

阅读量180

点赞数 3

文章标签：集成学习机器学习

于 2024-05-26 16:10:12 首次发布

本文链接：https://blog.csdn.net/weixin_45271076/article/details/139174894

版权

在这里插入图片描述

#交叉组合特征衍生
def Binary_cross_Combination(colNames,features,OneHot=True):
    colNames_new_l=[]
    features_new_l=[]
    #提取需要组合的交叉特征
    features=features[colNames]
    for col_index,col_name in enumerate(colNames):
        for col_sub_index in range(col_index+1,len(colNames)):
            newNames=col_name+'&'+colNames[col_sub_index]
            colNames_new_l.append(newNames)
            newDF=pd.Series(features[col_name].astype('str')+'&'+features[colNames[col_sub_index]].astype('str'),
                        name=newNames)
            features_new_l.append(newDF)
    features_new=pd.concat(features_new_l,axis=1)
    features_new.columns=colNames_new_l
    colNames_new=colNames_new_l
    if OneHot==True:
        enc=preprocessing.OneHotEncoder()
        enc.fit_transform(features_new)
        colNames_new=cate_colName(enc,colNames_new_l,drop='if_binary')
        features_new=pd.DataFrame(enc.fit_transform(features_new).toarray(),columns=colNames_new)
        return features_new



```c
colNames=['SeniorCitizen','Partner','Dependents']
features = tcc.drop(columns=[ID_col, target]).copy()
Binary_cross_Combination(colNames,features)

在这里插入图片描述

def cate_colName(Transformer,category_cols,drop='if_binary'):
    #离散字段独热编码后字段名函数
    #Transformer#独热编码转换器
    #category_cols，输入转化器的离散变量
    cate_cols_new=[]
    col_value=Transformer.categories_
    for index,name in enumerate(cate_cols):
        if (drop=='if_binary')&len(Transformer.categories_[index]==2):
            #如果是二分类，则不参与独热编码，列名就是子集
            cate_cols_new.append(name)
        else:
            for f in enc.categories_[index]:
                feature_name=name+'_'+f
                cate_cols_new.append(feature_name)
    return cate_cols_new

在这里插入图片描述

reset_index()可以把tenure拉回到列名的位置

在这里插入图片描述

def Binary_Group_Statistics(keyCol, 
                            features, 
                            col_num=None, 
                            col_cat=None, 
                            num_stat=['mean', 'var', 'max', 'min', 'skew', 'median'], 
                            cat_stat=['mean', 'var', 'max', 'min', 'median', 'count', 'nunique'], 
                            quant=True):
    """
    双变量分组统计特征衍生函数
    
    :param keyCol: 分组参考的关键变量
    :param features: 原始数据集
    :param col_num: 参与衍生的连续型变量
    :param col_cat: 参与衍生的离散型变量
    :param num_stat: 连续变量分组统计量
    :param cat_num: 离散变量分组统计量  
    :param quant: 是否计算分位数  

    :return：交叉衍生后的新特征和新特征的名称
    """
    
    # 当输入的特征有连续型特征时
    if col_num != None:
        aggs_num = {}
        colNames = col_num
        
        # 创建agg方法所需字典
        for col in col_num:
            aggs_num[col] = num_stat 
            
        # 创建衍生特征名称列表
        cols_num = [keyCol]
        for key in aggs_num.keys():
            cols_num.extend([key+'_'+keyCol+'_'+stat for stat in aggs_num[key]])
            
        # 创建衍生特征df
        features_num_new = features[col_num+[keyCol]].groupby(keyCol).agg(aggs_num).reset_index()
        features_num_new.columns = cols_num 
        
        # 当输入的特征有连续型也有离散型特征时
        if col_cat != None:        
            aggs_cat = {}
            colNames = col_num + col_cat

            # 创建agg方法所需字典
            for col in col_cat:
                aggs_cat[col] = cat_stat

            # 创建衍生特征名称列表
            cols_cat = [keyCol]
            for key in aggs_cat.keys():
                cols_cat.extend([key+'_'+keyCol+'_'+stat for stat in aggs_cat[key]])    

            # 创建衍生特征df
            features_cat_new = features[col_cat+[keyCol]].groupby(keyCol).agg(aggs_cat).reset_index()
            features_cat_new.columns = cols_cat
    
            # 合并连续变量衍生结果与离散变量衍生结果
            df_temp = pd.merge(features_num_new, features_cat_new, how='left',on=keyCol)
            features_new = pd.merge(features[keyCol], df_temp, how='left',on=keyCol)
            features_new.loc[:, ~features_new.columns.duplicated()]
            colNames_new = cols_num + cols_cat
            colNames_new.remove(keyCol)
            colNames_new.remove(keyCol)
         
        # 当只有连续变量时
        else:
            # merge连续变量衍生结果与原始数据，然后删除重复列
            features_new = pd.merge(features[keyCol], features_num_new, how='left',on=keyCol)
            features_new.loc[:, ~features_new.columns.duplicated()]
            colNames_new = cols_num
            colNames_new.remove(keyCol)
    
    # 当没有输入连续变量时
    else:
        # 但存在分类变量时，即只有分类变量时
        if col_cat != None:
            aggs_cat = {}
            colNames = col_cat

            for col in col_cat:
                aggs_cat[col] = cat_stat

            cols_cat = [keyCol]
            for key in aggs_cat.keys():
                cols_cat.extend([key+'_'+keyCol+'_'+stat for stat in aggs_cat[key]])    

            features_cat_new = features[col_cat+[keyCol]].groupby(keyCol).agg(aggs_cat).reset_index()
            features_cat_new.columns = cols_cat            
             
            features_new = pd.merge(features[keyCol], features_cat_new, how='left',on=keyCol)
            features_new.loc[:, ~features_new.columns.duplicated()]
            colNames_new = cols_cat
            colNames_new.remove(keyCol) 
    
    if quant:
        # 定义四分位计算函数
        def q1(x):
            """
            下四分位数
            """
            return x.quantile(0.25)

        def q2(x):
            """
            上四分位数
            """
            return x.quantile(0.75)

        aggs = {}
        for col in colNames:
            aggs[col] = ['q1', 'q2']

        cols = [keyCol]
        for key in aggs.keys():
            cols.extend([key+'_'+keyCol+'_'+stat for stat in aggs[key]])    

        aggs = {}
        for col in colNames:
            aggs[col] = [q1, q2]    

        features_temp = features[colNames+[keyCol]].groupby(keyCol).agg(aggs).reset_index()
        features_temp.columns = cols

        features_new = pd.merge(features_new, features_temp, how='left',on=keyCol)
        features_new.loc[:, ~features_new.columns.duplicated()]
        colNames_new = colNames_new + cols
        colNames_new.remove(keyCol)     
    
    features_new.drop([keyCol], axis=1, inplace=True)
        
    return features_new, colNames_new