数据竞赛模拟_房租预测（3 特征工程）

最新推荐文章于 2024-07-08 22:12:26 发布

一-叶知秋

最新推荐文章于 2024-07-08 22:12:26 发布

阅读量320

点赞数

分类专栏：编程文章标签：算法

本文链接：https://blog.csdn.net/weixin_44370010/article/details/103948281

版权

编程专栏收录该内容

23 篇文章 0 订阅

订阅专栏

任务3 特征工程

特征工程的分析：特征工程

1 任务2的部分预处理

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import IsolationForest #异常点检测，孤立森林算法
from sklearn.preprocessing import StandardScaler
ss = StandardScaler() 

#导入数据
data_train = pd.read_csv('./data/train_data.csv')
data_test = pd.read_csv('./data/test_a.csv')

"""
任务2的部分数据预处理
"""
def preprocessingData(data):
    # 填充缺失值,这里的未知方式也算一种类型
    data['rentType'][data['rentType'] == '--'] = '未知方式'

    # 处理pv和uv的空值
    data['pv'].fillna(data['pv'].mean(), inplace=True)
    data['uv'].fillna(data['uv'].mean(), inplace=True)
    data['pv'] = data['pv'].astype('int')
    data['uv'] = data['uv'].astype('int')
    
    # 去掉部分特征
    data.drop('city', axis=1, inplace=True)
    data.drop('ID', axis=1, inplace=True)
    return data

data_train = preprocessingData(data_train)

# clean data
def IF_drop(train):
    #异常点检测，孤立森林算法
    #contamination是用户设置样本中异常点的比例
    IForest = IsolationForest(contamination=0.01) 
    #fit(X)： Fit estimator.（无监督）
    IForest.fit(train["tradeMoney"].values.reshape(-1,1))  # reshape(-1,1)转换为1列，reshape(1,-1)是1行
    #predict(X)：返回值：+1 表示正常样本， -1表示异常样本。
    y_pred = IForest.predict(train["tradeMoney"].values.reshape(-1,1)) 
    drop_index = train.loc[y_pred==-1].index
    train.drop(drop_index,inplace=True)
    return train

data_train = IF_drop(data_train)

def dropData(train):
    # 丢弃部分异常值
    train = train[train.area <= 200]
    train = train[(train.tradeMoney <=16000) & (train.tradeMoney >=700)]
    train.drop(train[(train['totalFloor'] == 0)].index, inplace=True)
    return train  
#数据集异常值处理
data_train = dropData(data_train)

#深度清洗
def cleanData(data):
    data.drop(data[(data['region']=='RG00001') & (data['tradeMoney']<1000)&(data['area']>50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00001') & (data['tradeMoney']>25000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00001') & (data['area']>250)&(data['tradeMoney']<20000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00001') & (data['area']>400)&(data['tradeMoney']>50000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00001') & (data['area']>100)&(data['tradeMoney']<2000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00002') & (data['area']<100)&(data['tradeMoney']>60000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['area']<300)&(data['tradeMoney']>30000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']<500)&(data['area']<50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']<1500)&(data['area']>100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']<2000)&(data['area']>300)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['tradeMoney']>5000)&(data['area']<20)].index,inplace=True)
    data.drop(data[(data['region']=='RG00003') & (data['area']>600)&(data['tradeMoney']>40000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00004') & (data['tradeMoney']<1000)&(data['area']>80)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']<200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']<2000)&(data['area']>180)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']>50000)&(data['area']<200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['area']>200)&(data['tradeMoney']<2000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00007') & (data['area']>100)&(data['tradeMoney']<2500)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['area']>200)&(data['tradeMoney']>25000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['area']>400)&(data['tradeMoney']<15000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']<3000)&(data['area']>200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']>7000)&(data['area']<75)].index,inplace=True)
    data.drop(data[(data['region']=='RG00010') & (data['tradeMoney']>12500)&(data['area']<100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00004') & (data['area']>400)&(data['tradeMoney']>20000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00008') & (data['tradeMoney']<2000)&(data['area']>80)].index,inplace=True)
    data.drop(data[(data['region']=='RG00009') & (data['tradeMoney']>40000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00009') & (data['area']>300)].index,inplace=True)
    data.drop(data[(data['region']=='RG00009') & (data['area']>100)&(data['tradeMoney']<2000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00011') & (data['tradeMoney']<10000)&(data['area']>390)].index,inplace=True)
    data.drop(data[(data['region']=='RG00012') & (data['area']>120)&(data['tradeMoney']<5000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00013') & (data['area']<100)&(data['tradeMoney']>40000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00013') & (data['area']>400)&(data['tradeMoney']>50000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00013') & (data['area']>80)&(data['tradeMoney']<2000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['area']>300)&(data['tradeMoney']>40000)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<1300)&(data['area']>80)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<8000)&(data['area']>200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<1000)&(data['area']>20)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']>25000)&(data['area']>200)].index,inplace=True)
    data.drop(data[(data['region']=='RG00014') & (data['tradeMoney']<20000)&(data['area']>250)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']>30000)&(data['area']<100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']<50000)&(data['area']>600)].index,inplace=True)
    data.drop(data[(data['region']=='RG00005') & (data['tradeMoney']>50000)&(data['area']>350)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']>4000)&(data['area']<100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['tradeMoney']<600)&(data['area']>100)].index,inplace=True)
    data.drop(data[(data['region']=='RG00006') & (data['area']>165)].index,inplace=True)
    data.drop(data[(data['region']=='RG00012') & (data['tradeMoney']<800)&(data['area']<30)].index,inplace=True)
    data.drop(data[(data['region']=='RG00007') & (data['tradeMoney']<1100)&(data['area']>50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00004') & (data['tradeMoney']>8000)&(data['area']<80)].index,inplace=True)
    data.loc[(data['region']=='RG00002')&(data['area']>50)&(data['rentType']=='合租'),'rentType']='整租'
    data.loc[(data['region']=='RG00014')&(data['rentType']=='合租')&(data['area']>60),'rentType']='整租'
    data.drop(data[(data['region']=='RG00008')&(data['tradeMoney']>15000)&(data['area']<110)].index,inplace=True)
    data.drop(data[(data['region']=='RG00008')&(data['tradeMoney']>20000)&(data['area']>110)].index,inplace=True)
    data.drop(data[(data['region']=='RG00008')&(data['tradeMoney']<1500)&(data['area']<50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00008')&(data['rentType']=='合租')&(data['area']>50)].index,inplace=True)
    data.drop(data[(data['region']=='RG00015') ].index,inplace=True)
    data.reset_index(drop=True, inplace=True)
    return data

data_train = cleanData(data_train)

2 特征工程

train = data_train
test = data_test
target_train = train.pop('tradeMoney')
target_test = test.pop('tradeMoney')

2.1 特征合并

"""
特征合并
"""
def newfeature(data):

    # 将houseType转为'Room'，'Hall'，'Bath'
    def Room(x):
        Room = int(x.split('室')[0])
        return Room
    def Hall(x):
        Hall = int(x.split("室")[1].split("厅")[0])
        return Hall
    def Bath(x):
        Bath = int(x.split("室")[1].split("厅")[1].split("卫")[0])
        return Bath

    data['Room'] = data['houseType'].apply(lambda x: Room(x))
    data['Hall'] = data['houseType'].apply(lambda x: Hall(x))
    data['Bath'] = data['houseType'].apply(lambda x: Bath(x))
    data['Room_Bath'] = (data['Bath']+1) / (data['Room']+1)
    # 填充租房类型
    data.loc[(data['rentType'] == '未知方式') & (data['Room'] <= 1), 'rentType'] = '整租'
    # print(data.loc[(data['rentType']=='未知方式')&(data['Room_Bath']>1),'rentType'])
    data.loc[(data['rentType'] == '未知方式') & (data['Room_Bath'] > 1), 'rentType'] = '合租'
    data.loc[(data['rentType'] == '未知方式') & (data['Room'] > 1) & (data['area'] < 50), 'rentType'] = '合租'
    data.loc[(data['rentType'] == '未知方式') & (data['area'] / data['Room'] < 20), 'rentType'] = '合租'
    # data.loc[(data['rentType']=='未知方式')&(data['area']>60),'rentType']='合租'
    data.loc[(data['rentType'] == '未知方式') & (data['area'] <= 50) & (data['Room'] == 2), 'rentType'] = '合租'
    data.loc[(data['rentType'] == '未知方式') & (data['area'] > 60) & (data['Room'] == 2), 'rentType'] = '整租'
    data.loc[(data['rentType'] == '未知方式') & (data['area'] <= 60) & (data['Room'] == 3), 'rentType'] = '合租'
    data.loc[(data['rentType'] == '未知方式') & (data['area'] > 60) & (data['Room'] == 3), 'rentType'] = '整租'
    data.loc[(data['rentType'] == '未知方式') & (data['area'] >= 100) & (data['Room'] > 3), 'rentType'] = '整租'

    # data.drop('Room_Bath', axis=1, inplace=True)
    # 提升0.0001
    def month(x):
        month = int(x.split('/')[1])
        return month
    # def day(x):
    #     day = int(x.split('/')[2])
    #     return day
    # 结果变差

    # 分割交易时间
    # data['year']=data['tradeTime'].apply(lambda x:year(x))
    data['month'] = data['tradeTime'].apply(lambda x: month(x))
    # data['day'] = data['tradeTime'].apply(lambda x: day(x))# 结果变差
    #     data['pv/uv'] = data['pv'] / data['uv']
    #     data['房间总数'] = data['室'] + data['厅'] + data['卫']

    # 合并部分配套设施特征
    data['trainsportNum'] = 5 * data['subwayStationNum'] / data['subwayStationNum'].mean() + data['busStationNum'] / \
                                                                                             data[
                                                                                                 'busStationNum'].mean()
    data['all_SchoolNum'] = 2 * data['interSchoolNum'] / data['interSchoolNum'].mean() + data['schoolNum'] / data[
        'schoolNum'].mean() \
                            + data['privateSchoolNum'] / data['privateSchoolNum'].mean()
    data['all_hospitalNum'] = 2 * data['hospitalNum'] / data['hospitalNum'].mean() + \
                              data['drugStoreNum'] / data['drugStoreNum'].mean()
    data['all_mall'] = data['mallNum'] / data['mallNum'].mean() + \
                       data['superMarketNum'] / data['superMarketNum'].mean()
    data['otherNum'] = data['gymNum'] / data['gymNum'].mean() + data['bankNum'] / data['bankNum'].mean() + \
                       data['shopNum'] / data['shopNum'].mean() + 2 * data['parkNum'] / data['parkNum'].mean()

    data.drop(['subwayStationNum', 'busStationNum',
               'interSchoolNum', 'schoolNum', 'privateSchoolNum',
               'hospitalNum', 'drugStoreNum', 'mallNum', 'superMarketNum', 'gymNum', 'bankNum', 'shopNum', 'parkNum'],
              axis=1, inplace=True)
    # 提升0.0005
    
#     data['houseType_1sumcsu']=data['Bath'].map(lambda x:str(x))+data['month'].map(lambda x:str(x))
#     data['houseType_2sumcsu']=data['Bath'].map(lambda x:str(x))+data['communityName']
#     data['houseType_3sumcsu']=data['Bath'].map(lambda x:str(x))+data['plate']
    
    data.drop('houseType', axis=1, inplace=True)
    data.drop('tradeTime', axis=1, inplace=True)
    
    data["area"] = data["area"].astype(int)


    # categorical_feats = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName','region', 'plate']
    categorical_feats = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration',  'region', 'plate','cluster']

    return data#, categorical_feats

train = newfeature(train)
test = newfeature(test)

2.2 计算统计特征

"""
计算统计特征
"""
def featureCount(train,test):
    train['data_type'] = 0
    test['data_type'] = 1
    data = pd.concat([train, test], axis=0, join='outer')
    def feature_count(data, features=[]):
        new_feature = 'count'
        for i in features:
            new_feature += '_' + i
        temp = data.groupby(features).size().reset_index().rename(columns={0: new_feature})
        data = data.merge(temp, 'left', on=features)
        return data

    data = feature_count(data, ['communityName'])
    data = feature_count(data, ['buildYear'])
    data = feature_count(data, ['totalFloor'])
    data = feature_count(data, ['communityName', 'totalFloor'])
    data = feature_count(data, ['communityName', 'newWorkers'])
    data = feature_count(data, ['communityName', 'totalTradeMoney'])
    new_train = data[data['data_type'] == 0]
    new_test = data[data['data_type'] == 1]
    new_train.drop('data_type', axis=1, inplace=True)
    new_test.drop(['data_type'], axis=1, inplace=True)
    return new_train, new_test
    
train, test = featureCount(train, test)

2.3 groupby生成统计特征：mean,std等

"""
groupby生成统计特征：mean,std等
"""
def gourpby(train,test):
    train['data_type'] = 0
    test['data_type'] = 1
    data = pd.concat([train, test], axis=0, join='outer')
    columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate']
    for feature in columns:
        data[feature] = LabelEncoder().fit_transform(data[feature])
    
    # 将buildYear列转换为整型数据，mode()函数提取众数
    buildYearmean = pd.DataFrame(data[data['buildYear'] != '暂无信息']['buildYear'].mode())
    data.loc[data[data['buildYear'] == '暂无信息'].index, 'buildYear'] = buildYearmean.iloc[0, 0]
    data['buildYear'] = data['buildYear'].astype('int')  #转换为int类型
    
    temp = data.groupby('communityName')['area'].agg({'com_area_mean': 'mean', 'com_area_std': 'std'})
    temp.fillna(0, inplace=True)
    data = data.merge(temp, on='communityName', how='left')
    
    data['price_per_area'] = data.tradeMeanPrice / data.area * 100
    temp = data.groupby('communityName')['price_per_area'].agg(
        {'comm_price_mean': 'mean', 'comm_price_std': 'std'})
    temp.fillna(0, inplace=True)
    data = data.merge(temp, on='communityName', how='left')
   
    temp = data.groupby('plate')['price_per_area'].agg(
        {'plate_price_mean': 'mean', 'plate_price_std': 'std'})
    temp.fillna(0, inplace=True)
    data = data.merge(temp, on='plate', how='left')
    data.drop('price_per_area', axis=1, inplace=True)

    temp = data.groupby('plate')['area'].agg({'plate_area_mean': 'mean', 'plate_area_std': 'std'})
    temp.fillna(0, inplace=True)
    data = data.merge(temp, on='plate', how='left')
    
    temp = data.groupby(['plate'])['buildYear'].agg({'plate_year_mean': 'mean', 'plate_year_std': 'std'})
    data = data.merge(temp, on='plate', how='left')
    data.plate_year_mean = data.plate_year_mean.astype('int')
    data['comm_plate_year_diff'] = data.buildYear - data.plate_year_mean
    data.drop('plate_year_mean', axis=1, inplace=True)

    temp = data.groupby('plate')['trainsportNum'].agg('sum').reset_index(name='plate_trainsportNum')
    data = data.merge(temp, on='plate', how='left')
    temp = data.groupby(['communityName', 'plate'])['trainsportNum'].agg('sum').reset_index(name='com_trainsportNum')
    data = data.merge(temp, on=['communityName', 'plate'], how='left')
    data['trainsportNum_ratio'] = list(map(lambda x, y: round(x / y, 3) if y != 0 else -1,
                                           data['com_trainsportNum'], data['plate_trainsportNum']))
    data = data.drop(['com_trainsportNum', 'plate_trainsportNum'], axis=1)

    temp = data.groupby('plate')['all_SchoolNum'].agg('sum').reset_index(name='plate_all_SchoolNum')
    data = data.merge(temp, on='plate', how='left')
    temp = data.groupby(['communityName', 'plate'])['all_SchoolNum'].agg('sum').reset_index(name='com_all_SchoolNum')
    data = data.merge(temp, on=['communityName', 'plate'], how='left')
    data = data.drop(['com_all_SchoolNum', 'plate_all_SchoolNum'], axis=1)

    temp = data.groupby(['communityName', 'plate'])['all_mall'].agg('sum').reset_index(name='com_all_mall')
    data = data.merge(temp, on=['communityName', 'plate'], how='left')

    temp = data.groupby('plate')['otherNum'].agg('sum').reset_index(name='plate_otherNum')
    data = data.merge(temp, on='plate', how='left')
    temp = data.groupby(['communityName', 'plate'])['otherNum'].agg('sum').reset_index(name='com_otherNum')
    data = data.merge(temp, on=['communityName', 'plate'], how='left')
    data['other_ratio'] = list(map(lambda x, y: round(x / y, 3) if y != 0 else -1,
                                   data['com_otherNum'], data['plate_otherNum']))
    data = data.drop(['com_otherNum', 'plate_otherNum'], axis=1)

    temp = data.groupby(['month', 'communityName']).size().reset_index(name='communityName_saleNum')
    data = data.merge(temp, on=['month', 'communityName'], how='left')
    temp = data.groupby(['month', 'plate']).size().reset_index(name='plate_saleNum')
    data = data.merge(temp, on=['month', 'plate'], how='left')

    data['sale_ratio'] = round((data.communityName_saleNum + 1) / (data.plate_saleNum + 1), 3)
    data['sale_newworker_differ'] = 3 * data.plate_saleNum - data.newWorkers
    data.drop(['communityName_saleNum', 'plate_saleNum'], axis=1, inplace=True)

    new_train = data[data['data_type'] == 0]
    new_test = data[data['data_type'] == 1]
    new_train.drop('data_type', axis=1, inplace=True)
    new_test.drop(['data_type'], axis=1, inplace=True)
    return new_train, new_test

train, test = gourpby(train, test)

2.4 聚类

"""
聚类
"""
def cluster(train,test):
    from sklearn.mixture import GaussianMixture

    train['data_type'] = 0
    test['data_type'] = 1
    data = pd.concat([train, test], axis=0, join='outer')
    col = ['totalFloor',
           'houseDecoration', 'communityName', 'region', 'plate', 'buildYear',

           'tradeMeanPrice', 'tradeSecNum', 'totalNewTradeMoney',
           'totalNewTradeArea', 'tradeNewMeanPrice', 'tradeNewNum', 'remainNewNum',

           'landTotalPrice', 'landMeanPrice', 'totalWorkers',
           'newWorkers', 'residentPopulation', 'lookNum',
           'trainsportNum',
           'all_SchoolNum', 'all_hospitalNum', 'all_mall', 'otherNum']

    # EM
    gmm = GaussianMixture(n_components=3, covariance_type='full', random_state=0)
    data['cluster']= pd.DataFrame(gmm.fit(data[col]).predict(data[col]))


    col1 = ['totalFloor','houseDecoration', 'communityName', 'region', 'plate', 'buildYear']
    col2 = ['tradeMeanPrice', 'tradeSecNum', 'totalNewTradeMoney',
            'totalNewTradeArea', 'tradeNewMeanPrice', 'tradeNewNum', 'remainNewNum',
            'landTotalPrice', 'landMeanPrice', 'totalWorkers',
            'newWorkers', 'residentPopulation', 'lookNum',
            'trainsportNum',
            'all_SchoolNum', 'all_hospitalNum', 'all_mall', 'otherNum']
    for feature1 in col1:
        for feature2 in col2:
        
            temp = data.groupby(['cluster',feature1])[feature2].agg('mean').reset_index(name=feature2+'_'+feature1+'_cluster_mean')
            temp.fillna(0, inplace=True)
       
            data = data.merge(temp, on=['cluster', feature1], how='left')
    
   
    new_train = data[data['data_type'] == 0]
    new_test = data[data['data_type'] == 1]
    new_train.drop('data_type', axis=1, inplace=True)
    new_test.drop(['data_type'], axis=1, inplace=True)
    
    return new_train, new_test

train, test = cluster(train, test)

2.5 log平滑

"""
log平滑
"""
import numpy as np

# 过大量级值取log平滑（针对线性模型有效）
big_num_cols = ['totalTradeMoney','totalTradeArea','tradeMeanPrice','totalNewTradeMoney', 'totalNewTradeArea',
                'tradeNewMeanPrice','remainNewNum', 'supplyNewNum', 'supplyLandArea',
                'tradeLandArea','landTotalPrice','landMeanPrice','totalWorkers','newWorkers',
                'residentPopulation','pv','uv']
for col in big_num_cols:
        train[col] = train[col].map(lambda x: np.log1p(x))
        test[col] = test[col].map(lambda x: np.log1p(x))

2.6 对比特征工程前后线性模型结果情况

train=train.fillna(0)
test=test.fillna(0)
test.drop('city', axis=1, inplace=True)
train.drop('city', axis=1, inplace=True)
# Lasso回归
from sklearn.linear_model import Lasso
lasso=Lasso(alpha=0.1)
lasso.fit(train,target_train)
#预测测试集和训练集结果
y_pred_train=lasso.predict(train)
y_pred_test=lasso.predict(test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, target_test)
print("测试集结果：",score_test)

结果：

训练集结果： 0.7360740903782804
测试集结果： 0.8267663520602939

3 特征选择

3.1 相关系数法特征选择

#相关系数法特征选择
from sklearn.feature_selection import SelectKBest

print(train.shape)

sk=SelectKBest(k=150)
new_train=sk.fit_transform(train,target_train)
print(new_train.shape)

# 获取对应列索引
select_columns=sk.get_support(indices = True)
# print(select_columns)

# 获取对应列名
# print(test.columns[select_columns])
select_columns_name=test.columns[select_columns]
new_test=test[select_columns_name]
print(new_test.shape)
# Lasso回归
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,target_train)
#预测测试集和训练集结果
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, target_test)
print("测试集结果：",score_test)

结果：

(40134, 174)
(40134, 150)
(2469, 150)
训练集结果： 0.7239005208289223
测试集结果： 0.8103133874714904

3.2 Wrapper

# Wrapper

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=160)
rfe.fit(train,target_train)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=40, step=1, verbose=0)

select_columns = [f for f, s in zip(train.columns, rfe.support_) if s]
print(select_columns)
new_train = train[select_columns]
new_test = test[select_columns]

# Lasso回归
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,target_train)
#预测测试集和训练集结果
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, target_test)
print("测试集结果：",score_test)

结果：

['Bath', 'Hall', 'ID', 'Room', 'Room_Bath', 'all_SchoolNum', 'all_hospitalNum', 'all_mall', 'area', 'buildYear', 'communityName', 'houseDecoration', 'houseFloor', 'houseToward', 'landMeanPrice', 'landTotalPrice', 'lookNum', 'month', 'newWorkers', 'otherNum', 'plate', 'pv', 'region', 'remainNewNum', 'rentType', 'residentPopulation', 'saleSecHouseNum', 'supplyLandArea', 'supplyLandNum', 'supplyNewNum', 'totalFloor', 'totalNewTradeArea', 'totalNewTradeMoney', 'totalTradeArea', 'totalTradeMoney', 'totalWorkers', 'tradeLandArea', 'tradeLandNum', 'tradeMeanPrice', 'tradeNewMeanPrice', 'tradeNewNum', 'tradeSecNum', 'trainsportNum', 'uv', 'count_communityName', 'count_buildYear', 'count_totalFloor', 'count_communityName_totalFloor', 'count_communityName_newWorkers', 'count_communityName_totalTradeMoney', 'com_area_mean', 'com_area_std', 'comm_price_mean', 'comm_price_std', 'plate_price_mean', 'plate_price_std', 'plate_area_mean', 'plate_area_std', 'plate_year_std', 'comm_plate_year_diff', 'trainsportNum_ratio', 'com_all_mall', 'other_ratio', 'sale_ratio', 'sale_newworker_differ', 'cluster', 'tradeMeanPrice_totalFloor_cluster_mean', 'tradeSecNum_totalFloor_cluster_mean', 'totalNewTradeArea_totalFloor_cluster_mean', 'tradeNewMeanPrice_totalFloor_cluster_mean', 'tradeNewNum_totalFloor_cluster_mean', 'remainNewNum_totalFloor_cluster_mean', 'landMeanPrice_totalFloor_cluster_mean', 'totalWorkers_totalFloor_cluster_mean', 'newWorkers_totalFloor_cluster_mean', 'residentPopulation_totalFloor_cluster_mean', 'lookNum_totalFloor_cluster_mean', 'trainsportNum_totalFloor_cluster_mean', 'all_SchoolNum_totalFloor_cluster_mean', 'all_hospitalNum_totalFloor_cluster_mean', 'all_mall_totalFloor_cluster_mean', 'otherNum_totalFloor_cluster_mean', 'tradeMeanPrice_houseDecoration_cluster_mean', 'tradeSecNum_houseDecoration_cluster_mean', 'totalNewTradeArea_houseDecoration_cluster_mean', 'tradeNewMeanPrice_houseDecoration_cluster_mean', 'tradeNewNum_houseDecoration_cluster_mean', 'remainNewNum_houseDecoration_cluster_mean', 'landMeanPrice_houseDecoration_cluster_mean', 'totalWorkers_houseDecoration_cluster_mean', 'newWorkers_houseDecoration_cluster_mean', 'residentPopulation_houseDecoration_cluster_mean', 'lookNum_houseDecoration_cluster_mean', 'trainsportNum_houseDecoration_cluster_mean', 'all_SchoolNum_houseDecoration_cluster_mean', 'all_hospitalNum_houseDecoration_cluster_mean', 'all_mall_houseDecoration_cluster_mean', 'otherNum_houseDecoration_cluster_mean', 'tradeMeanPrice_communityName_cluster_mean', 'tradeSecNum_communityName_cluster_mean', 'totalNewTradeArea_communityName_cluster_mean', 'tradeNewMeanPrice_communityName_cluster_mean', 'tradeNewNum_communityName_cluster_mean', 'remainNewNum_communityName_cluster_mean', 'landMeanPrice_communityName_cluster_mean', 'totalWorkers_communityName_cluster_mean', 'residentPopulation_communityName_cluster_mean', 'lookNum_communityName_cluster_mean', 'trainsportNum_communityName_cluster_mean', 'all_SchoolNum_communityName_cluster_mean', 'all_hospitalNum_communityName_cluster_mean', 'all_mall_communityName_cluster_mean', 'otherNum_communityName_cluster_mean', 'tradeMeanPrice_region_cluster_mean', 'tradeSecNum_region_cluster_mean', 'totalNewTradeArea_region_cluster_mean', 'tradeNewMeanPrice_region_cluster_mean', 'tradeNewNum_region_cluster_mean', 'remainNewNum_region_cluster_mean', 'landMeanPrice_region_cluster_mean', 'totalWorkers_region_cluster_mean', 'newWorkers_region_cluster_mean', 'residentPopulation_region_cluster_mean', 'lookNum_region_cluster_mean', 'trainsportNum_region_cluster_mean', 'all_SchoolNum_region_cluster_mean', 'all_hospitalNum_region_cluster_mean', 'all_mall_region_cluster_mean', 'otherNum_region_cluster_mean', 'tradeMeanPrice_plate_cluster_mean', 'tradeSecNum_plate_cluster_mean', 'totalNewTradeArea_plate_cluster_mean', 'tradeNewMeanPrice_plate_cluster_mean', 'tradeNewNum_plate_cluster_mean', 'remainNewNum_plate_cluster_mean', 'landMeanPrice_plate_cluster_mean', 'totalWorkers_plate_cluster_mean', 'newWorkers_plate_cluster_mean', 'residentPopulation_plate_cluster_mean', 'lookNum_plate_cluster_mean', 'trainsportNum_plate_cluster_mean', 'all_SchoolNum_plate_cluster_mean', 'all_hospitalNum_plate_cluster_mean', 'all_mall_plate_cluster_mean', 'otherNum_plate_cluster_mean', 'tradeMeanPrice_buildYear_cluster_mean', 'tradeSecNum_buildYear_cluster_mean', 'totalNewTradeArea_buildYear_cluster_mean', 'tradeNewMeanPrice_buildYear_cluster_mean', 'tradeNewNum_buildYear_cluster_mean', 'remainNewNum_buildYear_cluster_mean', 'landMeanPrice_buildYear_cluster_mean', 'newWorkers_buildYear_cluster_mean', 'residentPopulation_buildYear_cluster_mean', 'lookNum_buildYear_cluster_mean', 'trainsportNum_buildYear_cluster_mean', 'all_SchoolNum_buildYear_cluster_mean', 'all_hospitalNum_buildYear_cluster_mean', 'all_mall_buildYear_cluster_mean', 'otherNum_buildYear_cluster_mean']
训练集结果： 0.7337742338727071
测试集结果： 0.8241519576433839

3.3 Embedded

3.3.1 基于惩罚项的特征选择法

3.3.2 Lasso(l1)和Ridge(l2)

from sklearn.linear_model import Ridge

ridge = Ridge(alpha=5)
ridge.fit(train,target_train)

Ridge(alpha=5, copy_X=True, fit_intercept=True, max_iter=None, normalize=False,
      random_state=None, solver='auto', tol=0.001)

# 特征系数排序
coefSort = ridge.coef_.argsort()
print(coefSort)


# 特征系数
featureCoefSore=ridge.coef_[coefSort]
print(featureCoefSore)

select_columns = [f for f, s in zip(train.columns, featureCoefSore) if abs(s)> 0.0000005 ] 
# 选择绝对值大于0.0000005的特征

new_train = train[select_columns]
new_test = test[select_columns]
# Lasso回归
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,target_train)
#预测测试集和训练集结果
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, target_test)
print("测试集结果：",score_test)

结果：

[  4  63 151 136 118 115  38 117   5  81 173 119  78 133  21 132 150  62
  37  60 116  31 169 137 134   3  39  18   9 170  27 161  12 125  89  13
  23  56 121  11  14  71  34  26  90  44  97 126  94  41 107 144  99  87
  10 101 141  84 100  72  74  54 146 164 110  66 162 124  95  45  46  77
 166  52 149 113 148  64 111 147 106  53 165 122  91 163 158  68 127 104
 145   2 109  73 140  86  75 167 112  70 160 129 131  76 142 156 120 102
  55 105  93  49  69 138  67 130  88 128 159  48 123  40  92  96  65 157
  61 108 103  98  16  29 139  50  58  47  15  51  36 114  35 143  79  17
  85  30 168   8  59  57  32 153   6  20  83  80  33  28  19  43  24 155
   1  25 171   7 172 135 154 152  22  82   0  42]
[-6.48343154e+02 -6.43386809e+02 -5.86478794e+02 -4.40904393e+02
 -3.74805643e+02 -3.70796697e+02 -3.58726696e+02 -3.50009215e+02
 -3.03955333e+02 -2.93990684e+02 -2.80475875e+02 -2.80280143e+02
 -2.49345719e+02 -2.48621943e+02 -1.46655049e+02 -1.39662683e+02
 -1.28958930e+02 -1.17906802e+02 -1.02292358e+02 -9.84447458e+01
 -9.40963354e+01 -8.56294072e+01 -7.94868461e+01 -5.82133416e+01
 -5.52345542e+01 -5.21234053e+01 -4.77384036e+01 -3.95396396e+01
 -3.87146629e+01 -2.80988522e+01 -2.77664125e+01 -1.76457162e+01
 -1.31868976e+01 -1.27922726e+01 -1.20039309e+01 -1.12728676e+01
 -1.10588818e+01 -8.28208449e+00 -6.85243790e+00 -6.65123011e+00
 -5.30336374e+00 -4.92213421e+00 -4.41740868e+00 -3.01940888e+00
 -1.96959729e+00 -1.91931131e+00 -1.55742486e+00 -1.51455886e+00
 -1.32077604e+00 -1.16329747e+00 -1.09188881e+00 -8.52059044e-01
 -6.89624813e-01 -6.08365849e-01 -5.05241812e-01 -4.03049355e-01
 -2.52147356e-01 -1.73444016e-01 -1.67481439e-01 -7.52770516e-02
 -3.79671175e-02 -3.13661948e-02 -1.97448471e-02 -1.57121312e-02
 -1.48623557e-02 -1.23857775e-02 -1.22137219e-02 -1.08270677e-02
 -7.43043523e-03 -7.40427199e-03 -6.62598909e-03 -5.02353788e-03
 -4.87744879e-03 -3.42374157e-03 -3.42220589e-03 -3.36500779e-03
 -3.35415758e-03 -2.54265531e-03 -1.88348786e-03 -1.85492221e-03
 -5.83340298e-04 -5.66290557e-04 -9.15228283e-05 -1.98365723e-06
 -1.62124640e-06 -5.94419617e-07 -4.91145646e-07 -3.83441382e-07
 -3.30139349e-07 -2.92338200e-07 -2.08679113e-07  0.00000000e+00
  2.63140694e-07  4.80388648e-07  1.29260707e-06  2.34285655e-05
  7.27390846e-04  1.22979304e-03  2.38977891e-03  3.03774676e-03
  3.69073134e-03  4.90001355e-03  6.38544920e-03  7.23180674e-03
  1.21545772e-02  1.40642039e-02  1.67981423e-02  2.02902475e-02
  2.09275740e-02  2.70220287e-02  4.37831544e-02  4.57963569e-02
  5.23670019e-02  7.23187501e-02  7.93575695e-02  8.25057819e-02
  9.65967232e-02  1.47400786e-01  2.01784094e-01  2.29938621e-01
  2.31249283e-01  2.50619500e-01  2.52628227e-01  3.37755439e-01
  3.42010056e-01  4.19841592e-01  7.49289951e-01  7.83037573e-01
  1.28879559e+00  1.97537483e+00  2.14279362e+00  2.15400614e+00
  3.16311850e+00  3.17851198e+00  3.54129580e+00  4.38345495e+00
  4.53613697e+00  5.44565950e+00  5.87781978e+00  8.03668179e+00
  1.58250372e+01  1.66510304e+01  1.86480602e+01  2.42372556e+01
  3.01757973e+01  3.35529440e+01  3.51419478e+01  3.70521353e+01
  4.25485791e+01  5.35143372e+01  5.57407374e+01  7.08396767e+01
  8.51228810e+01  9.91591948e+01  1.01170717e+02  1.09909614e+02
  1.24397083e+02  1.29325288e+02  1.35207104e+02  1.45531428e+02
  1.51557158e+02  2.15557537e+02  2.61779178e+02  3.06492704e+02
  3.28281181e+02  3.44420365e+02  4.19593500e+02  4.24285294e+02
  4.72016405e+02  4.72385762e+02  5.57231782e+02  6.95200991e+02
  7.02750797e+02  1.09398478e+03]
训练集结果： 0.7359208614361752
测试集结果： 0.8267792618327826

3.3.3 基于树模型的特征选择法

3.3.4 随机森林平均不纯度减少（mean decrease impurity）

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
# 训练随机森林模型，并通过feature_importances_属性获取每个特征的重要性分数。rf = RandomForestRegressor()
rf.fit(train,target_train)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), train.columns),
             reverse=True))

select_columns = [f for f, s in zip(train.columns, rf.feature_importances_) if abs(s)> 0.00005 ] 
# 选择绝对值大于0.00005的特征

new_train = train[select_columns]
new_test = test[select_columns]

# Lasso回归
from sklearn.linear_model import Lasso

lasso=Lasso(alpha=0.1)
lasso.fit(new_train,target_train)
#预测测试集和训练集结果
y_pred_train=lasso.predict(new_train)

y_pred_test=lasso.predict(new_test)

#对比结果
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果：",score_train)
score_test=r2_score(y_pred_test, target_test)
print("测试集结果：",score_test)

结果：

Features sorted by their score:
[(0.4402, 'area'), (0.1756, 'tradeMeanPrice_plate_cluster_mean'), (0.0466, 'tradeMeanPrice_communityName_cluster_mean'), (0.0327, 'plate_area_mean'), (0.0321, 'com_area_mean'), (0.0265, 'plate_year_std'), (0.0179, 'plate_area_std'), (0.01, 'tradeNewMeanPrice_plate_cluster_mean'), (0.0094, 'comm_plate_year_diff'), (0.0084, 'totalFloor'), (0.008, 'tradeNewMeanPrice_communityName_cluster_mean'), (0.0074, 'comm_price_mean'), (0.0068, 'buildYear'), (0.0065, 'plate_price_mean'), (0.0056, 'remainNewNum_communityName_cluster_mean'), (0.0054, 'Room'), (0.0045, 'sale_ratio'), (0.0042, 'tradeSecNum_communityName_cluster_mean'), (0.0042, 'other_ratio'), (0.0035, 'com_all_mall'), (0.0034, 'count_communityName'), (0.0032, 'communityName'), (0.0031, 'trainsportNum_ratio'), (0.0031, 'com_area_std'), (0.003, 'trainsportNum_communityName_cluster_mean'), (0.0029, 'tradeNewMeanPrice'), (0.0029, 'Hall'), (0.0028, 'count_communityName_totalFloor'), (0.0026, 'tradeMeanPrice'), (0.0026, 'comm_price_std'), (0.0026, 'all_SchoolNum_communityName_cluster_mean'), (0.0023, 'tradeMeanPrice_buildYear_cluster_mean'), (0.0023, 'houseFloor'), (0.0022, 'count_totalFloor'), (0.0021, 'totalWorkers_buildYear_cluster_mean'), (0.0021, 'plate_price_std'), (0.002, 'totalWorkers'), (0.002, 'remainNewNum'), (0.0019, 'tradeSecNum_region_cluster_mean'), (0.0019, 'totalTradeArea'), (0.0018, 'residentPopulation_communityName_cluster_mean'), (0.0018, 'otherNum_communityName_cluster_mean'), (0.0018, 'count_buildYear'), (0.0018, 'all_SchoolNum_totalFloor_cluster_mean'), (0.0018, 'all_SchoolNum_buildYear_cluster_mean'), (0.0018, 'Bath'), (0.0017, 'residentPopulation'), (0.0017, 'houseToward'), (0.0017, 'count_communityName_totalTradeMoney'), (0.0017, 'all_mall_buildYear_cluster_mean'), (0.0016, 'tradeSecNum_totalFloor_cluster_mean'), (0.0016, 'all_mall_communityName_cluster_mean'), (0.0016, 'all_hospitalNum_communityName_cluster_mean'), (0.0016, 'Room_Bath'), (0.0015, 'uv'), (0.0015, 'tradeNewMeanPrice_buildYear_cluster_mean'), (0.0015, 'totalWorkers_totalFloor_cluster_mean'), (0.0015, 'totalNewTradeMoney_totalFloor_cluster_mean'), (0.0015, 'pv'), (0.0014, 'tradeSecNum'), (0.0014, 'tradeNewMeanPrice_totalFloor_cluster_mean'), (0.0014, 'totalWorkers_communityName_cluster_mean'), (0.0014, 'totalTradeMoney'), (0.0014, 'sale_newworker_differ'), (0.0013, 'tradeSecNum_buildYear_cluster_mean'), (0.0013, 'tradeNewNum_communityName_cluster_mean'), (0.0013, 'tradeMeanPrice_region_cluster_mean'), (0.0013, 'month'), (0.0013, 'count_communityName_newWorkers'), (0.0012, 'trainsportNum_plate_cluster_mean'), (0.0012, 'trainsportNum_buildYear_cluster_mean'), (0.0012, 'tradeMeanPrice_totalFloor_cluster_mean'), (0.0012, 'plate'), (0.0011, 'totalNewTradeArea_communityName_cluster_mean'), (0.0011, 'landTotalPrice_totalFloor_cluster_mean'), (0.0011, 'landMeanPrice_totalFloor_cluster_mean'), (0.0011, 'all_mall_totalFloor_cluster_mean'), (0.001, 'trainsportNum_totalFloor_cluster_mean'), (0.001, 'totalNewTradeMoney_communityName_cluster_mean'), (0.001, 'residentPopulation_totalFloor_cluster_mean'), (0.001, 'landTotalPrice_buildYear_cluster_mean'), (0.0009, 'totalNewTradeMoney_buildYear_cluster_mean'), (0.0009, 'totalNewTradeMoney'), (0.0009, 'totalNewTradeArea'), (0.0009, 'saleSecHouseNum'), (0.0009, 'remainNewNum_totalFloor_cluster_mean'), (0.0009, 'otherNum_totalFloor_cluster_mean'), (0.0009, 'otherNum_buildYear_cluster_mean'), (0.0009, 'all_hospitalNum_buildYear_cluster_mean'), (0.0009, 'all_SchoolNum_region_cluster_mean'), (0.0009, 'all_SchoolNum_plate_cluster_mean'), (0.0008, 'tradeNewNum_region_cluster_mean'), (0.0008, 'tradeNewNum_plate_cluster_mean'), (0.0008, 'totalWorkers_region_cluster_mean'), (0.0008, 'totalWorkers_plate_cluster_mean'), (0.0008, 'residentPopulation_plate_cluster_mean'), (0.0008, 'residentPopulation_buildYear_cluster_mean'), (0.0008, 'remainNewNum_region_cluster_mean'), (0.0008, 'remainNewNum_plate_cluster_mean'), (0.0008, 'landMeanPrice_buildYear_cluster_mean'), (0.0007, 'totalNewTradeMoney_plate_cluster_mean'), (0.0007, 'all_hospitalNum'), (0.0006, 'tradeNewNum'), (0.0006, 'totalWorkers_houseDecoration_cluster_mean'), (0.0006, 'totalNewTradeArea_totalFloor_cluster_mean'), (0.0006, 'otherNum_plate_cluster_mean'), (0.0006, 'all_mall_plate_cluster_mean'), (0.0006, 'all_hospitalNum_plate_cluster_mean'), (0.0005, 'tradeNewNum_totalFloor_cluster_mean'), (0.0005, 'tradeNewMeanPrice_region_cluster_mean'), (0.0005, 'supplyNewNum'), (0.0005, 'remainNewNum_buildYear_cluster_mean'), (0.0005, 'otherNum'), (0.0005, 'lookNum'), (0.0005, 'houseDecoration'), (0.0005, 'all_mall'), (0.0005, 'all_hospitalNum_totalFloor_cluster_mean'), (0.0004, 'tradeSecNum_plate_cluster_mean'), (0.0004, 'totalNewTradeArea_buildYear_cluster_mean'), (0.0004, 'residentPopulation_region_cluster_mean'), (0.0004, 'newWorkers_communityName_cluster_mean'), (0.0004, 'all_SchoolNum_houseDecoration_cluster_mean'), (0.0004, 'all_SchoolNum'), (0.0003, 'trainsportNum_region_cluster_mean'), (0.0003, 'trainsportNum'), (0.0003, 'tradeNewNum_buildYear_cluster_mean'), (0.0003, 'tradeNewMeanPrice_houseDecoration_cluster_mean'), (0.0003, 'tradeMeanPrice_houseDecoration_cluster_mean'), (0.0003, 'totalNewTradeMoney_houseDecoration_cluster_mean'), (0.0003, 'rentType'), (0.0003, 'landMeanPrice_communityName_cluster_mean'), (0.0003, 'all_mall_region_cluster_mean'), (0.0003, 'all_hospitalNum_region_cluster_mean'), (0.0002, 'trainsportNum_houseDecoration_cluster_mean'), (0.0002, 'tradeSecNum_houseDecoration_cluster_mean'), (0.0002, 'otherNum_region_cluster_mean'), (0.0002, 'newWorkers_totalFloor_cluster_mean'), (0.0002, 'newWorkers_buildYear_cluster_mean'), (0.0002, 'lookNum_totalFloor_cluster_mean'), (0.0002, 'lookNum_communityName_cluster_mean'), (0.0002, 'lookNum_buildYear_cluster_mean'), (0.0002, 'landTotalPrice_houseDecoration_cluster_mean'), (0.0002, 'landTotalPrice_communityName_cluster_mean'), (0.0002, 'landMeanPrice_region_cluster_mean'), (0.0002, 'landMeanPrice_houseDecoration_cluster_mean'), (0.0002, 'all_mall_houseDecoration_cluster_mean'), (0.0001, 'tradeNewNum_houseDecoration_cluster_mean'), (0.0001, 'totalNewTradeMoney_region_cluster_mean'), (0.0001, 'totalNewTradeArea_region_cluster_mean'), (0.0001, 'totalNewTradeArea_plate_cluster_mean'), (0.0001, 'totalNewTradeArea_houseDecoration_cluster_mean'), (0.0001, 'supplyLandNum'), (0.0001, 'supplyLandArea'), (0.0001, 'residentPopulation_houseDecoration_cluster_mean'), (0.0001, 'remainNewNum_houseDecoration_cluster_mean'), (0.0001, 'region'), (0.0001, 'otherNum_houseDecoration_cluster_mean'), (0.0001, 'newWorkers_plate_cluster_mean'), (0.0001, 'newWorkers'), (0.0001, 'lookNum_region_cluster_mean'), (0.0001, 'lookNum_plate_cluster_mean'), (0.0001, 'landTotalPrice_region_cluster_mean'), (0.0001, 'landMeanPrice_plate_cluster_mean'), (0.0001, 'all_hospitalNum_houseDecoration_cluster_mean'), (0.0, 'tradeLandNum'), (0.0, 'tradeLandArea'), (0.0, 'newWorkers_region_cluster_mean'), (0.0, 'newWorkers_houseDecoration_cluster_mean'), (0.0, 'lookNum_houseDecoration_cluster_mean'), (0.0, 'landTotalPrice_plate_cluster_mean'), (0.0, 'landTotalPrice'), (0.0, 'landMeanPrice'), (0.0, 'cluster'), (0.0, 'ID')]
训练集结果： 0.7357530365422338
测试集结果： 0.8267922424570837

总结：全靠大佬写的代码

一-叶知秋

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
数据竞赛模拟_房租预测（3 特征工程）

任务3 特征工程1 任务2的部分预处理2 特征工程2.1 特征合并2.2 计算统计特征2.3 groupby生成统计特征：mean,std等2.4 聚类2.5 log平滑2.6 对比特征工程前后线性模型结果情况3 特征选择3.1 相关系数法特征选择3.2 Wrapper3.3 Embedded3.3.1 基于惩罚项的特征选择法3.3.2 Lasso(l1)和Ridge(l2)3.3.3 基于树模型...
复制链接

扫一扫