特征工程
team- learning 数据竞赛(房租预测)
任务3 特征工程& 特征选择( 3 天)
特征工程
#核心代码举例
# 统计特征
#计算均值
gp = train. groupby ( by) [ fea] . mean ( )
#计算中位数
gp = train. groupby ( by) [ fea] . median ( )
#计算方差
gp = train. groupby ( by) [ fea] . std ( )
#计算最大值
gp = train. groupby ( by) [ fea] . max ( )
#计算最小值
gp = train. groupby ( by) [ fea] . min ( )
#计算出现次数
gp = train. groupby ( by) [ fea] . size ( )
# groupby生成统计特征:mean,std
# 按照communityName分组计算面积的均值和方差
temp = data_train. groupby ( 'communityName' ) [ 'area' ] . agg ( { 'com_area_mean' : 'mean' , 'com_area_std' : 'std' } )
#groupby是按communityName(在圆括号里面的)进行分组,也就是说一行一行地找出来;然后按照[]即中括号里的进行求agg。agg是对按照communityName分组后的进行mean和std。
#当我们要对数据框(DataFrame)的数据进行按行或按列操作时用apply ( ) ,axis= 1 就是按照行来,
特征合并
import pandas as pd
import warnings
warnings. filterwarnings ( 'ignore' )
from sklearn. preprocessing import LabelEncoder
train = pd. read_csv ( './train_data.csv' )
test = pd. read_csv ( './test_a.csv' )
target_train = train. pop ( 'tradeMoney' )
target_test = test. pop ( 'tradeMoney' )
def newfeature ( data) :
# 将houseType转为'Room' ,'Hall' ,'Bath'
def Room ( x) :
Room = int ( x. split ( '室' ) [ 0 ] )
return Room
def Hall ( x) :
Hall = int ( x. split ( "室" ) [ 1 ] . split ( "厅" ) [ 0 ] )
return Hall
def Bath ( x) :
Bath = int ( x. split ( "室" ) [ 1 ] . split ( "厅" ) [ 1 ] . split ( "卫" ) [ 0 ] )
return Bath
data[ 'Room' ] = data[ 'houseType' ] . apply ( lambda x: Room ( x) )
data[ 'Hall' ] = data[ 'houseType' ] . apply ( lambda x: Hall ( x) )
data[ 'Bath' ] = data[ 'houseType' ] . apply ( lambda x: Bath ( x) )
data[ 'Room_Bath' ] = ( data[ 'Bath' ] + 1 ) / ( data[ 'Room' ] + 1 )
# 填充租房类型
data. loc[ ( data[ 'rentType' ] == '未知方式' ) & ( data[ 'Room' ] <= 1 ) , 'rentType' ] = '整租'
# print(data.loc[(data['rentType']=='未知方式')&(data['Room_Bath']>1),'rentType'])
data. loc[ ( data[ 'rentType' ] == '未知方式' ) & ( data[ 'Room_Bath' ] > 1 ) , 'rentType' ] = '合租'
data. loc[ ( data[ 'rentType' ] == '未知方式' ) & ( data[ 'Room' ] > 1 ) & ( data[ 'area' ] < 50 ) , 'rentType' ] = '合租'
data. loc[ ( data[ 'rentType' ] == '未知方式' ) & ( data[ 'area' ] / data[ 'Room' ] < 20 ) , 'rentType' ] = '合租'
# data.loc[(data['rentType']=='未知方式')&(data['area']>60),'rentType']='合租'
data. loc[ ( data[ 'rentType' ] == '未知方式' ) & ( data[ 'area' ] <= 50 ) & ( data[ 'Room' ] == 2 ) , 'rentType' ] = '合租'
data. loc[ ( data[ 'rentType' ] == '未知方式' ) & ( data[ 'area' ] > 60 ) & ( data[ 'Room' ] == 2 ) , 'rentType' ] = '整租'
data. loc[ ( data[ 'rentType' ] == '未知方式' ) & ( data[ 'area' ] <= 60 ) & ( data[ 'Room' ] == 3 ) , 'rentType' ] = '合租'
data. loc[ ( data[ 'rentType' ] == '未知方式' ) & ( data[ 'area' ] > 60 ) & ( data[ 'Room' ] == 3 ) , 'rentType' ] = '整租'
data. loc[ ( data[ 'rentType' ] == '未知方式' ) & ( data[ 'area' ] >= 100 ) & ( data[ 'Room' ] > 3 ) , 'rentType' ] = '整租'
# data.drop('Room_Bath', axis=1, inplace=True)
# 提升0.0001
def month ( x) :
month = int ( x. split ( '/' ) [ 1 ] )
return month
# def day(x):
# day = int(x.split('/')[2])
# return day
# 结果变差
# 分割交易时间
# data['year']=data['tradeTime'].apply(lambda x:year(x))
data[ 'month' ] = data[ 'tradeTime' ] . apply ( lambda x: month ( x) )
# data['day'] = data['tradeTime'].apply(lambda x: day(x))# 结果变差
# data['pv/uv'] = data['pv'] / data['uv']
# data['房间总数'] = data['室'] + data['厅'] + data['卫']
# 合并部分配套设施特征
data[ 'trainsportNum' ] = 5 * data[ 'subwayStationNum' ] / data[ 'subwayStationNum' ] . mean ( ) + data[ 'busStationNum' ] / \
data[
'busStationNum' ] . mean ( )
data[ 'all_SchoolNum' ] = 2 * data[ 'interSchoolNum' ] / data[ 'interSchoolNum' ] . mean ( ) + data[ 'schoolNum' ] / data[
'schoolNum' ] . mean ( ) \
+ data[ 'privateSchoolNum' ] / data[ 'privateSchoolNum' ] . mean ( )
data[ 'all_hospitalNum' ] = 2 * data[ 'hospitalNum' ] / data[ 'hospitalNum' ] . mean ( ) + \
data[ 'drugStoreNum' ] / data[ 'drugStoreNum' ] . mean ( )
data[ 'all_mall' ] = data[ 'mallNum' ] / data[ 'mallNum' ] . mean ( ) + \
data[ 'superMarketNum' ] / data[ 'superMarketNum' ] . mean ( )
data[ 'otherNum' ] = data[ 'gymNum' ] / data[ 'gymNum' ] . mean ( ) + data[ 'bankNum' ] / data[ 'bankNum' ] . mean ( ) + \
data[ 'shopNum' ] / data[ 'shopNum' ] . mean ( ) + 2 * data[ 'parkNum' ] / data[ 'parkNum' ] . mean ( )
data. drop ( [ 'subwayStationNum' , 'busStationNum' ,
'interSchoolNum' , 'schoolNum' , 'privateSchoolNum' ,
'hospitalNum' , 'drugStoreNum' , 'mallNum' , 'superMarketNum' , 'gymNum' , 'bankNum' , 'shopNum' , 'parkNum' ] ,
axis= 1 , inplace= True)
# 提升0.0005
# data['houseType_1sumcsu']=data['Bath'].map(lambda x:str(x))+data['month'].map(lambda x:str(x))
# data['houseType_2sumcsu']=data['Bath'].map(lambda x:str(x))+data['communityName']
# data['houseType_3sumcsu']=data['Bath'].map(lambda x:str(x))+data['plate']
data. drop ( 'houseType' , axis= 1 , inplace= True)
data. drop ( 'tradeTime' , axis= 1 , inplace= True)
data[ "area" ] = data[ "area" ] . astype ( int )
# categorical_feats = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName','region', 'plate']
categorical_feats = [ 'rentType' , 'houseFloor' , 'houseToward' , 'houseDecoration' , 'region' , 'plate' , 'cluster' ]
return data, categorical_feats
计算统计特征
#计算统计特征
def featureCount ( train, test) :
train[ 'data_type' ] = 0
test[ 'data_type' ] = 1
data = pd. concat ( [ train, test] , axis= 0 , join= 'outer' )
def feature_count ( data, features= [ ] ) :
new_feature = 'count'
for i in features:
new_feature + = '_' + i
temp = data. groupby ( features) . size ( ) . reset_index ( ) . rename ( columns= { 0 : new_feature} )
data = data. merge ( temp, 'left' , on= features)
return data
data = feature_count ( data, [ 'communityName' ] )
data = feature_count ( data, [ 'buildYear' ] )
data = feature_count ( data, [ 'totalFloor' ] )
data = feature_count ( data, [ 'communityName' , 'totalFloor' ] )
data = feature_count ( data, [ 'communityName' , 'newWorkers' ] )
data = feature_count ( data, [ 'communityName' , 'totalTradeMoney' ] )
new_train = data[ data[ 'data_type' ] == 0 ]
new_test = data[ data[ 'data_type' ] == 1 ]
new_train. drop ( 'data_type' , axis= 1 , inplace= True)
new_test. drop ( [ 'data_type' ] , axis= 1 , inplace= True)
return new_train, new_test
train, test = featureCount ( train, test)