城市-房产租金预测之特征工程&特征选择

城市-房产租金预测之特征工程&特征选择

首先我们要对房间类型[‘houseType’]进行拆分,将其作为新的特征[‘Room’],[‘Hall’],[‘Path’],同时也需要对[‘tradeTime’]进行时间提取。当然需要对train和test同时进行。

def newfeature(data):
    data['tradeTime'] = pd.to_datetime(data['tradeTime'],format='%Y-%m-%d')
    data["year"]=data['tradeTime'].dt.year
    data["day"]=data['tradeTime'].dt.day
    data["month"]=data['tradeTime'].dt.month
    
    def rooms(x):
        room = int(x.split('室')[0])
        return room
    def halls(x):
        hall = int(x.split('室')[1].split('厅')[0])
        return hall
    def toilets(x):
        toilet = int(x.split('室')[1].split('厅')[1].split('卫')[0])
        return toilet
    data['Room'] = data['houseType'].apply(lambda x : rooms(x))
    data['Hall'] = data['houseType'].apply(lambda x : halls(x))
    data['Path'] = data['houseType'].apply(lambda x : toilets(x))
    data['Room_Path'] = (data['Path'] + 1) / (data['Room'] + 1)
    
    return data
  
train  = newfeature(train)
test = newfeature(test)  
['rentType']中未知方式的取值太多,不宜采用众数和平均数填充,因此需要根据test和特征"area""rentType""rentType"进行填充。

```python
plt.figure(figsize=(25,25))
g = sns.lmplot(x="area", y="Room", hue="rentType", col="rentType",col_wrap=3,data=test,sharex=False,sharey=False)
plt.rcParams['font.sans-serif']=['SimHei'] 
plt.tight_layout()
plt.show()

在这里插入图片描述

plt.figure(figsize=(25,25))
g = sns.lmplot(x="area", y="Room", hue="rentType", col="rentType",col_wrap=3,data=train,sharex=False,sharey=False)
plt.rcParams['font.sans-serif']=['SimHei'] 
plt.tight_layout()
plt.show()

在这里插入图片描述

def clean_feature(data):
    data.loc[(data['rentType'] == '未知方式') & (data['Room'] <= 1), 'rentType'] = '整租'
    # print(data.loc[(data['rentType']=='未知方式')&(data['Room_Bath']>1),'rentType'])
    data.loc[(data['rentType'] == '未知方式') & (data['Room_Bath'] > 1), 'rentType'] = '合租'
    data.loc[(data['rentType'] == '未知方式') & (data['Room'] > 1) & (data['area'] < 50), 'rentType'] = '合租'
    data.loc[(data['rentType'] == '未知方式') & (data['area'] / data['Room'] < 20), 'rentType'] = '合租'
    # data.loc[(data['rentType']=='未知方式')&(data['area']>60),'rentType']='合租'
    data.loc[(data['rentType'] == '未知方式') & (data['area'] <= 50) & (data['Room'] == 2), 'rentType'] = '合租'
    data.loc[(data['rentType'] == '未知方式') & (data['area'] > 60) & (data['Room'] == 2), 'rentType'] = '整租'
    data.loc[(data['rentType'] == '未知方式') & (data['area'] <= 60) & (data['Room'] == 3), 'rentType'] = '合租'
    data.loc[(data['rentType'] == '未知方式') & (data['area'] > 60) & (data['Room'] == 3), 'rentType'] = '整租'
    data.loc[(data['rentType'] == '未知方式') & (data['area'] >= 100) & (data['Room'] > 3), 'rentType'] = '整租'
    
    return data
X_train_full = clean_feature(X_train_full)
X_valid_full = clean_feature(X_valid_full)
test = clean_feature(test)

同时我们可以根据特征之间的关系来构造新的特征。

def new_features(data):
	data['trainsportNum'] =5*data['subwayStationNum']/data['subwayStationNum'].mean() + data['busStationNum'] / data['busStationNum']
	data['all_SchoolNum'] = 2 * data['interSchoolNum'] / data['interSchoolNum'].mean() + data['schoolNum'] / data[
        'schoolNum'].mean() \
                            + data['privateSchoolNum'] / data['privateSchoolNum'].mean()
        data['all_hospitalNum'] = 2 * data['hospitalNum'] / data['hospitalNum'].mean() + \
                              data['drugStoreNum'] / data['drugStoreNum'].mean()
        data['all_mall'] = data['mallNum'] / data['mallNum'].mean() + \
                       data['superMarketNum'] / data['superMarketNum'].mean()
        data['otherNum'] = data['gymNum'] / data['gymNum'].mean() + data['bankNum'] / data['bankNum'].mean() + \
                       data['shopNum'] / data['shopNum'].mean() + 2 * data['parkNum'] / data['parkNum'].mean()
        data.drop(['subwayStationNum', 'busStationNum',
               'interSchoolNum', 'schoolNum', 'privateSchoolNum',
               'hospitalNum', 'drugStoreNum', 'mallNum', 'superMarketNum', 'gymNum', 'bankNum', 'shopNum', 'parkNum'],
              axis=1, inplace=True)
        data.drop('houseType', axis=1, inplace=True)
        data.drop('tradeTime', axis=1, inplace=True)
        data["area"] = data["area"].astype(int)
        return data                                                                         
train= new_features(train)
test = new_features(test)

我们可以用groupby函数来调整数据的分布情况。

def gourpby(data):	
	columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate']
    	for feature in columns:
    		data[feature] = LabelEncoder().fit_transform(data[feature])
    	temp = data.groupby('communityName')['area'].agg({'com_area_mean': 'mean', 'com_area_std': 'std'})
    	temp.fillna(0, inplace=True)
    	data = data.merge(temp, on='communityName', how='left')
	data['price_per_area'] = data.tradeMeanPrice / data.area * 100
    	temp = data.groupby('communityName')['price_per_area'].agg(
        {'comm_price_mean': 'mean', 'comm_price_std': 'std'})
    	temp.fillna(0, inplace=True)
    	data = data.merge(temp, on='communityName', how='left')
    	
    	temp = data.groupby('plate')['price_per_area'].agg(
        {'plate_price_mean': 'mean', 'plate_price_std': 'std'})
    	temp.fillna(0, inplace=True)
    	data = data.merge(temp, on='plate', how='left')
    	data.drop('price_per_area', axis=1, inplace=True)
    		
    	temp = data.groupby('plate')['area'].agg({'plate_area_mean': 'mean', 	'plate_area_std': 'std'})
    	temp.fillna(0, inplace=True)
    	data = data.merge(temp, on='plate', how='left')
    
    	return data
train = gourpby(train)
test = gourpby(test)   

我们使用聚类算法的高斯混合模型来发现数据集中的隐藏关系。

def cluster(train,test):
	from sklearn.mixture import GaussianMixture
	train['data_type'] = 0
    	test['data_type'] = 1
    	data = pd.concat([train, test], axis=0, join='outer')
    	col = ['totalFloor',
    	'houseDecoration', 'communityName', 'region', 'plate', 'buildYear',
    	'tradeMeanPrice', 'tradeSecNum', 'totalNewTradeMoney',
    	'totalNewTradeArea', 'tradeNewMeanPrice', 'tradeNewNum', 'remainNewNum',
    	'landTotalPrice', 'landMeanPrice', 'totalWorkers',
    	'newWorkers', 'residentPopulation', 'lookNum',
    	'trainsportNum',
    	'all_SchoolNum', 'all_hospitalNum', 'all_mall', 'otherNum']
    	gmm = GaussianMixture(n_components=3, covariance_type='full', 		 random_state=0)
        data['cluster']= pd.DataFrame(gmm.fit_predict(data[col]))
        col1 = ['totalFloor','houseDecoration', 'communityName', 'region', 'plate', 'buildYear']
        col2 = ['tradeMeanPrice', 'tradeSecNum', 'totalNewTradeMoney',
        'totalNewTradeArea', 'tradeNewMeanPrice', 'tradeNewNum', 'remainNewNum',
        'landTotalPrice', 'landMeanPrice', 'totalWorkers',
        'newWorkers', 'residentPopulation', 'lookNum',
        'trainsportNum',
        'all_SchoolNum', 'all_hospitalNum', 'all_mall', 'otherNum']
        for feature1 in col1:
        	for feature2 in col2:
        		temp = data.groupby(['cluster',feature1])[feature2].agg('mean').reset_index(name=feature2+'_'+feature1+'_cluster_mean')
            		temp.fillna(0, inplace=True)
            		data = data.merge(temp, on=['cluster', feature1], how='left')
            	new_train = data[data['data_type'] == 0]
    	new_test = data[data['data_type'] == 1]
    	new_train.drop('data_type', axis=1, inplace=True)
    	new_test.drop(['data_type'], axis=1, inplace=True)
    	return new_train, new_test
    	
train, test = cluster(train, test)   

过大量级值取log平滑(针对线性模型有效)

big_num_cols = ['totalTradeMoney','totalTradeArea','tradeMeanPrice','totalNewTradeMoney', 'totalNewTradeArea',
                'tradeNewMeanPrice','remainNewNum', 'supplyNewNum', 'supplyLandArea',
                'tradeLandArea','landTotalPrice','landMeanPrice','totalWorkers','newWorkers',
                'residentPopulation','pv','uv']
for col in big_num_cols:
        train[col] = train[col].map(lambda x: np.log1p(x))
        test[col] = test[col].map(lambda x: np.log1p(x))
target_train = train.pop('tradeMoney')        

查看模型评分的情况。

test=test.fillna(0)
from sklearn.linear_model import Lasso
lasso=Lasso(alpha=0.1)
lasso.fit(train,target_train)
y_pred_train=lasso.predict(train)
from sklearn.metrics import r2_score
score_train=r2_score(y_pred_train,target_train)
print("训练集结果:",score_train)

运行结果:
训练集结果: 0.7054199659531692
使用特征选择

from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=160)
rfe.fit(train,target_train)
RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=40, step=1, verbose=0)
select_columns = [f for f, s in zip(train.columns, rfe.support_) if s]
print(select_columns)
new_train = train[select_columns]
new_test = test[select_columns]   
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值