# 统计特征
#计算均值
gp = train.groupby(by)[fea].mean()
#计算中位数
gp = train.groupby(by)[fea].median()
#计算方差
gp = train.groupby(by)[fea].std()
#计算最大值
gp = train.groupby(by)[fea].max()
#计算最小值
gp = train.groupby(by)[fea].min()
#计算出现次数
gp = train.groupby(by)[fea].size()
# groupby生成统计特征:mean,std
# 按照communityName分组计算面积的均值和方差
temp = data.groupby('communityName')['area'].agg({'com_area_mean': 'mean', 'com_area_std': 'std'})
# 特征拆分
# 将houseType转为'Room','Hall','Bath'
def Room(x):
Room = int(x.split('室')[0])
return Room
def Hall(x):
Hall = int(x.split("室")[1].split("厅")[0])
return Hall
def Bath(x):
Bath = int(x.split("室")[1].split("厅")[1].split("卫")[0])
return Bath
data['Room'] = data['houseType'].apply(lambda x: Room(x))
data['Hall'] = data['houseType'].apply(lambda x: Hall(x))
data['Bath'] = data['houseType'].apply(lambda x: Bath(x))
#特征合并
# 合并部分配套设施特征
data['trainsportNum'] = 5 * data['subwayStationNum'] / data['subwayStationNum'].mean() + data['busStationNum'] / \
data[
'busStationNum'].mean()
# 交叉生成特征:特征之间交叉+ - * /
data['Room_Bath'] = (data['Bath']+1) / (data['Room']+1)
# 聚类特征
from sklearn.mixture import GaussianMixture 使用GaussianMixture做聚类特征
gmm = GaussianMixture(n_components=4, covariance_type='full', random_state=0)
gmm.fit_predict(data)
# 特征编码
from sklearn.preprocessing import LabelEncoder
data['communityName'] = LabelEncoder().fit_transform(data['communityName'])
from sklearn import preprocessing.OneHotEncoder
data['communityName'] = OneHotEncoder().fit_transform(data['communityName'])
# 过大量级值取log平滑(针对线性模型有效)
data[feature]=np.log1p(data[feature])