import pandas as pd
import numpy as np
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
def parseData(df):
"""
预处理数据
"""
df['rentType'][df['rentType']=='--'] = '未知方式'
def parseRoom(info, index):
res = int(info[index*2])
return res
df.insert(3,'室',None)
df.insert(4, '厅', None)
df.insert(5, '卫', None)
df['室'] = df['houseType'].apply(parseRoom, index=0)
df['厅'] = df['houseType'].apply(parseRoom, index=1)
df['卫'] = df['houseType'].apply(parseRoom, index=2)
df['交易月份'] = df['tradeTime'].apply(lambda x: int(x.split('/')[1]))
df['houseType_1sumcsu']=df['室'].map(lambda x:str(x))+df['communityName'].map(lambda x:str(x))
big_num_cols = ['totalTradeMoney','totalTradeArea','tradeMeanPrice','totalNewTradeMoney', 'totalNewTradeArea',
'tradeNewMeanPrice','remainNewNum', 'supplyNewNum', 'supplyLandArea',
'tradeLandArea','landTotalPrice','landMeanPrice','totalWorkers','newWorkers',
'residentPopulation','pv','uv']
for col in big_num_cols:
df[col] = df[col].map(lambda x: np.log1p(x))
'''
columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate','houseType_1sumcsu','houseType_2sumcsu','houseType_3sumcsu']
for col in columns:
df[col] = df[col].astype('category')
'''
columns = ['rentType', 'houseFloor', 'houseToward', 'houseDecoration', 'communityName', 'region', 'plate','houseType_1sumcsu']
for col in columns:
df[col] = df[col].astype('category')
df['pv'].fillna(df['pv'].mean(),inplace=True)
df['uv'].fillna(df['uv'].mean(),inplace=True)
df['pv'] = df['pv'].astype('int')
df['uv'] = df['uv'].astype('int')
tmp = df['buildYear'].copy()
tmp2 = tmp[tmp!='暂无信息'].astype('int')
tmp[tmp=='暂无信息'] = tmp2.mode().iloc[0]
df['buildYear'] = tmp
df['buildYear'] = df['buildYear'].astype('int')
df.loc[(df['rentType'] == '未知方式') & (df['室'] <= 1), 'rentType'] = '整租'
df.loc[(df['rentType'] == '未知方式') & (df['室'] > 1) & (df['area'] < 50), 'rentType'] = '合租'
df.loc[(df['rentType'] == '未知方式') & (df['area'] / df['室'] < 20), 'rentType'] = '合租'
df.loc[(df['rentType'] == '未知方式') & (df['area'] <= 50) & (df['室'] == 2), 'rentType'] = '合租'
df.loc[(df['rentType'] == '未知方式') & (df['area'] > 60) & (df['室'] == 2), 'rentType'] = '整租'
df.loc[(df['rentType'] == '未知方式') & (df['area'] <= 60) & (df['室'] == 3), 'rentType'] = '合租'
df.loc[(df['rentType'] == '未知方式') & (df['area'] > 60) & (df['室'] == 3), 'rentType'] = '整租'
df.loc[(df['rentType'] == '未知方式') & (df['area'] >= 100) & (df['室'] > 3), 'rentType'] = '整租'
items=['area']
for item in items:
xiaoquname_mean=df.groupby('communityName',as_index=False)[item].agg({
item+'mean小区名':'mean',}
)
df = df.merge(xiaoquname_mean,on='communityName',how='left')
df.drop('city',axis=1,inplace=True)
df.drop('houseToward',axis=1,inplace=True)
df.drop('houseDecoration',axis=1,inplace=True)
df.drop(['ID'],axis=1,inplace=True)
return df
def washData(df_train, df_test):
"""
清洗数据
"""
df_train = df_train[(df_train['area']<=200)&(df_train['area']>6)]
df_train = df_train[df_train['tradeMoney']<=100000]
df_train = df_train.drop(df_train[(df_train.tradeMoney/df_train.area>300)].index,axis=0)
df_train = df_train.drop(df_train[(df_train.tradeMoney/df_train.area<25)].index,axis=0)
df_train = df_train.drop(df_train[df_train['houseType'] =='0室0厅1卫' ].index)
df_train=df_train.drop(df_train[df_train.totalFloor==0].index)
df_train = df_train.drop(df_train[(df_train['tradeMoney']>25000)&(df_train['area']<100)].index)
df_train = df_train.drop(df_train[(df_train['tradeMoney']<75000)&(df_train['area']>800)].index)
return df_train, df_test
def feature(df):
"""
特征
"""
'''
def parseRoom(info, index):
res = int(info[index*2])
return res
df.insert(3,'室',None)
df.insert(4, '厅', None)
df.insert(5, '卫', None)
df['室'] = df['houseType'].apply(parseRoom, index=0)
df['厅'] = df['houseType'].apply(parseRoom, index=1)
df['卫'] = df['houseType'].apply(parseRoom, index=2)
df['交易月份'] = df['tradeTime'].apply(lambda x: int(x.split('/')[1]))
'''
df.drop('houseType', axis=1, inplace=True)
df.drop('tradeTime', axis=1, inplace=True)
'''
for item in items:
xiaoquname_mean=df.groupby('communityName',as_index=False)[item].agg({
item+'mean小区名':'mean',}
)
df = df.merge(xiaoquname_mean,on='communityName',how='left')
tmp=df.groupby(['communityName'],as_index=False)['area'].agg({
'ca_mean':'mean',
'ca_std':'std',
#'ca_min':'min'
})
df = df.merge(tmp, on=['communityName'], how='left')
'''
df['traffic']=df['subwayStationNum']+df['busStationNum']
df['edu']=df['interSchoolNum']+df['schoolNum']+df['privateSchoolNum']
df['livecondition']=df['drugStoreNum']+df['bankNum']+df['shopNum']+df['parkNum']+df['mallNum']+df['superMarketNum']+df['gymNum']
df['pepleroute']=df['newWorkers']/df['residentPopulation']
df['tradeMoneynew']=df['tradeMeanPrice']/df['tradeNewMeanPrice']
df['meanarea']=df['totalTradeArea']/df['tradeSecNum']
df['meanNewarea']=df['totalNewTradeArea']/df['tradeNewNum']
df['lostnum']=df['supplyNewNum']-df['tradeNewNum']
'''
df['aggpeople']=df['newWorkers']/df['residentPopulation']
df['剩余新房卖出比'] = (df['tradeNewNum'])/df['remainNewNum']
df['二手售出比'] = (df['saleSecHouseNum'])/(df['tradeSecNum'])
df['新二价格比'] = df['tradeMeanPrice']/df['tradeNewMeanPrice']
df['当月新房售出比'] = (df['tradeNewNum'])/(df['supplyNewNum'])
'''
df=df.drop(['subwayStationNum','busStationNum','interSchoolNum','schoolNum','privateSchoolNum','drugStoreNum','bankNum'],axis=1)
df=df.drop(['shopNum','parkNum','mallNum','superMarketNum','hospitalNum','gymNum'],axis=1)
df=df.drop(columns=['tradeLandNum','landMeanPrice','supplyLandNum','landTotalPrice','tradeLandArea','lostnum','supplyNewNum','supplyLandArea','region','tradeNewNum','pepleroute','lookNum','uv','saleSecHouseNum','livecondition','pv'])
categorical_feats = ['rentType', 'houseFloor', 'plate','communityName','houseType_1sumcsu']
return df, categorical_feats
def getData(feature):
"""
获取数据
"""
train=pd.read_csv(r'C:\Users\lxc\Desktop\featurecup\train_data.csv')
test=pd.read_csv(r'C:\Users\lxc\Desktop\featurecup\test_b.csv')
train = parseData(train)
train, test = washData(train, test)
train, col = feature(train)
test, col = feature(test)
target = train.pop('tradeMoney')
features = train.columns
categorical_feats = col
return train, test, target, features, categorical_feats
train, test, target, features, categorical_feats = getData(feature)
params = {
'num_leaves': 31,
'min_data_in_leaf': 20,
'min_child_samples':20,
'objective': 'regression',
'learning_rate': 0.01,
"boosting": "gbdt",
"feature_fraction": 0.8,
"bagging_freq": 1,
"bagging_fraction": 0.85,
"bagging_seed": 23,
"metric": 'rmse',
"lambda_l1": 0.2,
"nthread": 4,
}
folds = KFold(n_splits=5, shuffle=True, random_state=2333)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
print("fold {}".format(fold_))
trn_data = lgb.Dataset(train.iloc[trn_idx], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
val_data = lgb.Dataset(train.iloc[val_idx], label=target.iloc[val_idx], categorical_feature=categorical_feats)
num_round = 10000
clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 200)
oof_lgb[val_idx] = clf.predict(train.iloc[val_idx], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
predictions_lgb += clf.predict(test, num_iteration=clf.best_iteration) / folds.n_splits
print("CV Score: {:<8.5f}".format(r2_score(target, oof_lgb)))
from sklearn.preprocessing import LabelEncoder
xgb_params = {'eta': 0.05, 'max_depth': 5, 'subsample': 0.5, 'colsample_bytree': 0.5, 'alpha': 0.2,
'objective': 'reg:gamma', 'eval_metric': 'rmse', 'silent': True, 'nthread': -1
}
le =LabelEncoder()
train['rentType'] = le.fit_transform(train['rentType'])
train['houseFloor'] = le.fit_transform(train['houseFloor'])
train['communityName'] = le.fit_transform(train['communityName'])
train['plate'] = le.fit_transform(train['plate'])
train['houseType_1sumcsu'] = le.fit_transform(train['houseType_1sumcsu'])
train['houseType_2sumcsu'] = le.fit_transform(train['houseType_2sumcsu'])
train['houseType_3sumcsu'] = le.fit_transform(train['houseType_3sumcsu'])
test['rentType'] = le.fit_transform(test['rentType'])
test['houseFloor'] = le.fit_transform(test['houseFloor'])
test['communityName'] = le.fit_transform(test['communityName'])
test['plate'] = le.fit_transform(test['plate'])
test['houseType_1sumcsu'] = le.fit_transform(test['houseType_1sumcsu'])
test['houseType_2sumcsu'] = le.fit_transform(test['houseType_2sumcsu'])
test['houseType_3sumcsu'] = le.fit_transform(test['houseType_3sumcsu'])
import xgboost as xgb
folds = KFold(n_splits=5, shuffle=True, random_state=2333)
oof_xgb = np.zeros(len(train))
predictions_xgb = np.zeros(len(test))
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
print("fold {}".format(fold_))
trn_data = xgb.DMatrix(train.iloc[trn_idx], label=target.iloc[trn_idx])
val_data = xgb.DMatrix(train.iloc[val_idx], label=target.iloc[val_idx])
watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
num_round = 10000
clf = xgb.train(dtrain=trn_data,num_boost_round=10000, evals=watchlist, early_stopping_rounds=200,
verbose_eval=1000, params=xgb_params)
lgb_model=clf
oof_xgb[val_idx] = clf.predict(xgb.DMatrix(train.loc[val_idx]), ntree_limit=clf.best_ntree_limit)
'''
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
'''
predictions_xgb += clf.predict(xgb.DMatrix(test),
ntree_limit=clf.best_ntree_limit) / folds.n_splits
print("CV Score: {:<8.5f}".format(r2_score(target, oof_xgb)))
from mlxtend.classifier import StackingClassifier
sclf = StackingClassifier(classifiers=[lgb], meta_classifier=xgb_model)
sclf_score=sclf.fit(train,target)
test_predict=sclf.predict(test)
from sklearn.metrics import r2_score
def online_score(pred):
print("预测结果最大值:{},预测结果最小值:{}".format(pred.max(),pred.min()))
conmbine1 = pd.read_csv(r'C:\Users\lxc\Desktop\featurecup\sub_b_919.csv',engine = "python",header=None)
score1 = r2_score(pred, conmbine1)
print("对比919分数:{}".format(score1))
score = online_score(test_predict)
预测结果最大值:19051.067151217972,预测结果最小值:1199.97082591554
对比919分数:0.981891385946527