Task4 一起挖掘幸福感
1、代码部分
– preprocessing.py 数据预处理,生成 x_test.csv ;x_train.csv ;y_train.csv数据集文件
– model_library.py 定义了一些模型以及对应的优化参数
– model_param_opt.py 定义了参数优化的逻辑
– generate_best_single_model.py 得到每一个模型的最优参数,存入model_log_best_params.txt文件
– ensemble.py 对最优模型实现一个简单的集成,并产生提交结果
2、运行顺序
进入项目文件夹
python preprocessing.py
进行数据预处理,得到训练数据
python generate_best_single_model.py
得到每一个候选模型的最优参数,存入model_log_best_params.txt文件
python ensemble.py
实现模型集成,生成最终结果
3、代码如下
import pandas as pd
import numpy as np
datatrain = pd.read_csv('happiness_train_complete.csv', encoding="gb2312")
datatest = pd.read_csv('happiness_test_complete.csv', encoding="gb2312")
dataplot = datatrain.copy()
datatrain = datatrain[datatrain["happiness"] != -8].reset_index(drop=True)
dataplot = dataplot[dataplot["happiness"] != -8].reset_index(drop=True)
target_col = "happiness"
target = datatrain[target_col]
del datatrain['id']
del datatest['id']
label = datatrain['happiness']
del datatrain['happiness']
dataproc = pd.concat([datatrain, datatest], ignore_index=True)
dataproc['survey_type'] = dataproc['survey_type'].map(lambda x: x - 1) # 变0-1
count = []
for i in range(1, 32):
count.append(dataplot.loc[dataplot['province'] == i, 'happiness'].mean())
count = [i if (1 - pd.isnull(i)) else 3 for i in count]
# plt.scatter(range(1,32),count)
reg1 = [i for i in range(1, 32) if count[i - 1] < 3.2]
reg2 = [i for i in range(1, 32) if 3.2 < count[i - 1] < 3.9]
reg3 = [i for i in range(1, 32) if count[i - 1] >= 3.9]
def spl(x):
if x in [2, 3, 8, 13, 14, 20, 23, 25, 26, 30]:
return 0
else:
return 1
def spl1(x):
if x in reg1:
return 0
elif x in reg2:
return 1
elif x in reg3:
return 2
dataproc['province_1'] = dataproc['province'].map(spl) # 新增两个变量
dataproc['province_2'] = dataproc['province'].map(spl1)
dataproc['gender'] = dataproc['gender'].map(lambda x: x - 1) # 变0-1
dataproc['age'] = dataproc['survey_time'].map(lambda x: int(x[:4])) - dataproc['birth']
dataproc.loc[dataproc['nationality'] < 0, 'nationality'] = 1
dataproc = dataproc.join(pd.get_dummies(dataproc["nationality"], prefix="nationality"))
def nation(x):
if x == 1:
return 1
else:
return 0
dataproc['nationality1'] = dataproc['nationality'].map(nation) # 新特征,是否为汉族
del dataproc['nationality']
def relfreq(x):
if x < 2:
return 0
elif x < 5:
return 1
else:
return 2
dataproc['religion_freq'] = dataproc['religion_freq'].map(relfreq)
from scipy import stats
dataproc.loc[dataproc['edu'] < 0, 'edu'] = stats.mode(dataproc['edu'])[0][0]
del dataproc['edu_other']
dataproc = dataproc.join(pd.get_dummies(dataproc["edu_status"], prefix="edu_status"))
del dataproc["edu_status"]
def eduyr(x):
if (x > 0) and (not pd.isnull(x)):
return x
else:
return 0
dataproc['edu_yr'] = dataproc['edu_yr'].map(eduyr)
dataproc['edu_yr'] = dataproc['edu_yr'] - dataproc['birth']
def eduyr1(x):
if x > 0:
return x
else:
return 0
dataproc['edu_yr'] = dataproc['edu_yr'].map(eduyr1)
dataproc.loc[dataproc['income'] < 0, 'income'] = stats.mode(dataproc['income'])[0][0]
dataproc['income'] = dataproc['income'].map(lambda x: np.log(x + 1))
dataproc.loc[dataproc['political'] < 0, 'political'] = 1
dataproc = dataproc.join(pd.get_dummies(dataproc["political"], prefix="political"))
del dataproc['political']
def joinparty(x):
if pd.isnull(x):
return 0
if x < 0:
return 0
else:
return x
dataproc['join_party'] = (dataproc['join_party'] - dataproc['birth']).map(joinparty)
del dataproc['property_other']
dataproc.loc[(dataproc['weight_jin'] <= 80) & (dataproc['height_cm'] >= 160), 'weight_jin'] = dataproc[
'weight_jin'] * 2 # 对体重修正
dataproc.loc[dataproc['weight_jin'] <= 60, 'weight_jin'] = dataproc['weight_jin'] * 2
dataproc['bmi'] = dataproc['weight_jin'].map(lambda x: x / 2) / dataproc['height_cm'].map(lambda x: (x / 100) ** 2)
dataproc.loc[dataproc['health'] < 0, 'health'] = stats.mode(dataproc['health'])[0][0]
dataproc.loc[dataproc['health_problem'] < 0, 'health_problem'] = stats.mode(dataproc['health_problem'])[0][0]
dataproc.loc[dataproc['depression'] < 0, 'depression'] = stats.mode(dataproc['depression'])[0][0]
dataproc.loc[dataproc['media_1'] < 0, 'media_1'] = stats.mode(dataproc['media_1'])[0][0]
dataproc.loc[dataproc['media_2'] < 0, 'media_2'] = stats.mode(dataproc['media_2'])[0][0]
dataproc.loc[dataproc['media_3'] < 0, 'media_3'] = stats.mode(dataproc['media_3'])[0][0]
dataproc.loc[dataproc['media_4'] < 0, 'media_4'] = stats.mode(dataproc['media_4'])[0][0]
dataproc.loc[dataproc['media_5'] < 0, 'media_5'] = stats.mode(dataproc['media_5'])[0][0]
dataproc.loc[dataproc['media_6'] < 0, 'media_6'] = stats.mode(dataproc['media_6'])[0][0]
dataproc['media'] = (dataproc['media_1'] + dataproc['media_2'] + dataproc['media_3'] + dataproc['media_4'] +
dataproc['media_5'] + dataproc['media_6']).map(lambda x: x / 6)
for i in range(1, 13):
dataproc.loc[dataproc['leisure_' + str(i)] < 0, 'leisure_' + str(i)] = stats.mode(dataproc['leisure_' + str(i)])[0][
0]
dataproc['leisure'] = (dataproc['leisure_1'] + dataproc['leisure_2'] + dataproc['leisure_3'] + dataproc['leisure_4'] +
dataproc['leisure_5'] + dataproc['leisure_6'] + dataproc['leisure_7'] + dataproc['leisure_8'] +
dataproc['leisure_9'] + dataproc['leisure_10'] + dataproc['leisure_11'] +
dataproc['leisure_12']).map(lambda x: x / 12)
dataproc.loc[dataproc['socialize'] < 0, 'socialize'] = stats.mode(dataproc['socialize'])[0][0]
dataproc.loc[dataproc['relax'] < 0, 'relax'] = stats.mode(dataproc['relax'])[0][0]
dataproc.loc[dataproc['learn'] < 0, 'learn'] = stats.mode(dataproc['learn'])[0][0]
socialneimode = stats.mode(dataproc['social_neighbor'])[0][0]
def socialnei(x):
if pd.isnull(x):
return socialneimode
if x < 0:
return socialneimode
else:
return x
dataproc['social_neighbor'] = dataproc['social_neighbor'].map(socialnei)
socialfrimode = stats.mode(dataproc['social_friend'])[0][0]
def socialfri(x):
if pd.isnull(x):
return socialfrimode
if x < 0:
return socialfrimode
else:
return x
dataproc['social_friend'] = dataproc['social_friend'].map(socialfri)
dataproc.loc[dataproc['socia_outing'] < 0, 'socia_outing'] = stats.mode(dataproc['socia_outing'])[0][0]
dataproc.loc[dataproc['equity'] < 0, 'equity'] = stats.mode(dataproc['equity'])[0][0]
dataproc.loc[dataproc['class'] < 0, 'class'] = stats.mode(dataproc['class'])[0][0]
dataproc.loc[dataproc['class_10_before'] < 0, 'class_10_before'] = stats.mode(dataproc['class_10_before'])[0][0]
dataproc.loc[dataproc['class_10_after'] < 0, 'class_10_after'] = stats.mode(dataproc['class_10_after'])[0][0]
dataproc['class_new_1'] = dataproc['class'] - dataproc['class_10_before'] # 构造新特征
dataproc['class_new_2'] = dataproc['class'] - dataproc['class_10_after']
dataproc.loc[dataproc['class_14'] < 0, 'class_14'] = stats.mode(dataproc['class_14'])[0][0]
dataproc = dataproc.join(pd.get_dummies(dataproc["work_exper"], prefix="work_exper"))
def workstat(x):
if pd.isnull(x):
return 9
if x < 0:
return 9
else:
return x
dataproc['work_status'] = dataproc['work_status'].map(workstat)
dataproc = dataproc.join(pd.get_dummies(dataproc["work_status"], prefix="work_status"))
data = dataproc
# work_type
data['work_type'] = data['work_type'].fillna(100)
data = pd.concat([data, pd.get_dummies(data['work_type'], prefix='work_type')], 1)
del data['work_type']
# work_manage
data['work_manage'] = data['work_manage'].fillna('b')
data = pd.concat([data, pd.get_dummies(data['work_manage'], prefix='work_manage')], 1)
del data['work_manage']
# insur
data['insur_1'] = ['other' if i != 1 and i != 2 else i for i in list(data['insur_1'])]
data = pd.concat([data, pd.get_dummies(data['insur_1'], prefix='insur_1')], 1)
del data['insur_1']
data['insur_2'] = ['other' if i != 1 and i != 2 else i for i in list(data['insur_2'])]
data = pd.concat([data, pd.get_dummies(data['insur_2'], prefix='insur_2')], 1)
del data['insur_2']
data['insur_3'] = ['other' if i != 1 and i != 2 else i for i in list(data['insur_3'])]
data = pd.concat([data, pd.get_dummies(data['insur_3'], prefix='insur_3')], 1)
del data['insur_3']
data['insur_4'] = ['other' if i != 1 and i != 2 else i for i in list(data['insur_4'])]
data = pd.concat([data, pd.get_dummies(data['insur_4'], prefix='insur_4')], 1)
del data['insur_4']
# family income
median = np.median(data[data['family_income'] >= 0]['family_income'])
data['family_income'] = [i if i >= 0 else median for i in data['family_income']]
median = np.median(data[data['income'] >= 0]['income'])
data['income'] = [i if i >= 0 else median for i in data['income']]
data['income_family_income'] = data['income'] / data['family_income']
# car
data = pd.concat([data, pd.get_dummies(data['car'], prefix='car')], 1)
del data['car']
# invest_other
del data['invest_other']
# marital
data = pd.concat([data, pd.get_dummies(data['marital'], prefix='marital')], 1)
del data['marital']
del data['marital_1st']
# s_political
data = pd.concat([data, pd.get_dummies(data['s_political'], prefix='s_political')], 1)
del data['s_political']
# s_hukou
data = pd.concat([data, pd.get_dummies(data['s_hukou'], prefix='s_hukou')], 1)
del data['s_hukou']
# s_income
median = np.median(data[data['s_income'] >= 0]['s_income'])
data['s_income'] = [i if i >= 0 else median for i in data['s_income']]
# s_work_exper
data = pd.concat([data, pd.get_dummies(data['s_work_exper'], prefix='s_work_exper')], 1)
del data['s_work_exper']
# s_work_status
data = pd.concat([data, pd.get_dummies(data['s_work_status'], prefix='s_work_status')], 1)
del data['s_work_status']
# s_work_type
data = pd.concat([data, pd.get_dummies(data['s_work_type'], prefix='s_work_type')], 1)
del data['s_work_type']
# f_political
data = pd.concat([data, pd.get_dummies(data['f_political'], prefix='f_political')], 1)
del data['f_political']
# view
data = pd.concat([data, pd.get_dummies(data['view'], prefix='view')], 1)
del data['view']
# inc_exp
median = np.median(data[data['inc_exp'] >= 0]['inc_exp'])
data['inc_exp'] = [median if i < 0 else i for i in data['inc_exp']]
data['inc_exp_cha'] = data['inc_exp'] - data['income']
colnames = list(data.columns)
# minor_child
mode = data[colnames[77]].mode().values[0]
data['minor_child'] = [i if i >= 0 else mode for i in data['minor_child']]
# s_birth
del data['s_birth']
# marital_now
del data['marital_now']
# s_edu
mode = data[colnames[80]].mode().values[0]
data['s_edu'] = [i if i >= 0 else mode for i in data['s_edu']]
# work_yr
del data['work_yr']
# hukou_loc
del data['hukou_loc']
# income_family_income
mode = data[colnames[77]].mode().values[0]
data['income_family_income'] = data['income_family_income'].fillna(np.mean(data['income_family_income']))
del data['birth']
del data['province']
del data['city']
del data['county']
del data['f_birth']
def ff(x):
if x == np.inf:
return 0
else:
return x
data['income_family_income'] = list(map(ff, data['income_family_income']))
train_shape = datatrain.shape[0]
use_fea = [clo for clo in data.columns if clo != 'survey_time' and data[clo].dtype != object]
x_train = data[:train_shape][use_fea].values
y_train = target
x_test = data[train_shape:][use_fea].values
pd.DataFrame(x_train, columns=use_fea).to_csv('x_train.csv', index=False)
pd.DataFrame(x_test, columns=use_fea).to_csv('x_test.csv', index=False)
pd.DataFrame(list(y_train), columns=['target']).to_csv('y_train.csv', index=False)
##定义了要用的模型
from hyperopt import hp
import numpy as np
############
## Config ##
############
debug = False
## xgboost
xgb_random_seed = 2019
xgb_nthread = 2
xgb_dmatrix_silent = True
## sklearn
skl_random_seed = 2019
skl_n_jobs = 2
if debug:
xgb_nthread = 1
skl_n_jobs = 1
xgb_min_num_round = 5
xgb_max_num_round = 10
xgb_num_round_step = 5
skl_min_n_estimators = 5
skl_max_n_estimators = 10
skl_n_estimators_step = 5
libfm_min_iter = 5
libfm_max_iter = 10
iter_step = 5
hyperopt_param = {}
hyperopt_param["xgb_max_evals"] = 1
hyperopt_param["rf_max_evals"] = 1
hyperopt_param["etr_max_evals"] = 1
hyperopt_param["gbm_max_evals"] = 1
hyperopt_param["lr_max_evals"] = 1
hyperopt_param["ridge_max_evals"] = 1
hyperopt_param["lasso_max_evals"] = 1
hyperopt_param['svr_max_evals'] = 1
hyperopt_param['dnn_max_evals'] = 1
hyperopt_param['libfm_max_evals'] = 1
hyperopt_param['rgf_max_evals'] = 1
else:
xgb_min_num_round = 10
xgb_max_num_round = 500
xgb_num_round_step = 10
skl_min_n_estimators = 100
skl_max_n_estimators = 1000
skl_n_estimators_step = 20
libfm_min_iter = 10
libfm_max_iter = 500
iter_step = 10
hyperopt_param = {}
hyperopt_param["xgb_max_evals"] = 200
hyperopt_param["rf_max_evals"] = 200
hyperopt_param["etr_max_evals"] = 200
hyperopt_param["gbm_max_evals"] = 200
hyperopt_param["lr_max_evals"] = 200
hyperopt_param["ridge_max_evals"] = 200
hyperopt_param["lasso_max_evals"] = 200
hyperopt_param['svr_max_evals'] = 200
hyperopt_param['dnn_max_evals'] = 200
hyperopt_param['libfm_max_evals'] = 200
hyperopt_param['rgf_max_evals'] = 200
########################################
## Parameter Space for XGBoost models ##
########################################
## In the early stage of the competition, I mostly focus on
## raw tfidf features and linear booster.
## regression with linear booster
param_space_xgb_reg = {
'task': 'xgb_reg',
'eta' : hp.quniform('eta', 0.01, 1, 0.01),
'lambda' : hp.quniform('lambda', 0, 5, 0.05),
'alpha' : hp.quniform('alpha', 0, 0.5, 0.005),
'lambda_bias' : hp.quniform('lambda_bias', 0, 3, 0.1),
'num_round' : hp.quniform('num_round', xgb_min_num_round, xgb_max_num_round, xgb_num_round_step),
'nthread': xgb_nthread,
'silent' : 1,
'seed': xgb_random_seed,
"max_evals": hyperopt_param["xgb_max_evals"],
}
########################################
## Parameter Space for Sklearn Models ##
########################################
## random forest regressor
param_space_reg_skl_rf = {
'task': 'reg_skl_rf',
'n_estimators': hp.quniform("n_estimators", skl_min_n_estimators, skl_max_n_estimators, skl_n_estimators_step),
'max_features': hp.quniform("max_features", 0.05, 1.0, 0.05),
'max_depth': hp.quniform('max_depth', 1, 30, 1),
'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
'n_jobs': skl_n_jobs,
'random_state': skl_random_seed,
"max_evals": hyperopt_param["rf_max_evals"],
}
## extra trees regressor
param_space_reg_skl_etr = {
'task': 'reg_skl_etr',
'n_estimators': hp.quniform("n_estimators", skl_min_n_estimators, skl_max_n_estimators, skl_n_estimators_step),
'max_features': hp.quniform("max_features", 0.05, 1.0, 0.05),
'n_jobs': skl_n_jobs,
'random_state': skl_random_seed,
"max_evals": hyperopt_param["etr_max_evals"],
}
## gradient boosting regressor
param_space_reg_skl_gbm = {
'task': 'reg_skl_gbm',
'n_estimators': hp.quniform("n_estimators", skl_min_n_estimators, skl_max_n_estimators, skl_n_estimators_step),
'learning_rate': hp.quniform("learning_rate", 0.01, 0.5, 0.01),
'max_features': hp.quniform("max_features", 0.05, 1.0, 0.05),
'max_depth': hp.quniform('max_depth', 1, 15, 1),
'subsample': hp.quniform('subsample', 0.5, 1, 0.1),
'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
'random_state': skl_random_seed,
"max_evals": hyperopt_param["gbm_max_evals"],
}
## support vector regression
param_space_reg_skl_svr = {
'task': 'reg_skl_svr',
'C': hp.loguniform("C", np.log(1), np.log(100)),
'gamma': hp.loguniform("gamma", np.log(0.001), np.log(0.1)),
'degree': hp.quniform('degree', 1, 5, 1),
'epsilon': hp.loguniform("epsilon", np.log(0.001), np.log(0.1)),
'kernel': hp.choice('kernel', ['rbf', 'poly']),
"max_evals": hyperopt_param["svr_max_evals"],
}
## ridge regression
param_space_reg_skl_ridge = {
'task': 'reg_skl_ridge',
'alpha': hp.loguniform("alpha", np.log(0.01), np.log(20)),
'random_state': skl_random_seed,
"max_evals": hyperopt_param["ridge_max_evals"],
}
## lasso
param_space_reg_skl_lasso = {
'task': 'reg_skl_lasso',
'alpha': hp.loguniform("alpha", np.log(0.00001), np.log(0.1)),
'random_state': skl_random_seed,
"max_evals": hyperopt_param["lasso_max_evals"],
}
######################################
## Parameter Space for Keras Models ##
######################################
## integer features
int_feat = ["num_round", "n_estimators", "max_depth", "degree",'min_samples_split','min_samples_leaf',
"hidden_units", "hidden_layers", "batch_size", "nb_epoch",
"dim", "iter",
"max_leaf_forest", "num_iteration_opt", "num_tree_search", "min_pop", "opt_interval"]
####################
## All the Models ##
####################
feat_names = []
param_spaces = {}
#############
## xgboost ##
#############
## regression with xgboost tree booster
feat_name = "xgb_reg"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_xgb_reg
#############
## Sklearn ##
#############
## extra trees regressor
feat_name = "reg_skl_etr"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_reg_skl_etr
## random forest regressor
feat_name = "reg_skl_rf"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_reg_skl_rf
## gradient boosting regressor
feat_name = "reg_skl_gbm"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_reg_skl_gbm
## support vector regression
feat_name = "reg_skl_svr"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_reg_skl_svr
## ridge regression
feat_name = "reg_skl_ridge"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_reg_skl_ridge
## lasso
feat_name = "reg_skl_lasso"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_reg_skl_lasso
import pandas as pd
import numpy as np
from model_param_opt import hyperopt_wrapper,hyperopt_train_test
from model_library import feat_names,param_spaces
from hyperopt import fmin,Trials,tpe
from sklearn.model_selection import train_test_split
def data_prepare():
x_train = pd.read_csv('x_train.csv').values
y_train = pd.read_csv('y_train.csv').values
# x_test = pd.read_csv('x_test.csv').values
y_train = [i[0] for i in y_train]
data_used = list(train_test_split(x_train,y_train,test_size = 0.3))
return data_used
if __name__ == '__main__':
data_used = data_prepare()
with open('model_log_best_params.txt','w') as f:
for i in range(len(feat_names)):
param_space = param_spaces[feat_names[i]]
trials = Trials()
objective = lambda p: hyperopt_wrapper(p, data_used)
best_params = fmin(objective, param_space, algo=tpe.suggest,
trials=trials, max_evals=param_space["max_evals"])
print(best_params)
f.write('%s;%s'%(feat_names[i],str(best_params)))
f.write('\n')
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, RepeatedKFold
from model_library import int_feat
def int_feat_f(param):
for f in int_feat:
if f in param:
param[f] = int(param[f])
return param
def ensemble_run():
x_train = pd.read_csv('x_train.csv').values
x_test = pd.read_csv('x_test.csv').values
target = pd.read_csv('y_train.csv')
y_train = np.array([i[0] for i in target.values])
train_shape = len(y_train)
X_train = x_train
X_test = x_test
with open('model_log_best_params.txt', 'r') as f:
a = f.readlines()
model_param_dict = {i.split(";")[0]: eval(i.split(";")[1][:-1]) for i in a}
##### xgb
print('xgb')
xgb_params = int_feat_f(model_param_dict['xgb_reg'])
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_xgb = np.zeros(train_shape)
predictions_xgb = np.zeros(len(X_test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("fold n°{}".format(fold_ + 1))
model = xgb.XGBRegressor(**xgb_params).fit(X_train[trn_idx], y_train[trn_idx])
pre = model.predict(X_train[val_idx])
oof_xgb[val_idx] = model.predict(X_train[val_idx])
predictions_xgb += model.predict(X_test) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, y_train)))
##### ExtraTreesRegressor
print('ExtraTreesRegressor')
etr_params = int_feat_f(model_param_dict['reg_skl_etr'])
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_etr = np.zeros(train_shape)
predictions_etr = np.zeros(len(X_test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("fold n°{}".format(fold_ + 1))
model = ExtraTreesRegressor(**etr_params).fit(X_train[trn_idx], y_train[trn_idx])
pre = model.predict(X_train[val_idx])
oof_etr[val_idx] = model.predict(X_train[val_idx])
predictions_etr += model.predict(X_test) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_etr, y_train)))
##### RandomForestRegressor
print('RandomForestRegressor')
rfr_params = int_feat_f(model_param_dict['reg_skl_rf'])
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_rfr = np.zeros(train_shape)
predictions_rfr = np.zeros(len(X_test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("fold n°{}".format(fold_ + 1))
model = RandomForestRegressor(**rfr_params).fit(X_train[trn_idx], y_train[trn_idx])
pre = model.predict(X_train[val_idx])
oof_rfr[val_idx] = model.predict(X_train[val_idx])
predictions_rfr += model.predict(X_test) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_rfr, y_train)))
##### GradientBoostingRegressor
print('GradientBoostingRegressor')
gbr_params = int_feat_f(model_param_dict['reg_skl_gbm'])
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_gbr = np.zeros(train_shape)
predictions_gbr = np.zeros(len(X_test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("fold n°{}".format(fold_ + 1))
model = GradientBoostingRegressor(**gbr_params).fit(X_train[trn_idx], y_train[trn_idx])
pre = model.predict(X_train[val_idx])
oof_gbr[val_idx] = model.predict(X_train[val_idx])
predictions_gbr += model.predict(X_test) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_gbr, y_train)))
##### SVR
print('SVR')
svr_params = int_feat_f(model_param_dict['reg_skl_svr'])
kernal_choice = ['rbf', 'poly']
svr_params['kernel'] = kernal_choice[svr_params['kernel']]
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_svr = np.zeros(train_shape)
predictions_svr = np.zeros(len(X_test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("fold n°{}".format(fold_ + 1))
model = SVR(**svr_params).fit(X_train[trn_idx], y_train[trn_idx])
pre = model.predict(X_train[val_idx])
oof_svr[val_idx] = model.predict(X_train[val_idx])
predictions_svr += model.predict(X_test) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_svr, y_train)))
##### Ridge
print('Ridge')
ridge_params = int_feat_f(model_param_dict['reg_skl_ridge'])
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_ridge = np.zeros(train_shape)
predictions_ridge = np.zeros(len(X_test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("fold n°{}".format(fold_ + 1))
model = Ridge(**ridge_params).fit(X_train[trn_idx], y_train[trn_idx])
pre = model.predict(X_train[val_idx])
oof_ridge[val_idx] = model.predict(X_train[val_idx])
predictions_ridge += model.predict(X_test) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_ridge, y_train)))
##### Lasso
print('Lasso')
Lasso_params = int_feat_f(model_param_dict['reg_skl_lasso'])
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_Lasso = np.zeros(train_shape)
predictions_Lasso = np.zeros(len(X_test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
print("fold n°{}".format(fold_ + 1))
model = Ridge(**Lasso_params).fit(X_train[trn_idx], y_train[trn_idx])
pre = model.predict(X_train[val_idx])
oof_Lasso[val_idx] = model.predict(X_train[val_idx])
predictions_Lasso += model.predict(X_test) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_Lasso, y_train)))
# stacking
print('stacking')
train_stack = np.vstack([oof_xgb, oof_etr, oof_gbr, oof_Lasso, oof_rfr, oof_ridge, oof_svr]).transpose()
test_stack = np.vstack(
[predictions_xgb, predictions_etr, predictions_gbr, predictions_Lasso, predictions_rfr, predictions_ridge,
predictions_svr]).transpose()
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2019)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, target)):
print("fold {}".format(fold_))
trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values
clf_3 = BayesianRidge()
clf_3.fit(trn_data, trn_y.ravel())
oof_stack[val_idx] = clf_3.predict(val_data)
predictions += clf_3.predict(test_stack) / 10
mean_squared_error(target.values, oof_stack)
submit_example = pd.read_csv('happiness_submit.csv', encoding="gb2312")
submit_example['happiness'] = predictions
submit_example.to_csv('result.csv', index=False)
if __name__ == '__main__':
ensemble_run()