机器学习task4

Task4 一起挖掘幸福感

1、代码部分
– preprocessing.py 数据预处理,生成 x_test.csv ;x_train.csv ;y_train.csv数据集文件
– model_library.py 定义了一些模型以及对应的优化参数
– model_param_opt.py 定义了参数优化的逻辑
– generate_best_single_model.py 得到每一个模型的最优参数,存入model_log_best_params.txt文件
– ensemble.py 对最优模型实现一个简单的集成,并产生提交结果
2、运行顺序
进入项目文件夹
python preprocessing.py
进行数据预处理,得到训练数据
python generate_best_single_model.py
得到每一个候选模型的最优参数,存入model_log_best_params.txt文件
python ensemble.py
实现模型集成,生成最终结果

3、代码如下

import pandas as pd
import numpy as np

datatrain = pd.read_csv('happiness_train_complete.csv', encoding="gb2312")
datatest = pd.read_csv('happiness_test_complete.csv', encoding="gb2312")
dataplot = datatrain.copy()

datatrain = datatrain[datatrain["happiness"] != -8].reset_index(drop=True)
dataplot = dataplot[dataplot["happiness"] != -8].reset_index(drop=True)

target_col = "happiness"
target = datatrain[target_col]

del datatrain['id']
del datatest['id']

label = datatrain['happiness']

del datatrain['happiness']
dataproc = pd.concat([datatrain, datatest], ignore_index=True)

dataproc['survey_type'] = dataproc['survey_type'].map(lambda x: x - 1)  # 变0-1

count = []
for i in range(1, 32):
    count.append(dataplot.loc[dataplot['province'] == i, 'happiness'].mean())
count = [i if (1 - pd.isnull(i)) else 3 for i in count]
# plt.scatter(range(1,32),count)
reg1 = [i for i in range(1, 32) if count[i - 1] < 3.2]
reg2 = [i for i in range(1, 32) if 3.2 < count[i - 1] < 3.9]
reg3 = [i for i in range(1, 32) if count[i - 1] >= 3.9]


def spl(x):
    if x in [2, 3, 8, 13, 14, 20, 23, 25, 26, 30]:
        return 0
    else:
        return 1


def spl1(x):
    if x in reg1:
        return 0
    elif x in reg2:
        return 1
    elif x in reg3:
        return 2


dataproc['province_1'] = dataproc['province'].map(spl)  # 新增两个变量
dataproc['province_2'] = dataproc['province'].map(spl1)
dataproc['gender'] = dataproc['gender'].map(lambda x: x - 1)  # 变0-1
dataproc['age'] = dataproc['survey_time'].map(lambda x: int(x[:4])) - dataproc['birth']

dataproc.loc[dataproc['nationality'] < 0, 'nationality'] = 1
dataproc = dataproc.join(pd.get_dummies(dataproc["nationality"], prefix="nationality"))


def nation(x):
    if x == 1:
        return 1
    else:
        return 0


dataproc['nationality1'] = dataproc['nationality'].map(nation)  # 新特征,是否为汉族
del dataproc['nationality']


def relfreq(x):
    if x < 2:
        return 0
    elif x < 5:
        return 1
    else:
        return 2


dataproc['religion_freq'] = dataproc['religion_freq'].map(relfreq)

from scipy import stats

dataproc.loc[dataproc['edu'] < 0, 'edu'] = stats.mode(dataproc['edu'])[0][0]

del dataproc['edu_other']

dataproc = dataproc.join(pd.get_dummies(dataproc["edu_status"], prefix="edu_status"))
del dataproc["edu_status"]


def eduyr(x):
    if (x > 0) and (not pd.isnull(x)):
        return x
    else:
        return 0


dataproc['edu_yr'] = dataproc['edu_yr'].map(eduyr)
dataproc['edu_yr'] = dataproc['edu_yr'] - dataproc['birth']


def eduyr1(x):
    if x > 0:
        return x
    else:
        return 0


dataproc['edu_yr'] = dataproc['edu_yr'].map(eduyr1)

dataproc.loc[dataproc['income'] < 0, 'income'] = stats.mode(dataproc['income'])[0][0]
dataproc['income'] = dataproc['income'].map(lambda x: np.log(x + 1))

dataproc.loc[dataproc['political'] < 0, 'political'] = 1
dataproc = dataproc.join(pd.get_dummies(dataproc["political"], prefix="political"))
del dataproc['political']


def joinparty(x):
    if pd.isnull(x):
        return 0
    if x < 0:
        return 0
    else:
        return x


dataproc['join_party'] = (dataproc['join_party'] - dataproc['birth']).map(joinparty)

del dataproc['property_other']

dataproc.loc[(dataproc['weight_jin'] <= 80) & (dataproc['height_cm'] >= 160), 'weight_jin'] = dataproc[
                                                                                                  'weight_jin'] * 2  # 对体重修正
dataproc.loc[dataproc['weight_jin'] <= 60, 'weight_jin'] = dataproc['weight_jin'] * 2
dataproc['bmi'] = dataproc['weight_jin'].map(lambda x: x / 2) / dataproc['height_cm'].map(lambda x: (x / 100) ** 2)
dataproc.loc[dataproc['health'] < 0, 'health'] = stats.mode(dataproc['health'])[0][0]
dataproc.loc[dataproc['health_problem'] < 0, 'health_problem'] = stats.mode(dataproc['health_problem'])[0][0]
dataproc.loc[dataproc['depression'] < 0, 'depression'] = stats.mode(dataproc['depression'])[0][0]
dataproc.loc[dataproc['media_1'] < 0, 'media_1'] = stats.mode(dataproc['media_1'])[0][0]
dataproc.loc[dataproc['media_2'] < 0, 'media_2'] = stats.mode(dataproc['media_2'])[0][0]
dataproc.loc[dataproc['media_3'] < 0, 'media_3'] = stats.mode(dataproc['media_3'])[0][0]
dataproc.loc[dataproc['media_4'] < 0, 'media_4'] = stats.mode(dataproc['media_4'])[0][0]
dataproc.loc[dataproc['media_5'] < 0, 'media_5'] = stats.mode(dataproc['media_5'])[0][0]
dataproc.loc[dataproc['media_6'] < 0, 'media_6'] = stats.mode(dataproc['media_6'])[0][0]

dataproc['media'] = (dataproc['media_1'] + dataproc['media_2'] + dataproc['media_3'] + dataproc['media_4'] +
                     dataproc['media_5'] + dataproc['media_6']).map(lambda x: x / 6)

for i in range(1, 13):
    dataproc.loc[dataproc['leisure_' + str(i)] < 0, 'leisure_' + str(i)] = stats.mode(dataproc['leisure_' + str(i)])[0][
        0]

dataproc['leisure'] = (dataproc['leisure_1'] + dataproc['leisure_2'] + dataproc['leisure_3'] + dataproc['leisure_4'] +
                       dataproc['leisure_5'] + dataproc['leisure_6'] + dataproc['leisure_7'] + dataproc['leisure_8'] +
                       dataproc['leisure_9'] + dataproc['leisure_10'] + dataproc['leisure_11'] +
                       dataproc['leisure_12']).map(lambda x: x / 12)

dataproc.loc[dataproc['socialize'] < 0, 'socialize'] = stats.mode(dataproc['socialize'])[0][0]
dataproc.loc[dataproc['relax'] < 0, 'relax'] = stats.mode(dataproc['relax'])[0][0]
dataproc.loc[dataproc['learn'] < 0, 'learn'] = stats.mode(dataproc['learn'])[0][0]
socialneimode = stats.mode(dataproc['social_neighbor'])[0][0]


def socialnei(x):
    if pd.isnull(x):
        return socialneimode
    if x < 0:
        return socialneimode
    else:
        return x


dataproc['social_neighbor'] = dataproc['social_neighbor'].map(socialnei)
socialfrimode = stats.mode(dataproc['social_friend'])[0][0]


def socialfri(x):
    if pd.isnull(x):
        return socialfrimode
    if x < 0:
        return socialfrimode
    else:
        return x


dataproc['social_friend'] = dataproc['social_friend'].map(socialfri)
dataproc.loc[dataproc['socia_outing'] < 0, 'socia_outing'] = stats.mode(dataproc['socia_outing'])[0][0]
dataproc.loc[dataproc['equity'] < 0, 'equity'] = stats.mode(dataproc['equity'])[0][0]
dataproc.loc[dataproc['class'] < 0, 'class'] = stats.mode(dataproc['class'])[0][0]
dataproc.loc[dataproc['class_10_before'] < 0, 'class_10_before'] = stats.mode(dataproc['class_10_before'])[0][0]
dataproc.loc[dataproc['class_10_after'] < 0, 'class_10_after'] = stats.mode(dataproc['class_10_after'])[0][0]
dataproc['class_new_1'] = dataproc['class'] - dataproc['class_10_before']  # 构造新特征
dataproc['class_new_2'] = dataproc['class'] - dataproc['class_10_after']
dataproc.loc[dataproc['class_14'] < 0, 'class_14'] = stats.mode(dataproc['class_14'])[0][0]
dataproc = dataproc.join(pd.get_dummies(dataproc["work_exper"], prefix="work_exper"))


def workstat(x):
    if pd.isnull(x):
        return 9
    if x < 0:
        return 9
    else:
        return x


dataproc['work_status'] = dataproc['work_status'].map(workstat)
dataproc = dataproc.join(pd.get_dummies(dataproc["work_status"], prefix="work_status"))

data = dataproc

# work_type

data['work_type'] = data['work_type'].fillna(100)
data = pd.concat([data, pd.get_dummies(data['work_type'], prefix='work_type')], 1)
del data['work_type']

# work_manage

data['work_manage'] = data['work_manage'].fillna('b')
data = pd.concat([data, pd.get_dummies(data['work_manage'], prefix='work_manage')], 1)
del data['work_manage']

# insur

data['insur_1'] = ['other' if i != 1 and i != 2 else i for i in list(data['insur_1'])]
data = pd.concat([data, pd.get_dummies(data['insur_1'], prefix='insur_1')], 1)
del data['insur_1']

data['insur_2'] = ['other' if i != 1 and i != 2 else i for i in list(data['insur_2'])]
data = pd.concat([data, pd.get_dummies(data['insur_2'], prefix='insur_2')], 1)
del data['insur_2']

data['insur_3'] = ['other' if i != 1 and i != 2 else i for i in list(data['insur_3'])]
data = pd.concat([data, pd.get_dummies(data['insur_3'], prefix='insur_3')], 1)
del data['insur_3']

data['insur_4'] = ['other' if i != 1 and i != 2 else i for i in list(data['insur_4'])]
data = pd.concat([data, pd.get_dummies(data['insur_4'], prefix='insur_4')], 1)
del data['insur_4']

# family income

median = np.median(data[data['family_income'] >= 0]['family_income'])
data['family_income'] = [i if i >= 0 else median for i in data['family_income']]

median = np.median(data[data['income'] >= 0]['income'])
data['income'] = [i if i >= 0 else median for i in data['income']]

data['income_family_income'] = data['income'] / data['family_income']

# car

data = pd.concat([data, pd.get_dummies(data['car'], prefix='car')], 1)
del data['car']

# invest_other

del data['invest_other']

# marital

data = pd.concat([data, pd.get_dummies(data['marital'], prefix='marital')], 1)
del data['marital']

del data['marital_1st']

# s_political

data = pd.concat([data, pd.get_dummies(data['s_political'], prefix='s_political')], 1)
del data['s_political']

# s_hukou

data = pd.concat([data, pd.get_dummies(data['s_hukou'], prefix='s_hukou')], 1)
del data['s_hukou']

# s_income

median = np.median(data[data['s_income'] >= 0]['s_income'])
data['s_income'] = [i if i >= 0 else median for i in data['s_income']]

# s_work_exper

data = pd.concat([data, pd.get_dummies(data['s_work_exper'], prefix='s_work_exper')], 1)
del data['s_work_exper']

# s_work_status

data = pd.concat([data, pd.get_dummies(data['s_work_status'], prefix='s_work_status')], 1)
del data['s_work_status']

# s_work_type
data = pd.concat([data, pd.get_dummies(data['s_work_type'], prefix='s_work_type')], 1)
del data['s_work_type']

# f_political
data = pd.concat([data, pd.get_dummies(data['f_political'], prefix='f_political')], 1)
del data['f_political']

# view
data = pd.concat([data, pd.get_dummies(data['view'], prefix='view')], 1)
del data['view']

# inc_exp
median = np.median(data[data['inc_exp'] >= 0]['inc_exp'])
data['inc_exp'] = [median if i < 0 else i for i in data['inc_exp']]

data['inc_exp_cha'] = data['inc_exp'] - data['income']

colnames = list(data.columns)

# minor_child

mode = data[colnames[77]].mode().values[0]
data['minor_child'] = [i if i >= 0 else mode for i in data['minor_child']]

# s_birth

del data['s_birth']
# marital_now

del data['marital_now']
# s_edu

mode = data[colnames[80]].mode().values[0]
data['s_edu'] = [i if i >= 0 else mode for i in data['s_edu']]
# work_yr

del data['work_yr']
# hukou_loc

del data['hukou_loc']
# income_family_income

mode = data[colnames[77]].mode().values[0]
data['income_family_income'] = data['income_family_income'].fillna(np.mean(data['income_family_income']))

del data['birth']
del data['province']
del data['city']
del data['county']
del data['f_birth']


def ff(x):
    if x == np.inf:
        return 0
    else:
        return x


data['income_family_income'] = list(map(ff, data['income_family_income']))

train_shape = datatrain.shape[0]

use_fea = [clo for clo in data.columns if clo != 'survey_time' and data[clo].dtype != object]
x_train = data[:train_shape][use_fea].values
y_train = target
x_test = data[train_shape:][use_fea].values

pd.DataFrame(x_train, columns=use_fea).to_csv('x_train.csv', index=False)
pd.DataFrame(x_test, columns=use_fea).to_csv('x_test.csv', index=False)
pd.DataFrame(list(y_train), columns=['target']).to_csv('y_train.csv', index=False)



##定义了要用的模型
from hyperopt import hp
import numpy as np

############
## Config ##
############

debug = False

## xgboost
xgb_random_seed = 2019
xgb_nthread = 2
xgb_dmatrix_silent = True

## sklearn
skl_random_seed = 2019
skl_n_jobs = 2

if debug:
    xgb_nthread = 1
    skl_n_jobs = 1
    xgb_min_num_round = 5
    xgb_max_num_round = 10
    xgb_num_round_step = 5
    skl_min_n_estimators = 5
    skl_max_n_estimators = 10
    skl_n_estimators_step = 5
    libfm_min_iter = 5
    libfm_max_iter = 10
    iter_step = 5
    hyperopt_param = {}
    hyperopt_param["xgb_max_evals"] = 1
    hyperopt_param["rf_max_evals"] = 1
    hyperopt_param["etr_max_evals"] = 1
    hyperopt_param["gbm_max_evals"] = 1
    hyperopt_param["lr_max_evals"] = 1
    hyperopt_param["ridge_max_evals"] = 1
    hyperopt_param["lasso_max_evals"] = 1
    hyperopt_param['svr_max_evals'] = 1
    hyperopt_param['dnn_max_evals'] = 1
    hyperopt_param['libfm_max_evals'] = 1
    hyperopt_param['rgf_max_evals'] = 1
else:
    xgb_min_num_round = 10
    xgb_max_num_round = 500
    xgb_num_round_step = 10
    skl_min_n_estimators = 100
    skl_max_n_estimators = 1000
    skl_n_estimators_step = 20
    libfm_min_iter = 10
    libfm_max_iter = 500
    iter_step = 10
    hyperopt_param = {}
    hyperopt_param["xgb_max_evals"] = 200
    hyperopt_param["rf_max_evals"] = 200
    hyperopt_param["etr_max_evals"] = 200
    hyperopt_param["gbm_max_evals"] = 200
    hyperopt_param["lr_max_evals"] = 200
    hyperopt_param["ridge_max_evals"] = 200
    hyperopt_param["lasso_max_evals"] = 200
    hyperopt_param['svr_max_evals'] = 200
    hyperopt_param['dnn_max_evals'] = 200
    hyperopt_param['libfm_max_evals'] = 200
    hyperopt_param['rgf_max_evals'] = 200


########################################
## Parameter Space for XGBoost models ##
########################################
## In the early stage of the competition, I mostly focus on
## raw tfidf features and linear booster.

## regression with linear booster
param_space_xgb_reg = {
    'task': 'xgb_reg',
    'eta' : hp.quniform('eta', 0.01, 1, 0.01),
    'lambda' : hp.quniform('lambda', 0, 5, 0.05),
    'alpha' : hp.quniform('alpha', 0, 0.5, 0.005),
    'lambda_bias' : hp.quniform('lambda_bias', 0, 3, 0.1),
    'num_round' : hp.quniform('num_round', xgb_min_num_round, xgb_max_num_round, xgb_num_round_step),
    'nthread': xgb_nthread,
    'silent' : 1,
    'seed': xgb_random_seed,
    "max_evals": hyperopt_param["xgb_max_evals"],
}


########################################
## Parameter Space for Sklearn Models ##
########################################

## random forest regressor
param_space_reg_skl_rf = {
    'task': 'reg_skl_rf',
    'n_estimators': hp.quniform("n_estimators", skl_min_n_estimators, skl_max_n_estimators, skl_n_estimators_step),
    'max_features': hp.quniform("max_features", 0.05, 1.0, 0.05),
    'max_depth': hp.quniform('max_depth', 1, 30, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
    'n_jobs': skl_n_jobs,
    'random_state': skl_random_seed,
    "max_evals": hyperopt_param["rf_max_evals"],
}

## extra trees regressor
param_space_reg_skl_etr = {
    'task': 'reg_skl_etr',
    'n_estimators': hp.quniform("n_estimators", skl_min_n_estimators, skl_max_n_estimators, skl_n_estimators_step),
    'max_features': hp.quniform("max_features", 0.05, 1.0, 0.05),
    'n_jobs': skl_n_jobs,
    'random_state': skl_random_seed,
    "max_evals": hyperopt_param["etr_max_evals"],
}

## gradient boosting regressor
param_space_reg_skl_gbm = {
    'task': 'reg_skl_gbm',
    'n_estimators': hp.quniform("n_estimators", skl_min_n_estimators, skl_max_n_estimators, skl_n_estimators_step),
    'learning_rate': hp.quniform("learning_rate", 0.01, 0.5, 0.01),
    'max_features': hp.quniform("max_features", 0.05, 1.0, 0.05),
    'max_depth': hp.quniform('max_depth', 1, 15, 1),
    'subsample': hp.quniform('subsample', 0.5, 1, 0.1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 10, 1),
    'random_state': skl_random_seed,
    "max_evals": hyperopt_param["gbm_max_evals"],
}

## support vector regression
param_space_reg_skl_svr = {
    'task': 'reg_skl_svr',
    'C': hp.loguniform("C", np.log(1), np.log(100)),
    'gamma': hp.loguniform("gamma", np.log(0.001), np.log(0.1)),
    'degree': hp.quniform('degree', 1, 5, 1),
    'epsilon': hp.loguniform("epsilon", np.log(0.001), np.log(0.1)),
    'kernel': hp.choice('kernel', ['rbf', 'poly']),
    "max_evals": hyperopt_param["svr_max_evals"],
}

## ridge regression
param_space_reg_skl_ridge = {
    'task': 'reg_skl_ridge',
    'alpha': hp.loguniform("alpha", np.log(0.01), np.log(20)),
    'random_state': skl_random_seed,
    "max_evals": hyperopt_param["ridge_max_evals"],
}

## lasso
param_space_reg_skl_lasso = {
    'task': 'reg_skl_lasso',
    'alpha': hp.loguniform("alpha", np.log(0.00001), np.log(0.1)),
    'random_state': skl_random_seed,
    "max_evals": hyperopt_param["lasso_max_evals"],
}

######################################
## Parameter Space for Keras Models ##
######################################

## integer features
int_feat = ["num_round", "n_estimators", "max_depth", "degree",'min_samples_split','min_samples_leaf',
            "hidden_units", "hidden_layers", "batch_size", "nb_epoch",
            "dim", "iter",
            "max_leaf_forest", "num_iteration_opt", "num_tree_search", "min_pop", "opt_interval"]

####################
## All the Models ##
####################
feat_names = []
param_spaces = {}

#############
## xgboost ##
#############
## regression with xgboost tree booster
feat_name = "xgb_reg"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_xgb_reg

#############
## Sklearn ##
#############
## extra trees regressor
feat_name = "reg_skl_etr"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_reg_skl_etr

## random forest regressor
feat_name = "reg_skl_rf"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_reg_skl_rf

## gradient boosting regressor
feat_name = "reg_skl_gbm"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_reg_skl_gbm

## support vector regression
feat_name = "reg_skl_svr"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_reg_skl_svr

## ridge regression
feat_name = "reg_skl_ridge"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_reg_skl_ridge

## lasso
feat_name = "reg_skl_lasso"
feat_names.append( feat_name )
param_spaces[feat_name] = param_space_reg_skl_lasso

import pandas as pd
import numpy as np

from model_param_opt import hyperopt_wrapper,hyperopt_train_test
from model_library import feat_names,param_spaces
from hyperopt import fmin,Trials,tpe

from sklearn.model_selection import train_test_split

def data_prepare():
    x_train = pd.read_csv('x_train.csv').values
    y_train = pd.read_csv('y_train.csv').values
    # x_test = pd.read_csv('x_test.csv').values
    y_train = [i[0] for i in y_train]
    data_used = list(train_test_split(x_train,y_train,test_size = 0.3))
    return data_used

if __name__ == '__main__':
    data_used = data_prepare()
    with open('model_log_best_params.txt','w') as f:
        for i in range(len(feat_names)):
            param_space = param_spaces[feat_names[i]]
            trials = Trials()
            objective = lambda p: hyperopt_wrapper(p, data_used)
            best_params = fmin(objective, param_space, algo=tpe.suggest,
                           trials=trials, max_evals=param_space["max_evals"])
            print(best_params)
            f.write('%s;%s'%(feat_names[i],str(best_params)))
            f.write('\n')

from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, RepeatedKFold
from model_library import int_feat


def int_feat_f(param):
    for f in int_feat:
        if f in param:
            param[f] = int(param[f])
    return param


def ensemble_run():
    x_train = pd.read_csv('x_train.csv').values
    x_test = pd.read_csv('x_test.csv').values

    target = pd.read_csv('y_train.csv')
    y_train = np.array([i[0] for i in target.values])
    train_shape = len(y_train)

    X_train = x_train
    X_test = x_test

    with open('model_log_best_params.txt', 'r') as f:
        a = f.readlines()
    model_param_dict = {i.split(";")[0]: eval(i.split(";")[1][:-1]) for i in a}

    ##### xgb
    print('xgb')
    xgb_params = int_feat_f(model_param_dict['xgb_reg'])
    folds = KFold(n_splits=5, shuffle=True, random_state=2019)
    oof_xgb = np.zeros(train_shape)
    predictions_xgb = np.zeros(len(X_test))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_ + 1))
        model = xgb.XGBRegressor(**xgb_params).fit(X_train[trn_idx], y_train[trn_idx])
        pre = model.predict(X_train[val_idx])
        oof_xgb[val_idx] = model.predict(X_train[val_idx])
        predictions_xgb += model.predict(X_test) / folds.n_splits

    print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, y_train)))

    ##### ExtraTreesRegressor
    print('ExtraTreesRegressor')
    etr_params = int_feat_f(model_param_dict['reg_skl_etr'])

    folds = KFold(n_splits=5, shuffle=True, random_state=2019)
    oof_etr = np.zeros(train_shape)
    predictions_etr = np.zeros(len(X_test))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_ + 1))
        model = ExtraTreesRegressor(**etr_params).fit(X_train[trn_idx], y_train[trn_idx])
        pre = model.predict(X_train[val_idx])
        oof_etr[val_idx] = model.predict(X_train[val_idx])
        predictions_etr += model.predict(X_test) / folds.n_splits

    print("CV score: {:<8.8f}".format(mean_squared_error(oof_etr, y_train)))

    ##### RandomForestRegressor
    print('RandomForestRegressor')
    rfr_params = int_feat_f(model_param_dict['reg_skl_rf'])

    folds = KFold(n_splits=5, shuffle=True, random_state=2019)
    oof_rfr = np.zeros(train_shape)
    predictions_rfr = np.zeros(len(X_test))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_ + 1))
        model = RandomForestRegressor(**rfr_params).fit(X_train[trn_idx], y_train[trn_idx])
        pre = model.predict(X_train[val_idx])
        oof_rfr[val_idx] = model.predict(X_train[val_idx])
        predictions_rfr += model.predict(X_test) / folds.n_splits

    print("CV score: {:<8.8f}".format(mean_squared_error(oof_rfr, y_train)))

    ##### GradientBoostingRegressor
    print('GradientBoostingRegressor')
    gbr_params = int_feat_f(model_param_dict['reg_skl_gbm'])

    folds = KFold(n_splits=5, shuffle=True, random_state=2019)
    oof_gbr = np.zeros(train_shape)
    predictions_gbr = np.zeros(len(X_test))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_ + 1))
        model = GradientBoostingRegressor(**gbr_params).fit(X_train[trn_idx], y_train[trn_idx])
        pre = model.predict(X_train[val_idx])
        oof_gbr[val_idx] = model.predict(X_train[val_idx])
        predictions_gbr += model.predict(X_test) / folds.n_splits

    print("CV score: {:<8.8f}".format(mean_squared_error(oof_gbr, y_train)))

    ##### SVR
    print('SVR')
    svr_params = int_feat_f(model_param_dict['reg_skl_svr'])
    kernal_choice = ['rbf', 'poly']
    svr_params['kernel'] = kernal_choice[svr_params['kernel']]

    folds = KFold(n_splits=5, shuffle=True, random_state=2019)
    oof_svr = np.zeros(train_shape)
    predictions_svr = np.zeros(len(X_test))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_ + 1))
        model = SVR(**svr_params).fit(X_train[trn_idx], y_train[trn_idx])
        pre = model.predict(X_train[val_idx])
        oof_svr[val_idx] = model.predict(X_train[val_idx])
        predictions_svr += model.predict(X_test) / folds.n_splits

    print("CV score: {:<8.8f}".format(mean_squared_error(oof_svr, y_train)))

    ##### Ridge
    print('Ridge')
    ridge_params = int_feat_f(model_param_dict['reg_skl_ridge'])

    folds = KFold(n_splits=5, shuffle=True, random_state=2019)
    oof_ridge = np.zeros(train_shape)
    predictions_ridge = np.zeros(len(X_test))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_ + 1))
        model = Ridge(**ridge_params).fit(X_train[trn_idx], y_train[trn_idx])
        pre = model.predict(X_train[val_idx])
        oof_ridge[val_idx] = model.predict(X_train[val_idx])
        predictions_ridge += model.predict(X_test) / folds.n_splits

    print("CV score: {:<8.8f}".format(mean_squared_error(oof_ridge, y_train)))

    ##### Lasso
    print('Lasso')
    Lasso_params = int_feat_f(model_param_dict['reg_skl_lasso'])

    folds = KFold(n_splits=5, shuffle=True, random_state=2019)
    oof_Lasso = np.zeros(train_shape)
    predictions_Lasso = np.zeros(len(X_test))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_ + 1))
        model = Ridge(**Lasso_params).fit(X_train[trn_idx], y_train[trn_idx])
        pre = model.predict(X_train[val_idx])
        oof_Lasso[val_idx] = model.predict(X_train[val_idx])
        predictions_Lasso += model.predict(X_test) / folds.n_splits

    print("CV score: {:<8.8f}".format(mean_squared_error(oof_Lasso, y_train)))

    # stacking
    print('stacking')
    train_stack = np.vstack([oof_xgb, oof_etr, oof_gbr, oof_Lasso, oof_rfr, oof_ridge, oof_svr]).transpose()
    test_stack = np.vstack(
        [predictions_xgb, predictions_etr, predictions_gbr, predictions_Lasso, predictions_rfr, predictions_ridge,
         predictions_svr]).transpose()

    folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2019)
    oof_stack = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, target)):
        print("fold {}".format(fold_))
        trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
        val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

        clf_3 = BayesianRidge()
        clf_3.fit(trn_data, trn_y.ravel())

        oof_stack[val_idx] = clf_3.predict(val_data)
        predictions += clf_3.predict(test_stack) / 10

    mean_squared_error(target.values, oof_stack)

    submit_example = pd.read_csv('happiness_submit.csv', encoding="gb2312")
    submit_example['happiness'] = predictions

    submit_example.to_csv('result.csv', index=False)


if __name__ == '__main__':
    ensemble_run()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值