本学习笔记为阿里云天池龙珠计划机器学习训练营的学习内容,学习链接为:https://tianchi.aliyun.com/competition/entrance/231702/introduction?spm=5176.20222472.J_3678908510.8.8f5e67c2RKrT98
总体思路:分别使用LightGBM,xgboost,gbdt,catboost建立多个个体学习器(加入bagging的策略,对数据随机采样),对最终学习器的输出使用岭回归进一步提升精度。代码如下。
改进点:
1.可以在详细分析一下字段,可以考虑对字段进行特殊处理。
2.超参数还可以调,我没有使用网格搜索,只是简单的进行的调参。
3.如果单纯为了提高精度,可以更高随机种子,多试几次
import pandas as pd
import numpy as np
df = pd.read_csv("happiness_train_complete.csv",encoding="GB2312")
df = df.sample(frac=1,replace=False,random_state=11)
df.reset_index(inplace=True)
df = df[df["happiness"]>0]
Y = df["happiness"]
df["survey_month"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df["survey_day"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df["survey_hour"] = df["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
X = df.drop(columns=["id","index","happiness","survey_time","edu_other","property_other","invest_other"])
from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib
from sklearn.model_selection import KFold
kfold = KFold(n_splits=15, shuffle = True, random_state= 12)
model = LGBMRegressor(n_jobs=-1,learning_rate=0.051,
n_estimators=400,
num_leaves=11,
reg_alpha=2.0,
reg_lambda=2.1,
min_child_samples=6,
min_split_gain=0.5,
colsample_bytree=0.2
)
mse = []
i=0
for train, test in kfold.split(X):
X_train = X.iloc[train]
y_train = Y.iloc[train]
X_test = X.iloc[test]
y_test = Y.iloc[test]
model.fit(X_train,y_train)
# model2.fit(model.predict(X_train,pred_leaf=True),y_train)
# y_pred = model2.predict(model.predict(X=X_test,pred_leaf=True))
y_pred = model.predict(X=X_test)
e = mean_squared_error(y_true=y_test,y_pred=y_pred)
mse.append(e)
print(e)
joblib.dump(filename="light"+str(i),value=model)
i+=1
print("lightgbm",np.mean(mse),mse)
#CatBoostRegressor
import pandas as pd
import numpy as np
df = pd.read_csv("happiness_train_complete.csv",encoding="GB2312")
df = df.sample(frac=1,replace=False,random_state=11)
df.reset_index(inplace=True)
df = df[df["happiness"]>0]
Y = df["happiness"]
df["survey_month"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df["survey_day"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df["survey_hour"] = df["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
X = df.drop(columns=["id","index","happiness","survey_time","edu_other","property_other","invest_other"])
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.externals import joblib
kfold = KFold(n_splits=15, shuffle = True, random_state= 12)
model = CatBoostRegressor(colsample_bylevel=0.1,thread_count=6,silent=True,iterations=800,
depth=5,
learning_rate=0.051,
loss_function='RMSE',
l2_leaf_reg = 3)
mse = []
i=0
for train, test in kfold.split(X):
X_train = X.iloc[train]
y_train = Y.iloc[train]
X_test = X.iloc[test]
y_test = Y.iloc[test]
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
err = mean_squared_error(y_true=y_test,y_pred=y_pred)
mse.append(err)
print(err)
joblib.dump(filename="cat"+str(i),value=model)
i+=1
print("catboost",np.mean(mse),mse)
#xgboost
import pandas as pd
import numpy as np
df = pd.read_csv("happiness_train_complete.csv",encoding="GB2312")
df = df.sample(frac=1,replace=False,random_state=11)
df.reset_index(inplace=True)
df = df[df["happiness"]>0]
Y = df["happiness"]
df["survey_month"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df["survey_day"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df["survey_hour"] = df["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
X = df.drop(columns=["id","index","happiness","survey_time","edu_other","property_other","invest_other"])
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib
from sklearn.model_selection import KFold
kfold = KFold(n_splits=15, shuffle = True, random_state= 11)
model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.1,
colsample_bytree=0.971, gamma=0.11, learning_rate=0.069, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=None, n_estimators=499,
n_jobs=-1, nthread=50, objective='reg:linear', random_state=0,
reg_alpha=0.1, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=True, subsample=1.0)
mse = []
i = 0
for train, test in kfold.split(X):
X_train = X.iloc[train]
y_train = Y.iloc[train]
X_test = X.iloc[test]
y_test = Y.iloc[test]
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
xg_mse = mean_squared_error(y_true=y_test,y_pred=y_pred)
mse.append(xg_mse)
print("xgboost",xg_mse)
joblib.dump(filename="xg"+str(i),value=model)
i+=1
print("xgboost",np.mean(mse),mse)
#gbdt
import pandas as pd
import numpy as np
df = pd.read_csv("happiness_train_complete.csv",encoding="GB2312")
df = df.sample(frac=1,replace=False,random_state=11)
df.reset_index(inplace=True)
df = df[df["happiness"]>0]
Y = df["happiness"]
df["survey_month"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df["survey_day"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df["survey_hour"] = df["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
X = df.drop(columns=["id","index","happiness","survey_time","edu_other","property_other","invest_other"])
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib
from sklearn.model_selection import KFold
kfold = KFold(n_splits=15, shuffle = True, random_state= 12)
model = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
learning_rate=0.051, loss='ls', max_depth=4, max_features=10,
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=600, presort='auto', random_state=3,
subsample=0.98, verbose=0, warm_start=False)
X.fillna(-8,inplace=True)
mse = []
i = 0
for train, test in kfold.split(X):
X_train = X.iloc[train]
y_train = Y.iloc[train]
X_test = X.iloc[test]
y_test = Y.iloc[test]
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
gbdt_mse = mean_squared_error(y_true=y_test,y_pred=y_pred)
mse.append(gbdt_mse)
print("gbdt",gbdt_mse)
joblib.dump(filename="gbdt"+str(i),value=model)
i+=1
print("gbdt",np.mean(mse),mse)
#带权平均融合CatBoostRegressor + xgboost + gbdt现有模型
import pandas as pd
import numpy as np
df = pd.read_csv("happiness_train_complete.csv",encoding="GB2312")
df = df.sample(frac=1,replace=False,random_state=2000)
df.reset_index(inplace=True)
df = df[df["happiness"]>0]
Y = df["happiness"]
df["survey_month"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df["survey_day"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df["survey_hour"] = df["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
X = df.drop(columns=["id","index","happiness","survey_time","edu_other","property_other","invest_other"])
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.externals import joblib
kfold = KFold(n_splits=10, shuffle = True, random_state= 110)
catmse = []
lightmse = []
xgmse = []
gbdtmse = []
lrmse = []
i = 0
for train, test in kfold.split(X):
X_train = X.iloc[train]
y_train = Y.iloc[train]
X_test = X.iloc[test]
y_test = Y.iloc[test]
cat = joblib.load(filename="cat"+str(i))
light = joblib.load(filename="light"+str(i))
xg = joblib.load(filename="xg"+str(i))
gbdt = joblib.load(filename="gbdt"+str(i))
catX = cat.predict(X_test)
cat_mse = mean_squared_error(y_true=y_test,y_pred=catX)
print("\ncat mse:",cat_mse)
catmse.append(cat_mse)
# X_test2 = X_test.drop(columns=["survey_day"])
# lightX = light.predict(X_test2)
# light_mse = mean_squared_error(y_true=y_test,y_pred=lightX)
# print("light mse:",light_mse)
# lightmse.append(light_mse)
xgX = xg.predict(X_test)
xgX = xg.predict(X_test)
xg_mse = mean_squared_error(y_true=y_test,y_pred=xgX)
print("xg mse:",xg_mse)
xgmse.append(xg_mse)
X_test2 = X_test.fillna(-8)
gbdtX = gbdt.predict(X_test2)
gbdt_mse = mean_squared_error(y_true=y_test,y_pred=gbdtX)
print("gbdt mse:",gbdt_mse)
gbdtmse.append(gbdt_mse)
res = np.c_[catX,xgX,gbdtX]
e = np.array([1/cat_mse,1/xg_mse,1/gbdt_mse])
y_pred = np.sum(res*e,axis=1)/sum(e)
lr_mse = mean_squared_error(y_true=y_test,y_pred=y_pred)
print("lr mse:",lr_mse)
lrmse.append(lr_mse)
i+=1
print("\n\ncatmse:",np.mean(catmse))
# print("lightmse:",np.mean(lightmse))
print("xgmse:",np.mean(xgmse))
print("gbdtmse:",np.mean(gbdtmse))
print("lrmse:",np.mean(lrmse))
cat = CatBoostRegressor(colsample_bylevel=0.1,thread_count=6,silent=True,iterations=800,
depth=5,
learning_rate=0.051,
loss_function='RMSE',
l2_leaf_reg = 3)
xg = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.1,
colsample_bytree=0.971, gamma=0.11, learning_rate=0.069, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=None, n_estimators=499,
n_jobs=-1, nthread=50, objective='reg:linear', random_state=0,
reg_alpha=0.1, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=True, subsample=1.0)
gbdt = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
learning_rate=0.051, loss='ls', max_depth=4, max_features=10,
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=600, presort='auto', random_state=3,
subsample=0.98, verbose=0, warm_start=False)
cat.fit(X,Y)
xg.fit(X,Y)
gbdt.fit(X.fillna(-8),Y)
df2 = pd.read_csv("happiness_test_complete.csv",encoding="GB2312")
df2["survey_month"] = df2["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df2["survey_day"] = df2["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df2["survey_hour"] = df2["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
out = df2[["id"]]
X = df2.drop(columns=["id","survey_time","edu_other","property_other","invest_other"])
X2 = X.drop(columns=["survey_day"])
catX = cat.predict(X)
xgX = xg.predict(X)
gbdtX = gbdt.predict(X.fillna(-8))
res = np.c_[catX,xgX,gbdtX]
e = np.array([1/np.mean(catmse),1/np.mean(xgmse),1/np.mean(gbdtmse)])
y_pred = np.sum(res*e,axis=1)/sum(e)
out["happiness"] = y_pred
out.to_csv("happiness_submit.csv",index=False)
print("done")
print(e)
#LR 融合CatBoostRegressor + LightGBM + xgboost + gbdt现有模型
import pandas as pd
import numpy as np
df = pd.read_csv("happiness_train_complete.csv",encoding="GB2312")
df = df.sample(frac=1,replace=False,random_state=11)
df.reset_index(inplace=True)
df = df[df["happiness"]>0]
Y = df["happiness"]
df["survey_month"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df["survey_day"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df["survey_hour"] = df["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
X = df.drop(columns=["id","index","happiness","survey_time","edu_other","property_other","invest_other"])
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.externals import joblib
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import train_test_split
kfold = KFold(n_splits=15, shuffle = True, random_state= 12)
catmse = []
lightmse = []
xgmse = []
gbdtmse = []
lrmse = []
i = 0
for train, test in kfold.split(X):
X_train = X.iloc[train]
y_train = Y.iloc[train]
X_test = X.iloc[test]
y_test = Y.iloc[test]
cat = joblib.load(filename="cat"+str(i))
light = joblib.load(filename="light"+str(i))
xg = joblib.load(filename="xg"+str(i))
gbdt = joblib.load(filename="gbdt"+str(i))
catX = cat.predict(X_test)
cat_mse = mean_squared_error(y_true=y_test,y_pred=catX)
print("\ncat mse:",cat_mse)
catmse.append(cat_mse)
lightX = light.predict(X_test)
light_mse = mean_squared_error(y_true=y_test,y_pred=lightX)
print("light mse:",light_mse)
lightmse.append(light_mse)
xgX = xg.predict(X_test)
xg_mse = mean_squared_error(y_true=y_test,y_pred=xgX)
print("xg mse:",xg_mse)
xgmse.append(xg_mse)
gbdtX = gbdt.predict(X_test.fillna(-8))
gbdt_mse = mean_squared_error(y_true=y_test,y_pred=gbdtX)
print("gbdt mse:",gbdt_mse)
gbdtmse.append(gbdt_mse)
res = np.c_[catX,lightX,xgX,gbdtX]
lr = Ridge(fit_intercept=False, alpha=75)
lr.fit(res,y_test)
print(lr.coef_)
y_pred = lr.predict(res)
lr_mse = mean_squared_error(y_true=y_test,y_pred=y_pred)
print("lr mse:",lr_mse)
lrmse.append(lr_mse)
joblib.dump(filename="lr"+str(i),value=lr)
i+=1
print("\ncatmse:",np.mean(catmse))
print("\n\nlightmse:",np.mean(lightmse))
print("xgmse:",np.mean(xgmse))
print("gbdtmse:",np.mean(gbdtmse))
print("lrmse:",np.mean(lrmse))
df2 = pd.read_csv("happiness_test_complete.csv",encoding="GB2312")
df2["survey_month"] = df2["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df2["survey_day"] = df2["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df2["survey_hour"] = df2["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
out = df2[["id"]]
X = df2.drop(columns=["id","survey_time","edu_other","property_other","invest_other"])
prediction = []
for i in range(15):
cat = joblib.load(filename="cat"+str(i))
light = joblib.load(filename="light"+str(i))
xg = joblib.load(filename="xg"+str(i))
gbdt = joblib.load(filename="gbdt"+str(i))
lr = joblib.load(filename="lr"+str(i))
catX = cat.predict(X)
lightX = light.predict(X)
xgX = xg.predict(X)
gbdtX = gbdt.predict(X.fillna(-8))
res = np.c_[catX,lightX,xgX,gbdtX]
prediction.append(lr.predict(res))
a = np.array(prediction)
def cut(arr):
arr2 = []
for x in arr:
if x<1:
arr2.append(1)
elif x>5:
arr2.append(5)
else :
arr2.append(x)
return arr2
out["happiness"] = np.mean(np.array(prediction),axis=0)
out.to_csv("happiness_submit.csv",index=False)
print("done")
# out["happiness"] = cut(np.sum((1/np.array(lrmse)*a.T),axis=1)/np.sum(1/np.array(lrmse)))
# out.to_csv("happiness_submit.csv",index=False)
print("done")