龙珠训练营机器学习task04

最新推荐文章于 2023-06-25 21:10:21 发布

Chnek

最新推荐文章于 2023-06-25 21:10:21 发布

阅读量107

点赞数

本文链接：https://blog.csdn.net/chnek/article/details/116205385

版权

本学习笔记为阿里云天池龙珠计划机器学习训练营的学习内容，学习链接为：https://tianchi.aliyun.com/competition/entrance/231702/introduction?spm=5176.20222472.J_3678908510.8.8f5e67c2RKrT98
总体思路：分别使用LightGBM，xgboost，gbdt，catboost建立多个个体学习器（加入bagging的策略，对数据随机采样），对最终学习器的输出使用岭回归进一步提升精度。代码如下。

改进点：
1.可以在详细分析一下字段，可以考虑对字段进行特殊处理。
2.超参数还可以调，我没有使用网格搜索，只是简单的进行的调参。
3.如果单纯为了提高精度，可以更高随机种子，多试几次

import pandas as pd
import numpy as np
df = pd.read_csv("happiness_train_complete.csv",encoding="GB2312")
df = df.sample(frac=1,replace=False,random_state=11)
df.reset_index(inplace=True)
df = df[df["happiness"]>0]
Y = df["happiness"]
df["survey_month"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df["survey_day"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df["survey_hour"] = df["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
X = df.drop(columns=["id","index","happiness","survey_time","edu_other","property_other","invest_other"])

from sklearn.model_selection import train_test_split
from lightgbm.sklearn import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib
from sklearn.model_selection import KFold
kfold = KFold(n_splits=15, shuffle = True, random_state= 12)
model = LGBMRegressor(n_jobs=-1,learning_rate=0.051,
                      n_estimators=400,
                      num_leaves=11,
                      reg_alpha=2.0, 
                      reg_lambda=2.1,
                      min_child_samples=6,
                      min_split_gain=0.5,
                      colsample_bytree=0.2
                     )
mse = []
i=0
for train, test in kfold.split(X):
    X_train = X.iloc[train]
    y_train = Y.iloc[train]
    X_test = X.iloc[test]
    y_test = Y.iloc[test]
    model.fit(X_train,y_train)
#     model2.fit(model.predict(X_train,pred_leaf=True),y_train)
#     y_pred = model2.predict(model.predict(X=X_test,pred_leaf=True))
    y_pred = model.predict(X=X_test)
    e = mean_squared_error(y_true=y_test,y_pred=y_pred)
    mse.append(e)
    print(e)
    joblib.dump(filename="light"+str(i),value=model)
    i+=1
print("lightgbm",np.mean(mse),mse)

#CatBoostRegressor
import pandas as pd
import numpy as np
df = pd.read_csv("happiness_train_complete.csv",encoding="GB2312")
df = df.sample(frac=1,replace=False,random_state=11)
df.reset_index(inplace=True)

df = df[df["happiness"]>0]
Y = df["happiness"]
df["survey_month"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df["survey_day"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df["survey_hour"] = df["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
X = df.drop(columns=["id","index","happiness","survey_time","edu_other","property_other","invest_other"])


from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.externals import joblib
kfold = KFold(n_splits=15, shuffle = True, random_state= 12)
model = CatBoostRegressor(colsample_bylevel=0.1,thread_count=6,silent=True,iterations=800, 
                          depth=5, 
                          learning_rate=0.051, 
                          loss_function='RMSE',
                          l2_leaf_reg = 3)
mse = []
i=0
for train, test in kfold.split(X):
    X_train = X.iloc[train]
    y_train = Y.iloc[train]
    X_test = X.iloc[test]
    y_test = Y.iloc[test]
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    err = mean_squared_error(y_true=y_test,y_pred=y_pred)
    mse.append(err)
    print(err)
    joblib.dump(filename="cat"+str(i),value=model)
    i+=1
print("catboost",np.mean(mse),mse)

#xgboost
import pandas as pd
import numpy as np
df = pd.read_csv("happiness_train_complete.csv",encoding="GB2312")
df = df.sample(frac=1,replace=False,random_state=11)
df.reset_index(inplace=True)
df = df[df["happiness"]>0]
Y = df["happiness"]
df["survey_month"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df["survey_day"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df["survey_hour"] = df["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
X = df.drop(columns=["id","index","happiness","survey_time","edu_other","property_other","invest_other"])
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib
from sklearn.model_selection import KFold
kfold = KFold(n_splits=15, shuffle = True, random_state= 11)
model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.1,
       colsample_bytree=0.971, gamma=0.11, learning_rate=0.069, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=499,
       n_jobs=-1, nthread=50, objective='reg:linear', random_state=0,
       reg_alpha=0.1, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1.0)
mse = []
i = 0
for train, test in kfold.split(X):
    X_train = X.iloc[train]
    y_train = Y.iloc[train]
    X_test = X.iloc[test]
    y_test = Y.iloc[test]
 model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    
    xg_mse = mean_squared_error(y_true=y_test,y_pred=y_pred)
    mse.append(xg_mse)
    print("xgboost",xg_mse)
    joblib.dump(filename="xg"+str(i),value=model)
    i+=1
print("xgboost",np.mean(mse),mse)

#gbdt
import pandas as pd
import numpy as np
df = pd.read_csv("happiness_train_complete.csv",encoding="GB2312")
df = df.sample(frac=1,replace=False,random_state=11)
df.reset_index(inplace=True)
df = df[df["happiness"]>0]
Y = df["happiness"]
df["survey_month"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df["survey_day"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df["survey_hour"] = df["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
X = df.drop(columns=["id","index","happiness","survey_time","edu_other","property_other","invest_other"])
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.externals import joblib
from sklearn.model_selection import KFold
kfold = KFold(n_splits=15, shuffle = True, random_state= 12)
model = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.051, loss='ls', max_depth=4, max_features=10,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=600, presort='auto', random_state=3,
             subsample=0.98, verbose=0, warm_start=False)
             X.fillna(-8,inplace=True)
mse = []
i = 0
for train, test in kfold.split(X):
    X_train = X.iloc[train]
    y_train = Y.iloc[train]
    X_test = X.iloc[test]
    y_test = Y.iloc[test]

    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    gbdt_mse = mean_squared_error(y_true=y_test,y_pred=y_pred)
    mse.append(gbdt_mse)
    print("gbdt",gbdt_mse)
    joblib.dump(filename="gbdt"+str(i),value=model)
    i+=1
print("gbdt",np.mean(mse),mse)

#带权平均融合CatBoostRegressor + xgboost + gbdt现有模型
import pandas as pd
import numpy as np
df = pd.read_csv("happiness_train_complete.csv",encoding="GB2312")
df = df.sample(frac=1,replace=False,random_state=2000)
df.reset_index(inplace=True)

df = df[df["happiness"]>0]
Y = df["happiness"]
df["survey_month"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df["survey_day"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df["survey_hour"] = df["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
X = df.drop(columns=["id","index","happiness","survey_time","edu_other","property_other","invest_other"])

from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.externals import joblib
kfold = KFold(n_splits=10, shuffle = True, random_state= 110)
catmse = []
lightmse = []
xgmse = []
gbdtmse = []
lrmse = []
i = 0
for train, test in kfold.split(X):
 X_train = X.iloc[train]
    y_train = Y.iloc[train]
    X_test = X.iloc[test]
    y_test = Y.iloc[test]
    
    cat = joblib.load(filename="cat"+str(i))
    light = joblib.load(filename="light"+str(i))
    xg = joblib.load(filename="xg"+str(i))
    gbdt = joblib.load(filename="gbdt"+str(i))
    
    catX = cat.predict(X_test)
    cat_mse = mean_squared_error(y_true=y_test,y_pred=catX)
    print("\ncat mse:",cat_mse)
    catmse.append(cat_mse)
    
#     X_test2 = X_test.drop(columns=["survey_day"])
#     lightX = light.predict(X_test2)
#     light_mse = mean_squared_error(y_true=y_test,y_pred=lightX)
#     print("light mse:",light_mse)
#     lightmse.append(light_mse)
    
    xgX = xg.predict(X_test)
     xgX = xg.predict(X_test)
    xg_mse = mean_squared_error(y_true=y_test,y_pred=xgX)
    print("xg mse:",xg_mse)
    xgmse.append(xg_mse)
    
    X_test2 = X_test.fillna(-8)
    gbdtX = gbdt.predict(X_test2)
    gbdt_mse = mean_squared_error(y_true=y_test,y_pred=gbdtX)
    print("gbdt mse:",gbdt_mse)
    gbdtmse.append(gbdt_mse)
    
    res = np.c_[catX,xgX,gbdtX]
    e = np.array([1/cat_mse,1/xg_mse,1/gbdt_mse])
    y_pred = np.sum(res*e,axis=1)/sum(e)
    lr_mse = mean_squared_error(y_true=y_test,y_pred=y_pred)
    print("lr mse:",lr_mse)
    lrmse.append(lr_mse)
    
    i+=1
    
print("\n\ncatmse:",np.mean(catmse))
# print("lightmse:",np.mean(lightmse))
print("xgmse:",np.mean(xgmse))
print("gbdtmse:",np.mean(gbdtmse))
print("lrmse:",np.mean(lrmse))
cat = CatBoostRegressor(colsample_bylevel=0.1,thread_count=6,silent=True,iterations=800, 
                          depth=5, 
                          learning_rate=0.051, 
                          loss_function='RMSE',
                          l2_leaf_reg = 3)
xg = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=0.1,
       colsample_bytree=0.971, gamma=0.11, learning_rate=0.069, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=499,
       n_jobs=-1, nthread=50, objective='reg:linear', random_state=0,
       reg_alpha=0.1, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1.0)
gbdt = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.051, loss='ls', max_depth=4, max_features=10,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=600, presort='auto', random_state=3,
             subsample=0.98, verbose=0, warm_start=False)
cat.fit(X,Y)
xg.fit(X,Y)
gbdt.fit(X.fillna(-8),Y)
    
df2 = pd.read_csv("happiness_test_complete.csv",encoding="GB2312")
df2["survey_month"] = df2["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df2["survey_day"] = df2["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df2["survey_hour"] = df2["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
out = df2[["id"]]
X = df2.drop(columns=["id","survey_time","edu_other","property_other","invest_other"])
X2 = X.drop(columns=["survey_day"])
catX = cat.predict(X)
xgX = xg.predict(X)
gbdtX = gbdt.predict(X.fillna(-8))
res = np.c_[catX,xgX,gbdtX]
e = np.array([1/np.mean(catmse),1/np.mean(xgmse),1/np.mean(gbdtmse)])
y_pred = np.sum(res*e,axis=1)/sum(e)
out["happiness"] = y_pred
out.to_csv("happiness_submit.csv",index=False)
print("done")
print(e)

#LR 融合CatBoostRegressor + LightGBM + xgboost + gbdt现有模型
import pandas as pd
import numpy as np
df = pd.read_csv("happiness_train_complete.csv",encoding="GB2312")
df = df.sample(frac=1,replace=False,random_state=11)
df.reset_index(inplace=True)

df = df[df["happiness"]>0]
Y = df["happiness"]
df["survey_month"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df["survey_day"] = df["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df["survey_hour"] = df["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
X = df.drop(columns=["id","index","happiness","survey_time","edu_other","property_other","invest_other"])

from catboost import Pool, CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.externals import joblib
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.model_selection import train_test_split
kfold = KFold(n_splits=15, shuffle = True, random_state= 12)
catmse = []
lightmse = []
xgmse = []
gbdtmse = []
lrmse = []
i = 0
for train, test in kfold.split(X):
 X_train = X.iloc[train]
    y_train = Y.iloc[train]
    X_test = X.iloc[test]
    y_test = Y.iloc[test]
    
    cat = joblib.load(filename="cat"+str(i))
    light = joblib.load(filename="light"+str(i))
    xg = joblib.load(filename="xg"+str(i))
    gbdt = joblib.load(filename="gbdt"+str(i))
    
    catX = cat.predict(X_test)
    cat_mse = mean_squared_error(y_true=y_test,y_pred=catX)
    print("\ncat mse:",cat_mse)
    catmse.append(cat_mse)

    lightX = light.predict(X_test)
    light_mse = mean_squared_error(y_true=y_test,y_pred=lightX)
    print("light mse:",light_mse)
    lightmse.append(light_mse)
    
    xgX = xg.predict(X_test)
    xg_mse = mean_squared_error(y_true=y_test,y_pred=xgX)
    print("xg mse:",xg_mse)
    xgmse.append(xg_mse)
    
    gbdtX = gbdt.predict(X_test.fillna(-8))
    gbdt_mse = mean_squared_error(y_true=y_test,y_pred=gbdtX)
    print("gbdt mse:",gbdt_mse)
    gbdtmse.append(gbdt_mse)
    
    res = np.c_[catX,lightX,xgX,gbdtX]
    lr = Ridge(fit_intercept=False, alpha=75)
    lr.fit(res,y_test)
    print(lr.coef_)
    y_pred = lr.predict(res)
    lr_mse = mean_squared_error(y_true=y_test,y_pred=y_pred)
    print("lr mse:",lr_mse)
    lrmse.append(lr_mse)
    joblib.dump(filename="lr"+str(i),value=lr)
    i+=1
    
print("\ncatmse:",np.mean(catmse))
print("\n\nlightmse:",np.mean(lightmse))
print("xgmse:",np.mean(xgmse))
print("gbdtmse:",np.mean(gbdtmse))
print("lrmse:",np.mean(lrmse))

    
df2 = pd.read_csv("happiness_test_complete.csv",encoding="GB2312")
df2["survey_month"] = df2["survey_time"].map(lambda line:line.split(" ")[0].split("/")[1]).astype("int64")
df2["survey_day"] = df2["survey_time"].map(lambda line:line.split(" ")[0].split("/")[2]).astype("int64")
df2["survey_hour"] = df2["survey_time"].map(lambda line:line.split(" ")[1].split(":")[0]).astype("int64")
out = df2[["id"]]
X = df2.drop(columns=["id","survey_time","edu_other","property_other","invest_other"])
prediction = []
for i in range(15):
    cat = joblib.load(filename="cat"+str(i))
    light = joblib.load(filename="light"+str(i))
    xg = joblib.load(filename="xg"+str(i))
    gbdt = joblib.load(filename="gbdt"+str(i))
    lr = joblib.load(filename="lr"+str(i))
    
    catX = cat.predict(X)
    lightX = light.predict(X)
    xgX = xg.predict(X)
    gbdtX = gbdt.predict(X.fillna(-8))
    res = np.c_[catX,lightX,xgX,gbdtX]
    prediction.append(lr.predict(res))
    a = np.array(prediction)
def cut(arr):
    arr2 = []
    for x in arr:
        if x<1:
            arr2.append(1)
        elif x>5:
            arr2.append(5)
        else :
            arr2.append(x)
    return arr2
out["happiness"] = np.mean(np.array(prediction),axis=0)
out.to_csv("happiness_submit.csv",index=False)
print("done")


# out["happiness"] = cut(np.sum((1/np.array(lrmse)*a.T),axis=1)/np.sum(1/np.array(lrmse)))
# out.to_csv("happiness_submit.csv",index=False)
print("done")