幸福感挖掘

最新推荐文章于 2022-02-24 14:50:22 发布

happiless

最新推荐文章于 2022-02-24 14:50:22 发布

阅读量511

点赞数

分类专栏： AI 机器学习

本文链接：https://blog.csdn.net/happiless/article/details/116003550

版权

AI 同时被 2 个专栏收录

9 篇文章 4 订阅

订阅专栏

机器学习

4 篇文章 1 订阅

订阅专栏

一、学习知识点概要

1.1 学习内容介绍

阿里天池挖掘幸福感比赛入口: https://tianchi.aliyun.com/competition/entrance/231702/introduction

1.2 学习目标

通过学习能够挤进排行榜前500

1.3 代码流程

1.数据探索
2.特征工程
3.建模预测
4.模型调参与融合

二、学习内容

1. 数据探索

2. 特征工程

import numpy as np
import pandas as pd
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows', None)

# 读入数据
train_data = pd.read_csv('./input/happiness_train_complete.csv', encoding='gbk')
test_data = pd.read_csv('./input/happiness_test_complete.csv', encoding='gbk')

# 将数据进行预处理
y_train = train_data['happiness']
# 将label为-8的异常值修改为3
y_train = y_train.apply(lambda x: 3 if x == -8 else x)

train = train_data.drop(columns=['happiness'])
# 将训练集和测试集进行拼接，方便后续特征处理
data = pd.concat([train, test_data], axis=0)

# 通过data.nunique()， 可以发现一下字段对数据结果没有影响， 所以删除
drop_cols = ['survey_time', 'edu_other', 'property_other', 'invest_other', 'id']
data.drop(drop_cols, axis=1, inplace=True)

# 对缺失值进行处理， 因为数据不应该有负数，所以把所有负数都当成异常值重新进行填充
def replace_with_nan(x):
    if x < 0:
        return float('NaN') # np.nan
    else:
        return x
    
data=data.applymap(replace_with_nan)
# 中位数填充
data=data.fillna(round(data.median(),0))

# 下面就可以保存数据进行建模了
train = data[:train.shape[0]]
train['happiness'] = y_train
test = data[train.shape[0]:]

train.to_csv('./output/train_data.csv', index=False)
test.to_csv('./output/test_data.csv', index=False)

3. 建模预测

import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')

# 读入数据
train_data = pd.read_csv('./output/train_data.csv')
test_data = pd.read_csv('./output/test_data.csv')
X_train = train_data.drop(['happiness'], axis=1).values
y_train = train_data['happiness'].values
X_test = test_data.values

使用XGBoost进行建模

# 可以通过适当调参来改变模型，来提高训练速度或者提高模型准确率
def train_by_cv_xgb(X_train, y_train, X_test, n_splits):
    import xgboost as xgb
    xgb_params = {
        "booster":'gbtree', 
        'eta': 0.005, 
        'max_depth': 7, 
        'subsample': 0.7, 
        'colsample_bytree': 0.8, 
        'objective': 'reg:linear', 
        'eval_metric': 'rmse', 
        'silent': True, 
        'nthread': 8
    }
    # 使用K折交叉验证
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=2021)
    oof_xgb = np.zeros(X_train.shape[0])
    predictions_xgb = np.zeros(X_test.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_+1))
        trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx])
        val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx])
        watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
        clf = xgb.train(dtrain=trn_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params)
        oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit)
        predictions_xgb += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
    print("XGB CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, y_train)))
    return oof_xgb, predictions_xgb

使用LightGBM建模

def train_by_cv_lgb(X_train, y_train, X_test, n_splits):
    from sklearn.model_selection import KFold
    import lightgbm as lgb

    param = {
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'min_data_in_leaf': 20, 
        'objective':'regression',
        'max_depth':7,
        'learning_rate': 0.005,
        "min_child_samples": 30,
        "feature_fraction": 0.8,
        "bagging_freq": 1,
        "bagging_fraction": 0.8 ,
        "bagging_seed": 11,
        "metric": 'mse',
        "lambda_l1": 0.1,
        "verbosity": -1
    }
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=2021)
    oof_lgb = np.zeros(X_train.shape[0])
    predictions_lgb = np.zeros(X_test.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_+1))
        trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
        val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])
        num_round = 20000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 100)
        oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
        predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
    print("LGB CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, y_train)))
    return oof_lgb, predictions_lgb

CatBoost

def train_by_cv_cb(X_train, y_train, X_test, n_splits):
    from catboost import Pool, CatBoostRegressor

    folds = KFold(n_splits=10, shuffle=True, random_state=2021)
    oof_cb = np.zeros(X_train.shape[0])
    predictions_cb = np.zeros(X_test.shape[0])
    cb_params = {
             'n_estimators': 20000,
             'loss_function': 'RMSE',
             'eval_metric':'RMSE',
             'learning_rate': 0.005,
             'depth': 7,
             'use_best_model': True,
             'subsample': 0.6,
             'bootstrap_type': 'Bernoulli',
             'reg_lambda': 3
        }

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_+1))
        trn_X, trn_y = X_train[trn_idx], y_train[trn_idx]
        val_X, val_y = X_train[val_idx], y_train[val_idx]

        model_cb = CatBoostRegressor(**cb_params)
        model_cb.fit(trn_X, trn_y, eval_set=[(val_X, val_y)], verbose=100, early_stopping_rounds=50)
        oof_cb[val_idx] = model_cb.predict(val_X, ntree_end=model_cb.best_iteration_)
        predictions_cb += model_cb.predict(X_test, ntree_end=model_cb.best_iteration_) / folds.n_splits
    print("CB CV score: {:<8.8f}".format(mean_squared_error(oof_cb, y_train)))
    return oof_cb, predictions_cb

# 使用10折交叉验证
oof_lgb, predictions_lgb = train_by_cv_lgb(X_train, y_train, X_test, 10)
oof_xgb, predictions_xgb = train_by_cv_xgb(X_train, y_train, X_test, 10)
oof_cb, predictions_cb = train_by_cv_cb(X_train, y_train, X_test, 10)

4. 模型调参与融合

XGB, LGB, CB 进行stacking

def train_by_cv_xgb_lgb_cb_stack(oof_lgb, oof_xgb, oof_cb, predictions_lgb, predictions_xgb, predictions_cb, n_splits, n_repeats):
    from sklearn import linear_model
    from sklearn.model_selection import RepeatedKFold
    # 将lgb和xgb和ctb的结果进行stacking
    train_stack = np.vstack([oof_lgb, oof_xgb, oof_cb]).transpose()
    test_stack = np.vstack([predictions_lgb, predictions_xgb, predictions_cb]).transpose()

    folds_stack = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=2021)
    oof_stack = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,y_train)):
        print("fold {}".format(fold_))
        trn_data, trn_y = train_stack[trn_idx], y_train[trn_idx]
        val_data, val_y = train_stack[val_idx], y_train[val_idx]
        clf_3 = linear_model.BayesianRidge()
    #     clf_3 =linear_model.Ridge()
        clf_3.fit(trn_data, trn_y)
        oof_stack[val_idx] = clf_3.predict(val_data)
        predictions += clf_3.predict(test_stack) / (n_splits * n_repeats)
    print("CV score: {:<8.8f}".format(mean_squared_error(oof_stack, y_train)))
    return oof_stack, predictions

oof_stack, predictions = train_by_cv_xgb_lgb_cb_stack(oof_lgb, oof_xgb, oof_cb, predictions_lgb, predictions_xgb, predictions_cb, n_splits=10, n_repeats=2)

# 最后保存结果提交
submit_data = pd.read_csv('./input/happiness_submit.csv')
submit_data['happiness'] = predictions
submit_data.to_csv('./result/res.csv', index=False)

三、学习问题与解决

1.在拿到一份数据不要慌，我刚拿到数据就慌了，这140个属性，怎么分析啊，在自己冷静之后决定先出一个baseline，只做最简单的一些处理
2.我也完全是一个初学者，接触机器学习不到半年，对数据分析这一块也不太懂，所以更深入的特征还需要继续学习，也希望有人能够指出我的问题

四、总结

1.首先拿到这个数据，完全看不懂，只能做一个简单的探索分析，进行一个简单的处理
2.快速形成一个baseline，进行模型融合发现还不错
3.接下来就应该是对每一个特征进行分析，构建新特征，不断的发现与试错

happiless

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
1
评论
幸福感挖掘

一、学习知识点概要1.1 学习内容介绍阿里天池挖掘幸福感比赛入口: https://tianchi.aliyun.com/competition/entrance/231702/introduction1.2 学习目标通过学习能够挤进排行榜前5001.3 代码流程1.数据探索2.特征工程3.建模预测4.模型调参与融合二、学习内容1. 数据探索2. 特征工程import numpy as npimport pandas as pdpd.set_option('disp
复制链接

扫一扫

专栏目录