幸福感挖掘

一、学习知识点概要

1.1 学习内容介绍
  • 阿里天池挖掘幸福感比赛入口: https://tianchi.aliyun.com/competition/entrance/231702/introduction
1.2 学习目标
  • 通过学习能够挤进排行榜前500
1.3 代码流程
  • 1.数据探索
  • 2.特征工程
  • 3.建模预测
  • 4.模型调参与融合

二、学习内容

1. 数据探索
2. 特征工程
import numpy as np
import pandas as pd
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows', None)
# 读入数据
train_data = pd.read_csv('./input/happiness_train_complete.csv', encoding='gbk')
test_data = pd.read_csv('./input/happiness_test_complete.csv', encoding='gbk')

# 将数据进行预处理
y_train = train_data['happiness']
# 将label为-8的异常值修改为3
y_train = y_train.apply(lambda x: 3 if x == -8 else x)

train = train_data.drop(columns=['happiness'])
# 将训练集和测试集进行拼接,方便后续特征处理
data = pd.concat([train, test_data], axis=0)
# 通过data.nunique(), 可以发现一下字段对数据结果没有影响, 所以删除
drop_cols = ['survey_time', 'edu_other', 'property_other', 'invest_other', 'id']
data.drop(drop_cols, axis=1, inplace=True)
# 对缺失值进行处理, 因为数据不应该有负数,所以把所有负数都当成异常值重新进行填充
def replace_with_nan(x):
    if x < 0:
        return float('NaN') # np.nan
    else:
        return x
    
data=data.applymap(replace_with_nan)
# 中位数填充
data=data.fillna(round(data.median(),0))
# 下面就可以保存数据进行建模了
train = data[:train.shape[0]]
train['happiness'] = y_train
test = data[train.shape[0]:]

train.to_csv('./output/train_data.csv', index=False)
test.to_csv('./output/test_data.csv', index=False)
3. 建模预测
import numpy as np
import pandas as pd

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings('ignore')
# 读入数据
train_data = pd.read_csv('./output/train_data.csv')
test_data = pd.read_csv('./output/test_data.csv')
X_train = train_data.drop(['happiness'], axis=1).values
y_train = train_data['happiness'].values
X_test = test_data.values
  • 使用XGBoost进行建模
# 可以通过适当调参来改变模型,来提高训练速度或者提高模型准确率
def train_by_cv_xgb(X_train, y_train, X_test, n_splits):
    import xgboost as xgb
    xgb_params = {
        "booster":'gbtree', 
        'eta': 0.005, 
        'max_depth': 7, 
        'subsample': 0.7, 
        'colsample_bytree': 0.8, 
        'objective': 'reg:linear', 
        'eval_metric': 'rmse', 
        'silent': True, 
        'nthread': 8
    }
    # 使用K折交叉验证
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=2021)
    oof_xgb = np.zeros(X_train.shape[0])
    predictions_xgb = np.zeros(X_test.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_+1))
        trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx])
        val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx])
        watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
        clf = xgb.train(dtrain=trn_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params)
        oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit)
        predictions_xgb += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
    print("XGB CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, y_train)))
    return oof_xgb, predictions_xgb
  • 使用LightGBM建模
def train_by_cv_lgb(X_train, y_train, X_test, n_splits):
    from sklearn.model_selection import KFold
    import lightgbm as lgb

    param = {
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'min_data_in_leaf': 20, 
        'objective':'regression',
        'max_depth':7,
        'learning_rate': 0.005,
        "min_child_samples": 30,
        "feature_fraction": 0.8,
        "bagging_freq": 1,
        "bagging_fraction": 0.8 ,
        "bagging_seed": 11,
        "metric": 'mse',
        "lambda_l1": 0.1,
        "verbosity": -1
    }
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=2021)
    oof_lgb = np.zeros(X_train.shape[0])
    predictions_lgb = np.zeros(X_test.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_+1))
        trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
        val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])
        num_round = 20000
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 100)
        oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
        predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
    print("LGB CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, y_train)))
    return oof_lgb, predictions_lgb
  • CatBoost
def train_by_cv_cb(X_train, y_train, X_test, n_splits):
    from catboost import Pool, CatBoostRegressor

    folds = KFold(n_splits=10, shuffle=True, random_state=2021)
    oof_cb = np.zeros(X_train.shape[0])
    predictions_cb = np.zeros(X_test.shape[0])
    cb_params = {
             'n_estimators': 20000,
             'loss_function': 'RMSE',
             'eval_metric':'RMSE',
             'learning_rate': 0.005,
             'depth': 7,
             'use_best_model': True,
             'subsample': 0.6,
             'bootstrap_type': 'Bernoulli',
             'reg_lambda': 3
        }

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_+1))
        trn_X, trn_y = X_train[trn_idx], y_train[trn_idx]
        val_X, val_y = X_train[val_idx], y_train[val_idx]

        model_cb = CatBoostRegressor(**cb_params)
        model_cb.fit(trn_X, trn_y, eval_set=[(val_X, val_y)], verbose=100, early_stopping_rounds=50)
        oof_cb[val_idx] = model_cb.predict(val_X, ntree_end=model_cb.best_iteration_)
        predictions_cb += model_cb.predict(X_test, ntree_end=model_cb.best_iteration_) / folds.n_splits
    print("CB CV score: {:<8.8f}".format(mean_squared_error(oof_cb, y_train)))
    return oof_cb, predictions_cb
# 使用10折交叉验证
oof_lgb, predictions_lgb = train_by_cv_lgb(X_train, y_train, X_test, 10)
oof_xgb, predictions_xgb = train_by_cv_xgb(X_train, y_train, X_test, 10)
oof_cb, predictions_cb = train_by_cv_cb(X_train, y_train, X_test, 10)
4. 模型调参与融合
  • XGB, LGB, CB 进行stacking
def train_by_cv_xgb_lgb_cb_stack(oof_lgb, oof_xgb, oof_cb, predictions_lgb, predictions_xgb, predictions_cb, n_splits, n_repeats):
    from sklearn import linear_model
    from sklearn.model_selection import RepeatedKFold
    # 将lgb和xgb和ctb的结果进行stacking
    train_stack = np.vstack([oof_lgb, oof_xgb, oof_cb]).transpose()
    test_stack = np.vstack([predictions_lgb, predictions_xgb, predictions_cb]).transpose()

    folds_stack = RepeatedKFold(n_splits=n_splits, n_repeats=n_repeats, random_state=2021)
    oof_stack = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,y_train)):
        print("fold {}".format(fold_))
        trn_data, trn_y = train_stack[trn_idx], y_train[trn_idx]
        val_data, val_y = train_stack[val_idx], y_train[val_idx]
        clf_3 = linear_model.BayesianRidge()
    #     clf_3 =linear_model.Ridge()
        clf_3.fit(trn_data, trn_y)
        oof_stack[val_idx] = clf_3.predict(val_data)
        predictions += clf_3.predict(test_stack) / (n_splits * n_repeats)
    print("CV score: {:<8.8f}".format(mean_squared_error(oof_stack, y_train)))
    return oof_stack, predictions
oof_stack, predictions = train_by_cv_xgb_lgb_cb_stack(oof_lgb, oof_xgb, oof_cb, predictions_lgb, predictions_xgb, predictions_cb, n_splits=10, n_repeats=2)
# 最后保存结果提交
submit_data = pd.read_csv('./input/happiness_submit.csv')
submit_data['happiness'] = predictions
submit_data.to_csv('./result/res.csv', index=False)

三、学习问题与解决

  • 1.在拿到一份数据不要慌,我刚拿到数据就慌了,这140个属性,怎么分析啊,在自己冷静之后决定先出一个baseline,只做最简单的一些处理
  • 2.我也完全是一个初学者,接触机器学习不到半年,对数据分析这一块也不太懂,所以更深入的特征还需要继续学习,也希望有人能够指出我的问题

四、总结

  • 1.首先拿到这个数据,完全看不懂,只能做一个简单的探索分析,进行一个简单的处理
  • 2.快速形成一个baseline, 进行模型融合发现还不错
  • 3.接下来就应该是对每一个特征进行分析,构建新特征,不断的发现与试错
    在这里插入图片描述
  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值