# coding:utf-8
import time
notebookstart = time.time()
import pandas as pd
from sklearn import metrics
from catboost import CatBoostRegressor
from pylab import *
mpl.rcParams['font.sans-serif'] = ['SimHei'] #绘图正常显示中文
"""
catboost 回归简单实现
"""
######1.加载数据
df_train = pd.read_csv(r"C:\Users\ld\Desktop\yc18\train4.csv",encoding="cp936")
df_valid = pd.read_csv(r"C:\Users\ld\Desktop\yc18\test4.csv",encoding="cp936")
X_train = df_train[[i for i in df_train.columns.tolist() if i not in ["sdate","data"]]] #训练集
y_train = df_train["data"] #训练标签
X_valid =df_valid[[i for i in df_valid.columns.tolist() if i not in ["sdate","data"]]] #测试集
y_valid = df_valid["data"]
######2.定义数据集中(不含label)类别特征的列索引列表
categorical_features_pos = [9,3,4,5,6,7,8] #类别型特征在X数据集中的索引(从0开始)列表
######3.定义模型
cb_model = CatBoostRegressor(iterations=300,
learning_rate=0.3,
depth=6,
eval_metric='RMSE',
random_seed=20,
bagging_temperature=0.2,
od_type='Iter',
metric_period=5,
od_wait=300)
cb_model.fit(X_train, y_train,
eval_set=(X_valid, y_valid),
cat_features=categorical_features_pos,
use_best_model=True,
verbose=True)
######4.特征重要性可视化
fea_imp = pd.DataFrame({'imp': cb_model.feature_importances_, 'col': X_train.columns})
fea_imp = fea_imp.sort_values(['imp', 'col'], ascending=[True, False]).iloc[-30:]
_ = fea_imp.plot(kind='barh', x='col', y='imp', figsize=(20, 10))
plt.show()
######5.预测和评价
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_valid, cb_model.predict(X_valid))))
catpred = cb_model.predict(X_valid)
print("------------------------------pre------------------------>")
for i in catpred:
print(i)
#参考文献:
#https://github.com/Microstrong0305/WeChat-zhihu-csdnblog-code/blob/master/Ensemble%20Learning/CatBoost/catboost_regression.py
03-18
1580
12-27
1686