天池工业蒸汽量预测
一、赛题背景
火力发电的基本原理是:燃料在燃烧时加热水生成蒸汽,蒸汽压力推动汽轮机旋转,然后汽轮机带动发电机旋转,产生电能。在这一系列的能量转化中,影响发电效率的核心是锅炉的燃烧效率,即燃料燃烧加热水产生高温高压蒸汽。锅炉的燃烧效率的影响因素很多,包括锅炉的可调参数,如燃烧给量,一二次风,引风,返料风,给水水量;以及锅炉的工况,比如锅炉床温、床压,炉膛温度、压力,过热器的温度等。
二、赛题描述
经脱敏后的锅炉传感器采集的数据(采集频率是分钟级别),根据锅炉的工况,预测产生的蒸汽量。
三、一起来看代码
# 导包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.liner_model import RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,PolynomialFeatures
import warning
warnings.filterwarnings('ignore')
数据的预处理
train = pd.read_csv('./zhengqi_train.txt',sep = '\t')
test = pd.read_csv('./zhengqi_test.txt',sep = '\t')
train['origin'] = 'train'
test['origin'] = 'test'
data_all = pd.concat([train, test])
data_all.head()
特征探索
# 38个特征,将一些不重要的特征删除
# 特征分布情况,训练和测试数据特征不分布,删除
plt.figure(figsize=(9, 38*6))
for i,col in enumerate(data_all[:-2]):
cond = data_all['origin'] == 'train'
train_col = data_all[col][cond] # 训练数据
cond = data_all['origin'] == 'test'
test_col = data_all[col][cond] # 测试数据
axes = plt.subplot(38, 1, i+1)
ax = sns.kdeplot(train_col, shade = True)
sns.kdeplot(test_col, shade=True, ax=ax)
plt.legend(['train','test'])
plt.xlabel(col)
plt.figure(figsize=(9, 6))
for col in data_all.columns[:-2]:
g = sns.FaceGrid(data_all.col, col='origin')
g.map(sns.displot, col)
通过上面两个步骤画出的图表进行分析,观察图表中的横坐标,判断测试数据和预测数据是否大概几种在相同的区域,图标中的高度表的是数据的密度,横坐标的位置为数据的分布区间,应该将数据分布区间相差较大的特征进行删除,比如[‘V11’,‘V17’,‘V22’,‘V5’]
drop_labels = ['V11','V17','V22','V5']
data_all.drop(drop_labels, axis=1, inplace=True)
利用协方差进一步处理特征数据,协方差是两个属性之间的关系
cov = data_all.cov()
corr = data_all.corr() # 相关系数
# 通过相关性系数找到相关性不大的属性
cond = corr.loc['target'].abs() < 0.1
drop_labels = corr.loc['target'].index[cond]
# 查看属性的分布,删除分布不好的属性
drop_labels = ['V14','V21']
data_all.drop(drop_labels, axis=1, inplace = True)
画出热力图找出相关程度,颜色越红,二者之间的关系越密切
plt.figure(figsize = (20,16))
mcorr = train.corr()
mask = np.zeros_like(mcorr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True # 角分线右侧为True
cmap = sns.diverging_palette(200, 10, as_cmap=True) # 返回matplotlib colormap对象
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square = True, annot=True, fmt='0.2f') # 热力图
plt.show()
继续对代码进行标准化处理
stand = StandarScaler()
data = data_all.iloc[:,:-2]
data2 = stand.fit_transform(data)
cols = data_all.columns
data_all_std = pd.DataFrame(data2, columns=cols[:-2])
使用不同的算法进行训练
ridge = Ridge(alpha=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50])
cond = data_all_sta['origin'] == 'train'
# 训练值
X_train = data_all_std[cond].iloc[:, :-2]
# 真实值
y_train = data_all_std[cond]['target']
# 算法拟合数据和目标值时,不可能100%拟合,所以通过多种算法去尽可能的拟合真实的曲线
ridge.fit(X_train, y_train)
y_ = ridge.predict(X_train)
# 预测值一定会与真实值存在偏差,将一些偏差特别大的值进行删除
cond = abs((y_train - y_)) > y_train.std() * 0.8
plt.figure(figsize = (12, 6))
axes = plt.subplot(1, 3, 1)
# 线性图
axes.scatter(y_train, y_)
axes.scatter(y_train[cond], y_[cod], c='red', s=20)
# 散点图
axes = plt.subplot(1, 3, 2)
axes.scatter(y_train, y_train-y_)
axes.scatter(y_train[cond], (y_train-y_)[cond], c='red')
# 柱形图
axes = plt.plot(1, 3, 3)
(y_train - y_).plot.hist(bins=50, ax = axes)
(y_train - y_).loc[cond].plot.hist(bins=50, ax=axes, color='r')
红色的数据即为异常值
# 将异常值过滤
drop_index = cond[cond].index
data_all_std.drop(drop_index, axis=0, inplace=True)
聚合模型进行训练
def detect_model(estimators, data):
for key, estimator in estimators.items():
estimator.fit(data[0], data[2])
y_ = estimator.predict(data[1])
mse = mean_squared_error(data[3], y_)
print('------->mse%s'%(key), mes)
r2 = estimator.score(data[1], data[3])
print('------->scores%s'%(key), r2)
print('\n')
cond = data_all_std['origin'] == 'train'
X = data_all_std[cond].iloc[:,:-2]
y = data_all_std[cond]['target']
data = train_test_split(X, y, test_size=0.2)
estimators = {}
estimators['knn'] = KNeighborsRegressor()
estimators['linear'] = LinearRegressor()
estimators['ridge'] = Ridge()
estimators['lasso'] = Lasso()
estimators['elasticnet'] = ElasticNet()
estimators['forest'] = RandomForestRegressor()
estimators['gbdt'] = GradientBoostingRegressor()
estimators['ada'] = AdaBoostRegressor()
estimators['extreme'] = ExtraTreeRegressor()
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['svm_poly'] = SVR(kernel='poly')
estimators['light'] = LGBMRegressor()
estimators['xgb'] = XFBRegressor()
# 进行训练
detect_model(estimators, data)
# 使用以下算法进行模型的聚合
estimators = {}
estimators['forest'] = RandomForestRegressor()
estimators['gbdt'] = GradientBoostingRegressor()
estimators['ada'] = AdaBoostRegressor()
estimators['extreme'] = ExtraTreesRegressor()
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['light'] = LGBMRegressor()
estimators['xgb'] = XGBRegressor()
cond = data_all_std['origin'] = 'train'
X_train = data_all_std[cond].iloc[:, :-2]
y_train = data_all_std[cond]['target']
cond = data_all_std['origin'] == 'test'
X_test = data_all_std[cond].iloc[:, :-2]
# 将所有算法的结果放置在列表中,然后求其平均值
y_pred = []
for key, model in estimators.items():
model.fit(X_train, y_train)
y_ = model.predict(X_test)
y_pred.append(y_)
y_ = np.mean(y_pred, axis=0)
# 此时将求出的结果输出,并且到官网提交结果
pd.Series(y_).to_csv('./ensemble2.txt', index=False)
此时提交的结果与实际的相差较大,可能出现的原因是过拟合, 我们使用自己预测的结果作为新的特征,让我们的算法去学习,寻找数据和目标值之间的关系,y_作为预测值,当作新的特征,让哦我们的算法去学习
for key,model in estimators.items():
model.fit(X_train, y_train)
y_ = model.predict(X_train)
X_train[key] = y_
y_ = model.predict(X_test)
X_test[key] = y_
归一化
# 4个测试和训练特征分布不均匀,2个相关性系数小的特征
data = data_all.iloc[:, :-2]
minmaxscaler = MinMaxScaler()
data3 = minmaxscaler.fit_transform(data)
data_all_norm = pd.DataFrame(data3, columns=data_all.columns[:-2])
data_all_norm = pd.merge(dara_all_norm, data_all.iloc[:, -2:], left_index = True, right_index = True)
def scale_minmax(data):
return (data-data.min())/(data.max() - data.mmin())
# pip install scipy
from scipy import stats
fcols = 6
frows = len(data_all_norm.columns[:10])
plt.figure(figsize=(4*fcols,4*frows))
i=0
for col in data_all_norm.columns[:10]:
dat = data_all_norm[[col, 'target']].dropna()
# 这条线就是数据分布dist:distribution(分布)
i+=1
plt.subplot(frows,fcols,i)
sns.distplot(dat[col],fit = stats.norm);
plt.title(var+' Original')
plt.xlabel('')
# 第二个图:skew统计分析中中一个属性
# skewness 偏斜系数,对正太分布的度量
i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(dat[col], plot=plt)#画图,偏析度
plt.title('skew='+'{:.4f}'.format(stats.skew(dat[col])))
plt.xlabel('')
plt.ylabel('')
# 散点图
i+=1
plt.subplot(frows,fcols,i)
# plt.plot(dat[var], dat['target'],'.',alpha=0.5)
plt.scatter(dat[col],dat['target'],alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[col], dat['target'])[0][1]))
# !!!对数据进行了处理!!!
# 数据分布图distribution
i+=1
plt.subplot(frows,fcols,i)
trans_var, lambda_var = stats.boxcox(dat[col].dropna()+1)
trans_var = scale_minmax(trans_var)
sns.distplot(trans_var , fit=stats.norm);
plt.title(var+' Tramsformed')
plt.xlabel('')
# 偏斜度
i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(trans_var, plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
plt.xlabel('')
plt.ylabel('')
# 散点图
i+=1
plt.subplot(frows,fcols,i)
plt.plot(trans_var, dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))
将数据进行Box-Cox转换
统计建模中常用的数据变化
数据更加的正态化,标准化
for col in data_all_norm.columns[:-2]:
boxcox, maxlog = stats.boxcox(data_all_norm[col] + 1)
data_all_norm[col] = scale_minmax(boxcox)
过滤异常值
ridge = RidgeCV(alphas=[0.0001,0.001,0.01,0.1,0.2,0.5,1,2,3,4,5,10,20,30,50])
cond = data_all_norm['origin'] == 'train'
X_train = data_all_norm[cond].iloc[:,:-2]
# 真实值
y_train = data_all_norm[cond]['target']
# 算法拟合数据和目标值的时候,不可能100%拟合
ridge.fit(X_train,y_train)
# 预测,预测值肯定会和真实值有一定的偏差,偏差特别大,当成异常值
y_ = ridge.predict(X_train)
cond = abs(y_ - y_train) > y_train.std()
print(cond.sum())
# 画图
plt.figure(figsize=(12,6))
axes = plt.subplot(1,3,1)
axes.scatter(y_train,y_)
axes.scatter(y_train[cond],y_[cond],c = 'red',s = 20)
axes = plt.subplot(1,3,2)
axes.scatter(y_train,y_train - y_)
axes.scatter(y_train[cond],(y_train - y_)[cond],c = 'red')
axes = plt.subplot(1,3,3)
# _ = axes.hist(y_train,bins = 50)
(y_train - y_).plot.hist(bins = 50,ax = axes)
(y_train - y_).loc[cond].plot.hist(bins = 50,ax = axes,color = 'r')
index = cond[cond].index
data_all_norm.drop(index,axis = 0,inplace=True)
cond = data_all_norm['origin'] == 'train'
X_train = data_all_norm[cond].iloc[:,:-2]
y_train = data_all_norm[cond]['target']
cond = data_all_norm['origin'] == 'test'
X_test = data_all_norm[cond].iloc[:,:-2]
estimators = {}
# estimators['linear'] = LinearRegression()
# estimators['ridge'] = Ridge()
# estimators['lasso'] = Lasso()
estimators['forest'] = RandomForestRegressor(n_estimators=300)
estimators['gbdt'] = GradientBoostingRegressor(n_estimators=300)
estimators['ada'] = AdaBoostRegressor(n_estimators=300)
estimators['extreme'] = ExtraTreesRegressor(n_estimators=300)
estimators['svm_rbf'] = SVR(kernel='rbf')
estimators['light'] = LGBMRegressor(n_estimators=300)
estimators['xgb'] = XGBRegressor(n_estimators=300)
result = []
for key,model in estimators.items():
model.fit(X_train,y_train)
y_ = model.predict(X_test)
result.append(y_)
y_ = np.mean(result,axis = 0)
pd.Series(y_).to_csv('./norm.txt',index = False)
成绩提交
总结:
代码其实还可以进一步的优化,之后会继续对代码进行优化,希望可以再提高自己的成绩。
望您:
“情深不寿,强极则辱,谦谦君子,温润如玉”。