天池工业蒸汽量
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler,PolynomialFeatures
数据聚合
train = pd.read_csv('./zhengqi_train.txt',sep = '\t')
test = pd.read_csv('./zhengqi_test.txt',sep = '\t')
train['origin'] = 'train'
test['origin'] = 'test'
data_all = pd.concat([train,test])
print(data_all.shape)
data_all.head()
plt.figure(figsize=(9,38*6))
for i,col in enumerate(data_all.columns[:-2]):
cond = data_all['origin'] == 'train'
train_col = data_all[col][cond]
cond = data_all['origin'] == 'test'
test_col = data_all[col][cond]
axes = plt.subplot(38,1,i+1)
ax = sns.kdeplot(train_col,shade = True)
sns.kdeplot(test_col,shade = True,ax = ax)
plt.legend(['train','test'])
plt.xlabel(col)
plt.figure(figsize=(9,6))
for col in data_all.columns[:-2]:
g = sns.FacetGrid(data_all,col = 'origin')
g.map(sns.distplot,col)
drop_labels = ['V11','V17','V22','V5']
data_all.drop(drop_labels,axis = 1,inplace=True)
data_all.shape
相关性系数
cov = data_all.cov()
cov.head()
corr = data_all.corr()
corr.head()
cond = corr.loc['target'].abs() < 0.1
drop_labels = corr.loc['target'].index[cond]
drop_labels
drop_labels = ['V14', 'V21']
data_all.drop(drop_labels,axis = 1,inplace=True)
data_all.shape
plt.figure(figsize=(20, 16))
mcorr = train.corr()
mask = np.zeros_like(mcorr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
cmap = sns.diverging_palette(220, 10, as_cmap=True)
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')
plt.<