1.导包
import warnings
warnings. filterwarnings( 'ignore' )
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib. pyplot as plt
% matplotlib inline
import seaborn as sns
from sklearn. linear_model import LinearRegression, Lasso, Ridge, RidgeCV, ElasticNet
from sklearn. neighbors import KNeighborsRegressor
from sklearn. ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn. svm import SVR
from sklearn. metrics import mean_squared_error
from sklearn. model_selection import train_test_split
from sklearn. preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures
2.加载数据及特征探索
2.1数据聚合
train = pd. read_csv( './zhengqi_train.txt' , sep= '\t' )
test = pd. read_csv( './zhengqi_test.txt' , sep= '\t' )
train[ 'origin' ] = 'train'
test[ 'origin' ] = 'test'
data_all = pd. concat( [ train, test] )
print ( data_all. shape)
data_all. head( )
2.2特征探索
plt. figure( figsize= ( 9 , 38 * 6 ) )
for i, col in enumerate ( data_all. columns[ : - 2 ] ) :
cond_train = data_all[ 'origin' ] == 'train'
train_col = data_all[ col] [ cond_train]
cond_test = data_all[ 'origin' ] == 'test'
test_col = data_all[ col] [ cond_test]
axes = plt. subplot( 38 , 1 , i+ 1 )
ax = sns. kdeplot( train_col, shade = True , ax = axes)
sns. kdeplot( test_col, shade = True , ax = ax)
plt. legend( [ 'train' , 'test' ] )
plt. xlabel( col)
plt. figure( figsize= ( 9 , 38 * 6 ) )
for col in data_all. columns[ : - 2 ] :
g = sns. FacetGrid( data_all, col= 'origin' )
g. map ( sns. distplot, col)
drop_labels = [ "V5" , "V9" , "V11" , "V17" , "V22" , "V28" ]
data_all. drop( drop_labels, axis= 1 , inplace= True )
2.3相关系数
corr = data_all. corr( )
cond = corr. loc[ 'target' ] . abs ( ) < 0.15
drop_labels = corr. loc[ 'target' ] . index[ cond]
drop_labels
drop_labels = [ 'V14' , 'V21' , 'V19' , 'V35' , ]
data_all. drop( drop_labels, axis= 1 , inplace= True )
plt. figure( figsize= ( 20 , 16 ) )
mcorr = train. corr( )
mask = np. zeros_like( mcorr, dtype= np. bool )
mask[ np. triu_indices_from( mask) ] = True
cmap = sns. diverging_palette( 220 , 10 , as_cmap= True )
g = sns. heatmap( mcorr, mask= mask, cmap= cmap, square= True , annot= True , fmt= '0.2f' )
plt. show( )
3.数据处理
3.1标准化操作
data_all[ data_all[ 'origin' ] == 'train' ] . describe( )
data_all[ data_all[ 'origin' ] == 'test' ] . describe( )
stand = StandardScaler( )
data = data_all. iloc[ : , : - 2 ]
data2 = stand. fit_transform( data)
data2
cols = data_all. columns
data_all_std = pd. DataFrame( data2, columns= cols[ : - 2 ] )
data_all. index = np. arange( 4813 )
data_all
data_all_std = pd. merge( data_all_std, data_all. iloc[ : , - 2 : ] , right_index= True , left_index= True )
data_all_std
data_all_std. describe( )
3.2过滤异常值
ridge = RidgeCV( alphas= [ 0.0001 , 0.001 , 0.01 , 0.1 , 0.2 , 0.5 , 1 , 2 , 3 , 4 , 5 , 10 , 20 , 30 , 50 ] )
cond = data_all_std[ 'origin' ] == 'train'
X_train = data_all_std[ cond] . iloc[ : , : - 2 ]
y_train = data_all_std[ cond] [ 'target' ]
ridge. fit( X_train, y_train)
y_ = ridge. predict( X_train)
cond = ( y_train - y_) . abs ( ) > y_train. std( ) * 0.9
plt. figure( figsize= ( 12 , 6 ) )
axes = plt. subplot( 1 , 3 , 1 )
axes. scatter( y_train, y_)
axes. scatter( y_train[ cond] , y_[ cond] , c = 'red' , s = 20 )
axes = plt. subplot( 1 , 3 , 2 )
axes. scatter( y_train, y_train - y_)
axes. scatter( y_train[ cond] , ( y_train - y_) [ cond] , c = 'red' , s = 20 )
axes = plt. subplot( 1 , 3 , 3 )
( y_train - y_) . plot. hist( bins = 50 , ax = axes)
( y_train - y_) . loc[ cond] . plot. hist( bins = 50 , ax = axes, color = 'red' )
drop_index = cond[ cond] . index
data_all_std. drop( drop_index, axis = 0 , inplace= True )
3.3归一化处理
data = data_all. iloc[ : , : - 2 ]
minmaxscaler = MinMaxScaler( )
data3 = minmaxscaler. fit_transform( data)
data_all_norm = pd. DataFrame( data3, columns= data_all. columns[ : - 2 ] )
data_all_norm = pd. merge( data_all_norm, data_all. iloc[ : , - 2 : ] , left_index= True , right_index= True )
def scale_minmax ( data) :
return ( data - data. min ( ) ) / ( data. max ( ) - data. min ( ) )
fcols = 6
frows = len ( data_all_norm. columns[ : 10 ] )
plt. figure( figsize= ( 4 * fcols, 4 * frows) )
i= 0
for col in data_all_norm. columns[ : 10 ] :
dat = data_all_norm[ [ col, 'target' ] ] . dropna( )
i+= 1
plt. subplot( frows, fcols, i)
sns. distplot( dat[ col] , fit = stats. norm) ;
plt. title( col+ ' Original' )
plt. xlabel( '' )
i+= 1
plt. subplot( frows, fcols, i)
_= stats. probplot( dat[ col] , plot= plt)
plt. title( 'skew=' + '{:.4f}' . format ( stats. skew( dat[ col] ) ) )
plt. xlabel( '' )
plt. ylabel( '' )
i+= 1
plt. subplot( frows, fcols, i)
plt. scatter( dat[ col] , dat[ 'target' ] , alpha= 0.5 )
plt. title( 'corr=' + '{:.2f}' . format ( np. corrcoef( dat[ col] , dat[ 'target' ] ) [ 0 ] [ 1 ] ) )
i+= 1
plt. subplot( frows, fcols, i)
trans_var, lambda_var = stats. boxcox( dat[ col] . dropna( ) + 1 )
trans_var = scale_minmax( trans_var)
sns. distplot( trans_var , fit= stats. norm) ;
plt. title( col+ ' Tramsformed' )
plt. xlabel( '' )
i+= 1
plt. subplot( frows, fcols, i)
_= stats. probplot( trans_var, plot= plt)
plt. title( 'skew=' + '{:.4f}' . format ( stats. skew( trans_var) ) )
plt. xlabel( '' )
plt. ylabel( '' )
i+= 1
plt. subplot( frows, fcols, i)
plt. plot( trans_var, dat[ 'target' ] , '.' , alpha= 0.5 )
plt. title( 'corr=' + '{:.2f}' . format ( np. corrcoef( trans_var, dat[ 'target' ] ) [ 0 ] [ 1 ] ) )
for col in data_all_norm. columns[ : - 2 ] :
boxcox, maxlog = stats. boxcox( data_all_norm[ col] + 1 )
data_all_norm[ col] = scale_minmax( boxcox)
fcols = 6
frows = len ( data_all_norm. columns[ : 10 ] )
plt. figure( figsize= ( 4 * fcols, 4 * frows) )
i= 0
for col in data_all_norm. columns[ : 10 ] :
dat = data_all_norm[ [ col, 'target' ] ] . dropna( )
i+= 1
plt. subplot( frows, fcols, i)
sns. distplot( dat[ col] , fit = stats. norm) ;
plt. title( col+ ' Original' )
plt. xlabel( '' )
i+= 1
plt. subplot( frows, fcols, i)
_= stats. probplot( dat[ col] , plot= plt)
plt. title( 'skew=' + '{:.4f}' . format ( stats. skew( dat[ col] ) ) )
plt. xlabel( '' )
plt. ylabel( '' )
i+= 1
plt. subplot( frows, fcols, i)
plt. scatter( dat[ col] , dat[ 'target' ] , alpha= 0.5 )
plt. title( 'corr=' + '{:.2f}' . format ( np. corrcoef( dat[ col] , dat[ 'target' ] ) [ 0 ] [ 1 ] ) )
i+= 1
plt. subplot( frows, fcols, i)
trans_var, lambda_var = stats. boxcox( dat[ col] . dropna( ) + 1 )
trans_var = scale_minmax( trans_var)
sns. distplot( trans_var , fit= stats. norm) ;
plt. title( col+ ' Tramsformed' )
plt. xlabel( '' )
i+= 1
plt. subplot( frows, fcols, i)
_= stats. probplot( trans_var, plot= plt)
plt. title( 'skew=' + '{:.4f}' . format ( stats. skew( trans_var) ) )
plt. xlabel( '' )
plt. ylabel( '' )
i+= 1
plt. subplot( frows, fcols, i)
plt. plot( trans_var, dat[ 'target' ] , '.' , alpha= 0.5 )
plt. title( 'corr=' + '{:.2f}' . format ( np. corrcoef( trans_var, dat[ 'target' ] ) [ 0 ] [ 1 ] ) )
3.4再次过滤异常值
ridge = RidgeCV( alphas= [ 0.0001 , 0.001 , 0.01 , 0.1 , 0.2 , 0.5 , 1 , 2 , 3 , 4 , 5 , 10 , 20 , 30 , 50 ] )
cond = data_all_norm[ 'origin' ] == 'train'
X_train = data_all_norm[ cond] . iloc[ : , : - 2 ]
y_train = data_all_norm[ cond] [ 'target' ]
ridge. fit( X_train, y_train)
y_ = ridge. predict( X_train)
cond = abs ( y_ - y_train) > y_train. std( ) * 0.9
print ( cond. sum ( ) )
plt. figure( figsize= ( 12 , 6 ) )
axes = plt. subplot( 1 , 3 , 1 )
axes. scatter( y_train, y_)
axes. scatter( y_train[ cond] , y_[ cond] , c = 'red' , s = 20 )
axes = plt. subplot( 1 , 3 , 2 )
axes. scatter( y_train, y_train - y_)
axes. scatter( y_train[ cond] , ( y_train - y_) [ cond] , c = 'red' )
axes = plt. subplot( 1 , 3 , 3 )
( y_train - y_) . plot. hist( bins = 50 , ax = axes)
( y_train - y_) . loc[ cond] . plot. hist( bins = 50 , ax = axes, color = 'r' )
index = cond[ cond] . index
data_all_norm. drop( index, axis = 0 , inplace= True )
4预测数据
estimators = { }
estimators[ 'ada' ] = AdaBoostRegressor( n_estimators= 300 )
estimators[ 'extreme' ] = ExtraTreesRegressor( n_estimators= 300 )
estimators[ 'svm_rbf' ] = SVR( kernel= 'rbf' )
cond = data_all_norm[ 'origin' ] == 'train'
X_train = data_all_norm[ cond] . iloc[ : , : - 2 ]
y_train = data_all_norm[ cond] [ 'target' ]
cond = data_all_norm[ 'origin' ] == 'test'
X_test = data_all_norm[ cond] . iloc[ : , : - 2 ]
result = [ ]
for key, model in estimators. items( ) :
model. fit( X_train, y_train)
y_ = model. predict( X_test)
if key == 'svm_rbf' :
result. append( y_)
result. append( y_)
y_ = np. mean( result, axis = 0 )
data_all_norm. set_index( np. arange( data_all_norm. shape[ 0 ] ) , inplace= True )
for i in range ( 30 ) :
cond = data_all_norm[ 'origin' ] == 'train'
train_target = list ( data_all_norm[ cond] [ 'target' ] )
test_target = list ( y_)
target_all = train_target + test_target
target_all = pd. Series( target_all)
data_all_norm = data_all_norm. drop( [ 'target' ] , axis = 1 )
data_all_norm[ 'target' ] = target_all
X_train = data_all_norm. iloc[ : , : - 2 ]
y_train = data_all_norm[ 'target' ]
cond = data_all_norm[ 'origin' ] == 'test'
X_test = data_all_norm[ cond] . iloc[ : , : - 2 ]
result = [ ]
for key, model in estimators. items( ) :
model. fit( X_train, y_train)
y_ = model. predict( X_test)
if key == 'svm_rbf' :
result. append( y_)
result. append( y_)
y_ = np. mean( result, axis = 0 )
pd. Series( y_) . to_csv( './predict.txt' , index= False )
5.提交结果
去官网进行提交
工业蒸汽预测